1 /* AesOpt.c -- AES optimized code for x86 AES hardware instructions
2 Igor Pavlov : Public domain */
3
4 #include "Precomp.h"
5
6 #include "Aes.h"
7 #include "CpuArch.h"
8
9 #ifdef MY_CPU_X86_OR_AMD64
10
11 #if defined(__INTEL_COMPILER)
12 #if (__INTEL_COMPILER >= 1110)
13 #define USE_INTEL_AES
14 #if (__INTEL_COMPILER >= 1900)
15 #define USE_INTEL_VAES
16 #endif
17 #endif
18 #elif defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
19 || defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40400)
20 #define USE_INTEL_AES
21 #if !defined(__AES__)
22 #define ATTRIB_AES __attribute__((__target__("aes")))
23 #endif
24 #if defined(__clang__) && (__clang_major__ >= 8) \
25 || defined(__GNUC__) && (__GNUC__ >= 8)
26 #define USE_INTEL_VAES
27 #if !defined(__AES__) || !defined(__VAES__) || !defined(__AVX__) || !defined(__AVX2__)
28 #define ATTRIB_VAES __attribute__((__target__("aes,vaes,avx,avx2")))
29 #endif
30 #endif
31 #elif defined(_MSC_VER)
32 #if (_MSC_VER > 1500) || (_MSC_FULL_VER >= 150030729)
33 #define USE_INTEL_AES
34 #if (_MSC_VER >= 1910)
35 #define USE_INTEL_VAES
36 #endif
37 #endif
38 #ifndef USE_INTEL_AES
39 #define Z7_USE_AES_HW_STUB
40 #endif
41 #ifndef USE_INTEL_VAES
42 #define Z7_USE_VAES_HW_STUB
43 #endif
44 #endif
45
46 #ifndef USE_INTEL_AES
47 // #define Z7_USE_AES_HW_STUB // for debug
48 #endif
49 #ifndef USE_INTEL_VAES
50 // #define Z7_USE_VAES_HW_STUB // for debug
51 #endif
52
53
54 #ifdef USE_INTEL_AES
55
56 #include <wmmintrin.h>
57
58 #if !defined(USE_INTEL_VAES) && defined(Z7_USE_VAES_HW_STUB)
59 #define AES_TYPE_keys UInt32
60 #define AES_TYPE_data Byte
61 // #define AES_TYPE_keys __m128i
62 // #define AES_TYPE_data __m128i
63 #endif
64
65 #ifndef ATTRIB_AES
66 #define ATTRIB_AES
67 #endif
68
69 #define AES_FUNC_START(name) \
70 void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks)
71 // void Z7_FASTCALL name(__m128i *p, __m128i *data, size_t numBlocks)
72
73 #define AES_FUNC_START2(name) \
74 AES_FUNC_START (name); \
75 ATTRIB_AES \
76 AES_FUNC_START (name)
77
78 #define MM_OP(op, dest, src) dest = op(dest, src);
79 #define MM_OP_m(op, src) MM_OP(op, m, src)
80
81 #define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src)
82
83 #if 1
84 // use aligned SSE load/store for data.
85 // It is required for our Aes functions, that data is aligned for 16-bytes.
86 // So we can use this branch of code.
87 // and compiler can use fused load-op SSE instructions:
88 // xorps xmm0, XMMWORD PTR [rdx]
89 #define LOAD_128(pp) (*(__m128i *)(void *)(pp))
90 #define STORE_128(pp, _v) *(__m128i *)(void *)(pp) = _v
91 // use aligned SSE load/store for data. Alternative code with direct access
92 // #define LOAD_128(pp) _mm_load_si128(pp)
93 // #define STORE_128(pp, _v) _mm_store_si128(pp, _v)
94 #else
95 // use unaligned load/store for data: movdqu XMMWORD PTR [rdx]
96 #define LOAD_128(pp) _mm_loadu_si128(pp)
97 #define STORE_128(pp, _v) _mm_storeu_si128(pp, _v)
98 #endif
99
AES_FUNC_START2(AesCbc_Encode_HW)100 AES_FUNC_START2 (AesCbc_Encode_HW)
101 {
102 if (numBlocks == 0)
103 return;
104 {
105 __m128i *p = (__m128i *)(void *)ivAes;
106 __m128i *data = (__m128i *)(void *)data8;
107 __m128i m = *p;
108 const __m128i k0 = p[2];
109 const __m128i k1 = p[3];
110 const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
111 do
112 {
113 UInt32 r = numRounds2;
114 const __m128i *w = p + 4;
115 __m128i temp = LOAD_128(data);
116 MM_XOR (temp, k0)
117 MM_XOR (m, temp)
118 MM_OP_m (_mm_aesenc_si128, k1)
119 do
120 {
121 MM_OP_m (_mm_aesenc_si128, w[0])
122 MM_OP_m (_mm_aesenc_si128, w[1])
123 w += 2;
124 }
125 while (--r);
126 MM_OP_m (_mm_aesenclast_si128, w[0])
127 STORE_128(data, m);
128 data++;
129 }
130 while (--numBlocks);
131 *p = m;
132 }
133 }
134
135
136 #define WOP_1(op)
137 #define WOP_2(op) WOP_1 (op) op (m1, 1)
138 #define WOP_3(op) WOP_2 (op) op (m2, 2)
139 #define WOP_4(op) WOP_3 (op) op (m3, 3)
140 #ifdef MY_CPU_AMD64
141 #define WOP_5(op) WOP_4 (op) op (m4, 4)
142 #define WOP_6(op) WOP_5 (op) op (m5, 5)
143 #define WOP_7(op) WOP_6 (op) op (m6, 6)
144 #define WOP_8(op) WOP_7 (op) op (m7, 7)
145 #endif
146 /*
147 #define WOP_9(op) WOP_8 (op) op (m8, 8);
148 #define WOP_10(op) WOP_9 (op) op (m9, 9);
149 #define WOP_11(op) WOP_10(op) op (m10, 10);
150 #define WOP_12(op) WOP_11(op) op (m11, 11);
151 #define WOP_13(op) WOP_12(op) op (m12, 12);
152 #define WOP_14(op) WOP_13(op) op (m13, 13);
153 */
154
155 #ifdef MY_CPU_AMD64
156 #define NUM_WAYS 8
157 #define WOP_M1 WOP_8
158 #else
159 #define NUM_WAYS 4
160 #define WOP_M1 WOP_4
161 #endif
162
163 #define WOP(op) op (m0, 0) WOP_M1(op)
164
165 #define DECLARE_VAR(reg, ii) __m128i reg;
166 #define LOAD_data_ii(ii) LOAD_128(data + (ii))
167 #define LOAD_data( reg, ii) reg = LOAD_data_ii(ii);
168 #define STORE_data( reg, ii) STORE_128(data + (ii), reg);
169 #if (NUM_WAYS > 1)
170 #define XOR_data_M1(reg, ii) MM_XOR (reg, LOAD_128(data + (ii- 1)))
171 #endif
172
173 #define MM_OP_key(op, reg) MM_OP(op, reg, key);
174
175 #define AES_DEC( reg, ii) MM_OP_key (_mm_aesdec_si128, reg)
176 #define AES_DEC_LAST( reg, ii) MM_OP_key (_mm_aesdeclast_si128, reg)
177 #define AES_ENC( reg, ii) MM_OP_key (_mm_aesenc_si128, reg)
178 #define AES_ENC_LAST( reg, ii) MM_OP_key (_mm_aesenclast_si128, reg)
179 #define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg)
180
181 #define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr;
182 #define CTR_END( reg, ii) STORE_128(data + (ii), _mm_xor_si128(reg, \
183 LOAD_128 (data + (ii))));
184 #define WOP_KEY(op, n) { \
185 const __m128i key = w[n]; \
186 WOP(op) }
187
188 #define WIDE_LOOP_START \
189 dataEnd = data + numBlocks; \
190 if (numBlocks >= NUM_WAYS) \
191 { dataEnd -= NUM_WAYS; do { \
192
193 #define WIDE_LOOP_END \
194 data += NUM_WAYS; \
195 } while (data <= dataEnd); \
196 dataEnd += NUM_WAYS; } \
197
198 #define SINGLE_LOOP \
199 for (; data < dataEnd; data++)
200
201
202
203 #ifdef USE_INTEL_VAES
204
205 #define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src)
206 #define AVX_DECLARE_VAR(reg, ii) __m256i reg;
207
208 #if 1
209 // use unaligned AVX load/store for data.
210 // It is required for our Aes functions, that data is aligned for 16-bytes.
211 // But we need 32-bytes reading.
212 // So we use intrinsics for unaligned AVX load/store.
213 // notes for _mm256_storeu_si256:
214 // msvc2022: uses vmovdqu and keeps the order of instruction sequence.
215 // new gcc11 uses vmovdqu
216 // old gcc9 could use pair of instructions:
217 // vmovups %xmm7, -224(%rax)
218 // vextracti128 $0x1, %ymm7, -208(%rax)
219 #define AVX_LOAD(p) _mm256_loadu_si256((const __m256i *)(const void *)(p))
220 #define AVX_STORE(p, _v) _mm256_storeu_si256((__m256i *)(void *)(p), _v);
221 #else
222 // use aligned AVX load/store for data.
223 // for debug: we can use this branch, if we are sure that data is aligned for 32-bytes.
224 // msvc2022 uses vmovdqu still
225 // gcc uses vmovdqa (that requires 32-bytes alignment)
226 #define AVX_LOAD(p) (*(const __m256i *)(const void *)(p))
227 #define AVX_STORE(p, _v) (*(__m256i *)(void *)(p)) = _v;
228 #endif
229
230 #define AVX_LOAD_data( reg, ii) reg = AVX_LOAD((const __m256i *)(const void *)data + (ii));
231 #define AVX_STORE_data( reg, ii) AVX_STORE((__m256i *)(void *)data + (ii), reg)
232 /*
233 AVX_XOR_data_M1() needs unaligned memory load, even if (data)
234 is aligned for 256-bits, because we read 32-bytes chunk that
235 crosses (data) position: from (data - 16bytes) to (data + 16bytes).
236 */
237 #define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, _mm256_loadu_si256((const __m256i *)(const void *)(data - 1) + (ii)))
238
239 #define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg)
240 #define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg)
241 #define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg)
242 #define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg)
243 #define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg)
244 #define AVX_CTR_START(reg, ii) \
245 MM_OP (_mm256_add_epi64, ctr2, two) \
246 reg = _mm256_xor_si256(ctr2, key);
247
248 #define AVX_CTR_END(reg, ii) \
249 AVX_STORE((__m256i *)(void *)data + (ii), _mm256_xor_si256(reg, \
250 AVX_LOAD ((__m256i *)(void *)data + (ii))));
251
252 #define AVX_WOP_KEY(op, n) { \
253 const __m256i key = w[n]; \
254 WOP(op) }
255
256 #define NUM_AES_KEYS_MAX 15
257
258 #define WIDE_LOOP_START_AVX(OP) \
259 dataEnd = data + numBlocks; \
260 if (numBlocks >= NUM_WAYS * 2) \
261 { __m256i keys[NUM_AES_KEYS_MAX]; \
262 OP \
263 { UInt32 ii; for (ii = 0; ii < numRounds; ii++) \
264 keys[ii] = _mm256_broadcastsi128_si256(p[ii]); } \
265 dataEnd -= NUM_WAYS * 2; \
266 do { \
267
268 #define WIDE_LOOP_END_AVX(OP) \
269 data += NUM_WAYS * 2; \
270 } while (data <= dataEnd); \
271 dataEnd += NUM_WAYS * 2; \
272 OP \
273 _mm256_zeroupper(); \
274 } \
275
276 /* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified,
277 MSVC still can insert vzeroupper instruction. */
278
279 #endif
280
281
282
AES_FUNC_START2(AesCbc_Decode_HW)283 AES_FUNC_START2 (AesCbc_Decode_HW)
284 {
285 __m128i *p = (__m128i *)(void *)ivAes;
286 __m128i *data = (__m128i *)(void *)data8;
287 __m128i iv = *p;
288 const __m128i * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2 + 2 - 1;
289 const __m128i *dataEnd;
290 p += 2;
291
292 WIDE_LOOP_START
293 {
294 const __m128i *w = wStart;
295 WOP (DECLARE_VAR)
296 WOP (LOAD_data)
297 WOP_KEY (AES_XOR, 1)
298 do
299 {
300 WOP_KEY (AES_DEC, 0)
301
302 w--;
303 }
304 while (w != p);
305 WOP_KEY (AES_DEC_LAST, 0)
306
307 MM_XOR (m0, iv)
308 WOP_M1 (XOR_data_M1)
309 LOAD_data(iv, NUM_WAYS - 1)
310 WOP (STORE_data)
311 }
312 WIDE_LOOP_END
313
314 SINGLE_LOOP
315 {
316 const __m128i *w = wStart - 1;
317 __m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0));
318
319 do
320 {
321 MM_OP_m (_mm_aesdec_si128, w[1])
322 MM_OP_m (_mm_aesdec_si128, w[0])
323 w -= 2;
324 }
325 while (w != p);
326 MM_OP_m (_mm_aesdec_si128, w[1])
327 MM_OP_m (_mm_aesdeclast_si128, w[0])
328 MM_XOR (m, iv)
329 LOAD_data(iv, 0)
330 STORE_data(m, 0)
331 }
332
333 p[-2] = iv;
334 }
335
336
AES_FUNC_START2(AesCtr_Code_HW)337 AES_FUNC_START2 (AesCtr_Code_HW)
338 {
339 __m128i *p = (__m128i *)(void *)ivAes;
340 __m128i *data = (__m128i *)(void *)data8;
341 __m128i ctr = *p;
342 const UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1;
343 const __m128i *dataEnd;
344 const __m128i one = _mm_cvtsi32_si128(1);
345
346 p += 2;
347
348 WIDE_LOOP_START
349 {
350 const __m128i *w = p;
351 UInt32 r = numRoundsMinus2;
352 WOP (DECLARE_VAR)
353 WOP (CTR_START)
354 WOP_KEY (AES_XOR, 0)
355 w += 1;
356 do
357 {
358 WOP_KEY (AES_ENC, 0)
359 w += 1;
360 }
361 while (--r);
362 WOP_KEY (AES_ENC_LAST, 0)
363 WOP (CTR_END)
364 }
365 WIDE_LOOP_END
366
367 SINGLE_LOOP
368 {
369 UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1;
370 const __m128i *w = p;
371 __m128i m;
372 MM_OP (_mm_add_epi64, ctr, one)
373 m = _mm_xor_si128 (ctr, p[0]);
374 w += 1;
375 do
376 {
377 MM_OP_m (_mm_aesenc_si128, w[0])
378 MM_OP_m (_mm_aesenc_si128, w[1])
379 w += 2;
380 }
381 while (--numRounds2);
382 MM_OP_m (_mm_aesenc_si128, w[0])
383 MM_OP_m (_mm_aesenclast_si128, w[1])
384 CTR_END (m, 0)
385 }
386
387 p[-2] = ctr;
388 }
389
390
391
392 #ifdef USE_INTEL_VAES
393
394 /*
395 GCC before 2013-Jun:
396 <immintrin.h>:
397 #ifdef __AVX__
398 #include <avxintrin.h>
399 #endif
400 GCC after 2013-Jun:
401 <immintrin.h>:
402 #include <avxintrin.h>
403 CLANG 3.8+:
404 {
405 <immintrin.h>:
406 #if !defined(_MSC_VER) || defined(__AVX__)
407 #include <avxintrin.h>
408 #endif
409
410 if (the compiler is clang for Windows and if global arch is not set for __AVX__)
411 [ if (defined(_MSC_VER) && !defined(__AVX__)) ]
412 {
413 <immintrin.h> doesn't include <avxintrin.h>
414 and we have 2 ways to fix it:
415 1) we can define required __AVX__ before <immintrin.h>
416 or
417 2) we can include <avxintrin.h> after <immintrin.h>
418 }
419 }
420
421 If we include <avxintrin.h> manually for GCC/CLANG, it's
422 required that <immintrin.h> must be included before <avxintrin.h>.
423 */
424
425 /*
426 #if defined(__clang__) && defined(_MSC_VER)
427 #define __AVX__
428 #define __AVX2__
429 #define __VAES__
430 #endif
431 */
432
433 #include <immintrin.h>
434 #if defined(__clang__) && defined(_MSC_VER)
435 #if !defined(__AVX__)
436 #include <avxintrin.h>
437 #endif
438 #if !defined(__AVX2__)
439 #include <avx2intrin.h>
440 #endif
441 #if !defined(__VAES__)
442 #include <vaesintrin.h>
443 #endif
444 #endif // __clang__ && _MSC_VER
445
446 #ifndef ATTRIB_VAES
447 #define ATTRIB_VAES
448 #endif
449
450 #define VAES_FUNC_START2(name) \
451 AES_FUNC_START (name); \
452 ATTRIB_VAES \
453 AES_FUNC_START (name)
454
VAES_FUNC_START2(AesCbc_Decode_HW_256)455 VAES_FUNC_START2 (AesCbc_Decode_HW_256)
456 {
457 __m128i *p = (__m128i *)(void *)ivAes;
458 __m128i *data = (__m128i *)(void *)data8;
459 __m128i iv = *p;
460 const __m128i *dataEnd;
461 const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
462 p += 2;
463
464 WIDE_LOOP_START_AVX(;)
465 {
466 const __m256i *w = keys + numRounds - 2;
467
468 WOP (AVX_DECLARE_VAR)
469 WOP (AVX_LOAD_data)
470 AVX_WOP_KEY (AVX_AES_XOR, 1)
471
472 do
473 {
474 AVX_WOP_KEY (AVX_AES_DEC, 0)
475 w--;
476 }
477 while (w != keys);
478 AVX_WOP_KEY (AVX_AES_DEC_LAST, 0)
479
480 AVX_XOR (m0, _mm256_setr_m128i(iv, LOAD_data_ii(0)))
481 WOP_M1 (AVX_XOR_data_M1)
482 LOAD_data (iv, NUM_WAYS * 2 - 1)
483 WOP (AVX_STORE_data)
484 }
485 WIDE_LOOP_END_AVX(;)
486
487 SINGLE_LOOP
488 {
489 const __m128i *w = p - 2 + (size_t)*(const UInt32 *)(p + 1 - 2) * 2;
490 __m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0));
491 do
492 {
493 MM_OP_m (_mm_aesdec_si128, w[1])
494 MM_OP_m (_mm_aesdec_si128, w[0])
495 w -= 2;
496 }
497 while (w != p);
498 MM_OP_m (_mm_aesdec_si128, w[1])
499 MM_OP_m (_mm_aesdeclast_si128, w[0])
500
501 MM_XOR (m, iv)
502 LOAD_data(iv, 0)
503 STORE_data(m, 0)
504 }
505
506 p[-2] = iv;
507 }
508
509
510 /*
511 SSE2: _mm_cvtsi32_si128 : movd
512 AVX: _mm256_setr_m128i : vinsertf128
513 AVX2: _mm256_add_epi64 : vpaddq ymm, ymm, ymm
514 _mm256_extracti128_si256 : vextracti128
515 _mm256_broadcastsi128_si256 : vbroadcasti128
516 */
517
518 #define AVX_CTR_LOOP_START \
519 ctr2 = _mm256_setr_m128i(_mm_sub_epi64(ctr, one), ctr); \
520 two = _mm256_setr_m128i(one, one); \
521 two = _mm256_add_epi64(two, two); \
522
523 // two = _mm256_setr_epi64x(2, 0, 2, 0);
524
525 #define AVX_CTR_LOOP_ENC \
526 ctr = _mm256_extracti128_si256 (ctr2, 1); \
527
VAES_FUNC_START2(AesCtr_Code_HW_256)528 VAES_FUNC_START2 (AesCtr_Code_HW_256)
529 {
530 __m128i *p = (__m128i *)(void *)ivAes;
531 __m128i *data = (__m128i *)(void *)data8;
532 __m128i ctr = *p;
533 const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
534 const __m128i *dataEnd;
535 const __m128i one = _mm_cvtsi32_si128(1);
536 __m256i ctr2, two;
537 p += 2;
538
539 WIDE_LOOP_START_AVX (AVX_CTR_LOOP_START)
540 {
541 const __m256i *w = keys;
542 UInt32 r = numRounds - 2;
543 WOP (AVX_DECLARE_VAR)
544 AVX_WOP_KEY (AVX_CTR_START, 0)
545
546 w += 1;
547 do
548 {
549 AVX_WOP_KEY (AVX_AES_ENC, 0)
550 w += 1;
551 }
552 while (--r);
553 AVX_WOP_KEY (AVX_AES_ENC_LAST, 0)
554
555 WOP (AVX_CTR_END)
556 }
557 WIDE_LOOP_END_AVX (AVX_CTR_LOOP_ENC)
558
559 SINGLE_LOOP
560 {
561 UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1;
562 const __m128i *w = p;
563 __m128i m;
564 MM_OP (_mm_add_epi64, ctr, one)
565 m = _mm_xor_si128 (ctr, p[0]);
566 w += 1;
567 do
568 {
569 MM_OP_m (_mm_aesenc_si128, w[0])
570 MM_OP_m (_mm_aesenc_si128, w[1])
571 w += 2;
572 }
573 while (--numRounds2);
574 MM_OP_m (_mm_aesenc_si128, w[0])
575 MM_OP_m (_mm_aesenclast_si128, w[1])
576 CTR_END (m, 0)
577 }
578
579 p[-2] = ctr;
580 }
581
582 #endif // USE_INTEL_VAES
583
584 #else // USE_INTEL_AES
585
586 /* no USE_INTEL_AES */
587
588 #if defined(Z7_USE_AES_HW_STUB)
589 // We can compile this file with another C compiler,
590 // or we can compile asm version.
591 // So we can generate real code instead of this stub function.
592 // #if defined(_MSC_VER)
593 #pragma message("AES HW_SW stub was used")
594 // #endif
595
596 #if !defined(USE_INTEL_VAES) && defined(Z7_USE_VAES_HW_STUB)
597 #define AES_TYPE_keys UInt32
598 #define AES_TYPE_data Byte
599 #endif
600
601 #define AES_FUNC_START(name) \
602 void Z7_FASTCALL name(UInt32 *p, Byte *data, size_t numBlocks) \
603
604 #define AES_COMPAT_STUB(name) \
605 AES_FUNC_START(name); \
606 AES_FUNC_START(name ## _HW) \
607 { name(p, data, numBlocks); }
608
609 AES_COMPAT_STUB (AesCbc_Encode)
610 AES_COMPAT_STUB (AesCbc_Decode)
611 AES_COMPAT_STUB (AesCtr_Code)
612 #endif // Z7_USE_AES_HW_STUB
613
614 #endif // USE_INTEL_AES
615
616
617 #ifndef USE_INTEL_VAES
618 #if defined(Z7_USE_VAES_HW_STUB)
619 // #if defined(_MSC_VER)
620 #pragma message("VAES HW_SW stub was used")
621 // #endif
622
623 #define VAES_COMPAT_STUB(name) \
624 void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks); \
625 void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks) \
626 { name((AES_TYPE_keys *)(void *)p, (AES_TYPE_data *)(void *)data, numBlocks); }
627
628 VAES_COMPAT_STUB (AesCbc_Decode_HW)
629 VAES_COMPAT_STUB (AesCtr_Code_HW)
630 #endif
631 #endif // ! USE_INTEL_VAES
632
633
634
635
636 #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE)
637
638 #if defined(__ARM_FEATURE_AES) \
639 || defined(__ARM_FEATURE_CRYPTO)
640 #define USE_HW_AES
641 #else
642 #if defined(MY_CPU_ARM64) \
643 || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \
644 || defined(Z7_MSC_VER_ORIGINAL)
645 #if defined(__ARM_FP) && \
646 ( defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
647 || defined(__GNUC__) && (__GNUC__ >= 6) \
648 ) \
649 || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910)
650 #if defined(MY_CPU_ARM64) \
651 || !defined(Z7_CLANG_VERSION) \
652 || defined(__ARM_NEON) && \
653 (Z7_CLANG_VERSION < 170000 || \
654 Z7_CLANG_VERSION > 170001)
655 #define USE_HW_AES
656 #endif
657 #endif
658 #endif
659 #endif
660
661 #ifdef USE_HW_AES
662
663 // #pragma message("=== AES HW === ")
664 // __ARM_FEATURE_CRYPTO macro is deprecated in favor of the finer grained feature macro __ARM_FEATURE_AES
665
666 #if defined(__clang__) || defined(__GNUC__)
667 #if !defined(__ARM_FEATURE_AES) && \
668 !defined(__ARM_FEATURE_CRYPTO)
669 #ifdef MY_CPU_ARM64
670 #if defined(__clang__)
671 #define ATTRIB_AES __attribute__((__target__("crypto")))
672 #else
673 #define ATTRIB_AES __attribute__((__target__("+crypto")))
674 #endif
675 #else
676 #if defined(__clang__)
677 #define ATTRIB_AES __attribute__((__target__("armv8-a,aes")))
678 #else
679 #define ATTRIB_AES __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
680 #endif
681 #endif
682 #endif
683 #else
684 // _MSC_VER
685 // for arm32
686 #define _ARM_USE_NEW_NEON_INTRINSICS
687 #endif
688
689 #ifndef ATTRIB_AES
690 #define ATTRIB_AES
691 #endif
692
693 #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
694 #include <arm64_neon.h>
695 #else
696 /*
697 clang-17.0.1: error : Cannot select: intrinsic %llvm.arm.neon.aese
698 clang
699 3.8.1 : __ARM_NEON : defined(__ARM_FEATURE_CRYPTO)
700 7.0.1 : __ARM_NEON : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_CRYPTO)
701 11.?.0 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_CRYPTO)
702 13.0.1 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_AES)
703 16 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8
704 */
705 #if defined(__clang__) && __clang_major__ < 16
706 #if !defined(__ARM_FEATURE_AES) && \
707 !defined(__ARM_FEATURE_CRYPTO)
708 // #pragma message("=== we set __ARM_FEATURE_CRYPTO 1 === ")
709 Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
710 #define Z7_ARM_FEATURE_CRYPTO_WAS_SET 1
711 // #if defined(__clang__) && __clang_major__ < 13
712 #define __ARM_FEATURE_CRYPTO 1
713 // #else
714 #define __ARM_FEATURE_AES 1
715 // #endif
716 Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
717 #endif
718 #endif // clang
719
720 #if defined(__clang__)
721
722 #if defined(__ARM_ARCH) && __ARM_ARCH < 8
723 Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
724 // #pragma message("#define __ARM_ARCH 8")
725 #undef __ARM_ARCH
726 #define __ARM_ARCH 8
727 Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
728 #endif
729
730 #endif // clang
731
732 #include <arm_neon.h>
733
734 #if defined(Z7_ARM_FEATURE_CRYPTO_WAS_SET) && \
735 defined(__ARM_FEATURE_CRYPTO) && \
736 defined(__ARM_FEATURE_AES)
737 Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
738 #undef __ARM_FEATURE_CRYPTO
739 #undef __ARM_FEATURE_AES
740 #undef Z7_ARM_FEATURE_CRYPTO_WAS_SET
741 Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
742 // #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ")
743 #endif
744
745 #endif // Z7_MSC_VER_ORIGINAL
746
747 typedef uint8x16_t v128;
748
749 #define AES_FUNC_START(name) \
750 void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks)
751 // void Z7_FASTCALL name(v128 *p, v128 *data, size_t numBlocks)
752
753 #define AES_FUNC_START2(name) \
754 AES_FUNC_START (name); \
755 ATTRIB_AES \
756 AES_FUNC_START (name)
757
758 #define MM_OP(op, dest, src) dest = op(dest, src);
759 #define MM_OP_m(op, src) MM_OP(op, m, src)
760 #define MM_OP1_m(op) m = op(m);
761
762 #define MM_XOR( dest, src) MM_OP(veorq_u8, dest, src)
763 #define MM_XOR_m( src) MM_XOR(m, src)
764
765 #define AES_E_m(k) MM_OP_m (vaeseq_u8, k)
766 #define AES_E_MC_m(k) AES_E_m (k) MM_OP1_m(vaesmcq_u8)
767
768
769 AES_FUNC_START2 (AesCbc_Encode_HW)
770 {
771 if (numBlocks == 0)
772 return;
773 {
774 v128 * const p = (v128 *)(void *)ivAes;
775 v128 *data = (v128 *)(void *)data8;
776 v128 m = *p;
777 const UInt32 numRounds2 = *(const UInt32 *)(p + 1);
778 const v128 *w = p + (size_t)numRounds2 * 2;
779 const v128 k0 = p[2];
780 const v128 k1 = p[3];
781 const v128 k2 = p[4];
782 const v128 k3 = p[5];
783 const v128 k4 = p[6];
784 const v128 k5 = p[7];
785 const v128 k6 = p[8];
786 const v128 k7 = p[9];
787 const v128 k8 = p[10];
788 const v128 k9 = p[11];
789 const v128 k_z4 = w[-2];
790 const v128 k_z3 = w[-1];
791 const v128 k_z2 = w[0];
792 const v128 k_z1 = w[1];
793 const v128 k_z0 = w[2];
794 // we don't use optimization veorq_u8(*data, k_z0) that can reduce one cycle,
795 // because gcc/clang compilers are not good for that optimization.
796 do
797 {
798 MM_XOR_m (*data)
799 AES_E_MC_m (k0)
800 AES_E_MC_m (k1)
801 AES_E_MC_m (k2)
802 AES_E_MC_m (k3)
803 AES_E_MC_m (k4)
804 AES_E_MC_m (k5)
805 if (numRounds2 >= 6)
806 {
807 AES_E_MC_m (k6)
808 AES_E_MC_m (k7)
809 if (numRounds2 != 6)
810 {
811 AES_E_MC_m (k8)
812 AES_E_MC_m (k9)
813 }
814 }
815 AES_E_MC_m (k_z4)
816 AES_E_MC_m (k_z3)
817 AES_E_MC_m (k_z2)
818 AES_E_m (k_z1)
819 MM_XOR_m (k_z0)
820 *data++ = m;
821 }
822 while (--numBlocks);
823 *p = m;
824 }
825 }
826
827
828 #define WOP_1(op)
829 #define WOP_2(op) WOP_1 (op) op (m1, 1)
830 #define WOP_3(op) WOP_2 (op) op (m2, 2)
831 #define WOP_4(op) WOP_3 (op) op (m3, 3)
832 #define WOP_5(op) WOP_4 (op) op (m4, 4)
833 #define WOP_6(op) WOP_5 (op) op (m5, 5)
834 #define WOP_7(op) WOP_6 (op) op (m6, 6)
835 #define WOP_8(op) WOP_7 (op) op (m7, 7)
836
837 #define NUM_WAYS 8
838 #define WOP_M1 WOP_8
839
840 #define WOP(op) op (m0, 0) WOP_M1(op)
841
842 #define DECLARE_VAR(reg, ii) v128 reg;
843 #define LOAD_data( reg, ii) reg = data[ii];
844 #define STORE_data( reg, ii) data[ii] = reg;
845 #if (NUM_WAYS > 1)
846 #define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1])
847 #endif
848
849 #define MM_OP_key(op, reg) MM_OP (op, reg, key)
850
851 #define AES_D_m(k) MM_OP_m (vaesdq_u8, k)
852 #define AES_D_IMC_m(k) AES_D_m (k) MM_OP1_m (vaesimcq_u8)
853
854 #define AES_XOR( reg, ii) MM_OP_key (veorq_u8, reg)
855 #define AES_D( reg, ii) MM_OP_key (vaesdq_u8, reg)
856 #define AES_E( reg, ii) MM_OP_key (vaeseq_u8, reg)
857
858 #define AES_D_IMC( reg, ii) AES_D (reg, ii) reg = vaesimcq_u8(reg);
859 #define AES_E_MC( reg, ii) AES_E (reg, ii) reg = vaesmcq_u8(reg);
860
861 #define CTR_START(reg, ii) MM_OP (vaddq_u64, ctr, one) reg = vreinterpretq_u8_u64(ctr);
862 #define CTR_END( reg, ii) MM_XOR (data[ii], reg)
863
864 #define WOP_KEY(op, n) { \
865 const v128 key = w[n]; \
866 WOP(op) }
867
868 #define WIDE_LOOP_START \
869 dataEnd = data + numBlocks; \
870 if (numBlocks >= NUM_WAYS) \
871 { dataEnd -= NUM_WAYS; do { \
872
873 #define WIDE_LOOP_END \
874 data += NUM_WAYS; \
875 } while (data <= dataEnd); \
876 dataEnd += NUM_WAYS; } \
877
878 #define SINGLE_LOOP \
879 for (; data < dataEnd; data++)
880
881
882 AES_FUNC_START2 (AesCbc_Decode_HW)
883 {
884 v128 *p = (v128 *)(void *)ivAes;
885 v128 *data = (v128 *)(void *)data8;
886 v128 iv = *p;
887 const v128 * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2;
888 const v128 *dataEnd;
889 p += 2;
890
891 WIDE_LOOP_START
892 {
893 const v128 *w = wStart;
894 WOP (DECLARE_VAR)
895 WOP (LOAD_data)
896 WOP_KEY (AES_D_IMC, 2)
897 do
898 {
899 WOP_KEY (AES_D_IMC, 1)
900 WOP_KEY (AES_D_IMC, 0)
901 w -= 2;
902 }
903 while (w != p);
904 WOP_KEY (AES_D, 1)
905 WOP_KEY (AES_XOR, 0)
906 MM_XOR (m0, iv)
907 WOP_M1 (XOR_data_M1)
908 LOAD_data(iv, NUM_WAYS - 1)
909 WOP (STORE_data)
910 }
911 WIDE_LOOP_END
912
913 SINGLE_LOOP
914 {
915 const v128 *w = wStart;
916 v128 m; LOAD_data(m, 0)
917 AES_D_IMC_m (w[2])
918 do
919 {
920 AES_D_IMC_m (w[1])
921 AES_D_IMC_m (w[0])
922 w -= 2;
923 }
924 while (w != p);
925 AES_D_m (w[1])
926 MM_XOR_m (w[0])
927 MM_XOR_m (iv)
928 LOAD_data(iv, 0)
929 STORE_data(m, 0)
930 }
931
932 p[-2] = iv;
933 }
934
935
936 AES_FUNC_START2 (AesCtr_Code_HW)
937 {
938 v128 *p = (v128 *)(void *)ivAes;
939 v128 *data = (v128 *)(void *)data8;
940 uint64x2_t ctr = vreinterpretq_u64_u8(*p);
941 const v128 * const wEnd = p + (size_t)*(const UInt32 *)(p + 1) * 2;
942 const v128 *dataEnd;
943 // the bug in clang:
944 // __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2);
945 #if defined(__clang__) && (__clang_major__ <= 9)
946 #pragma GCC diagnostic ignored "-Wvector-conversion"
947 #endif
948 const uint64x2_t one = vsetq_lane_u64(1, vdupq_n_u64(0), 0);
949 p += 2;
950
951 WIDE_LOOP_START
952 {
953 const v128 *w = p;
954 WOP (DECLARE_VAR)
955 WOP (CTR_START)
956 do
957 {
958 WOP_KEY (AES_E_MC, 0)
959 WOP_KEY (AES_E_MC, 1)
960 w += 2;
961 }
962 while (w != wEnd);
963 WOP_KEY (AES_E_MC, 0)
964 WOP_KEY (AES_E, 1)
965 WOP_KEY (AES_XOR, 2)
966 WOP (CTR_END)
967 }
968 WIDE_LOOP_END
969
970 SINGLE_LOOP
971 {
972 const v128 *w = p;
973 v128 m;
974 CTR_START (m, 0)
975 do
976 {
977 AES_E_MC_m (w[0])
978 AES_E_MC_m (w[1])
979 w += 2;
980 }
981 while (w != wEnd);
982 AES_E_MC_m (w[0])
983 AES_E_m (w[1])
984 MM_XOR_m (w[2])
985 CTR_END (m, 0)
986 }
987
988 p[-2] = vreinterpretq_u8_u64(ctr);
989 }
990
991 #endif // USE_HW_AES
992
993 #endif // MY_CPU_ARM_OR_ARM64
994
995 #undef NUM_WAYS
996 #undef WOP_M1
997 #undef WOP
998 #undef DECLARE_VAR
999 #undef LOAD_data
1000 #undef STORE_data
1001 #undef USE_INTEL_AES
1002 #undef USE_HW_AES
1003