xref: /aosp_15_r20/external/lzma/C/AesOpt.c (revision f6dc9357d832569d4d1f5d24eacdb3935a1ae8e6)
1 /* AesOpt.c -- AES optimized code for x86 AES hardware instructions
2 Igor Pavlov : Public domain */
3 
4 #include "Precomp.h"
5 
6 #include "Aes.h"
7 #include "CpuArch.h"
8 
9 #ifdef MY_CPU_X86_OR_AMD64
10 
11   #if defined(__INTEL_COMPILER)
12     #if (__INTEL_COMPILER >= 1110)
13       #define USE_INTEL_AES
14       #if (__INTEL_COMPILER >= 1900)
15         #define USE_INTEL_VAES
16       #endif
17     #endif
18   #elif defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
19      || defined(Z7_GCC_VERSION)   && (Z7_GCC_VERSION   >= 40400)
20         #define USE_INTEL_AES
21         #if !defined(__AES__)
22           #define ATTRIB_AES __attribute__((__target__("aes")))
23         #endif
24       #if defined(__clang__) && (__clang_major__ >= 8) \
25           || defined(__GNUC__) && (__GNUC__ >= 8)
26         #define USE_INTEL_VAES
27         #if !defined(__AES__) || !defined(__VAES__) || !defined(__AVX__) || !defined(__AVX2__)
28           #define ATTRIB_VAES __attribute__((__target__("aes,vaes,avx,avx2")))
29         #endif
30       #endif
31   #elif defined(_MSC_VER)
32     #if (_MSC_VER > 1500) || (_MSC_FULL_VER >= 150030729)
33       #define USE_INTEL_AES
34       #if (_MSC_VER >= 1910)
35         #define USE_INTEL_VAES
36       #endif
37     #endif
38     #ifndef USE_INTEL_AES
39       #define Z7_USE_AES_HW_STUB
40     #endif
41     #ifndef USE_INTEL_VAES
42       #define Z7_USE_VAES_HW_STUB
43     #endif
44   #endif
45 
46     #ifndef USE_INTEL_AES
47       // #define Z7_USE_AES_HW_STUB // for debug
48     #endif
49     #ifndef USE_INTEL_VAES
50       // #define Z7_USE_VAES_HW_STUB // for debug
51     #endif
52 
53 
54 #ifdef USE_INTEL_AES
55 
56 #include <wmmintrin.h>
57 
58 #if !defined(USE_INTEL_VAES) && defined(Z7_USE_VAES_HW_STUB)
59 #define AES_TYPE_keys UInt32
60 #define AES_TYPE_data Byte
61 // #define AES_TYPE_keys __m128i
62 // #define AES_TYPE_data __m128i
63 #endif
64 
65 #ifndef ATTRIB_AES
66   #define ATTRIB_AES
67 #endif
68 
69 #define AES_FUNC_START(name) \
70     void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks)
71     // void Z7_FASTCALL name(__m128i *p, __m128i *data, size_t numBlocks)
72 
73 #define AES_FUNC_START2(name) \
74 AES_FUNC_START (name); \
75 ATTRIB_AES \
76 AES_FUNC_START (name)
77 
78 #define MM_OP(op, dest, src)  dest = op(dest, src);
79 #define MM_OP_m(op, src)      MM_OP(op, m, src)
80 
81 #define MM_XOR( dest, src)    MM_OP(_mm_xor_si128,    dest, src)
82 
83 #if 1
84 // use aligned SSE load/store for data.
85 // It is required for our Aes functions, that data is aligned for 16-bytes.
86 // So we can use this branch of code.
87 // and compiler can use fused load-op SSE instructions:
88 //   xorps xmm0, XMMWORD PTR [rdx]
89 #define LOAD_128(pp)        (*(__m128i *)(void *)(pp))
90 #define STORE_128(pp, _v)    *(__m128i *)(void *)(pp) = _v
91 // use aligned SSE load/store for data. Alternative code with direct access
92 // #define LOAD_128(pp)        _mm_load_si128(pp)
93 // #define STORE_128(pp, _v)   _mm_store_si128(pp, _v)
94 #else
95 // use unaligned load/store for data: movdqu XMMWORD PTR [rdx]
96 #define LOAD_128(pp)        _mm_loadu_si128(pp)
97 #define STORE_128(pp, _v)   _mm_storeu_si128(pp, _v)
98 #endif
99 
AES_FUNC_START2(AesCbc_Encode_HW)100 AES_FUNC_START2 (AesCbc_Encode_HW)
101 {
102   if (numBlocks == 0)
103     return;
104   {
105   __m128i *p = (__m128i *)(void *)ivAes;
106   __m128i *data = (__m128i *)(void *)data8;
107   __m128i m = *p;
108   const __m128i k0 = p[2];
109   const __m128i k1 = p[3];
110   const UInt32 numRounds2 = *(const UInt32 *)(p + 1) - 1;
111   do
112   {
113     UInt32 r = numRounds2;
114     const __m128i *w = p + 4;
115     __m128i temp = LOAD_128(data);
116     MM_XOR (temp, k0)
117     MM_XOR (m, temp)
118     MM_OP_m (_mm_aesenc_si128, k1)
119     do
120     {
121       MM_OP_m (_mm_aesenc_si128, w[0])
122       MM_OP_m (_mm_aesenc_si128, w[1])
123       w += 2;
124     }
125     while (--r);
126     MM_OP_m (_mm_aesenclast_si128, w[0])
127     STORE_128(data, m);
128     data++;
129   }
130   while (--numBlocks);
131   *p = m;
132   }
133 }
134 
135 
136 #define WOP_1(op)
137 #define WOP_2(op)   WOP_1 (op)  op (m1, 1)
138 #define WOP_3(op)   WOP_2 (op)  op (m2, 2)
139 #define WOP_4(op)   WOP_3 (op)  op (m3, 3)
140 #ifdef MY_CPU_AMD64
141 #define WOP_5(op)   WOP_4 (op)  op (m4, 4)
142 #define WOP_6(op)   WOP_5 (op)  op (m5, 5)
143 #define WOP_7(op)   WOP_6 (op)  op (m6, 6)
144 #define WOP_8(op)   WOP_7 (op)  op (m7, 7)
145 #endif
146 /*
147 #define WOP_9(op)   WOP_8 (op)  op (m8, 8);
148 #define WOP_10(op)  WOP_9 (op)  op (m9, 9);
149 #define WOP_11(op)  WOP_10(op)  op (m10, 10);
150 #define WOP_12(op)  WOP_11(op)  op (m11, 11);
151 #define WOP_13(op)  WOP_12(op)  op (m12, 12);
152 #define WOP_14(op)  WOP_13(op)  op (m13, 13);
153 */
154 
155 #ifdef MY_CPU_AMD64
156   #define NUM_WAYS      8
157   #define WOP_M1    WOP_8
158 #else
159   #define NUM_WAYS      4
160   #define WOP_M1    WOP_4
161 #endif
162 
163 #define WOP(op)  op (m0, 0)  WOP_M1(op)
164 
165 #define DECLARE_VAR(reg, ii)  __m128i reg;
166 #define LOAD_data_ii(ii)      LOAD_128(data + (ii))
167 #define LOAD_data(  reg, ii)  reg = LOAD_data_ii(ii);
168 #define STORE_data( reg, ii)  STORE_128(data + (ii), reg);
169 #if (NUM_WAYS > 1)
170 #define XOR_data_M1(reg, ii)  MM_XOR (reg, LOAD_128(data + (ii- 1)))
171 #endif
172 
173 #define MM_OP_key(op, reg)  MM_OP(op, reg, key);
174 
175 #define AES_DEC(      reg, ii)   MM_OP_key (_mm_aesdec_si128,     reg)
176 #define AES_DEC_LAST( reg, ii)   MM_OP_key (_mm_aesdeclast_si128, reg)
177 #define AES_ENC(      reg, ii)   MM_OP_key (_mm_aesenc_si128,     reg)
178 #define AES_ENC_LAST( reg, ii)   MM_OP_key (_mm_aesenclast_si128, reg)
179 #define AES_XOR(      reg, ii)   MM_OP_key (_mm_xor_si128,        reg)
180 
181 #define CTR_START(reg, ii)  MM_OP (_mm_add_epi64, ctr, one)  reg = ctr;
182 #define CTR_END(  reg, ii)  STORE_128(data + (ii), _mm_xor_si128(reg, \
183                             LOAD_128 (data + (ii))));
184 #define WOP_KEY(op, n) { \
185     const __m128i key = w[n]; \
186     WOP(op) }
187 
188 #define WIDE_LOOP_START  \
189     dataEnd = data + numBlocks;  \
190     if (numBlocks >= NUM_WAYS)  \
191     { dataEnd -= NUM_WAYS; do {  \
192 
193 #define WIDE_LOOP_END  \
194     data += NUM_WAYS;  \
195     } while (data <= dataEnd);  \
196     dataEnd += NUM_WAYS; }  \
197 
198 #define SINGLE_LOOP  \
199     for (; data < dataEnd; data++)
200 
201 
202 
203 #ifdef USE_INTEL_VAES
204 
205 #define AVX_XOR(dest, src)    MM_OP(_mm256_xor_si256, dest, src)
206 #define AVX_DECLARE_VAR(reg, ii)  __m256i reg;
207 
208 #if 1
209 // use unaligned AVX load/store for data.
210 // It is required for our Aes functions, that data is aligned for 16-bytes.
211 // But we need 32-bytes reading.
212 // So we use intrinsics for unaligned AVX load/store.
213 // notes for _mm256_storeu_si256:
214 // msvc2022: uses vmovdqu and keeps the order of instruction sequence.
215 // new gcc11 uses vmovdqu
216 // old gcc9 could use pair of instructions:
217 //   vmovups        %xmm7, -224(%rax)
218 //   vextracti128   $0x1, %ymm7, -208(%rax)
219 #define AVX_LOAD(p)         _mm256_loadu_si256((const __m256i *)(const void *)(p))
220 #define AVX_STORE(p, _v)    _mm256_storeu_si256((__m256i *)(void *)(p), _v);
221 #else
222 // use aligned AVX load/store for data.
223 // for debug: we can use this branch, if we are sure that data is aligned for 32-bytes.
224 // msvc2022 uses vmovdqu still
225 // gcc      uses vmovdqa (that requires 32-bytes alignment)
226 #define AVX_LOAD(p)         (*(const __m256i *)(const void *)(p))
227 #define AVX_STORE(p, _v)    (*(__m256i *)(void *)(p)) = _v;
228 #endif
229 
230 #define AVX_LOAD_data(  reg, ii)  reg = AVX_LOAD((const __m256i *)(const void *)data + (ii));
231 #define AVX_STORE_data( reg, ii)  AVX_STORE((__m256i *)(void *)data + (ii), reg)
232 /*
233 AVX_XOR_data_M1() needs unaligned memory load, even if (data)
234 is aligned for 256-bits, because we read 32-bytes chunk that
235 crosses (data) position: from (data - 16bytes) to (data + 16bytes).
236 */
237 #define AVX_XOR_data_M1(reg, ii)  AVX_XOR (reg, _mm256_loadu_si256((const __m256i *)(const void *)(data - 1) + (ii)))
238 
239 #define AVX_AES_DEC(      reg, ii)   MM_OP_key (_mm256_aesdec_epi128,     reg)
240 #define AVX_AES_DEC_LAST( reg, ii)   MM_OP_key (_mm256_aesdeclast_epi128, reg)
241 #define AVX_AES_ENC(      reg, ii)   MM_OP_key (_mm256_aesenc_epi128,     reg)
242 #define AVX_AES_ENC_LAST( reg, ii)   MM_OP_key (_mm256_aesenclast_epi128, reg)
243 #define AVX_AES_XOR(      reg, ii)   MM_OP_key (_mm256_xor_si256,         reg)
244 #define AVX_CTR_START(reg, ii)  \
245     MM_OP (_mm256_add_epi64, ctr2, two) \
246     reg = _mm256_xor_si256(ctr2, key);
247 
248 #define AVX_CTR_END(reg, ii)  \
249     AVX_STORE((__m256i *)(void *)data + (ii), _mm256_xor_si256(reg, \
250     AVX_LOAD ((__m256i *)(void *)data + (ii))));
251 
252 #define AVX_WOP_KEY(op, n) { \
253     const __m256i key = w[n]; \
254     WOP(op) }
255 
256 #define NUM_AES_KEYS_MAX 15
257 
258 #define WIDE_LOOP_START_AVX(OP)  \
259     dataEnd = data + numBlocks;  \
260     if (numBlocks >= NUM_WAYS * 2)  \
261     { __m256i keys[NUM_AES_KEYS_MAX];  \
262       OP  \
263       { UInt32 ii; for (ii = 0; ii < numRounds; ii++)  \
264         keys[ii] = _mm256_broadcastsi128_si256(p[ii]); }  \
265       dataEnd -= NUM_WAYS * 2; \
266       do {  \
267 
268 #define WIDE_LOOP_END_AVX(OP)  \
269         data += NUM_WAYS * 2;  \
270       } while (data <= dataEnd);  \
271       dataEnd += NUM_WAYS * 2;  \
272       OP  \
273       _mm256_zeroupper();  \
274     }  \
275 
276 /* MSVC for x86: If we don't call _mm256_zeroupper(), and -arch:IA32 is not specified,
277    MSVC still can insert vzeroupper instruction. */
278 
279 #endif
280 
281 
282 
AES_FUNC_START2(AesCbc_Decode_HW)283 AES_FUNC_START2 (AesCbc_Decode_HW)
284 {
285   __m128i *p = (__m128i *)(void *)ivAes;
286   __m128i *data = (__m128i *)(void *)data8;
287   __m128i iv = *p;
288   const __m128i * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2 + 2 - 1;
289   const __m128i *dataEnd;
290   p += 2;
291 
292   WIDE_LOOP_START
293   {
294     const __m128i *w = wStart;
295     WOP (DECLARE_VAR)
296     WOP (LOAD_data)
297     WOP_KEY (AES_XOR, 1)
298     do
299     {
300       WOP_KEY (AES_DEC, 0)
301 
302       w--;
303     }
304     while (w != p);
305     WOP_KEY (AES_DEC_LAST, 0)
306 
307     MM_XOR (m0, iv)
308     WOP_M1 (XOR_data_M1)
309     LOAD_data(iv, NUM_WAYS - 1)
310     WOP (STORE_data)
311   }
312   WIDE_LOOP_END
313 
314   SINGLE_LOOP
315   {
316     const __m128i *w = wStart - 1;
317     __m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0));
318 
319     do
320     {
321       MM_OP_m (_mm_aesdec_si128, w[1])
322       MM_OP_m (_mm_aesdec_si128, w[0])
323       w -= 2;
324     }
325     while (w != p);
326     MM_OP_m (_mm_aesdec_si128,     w[1])
327     MM_OP_m (_mm_aesdeclast_si128, w[0])
328     MM_XOR (m, iv)
329     LOAD_data(iv, 0)
330     STORE_data(m, 0)
331   }
332 
333   p[-2] = iv;
334 }
335 
336 
AES_FUNC_START2(AesCtr_Code_HW)337 AES_FUNC_START2 (AesCtr_Code_HW)
338 {
339   __m128i *p = (__m128i *)(void *)ivAes;
340   __m128i *data = (__m128i *)(void *)data8;
341   __m128i ctr = *p;
342   const UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1;
343   const __m128i *dataEnd;
344   const __m128i one = _mm_cvtsi32_si128(1);
345 
346   p += 2;
347 
348   WIDE_LOOP_START
349   {
350     const __m128i *w = p;
351     UInt32 r = numRoundsMinus2;
352     WOP (DECLARE_VAR)
353     WOP (CTR_START)
354     WOP_KEY (AES_XOR, 0)
355     w += 1;
356     do
357     {
358       WOP_KEY (AES_ENC, 0)
359       w += 1;
360     }
361     while (--r);
362     WOP_KEY (AES_ENC_LAST, 0)
363     WOP (CTR_END)
364   }
365   WIDE_LOOP_END
366 
367   SINGLE_LOOP
368   {
369     UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1;
370     const __m128i *w = p;
371     __m128i m;
372     MM_OP (_mm_add_epi64, ctr, one)
373     m = _mm_xor_si128 (ctr, p[0]);
374     w += 1;
375     do
376     {
377       MM_OP_m (_mm_aesenc_si128, w[0])
378       MM_OP_m (_mm_aesenc_si128, w[1])
379       w += 2;
380     }
381     while (--numRounds2);
382     MM_OP_m (_mm_aesenc_si128,     w[0])
383     MM_OP_m (_mm_aesenclast_si128, w[1])
384     CTR_END (m, 0)
385   }
386 
387   p[-2] = ctr;
388 }
389 
390 
391 
392 #ifdef USE_INTEL_VAES
393 
394 /*
395 GCC before 2013-Jun:
396   <immintrin.h>:
397     #ifdef __AVX__
398      #include <avxintrin.h>
399     #endif
400 GCC after 2013-Jun:
401   <immintrin.h>:
402     #include <avxintrin.h>
403 CLANG 3.8+:
404 {
405   <immintrin.h>:
406     #if !defined(_MSC_VER) || defined(__AVX__)
407       #include <avxintrin.h>
408     #endif
409 
410   if (the compiler is clang for Windows and if global arch is not set for __AVX__)
411     [ if (defined(_MSC_VER) && !defined(__AVX__)) ]
412   {
413     <immintrin.h> doesn't include <avxintrin.h>
414     and we have 2 ways to fix it:
415       1) we can define required __AVX__ before <immintrin.h>
416       or
417       2) we can include <avxintrin.h> after <immintrin.h>
418   }
419 }
420 
421 If we include <avxintrin.h> manually for GCC/CLANG, it's
422 required that <immintrin.h> must be included before <avxintrin.h>.
423 */
424 
425 /*
426 #if defined(__clang__) && defined(_MSC_VER)
427 #define __AVX__
428 #define __AVX2__
429 #define __VAES__
430 #endif
431 */
432 
433 #include <immintrin.h>
434 #if defined(__clang__) && defined(_MSC_VER)
435   #if !defined(__AVX__)
436     #include <avxintrin.h>
437   #endif
438   #if !defined(__AVX2__)
439     #include <avx2intrin.h>
440   #endif
441   #if !defined(__VAES__)
442     #include <vaesintrin.h>
443   #endif
444 #endif  // __clang__ && _MSC_VER
445 
446 #ifndef ATTRIB_VAES
447   #define ATTRIB_VAES
448 #endif
449 
450 #define VAES_FUNC_START2(name) \
451 AES_FUNC_START (name); \
452 ATTRIB_VAES \
453 AES_FUNC_START (name)
454 
VAES_FUNC_START2(AesCbc_Decode_HW_256)455 VAES_FUNC_START2 (AesCbc_Decode_HW_256)
456 {
457   __m128i *p = (__m128i *)(void *)ivAes;
458   __m128i *data = (__m128i *)(void *)data8;
459   __m128i iv = *p;
460   const __m128i *dataEnd;
461   const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
462   p += 2;
463 
464   WIDE_LOOP_START_AVX(;)
465   {
466     const __m256i *w = keys + numRounds - 2;
467 
468     WOP (AVX_DECLARE_VAR)
469     WOP (AVX_LOAD_data)
470     AVX_WOP_KEY (AVX_AES_XOR, 1)
471 
472     do
473     {
474       AVX_WOP_KEY (AVX_AES_DEC, 0)
475       w--;
476     }
477     while (w != keys);
478     AVX_WOP_KEY (AVX_AES_DEC_LAST, 0)
479 
480     AVX_XOR (m0, _mm256_setr_m128i(iv, LOAD_data_ii(0)))
481     WOP_M1 (AVX_XOR_data_M1)
482     LOAD_data (iv, NUM_WAYS * 2 - 1)
483     WOP (AVX_STORE_data)
484   }
485   WIDE_LOOP_END_AVX(;)
486 
487   SINGLE_LOOP
488   {
489     const __m128i *w = p - 2 + (size_t)*(const UInt32 *)(p + 1 - 2) * 2;
490     __m128i m = _mm_xor_si128 (w[2], LOAD_data_ii(0));
491     do
492     {
493       MM_OP_m (_mm_aesdec_si128, w[1])
494       MM_OP_m (_mm_aesdec_si128, w[0])
495       w -= 2;
496     }
497     while (w != p);
498     MM_OP_m (_mm_aesdec_si128,     w[1])
499     MM_OP_m (_mm_aesdeclast_si128, w[0])
500 
501     MM_XOR (m, iv)
502     LOAD_data(iv, 0)
503     STORE_data(m, 0)
504   }
505 
506   p[-2] = iv;
507 }
508 
509 
510 /*
511 SSE2: _mm_cvtsi32_si128 : movd
512 AVX:  _mm256_setr_m128i            : vinsertf128
513 AVX2: _mm256_add_epi64             : vpaddq ymm, ymm, ymm
514       _mm256_extracti128_si256     : vextracti128
515       _mm256_broadcastsi128_si256  : vbroadcasti128
516 */
517 
518 #define AVX_CTR_LOOP_START  \
519     ctr2 = _mm256_setr_m128i(_mm_sub_epi64(ctr, one), ctr); \
520     two = _mm256_setr_m128i(one, one); \
521     two = _mm256_add_epi64(two, two); \
522 
523 // two = _mm256_setr_epi64x(2, 0, 2, 0);
524 
525 #define AVX_CTR_LOOP_ENC  \
526     ctr = _mm256_extracti128_si256 (ctr2, 1); \
527 
VAES_FUNC_START2(AesCtr_Code_HW_256)528 VAES_FUNC_START2 (AesCtr_Code_HW_256)
529 {
530   __m128i *p = (__m128i *)(void *)ivAes;
531   __m128i *data = (__m128i *)(void *)data8;
532   __m128i ctr = *p;
533   const UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
534   const __m128i *dataEnd;
535   const __m128i one = _mm_cvtsi32_si128(1);
536   __m256i ctr2, two;
537   p += 2;
538 
539   WIDE_LOOP_START_AVX (AVX_CTR_LOOP_START)
540   {
541     const __m256i *w = keys;
542     UInt32 r = numRounds - 2;
543     WOP (AVX_DECLARE_VAR)
544     AVX_WOP_KEY (AVX_CTR_START, 0)
545 
546     w += 1;
547     do
548     {
549       AVX_WOP_KEY (AVX_AES_ENC, 0)
550       w += 1;
551     }
552     while (--r);
553     AVX_WOP_KEY (AVX_AES_ENC_LAST, 0)
554 
555     WOP (AVX_CTR_END)
556   }
557   WIDE_LOOP_END_AVX (AVX_CTR_LOOP_ENC)
558 
559   SINGLE_LOOP
560   {
561     UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1;
562     const __m128i *w = p;
563     __m128i m;
564     MM_OP (_mm_add_epi64, ctr, one)
565     m = _mm_xor_si128 (ctr, p[0]);
566     w += 1;
567     do
568     {
569       MM_OP_m (_mm_aesenc_si128, w[0])
570       MM_OP_m (_mm_aesenc_si128, w[1])
571       w += 2;
572     }
573     while (--numRounds2);
574     MM_OP_m (_mm_aesenc_si128,     w[0])
575     MM_OP_m (_mm_aesenclast_si128, w[1])
576     CTR_END (m, 0)
577   }
578 
579   p[-2] = ctr;
580 }
581 
582 #endif // USE_INTEL_VAES
583 
584 #else // USE_INTEL_AES
585 
586 /* no USE_INTEL_AES */
587 
588 #if defined(Z7_USE_AES_HW_STUB)
589 // We can compile this file with another C compiler,
590 // or we can compile asm version.
591 // So we can generate real code instead of this stub function.
592 // #if defined(_MSC_VER)
593 #pragma message("AES  HW_SW stub was used")
594 // #endif
595 
596 #if !defined(USE_INTEL_VAES) && defined(Z7_USE_VAES_HW_STUB)
597 #define AES_TYPE_keys UInt32
598 #define AES_TYPE_data Byte
599 #endif
600 
601 #define AES_FUNC_START(name) \
602     void Z7_FASTCALL name(UInt32 *p, Byte *data, size_t numBlocks) \
603 
604 #define AES_COMPAT_STUB(name) \
605     AES_FUNC_START(name); \
606     AES_FUNC_START(name ## _HW) \
607     { name(p, data, numBlocks); }
608 
609 AES_COMPAT_STUB (AesCbc_Encode)
610 AES_COMPAT_STUB (AesCbc_Decode)
611 AES_COMPAT_STUB (AesCtr_Code)
612 #endif // Z7_USE_AES_HW_STUB
613 
614 #endif // USE_INTEL_AES
615 
616 
617 #ifndef USE_INTEL_VAES
618 #if defined(Z7_USE_VAES_HW_STUB)
619 // #if defined(_MSC_VER)
620 #pragma message("VAES HW_SW stub was used")
621 // #endif
622 
623 #define VAES_COMPAT_STUB(name) \
624     void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks); \
625     void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks) \
626     { name((AES_TYPE_keys *)(void *)p, (AES_TYPE_data *)(void *)data, numBlocks); }
627 
628 VAES_COMPAT_STUB (AesCbc_Decode_HW)
629 VAES_COMPAT_STUB (AesCtr_Code_HW)
630 #endif
631 #endif // ! USE_INTEL_VAES
632 
633 
634 
635 
636 #elif defined(MY_CPU_ARM_OR_ARM64) && defined(MY_CPU_LE)
637 
638   #if   defined(__ARM_FEATURE_AES) \
639      || defined(__ARM_FEATURE_CRYPTO)
640     #define USE_HW_AES
641   #else
642     #if  defined(MY_CPU_ARM64) \
643       || defined(__ARM_ARCH) && (__ARM_ARCH >= 4) \
644       || defined(Z7_MSC_VER_ORIGINAL)
645     #if  defined(__ARM_FP) && \
646           (   defined(Z7_CLANG_VERSION) && (Z7_CLANG_VERSION >= 30800) \
647            || defined(__GNUC__) && (__GNUC__ >= 6) \
648           ) \
649       || defined(Z7_MSC_VER_ORIGINAL) && (_MSC_VER >= 1910)
650     #if  defined(MY_CPU_ARM64) \
651       || !defined(Z7_CLANG_VERSION) \
652       || defined(__ARM_NEON) && \
653           (Z7_CLANG_VERSION < 170000 || \
654            Z7_CLANG_VERSION > 170001)
655       #define USE_HW_AES
656     #endif
657     #endif
658     #endif
659   #endif
660 
661 #ifdef USE_HW_AES
662 
663 // #pragma message("=== AES HW === ")
664 // __ARM_FEATURE_CRYPTO macro is deprecated in favor of the finer grained feature macro __ARM_FEATURE_AES
665 
666 #if defined(__clang__) || defined(__GNUC__)
667 #if !defined(__ARM_FEATURE_AES) && \
668     !defined(__ARM_FEATURE_CRYPTO)
669   #ifdef MY_CPU_ARM64
670 #if defined(__clang__)
671     #define ATTRIB_AES __attribute__((__target__("crypto")))
672 #else
673     #define ATTRIB_AES __attribute__((__target__("+crypto")))
674 #endif
675   #else
676 #if defined(__clang__)
677     #define ATTRIB_AES __attribute__((__target__("armv8-a,aes")))
678 #else
679     #define ATTRIB_AES __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
680 #endif
681   #endif
682 #endif
683 #else
684   // _MSC_VER
685   // for arm32
686   #define _ARM_USE_NEW_NEON_INTRINSICS
687 #endif
688 
689 #ifndef ATTRIB_AES
690   #define ATTRIB_AES
691 #endif
692 
693 #if defined(Z7_MSC_VER_ORIGINAL) && defined(MY_CPU_ARM64)
694 #include <arm64_neon.h>
695 #else
696 /*
697   clang-17.0.1: error : Cannot select: intrinsic %llvm.arm.neon.aese
698   clang
699    3.8.1 : __ARM_NEON             :                    defined(__ARM_FEATURE_CRYPTO)
700    7.0.1 : __ARM_NEON             : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_CRYPTO)
701   11.?.0 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_CRYPTO)
702   13.0.1 : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8 && defined(__ARM_FEATURE_AES)
703   16     : __ARM_NEON && __ARM_FP : __ARM_ARCH >= 8
704 */
705 #if defined(__clang__) && __clang_major__ < 16
706 #if !defined(__ARM_FEATURE_AES) && \
707     !defined(__ARM_FEATURE_CRYPTO)
708 //     #pragma message("=== we set __ARM_FEATURE_CRYPTO 1 === ")
709     Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
710     #define Z7_ARM_FEATURE_CRYPTO_WAS_SET 1
711 // #if defined(__clang__) && __clang_major__ < 13
712     #define __ARM_FEATURE_CRYPTO 1
713 // #else
714     #define __ARM_FEATURE_AES 1
715 // #endif
716     Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
717 #endif
718 #endif // clang
719 
720 #if defined(__clang__)
721 
722 #if defined(__ARM_ARCH) && __ARM_ARCH < 8
723     Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
724 //    #pragma message("#define __ARM_ARCH 8")
725     #undef  __ARM_ARCH
726     #define __ARM_ARCH 8
727     Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
728 #endif
729 
730 #endif // clang
731 
732 #include <arm_neon.h>
733 
734 #if defined(Z7_ARM_FEATURE_CRYPTO_WAS_SET) && \
735     defined(__ARM_FEATURE_CRYPTO) && \
736     defined(__ARM_FEATURE_AES)
737 Z7_DIAGNOSTIC_IGNORE_BEGIN_RESERVED_MACRO_IDENTIFIER
738     #undef __ARM_FEATURE_CRYPTO
739     #undef __ARM_FEATURE_AES
740     #undef Z7_ARM_FEATURE_CRYPTO_WAS_SET
741 Z7_DIAGNOSTIC_IGNORE_END_RESERVED_MACRO_IDENTIFIER
742 //    #pragma message("=== we undefine __ARM_FEATURE_CRYPTO === ")
743 #endif
744 
745 #endif // Z7_MSC_VER_ORIGINAL
746 
747 typedef uint8x16_t v128;
748 
749 #define AES_FUNC_START(name) \
750     void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks)
751     // void Z7_FASTCALL name(v128 *p, v128 *data, size_t numBlocks)
752 
753 #define AES_FUNC_START2(name) \
754 AES_FUNC_START (name); \
755 ATTRIB_AES \
756 AES_FUNC_START (name)
757 
758 #define MM_OP(op, dest, src)  dest = op(dest, src);
759 #define MM_OP_m(op, src)      MM_OP(op, m, src)
760 #define MM_OP1_m(op)          m = op(m);
761 
762 #define MM_XOR( dest, src)    MM_OP(veorq_u8, dest, src)
763 #define MM_XOR_m( src)        MM_XOR(m, src)
764 
765 #define AES_E_m(k)     MM_OP_m (vaeseq_u8, k)
766 #define AES_E_MC_m(k)  AES_E_m (k)  MM_OP1_m(vaesmcq_u8)
767 
768 
769 AES_FUNC_START2 (AesCbc_Encode_HW)
770 {
771   if (numBlocks == 0)
772     return;
773   {
774   v128 * const p = (v128 *)(void *)ivAes;
775   v128 *data = (v128 *)(void *)data8;
776   v128 m = *p;
777   const UInt32 numRounds2 = *(const UInt32 *)(p + 1);
778   const v128 *w = p + (size_t)numRounds2 * 2;
779   const v128 k0 = p[2];
780   const v128 k1 = p[3];
781   const v128 k2 = p[4];
782   const v128 k3 = p[5];
783   const v128 k4 = p[6];
784   const v128 k5 = p[7];
785   const v128 k6 = p[8];
786   const v128 k7 = p[9];
787   const v128 k8 = p[10];
788   const v128 k9 = p[11];
789   const v128 k_z4 = w[-2];
790   const v128 k_z3 = w[-1];
791   const v128 k_z2 = w[0];
792   const v128 k_z1 = w[1];
793   const v128 k_z0 = w[2];
794   // we don't use optimization veorq_u8(*data, k_z0) that can reduce one cycle,
795   // because gcc/clang compilers are not good for that optimization.
796   do
797   {
798     MM_XOR_m (*data)
799     AES_E_MC_m (k0)
800     AES_E_MC_m (k1)
801     AES_E_MC_m (k2)
802     AES_E_MC_m (k3)
803     AES_E_MC_m (k4)
804     AES_E_MC_m (k5)
805     if (numRounds2 >= 6)
806     {
807       AES_E_MC_m (k6)
808       AES_E_MC_m (k7)
809       if (numRounds2 != 6)
810       {
811         AES_E_MC_m (k8)
812         AES_E_MC_m (k9)
813       }
814     }
815     AES_E_MC_m (k_z4)
816     AES_E_MC_m (k_z3)
817     AES_E_MC_m (k_z2)
818     AES_E_m    (k_z1)
819     MM_XOR_m   (k_z0)
820     *data++ = m;
821   }
822   while (--numBlocks);
823   *p = m;
824   }
825 }
826 
827 
828 #define WOP_1(op)
829 #define WOP_2(op)   WOP_1 (op)  op (m1, 1)
830 #define WOP_3(op)   WOP_2 (op)  op (m2, 2)
831 #define WOP_4(op)   WOP_3 (op)  op (m3, 3)
832 #define WOP_5(op)   WOP_4 (op)  op (m4, 4)
833 #define WOP_6(op)   WOP_5 (op)  op (m5, 5)
834 #define WOP_7(op)   WOP_6 (op)  op (m6, 6)
835 #define WOP_8(op)   WOP_7 (op)  op (m7, 7)
836 
837   #define NUM_WAYS      8
838   #define WOP_M1    WOP_8
839 
840 #define WOP(op)  op (m0, 0)   WOP_M1(op)
841 
842 #define DECLARE_VAR(reg, ii)  v128 reg;
843 #define LOAD_data(  reg, ii)  reg = data[ii];
844 #define STORE_data( reg, ii)  data[ii] = reg;
845 #if (NUM_WAYS > 1)
846 #define XOR_data_M1(reg, ii)  MM_XOR (reg, data[ii- 1])
847 #endif
848 
849 #define MM_OP_key(op, reg)  MM_OP (op, reg, key)
850 
851 #define AES_D_m(k)      MM_OP_m (vaesdq_u8, k)
852 #define AES_D_IMC_m(k)  AES_D_m (k)  MM_OP1_m (vaesimcq_u8)
853 
854 #define AES_XOR(   reg, ii)  MM_OP_key (veorq_u8,  reg)
855 #define AES_D(     reg, ii)  MM_OP_key (vaesdq_u8, reg)
856 #define AES_E(     reg, ii)  MM_OP_key (vaeseq_u8, reg)
857 
858 #define AES_D_IMC( reg, ii)  AES_D (reg, ii)  reg = vaesimcq_u8(reg);
859 #define AES_E_MC(  reg, ii)  AES_E (reg, ii)  reg = vaesmcq_u8(reg);
860 
861 #define CTR_START(reg, ii)  MM_OP (vaddq_u64, ctr, one)  reg = vreinterpretq_u8_u64(ctr);
862 #define CTR_END(  reg, ii)  MM_XOR (data[ii], reg)
863 
864 #define WOP_KEY(op, n) { \
865     const v128 key = w[n]; \
866     WOP(op) }
867 
868 #define WIDE_LOOP_START  \
869     dataEnd = data + numBlocks;  \
870     if (numBlocks >= NUM_WAYS)  \
871     { dataEnd -= NUM_WAYS; do {  \
872 
873 #define WIDE_LOOP_END  \
874     data += NUM_WAYS;  \
875     } while (data <= dataEnd);  \
876     dataEnd += NUM_WAYS; }  \
877 
878 #define SINGLE_LOOP  \
879     for (; data < dataEnd; data++)
880 
881 
882 AES_FUNC_START2 (AesCbc_Decode_HW)
883 {
884   v128 *p = (v128 *)(void *)ivAes;
885   v128 *data = (v128 *)(void *)data8;
886   v128 iv = *p;
887   const v128 * const wStart = p + (size_t)*(const UInt32 *)(p + 1) * 2;
888   const v128 *dataEnd;
889   p += 2;
890 
891   WIDE_LOOP_START
892   {
893     const v128 *w = wStart;
894     WOP (DECLARE_VAR)
895     WOP (LOAD_data)
896     WOP_KEY (AES_D_IMC, 2)
897     do
898     {
899       WOP_KEY (AES_D_IMC, 1)
900       WOP_KEY (AES_D_IMC, 0)
901       w -= 2;
902     }
903     while (w != p);
904     WOP_KEY (AES_D,   1)
905     WOP_KEY (AES_XOR, 0)
906     MM_XOR (m0, iv)
907     WOP_M1 (XOR_data_M1)
908     LOAD_data(iv, NUM_WAYS - 1)
909     WOP (STORE_data)
910   }
911   WIDE_LOOP_END
912 
913   SINGLE_LOOP
914   {
915     const v128 *w = wStart;
916     v128 m;  LOAD_data(m, 0)
917     AES_D_IMC_m (w[2])
918     do
919     {
920       AES_D_IMC_m (w[1])
921       AES_D_IMC_m (w[0])
922       w -= 2;
923     }
924     while (w != p);
925     AES_D_m  (w[1])
926     MM_XOR_m (w[0])
927     MM_XOR_m (iv)
928     LOAD_data(iv, 0)
929     STORE_data(m, 0)
930   }
931 
932   p[-2] = iv;
933 }
934 
935 
936 AES_FUNC_START2 (AesCtr_Code_HW)
937 {
938   v128 *p = (v128 *)(void *)ivAes;
939   v128 *data = (v128 *)(void *)data8;
940   uint64x2_t ctr = vreinterpretq_u64_u8(*p);
941   const v128 * const wEnd = p + (size_t)*(const UInt32 *)(p + 1) * 2;
942   const v128 *dataEnd;
943 // the bug in clang:
944 // __builtin_neon_vsetq_lane_i64(__s0, (int8x16_t)__s1, __p2);
945 #if defined(__clang__) && (__clang_major__ <= 9)
946 #pragma GCC diagnostic ignored "-Wvector-conversion"
947 #endif
948   const uint64x2_t one = vsetq_lane_u64(1, vdupq_n_u64(0), 0);
949   p += 2;
950 
951   WIDE_LOOP_START
952   {
953     const v128 *w = p;
954     WOP (DECLARE_VAR)
955     WOP (CTR_START)
956     do
957     {
958       WOP_KEY (AES_E_MC, 0)
959       WOP_KEY (AES_E_MC, 1)
960       w += 2;
961     }
962     while (w != wEnd);
963     WOP_KEY (AES_E_MC, 0)
964     WOP_KEY (AES_E,    1)
965     WOP_KEY (AES_XOR,  2)
966     WOP (CTR_END)
967   }
968   WIDE_LOOP_END
969 
970   SINGLE_LOOP
971   {
972     const v128 *w = p;
973     v128 m;
974     CTR_START (m, 0)
975     do
976     {
977       AES_E_MC_m (w[0])
978       AES_E_MC_m (w[1])
979       w += 2;
980     }
981     while (w != wEnd);
982     AES_E_MC_m (w[0])
983     AES_E_m    (w[1])
984     MM_XOR_m   (w[2])
985     CTR_END (m, 0)
986   }
987 
988   p[-2] = vreinterpretq_u8_u64(ctr);
989 }
990 
991 #endif // USE_HW_AES
992 
993 #endif // MY_CPU_ARM_OR_ARM64
994 
995 #undef NUM_WAYS
996 #undef WOP_M1
997 #undef WOP
998 #undef DECLARE_VAR
999 #undef LOAD_data
1000 #undef STORE_data
1001 #undef USE_INTEL_AES
1002 #undef USE_HW_AES
1003