xref: /aosp_15_r20/external/mesa3d/src/util/blake3/blake3_avx512.c (revision 6104692788411f58d303aa86923a9ff6ecaded22)
1*61046927SAndroid Build Coastguard Worker #include "blake3_impl.h"
2*61046927SAndroid Build Coastguard Worker 
3*61046927SAndroid Build Coastguard Worker #include <immintrin.h>
4*61046927SAndroid Build Coastguard Worker 
5*61046927SAndroid Build Coastguard Worker #define _mm_shuffle_ps2(a, b, c)                                               \
6*61046927SAndroid Build Coastguard Worker   (_mm_castps_si128(                                                           \
7*61046927SAndroid Build Coastguard Worker       _mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))
8*61046927SAndroid Build Coastguard Worker 
loadu_128(const uint8_t src[16])9*61046927SAndroid Build Coastguard Worker INLINE __m128i loadu_128(const uint8_t src[16]) {
10*61046927SAndroid Build Coastguard Worker   return _mm_loadu_si128((const __m128i *)src);
11*61046927SAndroid Build Coastguard Worker }
12*61046927SAndroid Build Coastguard Worker 
loadu_256(const uint8_t src[32])13*61046927SAndroid Build Coastguard Worker INLINE __m256i loadu_256(const uint8_t src[32]) {
14*61046927SAndroid Build Coastguard Worker   return _mm256_loadu_si256((const __m256i *)src);
15*61046927SAndroid Build Coastguard Worker }
16*61046927SAndroid Build Coastguard Worker 
loadu_512(const uint8_t src[64])17*61046927SAndroid Build Coastguard Worker INLINE __m512i loadu_512(const uint8_t src[64]) {
18*61046927SAndroid Build Coastguard Worker   return _mm512_loadu_si512((const __m512i *)src);
19*61046927SAndroid Build Coastguard Worker }
20*61046927SAndroid Build Coastguard Worker 
storeu_128(__m128i src,uint8_t dest[16])21*61046927SAndroid Build Coastguard Worker INLINE void storeu_128(__m128i src, uint8_t dest[16]) {
22*61046927SAndroid Build Coastguard Worker   _mm_storeu_si128((__m128i *)dest, src);
23*61046927SAndroid Build Coastguard Worker }
24*61046927SAndroid Build Coastguard Worker 
storeu_256(__m256i src,uint8_t dest[16])25*61046927SAndroid Build Coastguard Worker INLINE void storeu_256(__m256i src, uint8_t dest[16]) {
26*61046927SAndroid Build Coastguard Worker   _mm256_storeu_si256((__m256i *)dest, src);
27*61046927SAndroid Build Coastguard Worker }
28*61046927SAndroid Build Coastguard Worker 
add_128(__m128i a,__m128i b)29*61046927SAndroid Build Coastguard Worker INLINE __m128i add_128(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
30*61046927SAndroid Build Coastguard Worker 
add_256(__m256i a,__m256i b)31*61046927SAndroid Build Coastguard Worker INLINE __m256i add_256(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); }
32*61046927SAndroid Build Coastguard Worker 
add_512(__m512i a,__m512i b)33*61046927SAndroid Build Coastguard Worker INLINE __m512i add_512(__m512i a, __m512i b) { return _mm512_add_epi32(a, b); }
34*61046927SAndroid Build Coastguard Worker 
xor_128(__m128i a,__m128i b)35*61046927SAndroid Build Coastguard Worker INLINE __m128i xor_128(__m128i a, __m128i b) { return _mm_xor_si128(a, b); }
36*61046927SAndroid Build Coastguard Worker 
xor_256(__m256i a,__m256i b)37*61046927SAndroid Build Coastguard Worker INLINE __m256i xor_256(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); }
38*61046927SAndroid Build Coastguard Worker 
xor_512(__m512i a,__m512i b)39*61046927SAndroid Build Coastguard Worker INLINE __m512i xor_512(__m512i a, __m512i b) { return _mm512_xor_si512(a, b); }
40*61046927SAndroid Build Coastguard Worker 
set1_128(uint32_t x)41*61046927SAndroid Build Coastguard Worker INLINE __m128i set1_128(uint32_t x) { return _mm_set1_epi32((int32_t)x); }
42*61046927SAndroid Build Coastguard Worker 
set1_256(uint32_t x)43*61046927SAndroid Build Coastguard Worker INLINE __m256i set1_256(uint32_t x) { return _mm256_set1_epi32((int32_t)x); }
44*61046927SAndroid Build Coastguard Worker 
set1_512(uint32_t x)45*61046927SAndroid Build Coastguard Worker INLINE __m512i set1_512(uint32_t x) { return _mm512_set1_epi32((int32_t)x); }
46*61046927SAndroid Build Coastguard Worker 
set4(uint32_t a,uint32_t b,uint32_t c,uint32_t d)47*61046927SAndroid Build Coastguard Worker INLINE __m128i set4(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
48*61046927SAndroid Build Coastguard Worker   return _mm_setr_epi32((int32_t)a, (int32_t)b, (int32_t)c, (int32_t)d);
49*61046927SAndroid Build Coastguard Worker }
50*61046927SAndroid Build Coastguard Worker 
rot16_128(__m128i x)51*61046927SAndroid Build Coastguard Worker INLINE __m128i rot16_128(__m128i x) { return _mm_ror_epi32(x, 16); }
52*61046927SAndroid Build Coastguard Worker 
rot16_256(__m256i x)53*61046927SAndroid Build Coastguard Worker INLINE __m256i rot16_256(__m256i x) { return _mm256_ror_epi32(x, 16); }
54*61046927SAndroid Build Coastguard Worker 
rot16_512(__m512i x)55*61046927SAndroid Build Coastguard Worker INLINE __m512i rot16_512(__m512i x) { return _mm512_ror_epi32(x, 16); }
56*61046927SAndroid Build Coastguard Worker 
rot12_128(__m128i x)57*61046927SAndroid Build Coastguard Worker INLINE __m128i rot12_128(__m128i x) { return _mm_ror_epi32(x, 12); }
58*61046927SAndroid Build Coastguard Worker 
rot12_256(__m256i x)59*61046927SAndroid Build Coastguard Worker INLINE __m256i rot12_256(__m256i x) { return _mm256_ror_epi32(x, 12); }
60*61046927SAndroid Build Coastguard Worker 
rot12_512(__m512i x)61*61046927SAndroid Build Coastguard Worker INLINE __m512i rot12_512(__m512i x) { return _mm512_ror_epi32(x, 12); }
62*61046927SAndroid Build Coastguard Worker 
rot8_128(__m128i x)63*61046927SAndroid Build Coastguard Worker INLINE __m128i rot8_128(__m128i x) { return _mm_ror_epi32(x, 8); }
64*61046927SAndroid Build Coastguard Worker 
rot8_256(__m256i x)65*61046927SAndroid Build Coastguard Worker INLINE __m256i rot8_256(__m256i x) { return _mm256_ror_epi32(x, 8); }
66*61046927SAndroid Build Coastguard Worker 
rot8_512(__m512i x)67*61046927SAndroid Build Coastguard Worker INLINE __m512i rot8_512(__m512i x) { return _mm512_ror_epi32(x, 8); }
68*61046927SAndroid Build Coastguard Worker 
rot7_128(__m128i x)69*61046927SAndroid Build Coastguard Worker INLINE __m128i rot7_128(__m128i x) { return _mm_ror_epi32(x, 7); }
70*61046927SAndroid Build Coastguard Worker 
rot7_256(__m256i x)71*61046927SAndroid Build Coastguard Worker INLINE __m256i rot7_256(__m256i x) { return _mm256_ror_epi32(x, 7); }
72*61046927SAndroid Build Coastguard Worker 
rot7_512(__m512i x)73*61046927SAndroid Build Coastguard Worker INLINE __m512i rot7_512(__m512i x) { return _mm512_ror_epi32(x, 7); }
74*61046927SAndroid Build Coastguard Worker 
75*61046927SAndroid Build Coastguard Worker /*
76*61046927SAndroid Build Coastguard Worker  * ----------------------------------------------------------------------------
77*61046927SAndroid Build Coastguard Worker  * compress_avx512
78*61046927SAndroid Build Coastguard Worker  * ----------------------------------------------------------------------------
79*61046927SAndroid Build Coastguard Worker  */
80*61046927SAndroid Build Coastguard Worker 
g1(__m128i * row0,__m128i * row1,__m128i * row2,__m128i * row3,__m128i m)81*61046927SAndroid Build Coastguard Worker INLINE void g1(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
82*61046927SAndroid Build Coastguard Worker                __m128i m) {
83*61046927SAndroid Build Coastguard Worker   *row0 = add_128(add_128(*row0, m), *row1);
84*61046927SAndroid Build Coastguard Worker   *row3 = xor_128(*row3, *row0);
85*61046927SAndroid Build Coastguard Worker   *row3 = rot16_128(*row3);
86*61046927SAndroid Build Coastguard Worker   *row2 = add_128(*row2, *row3);
87*61046927SAndroid Build Coastguard Worker   *row1 = xor_128(*row1, *row2);
88*61046927SAndroid Build Coastguard Worker   *row1 = rot12_128(*row1);
89*61046927SAndroid Build Coastguard Worker }
90*61046927SAndroid Build Coastguard Worker 
g2(__m128i * row0,__m128i * row1,__m128i * row2,__m128i * row3,__m128i m)91*61046927SAndroid Build Coastguard Worker INLINE void g2(__m128i *row0, __m128i *row1, __m128i *row2, __m128i *row3,
92*61046927SAndroid Build Coastguard Worker                __m128i m) {
93*61046927SAndroid Build Coastguard Worker   *row0 = add_128(add_128(*row0, m), *row1);
94*61046927SAndroid Build Coastguard Worker   *row3 = xor_128(*row3, *row0);
95*61046927SAndroid Build Coastguard Worker   *row3 = rot8_128(*row3);
96*61046927SAndroid Build Coastguard Worker   *row2 = add_128(*row2, *row3);
97*61046927SAndroid Build Coastguard Worker   *row1 = xor_128(*row1, *row2);
98*61046927SAndroid Build Coastguard Worker   *row1 = rot7_128(*row1);
99*61046927SAndroid Build Coastguard Worker }
100*61046927SAndroid Build Coastguard Worker 
101*61046927SAndroid Build Coastguard Worker // Note the optimization here of leaving row1 as the unrotated row, rather than
102*61046927SAndroid Build Coastguard Worker // row0. All the message loads below are adjusted to compensate for this. See
103*61046927SAndroid Build Coastguard Worker // discussion at https://github.com/sneves/blake2-avx2/pull/4
diagonalize(__m128i * row0,__m128i * row2,__m128i * row3)104*61046927SAndroid Build Coastguard Worker INLINE void diagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
105*61046927SAndroid Build Coastguard Worker   *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(2, 1, 0, 3));
106*61046927SAndroid Build Coastguard Worker   *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
107*61046927SAndroid Build Coastguard Worker   *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(0, 3, 2, 1));
108*61046927SAndroid Build Coastguard Worker }
109*61046927SAndroid Build Coastguard Worker 
undiagonalize(__m128i * row0,__m128i * row2,__m128i * row3)110*61046927SAndroid Build Coastguard Worker INLINE void undiagonalize(__m128i *row0, __m128i *row2, __m128i *row3) {
111*61046927SAndroid Build Coastguard Worker   *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE(0, 3, 2, 1));
112*61046927SAndroid Build Coastguard Worker   *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE(1, 0, 3, 2));
113*61046927SAndroid Build Coastguard Worker   *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE(2, 1, 0, 3));
114*61046927SAndroid Build Coastguard Worker }
115*61046927SAndroid Build Coastguard Worker 
compress_pre(__m128i rows[4],const uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags)116*61046927SAndroid Build Coastguard Worker INLINE void compress_pre(__m128i rows[4], const uint32_t cv[8],
117*61046927SAndroid Build Coastguard Worker                          const uint8_t block[BLAKE3_BLOCK_LEN],
118*61046927SAndroid Build Coastguard Worker                          uint8_t block_len, uint64_t counter, uint8_t flags) {
119*61046927SAndroid Build Coastguard Worker   rows[0] = loadu_128((uint8_t *)&cv[0]);
120*61046927SAndroid Build Coastguard Worker   rows[1] = loadu_128((uint8_t *)&cv[4]);
121*61046927SAndroid Build Coastguard Worker   rows[2] = set4(IV[0], IV[1], IV[2], IV[3]);
122*61046927SAndroid Build Coastguard Worker   rows[3] = set4(counter_low(counter), counter_high(counter),
123*61046927SAndroid Build Coastguard Worker                  (uint32_t)block_len, (uint32_t)flags);
124*61046927SAndroid Build Coastguard Worker 
125*61046927SAndroid Build Coastguard Worker   __m128i m0 = loadu_128(&block[sizeof(__m128i) * 0]);
126*61046927SAndroid Build Coastguard Worker   __m128i m1 = loadu_128(&block[sizeof(__m128i) * 1]);
127*61046927SAndroid Build Coastguard Worker   __m128i m2 = loadu_128(&block[sizeof(__m128i) * 2]);
128*61046927SAndroid Build Coastguard Worker   __m128i m3 = loadu_128(&block[sizeof(__m128i) * 3]);
129*61046927SAndroid Build Coastguard Worker 
130*61046927SAndroid Build Coastguard Worker   __m128i t0, t1, t2, t3, tt;
131*61046927SAndroid Build Coastguard Worker 
132*61046927SAndroid Build Coastguard Worker   // Round 1. The first round permutes the message words from the original
133*61046927SAndroid Build Coastguard Worker   // input order, into the groups that get mixed in parallel.
134*61046927SAndroid Build Coastguard Worker   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(2, 0, 2, 0)); //  6  4  2  0
135*61046927SAndroid Build Coastguard Worker   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
136*61046927SAndroid Build Coastguard Worker   t1 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 3, 1)); //  7  5  3  1
137*61046927SAndroid Build Coastguard Worker   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
138*61046927SAndroid Build Coastguard Worker   diagonalize(&rows[0], &rows[2], &rows[3]);
139*61046927SAndroid Build Coastguard Worker   t2 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(2, 0, 2, 0)); // 14 12 10  8
140*61046927SAndroid Build Coastguard Worker   t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE(2, 1, 0, 3));   // 12 10  8 14
141*61046927SAndroid Build Coastguard Worker   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
142*61046927SAndroid Build Coastguard Worker   t3 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 1, 3, 1)); // 15 13 11  9
143*61046927SAndroid Build Coastguard Worker   t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE(2, 1, 0, 3));   // 13 11  9 15
144*61046927SAndroid Build Coastguard Worker   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
145*61046927SAndroid Build Coastguard Worker   undiagonalize(&rows[0], &rows[2], &rows[3]);
146*61046927SAndroid Build Coastguard Worker   m0 = t0;
147*61046927SAndroid Build Coastguard Worker   m1 = t1;
148*61046927SAndroid Build Coastguard Worker   m2 = t2;
149*61046927SAndroid Build Coastguard Worker   m3 = t3;
150*61046927SAndroid Build Coastguard Worker 
151*61046927SAndroid Build Coastguard Worker   // Round 2. This round and all following rounds apply a fixed permutation
152*61046927SAndroid Build Coastguard Worker   // to the message words from the round before.
153*61046927SAndroid Build Coastguard Worker   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
154*61046927SAndroid Build Coastguard Worker   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
155*61046927SAndroid Build Coastguard Worker   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
156*61046927SAndroid Build Coastguard Worker   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
157*61046927SAndroid Build Coastguard Worker   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
158*61046927SAndroid Build Coastguard Worker   t1 = _mm_blend_epi16(tt, t1, 0xCC);
159*61046927SAndroid Build Coastguard Worker   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
160*61046927SAndroid Build Coastguard Worker   diagonalize(&rows[0], &rows[2], &rows[3]);
161*61046927SAndroid Build Coastguard Worker   t2 = _mm_unpacklo_epi64(m3, m1);
162*61046927SAndroid Build Coastguard Worker   tt = _mm_blend_epi16(t2, m2, 0xC0);
163*61046927SAndroid Build Coastguard Worker   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
164*61046927SAndroid Build Coastguard Worker   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
165*61046927SAndroid Build Coastguard Worker   t3 = _mm_unpackhi_epi32(m1, m3);
166*61046927SAndroid Build Coastguard Worker   tt = _mm_unpacklo_epi32(m2, t3);
167*61046927SAndroid Build Coastguard Worker   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
168*61046927SAndroid Build Coastguard Worker   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
169*61046927SAndroid Build Coastguard Worker   undiagonalize(&rows[0], &rows[2], &rows[3]);
170*61046927SAndroid Build Coastguard Worker   m0 = t0;
171*61046927SAndroid Build Coastguard Worker   m1 = t1;
172*61046927SAndroid Build Coastguard Worker   m2 = t2;
173*61046927SAndroid Build Coastguard Worker   m3 = t3;
174*61046927SAndroid Build Coastguard Worker 
175*61046927SAndroid Build Coastguard Worker   // Round 3
176*61046927SAndroid Build Coastguard Worker   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
177*61046927SAndroid Build Coastguard Worker   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
178*61046927SAndroid Build Coastguard Worker   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
179*61046927SAndroid Build Coastguard Worker   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
180*61046927SAndroid Build Coastguard Worker   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
181*61046927SAndroid Build Coastguard Worker   t1 = _mm_blend_epi16(tt, t1, 0xCC);
182*61046927SAndroid Build Coastguard Worker   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
183*61046927SAndroid Build Coastguard Worker   diagonalize(&rows[0], &rows[2], &rows[3]);
184*61046927SAndroid Build Coastguard Worker   t2 = _mm_unpacklo_epi64(m3, m1);
185*61046927SAndroid Build Coastguard Worker   tt = _mm_blend_epi16(t2, m2, 0xC0);
186*61046927SAndroid Build Coastguard Worker   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
187*61046927SAndroid Build Coastguard Worker   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
188*61046927SAndroid Build Coastguard Worker   t3 = _mm_unpackhi_epi32(m1, m3);
189*61046927SAndroid Build Coastguard Worker   tt = _mm_unpacklo_epi32(m2, t3);
190*61046927SAndroid Build Coastguard Worker   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
191*61046927SAndroid Build Coastguard Worker   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
192*61046927SAndroid Build Coastguard Worker   undiagonalize(&rows[0], &rows[2], &rows[3]);
193*61046927SAndroid Build Coastguard Worker   m0 = t0;
194*61046927SAndroid Build Coastguard Worker   m1 = t1;
195*61046927SAndroid Build Coastguard Worker   m2 = t2;
196*61046927SAndroid Build Coastguard Worker   m3 = t3;
197*61046927SAndroid Build Coastguard Worker 
198*61046927SAndroid Build Coastguard Worker   // Round 4
199*61046927SAndroid Build Coastguard Worker   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
200*61046927SAndroid Build Coastguard Worker   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
201*61046927SAndroid Build Coastguard Worker   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
202*61046927SAndroid Build Coastguard Worker   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
203*61046927SAndroid Build Coastguard Worker   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
204*61046927SAndroid Build Coastguard Worker   t1 = _mm_blend_epi16(tt, t1, 0xCC);
205*61046927SAndroid Build Coastguard Worker   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
206*61046927SAndroid Build Coastguard Worker   diagonalize(&rows[0], &rows[2], &rows[3]);
207*61046927SAndroid Build Coastguard Worker   t2 = _mm_unpacklo_epi64(m3, m1);
208*61046927SAndroid Build Coastguard Worker   tt = _mm_blend_epi16(t2, m2, 0xC0);
209*61046927SAndroid Build Coastguard Worker   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
210*61046927SAndroid Build Coastguard Worker   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
211*61046927SAndroid Build Coastguard Worker   t3 = _mm_unpackhi_epi32(m1, m3);
212*61046927SAndroid Build Coastguard Worker   tt = _mm_unpacklo_epi32(m2, t3);
213*61046927SAndroid Build Coastguard Worker   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
214*61046927SAndroid Build Coastguard Worker   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
215*61046927SAndroid Build Coastguard Worker   undiagonalize(&rows[0], &rows[2], &rows[3]);
216*61046927SAndroid Build Coastguard Worker   m0 = t0;
217*61046927SAndroid Build Coastguard Worker   m1 = t1;
218*61046927SAndroid Build Coastguard Worker   m2 = t2;
219*61046927SAndroid Build Coastguard Worker   m3 = t3;
220*61046927SAndroid Build Coastguard Worker 
221*61046927SAndroid Build Coastguard Worker   // Round 5
222*61046927SAndroid Build Coastguard Worker   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
223*61046927SAndroid Build Coastguard Worker   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
224*61046927SAndroid Build Coastguard Worker   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
225*61046927SAndroid Build Coastguard Worker   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
226*61046927SAndroid Build Coastguard Worker   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
227*61046927SAndroid Build Coastguard Worker   t1 = _mm_blend_epi16(tt, t1, 0xCC);
228*61046927SAndroid Build Coastguard Worker   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
229*61046927SAndroid Build Coastguard Worker   diagonalize(&rows[0], &rows[2], &rows[3]);
230*61046927SAndroid Build Coastguard Worker   t2 = _mm_unpacklo_epi64(m3, m1);
231*61046927SAndroid Build Coastguard Worker   tt = _mm_blend_epi16(t2, m2, 0xC0);
232*61046927SAndroid Build Coastguard Worker   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
233*61046927SAndroid Build Coastguard Worker   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
234*61046927SAndroid Build Coastguard Worker   t3 = _mm_unpackhi_epi32(m1, m3);
235*61046927SAndroid Build Coastguard Worker   tt = _mm_unpacklo_epi32(m2, t3);
236*61046927SAndroid Build Coastguard Worker   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
237*61046927SAndroid Build Coastguard Worker   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
238*61046927SAndroid Build Coastguard Worker   undiagonalize(&rows[0], &rows[2], &rows[3]);
239*61046927SAndroid Build Coastguard Worker   m0 = t0;
240*61046927SAndroid Build Coastguard Worker   m1 = t1;
241*61046927SAndroid Build Coastguard Worker   m2 = t2;
242*61046927SAndroid Build Coastguard Worker   m3 = t3;
243*61046927SAndroid Build Coastguard Worker 
244*61046927SAndroid Build Coastguard Worker   // Round 6
245*61046927SAndroid Build Coastguard Worker   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
246*61046927SAndroid Build Coastguard Worker   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
247*61046927SAndroid Build Coastguard Worker   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
248*61046927SAndroid Build Coastguard Worker   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
249*61046927SAndroid Build Coastguard Worker   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
250*61046927SAndroid Build Coastguard Worker   t1 = _mm_blend_epi16(tt, t1, 0xCC);
251*61046927SAndroid Build Coastguard Worker   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
252*61046927SAndroid Build Coastguard Worker   diagonalize(&rows[0], &rows[2], &rows[3]);
253*61046927SAndroid Build Coastguard Worker   t2 = _mm_unpacklo_epi64(m3, m1);
254*61046927SAndroid Build Coastguard Worker   tt = _mm_blend_epi16(t2, m2, 0xC0);
255*61046927SAndroid Build Coastguard Worker   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
256*61046927SAndroid Build Coastguard Worker   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
257*61046927SAndroid Build Coastguard Worker   t3 = _mm_unpackhi_epi32(m1, m3);
258*61046927SAndroid Build Coastguard Worker   tt = _mm_unpacklo_epi32(m2, t3);
259*61046927SAndroid Build Coastguard Worker   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
260*61046927SAndroid Build Coastguard Worker   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
261*61046927SAndroid Build Coastguard Worker   undiagonalize(&rows[0], &rows[2], &rows[3]);
262*61046927SAndroid Build Coastguard Worker   m0 = t0;
263*61046927SAndroid Build Coastguard Worker   m1 = t1;
264*61046927SAndroid Build Coastguard Worker   m2 = t2;
265*61046927SAndroid Build Coastguard Worker   m3 = t3;
266*61046927SAndroid Build Coastguard Worker 
267*61046927SAndroid Build Coastguard Worker   // Round 7
268*61046927SAndroid Build Coastguard Worker   t0 = _mm_shuffle_ps2(m0, m1, _MM_SHUFFLE(3, 1, 1, 2));
269*61046927SAndroid Build Coastguard Worker   t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE(0, 3, 2, 1));
270*61046927SAndroid Build Coastguard Worker   g1(&rows[0], &rows[1], &rows[2], &rows[3], t0);
271*61046927SAndroid Build Coastguard Worker   t1 = _mm_shuffle_ps2(m2, m3, _MM_SHUFFLE(3, 3, 2, 2));
272*61046927SAndroid Build Coastguard Worker   tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE(0, 0, 3, 3));
273*61046927SAndroid Build Coastguard Worker   t1 = _mm_blend_epi16(tt, t1, 0xCC);
274*61046927SAndroid Build Coastguard Worker   g2(&rows[0], &rows[1], &rows[2], &rows[3], t1);
275*61046927SAndroid Build Coastguard Worker   diagonalize(&rows[0], &rows[2], &rows[3]);
276*61046927SAndroid Build Coastguard Worker   t2 = _mm_unpacklo_epi64(m3, m1);
277*61046927SAndroid Build Coastguard Worker   tt = _mm_blend_epi16(t2, m2, 0xC0);
278*61046927SAndroid Build Coastguard Worker   t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(1, 3, 2, 0));
279*61046927SAndroid Build Coastguard Worker   g1(&rows[0], &rows[1], &rows[2], &rows[3], t2);
280*61046927SAndroid Build Coastguard Worker   t3 = _mm_unpackhi_epi32(m1, m3);
281*61046927SAndroid Build Coastguard Worker   tt = _mm_unpacklo_epi32(m2, t3);
282*61046927SAndroid Build Coastguard Worker   t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE(0, 1, 3, 2));
283*61046927SAndroid Build Coastguard Worker   g2(&rows[0], &rows[1], &rows[2], &rows[3], t3);
284*61046927SAndroid Build Coastguard Worker   undiagonalize(&rows[0], &rows[2], &rows[3]);
285*61046927SAndroid Build Coastguard Worker }
286*61046927SAndroid Build Coastguard Worker 
blake3_compress_xof_avx512(const uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags,uint8_t out[64])287*61046927SAndroid Build Coastguard Worker void blake3_compress_xof_avx512(const uint32_t cv[8],
288*61046927SAndroid Build Coastguard Worker                                 const uint8_t block[BLAKE3_BLOCK_LEN],
289*61046927SAndroid Build Coastguard Worker                                 uint8_t block_len, uint64_t counter,
290*61046927SAndroid Build Coastguard Worker                                 uint8_t flags, uint8_t out[64]) {
291*61046927SAndroid Build Coastguard Worker   __m128i rows[4];
292*61046927SAndroid Build Coastguard Worker   compress_pre(rows, cv, block, block_len, counter, flags);
293*61046927SAndroid Build Coastguard Worker   storeu_128(xor_128(rows[0], rows[2]), &out[0]);
294*61046927SAndroid Build Coastguard Worker   storeu_128(xor_128(rows[1], rows[3]), &out[16]);
295*61046927SAndroid Build Coastguard Worker   storeu_128(xor_128(rows[2], loadu_128((uint8_t *)&cv[0])), &out[32]);
296*61046927SAndroid Build Coastguard Worker   storeu_128(xor_128(rows[3], loadu_128((uint8_t *)&cv[4])), &out[48]);
297*61046927SAndroid Build Coastguard Worker }
298*61046927SAndroid Build Coastguard Worker 
blake3_compress_in_place_avx512(uint32_t cv[8],const uint8_t block[BLAKE3_BLOCK_LEN],uint8_t block_len,uint64_t counter,uint8_t flags)299*61046927SAndroid Build Coastguard Worker void blake3_compress_in_place_avx512(uint32_t cv[8],
300*61046927SAndroid Build Coastguard Worker                                      const uint8_t block[BLAKE3_BLOCK_LEN],
301*61046927SAndroid Build Coastguard Worker                                      uint8_t block_len, uint64_t counter,
302*61046927SAndroid Build Coastguard Worker                                      uint8_t flags) {
303*61046927SAndroid Build Coastguard Worker   __m128i rows[4];
304*61046927SAndroid Build Coastguard Worker   compress_pre(rows, cv, block, block_len, counter, flags);
305*61046927SAndroid Build Coastguard Worker   storeu_128(xor_128(rows[0], rows[2]), (uint8_t *)&cv[0]);
306*61046927SAndroid Build Coastguard Worker   storeu_128(xor_128(rows[1], rows[3]), (uint8_t *)&cv[4]);
307*61046927SAndroid Build Coastguard Worker }
308*61046927SAndroid Build Coastguard Worker 
309*61046927SAndroid Build Coastguard Worker /*
310*61046927SAndroid Build Coastguard Worker  * ----------------------------------------------------------------------------
311*61046927SAndroid Build Coastguard Worker  * hash4_avx512
312*61046927SAndroid Build Coastguard Worker  * ----------------------------------------------------------------------------
313*61046927SAndroid Build Coastguard Worker  */
314*61046927SAndroid Build Coastguard Worker 
round_fn4(__m128i v[16],__m128i m[16],size_t r)315*61046927SAndroid Build Coastguard Worker INLINE void round_fn4(__m128i v[16], __m128i m[16], size_t r) {
316*61046927SAndroid Build Coastguard Worker   v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
317*61046927SAndroid Build Coastguard Worker   v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
318*61046927SAndroid Build Coastguard Worker   v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
319*61046927SAndroid Build Coastguard Worker   v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
320*61046927SAndroid Build Coastguard Worker   v[0] = add_128(v[0], v[4]);
321*61046927SAndroid Build Coastguard Worker   v[1] = add_128(v[1], v[5]);
322*61046927SAndroid Build Coastguard Worker   v[2] = add_128(v[2], v[6]);
323*61046927SAndroid Build Coastguard Worker   v[3] = add_128(v[3], v[7]);
324*61046927SAndroid Build Coastguard Worker   v[12] = xor_128(v[12], v[0]);
325*61046927SAndroid Build Coastguard Worker   v[13] = xor_128(v[13], v[1]);
326*61046927SAndroid Build Coastguard Worker   v[14] = xor_128(v[14], v[2]);
327*61046927SAndroid Build Coastguard Worker   v[15] = xor_128(v[15], v[3]);
328*61046927SAndroid Build Coastguard Worker   v[12] = rot16_128(v[12]);
329*61046927SAndroid Build Coastguard Worker   v[13] = rot16_128(v[13]);
330*61046927SAndroid Build Coastguard Worker   v[14] = rot16_128(v[14]);
331*61046927SAndroid Build Coastguard Worker   v[15] = rot16_128(v[15]);
332*61046927SAndroid Build Coastguard Worker   v[8] = add_128(v[8], v[12]);
333*61046927SAndroid Build Coastguard Worker   v[9] = add_128(v[9], v[13]);
334*61046927SAndroid Build Coastguard Worker   v[10] = add_128(v[10], v[14]);
335*61046927SAndroid Build Coastguard Worker   v[11] = add_128(v[11], v[15]);
336*61046927SAndroid Build Coastguard Worker   v[4] = xor_128(v[4], v[8]);
337*61046927SAndroid Build Coastguard Worker   v[5] = xor_128(v[5], v[9]);
338*61046927SAndroid Build Coastguard Worker   v[6] = xor_128(v[6], v[10]);
339*61046927SAndroid Build Coastguard Worker   v[7] = xor_128(v[7], v[11]);
340*61046927SAndroid Build Coastguard Worker   v[4] = rot12_128(v[4]);
341*61046927SAndroid Build Coastguard Worker   v[5] = rot12_128(v[5]);
342*61046927SAndroid Build Coastguard Worker   v[6] = rot12_128(v[6]);
343*61046927SAndroid Build Coastguard Worker   v[7] = rot12_128(v[7]);
344*61046927SAndroid Build Coastguard Worker   v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
345*61046927SAndroid Build Coastguard Worker   v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
346*61046927SAndroid Build Coastguard Worker   v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
347*61046927SAndroid Build Coastguard Worker   v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
348*61046927SAndroid Build Coastguard Worker   v[0] = add_128(v[0], v[4]);
349*61046927SAndroid Build Coastguard Worker   v[1] = add_128(v[1], v[5]);
350*61046927SAndroid Build Coastguard Worker   v[2] = add_128(v[2], v[6]);
351*61046927SAndroid Build Coastguard Worker   v[3] = add_128(v[3], v[7]);
352*61046927SAndroid Build Coastguard Worker   v[12] = xor_128(v[12], v[0]);
353*61046927SAndroid Build Coastguard Worker   v[13] = xor_128(v[13], v[1]);
354*61046927SAndroid Build Coastguard Worker   v[14] = xor_128(v[14], v[2]);
355*61046927SAndroid Build Coastguard Worker   v[15] = xor_128(v[15], v[3]);
356*61046927SAndroid Build Coastguard Worker   v[12] = rot8_128(v[12]);
357*61046927SAndroid Build Coastguard Worker   v[13] = rot8_128(v[13]);
358*61046927SAndroid Build Coastguard Worker   v[14] = rot8_128(v[14]);
359*61046927SAndroid Build Coastguard Worker   v[15] = rot8_128(v[15]);
360*61046927SAndroid Build Coastguard Worker   v[8] = add_128(v[8], v[12]);
361*61046927SAndroid Build Coastguard Worker   v[9] = add_128(v[9], v[13]);
362*61046927SAndroid Build Coastguard Worker   v[10] = add_128(v[10], v[14]);
363*61046927SAndroid Build Coastguard Worker   v[11] = add_128(v[11], v[15]);
364*61046927SAndroid Build Coastguard Worker   v[4] = xor_128(v[4], v[8]);
365*61046927SAndroid Build Coastguard Worker   v[5] = xor_128(v[5], v[9]);
366*61046927SAndroid Build Coastguard Worker   v[6] = xor_128(v[6], v[10]);
367*61046927SAndroid Build Coastguard Worker   v[7] = xor_128(v[7], v[11]);
368*61046927SAndroid Build Coastguard Worker   v[4] = rot7_128(v[4]);
369*61046927SAndroid Build Coastguard Worker   v[5] = rot7_128(v[5]);
370*61046927SAndroid Build Coastguard Worker   v[6] = rot7_128(v[6]);
371*61046927SAndroid Build Coastguard Worker   v[7] = rot7_128(v[7]);
372*61046927SAndroid Build Coastguard Worker 
373*61046927SAndroid Build Coastguard Worker   v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
374*61046927SAndroid Build Coastguard Worker   v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
375*61046927SAndroid Build Coastguard Worker   v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
376*61046927SAndroid Build Coastguard Worker   v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
377*61046927SAndroid Build Coastguard Worker   v[0] = add_128(v[0], v[5]);
378*61046927SAndroid Build Coastguard Worker   v[1] = add_128(v[1], v[6]);
379*61046927SAndroid Build Coastguard Worker   v[2] = add_128(v[2], v[7]);
380*61046927SAndroid Build Coastguard Worker   v[3] = add_128(v[3], v[4]);
381*61046927SAndroid Build Coastguard Worker   v[15] = xor_128(v[15], v[0]);
382*61046927SAndroid Build Coastguard Worker   v[12] = xor_128(v[12], v[1]);
383*61046927SAndroid Build Coastguard Worker   v[13] = xor_128(v[13], v[2]);
384*61046927SAndroid Build Coastguard Worker   v[14] = xor_128(v[14], v[3]);
385*61046927SAndroid Build Coastguard Worker   v[15] = rot16_128(v[15]);
386*61046927SAndroid Build Coastguard Worker   v[12] = rot16_128(v[12]);
387*61046927SAndroid Build Coastguard Worker   v[13] = rot16_128(v[13]);
388*61046927SAndroid Build Coastguard Worker   v[14] = rot16_128(v[14]);
389*61046927SAndroid Build Coastguard Worker   v[10] = add_128(v[10], v[15]);
390*61046927SAndroid Build Coastguard Worker   v[11] = add_128(v[11], v[12]);
391*61046927SAndroid Build Coastguard Worker   v[8] = add_128(v[8], v[13]);
392*61046927SAndroid Build Coastguard Worker   v[9] = add_128(v[9], v[14]);
393*61046927SAndroid Build Coastguard Worker   v[5] = xor_128(v[5], v[10]);
394*61046927SAndroid Build Coastguard Worker   v[6] = xor_128(v[6], v[11]);
395*61046927SAndroid Build Coastguard Worker   v[7] = xor_128(v[7], v[8]);
396*61046927SAndroid Build Coastguard Worker   v[4] = xor_128(v[4], v[9]);
397*61046927SAndroid Build Coastguard Worker   v[5] = rot12_128(v[5]);
398*61046927SAndroid Build Coastguard Worker   v[6] = rot12_128(v[6]);
399*61046927SAndroid Build Coastguard Worker   v[7] = rot12_128(v[7]);
400*61046927SAndroid Build Coastguard Worker   v[4] = rot12_128(v[4]);
401*61046927SAndroid Build Coastguard Worker   v[0] = add_128(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
402*61046927SAndroid Build Coastguard Worker   v[1] = add_128(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
403*61046927SAndroid Build Coastguard Worker   v[2] = add_128(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
404*61046927SAndroid Build Coastguard Worker   v[3] = add_128(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
405*61046927SAndroid Build Coastguard Worker   v[0] = add_128(v[0], v[5]);
406*61046927SAndroid Build Coastguard Worker   v[1] = add_128(v[1], v[6]);
407*61046927SAndroid Build Coastguard Worker   v[2] = add_128(v[2], v[7]);
408*61046927SAndroid Build Coastguard Worker   v[3] = add_128(v[3], v[4]);
409*61046927SAndroid Build Coastguard Worker   v[15] = xor_128(v[15], v[0]);
410*61046927SAndroid Build Coastguard Worker   v[12] = xor_128(v[12], v[1]);
411*61046927SAndroid Build Coastguard Worker   v[13] = xor_128(v[13], v[2]);
412*61046927SAndroid Build Coastguard Worker   v[14] = xor_128(v[14], v[3]);
413*61046927SAndroid Build Coastguard Worker   v[15] = rot8_128(v[15]);
414*61046927SAndroid Build Coastguard Worker   v[12] = rot8_128(v[12]);
415*61046927SAndroid Build Coastguard Worker   v[13] = rot8_128(v[13]);
416*61046927SAndroid Build Coastguard Worker   v[14] = rot8_128(v[14]);
417*61046927SAndroid Build Coastguard Worker   v[10] = add_128(v[10], v[15]);
418*61046927SAndroid Build Coastguard Worker   v[11] = add_128(v[11], v[12]);
419*61046927SAndroid Build Coastguard Worker   v[8] = add_128(v[8], v[13]);
420*61046927SAndroid Build Coastguard Worker   v[9] = add_128(v[9], v[14]);
421*61046927SAndroid Build Coastguard Worker   v[5] = xor_128(v[5], v[10]);
422*61046927SAndroid Build Coastguard Worker   v[6] = xor_128(v[6], v[11]);
423*61046927SAndroid Build Coastguard Worker   v[7] = xor_128(v[7], v[8]);
424*61046927SAndroid Build Coastguard Worker   v[4] = xor_128(v[4], v[9]);
425*61046927SAndroid Build Coastguard Worker   v[5] = rot7_128(v[5]);
426*61046927SAndroid Build Coastguard Worker   v[6] = rot7_128(v[6]);
427*61046927SAndroid Build Coastguard Worker   v[7] = rot7_128(v[7]);
428*61046927SAndroid Build Coastguard Worker   v[4] = rot7_128(v[4]);
429*61046927SAndroid Build Coastguard Worker }
430*61046927SAndroid Build Coastguard Worker 
transpose_vecs_128(__m128i vecs[4])431*61046927SAndroid Build Coastguard Worker INLINE void transpose_vecs_128(__m128i vecs[4]) {
432*61046927SAndroid Build Coastguard Worker   // Interleave 32-bit lanes. The low unpack is lanes 00/11 and the high is
433*61046927SAndroid Build Coastguard Worker   // 22/33. Note that this doesn't split the vector into two lanes, as the
434*61046927SAndroid Build Coastguard Worker   // AVX2 counterparts do.
435*61046927SAndroid Build Coastguard Worker   __m128i ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
436*61046927SAndroid Build Coastguard Worker   __m128i ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
437*61046927SAndroid Build Coastguard Worker   __m128i cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
438*61046927SAndroid Build Coastguard Worker   __m128i cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
439*61046927SAndroid Build Coastguard Worker 
440*61046927SAndroid Build Coastguard Worker   // Interleave 64-bit lanes.
441*61046927SAndroid Build Coastguard Worker   __m128i abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01);
442*61046927SAndroid Build Coastguard Worker   __m128i abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01);
443*61046927SAndroid Build Coastguard Worker   __m128i abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23);
444*61046927SAndroid Build Coastguard Worker   __m128i abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23);
445*61046927SAndroid Build Coastguard Worker 
446*61046927SAndroid Build Coastguard Worker   vecs[0] = abcd_0;
447*61046927SAndroid Build Coastguard Worker   vecs[1] = abcd_1;
448*61046927SAndroid Build Coastguard Worker   vecs[2] = abcd_2;
449*61046927SAndroid Build Coastguard Worker   vecs[3] = abcd_3;
450*61046927SAndroid Build Coastguard Worker }
451*61046927SAndroid Build Coastguard Worker 
transpose_msg_vecs4(const uint8_t * const * inputs,size_t block_offset,__m128i out[16])452*61046927SAndroid Build Coastguard Worker INLINE void transpose_msg_vecs4(const uint8_t *const *inputs,
453*61046927SAndroid Build Coastguard Worker                                 size_t block_offset, __m128i out[16]) {
454*61046927SAndroid Build Coastguard Worker   out[0] = loadu_128(&inputs[0][block_offset + 0 * sizeof(__m128i)]);
455*61046927SAndroid Build Coastguard Worker   out[1] = loadu_128(&inputs[1][block_offset + 0 * sizeof(__m128i)]);
456*61046927SAndroid Build Coastguard Worker   out[2] = loadu_128(&inputs[2][block_offset + 0 * sizeof(__m128i)]);
457*61046927SAndroid Build Coastguard Worker   out[3] = loadu_128(&inputs[3][block_offset + 0 * sizeof(__m128i)]);
458*61046927SAndroid Build Coastguard Worker   out[4] = loadu_128(&inputs[0][block_offset + 1 * sizeof(__m128i)]);
459*61046927SAndroid Build Coastguard Worker   out[5] = loadu_128(&inputs[1][block_offset + 1 * sizeof(__m128i)]);
460*61046927SAndroid Build Coastguard Worker   out[6] = loadu_128(&inputs[2][block_offset + 1 * sizeof(__m128i)]);
461*61046927SAndroid Build Coastguard Worker   out[7] = loadu_128(&inputs[3][block_offset + 1 * sizeof(__m128i)]);
462*61046927SAndroid Build Coastguard Worker   out[8] = loadu_128(&inputs[0][block_offset + 2 * sizeof(__m128i)]);
463*61046927SAndroid Build Coastguard Worker   out[9] = loadu_128(&inputs[1][block_offset + 2 * sizeof(__m128i)]);
464*61046927SAndroid Build Coastguard Worker   out[10] = loadu_128(&inputs[2][block_offset + 2 * sizeof(__m128i)]);
465*61046927SAndroid Build Coastguard Worker   out[11] = loadu_128(&inputs[3][block_offset + 2 * sizeof(__m128i)]);
466*61046927SAndroid Build Coastguard Worker   out[12] = loadu_128(&inputs[0][block_offset + 3 * sizeof(__m128i)]);
467*61046927SAndroid Build Coastguard Worker   out[13] = loadu_128(&inputs[1][block_offset + 3 * sizeof(__m128i)]);
468*61046927SAndroid Build Coastguard Worker   out[14] = loadu_128(&inputs[2][block_offset + 3 * sizeof(__m128i)]);
469*61046927SAndroid Build Coastguard Worker   out[15] = loadu_128(&inputs[3][block_offset + 3 * sizeof(__m128i)]);
470*61046927SAndroid Build Coastguard Worker   for (size_t i = 0; i < 4; ++i) {
471*61046927SAndroid Build Coastguard Worker     _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
472*61046927SAndroid Build Coastguard Worker   }
473*61046927SAndroid Build Coastguard Worker   transpose_vecs_128(&out[0]);
474*61046927SAndroid Build Coastguard Worker   transpose_vecs_128(&out[4]);
475*61046927SAndroid Build Coastguard Worker   transpose_vecs_128(&out[8]);
476*61046927SAndroid Build Coastguard Worker   transpose_vecs_128(&out[12]);
477*61046927SAndroid Build Coastguard Worker }
478*61046927SAndroid Build Coastguard Worker 
load_counters4(uint64_t counter,bool increment_counter,__m128i * out_lo,__m128i * out_hi)479*61046927SAndroid Build Coastguard Worker INLINE void load_counters4(uint64_t counter, bool increment_counter,
480*61046927SAndroid Build Coastguard Worker                            __m128i *out_lo, __m128i *out_hi) {
481*61046927SAndroid Build Coastguard Worker   uint64_t mask = (increment_counter ? ~0 : 0);
482*61046927SAndroid Build Coastguard Worker   __m256i mask_vec = _mm256_set1_epi64x(mask);
483*61046927SAndroid Build Coastguard Worker   __m256i deltas = _mm256_setr_epi64x(0, 1, 2, 3);
484*61046927SAndroid Build Coastguard Worker   deltas = _mm256_and_si256(mask_vec, deltas);
485*61046927SAndroid Build Coastguard Worker   __m256i counters =
486*61046927SAndroid Build Coastguard Worker       _mm256_add_epi64(_mm256_set1_epi64x((int64_t)counter), deltas);
487*61046927SAndroid Build Coastguard Worker   *out_lo = _mm256_cvtepi64_epi32(counters);
488*61046927SAndroid Build Coastguard Worker   *out_hi = _mm256_cvtepi64_epi32(_mm256_srli_epi64(counters, 32));
489*61046927SAndroid Build Coastguard Worker }
490*61046927SAndroid Build Coastguard Worker 
491*61046927SAndroid Build Coastguard Worker static
blake3_hash4_avx512(const uint8_t * const * inputs,size_t blocks,const uint32_t key[8],uint64_t counter,bool increment_counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t * out)492*61046927SAndroid Build Coastguard Worker void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks,
493*61046927SAndroid Build Coastguard Worker                          const uint32_t key[8], uint64_t counter,
494*61046927SAndroid Build Coastguard Worker                          bool increment_counter, uint8_t flags,
495*61046927SAndroid Build Coastguard Worker                          uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
496*61046927SAndroid Build Coastguard Worker   __m128i h_vecs[8] = {
497*61046927SAndroid Build Coastguard Worker       set1_128(key[0]), set1_128(key[1]), set1_128(key[2]), set1_128(key[3]),
498*61046927SAndroid Build Coastguard Worker       set1_128(key[4]), set1_128(key[5]), set1_128(key[6]), set1_128(key[7]),
499*61046927SAndroid Build Coastguard Worker   };
500*61046927SAndroid Build Coastguard Worker   __m128i counter_low_vec, counter_high_vec;
501*61046927SAndroid Build Coastguard Worker   load_counters4(counter, increment_counter, &counter_low_vec,
502*61046927SAndroid Build Coastguard Worker                  &counter_high_vec);
503*61046927SAndroid Build Coastguard Worker   uint8_t block_flags = flags | flags_start;
504*61046927SAndroid Build Coastguard Worker 
505*61046927SAndroid Build Coastguard Worker   for (size_t block = 0; block < blocks; block++) {
506*61046927SAndroid Build Coastguard Worker     if (block + 1 == blocks) {
507*61046927SAndroid Build Coastguard Worker       block_flags |= flags_end;
508*61046927SAndroid Build Coastguard Worker     }
509*61046927SAndroid Build Coastguard Worker     __m128i block_len_vec = set1_128(BLAKE3_BLOCK_LEN);
510*61046927SAndroid Build Coastguard Worker     __m128i block_flags_vec = set1_128(block_flags);
511*61046927SAndroid Build Coastguard Worker     __m128i msg_vecs[16];
512*61046927SAndroid Build Coastguard Worker     transpose_msg_vecs4(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
513*61046927SAndroid Build Coastguard Worker 
514*61046927SAndroid Build Coastguard Worker     __m128i v[16] = {
515*61046927SAndroid Build Coastguard Worker         h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
516*61046927SAndroid Build Coastguard Worker         h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
517*61046927SAndroid Build Coastguard Worker         set1_128(IV[0]), set1_128(IV[1]),  set1_128(IV[2]), set1_128(IV[3]),
518*61046927SAndroid Build Coastguard Worker         counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
519*61046927SAndroid Build Coastguard Worker     };
520*61046927SAndroid Build Coastguard Worker     round_fn4(v, msg_vecs, 0);
521*61046927SAndroid Build Coastguard Worker     round_fn4(v, msg_vecs, 1);
522*61046927SAndroid Build Coastguard Worker     round_fn4(v, msg_vecs, 2);
523*61046927SAndroid Build Coastguard Worker     round_fn4(v, msg_vecs, 3);
524*61046927SAndroid Build Coastguard Worker     round_fn4(v, msg_vecs, 4);
525*61046927SAndroid Build Coastguard Worker     round_fn4(v, msg_vecs, 5);
526*61046927SAndroid Build Coastguard Worker     round_fn4(v, msg_vecs, 6);
527*61046927SAndroid Build Coastguard Worker     h_vecs[0] = xor_128(v[0], v[8]);
528*61046927SAndroid Build Coastguard Worker     h_vecs[1] = xor_128(v[1], v[9]);
529*61046927SAndroid Build Coastguard Worker     h_vecs[2] = xor_128(v[2], v[10]);
530*61046927SAndroid Build Coastguard Worker     h_vecs[3] = xor_128(v[3], v[11]);
531*61046927SAndroid Build Coastguard Worker     h_vecs[4] = xor_128(v[4], v[12]);
532*61046927SAndroid Build Coastguard Worker     h_vecs[5] = xor_128(v[5], v[13]);
533*61046927SAndroid Build Coastguard Worker     h_vecs[6] = xor_128(v[6], v[14]);
534*61046927SAndroid Build Coastguard Worker     h_vecs[7] = xor_128(v[7], v[15]);
535*61046927SAndroid Build Coastguard Worker 
536*61046927SAndroid Build Coastguard Worker     block_flags = flags;
537*61046927SAndroid Build Coastguard Worker   }
538*61046927SAndroid Build Coastguard Worker 
539*61046927SAndroid Build Coastguard Worker   transpose_vecs_128(&h_vecs[0]);
540*61046927SAndroid Build Coastguard Worker   transpose_vecs_128(&h_vecs[4]);
541*61046927SAndroid Build Coastguard Worker   // The first four vecs now contain the first half of each output, and the
542*61046927SAndroid Build Coastguard Worker   // second four vecs contain the second half of each output.
543*61046927SAndroid Build Coastguard Worker   storeu_128(h_vecs[0], &out[0 * sizeof(__m128i)]);
544*61046927SAndroid Build Coastguard Worker   storeu_128(h_vecs[4], &out[1 * sizeof(__m128i)]);
545*61046927SAndroid Build Coastguard Worker   storeu_128(h_vecs[1], &out[2 * sizeof(__m128i)]);
546*61046927SAndroid Build Coastguard Worker   storeu_128(h_vecs[5], &out[3 * sizeof(__m128i)]);
547*61046927SAndroid Build Coastguard Worker   storeu_128(h_vecs[2], &out[4 * sizeof(__m128i)]);
548*61046927SAndroid Build Coastguard Worker   storeu_128(h_vecs[6], &out[5 * sizeof(__m128i)]);
549*61046927SAndroid Build Coastguard Worker   storeu_128(h_vecs[3], &out[6 * sizeof(__m128i)]);
550*61046927SAndroid Build Coastguard Worker   storeu_128(h_vecs[7], &out[7 * sizeof(__m128i)]);
551*61046927SAndroid Build Coastguard Worker }
552*61046927SAndroid Build Coastguard Worker 
553*61046927SAndroid Build Coastguard Worker /*
554*61046927SAndroid Build Coastguard Worker  * ----------------------------------------------------------------------------
555*61046927SAndroid Build Coastguard Worker  * hash8_avx512
556*61046927SAndroid Build Coastguard Worker  * ----------------------------------------------------------------------------
557*61046927SAndroid Build Coastguard Worker  */
558*61046927SAndroid Build Coastguard Worker 
round_fn8(__m256i v[16],__m256i m[16],size_t r)559*61046927SAndroid Build Coastguard Worker INLINE void round_fn8(__m256i v[16], __m256i m[16], size_t r) {
560*61046927SAndroid Build Coastguard Worker   v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
561*61046927SAndroid Build Coastguard Worker   v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
562*61046927SAndroid Build Coastguard Worker   v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
563*61046927SAndroid Build Coastguard Worker   v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
564*61046927SAndroid Build Coastguard Worker   v[0] = add_256(v[0], v[4]);
565*61046927SAndroid Build Coastguard Worker   v[1] = add_256(v[1], v[5]);
566*61046927SAndroid Build Coastguard Worker   v[2] = add_256(v[2], v[6]);
567*61046927SAndroid Build Coastguard Worker   v[3] = add_256(v[3], v[7]);
568*61046927SAndroid Build Coastguard Worker   v[12] = xor_256(v[12], v[0]);
569*61046927SAndroid Build Coastguard Worker   v[13] = xor_256(v[13], v[1]);
570*61046927SAndroid Build Coastguard Worker   v[14] = xor_256(v[14], v[2]);
571*61046927SAndroid Build Coastguard Worker   v[15] = xor_256(v[15], v[3]);
572*61046927SAndroid Build Coastguard Worker   v[12] = rot16_256(v[12]);
573*61046927SAndroid Build Coastguard Worker   v[13] = rot16_256(v[13]);
574*61046927SAndroid Build Coastguard Worker   v[14] = rot16_256(v[14]);
575*61046927SAndroid Build Coastguard Worker   v[15] = rot16_256(v[15]);
576*61046927SAndroid Build Coastguard Worker   v[8] = add_256(v[8], v[12]);
577*61046927SAndroid Build Coastguard Worker   v[9] = add_256(v[9], v[13]);
578*61046927SAndroid Build Coastguard Worker   v[10] = add_256(v[10], v[14]);
579*61046927SAndroid Build Coastguard Worker   v[11] = add_256(v[11], v[15]);
580*61046927SAndroid Build Coastguard Worker   v[4] = xor_256(v[4], v[8]);
581*61046927SAndroid Build Coastguard Worker   v[5] = xor_256(v[5], v[9]);
582*61046927SAndroid Build Coastguard Worker   v[6] = xor_256(v[6], v[10]);
583*61046927SAndroid Build Coastguard Worker   v[7] = xor_256(v[7], v[11]);
584*61046927SAndroid Build Coastguard Worker   v[4] = rot12_256(v[4]);
585*61046927SAndroid Build Coastguard Worker   v[5] = rot12_256(v[5]);
586*61046927SAndroid Build Coastguard Worker   v[6] = rot12_256(v[6]);
587*61046927SAndroid Build Coastguard Worker   v[7] = rot12_256(v[7]);
588*61046927SAndroid Build Coastguard Worker   v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
589*61046927SAndroid Build Coastguard Worker   v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
590*61046927SAndroid Build Coastguard Worker   v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
591*61046927SAndroid Build Coastguard Worker   v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
592*61046927SAndroid Build Coastguard Worker   v[0] = add_256(v[0], v[4]);
593*61046927SAndroid Build Coastguard Worker   v[1] = add_256(v[1], v[5]);
594*61046927SAndroid Build Coastguard Worker   v[2] = add_256(v[2], v[6]);
595*61046927SAndroid Build Coastguard Worker   v[3] = add_256(v[3], v[7]);
596*61046927SAndroid Build Coastguard Worker   v[12] = xor_256(v[12], v[0]);
597*61046927SAndroid Build Coastguard Worker   v[13] = xor_256(v[13], v[1]);
598*61046927SAndroid Build Coastguard Worker   v[14] = xor_256(v[14], v[2]);
599*61046927SAndroid Build Coastguard Worker   v[15] = xor_256(v[15], v[3]);
600*61046927SAndroid Build Coastguard Worker   v[12] = rot8_256(v[12]);
601*61046927SAndroid Build Coastguard Worker   v[13] = rot8_256(v[13]);
602*61046927SAndroid Build Coastguard Worker   v[14] = rot8_256(v[14]);
603*61046927SAndroid Build Coastguard Worker   v[15] = rot8_256(v[15]);
604*61046927SAndroid Build Coastguard Worker   v[8] = add_256(v[8], v[12]);
605*61046927SAndroid Build Coastguard Worker   v[9] = add_256(v[9], v[13]);
606*61046927SAndroid Build Coastguard Worker   v[10] = add_256(v[10], v[14]);
607*61046927SAndroid Build Coastguard Worker   v[11] = add_256(v[11], v[15]);
608*61046927SAndroid Build Coastguard Worker   v[4] = xor_256(v[4], v[8]);
609*61046927SAndroid Build Coastguard Worker   v[5] = xor_256(v[5], v[9]);
610*61046927SAndroid Build Coastguard Worker   v[6] = xor_256(v[6], v[10]);
611*61046927SAndroid Build Coastguard Worker   v[7] = xor_256(v[7], v[11]);
612*61046927SAndroid Build Coastguard Worker   v[4] = rot7_256(v[4]);
613*61046927SAndroid Build Coastguard Worker   v[5] = rot7_256(v[5]);
614*61046927SAndroid Build Coastguard Worker   v[6] = rot7_256(v[6]);
615*61046927SAndroid Build Coastguard Worker   v[7] = rot7_256(v[7]);
616*61046927SAndroid Build Coastguard Worker 
617*61046927SAndroid Build Coastguard Worker   v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
618*61046927SAndroid Build Coastguard Worker   v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
619*61046927SAndroid Build Coastguard Worker   v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
620*61046927SAndroid Build Coastguard Worker   v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
621*61046927SAndroid Build Coastguard Worker   v[0] = add_256(v[0], v[5]);
622*61046927SAndroid Build Coastguard Worker   v[1] = add_256(v[1], v[6]);
623*61046927SAndroid Build Coastguard Worker   v[2] = add_256(v[2], v[7]);
624*61046927SAndroid Build Coastguard Worker   v[3] = add_256(v[3], v[4]);
625*61046927SAndroid Build Coastguard Worker   v[15] = xor_256(v[15], v[0]);
626*61046927SAndroid Build Coastguard Worker   v[12] = xor_256(v[12], v[1]);
627*61046927SAndroid Build Coastguard Worker   v[13] = xor_256(v[13], v[2]);
628*61046927SAndroid Build Coastguard Worker   v[14] = xor_256(v[14], v[3]);
629*61046927SAndroid Build Coastguard Worker   v[15] = rot16_256(v[15]);
630*61046927SAndroid Build Coastguard Worker   v[12] = rot16_256(v[12]);
631*61046927SAndroid Build Coastguard Worker   v[13] = rot16_256(v[13]);
632*61046927SAndroid Build Coastguard Worker   v[14] = rot16_256(v[14]);
633*61046927SAndroid Build Coastguard Worker   v[10] = add_256(v[10], v[15]);
634*61046927SAndroid Build Coastguard Worker   v[11] = add_256(v[11], v[12]);
635*61046927SAndroid Build Coastguard Worker   v[8] = add_256(v[8], v[13]);
636*61046927SAndroid Build Coastguard Worker   v[9] = add_256(v[9], v[14]);
637*61046927SAndroid Build Coastguard Worker   v[5] = xor_256(v[5], v[10]);
638*61046927SAndroid Build Coastguard Worker   v[6] = xor_256(v[6], v[11]);
639*61046927SAndroid Build Coastguard Worker   v[7] = xor_256(v[7], v[8]);
640*61046927SAndroid Build Coastguard Worker   v[4] = xor_256(v[4], v[9]);
641*61046927SAndroid Build Coastguard Worker   v[5] = rot12_256(v[5]);
642*61046927SAndroid Build Coastguard Worker   v[6] = rot12_256(v[6]);
643*61046927SAndroid Build Coastguard Worker   v[7] = rot12_256(v[7]);
644*61046927SAndroid Build Coastguard Worker   v[4] = rot12_256(v[4]);
645*61046927SAndroid Build Coastguard Worker   v[0] = add_256(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
646*61046927SAndroid Build Coastguard Worker   v[1] = add_256(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
647*61046927SAndroid Build Coastguard Worker   v[2] = add_256(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
648*61046927SAndroid Build Coastguard Worker   v[3] = add_256(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
649*61046927SAndroid Build Coastguard Worker   v[0] = add_256(v[0], v[5]);
650*61046927SAndroid Build Coastguard Worker   v[1] = add_256(v[1], v[6]);
651*61046927SAndroid Build Coastguard Worker   v[2] = add_256(v[2], v[7]);
652*61046927SAndroid Build Coastguard Worker   v[3] = add_256(v[3], v[4]);
653*61046927SAndroid Build Coastguard Worker   v[15] = xor_256(v[15], v[0]);
654*61046927SAndroid Build Coastguard Worker   v[12] = xor_256(v[12], v[1]);
655*61046927SAndroid Build Coastguard Worker   v[13] = xor_256(v[13], v[2]);
656*61046927SAndroid Build Coastguard Worker   v[14] = xor_256(v[14], v[3]);
657*61046927SAndroid Build Coastguard Worker   v[15] = rot8_256(v[15]);
658*61046927SAndroid Build Coastguard Worker   v[12] = rot8_256(v[12]);
659*61046927SAndroid Build Coastguard Worker   v[13] = rot8_256(v[13]);
660*61046927SAndroid Build Coastguard Worker   v[14] = rot8_256(v[14]);
661*61046927SAndroid Build Coastguard Worker   v[10] = add_256(v[10], v[15]);
662*61046927SAndroid Build Coastguard Worker   v[11] = add_256(v[11], v[12]);
663*61046927SAndroid Build Coastguard Worker   v[8] = add_256(v[8], v[13]);
664*61046927SAndroid Build Coastguard Worker   v[9] = add_256(v[9], v[14]);
665*61046927SAndroid Build Coastguard Worker   v[5] = xor_256(v[5], v[10]);
666*61046927SAndroid Build Coastguard Worker   v[6] = xor_256(v[6], v[11]);
667*61046927SAndroid Build Coastguard Worker   v[7] = xor_256(v[7], v[8]);
668*61046927SAndroid Build Coastguard Worker   v[4] = xor_256(v[4], v[9]);
669*61046927SAndroid Build Coastguard Worker   v[5] = rot7_256(v[5]);
670*61046927SAndroid Build Coastguard Worker   v[6] = rot7_256(v[6]);
671*61046927SAndroid Build Coastguard Worker   v[7] = rot7_256(v[7]);
672*61046927SAndroid Build Coastguard Worker   v[4] = rot7_256(v[4]);
673*61046927SAndroid Build Coastguard Worker }
674*61046927SAndroid Build Coastguard Worker 
transpose_vecs_256(__m256i vecs[8])675*61046927SAndroid Build Coastguard Worker INLINE void transpose_vecs_256(__m256i vecs[8]) {
676*61046927SAndroid Build Coastguard Worker   // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high
677*61046927SAndroid Build Coastguard Worker   // is 22/33/66/77.
678*61046927SAndroid Build Coastguard Worker   __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]);
679*61046927SAndroid Build Coastguard Worker   __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]);
680*61046927SAndroid Build Coastguard Worker   __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]);
681*61046927SAndroid Build Coastguard Worker   __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]);
682*61046927SAndroid Build Coastguard Worker   __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]);
683*61046927SAndroid Build Coastguard Worker   __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]);
684*61046927SAndroid Build Coastguard Worker   __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
685*61046927SAndroid Build Coastguard Worker   __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
686*61046927SAndroid Build Coastguard Worker 
687*61046927SAndroid Build Coastguard Worker   // Interleave 64-bit lanes. The low unpack is lanes 00/22 and the high is
688*61046927SAndroid Build Coastguard Worker   // 11/33.
689*61046927SAndroid Build Coastguard Worker   __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
690*61046927SAndroid Build Coastguard Worker   __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
691*61046927SAndroid Build Coastguard Worker   __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367);
692*61046927SAndroid Build Coastguard Worker   __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367);
693*61046927SAndroid Build Coastguard Worker   __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145);
694*61046927SAndroid Build Coastguard Worker   __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145);
695*61046927SAndroid Build Coastguard Worker   __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367);
696*61046927SAndroid Build Coastguard Worker   __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367);
697*61046927SAndroid Build Coastguard Worker 
698*61046927SAndroid Build Coastguard Worker   // Interleave 128-bit lanes.
699*61046927SAndroid Build Coastguard Worker   vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20);
700*61046927SAndroid Build Coastguard Worker   vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20);
701*61046927SAndroid Build Coastguard Worker   vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20);
702*61046927SAndroid Build Coastguard Worker   vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20);
703*61046927SAndroid Build Coastguard Worker   vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31);
704*61046927SAndroid Build Coastguard Worker   vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31);
705*61046927SAndroid Build Coastguard Worker   vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31);
706*61046927SAndroid Build Coastguard Worker   vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31);
707*61046927SAndroid Build Coastguard Worker }
708*61046927SAndroid Build Coastguard Worker 
transpose_msg_vecs8(const uint8_t * const * inputs,size_t block_offset,__m256i out[16])709*61046927SAndroid Build Coastguard Worker INLINE void transpose_msg_vecs8(const uint8_t *const *inputs,
710*61046927SAndroid Build Coastguard Worker                                 size_t block_offset, __m256i out[16]) {
711*61046927SAndroid Build Coastguard Worker   out[0] = loadu_256(&inputs[0][block_offset + 0 * sizeof(__m256i)]);
712*61046927SAndroid Build Coastguard Worker   out[1] = loadu_256(&inputs[1][block_offset + 0 * sizeof(__m256i)]);
713*61046927SAndroid Build Coastguard Worker   out[2] = loadu_256(&inputs[2][block_offset + 0 * sizeof(__m256i)]);
714*61046927SAndroid Build Coastguard Worker   out[3] = loadu_256(&inputs[3][block_offset + 0 * sizeof(__m256i)]);
715*61046927SAndroid Build Coastguard Worker   out[4] = loadu_256(&inputs[4][block_offset + 0 * sizeof(__m256i)]);
716*61046927SAndroid Build Coastguard Worker   out[5] = loadu_256(&inputs[5][block_offset + 0 * sizeof(__m256i)]);
717*61046927SAndroid Build Coastguard Worker   out[6] = loadu_256(&inputs[6][block_offset + 0 * sizeof(__m256i)]);
718*61046927SAndroid Build Coastguard Worker   out[7] = loadu_256(&inputs[7][block_offset + 0 * sizeof(__m256i)]);
719*61046927SAndroid Build Coastguard Worker   out[8] = loadu_256(&inputs[0][block_offset + 1 * sizeof(__m256i)]);
720*61046927SAndroid Build Coastguard Worker   out[9] = loadu_256(&inputs[1][block_offset + 1 * sizeof(__m256i)]);
721*61046927SAndroid Build Coastguard Worker   out[10] = loadu_256(&inputs[2][block_offset + 1 * sizeof(__m256i)]);
722*61046927SAndroid Build Coastguard Worker   out[11] = loadu_256(&inputs[3][block_offset + 1 * sizeof(__m256i)]);
723*61046927SAndroid Build Coastguard Worker   out[12] = loadu_256(&inputs[4][block_offset + 1 * sizeof(__m256i)]);
724*61046927SAndroid Build Coastguard Worker   out[13] = loadu_256(&inputs[5][block_offset + 1 * sizeof(__m256i)]);
725*61046927SAndroid Build Coastguard Worker   out[14] = loadu_256(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
726*61046927SAndroid Build Coastguard Worker   out[15] = loadu_256(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
727*61046927SAndroid Build Coastguard Worker   for (size_t i = 0; i < 8; ++i) {
728*61046927SAndroid Build Coastguard Worker     _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
729*61046927SAndroid Build Coastguard Worker   }
730*61046927SAndroid Build Coastguard Worker   transpose_vecs_256(&out[0]);
731*61046927SAndroid Build Coastguard Worker   transpose_vecs_256(&out[8]);
732*61046927SAndroid Build Coastguard Worker }
733*61046927SAndroid Build Coastguard Worker 
load_counters8(uint64_t counter,bool increment_counter,__m256i * out_lo,__m256i * out_hi)734*61046927SAndroid Build Coastguard Worker INLINE void load_counters8(uint64_t counter, bool increment_counter,
735*61046927SAndroid Build Coastguard Worker                            __m256i *out_lo, __m256i *out_hi) {
736*61046927SAndroid Build Coastguard Worker   uint64_t mask = (increment_counter ? ~0 : 0);
737*61046927SAndroid Build Coastguard Worker   __m512i mask_vec = _mm512_set1_epi64(mask);
738*61046927SAndroid Build Coastguard Worker   __m512i deltas = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 6, 7);
739*61046927SAndroid Build Coastguard Worker   deltas = _mm512_and_si512(mask_vec, deltas);
740*61046927SAndroid Build Coastguard Worker   __m512i counters =
741*61046927SAndroid Build Coastguard Worker       _mm512_add_epi64(_mm512_set1_epi64((int64_t)counter), deltas);
742*61046927SAndroid Build Coastguard Worker   *out_lo = _mm512_cvtepi64_epi32(counters);
743*61046927SAndroid Build Coastguard Worker   *out_hi = _mm512_cvtepi64_epi32(_mm512_srli_epi64(counters, 32));
744*61046927SAndroid Build Coastguard Worker }
745*61046927SAndroid Build Coastguard Worker 
746*61046927SAndroid Build Coastguard Worker static
blake3_hash8_avx512(const uint8_t * const * inputs,size_t blocks,const uint32_t key[8],uint64_t counter,bool increment_counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t * out)747*61046927SAndroid Build Coastguard Worker void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks,
748*61046927SAndroid Build Coastguard Worker                          const uint32_t key[8], uint64_t counter,
749*61046927SAndroid Build Coastguard Worker                          bool increment_counter, uint8_t flags,
750*61046927SAndroid Build Coastguard Worker                          uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
751*61046927SAndroid Build Coastguard Worker   __m256i h_vecs[8] = {
752*61046927SAndroid Build Coastguard Worker       set1_256(key[0]), set1_256(key[1]), set1_256(key[2]), set1_256(key[3]),
753*61046927SAndroid Build Coastguard Worker       set1_256(key[4]), set1_256(key[5]), set1_256(key[6]), set1_256(key[7]),
754*61046927SAndroid Build Coastguard Worker   };
755*61046927SAndroid Build Coastguard Worker   __m256i counter_low_vec, counter_high_vec;
756*61046927SAndroid Build Coastguard Worker   load_counters8(counter, increment_counter, &counter_low_vec,
757*61046927SAndroid Build Coastguard Worker                  &counter_high_vec);
758*61046927SAndroid Build Coastguard Worker   uint8_t block_flags = flags | flags_start;
759*61046927SAndroid Build Coastguard Worker 
760*61046927SAndroid Build Coastguard Worker   for (size_t block = 0; block < blocks; block++) {
761*61046927SAndroid Build Coastguard Worker     if (block + 1 == blocks) {
762*61046927SAndroid Build Coastguard Worker       block_flags |= flags_end;
763*61046927SAndroid Build Coastguard Worker     }
764*61046927SAndroid Build Coastguard Worker     __m256i block_len_vec = set1_256(BLAKE3_BLOCK_LEN);
765*61046927SAndroid Build Coastguard Worker     __m256i block_flags_vec = set1_256(block_flags);
766*61046927SAndroid Build Coastguard Worker     __m256i msg_vecs[16];
767*61046927SAndroid Build Coastguard Worker     transpose_msg_vecs8(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
768*61046927SAndroid Build Coastguard Worker 
769*61046927SAndroid Build Coastguard Worker     __m256i v[16] = {
770*61046927SAndroid Build Coastguard Worker         h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
771*61046927SAndroid Build Coastguard Worker         h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
772*61046927SAndroid Build Coastguard Worker         set1_256(IV[0]), set1_256(IV[1]),  set1_256(IV[2]), set1_256(IV[3]),
773*61046927SAndroid Build Coastguard Worker         counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
774*61046927SAndroid Build Coastguard Worker     };
775*61046927SAndroid Build Coastguard Worker     round_fn8(v, msg_vecs, 0);
776*61046927SAndroid Build Coastguard Worker     round_fn8(v, msg_vecs, 1);
777*61046927SAndroid Build Coastguard Worker     round_fn8(v, msg_vecs, 2);
778*61046927SAndroid Build Coastguard Worker     round_fn8(v, msg_vecs, 3);
779*61046927SAndroid Build Coastguard Worker     round_fn8(v, msg_vecs, 4);
780*61046927SAndroid Build Coastguard Worker     round_fn8(v, msg_vecs, 5);
781*61046927SAndroid Build Coastguard Worker     round_fn8(v, msg_vecs, 6);
782*61046927SAndroid Build Coastguard Worker     h_vecs[0] = xor_256(v[0], v[8]);
783*61046927SAndroid Build Coastguard Worker     h_vecs[1] = xor_256(v[1], v[9]);
784*61046927SAndroid Build Coastguard Worker     h_vecs[2] = xor_256(v[2], v[10]);
785*61046927SAndroid Build Coastguard Worker     h_vecs[3] = xor_256(v[3], v[11]);
786*61046927SAndroid Build Coastguard Worker     h_vecs[4] = xor_256(v[4], v[12]);
787*61046927SAndroid Build Coastguard Worker     h_vecs[5] = xor_256(v[5], v[13]);
788*61046927SAndroid Build Coastguard Worker     h_vecs[6] = xor_256(v[6], v[14]);
789*61046927SAndroid Build Coastguard Worker     h_vecs[7] = xor_256(v[7], v[15]);
790*61046927SAndroid Build Coastguard Worker 
791*61046927SAndroid Build Coastguard Worker     block_flags = flags;
792*61046927SAndroid Build Coastguard Worker   }
793*61046927SAndroid Build Coastguard Worker 
794*61046927SAndroid Build Coastguard Worker   transpose_vecs_256(h_vecs);
795*61046927SAndroid Build Coastguard Worker   storeu_256(h_vecs[0], &out[0 * sizeof(__m256i)]);
796*61046927SAndroid Build Coastguard Worker   storeu_256(h_vecs[1], &out[1 * sizeof(__m256i)]);
797*61046927SAndroid Build Coastguard Worker   storeu_256(h_vecs[2], &out[2 * sizeof(__m256i)]);
798*61046927SAndroid Build Coastguard Worker   storeu_256(h_vecs[3], &out[3 * sizeof(__m256i)]);
799*61046927SAndroid Build Coastguard Worker   storeu_256(h_vecs[4], &out[4 * sizeof(__m256i)]);
800*61046927SAndroid Build Coastguard Worker   storeu_256(h_vecs[5], &out[5 * sizeof(__m256i)]);
801*61046927SAndroid Build Coastguard Worker   storeu_256(h_vecs[6], &out[6 * sizeof(__m256i)]);
802*61046927SAndroid Build Coastguard Worker   storeu_256(h_vecs[7], &out[7 * sizeof(__m256i)]);
803*61046927SAndroid Build Coastguard Worker }
804*61046927SAndroid Build Coastguard Worker 
805*61046927SAndroid Build Coastguard Worker /*
806*61046927SAndroid Build Coastguard Worker  * ----------------------------------------------------------------------------
807*61046927SAndroid Build Coastguard Worker  * hash16_avx512
808*61046927SAndroid Build Coastguard Worker  * ----------------------------------------------------------------------------
809*61046927SAndroid Build Coastguard Worker  */
810*61046927SAndroid Build Coastguard Worker 
round_fn16(__m512i v[16],__m512i m[16],size_t r)811*61046927SAndroid Build Coastguard Worker INLINE void round_fn16(__m512i v[16], __m512i m[16], size_t r) {
812*61046927SAndroid Build Coastguard Worker   v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
813*61046927SAndroid Build Coastguard Worker   v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
814*61046927SAndroid Build Coastguard Worker   v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
815*61046927SAndroid Build Coastguard Worker   v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
816*61046927SAndroid Build Coastguard Worker   v[0] = add_512(v[0], v[4]);
817*61046927SAndroid Build Coastguard Worker   v[1] = add_512(v[1], v[5]);
818*61046927SAndroid Build Coastguard Worker   v[2] = add_512(v[2], v[6]);
819*61046927SAndroid Build Coastguard Worker   v[3] = add_512(v[3], v[7]);
820*61046927SAndroid Build Coastguard Worker   v[12] = xor_512(v[12], v[0]);
821*61046927SAndroid Build Coastguard Worker   v[13] = xor_512(v[13], v[1]);
822*61046927SAndroid Build Coastguard Worker   v[14] = xor_512(v[14], v[2]);
823*61046927SAndroid Build Coastguard Worker   v[15] = xor_512(v[15], v[3]);
824*61046927SAndroid Build Coastguard Worker   v[12] = rot16_512(v[12]);
825*61046927SAndroid Build Coastguard Worker   v[13] = rot16_512(v[13]);
826*61046927SAndroid Build Coastguard Worker   v[14] = rot16_512(v[14]);
827*61046927SAndroid Build Coastguard Worker   v[15] = rot16_512(v[15]);
828*61046927SAndroid Build Coastguard Worker   v[8] = add_512(v[8], v[12]);
829*61046927SAndroid Build Coastguard Worker   v[9] = add_512(v[9], v[13]);
830*61046927SAndroid Build Coastguard Worker   v[10] = add_512(v[10], v[14]);
831*61046927SAndroid Build Coastguard Worker   v[11] = add_512(v[11], v[15]);
832*61046927SAndroid Build Coastguard Worker   v[4] = xor_512(v[4], v[8]);
833*61046927SAndroid Build Coastguard Worker   v[5] = xor_512(v[5], v[9]);
834*61046927SAndroid Build Coastguard Worker   v[6] = xor_512(v[6], v[10]);
835*61046927SAndroid Build Coastguard Worker   v[7] = xor_512(v[7], v[11]);
836*61046927SAndroid Build Coastguard Worker   v[4] = rot12_512(v[4]);
837*61046927SAndroid Build Coastguard Worker   v[5] = rot12_512(v[5]);
838*61046927SAndroid Build Coastguard Worker   v[6] = rot12_512(v[6]);
839*61046927SAndroid Build Coastguard Worker   v[7] = rot12_512(v[7]);
840*61046927SAndroid Build Coastguard Worker   v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
841*61046927SAndroid Build Coastguard Worker   v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
842*61046927SAndroid Build Coastguard Worker   v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
843*61046927SAndroid Build Coastguard Worker   v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
844*61046927SAndroid Build Coastguard Worker   v[0] = add_512(v[0], v[4]);
845*61046927SAndroid Build Coastguard Worker   v[1] = add_512(v[1], v[5]);
846*61046927SAndroid Build Coastguard Worker   v[2] = add_512(v[2], v[6]);
847*61046927SAndroid Build Coastguard Worker   v[3] = add_512(v[3], v[7]);
848*61046927SAndroid Build Coastguard Worker   v[12] = xor_512(v[12], v[0]);
849*61046927SAndroid Build Coastguard Worker   v[13] = xor_512(v[13], v[1]);
850*61046927SAndroid Build Coastguard Worker   v[14] = xor_512(v[14], v[2]);
851*61046927SAndroid Build Coastguard Worker   v[15] = xor_512(v[15], v[3]);
852*61046927SAndroid Build Coastguard Worker   v[12] = rot8_512(v[12]);
853*61046927SAndroid Build Coastguard Worker   v[13] = rot8_512(v[13]);
854*61046927SAndroid Build Coastguard Worker   v[14] = rot8_512(v[14]);
855*61046927SAndroid Build Coastguard Worker   v[15] = rot8_512(v[15]);
856*61046927SAndroid Build Coastguard Worker   v[8] = add_512(v[8], v[12]);
857*61046927SAndroid Build Coastguard Worker   v[9] = add_512(v[9], v[13]);
858*61046927SAndroid Build Coastguard Worker   v[10] = add_512(v[10], v[14]);
859*61046927SAndroid Build Coastguard Worker   v[11] = add_512(v[11], v[15]);
860*61046927SAndroid Build Coastguard Worker   v[4] = xor_512(v[4], v[8]);
861*61046927SAndroid Build Coastguard Worker   v[5] = xor_512(v[5], v[9]);
862*61046927SAndroid Build Coastguard Worker   v[6] = xor_512(v[6], v[10]);
863*61046927SAndroid Build Coastguard Worker   v[7] = xor_512(v[7], v[11]);
864*61046927SAndroid Build Coastguard Worker   v[4] = rot7_512(v[4]);
865*61046927SAndroid Build Coastguard Worker   v[5] = rot7_512(v[5]);
866*61046927SAndroid Build Coastguard Worker   v[6] = rot7_512(v[6]);
867*61046927SAndroid Build Coastguard Worker   v[7] = rot7_512(v[7]);
868*61046927SAndroid Build Coastguard Worker 
869*61046927SAndroid Build Coastguard Worker   v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
870*61046927SAndroid Build Coastguard Worker   v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
871*61046927SAndroid Build Coastguard Worker   v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
872*61046927SAndroid Build Coastguard Worker   v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
873*61046927SAndroid Build Coastguard Worker   v[0] = add_512(v[0], v[5]);
874*61046927SAndroid Build Coastguard Worker   v[1] = add_512(v[1], v[6]);
875*61046927SAndroid Build Coastguard Worker   v[2] = add_512(v[2], v[7]);
876*61046927SAndroid Build Coastguard Worker   v[3] = add_512(v[3], v[4]);
877*61046927SAndroid Build Coastguard Worker   v[15] = xor_512(v[15], v[0]);
878*61046927SAndroid Build Coastguard Worker   v[12] = xor_512(v[12], v[1]);
879*61046927SAndroid Build Coastguard Worker   v[13] = xor_512(v[13], v[2]);
880*61046927SAndroid Build Coastguard Worker   v[14] = xor_512(v[14], v[3]);
881*61046927SAndroid Build Coastguard Worker   v[15] = rot16_512(v[15]);
882*61046927SAndroid Build Coastguard Worker   v[12] = rot16_512(v[12]);
883*61046927SAndroid Build Coastguard Worker   v[13] = rot16_512(v[13]);
884*61046927SAndroid Build Coastguard Worker   v[14] = rot16_512(v[14]);
885*61046927SAndroid Build Coastguard Worker   v[10] = add_512(v[10], v[15]);
886*61046927SAndroid Build Coastguard Worker   v[11] = add_512(v[11], v[12]);
887*61046927SAndroid Build Coastguard Worker   v[8] = add_512(v[8], v[13]);
888*61046927SAndroid Build Coastguard Worker   v[9] = add_512(v[9], v[14]);
889*61046927SAndroid Build Coastguard Worker   v[5] = xor_512(v[5], v[10]);
890*61046927SAndroid Build Coastguard Worker   v[6] = xor_512(v[6], v[11]);
891*61046927SAndroid Build Coastguard Worker   v[7] = xor_512(v[7], v[8]);
892*61046927SAndroid Build Coastguard Worker   v[4] = xor_512(v[4], v[9]);
893*61046927SAndroid Build Coastguard Worker   v[5] = rot12_512(v[5]);
894*61046927SAndroid Build Coastguard Worker   v[6] = rot12_512(v[6]);
895*61046927SAndroid Build Coastguard Worker   v[7] = rot12_512(v[7]);
896*61046927SAndroid Build Coastguard Worker   v[4] = rot12_512(v[4]);
897*61046927SAndroid Build Coastguard Worker   v[0] = add_512(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
898*61046927SAndroid Build Coastguard Worker   v[1] = add_512(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
899*61046927SAndroid Build Coastguard Worker   v[2] = add_512(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
900*61046927SAndroid Build Coastguard Worker   v[3] = add_512(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
901*61046927SAndroid Build Coastguard Worker   v[0] = add_512(v[0], v[5]);
902*61046927SAndroid Build Coastguard Worker   v[1] = add_512(v[1], v[6]);
903*61046927SAndroid Build Coastguard Worker   v[2] = add_512(v[2], v[7]);
904*61046927SAndroid Build Coastguard Worker   v[3] = add_512(v[3], v[4]);
905*61046927SAndroid Build Coastguard Worker   v[15] = xor_512(v[15], v[0]);
906*61046927SAndroid Build Coastguard Worker   v[12] = xor_512(v[12], v[1]);
907*61046927SAndroid Build Coastguard Worker   v[13] = xor_512(v[13], v[2]);
908*61046927SAndroid Build Coastguard Worker   v[14] = xor_512(v[14], v[3]);
909*61046927SAndroid Build Coastguard Worker   v[15] = rot8_512(v[15]);
910*61046927SAndroid Build Coastguard Worker   v[12] = rot8_512(v[12]);
911*61046927SAndroid Build Coastguard Worker   v[13] = rot8_512(v[13]);
912*61046927SAndroid Build Coastguard Worker   v[14] = rot8_512(v[14]);
913*61046927SAndroid Build Coastguard Worker   v[10] = add_512(v[10], v[15]);
914*61046927SAndroid Build Coastguard Worker   v[11] = add_512(v[11], v[12]);
915*61046927SAndroid Build Coastguard Worker   v[8] = add_512(v[8], v[13]);
916*61046927SAndroid Build Coastguard Worker   v[9] = add_512(v[9], v[14]);
917*61046927SAndroid Build Coastguard Worker   v[5] = xor_512(v[5], v[10]);
918*61046927SAndroid Build Coastguard Worker   v[6] = xor_512(v[6], v[11]);
919*61046927SAndroid Build Coastguard Worker   v[7] = xor_512(v[7], v[8]);
920*61046927SAndroid Build Coastguard Worker   v[4] = xor_512(v[4], v[9]);
921*61046927SAndroid Build Coastguard Worker   v[5] = rot7_512(v[5]);
922*61046927SAndroid Build Coastguard Worker   v[6] = rot7_512(v[6]);
923*61046927SAndroid Build Coastguard Worker   v[7] = rot7_512(v[7]);
924*61046927SAndroid Build Coastguard Worker   v[4] = rot7_512(v[4]);
925*61046927SAndroid Build Coastguard Worker }
926*61046927SAndroid Build Coastguard Worker 
927*61046927SAndroid Build Coastguard Worker // 0b10001000, or lanes a0/a2/b0/b2 in little-endian order
928*61046927SAndroid Build Coastguard Worker #define LO_IMM8 0x88
929*61046927SAndroid Build Coastguard Worker 
unpack_lo_128(__m512i a,__m512i b)930*61046927SAndroid Build Coastguard Worker INLINE __m512i unpack_lo_128(__m512i a, __m512i b) {
931*61046927SAndroid Build Coastguard Worker   return _mm512_shuffle_i32x4(a, b, LO_IMM8);
932*61046927SAndroid Build Coastguard Worker }
933*61046927SAndroid Build Coastguard Worker 
934*61046927SAndroid Build Coastguard Worker // 0b11011101, or lanes a1/a3/b1/b3 in little-endian order
935*61046927SAndroid Build Coastguard Worker #define HI_IMM8 0xdd
936*61046927SAndroid Build Coastguard Worker 
unpack_hi_128(__m512i a,__m512i b)937*61046927SAndroid Build Coastguard Worker INLINE __m512i unpack_hi_128(__m512i a, __m512i b) {
938*61046927SAndroid Build Coastguard Worker   return _mm512_shuffle_i32x4(a, b, HI_IMM8);
939*61046927SAndroid Build Coastguard Worker }
940*61046927SAndroid Build Coastguard Worker 
transpose_vecs_512(__m512i vecs[16])941*61046927SAndroid Build Coastguard Worker INLINE void transpose_vecs_512(__m512i vecs[16]) {
942*61046927SAndroid Build Coastguard Worker   // Interleave 32-bit lanes. The _0 unpack is lanes
943*61046927SAndroid Build Coastguard Worker   // 0/0/1/1/4/4/5/5/8/8/9/9/12/12/13/13, and the _2 unpack is lanes
944*61046927SAndroid Build Coastguard Worker   // 2/2/3/3/6/6/7/7/10/10/11/11/14/14/15/15.
945*61046927SAndroid Build Coastguard Worker   __m512i ab_0 = _mm512_unpacklo_epi32(vecs[0], vecs[1]);
946*61046927SAndroid Build Coastguard Worker   __m512i ab_2 = _mm512_unpackhi_epi32(vecs[0], vecs[1]);
947*61046927SAndroid Build Coastguard Worker   __m512i cd_0 = _mm512_unpacklo_epi32(vecs[2], vecs[3]);
948*61046927SAndroid Build Coastguard Worker   __m512i cd_2 = _mm512_unpackhi_epi32(vecs[2], vecs[3]);
949*61046927SAndroid Build Coastguard Worker   __m512i ef_0 = _mm512_unpacklo_epi32(vecs[4], vecs[5]);
950*61046927SAndroid Build Coastguard Worker   __m512i ef_2 = _mm512_unpackhi_epi32(vecs[4], vecs[5]);
951*61046927SAndroid Build Coastguard Worker   __m512i gh_0 = _mm512_unpacklo_epi32(vecs[6], vecs[7]);
952*61046927SAndroid Build Coastguard Worker   __m512i gh_2 = _mm512_unpackhi_epi32(vecs[6], vecs[7]);
953*61046927SAndroid Build Coastguard Worker   __m512i ij_0 = _mm512_unpacklo_epi32(vecs[8], vecs[9]);
954*61046927SAndroid Build Coastguard Worker   __m512i ij_2 = _mm512_unpackhi_epi32(vecs[8], vecs[9]);
955*61046927SAndroid Build Coastguard Worker   __m512i kl_0 = _mm512_unpacklo_epi32(vecs[10], vecs[11]);
956*61046927SAndroid Build Coastguard Worker   __m512i kl_2 = _mm512_unpackhi_epi32(vecs[10], vecs[11]);
957*61046927SAndroid Build Coastguard Worker   __m512i mn_0 = _mm512_unpacklo_epi32(vecs[12], vecs[13]);
958*61046927SAndroid Build Coastguard Worker   __m512i mn_2 = _mm512_unpackhi_epi32(vecs[12], vecs[13]);
959*61046927SAndroid Build Coastguard Worker   __m512i op_0 = _mm512_unpacklo_epi32(vecs[14], vecs[15]);
960*61046927SAndroid Build Coastguard Worker   __m512i op_2 = _mm512_unpackhi_epi32(vecs[14], vecs[15]);
961*61046927SAndroid Build Coastguard Worker 
962*61046927SAndroid Build Coastguard Worker   // Interleave 64-bit lanes. The _0 unpack is lanes
963*61046927SAndroid Build Coastguard Worker   // 0/0/0/0/4/4/4/4/8/8/8/8/12/12/12/12, the _1 unpack is lanes
964*61046927SAndroid Build Coastguard Worker   // 1/1/1/1/5/5/5/5/9/9/9/9/13/13/13/13, the _2 unpack is lanes
965*61046927SAndroid Build Coastguard Worker   // 2/2/2/2/6/6/6/6/10/10/10/10/14/14/14/14, and the _3 unpack is lanes
966*61046927SAndroid Build Coastguard Worker   // 3/3/3/3/7/7/7/7/11/11/11/11/15/15/15/15.
967*61046927SAndroid Build Coastguard Worker   __m512i abcd_0 = _mm512_unpacklo_epi64(ab_0, cd_0);
968*61046927SAndroid Build Coastguard Worker   __m512i abcd_1 = _mm512_unpackhi_epi64(ab_0, cd_0);
969*61046927SAndroid Build Coastguard Worker   __m512i abcd_2 = _mm512_unpacklo_epi64(ab_2, cd_2);
970*61046927SAndroid Build Coastguard Worker   __m512i abcd_3 = _mm512_unpackhi_epi64(ab_2, cd_2);
971*61046927SAndroid Build Coastguard Worker   __m512i efgh_0 = _mm512_unpacklo_epi64(ef_0, gh_0);
972*61046927SAndroid Build Coastguard Worker   __m512i efgh_1 = _mm512_unpackhi_epi64(ef_0, gh_0);
973*61046927SAndroid Build Coastguard Worker   __m512i efgh_2 = _mm512_unpacklo_epi64(ef_2, gh_2);
974*61046927SAndroid Build Coastguard Worker   __m512i efgh_3 = _mm512_unpackhi_epi64(ef_2, gh_2);
975*61046927SAndroid Build Coastguard Worker   __m512i ijkl_0 = _mm512_unpacklo_epi64(ij_0, kl_0);
976*61046927SAndroid Build Coastguard Worker   __m512i ijkl_1 = _mm512_unpackhi_epi64(ij_0, kl_0);
977*61046927SAndroid Build Coastguard Worker   __m512i ijkl_2 = _mm512_unpacklo_epi64(ij_2, kl_2);
978*61046927SAndroid Build Coastguard Worker   __m512i ijkl_3 = _mm512_unpackhi_epi64(ij_2, kl_2);
979*61046927SAndroid Build Coastguard Worker   __m512i mnop_0 = _mm512_unpacklo_epi64(mn_0, op_0);
980*61046927SAndroid Build Coastguard Worker   __m512i mnop_1 = _mm512_unpackhi_epi64(mn_0, op_0);
981*61046927SAndroid Build Coastguard Worker   __m512i mnop_2 = _mm512_unpacklo_epi64(mn_2, op_2);
982*61046927SAndroid Build Coastguard Worker   __m512i mnop_3 = _mm512_unpackhi_epi64(mn_2, op_2);
983*61046927SAndroid Build Coastguard Worker 
984*61046927SAndroid Build Coastguard Worker   // Interleave 128-bit lanes. The _0 unpack is
985*61046927SAndroid Build Coastguard Worker   // 0/0/0/0/8/8/8/8/0/0/0/0/8/8/8/8, the _1 unpack is
986*61046927SAndroid Build Coastguard Worker   // 1/1/1/1/9/9/9/9/1/1/1/1/9/9/9/9, and so on.
987*61046927SAndroid Build Coastguard Worker   __m512i abcdefgh_0 = unpack_lo_128(abcd_0, efgh_0);
988*61046927SAndroid Build Coastguard Worker   __m512i abcdefgh_1 = unpack_lo_128(abcd_1, efgh_1);
989*61046927SAndroid Build Coastguard Worker   __m512i abcdefgh_2 = unpack_lo_128(abcd_2, efgh_2);
990*61046927SAndroid Build Coastguard Worker   __m512i abcdefgh_3 = unpack_lo_128(abcd_3, efgh_3);
991*61046927SAndroid Build Coastguard Worker   __m512i abcdefgh_4 = unpack_hi_128(abcd_0, efgh_0);
992*61046927SAndroid Build Coastguard Worker   __m512i abcdefgh_5 = unpack_hi_128(abcd_1, efgh_1);
993*61046927SAndroid Build Coastguard Worker   __m512i abcdefgh_6 = unpack_hi_128(abcd_2, efgh_2);
994*61046927SAndroid Build Coastguard Worker   __m512i abcdefgh_7 = unpack_hi_128(abcd_3, efgh_3);
995*61046927SAndroid Build Coastguard Worker   __m512i ijklmnop_0 = unpack_lo_128(ijkl_0, mnop_0);
996*61046927SAndroid Build Coastguard Worker   __m512i ijklmnop_1 = unpack_lo_128(ijkl_1, mnop_1);
997*61046927SAndroid Build Coastguard Worker   __m512i ijklmnop_2 = unpack_lo_128(ijkl_2, mnop_2);
998*61046927SAndroid Build Coastguard Worker   __m512i ijklmnop_3 = unpack_lo_128(ijkl_3, mnop_3);
999*61046927SAndroid Build Coastguard Worker   __m512i ijklmnop_4 = unpack_hi_128(ijkl_0, mnop_0);
1000*61046927SAndroid Build Coastguard Worker   __m512i ijklmnop_5 = unpack_hi_128(ijkl_1, mnop_1);
1001*61046927SAndroid Build Coastguard Worker   __m512i ijklmnop_6 = unpack_hi_128(ijkl_2, mnop_2);
1002*61046927SAndroid Build Coastguard Worker   __m512i ijklmnop_7 = unpack_hi_128(ijkl_3, mnop_3);
1003*61046927SAndroid Build Coastguard Worker 
1004*61046927SAndroid Build Coastguard Worker   // Interleave 128-bit lanes again for the final outputs.
1005*61046927SAndroid Build Coastguard Worker   vecs[0] = unpack_lo_128(abcdefgh_0, ijklmnop_0);
1006*61046927SAndroid Build Coastguard Worker   vecs[1] = unpack_lo_128(abcdefgh_1, ijklmnop_1);
1007*61046927SAndroid Build Coastguard Worker   vecs[2] = unpack_lo_128(abcdefgh_2, ijklmnop_2);
1008*61046927SAndroid Build Coastguard Worker   vecs[3] = unpack_lo_128(abcdefgh_3, ijklmnop_3);
1009*61046927SAndroid Build Coastguard Worker   vecs[4] = unpack_lo_128(abcdefgh_4, ijklmnop_4);
1010*61046927SAndroid Build Coastguard Worker   vecs[5] = unpack_lo_128(abcdefgh_5, ijklmnop_5);
1011*61046927SAndroid Build Coastguard Worker   vecs[6] = unpack_lo_128(abcdefgh_6, ijklmnop_6);
1012*61046927SAndroid Build Coastguard Worker   vecs[7] = unpack_lo_128(abcdefgh_7, ijklmnop_7);
1013*61046927SAndroid Build Coastguard Worker   vecs[8] = unpack_hi_128(abcdefgh_0, ijklmnop_0);
1014*61046927SAndroid Build Coastguard Worker   vecs[9] = unpack_hi_128(abcdefgh_1, ijklmnop_1);
1015*61046927SAndroid Build Coastguard Worker   vecs[10] = unpack_hi_128(abcdefgh_2, ijklmnop_2);
1016*61046927SAndroid Build Coastguard Worker   vecs[11] = unpack_hi_128(abcdefgh_3, ijklmnop_3);
1017*61046927SAndroid Build Coastguard Worker   vecs[12] = unpack_hi_128(abcdefgh_4, ijklmnop_4);
1018*61046927SAndroid Build Coastguard Worker   vecs[13] = unpack_hi_128(abcdefgh_5, ijklmnop_5);
1019*61046927SAndroid Build Coastguard Worker   vecs[14] = unpack_hi_128(abcdefgh_6, ijklmnop_6);
1020*61046927SAndroid Build Coastguard Worker   vecs[15] = unpack_hi_128(abcdefgh_7, ijklmnop_7);
1021*61046927SAndroid Build Coastguard Worker }
1022*61046927SAndroid Build Coastguard Worker 
transpose_msg_vecs16(const uint8_t * const * inputs,size_t block_offset,__m512i out[16])1023*61046927SAndroid Build Coastguard Worker INLINE void transpose_msg_vecs16(const uint8_t *const *inputs,
1024*61046927SAndroid Build Coastguard Worker                                  size_t block_offset, __m512i out[16]) {
1025*61046927SAndroid Build Coastguard Worker   out[0] = loadu_512(&inputs[0][block_offset]);
1026*61046927SAndroid Build Coastguard Worker   out[1] = loadu_512(&inputs[1][block_offset]);
1027*61046927SAndroid Build Coastguard Worker   out[2] = loadu_512(&inputs[2][block_offset]);
1028*61046927SAndroid Build Coastguard Worker   out[3] = loadu_512(&inputs[3][block_offset]);
1029*61046927SAndroid Build Coastguard Worker   out[4] = loadu_512(&inputs[4][block_offset]);
1030*61046927SAndroid Build Coastguard Worker   out[5] = loadu_512(&inputs[5][block_offset]);
1031*61046927SAndroid Build Coastguard Worker   out[6] = loadu_512(&inputs[6][block_offset]);
1032*61046927SAndroid Build Coastguard Worker   out[7] = loadu_512(&inputs[7][block_offset]);
1033*61046927SAndroid Build Coastguard Worker   out[8] = loadu_512(&inputs[8][block_offset]);
1034*61046927SAndroid Build Coastguard Worker   out[9] = loadu_512(&inputs[9][block_offset]);
1035*61046927SAndroid Build Coastguard Worker   out[10] = loadu_512(&inputs[10][block_offset]);
1036*61046927SAndroid Build Coastguard Worker   out[11] = loadu_512(&inputs[11][block_offset]);
1037*61046927SAndroid Build Coastguard Worker   out[12] = loadu_512(&inputs[12][block_offset]);
1038*61046927SAndroid Build Coastguard Worker   out[13] = loadu_512(&inputs[13][block_offset]);
1039*61046927SAndroid Build Coastguard Worker   out[14] = loadu_512(&inputs[14][block_offset]);
1040*61046927SAndroid Build Coastguard Worker   out[15] = loadu_512(&inputs[15][block_offset]);
1041*61046927SAndroid Build Coastguard Worker   for (size_t i = 0; i < 16; ++i) {
1042*61046927SAndroid Build Coastguard Worker     _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
1043*61046927SAndroid Build Coastguard Worker   }
1044*61046927SAndroid Build Coastguard Worker   transpose_vecs_512(out);
1045*61046927SAndroid Build Coastguard Worker }
1046*61046927SAndroid Build Coastguard Worker 
load_counters16(uint64_t counter,bool increment_counter,__m512i * out_lo,__m512i * out_hi)1047*61046927SAndroid Build Coastguard Worker INLINE void load_counters16(uint64_t counter, bool increment_counter,
1048*61046927SAndroid Build Coastguard Worker                             __m512i *out_lo, __m512i *out_hi) {
1049*61046927SAndroid Build Coastguard Worker   const __m512i mask = _mm512_set1_epi32(-(int32_t)increment_counter);
1050*61046927SAndroid Build Coastguard Worker   const __m512i deltas = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
1051*61046927SAndroid Build Coastguard Worker   const __m512i masked_deltas = _mm512_and_si512(deltas, mask);
1052*61046927SAndroid Build Coastguard Worker   const __m512i low_words = _mm512_add_epi32(
1053*61046927SAndroid Build Coastguard Worker     _mm512_set1_epi32((int32_t)counter),
1054*61046927SAndroid Build Coastguard Worker     masked_deltas);
1055*61046927SAndroid Build Coastguard Worker   // The carry bit is 1 if the high bit of the word was 1 before addition and is
1056*61046927SAndroid Build Coastguard Worker   // 0 after.
1057*61046927SAndroid Build Coastguard Worker   // NOTE: It would be a bit more natural to use _mm512_cmp_epu32_mask to
1058*61046927SAndroid Build Coastguard Worker   // compute the carry bits here, and originally we did, but that intrinsic is
1059*61046927SAndroid Build Coastguard Worker   // broken under GCC 5.4. See https://github.com/BLAKE3-team/BLAKE3/issues/271.
1060*61046927SAndroid Build Coastguard Worker   const __m512i carries = _mm512_srli_epi32(
1061*61046927SAndroid Build Coastguard Worker     _mm512_andnot_si512(
1062*61046927SAndroid Build Coastguard Worker         low_words, // 0 after (gets inverted by andnot)
1063*61046927SAndroid Build Coastguard Worker         _mm512_set1_epi32((int32_t)counter)), // and 1 before
1064*61046927SAndroid Build Coastguard Worker     31);
1065*61046927SAndroid Build Coastguard Worker   const __m512i high_words = _mm512_add_epi32(
1066*61046927SAndroid Build Coastguard Worker     _mm512_set1_epi32((int32_t)(counter >> 32)),
1067*61046927SAndroid Build Coastguard Worker     carries);
1068*61046927SAndroid Build Coastguard Worker   *out_lo = low_words;
1069*61046927SAndroid Build Coastguard Worker   *out_hi = high_words;
1070*61046927SAndroid Build Coastguard Worker }
1071*61046927SAndroid Build Coastguard Worker 
1072*61046927SAndroid Build Coastguard Worker static
blake3_hash16_avx512(const uint8_t * const * inputs,size_t blocks,const uint32_t key[8],uint64_t counter,bool increment_counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t * out)1073*61046927SAndroid Build Coastguard Worker void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks,
1074*61046927SAndroid Build Coastguard Worker                           const uint32_t key[8], uint64_t counter,
1075*61046927SAndroid Build Coastguard Worker                           bool increment_counter, uint8_t flags,
1076*61046927SAndroid Build Coastguard Worker                           uint8_t flags_start, uint8_t flags_end,
1077*61046927SAndroid Build Coastguard Worker                           uint8_t *out) {
1078*61046927SAndroid Build Coastguard Worker   __m512i h_vecs[8] = {
1079*61046927SAndroid Build Coastguard Worker       set1_512(key[0]), set1_512(key[1]), set1_512(key[2]), set1_512(key[3]),
1080*61046927SAndroid Build Coastguard Worker       set1_512(key[4]), set1_512(key[5]), set1_512(key[6]), set1_512(key[7]),
1081*61046927SAndroid Build Coastguard Worker   };
1082*61046927SAndroid Build Coastguard Worker   __m512i counter_low_vec, counter_high_vec;
1083*61046927SAndroid Build Coastguard Worker   load_counters16(counter, increment_counter, &counter_low_vec,
1084*61046927SAndroid Build Coastguard Worker                   &counter_high_vec);
1085*61046927SAndroid Build Coastguard Worker   uint8_t block_flags = flags | flags_start;
1086*61046927SAndroid Build Coastguard Worker 
1087*61046927SAndroid Build Coastguard Worker   for (size_t block = 0; block < blocks; block++) {
1088*61046927SAndroid Build Coastguard Worker     if (block + 1 == blocks) {
1089*61046927SAndroid Build Coastguard Worker       block_flags |= flags_end;
1090*61046927SAndroid Build Coastguard Worker     }
1091*61046927SAndroid Build Coastguard Worker     __m512i block_len_vec = set1_512(BLAKE3_BLOCK_LEN);
1092*61046927SAndroid Build Coastguard Worker     __m512i block_flags_vec = set1_512(block_flags);
1093*61046927SAndroid Build Coastguard Worker     __m512i msg_vecs[16];
1094*61046927SAndroid Build Coastguard Worker     transpose_msg_vecs16(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
1095*61046927SAndroid Build Coastguard Worker 
1096*61046927SAndroid Build Coastguard Worker     __m512i v[16] = {
1097*61046927SAndroid Build Coastguard Worker         h_vecs[0],       h_vecs[1],        h_vecs[2],       h_vecs[3],
1098*61046927SAndroid Build Coastguard Worker         h_vecs[4],       h_vecs[5],        h_vecs[6],       h_vecs[7],
1099*61046927SAndroid Build Coastguard Worker         set1_512(IV[0]), set1_512(IV[1]),  set1_512(IV[2]), set1_512(IV[3]),
1100*61046927SAndroid Build Coastguard Worker         counter_low_vec, counter_high_vec, block_len_vec,   block_flags_vec,
1101*61046927SAndroid Build Coastguard Worker     };
1102*61046927SAndroid Build Coastguard Worker     round_fn16(v, msg_vecs, 0);
1103*61046927SAndroid Build Coastguard Worker     round_fn16(v, msg_vecs, 1);
1104*61046927SAndroid Build Coastguard Worker     round_fn16(v, msg_vecs, 2);
1105*61046927SAndroid Build Coastguard Worker     round_fn16(v, msg_vecs, 3);
1106*61046927SAndroid Build Coastguard Worker     round_fn16(v, msg_vecs, 4);
1107*61046927SAndroid Build Coastguard Worker     round_fn16(v, msg_vecs, 5);
1108*61046927SAndroid Build Coastguard Worker     round_fn16(v, msg_vecs, 6);
1109*61046927SAndroid Build Coastguard Worker     h_vecs[0] = xor_512(v[0], v[8]);
1110*61046927SAndroid Build Coastguard Worker     h_vecs[1] = xor_512(v[1], v[9]);
1111*61046927SAndroid Build Coastguard Worker     h_vecs[2] = xor_512(v[2], v[10]);
1112*61046927SAndroid Build Coastguard Worker     h_vecs[3] = xor_512(v[3], v[11]);
1113*61046927SAndroid Build Coastguard Worker     h_vecs[4] = xor_512(v[4], v[12]);
1114*61046927SAndroid Build Coastguard Worker     h_vecs[5] = xor_512(v[5], v[13]);
1115*61046927SAndroid Build Coastguard Worker     h_vecs[6] = xor_512(v[6], v[14]);
1116*61046927SAndroid Build Coastguard Worker     h_vecs[7] = xor_512(v[7], v[15]);
1117*61046927SAndroid Build Coastguard Worker 
1118*61046927SAndroid Build Coastguard Worker     block_flags = flags;
1119*61046927SAndroid Build Coastguard Worker   }
1120*61046927SAndroid Build Coastguard Worker 
1121*61046927SAndroid Build Coastguard Worker   // transpose_vecs_512 operates on a 16x16 matrix of words, but we only have 8
1122*61046927SAndroid Build Coastguard Worker   // state vectors. Pad the matrix with zeros. After transposition, store the
1123*61046927SAndroid Build Coastguard Worker   // lower half of each vector.
1124*61046927SAndroid Build Coastguard Worker   __m512i padded[16] = {
1125*61046927SAndroid Build Coastguard Worker       h_vecs[0],   h_vecs[1],   h_vecs[2],   h_vecs[3],
1126*61046927SAndroid Build Coastguard Worker       h_vecs[4],   h_vecs[5],   h_vecs[6],   h_vecs[7],
1127*61046927SAndroid Build Coastguard Worker       set1_512(0), set1_512(0), set1_512(0), set1_512(0),
1128*61046927SAndroid Build Coastguard Worker       set1_512(0), set1_512(0), set1_512(0), set1_512(0),
1129*61046927SAndroid Build Coastguard Worker   };
1130*61046927SAndroid Build Coastguard Worker   transpose_vecs_512(padded);
1131*61046927SAndroid Build Coastguard Worker   _mm256_mask_storeu_epi32(&out[0 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[0]));
1132*61046927SAndroid Build Coastguard Worker   _mm256_mask_storeu_epi32(&out[1 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[1]));
1133*61046927SAndroid Build Coastguard Worker   _mm256_mask_storeu_epi32(&out[2 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[2]));
1134*61046927SAndroid Build Coastguard Worker   _mm256_mask_storeu_epi32(&out[3 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[3]));
1135*61046927SAndroid Build Coastguard Worker   _mm256_mask_storeu_epi32(&out[4 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[4]));
1136*61046927SAndroid Build Coastguard Worker   _mm256_mask_storeu_epi32(&out[5 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[5]));
1137*61046927SAndroid Build Coastguard Worker   _mm256_mask_storeu_epi32(&out[6 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[6]));
1138*61046927SAndroid Build Coastguard Worker   _mm256_mask_storeu_epi32(&out[7 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[7]));
1139*61046927SAndroid Build Coastguard Worker   _mm256_mask_storeu_epi32(&out[8 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[8]));
1140*61046927SAndroid Build Coastguard Worker   _mm256_mask_storeu_epi32(&out[9 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[9]));
1141*61046927SAndroid Build Coastguard Worker   _mm256_mask_storeu_epi32(&out[10 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[10]));
1142*61046927SAndroid Build Coastguard Worker   _mm256_mask_storeu_epi32(&out[11 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[11]));
1143*61046927SAndroid Build Coastguard Worker   _mm256_mask_storeu_epi32(&out[12 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[12]));
1144*61046927SAndroid Build Coastguard Worker   _mm256_mask_storeu_epi32(&out[13 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[13]));
1145*61046927SAndroid Build Coastguard Worker   _mm256_mask_storeu_epi32(&out[14 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[14]));
1146*61046927SAndroid Build Coastguard Worker   _mm256_mask_storeu_epi32(&out[15 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[15]));
1147*61046927SAndroid Build Coastguard Worker }
1148*61046927SAndroid Build Coastguard Worker 
1149*61046927SAndroid Build Coastguard Worker /*
1150*61046927SAndroid Build Coastguard Worker  * ----------------------------------------------------------------------------
1151*61046927SAndroid Build Coastguard Worker  * hash_many_avx512
1152*61046927SAndroid Build Coastguard Worker  * ----------------------------------------------------------------------------
1153*61046927SAndroid Build Coastguard Worker  */
1154*61046927SAndroid Build Coastguard Worker 
hash_one_avx512(const uint8_t * input,size_t blocks,const uint32_t key[8],uint64_t counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t out[BLAKE3_OUT_LEN])1155*61046927SAndroid Build Coastguard Worker INLINE void hash_one_avx512(const uint8_t *input, size_t blocks,
1156*61046927SAndroid Build Coastguard Worker                             const uint32_t key[8], uint64_t counter,
1157*61046927SAndroid Build Coastguard Worker                             uint8_t flags, uint8_t flags_start,
1158*61046927SAndroid Build Coastguard Worker                             uint8_t flags_end, uint8_t out[BLAKE3_OUT_LEN]) {
1159*61046927SAndroid Build Coastguard Worker   uint32_t cv[8];
1160*61046927SAndroid Build Coastguard Worker   memcpy(cv, key, BLAKE3_KEY_LEN);
1161*61046927SAndroid Build Coastguard Worker   uint8_t block_flags = flags | flags_start;
1162*61046927SAndroid Build Coastguard Worker   while (blocks > 0) {
1163*61046927SAndroid Build Coastguard Worker     if (blocks == 1) {
1164*61046927SAndroid Build Coastguard Worker       block_flags |= flags_end;
1165*61046927SAndroid Build Coastguard Worker     }
1166*61046927SAndroid Build Coastguard Worker     blake3_compress_in_place_avx512(cv, input, BLAKE3_BLOCK_LEN, counter,
1167*61046927SAndroid Build Coastguard Worker                                     block_flags);
1168*61046927SAndroid Build Coastguard Worker     input = &input[BLAKE3_BLOCK_LEN];
1169*61046927SAndroid Build Coastguard Worker     blocks -= 1;
1170*61046927SAndroid Build Coastguard Worker     block_flags = flags;
1171*61046927SAndroid Build Coastguard Worker   }
1172*61046927SAndroid Build Coastguard Worker   memcpy(out, cv, BLAKE3_OUT_LEN);
1173*61046927SAndroid Build Coastguard Worker }
1174*61046927SAndroid Build Coastguard Worker 
blake3_hash_many_avx512(const uint8_t * const * inputs,size_t num_inputs,size_t blocks,const uint32_t key[8],uint64_t counter,bool increment_counter,uint8_t flags,uint8_t flags_start,uint8_t flags_end,uint8_t * out)1175*61046927SAndroid Build Coastguard Worker void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
1176*61046927SAndroid Build Coastguard Worker                              size_t blocks, const uint32_t key[8],
1177*61046927SAndroid Build Coastguard Worker                              uint64_t counter, bool increment_counter,
1178*61046927SAndroid Build Coastguard Worker                              uint8_t flags, uint8_t flags_start,
1179*61046927SAndroid Build Coastguard Worker                              uint8_t flags_end, uint8_t *out) {
1180*61046927SAndroid Build Coastguard Worker   while (num_inputs >= 16) {
1181*61046927SAndroid Build Coastguard Worker     blake3_hash16_avx512(inputs, blocks, key, counter, increment_counter, flags,
1182*61046927SAndroid Build Coastguard Worker                          flags_start, flags_end, out);
1183*61046927SAndroid Build Coastguard Worker     if (increment_counter) {
1184*61046927SAndroid Build Coastguard Worker       counter += 16;
1185*61046927SAndroid Build Coastguard Worker     }
1186*61046927SAndroid Build Coastguard Worker     inputs += 16;
1187*61046927SAndroid Build Coastguard Worker     num_inputs -= 16;
1188*61046927SAndroid Build Coastguard Worker     out = &out[16 * BLAKE3_OUT_LEN];
1189*61046927SAndroid Build Coastguard Worker   }
1190*61046927SAndroid Build Coastguard Worker   while (num_inputs >= 8) {
1191*61046927SAndroid Build Coastguard Worker     blake3_hash8_avx512(inputs, blocks, key, counter, increment_counter, flags,
1192*61046927SAndroid Build Coastguard Worker                         flags_start, flags_end, out);
1193*61046927SAndroid Build Coastguard Worker     if (increment_counter) {
1194*61046927SAndroid Build Coastguard Worker       counter += 8;
1195*61046927SAndroid Build Coastguard Worker     }
1196*61046927SAndroid Build Coastguard Worker     inputs += 8;
1197*61046927SAndroid Build Coastguard Worker     num_inputs -= 8;
1198*61046927SAndroid Build Coastguard Worker     out = &out[8 * BLAKE3_OUT_LEN];
1199*61046927SAndroid Build Coastguard Worker   }
1200*61046927SAndroid Build Coastguard Worker   while (num_inputs >= 4) {
1201*61046927SAndroid Build Coastguard Worker     blake3_hash4_avx512(inputs, blocks, key, counter, increment_counter, flags,
1202*61046927SAndroid Build Coastguard Worker                         flags_start, flags_end, out);
1203*61046927SAndroid Build Coastguard Worker     if (increment_counter) {
1204*61046927SAndroid Build Coastguard Worker       counter += 4;
1205*61046927SAndroid Build Coastguard Worker     }
1206*61046927SAndroid Build Coastguard Worker     inputs += 4;
1207*61046927SAndroid Build Coastguard Worker     num_inputs -= 4;
1208*61046927SAndroid Build Coastguard Worker     out = &out[4 * BLAKE3_OUT_LEN];
1209*61046927SAndroid Build Coastguard Worker   }
1210*61046927SAndroid Build Coastguard Worker   while (num_inputs > 0) {
1211*61046927SAndroid Build Coastguard Worker     hash_one_avx512(inputs[0], blocks, key, counter, flags, flags_start,
1212*61046927SAndroid Build Coastguard Worker                     flags_end, out);
1213*61046927SAndroid Build Coastguard Worker     if (increment_counter) {
1214*61046927SAndroid Build Coastguard Worker       counter += 1;
1215*61046927SAndroid Build Coastguard Worker     }
1216*61046927SAndroid Build Coastguard Worker     inputs += 1;
1217*61046927SAndroid Build Coastguard Worker     num_inputs -= 1;
1218*61046927SAndroid Build Coastguard Worker     out = &out[BLAKE3_OUT_LEN];
1219*61046927SAndroid Build Coastguard Worker   }
1220*61046927SAndroid Build Coastguard Worker }
1221