xref: /aosp_15_r20/external/XNNPACK/src/amalgam/avx.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <assert.h>
7 
8 #include <immintrin.h>
9 
10 #include <xnnpack/common.h>
11 #include <xnnpack/dwconv.h>
12 #include <xnnpack/gemm.h>
13 #include <xnnpack/igemm.h>
14 #include <xnnpack/intrinsics-polyfill.h>
15 #include <xnnpack/lut.h>
16 #include <xnnpack/math.h>
17 #include <xnnpack/prelu.h>
18 #include <xnnpack/unaligned.h>
19 #include <xnnpack/vadd.h>
20 #include <xnnpack/vbinary.h>
21 #include <xnnpack/vcvt.h>
22 #include <xnnpack/vlrelu.h>
23 #include <xnnpack/vmul.h>
24 #include <xnnpack/vunary.h>
25 
26 
xnn_f16_f32_vcvt_ukernel__avx_int16_x16(size_t n,const void * input,float * output,const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])27 void xnn_f16_f32_vcvt_ukernel__avx_int16_x16(
28     size_t n,
29     const void* input,
30     float* output,
31     const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
32 {
33   assert(n != 0);
34   assert(n % sizeof(uint16_t) == 0);
35   assert(input != NULL);
36   assert(output != NULL);
37 
38   const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask);
39   const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset);
40   const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale);
41   const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask);
42   const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias);
43   const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff);
44 
45   const uint16_t* i = (const uint16_t*) input;
46   for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
47     const __m128i vh0 = _mm_loadu_si128((const __m128i*) i);
48     const __m128i vh1 = _mm_loadu_si128((const __m128i*) (i + 8));
49     i += 16;
50 
51     const __m128i vsign0 = _mm_and_si128(vh0, vsign_mask);
52     const __m128i vsign1 = _mm_and_si128(vh1, vsign_mask);
53 
54     const __m128i vnonsign0 = _mm_xor_si128(vh0, vsign0);
55     const __m128i vnonsign1 = _mm_xor_si128(vh1, vsign1);
56 
57     const __m128i vprenorm0 = _mm_slli_epi16(vnonsign0, 13);
58     const __m128i vprenorm1 = _mm_add_epi16(_mm_srli_epi16(vnonsign0, 3), vexp_offset);
59     const __m128i vprenorm2 = _mm_slli_epi16(vnonsign1, 13);
60     const __m128i vprenorm3 = _mm_add_epi16(_mm_srli_epi16(vnonsign1, 3), vexp_offset);
61 
62     const __m128i vnorm0 = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vprenorm0, vprenorm1)), vexp_scale));
63     const __m128i vnorm1 = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vprenorm0, vprenorm1)), vexp_scale));
64     const __m128i vnorm2 = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vprenorm2, vprenorm3)), vexp_scale));
65     const __m128i vnorm3 = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vprenorm2, vprenorm3)), vexp_scale));
66 
67     const __m128i vdenorm0 = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vnonsign0, vmagic_mask)), vmagic_bias));
68     const __m128i vdenorm1 = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vnonsign0, vmagic_mask)), vmagic_bias));
69     const __m128i vdenorm2 = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vnonsign1, vmagic_mask)), vmagic_bias));
70     const __m128i vdenorm3 = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vnonsign1, vmagic_mask)), vmagic_bias));
71 
72     const __m128i vmask0 = _mm_cmpgt_epi16(vnonsign0, vdenorm_cutoff);
73     const __m128i vmask1 = _mm_cmpgt_epi16(vnonsign1, vdenorm_cutoff);
74 
75     const __m128i vf0 = _mm_or_si128(_mm_unpacklo_epi16(_mm_setzero_si128(), vsign0),
76       _mm_blendv_epi8(vdenorm0, vnorm0, _mm_cvtepi16_epi32(vmask0)));
77     const __m128i vf1 = _mm_or_si128(_mm_unpackhi_epi16(_mm_setzero_si128(), vsign0),
78       _mm_blendv_epi8(vdenorm1, vnorm1, _mm_unpackhi_epi16(vmask0, vmask0)));
79     const __m128i vf2 = _mm_or_si128(_mm_unpacklo_epi16(_mm_setzero_si128(), vsign1),
80       _mm_blendv_epi8(vdenorm2, vnorm2, _mm_cvtepi16_epi32(vmask1)));
81     const __m128i vf3 = _mm_or_si128(_mm_unpackhi_epi16(_mm_setzero_si128(), vsign1),
82       _mm_blendv_epi8(vdenorm3, vnorm3, _mm_unpackhi_epi16(vmask1, vmask1)));
83 
84     _mm_storeu_ps(output, _mm_castsi128_ps(vf0));
85     _mm_storeu_ps(output + 4, _mm_castsi128_ps(vf1));
86     _mm_storeu_ps(output + 8, _mm_castsi128_ps(vf2));
87     _mm_storeu_ps(output + 12, _mm_castsi128_ps(vf3));
88     output += 16;
89   }
90   for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
91     const __m128i vh = _mm_loadu_si128((const __m128i*) i);
92     i += 8;
93 
94     const __m128i vsign = _mm_and_si128(vh, vsign_mask);
95 
96     const __m128i vnonsign = _mm_xor_si128(vh, vsign);
97 
98     const __m128i vprenorm_lo = _mm_slli_epi16(vnonsign, 13);
99     const __m128i vprenorm_hi = _mm_add_epi16(_mm_srli_epi16(vnonsign, 3), vexp_offset);
100 
101     const __m128i vnorm_lo = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vprenorm_lo, vprenorm_hi)), vexp_scale));
102     const __m128i vnorm_hi = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vprenorm_lo, vprenorm_hi)), vexp_scale));
103 
104     const __m128i vdenorm_lo = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vnonsign, vmagic_mask)), vmagic_bias));
105     const __m128i vdenorm_hi = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vnonsign, vmagic_mask)), vmagic_bias));
106 
107     const __m128i vmask = _mm_cmpgt_epi16(vnonsign, vdenorm_cutoff);
108 
109     const __m128i vf_lo = _mm_or_si128(_mm_unpacklo_epi16(_mm_setzero_si128(), vsign),
110       _mm_blendv_epi8(vdenorm_lo, vnorm_lo, _mm_cvtepi16_epi32(vmask)));
111 
112     const __m128i vf_hi = _mm_or_si128(_mm_unpackhi_epi16(_mm_setzero_si128(), vsign),
113       _mm_blendv_epi8(vdenorm_hi, vnorm_hi, _mm_unpackhi_epi16(vmask, vmask)));
114 
115     _mm_storeu_ps(output, _mm_castsi128_ps(vf_lo));
116     _mm_storeu_ps(output + 4, _mm_castsi128_ps(vf_hi));
117     output += 8;
118   }
119   if XNN_UNPREDICTABLE(n != 0) {
120     const __m128i vh = _mm_loadu_si128((const __m128i*) i);
121 
122     const __m128i vsign = _mm_and_si128(vh, vsign_mask);
123 
124     const __m128i vnonsign = _mm_xor_si128(vh, vsign);
125 
126     const __m128i vprenorm_lo = _mm_slli_epi16(vnonsign, 13);
127     const __m128i vprenorm_hi = _mm_add_epi16(_mm_srli_epi16(vnonsign, 3), vexp_offset);
128 
129     const __m128i vnorm_lo = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vprenorm_lo, vprenorm_hi)), vexp_scale));
130     const __m128i vnorm_hi = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vprenorm_lo, vprenorm_hi)), vexp_scale));
131 
132     const __m128i vdenorm_lo = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vnonsign, vmagic_mask)), vmagic_bias));
133     const __m128i vdenorm_hi = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vnonsign, vmagic_mask)), vmagic_bias));
134 
135     const __m128i vmask = _mm_cmpgt_epi16(vnonsign, vdenorm_cutoff);
136 
137     __m128i vf = _mm_or_si128(_mm_unpacklo_epi16(_mm_setzero_si128(), vsign),
138       _mm_blendv_epi8(vdenorm_lo, vnorm_lo, _mm_cvtepi16_epi32(vmask)));
139 
140     if (n & (4 * sizeof(uint16_t))) {
141       _mm_storeu_ps(output, _mm_castsi128_ps(vf));
142       output += 4;
143 
144       vf = _mm_or_si128(_mm_unpackhi_epi16(_mm_setzero_si128(), vsign),
145         _mm_blendv_epi8(vdenorm_hi, vnorm_hi, _mm_unpackhi_epi16(vmask, vmask)));
146     }
147     if (n & (2 * sizeof(uint16_t))) {
148       _mm_storel_pi((__m64*) output, _mm_castsi128_ps(vf));
149       output += 2;
150 
151       vf = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(vf), _mm_castsi128_ps(vf)));
152     }
153     if (n & (1 * sizeof(uint16_t))) {
154       _mm_store_ss(output, _mm_castsi128_ps(vf));
155     }
156   }
157 }
158 
xnn_f32_dwconv_minmax_ukernel_up16x3__avx(size_t channels,size_t output_width,const float ** input,const float * weights,float * output,size_t input_stride,size_t output_increment,size_t input_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])159 void xnn_f32_dwconv_minmax_ukernel_up16x3__avx(
160     size_t channels,
161     size_t output_width,
162     const float** input,
163     const float* weights,
164     float* output,
165     size_t input_stride,
166     size_t output_increment,
167     size_t input_offset,
168     const float* zero,
169     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
170 {
171   assert(channels != 0);
172   assert(output_width != 0);
173 
174   const __m256 vmax = _mm256_load_ps(params->avx.max);
175   const __m256 vmin = _mm256_load_ps(params->avx.min);
176   do {
177     const float* i0 = input[0];
178     assert(i0 != NULL);
179     if XNN_UNPREDICTABLE(i0 != zero) {
180       i0 = (const float*) ((uintptr_t) i0 + input_offset);
181     }
182     const float* i1 = input[1];
183     assert(i1 != NULL);
184     if XNN_UNPREDICTABLE(i1 != zero) {
185       i1 = (const float*) ((uintptr_t) i1 + input_offset);
186     }
187     const float* i2 = input[2];
188     assert(i2 != NULL);
189     if XNN_UNPREDICTABLE(i2 != zero) {
190       i2 = (const float*) ((uintptr_t) i2 + input_offset);
191     }
192     input = (const float**) ((uintptr_t) input + input_stride);
193 
194     size_t c = channels;
195     const float* w = weights;
196     for (; c >= 16; c -= 16) {
197       __m256 vacc01234567p0 = _mm256_load_ps(w);
198       __m256 vacc89ABCDEFp0 = _mm256_load_ps(w + 8);
199 
200 
201       const __m256 vi0x01234567 = _mm256_loadu_ps(i0);
202       const __m256 vi0x89ABCDEF = _mm256_loadu_ps(i0 + 8);
203       i0 += 16;
204 
205       const __m256 vk0x01234567 = _mm256_load_ps(w + 16);
206       const __m256 vk0x89ABCDEF = _mm256_load_ps(w + 24);
207       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567));
208       vacc89ABCDEFp0 = _mm256_add_ps(vacc89ABCDEFp0, _mm256_mul_ps(vi0x89ABCDEF, vk0x89ABCDEF));
209 
210       const __m256 vi1x01234567 = _mm256_loadu_ps(i1);
211       const __m256 vi1x89ABCDEF = _mm256_loadu_ps(i1 + 8);
212       i1 += 16;
213 
214       const __m256 vk1x01234567 = _mm256_load_ps(w + 32);
215       const __m256 vk1x89ABCDEF = _mm256_load_ps(w + 40);
216       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi1x01234567, vk1x01234567));
217       vacc89ABCDEFp0 = _mm256_add_ps(vacc89ABCDEFp0, _mm256_mul_ps(vi1x89ABCDEF, vk1x89ABCDEF));
218 
219       const __m256 vi2x01234567 = _mm256_loadu_ps(i2);
220       const __m256 vi2x89ABCDEF = _mm256_loadu_ps(i2 + 8);
221       i2 += 16;
222 
223       const __m256 vk2x01234567 = _mm256_load_ps(w + 48);
224       const __m256 vk2x89ABCDEF = _mm256_load_ps(w + 56);
225       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi2x01234567, vk2x01234567));
226       vacc89ABCDEFp0 = _mm256_add_ps(vacc89ABCDEFp0, _mm256_mul_ps(vi2x89ABCDEF, vk2x89ABCDEF));
227 
228       w += 64;
229 
230 
231       __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
232       __m256 vacc89ABCDEF = _mm256_max_ps(vacc89ABCDEFp0, vmin);
233       vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
234       vacc89ABCDEF = _mm256_min_ps(vacc89ABCDEF, vmax);
235 
236       _mm256_storeu_ps(output, vacc01234567);
237       _mm256_storeu_ps(output + 8, vacc89ABCDEF);
238       output += 16;
239     }
240     for (; c >= 8; c -= 8) {
241       __m256 vacc01234567p0 = _mm256_load_ps(w);
242 
243       const __m256 vi0x01234567 = _mm256_loadu_ps(i0);
244       i0 += 8;
245 
246       const __m256 vk0x01234567 = _mm256_load_ps(w + 16);
247       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567));
248 
249       const __m256 vi1x01234567 = _mm256_loadu_ps(i1);
250       i1 += 8;
251 
252       const __m256 vk1x01234567 = _mm256_load_ps(w + 32);
253       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi1x01234567, vk1x01234567));
254 
255       const __m256 vi2x01234567 = _mm256_loadu_ps(i2);
256       i2 += 8;
257 
258       const __m256 vk2x01234567 = _mm256_load_ps(w + 48);
259       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi2x01234567, vk2x01234567));
260 
261       w += 8;
262 
263 
264       __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
265       vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
266 
267       _mm256_storeu_ps(output, vacc01234567);
268       output += 8;
269     }
270     if XNN_UNLIKELY(c != 0) {
271       assert(c >= 1);
272       assert(c <= 7);
273       const __m256i vmask = _mm256_loadu_si256((const __m256i*) &params->avx.mask_table[7 - c]);
274 
275       __m256 vacc01234567p0 = _mm256_load_ps(w);
276 
277       const __m256 vi0x01234567 = _mm256_maskload_ps(i0, vmask);
278       const __m256 vk0x01234567 = _mm256_load_ps(w + 16);
279       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567));
280 
281       const __m256 vi1x01234567 = _mm256_maskload_ps(i1, vmask);
282       const __m256 vk1x01234567 = _mm256_load_ps(w + 32);
283       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi1x01234567, vk1x01234567));
284 
285       const __m256 vi2x01234567 = _mm256_maskload_ps(i2, vmask);
286       const __m256 vk2x01234567 = _mm256_load_ps(w + 48);
287       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi2x01234567, vk2x01234567));
288 
289 
290       __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
291       vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
292 
293       __m128 vacc0123 = _mm256_castps256_ps128(vacc01234567);
294       if (c & 4) {
295         _mm_storeu_ps(output, vacc0123);
296         vacc0123 = _mm256_extractf128_ps(vacc01234567, 1);
297         output += 4;
298       }
299       if (c & 2) {
300         _mm_storel_pi((__m64*) output, vacc0123);
301         vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
302         output += 2;
303       }
304       if (c & 1) {
305         _mm_store_ss(output, vacc0123);
306         output += 1;
307       }
308     }
309 
310     output = (float*) ((uintptr_t) output + output_increment);
311   } while (--output_width != 0);
312 }
313 
xnn_f32_dwconv_minmax_ukernel_up16x4__avx(size_t channels,size_t output_width,const float ** input,const float * weights,float * output,size_t input_stride,size_t output_increment,size_t input_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])314 void xnn_f32_dwconv_minmax_ukernel_up16x4__avx(
315     size_t channels,
316     size_t output_width,
317     const float** input,
318     const float* weights,
319     float* output,
320     size_t input_stride,
321     size_t output_increment,
322     size_t input_offset,
323     const float* zero,
324     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
325 {
326   assert(channels != 0);
327   assert(output_width != 0);
328 
329   const __m256 vmax = _mm256_load_ps(params->avx.max);
330   const __m256 vmin = _mm256_load_ps(params->avx.min);
331   do {
332     const float* i0 = input[0];
333     assert(i0 != NULL);
334     if XNN_UNPREDICTABLE(i0 != zero) {
335       i0 = (const float*) ((uintptr_t) i0 + input_offset);
336     }
337     const float* i1 = input[1];
338     assert(i1 != NULL);
339     if XNN_UNPREDICTABLE(i1 != zero) {
340       i1 = (const float*) ((uintptr_t) i1 + input_offset);
341     }
342     const float* i2 = input[2];
343     assert(i2 != NULL);
344     if XNN_UNPREDICTABLE(i2 != zero) {
345       i2 = (const float*) ((uintptr_t) i2 + input_offset);
346     }
347     const float* i3 = input[3];
348     assert(i3 != NULL);
349     if XNN_UNPREDICTABLE(i3 != zero) {
350       i3 = (const float*) ((uintptr_t) i3 + input_offset);
351     }
352     input = (const float**) ((uintptr_t) input + input_stride);
353 
354     size_t c = channels;
355     const float* w = weights;
356     for (; c >= 16; c -= 16) {
357       __m256 vacc01234567p0 = _mm256_load_ps(w);
358       __m256 vacc89ABCDEFp0 = _mm256_load_ps(w + 8);
359 
360 
361       const __m256 vi0x01234567 = _mm256_loadu_ps(i0);
362       const __m256 vi0x89ABCDEF = _mm256_loadu_ps(i0 + 8);
363       i0 += 16;
364 
365       const __m256 vk0x01234567 = _mm256_load_ps(w + 16);
366       const __m256 vk0x89ABCDEF = _mm256_load_ps(w + 24);
367       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567));
368       vacc89ABCDEFp0 = _mm256_add_ps(vacc89ABCDEFp0, _mm256_mul_ps(vi0x89ABCDEF, vk0x89ABCDEF));
369 
370       const __m256 vi1x01234567 = _mm256_loadu_ps(i1);
371       const __m256 vi1x89ABCDEF = _mm256_loadu_ps(i1 + 8);
372       i1 += 16;
373 
374       const __m256 vk1x01234567 = _mm256_load_ps(w + 32);
375       const __m256 vk1x89ABCDEF = _mm256_load_ps(w + 40);
376       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi1x01234567, vk1x01234567));
377       vacc89ABCDEFp0 = _mm256_add_ps(vacc89ABCDEFp0, _mm256_mul_ps(vi1x89ABCDEF, vk1x89ABCDEF));
378 
379       const __m256 vi2x01234567 = _mm256_loadu_ps(i2);
380       const __m256 vi2x89ABCDEF = _mm256_loadu_ps(i2 + 8);
381       i2 += 16;
382 
383       const __m256 vk2x01234567 = _mm256_load_ps(w + 48);
384       const __m256 vk2x89ABCDEF = _mm256_load_ps(w + 56);
385       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi2x01234567, vk2x01234567));
386       vacc89ABCDEFp0 = _mm256_add_ps(vacc89ABCDEFp0, _mm256_mul_ps(vi2x89ABCDEF, vk2x89ABCDEF));
387 
388       const __m256 vi3x01234567 = _mm256_loadu_ps(i3);
389       const __m256 vi3x89ABCDEF = _mm256_loadu_ps(i3 + 8);
390       i3 += 16;
391 
392       const __m256 vk3x01234567 = _mm256_load_ps(w + 64);
393       const __m256 vk3x89ABCDEF = _mm256_load_ps(w + 72);
394       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi3x01234567, vk3x01234567));
395       vacc89ABCDEFp0 = _mm256_add_ps(vacc89ABCDEFp0, _mm256_mul_ps(vi3x89ABCDEF, vk3x89ABCDEF));
396 
397       w += 80;
398 
399 
400       __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
401       __m256 vacc89ABCDEF = _mm256_max_ps(vacc89ABCDEFp0, vmin);
402       vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
403       vacc89ABCDEF = _mm256_min_ps(vacc89ABCDEF, vmax);
404 
405       _mm256_storeu_ps(output, vacc01234567);
406       _mm256_storeu_ps(output + 8, vacc89ABCDEF);
407       output += 16;
408     }
409     for (; c >= 8; c -= 8) {
410       __m256 vacc01234567p0 = _mm256_load_ps(w);
411 
412       const __m256 vi0x01234567 = _mm256_loadu_ps(i0);
413       i0 += 8;
414 
415       const __m256 vk0x01234567 = _mm256_load_ps(w + 16);
416       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567));
417 
418       const __m256 vi1x01234567 = _mm256_loadu_ps(i1);
419       i1 += 8;
420 
421       const __m256 vk1x01234567 = _mm256_load_ps(w + 32);
422       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi1x01234567, vk1x01234567));
423 
424       const __m256 vi2x01234567 = _mm256_loadu_ps(i2);
425       i2 += 8;
426 
427       const __m256 vk2x01234567 = _mm256_load_ps(w + 48);
428       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi2x01234567, vk2x01234567));
429 
430       const __m256 vi3x01234567 = _mm256_loadu_ps(i3);
431       i3 += 8;
432 
433       const __m256 vk3x01234567 = _mm256_load_ps(w + 64);
434       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi3x01234567, vk3x01234567));
435 
436       w += 8;
437 
438 
439       __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
440       vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
441 
442       _mm256_storeu_ps(output, vacc01234567);
443       output += 8;
444     }
445     if XNN_UNLIKELY(c != 0) {
446       assert(c >= 1);
447       assert(c <= 7);
448       const __m256i vmask = _mm256_loadu_si256((const __m256i*) &params->avx.mask_table[7 - c]);
449 
450       __m256 vacc01234567p0 = _mm256_load_ps(w);
451 
452       const __m256 vi0x01234567 = _mm256_maskload_ps(i0, vmask);
453       const __m256 vk0x01234567 = _mm256_load_ps(w + 16);
454       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567));
455 
456       const __m256 vi1x01234567 = _mm256_maskload_ps(i1, vmask);
457       const __m256 vk1x01234567 = _mm256_load_ps(w + 32);
458       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi1x01234567, vk1x01234567));
459 
460       const __m256 vi2x01234567 = _mm256_maskload_ps(i2, vmask);
461       const __m256 vk2x01234567 = _mm256_load_ps(w + 48);
462       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi2x01234567, vk2x01234567));
463 
464       const __m256 vi3x01234567 = _mm256_maskload_ps(i3, vmask);
465       const __m256 vk3x01234567 = _mm256_load_ps(w + 64);
466       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi3x01234567, vk3x01234567));
467 
468 
469       __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
470       vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
471 
472       __m128 vacc0123 = _mm256_castps256_ps128(vacc01234567);
473       if (c & 4) {
474         _mm_storeu_ps(output, vacc0123);
475         vacc0123 = _mm256_extractf128_ps(vacc01234567, 1);
476         output += 4;
477       }
478       if (c & 2) {
479         _mm_storel_pi((__m64*) output, vacc0123);
480         vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
481         output += 2;
482       }
483       if (c & 1) {
484         _mm_store_ss(output, vacc0123);
485         output += 1;
486       }
487     }
488 
489     output = (float*) ((uintptr_t) output + output_increment);
490   } while (--output_width != 0);
491 }
492 
xnn_f32_dwconv_minmax_ukernel_up16x9__avx(size_t channels,size_t output_width,const float ** input,const float * weights,float * output,size_t input_stride,size_t output_increment,size_t input_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])493 void xnn_f32_dwconv_minmax_ukernel_up16x9__avx(
494     size_t channels,
495     size_t output_width,
496     const float** input,
497     const float* weights,
498     float* output,
499     size_t input_stride,
500     size_t output_increment,
501     size_t input_offset,
502     const float* zero,
503     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
504 {
505   assert(channels != 0);
506   assert(output_width != 0);
507 
508   const __m256 vmax = _mm256_load_ps(params->avx.max);
509   const __m256 vmin = _mm256_load_ps(params->avx.min);
510   do {
511     const float* i0 = input[0];
512     assert(i0 != NULL);
513     if XNN_UNPREDICTABLE(i0 != zero) {
514       i0 = (const float*) ((uintptr_t) i0 + input_offset);
515     }
516     const float* i1 = input[1];
517     assert(i1 != NULL);
518     if XNN_UNPREDICTABLE(i1 != zero) {
519       i1 = (const float*) ((uintptr_t) i1 + input_offset);
520     }
521     const float* i2 = input[2];
522     assert(i2 != NULL);
523     if XNN_UNPREDICTABLE(i2 != zero) {
524       i2 = (const float*) ((uintptr_t) i2 + input_offset);
525     }
526     const float* i3 = input[3];
527     assert(i3 != NULL);
528     if XNN_UNPREDICTABLE(i3 != zero) {
529       i3 = (const float*) ((uintptr_t) i3 + input_offset);
530     }
531     const float* i4 = input[4];
532     assert(i4 != NULL);
533     if XNN_UNPREDICTABLE(i4 != zero) {
534       i4 = (const float*) ((uintptr_t) i4 + input_offset);
535     }
536     const float* i5 = input[5];
537     assert(i5 != NULL);
538     if XNN_UNPREDICTABLE(i5 != zero) {
539       i5 = (const float*) ((uintptr_t) i5 + input_offset);
540     }
541     const float* i6 = input[6];
542     assert(i6 != NULL);
543     if XNN_UNPREDICTABLE(i6 != zero) {
544       i6 = (const float*) ((uintptr_t) i6 + input_offset);
545     }
546     const float* i7 = input[7];
547     assert(i7 != NULL);
548     if XNN_UNPREDICTABLE(i7 != zero) {
549       i7 = (const float*) ((uintptr_t) i7 + input_offset);
550     }
551     const float* i8 = input[8];
552     assert(i8 != NULL);
553     if XNN_UNPREDICTABLE(i8 != zero) {
554       i8 = (const float*) ((uintptr_t) i8 + input_offset);
555     }
556     input = (const float**) ((uintptr_t) input + input_stride);
557 
558     size_t c = channels;
559     const float* w = weights;
560     for (; c >= 16; c -= 16) {
561       __m256 vacc01234567p0 = _mm256_load_ps(w);
562       __m256 vacc89ABCDEFp0 = _mm256_load_ps(w + 8);
563 
564 
565       const __m256 vi0x01234567 = _mm256_loadu_ps(i0);
566       const __m256 vi0x89ABCDEF = _mm256_loadu_ps(i0 + 8);
567       i0 += 16;
568 
569       const __m256 vk0x01234567 = _mm256_load_ps(w + 16);
570       const __m256 vk0x89ABCDEF = _mm256_load_ps(w + 24);
571       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567));
572       vacc89ABCDEFp0 = _mm256_add_ps(vacc89ABCDEFp0, _mm256_mul_ps(vi0x89ABCDEF, vk0x89ABCDEF));
573 
574       const __m256 vi1x01234567 = _mm256_loadu_ps(i1);
575       const __m256 vi1x89ABCDEF = _mm256_loadu_ps(i1 + 8);
576       i1 += 16;
577 
578       const __m256 vk1x01234567 = _mm256_load_ps(w + 32);
579       const __m256 vk1x89ABCDEF = _mm256_load_ps(w + 40);
580       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi1x01234567, vk1x01234567));
581       vacc89ABCDEFp0 = _mm256_add_ps(vacc89ABCDEFp0, _mm256_mul_ps(vi1x89ABCDEF, vk1x89ABCDEF));
582 
583       const __m256 vi2x01234567 = _mm256_loadu_ps(i2);
584       const __m256 vi2x89ABCDEF = _mm256_loadu_ps(i2 + 8);
585       i2 += 16;
586 
587       const __m256 vk2x01234567 = _mm256_load_ps(w + 48);
588       const __m256 vk2x89ABCDEF = _mm256_load_ps(w + 56);
589       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi2x01234567, vk2x01234567));
590       vacc89ABCDEFp0 = _mm256_add_ps(vacc89ABCDEFp0, _mm256_mul_ps(vi2x89ABCDEF, vk2x89ABCDEF));
591 
592       const __m256 vi3x01234567 = _mm256_loadu_ps(i3);
593       const __m256 vi3x89ABCDEF = _mm256_loadu_ps(i3 + 8);
594       i3 += 16;
595 
596       const __m256 vk3x01234567 = _mm256_load_ps(w + 64);
597       const __m256 vk3x89ABCDEF = _mm256_load_ps(w + 72);
598       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi3x01234567, vk3x01234567));
599       vacc89ABCDEFp0 = _mm256_add_ps(vacc89ABCDEFp0, _mm256_mul_ps(vi3x89ABCDEF, vk3x89ABCDEF));
600 
601       const __m256 vi4x01234567 = _mm256_loadu_ps(i4);
602       const __m256 vi4x89ABCDEF = _mm256_loadu_ps(i4 + 8);
603       i4 += 16;
604 
605       const __m256 vk4x01234567 = _mm256_load_ps(w + 80);
606       const __m256 vk4x89ABCDEF = _mm256_load_ps(w + 88);
607       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi4x01234567, vk4x01234567));
608       vacc89ABCDEFp0 = _mm256_add_ps(vacc89ABCDEFp0, _mm256_mul_ps(vi4x89ABCDEF, vk4x89ABCDEF));
609 
610       const __m256 vi5x01234567 = _mm256_loadu_ps(i5);
611       const __m256 vi5x89ABCDEF = _mm256_loadu_ps(i5 + 8);
612       i5 += 16;
613 
614       const __m256 vk5x01234567 = _mm256_load_ps(w + 96);
615       const __m256 vk5x89ABCDEF = _mm256_load_ps(w + 104);
616       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi5x01234567, vk5x01234567));
617       vacc89ABCDEFp0 = _mm256_add_ps(vacc89ABCDEFp0, _mm256_mul_ps(vi5x89ABCDEF, vk5x89ABCDEF));
618 
619       const __m256 vi6x01234567 = _mm256_loadu_ps(i6);
620       const __m256 vi6x89ABCDEF = _mm256_loadu_ps(i6 + 8);
621       i6 += 16;
622 
623       const __m256 vk6x01234567 = _mm256_load_ps(w + 112);
624       const __m256 vk6x89ABCDEF = _mm256_load_ps(w + 120);
625       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi6x01234567, vk6x01234567));
626       vacc89ABCDEFp0 = _mm256_add_ps(vacc89ABCDEFp0, _mm256_mul_ps(vi6x89ABCDEF, vk6x89ABCDEF));
627 
628       const __m256 vi7x01234567 = _mm256_loadu_ps(i7);
629       const __m256 vi7x89ABCDEF = _mm256_loadu_ps(i7 + 8);
630       i7 += 16;
631 
632       const __m256 vk7x01234567 = _mm256_load_ps(w + 128);
633       const __m256 vk7x89ABCDEF = _mm256_load_ps(w + 136);
634       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi7x01234567, vk7x01234567));
635       vacc89ABCDEFp0 = _mm256_add_ps(vacc89ABCDEFp0, _mm256_mul_ps(vi7x89ABCDEF, vk7x89ABCDEF));
636 
637       const __m256 vi8x01234567 = _mm256_loadu_ps(i8);
638       const __m256 vi8x89ABCDEF = _mm256_loadu_ps(i8 + 8);
639       i8 += 16;
640 
641       const __m256 vk8x01234567 = _mm256_load_ps(w + 144);
642       const __m256 vk8x89ABCDEF = _mm256_load_ps(w + 152);
643       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi8x01234567, vk8x01234567));
644       vacc89ABCDEFp0 = _mm256_add_ps(vacc89ABCDEFp0, _mm256_mul_ps(vi8x89ABCDEF, vk8x89ABCDEF));
645 
646       w += 160;
647 
648 
649       __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
650       __m256 vacc89ABCDEF = _mm256_max_ps(vacc89ABCDEFp0, vmin);
651       vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
652       vacc89ABCDEF = _mm256_min_ps(vacc89ABCDEF, vmax);
653 
654       _mm256_storeu_ps(output, vacc01234567);
655       _mm256_storeu_ps(output + 8, vacc89ABCDEF);
656       output += 16;
657     }
658     for (; c >= 8; c -= 8) {
659       __m256 vacc01234567p0 = _mm256_load_ps(w);
660 
661       const __m256 vi0x01234567 = _mm256_loadu_ps(i0);
662       i0 += 8;
663 
664       const __m256 vk0x01234567 = _mm256_load_ps(w + 16);
665       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567));
666 
667       const __m256 vi1x01234567 = _mm256_loadu_ps(i1);
668       i1 += 8;
669 
670       const __m256 vk1x01234567 = _mm256_load_ps(w + 32);
671       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi1x01234567, vk1x01234567));
672 
673       const __m256 vi2x01234567 = _mm256_loadu_ps(i2);
674       i2 += 8;
675 
676       const __m256 vk2x01234567 = _mm256_load_ps(w + 48);
677       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi2x01234567, vk2x01234567));
678 
679       const __m256 vi3x01234567 = _mm256_loadu_ps(i3);
680       i3 += 8;
681 
682       const __m256 vk3x01234567 = _mm256_load_ps(w + 64);
683       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi3x01234567, vk3x01234567));
684 
685       const __m256 vi4x01234567 = _mm256_loadu_ps(i4);
686       i4 += 8;
687 
688       const __m256 vk4x01234567 = _mm256_load_ps(w + 80);
689       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi4x01234567, vk4x01234567));
690 
691       const __m256 vi5x01234567 = _mm256_loadu_ps(i5);
692       i5 += 8;
693 
694       const __m256 vk5x01234567 = _mm256_load_ps(w + 96);
695       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi5x01234567, vk5x01234567));
696 
697       const __m256 vi6x01234567 = _mm256_loadu_ps(i6);
698       i6 += 8;
699 
700       const __m256 vk6x01234567 = _mm256_load_ps(w + 112);
701       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi6x01234567, vk6x01234567));
702 
703       const __m256 vi7x01234567 = _mm256_loadu_ps(i7);
704       i7 += 8;
705 
706       const __m256 vk7x01234567 = _mm256_load_ps(w + 128);
707       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi7x01234567, vk7x01234567));
708 
709       const __m256 vi8x01234567 = _mm256_loadu_ps(i8);
710       i8 += 8;
711 
712       const __m256 vk8x01234567 = _mm256_load_ps(w + 144);
713       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi8x01234567, vk8x01234567));
714 
715       w += 8;
716 
717 
718       __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
719       vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
720 
721       _mm256_storeu_ps(output, vacc01234567);
722       output += 8;
723     }
724     if XNN_UNLIKELY(c != 0) {
725       assert(c >= 1);
726       assert(c <= 7);
727       const __m256i vmask = _mm256_loadu_si256((const __m256i*) &params->avx.mask_table[7 - c]);
728 
729       __m256 vacc01234567p0 = _mm256_load_ps(w);
730 
731       const __m256 vi0x01234567 = _mm256_maskload_ps(i0, vmask);
732       const __m256 vk0x01234567 = _mm256_load_ps(w + 16);
733       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567));
734 
735       const __m256 vi1x01234567 = _mm256_maskload_ps(i1, vmask);
736       const __m256 vk1x01234567 = _mm256_load_ps(w + 32);
737       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi1x01234567, vk1x01234567));
738 
739       const __m256 vi2x01234567 = _mm256_maskload_ps(i2, vmask);
740       const __m256 vk2x01234567 = _mm256_load_ps(w + 48);
741       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi2x01234567, vk2x01234567));
742 
743       const __m256 vi3x01234567 = _mm256_maskload_ps(i3, vmask);
744       const __m256 vk3x01234567 = _mm256_load_ps(w + 64);
745       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi3x01234567, vk3x01234567));
746 
747       const __m256 vi4x01234567 = _mm256_maskload_ps(i4, vmask);
748       const __m256 vk4x01234567 = _mm256_load_ps(w + 80);
749       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi4x01234567, vk4x01234567));
750 
751       const __m256 vi5x01234567 = _mm256_maskload_ps(i5, vmask);
752       const __m256 vk5x01234567 = _mm256_load_ps(w + 96);
753       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi5x01234567, vk5x01234567));
754 
755       const __m256 vi6x01234567 = _mm256_maskload_ps(i6, vmask);
756       const __m256 vk6x01234567 = _mm256_load_ps(w + 112);
757       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi6x01234567, vk6x01234567));
758 
759       const __m256 vi7x01234567 = _mm256_maskload_ps(i7, vmask);
760       const __m256 vk7x01234567 = _mm256_load_ps(w + 128);
761       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi7x01234567, vk7x01234567));
762 
763       const __m256 vi8x01234567 = _mm256_maskload_ps(i8, vmask);
764       const __m256 vk8x01234567 = _mm256_load_ps(w + 144);
765       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi8x01234567, vk8x01234567));
766 
767 
768       __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
769       vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
770 
771       __m128 vacc0123 = _mm256_castps256_ps128(vacc01234567);
772       if (c & 4) {
773         _mm_storeu_ps(output, vacc0123);
774         vacc0123 = _mm256_extractf128_ps(vacc01234567, 1);
775         output += 4;
776       }
777       if (c & 2) {
778         _mm_storel_pi((__m64*) output, vacc0123);
779         vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
780         output += 2;
781       }
782       if (c & 1) {
783         _mm_store_ss(output, vacc0123);
784         output += 1;
785       }
786     }
787 
788     output = (float*) ((uintptr_t) output + output_increment);
789   } while (--output_width != 0);
790 }
791 
xnn_f32_dwconv_minmax_ukernel_up8x25__avx(size_t channels,size_t output_width,const float ** input,const float * weights,float * output,size_t input_stride,size_t output_increment,size_t input_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])792 void xnn_f32_dwconv_minmax_ukernel_up8x25__avx(
793     size_t channels,
794     size_t output_width,
795     const float** input,
796     const float* weights,
797     float* output,
798     size_t input_stride,
799     size_t output_increment,
800     size_t input_offset,
801     const float* zero,
802     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
803 {
804   assert(channels != 0);
805   assert(output_width != 0);
806 
807   const __m256 vmax = _mm256_load_ps(params->avx.max);
808   const __m256 vmin = _mm256_load_ps(params->avx.min);
809   do {
810     const float* i0 = input[0];
811     assert(i0 != NULL);
812     if XNN_UNPREDICTABLE(i0 != zero) {
813       i0 = (const float*) ((uintptr_t) i0 + input_offset);
814     }
815     const float* i1 = input[1];
816     assert(i1 != NULL);
817     if XNN_UNPREDICTABLE(i1 != zero) {
818       i1 = (const float*) ((uintptr_t) i1 + input_offset);
819     }
820     const float* i2 = input[2];
821     assert(i2 != NULL);
822     if XNN_UNPREDICTABLE(i2 != zero) {
823       i2 = (const float*) ((uintptr_t) i2 + input_offset);
824     }
825     const float* i3 = input[3];
826     assert(i3 != NULL);
827     if XNN_UNPREDICTABLE(i3 != zero) {
828       i3 = (const float*) ((uintptr_t) i3 + input_offset);
829     }
830     const float* i4 = input[4];
831     assert(i4 != NULL);
832     if XNN_UNPREDICTABLE(i4 != zero) {
833       i4 = (const float*) ((uintptr_t) i4 + input_offset);
834     }
835     const float* i5 = input[5];
836     assert(i5 != NULL);
837     if XNN_UNPREDICTABLE(i5 != zero) {
838       i5 = (const float*) ((uintptr_t) i5 + input_offset);
839     }
840     const float* i6 = input[6];
841     assert(i6 != NULL);
842     if XNN_UNPREDICTABLE(i6 != zero) {
843       i6 = (const float*) ((uintptr_t) i6 + input_offset);
844     }
845     const float* i7 = input[7];
846     assert(i7 != NULL);
847     if XNN_UNPREDICTABLE(i7 != zero) {
848       i7 = (const float*) ((uintptr_t) i7 + input_offset);
849     }
850     const float* i8 = input[8];
851     assert(i8 != NULL);
852     if XNN_UNPREDICTABLE(i8 != zero) {
853       i8 = (const float*) ((uintptr_t) i8 + input_offset);
854     }
855     const float* i9 = input[9];
856     assert(i9 != NULL);
857     if XNN_UNPREDICTABLE(i9 != zero) {
858       i9 = (const float*) ((uintptr_t) i9 + input_offset);
859     }
860     const float* i10 = input[10];
861     assert(i10 != NULL);
862     if XNN_UNPREDICTABLE(i10 != zero) {
863       i10 = (const float*) ((uintptr_t) i10 + input_offset);
864     }
865     const float* i11 = input[11];
866     assert(i11 != NULL);
867     if XNN_UNPREDICTABLE(i11 != zero) {
868       i11 = (const float*) ((uintptr_t) i11 + input_offset);
869     }
870     const float* i12 = input[12];
871     assert(i12 != NULL);
872     if XNN_UNPREDICTABLE(i12 != zero) {
873       i12 = (const float*) ((uintptr_t) i12 + input_offset);
874     }
875     const float* i13 = input[13];
876     assert(i13 != NULL);
877     if XNN_UNPREDICTABLE(i13 != zero) {
878       i13 = (const float*) ((uintptr_t) i13 + input_offset);
879     }
880     const float* i14 = input[14];
881     assert(i14 != NULL);
882     if XNN_UNPREDICTABLE(i14 != zero) {
883       i14 = (const float*) ((uintptr_t) i14 + input_offset);
884     }
885     const float* i15 = input[15];
886     assert(i15 != NULL);
887     if XNN_UNPREDICTABLE(i15 != zero) {
888       i15 = (const float*) ((uintptr_t) i15 + input_offset);
889     }
890     const float* i16 = input[16];
891     assert(i16 != NULL);
892     if XNN_UNPREDICTABLE(i16 != zero) {
893       i16 = (const float*) ((uintptr_t) i16 + input_offset);
894     }
895     const float* i17 = input[17];
896     assert(i17 != NULL);
897     if XNN_UNPREDICTABLE(i17 != zero) {
898       i17 = (const float*) ((uintptr_t) i17 + input_offset);
899     }
900     const float* i18 = input[18];
901     assert(i18 != NULL);
902     if XNN_UNPREDICTABLE(i18 != zero) {
903       i18 = (const float*) ((uintptr_t) i18 + input_offset);
904     }
905     const float* i19 = input[19];
906     assert(i19 != NULL);
907     if XNN_UNPREDICTABLE(i19 != zero) {
908       i19 = (const float*) ((uintptr_t) i19 + input_offset);
909     }
910     const float* i20 = input[20];
911     assert(i20 != NULL);
912     if XNN_UNPREDICTABLE(i20 != zero) {
913       i20 = (const float*) ((uintptr_t) i20 + input_offset);
914     }
915     const float* i21 = input[21];
916     assert(i21 != NULL);
917     if XNN_UNPREDICTABLE(i21 != zero) {
918       i21 = (const float*) ((uintptr_t) i21 + input_offset);
919     }
920     const float* i22 = input[22];
921     assert(i22 != NULL);
922     if XNN_UNPREDICTABLE(i22 != zero) {
923       i22 = (const float*) ((uintptr_t) i22 + input_offset);
924     }
925     const float* i23 = input[23];
926     assert(i23 != NULL);
927     if XNN_UNPREDICTABLE(i23 != zero) {
928       i23 = (const float*) ((uintptr_t) i23 + input_offset);
929     }
930     const float* i24 = input[24];
931     assert(i24 != NULL);
932     if XNN_UNPREDICTABLE(i24 != zero) {
933       i24 = (const float*) ((uintptr_t) i24 + input_offset);
934     }
935     input = (const float**) ((uintptr_t) input + input_stride);
936 
937     size_t c = channels;
938     const float* w = weights;
939     for (; c >= 8; c -= 8) {
940       __m256 vacc01234567p0 = _mm256_load_ps(w);
941 
942 
943       const __m256 vi0x01234567 = _mm256_loadu_ps(i0);
944       i0 += 8;
945 
946       const __m256 vk0x01234567 = _mm256_load_ps(w + 8);
947       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567));
948 
949       const __m256 vi1x01234567 = _mm256_loadu_ps(i1);
950       i1 += 8;
951 
952       const __m256 vk1x01234567 = _mm256_load_ps(w + 16);
953       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi1x01234567, vk1x01234567));
954 
955       const __m256 vi2x01234567 = _mm256_loadu_ps(i2);
956       i2 += 8;
957 
958       const __m256 vk2x01234567 = _mm256_load_ps(w + 24);
959       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi2x01234567, vk2x01234567));
960 
961       const __m256 vi3x01234567 = _mm256_loadu_ps(i3);
962       i3 += 8;
963 
964       const __m256 vk3x01234567 = _mm256_load_ps(w + 32);
965       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi3x01234567, vk3x01234567));
966 
967       const __m256 vi4x01234567 = _mm256_loadu_ps(i4);
968       i4 += 8;
969 
970       const __m256 vk4x01234567 = _mm256_load_ps(w + 40);
971       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi4x01234567, vk4x01234567));
972 
973       const __m256 vi5x01234567 = _mm256_loadu_ps(i5);
974       i5 += 8;
975 
976       const __m256 vk5x01234567 = _mm256_load_ps(w + 48);
977       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi5x01234567, vk5x01234567));
978 
979       const __m256 vi6x01234567 = _mm256_loadu_ps(i6);
980       i6 += 8;
981 
982       const __m256 vk6x01234567 = _mm256_load_ps(w + 56);
983       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi6x01234567, vk6x01234567));
984 
985       const __m256 vi7x01234567 = _mm256_loadu_ps(i7);
986       i7 += 8;
987 
988       const __m256 vk7x01234567 = _mm256_load_ps(w + 64);
989       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi7x01234567, vk7x01234567));
990 
991       const __m256 vi8x01234567 = _mm256_loadu_ps(i8);
992       i8 += 8;
993 
994       const __m256 vk8x01234567 = _mm256_load_ps(w + 72);
995       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi8x01234567, vk8x01234567));
996 
997       const __m256 vi9x01234567 = _mm256_loadu_ps(i9);
998       i9 += 8;
999 
1000       const __m256 vk9x01234567 = _mm256_load_ps(w + 80);
1001       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi9x01234567, vk9x01234567));
1002 
1003       const __m256 vi10x01234567 = _mm256_loadu_ps(i10);
1004       i10 += 8;
1005 
1006       const __m256 vk10x01234567 = _mm256_load_ps(w + 88);
1007       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi10x01234567, vk10x01234567));
1008 
1009       const __m256 vi11x01234567 = _mm256_loadu_ps(i11);
1010       i11 += 8;
1011 
1012       const __m256 vk11x01234567 = _mm256_load_ps(w + 96);
1013       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi11x01234567, vk11x01234567));
1014 
1015       const __m256 vi12x01234567 = _mm256_loadu_ps(i12);
1016       i12 += 8;
1017 
1018       const __m256 vk12x01234567 = _mm256_load_ps(w + 104);
1019       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi12x01234567, vk12x01234567));
1020 
1021       const __m256 vi13x01234567 = _mm256_loadu_ps(i13);
1022       i13 += 8;
1023 
1024       const __m256 vk13x01234567 = _mm256_load_ps(w + 112);
1025       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi13x01234567, vk13x01234567));
1026 
1027       const __m256 vi14x01234567 = _mm256_loadu_ps(i14);
1028       i14 += 8;
1029 
1030       const __m256 vk14x01234567 = _mm256_load_ps(w + 120);
1031       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi14x01234567, vk14x01234567));
1032 
1033       const __m256 vi15x01234567 = _mm256_loadu_ps(i15);
1034       i15 += 8;
1035 
1036       const __m256 vk15x01234567 = _mm256_load_ps(w + 128);
1037       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi15x01234567, vk15x01234567));
1038 
1039       const __m256 vi16x01234567 = _mm256_loadu_ps(i16);
1040       i16 += 8;
1041 
1042       const __m256 vk16x01234567 = _mm256_load_ps(w + 136);
1043       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi16x01234567, vk16x01234567));
1044 
1045       const __m256 vi17x01234567 = _mm256_loadu_ps(i17);
1046       i17 += 8;
1047 
1048       const __m256 vk17x01234567 = _mm256_load_ps(w + 144);
1049       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi17x01234567, vk17x01234567));
1050 
1051       const __m256 vi18x01234567 = _mm256_loadu_ps(i18);
1052       i18 += 8;
1053 
1054       const __m256 vk18x01234567 = _mm256_load_ps(w + 152);
1055       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi18x01234567, vk18x01234567));
1056 
1057       const __m256 vi19x01234567 = _mm256_loadu_ps(i19);
1058       i19 += 8;
1059 
1060       const __m256 vk19x01234567 = _mm256_load_ps(w + 160);
1061       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi19x01234567, vk19x01234567));
1062 
1063       const __m256 vi20x01234567 = _mm256_loadu_ps(i20);
1064       i20 += 8;
1065 
1066       const __m256 vk20x01234567 = _mm256_load_ps(w + 168);
1067       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi20x01234567, vk20x01234567));
1068 
1069       const __m256 vi21x01234567 = _mm256_loadu_ps(i21);
1070       i21 += 8;
1071 
1072       const __m256 vk21x01234567 = _mm256_load_ps(w + 176);
1073       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi21x01234567, vk21x01234567));
1074 
1075       const __m256 vi22x01234567 = _mm256_loadu_ps(i22);
1076       i22 += 8;
1077 
1078       const __m256 vk22x01234567 = _mm256_load_ps(w + 184);
1079       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi22x01234567, vk22x01234567));
1080 
1081       const __m256 vi23x01234567 = _mm256_loadu_ps(i23);
1082       i23 += 8;
1083 
1084       const __m256 vk23x01234567 = _mm256_load_ps(w + 192);
1085       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi23x01234567, vk23x01234567));
1086 
1087       const __m256 vi24x01234567 = _mm256_loadu_ps(i24);
1088       i24 += 8;
1089 
1090       const __m256 vk24x01234567 = _mm256_load_ps(w + 200);
1091       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi24x01234567, vk24x01234567));
1092 
1093       w += 208;
1094 
1095 
1096       __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
1097       vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
1098 
1099       _mm256_storeu_ps(output, vacc01234567);
1100       output += 8;
1101     }
1102     if XNN_UNLIKELY(c != 0) {
1103       assert(c >= 1);
1104       assert(c <= 7);
1105       const __m256i vmask = _mm256_loadu_si256((const __m256i*) &params->avx.mask_table[7 - c]);
1106 
1107       __m256 vacc01234567p0 = _mm256_load_ps(w);
1108 
1109       const __m256 vi0x01234567 = _mm256_maskload_ps(i0, vmask);
1110       const __m256 vk0x01234567 = _mm256_load_ps(w + 8);
1111       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567));
1112 
1113       const __m256 vi1x01234567 = _mm256_maskload_ps(i1, vmask);
1114       const __m256 vk1x01234567 = _mm256_load_ps(w + 16);
1115       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi1x01234567, vk1x01234567));
1116 
1117       const __m256 vi2x01234567 = _mm256_maskload_ps(i2, vmask);
1118       const __m256 vk2x01234567 = _mm256_load_ps(w + 24);
1119       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi2x01234567, vk2x01234567));
1120 
1121       const __m256 vi3x01234567 = _mm256_maskload_ps(i3, vmask);
1122       const __m256 vk3x01234567 = _mm256_load_ps(w + 32);
1123       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi3x01234567, vk3x01234567));
1124 
1125       const __m256 vi4x01234567 = _mm256_maskload_ps(i4, vmask);
1126       const __m256 vk4x01234567 = _mm256_load_ps(w + 40);
1127       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi4x01234567, vk4x01234567));
1128 
1129       const __m256 vi5x01234567 = _mm256_maskload_ps(i5, vmask);
1130       const __m256 vk5x01234567 = _mm256_load_ps(w + 48);
1131       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi5x01234567, vk5x01234567));
1132 
1133       const __m256 vi6x01234567 = _mm256_maskload_ps(i6, vmask);
1134       const __m256 vk6x01234567 = _mm256_load_ps(w + 56);
1135       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi6x01234567, vk6x01234567));
1136 
1137       const __m256 vi7x01234567 = _mm256_maskload_ps(i7, vmask);
1138       const __m256 vk7x01234567 = _mm256_load_ps(w + 64);
1139       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi7x01234567, vk7x01234567));
1140 
1141       const __m256 vi8x01234567 = _mm256_maskload_ps(i8, vmask);
1142       const __m256 vk8x01234567 = _mm256_load_ps(w + 72);
1143       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi8x01234567, vk8x01234567));
1144 
1145       const __m256 vi9x01234567 = _mm256_maskload_ps(i9, vmask);
1146       const __m256 vk9x01234567 = _mm256_load_ps(w + 80);
1147       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi9x01234567, vk9x01234567));
1148 
1149       const __m256 vi10x01234567 = _mm256_maskload_ps(i10, vmask);
1150       const __m256 vk10x01234567 = _mm256_load_ps(w + 88);
1151       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi10x01234567, vk10x01234567));
1152 
1153       const __m256 vi11x01234567 = _mm256_maskload_ps(i11, vmask);
1154       const __m256 vk11x01234567 = _mm256_load_ps(w + 96);
1155       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi11x01234567, vk11x01234567));
1156 
1157       const __m256 vi12x01234567 = _mm256_maskload_ps(i12, vmask);
1158       const __m256 vk12x01234567 = _mm256_load_ps(w + 104);
1159       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi12x01234567, vk12x01234567));
1160 
1161       const __m256 vi13x01234567 = _mm256_maskload_ps(i13, vmask);
1162       const __m256 vk13x01234567 = _mm256_load_ps(w + 112);
1163       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi13x01234567, vk13x01234567));
1164 
1165       const __m256 vi14x01234567 = _mm256_maskload_ps(i14, vmask);
1166       const __m256 vk14x01234567 = _mm256_load_ps(w + 120);
1167       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi14x01234567, vk14x01234567));
1168 
1169       const __m256 vi15x01234567 = _mm256_maskload_ps(i15, vmask);
1170       const __m256 vk15x01234567 = _mm256_load_ps(w + 128);
1171       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi15x01234567, vk15x01234567));
1172 
1173       const __m256 vi16x01234567 = _mm256_maskload_ps(i16, vmask);
1174       const __m256 vk16x01234567 = _mm256_load_ps(w + 136);
1175       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi16x01234567, vk16x01234567));
1176 
1177       const __m256 vi17x01234567 = _mm256_maskload_ps(i17, vmask);
1178       const __m256 vk17x01234567 = _mm256_load_ps(w + 144);
1179       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi17x01234567, vk17x01234567));
1180 
1181       const __m256 vi18x01234567 = _mm256_maskload_ps(i18, vmask);
1182       const __m256 vk18x01234567 = _mm256_load_ps(w + 152);
1183       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi18x01234567, vk18x01234567));
1184 
1185       const __m256 vi19x01234567 = _mm256_maskload_ps(i19, vmask);
1186       const __m256 vk19x01234567 = _mm256_load_ps(w + 160);
1187       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi19x01234567, vk19x01234567));
1188 
1189       const __m256 vi20x01234567 = _mm256_maskload_ps(i20, vmask);
1190       const __m256 vk20x01234567 = _mm256_load_ps(w + 168);
1191       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi20x01234567, vk20x01234567));
1192 
1193       const __m256 vi21x01234567 = _mm256_maskload_ps(i21, vmask);
1194       const __m256 vk21x01234567 = _mm256_load_ps(w + 176);
1195       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi21x01234567, vk21x01234567));
1196 
1197       const __m256 vi22x01234567 = _mm256_maskload_ps(i22, vmask);
1198       const __m256 vk22x01234567 = _mm256_load_ps(w + 184);
1199       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi22x01234567, vk22x01234567));
1200 
1201       const __m256 vi23x01234567 = _mm256_maskload_ps(i23, vmask);
1202       const __m256 vk23x01234567 = _mm256_load_ps(w + 192);
1203       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi23x01234567, vk23x01234567));
1204 
1205       const __m256 vi24x01234567 = _mm256_maskload_ps(i24, vmask);
1206       const __m256 vk24x01234567 = _mm256_load_ps(w + 200);
1207       vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi24x01234567, vk24x01234567));
1208 
1209 
1210       __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
1211       vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
1212 
1213       __m128 vacc0123 = _mm256_castps256_ps128(vacc01234567);
1214       if (c & 4) {
1215         _mm_storeu_ps(output, vacc0123);
1216         vacc0123 = _mm256_extractf128_ps(vacc01234567, 1);
1217         output += 4;
1218       }
1219       if (c & 2) {
1220         _mm_storel_pi((__m64*) output, vacc0123);
1221         vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
1222         output += 2;
1223       }
1224       if (c & 1) {
1225         _mm_store_ss(output, vacc0123);
1226         output += 1;
1227       }
1228     }
1229 
1230     output = (float*) ((uintptr_t) output + output_increment);
1231   } while (--output_width != 0);
1232 }
1233 
xnn_f32_f16_vcvt_ukernel__avx_x24(size_t n,const float * input,void * output,const union xnn_f32_f16_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])1234 void xnn_f32_f16_vcvt_ukernel__avx_x24(
1235     size_t n,
1236     const float* input,
1237     void* output,
1238     const union xnn_f32_f16_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1239 {
1240   assert(n != 0);
1241   assert(n % sizeof(float) == 0);
1242   assert(input != NULL);
1243   assert(output != NULL);
1244 
1245   const __m128 vnonsign_mask = _mm_load_ps((const float*) params->sse2.nonsign_mask);
1246   const __m128i vexp_bias = _mm_load_si128((const __m128i*) params->sse2.exp_bias);
1247   const __m128 vscale_to_inf = _mm_load_ps(params->sse2.scale_to_inf);
1248   const __m128i vexpw_max = _mm_load_si128((const __m128i*) params->sse2.expw_max);
1249   const __m128 vscale_to_zero = _mm_load_ps(params->sse2.scale_to_zero);
1250   const __m128i vbias_min = _mm_load_si128((const __m128i*) params->sse2.bias_min);
1251   const __m128i vmanth_mask = _mm_load_si128((const __m128i*) params->sse2.manth_mask);
1252   const __m128i vexph_mask = _mm_load_si128((const __m128i*) params->sse2.exph_mask);
1253   const __m128i vnanh = _mm_load_si128((const __m128i*) params->sse2.nanh);
1254 
1255   uint16_t* o = (uint16_t*) output;
1256   for (; n >= 24 * sizeof(float); n -= 24 * sizeof(float)) {
1257     const __m128 vx0 = _mm_loadu_ps(input);
1258     const __m128 vx1 = _mm_loadu_ps(input + 4);
1259     const __m128 vx2 = _mm_loadu_ps(input + 8);
1260     const __m128 vx3 = _mm_loadu_ps(input + 12);
1261     const __m128 vx4 = _mm_loadu_ps(input + 16);
1262     const __m128 vx5 = _mm_loadu_ps(input + 20);
1263     input += 24;
1264 
1265     const __m128 vabsx0 = _mm_and_ps(vx0, vnonsign_mask);
1266     const __m128 vabsx1 = _mm_and_ps(vx1, vnonsign_mask);
1267     const __m128 vabsx2 = _mm_and_ps(vx2, vnonsign_mask);
1268     const __m128 vabsx3 = _mm_and_ps(vx3, vnonsign_mask);
1269     const __m128 vabsx4 = _mm_and_ps(vx4, vnonsign_mask);
1270     const __m128 vabsx5 = _mm_and_ps(vx5, vnonsign_mask);
1271 
1272     const __m128 vsignx0 = _mm_xor_ps(vx0, vabsx0);
1273     const __m128 vsignx1 = _mm_xor_ps(vx1, vabsx1);
1274     const __m128 vsignx2 = _mm_xor_ps(vx2, vabsx2);
1275     const __m128 vsignx3 = _mm_xor_ps(vx3, vabsx3);
1276     const __m128 vsignx4 = _mm_xor_ps(vx4, vabsx4);
1277     const __m128 vsignx5 = _mm_xor_ps(vx5, vabsx5);
1278 
1279     __m128i vbias0 = _mm_add_epi32(_mm_castps_si128(vabsx0), vexp_bias);
1280     __m128i vbias1 = _mm_add_epi32(_mm_castps_si128(vabsx1), vexp_bias);
1281     __m128i vbias2 = _mm_add_epi32(_mm_castps_si128(vabsx2), vexp_bias);
1282     __m128i vbias3 = _mm_add_epi32(_mm_castps_si128(vabsx3), vexp_bias);
1283     __m128i vbias4 = _mm_add_epi32(_mm_castps_si128(vabsx4), vexp_bias);
1284     __m128i vbias5 = _mm_add_epi32(_mm_castps_si128(vabsx5), vexp_bias);
1285 
1286     __m128 vf0 = _mm_mul_ps(vabsx0, vscale_to_inf);
1287     __m128 vf1 = _mm_mul_ps(vabsx1, vscale_to_inf);
1288     __m128 vf2 = _mm_mul_ps(vabsx2, vscale_to_inf);
1289     __m128 vf3 = _mm_mul_ps(vabsx3, vscale_to_inf);
1290     __m128 vf4 = _mm_mul_ps(vabsx4, vscale_to_inf);
1291     __m128 vf5 = _mm_mul_ps(vabsx5, vscale_to_inf);
1292 
1293     const __m128i vnanmaskw0 = _mm_cmpgt_epi32(_mm_castps_si128(vabsx0), vexpw_max);
1294     const __m128i vnanmaskw1 = _mm_cmpgt_epi32(_mm_castps_si128(vabsx1), vexpw_max);
1295     const __m128i vnanmaskw2 = _mm_cmpgt_epi32(_mm_castps_si128(vabsx2), vexpw_max);
1296     const __m128i vnanmaskw3 = _mm_cmpgt_epi32(_mm_castps_si128(vabsx3), vexpw_max);
1297     const __m128i vnanmaskw4 = _mm_cmpgt_epi32(_mm_castps_si128(vabsx4), vexpw_max);
1298     const __m128i vnanmaskw5 = _mm_cmpgt_epi32(_mm_castps_si128(vabsx5), vexpw_max);
1299 
1300     vbias0 = _mm_and_si128(vbias0, vexpw_max);
1301     vbias1 = _mm_and_si128(vbias1, vexpw_max);
1302     vbias2 = _mm_and_si128(vbias2, vexpw_max);
1303     vbias3 = _mm_and_si128(vbias3, vexpw_max);
1304     vbias4 = _mm_and_si128(vbias4, vexpw_max);
1305     vbias5 = _mm_and_si128(vbias5, vexpw_max);
1306 
1307     vf0 = _mm_mul_ps(vf0, vscale_to_zero);
1308     vf1 = _mm_mul_ps(vf1, vscale_to_zero);
1309     vf2 = _mm_mul_ps(vf2, vscale_to_zero);
1310     vf3 = _mm_mul_ps(vf3, vscale_to_zero);
1311     vf4 = _mm_mul_ps(vf4, vscale_to_zero);
1312     vf5 = _mm_mul_ps(vf5, vscale_to_zero);
1313 
1314     const __m128i vnanmaskh0 = _mm_packs_epi32(vnanmaskw0, vnanmaskw1);
1315     const __m128i vnanmaskh1 = _mm_packs_epi32(vnanmaskw2, vnanmaskw3);
1316     const __m128i vnanmaskh2 = _mm_packs_epi32(vnanmaskw4, vnanmaskw5);
1317 
1318     const __m128i vsignh0 = _mm_packs_epi32(_mm_castps_si128(vsignx0), _mm_castps_si128(vsignx1));
1319     const __m128i vsignh1 = _mm_packs_epi32(_mm_castps_si128(vsignx2), _mm_castps_si128(vsignx3));
1320     const __m128i vsignh2 = _mm_packs_epi32(_mm_castps_si128(vsignx4), _mm_castps_si128(vsignx5));
1321 
1322     vbias0 = _mm_max_epi16(vbias0, vbias_min);
1323     vbias1 = _mm_max_epi16(vbias1, vbias_min);
1324     vbias2 = _mm_max_epi16(vbias2, vbias_min);
1325     vbias3 = _mm_max_epi16(vbias3, vbias_min);
1326     vbias4 = _mm_max_epi16(vbias4, vbias_min);
1327     vbias5 = _mm_max_epi16(vbias5, vbias_min);
1328 
1329 
1330     vf0 = _mm_add_ps(vf0, _mm_castsi128_ps(vbias0));
1331     vf1 = _mm_add_ps(vf1, _mm_castsi128_ps(vbias1));
1332     vf2 = _mm_add_ps(vf2, _mm_castsi128_ps(vbias2));
1333     vf3 = _mm_add_ps(vf3, _mm_castsi128_ps(vbias3));
1334     vf4 = _mm_add_ps(vf4, _mm_castsi128_ps(vbias4));
1335     vf5 = _mm_add_ps(vf5, _mm_castsi128_ps(vbias5));
1336 
1337 
1338     __m128i vexpw0 = _mm_srli_epi32(_mm_castps_si128(vf0), 13);
1339     __m128i vexpw1 = _mm_srli_epi32(_mm_castps_si128(vf1), 13);
1340     __m128i vexpw2 = _mm_srli_epi32(_mm_castps_si128(vf2), 13);
1341     __m128i vexpw3 = _mm_srli_epi32(_mm_castps_si128(vf3), 13);
1342     __m128i vexpw4 = _mm_srli_epi32(_mm_castps_si128(vf4), 13);
1343     __m128i vexpw5 = _mm_srli_epi32(_mm_castps_si128(vf5), 13);
1344 
1345     const __m128i vmantw0 = _mm_and_si128(_mm_castps_si128(vf0), vmanth_mask);
1346     const __m128i vmantw1 = _mm_and_si128(_mm_castps_si128(vf1), vmanth_mask);
1347     const __m128i vmantw2 = _mm_and_si128(_mm_castps_si128(vf2), vmanth_mask);
1348     const __m128i vmantw3 = _mm_and_si128(_mm_castps_si128(vf3), vmanth_mask);
1349     const __m128i vmantw4 = _mm_and_si128(_mm_castps_si128(vf4), vmanth_mask);
1350     const __m128i vmantw5 = _mm_and_si128(_mm_castps_si128(vf5), vmanth_mask);
1351 
1352     vexpw0 = _mm_and_si128(vexpw0, vexph_mask);
1353     vexpw1 = _mm_and_si128(vexpw1, vexph_mask);
1354     vexpw2 = _mm_and_si128(vexpw2, vexph_mask);
1355     vexpw3 = _mm_and_si128(vexpw3, vexph_mask);
1356     vexpw4 = _mm_and_si128(vexpw4, vexph_mask);
1357     vexpw5 = _mm_and_si128(vexpw5, vexph_mask);
1358 
1359     const __m128i vnonsignw0 = _mm_add_epi32(vmantw0, vexpw0);
1360     const __m128i vnonsignw1 = _mm_add_epi32(vmantw1, vexpw1);
1361     const __m128i vnonsignw2 = _mm_add_epi32(vmantw2, vexpw2);
1362     const __m128i vnonsignw3 = _mm_add_epi32(vmantw3, vexpw3);
1363     const __m128i vnonsignw4 = _mm_add_epi32(vmantw4, vexpw4);
1364     const __m128i vnonsignw5 = _mm_add_epi32(vmantw5, vexpw5);
1365 
1366     const __m128i vnonsignh0 = _mm_packs_epi32(vnonsignw0, vnonsignw1);
1367     const __m128i vnonsignh1 = _mm_packs_epi32(vnonsignw2, vnonsignw3);
1368     const __m128i vnonsignh2 = _mm_packs_epi32(vnonsignw4, vnonsignw5);
1369 
1370     const __m128i vabsh0 = _mm_blendv_epi8(vnonsignh0, vnanh, vnanmaskh0);
1371     const __m128i vabsh1 = _mm_blendv_epi8(vnonsignh1, vnanh, vnanmaskh1);
1372     const __m128i vabsh2 = _mm_blendv_epi8(vnonsignh2, vnanh, vnanmaskh2);
1373 
1374     const __m128i vh0 = _mm_or_si128(vabsh0, vsignh0);
1375     const __m128i vh1 = _mm_or_si128(vabsh1, vsignh1);
1376     const __m128i vh2 = _mm_or_si128(vabsh2, vsignh2);
1377 
1378     _mm_storeu_si128((__m128i*) o, vh0);
1379     _mm_storeu_si128((__m128i*) (o + 8), vh1);
1380     _mm_storeu_si128((__m128i*) (o + 16), vh2);
1381     o += 24;
1382   }
1383   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
1384     const __m128 vx_lo = _mm_loadu_ps(input);
1385     const __m128 vx_hi = _mm_loadu_ps(input + 4);
1386     input += 8;
1387 
1388     const __m128 vabsx_lo = _mm_and_ps(vx_lo, vnonsign_mask);
1389     const __m128 vabsx_hi = _mm_and_ps(vx_hi, vnonsign_mask);
1390 
1391     const __m128 vsignx_lo = _mm_xor_ps(vx_lo, vabsx_lo);
1392     const __m128 vsignx_hi = _mm_xor_ps(vx_hi, vabsx_hi);
1393     __m128i vbias_lo = _mm_add_epi32(_mm_castps_si128(vabsx_lo), vexp_bias);
1394     __m128i vbias_hi = _mm_add_epi32(_mm_castps_si128(vabsx_hi), vexp_bias);
1395     __m128 vf_lo = _mm_mul_ps(vabsx_lo, vscale_to_inf);
1396     __m128 vf_hi = _mm_mul_ps(vabsx_hi, vscale_to_inf);
1397     const __m128i vnanmaskw_lo = _mm_cmpgt_epi32(_mm_castps_si128(vabsx_lo), vexpw_max);
1398     const __m128i vnanmaskw_hi = _mm_cmpgt_epi32(_mm_castps_si128(vabsx_hi), vexpw_max);
1399 
1400     vbias_lo = _mm_and_si128(vbias_lo, vexpw_max);
1401     vbias_hi = _mm_and_si128(vbias_hi, vexpw_max);
1402     vf_lo = _mm_mul_ps(vf_lo, vscale_to_zero);
1403     vf_hi = _mm_mul_ps(vf_hi, vscale_to_zero);
1404     const __m128i vnanmaskh = _mm_packs_epi32(vnanmaskw_lo, vnanmaskw_hi);
1405     const __m128i vsignh = _mm_packs_epi32(_mm_castps_si128(vsignx_lo), _mm_castps_si128(vsignx_hi));
1406 
1407     vbias_lo = _mm_max_epi16(vbias_lo, vbias_min);
1408     vbias_hi = _mm_max_epi16(vbias_hi, vbias_min);
1409 
1410     vf_lo = _mm_add_ps(vf_lo, _mm_castsi128_ps(vbias_lo));
1411     vf_hi = _mm_add_ps(vf_hi, _mm_castsi128_ps(vbias_hi));
1412 
1413     __m128i vexpw_lo = _mm_srli_epi32(_mm_castps_si128(vf_lo), 13);
1414     __m128i vexpw_hi = _mm_srli_epi32(_mm_castps_si128(vf_hi), 13);
1415     const __m128i vmantw_lo = _mm_and_si128(_mm_castps_si128(vf_lo), vmanth_mask);
1416     const __m128i vmantw_hi = _mm_and_si128(_mm_castps_si128(vf_hi), vmanth_mask);
1417 
1418     vexpw_lo = _mm_and_si128(vexpw_lo, vexph_mask);
1419     vexpw_hi = _mm_and_si128(vexpw_hi, vexph_mask);
1420 
1421     const __m128i vnonsignw_lo = _mm_add_epi32(vmantw_lo, vexpw_lo);
1422     const __m128i vnonsignw_hi = _mm_add_epi32(vmantw_hi, vexpw_hi);
1423 
1424     const __m128i vnonsignh = _mm_packs_epi32(vnonsignw_lo, vnonsignw_hi);
1425 
1426     const __m128i vabsh = _mm_blendv_epi8(vnonsignh, vnanh, vnanmaskh);
1427 
1428     const __m128i vh = _mm_or_si128(vabsh, vsignh);
1429 
1430     _mm_storeu_si128((__m128i*) o, vh);
1431     o += 8;
1432   }
1433   if XNN_UNPREDICTABLE(n != 0) {
1434     const __m128 vx_lo = _mm_loadu_ps(input);
1435     const float* input_hi = (const float*) ((uintptr_t) input + (n & (4 * sizeof(float))));
1436     const __m128 vx_hi = _mm_loadu_ps(input_hi);
1437 
1438     const __m128 vabsx_lo = _mm_and_ps(vx_lo, vnonsign_mask);
1439     const __m128 vabsx_hi = _mm_and_ps(vx_hi, vnonsign_mask);
1440 
1441     const __m128 vsignx_lo = _mm_xor_ps(vx_lo, vabsx_lo);
1442     const __m128 vsignx_hi = _mm_xor_ps(vx_hi, vabsx_hi);
1443     __m128i vbias_lo = _mm_add_epi32(_mm_castps_si128(vabsx_lo), vexp_bias);
1444     __m128i vbias_hi = _mm_add_epi32(_mm_castps_si128(vabsx_hi), vexp_bias);
1445     __m128 vf_lo = _mm_mul_ps(vabsx_lo, vscale_to_inf);
1446     __m128 vf_hi = _mm_mul_ps(vabsx_hi, vscale_to_inf);
1447     const __m128i vnanmaskw_lo = _mm_cmpgt_epi32(_mm_castps_si128(vabsx_lo), vexpw_max);
1448     const __m128i vnanmaskw_hi = _mm_cmpgt_epi32(_mm_castps_si128(vabsx_hi), vexpw_max);
1449 
1450     vbias_lo = _mm_and_si128(vbias_lo, vexpw_max);
1451     vbias_hi = _mm_and_si128(vbias_hi, vexpw_max);
1452     vf_lo = _mm_mul_ps(vf_lo, vscale_to_zero);
1453     vf_hi = _mm_mul_ps(vf_hi, vscale_to_zero);
1454     const __m128i vnanmaskh = _mm_packs_epi32(vnanmaskw_lo, vnanmaskw_hi);
1455     const __m128i vsignh = _mm_packs_epi32(_mm_castps_si128(vsignx_lo), _mm_castps_si128(vsignx_hi));
1456 
1457     vbias_lo = _mm_max_epi16(vbias_lo, vbias_min);
1458     vbias_hi = _mm_max_epi16(vbias_hi, vbias_min);
1459 
1460     vf_lo = _mm_add_ps(vf_lo, _mm_castsi128_ps(vbias_lo));
1461     vf_hi = _mm_add_ps(vf_hi, _mm_castsi128_ps(vbias_hi));
1462 
1463     __m128i vexpw_lo = _mm_srli_epi32(_mm_castps_si128(vf_lo), 13);
1464     __m128i vexpw_hi = _mm_srli_epi32(_mm_castps_si128(vf_hi), 13);
1465     const __m128i vmantw_lo = _mm_and_si128(_mm_castps_si128(vf_lo), vmanth_mask);
1466     const __m128i vmantw_hi = _mm_and_si128(_mm_castps_si128(vf_hi), vmanth_mask);
1467 
1468     vexpw_lo = _mm_and_si128(vexpw_lo, vexph_mask);
1469     vexpw_hi = _mm_and_si128(vexpw_hi, vexph_mask);
1470 
1471     const __m128i vnonsignw_lo = _mm_add_epi32(vmantw_lo, vexpw_lo);
1472     const __m128i vnonsignw_hi = _mm_add_epi32(vmantw_hi, vexpw_hi);
1473 
1474     const __m128i vnonsignh = _mm_packs_epi32(vnonsignw_lo, vnonsignw_hi);
1475 
1476     const __m128i vabsh = _mm_blendv_epi8(vnonsignh, vnanh, vnanmaskh);
1477 
1478     __m128i vh = _mm_or_si128(vabsh, vsignh);
1479 
1480     if (n & (4 * sizeof(float))) {
1481       _mm_storel_epi64((__m128i*) o, vh);
1482       vh = _mm_unpackhi_epi64(vh, vh);
1483       o += 4;
1484     }
1485     if (n & (2 * sizeof(float))) {
1486       unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(vh));
1487       vh = _mm_srli_epi64(vh, 32);
1488       o += 2;
1489     }
1490     if (n & (1 * sizeof(float))) {
1491       *o = (uint16_t) _mm_extract_epi16(vh, 0);
1492     }
1493   }
1494 }
1495 
xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast(size_t mr,size_t nc,size_t kc,const float * restrict a,size_t a_stride,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1496 void xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast(
1497     size_t mr,
1498     size_t nc,
1499     size_t kc,
1500     const float*restrict a,
1501     size_t a_stride,
1502     const float*restrict w,
1503     float*restrict c,
1504     size_t cm_stride,
1505     size_t cn_stride,
1506     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
1507 {
1508   assert(mr != 0);
1509   assert(mr <= 1);
1510   assert(nc != 0);
1511   assert(kc != 0);
1512   assert(kc % sizeof(float) == 0);
1513   assert(a != NULL);
1514   assert(w != NULL);
1515   assert(c != NULL);
1516 
1517   const float* a0 = a;
1518   float* c0 = c;
1519 
1520   do {
1521     __m256 vacc0x01234567 = _mm256_load_ps(w + 0);
1522     __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8);
1523     w += 16;
1524 
1525     size_t k = kc;
1526     do {
1527       const __m256 va0 = _mm256_broadcast_ss(a0);
1528       a0 += 1;
1529 
1530       const __m256 vb01234567 = _mm256_load_ps(w);
1531       const __m256 vb89ABCDEF = _mm256_load_ps(w + 8);
1532       w += 16;
1533 
1534       vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
1535       vacc0x89ABCDEF = _mm256_add_ps(vacc0x89ABCDEF, _mm256_mul_ps(va0, vb89ABCDEF));
1536 
1537       k -= sizeof(float);
1538     } while (k != 0);
1539 
1540     const __m256 vmin = _mm256_load_ps(params->avx.min);
1541     vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
1542     vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
1543 
1544     const __m256 vmax = _mm256_load_ps(params->avx.max);
1545     vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
1546     vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
1547 
1548     if XNN_LIKELY(nc >= 16) {
1549       _mm256_storeu_ps(c0, vacc0x01234567);
1550       _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF);
1551       c0 = (float*) ((uintptr_t) c0 + cn_stride);
1552 
1553       a0 = (const float*) ((uintptr_t) a0 - kc);
1554 
1555       nc -= 16;
1556     } else {
1557       if (nc & 8) {
1558         _mm256_storeu_ps(c0, vacc0x01234567);
1559 
1560         vacc0x01234567 = vacc0x89ABCDEF;
1561 
1562         c0 += 8;
1563       }
1564       __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
1565       if (nc & 4) {
1566         _mm_storeu_ps(c0, vacc0x0123);
1567 
1568         vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
1569 
1570         c0 += 4;
1571       }
1572       if (nc & 2) {
1573         _mm_storel_pi((__m64*) c0, vacc0x0123);
1574 
1575         vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
1576 
1577         c0 += 2;
1578       }
1579       if (nc & 1) {
1580         _mm_store_ss(c0, vacc0x0123);
1581       }
1582 
1583       nc = 0;
1584     }
1585   } while (nc != 0);
1586 }
1587 
xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast(size_t mr,size_t nc,size_t kc,const float * restrict a,size_t a_stride,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1588 void xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast(
1589     size_t mr,
1590     size_t nc,
1591     size_t kc,
1592     const float*restrict a,
1593     size_t a_stride,
1594     const float*restrict w,
1595     float*restrict c,
1596     size_t cm_stride,
1597     size_t cn_stride,
1598     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
1599 {
1600   assert(mr != 0);
1601   assert(mr <= 5);
1602   assert(nc != 0);
1603   assert(kc != 0);
1604   assert(kc % sizeof(float) == 0);
1605   assert(a != NULL);
1606   assert(w != NULL);
1607   assert(c != NULL);
1608 
1609   const float* a0 = a;
1610   float* c0 = c;
1611   const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
1612   float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
1613   if XNN_UNPREDICTABLE(mr < 2) {
1614     a1 = a0;
1615     c1 = c0;
1616   }
1617   const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
1618   float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
1619   if XNN_UNPREDICTABLE(mr <= 2) {
1620     a2 = a1;
1621     c2 = c1;
1622   }
1623   const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
1624   float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
1625   if XNN_UNPREDICTABLE(mr < 4) {
1626     a3 = a2;
1627     c3 = c2;
1628   }
1629   const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
1630   float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
1631   if XNN_UNPREDICTABLE(mr <= 4) {
1632     a4 = a3;
1633     c4 = c3;
1634   }
1635 
1636   do {
1637     __m256 vacc0x01234567 = _mm256_load_ps(w + 0);
1638     __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8);
1639     __m256 vacc1x01234567 = vacc0x01234567;
1640     __m256 vacc1x89ABCDEF = vacc0x89ABCDEF;
1641     __m256 vacc2x01234567 = vacc0x01234567;
1642     __m256 vacc2x89ABCDEF = vacc0x89ABCDEF;
1643     __m256 vacc3x01234567 = vacc0x01234567;
1644     __m256 vacc3x89ABCDEF = vacc0x89ABCDEF;
1645     __m256 vacc4x01234567 = vacc0x01234567;
1646     __m256 vacc4x89ABCDEF = vacc0x89ABCDEF;
1647     w += 16;
1648 
1649     size_t k = kc;
1650     do {
1651       const __m256 va0 = _mm256_broadcast_ss(a0);
1652       a0 += 1;
1653       const __m256 va1 = _mm256_broadcast_ss(a1);
1654       a1 += 1;
1655       const __m256 va2 = _mm256_broadcast_ss(a2);
1656       a2 += 1;
1657       const __m256 va3 = _mm256_broadcast_ss(a3);
1658       a3 += 1;
1659       const __m256 va4 = _mm256_broadcast_ss(a4);
1660       a4 += 1;
1661 
1662       const __m256 vb01234567 = _mm256_load_ps(w);
1663       const __m256 vb89ABCDEF = _mm256_load_ps(w + 8);
1664       w += 16;
1665 
1666       vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
1667       vacc1x01234567 = _mm256_add_ps(vacc1x01234567, _mm256_mul_ps(va1, vb01234567));
1668       vacc2x01234567 = _mm256_add_ps(vacc2x01234567, _mm256_mul_ps(va2, vb01234567));
1669       vacc3x01234567 = _mm256_add_ps(vacc3x01234567, _mm256_mul_ps(va3, vb01234567));
1670       vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567));
1671       vacc0x89ABCDEF = _mm256_add_ps(vacc0x89ABCDEF, _mm256_mul_ps(va0, vb89ABCDEF));
1672       vacc1x89ABCDEF = _mm256_add_ps(vacc1x89ABCDEF, _mm256_mul_ps(va1, vb89ABCDEF));
1673       vacc2x89ABCDEF = _mm256_add_ps(vacc2x89ABCDEF, _mm256_mul_ps(va2, vb89ABCDEF));
1674       vacc3x89ABCDEF = _mm256_add_ps(vacc3x89ABCDEF, _mm256_mul_ps(va3, vb89ABCDEF));
1675       vacc4x89ABCDEF = _mm256_add_ps(vacc4x89ABCDEF, _mm256_mul_ps(va4, vb89ABCDEF));
1676 
1677       k -= sizeof(float);
1678     } while (k != 0);
1679 
1680     const __m256 vmin = _mm256_load_ps(params->avx.min);
1681     vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
1682     vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
1683     vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
1684     vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
1685     vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
1686     vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
1687     vacc1x89ABCDEF = _mm256_max_ps(vacc1x89ABCDEF, vmin);
1688     vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin);
1689     vacc3x89ABCDEF = _mm256_max_ps(vacc3x89ABCDEF, vmin);
1690     vacc4x89ABCDEF = _mm256_max_ps(vacc4x89ABCDEF, vmin);
1691 
1692     const __m256 vmax = _mm256_load_ps(params->avx.max);
1693     vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
1694     vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
1695     vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
1696     vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
1697     vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
1698     vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
1699     vacc1x89ABCDEF = _mm256_min_ps(vacc1x89ABCDEF, vmax);
1700     vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax);
1701     vacc3x89ABCDEF = _mm256_min_ps(vacc3x89ABCDEF, vmax);
1702     vacc4x89ABCDEF = _mm256_min_ps(vacc4x89ABCDEF, vmax);
1703 
1704     if XNN_LIKELY(nc >= 16) {
1705       _mm256_storeu_ps(c4, vacc4x01234567);
1706       _mm256_storeu_ps(c4 + 8, vacc4x89ABCDEF);
1707       c4 = (float*) ((uintptr_t) c4 + cn_stride);
1708       _mm256_storeu_ps(c3, vacc3x01234567);
1709       _mm256_storeu_ps(c3 + 8, vacc3x89ABCDEF);
1710       c3 = (float*) ((uintptr_t) c3 + cn_stride);
1711       _mm256_storeu_ps(c2, vacc2x01234567);
1712       _mm256_storeu_ps(c2 + 8, vacc2x89ABCDEF);
1713       c2 = (float*) ((uintptr_t) c2 + cn_stride);
1714       _mm256_storeu_ps(c1, vacc1x01234567);
1715       _mm256_storeu_ps(c1 + 8, vacc1x89ABCDEF);
1716       c1 = (float*) ((uintptr_t) c1 + cn_stride);
1717       _mm256_storeu_ps(c0, vacc0x01234567);
1718       _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF);
1719       c0 = (float*) ((uintptr_t) c0 + cn_stride);
1720 
1721       a4 = (const float*) ((uintptr_t) a4 - kc);
1722       a3 = (const float*) ((uintptr_t) a3 - kc);
1723       a2 = (const float*) ((uintptr_t) a2 - kc);
1724       a1 = (const float*) ((uintptr_t) a1 - kc);
1725       a0 = (const float*) ((uintptr_t) a0 - kc);
1726 
1727       nc -= 16;
1728     } else {
1729       if (nc & 8) {
1730         _mm256_storeu_ps(c4, vacc4x01234567);
1731         _mm256_storeu_ps(c3, vacc3x01234567);
1732         _mm256_storeu_ps(c2, vacc2x01234567);
1733         _mm256_storeu_ps(c1, vacc1x01234567);
1734         _mm256_storeu_ps(c0, vacc0x01234567);
1735 
1736         vacc4x01234567 = vacc4x89ABCDEF;
1737         vacc3x01234567 = vacc3x89ABCDEF;
1738         vacc2x01234567 = vacc2x89ABCDEF;
1739         vacc1x01234567 = vacc1x89ABCDEF;
1740         vacc0x01234567 = vacc0x89ABCDEF;
1741 
1742         c4 += 8;
1743         c3 += 8;
1744         c2 += 8;
1745         c1 += 8;
1746         c0 += 8;
1747       }
1748       __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
1749       __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
1750       __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
1751       __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
1752       __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
1753       if (nc & 4) {
1754         _mm_storeu_ps(c4, vacc4x0123);
1755         _mm_storeu_ps(c3, vacc3x0123);
1756         _mm_storeu_ps(c2, vacc2x0123);
1757         _mm_storeu_ps(c1, vacc1x0123);
1758         _mm_storeu_ps(c0, vacc0x0123);
1759 
1760         vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
1761         vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
1762         vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
1763         vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
1764         vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
1765 
1766         c4 += 4;
1767         c3 += 4;
1768         c2 += 4;
1769         c1 += 4;
1770         c0 += 4;
1771       }
1772       if (nc & 2) {
1773         _mm_storel_pi((__m64*) c4, vacc4x0123);
1774         _mm_storel_pi((__m64*) c3, vacc3x0123);
1775         _mm_storel_pi((__m64*) c2, vacc2x0123);
1776         _mm_storel_pi((__m64*) c1, vacc1x0123);
1777         _mm_storel_pi((__m64*) c0, vacc0x0123);
1778 
1779         vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
1780         vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
1781         vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
1782         vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
1783         vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
1784 
1785         c4 += 2;
1786         c3 += 2;
1787         c2 += 2;
1788         c1 += 2;
1789         c0 += 2;
1790       }
1791       if (nc & 1) {
1792         _mm_store_ss(c4, vacc4x0123);
1793         _mm_store_ss(c3, vacc3x0123);
1794         _mm_store_ss(c2, vacc2x0123);
1795         _mm_store_ss(c1, vacc1x0123);
1796         _mm_store_ss(c0, vacc0x0123);
1797       }
1798 
1799       nc = 0;
1800     }
1801   } while (nc != 0);
1802 }
1803 
xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast(size_t mr,size_t nc,size_t kc,size_t ks,const float ** restrict a,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1804 void xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast(
1805     size_t mr,
1806     size_t nc,
1807     size_t kc,
1808     size_t ks,
1809     const float**restrict a,
1810     const float*restrict w,
1811     float*restrict c,
1812     size_t cm_stride,
1813     size_t cn_stride,
1814     size_t a_offset,
1815     const float* zero,
1816     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
1817 {
1818   assert(mr != 0);
1819   assert(mr <= 1);
1820   assert(nc != 0);
1821   assert(kc != 0);
1822   assert(kc % sizeof(float) == 0);
1823   assert(ks != 0);
1824   assert(ks % (1 * sizeof(void*)) == 0);
1825   assert(a_offset % sizeof(float) == 0);
1826   assert(a != NULL);
1827   assert(w != NULL);
1828   assert(c != NULL);
1829 
1830   float* c0 = c;
1831 
1832   do {
1833     __m256 vacc0x01234567 = _mm256_load_ps(w);
1834     __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8);
1835     w += 16;
1836 
1837     size_t p = ks;
1838     do {
1839       const float* restrict a0 = a[0];
1840       assert(a0 != NULL);
1841       if XNN_UNPREDICTABLE(a0 != zero) {
1842         a0 = (const float*) ((uintptr_t) a0 + a_offset);
1843       }
1844       a += 1;
1845 
1846       size_t k = kc;
1847       do {
1848         const __m256 vb01234567 = _mm256_load_ps(w);
1849         const __m256 vb89ABCDEF = _mm256_load_ps(w + 8);
1850         w += 16;
1851 
1852         const __m256 va0 = _mm256_broadcast_ss(a0);
1853         a0 += 1;
1854 
1855         vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
1856         vacc0x89ABCDEF = _mm256_add_ps(vacc0x89ABCDEF, _mm256_mul_ps(va0, vb89ABCDEF));
1857         k -= sizeof(float);
1858       } while (k != 0);
1859       p -= 1 * sizeof(void*);
1860     } while (p != 0);
1861 
1862     const __m256 vmin = _mm256_load_ps(params->avx.min);
1863     vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
1864     vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
1865 
1866     const __m256 vmax = _mm256_load_ps(params->avx.max);
1867     vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
1868     vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
1869 
1870     if XNN_LIKELY(nc >= 16) {
1871       _mm256_storeu_ps(c0, vacc0x01234567);
1872       _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF);
1873       c0 = (float*) ((uintptr_t) c0 + cn_stride);
1874 
1875       a = (const float**restrict) ((uintptr_t) a - ks);
1876       nc -= 16;
1877     } else {
1878       if (nc & 8) {
1879         _mm256_storeu_ps(c0, vacc0x01234567);
1880 
1881         vacc0x01234567 = vacc0x89ABCDEF;
1882 
1883         c0 += 8;
1884       }
1885       __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
1886       if (nc & 4) {
1887         _mm_storeu_ps(c0, vacc0x0123);
1888 
1889         vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
1890 
1891         c0 += 4;
1892       }
1893       if (nc & 2) {
1894         _mm_storel_pi((__m64*) c0, vacc0x0123);
1895 
1896         vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
1897 
1898         c0 += 2;
1899       }
1900       if (nc & 1) {
1901         _mm_store_ss(c0, vacc0x0123);
1902       }
1903 
1904       nc = 0;
1905     }
1906   } while (nc != 0);
1907 }
1908 
xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast(size_t mr,size_t nc,size_t kc,size_t ks,const float ** restrict a,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1909 void xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast(
1910     size_t mr,
1911     size_t nc,
1912     size_t kc,
1913     size_t ks,
1914     const float**restrict a,
1915     const float*restrict w,
1916     float*restrict c,
1917     size_t cm_stride,
1918     size_t cn_stride,
1919     size_t a_offset,
1920     const float* zero,
1921     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
1922 {
1923   assert(mr != 0);
1924   assert(mr <= 5);
1925   assert(nc != 0);
1926   assert(kc != 0);
1927   assert(kc % sizeof(float) == 0);
1928   assert(ks != 0);
1929   assert(ks % (5 * sizeof(void*)) == 0);
1930   assert(a_offset % sizeof(float) == 0);
1931   assert(a != NULL);
1932   assert(w != NULL);
1933   assert(c != NULL);
1934 
1935   float* c0 = c;
1936   float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
1937   if XNN_UNPREDICTABLE(mr < 2) {
1938     c1 = c0;
1939   }
1940   float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
1941   if XNN_UNPREDICTABLE(mr <= 2) {
1942     c2 = c1;
1943   }
1944   float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
1945   if XNN_UNPREDICTABLE(mr < 4) {
1946     c3 = c2;
1947   }
1948   float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
1949   if XNN_UNPREDICTABLE(mr <= 4) {
1950     c4 = c3;
1951   }
1952 
1953   do {
1954     __m256 vacc0x01234567 = _mm256_load_ps(w);
1955     __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8);
1956     __m256 vacc1x01234567 = vacc0x01234567;
1957     __m256 vacc1x89ABCDEF = vacc0x89ABCDEF;
1958     __m256 vacc2x01234567 = vacc0x01234567;
1959     __m256 vacc2x89ABCDEF = vacc0x89ABCDEF;
1960     __m256 vacc3x01234567 = vacc0x01234567;
1961     __m256 vacc3x89ABCDEF = vacc0x89ABCDEF;
1962     __m256 vacc4x01234567 = vacc0x01234567;
1963     __m256 vacc4x89ABCDEF = vacc0x89ABCDEF;
1964     w += 16;
1965 
1966     size_t p = ks;
1967     do {
1968       const float* restrict a0 = a[0];
1969       assert(a0 != NULL);
1970       if XNN_UNPREDICTABLE(a0 != zero) {
1971         a0 = (const float*) ((uintptr_t) a0 + a_offset);
1972       }
1973       const float* restrict a1 = a[1];
1974       assert(a1 != NULL);
1975       if XNN_UNPREDICTABLE(a1 != zero) {
1976         a1 = (const float*) ((uintptr_t) a1 + a_offset);
1977       }
1978       const float* restrict a2 = a[2];
1979       assert(a2 != NULL);
1980       if XNN_UNPREDICTABLE(a2 != zero) {
1981         a2 = (const float*) ((uintptr_t) a2 + a_offset);
1982       }
1983       const float* restrict a3 = a[3];
1984       assert(a3 != NULL);
1985       if XNN_UNPREDICTABLE(a3 != zero) {
1986         a3 = (const float*) ((uintptr_t) a3 + a_offset);
1987       }
1988       const float* restrict a4 = a[4];
1989       assert(a4 != NULL);
1990       if XNN_UNPREDICTABLE(a4 != zero) {
1991         a4 = (const float*) ((uintptr_t) a4 + a_offset);
1992       }
1993       a += 5;
1994 
1995       size_t k = kc;
1996       do {
1997         const __m256 vb01234567 = _mm256_load_ps(w);
1998         const __m256 vb89ABCDEF = _mm256_load_ps(w + 8);
1999         w += 16;
2000 
2001         const __m256 va0 = _mm256_broadcast_ss(a0);
2002         a0 += 1;
2003         const __m256 va1 = _mm256_broadcast_ss(a1);
2004         a1 += 1;
2005         const __m256 va2 = _mm256_broadcast_ss(a2);
2006         a2 += 1;
2007         const __m256 va3 = _mm256_broadcast_ss(a3);
2008         a3 += 1;
2009         const __m256 va4 = _mm256_broadcast_ss(a4);
2010         a4 += 1;
2011 
2012         vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
2013         vacc0x89ABCDEF = _mm256_add_ps(vacc0x89ABCDEF, _mm256_mul_ps(va0, vb89ABCDEF));
2014         vacc1x01234567 = _mm256_add_ps(vacc1x01234567, _mm256_mul_ps(va1, vb01234567));
2015         vacc1x89ABCDEF = _mm256_add_ps(vacc1x89ABCDEF, _mm256_mul_ps(va1, vb89ABCDEF));
2016         vacc2x01234567 = _mm256_add_ps(vacc2x01234567, _mm256_mul_ps(va2, vb01234567));
2017         vacc2x89ABCDEF = _mm256_add_ps(vacc2x89ABCDEF, _mm256_mul_ps(va2, vb89ABCDEF));
2018         vacc3x01234567 = _mm256_add_ps(vacc3x01234567, _mm256_mul_ps(va3, vb01234567));
2019         vacc3x89ABCDEF = _mm256_add_ps(vacc3x89ABCDEF, _mm256_mul_ps(va3, vb89ABCDEF));
2020         vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567));
2021         vacc4x89ABCDEF = _mm256_add_ps(vacc4x89ABCDEF, _mm256_mul_ps(va4, vb89ABCDEF));
2022         k -= sizeof(float);
2023       } while (k != 0);
2024       p -= 5 * sizeof(void*);
2025     } while (p != 0);
2026 
2027     const __m256 vmin = _mm256_load_ps(params->avx.min);
2028     vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
2029     vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
2030     vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
2031     vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
2032     vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
2033     vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
2034     vacc1x89ABCDEF = _mm256_max_ps(vacc1x89ABCDEF, vmin);
2035     vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin);
2036     vacc3x89ABCDEF = _mm256_max_ps(vacc3x89ABCDEF, vmin);
2037     vacc4x89ABCDEF = _mm256_max_ps(vacc4x89ABCDEF, vmin);
2038 
2039     const __m256 vmax = _mm256_load_ps(params->avx.max);
2040     vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
2041     vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
2042     vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
2043     vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
2044     vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
2045     vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
2046     vacc1x89ABCDEF = _mm256_min_ps(vacc1x89ABCDEF, vmax);
2047     vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax);
2048     vacc3x89ABCDEF = _mm256_min_ps(vacc3x89ABCDEF, vmax);
2049     vacc4x89ABCDEF = _mm256_min_ps(vacc4x89ABCDEF, vmax);
2050 
2051     if XNN_LIKELY(nc >= 16) {
2052       _mm256_storeu_ps(c4, vacc4x01234567);
2053       _mm256_storeu_ps(c4 + 8, vacc4x89ABCDEF);
2054       c4 = (float*) ((uintptr_t) c4 + cn_stride);
2055       _mm256_storeu_ps(c3, vacc3x01234567);
2056       _mm256_storeu_ps(c3 + 8, vacc3x89ABCDEF);
2057       c3 = (float*) ((uintptr_t) c3 + cn_stride);
2058       _mm256_storeu_ps(c2, vacc2x01234567);
2059       _mm256_storeu_ps(c2 + 8, vacc2x89ABCDEF);
2060       c2 = (float*) ((uintptr_t) c2 + cn_stride);
2061       _mm256_storeu_ps(c1, vacc1x01234567);
2062       _mm256_storeu_ps(c1 + 8, vacc1x89ABCDEF);
2063       c1 = (float*) ((uintptr_t) c1 + cn_stride);
2064       _mm256_storeu_ps(c0, vacc0x01234567);
2065       _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF);
2066       c0 = (float*) ((uintptr_t) c0 + cn_stride);
2067 
2068       a = (const float**restrict) ((uintptr_t) a - ks);
2069       nc -= 16;
2070     } else {
2071       if (nc & 8) {
2072         _mm256_storeu_ps(c4, vacc4x01234567);
2073         _mm256_storeu_ps(c3, vacc3x01234567);
2074         _mm256_storeu_ps(c2, vacc2x01234567);
2075         _mm256_storeu_ps(c1, vacc1x01234567);
2076         _mm256_storeu_ps(c0, vacc0x01234567);
2077 
2078         vacc4x01234567 = vacc4x89ABCDEF;
2079         vacc3x01234567 = vacc3x89ABCDEF;
2080         vacc2x01234567 = vacc2x89ABCDEF;
2081         vacc1x01234567 = vacc1x89ABCDEF;
2082         vacc0x01234567 = vacc0x89ABCDEF;
2083 
2084         c4 += 8;
2085         c3 += 8;
2086         c2 += 8;
2087         c1 += 8;
2088         c0 += 8;
2089       }
2090       __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
2091       __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
2092       __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
2093       __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
2094       __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
2095       if (nc & 4) {
2096         _mm_storeu_ps(c4, vacc4x0123);
2097         _mm_storeu_ps(c3, vacc3x0123);
2098         _mm_storeu_ps(c2, vacc2x0123);
2099         _mm_storeu_ps(c1, vacc1x0123);
2100         _mm_storeu_ps(c0, vacc0x0123);
2101 
2102         vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
2103         vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
2104         vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
2105         vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
2106         vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
2107 
2108         c4 += 4;
2109         c3 += 4;
2110         c2 += 4;
2111         c1 += 4;
2112         c0 += 4;
2113       }
2114       if (nc & 2) {
2115         _mm_storel_pi((__m64*) c4, vacc4x0123);
2116         _mm_storel_pi((__m64*) c3, vacc3x0123);
2117         _mm_storel_pi((__m64*) c2, vacc2x0123);
2118         _mm_storel_pi((__m64*) c1, vacc1x0123);
2119         _mm_storel_pi((__m64*) c0, vacc0x0123);
2120 
2121         vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
2122         vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
2123         vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
2124         vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
2125         vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
2126 
2127         c4 += 2;
2128         c3 += 2;
2129         c2 += 2;
2130         c1 += 2;
2131         c0 += 2;
2132       }
2133       if (nc & 1) {
2134         _mm_store_ss(c4, vacc4x0123);
2135         _mm_store_ss(c3, vacc3x0123);
2136         _mm_store_ss(c2, vacc2x0123);
2137         _mm_store_ss(c1, vacc1x0123);
2138         _mm_store_ss(c0, vacc0x0123);
2139       }
2140 
2141       nc = 0;
2142     }
2143   } while (nc != 0);
2144 }
2145 
2146 static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
2147 
xnn_f32_prelu_ukernel__avx_2x16(size_t rows,size_t channels,const float * restrict input,size_t input_stride,const float * restrict weights,float * restrict output,size_t output_stride)2148 void xnn_f32_prelu_ukernel__avx_2x16(
2149     size_t rows,
2150     size_t channels,
2151     const float*restrict input,
2152     size_t input_stride,
2153     const float*restrict weights,
2154     float*restrict output,
2155     size_t output_stride)
2156 {
2157   assert(rows != 0);
2158   assert(channels != 0);
2159   assert(channels % sizeof(float) == 0);
2160 
2161   const float* i0 = input;
2162   float* o0 = output;
2163   const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
2164   float* o1 = (float*) ((uintptr_t) o0 + output_stride);
2165 
2166   const size_t input_increment = input_stride * 2 - channels;
2167   const size_t output_increment = output_stride * 2 - channels;
2168 
2169   do {
2170     if XNN_UNPREDICTABLE(rows < 2) {
2171       i1 = i0;
2172       o1 = o0;
2173     }
2174 
2175     const float* w = weights;
2176     size_t c = channels;
2177     for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) {
2178       const __m256 vw01234567 = _mm256_load_ps(w);
2179       const __m256 vw89ABCDEF = _mm256_load_ps(w + 8);
2180       w += 16;
2181 
2182       const __m256 vi0x01234567 = _mm256_loadu_ps(i0);
2183       const __m256 vi0x89ABCDEF = _mm256_loadu_ps(i0 + 8);
2184       i0 += 16;
2185       const __m256 vi1x01234567 = _mm256_loadu_ps(i1);
2186       const __m256 vi1x89ABCDEF = _mm256_loadu_ps(i1 + 8);
2187       i1 += 16;
2188 
2189       const __m256 vprod0x01234567 = _mm256_mul_ps(vi0x01234567, vw01234567);
2190       const __m256 vprod0x89ABCDEF = _mm256_mul_ps(vi0x89ABCDEF, vw89ABCDEF);
2191       const __m256 vprod1x01234567 = _mm256_mul_ps(vi1x01234567, vw01234567);
2192       const __m256 vprod1x89ABCDEF = _mm256_mul_ps(vi1x89ABCDEF, vw89ABCDEF);
2193 
2194       const __m256 vacc0x01234567 = _mm256_blendv_ps(vi0x01234567, vprod0x01234567, vi0x01234567);
2195       const __m256 vacc0x89ABCDEF = _mm256_blendv_ps(vi0x89ABCDEF, vprod0x89ABCDEF, vi0x89ABCDEF);
2196       const __m256 vacc1x01234567 = _mm256_blendv_ps(vi1x01234567, vprod1x01234567, vi1x01234567);
2197       const __m256 vacc1x89ABCDEF = _mm256_blendv_ps(vi1x89ABCDEF, vprod1x89ABCDEF, vi1x89ABCDEF);
2198 
2199       _mm256_storeu_ps(o0, vacc0x01234567);
2200       _mm256_storeu_ps(o0 + 8, vacc0x89ABCDEF);
2201       o0 += 16;
2202       _mm256_storeu_ps(o1, vacc1x01234567);
2203       _mm256_storeu_ps(o1 + 8, vacc1x89ABCDEF);
2204       o1 += 16;
2205     }
2206     for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) {
2207       const __m256 vw = _mm256_load_ps(w);
2208       w += 8;
2209 
2210       const __m256 vi0 = _mm256_loadu_ps(i0);
2211       i0 += 8;
2212       const __m256 vi1 = _mm256_loadu_ps(i1);
2213       i1 += 8;
2214 
2215       const __m256 vprod0 = _mm256_mul_ps(vi0, vw);
2216       const __m256 vprod1 = _mm256_mul_ps(vi1, vw);
2217 
2218       const __m256 vacc0 = _mm256_blendv_ps(vi0, vprod0, vi0);
2219       const __m256 vacc1 = _mm256_blendv_ps(vi1, vprod1, vi1);
2220 
2221       _mm256_storeu_ps(o0, vacc0);
2222       o0 += 8;
2223       _mm256_storeu_ps(o1, vacc1);
2224       o1 += 8;
2225     }
2226     if XNN_UNLIKELY(c != 0) {
2227       assert(c >= 1 * sizeof(float));
2228       assert(c <= 7 * sizeof(float));
2229       __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - c));
2230 
2231       const __m256 vw = _mm256_maskload_ps(w, vmask);
2232 
2233       const __m256 vi0 = _mm256_maskload_ps(i0, vmask);
2234       i0 = (const float*) ((uintptr_t) i0 + c);
2235       const __m256 vi1 = _mm256_maskload_ps(i1, vmask);
2236       i1 = (const float*) ((uintptr_t) i1 + c);
2237 
2238       const __m256 vprod0 = _mm256_mul_ps(vi0, vw);
2239       const __m256 vprod1 = _mm256_mul_ps(vi1, vw);
2240 
2241       __m256 vacc0 = _mm256_blendv_ps(vi0, vprod0, vi0);
2242       __m256 vacc1 = _mm256_blendv_ps(vi1, vprod1, vi1);
2243 
2244       __m128 vacc0_lo = _mm256_castps256_ps128(vacc0);
2245       __m128 vacc1_lo = _mm256_castps256_ps128(vacc1);
2246       if (c & (4 * sizeof(float))) {
2247         _mm_storeu_ps(o0, vacc0_lo);
2248         _mm_storeu_ps(o1, vacc1_lo);
2249 
2250         vacc0_lo = _mm256_extractf128_ps(vacc0, 1);
2251         vacc1_lo = _mm256_extractf128_ps(vacc1, 1);
2252 
2253         o0 += 4;
2254         o1 += 4;
2255       }
2256       if (c & (2 * sizeof(float))) {
2257         _mm_storel_pi((__m64*) o0, vacc0_lo);
2258         _mm_storel_pi((__m64*) o1, vacc1_lo);
2259 
2260         vacc0_lo = _mm_movehl_ps(vacc0_lo, vacc0_lo);
2261         vacc1_lo = _mm_movehl_ps(vacc1_lo, vacc1_lo);
2262 
2263         o0 += 2;
2264         o1 += 2;
2265       }
2266       if (c & (1 * sizeof(float))) {
2267         _mm_store_ss(o0, vacc0_lo);
2268         _mm_store_ss(o1, vacc1_lo);
2269 
2270         o0 += 1;
2271         o1 += 1;
2272       }
2273     }
2274     i0 = (const float*) ((uintptr_t) i0 + input_increment);
2275     o0 = (float*) ((uintptr_t) o0 + output_increment);
2276     i1 = (const float*) ((uintptr_t) i1 + input_increment);
2277     o1 = (float*) ((uintptr_t) o1 + output_increment);
2278     rows = doz(rows, 2);
2279   } while (rows != 0);
2280 }
2281 
xnn_f32_qs8_vcvt_ukernel__avx_x32(size_t n,const float * x,int8_t * y,const union xnn_f32_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])2282 void xnn_f32_qs8_vcvt_ukernel__avx_x32(
2283     size_t n,
2284     const float* x,
2285     int8_t* y,
2286     const union xnn_f32_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
2287 {
2288   assert(n != 0);
2289   assert(n % sizeof(float) == 0);
2290   assert(x != NULL);
2291   assert(y != NULL);
2292 
2293   const __m256 vscale = _mm256_load_ps(params->avx.scale);
2294   const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx.output_max_less_zero_point);
2295   const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx.output_zero_point);
2296   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx.output_min);
2297 
2298   for (; n >= 32 * sizeof(float); n -= 32 * sizeof(float)) {
2299     __m256 vx01234567 = _mm256_loadu_ps(x);
2300     __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
2301     __m256 vxGHIJKLMN = _mm256_loadu_ps(x + 16);
2302     __m256 vxOPQRSTUV = _mm256_loadu_ps(x + 24);
2303     x += 32;
2304 
2305     vx01234567 = _mm256_mul_ps(vx01234567, vscale);
2306     vx89ABCDEF = _mm256_mul_ps(vx89ABCDEF, vscale);
2307     vxGHIJKLMN = _mm256_mul_ps(vxGHIJKLMN, vscale);
2308     vxOPQRSTUV = _mm256_mul_ps(vxOPQRSTUV, vscale);
2309 
2310     vx01234567 = _mm256_min_ps(vx01234567, voutput_max_less_zero_point);
2311     vx89ABCDEF = _mm256_min_ps(vx89ABCDEF, voutput_max_less_zero_point);
2312     vxGHIJKLMN = _mm256_min_ps(vxGHIJKLMN, voutput_max_less_zero_point);
2313     vxOPQRSTUV = _mm256_min_ps(vxOPQRSTUV, voutput_max_less_zero_point);
2314 
2315     const __m256i vacc01234567 = _mm256_cvtps_epi32(vx01234567);
2316     const __m256i vacc89ABCDEF = _mm256_cvtps_epi32(vx89ABCDEF);
2317     const __m256i vaccGHIJKLMN = _mm256_cvtps_epi32(vxGHIJKLMN);
2318     const __m256i vaccOPQRSTUV = _mm256_cvtps_epi32(vxOPQRSTUV);
2319 
2320     __m128i vy01234567 = _mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extractf128_si256(vacc01234567, 1));
2321     __m128i vy89ABCDEF = _mm_packs_epi32(_mm256_castsi256_si128(vacc89ABCDEF), _mm256_extractf128_si256(vacc89ABCDEF, 1));
2322     __m128i vyGHIJKLMN = _mm_packs_epi32(_mm256_castsi256_si128(vaccGHIJKLMN), _mm256_extractf128_si256(vaccGHIJKLMN, 1));
2323     __m128i vyOPQRSTUV = _mm_packs_epi32(_mm256_castsi256_si128(vaccOPQRSTUV), _mm256_extractf128_si256(vaccOPQRSTUV, 1));
2324 
2325     vy01234567 = _mm_adds_epi16(vy01234567, voutput_zero_point);
2326     vy89ABCDEF = _mm_adds_epi16(vy89ABCDEF, voutput_zero_point);
2327     vyGHIJKLMN = _mm_adds_epi16(vyGHIJKLMN, voutput_zero_point);
2328     vyOPQRSTUV = _mm_adds_epi16(vyOPQRSTUV, voutput_zero_point);
2329 
2330     __m128i vy0123456789ABCDEF = _mm_packs_epi16(vy01234567, vy89ABCDEF);
2331     __m128i vyGHIJKLMNOPQRSTUV = _mm_packs_epi16(vyGHIJKLMN, vyOPQRSTUV);
2332 
2333     vy0123456789ABCDEF = _mm_max_epi8(vy0123456789ABCDEF, voutput_min);
2334     vyGHIJKLMNOPQRSTUV = _mm_max_epi8(vyGHIJKLMNOPQRSTUV, voutput_min);
2335 
2336     _mm_storeu_si128((__m128i*) y, vy0123456789ABCDEF);
2337     _mm_storeu_si128((__m128i*) (y + 16), vyGHIJKLMNOPQRSTUV);
2338     y += 32;
2339   }
2340   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
2341     __m256 vx = _mm256_loadu_ps(x);
2342     vx = _mm256_mul_ps(vx, vscale);
2343     vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
2344     x += 8;
2345 
2346     const __m256i vacc = _mm256_cvtps_epi32(vx);
2347 
2348     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
2349     vy = _mm_adds_epi16(vy, voutput_zero_point);
2350     vy = _mm_packs_epi16(vy, vy);
2351     vy = _mm_max_epi8(vy, voutput_min);
2352 
2353     _mm_storel_epi64((__m128i*) y, vy);
2354     y += 8;
2355   }
2356   if XNN_UNLIKELY(n != 0) {
2357     assert(n >= 1 * sizeof(float));
2358     assert(n <= 7 * sizeof(float));
2359     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
2360 
2361     __m256 vx = _mm256_maskload_ps(x, vmask);
2362     vx = _mm256_mul_ps(vx, vscale);
2363     vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
2364 
2365     const __m256i vacc = _mm256_cvtps_epi32(vx);
2366 
2367     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
2368     vy = _mm_adds_epi16(vy, voutput_zero_point);
2369     vy = _mm_packs_epi16(vy, vy);
2370     vy = _mm_max_epi8(vy, voutput_min);
2371 
2372     if (n & (4 * sizeof(float))) {
2373       _mm_storeu_si32(y, vy);
2374       y += 4;
2375       vy = _mm_srli_epi64(vy, 32);
2376     }
2377     if (n & (2 * sizeof(float))) {
2378       _mm_storeu_si16(y, vy);
2379       y += 2;
2380       vy = _mm_srli_epi32(vy, 16);
2381     }
2382     if (n & (1 * sizeof(float))) {
2383       *y = (int8_t) _mm_extract_epi8(vy, 0);
2384     }
2385   }
2386 }
2387 
xnn_f32_qu8_vcvt_ukernel__avx_x32(size_t n,const float * x,uint8_t * y,const union xnn_f32_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])2388 void xnn_f32_qu8_vcvt_ukernel__avx_x32(
2389     size_t n,
2390     const float* x,
2391     uint8_t* y,
2392     const union xnn_f32_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
2393 {
2394   assert(n != 0);
2395   assert(n % sizeof(float) == 0);
2396   assert(x != NULL);
2397   assert(y != NULL);
2398 
2399   const __m256 vscale = _mm256_load_ps(params->avx.scale);
2400   const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx.output_max_less_zero_point);
2401   const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx.output_zero_point);
2402   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx.output_min);
2403 
2404   for (; n >= 32 * sizeof(float); n -= 32 * sizeof(float)) {
2405     __m256 vx01234567 = _mm256_loadu_ps(x);
2406     __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
2407     __m256 vxGHIJKLMN = _mm256_loadu_ps(x + 16);
2408     __m256 vxOPQRSTUV = _mm256_loadu_ps(x + 24);
2409     x += 32;
2410 
2411     vx01234567 = _mm256_mul_ps(vx01234567, vscale);
2412     vx89ABCDEF = _mm256_mul_ps(vx89ABCDEF, vscale);
2413     vxGHIJKLMN = _mm256_mul_ps(vxGHIJKLMN, vscale);
2414     vxOPQRSTUV = _mm256_mul_ps(vxOPQRSTUV, vscale);
2415 
2416     vx01234567 = _mm256_min_ps(vx01234567, voutput_max_less_zero_point);
2417     vx89ABCDEF = _mm256_min_ps(vx89ABCDEF, voutput_max_less_zero_point);
2418     vxGHIJKLMN = _mm256_min_ps(vxGHIJKLMN, voutput_max_less_zero_point);
2419     vxOPQRSTUV = _mm256_min_ps(vxOPQRSTUV, voutput_max_less_zero_point);
2420 
2421     const __m256i vacc01234567 = _mm256_cvtps_epi32(vx01234567);
2422     const __m256i vacc89ABCDEF = _mm256_cvtps_epi32(vx89ABCDEF);
2423     const __m256i vaccGHIJKLMN = _mm256_cvtps_epi32(vxGHIJKLMN);
2424     const __m256i vaccOPQRSTUV = _mm256_cvtps_epi32(vxOPQRSTUV);
2425 
2426     __m128i vy01234567 = _mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extractf128_si256(vacc01234567, 1));
2427     __m128i vy89ABCDEF = _mm_packs_epi32(_mm256_castsi256_si128(vacc89ABCDEF), _mm256_extractf128_si256(vacc89ABCDEF, 1));
2428     __m128i vyGHIJKLMN = _mm_packs_epi32(_mm256_castsi256_si128(vaccGHIJKLMN), _mm256_extractf128_si256(vaccGHIJKLMN, 1));
2429     __m128i vyOPQRSTUV = _mm_packs_epi32(_mm256_castsi256_si128(vaccOPQRSTUV), _mm256_extractf128_si256(vaccOPQRSTUV, 1));
2430 
2431     vy01234567 = _mm_adds_epi16(vy01234567, voutput_zero_point);
2432     vy89ABCDEF = _mm_adds_epi16(vy89ABCDEF, voutput_zero_point);
2433     vyGHIJKLMN = _mm_adds_epi16(vyGHIJKLMN, voutput_zero_point);
2434     vyOPQRSTUV = _mm_adds_epi16(vyOPQRSTUV, voutput_zero_point);
2435 
2436     __m128i vy0123456789ABCDEF = _mm_packus_epi16(vy01234567, vy89ABCDEF);
2437     __m128i vyGHIJKLMNOPQRSTUV = _mm_packus_epi16(vyGHIJKLMN, vyOPQRSTUV);
2438 
2439     vy0123456789ABCDEF = _mm_max_epu8(vy0123456789ABCDEF, voutput_min);
2440     vyGHIJKLMNOPQRSTUV = _mm_max_epu8(vyGHIJKLMNOPQRSTUV, voutput_min);
2441 
2442     _mm_storeu_si128((__m128i*) y, vy0123456789ABCDEF);
2443     _mm_storeu_si128((__m128i*) (y + 16), vyGHIJKLMNOPQRSTUV);
2444     y += 32;
2445   }
2446   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
2447     __m256 vx = _mm256_loadu_ps(x);
2448     vx = _mm256_mul_ps(vx, vscale);
2449     vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
2450     x += 8;
2451 
2452     const __m256i vacc = _mm256_cvtps_epi32(vx);
2453 
2454     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
2455     vy = _mm_adds_epi16(vy, voutput_zero_point);
2456     vy = _mm_packus_epi16(vy, vy);
2457     vy = _mm_max_epu8(vy, voutput_min);
2458 
2459     _mm_storel_epi64((__m128i*) y, vy);
2460     y += 8;
2461   }
2462   if XNN_UNLIKELY(n != 0) {
2463     assert(n >= 1 * sizeof(float));
2464     assert(n <= 7 * sizeof(float));
2465     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
2466 
2467     __m256 vx = _mm256_maskload_ps(x, vmask);
2468     vx = _mm256_mul_ps(vx, vscale);
2469     vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
2470 
2471     const __m256i vacc = _mm256_cvtps_epi32(vx);
2472 
2473     __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
2474     vy = _mm_adds_epi16(vy, voutput_zero_point);
2475     vy = _mm_packus_epi16(vy, vy);
2476     vy = _mm_max_epu8(vy, voutput_min);
2477 
2478     if (n & (4 * sizeof(float))) {
2479       _mm_storeu_si32(y, vy);
2480       y += 4;
2481       vy = _mm_srli_epi64(vy, 32);
2482     }
2483     if (n & (2 * sizeof(float))) {
2484       _mm_storeu_si16(y, vy);
2485       y += 2;
2486       vy = _mm_srli_epi32(vy, 16);
2487     }
2488     if (n & (1 * sizeof(float))) {
2489       *y = (uint8_t) _mm_extract_epi8(vy, 0);
2490     }
2491   }
2492 }
2493 
xnn_f32_vadd_minmax_ukernel__avx_x16(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2494 void xnn_f32_vadd_minmax_ukernel__avx_x16(
2495     size_t n,
2496     const float* a,
2497     const float* b,
2498     float* y,
2499     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
2500 {
2501   assert(n != 0);
2502   assert(n % sizeof(float) == 0);
2503   assert(a != NULL);
2504   assert(b != NULL);
2505   assert(y != NULL);
2506 
2507   const __m256 vy_min = _mm256_load_ps(params->avx.min);
2508   const __m256 vy_max = _mm256_load_ps(params->avx.max);
2509 
2510   for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
2511     const __m256 va01234567 = _mm256_loadu_ps(a);
2512     const __m256 va89ABCDEF = _mm256_loadu_ps(a + 8);
2513     a += 16;
2514 
2515     const __m256 vb01234567 = _mm256_loadu_ps(b);
2516     const __m256 vb89ABCDEF = _mm256_loadu_ps(b + 8);
2517     b += 16;
2518 
2519     __m256 vy01234567 = _mm256_add_ps(va01234567, vb01234567);
2520     __m256 vy89ABCDEF = _mm256_add_ps(va89ABCDEF, vb89ABCDEF);
2521 
2522 
2523     vy01234567 = _mm256_max_ps(vy01234567, vy_min);
2524     vy89ABCDEF = _mm256_max_ps(vy89ABCDEF, vy_min);
2525 
2526     vy01234567 = _mm256_min_ps(vy01234567, vy_max);
2527     vy89ABCDEF = _mm256_min_ps(vy89ABCDEF, vy_max);
2528 
2529     _mm256_storeu_ps(y, vy01234567);
2530     _mm256_storeu_ps(y + 8, vy89ABCDEF);
2531     y += 16;
2532   }
2533   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
2534     const __m256 va = _mm256_loadu_ps(a);
2535     a += 8;
2536 
2537     const __m256 vb = _mm256_loadu_ps(b);
2538     b += 8;
2539 
2540     __m256 vy = _mm256_add_ps(va, vb);
2541     vy = _mm256_max_ps(vy, vy_min);
2542     vy = _mm256_min_ps(vy, vy_max);
2543     _mm256_storeu_ps(y, vy);
2544     y += 8;
2545   }
2546   if XNN_UNLIKELY(n != 0) {
2547     assert(n >= 1 * sizeof(float));
2548     assert(n <= 7 * sizeof(float));
2549     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
2550 
2551     const __m256 va = _mm256_maskload_ps(a, vmask);
2552     const __m256 vb = _mm256_maskload_ps(b, vmask);
2553 
2554     __m256 vy = _mm256_add_ps(va, vb);
2555     vy = _mm256_max_ps(vy, vy_min);
2556     vy = _mm256_min_ps(vy, vy_max);
2557 
2558     __m128 vy_lo = _mm256_castps256_ps128(vy);
2559     if (n & (4 * sizeof(float))) {
2560       _mm_storeu_ps(y, vy_lo);
2561       vy_lo = _mm256_extractf128_ps(vy, 1);
2562       y += 4;
2563     }
2564     if (n & (2 * sizeof(float))) {
2565       _mm_storel_pi((__m64*) y, vy_lo);
2566       vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
2567       y += 2;
2568     }
2569     if (n & (1 * sizeof(float))) {
2570       _mm_store_ss(y, vy_lo);
2571     }
2572   }
2573 }
2574 
xnn_f32_vaddc_minmax_ukernel__avx_x16(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2575 void xnn_f32_vaddc_minmax_ukernel__avx_x16(
2576     size_t n,
2577     const float* a,
2578     const float* b,
2579     float* y,
2580     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
2581 {
2582   assert(n != 0);
2583   assert(n % sizeof(float) == 0);
2584   assert(a != NULL);
2585   assert(b != NULL);
2586   assert(y != NULL);
2587 
2588   const __m256 vy_min = _mm256_load_ps(params->avx.min);
2589   const __m256 vy_max = _mm256_load_ps(params->avx.max);
2590 
2591   const __m256 vb = _mm256_broadcast_ss(b);
2592   for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
2593     const __m256 va01234567 = _mm256_loadu_ps(a);
2594     const __m256 va89ABCDEF = _mm256_loadu_ps(a + 8);
2595     a += 16;
2596 
2597     __m256 vy01234567 = _mm256_add_ps(va01234567, vb);
2598     __m256 vy89ABCDEF = _mm256_add_ps(va89ABCDEF, vb);
2599 
2600 
2601     vy01234567 = _mm256_max_ps(vy01234567, vy_min);
2602     vy89ABCDEF = _mm256_max_ps(vy89ABCDEF, vy_min);
2603 
2604     vy01234567 = _mm256_min_ps(vy01234567, vy_max);
2605     vy89ABCDEF = _mm256_min_ps(vy89ABCDEF, vy_max);
2606 
2607     _mm256_storeu_ps(y, vy01234567);
2608     _mm256_storeu_ps(y + 8, vy89ABCDEF);
2609     y += 16;
2610   }
2611   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
2612     const __m256 va = _mm256_loadu_ps(a);
2613     a += 8;
2614 
2615     __m256 vy = _mm256_add_ps(va, vb);
2616     vy = _mm256_max_ps(vy, vy_min);
2617     vy = _mm256_min_ps(vy, vy_max);
2618     _mm256_storeu_ps(y, vy);
2619     y += 8;
2620   }
2621   if XNN_UNLIKELY(n != 0) {
2622     assert(n >= 1 * sizeof(float));
2623     assert(n <= 7 * sizeof(float));
2624     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
2625 
2626     const __m256 va = _mm256_maskload_ps(a, vmask);
2627 
2628     __m256 vy = _mm256_add_ps(va, vb);
2629     vy = _mm256_max_ps(vy, vy_min);
2630     vy = _mm256_min_ps(vy, vy_max);
2631 
2632     __m128 vy_lo = _mm256_castps256_ps128(vy);
2633     if (n & (4 * sizeof(float))) {
2634       _mm_storeu_ps(y, vy_lo);
2635       vy_lo = _mm256_extractf128_ps(vy, 1);
2636       y += 4;
2637     }
2638     if (n & (2 * sizeof(float))) {
2639       _mm_storel_pi((__m64*) y, vy_lo);
2640       vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
2641       y += 2;
2642     }
2643     if (n & (1 * sizeof(float))) {
2644       _mm_store_ss(y, vy_lo);
2645     }
2646   }
2647 }
2648 
xnn_f32_vdiv_minmax_ukernel__avx_x16(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2649 void xnn_f32_vdiv_minmax_ukernel__avx_x16(
2650     size_t n,
2651     const float* a,
2652     const float* b,
2653     float* y,
2654     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
2655 {
2656   assert(n != 0);
2657   assert(n % sizeof(float) == 0);
2658   assert(a != NULL);
2659   assert(b != NULL);
2660   assert(y != NULL);
2661 
2662   const __m256 vy_min = _mm256_load_ps(params->avx.min);
2663   const __m256 vy_max = _mm256_load_ps(params->avx.max);
2664 
2665   for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
2666     const __m256 va01234567 = _mm256_loadu_ps(a);
2667     const __m256 va89ABCDEF = _mm256_loadu_ps(a + 8);
2668     a += 16;
2669 
2670     const __m256 vb01234567 = _mm256_loadu_ps(b);
2671     const __m256 vb89ABCDEF = _mm256_loadu_ps(b + 8);
2672     b += 16;
2673 
2674     __m256 vy01234567 = _mm256_div_ps(va01234567, vb01234567);
2675     __m256 vy89ABCDEF = _mm256_div_ps(va89ABCDEF, vb89ABCDEF);
2676 
2677 
2678     vy01234567 = _mm256_max_ps(vy01234567, vy_min);
2679     vy89ABCDEF = _mm256_max_ps(vy89ABCDEF, vy_min);
2680 
2681     vy01234567 = _mm256_min_ps(vy01234567, vy_max);
2682     vy89ABCDEF = _mm256_min_ps(vy89ABCDEF, vy_max);
2683 
2684     _mm256_storeu_ps(y, vy01234567);
2685     _mm256_storeu_ps(y + 8, vy89ABCDEF);
2686     y += 16;
2687   }
2688   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
2689     const __m256 va = _mm256_loadu_ps(a);
2690     a += 8;
2691 
2692     const __m256 vb = _mm256_loadu_ps(b);
2693     b += 8;
2694 
2695     __m256 vy = _mm256_div_ps(va, vb);
2696     vy = _mm256_max_ps(vy, vy_min);
2697     vy = _mm256_min_ps(vy, vy_max);
2698     _mm256_storeu_ps(y, vy);
2699     y += 8;
2700   }
2701   if XNN_UNLIKELY(n != 0) {
2702     assert(n >= 1 * sizeof(float));
2703     assert(n <= 7 * sizeof(float));
2704     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
2705 
2706     const __m256 va = _mm256_maskload_ps(a, vmask);
2707     const __m256 vb = _mm256_maskload_ps(b, vmask);
2708 
2709     __m256 vy = _mm256_div_ps(va, vb);
2710     vy = _mm256_max_ps(vy, vy_min);
2711     vy = _mm256_min_ps(vy, vy_max);
2712 
2713     __m128 vy_lo = _mm256_castps256_ps128(vy);
2714     if (n & (4 * sizeof(float))) {
2715       _mm_storeu_ps(y, vy_lo);
2716       vy_lo = _mm256_extractf128_ps(vy, 1);
2717       y += 4;
2718     }
2719     if (n & (2 * sizeof(float))) {
2720       _mm_storel_pi((__m64*) y, vy_lo);
2721       vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
2722       y += 2;
2723     }
2724     if (n & (1 * sizeof(float))) {
2725       _mm_store_ss(y, vy_lo);
2726     }
2727   }
2728 }
2729 
xnn_f32_vdivc_minmax_ukernel__avx_x16(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2730 void xnn_f32_vdivc_minmax_ukernel__avx_x16(
2731     size_t n,
2732     const float* a,
2733     const float* b,
2734     float* y,
2735     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
2736 {
2737   assert(n != 0);
2738   assert(n % sizeof(float) == 0);
2739   assert(a != NULL);
2740   assert(b != NULL);
2741   assert(y != NULL);
2742 
2743   const __m256 vy_min = _mm256_load_ps(params->avx.min);
2744   const __m256 vy_max = _mm256_load_ps(params->avx.max);
2745 
2746   const __m256 vb = _mm256_broadcast_ss(b);
2747   for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
2748     const __m256 va01234567 = _mm256_loadu_ps(a);
2749     const __m256 va89ABCDEF = _mm256_loadu_ps(a + 8);
2750     a += 16;
2751 
2752     __m256 vy01234567 = _mm256_div_ps(va01234567, vb);
2753     __m256 vy89ABCDEF = _mm256_div_ps(va89ABCDEF, vb);
2754 
2755 
2756     vy01234567 = _mm256_max_ps(vy01234567, vy_min);
2757     vy89ABCDEF = _mm256_max_ps(vy89ABCDEF, vy_min);
2758 
2759     vy01234567 = _mm256_min_ps(vy01234567, vy_max);
2760     vy89ABCDEF = _mm256_min_ps(vy89ABCDEF, vy_max);
2761 
2762     _mm256_storeu_ps(y, vy01234567);
2763     _mm256_storeu_ps(y + 8, vy89ABCDEF);
2764     y += 16;
2765   }
2766   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
2767     const __m256 va = _mm256_loadu_ps(a);
2768     a += 8;
2769 
2770     __m256 vy = _mm256_div_ps(va, vb);
2771     vy = _mm256_max_ps(vy, vy_min);
2772     vy = _mm256_min_ps(vy, vy_max);
2773     _mm256_storeu_ps(y, vy);
2774     y += 8;
2775   }
2776   if XNN_UNLIKELY(n != 0) {
2777     assert(n >= 1 * sizeof(float));
2778     assert(n <= 7 * sizeof(float));
2779     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
2780 
2781     const __m256 va = _mm256_maskload_ps(a, vmask);
2782 
2783     __m256 vy = _mm256_div_ps(va, vb);
2784     vy = _mm256_max_ps(vy, vy_min);
2785     vy = _mm256_min_ps(vy, vy_max);
2786 
2787     __m128 vy_lo = _mm256_castps256_ps128(vy);
2788     if (n & (4 * sizeof(float))) {
2789       _mm_storeu_ps(y, vy_lo);
2790       vy_lo = _mm256_extractf128_ps(vy, 1);
2791       y += 4;
2792     }
2793     if (n & (2 * sizeof(float))) {
2794       _mm_storel_pi((__m64*) y, vy_lo);
2795       vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
2796       y += 2;
2797     }
2798     if (n & (1 * sizeof(float))) {
2799       _mm_store_ss(y, vy_lo);
2800     }
2801   }
2802 }
2803 
xnn_f32_vmax_ukernel__avx_x16(size_t n,const float * a,const float * b,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])2804 void xnn_f32_vmax_ukernel__avx_x16(
2805     size_t n,
2806     const float* a,
2807     const float* b,
2808     float* y,
2809     const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
2810 {
2811   assert(n != 0);
2812   assert(n % sizeof(float) == 0);
2813   assert(a != NULL);
2814   assert(b != NULL);
2815   assert(y != NULL);
2816 
2817 
2818   for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
2819     const __m256 va01234567 = _mm256_loadu_ps(a);
2820     const __m256 va89ABCDEF = _mm256_loadu_ps(a + 8);
2821     a += 16;
2822 
2823     const __m256 vb01234567 = _mm256_loadu_ps(b);
2824     const __m256 vb89ABCDEF = _mm256_loadu_ps(b + 8);
2825     b += 16;
2826 
2827     __m256 vy01234567 = _mm256_max_ps(va01234567, vb01234567);
2828     __m256 vy89ABCDEF = _mm256_max_ps(va89ABCDEF, vb89ABCDEF);
2829 
2830 
2831 
2832     _mm256_storeu_ps(y, vy01234567);
2833     _mm256_storeu_ps(y + 8, vy89ABCDEF);
2834     y += 16;
2835   }
2836   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
2837     const __m256 va = _mm256_loadu_ps(a);
2838     a += 8;
2839 
2840     const __m256 vb = _mm256_loadu_ps(b);
2841     b += 8;
2842 
2843     __m256 vy = _mm256_max_ps(va, vb);
2844     _mm256_storeu_ps(y, vy);
2845     y += 8;
2846   }
2847   if XNN_UNLIKELY(n != 0) {
2848     assert(n >= 1 * sizeof(float));
2849     assert(n <= 7 * sizeof(float));
2850     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
2851 
2852     const __m256 va = _mm256_maskload_ps(a, vmask);
2853     const __m256 vb = _mm256_maskload_ps(b, vmask);
2854 
2855     __m256 vy = _mm256_max_ps(va, vb);
2856 
2857     __m128 vy_lo = _mm256_castps256_ps128(vy);
2858     if (n & (4 * sizeof(float))) {
2859       _mm_storeu_ps(y, vy_lo);
2860       vy_lo = _mm256_extractf128_ps(vy, 1);
2861       y += 4;
2862     }
2863     if (n & (2 * sizeof(float))) {
2864       _mm_storel_pi((__m64*) y, vy_lo);
2865       vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
2866       y += 2;
2867     }
2868     if (n & (1 * sizeof(float))) {
2869       _mm_store_ss(y, vy_lo);
2870     }
2871   }
2872 }
2873 
xnn_f32_vmaxc_ukernel__avx_x16(size_t n,const float * a,const float * b,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])2874 void xnn_f32_vmaxc_ukernel__avx_x16(
2875     size_t n,
2876     const float* a,
2877     const float* b,
2878     float* y,
2879     const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
2880 {
2881   assert(n != 0);
2882   assert(n % sizeof(float) == 0);
2883   assert(a != NULL);
2884   assert(b != NULL);
2885   assert(y != NULL);
2886 
2887 
2888   const __m256 vb = _mm256_broadcast_ss(b);
2889   for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
2890     const __m256 va01234567 = _mm256_loadu_ps(a);
2891     const __m256 va89ABCDEF = _mm256_loadu_ps(a + 8);
2892     a += 16;
2893 
2894     __m256 vy01234567 = _mm256_max_ps(va01234567, vb);
2895     __m256 vy89ABCDEF = _mm256_max_ps(va89ABCDEF, vb);
2896 
2897 
2898 
2899     _mm256_storeu_ps(y, vy01234567);
2900     _mm256_storeu_ps(y + 8, vy89ABCDEF);
2901     y += 16;
2902   }
2903   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
2904     const __m256 va = _mm256_loadu_ps(a);
2905     a += 8;
2906 
2907     __m256 vy = _mm256_max_ps(va, vb);
2908     _mm256_storeu_ps(y, vy);
2909     y += 8;
2910   }
2911   if XNN_UNLIKELY(n != 0) {
2912     assert(n >= 1 * sizeof(float));
2913     assert(n <= 7 * sizeof(float));
2914     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
2915 
2916     const __m256 va = _mm256_maskload_ps(a, vmask);
2917 
2918     __m256 vy = _mm256_max_ps(va, vb);
2919 
2920     __m128 vy_lo = _mm256_castps256_ps128(vy);
2921     if (n & (4 * sizeof(float))) {
2922       _mm_storeu_ps(y, vy_lo);
2923       vy_lo = _mm256_extractf128_ps(vy, 1);
2924       y += 4;
2925     }
2926     if (n & (2 * sizeof(float))) {
2927       _mm_storel_pi((__m64*) y, vy_lo);
2928       vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
2929       y += 2;
2930     }
2931     if (n & (1 * sizeof(float))) {
2932       _mm_store_ss(y, vy_lo);
2933     }
2934   }
2935 }
2936 
xnn_f32_vmin_ukernel__avx_x16(size_t n,const float * a,const float * b,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])2937 void xnn_f32_vmin_ukernel__avx_x16(
2938     size_t n,
2939     const float* a,
2940     const float* b,
2941     float* y,
2942     const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
2943 {
2944   assert(n != 0);
2945   assert(n % sizeof(float) == 0);
2946   assert(a != NULL);
2947   assert(b != NULL);
2948   assert(y != NULL);
2949 
2950 
2951   for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
2952     const __m256 va01234567 = _mm256_loadu_ps(a);
2953     const __m256 va89ABCDEF = _mm256_loadu_ps(a + 8);
2954     a += 16;
2955 
2956     const __m256 vb01234567 = _mm256_loadu_ps(b);
2957     const __m256 vb89ABCDEF = _mm256_loadu_ps(b + 8);
2958     b += 16;
2959 
2960     __m256 vy01234567 = _mm256_min_ps(va01234567, vb01234567);
2961     __m256 vy89ABCDEF = _mm256_min_ps(va89ABCDEF, vb89ABCDEF);
2962 
2963 
2964 
2965     _mm256_storeu_ps(y, vy01234567);
2966     _mm256_storeu_ps(y + 8, vy89ABCDEF);
2967     y += 16;
2968   }
2969   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
2970     const __m256 va = _mm256_loadu_ps(a);
2971     a += 8;
2972 
2973     const __m256 vb = _mm256_loadu_ps(b);
2974     b += 8;
2975 
2976     __m256 vy = _mm256_min_ps(va, vb);
2977     _mm256_storeu_ps(y, vy);
2978     y += 8;
2979   }
2980   if XNN_UNLIKELY(n != 0) {
2981     assert(n >= 1 * sizeof(float));
2982     assert(n <= 7 * sizeof(float));
2983     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
2984 
2985     const __m256 va = _mm256_maskload_ps(a, vmask);
2986     const __m256 vb = _mm256_maskload_ps(b, vmask);
2987 
2988     __m256 vy = _mm256_min_ps(va, vb);
2989 
2990     __m128 vy_lo = _mm256_castps256_ps128(vy);
2991     if (n & (4 * sizeof(float))) {
2992       _mm_storeu_ps(y, vy_lo);
2993       vy_lo = _mm256_extractf128_ps(vy, 1);
2994       y += 4;
2995     }
2996     if (n & (2 * sizeof(float))) {
2997       _mm_storel_pi((__m64*) y, vy_lo);
2998       vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
2999       y += 2;
3000     }
3001     if (n & (1 * sizeof(float))) {
3002       _mm_store_ss(y, vy_lo);
3003     }
3004   }
3005 }
3006 
xnn_f32_vminc_ukernel__avx_x16(size_t n,const float * a,const float * b,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])3007 void xnn_f32_vminc_ukernel__avx_x16(
3008     size_t n,
3009     const float* a,
3010     const float* b,
3011     float* y,
3012     const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
3013 {
3014   assert(n != 0);
3015   assert(n % sizeof(float) == 0);
3016   assert(a != NULL);
3017   assert(b != NULL);
3018   assert(y != NULL);
3019 
3020 
3021   const __m256 vb = _mm256_broadcast_ss(b);
3022   for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
3023     const __m256 va01234567 = _mm256_loadu_ps(a);
3024     const __m256 va89ABCDEF = _mm256_loadu_ps(a + 8);
3025     a += 16;
3026 
3027     __m256 vy01234567 = _mm256_min_ps(va01234567, vb);
3028     __m256 vy89ABCDEF = _mm256_min_ps(va89ABCDEF, vb);
3029 
3030 
3031 
3032     _mm256_storeu_ps(y, vy01234567);
3033     _mm256_storeu_ps(y + 8, vy89ABCDEF);
3034     y += 16;
3035   }
3036   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
3037     const __m256 va = _mm256_loadu_ps(a);
3038     a += 8;
3039 
3040     __m256 vy = _mm256_min_ps(va, vb);
3041     _mm256_storeu_ps(y, vy);
3042     y += 8;
3043   }
3044   if XNN_UNLIKELY(n != 0) {
3045     assert(n >= 1 * sizeof(float));
3046     assert(n <= 7 * sizeof(float));
3047     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
3048 
3049     const __m256 va = _mm256_maskload_ps(a, vmask);
3050 
3051     __m256 vy = _mm256_min_ps(va, vb);
3052 
3053     __m128 vy_lo = _mm256_castps256_ps128(vy);
3054     if (n & (4 * sizeof(float))) {
3055       _mm_storeu_ps(y, vy_lo);
3056       vy_lo = _mm256_extractf128_ps(vy, 1);
3057       y += 4;
3058     }
3059     if (n & (2 * sizeof(float))) {
3060       _mm_storel_pi((__m64*) y, vy_lo);
3061       vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
3062       y += 2;
3063     }
3064     if (n & (1 * sizeof(float))) {
3065       _mm_store_ss(y, vy_lo);
3066     }
3067   }
3068 }
3069 
xnn_f32_vmul_minmax_ukernel__avx_x16(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3070 void xnn_f32_vmul_minmax_ukernel__avx_x16(
3071     size_t n,
3072     const float* a,
3073     const float* b,
3074     float* y,
3075     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
3076 {
3077   assert(n != 0);
3078   assert(n % sizeof(float) == 0);
3079   assert(a != NULL);
3080   assert(b != NULL);
3081   assert(y != NULL);
3082 
3083   const __m256 vy_min = _mm256_load_ps(params->avx.min);
3084   const __m256 vy_max = _mm256_load_ps(params->avx.max);
3085 
3086   for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
3087     const __m256 va01234567 = _mm256_loadu_ps(a);
3088     const __m256 va89ABCDEF = _mm256_loadu_ps(a + 8);
3089     a += 16;
3090 
3091     const __m256 vb01234567 = _mm256_loadu_ps(b);
3092     const __m256 vb89ABCDEF = _mm256_loadu_ps(b + 8);
3093     b += 16;
3094 
3095     __m256 vy01234567 = _mm256_mul_ps(va01234567, vb01234567);
3096     __m256 vy89ABCDEF = _mm256_mul_ps(va89ABCDEF, vb89ABCDEF);
3097 
3098 
3099     vy01234567 = _mm256_max_ps(vy01234567, vy_min);
3100     vy89ABCDEF = _mm256_max_ps(vy89ABCDEF, vy_min);
3101 
3102     vy01234567 = _mm256_min_ps(vy01234567, vy_max);
3103     vy89ABCDEF = _mm256_min_ps(vy89ABCDEF, vy_max);
3104 
3105     _mm256_storeu_ps(y, vy01234567);
3106     _mm256_storeu_ps(y + 8, vy89ABCDEF);
3107     y += 16;
3108   }
3109   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
3110     const __m256 va = _mm256_loadu_ps(a);
3111     a += 8;
3112 
3113     const __m256 vb = _mm256_loadu_ps(b);
3114     b += 8;
3115 
3116     __m256 vy = _mm256_mul_ps(va, vb);
3117     vy = _mm256_max_ps(vy, vy_min);
3118     vy = _mm256_min_ps(vy, vy_max);
3119     _mm256_storeu_ps(y, vy);
3120     y += 8;
3121   }
3122   if XNN_UNLIKELY(n != 0) {
3123     assert(n >= 1 * sizeof(float));
3124     assert(n <= 7 * sizeof(float));
3125     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
3126 
3127     const __m256 va = _mm256_maskload_ps(a, vmask);
3128     const __m256 vb = _mm256_maskload_ps(b, vmask);
3129 
3130     __m256 vy = _mm256_mul_ps(va, vb);
3131     vy = _mm256_max_ps(vy, vy_min);
3132     vy = _mm256_min_ps(vy, vy_max);
3133 
3134     __m128 vy_lo = _mm256_castps256_ps128(vy);
3135     if (n & (4 * sizeof(float))) {
3136       _mm_storeu_ps(y, vy_lo);
3137       vy_lo = _mm256_extractf128_ps(vy, 1);
3138       y += 4;
3139     }
3140     if (n & (2 * sizeof(float))) {
3141       _mm_storel_pi((__m64*) y, vy_lo);
3142       vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
3143       y += 2;
3144     }
3145     if (n & (1 * sizeof(float))) {
3146       _mm_store_ss(y, vy_lo);
3147     }
3148   }
3149 }
3150 
xnn_f32_vmulc_minmax_ukernel__avx_x16(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3151 void xnn_f32_vmulc_minmax_ukernel__avx_x16(
3152     size_t n,
3153     const float* a,
3154     const float* b,
3155     float* y,
3156     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
3157 {
3158   assert(n != 0);
3159   assert(n % sizeof(float) == 0);
3160   assert(a != NULL);
3161   assert(b != NULL);
3162   assert(y != NULL);
3163 
3164   const __m256 vy_min = _mm256_load_ps(params->avx.min);
3165   const __m256 vy_max = _mm256_load_ps(params->avx.max);
3166 
3167   const __m256 vb = _mm256_broadcast_ss(b);
3168   for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
3169     const __m256 va01234567 = _mm256_loadu_ps(a);
3170     const __m256 va89ABCDEF = _mm256_loadu_ps(a + 8);
3171     a += 16;
3172 
3173     __m256 vy01234567 = _mm256_mul_ps(va01234567, vb);
3174     __m256 vy89ABCDEF = _mm256_mul_ps(va89ABCDEF, vb);
3175 
3176 
3177     vy01234567 = _mm256_max_ps(vy01234567, vy_min);
3178     vy89ABCDEF = _mm256_max_ps(vy89ABCDEF, vy_min);
3179 
3180     vy01234567 = _mm256_min_ps(vy01234567, vy_max);
3181     vy89ABCDEF = _mm256_min_ps(vy89ABCDEF, vy_max);
3182 
3183     _mm256_storeu_ps(y, vy01234567);
3184     _mm256_storeu_ps(y + 8, vy89ABCDEF);
3185     y += 16;
3186   }
3187   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
3188     const __m256 va = _mm256_loadu_ps(a);
3189     a += 8;
3190 
3191     __m256 vy = _mm256_mul_ps(va, vb);
3192     vy = _mm256_max_ps(vy, vy_min);
3193     vy = _mm256_min_ps(vy, vy_max);
3194     _mm256_storeu_ps(y, vy);
3195     y += 8;
3196   }
3197   if XNN_UNLIKELY(n != 0) {
3198     assert(n >= 1 * sizeof(float));
3199     assert(n <= 7 * sizeof(float));
3200     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
3201 
3202     const __m256 va = _mm256_maskload_ps(a, vmask);
3203 
3204     __m256 vy = _mm256_mul_ps(va, vb);
3205     vy = _mm256_max_ps(vy, vy_min);
3206     vy = _mm256_min_ps(vy, vy_max);
3207 
3208     __m128 vy_lo = _mm256_castps256_ps128(vy);
3209     if (n & (4 * sizeof(float))) {
3210       _mm_storeu_ps(y, vy_lo);
3211       vy_lo = _mm256_extractf128_ps(vy, 1);
3212       y += 4;
3213     }
3214     if (n & (2 * sizeof(float))) {
3215       _mm_storel_pi((__m64*) y, vy_lo);
3216       vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
3217       y += 2;
3218     }
3219     if (n & (1 * sizeof(float))) {
3220       _mm_store_ss(y, vy_lo);
3221     }
3222   }
3223 }
3224 
xnn_f32_vrdivc_minmax_ukernel__avx_x16(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3225 void xnn_f32_vrdivc_minmax_ukernel__avx_x16(
3226     size_t n,
3227     const float* a,
3228     const float* b,
3229     float* y,
3230     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
3231 {
3232   assert(n != 0);
3233   assert(n % sizeof(float) == 0);
3234   assert(a != NULL);
3235   assert(b != NULL);
3236   assert(y != NULL);
3237 
3238   const __m256 vy_min = _mm256_load_ps(params->avx.min);
3239   const __m256 vy_max = _mm256_load_ps(params->avx.max);
3240 
3241   const __m256 vb = _mm256_broadcast_ss(b);
3242   for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
3243     const __m256 va01234567 = _mm256_loadu_ps(a);
3244     const __m256 va89ABCDEF = _mm256_loadu_ps(a + 8);
3245     a += 16;
3246 
3247     __m256 vy01234567 = _mm256_div_ps(vb, va01234567);
3248     __m256 vy89ABCDEF = _mm256_div_ps(vb, va89ABCDEF);
3249 
3250 
3251     vy01234567 = _mm256_max_ps(vy01234567, vy_min);
3252     vy89ABCDEF = _mm256_max_ps(vy89ABCDEF, vy_min);
3253 
3254     vy01234567 = _mm256_min_ps(vy01234567, vy_max);
3255     vy89ABCDEF = _mm256_min_ps(vy89ABCDEF, vy_max);
3256 
3257     _mm256_storeu_ps(y, vy01234567);
3258     _mm256_storeu_ps(y + 8, vy89ABCDEF);
3259     y += 16;
3260   }
3261   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
3262     const __m256 va = _mm256_loadu_ps(a);
3263     a += 8;
3264 
3265     __m256 vy = _mm256_div_ps(vb, va);
3266     vy = _mm256_max_ps(vy, vy_min);
3267     vy = _mm256_min_ps(vy, vy_max);
3268     _mm256_storeu_ps(y, vy);
3269     y += 8;
3270   }
3271   if XNN_UNLIKELY(n != 0) {
3272     assert(n >= 1 * sizeof(float));
3273     assert(n <= 7 * sizeof(float));
3274     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
3275 
3276     const __m256 va = _mm256_maskload_ps(a, vmask);
3277 
3278     __m256 vy = _mm256_div_ps(vb, va);
3279     vy = _mm256_max_ps(vy, vy_min);
3280     vy = _mm256_min_ps(vy, vy_max);
3281 
3282     __m128 vy_lo = _mm256_castps256_ps128(vy);
3283     if (n & (4 * sizeof(float))) {
3284       _mm_storeu_ps(y, vy_lo);
3285       vy_lo = _mm256_extractf128_ps(vy, 1);
3286       y += 4;
3287     }
3288     if (n & (2 * sizeof(float))) {
3289       _mm_storel_pi((__m64*) y, vy_lo);
3290       vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
3291       y += 2;
3292     }
3293     if (n & (1 * sizeof(float))) {
3294       _mm_store_ss(y, vy_lo);
3295     }
3296   }
3297 }
3298 
xnn_f32_vrsubc_minmax_ukernel__avx_x16(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3299 void xnn_f32_vrsubc_minmax_ukernel__avx_x16(
3300     size_t n,
3301     const float* a,
3302     const float* b,
3303     float* y,
3304     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
3305 {
3306   assert(n != 0);
3307   assert(n % sizeof(float) == 0);
3308   assert(a != NULL);
3309   assert(b != NULL);
3310   assert(y != NULL);
3311 
3312   const __m256 vy_min = _mm256_load_ps(params->avx.min);
3313   const __m256 vy_max = _mm256_load_ps(params->avx.max);
3314 
3315   const __m256 vb = _mm256_broadcast_ss(b);
3316   for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
3317     const __m256 va01234567 = _mm256_loadu_ps(a);
3318     const __m256 va89ABCDEF = _mm256_loadu_ps(a + 8);
3319     a += 16;
3320 
3321     __m256 vy01234567 = _mm256_sub_ps(vb, va01234567);
3322     __m256 vy89ABCDEF = _mm256_sub_ps(vb, va89ABCDEF);
3323 
3324 
3325     vy01234567 = _mm256_max_ps(vy01234567, vy_min);
3326     vy89ABCDEF = _mm256_max_ps(vy89ABCDEF, vy_min);
3327 
3328     vy01234567 = _mm256_min_ps(vy01234567, vy_max);
3329     vy89ABCDEF = _mm256_min_ps(vy89ABCDEF, vy_max);
3330 
3331     _mm256_storeu_ps(y, vy01234567);
3332     _mm256_storeu_ps(y + 8, vy89ABCDEF);
3333     y += 16;
3334   }
3335   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
3336     const __m256 va = _mm256_loadu_ps(a);
3337     a += 8;
3338 
3339     __m256 vy = _mm256_sub_ps(vb, va);
3340     vy = _mm256_max_ps(vy, vy_min);
3341     vy = _mm256_min_ps(vy, vy_max);
3342     _mm256_storeu_ps(y, vy);
3343     y += 8;
3344   }
3345   if XNN_UNLIKELY(n != 0) {
3346     assert(n >= 1 * sizeof(float));
3347     assert(n <= 7 * sizeof(float));
3348     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
3349 
3350     const __m256 va = _mm256_maskload_ps(a, vmask);
3351 
3352     __m256 vy = _mm256_sub_ps(vb, va);
3353     vy = _mm256_max_ps(vy, vy_min);
3354     vy = _mm256_min_ps(vy, vy_max);
3355 
3356     __m128 vy_lo = _mm256_castps256_ps128(vy);
3357     if (n & (4 * sizeof(float))) {
3358       _mm_storeu_ps(y, vy_lo);
3359       vy_lo = _mm256_extractf128_ps(vy, 1);
3360       y += 4;
3361     }
3362     if (n & (2 * sizeof(float))) {
3363       _mm_storel_pi((__m64*) y, vy_lo);
3364       vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
3365       y += 2;
3366     }
3367     if (n & (1 * sizeof(float))) {
3368       _mm_store_ss(y, vy_lo);
3369     }
3370   }
3371 }
3372 
xnn_f32_vsqrdiff_ukernel__avx_x16(size_t n,const float * a,const float * b,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])3373 void xnn_f32_vsqrdiff_ukernel__avx_x16(
3374     size_t n,
3375     const float* a,
3376     const float* b,
3377     float* y,
3378     const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
3379 {
3380   assert(n != 0);
3381   assert(n % sizeof(float) == 0);
3382   assert(a != NULL);
3383   assert(b != NULL);
3384   assert(y != NULL);
3385 
3386 
3387   for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
3388     const __m256 va01234567 = _mm256_loadu_ps(a);
3389     const __m256 va89ABCDEF = _mm256_loadu_ps(a + 8);
3390     a += 16;
3391 
3392     const __m256 vb01234567 = _mm256_loadu_ps(b);
3393     const __m256 vb89ABCDEF = _mm256_loadu_ps(b + 8);
3394     b += 16;
3395 
3396     __m256 vy01234567 = _mm256_sub_ps(va01234567, vb01234567);
3397     __m256 vy89ABCDEF = _mm256_sub_ps(va89ABCDEF, vb89ABCDEF);
3398 
3399     vy01234567 = _mm256_mul_ps(vy01234567, vy01234567);
3400     vy89ABCDEF = _mm256_mul_ps(vy89ABCDEF, vy89ABCDEF);
3401 
3402 
3403     _mm256_storeu_ps(y, vy01234567);
3404     _mm256_storeu_ps(y + 8, vy89ABCDEF);
3405     y += 16;
3406   }
3407   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
3408     const __m256 va = _mm256_loadu_ps(a);
3409     a += 8;
3410 
3411     const __m256 vb = _mm256_loadu_ps(b);
3412     b += 8;
3413 
3414     __m256 vy = _mm256_sub_ps(va, vb);
3415     vy = _mm256_mul_ps(vy, vy);
3416     _mm256_storeu_ps(y, vy);
3417     y += 8;
3418   }
3419   if XNN_UNLIKELY(n != 0) {
3420     assert(n >= 1 * sizeof(float));
3421     assert(n <= 7 * sizeof(float));
3422     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
3423 
3424     const __m256 va = _mm256_maskload_ps(a, vmask);
3425     const __m256 vb = _mm256_maskload_ps(b, vmask);
3426 
3427     __m256 vy = _mm256_sub_ps(va, vb);
3428     vy = _mm256_mul_ps(vy, vy);
3429 
3430     __m128 vy_lo = _mm256_castps256_ps128(vy);
3431     if (n & (4 * sizeof(float))) {
3432       _mm_storeu_ps(y, vy_lo);
3433       vy_lo = _mm256_extractf128_ps(vy, 1);
3434       y += 4;
3435     }
3436     if (n & (2 * sizeof(float))) {
3437       _mm_storel_pi((__m64*) y, vy_lo);
3438       vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
3439       y += 2;
3440     }
3441     if (n & (1 * sizeof(float))) {
3442       _mm_store_ss(y, vy_lo);
3443     }
3444   }
3445 }
3446 
xnn_f32_vsqrdiffc_ukernel__avx_x16(size_t n,const float * a,const float * b,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])3447 void xnn_f32_vsqrdiffc_ukernel__avx_x16(
3448     size_t n,
3449     const float* a,
3450     const float* b,
3451     float* y,
3452     const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
3453 {
3454   assert(n != 0);
3455   assert(n % sizeof(float) == 0);
3456   assert(a != NULL);
3457   assert(b != NULL);
3458   assert(y != NULL);
3459 
3460 
3461   const __m256 vb = _mm256_broadcast_ss(b);
3462   for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
3463     const __m256 va01234567 = _mm256_loadu_ps(a);
3464     const __m256 va89ABCDEF = _mm256_loadu_ps(a + 8);
3465     a += 16;
3466 
3467     __m256 vy01234567 = _mm256_sub_ps(va01234567, vb);
3468     __m256 vy89ABCDEF = _mm256_sub_ps(va89ABCDEF, vb);
3469 
3470     vy01234567 = _mm256_mul_ps(vy01234567, vy01234567);
3471     vy89ABCDEF = _mm256_mul_ps(vy89ABCDEF, vy89ABCDEF);
3472 
3473 
3474     _mm256_storeu_ps(y, vy01234567);
3475     _mm256_storeu_ps(y + 8, vy89ABCDEF);
3476     y += 16;
3477   }
3478   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
3479     const __m256 va = _mm256_loadu_ps(a);
3480     a += 8;
3481 
3482     __m256 vy = _mm256_sub_ps(va, vb);
3483     vy = _mm256_mul_ps(vy, vy);
3484     _mm256_storeu_ps(y, vy);
3485     y += 8;
3486   }
3487   if XNN_UNLIKELY(n != 0) {
3488     assert(n >= 1 * sizeof(float));
3489     assert(n <= 7 * sizeof(float));
3490     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
3491 
3492     const __m256 va = _mm256_maskload_ps(a, vmask);
3493 
3494     __m256 vy = _mm256_sub_ps(va, vb);
3495     vy = _mm256_mul_ps(vy, vy);
3496 
3497     __m128 vy_lo = _mm256_castps256_ps128(vy);
3498     if (n & (4 * sizeof(float))) {
3499       _mm_storeu_ps(y, vy_lo);
3500       vy_lo = _mm256_extractf128_ps(vy, 1);
3501       y += 4;
3502     }
3503     if (n & (2 * sizeof(float))) {
3504       _mm_storel_pi((__m64*) y, vy_lo);
3505       vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
3506       y += 2;
3507     }
3508     if (n & (1 * sizeof(float))) {
3509       _mm_store_ss(y, vy_lo);
3510     }
3511   }
3512 }
3513 
xnn_f32_vsub_minmax_ukernel__avx_x16(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3514 void xnn_f32_vsub_minmax_ukernel__avx_x16(
3515     size_t n,
3516     const float* a,
3517     const float* b,
3518     float* y,
3519     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
3520 {
3521   assert(n != 0);
3522   assert(n % sizeof(float) == 0);
3523   assert(a != NULL);
3524   assert(b != NULL);
3525   assert(y != NULL);
3526 
3527   const __m256 vy_min = _mm256_load_ps(params->avx.min);
3528   const __m256 vy_max = _mm256_load_ps(params->avx.max);
3529 
3530   for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
3531     const __m256 va01234567 = _mm256_loadu_ps(a);
3532     const __m256 va89ABCDEF = _mm256_loadu_ps(a + 8);
3533     a += 16;
3534 
3535     const __m256 vb01234567 = _mm256_loadu_ps(b);
3536     const __m256 vb89ABCDEF = _mm256_loadu_ps(b + 8);
3537     b += 16;
3538 
3539     __m256 vy01234567 = _mm256_sub_ps(va01234567, vb01234567);
3540     __m256 vy89ABCDEF = _mm256_sub_ps(va89ABCDEF, vb89ABCDEF);
3541 
3542 
3543     vy01234567 = _mm256_max_ps(vy01234567, vy_min);
3544     vy89ABCDEF = _mm256_max_ps(vy89ABCDEF, vy_min);
3545 
3546     vy01234567 = _mm256_min_ps(vy01234567, vy_max);
3547     vy89ABCDEF = _mm256_min_ps(vy89ABCDEF, vy_max);
3548 
3549     _mm256_storeu_ps(y, vy01234567);
3550     _mm256_storeu_ps(y + 8, vy89ABCDEF);
3551     y += 16;
3552   }
3553   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
3554     const __m256 va = _mm256_loadu_ps(a);
3555     a += 8;
3556 
3557     const __m256 vb = _mm256_loadu_ps(b);
3558     b += 8;
3559 
3560     __m256 vy = _mm256_sub_ps(va, vb);
3561     vy = _mm256_max_ps(vy, vy_min);
3562     vy = _mm256_min_ps(vy, vy_max);
3563     _mm256_storeu_ps(y, vy);
3564     y += 8;
3565   }
3566   if XNN_UNLIKELY(n != 0) {
3567     assert(n >= 1 * sizeof(float));
3568     assert(n <= 7 * sizeof(float));
3569     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
3570 
3571     const __m256 va = _mm256_maskload_ps(a, vmask);
3572     const __m256 vb = _mm256_maskload_ps(b, vmask);
3573 
3574     __m256 vy = _mm256_sub_ps(va, vb);
3575     vy = _mm256_max_ps(vy, vy_min);
3576     vy = _mm256_min_ps(vy, vy_max);
3577 
3578     __m128 vy_lo = _mm256_castps256_ps128(vy);
3579     if (n & (4 * sizeof(float))) {
3580       _mm_storeu_ps(y, vy_lo);
3581       vy_lo = _mm256_extractf128_ps(vy, 1);
3582       y += 4;
3583     }
3584     if (n & (2 * sizeof(float))) {
3585       _mm_storel_pi((__m64*) y, vy_lo);
3586       vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
3587       y += 2;
3588     }
3589     if (n & (1 * sizeof(float))) {
3590       _mm_store_ss(y, vy_lo);
3591     }
3592   }
3593 }
3594 
xnn_f32_vsubc_minmax_ukernel__avx_x16(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3595 void xnn_f32_vsubc_minmax_ukernel__avx_x16(
3596     size_t n,
3597     const float* a,
3598     const float* b,
3599     float* y,
3600     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
3601 {
3602   assert(n != 0);
3603   assert(n % sizeof(float) == 0);
3604   assert(a != NULL);
3605   assert(b != NULL);
3606   assert(y != NULL);
3607 
3608   const __m256 vy_min = _mm256_load_ps(params->avx.min);
3609   const __m256 vy_max = _mm256_load_ps(params->avx.max);
3610 
3611   const __m256 vb = _mm256_broadcast_ss(b);
3612   for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
3613     const __m256 va01234567 = _mm256_loadu_ps(a);
3614     const __m256 va89ABCDEF = _mm256_loadu_ps(a + 8);
3615     a += 16;
3616 
3617     __m256 vy01234567 = _mm256_sub_ps(va01234567, vb);
3618     __m256 vy89ABCDEF = _mm256_sub_ps(va89ABCDEF, vb);
3619 
3620 
3621     vy01234567 = _mm256_max_ps(vy01234567, vy_min);
3622     vy89ABCDEF = _mm256_max_ps(vy89ABCDEF, vy_min);
3623 
3624     vy01234567 = _mm256_min_ps(vy01234567, vy_max);
3625     vy89ABCDEF = _mm256_min_ps(vy89ABCDEF, vy_max);
3626 
3627     _mm256_storeu_ps(y, vy01234567);
3628     _mm256_storeu_ps(y + 8, vy89ABCDEF);
3629     y += 16;
3630   }
3631   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
3632     const __m256 va = _mm256_loadu_ps(a);
3633     a += 8;
3634 
3635     __m256 vy = _mm256_sub_ps(va, vb);
3636     vy = _mm256_max_ps(vy, vy_min);
3637     vy = _mm256_min_ps(vy, vy_max);
3638     _mm256_storeu_ps(y, vy);
3639     y += 8;
3640   }
3641   if XNN_UNLIKELY(n != 0) {
3642     assert(n >= 1 * sizeof(float));
3643     assert(n <= 7 * sizeof(float));
3644     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
3645 
3646     const __m256 va = _mm256_maskload_ps(a, vmask);
3647 
3648     __m256 vy = _mm256_sub_ps(va, vb);
3649     vy = _mm256_max_ps(vy, vy_min);
3650     vy = _mm256_min_ps(vy, vy_max);
3651 
3652     __m128 vy_lo = _mm256_castps256_ps128(vy);
3653     if (n & (4 * sizeof(float))) {
3654       _mm_storeu_ps(y, vy_lo);
3655       vy_lo = _mm256_extractf128_ps(vy, 1);
3656       y += 4;
3657     }
3658     if (n & (2 * sizeof(float))) {
3659       _mm_storel_pi((__m64*) y, vy_lo);
3660       vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
3661       y += 2;
3662     }
3663     if (n & (1 * sizeof(float))) {
3664       _mm_store_ss(y, vy_lo);
3665     }
3666   }
3667 }
3668 
xnn_f32_vclamp_ukernel__avx_x16(size_t n,const float * x,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3669 void xnn_f32_vclamp_ukernel__avx_x16(
3670     size_t n,
3671     const float* x,
3672     float* y,
3673     const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
3674 {
3675   assert(n != 0);
3676   assert(n % sizeof(float) == 0);
3677   assert(x != NULL);
3678   assert(y != NULL);
3679 
3680   const __m256 vy_min = _mm256_load_ps(params->avx.min);
3681   const __m256 vy_max = _mm256_load_ps(params->avx.max);
3682 
3683   for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
3684     __m256 vacc01234567 = _mm256_loadu_ps(x);
3685     __m256 vacc89ABCDEF = _mm256_loadu_ps(x + 8);
3686     x += 16;
3687 
3688     vacc01234567 = _mm256_max_ps(vacc01234567, vy_min);
3689     vacc89ABCDEF = _mm256_max_ps(vacc89ABCDEF, vy_min);
3690 
3691     vacc01234567 = _mm256_min_ps(vacc01234567, vy_max);
3692     vacc89ABCDEF = _mm256_min_ps(vacc89ABCDEF, vy_max);
3693 
3694     _mm256_storeu_ps(y, vacc01234567);
3695     _mm256_storeu_ps(y + 8, vacc89ABCDEF);
3696     y += 16;
3697   }
3698   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
3699     __m256 vacc = _mm256_loadu_ps(x);
3700     x += 8;
3701 
3702     vacc = _mm256_max_ps(vacc, vy_min);
3703     vacc = _mm256_min_ps(vacc, vy_max);
3704 
3705     _mm256_storeu_ps(y, vacc);
3706     y += 8;
3707   }
3708   if XNN_UNLIKELY(n != 0) {
3709     assert(n >= 1 * sizeof(float));
3710     assert(n <= 7 * sizeof(float));
3711     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
3712 
3713     __m256 vacc = _mm256_maskload_ps(x, vmask);
3714     vacc = _mm256_max_ps(vacc, vy_min);
3715     vacc = _mm256_min_ps(vacc, vy_max);
3716 
3717     __m128 vacc_lo = _mm256_castps256_ps128(vacc);
3718     if (n & (4 * sizeof(float))) {
3719       _mm_storeu_ps(y, vacc_lo);
3720       vacc_lo = _mm256_extractf128_ps(vacc, 1);
3721       y += 4;
3722     }
3723     if (n & (2 * sizeof(float))) {
3724       _mm_storel_pi((__m64*) y, vacc_lo);
3725       vacc_lo = _mm_movehl_ps(vacc_lo, vacc_lo);
3726       y += 2;
3727     }
3728     if (n & (1 * sizeof(float))) {
3729       _mm_store_ss(y, vacc_lo);
3730     }
3731   }
3732 }
3733 
xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x32(size_t n,const float * x,float * y,const union xnn_f32_elu_params params[restrict XNN_MIN_ELEMENTS (1)])3734 void xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x32(
3735     size_t n,
3736     const float* x,
3737     float* y,
3738     const union xnn_f32_elu_params params[restrict XNN_MIN_ELEMENTS(1)])
3739 {
3740   assert(n % sizeof(float) == 0);
3741 
3742   const __m256 vprescale = _mm256_load_ps(params->avx_rr2_lut4_p4.prescale);
3743   const __m256 valpha = _mm256_load_ps(params->avx_rr2_lut4_p4.alpha);
3744   const __m256 vbeta = _mm256_load_ps(params->avx_rr2_lut4_p4.beta);
3745   const __m256 vsat_cutoff = _mm256_load_ps(params->avx_rr2_lut4_p4.sat_cutoff);
3746   const __m256 vmagic_bias = _mm256_load_ps(params->avx_rr2_lut4_p4.magic_bias);
3747   const __m256 vlog2e = _mm256_load_ps(params->avx_rr2_lut4_p4.log2e);
3748   const __m256 vindex_mask = _mm256_load_ps((const float*) params->avx_rr2_lut4_p4.index_mask);
3749   const __m256 vtable = _mm256_load_ps(params->avx_rr2_lut4_p4.table);
3750   const __m256 vminus_ln2_hi = _mm256_load_ps(params->avx_rr2_lut4_p4.minus_ln2_hi);
3751   const __m256 vminus_ln2_lo = _mm256_load_ps(params->avx_rr2_lut4_p4.minus_ln2_lo);
3752   const __m256 vc4 = _mm256_load_ps(params->avx_rr2_lut4_p4.c4);
3753   const __m256 vc3 = _mm256_load_ps(params->avx_rr2_lut4_p4.c3);
3754   const __m256 vc2 = _mm256_load_ps(params->avx_rr2_lut4_p4.c2);
3755   const __m256 vone = _mm256_load_ps(params->avx_rr2_lut4_p4.one);
3756 
3757   for (; n >= 32 * sizeof(float); n -= 32 * sizeof(float)) {
3758     __m256 vx0 = _mm256_loadu_ps(x);
3759     __m256 vx1 = _mm256_loadu_ps(x + 8);
3760     __m256 vx2 = _mm256_loadu_ps(x + 16);
3761     __m256 vx3 = _mm256_loadu_ps(x + 24);
3762     x += 32;
3763 
3764     const __m256 vz0 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx0, vprescale));
3765     const __m256 vz1 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx1, vprescale));
3766     const __m256 vz2 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx2, vprescale));
3767     const __m256 vz3 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx3, vprescale));
3768 
3769     __m256 vn0 = _mm256_add_ps(_mm256_mul_ps(vz0, vlog2e), vmagic_bias);
3770     __m256 vn1 = _mm256_add_ps(_mm256_mul_ps(vz1, vlog2e), vmagic_bias);
3771     __m256 vn2 = _mm256_add_ps(_mm256_mul_ps(vz2, vlog2e), vmagic_bias);
3772     __m256 vn3 = _mm256_add_ps(_mm256_mul_ps(vz3, vlog2e), vmagic_bias);
3773 
3774     __m256 ven0 = _mm256_andnot_ps(vindex_mask, vn0);
3775     const __m256 vl0 = _mm256_permutevar_ps(vtable, _mm256_castps_si256(vn0));
3776     const __m128 ven0_lo = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(ven0)), 21));
3777     __m256 ven1 = _mm256_andnot_ps(vindex_mask, vn1);
3778     const __m256 vl1 = _mm256_permutevar_ps(vtable, _mm256_castps_si256(vn1));
3779     const __m128 ven1_lo = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(ven1)), 21));
3780     __m256 ven2 = _mm256_andnot_ps(vindex_mask, vn2);
3781     const __m256 vl2 = _mm256_permutevar_ps(vtable, _mm256_castps_si256(vn2));
3782     const __m128 ven2_lo = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(ven2)), 21));
3783     __m256 ven3 = _mm256_andnot_ps(vindex_mask, vn3);
3784     const __m256 vl3 = _mm256_permutevar_ps(vtable, _mm256_castps_si256(vn3));
3785     const __m128 ven3_lo = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(ven3)), 21));
3786 
3787     vn0 = _mm256_sub_ps(vn0, vmagic_bias);
3788     const __m128 ven0_hi = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(ven0, 1)), 21));
3789     vn1 = _mm256_sub_ps(vn1, vmagic_bias);
3790     const __m128 ven1_hi = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(ven1, 1)), 21));
3791     vn2 = _mm256_sub_ps(vn2, vmagic_bias);
3792     const __m128 ven2_hi = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(ven2, 1)), 21));
3793     vn3 = _mm256_sub_ps(vn3, vmagic_bias);
3794     const __m128 ven3_hi = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(ven3, 1)), 21));
3795 
3796     __m256 vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_hi), vz0);
3797     ven0 = _mm256_insertf128_ps(_mm256_castps128_ps256(ven0_lo), ven0_hi, 1);
3798     __m256 vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_hi), vz1);
3799     ven1 = _mm256_insertf128_ps(_mm256_castps128_ps256(ven1_lo), ven1_hi, 1);
3800     __m256 vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_hi), vz2);
3801     ven2 = _mm256_insertf128_ps(_mm256_castps128_ps256(ven2_lo), ven2_hi, 1);
3802     __m256 vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_hi), vz3);
3803     ven3 = _mm256_insertf128_ps(_mm256_castps128_ps256(ven3_lo), ven3_hi, 1);
3804 
3805     vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_lo), vt0);
3806     __m256 vs0 = _mm256_mul_ps(vl0, ven0);
3807     vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_lo), vt1);
3808     __m256 vs1 = _mm256_mul_ps(vl1, ven1);
3809     vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_lo), vt2);
3810     __m256 vs2 = _mm256_mul_ps(vl2, ven2);
3811     vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_lo), vt3);
3812     __m256 vs3 = _mm256_mul_ps(vl3, ven3);
3813 
3814     __m256 vp0 = _mm256_add_ps(_mm256_mul_ps(vc4, vt0), vc3);
3815     __m256 vp1 = _mm256_add_ps(_mm256_mul_ps(vc4, vt1), vc3);
3816     __m256 vp2 = _mm256_add_ps(_mm256_mul_ps(vc4, vt2), vc3);
3817     __m256 vp3 = _mm256_add_ps(_mm256_mul_ps(vc4, vt3), vc3);
3818 
3819     vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc2);
3820     vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc2);
3821     vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc2);
3822     vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc2);
3823 
3824     vp0 = _mm256_mul_ps(vp0, vt0);
3825     vp1 = _mm256_mul_ps(vp1, vt1);
3826     vp2 = _mm256_mul_ps(vp2, vt2);
3827     vp3 = _mm256_mul_ps(vp3, vt3);
3828 
3829     vt0 = _mm256_mul_ps(vt0, vs0);
3830     vs0 = _mm256_sub_ps(vs0, vone);
3831     vt1 = _mm256_mul_ps(vt1, vs1);
3832     vs1 = _mm256_sub_ps(vs1, vone);
3833     vt2 = _mm256_mul_ps(vt2, vs2);
3834     vs2 = _mm256_sub_ps(vs2, vone);
3835     vt3 = _mm256_mul_ps(vt3, vs3);
3836     vs3 = _mm256_sub_ps(vs3, vone);
3837 
3838     vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vt0);
3839     vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vt1);
3840     vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vt2);
3841     vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vt3);
3842 
3843     const __m256 ve0 = _mm256_mul_ps(_mm256_add_ps(vp0, vs0), valpha);
3844     vx0 = _mm256_mul_ps(vx0, vbeta);
3845     const __m256 ve1 = _mm256_mul_ps(_mm256_add_ps(vp1, vs1), valpha);
3846     vx1 = _mm256_mul_ps(vx1, vbeta);
3847     const __m256 ve2 = _mm256_mul_ps(_mm256_add_ps(vp2, vs2), valpha);
3848     vx2 = _mm256_mul_ps(vx2, vbeta);
3849     const __m256 ve3 = _mm256_mul_ps(_mm256_add_ps(vp3, vs3), valpha);
3850     vx3 = _mm256_mul_ps(vx3, vbeta);
3851 
3852     const __m256 vy0 = _mm256_blendv_ps(vx0, ve0, vx0);
3853     const __m256 vy1 = _mm256_blendv_ps(vx1, ve1, vx1);
3854     const __m256 vy2 = _mm256_blendv_ps(vx2, ve2, vx2);
3855     const __m256 vy3 = _mm256_blendv_ps(vx3, ve3, vx3);
3856 
3857     _mm256_storeu_ps(y, vy0);
3858     _mm256_storeu_ps(y + 8, vy1);
3859     _mm256_storeu_ps(y + 16, vy2);
3860     _mm256_storeu_ps(y + 24, vy3);
3861     y += 32;
3862   }
3863   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
3864     __m256 vx = _mm256_loadu_ps(x);
3865     x += 8;
3866 
3867     const __m256 vz = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx, vprescale));
3868 
3869     __m256 vn = _mm256_add_ps(_mm256_mul_ps(vz, vlog2e), vmagic_bias);
3870     __m256 ven = _mm256_andnot_ps(vindex_mask, vn);
3871     const __m256 vl = _mm256_permutevar_ps(vtable, _mm256_castps_si256(vn));
3872     const __m128 ven_lo = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(ven)), 21));
3873     vn = _mm256_sub_ps(vn, vmagic_bias);
3874     const __m128 ven_hi = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(ven, 1)), 21));
3875 
3876     __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vz);
3877     ven = _mm256_insertf128_ps(_mm256_castps128_ps256(ven_lo), ven_hi, 1);
3878     vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt);
3879     __m256 vs = _mm256_mul_ps(vl, ven);
3880 
3881     __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc4, vt), vc3);
3882     vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2);
3883     vp = _mm256_mul_ps(vp, vt);
3884 
3885     vt = _mm256_mul_ps(vt, vs);
3886     vs = _mm256_sub_ps(vs, vone);
3887     vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vt);
3888 
3889     const __m256 ve = _mm256_mul_ps(_mm256_add_ps(vp, vs), valpha);
3890     vx = _mm256_mul_ps(vx, vbeta);
3891     const __m256 vy = _mm256_blendv_ps(vx, ve, vx);
3892 
3893     _mm256_storeu_ps(y, vy);
3894     y += 8;
3895   }
3896   if XNN_UNLIKELY(n != 0) {
3897     assert(n >= 1 * sizeof(float));
3898     assert(n <= 7 * sizeof(float));
3899     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx_rr2_p6.mask_table[7] - n));
3900 
3901     __m256 vx = _mm256_maskload_ps(x, vmask);
3902 
3903     const __m256 vz = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx, vprescale));
3904 
3905     __m256 vn = _mm256_add_ps(_mm256_mul_ps(vz, vlog2e), vmagic_bias);
3906     __m256 ven = _mm256_andnot_ps(vindex_mask, vn);
3907     const __m256 vl = _mm256_permutevar_ps(vtable, _mm256_castps_si256(vn));
3908     const __m128 ven_lo = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(ven)), 21));
3909     vn = _mm256_sub_ps(vn, vmagic_bias);
3910     const __m128 ven_hi = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(ven, 1)), 21));
3911 
3912     __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vz);
3913     ven = _mm256_insertf128_ps(_mm256_castps128_ps256(ven_lo), ven_hi, 1);
3914     vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt);
3915     __m256 vs = _mm256_mul_ps(vl, ven);
3916 
3917     __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc4, vt), vc3);
3918     vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2);
3919     vp = _mm256_mul_ps(vp, vt);
3920 
3921     vt = _mm256_mul_ps(vt, vs);
3922     vs = _mm256_sub_ps(vs, vone);
3923     vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vt);
3924 
3925     const __m256 ve = _mm256_mul_ps(_mm256_add_ps(vp, vs), valpha);
3926     vx = _mm256_mul_ps(vx, vbeta);
3927     const __m256 vy = _mm256_blendv_ps(vx, ve, vx);
3928 
3929     __m128 vy_lo = _mm256_castps256_ps128(vy);
3930     if (n & (4 * sizeof(float))) {
3931       _mm_storeu_ps(y, vy_lo);
3932       vy_lo = _mm256_extractf128_ps(vy, 1);
3933       y += 4;
3934     }
3935     if (n & (2 * sizeof(float))) {
3936       _mm_storel_pi((__m64*) y, vy_lo);
3937       vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
3938       y += 2;
3939     }
3940     if (n & (1 * sizeof(float))) {
3941       _mm_store_ss(y, vy_lo);
3942     }
3943   }
3944 }
3945 
xnn_f32_vhswish_ukernel__avx_x16(size_t n,const float * x,float * y,const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS (1)])3946 void xnn_f32_vhswish_ukernel__avx_x16(
3947     size_t n,
3948     const float* x,
3949     float* y,
3950     const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)])
3951 {
3952   assert(n != 0);
3953   assert(n % sizeof(float) == 0);
3954 
3955   const __m256 vsixth = _mm256_load_ps(params->avx.sixth);
3956   const __m256 vhalf = _mm256_load_ps(params->avx.half);
3957   const __m256 vone = _mm256_load_ps(params->avx.one);
3958   const __m256 vzero = _mm256_setzero_ps();
3959 
3960   for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
3961     const __m256 vx01234567 = _mm256_loadu_ps(x);
3962     const __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
3963     x += 16;
3964 
3965     __m256 vacc01234567 = _mm256_mul_ps(vx01234567, vsixth);
3966     __m256 vacc89ABCDEF = _mm256_mul_ps(vx89ABCDEF, vsixth);
3967 
3968     vacc01234567 = _mm256_add_ps(vacc01234567, vhalf);
3969     vacc89ABCDEF = _mm256_add_ps(vacc89ABCDEF, vhalf);
3970 
3971     vacc01234567 = _mm256_max_ps(vacc01234567, vzero);
3972     vacc89ABCDEF = _mm256_max_ps(vacc89ABCDEF, vzero);
3973 
3974     vacc01234567 = _mm256_min_ps(vacc01234567, vone);
3975     vacc89ABCDEF = _mm256_min_ps(vacc89ABCDEF, vone);
3976 
3977     vacc01234567 = _mm256_mul_ps(vacc01234567, vx01234567);
3978     vacc89ABCDEF = _mm256_mul_ps(vacc89ABCDEF, vx89ABCDEF);
3979 
3980     _mm256_storeu_ps(y, vacc01234567);
3981     _mm256_storeu_ps(y + 8, vacc89ABCDEF);
3982     y += 16;
3983   }
3984   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
3985     const __m256 vx = _mm256_loadu_ps(x);
3986     x += 8;
3987     __m256 vacc = _mm256_mul_ps(vx, vsixth);
3988     vacc = _mm256_add_ps(vacc, vhalf);
3989     vacc = _mm256_max_ps(vacc, vzero);
3990     vacc = _mm256_min_ps(vacc, vone);
3991     vacc = _mm256_mul_ps(vacc, vx);
3992     _mm256_storeu_ps(y, vacc);
3993     y += 8;
3994   }
3995   if XNN_UNLIKELY(n != 0) {
3996     assert(n >= 1 * sizeof(float));
3997     assert(n <= 7 * sizeof(float));
3998     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
3999 
4000     const __m256 vx = _mm256_maskload_ps(x, vmask);
4001     __m256 vacc = _mm256_mul_ps(vx, vsixth);
4002     vacc = _mm256_add_ps(vacc, vhalf);
4003     vacc = _mm256_max_ps(vacc, vzero);
4004     vacc = _mm256_min_ps(vacc, vone);
4005     vacc = _mm256_mul_ps(vacc, vx);
4006 
4007     __m128 vacc_lo = _mm256_castps256_ps128(vacc);
4008     if (n & (4 * sizeof(float))) {
4009       _mm_storeu_ps(y, vacc_lo);
4010       vacc_lo = _mm256_extractf128_ps(vacc, 1);
4011       y += 4;
4012     }
4013     if (n & (2 * sizeof(float))) {
4014       _mm_storel_pi((__m64*) y, vacc_lo);
4015       vacc_lo = _mm_movehl_ps(vacc_lo, vacc_lo);
4016       y += 2;
4017     }
4018     if (n & (1 * sizeof(float))) {
4019       _mm_store_ss(y, vacc_lo);
4020     }
4021   }
4022 }
4023 
xnn_f32_vlrelu_ukernel__avx_x16(size_t n,const float * x,float * y,const union xnn_f32_lrelu_params params[restrict XNN_MIN_ELEMENTS (1)])4024 void xnn_f32_vlrelu_ukernel__avx_x16(
4025     size_t n,
4026     const float* x,
4027     float* y,
4028     const union xnn_f32_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)])
4029 {
4030   assert(n != 0);
4031   assert(n % sizeof(float) == 0);
4032 
4033   const __m256 vslope = _mm256_load_ps(params->avx.slope);
4034   for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
4035     const __m256 vx01234567 = _mm256_loadu_ps(x);
4036     const __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
4037     x += 16;
4038 
4039     __m256 vacc01234567 = _mm256_mul_ps(vx01234567, vslope);
4040     __m256 vacc89ABCDEF = _mm256_mul_ps(vx89ABCDEF, vslope);
4041 
4042     vacc01234567 = _mm256_blendv_ps(vx01234567, vacc01234567, vx01234567);
4043     vacc89ABCDEF = _mm256_blendv_ps(vx89ABCDEF, vacc89ABCDEF, vx89ABCDEF);
4044 
4045     _mm256_storeu_ps(y, vacc01234567);
4046     _mm256_storeu_ps(y + 8, vacc89ABCDEF);
4047     y += 16;
4048   }
4049   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
4050     const __m256 vx = _mm256_loadu_ps(x);
4051     x += 8;
4052     __m256 vacc = _mm256_mul_ps(vx, vslope);
4053     vacc = _mm256_blendv_ps(vx, vacc, vx);
4054     _mm256_storeu_ps(y, vacc);
4055     y += 8;
4056   }
4057   if XNN_UNLIKELY(n != 0) {
4058     assert(n >= 1 * sizeof(float));
4059     assert(n <= 7 * sizeof(float));
4060     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
4061 
4062     const __m256 vx = _mm256_maskload_ps(x, vmask);
4063     __m256 vacc = _mm256_mul_ps(vx, vslope);
4064     vacc = _mm256_blendv_ps(vx, vacc, vx);
4065 
4066     __m128 vacc_lo = _mm256_castps256_ps128(vacc);
4067     if (n & (4 * sizeof(float))) {
4068       _mm_storeu_ps(y, vacc_lo);
4069       vacc_lo = _mm256_extractf128_ps(vacc, 1);
4070       y += 4;
4071     }
4072     if (n & (2 * sizeof(float))) {
4073       _mm_storel_pi((__m64*) y, vacc_lo);
4074       vacc_lo = _mm_movehl_ps(vacc_lo, vacc_lo);
4075       y += 2;
4076     }
4077     if (n & (1 * sizeof(float))) {
4078       _mm_store_ss(y, vacc_lo);
4079     }
4080   }
4081 }
4082 
xnn_f32_vrndd_ukernel__avx_x16(size_t n,const float * x,float * y,const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS (1)])4083 void xnn_f32_vrndd_ukernel__avx_x16(
4084     size_t n,
4085     const float* x,
4086     float* y,
4087     const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)])
4088 {
4089   assert(n != 0);
4090   assert(n % sizeof(float) == 0);
4091 
4092   for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
4093     const __m256 vx01234567 = _mm256_loadu_ps(x);
4094     const __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
4095     x += 16;
4096 
4097     const __m256 vy01234567 = _mm256_round_ps(vx01234567, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
4098     const __m256 vy89ABCDEF = _mm256_round_ps(vx89ABCDEF, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
4099 
4100     _mm256_storeu_ps(y, vy01234567);
4101     _mm256_storeu_ps(y + 8, vy89ABCDEF);
4102     y += 16;
4103   }
4104   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
4105     const __m256 vx = _mm256_loadu_ps(x);
4106     x += 8;
4107 
4108     const __m256 vy = _mm256_round_ps(vx, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
4109 
4110     _mm256_storeu_ps(y, vy);
4111     y += 8;
4112   }
4113   if XNN_UNLIKELY(n != 0) {
4114     assert(n >= 1 * sizeof(float));
4115     assert(n <= 7 * sizeof(float));
4116     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
4117 
4118     const __m256 vx = _mm256_maskload_ps(x, vmask);
4119     const __m256 vy = _mm256_round_ps(vx, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
4120 
4121     __m128 vy_lo = _mm256_castps256_ps128(vy);
4122     if (n & (4 * sizeof(float))) {
4123       _mm_storeu_ps(y, vy_lo);
4124       vy_lo = _mm256_extractf128_ps(vy, 1);
4125       y += 4;
4126     }
4127     if (n & (2 * sizeof(float))) {
4128       _mm_storel_pi((__m64*) y, vy_lo);
4129       vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
4130       y += 2;
4131     }
4132     if (n & (1 * sizeof(float))) {
4133       _mm_store_ss(y, vy_lo);
4134     }
4135   }
4136 }
4137 
xnn_f32_vrndne_ukernel__avx_x16(size_t n,const float * x,float * y,const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS (1)])4138 void xnn_f32_vrndne_ukernel__avx_x16(
4139     size_t n,
4140     const float* x,
4141     float* y,
4142     const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)])
4143 {
4144   assert(n != 0);
4145   assert(n % sizeof(float) == 0);
4146 
4147   for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
4148     const __m256 vx01234567 = _mm256_loadu_ps(x);
4149     const __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
4150     x += 16;
4151 
4152     const __m256 vy01234567 = _mm256_round_ps(vx01234567, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
4153     const __m256 vy89ABCDEF = _mm256_round_ps(vx89ABCDEF, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
4154 
4155     _mm256_storeu_ps(y, vy01234567);
4156     _mm256_storeu_ps(y + 8, vy89ABCDEF);
4157     y += 16;
4158   }
4159   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
4160     const __m256 vx = _mm256_loadu_ps(x);
4161     x += 8;
4162 
4163     const __m256 vy = _mm256_round_ps(vx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
4164 
4165     _mm256_storeu_ps(y, vy);
4166     y += 8;
4167   }
4168   if XNN_UNLIKELY(n != 0) {
4169     assert(n >= 1 * sizeof(float));
4170     assert(n <= 7 * sizeof(float));
4171     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
4172 
4173     const __m256 vx = _mm256_maskload_ps(x, vmask);
4174     const __m256 vy = _mm256_round_ps(vx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
4175 
4176     __m128 vy_lo = _mm256_castps256_ps128(vy);
4177     if (n & (4 * sizeof(float))) {
4178       _mm_storeu_ps(y, vy_lo);
4179       vy_lo = _mm256_extractf128_ps(vy, 1);
4180       y += 4;
4181     }
4182     if (n & (2 * sizeof(float))) {
4183       _mm_storel_pi((__m64*) y, vy_lo);
4184       vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
4185       y += 2;
4186     }
4187     if (n & (1 * sizeof(float))) {
4188       _mm_store_ss(y, vy_lo);
4189     }
4190   }
4191 }
4192 
xnn_f32_vrndu_ukernel__avx_x16(size_t n,const float * x,float * y,const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS (1)])4193 void xnn_f32_vrndu_ukernel__avx_x16(
4194     size_t n,
4195     const float* x,
4196     float* y,
4197     const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)])
4198 {
4199   assert(n != 0);
4200   assert(n % sizeof(float) == 0);
4201 
4202   for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
4203     const __m256 vx01234567 = _mm256_loadu_ps(x);
4204     const __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
4205     x += 16;
4206 
4207     const __m256 vy01234567 = _mm256_round_ps(vx01234567, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
4208     const __m256 vy89ABCDEF = _mm256_round_ps(vx89ABCDEF, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
4209 
4210     _mm256_storeu_ps(y, vy01234567);
4211     _mm256_storeu_ps(y + 8, vy89ABCDEF);
4212     y += 16;
4213   }
4214   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
4215     const __m256 vx = _mm256_loadu_ps(x);
4216     x += 8;
4217 
4218     const __m256 vy = _mm256_round_ps(vx, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
4219 
4220     _mm256_storeu_ps(y, vy);
4221     y += 8;
4222   }
4223   if XNN_UNLIKELY(n != 0) {
4224     assert(n >= 1 * sizeof(float));
4225     assert(n <= 7 * sizeof(float));
4226     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
4227 
4228     const __m256 vx = _mm256_maskload_ps(x, vmask);
4229     const __m256 vy = _mm256_round_ps(vx, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
4230 
4231     __m128 vy_lo = _mm256_castps256_ps128(vy);
4232     if (n & (4 * sizeof(float))) {
4233       _mm_storeu_ps(y, vy_lo);
4234       vy_lo = _mm256_extractf128_ps(vy, 1);
4235       y += 4;
4236     }
4237     if (n & (2 * sizeof(float))) {
4238       _mm_storel_pi((__m64*) y, vy_lo);
4239       vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
4240       y += 2;
4241     }
4242     if (n & (1 * sizeof(float))) {
4243       _mm_store_ss(y, vy_lo);
4244     }
4245   }
4246 }
4247 
xnn_f32_vrndz_ukernel__avx_x16(size_t n,const float * x,float * y,const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS (1)])4248 void xnn_f32_vrndz_ukernel__avx_x16(
4249     size_t n,
4250     const float* x,
4251     float* y,
4252     const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)])
4253 {
4254   assert(n != 0);
4255   assert(n % sizeof(float) == 0);
4256 
4257   for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
4258     const __m256 vx01234567 = _mm256_loadu_ps(x);
4259     const __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
4260     x += 16;
4261 
4262     const __m256 vy01234567 = _mm256_round_ps(vx01234567, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
4263     const __m256 vy89ABCDEF = _mm256_round_ps(vx89ABCDEF, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
4264 
4265     _mm256_storeu_ps(y, vy01234567);
4266     _mm256_storeu_ps(y + 8, vy89ABCDEF);
4267     y += 16;
4268   }
4269   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
4270     const __m256 vx = _mm256_loadu_ps(x);
4271     x += 8;
4272 
4273     const __m256 vy = _mm256_round_ps(vx, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
4274 
4275     _mm256_storeu_ps(y, vy);
4276     y += 8;
4277   }
4278   if XNN_UNLIKELY(n != 0) {
4279     assert(n >= 1 * sizeof(float));
4280     assert(n <= 7 * sizeof(float));
4281     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
4282 
4283     const __m256 vx = _mm256_maskload_ps(x, vmask);
4284     const __m256 vy = _mm256_round_ps(vx, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
4285 
4286     __m128 vy_lo = _mm256_castps256_ps128(vy);
4287     if (n & (4 * sizeof(float))) {
4288       _mm_storeu_ps(y, vy_lo);
4289       vy_lo = _mm256_extractf128_ps(vy, 1);
4290       y += 4;
4291     }
4292     if (n & (2 * sizeof(float))) {
4293       _mm_storel_pi((__m64*) y, vy_lo);
4294       vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
4295       y += 2;
4296     }
4297     if (n & (1 * sizeof(float))) {
4298       _mm_store_ss(y, vy_lo);
4299     }
4300   }
4301 }
4302 
xnn_f32_vsigmoid_ukernel__avx_rr2_p5_nr2_x40(size_t n,const float * x,float * y,const union xnn_f32_sigmoid_params params[restrict XNN_MIN_ELEMENTS (1)])4303 void xnn_f32_vsigmoid_ukernel__avx_rr2_p5_nr2_x40(
4304     size_t n,
4305     const float* x,
4306     float* y,
4307     const union xnn_f32_sigmoid_params params[restrict XNN_MIN_ELEMENTS(1)])
4308 {
4309   assert(n % sizeof(float) == 0);
4310 
4311   const __m256 vsign_mask = _mm256_load_ps(params->avx_rr2_p5.sign_mask);
4312   const __m256 vmagic_bias = _mm256_load_ps(params->avx_rr2_p5.magic_bias);
4313   const __m256 vlog2e = _mm256_load_ps(params->avx_rr2_p5.log2e);
4314   const __m256 vminus_ln2_hi = _mm256_load_ps(params->avx_rr2_p5.minus_ln2_hi);
4315   const __m256 vminus_ln2_lo = _mm256_load_ps(params->avx_rr2_p5.minus_ln2_lo);
4316   const __m256 vc5 = _mm256_load_ps(params->avx_rr2_p5.c5);
4317   const __m256 vc4 = _mm256_load_ps(params->avx_rr2_p5.c4);
4318   const __m256 vc3 = _mm256_load_ps(params->avx_rr2_p5.c3);
4319   const __m256 vc2 = _mm256_load_ps(params->avx_rr2_p5.c2);
4320   const __m256 vc1 = _mm256_load_ps(params->avx_rr2_p5.c1);
4321   const __m256 vone = _mm256_load_ps(params->avx_rr2_p5.one);
4322   const __m256 vtwo = _mm256_load_ps(params->avx_rr2_p5.two);
4323   const __m256 vdenorm_cutoff = _mm256_load_ps(params->avx_rr2_p5.denorm_cutoff);
4324 
4325   for (; n >= 40 * sizeof(float); n -= 40 * sizeof(float)) {
4326     const __m256 vx0 = _mm256_loadu_ps(x);
4327     const __m256 vx1 = _mm256_loadu_ps(x + 8);
4328     const __m256 vx2 = _mm256_loadu_ps(x + 16);
4329     const __m256 vx3 = _mm256_loadu_ps(x + 24);
4330     const __m256 vx4 = _mm256_loadu_ps(x + 32);
4331     x += 40;
4332 
4333     const __m256 vz0 = _mm256_or_ps(vx0, vsign_mask);
4334     const __m256 vz1 = _mm256_or_ps(vx1, vsign_mask);
4335     const __m256 vz2 = _mm256_or_ps(vx2, vsign_mask);
4336     const __m256 vz3 = _mm256_or_ps(vx3, vsign_mask);
4337     const __m256 vz4 = _mm256_or_ps(vx4, vsign_mask);
4338 
4339     __m256 vn0 = _mm256_add_ps(_mm256_mul_ps(vz0, vlog2e), vmagic_bias);
4340     __m256 vn1 = _mm256_add_ps(_mm256_mul_ps(vz1, vlog2e), vmagic_bias);
4341     __m256 vn2 = _mm256_add_ps(_mm256_mul_ps(vz2, vlog2e), vmagic_bias);
4342     __m256 vn3 = _mm256_add_ps(_mm256_mul_ps(vz3, vlog2e), vmagic_bias);
4343     __m256 vn4 = _mm256_add_ps(_mm256_mul_ps(vz4, vlog2e), vmagic_bias);
4344 
4345     const __m128 vs_lo0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(vn0)), 23));
4346     const __m128 vs_hi0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(vn0, 1)), 23));
4347     const __m256 vs0 = _mm256_insertf128_ps(_mm256_castps128_ps256(vs_lo0), vs_hi0, 1);
4348     const __m128 vs_lo1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(vn1)), 23));
4349     const __m128 vs_hi1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(vn1, 1)), 23));
4350     const __m256 vs1 = _mm256_insertf128_ps(_mm256_castps128_ps256(vs_lo1), vs_hi1, 1);
4351     const __m128 vs_lo2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(vn2)), 23));
4352     const __m128 vs_hi2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(vn2, 1)), 23));
4353     const __m256 vs2 = _mm256_insertf128_ps(_mm256_castps128_ps256(vs_lo2), vs_hi2, 1);
4354     const __m128 vs_lo3 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(vn3)), 23));
4355     const __m128 vs_hi3 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(vn3, 1)), 23));
4356     const __m256 vs3 = _mm256_insertf128_ps(_mm256_castps128_ps256(vs_lo3), vs_hi3, 1);
4357     const __m128 vs_lo4 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(vn4)), 23));
4358     const __m128 vs_hi4 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(vn4, 1)), 23));
4359     const __m256 vs4 = _mm256_insertf128_ps(_mm256_castps128_ps256(vs_lo4), vs_hi4, 1);
4360 
4361     vn0 = _mm256_sub_ps(vn0, vmagic_bias);
4362     vn1 = _mm256_sub_ps(vn1, vmagic_bias);
4363     vn2 = _mm256_sub_ps(vn2, vmagic_bias);
4364     vn3 = _mm256_sub_ps(vn3, vmagic_bias);
4365     vn4 = _mm256_sub_ps(vn4, vmagic_bias);
4366 
4367     __m256 vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_hi), vz0);
4368     __m256 vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_hi), vz1);
4369     __m256 vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_hi), vz2);
4370     __m256 vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_hi), vz3);
4371     __m256 vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_hi), vz4);
4372 
4373     vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_lo), vt0);
4374     vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_lo), vt1);
4375     vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_lo), vt2);
4376     vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_lo), vt3);
4377     vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_lo), vt4);
4378 
4379     __m256 vp0 = _mm256_add_ps(_mm256_mul_ps(vc5, vt0), vc4);
4380     __m256 vp1 = _mm256_add_ps(_mm256_mul_ps(vc5, vt1), vc4);
4381     __m256 vp2 = _mm256_add_ps(_mm256_mul_ps(vc5, vt2), vc4);
4382     __m256 vp3 = _mm256_add_ps(_mm256_mul_ps(vc5, vt3), vc4);
4383     __m256 vp4 = _mm256_add_ps(_mm256_mul_ps(vc5, vt4), vc4);
4384 
4385     vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc3);
4386     vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc3);
4387     vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc3);
4388     vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc3);
4389     vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc3);
4390 
4391     vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc2);
4392     vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc2);
4393     vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc2);
4394     vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc2);
4395     vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc2);
4396 
4397     vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc1);
4398     vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc1);
4399     vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc1);
4400     vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc1);
4401     vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc1);
4402 
4403     vt0 = _mm256_mul_ps(vt0, vs0);
4404     vt1 = _mm256_mul_ps(vt1, vs1);
4405     vt2 = _mm256_mul_ps(vt2, vs2);
4406     vt3 = _mm256_mul_ps(vt3, vs3);
4407     vt4 = _mm256_mul_ps(vt4, vs4);
4408 
4409     const __m256 ve0 = _mm256_add_ps(_mm256_mul_ps(vt0, vp0), vs0);
4410     const __m256 ve1 = _mm256_add_ps(_mm256_mul_ps(vt1, vp1), vs1);
4411     const __m256 ve2 = _mm256_add_ps(_mm256_mul_ps(vt2, vp2), vs2);
4412     const __m256 ve3 = _mm256_add_ps(_mm256_mul_ps(vt3, vp3), vs3);
4413     const __m256 ve4 = _mm256_add_ps(_mm256_mul_ps(vt4, vp4), vs4);
4414 
4415     const __m256 vd0 = _mm256_add_ps(ve0, vone);
4416     const __m256 vd1 = _mm256_add_ps(ve1, vone);
4417     const __m256 vd2 = _mm256_add_ps(ve2, vone);
4418     const __m256 vd3 = _mm256_add_ps(ve3, vone);
4419     const __m256 vd4 = _mm256_add_ps(ve4, vone);
4420 
4421     __m256 vr0 = _mm256_rcp_ps(vd0);
4422     __m256 vr1 = _mm256_rcp_ps(vd1);
4423     __m256 vr2 = _mm256_rcp_ps(vd2);
4424     __m256 vr3 = _mm256_rcp_ps(vd3);
4425     __m256 vr4 = _mm256_rcp_ps(vd4);
4426 
4427     vr0 = _mm256_mul_ps(vr0, _mm256_sub_ps(vtwo, _mm256_mul_ps(vr0, vd0)));
4428     vr0 = _mm256_mul_ps(vr0, _mm256_sub_ps(vtwo, _mm256_mul_ps(vr0, vd0)));
4429     vr1 = _mm256_mul_ps(vr1, _mm256_sub_ps(vtwo, _mm256_mul_ps(vr1, vd1)));
4430     vr1 = _mm256_mul_ps(vr1, _mm256_sub_ps(vtwo, _mm256_mul_ps(vr1, vd1)));
4431     vr2 = _mm256_mul_ps(vr2, _mm256_sub_ps(vtwo, _mm256_mul_ps(vr2, vd2)));
4432     vr2 = _mm256_mul_ps(vr2, _mm256_sub_ps(vtwo, _mm256_mul_ps(vr2, vd2)));
4433     vr3 = _mm256_mul_ps(vr3, _mm256_sub_ps(vtwo, _mm256_mul_ps(vr3, vd3)));
4434     vr3 = _mm256_mul_ps(vr3, _mm256_sub_ps(vtwo, _mm256_mul_ps(vr3, vd3)));
4435     vr4 = _mm256_mul_ps(vr4, _mm256_sub_ps(vtwo, _mm256_mul_ps(vr4, vd4)));
4436     vr4 = _mm256_mul_ps(vr4, _mm256_sub_ps(vtwo, _mm256_mul_ps(vr4, vd4)));
4437 
4438     __m256 vf0 = _mm256_mul_ps(ve0, vr0);
4439     __m256 vf1 = _mm256_mul_ps(ve1, vr1);
4440     __m256 vf2 = _mm256_mul_ps(ve2, vr2);
4441     __m256 vf3 = _mm256_mul_ps(ve3, vr3);
4442     __m256 vf4 = _mm256_mul_ps(ve4, vr4);
4443 
4444     vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vz0, vdenorm_cutoff, _CMP_LT_OS), vf0);
4445     vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vz1, vdenorm_cutoff, _CMP_LT_OS), vf1);
4446     vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vz2, vdenorm_cutoff, _CMP_LT_OS), vf2);
4447     vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vz3, vdenorm_cutoff, _CMP_LT_OS), vf3);
4448     vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vz4, vdenorm_cutoff, _CMP_LT_OS), vf4);
4449 
4450     vf0 = _mm256_blendv_ps(_mm256_sub_ps(vone, vf0), vf0, vx0);
4451     vf1 = _mm256_blendv_ps(_mm256_sub_ps(vone, vf1), vf1, vx1);
4452     vf2 = _mm256_blendv_ps(_mm256_sub_ps(vone, vf2), vf2, vx2);
4453     vf3 = _mm256_blendv_ps(_mm256_sub_ps(vone, vf3), vf3, vx3);
4454     vf4 = _mm256_blendv_ps(_mm256_sub_ps(vone, vf4), vf4, vx4);
4455 
4456     _mm256_storeu_ps(y, vf0);
4457     _mm256_storeu_ps(y + 8, vf1);
4458     _mm256_storeu_ps(y + 16, vf2);
4459     _mm256_storeu_ps(y + 24, vf3);
4460     _mm256_storeu_ps(y + 32, vf4);
4461     y += 40;
4462   }
4463   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
4464     const __m256 vx = _mm256_loadu_ps(x);
4465     x += 8;
4466 
4467     const __m256 vz = _mm256_or_ps(vx, vsign_mask);
4468 
4469     __m256 vn = _mm256_add_ps(_mm256_mul_ps(vz, vlog2e), vmagic_bias);
4470 
4471     const __m128 vs_lo = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(vn)), 23));
4472     const __m128 vs_hi = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(vn, 1)), 23));
4473     const __m256 vs = _mm256_insertf128_ps(_mm256_castps128_ps256(vs_lo), vs_hi, 1);
4474 
4475     vn = _mm256_sub_ps(vn, vmagic_bias);
4476 
4477     __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vz);
4478     vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt);
4479 
4480     __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4);
4481     vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3);
4482     vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2);
4483     vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1);
4484 
4485     vt = _mm256_mul_ps(vt, vs);
4486     const __m256 ve = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs);
4487 
4488     const __m256 vd = _mm256_add_ps(ve, vone);
4489     __m256 vr = _mm256_rcp_ps(vd);
4490     vr = _mm256_mul_ps(vr, _mm256_sub_ps(vtwo, _mm256_mul_ps(vr, vd)));
4491     vr = _mm256_mul_ps(vr, _mm256_sub_ps(vtwo, _mm256_mul_ps(vr, vd)));
4492     __m256 vf = _mm256_mul_ps(ve, vr);
4493 
4494     vf = _mm256_andnot_ps(_mm256_cmp_ps(vz, vdenorm_cutoff, _CMP_LT_OS), vf);
4495     vf = _mm256_blendv_ps(_mm256_sub_ps(vone, vf), vf, vx);
4496 
4497     _mm256_storeu_ps(y, vf);
4498     y += 8;
4499   }
4500   if XNN_UNLIKELY(n != 0) {
4501     assert(n >= 1 * sizeof(float));
4502     assert(n <= 7 * sizeof(float));
4503     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx_rr2_p5.mask_table[7] - n));
4504 
4505     const __m256 vx = _mm256_maskload_ps(x, vmask);
4506 
4507     const __m256 vz = _mm256_or_ps(vx, vsign_mask);
4508 
4509     __m256 vn = _mm256_add_ps(_mm256_mul_ps(vz, vlog2e), vmagic_bias);
4510     const __m128 vs_lo = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(vn)), 23));
4511     const __m128 vs_hi = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(vn, 1)), 23));
4512     const __m256 vs = _mm256_insertf128_ps(_mm256_castps128_ps256(vs_lo), vs_hi, 1);
4513 
4514     vn = _mm256_sub_ps(vn, vmagic_bias);
4515 
4516     __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vz);
4517     vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt);
4518 
4519     __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4);
4520     vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3);
4521     vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2);
4522     vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1);
4523 
4524     vt = _mm256_mul_ps(vt, vs);
4525     const __m256 ve = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs);
4526 
4527     const __m256 vd = _mm256_add_ps(ve, vone);
4528     __m256 vr = _mm256_rcp_ps(vd);
4529     vr = _mm256_mul_ps(vr, _mm256_sub_ps(vtwo, _mm256_mul_ps(vr, vd)));
4530     vr = _mm256_mul_ps(vr, _mm256_sub_ps(vtwo, _mm256_mul_ps(vr, vd)));
4531     __m256 vf = _mm256_mul_ps(ve, vr);
4532 
4533     vf = _mm256_andnot_ps(_mm256_cmp_ps(vz, vdenorm_cutoff, _CMP_LT_OS), vf);
4534     vf = _mm256_blendv_ps(_mm256_sub_ps(vone, vf), vf, vx);
4535 
4536     __m128 vf_lo = _mm256_castps256_ps128(vf);
4537     if (n & (4 * sizeof(float))) {
4538       _mm_storeu_ps(y, vf_lo);
4539       vf_lo = _mm256_extractf128_ps(vf, 1);
4540       y += 4;
4541     }
4542     if (n & (2 * sizeof(float))) {
4543       _mm_storel_pi((__m64*) y, vf_lo);
4544       vf_lo = _mm_movehl_ps(vf_lo, vf_lo);
4545       y += 2;
4546     }
4547     if (n & (1 * sizeof(float))) {
4548       _mm_store_ss(y, vf_lo);
4549     }
4550   }
4551 }
4552 
xnn_f32_vsqrt_ukernel__avx_sqrt_x8(size_t n,const float * x,float * y,const union xnn_f32_sqrt_params params[restrict XNN_MIN_ELEMENTS (1)])4553 void xnn_f32_vsqrt_ukernel__avx_sqrt_x8(
4554     size_t n,
4555     const float* x,
4556     float* y,
4557     const union xnn_f32_sqrt_params params[restrict XNN_MIN_ELEMENTS(1)])
4558 {
4559   assert(n != 0);
4560   assert(n % sizeof(float) == 0);
4561 
4562   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
4563     const __m256 vx = _mm256_loadu_ps(x);
4564     x += 8;
4565     const __m256 vy = _mm256_sqrt_ps(vx);
4566     _mm256_storeu_ps(y, vy);
4567     y += 8;
4568   }
4569   if XNN_UNLIKELY(n != 0) {
4570     assert(n >= 1 * sizeof(float));
4571     assert(n <= 7 * sizeof(float));
4572     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
4573 
4574     const __m256 vx = _mm256_maskload_ps(x, vmask);
4575     const __m256 vy = _mm256_sqrt_ps(vx);
4576 
4577     __m128 vy_lo = _mm256_castps256_ps128(vy);
4578     if (n & (4 * sizeof(float))) {
4579       _mm_storeu_ps(y, vy_lo);
4580       vy_lo = _mm256_extractf128_ps(vy, 1);
4581       y += 4;
4582     }
4583     if (n & (2 * sizeof(float))) {
4584       _mm_storel_pi((__m64*) y, vy_lo);
4585       vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
4586       y += 2;
4587     }
4588     if (n & (1 * sizeof(float))) {
4589       _mm_store_ss(y, vy_lo);
4590     }
4591   }
4592 }
4593 
xnn_f32_vabs_ukernel__avx_x16(size_t n,const float * x,float * y,const union xnn_f32_abs_params params[restrict XNN_MIN_ELEMENTS (1)])4594 void xnn_f32_vabs_ukernel__avx_x16(
4595     size_t n,
4596     const float* x,
4597     float* y,
4598     const union xnn_f32_abs_params params[restrict XNN_MIN_ELEMENTS(1)])
4599 {
4600   assert(n != 0);
4601   assert(n % sizeof(float) == 0);
4602   assert(x != NULL);
4603   assert(y != NULL);
4604 
4605   const __m256 vnonsign_mask = _mm256_load_ps(params->avx.nonsign_mask);
4606   for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
4607     const __m256 vx01234567 = _mm256_loadu_ps(x);
4608     const __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
4609     x += 16;
4610 
4611     const __m256 vy01234567 = _mm256_and_ps(vx01234567, vnonsign_mask);
4612     const __m256 vy89ABCDEF = _mm256_and_ps(vx89ABCDEF, vnonsign_mask);
4613 
4614     _mm256_storeu_ps(y, vy01234567);
4615     _mm256_storeu_ps(y + 8, vy89ABCDEF);
4616     y += 16;
4617   }
4618   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
4619     const __m256 vx = _mm256_loadu_ps(x);
4620     x += 8;
4621     const __m256 vy = _mm256_and_ps(vx, vnonsign_mask);
4622     _mm256_storeu_ps(y, vy);
4623     y += 8;
4624   }
4625   if XNN_UNLIKELY(n != 0) {
4626     assert(n >= 1 * sizeof(float));
4627     assert(n <= 7 * sizeof(float));
4628     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
4629 
4630     const __m256 vx = _mm256_maskload_ps(x, vmask);
4631     const __m256 vy = _mm256_and_ps(vx, vnonsign_mask);
4632 
4633     __m128 vy_lo = _mm256_castps256_ps128(vy);
4634     if (n & (4 * sizeof(float))) {
4635       _mm_storeu_ps(y, vy_lo);
4636       vy_lo = _mm256_extractf128_ps(vy, 1);
4637       y += 4;
4638     }
4639     if (n & (2 * sizeof(float))) {
4640       _mm_storel_pi((__m64*) y, vy_lo);
4641       vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
4642       y += 2;
4643     }
4644     if (n & (1 * sizeof(float))) {
4645       _mm_store_ss(y, vy_lo);
4646     }
4647   }
4648 }
4649 
xnn_f32_vneg_ukernel__avx_x16(size_t n,const float * x,float * y,const union xnn_f32_neg_params params[restrict XNN_MIN_ELEMENTS (1)])4650 void xnn_f32_vneg_ukernel__avx_x16(
4651     size_t n,
4652     const float* x,
4653     float* y,
4654     const union xnn_f32_neg_params params[restrict XNN_MIN_ELEMENTS(1)])
4655 {
4656   assert(n != 0);
4657   assert(n % sizeof(float) == 0);
4658   assert(x != NULL);
4659   assert(y != NULL);
4660 
4661   const __m256 vsign_mask = _mm256_load_ps(params->sse.sign_mask);
4662   for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
4663     const __m256 vx01234567 = _mm256_loadu_ps(x);
4664     const __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
4665     x += 16;
4666 
4667     const __m256 vy01234567 = _mm256_xor_ps(vx01234567, vsign_mask);
4668     const __m256 vy89ABCDEF = _mm256_xor_ps(vx89ABCDEF, vsign_mask);
4669 
4670     _mm256_storeu_ps(y, vy01234567);
4671     _mm256_storeu_ps(y + 8, vy89ABCDEF);
4672     y += 16;
4673   }
4674   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
4675     const __m256 vx = _mm256_loadu_ps(x);
4676     x += 8;
4677     const __m256 vy = _mm256_xor_ps(vx, vsign_mask);
4678     _mm256_storeu_ps(y, vy);
4679     y += 8;
4680   }
4681   if XNN_UNLIKELY(n != 0) {
4682     assert(n >= 1 * sizeof(float));
4683     assert(n <= 7 * sizeof(float));
4684     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
4685 
4686     const __m256 vx = _mm256_maskload_ps(x, vmask);
4687     const __m256 vy = _mm256_xor_ps(vx, vsign_mask);
4688 
4689     __m128 vy_lo = _mm256_castps256_ps128(vy);
4690     if (n & (4 * sizeof(float))) {
4691       _mm_storeu_ps(y, vy_lo);
4692       vy_lo = _mm256_extractf128_ps(vy, 1);
4693       y += 4;
4694     }
4695     if (n & (2 * sizeof(float))) {
4696       _mm_storel_pi((__m64*) y, vy_lo);
4697       vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
4698       y += 2;
4699     }
4700     if (n & (1 * sizeof(float))) {
4701       _mm_store_ss(y, vy_lo);
4702     }
4703   }
4704 }
4705 
xnn_f32_vsqr_ukernel__avx_x16(size_t n,const float * x,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])4706 void xnn_f32_vsqr_ukernel__avx_x16(
4707     size_t n,
4708     const float* x,
4709     float* y,
4710     const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
4711 {
4712   assert(n != 0);
4713   assert(n % sizeof(float) == 0);
4714   assert(x != NULL);
4715   assert(y != NULL);
4716 
4717   for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
4718     const __m256 vx01234567 = _mm256_loadu_ps(x);
4719     const __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
4720     x += 16;
4721 
4722     const __m256 vy01234567 = _mm256_mul_ps(vx01234567, vx01234567);
4723     const __m256 vy89ABCDEF = _mm256_mul_ps(vx89ABCDEF, vx89ABCDEF);
4724 
4725     _mm256_storeu_ps(y, vy01234567);
4726     _mm256_storeu_ps(y + 8, vy89ABCDEF);
4727     y += 16;
4728   }
4729   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
4730     const __m256 vx = _mm256_loadu_ps(x);
4731     x += 8;
4732     const __m256 vy = _mm256_mul_ps(vx, vx);
4733     _mm256_storeu_ps(y, vy);
4734     y += 8;
4735   }
4736   if XNN_UNLIKELY(n != 0) {
4737     assert(n >= 1 * sizeof(float));
4738     assert(n <= 7 * sizeof(float));
4739     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->avx.mask_table[7] - n));
4740 
4741     const __m256 vx = _mm256_maskload_ps(x, vmask);
4742     const __m256 vy = _mm256_mul_ps(vx, vx);
4743 
4744     __m128 vy_lo = _mm256_castps256_ps128(vy);
4745     if (n & (4 * sizeof(float))) {
4746       _mm_storeu_ps(y, vy_lo);
4747       vy_lo = _mm256_extractf128_ps(vy, 1);
4748       y += 4;
4749     }
4750     if (n & (2 * sizeof(float))) {
4751       _mm_storel_pi((__m64*) y, vy_lo);
4752       vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
4753       y += 2;
4754     }
4755     if (n & (1 * sizeof(float))) {
4756       _mm_store_ss(y, vy_lo);
4757     }
4758   }
4759 }
4760 
xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16_add16(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4761 void xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16_add16(
4762     size_t channels,
4763     size_t output_width,
4764     const int8_t** input,
4765     const void* weights,
4766     int8_t* output,
4767     size_t input_stride,
4768     size_t output_increment,
4769     size_t input_offset,
4770     const int8_t* zero,
4771     const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4772 {
4773   assert(channels != 0);
4774   assert(output_width != 0);
4775 
4776   do {
4777     const int8_t* i0 = input[0];
4778     assert(i0 != NULL);
4779     if XNN_UNPREDICTABLE(i0 != zero) {
4780       i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
4781     }
4782     const int8_t* i1 = input[1];
4783     assert(i1 != NULL);
4784     if XNN_UNPREDICTABLE(i1 != zero) {
4785       i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
4786     }
4787     const int8_t* i2 = input[2];
4788     assert(i2 != NULL);
4789     if XNN_UNPREDICTABLE(i2 != zero) {
4790       i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
4791     }
4792     const int8_t* i3 = input[3];
4793     assert(i3 != NULL);
4794     if XNN_UNPREDICTABLE(i3 != zero) {
4795       i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
4796     }
4797     const int8_t* i4 = input[4];
4798     assert(i4 != NULL);
4799     if XNN_UNPREDICTABLE(i4 != zero) {
4800       i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
4801     }
4802     const int8_t* i5 = input[5];
4803     assert(i5 != NULL);
4804     if XNN_UNPREDICTABLE(i5 != zero) {
4805       i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
4806     }
4807     const int8_t* i6 = input[6];
4808     assert(i6 != NULL);
4809     if XNN_UNPREDICTABLE(i6 != zero) {
4810       i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
4811     }
4812     const int8_t* i7 = input[7];
4813     assert(i7 != NULL);
4814     if XNN_UNPREDICTABLE(i7 != zero) {
4815       i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
4816     }
4817     const int8_t* i8 = input[8];
4818     assert(i8 != NULL);
4819     if XNN_UNPREDICTABLE(i8 != zero) {
4820       i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
4821     }
4822     const int8_t* i9 = input[9];
4823     assert(i9 != NULL);
4824     if XNN_UNPREDICTABLE(i9 != zero) {
4825       i9 = (const int8_t*) ((uintptr_t) i9 + input_offset);
4826     }
4827     const int8_t* i10 = input[10];
4828     assert(i10 != NULL);
4829     if XNN_UNPREDICTABLE(i10 != zero) {
4830       i10 = (const int8_t*) ((uintptr_t) i10 + input_offset);
4831     }
4832     const int8_t* i11 = input[11];
4833     assert(i11 != NULL);
4834     if XNN_UNPREDICTABLE(i11 != zero) {
4835       i11 = (const int8_t*) ((uintptr_t) i11 + input_offset);
4836     }
4837     const int8_t* i12 = input[12];
4838     assert(i12 != NULL);
4839     if XNN_UNPREDICTABLE(i12 != zero) {
4840       i12 = (const int8_t*) ((uintptr_t) i12 + input_offset);
4841     }
4842     const int8_t* i13 = input[13];
4843     assert(i13 != NULL);
4844     if XNN_UNPREDICTABLE(i13 != zero) {
4845       i13 = (const int8_t*) ((uintptr_t) i13 + input_offset);
4846     }
4847     const int8_t* i14 = input[14];
4848     assert(i14 != NULL);
4849     if XNN_UNPREDICTABLE(i14 != zero) {
4850       i14 = (const int8_t*) ((uintptr_t) i14 + input_offset);
4851     }
4852     const int8_t* i15 = input[15];
4853     assert(i15 != NULL);
4854     if XNN_UNPREDICTABLE(i15 != zero) {
4855       i15 = (const int8_t*) ((uintptr_t) i15 + input_offset);
4856     }
4857     const int8_t* i16 = input[16];
4858     assert(i16 != NULL);
4859     if XNN_UNPREDICTABLE(i16 != zero) {
4860       i16 = (const int8_t*) ((uintptr_t) i16 + input_offset);
4861     }
4862     const int8_t* i17 = input[17];
4863     assert(i17 != NULL);
4864     if XNN_UNPREDICTABLE(i17 != zero) {
4865       i17 = (const int8_t*) ((uintptr_t) i17 + input_offset);
4866     }
4867     const int8_t* i18 = input[18];
4868     assert(i18 != NULL);
4869     if XNN_UNPREDICTABLE(i18 != zero) {
4870       i18 = (const int8_t*) ((uintptr_t) i18 + input_offset);
4871     }
4872     const int8_t* i19 = input[19];
4873     assert(i19 != NULL);
4874     if XNN_UNPREDICTABLE(i19 != zero) {
4875       i19 = (const int8_t*) ((uintptr_t) i19 + input_offset);
4876     }
4877     const int8_t* i20 = input[20];
4878     assert(i20 != NULL);
4879     if XNN_UNPREDICTABLE(i20 != zero) {
4880       i20 = (const int8_t*) ((uintptr_t) i20 + input_offset);
4881     }
4882     const int8_t* i21 = input[21];
4883     assert(i21 != NULL);
4884     if XNN_UNPREDICTABLE(i21 != zero) {
4885       i21 = (const int8_t*) ((uintptr_t) i21 + input_offset);
4886     }
4887     const int8_t* i22 = input[22];
4888     assert(i22 != NULL);
4889     if XNN_UNPREDICTABLE(i22 != zero) {
4890       i22 = (const int8_t*) ((uintptr_t) i22 + input_offset);
4891     }
4892     const int8_t* i23 = input[23];
4893     assert(i23 != NULL);
4894     if XNN_UNPREDICTABLE(i23 != zero) {
4895       i23 = (const int8_t*) ((uintptr_t) i23 + input_offset);
4896     }
4897     const int8_t* i24 = input[24];
4898     assert(i24 != NULL);
4899     if XNN_UNPREDICTABLE(i24 != zero) {
4900       i24 = (const int8_t*) ((uintptr_t) i24 + input_offset);
4901     }
4902     input = (const int8_t**) ((uintptr_t) input + input_stride);
4903 
4904     size_t c = channels;
4905     const void* w = weights;
4906     for (; c >= 16; c -= 16) {
4907       __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
4908       __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
4909       __m128i vacc89AB = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 8));
4910       __m128i vaccCDEF = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 12));
4911 
4912 
4913       const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
4914       const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
4915       const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t)));
4916       const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
4917       const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8));
4918       const __m128i vxi0x89ABCDEF = _mm_cvtepi8_epi16(vi0x89ABCDEF);
4919       const __m128i vk0x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t)));
4920       const __m128i vxk0x89ABCDEF = _mm_cvtepi8_epi16(vk0x89ABCDEF);
4921       i0 += 16;
4922 
4923 
4924       __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
4925       __m128i vprod89ABCDEF = _mm_mullo_epi16(vxi0x89ABCDEF, vxk0x89ABCDEF);
4926 
4927 
4928       const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
4929       const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
4930       const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t)));
4931       const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
4932       const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8));
4933       const __m128i vxi1x89ABCDEF = _mm_cvtepi8_epi16(vi1x89ABCDEF);
4934       const __m128i vk1x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t)));
4935       const __m128i vxk1x89ABCDEF = _mm_cvtepi8_epi16(vk1x89ABCDEF);
4936       i1 += 16;
4937 
4938 
4939       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi1x01234567, vxk1x01234567));
4940       vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi1x89ABCDEF, vxk1x89ABCDEF));
4941 
4942       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
4943       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
4944       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
4945       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
4946 
4947       const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
4948       const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
4949       const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t)));
4950       const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
4951       const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8));
4952       const __m128i vxi2x89ABCDEF = _mm_cvtepi8_epi16(vi2x89ABCDEF);
4953       const __m128i vk2x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t)));
4954       const __m128i vxk2x89ABCDEF = _mm_cvtepi8_epi16(vk2x89ABCDEF);
4955       i2 += 16;
4956 
4957 
4958       vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
4959       vprod89ABCDEF = _mm_mullo_epi16(vxi2x89ABCDEF, vxk2x89ABCDEF);
4960 
4961 
4962       const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
4963       const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567);
4964       const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t)));
4965       const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567);
4966       const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8));
4967       const __m128i vxi3x89ABCDEF = _mm_cvtepi8_epi16(vi3x89ABCDEF);
4968       const __m128i vk3x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t)));
4969       const __m128i vxk3x89ABCDEF = _mm_cvtepi8_epi16(vk3x89ABCDEF);
4970       i3 += 16;
4971 
4972 
4973       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi3x01234567, vxk3x01234567));
4974       vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi3x89ABCDEF, vxk3x89ABCDEF));
4975 
4976       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
4977       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
4978       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
4979       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
4980 
4981       const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
4982       const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567);
4983       const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t)));
4984       const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567);
4985       const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8));
4986       const __m128i vxi4x89ABCDEF = _mm_cvtepi8_epi16(vi4x89ABCDEF);
4987       const __m128i vk4x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t)));
4988       const __m128i vxk4x89ABCDEF = _mm_cvtepi8_epi16(vk4x89ABCDEF);
4989       i4 += 16;
4990 
4991 
4992       vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
4993       vprod89ABCDEF = _mm_mullo_epi16(vxi4x89ABCDEF, vxk4x89ABCDEF);
4994 
4995 
4996       const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
4997       const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567);
4998       const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t)));
4999       const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567);
5000       const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8));
5001       const __m128i vxi5x89ABCDEF = _mm_cvtepi8_epi16(vi5x89ABCDEF);
5002       const __m128i vk5x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t)));
5003       const __m128i vxk5x89ABCDEF = _mm_cvtepi8_epi16(vk5x89ABCDEF);
5004       i5 += 16;
5005 
5006 
5007       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi5x01234567, vxk5x01234567));
5008       vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi5x89ABCDEF, vxk5x89ABCDEF));
5009 
5010       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5011       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5012       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
5013       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
5014 
5015       const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
5016       const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567);
5017       const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t)));
5018       const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567);
5019       const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8));
5020       const __m128i vxi6x89ABCDEF = _mm_cvtepi8_epi16(vi6x89ABCDEF);
5021       const __m128i vk6x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t)));
5022       const __m128i vxk6x89ABCDEF = _mm_cvtepi8_epi16(vk6x89ABCDEF);
5023       i6 += 16;
5024 
5025 
5026       vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
5027       vprod89ABCDEF = _mm_mullo_epi16(vxi6x89ABCDEF, vxk6x89ABCDEF);
5028 
5029 
5030       const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
5031       const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567);
5032       const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t)));
5033       const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567);
5034       const __m128i vi7x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i7 + 8));
5035       const __m128i vxi7x89ABCDEF = _mm_cvtepi8_epi16(vi7x89ABCDEF);
5036       const __m128i vk7x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t)));
5037       const __m128i vxk7x89ABCDEF = _mm_cvtepi8_epi16(vk7x89ABCDEF);
5038       i7 += 16;
5039 
5040 
5041       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi7x01234567, vxk7x01234567));
5042       vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi7x89ABCDEF, vxk7x89ABCDEF));
5043 
5044       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5045       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5046       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
5047       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
5048 
5049       const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
5050       const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567);
5051       const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t)));
5052       const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567);
5053       const __m128i vi8x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i8 + 8));
5054       const __m128i vxi8x89ABCDEF = _mm_cvtepi8_epi16(vi8x89ABCDEF);
5055       const __m128i vk8x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t)));
5056       const __m128i vxk8x89ABCDEF = _mm_cvtepi8_epi16(vk8x89ABCDEF);
5057       i8 += 16;
5058 
5059 
5060       vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
5061       vprod89ABCDEF = _mm_mullo_epi16(vxi8x89ABCDEF, vxk8x89ABCDEF);
5062 
5063 
5064       const __m128i vi9x01234567 = _mm_loadl_epi64((const __m128i*) i9);
5065       const __m128i vxi9x01234567 = _mm_cvtepi8_epi16(vi9x01234567);
5066       const __m128i vk9x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t)));
5067       const __m128i vxk9x01234567 = _mm_cvtepi8_epi16(vk9x01234567);
5068       const __m128i vi9x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i9 + 8));
5069       const __m128i vxi9x89ABCDEF = _mm_cvtepi8_epi16(vi9x89ABCDEF);
5070       const __m128i vk9x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 152 * sizeof(int8_t)));
5071       const __m128i vxk9x89ABCDEF = _mm_cvtepi8_epi16(vk9x89ABCDEF);
5072       i9 += 16;
5073 
5074 
5075       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi9x01234567, vxk9x01234567));
5076       vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi9x89ABCDEF, vxk9x89ABCDEF));
5077 
5078       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5079       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5080       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
5081       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
5082 
5083       const __m128i vi10x01234567 = _mm_loadl_epi64((const __m128i*) i10);
5084       const __m128i vxi10x01234567 = _mm_cvtepi8_epi16(vi10x01234567);
5085       const __m128i vk10x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 160 * sizeof(int8_t)));
5086       const __m128i vxk10x01234567 = _mm_cvtepi8_epi16(vk10x01234567);
5087       const __m128i vi10x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i10 + 8));
5088       const __m128i vxi10x89ABCDEF = _mm_cvtepi8_epi16(vi10x89ABCDEF);
5089       const __m128i vk10x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 168 * sizeof(int8_t)));
5090       const __m128i vxk10x89ABCDEF = _mm_cvtepi8_epi16(vk10x89ABCDEF);
5091       i10 += 16;
5092 
5093 
5094       vprod01234567 = _mm_mullo_epi16(vxi10x01234567, vxk10x01234567);
5095       vprod89ABCDEF = _mm_mullo_epi16(vxi10x89ABCDEF, vxk10x89ABCDEF);
5096 
5097 
5098       const __m128i vi11x01234567 = _mm_loadl_epi64((const __m128i*) i11);
5099       const __m128i vxi11x01234567 = _mm_cvtepi8_epi16(vi11x01234567);
5100       const __m128i vk11x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 176 * sizeof(int8_t)));
5101       const __m128i vxk11x01234567 = _mm_cvtepi8_epi16(vk11x01234567);
5102       const __m128i vi11x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i11 + 8));
5103       const __m128i vxi11x89ABCDEF = _mm_cvtepi8_epi16(vi11x89ABCDEF);
5104       const __m128i vk11x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 184 * sizeof(int8_t)));
5105       const __m128i vxk11x89ABCDEF = _mm_cvtepi8_epi16(vk11x89ABCDEF);
5106       i11 += 16;
5107 
5108 
5109       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi11x01234567, vxk11x01234567));
5110       vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi11x89ABCDEF, vxk11x89ABCDEF));
5111 
5112       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5113       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5114       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
5115       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
5116 
5117       const __m128i vi12x01234567 = _mm_loadl_epi64((const __m128i*) i12);
5118       const __m128i vxi12x01234567 = _mm_cvtepi8_epi16(vi12x01234567);
5119       const __m128i vk12x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 192 * sizeof(int8_t)));
5120       const __m128i vxk12x01234567 = _mm_cvtepi8_epi16(vk12x01234567);
5121       const __m128i vi12x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i12 + 8));
5122       const __m128i vxi12x89ABCDEF = _mm_cvtepi8_epi16(vi12x89ABCDEF);
5123       const __m128i vk12x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 200 * sizeof(int8_t)));
5124       const __m128i vxk12x89ABCDEF = _mm_cvtepi8_epi16(vk12x89ABCDEF);
5125       i12 += 16;
5126 
5127 
5128       vprod01234567 = _mm_mullo_epi16(vxi12x01234567, vxk12x01234567);
5129       vprod89ABCDEF = _mm_mullo_epi16(vxi12x89ABCDEF, vxk12x89ABCDEF);
5130 
5131 
5132       const __m128i vi13x01234567 = _mm_loadl_epi64((const __m128i*) i13);
5133       const __m128i vxi13x01234567 = _mm_cvtepi8_epi16(vi13x01234567);
5134       const __m128i vk13x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 208 * sizeof(int8_t)));
5135       const __m128i vxk13x01234567 = _mm_cvtepi8_epi16(vk13x01234567);
5136       const __m128i vi13x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i13 + 8));
5137       const __m128i vxi13x89ABCDEF = _mm_cvtepi8_epi16(vi13x89ABCDEF);
5138       const __m128i vk13x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 216 * sizeof(int8_t)));
5139       const __m128i vxk13x89ABCDEF = _mm_cvtepi8_epi16(vk13x89ABCDEF);
5140       i13 += 16;
5141 
5142 
5143       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi13x01234567, vxk13x01234567));
5144       vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi13x89ABCDEF, vxk13x89ABCDEF));
5145 
5146       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5147       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5148       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
5149       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
5150 
5151       const __m128i vi14x01234567 = _mm_loadl_epi64((const __m128i*) i14);
5152       const __m128i vxi14x01234567 = _mm_cvtepi8_epi16(vi14x01234567);
5153       const __m128i vk14x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 224 * sizeof(int8_t)));
5154       const __m128i vxk14x01234567 = _mm_cvtepi8_epi16(vk14x01234567);
5155       const __m128i vi14x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i14 + 8));
5156       const __m128i vxi14x89ABCDEF = _mm_cvtepi8_epi16(vi14x89ABCDEF);
5157       const __m128i vk14x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 232 * sizeof(int8_t)));
5158       const __m128i vxk14x89ABCDEF = _mm_cvtepi8_epi16(vk14x89ABCDEF);
5159       i14 += 16;
5160 
5161 
5162       vprod01234567 = _mm_mullo_epi16(vxi14x01234567, vxk14x01234567);
5163       vprod89ABCDEF = _mm_mullo_epi16(vxi14x89ABCDEF, vxk14x89ABCDEF);
5164 
5165 
5166       const __m128i vi15x01234567 = _mm_loadl_epi64((const __m128i*) i15);
5167       const __m128i vxi15x01234567 = _mm_cvtepi8_epi16(vi15x01234567);
5168       const __m128i vk15x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 240 * sizeof(int8_t)));
5169       const __m128i vxk15x01234567 = _mm_cvtepi8_epi16(vk15x01234567);
5170       const __m128i vi15x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i15 + 8));
5171       const __m128i vxi15x89ABCDEF = _mm_cvtepi8_epi16(vi15x89ABCDEF);
5172       const __m128i vk15x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 248 * sizeof(int8_t)));
5173       const __m128i vxk15x89ABCDEF = _mm_cvtepi8_epi16(vk15x89ABCDEF);
5174       i15 += 16;
5175 
5176 
5177       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi15x01234567, vxk15x01234567));
5178       vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi15x89ABCDEF, vxk15x89ABCDEF));
5179 
5180       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5181       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5182       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
5183       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
5184 
5185       const __m128i vi16x01234567 = _mm_loadl_epi64((const __m128i*) i16);
5186       const __m128i vxi16x01234567 = _mm_cvtepi8_epi16(vi16x01234567);
5187       const __m128i vk16x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 256 * sizeof(int8_t)));
5188       const __m128i vxk16x01234567 = _mm_cvtepi8_epi16(vk16x01234567);
5189       const __m128i vi16x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i16 + 8));
5190       const __m128i vxi16x89ABCDEF = _mm_cvtepi8_epi16(vi16x89ABCDEF);
5191       const __m128i vk16x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 264 * sizeof(int8_t)));
5192       const __m128i vxk16x89ABCDEF = _mm_cvtepi8_epi16(vk16x89ABCDEF);
5193       i16 += 16;
5194 
5195 
5196       vprod01234567 = _mm_mullo_epi16(vxi16x01234567, vxk16x01234567);
5197       vprod89ABCDEF = _mm_mullo_epi16(vxi16x89ABCDEF, vxk16x89ABCDEF);
5198 
5199 
5200       const __m128i vi17x01234567 = _mm_loadl_epi64((const __m128i*) i17);
5201       const __m128i vxi17x01234567 = _mm_cvtepi8_epi16(vi17x01234567);
5202       const __m128i vk17x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 272 * sizeof(int8_t)));
5203       const __m128i vxk17x01234567 = _mm_cvtepi8_epi16(vk17x01234567);
5204       const __m128i vi17x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i17 + 8));
5205       const __m128i vxi17x89ABCDEF = _mm_cvtepi8_epi16(vi17x89ABCDEF);
5206       const __m128i vk17x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 280 * sizeof(int8_t)));
5207       const __m128i vxk17x89ABCDEF = _mm_cvtepi8_epi16(vk17x89ABCDEF);
5208       i17 += 16;
5209 
5210 
5211       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi17x01234567, vxk17x01234567));
5212       vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi17x89ABCDEF, vxk17x89ABCDEF));
5213 
5214       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5215       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5216       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
5217       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
5218 
5219       const __m128i vi18x01234567 = _mm_loadl_epi64((const __m128i*) i18);
5220       const __m128i vxi18x01234567 = _mm_cvtepi8_epi16(vi18x01234567);
5221       const __m128i vk18x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 288 * sizeof(int8_t)));
5222       const __m128i vxk18x01234567 = _mm_cvtepi8_epi16(vk18x01234567);
5223       const __m128i vi18x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i18 + 8));
5224       const __m128i vxi18x89ABCDEF = _mm_cvtepi8_epi16(vi18x89ABCDEF);
5225       const __m128i vk18x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 296 * sizeof(int8_t)));
5226       const __m128i vxk18x89ABCDEF = _mm_cvtepi8_epi16(vk18x89ABCDEF);
5227       i18 += 16;
5228 
5229 
5230       vprod01234567 = _mm_mullo_epi16(vxi18x01234567, vxk18x01234567);
5231       vprod89ABCDEF = _mm_mullo_epi16(vxi18x89ABCDEF, vxk18x89ABCDEF);
5232 
5233 
5234       const __m128i vi19x01234567 = _mm_loadl_epi64((const __m128i*) i19);
5235       const __m128i vxi19x01234567 = _mm_cvtepi8_epi16(vi19x01234567);
5236       const __m128i vk19x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 304 * sizeof(int8_t)));
5237       const __m128i vxk19x01234567 = _mm_cvtepi8_epi16(vk19x01234567);
5238       const __m128i vi19x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i19 + 8));
5239       const __m128i vxi19x89ABCDEF = _mm_cvtepi8_epi16(vi19x89ABCDEF);
5240       const __m128i vk19x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 312 * sizeof(int8_t)));
5241       const __m128i vxk19x89ABCDEF = _mm_cvtepi8_epi16(vk19x89ABCDEF);
5242       i19 += 16;
5243 
5244 
5245       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi19x01234567, vxk19x01234567));
5246       vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi19x89ABCDEF, vxk19x89ABCDEF));
5247 
5248       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5249       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5250       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
5251       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
5252 
5253       const __m128i vi20x01234567 = _mm_loadl_epi64((const __m128i*) i20);
5254       const __m128i vxi20x01234567 = _mm_cvtepi8_epi16(vi20x01234567);
5255       const __m128i vk20x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 320 * sizeof(int8_t)));
5256       const __m128i vxk20x01234567 = _mm_cvtepi8_epi16(vk20x01234567);
5257       const __m128i vi20x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i20 + 8));
5258       const __m128i vxi20x89ABCDEF = _mm_cvtepi8_epi16(vi20x89ABCDEF);
5259       const __m128i vk20x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 328 * sizeof(int8_t)));
5260       const __m128i vxk20x89ABCDEF = _mm_cvtepi8_epi16(vk20x89ABCDEF);
5261       i20 += 16;
5262 
5263 
5264       vprod01234567 = _mm_mullo_epi16(vxi20x01234567, vxk20x01234567);
5265       vprod89ABCDEF = _mm_mullo_epi16(vxi20x89ABCDEF, vxk20x89ABCDEF);
5266 
5267 
5268       const __m128i vi21x01234567 = _mm_loadl_epi64((const __m128i*) i21);
5269       const __m128i vxi21x01234567 = _mm_cvtepi8_epi16(vi21x01234567);
5270       const __m128i vk21x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 336 * sizeof(int8_t)));
5271       const __m128i vxk21x01234567 = _mm_cvtepi8_epi16(vk21x01234567);
5272       const __m128i vi21x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i21 + 8));
5273       const __m128i vxi21x89ABCDEF = _mm_cvtepi8_epi16(vi21x89ABCDEF);
5274       const __m128i vk21x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 344 * sizeof(int8_t)));
5275       const __m128i vxk21x89ABCDEF = _mm_cvtepi8_epi16(vk21x89ABCDEF);
5276       i21 += 16;
5277 
5278 
5279       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi21x01234567, vxk21x01234567));
5280       vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi21x89ABCDEF, vxk21x89ABCDEF));
5281 
5282       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5283       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5284       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
5285       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
5286 
5287       const __m128i vi22x01234567 = _mm_loadl_epi64((const __m128i*) i22);
5288       const __m128i vxi22x01234567 = _mm_cvtepi8_epi16(vi22x01234567);
5289       const __m128i vk22x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 352 * sizeof(int8_t)));
5290       const __m128i vxk22x01234567 = _mm_cvtepi8_epi16(vk22x01234567);
5291       const __m128i vi22x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i22 + 8));
5292       const __m128i vxi22x89ABCDEF = _mm_cvtepi8_epi16(vi22x89ABCDEF);
5293       const __m128i vk22x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 360 * sizeof(int8_t)));
5294       const __m128i vxk22x89ABCDEF = _mm_cvtepi8_epi16(vk22x89ABCDEF);
5295       i22 += 16;
5296 
5297 
5298       vprod01234567 = _mm_mullo_epi16(vxi22x01234567, vxk22x01234567);
5299       vprod89ABCDEF = _mm_mullo_epi16(vxi22x89ABCDEF, vxk22x89ABCDEF);
5300 
5301 
5302       const __m128i vi23x01234567 = _mm_loadl_epi64((const __m128i*) i23);
5303       const __m128i vxi23x01234567 = _mm_cvtepi8_epi16(vi23x01234567);
5304       const __m128i vk23x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 368 * sizeof(int8_t)));
5305       const __m128i vxk23x01234567 = _mm_cvtepi8_epi16(vk23x01234567);
5306       const __m128i vi23x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i23 + 8));
5307       const __m128i vxi23x89ABCDEF = _mm_cvtepi8_epi16(vi23x89ABCDEF);
5308       const __m128i vk23x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 376 * sizeof(int8_t)));
5309       const __m128i vxk23x89ABCDEF = _mm_cvtepi8_epi16(vk23x89ABCDEF);
5310       i23 += 16;
5311 
5312 
5313       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi23x01234567, vxk23x01234567));
5314       vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi23x89ABCDEF, vxk23x89ABCDEF));
5315 
5316       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5317       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5318       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
5319       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
5320 
5321       const __m128i vi24x01234567 = _mm_loadl_epi64((const __m128i*) i24);
5322       const __m128i vxi24x01234567 = _mm_cvtepi8_epi16(vi24x01234567);
5323       const __m128i vk24x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 384 * sizeof(int8_t)));
5324       const __m128i vxk24x01234567 = _mm_cvtepi8_epi16(vk24x01234567);
5325       const __m128i vi24x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i24 + 8));
5326       const __m128i vxi24x89ABCDEF = _mm_cvtepi8_epi16(vi24x89ABCDEF);
5327       const __m128i vk24x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 392 * sizeof(int8_t)));
5328       const __m128i vxk24x89ABCDEF = _mm_cvtepi8_epi16(vk24x89ABCDEF);
5329       i24 += 16;
5330 
5331 
5332       vprod01234567 = _mm_mullo_epi16(vxi24x01234567, vxk24x01234567);
5333       vprod89ABCDEF = _mm_mullo_epi16(vxi24x89ABCDEF, vxk24x89ABCDEF);
5334 
5335       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5336       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5337       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
5338       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
5339 
5340       w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(int8_t));
5341 
5342       __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
5343       __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
5344       __m128 vscaled89AB = _mm_cvtepi32_ps(vacc89AB);
5345       __m128 vscaledCDEF = _mm_cvtepi32_ps(vaccCDEF);
5346 
5347       const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
5348       const __m128 vscale4567 = _mm_loadu_ps((const float*) w + 4);
5349       const __m128 vscale89AB = _mm_loadu_ps((const float*) w + 8);
5350       const __m128 vscaleCDEF = _mm_loadu_ps((const float*) w + 12);
5351       w = (const void*) ((const float*) w + 16);
5352       vscaled0123 = _mm_mul_ps(vscaled0123, vscale0123);
5353       vscaled4567 = _mm_mul_ps(vscaled4567, vscale4567);
5354       vscaled89AB = _mm_mul_ps(vscaled89AB, vscale89AB);
5355       vscaledCDEF = _mm_mul_ps(vscaledCDEF, vscaleCDEF);
5356 
5357       const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
5358       vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
5359       vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
5360       vscaled89AB = _mm_min_ps(vscaled89AB, voutput_max_less_zero_point);
5361       vscaledCDEF = _mm_min_ps(vscaledCDEF, voutput_max_less_zero_point);
5362 
5363       vacc0123 = _mm_cvtps_epi32(vscaled0123);
5364       vacc4567 = _mm_cvtps_epi32(vscaled4567);
5365       vacc89AB = _mm_cvtps_epi32(vscaled89AB);
5366       vaccCDEF = _mm_cvtps_epi32(vscaledCDEF);
5367 
5368       const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
5369       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
5370       __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
5371 
5372 
5373       __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
5374 
5375       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
5376       vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
5377 
5378       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
5379       output += 16;
5380     }
5381     if XNN_UNLIKELY(c != 0) {
5382       const int8_t* k = (const int8_t*) ((const int32_t*) w + 16);
5383       do {
5384         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
5385         __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
5386 
5387 
5388         const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
5389         const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
5390         const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) k);
5391         const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
5392         i0 += 8;
5393 
5394 
5395         __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
5396 
5397 
5398         const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
5399         const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
5400         const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) (k + 16));
5401         const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
5402         i1 += 8;
5403 
5404 
5405         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi1x01234567, vxk1x01234567));
5406 
5407         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5408         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5409 
5410         const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
5411         const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
5412         const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) (k + 32));
5413         const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
5414         i2 += 8;
5415 
5416 
5417         vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
5418 
5419 
5420         const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
5421         const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567);
5422         const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) (k + 48));
5423         const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567);
5424         i3 += 8;
5425 
5426 
5427         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi3x01234567, vxk3x01234567));
5428 
5429         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5430         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5431 
5432         const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
5433         const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567);
5434         const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) (k + 64));
5435         const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567);
5436         i4 += 8;
5437 
5438 
5439         vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
5440 
5441 
5442         const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
5443         const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567);
5444         const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) (k + 80));
5445         const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567);
5446         i5 += 8;
5447 
5448 
5449         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi5x01234567, vxk5x01234567));
5450 
5451         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5452         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5453 
5454         const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
5455         const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567);
5456         const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) (k + 96));
5457         const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567);
5458         i6 += 8;
5459 
5460 
5461         vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
5462 
5463 
5464         const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
5465         const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567);
5466         const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) (k + 112));
5467         const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567);
5468         i7 += 8;
5469 
5470 
5471         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi7x01234567, vxk7x01234567));
5472 
5473         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5474         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5475 
5476         const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
5477         const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567);
5478         const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) (k + 128));
5479         const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567);
5480         i8 += 8;
5481 
5482 
5483         vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
5484 
5485 
5486         const __m128i vi9x01234567 = _mm_loadl_epi64((const __m128i*) i9);
5487         const __m128i vxi9x01234567 = _mm_cvtepi8_epi16(vi9x01234567);
5488         const __m128i vk9x01234567 = _mm_loadl_epi64((const __m128i*) (k + 144));
5489         const __m128i vxk9x01234567 = _mm_cvtepi8_epi16(vk9x01234567);
5490         i9 += 8;
5491 
5492 
5493         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi9x01234567, vxk9x01234567));
5494 
5495         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5496         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5497 
5498         const __m128i vi10x01234567 = _mm_loadl_epi64((const __m128i*) i10);
5499         const __m128i vxi10x01234567 = _mm_cvtepi8_epi16(vi10x01234567);
5500         const __m128i vk10x01234567 = _mm_loadl_epi64((const __m128i*) (k + 160));
5501         const __m128i vxk10x01234567 = _mm_cvtepi8_epi16(vk10x01234567);
5502         i10 += 8;
5503 
5504 
5505         vprod01234567 = _mm_mullo_epi16(vxi10x01234567, vxk10x01234567);
5506 
5507 
5508         const __m128i vi11x01234567 = _mm_loadl_epi64((const __m128i*) i11);
5509         const __m128i vxi11x01234567 = _mm_cvtepi8_epi16(vi11x01234567);
5510         const __m128i vk11x01234567 = _mm_loadl_epi64((const __m128i*) (k + 176));
5511         const __m128i vxk11x01234567 = _mm_cvtepi8_epi16(vk11x01234567);
5512         i11 += 8;
5513 
5514 
5515         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi11x01234567, vxk11x01234567));
5516 
5517         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5518         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5519 
5520         const __m128i vi12x01234567 = _mm_loadl_epi64((const __m128i*) i12);
5521         const __m128i vxi12x01234567 = _mm_cvtepi8_epi16(vi12x01234567);
5522         const __m128i vk12x01234567 = _mm_loadl_epi64((const __m128i*) (k + 192));
5523         const __m128i vxk12x01234567 = _mm_cvtepi8_epi16(vk12x01234567);
5524         i12 += 8;
5525 
5526 
5527         vprod01234567 = _mm_mullo_epi16(vxi12x01234567, vxk12x01234567);
5528 
5529 
5530         const __m128i vi13x01234567 = _mm_loadl_epi64((const __m128i*) i13);
5531         const __m128i vxi13x01234567 = _mm_cvtepi8_epi16(vi13x01234567);
5532         const __m128i vk13x01234567 = _mm_loadl_epi64((const __m128i*) (k + 208));
5533         const __m128i vxk13x01234567 = _mm_cvtepi8_epi16(vk13x01234567);
5534         i13 += 8;
5535 
5536 
5537         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi13x01234567, vxk13x01234567));
5538 
5539         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5540         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5541 
5542         const __m128i vi14x01234567 = _mm_loadl_epi64((const __m128i*) i14);
5543         const __m128i vxi14x01234567 = _mm_cvtepi8_epi16(vi14x01234567);
5544         const __m128i vk14x01234567 = _mm_loadl_epi64((const __m128i*) (k + 224));
5545         const __m128i vxk14x01234567 = _mm_cvtepi8_epi16(vk14x01234567);
5546         i14 += 8;
5547 
5548 
5549         vprod01234567 = _mm_mullo_epi16(vxi14x01234567, vxk14x01234567);
5550 
5551 
5552         const __m128i vi15x01234567 = _mm_loadl_epi64((const __m128i*) i15);
5553         const __m128i vxi15x01234567 = _mm_cvtepi8_epi16(vi15x01234567);
5554         const __m128i vk15x01234567 = _mm_loadl_epi64((const __m128i*) (k + 240));
5555         const __m128i vxk15x01234567 = _mm_cvtepi8_epi16(vk15x01234567);
5556         i15 += 8;
5557 
5558 
5559         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi15x01234567, vxk15x01234567));
5560 
5561         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5562         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5563 
5564         const __m128i vi16x01234567 = _mm_loadl_epi64((const __m128i*) i16);
5565         const __m128i vxi16x01234567 = _mm_cvtepi8_epi16(vi16x01234567);
5566         const __m128i vk16x01234567 = _mm_loadl_epi64((const __m128i*) (k + 256));
5567         const __m128i vxk16x01234567 = _mm_cvtepi8_epi16(vk16x01234567);
5568         i16 += 8;
5569 
5570 
5571         vprod01234567 = _mm_mullo_epi16(vxi16x01234567, vxk16x01234567);
5572 
5573 
5574         const __m128i vi17x01234567 = _mm_loadl_epi64((const __m128i*) i17);
5575         const __m128i vxi17x01234567 = _mm_cvtepi8_epi16(vi17x01234567);
5576         const __m128i vk17x01234567 = _mm_loadl_epi64((const __m128i*) (k + 272));
5577         const __m128i vxk17x01234567 = _mm_cvtepi8_epi16(vk17x01234567);
5578         i17 += 8;
5579 
5580 
5581         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi17x01234567, vxk17x01234567));
5582 
5583         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5584         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5585 
5586         const __m128i vi18x01234567 = _mm_loadl_epi64((const __m128i*) i18);
5587         const __m128i vxi18x01234567 = _mm_cvtepi8_epi16(vi18x01234567);
5588         const __m128i vk18x01234567 = _mm_loadl_epi64((const __m128i*) (k + 288));
5589         const __m128i vxk18x01234567 = _mm_cvtepi8_epi16(vk18x01234567);
5590         i18 += 8;
5591 
5592 
5593         vprod01234567 = _mm_mullo_epi16(vxi18x01234567, vxk18x01234567);
5594 
5595 
5596         const __m128i vi19x01234567 = _mm_loadl_epi64((const __m128i*) i19);
5597         const __m128i vxi19x01234567 = _mm_cvtepi8_epi16(vi19x01234567);
5598         const __m128i vk19x01234567 = _mm_loadl_epi64((const __m128i*) (k + 304));
5599         const __m128i vxk19x01234567 = _mm_cvtepi8_epi16(vk19x01234567);
5600         i19 += 8;
5601 
5602 
5603         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi19x01234567, vxk19x01234567));
5604 
5605         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5606         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5607 
5608         const __m128i vi20x01234567 = _mm_loadl_epi64((const __m128i*) i20);
5609         const __m128i vxi20x01234567 = _mm_cvtepi8_epi16(vi20x01234567);
5610         const __m128i vk20x01234567 = _mm_loadl_epi64((const __m128i*) (k + 320));
5611         const __m128i vxk20x01234567 = _mm_cvtepi8_epi16(vk20x01234567);
5612         i20 += 8;
5613 
5614 
5615         vprod01234567 = _mm_mullo_epi16(vxi20x01234567, vxk20x01234567);
5616 
5617 
5618         const __m128i vi21x01234567 = _mm_loadl_epi64((const __m128i*) i21);
5619         const __m128i vxi21x01234567 = _mm_cvtepi8_epi16(vi21x01234567);
5620         const __m128i vk21x01234567 = _mm_loadl_epi64((const __m128i*) (k + 336));
5621         const __m128i vxk21x01234567 = _mm_cvtepi8_epi16(vk21x01234567);
5622         i21 += 8;
5623 
5624 
5625         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi21x01234567, vxk21x01234567));
5626 
5627         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5628         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5629 
5630         const __m128i vi22x01234567 = _mm_loadl_epi64((const __m128i*) i22);
5631         const __m128i vxi22x01234567 = _mm_cvtepi8_epi16(vi22x01234567);
5632         const __m128i vk22x01234567 = _mm_loadl_epi64((const __m128i*) (k + 352));
5633         const __m128i vxk22x01234567 = _mm_cvtepi8_epi16(vk22x01234567);
5634         i22 += 8;
5635 
5636 
5637         vprod01234567 = _mm_mullo_epi16(vxi22x01234567, vxk22x01234567);
5638 
5639 
5640         const __m128i vi23x01234567 = _mm_loadl_epi64((const __m128i*) i23);
5641         const __m128i vxi23x01234567 = _mm_cvtepi8_epi16(vi23x01234567);
5642         const __m128i vk23x01234567 = _mm_loadl_epi64((const __m128i*) (k + 368));
5643         const __m128i vxk23x01234567 = _mm_cvtepi8_epi16(vk23x01234567);
5644         i23 += 8;
5645 
5646 
5647         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi23x01234567, vxk23x01234567));
5648 
5649         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5650         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5651 
5652         const __m128i vi24x01234567 = _mm_loadl_epi64((const __m128i*) i24);
5653         const __m128i vxi24x01234567 = _mm_cvtepi8_epi16(vi24x01234567);
5654         const __m128i vk24x01234567 = _mm_loadl_epi64((const __m128i*) (k + 384));
5655         const __m128i vxk24x01234567 = _mm_cvtepi8_epi16(vk24x01234567);
5656         i24 += 8;
5657 
5658 
5659         vprod01234567 = _mm_mullo_epi16(vxi24x01234567, vxk24x01234567);
5660 
5661         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5662         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5663 
5664         k += 8;
5665 
5666         __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
5667         __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
5668 
5669         const __m128 vscale0123 = _mm_loadu_ps((const float*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(int8_t)));
5670         const __m128 vscale4567 = _mm_loadu_ps((const float*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(int8_t) + 4 * sizeof(float)));
5671         vscaled0123 = _mm_mul_ps(vscaled0123, vscale0123);
5672         vscaled4567 = _mm_mul_ps(vscaled4567, vscale4567);
5673 
5674         const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
5675         vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
5676         vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
5677 
5678         vacc0123 = _mm_cvtps_epi32(vscaled0123);
5679         vacc4567 = _mm_cvtps_epi32(vscaled4567);
5680 
5681         w = (const void*) ((const int32_t*) w + 8);
5682 
5683         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
5684         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
5685 
5686 
5687         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
5688 
5689         vout0123456701234567 = _mm_max_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
5690 
5691         if XNN_LIKELY(c >= 8) {
5692           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
5693           output += 8;
5694           c -= 8;
5695         } else {
5696           if (c & 4) {
5697             unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
5698             vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
5699             output += 4;
5700           }
5701           if (c & 2) {
5702             unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
5703             vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
5704             output += 2;
5705           }
5706           if (c & 1) {
5707             *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
5708             output += 1;
5709           }
5710           c = 0;
5711         }
5712       } while (c != 0);
5713     }
5714 
5715     output = (int8_t*) ((uintptr_t) output + output_increment);
5716   } while (--output_width != 0);
5717 }
5718 
xnn_qc8_dwconv_minmax_fp32_ukernel_up16x3__avx_mul16_add16(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5719 void xnn_qc8_dwconv_minmax_fp32_ukernel_up16x3__avx_mul16_add16(
5720     size_t channels,
5721     size_t output_width,
5722     const int8_t** input,
5723     const void* weights,
5724     int8_t* output,
5725     size_t input_stride,
5726     size_t output_increment,
5727     size_t input_offset,
5728     const int8_t* zero,
5729     const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5730 {
5731   assert(channels != 0);
5732   assert(output_width != 0);
5733 
5734   do {
5735     const int8_t* i0 = input[0];
5736     assert(i0 != NULL);
5737     if XNN_UNPREDICTABLE(i0 != zero) {
5738       i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
5739     }
5740     const int8_t* i1 = input[1];
5741     assert(i1 != NULL);
5742     if XNN_UNPREDICTABLE(i1 != zero) {
5743       i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
5744     }
5745     const int8_t* i2 = input[2];
5746     assert(i2 != NULL);
5747     if XNN_UNPREDICTABLE(i2 != zero) {
5748       i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
5749     }
5750     input = (const int8_t**) ((uintptr_t) input + input_stride);
5751 
5752     size_t c = channels;
5753     const void* w = weights;
5754     for (; c >= 16; c -= 16) {
5755       __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
5756       __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
5757       __m128i vacc89AB = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 8));
5758       __m128i vaccCDEF = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 12));
5759 
5760 
5761       const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
5762       const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
5763       const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t)));
5764       const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
5765       const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8));
5766       const __m128i vxi0x89ABCDEF = _mm_cvtepi8_epi16(vi0x89ABCDEF);
5767       const __m128i vk0x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t)));
5768       const __m128i vxk0x89ABCDEF = _mm_cvtepi8_epi16(vk0x89ABCDEF);
5769       i0 += 16;
5770 
5771 
5772       __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
5773       __m128i vprod89ABCDEF = _mm_mullo_epi16(vxi0x89ABCDEF, vxk0x89ABCDEF);
5774 
5775 
5776       const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
5777       const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
5778       const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t)));
5779       const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
5780       const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8));
5781       const __m128i vxi1x89ABCDEF = _mm_cvtepi8_epi16(vi1x89ABCDEF);
5782       const __m128i vk1x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t)));
5783       const __m128i vxk1x89ABCDEF = _mm_cvtepi8_epi16(vk1x89ABCDEF);
5784       i1 += 16;
5785 
5786 
5787       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi1x01234567, vxk1x01234567));
5788       vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi1x89ABCDEF, vxk1x89ABCDEF));
5789 
5790       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5791       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5792       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
5793       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
5794 
5795       const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
5796       const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
5797       const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t)));
5798       const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
5799       const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8));
5800       const __m128i vxi2x89ABCDEF = _mm_cvtepi8_epi16(vi2x89ABCDEF);
5801       const __m128i vk2x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t)));
5802       const __m128i vxk2x89ABCDEF = _mm_cvtepi8_epi16(vk2x89ABCDEF);
5803       i2 += 16;
5804 
5805 
5806       vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
5807       vprod89ABCDEF = _mm_mullo_epi16(vxi2x89ABCDEF, vxk2x89ABCDEF);
5808 
5809       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5810       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5811       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
5812       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
5813 
5814       w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t));
5815 
5816       __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
5817       __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
5818       __m128 vscaled89AB = _mm_cvtepi32_ps(vacc89AB);
5819       __m128 vscaledCDEF = _mm_cvtepi32_ps(vaccCDEF);
5820 
5821       const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
5822       const __m128 vscale4567 = _mm_loadu_ps((const float*) w + 4);
5823       const __m128 vscale89AB = _mm_loadu_ps((const float*) w + 8);
5824       const __m128 vscaleCDEF = _mm_loadu_ps((const float*) w + 12);
5825       w = (const void*) ((const float*) w + 16);
5826       vscaled0123 = _mm_mul_ps(vscaled0123, vscale0123);
5827       vscaled4567 = _mm_mul_ps(vscaled4567, vscale4567);
5828       vscaled89AB = _mm_mul_ps(vscaled89AB, vscale89AB);
5829       vscaledCDEF = _mm_mul_ps(vscaledCDEF, vscaleCDEF);
5830 
5831       const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
5832       vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
5833       vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
5834       vscaled89AB = _mm_min_ps(vscaled89AB, voutput_max_less_zero_point);
5835       vscaledCDEF = _mm_min_ps(vscaledCDEF, voutput_max_less_zero_point);
5836 
5837       vacc0123 = _mm_cvtps_epi32(vscaled0123);
5838       vacc4567 = _mm_cvtps_epi32(vscaled4567);
5839       vacc89AB = _mm_cvtps_epi32(vscaled89AB);
5840       vaccCDEF = _mm_cvtps_epi32(vscaledCDEF);
5841 
5842       const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
5843       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
5844       __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
5845 
5846 
5847       __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
5848 
5849       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
5850       vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
5851 
5852       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
5853       output += 16;
5854     }
5855     if XNN_UNLIKELY(c != 0) {
5856       const int8_t* k = (const int8_t*) ((const int32_t*) w + 16);
5857       do {
5858         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
5859         __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
5860 
5861 
5862         const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
5863         const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
5864         const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) k);
5865         const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
5866         i0 += 8;
5867 
5868 
5869         __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
5870 
5871 
5872         const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
5873         const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
5874         const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) (k + 16));
5875         const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
5876         i1 += 8;
5877 
5878 
5879         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi1x01234567, vxk1x01234567));
5880 
5881         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5882         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5883 
5884         const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
5885         const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
5886         const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) (k + 32));
5887         const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
5888         i2 += 8;
5889 
5890 
5891         vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
5892 
5893         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5894         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5895 
5896         k += 8;
5897 
5898         __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
5899         __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
5900 
5901         const __m128 vscale0123 = _mm_loadu_ps((const float*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t)));
5902         const __m128 vscale4567 = _mm_loadu_ps((const float*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t) + 4 * sizeof(float)));
5903         vscaled0123 = _mm_mul_ps(vscaled0123, vscale0123);
5904         vscaled4567 = _mm_mul_ps(vscaled4567, vscale4567);
5905 
5906         const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
5907         vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
5908         vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
5909 
5910         vacc0123 = _mm_cvtps_epi32(vscaled0123);
5911         vacc4567 = _mm_cvtps_epi32(vscaled4567);
5912 
5913         w = (const void*) ((const int32_t*) w + 8);
5914 
5915         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
5916         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
5917 
5918 
5919         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
5920 
5921         vout0123456701234567 = _mm_max_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
5922 
5923         if XNN_LIKELY(c >= 8) {
5924           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
5925           output += 8;
5926           c -= 8;
5927         } else {
5928           if (c & 4) {
5929             unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
5930             vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
5931             output += 4;
5932           }
5933           if (c & 2) {
5934             unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
5935             vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
5936             output += 2;
5937           }
5938           if (c & 1) {
5939             *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
5940             output += 1;
5941           }
5942           c = 0;
5943         }
5944       } while (c != 0);
5945     }
5946 
5947     output = (int8_t*) ((uintptr_t) output + output_increment);
5948   } while (--output_width != 0);
5949 }
5950 
xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16_add16(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5951 void xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16_add16(
5952     size_t channels,
5953     size_t output_width,
5954     const int8_t** input,
5955     const void* weights,
5956     int8_t* output,
5957     size_t input_stride,
5958     size_t output_increment,
5959     size_t input_offset,
5960     const int8_t* zero,
5961     const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5962 {
5963   assert(channels != 0);
5964   assert(output_width != 0);
5965 
5966   do {
5967     const int8_t* i0 = input[0];
5968     assert(i0 != NULL);
5969     if XNN_UNPREDICTABLE(i0 != zero) {
5970       i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
5971     }
5972     const int8_t* i1 = input[1];
5973     assert(i1 != NULL);
5974     if XNN_UNPREDICTABLE(i1 != zero) {
5975       i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
5976     }
5977     const int8_t* i2 = input[2];
5978     assert(i2 != NULL);
5979     if XNN_UNPREDICTABLE(i2 != zero) {
5980       i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
5981     }
5982     const int8_t* i3 = input[3];
5983     assert(i3 != NULL);
5984     if XNN_UNPREDICTABLE(i3 != zero) {
5985       i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
5986     }
5987     const int8_t* i4 = input[4];
5988     assert(i4 != NULL);
5989     if XNN_UNPREDICTABLE(i4 != zero) {
5990       i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
5991     }
5992     const int8_t* i5 = input[5];
5993     assert(i5 != NULL);
5994     if XNN_UNPREDICTABLE(i5 != zero) {
5995       i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
5996     }
5997     const int8_t* i6 = input[6];
5998     assert(i6 != NULL);
5999     if XNN_UNPREDICTABLE(i6 != zero) {
6000       i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
6001     }
6002     const int8_t* i7 = input[7];
6003     assert(i7 != NULL);
6004     if XNN_UNPREDICTABLE(i7 != zero) {
6005       i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
6006     }
6007     const int8_t* i8 = input[8];
6008     assert(i8 != NULL);
6009     if XNN_UNPREDICTABLE(i8 != zero) {
6010       i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
6011     }
6012     input = (const int8_t**) ((uintptr_t) input + input_stride);
6013 
6014     size_t c = channels;
6015     const void* w = weights;
6016     for (; c >= 16; c -= 16) {
6017       __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
6018       __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
6019       __m128i vacc89AB = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 8));
6020       __m128i vaccCDEF = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 12));
6021 
6022 
6023       const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
6024       const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
6025       const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t)));
6026       const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
6027       const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8));
6028       const __m128i vxi0x89ABCDEF = _mm_cvtepi8_epi16(vi0x89ABCDEF);
6029       const __m128i vk0x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t)));
6030       const __m128i vxk0x89ABCDEF = _mm_cvtepi8_epi16(vk0x89ABCDEF);
6031       i0 += 16;
6032 
6033 
6034       __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
6035       __m128i vprod89ABCDEF = _mm_mullo_epi16(vxi0x89ABCDEF, vxk0x89ABCDEF);
6036 
6037 
6038       const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
6039       const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
6040       const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t)));
6041       const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
6042       const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8));
6043       const __m128i vxi1x89ABCDEF = _mm_cvtepi8_epi16(vi1x89ABCDEF);
6044       const __m128i vk1x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t)));
6045       const __m128i vxk1x89ABCDEF = _mm_cvtepi8_epi16(vk1x89ABCDEF);
6046       i1 += 16;
6047 
6048 
6049       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi1x01234567, vxk1x01234567));
6050       vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi1x89ABCDEF, vxk1x89ABCDEF));
6051 
6052       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
6053       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
6054       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
6055       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
6056 
6057       const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
6058       const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
6059       const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t)));
6060       const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
6061       const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8));
6062       const __m128i vxi2x89ABCDEF = _mm_cvtepi8_epi16(vi2x89ABCDEF);
6063       const __m128i vk2x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t)));
6064       const __m128i vxk2x89ABCDEF = _mm_cvtepi8_epi16(vk2x89ABCDEF);
6065       i2 += 16;
6066 
6067 
6068       vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
6069       vprod89ABCDEF = _mm_mullo_epi16(vxi2x89ABCDEF, vxk2x89ABCDEF);
6070 
6071 
6072       const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
6073       const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567);
6074       const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t)));
6075       const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567);
6076       const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8));
6077       const __m128i vxi3x89ABCDEF = _mm_cvtepi8_epi16(vi3x89ABCDEF);
6078       const __m128i vk3x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t)));
6079       const __m128i vxk3x89ABCDEF = _mm_cvtepi8_epi16(vk3x89ABCDEF);
6080       i3 += 16;
6081 
6082 
6083       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi3x01234567, vxk3x01234567));
6084       vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi3x89ABCDEF, vxk3x89ABCDEF));
6085 
6086       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
6087       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
6088       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
6089       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
6090 
6091       const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
6092       const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567);
6093       const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t)));
6094       const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567);
6095       const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8));
6096       const __m128i vxi4x89ABCDEF = _mm_cvtepi8_epi16(vi4x89ABCDEF);
6097       const __m128i vk4x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t)));
6098       const __m128i vxk4x89ABCDEF = _mm_cvtepi8_epi16(vk4x89ABCDEF);
6099       i4 += 16;
6100 
6101 
6102       vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
6103       vprod89ABCDEF = _mm_mullo_epi16(vxi4x89ABCDEF, vxk4x89ABCDEF);
6104 
6105 
6106       const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
6107       const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567);
6108       const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t)));
6109       const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567);
6110       const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8));
6111       const __m128i vxi5x89ABCDEF = _mm_cvtepi8_epi16(vi5x89ABCDEF);
6112       const __m128i vk5x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t)));
6113       const __m128i vxk5x89ABCDEF = _mm_cvtepi8_epi16(vk5x89ABCDEF);
6114       i5 += 16;
6115 
6116 
6117       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi5x01234567, vxk5x01234567));
6118       vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi5x89ABCDEF, vxk5x89ABCDEF));
6119 
6120       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
6121       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
6122       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
6123       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
6124 
6125       const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
6126       const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567);
6127       const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t)));
6128       const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567);
6129       const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8));
6130       const __m128i vxi6x89ABCDEF = _mm_cvtepi8_epi16(vi6x89ABCDEF);
6131       const __m128i vk6x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t)));
6132       const __m128i vxk6x89ABCDEF = _mm_cvtepi8_epi16(vk6x89ABCDEF);
6133       i6 += 16;
6134 
6135 
6136       vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
6137       vprod89ABCDEF = _mm_mullo_epi16(vxi6x89ABCDEF, vxk6x89ABCDEF);
6138 
6139 
6140       const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
6141       const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567);
6142       const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t)));
6143       const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567);
6144       const __m128i vi7x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i7 + 8));
6145       const __m128i vxi7x89ABCDEF = _mm_cvtepi8_epi16(vi7x89ABCDEF);
6146       const __m128i vk7x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t)));
6147       const __m128i vxk7x89ABCDEF = _mm_cvtepi8_epi16(vk7x89ABCDEF);
6148       i7 += 16;
6149 
6150 
6151       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi7x01234567, vxk7x01234567));
6152       vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi7x89ABCDEF, vxk7x89ABCDEF));
6153 
6154       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
6155       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
6156       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
6157       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
6158 
6159       const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
6160       const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567);
6161       const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t)));
6162       const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567);
6163       const __m128i vi8x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i8 + 8));
6164       const __m128i vxi8x89ABCDEF = _mm_cvtepi8_epi16(vi8x89ABCDEF);
6165       const __m128i vk8x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t)));
6166       const __m128i vxk8x89ABCDEF = _mm_cvtepi8_epi16(vk8x89ABCDEF);
6167       i8 += 16;
6168 
6169 
6170       vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
6171       vprod89ABCDEF = _mm_mullo_epi16(vxi8x89ABCDEF, vxk8x89ABCDEF);
6172 
6173       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
6174       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
6175       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
6176       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
6177 
6178       w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t));
6179 
6180       __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
6181       __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
6182       __m128 vscaled89AB = _mm_cvtepi32_ps(vacc89AB);
6183       __m128 vscaledCDEF = _mm_cvtepi32_ps(vaccCDEF);
6184 
6185       const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
6186       const __m128 vscale4567 = _mm_loadu_ps((const float*) w + 4);
6187       const __m128 vscale89AB = _mm_loadu_ps((const float*) w + 8);
6188       const __m128 vscaleCDEF = _mm_loadu_ps((const float*) w + 12);
6189       w = (const void*) ((const float*) w + 16);
6190       vscaled0123 = _mm_mul_ps(vscaled0123, vscale0123);
6191       vscaled4567 = _mm_mul_ps(vscaled4567, vscale4567);
6192       vscaled89AB = _mm_mul_ps(vscaled89AB, vscale89AB);
6193       vscaledCDEF = _mm_mul_ps(vscaledCDEF, vscaleCDEF);
6194 
6195       const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
6196       vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
6197       vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
6198       vscaled89AB = _mm_min_ps(vscaled89AB, voutput_max_less_zero_point);
6199       vscaledCDEF = _mm_min_ps(vscaledCDEF, voutput_max_less_zero_point);
6200 
6201       vacc0123 = _mm_cvtps_epi32(vscaled0123);
6202       vacc4567 = _mm_cvtps_epi32(vscaled4567);
6203       vacc89AB = _mm_cvtps_epi32(vscaled89AB);
6204       vaccCDEF = _mm_cvtps_epi32(vscaledCDEF);
6205 
6206       const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
6207       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
6208       __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
6209 
6210 
6211       __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
6212 
6213       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
6214       vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
6215 
6216       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
6217       output += 16;
6218     }
6219     if XNN_UNLIKELY(c != 0) {
6220       const int8_t* k = (const int8_t*) ((const int32_t*) w + 16);
6221       do {
6222         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
6223         __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
6224 
6225 
6226         const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
6227         const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
6228         const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) k);
6229         const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
6230         i0 += 8;
6231 
6232 
6233         __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
6234 
6235 
6236         const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
6237         const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
6238         const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) (k + 16));
6239         const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
6240         i1 += 8;
6241 
6242 
6243         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi1x01234567, vxk1x01234567));
6244 
6245         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
6246         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
6247 
6248         const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
6249         const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
6250         const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) (k + 32));
6251         const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
6252         i2 += 8;
6253 
6254 
6255         vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
6256 
6257 
6258         const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
6259         const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567);
6260         const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) (k + 48));
6261         const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567);
6262         i3 += 8;
6263 
6264 
6265         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi3x01234567, vxk3x01234567));
6266 
6267         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
6268         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
6269 
6270         const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
6271         const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567);
6272         const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) (k + 64));
6273         const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567);
6274         i4 += 8;
6275 
6276 
6277         vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
6278 
6279 
6280         const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
6281         const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567);
6282         const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) (k + 80));
6283         const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567);
6284         i5 += 8;
6285 
6286 
6287         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi5x01234567, vxk5x01234567));
6288 
6289         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
6290         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
6291 
6292         const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
6293         const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567);
6294         const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) (k + 96));
6295         const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567);
6296         i6 += 8;
6297 
6298 
6299         vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
6300 
6301 
6302         const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
6303         const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567);
6304         const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) (k + 112));
6305         const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567);
6306         i7 += 8;
6307 
6308 
6309         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi7x01234567, vxk7x01234567));
6310 
6311         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
6312         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
6313 
6314         const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
6315         const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567);
6316         const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) (k + 128));
6317         const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567);
6318         i8 += 8;
6319 
6320 
6321         vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
6322 
6323         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
6324         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
6325 
6326         k += 8;
6327 
6328         __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
6329         __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
6330 
6331         const __m128 vscale0123 = _mm_loadu_ps((const float*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t)));
6332         const __m128 vscale4567 = _mm_loadu_ps((const float*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t) + 4 * sizeof(float)));
6333         vscaled0123 = _mm_mul_ps(vscaled0123, vscale0123);
6334         vscaled4567 = _mm_mul_ps(vscaled4567, vscale4567);
6335 
6336         const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
6337         vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
6338         vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
6339 
6340         vacc0123 = _mm_cvtps_epi32(vscaled0123);
6341         vacc4567 = _mm_cvtps_epi32(vscaled4567);
6342 
6343         w = (const void*) ((const int32_t*) w + 8);
6344 
6345         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
6346         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
6347 
6348 
6349         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
6350 
6351         vout0123456701234567 = _mm_max_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
6352 
6353         if XNN_LIKELY(c >= 8) {
6354           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
6355           output += 8;
6356           c -= 8;
6357         } else {
6358           if (c & 4) {
6359             unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
6360             vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
6361             output += 4;
6362           }
6363           if (c & 2) {
6364             unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
6365             vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
6366             output += 2;
6367           }
6368           if (c & 1) {
6369             *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
6370             output += 1;
6371           }
6372           c = 0;
6373         }
6374       } while (c != 0);
6375     }
6376 
6377     output = (int8_t*) ((uintptr_t) output + output_increment);
6378   } while (--output_width != 0);
6379 }
6380 
xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6381 void xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128(
6382     size_t mr,
6383     size_t nc,
6384     size_t kc,
6385     const int8_t* restrict a,
6386     size_t a_stride,
6387     const void* restrict w,
6388     int8_t* restrict c,
6389     size_t cm_stride,
6390     size_t cn_stride,
6391     const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6392 {
6393   assert(mr != 0);
6394   assert(mr <= 1);
6395   assert(nc != 0);
6396   assert(kc != 0);
6397   assert(kc % sizeof(int8_t) == 0);
6398   assert(a != NULL);
6399   assert(w != NULL);
6400   assert(c != NULL);
6401 
6402   kc = round_up_po2(kc, 8);
6403   const int8_t* a0 = a;
6404   int8_t* c0 = c;
6405 
6406   do {
6407     __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
6408     __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
6409     __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
6410     __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
6411     w = (const int32_t*) w + 4;
6412 
6413     size_t k = 0;
6414     while (k < kc) {
6415       const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
6416       const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
6417       a0 += 8;
6418 
6419       const __m128i vb01 = _mm_load_si128((const __m128i*) w);
6420       const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
6421       const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
6422 
6423       vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
6424       vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
6425       const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
6426       const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
6427       const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
6428 
6429       vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
6430       vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
6431 
6432       w = (const void*) ((const int8_t*) w + 32);
6433       k += 8 * sizeof(int8_t);
6434     }
6435 
6436     const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
6437     const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
6438 
6439     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
6440 
6441     __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
6442 
6443     const __m128 vscale0123 = _mm_load_ps((const float*) w);
6444     w = (const void*) ((const float*) w + 4);
6445     vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
6446 
6447     const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
6448     vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
6449 
6450     vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
6451 
6452     const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
6453     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
6454 
6455 
6456     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
6457 
6458     vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
6459 
6460     if (nc >= 4) {
6461       unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
6462 
6463       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
6464 
6465       a0 = (const int8_t*) ((uintptr_t) a0 - kc);
6466 
6467       nc -= 4;
6468     } else {
6469       if (nc & 2) {
6470         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
6471         c0 += 2;
6472         vout = _mm_srli_epi32(vout, 16);
6473       }
6474       if (nc & 1) {
6475         *c0 = (int8_t) _mm_extract_epi8(vout, 0);
6476       }
6477 
6478       nc = 0;
6479     }
6480   } while (nc != 0);
6481 }
6482 
xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6483 void xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128(
6484     size_t mr,
6485     size_t nc,
6486     size_t kc,
6487     const int8_t* restrict a,
6488     size_t a_stride,
6489     const void* restrict w,
6490     int8_t* restrict c,
6491     size_t cm_stride,
6492     size_t cn_stride,
6493     const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6494 {
6495   assert(mr != 0);
6496   assert(mr <= 2);
6497   assert(nc != 0);
6498   assert(kc != 0);
6499   assert(kc % sizeof(int8_t) == 0);
6500   assert(a != NULL);
6501   assert(w != NULL);
6502   assert(c != NULL);
6503 
6504   kc = round_up_po2(kc, 8);
6505   const int8_t* a0 = a;
6506   int8_t* c0 = c;
6507   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
6508   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
6509   if XNN_UNPREDICTABLE(mr != 2) {
6510     a1 = a0;
6511     c1 = c0;
6512   }
6513 
6514   do {
6515     __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
6516     __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
6517     __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
6518     __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
6519     __m128i vacc1x0 = vacc0x0;
6520     __m128i vacc1x1 = vacc0x1;
6521     __m128i vacc1x2 = vacc0x2;
6522     __m128i vacc1x3 = vacc0x3;
6523     w = (const int32_t*) w + 4;
6524 
6525     size_t k = 0;
6526     while (k < kc) {
6527       const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
6528       const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
6529       a0 += 8;
6530       const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
6531       const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
6532       a1 += 8;
6533 
6534       const __m128i vb01 = _mm_load_si128((const __m128i*) w);
6535       const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
6536       const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
6537 
6538       vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
6539       vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
6540       vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
6541       vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
6542       const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
6543       const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
6544       const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
6545 
6546       vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
6547       vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
6548       vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
6549       vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
6550 
6551       w = (const void*) ((const int8_t*) w + 32);
6552       k += 8 * sizeof(int8_t);
6553     }
6554 
6555     const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
6556     const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
6557     const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
6558     const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
6559 
6560     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
6561     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
6562 
6563     __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
6564     __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
6565 
6566     const __m128 vscale0123 = _mm_load_ps((const float*) w);
6567     w = (const void*) ((const float*) w + 4);
6568     vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
6569     vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
6570 
6571     const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
6572     vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
6573     vscaled1x0123 = _mm_min_ps(vscaled1x0123, voutput_max_less_zero_point);
6574 
6575     vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
6576     vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
6577 
6578     const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
6579     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
6580 
6581 
6582     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
6583 
6584     vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
6585 
6586     if (nc >= 4) {
6587       unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
6588       unaligned_store_u32(c1, (uint32_t) _mm_extract_epi32(vout, 1));
6589 
6590       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
6591       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
6592 
6593       a0 = (const int8_t*) ((uintptr_t) a0 - kc);
6594       a1 = (const int8_t*) ((uintptr_t) a1 - kc);
6595 
6596       nc -= 4;
6597     } else {
6598       if (nc & 2) {
6599         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
6600         c0 += 2;
6601         unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout, 2));
6602         c1 += 2;
6603         vout = _mm_srli_epi32(vout, 16);
6604       }
6605       if (nc & 1) {
6606         *c0 = (int8_t) _mm_extract_epi8(vout, 0);
6607         *c1 = (int8_t) _mm_extract_epi8(vout, 4);
6608       }
6609 
6610       nc = 0;
6611     }
6612   } while (nc != 0);
6613 }
6614 
xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6615 void xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128(
6616     size_t mr,
6617     size_t nc,
6618     size_t kc,
6619     size_t ks,
6620     const int8_t** restrict a,
6621     const void* restrict w,
6622     int8_t* restrict c,
6623     size_t cm_stride,
6624     size_t cn_stride,
6625     size_t a_offset,
6626     const int8_t* zero,
6627     const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6628 {
6629   assert(mr != 0);
6630   assert(mr <= 1);
6631   assert(nc != 0);
6632   assert(kc != 0);
6633   assert(ks != 0);
6634   assert(ks % (1 * sizeof(void*)) == 0);
6635   assert(a_offset % sizeof(int8_t) == 0);
6636   assert(a != NULL);
6637   assert(w != NULL);
6638   assert(c != NULL);
6639 
6640   kc = round_up_po2(kc, 8);
6641   int8_t* c0 = c;
6642 
6643   do {
6644     __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
6645     __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
6646     __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
6647     __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
6648     w = (const int32_t*) w + 4;
6649 
6650     size_t p = ks;
6651     do {
6652       const int8_t* restrict a0 = a[0];
6653       if XNN_UNPREDICTABLE(a0 != zero) {
6654         a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
6655       }
6656       a += 1;
6657 
6658       size_t k = 0;
6659       while (k < kc) {
6660         const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
6661         const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
6662         a0 += 8;
6663 
6664         const __m128i vb01 = _mm_load_si128((const __m128i*) w);
6665         const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
6666         const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
6667 
6668         vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
6669         vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
6670         const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
6671         const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
6672         const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
6673 
6674         vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
6675         vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
6676 
6677         w = (const void*) ((const int8_t*) w + 32);
6678         k += 8 * sizeof(int8_t);
6679       }
6680       p -= 1 * sizeof(void*);
6681     } while (p != 0);
6682 
6683     const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
6684     const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
6685 
6686     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
6687 
6688     __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
6689 
6690     const __m128 vscale0123 = _mm_load_ps((const float*) w);
6691     w = (const void*) ((const float*) w + 4);
6692     vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
6693 
6694     const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
6695     vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
6696 
6697     vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
6698 
6699     const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
6700     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
6701 
6702 
6703     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
6704 
6705     vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
6706 
6707     if (nc >= 4) {
6708       unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
6709       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
6710 
6711       a = (const int8_t**restrict) ((uintptr_t) a - ks);
6712 
6713       nc -= 4;
6714     } else {
6715       if (nc & 2) {
6716         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
6717         c0 += 2;
6718         vout = _mm_srli_epi32(vout, 16);
6719       }
6720       if (nc & 1) {
6721         *c0 = (int8_t) _mm_extract_epi8(vout, 0);
6722       }
6723 
6724       nc = 0;
6725     }
6726   } while (nc != 0);
6727 }
6728 
xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6729 void xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128(
6730     size_t mr,
6731     size_t nc,
6732     size_t kc,
6733     size_t ks,
6734     const int8_t** restrict a,
6735     const void* restrict w,
6736     int8_t* restrict c,
6737     size_t cm_stride,
6738     size_t cn_stride,
6739     size_t a_offset,
6740     const int8_t* zero,
6741     const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6742 {
6743   assert(mr != 0);
6744   assert(mr <= 2);
6745   assert(nc != 0);
6746   assert(kc != 0);
6747   assert(ks != 0);
6748   assert(ks % (2 * sizeof(void*)) == 0);
6749   assert(a_offset % sizeof(int8_t) == 0);
6750   assert(a != NULL);
6751   assert(w != NULL);
6752   assert(c != NULL);
6753 
6754   kc = round_up_po2(kc, 8);
6755   int8_t* c0 = c;
6756   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
6757   if XNN_UNPREDICTABLE(mr != 2) {
6758     c1 = c0;
6759   }
6760 
6761   do {
6762     __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
6763     __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
6764     __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
6765     __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
6766     __m128i vacc1x0 = vacc0x0;
6767     __m128i vacc1x1 = vacc0x1;
6768     __m128i vacc1x2 = vacc0x2;
6769     __m128i vacc1x3 = vacc0x3;
6770     w = (const int32_t*) w + 4;
6771 
6772     size_t p = ks;
6773     do {
6774       const int8_t* restrict a0 = a[0];
6775       if XNN_UNPREDICTABLE(a0 != zero) {
6776         a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
6777       }
6778       const int8_t* restrict a1 = a[1];
6779       if XNN_UNPREDICTABLE(a1 != zero) {
6780         a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
6781       }
6782       a += 2;
6783 
6784       size_t k = 0;
6785       while (k < kc) {
6786         const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
6787         const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
6788         a0 += 8;
6789         const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
6790         const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
6791         a1 += 8;
6792 
6793         const __m128i vb01 = _mm_load_si128((const __m128i*) w);
6794         const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
6795         const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
6796 
6797         vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
6798         vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
6799         vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
6800         vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
6801         const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
6802         const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
6803         const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
6804 
6805         vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
6806         vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
6807         vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
6808         vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
6809 
6810         w = (const void*) ((const int8_t*) w + 32);
6811         k += 8 * sizeof(int8_t);
6812       }
6813       p -= 2 * sizeof(void*);
6814     } while (p != 0);
6815 
6816     const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
6817     const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
6818     const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
6819     const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
6820 
6821     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
6822     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
6823 
6824     __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
6825     __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
6826 
6827     const __m128 vscale0123 = _mm_load_ps((const float*) w);
6828     w = (const void*) ((const float*) w + 4);
6829     vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
6830     vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
6831 
6832     const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
6833     vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
6834     vscaled1x0123 = _mm_min_ps(vscaled1x0123, voutput_max_less_zero_point);
6835 
6836     vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
6837     vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
6838 
6839     const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
6840     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
6841 
6842 
6843     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
6844 
6845     vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
6846 
6847     if (nc >= 4) {
6848       unaligned_store_u32(c1, (uint32_t) _mm_extract_epi32(vout, 1));
6849       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
6850       unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
6851       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
6852 
6853       a = (const int8_t**restrict) ((uintptr_t) a - ks);
6854 
6855       nc -= 4;
6856     } else {
6857       if (nc & 2) {
6858         unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout, 2));
6859         c1 += 2;
6860         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
6861         c0 += 2;
6862         vout = _mm_srli_epi32(vout, 16);
6863       }
6864       if (nc & 1) {
6865         *c1 = (int8_t) _mm_extract_epi8(vout, 4);
6866         *c0 = (int8_t) _mm_extract_epi8(vout, 0);
6867       }
6868 
6869       nc = 0;
6870     }
6871   } while (nc != 0);
6872 }
6873 
xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16_add16(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6874 void xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16_add16(
6875     size_t channels,
6876     size_t output_width,
6877     const int8_t** input,
6878     const void* weights,
6879     int8_t* output,
6880     size_t input_stride,
6881     size_t output_increment,
6882     size_t input_offset,
6883     const int8_t* zero,
6884     const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6885 {
6886   assert(channels != 0);
6887   assert(output_width != 0);
6888 
6889   do {
6890     const int8_t* i0 = input[0];
6891     assert(i0 != NULL);
6892     if XNN_UNPREDICTABLE(i0 != zero) {
6893       i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
6894     }
6895     const int8_t* i1 = input[1];
6896     assert(i1 != NULL);
6897     if XNN_UNPREDICTABLE(i1 != zero) {
6898       i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
6899     }
6900     const int8_t* i2 = input[2];
6901     assert(i2 != NULL);
6902     if XNN_UNPREDICTABLE(i2 != zero) {
6903       i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
6904     }
6905     const int8_t* i3 = input[3];
6906     assert(i3 != NULL);
6907     if XNN_UNPREDICTABLE(i3 != zero) {
6908       i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
6909     }
6910     const int8_t* i4 = input[4];
6911     assert(i4 != NULL);
6912     if XNN_UNPREDICTABLE(i4 != zero) {
6913       i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
6914     }
6915     const int8_t* i5 = input[5];
6916     assert(i5 != NULL);
6917     if XNN_UNPREDICTABLE(i5 != zero) {
6918       i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
6919     }
6920     const int8_t* i6 = input[6];
6921     assert(i6 != NULL);
6922     if XNN_UNPREDICTABLE(i6 != zero) {
6923       i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
6924     }
6925     const int8_t* i7 = input[7];
6926     assert(i7 != NULL);
6927     if XNN_UNPREDICTABLE(i7 != zero) {
6928       i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
6929     }
6930     const int8_t* i8 = input[8];
6931     assert(i8 != NULL);
6932     if XNN_UNPREDICTABLE(i8 != zero) {
6933       i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
6934     }
6935     const int8_t* i9 = input[9];
6936     assert(i9 != NULL);
6937     if XNN_UNPREDICTABLE(i9 != zero) {
6938       i9 = (const int8_t*) ((uintptr_t) i9 + input_offset);
6939     }
6940     const int8_t* i10 = input[10];
6941     assert(i10 != NULL);
6942     if XNN_UNPREDICTABLE(i10 != zero) {
6943       i10 = (const int8_t*) ((uintptr_t) i10 + input_offset);
6944     }
6945     const int8_t* i11 = input[11];
6946     assert(i11 != NULL);
6947     if XNN_UNPREDICTABLE(i11 != zero) {
6948       i11 = (const int8_t*) ((uintptr_t) i11 + input_offset);
6949     }
6950     const int8_t* i12 = input[12];
6951     assert(i12 != NULL);
6952     if XNN_UNPREDICTABLE(i12 != zero) {
6953       i12 = (const int8_t*) ((uintptr_t) i12 + input_offset);
6954     }
6955     const int8_t* i13 = input[13];
6956     assert(i13 != NULL);
6957     if XNN_UNPREDICTABLE(i13 != zero) {
6958       i13 = (const int8_t*) ((uintptr_t) i13 + input_offset);
6959     }
6960     const int8_t* i14 = input[14];
6961     assert(i14 != NULL);
6962     if XNN_UNPREDICTABLE(i14 != zero) {
6963       i14 = (const int8_t*) ((uintptr_t) i14 + input_offset);
6964     }
6965     const int8_t* i15 = input[15];
6966     assert(i15 != NULL);
6967     if XNN_UNPREDICTABLE(i15 != zero) {
6968       i15 = (const int8_t*) ((uintptr_t) i15 + input_offset);
6969     }
6970     const int8_t* i16 = input[16];
6971     assert(i16 != NULL);
6972     if XNN_UNPREDICTABLE(i16 != zero) {
6973       i16 = (const int8_t*) ((uintptr_t) i16 + input_offset);
6974     }
6975     const int8_t* i17 = input[17];
6976     assert(i17 != NULL);
6977     if XNN_UNPREDICTABLE(i17 != zero) {
6978       i17 = (const int8_t*) ((uintptr_t) i17 + input_offset);
6979     }
6980     const int8_t* i18 = input[18];
6981     assert(i18 != NULL);
6982     if XNN_UNPREDICTABLE(i18 != zero) {
6983       i18 = (const int8_t*) ((uintptr_t) i18 + input_offset);
6984     }
6985     const int8_t* i19 = input[19];
6986     assert(i19 != NULL);
6987     if XNN_UNPREDICTABLE(i19 != zero) {
6988       i19 = (const int8_t*) ((uintptr_t) i19 + input_offset);
6989     }
6990     const int8_t* i20 = input[20];
6991     assert(i20 != NULL);
6992     if XNN_UNPREDICTABLE(i20 != zero) {
6993       i20 = (const int8_t*) ((uintptr_t) i20 + input_offset);
6994     }
6995     const int8_t* i21 = input[21];
6996     assert(i21 != NULL);
6997     if XNN_UNPREDICTABLE(i21 != zero) {
6998       i21 = (const int8_t*) ((uintptr_t) i21 + input_offset);
6999     }
7000     const int8_t* i22 = input[22];
7001     assert(i22 != NULL);
7002     if XNN_UNPREDICTABLE(i22 != zero) {
7003       i22 = (const int8_t*) ((uintptr_t) i22 + input_offset);
7004     }
7005     const int8_t* i23 = input[23];
7006     assert(i23 != NULL);
7007     if XNN_UNPREDICTABLE(i23 != zero) {
7008       i23 = (const int8_t*) ((uintptr_t) i23 + input_offset);
7009     }
7010     const int8_t* i24 = input[24];
7011     assert(i24 != NULL);
7012     if XNN_UNPREDICTABLE(i24 != zero) {
7013       i24 = (const int8_t*) ((uintptr_t) i24 + input_offset);
7014     }
7015     input = (const int8_t**) ((uintptr_t) input + input_stride);
7016 
7017     size_t c = channels;
7018     const void* w = weights;
7019     for (; c >= 16; c -= 16) {
7020       __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
7021       __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
7022       __m128i vacc89AB = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 8));
7023       __m128i vaccCDEF = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 12));
7024 
7025 
7026       const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
7027       const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
7028       const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t)));
7029       const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
7030       const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8));
7031       const __m128i vxi0x89ABCDEF = _mm_cvtepi8_epi16(vi0x89ABCDEF);
7032       const __m128i vk0x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t)));
7033       const __m128i vxk0x89ABCDEF = _mm_cvtepi8_epi16(vk0x89ABCDEF);
7034       i0 += 16;
7035 
7036 
7037       __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
7038       __m128i vprod89ABCDEF = _mm_mullo_epi16(vxi0x89ABCDEF, vxk0x89ABCDEF);
7039 
7040 
7041       const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
7042       const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
7043       const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t)));
7044       const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
7045       const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8));
7046       const __m128i vxi1x89ABCDEF = _mm_cvtepi8_epi16(vi1x89ABCDEF);
7047       const __m128i vk1x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t)));
7048       const __m128i vxk1x89ABCDEF = _mm_cvtepi8_epi16(vk1x89ABCDEF);
7049       i1 += 16;
7050 
7051 
7052       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi1x01234567, vxk1x01234567));
7053       vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi1x89ABCDEF, vxk1x89ABCDEF));
7054 
7055       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7056       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7057       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
7058       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
7059 
7060       const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
7061       const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
7062       const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t)));
7063       const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
7064       const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8));
7065       const __m128i vxi2x89ABCDEF = _mm_cvtepi8_epi16(vi2x89ABCDEF);
7066       const __m128i vk2x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t)));
7067       const __m128i vxk2x89ABCDEF = _mm_cvtepi8_epi16(vk2x89ABCDEF);
7068       i2 += 16;
7069 
7070 
7071       vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
7072       vprod89ABCDEF = _mm_mullo_epi16(vxi2x89ABCDEF, vxk2x89ABCDEF);
7073 
7074 
7075       const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
7076       const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567);
7077       const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t)));
7078       const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567);
7079       const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8));
7080       const __m128i vxi3x89ABCDEF = _mm_cvtepi8_epi16(vi3x89ABCDEF);
7081       const __m128i vk3x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t)));
7082       const __m128i vxk3x89ABCDEF = _mm_cvtepi8_epi16(vk3x89ABCDEF);
7083       i3 += 16;
7084 
7085 
7086       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi3x01234567, vxk3x01234567));
7087       vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi3x89ABCDEF, vxk3x89ABCDEF));
7088 
7089       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7090       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7091       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
7092       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
7093 
7094       const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
7095       const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567);
7096       const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t)));
7097       const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567);
7098       const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8));
7099       const __m128i vxi4x89ABCDEF = _mm_cvtepi8_epi16(vi4x89ABCDEF);
7100       const __m128i vk4x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t)));
7101       const __m128i vxk4x89ABCDEF = _mm_cvtepi8_epi16(vk4x89ABCDEF);
7102       i4 += 16;
7103 
7104 
7105       vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
7106       vprod89ABCDEF = _mm_mullo_epi16(vxi4x89ABCDEF, vxk4x89ABCDEF);
7107 
7108 
7109       const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
7110       const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567);
7111       const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t)));
7112       const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567);
7113       const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8));
7114       const __m128i vxi5x89ABCDEF = _mm_cvtepi8_epi16(vi5x89ABCDEF);
7115       const __m128i vk5x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t)));
7116       const __m128i vxk5x89ABCDEF = _mm_cvtepi8_epi16(vk5x89ABCDEF);
7117       i5 += 16;
7118 
7119 
7120       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi5x01234567, vxk5x01234567));
7121       vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi5x89ABCDEF, vxk5x89ABCDEF));
7122 
7123       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7124       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7125       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
7126       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
7127 
7128       const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
7129       const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567);
7130       const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t)));
7131       const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567);
7132       const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8));
7133       const __m128i vxi6x89ABCDEF = _mm_cvtepi8_epi16(vi6x89ABCDEF);
7134       const __m128i vk6x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t)));
7135       const __m128i vxk6x89ABCDEF = _mm_cvtepi8_epi16(vk6x89ABCDEF);
7136       i6 += 16;
7137 
7138 
7139       vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
7140       vprod89ABCDEF = _mm_mullo_epi16(vxi6x89ABCDEF, vxk6x89ABCDEF);
7141 
7142 
7143       const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
7144       const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567);
7145       const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t)));
7146       const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567);
7147       const __m128i vi7x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i7 + 8));
7148       const __m128i vxi7x89ABCDEF = _mm_cvtepi8_epi16(vi7x89ABCDEF);
7149       const __m128i vk7x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t)));
7150       const __m128i vxk7x89ABCDEF = _mm_cvtepi8_epi16(vk7x89ABCDEF);
7151       i7 += 16;
7152 
7153 
7154       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi7x01234567, vxk7x01234567));
7155       vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi7x89ABCDEF, vxk7x89ABCDEF));
7156 
7157       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7158       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7159       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
7160       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
7161 
7162       const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
7163       const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567);
7164       const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t)));
7165       const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567);
7166       const __m128i vi8x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i8 + 8));
7167       const __m128i vxi8x89ABCDEF = _mm_cvtepi8_epi16(vi8x89ABCDEF);
7168       const __m128i vk8x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t)));
7169       const __m128i vxk8x89ABCDEF = _mm_cvtepi8_epi16(vk8x89ABCDEF);
7170       i8 += 16;
7171 
7172 
7173       vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
7174       vprod89ABCDEF = _mm_mullo_epi16(vxi8x89ABCDEF, vxk8x89ABCDEF);
7175 
7176 
7177       const __m128i vi9x01234567 = _mm_loadl_epi64((const __m128i*) i9);
7178       const __m128i vxi9x01234567 = _mm_cvtepi8_epi16(vi9x01234567);
7179       const __m128i vk9x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t)));
7180       const __m128i vxk9x01234567 = _mm_cvtepi8_epi16(vk9x01234567);
7181       const __m128i vi9x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i9 + 8));
7182       const __m128i vxi9x89ABCDEF = _mm_cvtepi8_epi16(vi9x89ABCDEF);
7183       const __m128i vk9x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 152 * sizeof(int8_t)));
7184       const __m128i vxk9x89ABCDEF = _mm_cvtepi8_epi16(vk9x89ABCDEF);
7185       i9 += 16;
7186 
7187 
7188       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi9x01234567, vxk9x01234567));
7189       vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi9x89ABCDEF, vxk9x89ABCDEF));
7190 
7191       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7192       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7193       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
7194       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
7195 
7196       const __m128i vi10x01234567 = _mm_loadl_epi64((const __m128i*) i10);
7197       const __m128i vxi10x01234567 = _mm_cvtepi8_epi16(vi10x01234567);
7198       const __m128i vk10x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 160 * sizeof(int8_t)));
7199       const __m128i vxk10x01234567 = _mm_cvtepi8_epi16(vk10x01234567);
7200       const __m128i vi10x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i10 + 8));
7201       const __m128i vxi10x89ABCDEF = _mm_cvtepi8_epi16(vi10x89ABCDEF);
7202       const __m128i vk10x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 168 * sizeof(int8_t)));
7203       const __m128i vxk10x89ABCDEF = _mm_cvtepi8_epi16(vk10x89ABCDEF);
7204       i10 += 16;
7205 
7206 
7207       vprod01234567 = _mm_mullo_epi16(vxi10x01234567, vxk10x01234567);
7208       vprod89ABCDEF = _mm_mullo_epi16(vxi10x89ABCDEF, vxk10x89ABCDEF);
7209 
7210 
7211       const __m128i vi11x01234567 = _mm_loadl_epi64((const __m128i*) i11);
7212       const __m128i vxi11x01234567 = _mm_cvtepi8_epi16(vi11x01234567);
7213       const __m128i vk11x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 176 * sizeof(int8_t)));
7214       const __m128i vxk11x01234567 = _mm_cvtepi8_epi16(vk11x01234567);
7215       const __m128i vi11x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i11 + 8));
7216       const __m128i vxi11x89ABCDEF = _mm_cvtepi8_epi16(vi11x89ABCDEF);
7217       const __m128i vk11x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 184 * sizeof(int8_t)));
7218       const __m128i vxk11x89ABCDEF = _mm_cvtepi8_epi16(vk11x89ABCDEF);
7219       i11 += 16;
7220 
7221 
7222       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi11x01234567, vxk11x01234567));
7223       vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi11x89ABCDEF, vxk11x89ABCDEF));
7224 
7225       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7226       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7227       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
7228       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
7229 
7230       const __m128i vi12x01234567 = _mm_loadl_epi64((const __m128i*) i12);
7231       const __m128i vxi12x01234567 = _mm_cvtepi8_epi16(vi12x01234567);
7232       const __m128i vk12x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 192 * sizeof(int8_t)));
7233       const __m128i vxk12x01234567 = _mm_cvtepi8_epi16(vk12x01234567);
7234       const __m128i vi12x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i12 + 8));
7235       const __m128i vxi12x89ABCDEF = _mm_cvtepi8_epi16(vi12x89ABCDEF);
7236       const __m128i vk12x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 200 * sizeof(int8_t)));
7237       const __m128i vxk12x89ABCDEF = _mm_cvtepi8_epi16(vk12x89ABCDEF);
7238       i12 += 16;
7239 
7240 
7241       vprod01234567 = _mm_mullo_epi16(vxi12x01234567, vxk12x01234567);
7242       vprod89ABCDEF = _mm_mullo_epi16(vxi12x89ABCDEF, vxk12x89ABCDEF);
7243 
7244 
7245       const __m128i vi13x01234567 = _mm_loadl_epi64((const __m128i*) i13);
7246       const __m128i vxi13x01234567 = _mm_cvtepi8_epi16(vi13x01234567);
7247       const __m128i vk13x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 208 * sizeof(int8_t)));
7248       const __m128i vxk13x01234567 = _mm_cvtepi8_epi16(vk13x01234567);
7249       const __m128i vi13x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i13 + 8));
7250       const __m128i vxi13x89ABCDEF = _mm_cvtepi8_epi16(vi13x89ABCDEF);
7251       const __m128i vk13x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 216 * sizeof(int8_t)));
7252       const __m128i vxk13x89ABCDEF = _mm_cvtepi8_epi16(vk13x89ABCDEF);
7253       i13 += 16;
7254 
7255 
7256       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi13x01234567, vxk13x01234567));
7257       vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi13x89ABCDEF, vxk13x89ABCDEF));
7258 
7259       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7260       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7261       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
7262       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
7263 
7264       const __m128i vi14x01234567 = _mm_loadl_epi64((const __m128i*) i14);
7265       const __m128i vxi14x01234567 = _mm_cvtepi8_epi16(vi14x01234567);
7266       const __m128i vk14x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 224 * sizeof(int8_t)));
7267       const __m128i vxk14x01234567 = _mm_cvtepi8_epi16(vk14x01234567);
7268       const __m128i vi14x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i14 + 8));
7269       const __m128i vxi14x89ABCDEF = _mm_cvtepi8_epi16(vi14x89ABCDEF);
7270       const __m128i vk14x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 232 * sizeof(int8_t)));
7271       const __m128i vxk14x89ABCDEF = _mm_cvtepi8_epi16(vk14x89ABCDEF);
7272       i14 += 16;
7273 
7274 
7275       vprod01234567 = _mm_mullo_epi16(vxi14x01234567, vxk14x01234567);
7276       vprod89ABCDEF = _mm_mullo_epi16(vxi14x89ABCDEF, vxk14x89ABCDEF);
7277 
7278 
7279       const __m128i vi15x01234567 = _mm_loadl_epi64((const __m128i*) i15);
7280       const __m128i vxi15x01234567 = _mm_cvtepi8_epi16(vi15x01234567);
7281       const __m128i vk15x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 240 * sizeof(int8_t)));
7282       const __m128i vxk15x01234567 = _mm_cvtepi8_epi16(vk15x01234567);
7283       const __m128i vi15x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i15 + 8));
7284       const __m128i vxi15x89ABCDEF = _mm_cvtepi8_epi16(vi15x89ABCDEF);
7285       const __m128i vk15x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 248 * sizeof(int8_t)));
7286       const __m128i vxk15x89ABCDEF = _mm_cvtepi8_epi16(vk15x89ABCDEF);
7287       i15 += 16;
7288 
7289 
7290       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi15x01234567, vxk15x01234567));
7291       vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi15x89ABCDEF, vxk15x89ABCDEF));
7292 
7293       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7294       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7295       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
7296       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
7297 
7298       const __m128i vi16x01234567 = _mm_loadl_epi64((const __m128i*) i16);
7299       const __m128i vxi16x01234567 = _mm_cvtepi8_epi16(vi16x01234567);
7300       const __m128i vk16x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 256 * sizeof(int8_t)));
7301       const __m128i vxk16x01234567 = _mm_cvtepi8_epi16(vk16x01234567);
7302       const __m128i vi16x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i16 + 8));
7303       const __m128i vxi16x89ABCDEF = _mm_cvtepi8_epi16(vi16x89ABCDEF);
7304       const __m128i vk16x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 264 * sizeof(int8_t)));
7305       const __m128i vxk16x89ABCDEF = _mm_cvtepi8_epi16(vk16x89ABCDEF);
7306       i16 += 16;
7307 
7308 
7309       vprod01234567 = _mm_mullo_epi16(vxi16x01234567, vxk16x01234567);
7310       vprod89ABCDEF = _mm_mullo_epi16(vxi16x89ABCDEF, vxk16x89ABCDEF);
7311 
7312 
7313       const __m128i vi17x01234567 = _mm_loadl_epi64((const __m128i*) i17);
7314       const __m128i vxi17x01234567 = _mm_cvtepi8_epi16(vi17x01234567);
7315       const __m128i vk17x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 272 * sizeof(int8_t)));
7316       const __m128i vxk17x01234567 = _mm_cvtepi8_epi16(vk17x01234567);
7317       const __m128i vi17x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i17 + 8));
7318       const __m128i vxi17x89ABCDEF = _mm_cvtepi8_epi16(vi17x89ABCDEF);
7319       const __m128i vk17x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 280 * sizeof(int8_t)));
7320       const __m128i vxk17x89ABCDEF = _mm_cvtepi8_epi16(vk17x89ABCDEF);
7321       i17 += 16;
7322 
7323 
7324       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi17x01234567, vxk17x01234567));
7325       vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi17x89ABCDEF, vxk17x89ABCDEF));
7326 
7327       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7328       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7329       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
7330       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
7331 
7332       const __m128i vi18x01234567 = _mm_loadl_epi64((const __m128i*) i18);
7333       const __m128i vxi18x01234567 = _mm_cvtepi8_epi16(vi18x01234567);
7334       const __m128i vk18x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 288 * sizeof(int8_t)));
7335       const __m128i vxk18x01234567 = _mm_cvtepi8_epi16(vk18x01234567);
7336       const __m128i vi18x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i18 + 8));
7337       const __m128i vxi18x89ABCDEF = _mm_cvtepi8_epi16(vi18x89ABCDEF);
7338       const __m128i vk18x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 296 * sizeof(int8_t)));
7339       const __m128i vxk18x89ABCDEF = _mm_cvtepi8_epi16(vk18x89ABCDEF);
7340       i18 += 16;
7341 
7342 
7343       vprod01234567 = _mm_mullo_epi16(vxi18x01234567, vxk18x01234567);
7344       vprod89ABCDEF = _mm_mullo_epi16(vxi18x89ABCDEF, vxk18x89ABCDEF);
7345 
7346 
7347       const __m128i vi19x01234567 = _mm_loadl_epi64((const __m128i*) i19);
7348       const __m128i vxi19x01234567 = _mm_cvtepi8_epi16(vi19x01234567);
7349       const __m128i vk19x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 304 * sizeof(int8_t)));
7350       const __m128i vxk19x01234567 = _mm_cvtepi8_epi16(vk19x01234567);
7351       const __m128i vi19x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i19 + 8));
7352       const __m128i vxi19x89ABCDEF = _mm_cvtepi8_epi16(vi19x89ABCDEF);
7353       const __m128i vk19x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 312 * sizeof(int8_t)));
7354       const __m128i vxk19x89ABCDEF = _mm_cvtepi8_epi16(vk19x89ABCDEF);
7355       i19 += 16;
7356 
7357 
7358       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi19x01234567, vxk19x01234567));
7359       vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi19x89ABCDEF, vxk19x89ABCDEF));
7360 
7361       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7362       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7363       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
7364       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
7365 
7366       const __m128i vi20x01234567 = _mm_loadl_epi64((const __m128i*) i20);
7367       const __m128i vxi20x01234567 = _mm_cvtepi8_epi16(vi20x01234567);
7368       const __m128i vk20x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 320 * sizeof(int8_t)));
7369       const __m128i vxk20x01234567 = _mm_cvtepi8_epi16(vk20x01234567);
7370       const __m128i vi20x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i20 + 8));
7371       const __m128i vxi20x89ABCDEF = _mm_cvtepi8_epi16(vi20x89ABCDEF);
7372       const __m128i vk20x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 328 * sizeof(int8_t)));
7373       const __m128i vxk20x89ABCDEF = _mm_cvtepi8_epi16(vk20x89ABCDEF);
7374       i20 += 16;
7375 
7376 
7377       vprod01234567 = _mm_mullo_epi16(vxi20x01234567, vxk20x01234567);
7378       vprod89ABCDEF = _mm_mullo_epi16(vxi20x89ABCDEF, vxk20x89ABCDEF);
7379 
7380 
7381       const __m128i vi21x01234567 = _mm_loadl_epi64((const __m128i*) i21);
7382       const __m128i vxi21x01234567 = _mm_cvtepi8_epi16(vi21x01234567);
7383       const __m128i vk21x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 336 * sizeof(int8_t)));
7384       const __m128i vxk21x01234567 = _mm_cvtepi8_epi16(vk21x01234567);
7385       const __m128i vi21x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i21 + 8));
7386       const __m128i vxi21x89ABCDEF = _mm_cvtepi8_epi16(vi21x89ABCDEF);
7387       const __m128i vk21x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 344 * sizeof(int8_t)));
7388       const __m128i vxk21x89ABCDEF = _mm_cvtepi8_epi16(vk21x89ABCDEF);
7389       i21 += 16;
7390 
7391 
7392       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi21x01234567, vxk21x01234567));
7393       vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi21x89ABCDEF, vxk21x89ABCDEF));
7394 
7395       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7396       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7397       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
7398       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
7399 
7400       const __m128i vi22x01234567 = _mm_loadl_epi64((const __m128i*) i22);
7401       const __m128i vxi22x01234567 = _mm_cvtepi8_epi16(vi22x01234567);
7402       const __m128i vk22x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 352 * sizeof(int8_t)));
7403       const __m128i vxk22x01234567 = _mm_cvtepi8_epi16(vk22x01234567);
7404       const __m128i vi22x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i22 + 8));
7405       const __m128i vxi22x89ABCDEF = _mm_cvtepi8_epi16(vi22x89ABCDEF);
7406       const __m128i vk22x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 360 * sizeof(int8_t)));
7407       const __m128i vxk22x89ABCDEF = _mm_cvtepi8_epi16(vk22x89ABCDEF);
7408       i22 += 16;
7409 
7410 
7411       vprod01234567 = _mm_mullo_epi16(vxi22x01234567, vxk22x01234567);
7412       vprod89ABCDEF = _mm_mullo_epi16(vxi22x89ABCDEF, vxk22x89ABCDEF);
7413 
7414 
7415       const __m128i vi23x01234567 = _mm_loadl_epi64((const __m128i*) i23);
7416       const __m128i vxi23x01234567 = _mm_cvtepi8_epi16(vi23x01234567);
7417       const __m128i vk23x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 368 * sizeof(int8_t)));
7418       const __m128i vxk23x01234567 = _mm_cvtepi8_epi16(vk23x01234567);
7419       const __m128i vi23x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i23 + 8));
7420       const __m128i vxi23x89ABCDEF = _mm_cvtepi8_epi16(vi23x89ABCDEF);
7421       const __m128i vk23x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 376 * sizeof(int8_t)));
7422       const __m128i vxk23x89ABCDEF = _mm_cvtepi8_epi16(vk23x89ABCDEF);
7423       i23 += 16;
7424 
7425 
7426       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi23x01234567, vxk23x01234567));
7427       vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi23x89ABCDEF, vxk23x89ABCDEF));
7428 
7429       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7430       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7431       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
7432       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
7433 
7434       const __m128i vi24x01234567 = _mm_loadl_epi64((const __m128i*) i24);
7435       const __m128i vxi24x01234567 = _mm_cvtepi8_epi16(vi24x01234567);
7436       const __m128i vk24x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 384 * sizeof(int8_t)));
7437       const __m128i vxk24x01234567 = _mm_cvtepi8_epi16(vk24x01234567);
7438       const __m128i vi24x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i24 + 8));
7439       const __m128i vxi24x89ABCDEF = _mm_cvtepi8_epi16(vi24x89ABCDEF);
7440       const __m128i vk24x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 392 * sizeof(int8_t)));
7441       const __m128i vxk24x89ABCDEF = _mm_cvtepi8_epi16(vk24x89ABCDEF);
7442       i24 += 16;
7443 
7444 
7445       vprod01234567 = _mm_mullo_epi16(vxi24x01234567, vxk24x01234567);
7446       vprod89ABCDEF = _mm_mullo_epi16(vxi24x89ABCDEF, vxk24x89ABCDEF);
7447 
7448       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7449       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7450       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
7451       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
7452 
7453       w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(int8_t));
7454 
7455       __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
7456       __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
7457       __m128 vscaled89AB = _mm_cvtepi32_ps(vacc89AB);
7458       __m128 vscaledCDEF = _mm_cvtepi32_ps(vaccCDEF);
7459 
7460       const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
7461       vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
7462       vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
7463       vscaled89AB = _mm_mul_ps(vscaled89AB, vscale);
7464       vscaledCDEF = _mm_mul_ps(vscaledCDEF, vscale);
7465 
7466       const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
7467       vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
7468       vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
7469       vscaled89AB = _mm_min_ps(vscaled89AB, voutput_max_less_zero_point);
7470       vscaledCDEF = _mm_min_ps(vscaledCDEF, voutput_max_less_zero_point);
7471 
7472       vacc0123 = _mm_cvtps_epi32(vscaled0123);
7473       vacc4567 = _mm_cvtps_epi32(vscaled4567);
7474       vacc89AB = _mm_cvtps_epi32(vscaled89AB);
7475       vaccCDEF = _mm_cvtps_epi32(vscaledCDEF);
7476 
7477       const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
7478       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
7479       __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
7480 
7481 
7482       __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
7483 
7484       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
7485       vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
7486 
7487       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
7488       output += 16;
7489     }
7490     if XNN_UNLIKELY(c != 0) {
7491       const int8_t* k = (const int8_t*) ((const int32_t*) w + 16);
7492       do {
7493         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
7494         __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
7495 
7496 
7497         const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
7498         const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
7499         const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) k);
7500         const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
7501         i0 += 8;
7502 
7503 
7504         __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
7505 
7506 
7507         const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
7508         const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
7509         const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) (k + 16));
7510         const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
7511         i1 += 8;
7512 
7513 
7514         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi1x01234567, vxk1x01234567));
7515 
7516         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7517         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7518 
7519         const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
7520         const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
7521         const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) (k + 32));
7522         const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
7523         i2 += 8;
7524 
7525 
7526         vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
7527 
7528 
7529         const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
7530         const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567);
7531         const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) (k + 48));
7532         const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567);
7533         i3 += 8;
7534 
7535 
7536         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi3x01234567, vxk3x01234567));
7537 
7538         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7539         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7540 
7541         const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
7542         const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567);
7543         const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) (k + 64));
7544         const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567);
7545         i4 += 8;
7546 
7547 
7548         vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
7549 
7550 
7551         const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
7552         const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567);
7553         const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) (k + 80));
7554         const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567);
7555         i5 += 8;
7556 
7557 
7558         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi5x01234567, vxk5x01234567));
7559 
7560         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7561         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7562 
7563         const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
7564         const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567);
7565         const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) (k + 96));
7566         const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567);
7567         i6 += 8;
7568 
7569 
7570         vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
7571 
7572 
7573         const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
7574         const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567);
7575         const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) (k + 112));
7576         const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567);
7577         i7 += 8;
7578 
7579 
7580         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi7x01234567, vxk7x01234567));
7581 
7582         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7583         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7584 
7585         const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
7586         const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567);
7587         const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) (k + 128));
7588         const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567);
7589         i8 += 8;
7590 
7591 
7592         vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
7593 
7594 
7595         const __m128i vi9x01234567 = _mm_loadl_epi64((const __m128i*) i9);
7596         const __m128i vxi9x01234567 = _mm_cvtepi8_epi16(vi9x01234567);
7597         const __m128i vk9x01234567 = _mm_loadl_epi64((const __m128i*) (k + 144));
7598         const __m128i vxk9x01234567 = _mm_cvtepi8_epi16(vk9x01234567);
7599         i9 += 8;
7600 
7601 
7602         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi9x01234567, vxk9x01234567));
7603 
7604         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7605         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7606 
7607         const __m128i vi10x01234567 = _mm_loadl_epi64((const __m128i*) i10);
7608         const __m128i vxi10x01234567 = _mm_cvtepi8_epi16(vi10x01234567);
7609         const __m128i vk10x01234567 = _mm_loadl_epi64((const __m128i*) (k + 160));
7610         const __m128i vxk10x01234567 = _mm_cvtepi8_epi16(vk10x01234567);
7611         i10 += 8;
7612 
7613 
7614         vprod01234567 = _mm_mullo_epi16(vxi10x01234567, vxk10x01234567);
7615 
7616 
7617         const __m128i vi11x01234567 = _mm_loadl_epi64((const __m128i*) i11);
7618         const __m128i vxi11x01234567 = _mm_cvtepi8_epi16(vi11x01234567);
7619         const __m128i vk11x01234567 = _mm_loadl_epi64((const __m128i*) (k + 176));
7620         const __m128i vxk11x01234567 = _mm_cvtepi8_epi16(vk11x01234567);
7621         i11 += 8;
7622 
7623 
7624         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi11x01234567, vxk11x01234567));
7625 
7626         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7627         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7628 
7629         const __m128i vi12x01234567 = _mm_loadl_epi64((const __m128i*) i12);
7630         const __m128i vxi12x01234567 = _mm_cvtepi8_epi16(vi12x01234567);
7631         const __m128i vk12x01234567 = _mm_loadl_epi64((const __m128i*) (k + 192));
7632         const __m128i vxk12x01234567 = _mm_cvtepi8_epi16(vk12x01234567);
7633         i12 += 8;
7634 
7635 
7636         vprod01234567 = _mm_mullo_epi16(vxi12x01234567, vxk12x01234567);
7637 
7638 
7639         const __m128i vi13x01234567 = _mm_loadl_epi64((const __m128i*) i13);
7640         const __m128i vxi13x01234567 = _mm_cvtepi8_epi16(vi13x01234567);
7641         const __m128i vk13x01234567 = _mm_loadl_epi64((const __m128i*) (k + 208));
7642         const __m128i vxk13x01234567 = _mm_cvtepi8_epi16(vk13x01234567);
7643         i13 += 8;
7644 
7645 
7646         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi13x01234567, vxk13x01234567));
7647 
7648         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7649         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7650 
7651         const __m128i vi14x01234567 = _mm_loadl_epi64((const __m128i*) i14);
7652         const __m128i vxi14x01234567 = _mm_cvtepi8_epi16(vi14x01234567);
7653         const __m128i vk14x01234567 = _mm_loadl_epi64((const __m128i*) (k + 224));
7654         const __m128i vxk14x01234567 = _mm_cvtepi8_epi16(vk14x01234567);
7655         i14 += 8;
7656 
7657 
7658         vprod01234567 = _mm_mullo_epi16(vxi14x01234567, vxk14x01234567);
7659 
7660 
7661         const __m128i vi15x01234567 = _mm_loadl_epi64((const __m128i*) i15);
7662         const __m128i vxi15x01234567 = _mm_cvtepi8_epi16(vi15x01234567);
7663         const __m128i vk15x01234567 = _mm_loadl_epi64((const __m128i*) (k + 240));
7664         const __m128i vxk15x01234567 = _mm_cvtepi8_epi16(vk15x01234567);
7665         i15 += 8;
7666 
7667 
7668         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi15x01234567, vxk15x01234567));
7669 
7670         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7671         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7672 
7673         const __m128i vi16x01234567 = _mm_loadl_epi64((const __m128i*) i16);
7674         const __m128i vxi16x01234567 = _mm_cvtepi8_epi16(vi16x01234567);
7675         const __m128i vk16x01234567 = _mm_loadl_epi64((const __m128i*) (k + 256));
7676         const __m128i vxk16x01234567 = _mm_cvtepi8_epi16(vk16x01234567);
7677         i16 += 8;
7678 
7679 
7680         vprod01234567 = _mm_mullo_epi16(vxi16x01234567, vxk16x01234567);
7681 
7682 
7683         const __m128i vi17x01234567 = _mm_loadl_epi64((const __m128i*) i17);
7684         const __m128i vxi17x01234567 = _mm_cvtepi8_epi16(vi17x01234567);
7685         const __m128i vk17x01234567 = _mm_loadl_epi64((const __m128i*) (k + 272));
7686         const __m128i vxk17x01234567 = _mm_cvtepi8_epi16(vk17x01234567);
7687         i17 += 8;
7688 
7689 
7690         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi17x01234567, vxk17x01234567));
7691 
7692         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7693         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7694 
7695         const __m128i vi18x01234567 = _mm_loadl_epi64((const __m128i*) i18);
7696         const __m128i vxi18x01234567 = _mm_cvtepi8_epi16(vi18x01234567);
7697         const __m128i vk18x01234567 = _mm_loadl_epi64((const __m128i*) (k + 288));
7698         const __m128i vxk18x01234567 = _mm_cvtepi8_epi16(vk18x01234567);
7699         i18 += 8;
7700 
7701 
7702         vprod01234567 = _mm_mullo_epi16(vxi18x01234567, vxk18x01234567);
7703 
7704 
7705         const __m128i vi19x01234567 = _mm_loadl_epi64((const __m128i*) i19);
7706         const __m128i vxi19x01234567 = _mm_cvtepi8_epi16(vi19x01234567);
7707         const __m128i vk19x01234567 = _mm_loadl_epi64((const __m128i*) (k + 304));
7708         const __m128i vxk19x01234567 = _mm_cvtepi8_epi16(vk19x01234567);
7709         i19 += 8;
7710 
7711 
7712         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi19x01234567, vxk19x01234567));
7713 
7714         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7715         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7716 
7717         const __m128i vi20x01234567 = _mm_loadl_epi64((const __m128i*) i20);
7718         const __m128i vxi20x01234567 = _mm_cvtepi8_epi16(vi20x01234567);
7719         const __m128i vk20x01234567 = _mm_loadl_epi64((const __m128i*) (k + 320));
7720         const __m128i vxk20x01234567 = _mm_cvtepi8_epi16(vk20x01234567);
7721         i20 += 8;
7722 
7723 
7724         vprod01234567 = _mm_mullo_epi16(vxi20x01234567, vxk20x01234567);
7725 
7726 
7727         const __m128i vi21x01234567 = _mm_loadl_epi64((const __m128i*) i21);
7728         const __m128i vxi21x01234567 = _mm_cvtepi8_epi16(vi21x01234567);
7729         const __m128i vk21x01234567 = _mm_loadl_epi64((const __m128i*) (k + 336));
7730         const __m128i vxk21x01234567 = _mm_cvtepi8_epi16(vk21x01234567);
7731         i21 += 8;
7732 
7733 
7734         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi21x01234567, vxk21x01234567));
7735 
7736         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7737         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7738 
7739         const __m128i vi22x01234567 = _mm_loadl_epi64((const __m128i*) i22);
7740         const __m128i vxi22x01234567 = _mm_cvtepi8_epi16(vi22x01234567);
7741         const __m128i vk22x01234567 = _mm_loadl_epi64((const __m128i*) (k + 352));
7742         const __m128i vxk22x01234567 = _mm_cvtepi8_epi16(vk22x01234567);
7743         i22 += 8;
7744 
7745 
7746         vprod01234567 = _mm_mullo_epi16(vxi22x01234567, vxk22x01234567);
7747 
7748 
7749         const __m128i vi23x01234567 = _mm_loadl_epi64((const __m128i*) i23);
7750         const __m128i vxi23x01234567 = _mm_cvtepi8_epi16(vi23x01234567);
7751         const __m128i vk23x01234567 = _mm_loadl_epi64((const __m128i*) (k + 368));
7752         const __m128i vxk23x01234567 = _mm_cvtepi8_epi16(vk23x01234567);
7753         i23 += 8;
7754 
7755 
7756         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi23x01234567, vxk23x01234567));
7757 
7758         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7759         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7760 
7761         const __m128i vi24x01234567 = _mm_loadl_epi64((const __m128i*) i24);
7762         const __m128i vxi24x01234567 = _mm_cvtepi8_epi16(vi24x01234567);
7763         const __m128i vk24x01234567 = _mm_loadl_epi64((const __m128i*) (k + 384));
7764         const __m128i vxk24x01234567 = _mm_cvtepi8_epi16(vk24x01234567);
7765         i24 += 8;
7766 
7767 
7768         vprod01234567 = _mm_mullo_epi16(vxi24x01234567, vxk24x01234567);
7769 
7770         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7771         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7772 
7773         k += 8;
7774 
7775         __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
7776         __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
7777 
7778         const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
7779         vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
7780         vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
7781 
7782         const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
7783         vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
7784         vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
7785 
7786         vacc0123 = _mm_cvtps_epi32(vscaled0123);
7787         vacc4567 = _mm_cvtps_epi32(vscaled4567);
7788 
7789         w = (const void*) ((const int32_t*) w + 8);
7790 
7791         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
7792         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
7793 
7794 
7795         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
7796 
7797         vout0123456701234567 = _mm_max_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
7798 
7799         if XNN_LIKELY(c >= 8) {
7800           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
7801           output += 8;
7802           c -= 8;
7803         } else {
7804           if (c & 4) {
7805             unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
7806             vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
7807             output += 4;
7808           }
7809           if (c & 2) {
7810             unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
7811             vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
7812             output += 2;
7813           }
7814           if (c & 1) {
7815             *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
7816             output += 1;
7817           }
7818           c = 0;
7819         }
7820       } while (c != 0);
7821     }
7822 
7823     output = (int8_t*) ((uintptr_t) output + output_increment);
7824   } while (--output_width != 0);
7825 }
7826 
xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16_add16(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7827 void xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16_add16(
7828     size_t channels,
7829     size_t output_width,
7830     const int8_t** input,
7831     const void* weights,
7832     int8_t* output,
7833     size_t input_stride,
7834     size_t output_increment,
7835     size_t input_offset,
7836     const int8_t* zero,
7837     const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7838 {
7839   assert(channels != 0);
7840   assert(output_width != 0);
7841 
7842   do {
7843     const int8_t* i0 = input[0];
7844     assert(i0 != NULL);
7845     if XNN_UNPREDICTABLE(i0 != zero) {
7846       i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
7847     }
7848     const int8_t* i1 = input[1];
7849     assert(i1 != NULL);
7850     if XNN_UNPREDICTABLE(i1 != zero) {
7851       i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
7852     }
7853     const int8_t* i2 = input[2];
7854     assert(i2 != NULL);
7855     if XNN_UNPREDICTABLE(i2 != zero) {
7856       i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
7857     }
7858     const int8_t* i3 = input[3];
7859     assert(i3 != NULL);
7860     if XNN_UNPREDICTABLE(i3 != zero) {
7861       i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
7862     }
7863     const int8_t* i4 = input[4];
7864     assert(i4 != NULL);
7865     if XNN_UNPREDICTABLE(i4 != zero) {
7866       i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
7867     }
7868     const int8_t* i5 = input[5];
7869     assert(i5 != NULL);
7870     if XNN_UNPREDICTABLE(i5 != zero) {
7871       i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
7872     }
7873     const int8_t* i6 = input[6];
7874     assert(i6 != NULL);
7875     if XNN_UNPREDICTABLE(i6 != zero) {
7876       i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
7877     }
7878     const int8_t* i7 = input[7];
7879     assert(i7 != NULL);
7880     if XNN_UNPREDICTABLE(i7 != zero) {
7881       i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
7882     }
7883     const int8_t* i8 = input[8];
7884     assert(i8 != NULL);
7885     if XNN_UNPREDICTABLE(i8 != zero) {
7886       i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
7887     }
7888     input = (const int8_t**) ((uintptr_t) input + input_stride);
7889 
7890     size_t c = channels;
7891     const void* w = weights;
7892     for (; c >= 16; c -= 16) {
7893       __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
7894       __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
7895       __m128i vacc89AB = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 8));
7896       __m128i vaccCDEF = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 12));
7897 
7898 
7899       const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
7900       const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
7901       const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t)));
7902       const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
7903       const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8));
7904       const __m128i vxi0x89ABCDEF = _mm_cvtepi8_epi16(vi0x89ABCDEF);
7905       const __m128i vk0x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t)));
7906       const __m128i vxk0x89ABCDEF = _mm_cvtepi8_epi16(vk0x89ABCDEF);
7907       i0 += 16;
7908 
7909 
7910       __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
7911       __m128i vprod89ABCDEF = _mm_mullo_epi16(vxi0x89ABCDEF, vxk0x89ABCDEF);
7912 
7913 
7914       const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
7915       const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
7916       const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t)));
7917       const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
7918       const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8));
7919       const __m128i vxi1x89ABCDEF = _mm_cvtepi8_epi16(vi1x89ABCDEF);
7920       const __m128i vk1x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t)));
7921       const __m128i vxk1x89ABCDEF = _mm_cvtepi8_epi16(vk1x89ABCDEF);
7922       i1 += 16;
7923 
7924 
7925       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi1x01234567, vxk1x01234567));
7926       vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi1x89ABCDEF, vxk1x89ABCDEF));
7927 
7928       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7929       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7930       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
7931       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
7932 
7933       const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
7934       const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
7935       const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t)));
7936       const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
7937       const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8));
7938       const __m128i vxi2x89ABCDEF = _mm_cvtepi8_epi16(vi2x89ABCDEF);
7939       const __m128i vk2x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t)));
7940       const __m128i vxk2x89ABCDEF = _mm_cvtepi8_epi16(vk2x89ABCDEF);
7941       i2 += 16;
7942 
7943 
7944       vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
7945       vprod89ABCDEF = _mm_mullo_epi16(vxi2x89ABCDEF, vxk2x89ABCDEF);
7946 
7947 
7948       const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
7949       const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567);
7950       const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t)));
7951       const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567);
7952       const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8));
7953       const __m128i vxi3x89ABCDEF = _mm_cvtepi8_epi16(vi3x89ABCDEF);
7954       const __m128i vk3x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t)));
7955       const __m128i vxk3x89ABCDEF = _mm_cvtepi8_epi16(vk3x89ABCDEF);
7956       i3 += 16;
7957 
7958 
7959       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi3x01234567, vxk3x01234567));
7960       vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi3x89ABCDEF, vxk3x89ABCDEF));
7961 
7962       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7963       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7964       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
7965       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
7966 
7967       const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
7968       const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567);
7969       const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t)));
7970       const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567);
7971       const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8));
7972       const __m128i vxi4x89ABCDEF = _mm_cvtepi8_epi16(vi4x89ABCDEF);
7973       const __m128i vk4x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t)));
7974       const __m128i vxk4x89ABCDEF = _mm_cvtepi8_epi16(vk4x89ABCDEF);
7975       i4 += 16;
7976 
7977 
7978       vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
7979       vprod89ABCDEF = _mm_mullo_epi16(vxi4x89ABCDEF, vxk4x89ABCDEF);
7980 
7981 
7982       const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
7983       const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567);
7984       const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t)));
7985       const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567);
7986       const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8));
7987       const __m128i vxi5x89ABCDEF = _mm_cvtepi8_epi16(vi5x89ABCDEF);
7988       const __m128i vk5x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t)));
7989       const __m128i vxk5x89ABCDEF = _mm_cvtepi8_epi16(vk5x89ABCDEF);
7990       i5 += 16;
7991 
7992 
7993       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi5x01234567, vxk5x01234567));
7994       vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi5x89ABCDEF, vxk5x89ABCDEF));
7995 
7996       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7997       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7998       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
7999       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
8000 
8001       const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
8002       const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567);
8003       const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t)));
8004       const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567);
8005       const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8));
8006       const __m128i vxi6x89ABCDEF = _mm_cvtepi8_epi16(vi6x89ABCDEF);
8007       const __m128i vk6x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t)));
8008       const __m128i vxk6x89ABCDEF = _mm_cvtepi8_epi16(vk6x89ABCDEF);
8009       i6 += 16;
8010 
8011 
8012       vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
8013       vprod89ABCDEF = _mm_mullo_epi16(vxi6x89ABCDEF, vxk6x89ABCDEF);
8014 
8015 
8016       const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
8017       const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567);
8018       const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t)));
8019       const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567);
8020       const __m128i vi7x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i7 + 8));
8021       const __m128i vxi7x89ABCDEF = _mm_cvtepi8_epi16(vi7x89ABCDEF);
8022       const __m128i vk7x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t)));
8023       const __m128i vxk7x89ABCDEF = _mm_cvtepi8_epi16(vk7x89ABCDEF);
8024       i7 += 16;
8025 
8026 
8027       vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi7x01234567, vxk7x01234567));
8028       vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi7x89ABCDEF, vxk7x89ABCDEF));
8029 
8030       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
8031       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
8032       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
8033       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
8034 
8035       const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
8036       const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567);
8037       const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t)));
8038       const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567);
8039       const __m128i vi8x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i8 + 8));
8040       const __m128i vxi8x89ABCDEF = _mm_cvtepi8_epi16(vi8x89ABCDEF);
8041       const __m128i vk8x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t)));
8042       const __m128i vxk8x89ABCDEF = _mm_cvtepi8_epi16(vk8x89ABCDEF);
8043       i8 += 16;
8044 
8045 
8046       vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
8047       vprod89ABCDEF = _mm_mullo_epi16(vxi8x89ABCDEF, vxk8x89ABCDEF);
8048 
8049       vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
8050       vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
8051       vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
8052       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
8053 
8054       w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t));
8055 
8056       __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
8057       __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
8058       __m128 vscaled89AB = _mm_cvtepi32_ps(vacc89AB);
8059       __m128 vscaledCDEF = _mm_cvtepi32_ps(vaccCDEF);
8060 
8061       const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
8062       vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
8063       vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
8064       vscaled89AB = _mm_mul_ps(vscaled89AB, vscale);
8065       vscaledCDEF = _mm_mul_ps(vscaledCDEF, vscale);
8066 
8067       const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
8068       vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
8069       vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
8070       vscaled89AB = _mm_min_ps(vscaled89AB, voutput_max_less_zero_point);
8071       vscaledCDEF = _mm_min_ps(vscaledCDEF, voutput_max_less_zero_point);
8072 
8073       vacc0123 = _mm_cvtps_epi32(vscaled0123);
8074       vacc4567 = _mm_cvtps_epi32(vscaled4567);
8075       vacc89AB = _mm_cvtps_epi32(vscaled89AB);
8076       vaccCDEF = _mm_cvtps_epi32(vscaledCDEF);
8077 
8078       const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
8079       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
8080       __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
8081 
8082 
8083       __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
8084 
8085       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
8086       vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
8087 
8088       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
8089       output += 16;
8090     }
8091     if XNN_UNLIKELY(c != 0) {
8092       const int8_t* k = (const int8_t*) ((const int32_t*) w + 16);
8093       do {
8094         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
8095         __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
8096 
8097 
8098         const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
8099         const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
8100         const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) k);
8101         const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
8102         i0 += 8;
8103 
8104 
8105         __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
8106 
8107 
8108         const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
8109         const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
8110         const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) (k + 16));
8111         const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
8112         i1 += 8;
8113 
8114 
8115         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi1x01234567, vxk1x01234567));
8116 
8117         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
8118         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
8119 
8120         const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
8121         const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
8122         const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) (k + 32));
8123         const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
8124         i2 += 8;
8125 
8126 
8127         vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
8128 
8129 
8130         const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
8131         const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567);
8132         const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) (k + 48));
8133         const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567);
8134         i3 += 8;
8135 
8136 
8137         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi3x01234567, vxk3x01234567));
8138 
8139         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
8140         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
8141 
8142         const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
8143         const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567);
8144         const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) (k + 64));
8145         const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567);
8146         i4 += 8;
8147 
8148 
8149         vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
8150 
8151 
8152         const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
8153         const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567);
8154         const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) (k + 80));
8155         const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567);
8156         i5 += 8;
8157 
8158 
8159         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi5x01234567, vxk5x01234567));
8160 
8161         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
8162         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
8163 
8164         const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
8165         const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567);
8166         const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) (k + 96));
8167         const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567);
8168         i6 += 8;
8169 
8170 
8171         vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
8172 
8173 
8174         const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
8175         const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567);
8176         const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) (k + 112));
8177         const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567);
8178         i7 += 8;
8179 
8180 
8181         vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi7x01234567, vxk7x01234567));
8182 
8183         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
8184         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
8185 
8186         const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
8187         const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567);
8188         const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) (k + 128));
8189         const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567);
8190         i8 += 8;
8191 
8192 
8193         vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
8194 
8195         vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
8196         vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
8197 
8198         k += 8;
8199 
8200         __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
8201         __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
8202 
8203         const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
8204         vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
8205         vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
8206 
8207         const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
8208         vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
8209         vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
8210 
8211         vacc0123 = _mm_cvtps_epi32(vscaled0123);
8212         vacc4567 = _mm_cvtps_epi32(vscaled4567);
8213 
8214         w = (const void*) ((const int32_t*) w + 8);
8215 
8216         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
8217         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
8218 
8219 
8220         __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
8221 
8222         vout0123456701234567 = _mm_max_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
8223 
8224         if XNN_LIKELY(c >= 8) {
8225           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
8226           output += 8;
8227           c -= 8;
8228         } else {
8229           if (c & 4) {
8230             unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
8231             vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
8232             output += 4;
8233           }
8234           if (c & 2) {
8235             unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
8236             vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
8237             output += 2;
8238           }
8239           if (c & 1) {
8240             *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
8241             output += 1;
8242           }
8243           c = 0;
8244         }
8245       } while (c != 0);
8246     }
8247 
8248     output = (int8_t*) ((uintptr_t) output + output_increment);
8249   } while (--output_width != 0);
8250 }
8251 
xnn_qs8_f32_vcvt_ukernel__avx_x32(size_t n,const int8_t * x,float * y,const union xnn_qs8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])8252 void xnn_qs8_f32_vcvt_ukernel__avx_x32(
8253     size_t n,
8254     const int8_t* x,
8255     float* y,
8256     const union xnn_qs8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8257 {
8258   assert(n != 0);
8259   assert(n % sizeof(int8_t) == 0);
8260   assert(x != NULL);
8261   assert(y != NULL);
8262 
8263   const __m128i vminus_zero_point = _mm_load_si128((const __m128i*) params->avx.minus_zero_point);
8264   const __m256 vscale = _mm256_load_ps(params->avx.scale);
8265   for (; n >= 32 * sizeof(int8_t); n -= 32 * sizeof(int8_t)) {
8266     __m128i vx0123 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x)));
8267     __m128i vx4567 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 4)));
8268     __m128i vx89AB = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 8)));
8269     __m128i vxCDEF = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 12)));
8270     __m128i vxGHIJ = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 16)));
8271     __m128i vxKLMN = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 20)));
8272     __m128i vxOPQR = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 24)));
8273     __m128i vxSTUV = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 28)));
8274     x += 32;
8275 
8276     vx0123 = _mm_add_epi32(vx0123, vminus_zero_point);
8277     vx4567 = _mm_add_epi32(vx4567, vminus_zero_point);
8278     vx89AB = _mm_add_epi32(vx89AB, vminus_zero_point);
8279     vxCDEF = _mm_add_epi32(vxCDEF, vminus_zero_point);
8280     vxGHIJ = _mm_add_epi32(vxGHIJ, vminus_zero_point);
8281     vxKLMN = _mm_add_epi32(vxKLMN, vminus_zero_point);
8282     vxOPQR = _mm_add_epi32(vxOPQR, vminus_zero_point);
8283     vxSTUV = _mm_add_epi32(vxSTUV, vminus_zero_point);
8284 
8285     const __m256i vx01234567 = _mm256_insertf128_si256(_mm256_castsi128_si256(vx0123), vx4567, 1);
8286     const __m256i vx89ABCDEF = _mm256_insertf128_si256(_mm256_castsi128_si256(vx89AB), vxCDEF, 1);
8287     const __m256i vxGHIJKLMN = _mm256_insertf128_si256(_mm256_castsi128_si256(vxGHIJ), vxKLMN, 1);
8288     const __m256i vxOPQRSTUV = _mm256_insertf128_si256(_mm256_castsi128_si256(vxOPQR), vxSTUV, 1);
8289 
8290     __m256 vy01234567 = _mm256_cvtepi32_ps(vx01234567);
8291     __m256 vy89ABCDEF = _mm256_cvtepi32_ps(vx89ABCDEF);
8292     __m256 vyGHIJKLMN = _mm256_cvtepi32_ps(vxGHIJKLMN);
8293     __m256 vyOPQRSTUV = _mm256_cvtepi32_ps(vxOPQRSTUV);
8294 
8295     vy01234567 = _mm256_mul_ps(vy01234567, vscale);
8296     vy89ABCDEF = _mm256_mul_ps(vy89ABCDEF, vscale);
8297     vyGHIJKLMN = _mm256_mul_ps(vyGHIJKLMN, vscale);
8298     vyOPQRSTUV = _mm256_mul_ps(vyOPQRSTUV, vscale);
8299 
8300     _mm256_storeu_ps(y, vy01234567);
8301     _mm256_storeu_ps(y + 8, vy89ABCDEF);
8302     _mm256_storeu_ps(y + 16, vyGHIJKLMN);
8303     _mm256_storeu_ps(y + 24, vyOPQRSTUV);
8304     y += 32;
8305   }
8306   for (; n >= 4 * sizeof(int8_t); n -= 4 * sizeof(int8_t)) {
8307     __m128i vx = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x)));
8308     vx = _mm_add_epi32(vx, vminus_zero_point);
8309     x += 4;
8310 
8311     __m128 vy = _mm_cvtepi32_ps(vx);
8312     vy = _mm_mul_ps(vy, _mm256_castps256_ps128(vscale));
8313 
8314     _mm_storeu_ps(y, vy);
8315     y += 4;
8316   }
8317   if XNN_UNLIKELY(n != 0) {
8318     assert(n >= 1 * sizeof(int8_t));
8319     assert(n <= 3 * sizeof(int8_t));
8320 
8321     __m128i vx = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x)));
8322     vx = _mm_add_epi32(vx, vminus_zero_point);
8323 
8324     __m128 vy = _mm_cvtepi32_ps(vx);
8325     vy = _mm_mul_ps(vy, _mm256_castps256_ps128(vscale));
8326 
8327     if (n & (2 * sizeof(int8_t))) {
8328       _mm_storel_pi((__m64*) y, vy);
8329       vy = _mm_movehl_ps(vy, vy);
8330       y += 2;
8331     }
8332     if (n & (1 * sizeof(int8_t))) {
8333       _mm_store_ss(y, vy);
8334     }
8335   }
8336 }
8337 
xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])8338 void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128(
8339     size_t mr,
8340     size_t nc,
8341     size_t kc,
8342     const int8_t* restrict a,
8343     size_t a_stride,
8344     const void* restrict w,
8345     int8_t* restrict c,
8346     size_t cm_stride,
8347     size_t cn_stride,
8348     const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8349 {
8350   assert(mr != 0);
8351   assert(mr <= 1);
8352   assert(nc != 0);
8353   assert(kc != 0);
8354   assert(kc % sizeof(int8_t) == 0);
8355   assert(a != NULL);
8356   assert(w != NULL);
8357   assert(c != NULL);
8358 
8359   kc = round_up_po2(kc, 8);
8360   const int8_t* a0 = a;
8361   int8_t* c0 = c;
8362 
8363   do {
8364     __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
8365     __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
8366     __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
8367     __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
8368     w = (const int32_t*) w + 4;
8369 
8370     size_t k = 0;
8371     while (k < kc) {
8372       const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
8373       const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
8374       a0 += 8;
8375 
8376       const __m128i vb01 = _mm_load_si128((const __m128i*) w);
8377       const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
8378       const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
8379 
8380       vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
8381       vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
8382       const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
8383       const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
8384       const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
8385 
8386       vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
8387       vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
8388 
8389       w = (const void*) ((const int8_t*) w + 32);
8390       k += 8 * sizeof(int8_t);
8391     }
8392 
8393     const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
8394     const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
8395 
8396     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
8397 
8398     __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
8399 
8400     const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
8401     vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
8402 
8403     const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
8404     vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
8405 
8406     vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
8407 
8408     const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
8409     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
8410 
8411 
8412     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
8413 
8414     vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
8415 
8416     if (nc >= 4) {
8417       unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
8418 
8419       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
8420 
8421       a0 = (const int8_t*) ((uintptr_t) a0 - kc);
8422 
8423       nc -= 4;
8424     } else {
8425       if (nc & 2) {
8426         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
8427         c0 += 2;
8428         vout = _mm_srli_epi32(vout, 16);
8429       }
8430       if (nc & 1) {
8431         *c0 = (int8_t) _mm_extract_epi8(vout, 0);
8432       }
8433 
8434       nc = 0;
8435     }
8436   } while (nc != 0);
8437 }
8438 
xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])8439 void xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128(
8440     size_t mr,
8441     size_t nc,
8442     size_t kc,
8443     const int8_t* restrict a,
8444     size_t a_stride,
8445     const void* restrict w,
8446     int8_t* restrict c,
8447     size_t cm_stride,
8448     size_t cn_stride,
8449     const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8450 {
8451   assert(mr != 0);
8452   assert(mr <= 2);
8453   assert(nc != 0);
8454   assert(kc != 0);
8455   assert(kc % sizeof(int8_t) == 0);
8456   assert(a != NULL);
8457   assert(w != NULL);
8458   assert(c != NULL);
8459 
8460   kc = round_up_po2(kc, 8);
8461   const int8_t* a0 = a;
8462   int8_t* c0 = c;
8463   const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
8464   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
8465   if XNN_UNPREDICTABLE(mr != 2) {
8466     a1 = a0;
8467     c1 = c0;
8468   }
8469 
8470   do {
8471     __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
8472     __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
8473     __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
8474     __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
8475     __m128i vacc1x0 = vacc0x0;
8476     __m128i vacc1x1 = vacc0x1;
8477     __m128i vacc1x2 = vacc0x2;
8478     __m128i vacc1x3 = vacc0x3;
8479     w = (const int32_t*) w + 4;
8480 
8481     size_t k = 0;
8482     while (k < kc) {
8483       const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
8484       const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
8485       a0 += 8;
8486       const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
8487       const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
8488       a1 += 8;
8489 
8490       const __m128i vb01 = _mm_load_si128((const __m128i*) w);
8491       const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
8492       const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
8493 
8494       vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
8495       vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
8496       vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
8497       vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
8498       const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
8499       const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
8500       const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
8501 
8502       vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
8503       vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
8504       vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
8505       vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
8506 
8507       w = (const void*) ((const int8_t*) w + 32);
8508       k += 8 * sizeof(int8_t);
8509     }
8510 
8511     const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
8512     const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
8513     const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
8514     const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
8515 
8516     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
8517     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
8518 
8519     __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
8520     __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
8521 
8522     const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
8523     vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
8524     vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
8525 
8526     const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
8527     vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
8528     vscaled1x0123 = _mm_min_ps(vscaled1x0123, voutput_max_less_zero_point);
8529 
8530     vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
8531     vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
8532 
8533     const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
8534     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
8535 
8536 
8537     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
8538 
8539     vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
8540 
8541     if (nc >= 4) {
8542       unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
8543       unaligned_store_u32(c1, (uint32_t) _mm_extract_epi32(vout, 1));
8544 
8545       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
8546       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
8547 
8548       a0 = (const int8_t*) ((uintptr_t) a0 - kc);
8549       a1 = (const int8_t*) ((uintptr_t) a1 - kc);
8550 
8551       nc -= 4;
8552     } else {
8553       if (nc & 2) {
8554         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
8555         c0 += 2;
8556         unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout, 2));
8557         c1 += 2;
8558         vout = _mm_srli_epi32(vout, 16);
8559       }
8560       if (nc & 1) {
8561         *c0 = (int8_t) _mm_extract_epi8(vout, 0);
8562         *c1 = (int8_t) _mm_extract_epi8(vout, 4);
8563       }
8564 
8565       nc = 0;
8566     }
8567   } while (nc != 0);
8568 }
8569 
xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])8570 void xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128(
8571     size_t mr,
8572     size_t nc,
8573     size_t kc,
8574     size_t ks,
8575     const int8_t** restrict a,
8576     const void* restrict w,
8577     int8_t* restrict c,
8578     size_t cm_stride,
8579     size_t cn_stride,
8580     size_t a_offset,
8581     const int8_t* zero,
8582     const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8583 {
8584   assert(mr != 0);
8585   assert(mr <= 1);
8586   assert(nc != 0);
8587   assert(kc != 0);
8588   assert(ks != 0);
8589   assert(ks % (1 * sizeof(void*)) == 0);
8590   assert(a_offset % sizeof(int8_t) == 0);
8591   assert(a != NULL);
8592   assert(w != NULL);
8593   assert(c != NULL);
8594 
8595   kc = round_up_po2(kc, 8);
8596   int8_t* c0 = c;
8597 
8598   do {
8599     __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
8600     __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
8601     __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
8602     __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
8603     w = (const int32_t*) w + 4;
8604 
8605     size_t p = ks;
8606     do {
8607       const int8_t* restrict a0 = a[0];
8608       if XNN_UNPREDICTABLE(a0 != zero) {
8609         a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
8610       }
8611       a += 1;
8612 
8613       size_t k = 0;
8614       while (k < kc) {
8615         const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
8616         const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
8617         a0 += 8;
8618 
8619         const __m128i vb01 = _mm_load_si128((const __m128i*) w);
8620         const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
8621         const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
8622 
8623         vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
8624         vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
8625         const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
8626         const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
8627         const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
8628 
8629         vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
8630         vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
8631 
8632         w = (const void*) ((const int8_t*) w + 32);
8633         k += 8 * sizeof(int8_t);
8634       }
8635       p -= 1 * sizeof(void*);
8636     } while (p != 0);
8637 
8638     const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
8639     const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
8640 
8641     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
8642 
8643     __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
8644 
8645     const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
8646     vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
8647 
8648     const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
8649     vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
8650 
8651     vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
8652 
8653     const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
8654     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
8655 
8656 
8657     __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
8658 
8659     vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
8660 
8661     if (nc >= 4) {
8662       unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
8663       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
8664 
8665       a = (const int8_t**restrict) ((uintptr_t) a - ks);
8666 
8667       nc -= 4;
8668     } else {
8669       if (nc & 2) {
8670         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
8671         c0 += 2;
8672         vout = _mm_srli_epi32(vout, 16);
8673       }
8674       if (nc & 1) {
8675         *c0 = (int8_t) _mm_extract_epi8(vout, 0);
8676       }
8677 
8678       nc = 0;
8679     }
8680   } while (nc != 0);
8681 }
8682 
xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])8683 void xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128(
8684     size_t mr,
8685     size_t nc,
8686     size_t kc,
8687     size_t ks,
8688     const int8_t** restrict a,
8689     const void* restrict w,
8690     int8_t* restrict c,
8691     size_t cm_stride,
8692     size_t cn_stride,
8693     size_t a_offset,
8694     const int8_t* zero,
8695     const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8696 {
8697   assert(mr != 0);
8698   assert(mr <= 2);
8699   assert(nc != 0);
8700   assert(kc != 0);
8701   assert(ks != 0);
8702   assert(ks % (2 * sizeof(void*)) == 0);
8703   assert(a_offset % sizeof(int8_t) == 0);
8704   assert(a != NULL);
8705   assert(w != NULL);
8706   assert(c != NULL);
8707 
8708   kc = round_up_po2(kc, 8);
8709   int8_t* c0 = c;
8710   int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
8711   if XNN_UNPREDICTABLE(mr != 2) {
8712     c1 = c0;
8713   }
8714 
8715   do {
8716     __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
8717     __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
8718     __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
8719     __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
8720     __m128i vacc1x0 = vacc0x0;
8721     __m128i vacc1x1 = vacc0x1;
8722     __m128i vacc1x2 = vacc0x2;
8723     __m128i vacc1x3 = vacc0x3;
8724     w = (const int32_t*) w + 4;
8725 
8726     size_t p = ks;
8727     do {
8728       const int8_t* restrict a0 = a[0];
8729       if XNN_UNPREDICTABLE(a0 != zero) {
8730         a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
8731       }
8732       const int8_t* restrict a1 = a[1];
8733       if XNN_UNPREDICTABLE(a1 != zero) {
8734         a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
8735       }
8736       a += 2;
8737 
8738       size_t k = 0;
8739       while (k < kc) {
8740         const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
8741         const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
8742         a0 += 8;
8743         const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
8744         const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
8745         a1 += 8;
8746 
8747         const __m128i vb01 = _mm_load_si128((const __m128i*) w);
8748         const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
8749         const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
8750 
8751         vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
8752         vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
8753         vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
8754         vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
8755         const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
8756         const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
8757         const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
8758 
8759         vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
8760         vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
8761         vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
8762         vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
8763 
8764         w = (const void*) ((const int8_t*) w + 32);
8765         k += 8 * sizeof(int8_t);
8766       }
8767       p -= 2 * sizeof(void*);
8768     } while (p != 0);
8769 
8770     const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
8771     const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
8772     const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
8773     const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
8774 
8775     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
8776     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
8777 
8778     __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
8779     __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
8780 
8781     const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
8782     vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
8783     vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
8784 
8785     const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
8786     vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
8787     vscaled1x0123 = _mm_min_ps(vscaled1x0123, voutput_max_less_zero_point);
8788 
8789     vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
8790     vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
8791 
8792     const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
8793     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
8794 
8795 
8796     __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
8797 
8798     vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
8799 
8800     if (nc >= 4) {
8801       unaligned_store_u32(c1, (uint32_t) _mm_extract_epi32(vout, 1));
8802       c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
8803       unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
8804       c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
8805 
8806       a = (const int8_t**restrict) ((uintptr_t) a - ks);
8807 
8808       nc -= 4;
8809     } else {
8810       if (nc & 2) {
8811         unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout, 2));
8812         c1 += 2;
8813         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
8814         c0 += 2;
8815         vout = _mm_srli_epi32(vout, 16);
8816       }
8817       if (nc & 1) {
8818         *c1 = (int8_t) _mm_extract_epi8(vout, 4);
8819         *c0 = (int8_t) _mm_extract_epi8(vout, 0);
8820       }
8821 
8822       nc = 0;
8823     }
8824   } while (nc != 0);
8825 }
8826 
xnn_qs8_vadd_minmax_ukernel__avx_mul32_ld32_x8(size_t n,const int8_t * input_a,const int8_t * input_b,int8_t * output,const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])8827 void xnn_qs8_vadd_minmax_ukernel__avx_mul32_ld32_x8(
8828     size_t n,
8829     const int8_t* input_a,
8830     const int8_t* input_b,
8831     int8_t* output,
8832     const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8833 {
8834   const __m128i vbias = _mm_load_si128((const __m128i*) params->sse4_mul32.bias);
8835   const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.a_multiplier);
8836   const __m128i vb_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.b_multiplier);
8837   const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4_mul32.shift);
8838   const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul32.output_zero_point);
8839   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul32.output_min);
8840   const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4_mul32.output_max);
8841 
8842   for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
8843     const __m128i va0123 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_a)));
8844     const __m128i vb0123 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_b)));
8845     const __m128i va4567 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_a + 4)));
8846     const __m128i vb4567 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_b + 4)));
8847     input_a += 8;
8848     input_b += 8;
8849 
8850     __m128i vacc0123 = _mm_add_epi32(vbias, _mm_mullo_epi32(va0123, va_multiplier));
8851     __m128i vacc4567 = _mm_add_epi32(vbias, _mm_mullo_epi32(va4567, va_multiplier));
8852 
8853     vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vb0123, vb_multiplier));
8854     vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vb4567, vb_multiplier));
8855 
8856     vacc0123 = _mm_sra_epi32(vacc0123, vshift);
8857     vacc4567 = _mm_sra_epi32(vacc4567, vshift);
8858 
8859     const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
8860 
8861     __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
8862 
8863     vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
8864 
8865     vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
8866 
8867     _mm_storel_epi64((__m128i*) output, vout0123456701234567);
8868     output += 8;
8869   }
8870   if XNN_UNLIKELY(n != 0) {
8871     {
8872       const __m128i va0123 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_a)));
8873       const __m128i vb0123 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_b)));
8874       const __m128i va4567 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_a + 4)));
8875       const __m128i vb4567 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_b + 4)));
8876 
8877       __m128i vacc0123 = _mm_add_epi32(vbias, _mm_mullo_epi32(va0123, va_multiplier));
8878       __m128i vacc4567 = _mm_add_epi32(vbias, _mm_mullo_epi32(va4567, va_multiplier));
8879 
8880       vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vb0123, vb_multiplier));
8881       vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vb4567, vb_multiplier));
8882 
8883       vacc0123 = _mm_sra_epi32(vacc0123, vshift);
8884       vacc4567 = _mm_sra_epi32(vacc4567, vshift);
8885 
8886       const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
8887 
8888       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
8889       vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
8890       vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
8891 
8892       if (n & (4 * sizeof(int8_t))) {
8893         unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
8894         vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
8895         output += 4;
8896       }
8897       if (n & (2 * sizeof(int8_t))) {
8898         unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
8899         vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
8900         output += 2;
8901       }
8902       if (n & (1 * sizeof(int8_t))) {
8903         *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
8904       }
8905     }
8906   }
8907 }
8908 
xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x8(size_t n,const int8_t * input_a,const int8_t * input_b,int8_t * output,const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])8909 void xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x8(
8910     size_t n,
8911     const int8_t* input_a,
8912     const int8_t* input_b,
8913     int8_t* output,
8914     const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8915 {
8916   const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.a_multiplier);
8917   const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4_mul32.shift);
8918   const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul32.output_zero_point);
8919   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul32.output_min);
8920   const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4_mul32.output_max);
8921 
8922   __m128i vbias = _mm_cvtsi32_si128(params->sse4_mul32.b_multiplier[0] * (int32_t) *input_b);
8923   vbias = _mm_shuffle_epi32(vbias, _MM_SHUFFLE(0, 0, 0, 0));
8924   vbias = _mm_add_epi32(vbias, _mm_load_si128((const __m128i*) params->sse4_mul32.bias));
8925   for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
8926     const __m128i va0123 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_a)));
8927     const __m128i va4567 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_a + 4)));
8928     input_a += 8;
8929     input_b += 8;
8930 
8931     __m128i vacc0123 = _mm_add_epi32(vbias, _mm_mullo_epi32(va0123, va_multiplier));
8932     __m128i vacc4567 = _mm_add_epi32(vbias, _mm_mullo_epi32(va4567, va_multiplier));
8933 
8934     vacc0123 = _mm_sra_epi32(vacc0123, vshift);
8935     vacc4567 = _mm_sra_epi32(vacc4567, vshift);
8936 
8937     const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
8938 
8939     __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
8940 
8941     vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
8942 
8943     vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
8944 
8945     _mm_storel_epi64((__m128i*) output, vout0123456701234567);
8946     output += 8;
8947   }
8948   if XNN_UNLIKELY(n != 0) {
8949     {
8950       const __m128i va0123 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_a)));
8951       const __m128i va4567 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_a + 4)));
8952 
8953       __m128i vacc0123 = _mm_add_epi32(vbias, _mm_mullo_epi32(va0123, va_multiplier));
8954       __m128i vacc4567 = _mm_add_epi32(vbias, _mm_mullo_epi32(va4567, va_multiplier));
8955 
8956       vacc0123 = _mm_sra_epi32(vacc0123, vshift);
8957       vacc4567 = _mm_sra_epi32(vacc4567, vshift);
8958 
8959       const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
8960 
8961       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
8962       vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
8963       vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
8964 
8965       if (n & (4 * sizeof(int8_t))) {
8966         unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
8967         vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
8968         output += 4;
8969       }
8970       if (n & (2 * sizeof(int8_t))) {
8971         unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
8972         vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
8973         output += 2;
8974       }
8975       if (n & (1 * sizeof(int8_t))) {
8976         *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
8977       }
8978     }
8979   }
8980 }
8981 
xnn_qs8_vcvt_ukernel__avx_x32(size_t n,const int8_t * x,int8_t * y,const union xnn_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])8982 void xnn_qs8_vcvt_ukernel__avx_x32(
8983     size_t n,
8984     const int8_t* x,
8985     int8_t* y,
8986     const union xnn_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8987 {
8988   assert(n != 0);
8989   assert(n % sizeof(int8_t) == 0);
8990   assert(x != NULL);
8991   assert(y != NULL);
8992 
8993   const __m128i vinput_zero_point = _mm_load_si128((const __m128i*) params->ssse3.input_zero_point);
8994   const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->ssse3.multiplier);
8995   const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->ssse3.output_zero_point);
8996   for (; n >= 32 * sizeof(int8_t); n -= 32 * sizeof(int8_t)) {
8997     __m128i vacc0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) x));
8998     __m128i vacc1 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (x + 8)));
8999     __m128i vacc2 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (x + 16)));
9000     __m128i vacc3 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (x + 24)));
9001     x += 32;
9002 
9003     vacc0 = _mm_sub_epi16(vinput_zero_point, vacc0);
9004     vacc1 = _mm_sub_epi16(vinput_zero_point, vacc1);
9005     vacc2 = _mm_sub_epi16(vinput_zero_point, vacc2);
9006     vacc3 = _mm_sub_epi16(vinput_zero_point, vacc3);
9007 
9008     vacc0 = _mm_slli_epi16(vacc0, 7);
9009     vacc1 = _mm_slli_epi16(vacc1, 7);
9010     vacc2 = _mm_slli_epi16(vacc2, 7);
9011     vacc3 = _mm_slli_epi16(vacc3, 7);
9012 
9013     vacc0 = _mm_mulhrs_epi16(vacc0, vmultiplier);
9014     vacc1 = _mm_mulhrs_epi16(vacc1, vmultiplier);
9015     vacc2 = _mm_mulhrs_epi16(vacc2, vmultiplier);
9016     vacc3 = _mm_mulhrs_epi16(vacc3, vmultiplier);
9017 
9018     vacc0 = _mm_adds_epi16(vacc0, voutput_zero_point);
9019     vacc1 = _mm_adds_epi16(vacc1, voutput_zero_point);
9020     vacc2 = _mm_adds_epi16(vacc2, voutput_zero_point);
9021     vacc3 = _mm_adds_epi16(vacc3, voutput_zero_point);
9022 
9023     const __m128i vy0 = _mm_packs_epi16(vacc0, vacc1);
9024     const __m128i vy1 = _mm_packs_epi16(vacc2, vacc3);
9025 
9026     _mm_storeu_si128((__m128i*) y, vy0);
9027     _mm_storeu_si128((__m128i*) (y + 16), vy1);
9028     y += 32;
9029   }
9030   for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
9031     __m128i vacc = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) x));
9032     vacc = _mm_sub_epi16(vinput_zero_point, vacc);
9033     vacc = _mm_slli_epi16(vacc, 7);
9034     vacc = _mm_mulhrs_epi16(vacc, vmultiplier);
9035     vacc = _mm_adds_epi16(vacc, voutput_zero_point);
9036     x += 8;
9037 
9038     const __m128i vy = _mm_packs_epi16(vacc, vacc);
9039     _mm_storel_epi64((__m128i*) y, vy);
9040     y += 8;
9041   }
9042   if XNN_UNLIKELY(n != 0) {
9043     assert(n >= 1 * sizeof(int8_t));
9044     assert(n <= 7 * sizeof(int8_t));
9045 
9046     __m128i vacc = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) x));
9047     vacc = _mm_sub_epi16(vinput_zero_point, vacc);
9048     vacc = _mm_slli_epi16(vacc, 7);
9049     vacc = _mm_mulhrs_epi16(vacc, vmultiplier);
9050     vacc = _mm_adds_epi16(vacc, voutput_zero_point);
9051 
9052     __m128i vy = _mm_packs_epi16(vacc, vacc);
9053     if (n & (4 * sizeof(int8_t))) {
9054       _mm_storeu_si32(y, vy);
9055       vy = _mm_srli_epi64(vy, 32);
9056       y += 4;
9057     }
9058     if (n & (2 * sizeof(int8_t))) {
9059       _mm_storeu_si16(y, vy);
9060       vy = _mm_srli_epi32(vy, 16);
9061       y += 2;
9062     }
9063     if (n & (1 * sizeof(int8_t))) {
9064       *y = (int8_t) _mm_extract_epi8(vy, 0);
9065     }
9066   }
9067 }
9068 
xnn_qs8_vlrelu_ukernel__avx_x32(size_t n,const int8_t * x,int8_t * y,const union xnn_qs8_lrelu_params params[restrict XNN_MIN_ELEMENTS (1)])9069 void xnn_qs8_vlrelu_ukernel__avx_x32(
9070     size_t n,
9071     const int8_t* x,
9072     int8_t* y,
9073     const union xnn_qs8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
9074 {
9075   assert(n != 0);
9076   assert(n % sizeof(int8_t) == 0);
9077   assert(x != NULL);
9078   assert(y != NULL);
9079 
9080   const __m128i vinput_zero_point = _mm_load_si128((const __m128i*) params->avx.input_zero_point);
9081   const __m128i vpositive_multiplier = _mm_load_si128((const __m128i*) params->avx.positive_multiplier);
9082   const __m128i vnegative_multiplier = _mm_load_si128((const __m128i*) params->avx.negative_multiplier);
9083   const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx.output_zero_point);
9084   for (; n >= 32 * sizeof(int8_t); n -= 32 * sizeof(int8_t)) {
9085     __m128i vacc0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) x));
9086     __m128i vacc1 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (x + 8)));
9087     __m128i vacc2 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (x + 16)));
9088     __m128i vacc3 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (x + 24)));
9089     x += 32;
9090 
9091     __m128i vmultiplier0 = _mm_cmpgt_epi16(vacc0, vinput_zero_point);
9092     vacc0 = _mm_sub_epi16(vinput_zero_point, vacc0);
9093     __m128i vmultiplier1 = _mm_cmpgt_epi16(vacc1, vinput_zero_point);
9094     vacc1 = _mm_sub_epi16(vinput_zero_point, vacc1);
9095     __m128i vmultiplier2 = _mm_cmpgt_epi16(vacc2, vinput_zero_point);
9096     vacc2 = _mm_sub_epi16(vinput_zero_point, vacc2);
9097     __m128i vmultiplier3 = _mm_cmpgt_epi16(vacc3, vinput_zero_point);
9098     vacc3 = _mm_sub_epi16(vinput_zero_point, vacc3);
9099 
9100     vmultiplier0 = _mm_blendv_epi8(vnegative_multiplier, vpositive_multiplier, vmultiplier0);
9101     vacc0 = _mm_slli_epi16(vacc0, 7);
9102     vmultiplier1 = _mm_blendv_epi8(vnegative_multiplier, vpositive_multiplier, vmultiplier1);
9103     vacc1 = _mm_slli_epi16(vacc1, 7);
9104     vmultiplier2 = _mm_blendv_epi8(vnegative_multiplier, vpositive_multiplier, vmultiplier2);
9105     vacc2 = _mm_slli_epi16(vacc2, 7);
9106     vmultiplier3 = _mm_blendv_epi8(vnegative_multiplier, vpositive_multiplier, vmultiplier3);
9107     vacc3 = _mm_slli_epi16(vacc3, 7);
9108 
9109     vacc0 = _mm_mulhrs_epi16(vacc0, vmultiplier0);
9110     vacc1 = _mm_mulhrs_epi16(vacc1, vmultiplier1);
9111     vacc2 = _mm_mulhrs_epi16(vacc2, vmultiplier2);
9112     vacc3 = _mm_mulhrs_epi16(vacc3, vmultiplier3);
9113 
9114     vacc0 = _mm_adds_epi16(vacc0, voutput_zero_point);
9115     vacc1 = _mm_adds_epi16(vacc1, voutput_zero_point);
9116     vacc2 = _mm_adds_epi16(vacc2, voutput_zero_point);
9117     vacc3 = _mm_adds_epi16(vacc3, voutput_zero_point);
9118 
9119     const __m128i vy0 = _mm_packs_epi16(vacc0, vacc1);
9120     const __m128i vy1 = _mm_packs_epi16(vacc2, vacc3);
9121 
9122     _mm_storeu_si128((__m128i*) y, vy0);
9123     _mm_storeu_si128((__m128i*) (y + 16), vy1);
9124     y += 32;
9125   }
9126   for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
9127     __m128i vacc = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) x));
9128     __m128i vmultiplier = _mm_cmpgt_epi16(vacc, vinput_zero_point);
9129     vacc = _mm_sub_epi16(vinput_zero_point, vacc);
9130     vmultiplier = _mm_blendv_epi8(vnegative_multiplier, vpositive_multiplier, vmultiplier);
9131     vacc = _mm_slli_epi16(vacc, 7);
9132     vacc = _mm_mulhrs_epi16(vacc, vmultiplier);
9133     vacc = _mm_adds_epi16(vacc, voutput_zero_point);
9134     x += 8;
9135 
9136     const __m128i vy = _mm_packs_epi16(vacc, vacc);
9137     _mm_storel_epi64((__m128i*) y, vy);
9138     y += 8;
9139   }
9140   if XNN_UNLIKELY(n != 0) {
9141     assert(n >= 1 * sizeof(int8_t));
9142     assert(n <= 7 * sizeof(int8_t));
9143 
9144     __m128i vacc = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) x));
9145     __m128i vmultiplier = _mm_cmpgt_epi16(vacc, vinput_zero_point);
9146     vacc = _mm_sub_epi16(vinput_zero_point, vacc);
9147     vmultiplier = _mm_blendv_epi8(vnegative_multiplier, vpositive_multiplier, vmultiplier);
9148     vacc = _mm_slli_epi16(vacc, 7);
9149     vacc = _mm_mulhrs_epi16(vacc, vmultiplier);
9150     vacc = _mm_adds_epi16(vacc, voutput_zero_point);
9151 
9152     __m128i vy = _mm_packs_epi16(vacc, vacc);
9153     if (n & (4 * sizeof(int8_t))) {
9154       _mm_storeu_si32(y, vy);
9155       vy = _mm_srli_epi64(vy, 32);
9156       y += 4;
9157     }
9158     if (n & (2 * sizeof(int8_t))) {
9159       _mm_storeu_si16(y, vy);
9160       vy = _mm_srli_epi32(vy, 16);
9161       y += 2;
9162     }
9163     if (n & (1 * sizeof(int8_t))) {
9164       *y = (int8_t) _mm_extract_epi8(vy, 0);
9165     }
9166   }
9167 }
9168 
xnn_qs8_vmul_minmax_fp32_ukernel__avx_mul16_ld64_x16(size_t n,const int8_t * input_a,const int8_t * input_b,int8_t * output,const union xnn_qs8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])9169 void xnn_qs8_vmul_minmax_fp32_ukernel__avx_mul16_ld64_x16(
9170     size_t n,
9171     const int8_t* input_a,
9172     const int8_t* input_b,
9173     int8_t* output,
9174     const union xnn_qs8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
9175 
9176 {
9177   const __m128i va_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.a_zero_point);
9178   const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.b_zero_point);
9179   const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
9180   const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
9181   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
9182   const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse4.output_max);
9183 
9184   for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
9185     const __m128i va01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
9186     const __m128i vb01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_b));
9187     const __m128i va89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (input_a + 8)));
9188     const __m128i vb89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (input_b + 8)));
9189     input_a += 16;
9190     input_b += 16;
9191 
9192 
9193     const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
9194     const __m128i vxb01234567 = _mm_sub_epi16(vb01234567, vb_zero_point);
9195     const __m128i vxa89ABCDEF = _mm_sub_epi16(va89ABCDEF, va_zero_point);
9196     const __m128i vxb89ABCDEF = _mm_sub_epi16(vb89ABCDEF, vb_zero_point);
9197 
9198     const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb01234567);
9199     const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb01234567);
9200     const __m128i vprod89ABCDEFlo = _mm_mullo_epi16(vxa89ABCDEF, vxb89ABCDEF);
9201     const __m128i vprod89ABCDEFhi = _mm_mulhi_epi16(vxa89ABCDEF, vxb89ABCDEF);
9202 
9203     const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
9204     const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
9205     const __m128i vprod89AB = _mm_unpacklo_epi16(vprod89ABCDEFlo, vprod89ABCDEFhi);
9206     const __m128i vprodCDEF = _mm_unpackhi_epi16(vprod89ABCDEFlo, vprod89ABCDEFhi);
9207 
9208     __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
9209     __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
9210     __m128 vfpacc89AB = _mm_cvtepi32_ps(vprod89AB);
9211     __m128 vfpaccCDEF = _mm_cvtepi32_ps(vprodCDEF);
9212 
9213     vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
9214     vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
9215     vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale);
9216     vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale);
9217 
9218     const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
9219     const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
9220     const __m128i vacc89AB = _mm_cvtps_epi32(vfpacc89AB);
9221     const __m128i vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF);
9222 
9223     __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
9224     __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
9225 
9226 
9227     __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
9228 
9229     vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
9230 
9231     vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
9232 
9233     _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
9234     output += 16;
9235   }
9236   if XNN_UNLIKELY(n != 0) {
9237     do {
9238       const __m128i va01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
9239       const __m128i vb01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_b));
9240       input_a += 8;
9241       input_b += 8;
9242 
9243 
9244       const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
9245       const __m128i vxb01234567 = _mm_sub_epi16(vb01234567, vb_zero_point);
9246 
9247       const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb01234567);
9248       const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb01234567);
9249 
9250       const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
9251       const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
9252 
9253       __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
9254       __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
9255 
9256       vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
9257       vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
9258 
9259       const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
9260       const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
9261 
9262       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
9263 
9264       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
9265       vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
9266       vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
9267 
9268       if XNN_LIKELY(n >= (8 * sizeof(int8_t))) {
9269         _mm_storel_epi64((__m128i*) output, vout0123456701234567);
9270         output += 8;
9271         n -= 8 * sizeof(int8_t);
9272       } else {
9273         if (n & (4 * sizeof(int8_t))) {
9274           unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
9275           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
9276           output += 4;
9277         }
9278         if (n & (2 * sizeof(int8_t))) {
9279           unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
9280           vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
9281           output += 2;
9282         }
9283         if (n & (1 * sizeof(int8_t))) {
9284           *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
9285         }
9286         n = 0;
9287       }
9288     } while (n != 0);
9289   }
9290 }
9291 
xnn_qs8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16(size_t n,const int8_t * input_a,const int8_t * input_b,int8_t * output,const union xnn_qs8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])9292 void xnn_qs8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16(
9293     size_t n,
9294     const int8_t* input_a,
9295     const int8_t* input_b,
9296     int8_t* output,
9297     const union xnn_qs8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
9298 
9299 {
9300   const __m128i va_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.a_zero_point);
9301   const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
9302   const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
9303   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
9304   const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse4.output_max);
9305 
9306   __m128i vxb = _mm_sub_epi16(
9307     _mm_shuffle_epi32(_mm_cvtsi32_si128(UINT32_C(0x00010001) * (uint32_t) (uint16_t) (int16_t) *input_b), 0),
9308     _mm_load_si128((const __m128i*) params->fp32_sse4.b_zero_point));
9309   for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
9310     const __m128i va01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
9311     const __m128i va89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (input_a + 8)));
9312     input_a += 16;
9313 
9314 
9315     const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
9316     const __m128i vxa89ABCDEF = _mm_sub_epi16(va89ABCDEF, va_zero_point);
9317 
9318     const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb);
9319     const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb);
9320     const __m128i vprod89ABCDEFlo = _mm_mullo_epi16(vxa89ABCDEF, vxb);
9321     const __m128i vprod89ABCDEFhi = _mm_mulhi_epi16(vxa89ABCDEF, vxb);
9322 
9323     const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
9324     const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
9325     const __m128i vprod89AB = _mm_unpacklo_epi16(vprod89ABCDEFlo, vprod89ABCDEFhi);
9326     const __m128i vprodCDEF = _mm_unpackhi_epi16(vprod89ABCDEFlo, vprod89ABCDEFhi);
9327 
9328     __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
9329     __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
9330     __m128 vfpacc89AB = _mm_cvtepi32_ps(vprod89AB);
9331     __m128 vfpaccCDEF = _mm_cvtepi32_ps(vprodCDEF);
9332 
9333     vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
9334     vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
9335     vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale);
9336     vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale);
9337 
9338     const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
9339     const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
9340     const __m128i vacc89AB = _mm_cvtps_epi32(vfpacc89AB);
9341     const __m128i vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF);
9342 
9343     __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
9344     __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
9345 
9346 
9347     __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
9348 
9349     vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
9350 
9351     vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
9352 
9353     _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
9354     output += 16;
9355   }
9356   if XNN_UNLIKELY(n != 0) {
9357     do {
9358       const __m128i va01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
9359       input_a += 8;
9360 
9361 
9362       const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
9363 
9364       const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb);
9365       const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb);
9366 
9367       const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
9368       const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
9369 
9370       __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
9371       __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
9372 
9373       vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
9374       vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
9375 
9376       const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
9377       const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
9378 
9379       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
9380 
9381       __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
9382       vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
9383       vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
9384 
9385       if XNN_LIKELY(n >= (8 * sizeof(int8_t))) {
9386         _mm_storel_epi64((__m128i*) output, vout0123456701234567);
9387         output += 8;
9388         n -= 8 * sizeof(int8_t);
9389       } else {
9390         if (n & (4 * sizeof(int8_t))) {
9391           unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
9392           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
9393           output += 4;
9394         }
9395         if (n & (2 * sizeof(int8_t))) {
9396           unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
9397           vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
9398           output += 2;
9399         }
9400         if (n & (1 * sizeof(int8_t))) {
9401           *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
9402         }
9403         n = 0;
9404       }
9405     } while (n != 0);
9406   }
9407 }
9408 
xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16(size_t channels,size_t output_width,const uint8_t ** input,const void * weights,uint8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])9409 void xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16(
9410     size_t channels,
9411     size_t output_width,
9412     const uint8_t** input,
9413     const void* weights,
9414     uint8_t* output,
9415     size_t input_stride,
9416     size_t output_increment,
9417     size_t input_offset,
9418     const uint8_t* zero,
9419     const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
9420 {
9421   assert(channels != 0);
9422   assert(output_width != 0);
9423 
9424   do {
9425     const uint8_t* i0 = input[0];
9426     assert(i0 != NULL);
9427     if XNN_UNPREDICTABLE(i0 != zero) {
9428       i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
9429     }
9430     const uint8_t* i1 = input[1];
9431     assert(i1 != NULL);
9432     if XNN_UNPREDICTABLE(i1 != zero) {
9433       i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
9434     }
9435     const uint8_t* i2 = input[2];
9436     assert(i2 != NULL);
9437     if XNN_UNPREDICTABLE(i2 != zero) {
9438       i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
9439     }
9440     const uint8_t* i3 = input[3];
9441     assert(i3 != NULL);
9442     if XNN_UNPREDICTABLE(i3 != zero) {
9443       i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
9444     }
9445     const uint8_t* i4 = input[4];
9446     assert(i4 != NULL);
9447     if XNN_UNPREDICTABLE(i4 != zero) {
9448       i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
9449     }
9450     const uint8_t* i5 = input[5];
9451     assert(i5 != NULL);
9452     if XNN_UNPREDICTABLE(i5 != zero) {
9453       i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
9454     }
9455     const uint8_t* i6 = input[6];
9456     assert(i6 != NULL);
9457     if XNN_UNPREDICTABLE(i6 != zero) {
9458       i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
9459     }
9460     const uint8_t* i7 = input[7];
9461     assert(i7 != NULL);
9462     if XNN_UNPREDICTABLE(i7 != zero) {
9463       i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
9464     }
9465     const uint8_t* i8 = input[8];
9466     assert(i8 != NULL);
9467     if XNN_UNPREDICTABLE(i8 != zero) {
9468       i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
9469     }
9470     const uint8_t* i9 = input[9];
9471     assert(i9 != NULL);
9472     if XNN_UNPREDICTABLE(i9 != zero) {
9473       i9 = (const uint8_t*) ((uintptr_t) i9 + input_offset);
9474     }
9475     const uint8_t* i10 = input[10];
9476     assert(i10 != NULL);
9477     if XNN_UNPREDICTABLE(i10 != zero) {
9478       i10 = (const uint8_t*) ((uintptr_t) i10 + input_offset);
9479     }
9480     const uint8_t* i11 = input[11];
9481     assert(i11 != NULL);
9482     if XNN_UNPREDICTABLE(i11 != zero) {
9483       i11 = (const uint8_t*) ((uintptr_t) i11 + input_offset);
9484     }
9485     const uint8_t* i12 = input[12];
9486     assert(i12 != NULL);
9487     if XNN_UNPREDICTABLE(i12 != zero) {
9488       i12 = (const uint8_t*) ((uintptr_t) i12 + input_offset);
9489     }
9490     const uint8_t* i13 = input[13];
9491     assert(i13 != NULL);
9492     if XNN_UNPREDICTABLE(i13 != zero) {
9493       i13 = (const uint8_t*) ((uintptr_t) i13 + input_offset);
9494     }
9495     const uint8_t* i14 = input[14];
9496     assert(i14 != NULL);
9497     if XNN_UNPREDICTABLE(i14 != zero) {
9498       i14 = (const uint8_t*) ((uintptr_t) i14 + input_offset);
9499     }
9500     const uint8_t* i15 = input[15];
9501     assert(i15 != NULL);
9502     if XNN_UNPREDICTABLE(i15 != zero) {
9503       i15 = (const uint8_t*) ((uintptr_t) i15 + input_offset);
9504     }
9505     const uint8_t* i16 = input[16];
9506     assert(i16 != NULL);
9507     if XNN_UNPREDICTABLE(i16 != zero) {
9508       i16 = (const uint8_t*) ((uintptr_t) i16 + input_offset);
9509     }
9510     const uint8_t* i17 = input[17];
9511     assert(i17 != NULL);
9512     if XNN_UNPREDICTABLE(i17 != zero) {
9513       i17 = (const uint8_t*) ((uintptr_t) i17 + input_offset);
9514     }
9515     const uint8_t* i18 = input[18];
9516     assert(i18 != NULL);
9517     if XNN_UNPREDICTABLE(i18 != zero) {
9518       i18 = (const uint8_t*) ((uintptr_t) i18 + input_offset);
9519     }
9520     const uint8_t* i19 = input[19];
9521     assert(i19 != NULL);
9522     if XNN_UNPREDICTABLE(i19 != zero) {
9523       i19 = (const uint8_t*) ((uintptr_t) i19 + input_offset);
9524     }
9525     const uint8_t* i20 = input[20];
9526     assert(i20 != NULL);
9527     if XNN_UNPREDICTABLE(i20 != zero) {
9528       i20 = (const uint8_t*) ((uintptr_t) i20 + input_offset);
9529     }
9530     const uint8_t* i21 = input[21];
9531     assert(i21 != NULL);
9532     if XNN_UNPREDICTABLE(i21 != zero) {
9533       i21 = (const uint8_t*) ((uintptr_t) i21 + input_offset);
9534     }
9535     const uint8_t* i22 = input[22];
9536     assert(i22 != NULL);
9537     if XNN_UNPREDICTABLE(i22 != zero) {
9538       i22 = (const uint8_t*) ((uintptr_t) i22 + input_offset);
9539     }
9540     const uint8_t* i23 = input[23];
9541     assert(i23 != NULL);
9542     if XNN_UNPREDICTABLE(i23 != zero) {
9543       i23 = (const uint8_t*) ((uintptr_t) i23 + input_offset);
9544     }
9545     const uint8_t* i24 = input[24];
9546     assert(i24 != NULL);
9547     if XNN_UNPREDICTABLE(i24 != zero) {
9548       i24 = (const uint8_t*) ((uintptr_t) i24 + input_offset);
9549     }
9550     input = (const uint8_t**) ((uintptr_t) input + input_stride);
9551 
9552     size_t c = channels;
9553     const void* w = weights;
9554     const __m128i vk_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point);
9555     for (; c >= 16; c -= 16) {
9556       __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
9557       __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
9558       __m128i vacc89AB = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 8));
9559       __m128i vaccCDEF = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 12));
9560 
9561 
9562       const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
9563       const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(vi0x01234567);
9564       const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(uint8_t)));
9565       const __m128i vxk0x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk0x01234567), vk_zero_point);
9566       const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8));
9567       const __m128i vxi0x89ABCDEF = _mm_cvtepu8_epi16(vi0x89ABCDEF);
9568       const __m128i vk0x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(uint8_t)));
9569       const __m128i vxk0x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk0x89ABCDEF), vk_zero_point);
9570       i0 += 16;
9571 
9572 
9573       const __m128i vprod0x01234567lo = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
9574       const __m128i vprod0x01234567hi = _mm_mulhi_epi16(vxi0x01234567, vxk0x01234567);
9575       const __m128i vprod0x89ABCDEFlo = _mm_mullo_epi16(vxi0x89ABCDEF, vxk0x89ABCDEF);
9576       const __m128i vprod0x89ABCDEFhi = _mm_mulhi_epi16(vxi0x89ABCDEF, vxk0x89ABCDEF);
9577 
9578       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod0x01234567lo, vprod0x01234567hi));
9579       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod0x01234567lo, vprod0x01234567hi));
9580       vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod0x89ABCDEFlo, vprod0x89ABCDEFhi));
9581       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod0x89ABCDEFlo, vprod0x89ABCDEFhi));
9582 
9583       const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
9584       const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(vi1x01234567);
9585       const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(uint8_t)));
9586       const __m128i vxk1x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk1x01234567), vk_zero_point);
9587       const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8));
9588       const __m128i vxi1x89ABCDEF = _mm_cvtepu8_epi16(vi1x89ABCDEF);
9589       const __m128i vk1x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(uint8_t)));
9590       const __m128i vxk1x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk1x89ABCDEF), vk_zero_point);
9591       i1 += 16;
9592 
9593 
9594       const __m128i vprod1x01234567lo = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
9595       const __m128i vprod1x01234567hi = _mm_mulhi_epi16(vxi1x01234567, vxk1x01234567);
9596       const __m128i vprod1x89ABCDEFlo = _mm_mullo_epi16(vxi1x89ABCDEF, vxk1x89ABCDEF);
9597       const __m128i vprod1x89ABCDEFhi = _mm_mulhi_epi16(vxi1x89ABCDEF, vxk1x89ABCDEF);
9598 
9599       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod1x01234567lo, vprod1x01234567hi));
9600       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod1x01234567lo, vprod1x01234567hi));
9601       vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod1x89ABCDEFlo, vprod1x89ABCDEFhi));
9602       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod1x89ABCDEFlo, vprod1x89ABCDEFhi));
9603 
9604       const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
9605       const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(vi2x01234567);
9606       const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(uint8_t)));
9607       const __m128i vxk2x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk2x01234567), vk_zero_point);
9608       const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8));
9609       const __m128i vxi2x89ABCDEF = _mm_cvtepu8_epi16(vi2x89ABCDEF);
9610       const __m128i vk2x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(uint8_t)));
9611       const __m128i vxk2x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk2x89ABCDEF), vk_zero_point);
9612       i2 += 16;
9613 
9614 
9615       const __m128i vprod2x01234567lo = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
9616       const __m128i vprod2x01234567hi = _mm_mulhi_epi16(vxi2x01234567, vxk2x01234567);
9617       const __m128i vprod2x89ABCDEFlo = _mm_mullo_epi16(vxi2x89ABCDEF, vxk2x89ABCDEF);
9618       const __m128i vprod2x89ABCDEFhi = _mm_mulhi_epi16(vxi2x89ABCDEF, vxk2x89ABCDEF);
9619 
9620       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod2x01234567lo, vprod2x01234567hi));
9621       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod2x01234567lo, vprod2x01234567hi));
9622       vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod2x89ABCDEFlo, vprod2x89ABCDEFhi));
9623       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod2x89ABCDEFlo, vprod2x89ABCDEFhi));
9624 
9625       const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
9626       const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(vi3x01234567);
9627       const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(uint8_t)));
9628       const __m128i vxk3x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk3x01234567), vk_zero_point);
9629       const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8));
9630       const __m128i vxi3x89ABCDEF = _mm_cvtepu8_epi16(vi3x89ABCDEF);
9631       const __m128i vk3x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(uint8_t)));
9632       const __m128i vxk3x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk3x89ABCDEF), vk_zero_point);
9633       i3 += 16;
9634 
9635 
9636       const __m128i vprod3x01234567lo = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567);
9637       const __m128i vprod3x01234567hi = _mm_mulhi_epi16(vxi3x01234567, vxk3x01234567);
9638       const __m128i vprod3x89ABCDEFlo = _mm_mullo_epi16(vxi3x89ABCDEF, vxk3x89ABCDEF);
9639       const __m128i vprod3x89ABCDEFhi = _mm_mulhi_epi16(vxi3x89ABCDEF, vxk3x89ABCDEF);
9640 
9641       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod3x01234567lo, vprod3x01234567hi));
9642       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod3x01234567lo, vprod3x01234567hi));
9643       vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod3x89ABCDEFlo, vprod3x89ABCDEFhi));
9644       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod3x89ABCDEFlo, vprod3x89ABCDEFhi));
9645 
9646       const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
9647       const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(vi4x01234567);
9648       const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(uint8_t)));
9649       const __m128i vxk4x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk4x01234567), vk_zero_point);
9650       const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8));
9651       const __m128i vxi4x89ABCDEF = _mm_cvtepu8_epi16(vi4x89ABCDEF);
9652       const __m128i vk4x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(uint8_t)));
9653       const __m128i vxk4x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk4x89ABCDEF), vk_zero_point);
9654       i4 += 16;
9655 
9656 
9657       const __m128i vprod4x01234567lo = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
9658       const __m128i vprod4x01234567hi = _mm_mulhi_epi16(vxi4x01234567, vxk4x01234567);
9659       const __m128i vprod4x89ABCDEFlo = _mm_mullo_epi16(vxi4x89ABCDEF, vxk4x89ABCDEF);
9660       const __m128i vprod4x89ABCDEFhi = _mm_mulhi_epi16(vxi4x89ABCDEF, vxk4x89ABCDEF);
9661 
9662       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod4x01234567lo, vprod4x01234567hi));
9663       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod4x01234567lo, vprod4x01234567hi));
9664       vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod4x89ABCDEFlo, vprod4x89ABCDEFhi));
9665       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod4x89ABCDEFlo, vprod4x89ABCDEFhi));
9666 
9667       const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
9668       const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(vi5x01234567);
9669       const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(uint8_t)));
9670       const __m128i vxk5x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk5x01234567), vk_zero_point);
9671       const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8));
9672       const __m128i vxi5x89ABCDEF = _mm_cvtepu8_epi16(vi5x89ABCDEF);
9673       const __m128i vk5x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(uint8_t)));
9674       const __m128i vxk5x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk5x89ABCDEF), vk_zero_point);
9675       i5 += 16;
9676 
9677 
9678       const __m128i vprod5x01234567lo = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567);
9679       const __m128i vprod5x01234567hi = _mm_mulhi_epi16(vxi5x01234567, vxk5x01234567);
9680       const __m128i vprod5x89ABCDEFlo = _mm_mullo_epi16(vxi5x89ABCDEF, vxk5x89ABCDEF);
9681       const __m128i vprod5x89ABCDEFhi = _mm_mulhi_epi16(vxi5x89ABCDEF, vxk5x89ABCDEF);
9682 
9683       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod5x01234567lo, vprod5x01234567hi));
9684       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod5x01234567lo, vprod5x01234567hi));
9685       vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod5x89ABCDEFlo, vprod5x89ABCDEFhi));
9686       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod5x89ABCDEFlo, vprod5x89ABCDEFhi));
9687 
9688       const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
9689       const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(vi6x01234567);
9690       const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(uint8_t)));
9691       const __m128i vxk6x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk6x01234567), vk_zero_point);
9692       const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8));
9693       const __m128i vxi6x89ABCDEF = _mm_cvtepu8_epi16(vi6x89ABCDEF);
9694       const __m128i vk6x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(uint8_t)));
9695       const __m128i vxk6x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk6x89ABCDEF), vk_zero_point);
9696       i6 += 16;
9697 
9698 
9699       const __m128i vprod6x01234567lo = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
9700       const __m128i vprod6x01234567hi = _mm_mulhi_epi16(vxi6x01234567, vxk6x01234567);
9701       const __m128i vprod6x89ABCDEFlo = _mm_mullo_epi16(vxi6x89ABCDEF, vxk6x89ABCDEF);
9702       const __m128i vprod6x89ABCDEFhi = _mm_mulhi_epi16(vxi6x89ABCDEF, vxk6x89ABCDEF);
9703 
9704       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod6x01234567lo, vprod6x01234567hi));
9705       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod6x01234567lo, vprod6x01234567hi));
9706       vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod6x89ABCDEFlo, vprod6x89ABCDEFhi));
9707       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod6x89ABCDEFlo, vprod6x89ABCDEFhi));
9708 
9709       const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
9710       const __m128i vxi7x01234567 = _mm_cvtepu8_epi16(vi7x01234567);
9711       const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(uint8_t)));
9712       const __m128i vxk7x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk7x01234567), vk_zero_point);
9713       const __m128i vi7x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i7 + 8));
9714       const __m128i vxi7x89ABCDEF = _mm_cvtepu8_epi16(vi7x89ABCDEF);
9715       const __m128i vk7x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(uint8_t)));
9716       const __m128i vxk7x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk7x89ABCDEF), vk_zero_point);
9717       i7 += 16;
9718 
9719 
9720       const __m128i vprod7x01234567lo = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567);
9721       const __m128i vprod7x01234567hi = _mm_mulhi_epi16(vxi7x01234567, vxk7x01234567);
9722       const __m128i vprod7x89ABCDEFlo = _mm_mullo_epi16(vxi7x89ABCDEF, vxk7x89ABCDEF);
9723       const __m128i vprod7x89ABCDEFhi = _mm_mulhi_epi16(vxi7x89ABCDEF, vxk7x89ABCDEF);
9724 
9725       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod7x01234567lo, vprod7x01234567hi));
9726       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod7x01234567lo, vprod7x01234567hi));
9727       vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod7x89ABCDEFlo, vprod7x89ABCDEFhi));
9728       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod7x89ABCDEFlo, vprod7x89ABCDEFhi));
9729 
9730       const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
9731       const __m128i vxi8x01234567 = _mm_cvtepu8_epi16(vi8x01234567);
9732       const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(uint8_t)));
9733       const __m128i vxk8x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk8x01234567), vk_zero_point);
9734       const __m128i vi8x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i8 + 8));
9735       const __m128i vxi8x89ABCDEF = _mm_cvtepu8_epi16(vi8x89ABCDEF);
9736       const __m128i vk8x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(uint8_t)));
9737       const __m128i vxk8x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk8x89ABCDEF), vk_zero_point);
9738       i8 += 16;
9739 
9740 
9741       const __m128i vprod8x01234567lo = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
9742       const __m128i vprod8x01234567hi = _mm_mulhi_epi16(vxi8x01234567, vxk8x01234567);
9743       const __m128i vprod8x89ABCDEFlo = _mm_mullo_epi16(vxi8x89ABCDEF, vxk8x89ABCDEF);
9744       const __m128i vprod8x89ABCDEFhi = _mm_mulhi_epi16(vxi8x89ABCDEF, vxk8x89ABCDEF);
9745 
9746       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod8x01234567lo, vprod8x01234567hi));
9747       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod8x01234567lo, vprod8x01234567hi));
9748       vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod8x89ABCDEFlo, vprod8x89ABCDEFhi));
9749       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod8x89ABCDEFlo, vprod8x89ABCDEFhi));
9750 
9751       const __m128i vi9x01234567 = _mm_loadl_epi64((const __m128i*) i9);
9752       const __m128i vxi9x01234567 = _mm_cvtepu8_epi16(vi9x01234567);
9753       const __m128i vk9x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(uint8_t)));
9754       const __m128i vxk9x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk9x01234567), vk_zero_point);
9755       const __m128i vi9x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i9 + 8));
9756       const __m128i vxi9x89ABCDEF = _mm_cvtepu8_epi16(vi9x89ABCDEF);
9757       const __m128i vk9x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 152 * sizeof(uint8_t)));
9758       const __m128i vxk9x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk9x89ABCDEF), vk_zero_point);
9759       i9 += 16;
9760 
9761 
9762       const __m128i vprod9x01234567lo = _mm_mullo_epi16(vxi9x01234567, vxk9x01234567);
9763       const __m128i vprod9x01234567hi = _mm_mulhi_epi16(vxi9x01234567, vxk9x01234567);
9764       const __m128i vprod9x89ABCDEFlo = _mm_mullo_epi16(vxi9x89ABCDEF, vxk9x89ABCDEF);
9765       const __m128i vprod9x89ABCDEFhi = _mm_mulhi_epi16(vxi9x89ABCDEF, vxk9x89ABCDEF);
9766 
9767       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod9x01234567lo, vprod9x01234567hi));
9768       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod9x01234567lo, vprod9x01234567hi));
9769       vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod9x89ABCDEFlo, vprod9x89ABCDEFhi));
9770       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod9x89ABCDEFlo, vprod9x89ABCDEFhi));
9771 
9772       const __m128i vi10x01234567 = _mm_loadl_epi64((const __m128i*) i10);
9773       const __m128i vxi10x01234567 = _mm_cvtepu8_epi16(vi10x01234567);
9774       const __m128i vk10x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 160 * sizeof(uint8_t)));
9775       const __m128i vxk10x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk10x01234567), vk_zero_point);
9776       const __m128i vi10x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i10 + 8));
9777       const __m128i vxi10x89ABCDEF = _mm_cvtepu8_epi16(vi10x89ABCDEF);
9778       const __m128i vk10x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 168 * sizeof(uint8_t)));
9779       const __m128i vxk10x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk10x89ABCDEF), vk_zero_point);
9780       i10 += 16;
9781 
9782 
9783       const __m128i vprod10x01234567lo = _mm_mullo_epi16(vxi10x01234567, vxk10x01234567);
9784       const __m128i vprod10x01234567hi = _mm_mulhi_epi16(vxi10x01234567, vxk10x01234567);
9785       const __m128i vprod10x89ABCDEFlo = _mm_mullo_epi16(vxi10x89ABCDEF, vxk10x89ABCDEF);
9786       const __m128i vprod10x89ABCDEFhi = _mm_mulhi_epi16(vxi10x89ABCDEF, vxk10x89ABCDEF);
9787 
9788       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod10x01234567lo, vprod10x01234567hi));
9789       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod10x01234567lo, vprod10x01234567hi));
9790       vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod10x89ABCDEFlo, vprod10x89ABCDEFhi));
9791       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod10x89ABCDEFlo, vprod10x89ABCDEFhi));
9792 
9793       const __m128i vi11x01234567 = _mm_loadl_epi64((const __m128i*) i11);
9794       const __m128i vxi11x01234567 = _mm_cvtepu8_epi16(vi11x01234567);
9795       const __m128i vk11x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 176 * sizeof(uint8_t)));
9796       const __m128i vxk11x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk11x01234567), vk_zero_point);
9797       const __m128i vi11x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i11 + 8));
9798       const __m128i vxi11x89ABCDEF = _mm_cvtepu8_epi16(vi11x89ABCDEF);
9799       const __m128i vk11x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 184 * sizeof(uint8_t)));
9800       const __m128i vxk11x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk11x89ABCDEF), vk_zero_point);
9801       i11 += 16;
9802 
9803 
9804       const __m128i vprod11x01234567lo = _mm_mullo_epi16(vxi11x01234567, vxk11x01234567);
9805       const __m128i vprod11x01234567hi = _mm_mulhi_epi16(vxi11x01234567, vxk11x01234567);
9806       const __m128i vprod11x89ABCDEFlo = _mm_mullo_epi16(vxi11x89ABCDEF, vxk11x89ABCDEF);
9807       const __m128i vprod11x89ABCDEFhi = _mm_mulhi_epi16(vxi11x89ABCDEF, vxk11x89ABCDEF);
9808 
9809       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod11x01234567lo, vprod11x01234567hi));
9810       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod11x01234567lo, vprod11x01234567hi));
9811       vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod11x89ABCDEFlo, vprod11x89ABCDEFhi));
9812       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod11x89ABCDEFlo, vprod11x89ABCDEFhi));
9813 
9814       const __m128i vi12x01234567 = _mm_loadl_epi64((const __m128i*) i12);
9815       const __m128i vxi12x01234567 = _mm_cvtepu8_epi16(vi12x01234567);
9816       const __m128i vk12x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 192 * sizeof(uint8_t)));
9817       const __m128i vxk12x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk12x01234567), vk_zero_point);
9818       const __m128i vi12x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i12 + 8));
9819       const __m128i vxi12x89ABCDEF = _mm_cvtepu8_epi16(vi12x89ABCDEF);
9820       const __m128i vk12x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 200 * sizeof(uint8_t)));
9821       const __m128i vxk12x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk12x89ABCDEF), vk_zero_point);
9822       i12 += 16;
9823 
9824 
9825       const __m128i vprod12x01234567lo = _mm_mullo_epi16(vxi12x01234567, vxk12x01234567);
9826       const __m128i vprod12x01234567hi = _mm_mulhi_epi16(vxi12x01234567, vxk12x01234567);
9827       const __m128i vprod12x89ABCDEFlo = _mm_mullo_epi16(vxi12x89ABCDEF, vxk12x89ABCDEF);
9828       const __m128i vprod12x89ABCDEFhi = _mm_mulhi_epi16(vxi12x89ABCDEF, vxk12x89ABCDEF);
9829 
9830       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod12x01234567lo, vprod12x01234567hi));
9831       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod12x01234567lo, vprod12x01234567hi));
9832       vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod12x89ABCDEFlo, vprod12x89ABCDEFhi));
9833       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod12x89ABCDEFlo, vprod12x89ABCDEFhi));
9834 
9835       const __m128i vi13x01234567 = _mm_loadl_epi64((const __m128i*) i13);
9836       const __m128i vxi13x01234567 = _mm_cvtepu8_epi16(vi13x01234567);
9837       const __m128i vk13x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 208 * sizeof(uint8_t)));
9838       const __m128i vxk13x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk13x01234567), vk_zero_point);
9839       const __m128i vi13x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i13 + 8));
9840       const __m128i vxi13x89ABCDEF = _mm_cvtepu8_epi16(vi13x89ABCDEF);
9841       const __m128i vk13x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 216 * sizeof(uint8_t)));
9842       const __m128i vxk13x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk13x89ABCDEF), vk_zero_point);
9843       i13 += 16;
9844 
9845 
9846       const __m128i vprod13x01234567lo = _mm_mullo_epi16(vxi13x01234567, vxk13x01234567);
9847       const __m128i vprod13x01234567hi = _mm_mulhi_epi16(vxi13x01234567, vxk13x01234567);
9848       const __m128i vprod13x89ABCDEFlo = _mm_mullo_epi16(vxi13x89ABCDEF, vxk13x89ABCDEF);
9849       const __m128i vprod13x89ABCDEFhi = _mm_mulhi_epi16(vxi13x89ABCDEF, vxk13x89ABCDEF);
9850 
9851       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod13x01234567lo, vprod13x01234567hi));
9852       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod13x01234567lo, vprod13x01234567hi));
9853       vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod13x89ABCDEFlo, vprod13x89ABCDEFhi));
9854       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod13x89ABCDEFlo, vprod13x89ABCDEFhi));
9855 
9856       const __m128i vi14x01234567 = _mm_loadl_epi64((const __m128i*) i14);
9857       const __m128i vxi14x01234567 = _mm_cvtepu8_epi16(vi14x01234567);
9858       const __m128i vk14x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 224 * sizeof(uint8_t)));
9859       const __m128i vxk14x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk14x01234567), vk_zero_point);
9860       const __m128i vi14x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i14 + 8));
9861       const __m128i vxi14x89ABCDEF = _mm_cvtepu8_epi16(vi14x89ABCDEF);
9862       const __m128i vk14x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 232 * sizeof(uint8_t)));
9863       const __m128i vxk14x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk14x89ABCDEF), vk_zero_point);
9864       i14 += 16;
9865 
9866 
9867       const __m128i vprod14x01234567lo = _mm_mullo_epi16(vxi14x01234567, vxk14x01234567);
9868       const __m128i vprod14x01234567hi = _mm_mulhi_epi16(vxi14x01234567, vxk14x01234567);
9869       const __m128i vprod14x89ABCDEFlo = _mm_mullo_epi16(vxi14x89ABCDEF, vxk14x89ABCDEF);
9870       const __m128i vprod14x89ABCDEFhi = _mm_mulhi_epi16(vxi14x89ABCDEF, vxk14x89ABCDEF);
9871 
9872       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod14x01234567lo, vprod14x01234567hi));
9873       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod14x01234567lo, vprod14x01234567hi));
9874       vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod14x89ABCDEFlo, vprod14x89ABCDEFhi));
9875       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod14x89ABCDEFlo, vprod14x89ABCDEFhi));
9876 
9877       const __m128i vi15x01234567 = _mm_loadl_epi64((const __m128i*) i15);
9878       const __m128i vxi15x01234567 = _mm_cvtepu8_epi16(vi15x01234567);
9879       const __m128i vk15x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 240 * sizeof(uint8_t)));
9880       const __m128i vxk15x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk15x01234567), vk_zero_point);
9881       const __m128i vi15x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i15 + 8));
9882       const __m128i vxi15x89ABCDEF = _mm_cvtepu8_epi16(vi15x89ABCDEF);
9883       const __m128i vk15x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 248 * sizeof(uint8_t)));
9884       const __m128i vxk15x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk15x89ABCDEF), vk_zero_point);
9885       i15 += 16;
9886 
9887 
9888       const __m128i vprod15x01234567lo = _mm_mullo_epi16(vxi15x01234567, vxk15x01234567);
9889       const __m128i vprod15x01234567hi = _mm_mulhi_epi16(vxi15x01234567, vxk15x01234567);
9890       const __m128i vprod15x89ABCDEFlo = _mm_mullo_epi16(vxi15x89ABCDEF, vxk15x89ABCDEF);
9891       const __m128i vprod15x89ABCDEFhi = _mm_mulhi_epi16(vxi15x89ABCDEF, vxk15x89ABCDEF);
9892 
9893       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod15x01234567lo, vprod15x01234567hi));
9894       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod15x01234567lo, vprod15x01234567hi));
9895       vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod15x89ABCDEFlo, vprod15x89ABCDEFhi));
9896       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod15x89ABCDEFlo, vprod15x89ABCDEFhi));
9897 
9898       const __m128i vi16x01234567 = _mm_loadl_epi64((const __m128i*) i16);
9899       const __m128i vxi16x01234567 = _mm_cvtepu8_epi16(vi16x01234567);
9900       const __m128i vk16x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 256 * sizeof(uint8_t)));
9901       const __m128i vxk16x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk16x01234567), vk_zero_point);
9902       const __m128i vi16x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i16 + 8));
9903       const __m128i vxi16x89ABCDEF = _mm_cvtepu8_epi16(vi16x89ABCDEF);
9904       const __m128i vk16x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 264 * sizeof(uint8_t)));
9905       const __m128i vxk16x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk16x89ABCDEF), vk_zero_point);
9906       i16 += 16;
9907 
9908 
9909       const __m128i vprod16x01234567lo = _mm_mullo_epi16(vxi16x01234567, vxk16x01234567);
9910       const __m128i vprod16x01234567hi = _mm_mulhi_epi16(vxi16x01234567, vxk16x01234567);
9911       const __m128i vprod16x89ABCDEFlo = _mm_mullo_epi16(vxi16x89ABCDEF, vxk16x89ABCDEF);
9912       const __m128i vprod16x89ABCDEFhi = _mm_mulhi_epi16(vxi16x89ABCDEF, vxk16x89ABCDEF);
9913 
9914       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod16x01234567lo, vprod16x01234567hi));
9915       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod16x01234567lo, vprod16x01234567hi));
9916       vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod16x89ABCDEFlo, vprod16x89ABCDEFhi));
9917       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod16x89ABCDEFlo, vprod16x89ABCDEFhi));
9918 
9919       const __m128i vi17x01234567 = _mm_loadl_epi64((const __m128i*) i17);
9920       const __m128i vxi17x01234567 = _mm_cvtepu8_epi16(vi17x01234567);
9921       const __m128i vk17x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 272 * sizeof(uint8_t)));
9922       const __m128i vxk17x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk17x01234567), vk_zero_point);
9923       const __m128i vi17x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i17 + 8));
9924       const __m128i vxi17x89ABCDEF = _mm_cvtepu8_epi16(vi17x89ABCDEF);
9925       const __m128i vk17x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 280 * sizeof(uint8_t)));
9926       const __m128i vxk17x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk17x89ABCDEF), vk_zero_point);
9927       i17 += 16;
9928 
9929 
9930       const __m128i vprod17x01234567lo = _mm_mullo_epi16(vxi17x01234567, vxk17x01234567);
9931       const __m128i vprod17x01234567hi = _mm_mulhi_epi16(vxi17x01234567, vxk17x01234567);
9932       const __m128i vprod17x89ABCDEFlo = _mm_mullo_epi16(vxi17x89ABCDEF, vxk17x89ABCDEF);
9933       const __m128i vprod17x89ABCDEFhi = _mm_mulhi_epi16(vxi17x89ABCDEF, vxk17x89ABCDEF);
9934 
9935       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod17x01234567lo, vprod17x01234567hi));
9936       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod17x01234567lo, vprod17x01234567hi));
9937       vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod17x89ABCDEFlo, vprod17x89ABCDEFhi));
9938       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod17x89ABCDEFlo, vprod17x89ABCDEFhi));
9939 
9940       const __m128i vi18x01234567 = _mm_loadl_epi64((const __m128i*) i18);
9941       const __m128i vxi18x01234567 = _mm_cvtepu8_epi16(vi18x01234567);
9942       const __m128i vk18x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 288 * sizeof(uint8_t)));
9943       const __m128i vxk18x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk18x01234567), vk_zero_point);
9944       const __m128i vi18x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i18 + 8));
9945       const __m128i vxi18x89ABCDEF = _mm_cvtepu8_epi16(vi18x89ABCDEF);
9946       const __m128i vk18x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 296 * sizeof(uint8_t)));
9947       const __m128i vxk18x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk18x89ABCDEF), vk_zero_point);
9948       i18 += 16;
9949 
9950 
9951       const __m128i vprod18x01234567lo = _mm_mullo_epi16(vxi18x01234567, vxk18x01234567);
9952       const __m128i vprod18x01234567hi = _mm_mulhi_epi16(vxi18x01234567, vxk18x01234567);
9953       const __m128i vprod18x89ABCDEFlo = _mm_mullo_epi16(vxi18x89ABCDEF, vxk18x89ABCDEF);
9954       const __m128i vprod18x89ABCDEFhi = _mm_mulhi_epi16(vxi18x89ABCDEF, vxk18x89ABCDEF);
9955 
9956       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod18x01234567lo, vprod18x01234567hi));
9957       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod18x01234567lo, vprod18x01234567hi));
9958       vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod18x89ABCDEFlo, vprod18x89ABCDEFhi));
9959       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod18x89ABCDEFlo, vprod18x89ABCDEFhi));
9960 
9961       const __m128i vi19x01234567 = _mm_loadl_epi64((const __m128i*) i19);
9962       const __m128i vxi19x01234567 = _mm_cvtepu8_epi16(vi19x01234567);
9963       const __m128i vk19x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 304 * sizeof(uint8_t)));
9964       const __m128i vxk19x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk19x01234567), vk_zero_point);
9965       const __m128i vi19x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i19 + 8));
9966       const __m128i vxi19x89ABCDEF = _mm_cvtepu8_epi16(vi19x89ABCDEF);
9967       const __m128i vk19x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 312 * sizeof(uint8_t)));
9968       const __m128i vxk19x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk19x89ABCDEF), vk_zero_point);
9969       i19 += 16;
9970 
9971 
9972       const __m128i vprod19x01234567lo = _mm_mullo_epi16(vxi19x01234567, vxk19x01234567);
9973       const __m128i vprod19x01234567hi = _mm_mulhi_epi16(vxi19x01234567, vxk19x01234567);
9974       const __m128i vprod19x89ABCDEFlo = _mm_mullo_epi16(vxi19x89ABCDEF, vxk19x89ABCDEF);
9975       const __m128i vprod19x89ABCDEFhi = _mm_mulhi_epi16(vxi19x89ABCDEF, vxk19x89ABCDEF);
9976 
9977       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod19x01234567lo, vprod19x01234567hi));
9978       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod19x01234567lo, vprod19x01234567hi));
9979       vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod19x89ABCDEFlo, vprod19x89ABCDEFhi));
9980       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod19x89ABCDEFlo, vprod19x89ABCDEFhi));
9981 
9982       const __m128i vi20x01234567 = _mm_loadl_epi64((const __m128i*) i20);
9983       const __m128i vxi20x01234567 = _mm_cvtepu8_epi16(vi20x01234567);
9984       const __m128i vk20x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 320 * sizeof(uint8_t)));
9985       const __m128i vxk20x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk20x01234567), vk_zero_point);
9986       const __m128i vi20x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i20 + 8));
9987       const __m128i vxi20x89ABCDEF = _mm_cvtepu8_epi16(vi20x89ABCDEF);
9988       const __m128i vk20x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 328 * sizeof(uint8_t)));
9989       const __m128i vxk20x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk20x89ABCDEF), vk_zero_point);
9990       i20 += 16;
9991 
9992 
9993       const __m128i vprod20x01234567lo = _mm_mullo_epi16(vxi20x01234567, vxk20x01234567);
9994       const __m128i vprod20x01234567hi = _mm_mulhi_epi16(vxi20x01234567, vxk20x01234567);
9995       const __m128i vprod20x89ABCDEFlo = _mm_mullo_epi16(vxi20x89ABCDEF, vxk20x89ABCDEF);
9996       const __m128i vprod20x89ABCDEFhi = _mm_mulhi_epi16(vxi20x89ABCDEF, vxk20x89ABCDEF);
9997 
9998       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod20x01234567lo, vprod20x01234567hi));
9999       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod20x01234567lo, vprod20x01234567hi));
10000       vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod20x89ABCDEFlo, vprod20x89ABCDEFhi));
10001       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod20x89ABCDEFlo, vprod20x89ABCDEFhi));
10002 
10003       const __m128i vi21x01234567 = _mm_loadl_epi64((const __m128i*) i21);
10004       const __m128i vxi21x01234567 = _mm_cvtepu8_epi16(vi21x01234567);
10005       const __m128i vk21x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 336 * sizeof(uint8_t)));
10006       const __m128i vxk21x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk21x01234567), vk_zero_point);
10007       const __m128i vi21x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i21 + 8));
10008       const __m128i vxi21x89ABCDEF = _mm_cvtepu8_epi16(vi21x89ABCDEF);
10009       const __m128i vk21x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 344 * sizeof(uint8_t)));
10010       const __m128i vxk21x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk21x89ABCDEF), vk_zero_point);
10011       i21 += 16;
10012 
10013 
10014       const __m128i vprod21x01234567lo = _mm_mullo_epi16(vxi21x01234567, vxk21x01234567);
10015       const __m128i vprod21x01234567hi = _mm_mulhi_epi16(vxi21x01234567, vxk21x01234567);
10016       const __m128i vprod21x89ABCDEFlo = _mm_mullo_epi16(vxi21x89ABCDEF, vxk21x89ABCDEF);
10017       const __m128i vprod21x89ABCDEFhi = _mm_mulhi_epi16(vxi21x89ABCDEF, vxk21x89ABCDEF);
10018 
10019       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod21x01234567lo, vprod21x01234567hi));
10020       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod21x01234567lo, vprod21x01234567hi));
10021       vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod21x89ABCDEFlo, vprod21x89ABCDEFhi));
10022       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod21x89ABCDEFlo, vprod21x89ABCDEFhi));
10023 
10024       const __m128i vi22x01234567 = _mm_loadl_epi64((const __m128i*) i22);
10025       const __m128i vxi22x01234567 = _mm_cvtepu8_epi16(vi22x01234567);
10026       const __m128i vk22x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 352 * sizeof(uint8_t)));
10027       const __m128i vxk22x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk22x01234567), vk_zero_point);
10028       const __m128i vi22x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i22 + 8));
10029       const __m128i vxi22x89ABCDEF = _mm_cvtepu8_epi16(vi22x89ABCDEF);
10030       const __m128i vk22x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 360 * sizeof(uint8_t)));
10031       const __m128i vxk22x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk22x89ABCDEF), vk_zero_point);
10032       i22 += 16;
10033 
10034 
10035       const __m128i vprod22x01234567lo = _mm_mullo_epi16(vxi22x01234567, vxk22x01234567);
10036       const __m128i vprod22x01234567hi = _mm_mulhi_epi16(vxi22x01234567, vxk22x01234567);
10037       const __m128i vprod22x89ABCDEFlo = _mm_mullo_epi16(vxi22x89ABCDEF, vxk22x89ABCDEF);
10038       const __m128i vprod22x89ABCDEFhi = _mm_mulhi_epi16(vxi22x89ABCDEF, vxk22x89ABCDEF);
10039 
10040       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod22x01234567lo, vprod22x01234567hi));
10041       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod22x01234567lo, vprod22x01234567hi));
10042       vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod22x89ABCDEFlo, vprod22x89ABCDEFhi));
10043       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod22x89ABCDEFlo, vprod22x89ABCDEFhi));
10044 
10045       const __m128i vi23x01234567 = _mm_loadl_epi64((const __m128i*) i23);
10046       const __m128i vxi23x01234567 = _mm_cvtepu8_epi16(vi23x01234567);
10047       const __m128i vk23x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 368 * sizeof(uint8_t)));
10048       const __m128i vxk23x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk23x01234567), vk_zero_point);
10049       const __m128i vi23x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i23 + 8));
10050       const __m128i vxi23x89ABCDEF = _mm_cvtepu8_epi16(vi23x89ABCDEF);
10051       const __m128i vk23x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 376 * sizeof(uint8_t)));
10052       const __m128i vxk23x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk23x89ABCDEF), vk_zero_point);
10053       i23 += 16;
10054 
10055 
10056       const __m128i vprod23x01234567lo = _mm_mullo_epi16(vxi23x01234567, vxk23x01234567);
10057       const __m128i vprod23x01234567hi = _mm_mulhi_epi16(vxi23x01234567, vxk23x01234567);
10058       const __m128i vprod23x89ABCDEFlo = _mm_mullo_epi16(vxi23x89ABCDEF, vxk23x89ABCDEF);
10059       const __m128i vprod23x89ABCDEFhi = _mm_mulhi_epi16(vxi23x89ABCDEF, vxk23x89ABCDEF);
10060 
10061       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod23x01234567lo, vprod23x01234567hi));
10062       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod23x01234567lo, vprod23x01234567hi));
10063       vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod23x89ABCDEFlo, vprod23x89ABCDEFhi));
10064       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod23x89ABCDEFlo, vprod23x89ABCDEFhi));
10065 
10066       const __m128i vi24x01234567 = _mm_loadl_epi64((const __m128i*) i24);
10067       const __m128i vxi24x01234567 = _mm_cvtepu8_epi16(vi24x01234567);
10068       const __m128i vk24x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 384 * sizeof(uint8_t)));
10069       const __m128i vxk24x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk24x01234567), vk_zero_point);
10070       const __m128i vi24x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i24 + 8));
10071       const __m128i vxi24x89ABCDEF = _mm_cvtepu8_epi16(vi24x89ABCDEF);
10072       const __m128i vk24x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 392 * sizeof(uint8_t)));
10073       const __m128i vxk24x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk24x89ABCDEF), vk_zero_point);
10074       i24 += 16;
10075 
10076 
10077       const __m128i vprod24x01234567lo = _mm_mullo_epi16(vxi24x01234567, vxk24x01234567);
10078       const __m128i vprod24x01234567hi = _mm_mulhi_epi16(vxi24x01234567, vxk24x01234567);
10079       const __m128i vprod24x89ABCDEFlo = _mm_mullo_epi16(vxi24x89ABCDEF, vxk24x89ABCDEF);
10080       const __m128i vprod24x89ABCDEFhi = _mm_mulhi_epi16(vxi24x89ABCDEF, vxk24x89ABCDEF);
10081 
10082       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod24x01234567lo, vprod24x01234567hi));
10083       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod24x01234567lo, vprod24x01234567hi));
10084       vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod24x89ABCDEFlo, vprod24x89ABCDEFhi));
10085       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod24x89ABCDEFlo, vprod24x89ABCDEFhi));
10086 
10087       w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(uint8_t));
10088 
10089       __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
10090       __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
10091       __m128 vscaled89AB = _mm_cvtepi32_ps(vacc89AB);
10092       __m128 vscaledCDEF = _mm_cvtepi32_ps(vaccCDEF);
10093 
10094       const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
10095       vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
10096       vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
10097       vscaled89AB = _mm_mul_ps(vscaled89AB, vscale);
10098       vscaledCDEF = _mm_mul_ps(vscaledCDEF, vscale);
10099 
10100       const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
10101       vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
10102       vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
10103       vscaled89AB = _mm_min_ps(vscaled89AB, voutput_max_less_zero_point);
10104       vscaledCDEF = _mm_min_ps(vscaledCDEF, voutput_max_less_zero_point);
10105 
10106       vacc0123 = _mm_cvtps_epi32(vscaled0123);
10107       vacc4567 = _mm_cvtps_epi32(vscaled4567);
10108       vacc89AB = _mm_cvtps_epi32(vscaled89AB);
10109       vaccCDEF = _mm_cvtps_epi32(vscaledCDEF);
10110 
10111       const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
10112       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
10113       __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
10114 
10115       __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF);
10116 
10117       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
10118       vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
10119 
10120       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
10121       output += 16;
10122     }
10123     if XNN_UNLIKELY(c != 0) {
10124       const uint8_t* k = (const uint8_t*) ((const int32_t*) w + 16);
10125       do {
10126         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
10127         __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
10128 
10129 
10130         const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
10131         const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(vi0x01234567);
10132         const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) k);
10133         const __m128i vxk0x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk0x01234567), vk_zero_point);
10134         i0 += 8;
10135 
10136 
10137         const __m128i vprod0x01234567lo = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
10138         const __m128i vprod0x01234567hi = _mm_mulhi_epi16(vxi0x01234567, vxk0x01234567);
10139 
10140         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod0x01234567lo, vprod0x01234567hi));
10141         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod0x01234567lo, vprod0x01234567hi));
10142 
10143         const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
10144         const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(vi1x01234567);
10145         const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) (k + 16));
10146         const __m128i vxk1x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk1x01234567), vk_zero_point);
10147         i1 += 8;
10148 
10149 
10150         const __m128i vprod1x01234567lo = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
10151         const __m128i vprod1x01234567hi = _mm_mulhi_epi16(vxi1x01234567, vxk1x01234567);
10152 
10153         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod1x01234567lo, vprod1x01234567hi));
10154         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod1x01234567lo, vprod1x01234567hi));
10155 
10156         const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
10157         const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(vi2x01234567);
10158         const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) (k + 32));
10159         const __m128i vxk2x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk2x01234567), vk_zero_point);
10160         i2 += 8;
10161 
10162 
10163         const __m128i vprod2x01234567lo = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
10164         const __m128i vprod2x01234567hi = _mm_mulhi_epi16(vxi2x01234567, vxk2x01234567);
10165 
10166         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod2x01234567lo, vprod2x01234567hi));
10167         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod2x01234567lo, vprod2x01234567hi));
10168 
10169         const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
10170         const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(vi3x01234567);
10171         const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) (k + 48));
10172         const __m128i vxk3x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk3x01234567), vk_zero_point);
10173         i3 += 8;
10174 
10175 
10176         const __m128i vprod3x01234567lo = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567);
10177         const __m128i vprod3x01234567hi = _mm_mulhi_epi16(vxi3x01234567, vxk3x01234567);
10178 
10179         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod3x01234567lo, vprod3x01234567hi));
10180         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod3x01234567lo, vprod3x01234567hi));
10181 
10182         const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
10183         const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(vi4x01234567);
10184         const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) (k + 64));
10185         const __m128i vxk4x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk4x01234567), vk_zero_point);
10186         i4 += 8;
10187 
10188 
10189         const __m128i vprod4x01234567lo = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
10190         const __m128i vprod4x01234567hi = _mm_mulhi_epi16(vxi4x01234567, vxk4x01234567);
10191 
10192         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod4x01234567lo, vprod4x01234567hi));
10193         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod4x01234567lo, vprod4x01234567hi));
10194 
10195         const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
10196         const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(vi5x01234567);
10197         const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) (k + 80));
10198         const __m128i vxk5x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk5x01234567), vk_zero_point);
10199         i5 += 8;
10200 
10201 
10202         const __m128i vprod5x01234567lo = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567);
10203         const __m128i vprod5x01234567hi = _mm_mulhi_epi16(vxi5x01234567, vxk5x01234567);
10204 
10205         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod5x01234567lo, vprod5x01234567hi));
10206         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod5x01234567lo, vprod5x01234567hi));
10207 
10208         const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
10209         const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(vi6x01234567);
10210         const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) (k + 96));
10211         const __m128i vxk6x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk6x01234567), vk_zero_point);
10212         i6 += 8;
10213 
10214 
10215         const __m128i vprod6x01234567lo = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
10216         const __m128i vprod6x01234567hi = _mm_mulhi_epi16(vxi6x01234567, vxk6x01234567);
10217 
10218         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod6x01234567lo, vprod6x01234567hi));
10219         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod6x01234567lo, vprod6x01234567hi));
10220 
10221         const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
10222         const __m128i vxi7x01234567 = _mm_cvtepu8_epi16(vi7x01234567);
10223         const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) (k + 112));
10224         const __m128i vxk7x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk7x01234567), vk_zero_point);
10225         i7 += 8;
10226 
10227 
10228         const __m128i vprod7x01234567lo = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567);
10229         const __m128i vprod7x01234567hi = _mm_mulhi_epi16(vxi7x01234567, vxk7x01234567);
10230 
10231         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod7x01234567lo, vprod7x01234567hi));
10232         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod7x01234567lo, vprod7x01234567hi));
10233 
10234         const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
10235         const __m128i vxi8x01234567 = _mm_cvtepu8_epi16(vi8x01234567);
10236         const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) (k + 128));
10237         const __m128i vxk8x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk8x01234567), vk_zero_point);
10238         i8 += 8;
10239 
10240 
10241         const __m128i vprod8x01234567lo = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
10242         const __m128i vprod8x01234567hi = _mm_mulhi_epi16(vxi8x01234567, vxk8x01234567);
10243 
10244         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod8x01234567lo, vprod8x01234567hi));
10245         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod8x01234567lo, vprod8x01234567hi));
10246 
10247         const __m128i vi9x01234567 = _mm_loadl_epi64((const __m128i*) i9);
10248         const __m128i vxi9x01234567 = _mm_cvtepu8_epi16(vi9x01234567);
10249         const __m128i vk9x01234567 = _mm_loadl_epi64((const __m128i*) (k + 144));
10250         const __m128i vxk9x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk9x01234567), vk_zero_point);
10251         i9 += 8;
10252 
10253 
10254         const __m128i vprod9x01234567lo = _mm_mullo_epi16(vxi9x01234567, vxk9x01234567);
10255         const __m128i vprod9x01234567hi = _mm_mulhi_epi16(vxi9x01234567, vxk9x01234567);
10256 
10257         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod9x01234567lo, vprod9x01234567hi));
10258         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod9x01234567lo, vprod9x01234567hi));
10259 
10260         const __m128i vi10x01234567 = _mm_loadl_epi64((const __m128i*) i10);
10261         const __m128i vxi10x01234567 = _mm_cvtepu8_epi16(vi10x01234567);
10262         const __m128i vk10x01234567 = _mm_loadl_epi64((const __m128i*) (k + 160));
10263         const __m128i vxk10x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk10x01234567), vk_zero_point);
10264         i10 += 8;
10265 
10266 
10267         const __m128i vprod10x01234567lo = _mm_mullo_epi16(vxi10x01234567, vxk10x01234567);
10268         const __m128i vprod10x01234567hi = _mm_mulhi_epi16(vxi10x01234567, vxk10x01234567);
10269 
10270         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod10x01234567lo, vprod10x01234567hi));
10271         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod10x01234567lo, vprod10x01234567hi));
10272 
10273         const __m128i vi11x01234567 = _mm_loadl_epi64((const __m128i*) i11);
10274         const __m128i vxi11x01234567 = _mm_cvtepu8_epi16(vi11x01234567);
10275         const __m128i vk11x01234567 = _mm_loadl_epi64((const __m128i*) (k + 176));
10276         const __m128i vxk11x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk11x01234567), vk_zero_point);
10277         i11 += 8;
10278 
10279 
10280         const __m128i vprod11x01234567lo = _mm_mullo_epi16(vxi11x01234567, vxk11x01234567);
10281         const __m128i vprod11x01234567hi = _mm_mulhi_epi16(vxi11x01234567, vxk11x01234567);
10282 
10283         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod11x01234567lo, vprod11x01234567hi));
10284         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod11x01234567lo, vprod11x01234567hi));
10285 
10286         const __m128i vi12x01234567 = _mm_loadl_epi64((const __m128i*) i12);
10287         const __m128i vxi12x01234567 = _mm_cvtepu8_epi16(vi12x01234567);
10288         const __m128i vk12x01234567 = _mm_loadl_epi64((const __m128i*) (k + 192));
10289         const __m128i vxk12x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk12x01234567), vk_zero_point);
10290         i12 += 8;
10291 
10292 
10293         const __m128i vprod12x01234567lo = _mm_mullo_epi16(vxi12x01234567, vxk12x01234567);
10294         const __m128i vprod12x01234567hi = _mm_mulhi_epi16(vxi12x01234567, vxk12x01234567);
10295 
10296         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod12x01234567lo, vprod12x01234567hi));
10297         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod12x01234567lo, vprod12x01234567hi));
10298 
10299         const __m128i vi13x01234567 = _mm_loadl_epi64((const __m128i*) i13);
10300         const __m128i vxi13x01234567 = _mm_cvtepu8_epi16(vi13x01234567);
10301         const __m128i vk13x01234567 = _mm_loadl_epi64((const __m128i*) (k + 208));
10302         const __m128i vxk13x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk13x01234567), vk_zero_point);
10303         i13 += 8;
10304 
10305 
10306         const __m128i vprod13x01234567lo = _mm_mullo_epi16(vxi13x01234567, vxk13x01234567);
10307         const __m128i vprod13x01234567hi = _mm_mulhi_epi16(vxi13x01234567, vxk13x01234567);
10308 
10309         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod13x01234567lo, vprod13x01234567hi));
10310         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod13x01234567lo, vprod13x01234567hi));
10311 
10312         const __m128i vi14x01234567 = _mm_loadl_epi64((const __m128i*) i14);
10313         const __m128i vxi14x01234567 = _mm_cvtepu8_epi16(vi14x01234567);
10314         const __m128i vk14x01234567 = _mm_loadl_epi64((const __m128i*) (k + 224));
10315         const __m128i vxk14x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk14x01234567), vk_zero_point);
10316         i14 += 8;
10317 
10318 
10319         const __m128i vprod14x01234567lo = _mm_mullo_epi16(vxi14x01234567, vxk14x01234567);
10320         const __m128i vprod14x01234567hi = _mm_mulhi_epi16(vxi14x01234567, vxk14x01234567);
10321 
10322         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod14x01234567lo, vprod14x01234567hi));
10323         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod14x01234567lo, vprod14x01234567hi));
10324 
10325         const __m128i vi15x01234567 = _mm_loadl_epi64((const __m128i*) i15);
10326         const __m128i vxi15x01234567 = _mm_cvtepu8_epi16(vi15x01234567);
10327         const __m128i vk15x01234567 = _mm_loadl_epi64((const __m128i*) (k + 240));
10328         const __m128i vxk15x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk15x01234567), vk_zero_point);
10329         i15 += 8;
10330 
10331 
10332         const __m128i vprod15x01234567lo = _mm_mullo_epi16(vxi15x01234567, vxk15x01234567);
10333         const __m128i vprod15x01234567hi = _mm_mulhi_epi16(vxi15x01234567, vxk15x01234567);
10334 
10335         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod15x01234567lo, vprod15x01234567hi));
10336         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod15x01234567lo, vprod15x01234567hi));
10337 
10338         const __m128i vi16x01234567 = _mm_loadl_epi64((const __m128i*) i16);
10339         const __m128i vxi16x01234567 = _mm_cvtepu8_epi16(vi16x01234567);
10340         const __m128i vk16x01234567 = _mm_loadl_epi64((const __m128i*) (k + 256));
10341         const __m128i vxk16x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk16x01234567), vk_zero_point);
10342         i16 += 8;
10343 
10344 
10345         const __m128i vprod16x01234567lo = _mm_mullo_epi16(vxi16x01234567, vxk16x01234567);
10346         const __m128i vprod16x01234567hi = _mm_mulhi_epi16(vxi16x01234567, vxk16x01234567);
10347 
10348         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod16x01234567lo, vprod16x01234567hi));
10349         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod16x01234567lo, vprod16x01234567hi));
10350 
10351         const __m128i vi17x01234567 = _mm_loadl_epi64((const __m128i*) i17);
10352         const __m128i vxi17x01234567 = _mm_cvtepu8_epi16(vi17x01234567);
10353         const __m128i vk17x01234567 = _mm_loadl_epi64((const __m128i*) (k + 272));
10354         const __m128i vxk17x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk17x01234567), vk_zero_point);
10355         i17 += 8;
10356 
10357 
10358         const __m128i vprod17x01234567lo = _mm_mullo_epi16(vxi17x01234567, vxk17x01234567);
10359         const __m128i vprod17x01234567hi = _mm_mulhi_epi16(vxi17x01234567, vxk17x01234567);
10360 
10361         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod17x01234567lo, vprod17x01234567hi));
10362         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod17x01234567lo, vprod17x01234567hi));
10363 
10364         const __m128i vi18x01234567 = _mm_loadl_epi64((const __m128i*) i18);
10365         const __m128i vxi18x01234567 = _mm_cvtepu8_epi16(vi18x01234567);
10366         const __m128i vk18x01234567 = _mm_loadl_epi64((const __m128i*) (k + 288));
10367         const __m128i vxk18x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk18x01234567), vk_zero_point);
10368         i18 += 8;
10369 
10370 
10371         const __m128i vprod18x01234567lo = _mm_mullo_epi16(vxi18x01234567, vxk18x01234567);
10372         const __m128i vprod18x01234567hi = _mm_mulhi_epi16(vxi18x01234567, vxk18x01234567);
10373 
10374         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod18x01234567lo, vprod18x01234567hi));
10375         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod18x01234567lo, vprod18x01234567hi));
10376 
10377         const __m128i vi19x01234567 = _mm_loadl_epi64((const __m128i*) i19);
10378         const __m128i vxi19x01234567 = _mm_cvtepu8_epi16(vi19x01234567);
10379         const __m128i vk19x01234567 = _mm_loadl_epi64((const __m128i*) (k + 304));
10380         const __m128i vxk19x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk19x01234567), vk_zero_point);
10381         i19 += 8;
10382 
10383 
10384         const __m128i vprod19x01234567lo = _mm_mullo_epi16(vxi19x01234567, vxk19x01234567);
10385         const __m128i vprod19x01234567hi = _mm_mulhi_epi16(vxi19x01234567, vxk19x01234567);
10386 
10387         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod19x01234567lo, vprod19x01234567hi));
10388         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod19x01234567lo, vprod19x01234567hi));
10389 
10390         const __m128i vi20x01234567 = _mm_loadl_epi64((const __m128i*) i20);
10391         const __m128i vxi20x01234567 = _mm_cvtepu8_epi16(vi20x01234567);
10392         const __m128i vk20x01234567 = _mm_loadl_epi64((const __m128i*) (k + 320));
10393         const __m128i vxk20x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk20x01234567), vk_zero_point);
10394         i20 += 8;
10395 
10396 
10397         const __m128i vprod20x01234567lo = _mm_mullo_epi16(vxi20x01234567, vxk20x01234567);
10398         const __m128i vprod20x01234567hi = _mm_mulhi_epi16(vxi20x01234567, vxk20x01234567);
10399 
10400         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod20x01234567lo, vprod20x01234567hi));
10401         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod20x01234567lo, vprod20x01234567hi));
10402 
10403         const __m128i vi21x01234567 = _mm_loadl_epi64((const __m128i*) i21);
10404         const __m128i vxi21x01234567 = _mm_cvtepu8_epi16(vi21x01234567);
10405         const __m128i vk21x01234567 = _mm_loadl_epi64((const __m128i*) (k + 336));
10406         const __m128i vxk21x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk21x01234567), vk_zero_point);
10407         i21 += 8;
10408 
10409 
10410         const __m128i vprod21x01234567lo = _mm_mullo_epi16(vxi21x01234567, vxk21x01234567);
10411         const __m128i vprod21x01234567hi = _mm_mulhi_epi16(vxi21x01234567, vxk21x01234567);
10412 
10413         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod21x01234567lo, vprod21x01234567hi));
10414         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod21x01234567lo, vprod21x01234567hi));
10415 
10416         const __m128i vi22x01234567 = _mm_loadl_epi64((const __m128i*) i22);
10417         const __m128i vxi22x01234567 = _mm_cvtepu8_epi16(vi22x01234567);
10418         const __m128i vk22x01234567 = _mm_loadl_epi64((const __m128i*) (k + 352));
10419         const __m128i vxk22x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk22x01234567), vk_zero_point);
10420         i22 += 8;
10421 
10422 
10423         const __m128i vprod22x01234567lo = _mm_mullo_epi16(vxi22x01234567, vxk22x01234567);
10424         const __m128i vprod22x01234567hi = _mm_mulhi_epi16(vxi22x01234567, vxk22x01234567);
10425 
10426         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod22x01234567lo, vprod22x01234567hi));
10427         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod22x01234567lo, vprod22x01234567hi));
10428 
10429         const __m128i vi23x01234567 = _mm_loadl_epi64((const __m128i*) i23);
10430         const __m128i vxi23x01234567 = _mm_cvtepu8_epi16(vi23x01234567);
10431         const __m128i vk23x01234567 = _mm_loadl_epi64((const __m128i*) (k + 368));
10432         const __m128i vxk23x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk23x01234567), vk_zero_point);
10433         i23 += 8;
10434 
10435 
10436         const __m128i vprod23x01234567lo = _mm_mullo_epi16(vxi23x01234567, vxk23x01234567);
10437         const __m128i vprod23x01234567hi = _mm_mulhi_epi16(vxi23x01234567, vxk23x01234567);
10438 
10439         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod23x01234567lo, vprod23x01234567hi));
10440         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod23x01234567lo, vprod23x01234567hi));
10441 
10442         const __m128i vi24x01234567 = _mm_loadl_epi64((const __m128i*) i24);
10443         const __m128i vxi24x01234567 = _mm_cvtepu8_epi16(vi24x01234567);
10444         const __m128i vk24x01234567 = _mm_loadl_epi64((const __m128i*) (k + 384));
10445         const __m128i vxk24x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk24x01234567), vk_zero_point);
10446         i24 += 8;
10447 
10448 
10449         const __m128i vprod24x01234567lo = _mm_mullo_epi16(vxi24x01234567, vxk24x01234567);
10450         const __m128i vprod24x01234567hi = _mm_mulhi_epi16(vxi24x01234567, vxk24x01234567);
10451 
10452         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod24x01234567lo, vprod24x01234567hi));
10453         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod24x01234567lo, vprod24x01234567hi));
10454 
10455         k += 8;
10456 
10457         __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
10458         __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
10459 
10460         const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
10461         vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
10462         vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
10463 
10464         const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
10465         vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
10466         vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
10467 
10468         vacc0123 = _mm_cvtps_epi32(vscaled0123);
10469         vacc4567 = _mm_cvtps_epi32(vscaled4567);
10470 
10471         w = (const void*) ((const int32_t*) w + 8);
10472 
10473         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
10474         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
10475 
10476         __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
10477 
10478         vout0123456701234567 = _mm_max_epu8(vout0123456701234567, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
10479 
10480         if XNN_LIKELY(c >= 8) {
10481           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
10482           output += 8;
10483           c -= 8;
10484         } else {
10485           if (c & 4) {
10486             unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
10487             vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
10488             output += 4;
10489           }
10490           if (c & 2) {
10491             unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
10492             vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
10493             output += 2;
10494           }
10495           if (c & 1) {
10496             *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
10497             output += 1;
10498           }
10499           c = 0;
10500         }
10501       } while (c != 0);
10502     }
10503 
10504     output = (uint8_t*) ((uintptr_t) output + output_increment);
10505   } while (--output_width != 0);
10506 }
10507 
xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16(size_t channels,size_t output_width,const uint8_t ** input,const void * weights,uint8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])10508 void xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16(
10509     size_t channels,
10510     size_t output_width,
10511     const uint8_t** input,
10512     const void* weights,
10513     uint8_t* output,
10514     size_t input_stride,
10515     size_t output_increment,
10516     size_t input_offset,
10517     const uint8_t* zero,
10518     const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
10519 {
10520   assert(channels != 0);
10521   assert(output_width != 0);
10522 
10523   do {
10524     const uint8_t* i0 = input[0];
10525     assert(i0 != NULL);
10526     if XNN_UNPREDICTABLE(i0 != zero) {
10527       i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
10528     }
10529     const uint8_t* i1 = input[1];
10530     assert(i1 != NULL);
10531     if XNN_UNPREDICTABLE(i1 != zero) {
10532       i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
10533     }
10534     const uint8_t* i2 = input[2];
10535     assert(i2 != NULL);
10536     if XNN_UNPREDICTABLE(i2 != zero) {
10537       i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
10538     }
10539     const uint8_t* i3 = input[3];
10540     assert(i3 != NULL);
10541     if XNN_UNPREDICTABLE(i3 != zero) {
10542       i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
10543     }
10544     const uint8_t* i4 = input[4];
10545     assert(i4 != NULL);
10546     if XNN_UNPREDICTABLE(i4 != zero) {
10547       i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
10548     }
10549     const uint8_t* i5 = input[5];
10550     assert(i5 != NULL);
10551     if XNN_UNPREDICTABLE(i5 != zero) {
10552       i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
10553     }
10554     const uint8_t* i6 = input[6];
10555     assert(i6 != NULL);
10556     if XNN_UNPREDICTABLE(i6 != zero) {
10557       i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
10558     }
10559     const uint8_t* i7 = input[7];
10560     assert(i7 != NULL);
10561     if XNN_UNPREDICTABLE(i7 != zero) {
10562       i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
10563     }
10564     const uint8_t* i8 = input[8];
10565     assert(i8 != NULL);
10566     if XNN_UNPREDICTABLE(i8 != zero) {
10567       i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
10568     }
10569     input = (const uint8_t**) ((uintptr_t) input + input_stride);
10570 
10571     size_t c = channels;
10572     const void* w = weights;
10573     const __m128i vk_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point);
10574     for (; c >= 16; c -= 16) {
10575       __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
10576       __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
10577       __m128i vacc89AB = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 8));
10578       __m128i vaccCDEF = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 12));
10579 
10580 
10581       const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
10582       const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(vi0x01234567);
10583       const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(uint8_t)));
10584       const __m128i vxk0x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk0x01234567), vk_zero_point);
10585       const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8));
10586       const __m128i vxi0x89ABCDEF = _mm_cvtepu8_epi16(vi0x89ABCDEF);
10587       const __m128i vk0x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(uint8_t)));
10588       const __m128i vxk0x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk0x89ABCDEF), vk_zero_point);
10589       i0 += 16;
10590 
10591 
10592       const __m128i vprod0x01234567lo = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
10593       const __m128i vprod0x01234567hi = _mm_mulhi_epi16(vxi0x01234567, vxk0x01234567);
10594       const __m128i vprod0x89ABCDEFlo = _mm_mullo_epi16(vxi0x89ABCDEF, vxk0x89ABCDEF);
10595       const __m128i vprod0x89ABCDEFhi = _mm_mulhi_epi16(vxi0x89ABCDEF, vxk0x89ABCDEF);
10596 
10597       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod0x01234567lo, vprod0x01234567hi));
10598       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod0x01234567lo, vprod0x01234567hi));
10599       vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod0x89ABCDEFlo, vprod0x89ABCDEFhi));
10600       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod0x89ABCDEFlo, vprod0x89ABCDEFhi));
10601 
10602       const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
10603       const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(vi1x01234567);
10604       const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(uint8_t)));
10605       const __m128i vxk1x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk1x01234567), vk_zero_point);
10606       const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8));
10607       const __m128i vxi1x89ABCDEF = _mm_cvtepu8_epi16(vi1x89ABCDEF);
10608       const __m128i vk1x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(uint8_t)));
10609       const __m128i vxk1x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk1x89ABCDEF), vk_zero_point);
10610       i1 += 16;
10611 
10612 
10613       const __m128i vprod1x01234567lo = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
10614       const __m128i vprod1x01234567hi = _mm_mulhi_epi16(vxi1x01234567, vxk1x01234567);
10615       const __m128i vprod1x89ABCDEFlo = _mm_mullo_epi16(vxi1x89ABCDEF, vxk1x89ABCDEF);
10616       const __m128i vprod1x89ABCDEFhi = _mm_mulhi_epi16(vxi1x89ABCDEF, vxk1x89ABCDEF);
10617 
10618       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod1x01234567lo, vprod1x01234567hi));
10619       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod1x01234567lo, vprod1x01234567hi));
10620       vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod1x89ABCDEFlo, vprod1x89ABCDEFhi));
10621       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod1x89ABCDEFlo, vprod1x89ABCDEFhi));
10622 
10623       const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
10624       const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(vi2x01234567);
10625       const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(uint8_t)));
10626       const __m128i vxk2x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk2x01234567), vk_zero_point);
10627       const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8));
10628       const __m128i vxi2x89ABCDEF = _mm_cvtepu8_epi16(vi2x89ABCDEF);
10629       const __m128i vk2x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(uint8_t)));
10630       const __m128i vxk2x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk2x89ABCDEF), vk_zero_point);
10631       i2 += 16;
10632 
10633 
10634       const __m128i vprod2x01234567lo = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
10635       const __m128i vprod2x01234567hi = _mm_mulhi_epi16(vxi2x01234567, vxk2x01234567);
10636       const __m128i vprod2x89ABCDEFlo = _mm_mullo_epi16(vxi2x89ABCDEF, vxk2x89ABCDEF);
10637       const __m128i vprod2x89ABCDEFhi = _mm_mulhi_epi16(vxi2x89ABCDEF, vxk2x89ABCDEF);
10638 
10639       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod2x01234567lo, vprod2x01234567hi));
10640       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod2x01234567lo, vprod2x01234567hi));
10641       vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod2x89ABCDEFlo, vprod2x89ABCDEFhi));
10642       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod2x89ABCDEFlo, vprod2x89ABCDEFhi));
10643 
10644       const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
10645       const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(vi3x01234567);
10646       const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(uint8_t)));
10647       const __m128i vxk3x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk3x01234567), vk_zero_point);
10648       const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8));
10649       const __m128i vxi3x89ABCDEF = _mm_cvtepu8_epi16(vi3x89ABCDEF);
10650       const __m128i vk3x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(uint8_t)));
10651       const __m128i vxk3x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk3x89ABCDEF), vk_zero_point);
10652       i3 += 16;
10653 
10654 
10655       const __m128i vprod3x01234567lo = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567);
10656       const __m128i vprod3x01234567hi = _mm_mulhi_epi16(vxi3x01234567, vxk3x01234567);
10657       const __m128i vprod3x89ABCDEFlo = _mm_mullo_epi16(vxi3x89ABCDEF, vxk3x89ABCDEF);
10658       const __m128i vprod3x89ABCDEFhi = _mm_mulhi_epi16(vxi3x89ABCDEF, vxk3x89ABCDEF);
10659 
10660       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod3x01234567lo, vprod3x01234567hi));
10661       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod3x01234567lo, vprod3x01234567hi));
10662       vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod3x89ABCDEFlo, vprod3x89ABCDEFhi));
10663       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod3x89ABCDEFlo, vprod3x89ABCDEFhi));
10664 
10665       const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
10666       const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(vi4x01234567);
10667       const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(uint8_t)));
10668       const __m128i vxk4x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk4x01234567), vk_zero_point);
10669       const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8));
10670       const __m128i vxi4x89ABCDEF = _mm_cvtepu8_epi16(vi4x89ABCDEF);
10671       const __m128i vk4x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(uint8_t)));
10672       const __m128i vxk4x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk4x89ABCDEF), vk_zero_point);
10673       i4 += 16;
10674 
10675 
10676       const __m128i vprod4x01234567lo = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
10677       const __m128i vprod4x01234567hi = _mm_mulhi_epi16(vxi4x01234567, vxk4x01234567);
10678       const __m128i vprod4x89ABCDEFlo = _mm_mullo_epi16(vxi4x89ABCDEF, vxk4x89ABCDEF);
10679       const __m128i vprod4x89ABCDEFhi = _mm_mulhi_epi16(vxi4x89ABCDEF, vxk4x89ABCDEF);
10680 
10681       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod4x01234567lo, vprod4x01234567hi));
10682       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod4x01234567lo, vprod4x01234567hi));
10683       vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod4x89ABCDEFlo, vprod4x89ABCDEFhi));
10684       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod4x89ABCDEFlo, vprod4x89ABCDEFhi));
10685 
10686       const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
10687       const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(vi5x01234567);
10688       const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(uint8_t)));
10689       const __m128i vxk5x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk5x01234567), vk_zero_point);
10690       const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8));
10691       const __m128i vxi5x89ABCDEF = _mm_cvtepu8_epi16(vi5x89ABCDEF);
10692       const __m128i vk5x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(uint8_t)));
10693       const __m128i vxk5x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk5x89ABCDEF), vk_zero_point);
10694       i5 += 16;
10695 
10696 
10697       const __m128i vprod5x01234567lo = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567);
10698       const __m128i vprod5x01234567hi = _mm_mulhi_epi16(vxi5x01234567, vxk5x01234567);
10699       const __m128i vprod5x89ABCDEFlo = _mm_mullo_epi16(vxi5x89ABCDEF, vxk5x89ABCDEF);
10700       const __m128i vprod5x89ABCDEFhi = _mm_mulhi_epi16(vxi5x89ABCDEF, vxk5x89ABCDEF);
10701 
10702       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod5x01234567lo, vprod5x01234567hi));
10703       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod5x01234567lo, vprod5x01234567hi));
10704       vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod5x89ABCDEFlo, vprod5x89ABCDEFhi));
10705       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod5x89ABCDEFlo, vprod5x89ABCDEFhi));
10706 
10707       const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
10708       const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(vi6x01234567);
10709       const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(uint8_t)));
10710       const __m128i vxk6x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk6x01234567), vk_zero_point);
10711       const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8));
10712       const __m128i vxi6x89ABCDEF = _mm_cvtepu8_epi16(vi6x89ABCDEF);
10713       const __m128i vk6x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(uint8_t)));
10714       const __m128i vxk6x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk6x89ABCDEF), vk_zero_point);
10715       i6 += 16;
10716 
10717 
10718       const __m128i vprod6x01234567lo = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
10719       const __m128i vprod6x01234567hi = _mm_mulhi_epi16(vxi6x01234567, vxk6x01234567);
10720       const __m128i vprod6x89ABCDEFlo = _mm_mullo_epi16(vxi6x89ABCDEF, vxk6x89ABCDEF);
10721       const __m128i vprod6x89ABCDEFhi = _mm_mulhi_epi16(vxi6x89ABCDEF, vxk6x89ABCDEF);
10722 
10723       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod6x01234567lo, vprod6x01234567hi));
10724       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod6x01234567lo, vprod6x01234567hi));
10725       vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod6x89ABCDEFlo, vprod6x89ABCDEFhi));
10726       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod6x89ABCDEFlo, vprod6x89ABCDEFhi));
10727 
10728       const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
10729       const __m128i vxi7x01234567 = _mm_cvtepu8_epi16(vi7x01234567);
10730       const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(uint8_t)));
10731       const __m128i vxk7x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk7x01234567), vk_zero_point);
10732       const __m128i vi7x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i7 + 8));
10733       const __m128i vxi7x89ABCDEF = _mm_cvtepu8_epi16(vi7x89ABCDEF);
10734       const __m128i vk7x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(uint8_t)));
10735       const __m128i vxk7x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk7x89ABCDEF), vk_zero_point);
10736       i7 += 16;
10737 
10738 
10739       const __m128i vprod7x01234567lo = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567);
10740       const __m128i vprod7x01234567hi = _mm_mulhi_epi16(vxi7x01234567, vxk7x01234567);
10741       const __m128i vprod7x89ABCDEFlo = _mm_mullo_epi16(vxi7x89ABCDEF, vxk7x89ABCDEF);
10742       const __m128i vprod7x89ABCDEFhi = _mm_mulhi_epi16(vxi7x89ABCDEF, vxk7x89ABCDEF);
10743 
10744       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod7x01234567lo, vprod7x01234567hi));
10745       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod7x01234567lo, vprod7x01234567hi));
10746       vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod7x89ABCDEFlo, vprod7x89ABCDEFhi));
10747       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod7x89ABCDEFlo, vprod7x89ABCDEFhi));
10748 
10749       const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
10750       const __m128i vxi8x01234567 = _mm_cvtepu8_epi16(vi8x01234567);
10751       const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(uint8_t)));
10752       const __m128i vxk8x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk8x01234567), vk_zero_point);
10753       const __m128i vi8x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i8 + 8));
10754       const __m128i vxi8x89ABCDEF = _mm_cvtepu8_epi16(vi8x89ABCDEF);
10755       const __m128i vk8x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(uint8_t)));
10756       const __m128i vxk8x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk8x89ABCDEF), vk_zero_point);
10757       i8 += 16;
10758 
10759 
10760       const __m128i vprod8x01234567lo = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
10761       const __m128i vprod8x01234567hi = _mm_mulhi_epi16(vxi8x01234567, vxk8x01234567);
10762       const __m128i vprod8x89ABCDEFlo = _mm_mullo_epi16(vxi8x89ABCDEF, vxk8x89ABCDEF);
10763       const __m128i vprod8x89ABCDEFhi = _mm_mulhi_epi16(vxi8x89ABCDEF, vxk8x89ABCDEF);
10764 
10765       vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod8x01234567lo, vprod8x01234567hi));
10766       vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod8x01234567lo, vprod8x01234567hi));
10767       vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod8x89ABCDEFlo, vprod8x89ABCDEFhi));
10768       vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod8x89ABCDEFlo, vprod8x89ABCDEFhi));
10769 
10770       w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(uint8_t));
10771 
10772       __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
10773       __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
10774       __m128 vscaled89AB = _mm_cvtepi32_ps(vacc89AB);
10775       __m128 vscaledCDEF = _mm_cvtepi32_ps(vaccCDEF);
10776 
10777       const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
10778       vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
10779       vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
10780       vscaled89AB = _mm_mul_ps(vscaled89AB, vscale);
10781       vscaledCDEF = _mm_mul_ps(vscaledCDEF, vscale);
10782 
10783       const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
10784       vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
10785       vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
10786       vscaled89AB = _mm_min_ps(vscaled89AB, voutput_max_less_zero_point);
10787       vscaledCDEF = _mm_min_ps(vscaledCDEF, voutput_max_less_zero_point);
10788 
10789       vacc0123 = _mm_cvtps_epi32(vscaled0123);
10790       vacc4567 = _mm_cvtps_epi32(vscaled4567);
10791       vacc89AB = _mm_cvtps_epi32(vscaled89AB);
10792       vaccCDEF = _mm_cvtps_epi32(vscaledCDEF);
10793 
10794       const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
10795       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
10796       __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
10797 
10798       __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF);
10799 
10800       const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
10801       vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
10802 
10803       _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
10804       output += 16;
10805     }
10806     if XNN_UNLIKELY(c != 0) {
10807       const uint8_t* k = (const uint8_t*) ((const int32_t*) w + 16);
10808       do {
10809         __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
10810         __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
10811 
10812 
10813         const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
10814         const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(vi0x01234567);
10815         const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) k);
10816         const __m128i vxk0x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk0x01234567), vk_zero_point);
10817         i0 += 8;
10818 
10819 
10820         const __m128i vprod0x01234567lo = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
10821         const __m128i vprod0x01234567hi = _mm_mulhi_epi16(vxi0x01234567, vxk0x01234567);
10822 
10823         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod0x01234567lo, vprod0x01234567hi));
10824         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod0x01234567lo, vprod0x01234567hi));
10825 
10826         const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
10827         const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(vi1x01234567);
10828         const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) (k + 16));
10829         const __m128i vxk1x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk1x01234567), vk_zero_point);
10830         i1 += 8;
10831 
10832 
10833         const __m128i vprod1x01234567lo = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
10834         const __m128i vprod1x01234567hi = _mm_mulhi_epi16(vxi1x01234567, vxk1x01234567);
10835 
10836         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod1x01234567lo, vprod1x01234567hi));
10837         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod1x01234567lo, vprod1x01234567hi));
10838 
10839         const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
10840         const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(vi2x01234567);
10841         const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) (k + 32));
10842         const __m128i vxk2x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk2x01234567), vk_zero_point);
10843         i2 += 8;
10844 
10845 
10846         const __m128i vprod2x01234567lo = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
10847         const __m128i vprod2x01234567hi = _mm_mulhi_epi16(vxi2x01234567, vxk2x01234567);
10848 
10849         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod2x01234567lo, vprod2x01234567hi));
10850         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod2x01234567lo, vprod2x01234567hi));
10851 
10852         const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
10853         const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(vi3x01234567);
10854         const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) (k + 48));
10855         const __m128i vxk3x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk3x01234567), vk_zero_point);
10856         i3 += 8;
10857 
10858 
10859         const __m128i vprod3x01234567lo = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567);
10860         const __m128i vprod3x01234567hi = _mm_mulhi_epi16(vxi3x01234567, vxk3x01234567);
10861 
10862         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod3x01234567lo, vprod3x01234567hi));
10863         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod3x01234567lo, vprod3x01234567hi));
10864 
10865         const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
10866         const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(vi4x01234567);
10867         const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) (k + 64));
10868         const __m128i vxk4x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk4x01234567), vk_zero_point);
10869         i4 += 8;
10870 
10871 
10872         const __m128i vprod4x01234567lo = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
10873         const __m128i vprod4x01234567hi = _mm_mulhi_epi16(vxi4x01234567, vxk4x01234567);
10874 
10875         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod4x01234567lo, vprod4x01234567hi));
10876         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod4x01234567lo, vprod4x01234567hi));
10877 
10878         const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
10879         const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(vi5x01234567);
10880         const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) (k + 80));
10881         const __m128i vxk5x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk5x01234567), vk_zero_point);
10882         i5 += 8;
10883 
10884 
10885         const __m128i vprod5x01234567lo = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567);
10886         const __m128i vprod5x01234567hi = _mm_mulhi_epi16(vxi5x01234567, vxk5x01234567);
10887 
10888         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod5x01234567lo, vprod5x01234567hi));
10889         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod5x01234567lo, vprod5x01234567hi));
10890 
10891         const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
10892         const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(vi6x01234567);
10893         const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) (k + 96));
10894         const __m128i vxk6x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk6x01234567), vk_zero_point);
10895         i6 += 8;
10896 
10897 
10898         const __m128i vprod6x01234567lo = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
10899         const __m128i vprod6x01234567hi = _mm_mulhi_epi16(vxi6x01234567, vxk6x01234567);
10900 
10901         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod6x01234567lo, vprod6x01234567hi));
10902         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod6x01234567lo, vprod6x01234567hi));
10903 
10904         const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
10905         const __m128i vxi7x01234567 = _mm_cvtepu8_epi16(vi7x01234567);
10906         const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) (k + 112));
10907         const __m128i vxk7x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk7x01234567), vk_zero_point);
10908         i7 += 8;
10909 
10910 
10911         const __m128i vprod7x01234567lo = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567);
10912         const __m128i vprod7x01234567hi = _mm_mulhi_epi16(vxi7x01234567, vxk7x01234567);
10913 
10914         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod7x01234567lo, vprod7x01234567hi));
10915         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod7x01234567lo, vprod7x01234567hi));
10916 
10917         const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
10918         const __m128i vxi8x01234567 = _mm_cvtepu8_epi16(vi8x01234567);
10919         const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) (k + 128));
10920         const __m128i vxk8x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk8x01234567), vk_zero_point);
10921         i8 += 8;
10922 
10923 
10924         const __m128i vprod8x01234567lo = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
10925         const __m128i vprod8x01234567hi = _mm_mulhi_epi16(vxi8x01234567, vxk8x01234567);
10926 
10927         vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod8x01234567lo, vprod8x01234567hi));
10928         vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod8x01234567lo, vprod8x01234567hi));
10929 
10930         k += 8;
10931 
10932         __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
10933         __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
10934 
10935         const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
10936         vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
10937         vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
10938 
10939         const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
10940         vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
10941         vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
10942 
10943         vacc0123 = _mm_cvtps_epi32(vscaled0123);
10944         vacc4567 = _mm_cvtps_epi32(vscaled4567);
10945 
10946         w = (const void*) ((const int32_t*) w + 8);
10947 
10948         const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
10949         __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
10950 
10951         __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
10952 
10953         vout0123456701234567 = _mm_max_epu8(vout0123456701234567, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
10954 
10955         if XNN_LIKELY(c >= 8) {
10956           _mm_storel_epi64((__m128i*) output, vout0123456701234567);
10957           output += 8;
10958           c -= 8;
10959         } else {
10960           if (c & 4) {
10961             unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
10962             vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
10963             output += 4;
10964           }
10965           if (c & 2) {
10966             unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
10967             vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
10968             output += 2;
10969           }
10970           if (c & 1) {
10971             *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
10972             output += 1;
10973           }
10974           c = 0;
10975         }
10976       } while (c != 0);
10977     }
10978 
10979     output = (uint8_t*) ((uintptr_t) output + output_increment);
10980   } while (--output_width != 0);
10981 }
10982 
xnn_qu8_f32_vcvt_ukernel__avx_x32(size_t n,const uint8_t * x,float * y,const union xnn_qu8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])10983 void xnn_qu8_f32_vcvt_ukernel__avx_x32(
10984     size_t n,
10985     const uint8_t* x,
10986     float* y,
10987     const union xnn_qu8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
10988 {
10989   assert(n != 0);
10990   assert(n % sizeof(uint8_t) == 0);
10991   assert(x != NULL);
10992   assert(y != NULL);
10993 
10994   const __m128i vminus_zero_point = _mm_load_si128((const __m128i*) params->avx.minus_zero_point);
10995   const __m256 vscale = _mm256_load_ps(params->avx.scale);
10996   for (; n >= 32 * sizeof(uint8_t); n -= 32 * sizeof(uint8_t)) {
10997     __m128i vx0123 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x)));
10998     __m128i vx4567 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 4)));
10999     __m128i vx89AB = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 8)));
11000     __m128i vxCDEF = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 12)));
11001     __m128i vxGHIJ = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 16)));
11002     __m128i vxKLMN = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 20)));
11003     __m128i vxOPQR = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 24)));
11004     __m128i vxSTUV = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 28)));
11005     x += 32;
11006 
11007     vx0123 = _mm_add_epi32(vx0123, vminus_zero_point);
11008     vx4567 = _mm_add_epi32(vx4567, vminus_zero_point);
11009     vx89AB = _mm_add_epi32(vx89AB, vminus_zero_point);
11010     vxCDEF = _mm_add_epi32(vxCDEF, vminus_zero_point);
11011     vxGHIJ = _mm_add_epi32(vxGHIJ, vminus_zero_point);
11012     vxKLMN = _mm_add_epi32(vxKLMN, vminus_zero_point);
11013     vxOPQR = _mm_add_epi32(vxOPQR, vminus_zero_point);
11014     vxSTUV = _mm_add_epi32(vxSTUV, vminus_zero_point);
11015 
11016     const __m256i vx01234567 = _mm256_insertf128_si256(_mm256_castsi128_si256(vx0123), vx4567, 1);
11017     const __m256i vx89ABCDEF = _mm256_insertf128_si256(_mm256_castsi128_si256(vx89AB), vxCDEF, 1);
11018     const __m256i vxGHIJKLMN = _mm256_insertf128_si256(_mm256_castsi128_si256(vxGHIJ), vxKLMN, 1);
11019     const __m256i vxOPQRSTUV = _mm256_insertf128_si256(_mm256_castsi128_si256(vxOPQR), vxSTUV, 1);
11020 
11021     __m256 vy01234567 = _mm256_cvtepi32_ps(vx01234567);
11022     __m256 vy89ABCDEF = _mm256_cvtepi32_ps(vx89ABCDEF);
11023     __m256 vyGHIJKLMN = _mm256_cvtepi32_ps(vxGHIJKLMN);
11024     __m256 vyOPQRSTUV = _mm256_cvtepi32_ps(vxOPQRSTUV);
11025 
11026     vy01234567 = _mm256_mul_ps(vy01234567, vscale);
11027     vy89ABCDEF = _mm256_mul_ps(vy89ABCDEF, vscale);
11028     vyGHIJKLMN = _mm256_mul_ps(vyGHIJKLMN, vscale);
11029     vyOPQRSTUV = _mm256_mul_ps(vyOPQRSTUV, vscale);
11030 
11031     _mm256_storeu_ps(y, vy01234567);
11032     _mm256_storeu_ps(y + 8, vy89ABCDEF);
11033     _mm256_storeu_ps(y + 16, vyGHIJKLMN);
11034     _mm256_storeu_ps(y + 24, vyOPQRSTUV);
11035     y += 32;
11036   }
11037   for (; n >= 4 * sizeof(uint8_t); n -= 4 * sizeof(uint8_t)) {
11038     __m128i vx = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x)));
11039     vx = _mm_add_epi32(vx, vminus_zero_point);
11040     x += 4;
11041 
11042     __m128 vy = _mm_cvtepi32_ps(vx);
11043     vy = _mm_mul_ps(vy, _mm256_castps256_ps128(vscale));
11044 
11045     _mm_storeu_ps(y, vy);
11046     y += 4;
11047   }
11048   if XNN_UNLIKELY(n != 0) {
11049     assert(n >= 1 * sizeof(uint8_t));
11050     assert(n <= 3 * sizeof(uint8_t));
11051 
11052     __m128i vx = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x)));
11053     vx = _mm_add_epi32(vx, vminus_zero_point);
11054 
11055     __m128 vy = _mm_cvtepi32_ps(vx);
11056     vy = _mm_mul_ps(vy, _mm256_castps256_ps128(vscale));
11057 
11058     if (n & (2 * sizeof(uint8_t))) {
11059       _mm_storel_pi((__m64*) y, vy);
11060       vy = _mm_movehl_ps(vy, vy);
11061       y += 2;
11062     }
11063     if (n & (1 * sizeof(uint8_t))) {
11064       _mm_store_ss(y, vy);
11065     }
11066   }
11067 }
11068 
xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128(size_t mr,size_t nc,size_t kc,const uint8_t * restrict a,size_t a_stride,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])11069 void xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128(
11070     size_t mr,
11071     size_t nc,
11072     size_t kc,
11073     const uint8_t* restrict a,
11074     size_t a_stride,
11075     const void* restrict w,
11076     uint8_t* restrict c,
11077     size_t cm_stride,
11078     size_t cn_stride,
11079     const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
11080 {
11081   assert(mr != 0);
11082   assert(mr <= 1);
11083   assert(nc != 0);
11084   assert(kc != 0);
11085   assert(kc % sizeof(uint8_t) == 0);
11086   assert(a != NULL);
11087   assert(w != NULL);
11088   assert(c != NULL);
11089 
11090   kc = round_up_po2(kc, 8);
11091   const uint8_t* a0 = a;
11092   uint8_t* c0 = c;
11093 
11094   do {
11095     __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
11096     __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
11097     __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
11098     __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
11099     w = (const int32_t*) w + 4;
11100 
11101     size_t k = 0;
11102     const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point);
11103     const __m128i vzero = _mm_setzero_si128();
11104     while (k < kc) {
11105       const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
11106       const __m128i vxa0 = _mm_cvtepu8_epi16(va0);
11107       a0 += 8;
11108 
11109       const __m128i vb01 = _mm_load_si128((const __m128i*) w);
11110       const __m128i vxb0 = _mm_sub_epi16(_mm_unpacklo_epi8(vb01, vzero), vb_zero_point);
11111       const __m128i vxb1 = _mm_sub_epi16(_mm_unpackhi_epi8(vb01, vzero), vb_zero_point);
11112 
11113       vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
11114       vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
11115       const __m128i vb23 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 16));
11116       const __m128i vxb2 = _mm_sub_epi16(_mm_unpacklo_epi8(vb23, vzero), vb_zero_point);
11117       const __m128i vxb3 = _mm_sub_epi16(_mm_unpackhi_epi8(vb23, vzero), vb_zero_point);
11118 
11119       vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
11120       vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
11121 
11122       w = (const void*) ((const uint8_t*) w + 32);
11123       k += 8 * sizeof(uint8_t);
11124     }
11125 
11126     const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
11127     const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
11128 
11129     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
11130 
11131     __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
11132 
11133     const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
11134     vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
11135 
11136     const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
11137     vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
11138 
11139     vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
11140 
11141     const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
11142     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
11143 
11144     __m128i vout = _mm_packus_epi16(vacc00x0123, vacc00x0123);
11145 
11146     vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
11147 
11148     if (nc >= 4) {
11149       unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
11150 
11151       c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
11152 
11153       a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
11154 
11155       nc -= 4;
11156     } else {
11157       if (nc & 2) {
11158         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
11159         c0 += 2;
11160         vout = _mm_srli_epi32(vout, 16);
11161       }
11162       if (nc & 1) {
11163         *c0 = (uint8_t) _mm_extract_epi8(vout, 0);
11164       }
11165 
11166       nc = 0;
11167     }
11168   } while (nc != 0);
11169 }
11170 
xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128(size_t mr,size_t nc,size_t kc,const uint8_t * restrict a,size_t a_stride,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])11171 void xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128(
11172     size_t mr,
11173     size_t nc,
11174     size_t kc,
11175     const uint8_t* restrict a,
11176     size_t a_stride,
11177     const void* restrict w,
11178     uint8_t* restrict c,
11179     size_t cm_stride,
11180     size_t cn_stride,
11181     const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
11182 {
11183   assert(mr != 0);
11184   assert(mr <= 2);
11185   assert(nc != 0);
11186   assert(kc != 0);
11187   assert(kc % sizeof(uint8_t) == 0);
11188   assert(a != NULL);
11189   assert(w != NULL);
11190   assert(c != NULL);
11191 
11192   kc = round_up_po2(kc, 8);
11193   const uint8_t* a0 = a;
11194   uint8_t* c0 = c;
11195   const uint8_t* a1 = (const uint8_t*) ((uintptr_t) a0 + a_stride);
11196   uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
11197   if XNN_UNPREDICTABLE(mr != 2) {
11198     a1 = a0;
11199     c1 = c0;
11200   }
11201 
11202   do {
11203     __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
11204     __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
11205     __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
11206     __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
11207     __m128i vacc1x0 = vacc0x0;
11208     __m128i vacc1x1 = vacc0x1;
11209     __m128i vacc1x2 = vacc0x2;
11210     __m128i vacc1x3 = vacc0x3;
11211     w = (const int32_t*) w + 4;
11212 
11213     size_t k = 0;
11214     const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point);
11215     const __m128i vzero = _mm_setzero_si128();
11216     while (k < kc) {
11217       const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
11218       const __m128i vxa0 = _mm_cvtepu8_epi16(va0);
11219       a0 += 8;
11220       const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
11221       const __m128i vxa1 = _mm_cvtepu8_epi16(va1);
11222       a1 += 8;
11223 
11224       const __m128i vb01 = _mm_load_si128((const __m128i*) w);
11225       const __m128i vxb0 = _mm_sub_epi16(_mm_unpacklo_epi8(vb01, vzero), vb_zero_point);
11226       const __m128i vxb1 = _mm_sub_epi16(_mm_unpackhi_epi8(vb01, vzero), vb_zero_point);
11227 
11228       vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
11229       vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
11230       vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
11231       vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
11232       const __m128i vb23 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 16));
11233       const __m128i vxb2 = _mm_sub_epi16(_mm_unpacklo_epi8(vb23, vzero), vb_zero_point);
11234       const __m128i vxb3 = _mm_sub_epi16(_mm_unpackhi_epi8(vb23, vzero), vb_zero_point);
11235 
11236       vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
11237       vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
11238       vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
11239       vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
11240 
11241       w = (const void*) ((const uint8_t*) w + 32);
11242       k += 8 * sizeof(uint8_t);
11243     }
11244 
11245     const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
11246     const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
11247     const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
11248     const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
11249 
11250     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
11251     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
11252 
11253     __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
11254     __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
11255 
11256     const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
11257     vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
11258     vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
11259 
11260     const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
11261     vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
11262     vscaled1x0123 = _mm_min_ps(vscaled1x0123, voutput_max_less_zero_point);
11263 
11264     vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
11265     vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
11266 
11267     const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
11268     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
11269 
11270     __m128i vout = _mm_packus_epi16(vacc01x0123, vacc01x0123);
11271 
11272     vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
11273 
11274     if (nc >= 4) {
11275       unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
11276       unaligned_store_u32(c1, (uint32_t) _mm_extract_epi32(vout, 1));
11277 
11278       c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
11279       c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
11280 
11281       a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
11282       a1 = (const uint8_t*) ((uintptr_t) a1 - kc);
11283 
11284       nc -= 4;
11285     } else {
11286       if (nc & 2) {
11287         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
11288         c0 += 2;
11289         unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout, 2));
11290         c1 += 2;
11291         vout = _mm_srli_epi32(vout, 16);
11292       }
11293       if (nc & 1) {
11294         *c0 = (uint8_t) _mm_extract_epi8(vout, 0);
11295         *c1 = (uint8_t) _mm_extract_epi8(vout, 4);
11296       }
11297 
11298       nc = 0;
11299     }
11300   } while (nc != 0);
11301 }
11302 
xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128(size_t mr,size_t nc,size_t kc,size_t ks,const uint8_t ** restrict a,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])11303 void xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128(
11304     size_t mr,
11305     size_t nc,
11306     size_t kc,
11307     size_t ks,
11308     const uint8_t** restrict a,
11309     const void* restrict w,
11310     uint8_t* restrict c,
11311     size_t cm_stride,
11312     size_t cn_stride,
11313     size_t a_offset,
11314     const uint8_t* zero,
11315     const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
11316 {
11317   assert(mr != 0);
11318   assert(mr <= 1);
11319   assert(nc != 0);
11320   assert(kc != 0);
11321   assert(ks != 0);
11322   assert(ks % (1 * sizeof(void*)) == 0);
11323   assert(a_offset % sizeof(uint8_t) == 0);
11324   assert(a != NULL);
11325   assert(w != NULL);
11326   assert(c != NULL);
11327 
11328   kc = round_up_po2(kc, 8);
11329   uint8_t* c0 = c;
11330 
11331   do {
11332     __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
11333     __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
11334     __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
11335     __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
11336     w = (const int32_t*) w + 4;
11337 
11338     size_t p = ks;
11339     do {
11340       const uint8_t* restrict a0 = a[0];
11341       if XNN_UNPREDICTABLE(a0 != zero) {
11342         a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
11343       }
11344       a += 1;
11345 
11346       size_t k = 0;
11347       const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point);
11348       const __m128i vzero = _mm_setzero_si128();
11349       while (k < kc) {
11350         const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
11351         const __m128i vxa0 = _mm_cvtepu8_epi16(va0);
11352         a0 += 8;
11353 
11354         const __m128i vb01 = _mm_load_si128((const __m128i*) w);
11355         const __m128i vxb0 = _mm_sub_epi16(_mm_unpacklo_epi8(vb01, vzero), vb_zero_point);
11356         const __m128i vxb1 = _mm_sub_epi16(_mm_unpackhi_epi8(vb01, vzero), vb_zero_point);
11357 
11358         vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
11359         vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
11360         const __m128i vb23 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 16));
11361         const __m128i vxb2 = _mm_sub_epi16(_mm_unpacklo_epi8(vb23, vzero), vb_zero_point);
11362         const __m128i vxb3 = _mm_sub_epi16(_mm_unpackhi_epi8(vb23, vzero), vb_zero_point);
11363 
11364         vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
11365         vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
11366 
11367         w = (const void*) ((const uint8_t*) w + 32);
11368         k += 8 * sizeof(uint8_t);
11369       }
11370       p -= 1 * sizeof(void*);
11371     } while (p != 0);
11372 
11373     const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
11374     const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
11375 
11376     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
11377 
11378     __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
11379 
11380     const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
11381     vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
11382 
11383     const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
11384     vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
11385 
11386     vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
11387 
11388     const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
11389     __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
11390 
11391     __m128i vout = _mm_packus_epi16(vacc00x0123, vacc00x0123);
11392 
11393     vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
11394 
11395     if (nc >= 4) {
11396       unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
11397       c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
11398 
11399       a = (const uint8_t**restrict) ((uintptr_t) a - ks);
11400 
11401       nc -= 4;
11402     } else {
11403       if (nc & 2) {
11404         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
11405         c0 += 2;
11406         vout = _mm_srli_epi32(vout, 16);
11407       }
11408       if (nc & 1) {
11409         *c0 = (uint8_t) _mm_extract_epi8(vout, 0);
11410       }
11411 
11412       nc = 0;
11413     }
11414   } while (nc != 0);
11415 }
11416 
xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128(size_t mr,size_t nc,size_t kc,size_t ks,const uint8_t ** restrict a,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])11417 void xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128(
11418     size_t mr,
11419     size_t nc,
11420     size_t kc,
11421     size_t ks,
11422     const uint8_t** restrict a,
11423     const void* restrict w,
11424     uint8_t* restrict c,
11425     size_t cm_stride,
11426     size_t cn_stride,
11427     size_t a_offset,
11428     const uint8_t* zero,
11429     const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
11430 {
11431   assert(mr != 0);
11432   assert(mr <= 2);
11433   assert(nc != 0);
11434   assert(kc != 0);
11435   assert(ks != 0);
11436   assert(ks % (2 * sizeof(void*)) == 0);
11437   assert(a_offset % sizeof(uint8_t) == 0);
11438   assert(a != NULL);
11439   assert(w != NULL);
11440   assert(c != NULL);
11441 
11442   kc = round_up_po2(kc, 8);
11443   uint8_t* c0 = c;
11444   uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
11445   if XNN_UNPREDICTABLE(mr != 2) {
11446     c1 = c0;
11447   }
11448 
11449   do {
11450     __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
11451     __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
11452     __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
11453     __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
11454     __m128i vacc1x0 = vacc0x0;
11455     __m128i vacc1x1 = vacc0x1;
11456     __m128i vacc1x2 = vacc0x2;
11457     __m128i vacc1x3 = vacc0x3;
11458     w = (const int32_t*) w + 4;
11459 
11460     size_t p = ks;
11461     do {
11462       const uint8_t* restrict a0 = a[0];
11463       if XNN_UNPREDICTABLE(a0 != zero) {
11464         a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
11465       }
11466       const uint8_t* restrict a1 = a[1];
11467       if XNN_UNPREDICTABLE(a1 != zero) {
11468         a1 = (const uint8_t*) ((uintptr_t) a1 + a_offset);
11469       }
11470       a += 2;
11471 
11472       size_t k = 0;
11473       const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point);
11474       const __m128i vzero = _mm_setzero_si128();
11475       while (k < kc) {
11476         const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
11477         const __m128i vxa0 = _mm_cvtepu8_epi16(va0);
11478         a0 += 8;
11479         const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
11480         const __m128i vxa1 = _mm_cvtepu8_epi16(va1);
11481         a1 += 8;
11482 
11483         const __m128i vb01 = _mm_load_si128((const __m128i*) w);
11484         const __m128i vxb0 = _mm_sub_epi16(_mm_unpacklo_epi8(vb01, vzero), vb_zero_point);
11485         const __m128i vxb1 = _mm_sub_epi16(_mm_unpackhi_epi8(vb01, vzero), vb_zero_point);
11486 
11487         vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
11488         vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
11489         vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
11490         vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
11491         const __m128i vb23 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 16));
11492         const __m128i vxb2 = _mm_sub_epi16(_mm_unpacklo_epi8(vb23, vzero), vb_zero_point);
11493         const __m128i vxb3 = _mm_sub_epi16(_mm_unpackhi_epi8(vb23, vzero), vb_zero_point);
11494 
11495         vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
11496         vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
11497         vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
11498         vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
11499 
11500         w = (const void*) ((const uint8_t*) w + 32);
11501         k += 8 * sizeof(uint8_t);
11502       }
11503       p -= 2 * sizeof(void*);
11504     } while (p != 0);
11505 
11506     const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
11507     const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
11508     const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
11509     const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
11510 
11511     __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
11512     __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
11513 
11514     __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
11515     __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
11516 
11517     const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
11518     vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
11519     vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
11520 
11521     const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
11522     vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
11523     vscaled1x0123 = _mm_min_ps(vscaled1x0123, voutput_max_less_zero_point);
11524 
11525     vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
11526     vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
11527 
11528     const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
11529     __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
11530 
11531     __m128i vout = _mm_packus_epi16(vacc01x0123, vacc01x0123);
11532 
11533     vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
11534 
11535     if (nc >= 4) {
11536       unaligned_store_u32(c1, (uint32_t) _mm_extract_epi32(vout, 1));
11537       c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
11538       unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
11539       c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
11540 
11541       a = (const uint8_t**restrict) ((uintptr_t) a - ks);
11542 
11543       nc -= 4;
11544     } else {
11545       if (nc & 2) {
11546         unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout, 2));
11547         c1 += 2;
11548         unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
11549         c0 += 2;
11550         vout = _mm_srli_epi32(vout, 16);
11551       }
11552       if (nc & 1) {
11553         *c1 = (uint8_t) _mm_extract_epi8(vout, 4);
11554         *c0 = (uint8_t) _mm_extract_epi8(vout, 0);
11555       }
11556 
11557       nc = 0;
11558     }
11559   } while (nc != 0);
11560 }
11561 
xnn_qu8_vadd_minmax_ukernel__avx_mul32_ld32_x8(size_t n,const uint8_t * input_a,const uint8_t * input_b,uint8_t * output,const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])11562 void xnn_qu8_vadd_minmax_ukernel__avx_mul32_ld32_x8(
11563     size_t n,
11564     const uint8_t* input_a,
11565     const uint8_t* input_b,
11566     uint8_t* output,
11567     const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
11568 {
11569   const __m128i vbias = _mm_load_si128((const __m128i*) params->sse4.bias);
11570   const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4.a_multiplier);
11571   const __m128i vb_multiplier = _mm_load_si128((const __m128i*) params->sse4.b_multiplier);
11572   const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
11573   const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
11574   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
11575   const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
11576 
11577   for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
11578     const __m128i va0123 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_a)));
11579     const __m128i vb0123 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_b)));
11580     const __m128i va4567 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_a + 4)));
11581     const __m128i vb4567 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_b + 4)));
11582     input_a += 8;
11583     input_b += 8;
11584 
11585     __m128i vacc0123 = _mm_add_epi32(vbias, _mm_mullo_epi32(va0123, va_multiplier));
11586     __m128i vacc4567 = _mm_add_epi32(vbias, _mm_mullo_epi32(va4567, va_multiplier));
11587 
11588     vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vb0123, vb_multiplier));
11589     vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vb4567, vb_multiplier));
11590 
11591     vacc0123 = _mm_sra_epi32(vacc0123, vshift);
11592     vacc4567 = _mm_sra_epi32(vacc4567, vshift);
11593 
11594     const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
11595 
11596     __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
11597 
11598     vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
11599 
11600     vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
11601 
11602     _mm_storel_epi64((__m128i*) output, vout0123456701234567);
11603     output += 8;
11604   }
11605   if XNN_UNLIKELY(n != 0) {
11606     {
11607       const __m128i va0123 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_a)));
11608       const __m128i vb0123 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_b)));
11609       const __m128i va4567 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_a + 4)));
11610       const __m128i vb4567 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_b + 4)));
11611 
11612       __m128i vacc0123 = _mm_add_epi32(vbias, _mm_mullo_epi32(va0123, va_multiplier));
11613       __m128i vacc4567 = _mm_add_epi32(vbias, _mm_mullo_epi32(va4567, va_multiplier));
11614 
11615       vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vb0123, vb_multiplier));
11616       vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vb4567, vb_multiplier));
11617 
11618       vacc0123 = _mm_sra_epi32(vacc0123, vshift);
11619       vacc4567 = _mm_sra_epi32(vacc4567, vshift);
11620 
11621       const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
11622 
11623       __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
11624       vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
11625       vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
11626 
11627       if (n & (4 * sizeof(uint8_t))) {
11628         unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
11629         vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
11630         output += 4;
11631       }
11632       if (n & (2 * sizeof(uint8_t))) {
11633         unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
11634         vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
11635         output += 2;
11636       }
11637       if (n & (1 * sizeof(uint8_t))) {
11638         *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
11639       }
11640     }
11641   }
11642 }
11643 
xnn_qu8_vaddc_minmax_ukernel__avx_mul32_ld32_x8(size_t n,const uint8_t * input_a,const uint8_t * input_b,uint8_t * output,const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])11644 void xnn_qu8_vaddc_minmax_ukernel__avx_mul32_ld32_x8(
11645     size_t n,
11646     const uint8_t* input_a,
11647     const uint8_t* input_b,
11648     uint8_t* output,
11649     const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
11650 {
11651   const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4.a_multiplier);
11652   const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
11653   const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
11654   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
11655   const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
11656 
11657   __m128i vbias = _mm_cvtsi32_si128(params->sse4.b_multiplier[0] * (int32_t) *input_b);
11658   vbias = _mm_shuffle_epi32(vbias, _MM_SHUFFLE(0, 0, 0, 0));
11659   vbias = _mm_add_epi32(vbias, _mm_load_si128((const __m128i*) params->sse4.bias));
11660   for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
11661     const __m128i va0123 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_a)));
11662     const __m128i va4567 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_a + 4)));
11663     input_a += 8;
11664     input_b += 8;
11665 
11666     __m128i vacc0123 = _mm_add_epi32(vbias, _mm_mullo_epi32(va0123, va_multiplier));
11667     __m128i vacc4567 = _mm_add_epi32(vbias, _mm_mullo_epi32(va4567, va_multiplier));
11668 
11669     vacc0123 = _mm_sra_epi32(vacc0123, vshift);
11670     vacc4567 = _mm_sra_epi32(vacc4567, vshift);
11671 
11672     const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
11673 
11674     __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
11675 
11676     vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
11677 
11678     vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
11679 
11680     _mm_storel_epi64((__m128i*) output, vout0123456701234567);
11681     output += 8;
11682   }
11683   if XNN_UNLIKELY(n != 0) {
11684     {
11685       const __m128i va0123 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_a)));
11686       const __m128i va4567 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_a + 4)));
11687 
11688       __m128i vacc0123 = _mm_add_epi32(vbias, _mm_mullo_epi32(va0123, va_multiplier));
11689       __m128i vacc4567 = _mm_add_epi32(vbias, _mm_mullo_epi32(va4567, va_multiplier));
11690 
11691       vacc0123 = _mm_sra_epi32(vacc0123, vshift);
11692       vacc4567 = _mm_sra_epi32(vacc4567, vshift);
11693 
11694       const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
11695 
11696       __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
11697       vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
11698       vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
11699 
11700       if (n & (4 * sizeof(uint8_t))) {
11701         unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
11702         vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
11703         output += 4;
11704       }
11705       if (n & (2 * sizeof(uint8_t))) {
11706         unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
11707         vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
11708         output += 2;
11709       }
11710       if (n & (1 * sizeof(uint8_t))) {
11711         *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
11712       }
11713     }
11714   }
11715 }
11716 
xnn_qu8_vcvt_ukernel__avx_x32(size_t n,const uint8_t * x,uint8_t * y,const union xnn_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])11717 void xnn_qu8_vcvt_ukernel__avx_x32(
11718     size_t n,
11719     const uint8_t* x,
11720     uint8_t* y,
11721     const union xnn_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
11722 {
11723   assert(n != 0);
11724   assert(n % sizeof(uint8_t) == 0);
11725   assert(x != NULL);
11726   assert(y != NULL);
11727 
11728   const __m128i vinput_zero_point = _mm_load_si128((const __m128i*) params->ssse3.input_zero_point);
11729   const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->ssse3.multiplier);
11730   const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->ssse3.output_zero_point);
11731   for (; n >= 32 * sizeof(uint8_t); n -= 32 * sizeof(uint8_t)) {
11732     __m128i vacc0 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) x));
11733     __m128i vacc1 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (x + 8)));
11734     __m128i vacc2 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (x + 16)));
11735     __m128i vacc3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (x + 24)));
11736     x += 32;
11737 
11738     vacc0 = _mm_sub_epi16(vinput_zero_point, vacc0);
11739     vacc1 = _mm_sub_epi16(vinput_zero_point, vacc1);
11740     vacc2 = _mm_sub_epi16(vinput_zero_point, vacc2);
11741     vacc3 = _mm_sub_epi16(vinput_zero_point, vacc3);
11742 
11743     vacc0 = _mm_slli_epi16(vacc0, 7);
11744     vacc1 = _mm_slli_epi16(vacc1, 7);
11745     vacc2 = _mm_slli_epi16(vacc2, 7);
11746     vacc3 = _mm_slli_epi16(vacc3, 7);
11747 
11748     vacc0 = _mm_mulhrs_epi16(vacc0, vmultiplier);
11749     vacc1 = _mm_mulhrs_epi16(vacc1, vmultiplier);
11750     vacc2 = _mm_mulhrs_epi16(vacc2, vmultiplier);
11751     vacc3 = _mm_mulhrs_epi16(vacc3, vmultiplier);
11752 
11753     vacc0 = _mm_adds_epi16(vacc0, voutput_zero_point);
11754     vacc1 = _mm_adds_epi16(vacc1, voutput_zero_point);
11755     vacc2 = _mm_adds_epi16(vacc2, voutput_zero_point);
11756     vacc3 = _mm_adds_epi16(vacc3, voutput_zero_point);
11757 
11758     const __m128i vy0 = _mm_packus_epi16(vacc0, vacc1);
11759     const __m128i vy1 = _mm_packus_epi16(vacc2, vacc3);
11760 
11761     _mm_storeu_si128((__m128i*) y, vy0);
11762     _mm_storeu_si128((__m128i*) (y + 16), vy1);
11763     y += 32;
11764   }
11765   for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
11766     __m128i vacc = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) x));
11767     vacc = _mm_sub_epi16(vinput_zero_point, vacc);
11768     vacc = _mm_slli_epi16(vacc, 7);
11769     vacc = _mm_mulhrs_epi16(vacc, vmultiplier);
11770     vacc = _mm_adds_epi16(vacc, voutput_zero_point);
11771     x += 8;
11772 
11773     const __m128i vy = _mm_packus_epi16(vacc, vacc);
11774     _mm_storel_epi64((__m128i*) y, vy);
11775     y += 8;
11776   }
11777   if XNN_UNLIKELY(n != 0) {
11778     assert(n >= 1 * sizeof(uint8_t));
11779     assert(n <= 7 * sizeof(uint8_t));
11780 
11781     __m128i vacc = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) x));
11782     vacc = _mm_sub_epi16(vinput_zero_point, vacc);
11783     vacc = _mm_slli_epi16(vacc, 7);
11784     vacc = _mm_mulhrs_epi16(vacc, vmultiplier);
11785     vacc = _mm_adds_epi16(vacc, voutput_zero_point);
11786 
11787     __m128i vy = _mm_packus_epi16(vacc, vacc);
11788     if (n & (4 * sizeof(uint8_t))) {
11789       _mm_storeu_si32(y, vy);
11790       vy = _mm_srli_epi64(vy, 32);
11791       y += 4;
11792     }
11793     if (n & (2 * sizeof(uint8_t))) {
11794       _mm_storeu_si16(y, vy);
11795       vy = _mm_srli_epi32(vy, 16);
11796       y += 2;
11797     }
11798     if (n & (1 * sizeof(uint8_t))) {
11799       *y = (uint8_t) _mm_extract_epi8(vy, 0);
11800     }
11801   }
11802 }
11803 
xnn_qu8_vlrelu_ukernel__avx_x32(size_t n,const uint8_t * x,uint8_t * y,const union xnn_qu8_lrelu_params params[restrict XNN_MIN_ELEMENTS (1)])11804 void xnn_qu8_vlrelu_ukernel__avx_x32(
11805     size_t n,
11806     const uint8_t* x,
11807     uint8_t* y,
11808     const union xnn_qu8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
11809 {
11810   assert(n != 0);
11811   assert(n % sizeof(uint8_t) == 0);
11812   assert(x != NULL);
11813   assert(y != NULL);
11814 
11815   const __m128i vinput_zero_point = _mm_load_si128((const __m128i*) params->avx.input_zero_point);
11816   const __m128i vpositive_multiplier = _mm_load_si128((const __m128i*) params->avx.positive_multiplier);
11817   const __m128i vnegative_multiplier = _mm_load_si128((const __m128i*) params->avx.negative_multiplier);
11818   const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx.output_zero_point);
11819   for (; n >= 32 * sizeof(uint8_t); n -= 32 * sizeof(uint8_t)) {
11820     __m128i vacc0 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) x));
11821     __m128i vacc1 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (x + 8)));
11822     __m128i vacc2 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (x + 16)));
11823     __m128i vacc3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (x + 24)));
11824     x += 32;
11825 
11826     __m128i vmultiplier0 = _mm_cmpgt_epi16(vacc0, vinput_zero_point);
11827     vacc0 = _mm_sub_epi16(vinput_zero_point, vacc0);
11828     __m128i vmultiplier1 = _mm_cmpgt_epi16(vacc1, vinput_zero_point);
11829     vacc1 = _mm_sub_epi16(vinput_zero_point, vacc1);
11830     __m128i vmultiplier2 = _mm_cmpgt_epi16(vacc2, vinput_zero_point);
11831     vacc2 = _mm_sub_epi16(vinput_zero_point, vacc2);
11832     __m128i vmultiplier3 = _mm_cmpgt_epi16(vacc3, vinput_zero_point);
11833     vacc3 = _mm_sub_epi16(vinput_zero_point, vacc3);
11834 
11835     vmultiplier0 = _mm_blendv_epi8(vnegative_multiplier, vpositive_multiplier, vmultiplier0);
11836     vacc0 = _mm_slli_epi16(vacc0, 7);
11837     vmultiplier1 = _mm_blendv_epi8(vnegative_multiplier, vpositive_multiplier, vmultiplier1);
11838     vacc1 = _mm_slli_epi16(vacc1, 7);
11839     vmultiplier2 = _mm_blendv_epi8(vnegative_multiplier, vpositive_multiplier, vmultiplier2);
11840     vacc2 = _mm_slli_epi16(vacc2, 7);
11841     vmultiplier3 = _mm_blendv_epi8(vnegative_multiplier, vpositive_multiplier, vmultiplier3);
11842     vacc3 = _mm_slli_epi16(vacc3, 7);
11843 
11844     vacc0 = _mm_mulhrs_epi16(vacc0, vmultiplier0);
11845     vacc1 = _mm_mulhrs_epi16(vacc1, vmultiplier1);
11846     vacc2 = _mm_mulhrs_epi16(vacc2, vmultiplier2);
11847     vacc3 = _mm_mulhrs_epi16(vacc3, vmultiplier3);
11848 
11849     vacc0 = _mm_adds_epi16(vacc0, voutput_zero_point);
11850     vacc1 = _mm_adds_epi16(vacc1, voutput_zero_point);
11851     vacc2 = _mm_adds_epi16(vacc2, voutput_zero_point);
11852     vacc3 = _mm_adds_epi16(vacc3, voutput_zero_point);
11853 
11854     const __m128i vy0 = _mm_packus_epi16(vacc0, vacc1);
11855     const __m128i vy1 = _mm_packus_epi16(vacc2, vacc3);
11856 
11857     _mm_storeu_si128((__m128i*) y, vy0);
11858     _mm_storeu_si128((__m128i*) (y + 16), vy1);
11859     y += 32;
11860   }
11861   for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
11862     __m128i vacc = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) x));
11863     __m128i vmultiplier = _mm_cmpgt_epi16(vacc, vinput_zero_point);
11864     vacc = _mm_sub_epi16(vinput_zero_point, vacc);
11865     vmultiplier = _mm_blendv_epi8(vnegative_multiplier, vpositive_multiplier, vmultiplier);
11866     vacc = _mm_slli_epi16(vacc, 7);
11867     vacc = _mm_mulhrs_epi16(vacc, vmultiplier);
11868     vacc = _mm_adds_epi16(vacc, voutput_zero_point);
11869     x += 8;
11870 
11871     const __m128i vy = _mm_packus_epi16(vacc, vacc);
11872     _mm_storel_epi64((__m128i*) y, vy);
11873     y += 8;
11874   }
11875   if XNN_UNLIKELY(n != 0) {
11876     assert(n >= 1 * sizeof(uint8_t));
11877     assert(n <= 7 * sizeof(uint8_t));
11878 
11879     __m128i vacc = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) x));
11880     __m128i vmultiplier = _mm_cmpgt_epi16(vacc, vinput_zero_point);
11881     vacc = _mm_sub_epi16(vinput_zero_point, vacc);
11882     vmultiplier = _mm_blendv_epi8(vnegative_multiplier, vpositive_multiplier, vmultiplier);
11883     vacc = _mm_slli_epi16(vacc, 7);
11884     vacc = _mm_mulhrs_epi16(vacc, vmultiplier);
11885     vacc = _mm_adds_epi16(vacc, voutput_zero_point);
11886 
11887     __m128i vy = _mm_packus_epi16(vacc, vacc);
11888     if (n & (4 * sizeof(uint8_t))) {
11889       _mm_storeu_si32(y, vy);
11890       vy = _mm_srli_epi64(vy, 32);
11891       y += 4;
11892     }
11893     if (n & (2 * sizeof(uint8_t))) {
11894       _mm_storeu_si16(y, vy);
11895       vy = _mm_srli_epi32(vy, 16);
11896       y += 2;
11897     }
11898     if (n & (1 * sizeof(uint8_t))) {
11899       *y = (uint8_t) _mm_extract_epi8(vy, 0);
11900     }
11901   }
11902 }
11903 
xnn_qu8_vmul_minmax_fp32_ukernel__avx_mul16_ld64_x16(size_t n,const uint8_t * input_a,const uint8_t * input_b,uint8_t * output,const union xnn_qu8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])11904 void xnn_qu8_vmul_minmax_fp32_ukernel__avx_mul16_ld64_x16(
11905     size_t n,
11906     const uint8_t* input_a,
11907     const uint8_t* input_b,
11908     uint8_t* output,
11909     const union xnn_qu8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
11910 
11911 {
11912   const __m128i va_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.a_zero_point);
11913   const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.b_zero_point);
11914   const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
11915   const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
11916   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
11917   const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
11918 
11919   for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
11920     const __m128i va01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
11921     const __m128i vb01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_b));
11922     const __m128i va89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (input_a + 8)));
11923     const __m128i vb89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (input_b + 8)));
11924     input_a += 16;
11925     input_b += 16;
11926 
11927 
11928     const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
11929     const __m128i vxb01234567 = _mm_sub_epi16(vb01234567, vb_zero_point);
11930     const __m128i vxa89ABCDEF = _mm_sub_epi16(va89ABCDEF, va_zero_point);
11931     const __m128i vxb89ABCDEF = _mm_sub_epi16(vb89ABCDEF, vb_zero_point);
11932 
11933     const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb01234567);
11934     const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb01234567);
11935     const __m128i vprod89ABCDEFlo = _mm_mullo_epi16(vxa89ABCDEF, vxb89ABCDEF);
11936     const __m128i vprod89ABCDEFhi = _mm_mulhi_epi16(vxa89ABCDEF, vxb89ABCDEF);
11937 
11938     const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
11939     const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
11940     const __m128i vprod89AB = _mm_unpacklo_epi16(vprod89ABCDEFlo, vprod89ABCDEFhi);
11941     const __m128i vprodCDEF = _mm_unpackhi_epi16(vprod89ABCDEFlo, vprod89ABCDEFhi);
11942 
11943     __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
11944     __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
11945     __m128 vfpacc89AB = _mm_cvtepi32_ps(vprod89AB);
11946     __m128 vfpaccCDEF = _mm_cvtepi32_ps(vprodCDEF);
11947 
11948     vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
11949     vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
11950     vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale);
11951     vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale);
11952 
11953     const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
11954     const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
11955     const __m128i vacc89AB = _mm_cvtps_epi32(vfpacc89AB);
11956     const __m128i vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF);
11957 
11958     __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
11959     __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
11960 
11961 
11962     __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF);
11963 
11964     vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
11965 
11966     vout0123456789ABCDEF = _mm_min_epu8(vout0123456789ABCDEF, voutput_max);
11967 
11968     _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
11969     output += 16;
11970   }
11971   if XNN_UNLIKELY(n != 0) {
11972     do {
11973       const __m128i va01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
11974       const __m128i vb01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_b));
11975       input_a += 8;
11976       input_b += 8;
11977 
11978 
11979       const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
11980       const __m128i vxb01234567 = _mm_sub_epi16(vb01234567, vb_zero_point);
11981 
11982       const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb01234567);
11983       const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb01234567);
11984 
11985       const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
11986       const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
11987 
11988       __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
11989       __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
11990 
11991       vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
11992       vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
11993 
11994       const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
11995       const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
11996 
11997       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
11998 
11999       __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
12000       vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
12001       vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
12002 
12003       if XNN_LIKELY(n >= (8 * sizeof(uint8_t))) {
12004         _mm_storel_epi64((__m128i*) output, vout0123456701234567);
12005         output += 8;
12006         n -= 8 * sizeof(uint8_t);
12007       } else {
12008         if (n & (4 * sizeof(uint8_t))) {
12009           unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
12010           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
12011           output += 4;
12012         }
12013         if (n & (2 * sizeof(uint8_t))) {
12014           unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
12015           vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
12016           output += 2;
12017         }
12018         if (n & (1 * sizeof(uint8_t))) {
12019           *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
12020         }
12021         n = 0;
12022       }
12023     } while (n != 0);
12024   }
12025 }
12026 
xnn_qu8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16(size_t n,const uint8_t * input_a,const uint8_t * input_b,uint8_t * output,const union xnn_qu8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])12027 void xnn_qu8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16(
12028     size_t n,
12029     const uint8_t* input_a,
12030     const uint8_t* input_b,
12031     uint8_t* output,
12032     const union xnn_qu8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
12033 
12034 {
12035   const __m128i va_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.a_zero_point);
12036   const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
12037   const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
12038   const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
12039   const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
12040 
12041   __m128i vxb = _mm_sub_epi16(
12042     _mm_shuffle_epi32(_mm_cvtsi32_si128(UINT32_C(0x00010001) * (uint32_t) (uint16_t) (int16_t) *input_b), 0),
12043     _mm_load_si128((const __m128i*) params->fp32_sse2.b_zero_point));
12044   for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
12045     const __m128i va01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
12046     const __m128i va89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (input_a + 8)));
12047     input_a += 16;
12048 
12049 
12050     const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
12051     const __m128i vxa89ABCDEF = _mm_sub_epi16(va89ABCDEF, va_zero_point);
12052 
12053     const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb);
12054     const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb);
12055     const __m128i vprod89ABCDEFlo = _mm_mullo_epi16(vxa89ABCDEF, vxb);
12056     const __m128i vprod89ABCDEFhi = _mm_mulhi_epi16(vxa89ABCDEF, vxb);
12057 
12058     const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
12059     const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
12060     const __m128i vprod89AB = _mm_unpacklo_epi16(vprod89ABCDEFlo, vprod89ABCDEFhi);
12061     const __m128i vprodCDEF = _mm_unpackhi_epi16(vprod89ABCDEFlo, vprod89ABCDEFhi);
12062 
12063     __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
12064     __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
12065     __m128 vfpacc89AB = _mm_cvtepi32_ps(vprod89AB);
12066     __m128 vfpaccCDEF = _mm_cvtepi32_ps(vprodCDEF);
12067 
12068     vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
12069     vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
12070     vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale);
12071     vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale);
12072 
12073     const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
12074     const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
12075     const __m128i vacc89AB = _mm_cvtps_epi32(vfpacc89AB);
12076     const __m128i vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF);
12077 
12078     __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
12079     __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
12080 
12081 
12082     __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF);
12083 
12084     vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
12085 
12086     vout0123456789ABCDEF = _mm_min_epu8(vout0123456789ABCDEF, voutput_max);
12087 
12088     _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
12089     output += 16;
12090   }
12091   if XNN_UNLIKELY(n != 0) {
12092     do {
12093       const __m128i va01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
12094       input_a += 8;
12095 
12096 
12097       const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
12098 
12099       const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb);
12100       const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb);
12101 
12102       const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
12103       const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
12104 
12105       __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
12106       __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
12107 
12108       vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
12109       vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
12110 
12111       const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
12112       const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
12113 
12114       __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
12115 
12116       __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
12117       vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
12118       vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
12119 
12120       if XNN_LIKELY(n >= (8 * sizeof(uint8_t))) {
12121         _mm_storel_epi64((__m128i*) output, vout0123456701234567);
12122         output += 8;
12123         n -= 8 * sizeof(uint8_t);
12124       } else {
12125         if (n & (4 * sizeof(uint8_t))) {
12126           unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
12127           vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
12128           output += 4;
12129         }
12130         if (n & (2 * sizeof(uint8_t))) {
12131           unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
12132           vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
12133           output += 2;
12134         }
12135         if (n & (1 * sizeof(uint8_t))) {
12136           *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
12137         }
12138         n = 0;
12139       }
12140     } while (n != 0);
12141   }
12142 }
12143 
xnn_x8_lut_ukernel__avx_x64(size_t n,const uint8_t * x,uint8_t * y,const uint8_t t[restrict XNN_MIN_ELEMENTS (256)])12144 void xnn_x8_lut_ukernel__avx_x64(
12145     size_t n,
12146     const uint8_t* x,
12147     uint8_t* y,
12148     const uint8_t t[restrict XNN_MIN_ELEMENTS(256)])
12149 {
12150   assert(n != 0);
12151   assert(x != NULL);
12152   assert(y != NULL);
12153 
12154   const __m128i vt0 = _mm_load_si128((const __m128i*) t);
12155   const __m128i vt1 = _mm_load_si128((const __m128i*) (t + 16));
12156   const __m128i vt2 = _mm_load_si128((const __m128i*) (t + 32));
12157   const __m128i vt3 = _mm_load_si128((const __m128i*) (t + 48));
12158   const __m128i vt4 = _mm_load_si128((const __m128i*) (t + 64));
12159   const __m128i vt5 = _mm_load_si128((const __m128i*) (t + 80));
12160   const __m128i vt6 = _mm_load_si128((const __m128i*) (t + 96));
12161   const __m128i vt7 = _mm_load_si128((const __m128i*) (t + 112));
12162   const __m128i vt8 = _mm_load_si128((const __m128i*) (t + 128));
12163   const __m128i vt9 = _mm_load_si128((const __m128i*) (t + 144));
12164   const __m128i vtA = _mm_load_si128((const __m128i*) (t + 160));
12165   const __m128i vtB = _mm_load_si128((const __m128i*) (t + 176));
12166   const __m128i vtC = _mm_load_si128((const __m128i*) (t + 192));
12167   const __m128i vtD = _mm_load_si128((const __m128i*) (t + 208));
12168   const __m128i vtE = _mm_load_si128((const __m128i*) (t + 224));
12169   const __m128i vtF = _mm_load_si128((const __m128i*) (t + 240));
12170 
12171   const __m128i vtable0 = vt0;
12172   const __m128i vtable1 = _mm_xor_si128(vt0, vt1);
12173   const __m128i vtable2 = _mm_xor_si128(vt1, vt2);
12174   const __m128i vtable3 = _mm_xor_si128(vt2, vt3);
12175   const __m128i vtable4 = _mm_xor_si128(vt3, vt4);
12176   const __m128i vtable5 = _mm_xor_si128(vt4, vt5);
12177   const __m128i vtable6 = _mm_xor_si128(vt5, vt6);
12178   const __m128i vtable7 = _mm_xor_si128(vt6, vt7);
12179   const __m128i vtable8 = _mm_xor_si128(_mm_xor_si128(vt7, vt8), vtable0);
12180   const __m128i vtable9 = _mm_xor_si128(_mm_xor_si128(vt8, vt9), vtable1);
12181   const __m128i vtableA = _mm_xor_si128(_mm_xor_si128(vt9, vtA), vtable2);
12182   const __m128i vtableB = _mm_xor_si128(_mm_xor_si128(vtA, vtB), vtable3);
12183   const __m128i vtableC = _mm_xor_si128(_mm_xor_si128(vtB, vtC), vtable4);
12184   const __m128i vtableD = _mm_xor_si128(_mm_xor_si128(vtC, vtD), vtable5);
12185   const __m128i vtableE = _mm_xor_si128(_mm_xor_si128(vtD, vtE), vtable6);
12186   const __m128i vtableF = _mm_xor_si128(_mm_xor_si128(vtE, vtF), vtable7);
12187 
12188   const __m128i voffset = _mm_set1_epi8(16);
12189   for (; n >= 64 * sizeof(uint8_t); n -= 64 * sizeof(uint8_t)) {
12190     __m128i vx0 = _mm_loadu_si128((const __m128i*) x);
12191     __m128i vx1 = _mm_loadu_si128((const __m128i*) (x + 16));
12192     __m128i vx2 = _mm_loadu_si128((const __m128i*) (x + 32));
12193     __m128i vx3 = _mm_loadu_si128((const __m128i*) (x + 48));
12194     x += 64;
12195 
12196     __m128i vy0 = _mm_shuffle_epi8(vtable0, vx0);
12197     __m128i vy1 = _mm_shuffle_epi8(vtable0, vx1);
12198     __m128i vy2 = _mm_shuffle_epi8(vtable0, vx2);
12199     __m128i vy3 = _mm_shuffle_epi8(vtable0, vx3);
12200 
12201     vx0 = _mm_sub_epi8(vx0, voffset);
12202     vx1 = _mm_sub_epi8(vx1, voffset);
12203     vx2 = _mm_sub_epi8(vx2, voffset);
12204     vx3 = _mm_sub_epi8(vx3, voffset);
12205     vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable1, vx0));
12206     vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable1, vx1));
12207     vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable1, vx2));
12208     vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtable1, vx3));
12209     vx0 = _mm_sub_epi8(vx0, voffset);
12210     vx1 = _mm_sub_epi8(vx1, voffset);
12211     vx2 = _mm_sub_epi8(vx2, voffset);
12212     vx3 = _mm_sub_epi8(vx3, voffset);
12213     vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable2, vx0));
12214     vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable2, vx1));
12215     vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable2, vx2));
12216     vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtable2, vx3));
12217     vx0 = _mm_sub_epi8(vx0, voffset);
12218     vx1 = _mm_sub_epi8(vx1, voffset);
12219     vx2 = _mm_sub_epi8(vx2, voffset);
12220     vx3 = _mm_sub_epi8(vx3, voffset);
12221     vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable3, vx0));
12222     vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable3, vx1));
12223     vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable3, vx2));
12224     vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtable3, vx3));
12225     vx0 = _mm_sub_epi8(vx0, voffset);
12226     vx1 = _mm_sub_epi8(vx1, voffset);
12227     vx2 = _mm_sub_epi8(vx2, voffset);
12228     vx3 = _mm_sub_epi8(vx3, voffset);
12229     vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable4, vx0));
12230     vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable4, vx1));
12231     vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable4, vx2));
12232     vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtable4, vx3));
12233     vx0 = _mm_sub_epi8(vx0, voffset);
12234     vx1 = _mm_sub_epi8(vx1, voffset);
12235     vx2 = _mm_sub_epi8(vx2, voffset);
12236     vx3 = _mm_sub_epi8(vx3, voffset);
12237     vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable5, vx0));
12238     vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable5, vx1));
12239     vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable5, vx2));
12240     vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtable5, vx3));
12241     vx0 = _mm_sub_epi8(vx0, voffset);
12242     vx1 = _mm_sub_epi8(vx1, voffset);
12243     vx2 = _mm_sub_epi8(vx2, voffset);
12244     vx3 = _mm_sub_epi8(vx3, voffset);
12245     vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable6, vx0));
12246     vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable6, vx1));
12247     vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable6, vx2));
12248     vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtable6, vx3));
12249     vx0 = _mm_sub_epi8(vx0, voffset);
12250     vx1 = _mm_sub_epi8(vx1, voffset);
12251     vx2 = _mm_sub_epi8(vx2, voffset);
12252     vx3 = _mm_sub_epi8(vx3, voffset);
12253     vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable7, vx0));
12254     vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable7, vx1));
12255     vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable7, vx2));
12256     vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtable7, vx3));
12257     vx0 = _mm_sub_epi8(vx0, voffset);
12258     vx1 = _mm_sub_epi8(vx1, voffset);
12259     vx2 = _mm_sub_epi8(vx2, voffset);
12260     vx3 = _mm_sub_epi8(vx3, voffset);
12261     vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable8, vx0));
12262     vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable8, vx1));
12263     vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable8, vx2));
12264     vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtable8, vx3));
12265 
12266     vx0 = _mm_subs_epi8(vx0, voffset);
12267     vx1 = _mm_subs_epi8(vx1, voffset);
12268     vx2 = _mm_subs_epi8(vx2, voffset);
12269     vx3 = _mm_subs_epi8(vx3, voffset);
12270     vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable9, vx0));
12271     vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable9, vx1));
12272     vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable9, vx2));
12273     vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtable9, vx3));
12274     vx0 = _mm_subs_epi8(vx0, voffset);
12275     vx1 = _mm_subs_epi8(vx1, voffset);
12276     vx2 = _mm_subs_epi8(vx2, voffset);
12277     vx3 = _mm_subs_epi8(vx3, voffset);
12278     vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableA, vx0));
12279     vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableA, vx1));
12280     vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtableA, vx2));
12281     vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtableA, vx3));
12282     vx0 = _mm_subs_epi8(vx0, voffset);
12283     vx1 = _mm_subs_epi8(vx1, voffset);
12284     vx2 = _mm_subs_epi8(vx2, voffset);
12285     vx3 = _mm_subs_epi8(vx3, voffset);
12286     vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableB, vx0));
12287     vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableB, vx1));
12288     vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtableB, vx2));
12289     vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtableB, vx3));
12290     vx0 = _mm_subs_epi8(vx0, voffset);
12291     vx1 = _mm_subs_epi8(vx1, voffset);
12292     vx2 = _mm_subs_epi8(vx2, voffset);
12293     vx3 = _mm_subs_epi8(vx3, voffset);
12294     vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableC, vx0));
12295     vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableC, vx1));
12296     vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtableC, vx2));
12297     vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtableC, vx3));
12298     vx0 = _mm_subs_epi8(vx0, voffset);
12299     vx1 = _mm_subs_epi8(vx1, voffset);
12300     vx2 = _mm_subs_epi8(vx2, voffset);
12301     vx3 = _mm_subs_epi8(vx3, voffset);
12302     vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableD, vx0));
12303     vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableD, vx1));
12304     vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtableD, vx2));
12305     vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtableD, vx3));
12306     vx0 = _mm_subs_epi8(vx0, voffset);
12307     vx1 = _mm_subs_epi8(vx1, voffset);
12308     vx2 = _mm_subs_epi8(vx2, voffset);
12309     vx3 = _mm_subs_epi8(vx3, voffset);
12310     vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableE, vx0));
12311     vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableE, vx1));
12312     vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtableE, vx2));
12313     vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtableE, vx3));
12314     vx0 = _mm_subs_epi8(vx0, voffset);
12315     vx1 = _mm_subs_epi8(vx1, voffset);
12316     vx2 = _mm_subs_epi8(vx2, voffset);
12317     vx3 = _mm_subs_epi8(vx3, voffset);
12318     vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableF, vx0));
12319     vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableF, vx1));
12320     vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtableF, vx2));
12321     vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtableF, vx3));
12322 
12323     _mm_storeu_si128((__m128i*) y, vy0);
12324     _mm_storeu_si128((__m128i*) (y + 16), vy1);
12325     _mm_storeu_si128((__m128i*) (y + 32), vy2);
12326     _mm_storeu_si128((__m128i*) (y + 48), vy3);
12327     y += 64;
12328   }
12329   for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
12330     __m128i vx = _mm_loadu_si128((const __m128i*) x);
12331     x += 16;
12332 
12333     __m128i vy = _mm_shuffle_epi8(vtable0, vx);
12334 
12335     vx = _mm_sub_epi8(vx, voffset);
12336     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable1, vx));
12337     vx = _mm_sub_epi8(vx, voffset);
12338     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable2, vx));
12339     vx = _mm_sub_epi8(vx, voffset);
12340     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable3, vx));
12341     vx = _mm_sub_epi8(vx, voffset);
12342     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable4, vx));
12343     vx = _mm_sub_epi8(vx, voffset);
12344     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable5, vx));
12345     vx = _mm_sub_epi8(vx, voffset);
12346     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable6, vx));
12347     vx = _mm_sub_epi8(vx, voffset);
12348     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable7, vx));
12349     vx = _mm_sub_epi8(vx, voffset);
12350     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable8, vx));
12351 
12352     vx = _mm_subs_epi8(vx, voffset);
12353     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable9, vx));
12354     vx = _mm_subs_epi8(vx, voffset);
12355     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableA, vx));
12356     vx = _mm_subs_epi8(vx, voffset);
12357     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableB, vx));
12358     vx = _mm_subs_epi8(vx, voffset);
12359     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableC, vx));
12360     vx = _mm_subs_epi8(vx, voffset);
12361     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableD, vx));
12362     vx = _mm_subs_epi8(vx, voffset);
12363     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableE, vx));
12364     vx = _mm_subs_epi8(vx, voffset);
12365     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableF, vx));
12366 
12367     _mm_storeu_si128((__m128i*) y, vy);
12368     y += 16;
12369   }
12370   if XNN_UNLIKELY(n != 0) {
12371     __m128i vx = _mm_loadu_si128((const __m128i*) x);
12372 
12373     __m128i vy = _mm_shuffle_epi8(vtable0, vx);
12374 
12375     vx = _mm_sub_epi8(vx, voffset);
12376     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable1, vx));
12377     vx = _mm_sub_epi8(vx, voffset);
12378     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable2, vx));
12379     vx = _mm_sub_epi8(vx, voffset);
12380     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable3, vx));
12381     vx = _mm_sub_epi8(vx, voffset);
12382     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable4, vx));
12383     vx = _mm_sub_epi8(vx, voffset);
12384     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable5, vx));
12385     vx = _mm_sub_epi8(vx, voffset);
12386     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable6, vx));
12387     vx = _mm_sub_epi8(vx, voffset);
12388     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable7, vx));
12389     vx = _mm_sub_epi8(vx, voffset);
12390     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable8, vx));
12391 
12392     vx = _mm_subs_epi8(vx, voffset);
12393     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable9, vx));
12394     vx = _mm_subs_epi8(vx, voffset);
12395     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableA, vx));
12396     vx = _mm_subs_epi8(vx, voffset);
12397     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableB, vx));
12398     vx = _mm_subs_epi8(vx, voffset);
12399     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableC, vx));
12400     vx = _mm_subs_epi8(vx, voffset);
12401     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableD, vx));
12402     vx = _mm_subs_epi8(vx, voffset);
12403     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableE, vx));
12404     vx = _mm_subs_epi8(vx, voffset);
12405     vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableF, vx));
12406 
12407     if (n & (8 * sizeof(uint8_t))) {
12408       _mm_storel_epi64((__m128i*) y, vy);
12409       vy = _mm_unpackhi_epi64(vy, vy);
12410       y += 8;
12411     }
12412     if (n & (4 * sizeof(uint8_t))) {
12413       _mm_storeu_si32(y, vy);
12414       vy = _mm_srli_epi64(vy, 32);
12415       y += 4;
12416     }
12417     if (n & (2 * sizeof(uint8_t))) {
12418       _mm_storeu_si16(y, vy);
12419       vy = _mm_srli_epi32(vy, 16);
12420       y += 2;
12421     }
12422     if (n & (1 * sizeof(uint8_t))) {
12423       *y = (uint8_t) _mm_extract_epi8(vy, 0);
12424     }
12425   }
12426 }
12427