1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <assert.h>
7
8 #include <immintrin.h>
9
10 #include <xnnpack/common.h>
11 #include <xnnpack/dwconv.h>
12 #include <xnnpack/gemm.h>
13 #include <xnnpack/igemm.h>
14 #include <xnnpack/intrinsics-polyfill.h>
15 #include <xnnpack/lut.h>
16 #include <xnnpack/math.h>
17 #include <xnnpack/prelu.h>
18 #include <xnnpack/unaligned.h>
19 #include <xnnpack/vadd.h>
20 #include <xnnpack/vbinary.h>
21 #include <xnnpack/vcvt.h>
22 #include <xnnpack/vlrelu.h>
23 #include <xnnpack/vmul.h>
24 #include <xnnpack/vunary.h>
25
26
xnn_f16_f32_vcvt_ukernel__avx_int16_x16(size_t n,const void * input,float * output,const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])27 void xnn_f16_f32_vcvt_ukernel__avx_int16_x16(
28 size_t n,
29 const void* input,
30 float* output,
31 const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
32 {
33 assert(n != 0);
34 assert(n % sizeof(uint16_t) == 0);
35 assert(input != NULL);
36 assert(output != NULL);
37
38 const __m128i vsign_mask = _mm_load_si128((const __m128i*) params->sse_int16.sign_mask);
39 const __m128i vexp_offset = _mm_load_si128((const __m128i*) params->sse_int16.exp_offset);
40 const __m128 vexp_scale = _mm_load_ps(params->sse_int16.exp_scale);
41 const __m128i vmagic_mask = _mm_load_si128((const __m128i*) params->sse_int16.magic_mask);
42 const __m128 vmagic_bias = _mm_load_ps(params->sse_int16.magic_bias);
43 const __m128i vdenorm_cutoff = _mm_load_si128((const __m128i*) params->sse_int16.denorm_cutoff);
44
45 const uint16_t* i = (const uint16_t*) input;
46 for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
47 const __m128i vh0 = _mm_loadu_si128((const __m128i*) i);
48 const __m128i vh1 = _mm_loadu_si128((const __m128i*) (i + 8));
49 i += 16;
50
51 const __m128i vsign0 = _mm_and_si128(vh0, vsign_mask);
52 const __m128i vsign1 = _mm_and_si128(vh1, vsign_mask);
53
54 const __m128i vnonsign0 = _mm_xor_si128(vh0, vsign0);
55 const __m128i vnonsign1 = _mm_xor_si128(vh1, vsign1);
56
57 const __m128i vprenorm0 = _mm_slli_epi16(vnonsign0, 13);
58 const __m128i vprenorm1 = _mm_add_epi16(_mm_srli_epi16(vnonsign0, 3), vexp_offset);
59 const __m128i vprenorm2 = _mm_slli_epi16(vnonsign1, 13);
60 const __m128i vprenorm3 = _mm_add_epi16(_mm_srli_epi16(vnonsign1, 3), vexp_offset);
61
62 const __m128i vnorm0 = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vprenorm0, vprenorm1)), vexp_scale));
63 const __m128i vnorm1 = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vprenorm0, vprenorm1)), vexp_scale));
64 const __m128i vnorm2 = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vprenorm2, vprenorm3)), vexp_scale));
65 const __m128i vnorm3 = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vprenorm2, vprenorm3)), vexp_scale));
66
67 const __m128i vdenorm0 = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vnonsign0, vmagic_mask)), vmagic_bias));
68 const __m128i vdenorm1 = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vnonsign0, vmagic_mask)), vmagic_bias));
69 const __m128i vdenorm2 = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vnonsign1, vmagic_mask)), vmagic_bias));
70 const __m128i vdenorm3 = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vnonsign1, vmagic_mask)), vmagic_bias));
71
72 const __m128i vmask0 = _mm_cmpgt_epi16(vnonsign0, vdenorm_cutoff);
73 const __m128i vmask1 = _mm_cmpgt_epi16(vnonsign1, vdenorm_cutoff);
74
75 const __m128i vf0 = _mm_or_si128(_mm_unpacklo_epi16(_mm_setzero_si128(), vsign0),
76 _mm_blendv_epi8(vdenorm0, vnorm0, _mm_cvtepi16_epi32(vmask0)));
77 const __m128i vf1 = _mm_or_si128(_mm_unpackhi_epi16(_mm_setzero_si128(), vsign0),
78 _mm_blendv_epi8(vdenorm1, vnorm1, _mm_unpackhi_epi16(vmask0, vmask0)));
79 const __m128i vf2 = _mm_or_si128(_mm_unpacklo_epi16(_mm_setzero_si128(), vsign1),
80 _mm_blendv_epi8(vdenorm2, vnorm2, _mm_cvtepi16_epi32(vmask1)));
81 const __m128i vf3 = _mm_or_si128(_mm_unpackhi_epi16(_mm_setzero_si128(), vsign1),
82 _mm_blendv_epi8(vdenorm3, vnorm3, _mm_unpackhi_epi16(vmask1, vmask1)));
83
84 _mm_storeu_ps(output, _mm_castsi128_ps(vf0));
85 _mm_storeu_ps(output + 4, _mm_castsi128_ps(vf1));
86 _mm_storeu_ps(output + 8, _mm_castsi128_ps(vf2));
87 _mm_storeu_ps(output + 12, _mm_castsi128_ps(vf3));
88 output += 16;
89 }
90 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
91 const __m128i vh = _mm_loadu_si128((const __m128i*) i);
92 i += 8;
93
94 const __m128i vsign = _mm_and_si128(vh, vsign_mask);
95
96 const __m128i vnonsign = _mm_xor_si128(vh, vsign);
97
98 const __m128i vprenorm_lo = _mm_slli_epi16(vnonsign, 13);
99 const __m128i vprenorm_hi = _mm_add_epi16(_mm_srli_epi16(vnonsign, 3), vexp_offset);
100
101 const __m128i vnorm_lo = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vprenorm_lo, vprenorm_hi)), vexp_scale));
102 const __m128i vnorm_hi = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vprenorm_lo, vprenorm_hi)), vexp_scale));
103
104 const __m128i vdenorm_lo = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vnonsign, vmagic_mask)), vmagic_bias));
105 const __m128i vdenorm_hi = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vnonsign, vmagic_mask)), vmagic_bias));
106
107 const __m128i vmask = _mm_cmpgt_epi16(vnonsign, vdenorm_cutoff);
108
109 const __m128i vf_lo = _mm_or_si128(_mm_unpacklo_epi16(_mm_setzero_si128(), vsign),
110 _mm_blendv_epi8(vdenorm_lo, vnorm_lo, _mm_cvtepi16_epi32(vmask)));
111
112 const __m128i vf_hi = _mm_or_si128(_mm_unpackhi_epi16(_mm_setzero_si128(), vsign),
113 _mm_blendv_epi8(vdenorm_hi, vnorm_hi, _mm_unpackhi_epi16(vmask, vmask)));
114
115 _mm_storeu_ps(output, _mm_castsi128_ps(vf_lo));
116 _mm_storeu_ps(output + 4, _mm_castsi128_ps(vf_hi));
117 output += 8;
118 }
119 if XNN_UNPREDICTABLE(n != 0) {
120 const __m128i vh = _mm_loadu_si128((const __m128i*) i);
121
122 const __m128i vsign = _mm_and_si128(vh, vsign_mask);
123
124 const __m128i vnonsign = _mm_xor_si128(vh, vsign);
125
126 const __m128i vprenorm_lo = _mm_slli_epi16(vnonsign, 13);
127 const __m128i vprenorm_hi = _mm_add_epi16(_mm_srli_epi16(vnonsign, 3), vexp_offset);
128
129 const __m128i vnorm_lo = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vprenorm_lo, vprenorm_hi)), vexp_scale));
130 const __m128i vnorm_hi = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vprenorm_lo, vprenorm_hi)), vexp_scale));
131
132 const __m128i vdenorm_lo = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpacklo_epi16(vnonsign, vmagic_mask)), vmagic_bias));
133 const __m128i vdenorm_hi = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(_mm_unpackhi_epi16(vnonsign, vmagic_mask)), vmagic_bias));
134
135 const __m128i vmask = _mm_cmpgt_epi16(vnonsign, vdenorm_cutoff);
136
137 __m128i vf = _mm_or_si128(_mm_unpacklo_epi16(_mm_setzero_si128(), vsign),
138 _mm_blendv_epi8(vdenorm_lo, vnorm_lo, _mm_cvtepi16_epi32(vmask)));
139
140 if (n & (4 * sizeof(uint16_t))) {
141 _mm_storeu_ps(output, _mm_castsi128_ps(vf));
142 output += 4;
143
144 vf = _mm_or_si128(_mm_unpackhi_epi16(_mm_setzero_si128(), vsign),
145 _mm_blendv_epi8(vdenorm_hi, vnorm_hi, _mm_unpackhi_epi16(vmask, vmask)));
146 }
147 if (n & (2 * sizeof(uint16_t))) {
148 _mm_storel_pi((__m64*) output, _mm_castsi128_ps(vf));
149 output += 2;
150
151 vf = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(vf), _mm_castsi128_ps(vf)));
152 }
153 if (n & (1 * sizeof(uint16_t))) {
154 _mm_store_ss(output, _mm_castsi128_ps(vf));
155 }
156 }
157 }
158
xnn_f32_dwconv_minmax_ukernel_up16x3__avx(size_t channels,size_t output_width,const float ** input,const float * weights,float * output,size_t input_stride,size_t output_increment,size_t input_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])159 void xnn_f32_dwconv_minmax_ukernel_up16x3__avx(
160 size_t channels,
161 size_t output_width,
162 const float** input,
163 const float* weights,
164 float* output,
165 size_t input_stride,
166 size_t output_increment,
167 size_t input_offset,
168 const float* zero,
169 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
170 {
171 assert(channels != 0);
172 assert(output_width != 0);
173
174 const __m256 vmax = _mm256_load_ps(params->avx.max);
175 const __m256 vmin = _mm256_load_ps(params->avx.min);
176 do {
177 const float* i0 = input[0];
178 assert(i0 != NULL);
179 if XNN_UNPREDICTABLE(i0 != zero) {
180 i0 = (const float*) ((uintptr_t) i0 + input_offset);
181 }
182 const float* i1 = input[1];
183 assert(i1 != NULL);
184 if XNN_UNPREDICTABLE(i1 != zero) {
185 i1 = (const float*) ((uintptr_t) i1 + input_offset);
186 }
187 const float* i2 = input[2];
188 assert(i2 != NULL);
189 if XNN_UNPREDICTABLE(i2 != zero) {
190 i2 = (const float*) ((uintptr_t) i2 + input_offset);
191 }
192 input = (const float**) ((uintptr_t) input + input_stride);
193
194 size_t c = channels;
195 const float* w = weights;
196 for (; c >= 16; c -= 16) {
197 __m256 vacc01234567p0 = _mm256_load_ps(w);
198 __m256 vacc89ABCDEFp0 = _mm256_load_ps(w + 8);
199
200
201 const __m256 vi0x01234567 = _mm256_loadu_ps(i0);
202 const __m256 vi0x89ABCDEF = _mm256_loadu_ps(i0 + 8);
203 i0 += 16;
204
205 const __m256 vk0x01234567 = _mm256_load_ps(w + 16);
206 const __m256 vk0x89ABCDEF = _mm256_load_ps(w + 24);
207 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567));
208 vacc89ABCDEFp0 = _mm256_add_ps(vacc89ABCDEFp0, _mm256_mul_ps(vi0x89ABCDEF, vk0x89ABCDEF));
209
210 const __m256 vi1x01234567 = _mm256_loadu_ps(i1);
211 const __m256 vi1x89ABCDEF = _mm256_loadu_ps(i1 + 8);
212 i1 += 16;
213
214 const __m256 vk1x01234567 = _mm256_load_ps(w + 32);
215 const __m256 vk1x89ABCDEF = _mm256_load_ps(w + 40);
216 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi1x01234567, vk1x01234567));
217 vacc89ABCDEFp0 = _mm256_add_ps(vacc89ABCDEFp0, _mm256_mul_ps(vi1x89ABCDEF, vk1x89ABCDEF));
218
219 const __m256 vi2x01234567 = _mm256_loadu_ps(i2);
220 const __m256 vi2x89ABCDEF = _mm256_loadu_ps(i2 + 8);
221 i2 += 16;
222
223 const __m256 vk2x01234567 = _mm256_load_ps(w + 48);
224 const __m256 vk2x89ABCDEF = _mm256_load_ps(w + 56);
225 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi2x01234567, vk2x01234567));
226 vacc89ABCDEFp0 = _mm256_add_ps(vacc89ABCDEFp0, _mm256_mul_ps(vi2x89ABCDEF, vk2x89ABCDEF));
227
228 w += 64;
229
230
231 __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
232 __m256 vacc89ABCDEF = _mm256_max_ps(vacc89ABCDEFp0, vmin);
233 vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
234 vacc89ABCDEF = _mm256_min_ps(vacc89ABCDEF, vmax);
235
236 _mm256_storeu_ps(output, vacc01234567);
237 _mm256_storeu_ps(output + 8, vacc89ABCDEF);
238 output += 16;
239 }
240 for (; c >= 8; c -= 8) {
241 __m256 vacc01234567p0 = _mm256_load_ps(w);
242
243 const __m256 vi0x01234567 = _mm256_loadu_ps(i0);
244 i0 += 8;
245
246 const __m256 vk0x01234567 = _mm256_load_ps(w + 16);
247 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567));
248
249 const __m256 vi1x01234567 = _mm256_loadu_ps(i1);
250 i1 += 8;
251
252 const __m256 vk1x01234567 = _mm256_load_ps(w + 32);
253 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi1x01234567, vk1x01234567));
254
255 const __m256 vi2x01234567 = _mm256_loadu_ps(i2);
256 i2 += 8;
257
258 const __m256 vk2x01234567 = _mm256_load_ps(w + 48);
259 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi2x01234567, vk2x01234567));
260
261 w += 8;
262
263
264 __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
265 vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
266
267 _mm256_storeu_ps(output, vacc01234567);
268 output += 8;
269 }
270 if XNN_UNLIKELY(c != 0) {
271 assert(c >= 1);
272 assert(c <= 7);
273 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ¶ms->avx.mask_table[7 - c]);
274
275 __m256 vacc01234567p0 = _mm256_load_ps(w);
276
277 const __m256 vi0x01234567 = _mm256_maskload_ps(i0, vmask);
278 const __m256 vk0x01234567 = _mm256_load_ps(w + 16);
279 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567));
280
281 const __m256 vi1x01234567 = _mm256_maskload_ps(i1, vmask);
282 const __m256 vk1x01234567 = _mm256_load_ps(w + 32);
283 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi1x01234567, vk1x01234567));
284
285 const __m256 vi2x01234567 = _mm256_maskload_ps(i2, vmask);
286 const __m256 vk2x01234567 = _mm256_load_ps(w + 48);
287 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi2x01234567, vk2x01234567));
288
289
290 __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
291 vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
292
293 __m128 vacc0123 = _mm256_castps256_ps128(vacc01234567);
294 if (c & 4) {
295 _mm_storeu_ps(output, vacc0123);
296 vacc0123 = _mm256_extractf128_ps(vacc01234567, 1);
297 output += 4;
298 }
299 if (c & 2) {
300 _mm_storel_pi((__m64*) output, vacc0123);
301 vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
302 output += 2;
303 }
304 if (c & 1) {
305 _mm_store_ss(output, vacc0123);
306 output += 1;
307 }
308 }
309
310 output = (float*) ((uintptr_t) output + output_increment);
311 } while (--output_width != 0);
312 }
313
xnn_f32_dwconv_minmax_ukernel_up16x4__avx(size_t channels,size_t output_width,const float ** input,const float * weights,float * output,size_t input_stride,size_t output_increment,size_t input_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])314 void xnn_f32_dwconv_minmax_ukernel_up16x4__avx(
315 size_t channels,
316 size_t output_width,
317 const float** input,
318 const float* weights,
319 float* output,
320 size_t input_stride,
321 size_t output_increment,
322 size_t input_offset,
323 const float* zero,
324 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
325 {
326 assert(channels != 0);
327 assert(output_width != 0);
328
329 const __m256 vmax = _mm256_load_ps(params->avx.max);
330 const __m256 vmin = _mm256_load_ps(params->avx.min);
331 do {
332 const float* i0 = input[0];
333 assert(i0 != NULL);
334 if XNN_UNPREDICTABLE(i0 != zero) {
335 i0 = (const float*) ((uintptr_t) i0 + input_offset);
336 }
337 const float* i1 = input[1];
338 assert(i1 != NULL);
339 if XNN_UNPREDICTABLE(i1 != zero) {
340 i1 = (const float*) ((uintptr_t) i1 + input_offset);
341 }
342 const float* i2 = input[2];
343 assert(i2 != NULL);
344 if XNN_UNPREDICTABLE(i2 != zero) {
345 i2 = (const float*) ((uintptr_t) i2 + input_offset);
346 }
347 const float* i3 = input[3];
348 assert(i3 != NULL);
349 if XNN_UNPREDICTABLE(i3 != zero) {
350 i3 = (const float*) ((uintptr_t) i3 + input_offset);
351 }
352 input = (const float**) ((uintptr_t) input + input_stride);
353
354 size_t c = channels;
355 const float* w = weights;
356 for (; c >= 16; c -= 16) {
357 __m256 vacc01234567p0 = _mm256_load_ps(w);
358 __m256 vacc89ABCDEFp0 = _mm256_load_ps(w + 8);
359
360
361 const __m256 vi0x01234567 = _mm256_loadu_ps(i0);
362 const __m256 vi0x89ABCDEF = _mm256_loadu_ps(i0 + 8);
363 i0 += 16;
364
365 const __m256 vk0x01234567 = _mm256_load_ps(w + 16);
366 const __m256 vk0x89ABCDEF = _mm256_load_ps(w + 24);
367 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567));
368 vacc89ABCDEFp0 = _mm256_add_ps(vacc89ABCDEFp0, _mm256_mul_ps(vi0x89ABCDEF, vk0x89ABCDEF));
369
370 const __m256 vi1x01234567 = _mm256_loadu_ps(i1);
371 const __m256 vi1x89ABCDEF = _mm256_loadu_ps(i1 + 8);
372 i1 += 16;
373
374 const __m256 vk1x01234567 = _mm256_load_ps(w + 32);
375 const __m256 vk1x89ABCDEF = _mm256_load_ps(w + 40);
376 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi1x01234567, vk1x01234567));
377 vacc89ABCDEFp0 = _mm256_add_ps(vacc89ABCDEFp0, _mm256_mul_ps(vi1x89ABCDEF, vk1x89ABCDEF));
378
379 const __m256 vi2x01234567 = _mm256_loadu_ps(i2);
380 const __m256 vi2x89ABCDEF = _mm256_loadu_ps(i2 + 8);
381 i2 += 16;
382
383 const __m256 vk2x01234567 = _mm256_load_ps(w + 48);
384 const __m256 vk2x89ABCDEF = _mm256_load_ps(w + 56);
385 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi2x01234567, vk2x01234567));
386 vacc89ABCDEFp0 = _mm256_add_ps(vacc89ABCDEFp0, _mm256_mul_ps(vi2x89ABCDEF, vk2x89ABCDEF));
387
388 const __m256 vi3x01234567 = _mm256_loadu_ps(i3);
389 const __m256 vi3x89ABCDEF = _mm256_loadu_ps(i3 + 8);
390 i3 += 16;
391
392 const __m256 vk3x01234567 = _mm256_load_ps(w + 64);
393 const __m256 vk3x89ABCDEF = _mm256_load_ps(w + 72);
394 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi3x01234567, vk3x01234567));
395 vacc89ABCDEFp0 = _mm256_add_ps(vacc89ABCDEFp0, _mm256_mul_ps(vi3x89ABCDEF, vk3x89ABCDEF));
396
397 w += 80;
398
399
400 __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
401 __m256 vacc89ABCDEF = _mm256_max_ps(vacc89ABCDEFp0, vmin);
402 vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
403 vacc89ABCDEF = _mm256_min_ps(vacc89ABCDEF, vmax);
404
405 _mm256_storeu_ps(output, vacc01234567);
406 _mm256_storeu_ps(output + 8, vacc89ABCDEF);
407 output += 16;
408 }
409 for (; c >= 8; c -= 8) {
410 __m256 vacc01234567p0 = _mm256_load_ps(w);
411
412 const __m256 vi0x01234567 = _mm256_loadu_ps(i0);
413 i0 += 8;
414
415 const __m256 vk0x01234567 = _mm256_load_ps(w + 16);
416 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567));
417
418 const __m256 vi1x01234567 = _mm256_loadu_ps(i1);
419 i1 += 8;
420
421 const __m256 vk1x01234567 = _mm256_load_ps(w + 32);
422 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi1x01234567, vk1x01234567));
423
424 const __m256 vi2x01234567 = _mm256_loadu_ps(i2);
425 i2 += 8;
426
427 const __m256 vk2x01234567 = _mm256_load_ps(w + 48);
428 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi2x01234567, vk2x01234567));
429
430 const __m256 vi3x01234567 = _mm256_loadu_ps(i3);
431 i3 += 8;
432
433 const __m256 vk3x01234567 = _mm256_load_ps(w + 64);
434 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi3x01234567, vk3x01234567));
435
436 w += 8;
437
438
439 __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
440 vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
441
442 _mm256_storeu_ps(output, vacc01234567);
443 output += 8;
444 }
445 if XNN_UNLIKELY(c != 0) {
446 assert(c >= 1);
447 assert(c <= 7);
448 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ¶ms->avx.mask_table[7 - c]);
449
450 __m256 vacc01234567p0 = _mm256_load_ps(w);
451
452 const __m256 vi0x01234567 = _mm256_maskload_ps(i0, vmask);
453 const __m256 vk0x01234567 = _mm256_load_ps(w + 16);
454 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567));
455
456 const __m256 vi1x01234567 = _mm256_maskload_ps(i1, vmask);
457 const __m256 vk1x01234567 = _mm256_load_ps(w + 32);
458 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi1x01234567, vk1x01234567));
459
460 const __m256 vi2x01234567 = _mm256_maskload_ps(i2, vmask);
461 const __m256 vk2x01234567 = _mm256_load_ps(w + 48);
462 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi2x01234567, vk2x01234567));
463
464 const __m256 vi3x01234567 = _mm256_maskload_ps(i3, vmask);
465 const __m256 vk3x01234567 = _mm256_load_ps(w + 64);
466 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi3x01234567, vk3x01234567));
467
468
469 __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
470 vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
471
472 __m128 vacc0123 = _mm256_castps256_ps128(vacc01234567);
473 if (c & 4) {
474 _mm_storeu_ps(output, vacc0123);
475 vacc0123 = _mm256_extractf128_ps(vacc01234567, 1);
476 output += 4;
477 }
478 if (c & 2) {
479 _mm_storel_pi((__m64*) output, vacc0123);
480 vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
481 output += 2;
482 }
483 if (c & 1) {
484 _mm_store_ss(output, vacc0123);
485 output += 1;
486 }
487 }
488
489 output = (float*) ((uintptr_t) output + output_increment);
490 } while (--output_width != 0);
491 }
492
xnn_f32_dwconv_minmax_ukernel_up16x9__avx(size_t channels,size_t output_width,const float ** input,const float * weights,float * output,size_t input_stride,size_t output_increment,size_t input_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])493 void xnn_f32_dwconv_minmax_ukernel_up16x9__avx(
494 size_t channels,
495 size_t output_width,
496 const float** input,
497 const float* weights,
498 float* output,
499 size_t input_stride,
500 size_t output_increment,
501 size_t input_offset,
502 const float* zero,
503 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
504 {
505 assert(channels != 0);
506 assert(output_width != 0);
507
508 const __m256 vmax = _mm256_load_ps(params->avx.max);
509 const __m256 vmin = _mm256_load_ps(params->avx.min);
510 do {
511 const float* i0 = input[0];
512 assert(i0 != NULL);
513 if XNN_UNPREDICTABLE(i0 != zero) {
514 i0 = (const float*) ((uintptr_t) i0 + input_offset);
515 }
516 const float* i1 = input[1];
517 assert(i1 != NULL);
518 if XNN_UNPREDICTABLE(i1 != zero) {
519 i1 = (const float*) ((uintptr_t) i1 + input_offset);
520 }
521 const float* i2 = input[2];
522 assert(i2 != NULL);
523 if XNN_UNPREDICTABLE(i2 != zero) {
524 i2 = (const float*) ((uintptr_t) i2 + input_offset);
525 }
526 const float* i3 = input[3];
527 assert(i3 != NULL);
528 if XNN_UNPREDICTABLE(i3 != zero) {
529 i3 = (const float*) ((uintptr_t) i3 + input_offset);
530 }
531 const float* i4 = input[4];
532 assert(i4 != NULL);
533 if XNN_UNPREDICTABLE(i4 != zero) {
534 i4 = (const float*) ((uintptr_t) i4 + input_offset);
535 }
536 const float* i5 = input[5];
537 assert(i5 != NULL);
538 if XNN_UNPREDICTABLE(i5 != zero) {
539 i5 = (const float*) ((uintptr_t) i5 + input_offset);
540 }
541 const float* i6 = input[6];
542 assert(i6 != NULL);
543 if XNN_UNPREDICTABLE(i6 != zero) {
544 i6 = (const float*) ((uintptr_t) i6 + input_offset);
545 }
546 const float* i7 = input[7];
547 assert(i7 != NULL);
548 if XNN_UNPREDICTABLE(i7 != zero) {
549 i7 = (const float*) ((uintptr_t) i7 + input_offset);
550 }
551 const float* i8 = input[8];
552 assert(i8 != NULL);
553 if XNN_UNPREDICTABLE(i8 != zero) {
554 i8 = (const float*) ((uintptr_t) i8 + input_offset);
555 }
556 input = (const float**) ((uintptr_t) input + input_stride);
557
558 size_t c = channels;
559 const float* w = weights;
560 for (; c >= 16; c -= 16) {
561 __m256 vacc01234567p0 = _mm256_load_ps(w);
562 __m256 vacc89ABCDEFp0 = _mm256_load_ps(w + 8);
563
564
565 const __m256 vi0x01234567 = _mm256_loadu_ps(i0);
566 const __m256 vi0x89ABCDEF = _mm256_loadu_ps(i0 + 8);
567 i0 += 16;
568
569 const __m256 vk0x01234567 = _mm256_load_ps(w + 16);
570 const __m256 vk0x89ABCDEF = _mm256_load_ps(w + 24);
571 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567));
572 vacc89ABCDEFp0 = _mm256_add_ps(vacc89ABCDEFp0, _mm256_mul_ps(vi0x89ABCDEF, vk0x89ABCDEF));
573
574 const __m256 vi1x01234567 = _mm256_loadu_ps(i1);
575 const __m256 vi1x89ABCDEF = _mm256_loadu_ps(i1 + 8);
576 i1 += 16;
577
578 const __m256 vk1x01234567 = _mm256_load_ps(w + 32);
579 const __m256 vk1x89ABCDEF = _mm256_load_ps(w + 40);
580 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi1x01234567, vk1x01234567));
581 vacc89ABCDEFp0 = _mm256_add_ps(vacc89ABCDEFp0, _mm256_mul_ps(vi1x89ABCDEF, vk1x89ABCDEF));
582
583 const __m256 vi2x01234567 = _mm256_loadu_ps(i2);
584 const __m256 vi2x89ABCDEF = _mm256_loadu_ps(i2 + 8);
585 i2 += 16;
586
587 const __m256 vk2x01234567 = _mm256_load_ps(w + 48);
588 const __m256 vk2x89ABCDEF = _mm256_load_ps(w + 56);
589 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi2x01234567, vk2x01234567));
590 vacc89ABCDEFp0 = _mm256_add_ps(vacc89ABCDEFp0, _mm256_mul_ps(vi2x89ABCDEF, vk2x89ABCDEF));
591
592 const __m256 vi3x01234567 = _mm256_loadu_ps(i3);
593 const __m256 vi3x89ABCDEF = _mm256_loadu_ps(i3 + 8);
594 i3 += 16;
595
596 const __m256 vk3x01234567 = _mm256_load_ps(w + 64);
597 const __m256 vk3x89ABCDEF = _mm256_load_ps(w + 72);
598 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi3x01234567, vk3x01234567));
599 vacc89ABCDEFp0 = _mm256_add_ps(vacc89ABCDEFp0, _mm256_mul_ps(vi3x89ABCDEF, vk3x89ABCDEF));
600
601 const __m256 vi4x01234567 = _mm256_loadu_ps(i4);
602 const __m256 vi4x89ABCDEF = _mm256_loadu_ps(i4 + 8);
603 i4 += 16;
604
605 const __m256 vk4x01234567 = _mm256_load_ps(w + 80);
606 const __m256 vk4x89ABCDEF = _mm256_load_ps(w + 88);
607 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi4x01234567, vk4x01234567));
608 vacc89ABCDEFp0 = _mm256_add_ps(vacc89ABCDEFp0, _mm256_mul_ps(vi4x89ABCDEF, vk4x89ABCDEF));
609
610 const __m256 vi5x01234567 = _mm256_loadu_ps(i5);
611 const __m256 vi5x89ABCDEF = _mm256_loadu_ps(i5 + 8);
612 i5 += 16;
613
614 const __m256 vk5x01234567 = _mm256_load_ps(w + 96);
615 const __m256 vk5x89ABCDEF = _mm256_load_ps(w + 104);
616 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi5x01234567, vk5x01234567));
617 vacc89ABCDEFp0 = _mm256_add_ps(vacc89ABCDEFp0, _mm256_mul_ps(vi5x89ABCDEF, vk5x89ABCDEF));
618
619 const __m256 vi6x01234567 = _mm256_loadu_ps(i6);
620 const __m256 vi6x89ABCDEF = _mm256_loadu_ps(i6 + 8);
621 i6 += 16;
622
623 const __m256 vk6x01234567 = _mm256_load_ps(w + 112);
624 const __m256 vk6x89ABCDEF = _mm256_load_ps(w + 120);
625 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi6x01234567, vk6x01234567));
626 vacc89ABCDEFp0 = _mm256_add_ps(vacc89ABCDEFp0, _mm256_mul_ps(vi6x89ABCDEF, vk6x89ABCDEF));
627
628 const __m256 vi7x01234567 = _mm256_loadu_ps(i7);
629 const __m256 vi7x89ABCDEF = _mm256_loadu_ps(i7 + 8);
630 i7 += 16;
631
632 const __m256 vk7x01234567 = _mm256_load_ps(w + 128);
633 const __m256 vk7x89ABCDEF = _mm256_load_ps(w + 136);
634 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi7x01234567, vk7x01234567));
635 vacc89ABCDEFp0 = _mm256_add_ps(vacc89ABCDEFp0, _mm256_mul_ps(vi7x89ABCDEF, vk7x89ABCDEF));
636
637 const __m256 vi8x01234567 = _mm256_loadu_ps(i8);
638 const __m256 vi8x89ABCDEF = _mm256_loadu_ps(i8 + 8);
639 i8 += 16;
640
641 const __m256 vk8x01234567 = _mm256_load_ps(w + 144);
642 const __m256 vk8x89ABCDEF = _mm256_load_ps(w + 152);
643 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi8x01234567, vk8x01234567));
644 vacc89ABCDEFp0 = _mm256_add_ps(vacc89ABCDEFp0, _mm256_mul_ps(vi8x89ABCDEF, vk8x89ABCDEF));
645
646 w += 160;
647
648
649 __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
650 __m256 vacc89ABCDEF = _mm256_max_ps(vacc89ABCDEFp0, vmin);
651 vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
652 vacc89ABCDEF = _mm256_min_ps(vacc89ABCDEF, vmax);
653
654 _mm256_storeu_ps(output, vacc01234567);
655 _mm256_storeu_ps(output + 8, vacc89ABCDEF);
656 output += 16;
657 }
658 for (; c >= 8; c -= 8) {
659 __m256 vacc01234567p0 = _mm256_load_ps(w);
660
661 const __m256 vi0x01234567 = _mm256_loadu_ps(i0);
662 i0 += 8;
663
664 const __m256 vk0x01234567 = _mm256_load_ps(w + 16);
665 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567));
666
667 const __m256 vi1x01234567 = _mm256_loadu_ps(i1);
668 i1 += 8;
669
670 const __m256 vk1x01234567 = _mm256_load_ps(w + 32);
671 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi1x01234567, vk1x01234567));
672
673 const __m256 vi2x01234567 = _mm256_loadu_ps(i2);
674 i2 += 8;
675
676 const __m256 vk2x01234567 = _mm256_load_ps(w + 48);
677 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi2x01234567, vk2x01234567));
678
679 const __m256 vi3x01234567 = _mm256_loadu_ps(i3);
680 i3 += 8;
681
682 const __m256 vk3x01234567 = _mm256_load_ps(w + 64);
683 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi3x01234567, vk3x01234567));
684
685 const __m256 vi4x01234567 = _mm256_loadu_ps(i4);
686 i4 += 8;
687
688 const __m256 vk4x01234567 = _mm256_load_ps(w + 80);
689 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi4x01234567, vk4x01234567));
690
691 const __m256 vi5x01234567 = _mm256_loadu_ps(i5);
692 i5 += 8;
693
694 const __m256 vk5x01234567 = _mm256_load_ps(w + 96);
695 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi5x01234567, vk5x01234567));
696
697 const __m256 vi6x01234567 = _mm256_loadu_ps(i6);
698 i6 += 8;
699
700 const __m256 vk6x01234567 = _mm256_load_ps(w + 112);
701 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi6x01234567, vk6x01234567));
702
703 const __m256 vi7x01234567 = _mm256_loadu_ps(i7);
704 i7 += 8;
705
706 const __m256 vk7x01234567 = _mm256_load_ps(w + 128);
707 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi7x01234567, vk7x01234567));
708
709 const __m256 vi8x01234567 = _mm256_loadu_ps(i8);
710 i8 += 8;
711
712 const __m256 vk8x01234567 = _mm256_load_ps(w + 144);
713 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi8x01234567, vk8x01234567));
714
715 w += 8;
716
717
718 __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
719 vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
720
721 _mm256_storeu_ps(output, vacc01234567);
722 output += 8;
723 }
724 if XNN_UNLIKELY(c != 0) {
725 assert(c >= 1);
726 assert(c <= 7);
727 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ¶ms->avx.mask_table[7 - c]);
728
729 __m256 vacc01234567p0 = _mm256_load_ps(w);
730
731 const __m256 vi0x01234567 = _mm256_maskload_ps(i0, vmask);
732 const __m256 vk0x01234567 = _mm256_load_ps(w + 16);
733 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567));
734
735 const __m256 vi1x01234567 = _mm256_maskload_ps(i1, vmask);
736 const __m256 vk1x01234567 = _mm256_load_ps(w + 32);
737 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi1x01234567, vk1x01234567));
738
739 const __m256 vi2x01234567 = _mm256_maskload_ps(i2, vmask);
740 const __m256 vk2x01234567 = _mm256_load_ps(w + 48);
741 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi2x01234567, vk2x01234567));
742
743 const __m256 vi3x01234567 = _mm256_maskload_ps(i3, vmask);
744 const __m256 vk3x01234567 = _mm256_load_ps(w + 64);
745 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi3x01234567, vk3x01234567));
746
747 const __m256 vi4x01234567 = _mm256_maskload_ps(i4, vmask);
748 const __m256 vk4x01234567 = _mm256_load_ps(w + 80);
749 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi4x01234567, vk4x01234567));
750
751 const __m256 vi5x01234567 = _mm256_maskload_ps(i5, vmask);
752 const __m256 vk5x01234567 = _mm256_load_ps(w + 96);
753 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi5x01234567, vk5x01234567));
754
755 const __m256 vi6x01234567 = _mm256_maskload_ps(i6, vmask);
756 const __m256 vk6x01234567 = _mm256_load_ps(w + 112);
757 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi6x01234567, vk6x01234567));
758
759 const __m256 vi7x01234567 = _mm256_maskload_ps(i7, vmask);
760 const __m256 vk7x01234567 = _mm256_load_ps(w + 128);
761 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi7x01234567, vk7x01234567));
762
763 const __m256 vi8x01234567 = _mm256_maskload_ps(i8, vmask);
764 const __m256 vk8x01234567 = _mm256_load_ps(w + 144);
765 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi8x01234567, vk8x01234567));
766
767
768 __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
769 vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
770
771 __m128 vacc0123 = _mm256_castps256_ps128(vacc01234567);
772 if (c & 4) {
773 _mm_storeu_ps(output, vacc0123);
774 vacc0123 = _mm256_extractf128_ps(vacc01234567, 1);
775 output += 4;
776 }
777 if (c & 2) {
778 _mm_storel_pi((__m64*) output, vacc0123);
779 vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
780 output += 2;
781 }
782 if (c & 1) {
783 _mm_store_ss(output, vacc0123);
784 output += 1;
785 }
786 }
787
788 output = (float*) ((uintptr_t) output + output_increment);
789 } while (--output_width != 0);
790 }
791
xnn_f32_dwconv_minmax_ukernel_up8x25__avx(size_t channels,size_t output_width,const float ** input,const float * weights,float * output,size_t input_stride,size_t output_increment,size_t input_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])792 void xnn_f32_dwconv_minmax_ukernel_up8x25__avx(
793 size_t channels,
794 size_t output_width,
795 const float** input,
796 const float* weights,
797 float* output,
798 size_t input_stride,
799 size_t output_increment,
800 size_t input_offset,
801 const float* zero,
802 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
803 {
804 assert(channels != 0);
805 assert(output_width != 0);
806
807 const __m256 vmax = _mm256_load_ps(params->avx.max);
808 const __m256 vmin = _mm256_load_ps(params->avx.min);
809 do {
810 const float* i0 = input[0];
811 assert(i0 != NULL);
812 if XNN_UNPREDICTABLE(i0 != zero) {
813 i0 = (const float*) ((uintptr_t) i0 + input_offset);
814 }
815 const float* i1 = input[1];
816 assert(i1 != NULL);
817 if XNN_UNPREDICTABLE(i1 != zero) {
818 i1 = (const float*) ((uintptr_t) i1 + input_offset);
819 }
820 const float* i2 = input[2];
821 assert(i2 != NULL);
822 if XNN_UNPREDICTABLE(i2 != zero) {
823 i2 = (const float*) ((uintptr_t) i2 + input_offset);
824 }
825 const float* i3 = input[3];
826 assert(i3 != NULL);
827 if XNN_UNPREDICTABLE(i3 != zero) {
828 i3 = (const float*) ((uintptr_t) i3 + input_offset);
829 }
830 const float* i4 = input[4];
831 assert(i4 != NULL);
832 if XNN_UNPREDICTABLE(i4 != zero) {
833 i4 = (const float*) ((uintptr_t) i4 + input_offset);
834 }
835 const float* i5 = input[5];
836 assert(i5 != NULL);
837 if XNN_UNPREDICTABLE(i5 != zero) {
838 i5 = (const float*) ((uintptr_t) i5 + input_offset);
839 }
840 const float* i6 = input[6];
841 assert(i6 != NULL);
842 if XNN_UNPREDICTABLE(i6 != zero) {
843 i6 = (const float*) ((uintptr_t) i6 + input_offset);
844 }
845 const float* i7 = input[7];
846 assert(i7 != NULL);
847 if XNN_UNPREDICTABLE(i7 != zero) {
848 i7 = (const float*) ((uintptr_t) i7 + input_offset);
849 }
850 const float* i8 = input[8];
851 assert(i8 != NULL);
852 if XNN_UNPREDICTABLE(i8 != zero) {
853 i8 = (const float*) ((uintptr_t) i8 + input_offset);
854 }
855 const float* i9 = input[9];
856 assert(i9 != NULL);
857 if XNN_UNPREDICTABLE(i9 != zero) {
858 i9 = (const float*) ((uintptr_t) i9 + input_offset);
859 }
860 const float* i10 = input[10];
861 assert(i10 != NULL);
862 if XNN_UNPREDICTABLE(i10 != zero) {
863 i10 = (const float*) ((uintptr_t) i10 + input_offset);
864 }
865 const float* i11 = input[11];
866 assert(i11 != NULL);
867 if XNN_UNPREDICTABLE(i11 != zero) {
868 i11 = (const float*) ((uintptr_t) i11 + input_offset);
869 }
870 const float* i12 = input[12];
871 assert(i12 != NULL);
872 if XNN_UNPREDICTABLE(i12 != zero) {
873 i12 = (const float*) ((uintptr_t) i12 + input_offset);
874 }
875 const float* i13 = input[13];
876 assert(i13 != NULL);
877 if XNN_UNPREDICTABLE(i13 != zero) {
878 i13 = (const float*) ((uintptr_t) i13 + input_offset);
879 }
880 const float* i14 = input[14];
881 assert(i14 != NULL);
882 if XNN_UNPREDICTABLE(i14 != zero) {
883 i14 = (const float*) ((uintptr_t) i14 + input_offset);
884 }
885 const float* i15 = input[15];
886 assert(i15 != NULL);
887 if XNN_UNPREDICTABLE(i15 != zero) {
888 i15 = (const float*) ((uintptr_t) i15 + input_offset);
889 }
890 const float* i16 = input[16];
891 assert(i16 != NULL);
892 if XNN_UNPREDICTABLE(i16 != zero) {
893 i16 = (const float*) ((uintptr_t) i16 + input_offset);
894 }
895 const float* i17 = input[17];
896 assert(i17 != NULL);
897 if XNN_UNPREDICTABLE(i17 != zero) {
898 i17 = (const float*) ((uintptr_t) i17 + input_offset);
899 }
900 const float* i18 = input[18];
901 assert(i18 != NULL);
902 if XNN_UNPREDICTABLE(i18 != zero) {
903 i18 = (const float*) ((uintptr_t) i18 + input_offset);
904 }
905 const float* i19 = input[19];
906 assert(i19 != NULL);
907 if XNN_UNPREDICTABLE(i19 != zero) {
908 i19 = (const float*) ((uintptr_t) i19 + input_offset);
909 }
910 const float* i20 = input[20];
911 assert(i20 != NULL);
912 if XNN_UNPREDICTABLE(i20 != zero) {
913 i20 = (const float*) ((uintptr_t) i20 + input_offset);
914 }
915 const float* i21 = input[21];
916 assert(i21 != NULL);
917 if XNN_UNPREDICTABLE(i21 != zero) {
918 i21 = (const float*) ((uintptr_t) i21 + input_offset);
919 }
920 const float* i22 = input[22];
921 assert(i22 != NULL);
922 if XNN_UNPREDICTABLE(i22 != zero) {
923 i22 = (const float*) ((uintptr_t) i22 + input_offset);
924 }
925 const float* i23 = input[23];
926 assert(i23 != NULL);
927 if XNN_UNPREDICTABLE(i23 != zero) {
928 i23 = (const float*) ((uintptr_t) i23 + input_offset);
929 }
930 const float* i24 = input[24];
931 assert(i24 != NULL);
932 if XNN_UNPREDICTABLE(i24 != zero) {
933 i24 = (const float*) ((uintptr_t) i24 + input_offset);
934 }
935 input = (const float**) ((uintptr_t) input + input_stride);
936
937 size_t c = channels;
938 const float* w = weights;
939 for (; c >= 8; c -= 8) {
940 __m256 vacc01234567p0 = _mm256_load_ps(w);
941
942
943 const __m256 vi0x01234567 = _mm256_loadu_ps(i0);
944 i0 += 8;
945
946 const __m256 vk0x01234567 = _mm256_load_ps(w + 8);
947 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567));
948
949 const __m256 vi1x01234567 = _mm256_loadu_ps(i1);
950 i1 += 8;
951
952 const __m256 vk1x01234567 = _mm256_load_ps(w + 16);
953 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi1x01234567, vk1x01234567));
954
955 const __m256 vi2x01234567 = _mm256_loadu_ps(i2);
956 i2 += 8;
957
958 const __m256 vk2x01234567 = _mm256_load_ps(w + 24);
959 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi2x01234567, vk2x01234567));
960
961 const __m256 vi3x01234567 = _mm256_loadu_ps(i3);
962 i3 += 8;
963
964 const __m256 vk3x01234567 = _mm256_load_ps(w + 32);
965 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi3x01234567, vk3x01234567));
966
967 const __m256 vi4x01234567 = _mm256_loadu_ps(i4);
968 i4 += 8;
969
970 const __m256 vk4x01234567 = _mm256_load_ps(w + 40);
971 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi4x01234567, vk4x01234567));
972
973 const __m256 vi5x01234567 = _mm256_loadu_ps(i5);
974 i5 += 8;
975
976 const __m256 vk5x01234567 = _mm256_load_ps(w + 48);
977 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi5x01234567, vk5x01234567));
978
979 const __m256 vi6x01234567 = _mm256_loadu_ps(i6);
980 i6 += 8;
981
982 const __m256 vk6x01234567 = _mm256_load_ps(w + 56);
983 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi6x01234567, vk6x01234567));
984
985 const __m256 vi7x01234567 = _mm256_loadu_ps(i7);
986 i7 += 8;
987
988 const __m256 vk7x01234567 = _mm256_load_ps(w + 64);
989 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi7x01234567, vk7x01234567));
990
991 const __m256 vi8x01234567 = _mm256_loadu_ps(i8);
992 i8 += 8;
993
994 const __m256 vk8x01234567 = _mm256_load_ps(w + 72);
995 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi8x01234567, vk8x01234567));
996
997 const __m256 vi9x01234567 = _mm256_loadu_ps(i9);
998 i9 += 8;
999
1000 const __m256 vk9x01234567 = _mm256_load_ps(w + 80);
1001 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi9x01234567, vk9x01234567));
1002
1003 const __m256 vi10x01234567 = _mm256_loadu_ps(i10);
1004 i10 += 8;
1005
1006 const __m256 vk10x01234567 = _mm256_load_ps(w + 88);
1007 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi10x01234567, vk10x01234567));
1008
1009 const __m256 vi11x01234567 = _mm256_loadu_ps(i11);
1010 i11 += 8;
1011
1012 const __m256 vk11x01234567 = _mm256_load_ps(w + 96);
1013 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi11x01234567, vk11x01234567));
1014
1015 const __m256 vi12x01234567 = _mm256_loadu_ps(i12);
1016 i12 += 8;
1017
1018 const __m256 vk12x01234567 = _mm256_load_ps(w + 104);
1019 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi12x01234567, vk12x01234567));
1020
1021 const __m256 vi13x01234567 = _mm256_loadu_ps(i13);
1022 i13 += 8;
1023
1024 const __m256 vk13x01234567 = _mm256_load_ps(w + 112);
1025 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi13x01234567, vk13x01234567));
1026
1027 const __m256 vi14x01234567 = _mm256_loadu_ps(i14);
1028 i14 += 8;
1029
1030 const __m256 vk14x01234567 = _mm256_load_ps(w + 120);
1031 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi14x01234567, vk14x01234567));
1032
1033 const __m256 vi15x01234567 = _mm256_loadu_ps(i15);
1034 i15 += 8;
1035
1036 const __m256 vk15x01234567 = _mm256_load_ps(w + 128);
1037 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi15x01234567, vk15x01234567));
1038
1039 const __m256 vi16x01234567 = _mm256_loadu_ps(i16);
1040 i16 += 8;
1041
1042 const __m256 vk16x01234567 = _mm256_load_ps(w + 136);
1043 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi16x01234567, vk16x01234567));
1044
1045 const __m256 vi17x01234567 = _mm256_loadu_ps(i17);
1046 i17 += 8;
1047
1048 const __m256 vk17x01234567 = _mm256_load_ps(w + 144);
1049 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi17x01234567, vk17x01234567));
1050
1051 const __m256 vi18x01234567 = _mm256_loadu_ps(i18);
1052 i18 += 8;
1053
1054 const __m256 vk18x01234567 = _mm256_load_ps(w + 152);
1055 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi18x01234567, vk18x01234567));
1056
1057 const __m256 vi19x01234567 = _mm256_loadu_ps(i19);
1058 i19 += 8;
1059
1060 const __m256 vk19x01234567 = _mm256_load_ps(w + 160);
1061 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi19x01234567, vk19x01234567));
1062
1063 const __m256 vi20x01234567 = _mm256_loadu_ps(i20);
1064 i20 += 8;
1065
1066 const __m256 vk20x01234567 = _mm256_load_ps(w + 168);
1067 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi20x01234567, vk20x01234567));
1068
1069 const __m256 vi21x01234567 = _mm256_loadu_ps(i21);
1070 i21 += 8;
1071
1072 const __m256 vk21x01234567 = _mm256_load_ps(w + 176);
1073 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi21x01234567, vk21x01234567));
1074
1075 const __m256 vi22x01234567 = _mm256_loadu_ps(i22);
1076 i22 += 8;
1077
1078 const __m256 vk22x01234567 = _mm256_load_ps(w + 184);
1079 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi22x01234567, vk22x01234567));
1080
1081 const __m256 vi23x01234567 = _mm256_loadu_ps(i23);
1082 i23 += 8;
1083
1084 const __m256 vk23x01234567 = _mm256_load_ps(w + 192);
1085 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi23x01234567, vk23x01234567));
1086
1087 const __m256 vi24x01234567 = _mm256_loadu_ps(i24);
1088 i24 += 8;
1089
1090 const __m256 vk24x01234567 = _mm256_load_ps(w + 200);
1091 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi24x01234567, vk24x01234567));
1092
1093 w += 208;
1094
1095
1096 __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
1097 vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
1098
1099 _mm256_storeu_ps(output, vacc01234567);
1100 output += 8;
1101 }
1102 if XNN_UNLIKELY(c != 0) {
1103 assert(c >= 1);
1104 assert(c <= 7);
1105 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ¶ms->avx.mask_table[7 - c]);
1106
1107 __m256 vacc01234567p0 = _mm256_load_ps(w);
1108
1109 const __m256 vi0x01234567 = _mm256_maskload_ps(i0, vmask);
1110 const __m256 vk0x01234567 = _mm256_load_ps(w + 8);
1111 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi0x01234567, vk0x01234567));
1112
1113 const __m256 vi1x01234567 = _mm256_maskload_ps(i1, vmask);
1114 const __m256 vk1x01234567 = _mm256_load_ps(w + 16);
1115 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi1x01234567, vk1x01234567));
1116
1117 const __m256 vi2x01234567 = _mm256_maskload_ps(i2, vmask);
1118 const __m256 vk2x01234567 = _mm256_load_ps(w + 24);
1119 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi2x01234567, vk2x01234567));
1120
1121 const __m256 vi3x01234567 = _mm256_maskload_ps(i3, vmask);
1122 const __m256 vk3x01234567 = _mm256_load_ps(w + 32);
1123 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi3x01234567, vk3x01234567));
1124
1125 const __m256 vi4x01234567 = _mm256_maskload_ps(i4, vmask);
1126 const __m256 vk4x01234567 = _mm256_load_ps(w + 40);
1127 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi4x01234567, vk4x01234567));
1128
1129 const __m256 vi5x01234567 = _mm256_maskload_ps(i5, vmask);
1130 const __m256 vk5x01234567 = _mm256_load_ps(w + 48);
1131 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi5x01234567, vk5x01234567));
1132
1133 const __m256 vi6x01234567 = _mm256_maskload_ps(i6, vmask);
1134 const __m256 vk6x01234567 = _mm256_load_ps(w + 56);
1135 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi6x01234567, vk6x01234567));
1136
1137 const __m256 vi7x01234567 = _mm256_maskload_ps(i7, vmask);
1138 const __m256 vk7x01234567 = _mm256_load_ps(w + 64);
1139 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi7x01234567, vk7x01234567));
1140
1141 const __m256 vi8x01234567 = _mm256_maskload_ps(i8, vmask);
1142 const __m256 vk8x01234567 = _mm256_load_ps(w + 72);
1143 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi8x01234567, vk8x01234567));
1144
1145 const __m256 vi9x01234567 = _mm256_maskload_ps(i9, vmask);
1146 const __m256 vk9x01234567 = _mm256_load_ps(w + 80);
1147 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi9x01234567, vk9x01234567));
1148
1149 const __m256 vi10x01234567 = _mm256_maskload_ps(i10, vmask);
1150 const __m256 vk10x01234567 = _mm256_load_ps(w + 88);
1151 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi10x01234567, vk10x01234567));
1152
1153 const __m256 vi11x01234567 = _mm256_maskload_ps(i11, vmask);
1154 const __m256 vk11x01234567 = _mm256_load_ps(w + 96);
1155 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi11x01234567, vk11x01234567));
1156
1157 const __m256 vi12x01234567 = _mm256_maskload_ps(i12, vmask);
1158 const __m256 vk12x01234567 = _mm256_load_ps(w + 104);
1159 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi12x01234567, vk12x01234567));
1160
1161 const __m256 vi13x01234567 = _mm256_maskload_ps(i13, vmask);
1162 const __m256 vk13x01234567 = _mm256_load_ps(w + 112);
1163 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi13x01234567, vk13x01234567));
1164
1165 const __m256 vi14x01234567 = _mm256_maskload_ps(i14, vmask);
1166 const __m256 vk14x01234567 = _mm256_load_ps(w + 120);
1167 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi14x01234567, vk14x01234567));
1168
1169 const __m256 vi15x01234567 = _mm256_maskload_ps(i15, vmask);
1170 const __m256 vk15x01234567 = _mm256_load_ps(w + 128);
1171 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi15x01234567, vk15x01234567));
1172
1173 const __m256 vi16x01234567 = _mm256_maskload_ps(i16, vmask);
1174 const __m256 vk16x01234567 = _mm256_load_ps(w + 136);
1175 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi16x01234567, vk16x01234567));
1176
1177 const __m256 vi17x01234567 = _mm256_maskload_ps(i17, vmask);
1178 const __m256 vk17x01234567 = _mm256_load_ps(w + 144);
1179 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi17x01234567, vk17x01234567));
1180
1181 const __m256 vi18x01234567 = _mm256_maskload_ps(i18, vmask);
1182 const __m256 vk18x01234567 = _mm256_load_ps(w + 152);
1183 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi18x01234567, vk18x01234567));
1184
1185 const __m256 vi19x01234567 = _mm256_maskload_ps(i19, vmask);
1186 const __m256 vk19x01234567 = _mm256_load_ps(w + 160);
1187 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi19x01234567, vk19x01234567));
1188
1189 const __m256 vi20x01234567 = _mm256_maskload_ps(i20, vmask);
1190 const __m256 vk20x01234567 = _mm256_load_ps(w + 168);
1191 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi20x01234567, vk20x01234567));
1192
1193 const __m256 vi21x01234567 = _mm256_maskload_ps(i21, vmask);
1194 const __m256 vk21x01234567 = _mm256_load_ps(w + 176);
1195 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi21x01234567, vk21x01234567));
1196
1197 const __m256 vi22x01234567 = _mm256_maskload_ps(i22, vmask);
1198 const __m256 vk22x01234567 = _mm256_load_ps(w + 184);
1199 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi22x01234567, vk22x01234567));
1200
1201 const __m256 vi23x01234567 = _mm256_maskload_ps(i23, vmask);
1202 const __m256 vk23x01234567 = _mm256_load_ps(w + 192);
1203 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi23x01234567, vk23x01234567));
1204
1205 const __m256 vi24x01234567 = _mm256_maskload_ps(i24, vmask);
1206 const __m256 vk24x01234567 = _mm256_load_ps(w + 200);
1207 vacc01234567p0 = _mm256_add_ps(vacc01234567p0, _mm256_mul_ps(vi24x01234567, vk24x01234567));
1208
1209
1210 __m256 vacc01234567 = _mm256_max_ps(vacc01234567p0, vmin);
1211 vacc01234567 = _mm256_min_ps(vacc01234567, vmax);
1212
1213 __m128 vacc0123 = _mm256_castps256_ps128(vacc01234567);
1214 if (c & 4) {
1215 _mm_storeu_ps(output, vacc0123);
1216 vacc0123 = _mm256_extractf128_ps(vacc01234567, 1);
1217 output += 4;
1218 }
1219 if (c & 2) {
1220 _mm_storel_pi((__m64*) output, vacc0123);
1221 vacc0123 = _mm_movehl_ps(vacc0123, vacc0123);
1222 output += 2;
1223 }
1224 if (c & 1) {
1225 _mm_store_ss(output, vacc0123);
1226 output += 1;
1227 }
1228 }
1229
1230 output = (float*) ((uintptr_t) output + output_increment);
1231 } while (--output_width != 0);
1232 }
1233
xnn_f32_f16_vcvt_ukernel__avx_x24(size_t n,const float * input,void * output,const union xnn_f32_f16_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])1234 void xnn_f32_f16_vcvt_ukernel__avx_x24(
1235 size_t n,
1236 const float* input,
1237 void* output,
1238 const union xnn_f32_f16_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1239 {
1240 assert(n != 0);
1241 assert(n % sizeof(float) == 0);
1242 assert(input != NULL);
1243 assert(output != NULL);
1244
1245 const __m128 vnonsign_mask = _mm_load_ps((const float*) params->sse2.nonsign_mask);
1246 const __m128i vexp_bias = _mm_load_si128((const __m128i*) params->sse2.exp_bias);
1247 const __m128 vscale_to_inf = _mm_load_ps(params->sse2.scale_to_inf);
1248 const __m128i vexpw_max = _mm_load_si128((const __m128i*) params->sse2.expw_max);
1249 const __m128 vscale_to_zero = _mm_load_ps(params->sse2.scale_to_zero);
1250 const __m128i vbias_min = _mm_load_si128((const __m128i*) params->sse2.bias_min);
1251 const __m128i vmanth_mask = _mm_load_si128((const __m128i*) params->sse2.manth_mask);
1252 const __m128i vexph_mask = _mm_load_si128((const __m128i*) params->sse2.exph_mask);
1253 const __m128i vnanh = _mm_load_si128((const __m128i*) params->sse2.nanh);
1254
1255 uint16_t* o = (uint16_t*) output;
1256 for (; n >= 24 * sizeof(float); n -= 24 * sizeof(float)) {
1257 const __m128 vx0 = _mm_loadu_ps(input);
1258 const __m128 vx1 = _mm_loadu_ps(input + 4);
1259 const __m128 vx2 = _mm_loadu_ps(input + 8);
1260 const __m128 vx3 = _mm_loadu_ps(input + 12);
1261 const __m128 vx4 = _mm_loadu_ps(input + 16);
1262 const __m128 vx5 = _mm_loadu_ps(input + 20);
1263 input += 24;
1264
1265 const __m128 vabsx0 = _mm_and_ps(vx0, vnonsign_mask);
1266 const __m128 vabsx1 = _mm_and_ps(vx1, vnonsign_mask);
1267 const __m128 vabsx2 = _mm_and_ps(vx2, vnonsign_mask);
1268 const __m128 vabsx3 = _mm_and_ps(vx3, vnonsign_mask);
1269 const __m128 vabsx4 = _mm_and_ps(vx4, vnonsign_mask);
1270 const __m128 vabsx5 = _mm_and_ps(vx5, vnonsign_mask);
1271
1272 const __m128 vsignx0 = _mm_xor_ps(vx0, vabsx0);
1273 const __m128 vsignx1 = _mm_xor_ps(vx1, vabsx1);
1274 const __m128 vsignx2 = _mm_xor_ps(vx2, vabsx2);
1275 const __m128 vsignx3 = _mm_xor_ps(vx3, vabsx3);
1276 const __m128 vsignx4 = _mm_xor_ps(vx4, vabsx4);
1277 const __m128 vsignx5 = _mm_xor_ps(vx5, vabsx5);
1278
1279 __m128i vbias0 = _mm_add_epi32(_mm_castps_si128(vabsx0), vexp_bias);
1280 __m128i vbias1 = _mm_add_epi32(_mm_castps_si128(vabsx1), vexp_bias);
1281 __m128i vbias2 = _mm_add_epi32(_mm_castps_si128(vabsx2), vexp_bias);
1282 __m128i vbias3 = _mm_add_epi32(_mm_castps_si128(vabsx3), vexp_bias);
1283 __m128i vbias4 = _mm_add_epi32(_mm_castps_si128(vabsx4), vexp_bias);
1284 __m128i vbias5 = _mm_add_epi32(_mm_castps_si128(vabsx5), vexp_bias);
1285
1286 __m128 vf0 = _mm_mul_ps(vabsx0, vscale_to_inf);
1287 __m128 vf1 = _mm_mul_ps(vabsx1, vscale_to_inf);
1288 __m128 vf2 = _mm_mul_ps(vabsx2, vscale_to_inf);
1289 __m128 vf3 = _mm_mul_ps(vabsx3, vscale_to_inf);
1290 __m128 vf4 = _mm_mul_ps(vabsx4, vscale_to_inf);
1291 __m128 vf5 = _mm_mul_ps(vabsx5, vscale_to_inf);
1292
1293 const __m128i vnanmaskw0 = _mm_cmpgt_epi32(_mm_castps_si128(vabsx0), vexpw_max);
1294 const __m128i vnanmaskw1 = _mm_cmpgt_epi32(_mm_castps_si128(vabsx1), vexpw_max);
1295 const __m128i vnanmaskw2 = _mm_cmpgt_epi32(_mm_castps_si128(vabsx2), vexpw_max);
1296 const __m128i vnanmaskw3 = _mm_cmpgt_epi32(_mm_castps_si128(vabsx3), vexpw_max);
1297 const __m128i vnanmaskw4 = _mm_cmpgt_epi32(_mm_castps_si128(vabsx4), vexpw_max);
1298 const __m128i vnanmaskw5 = _mm_cmpgt_epi32(_mm_castps_si128(vabsx5), vexpw_max);
1299
1300 vbias0 = _mm_and_si128(vbias0, vexpw_max);
1301 vbias1 = _mm_and_si128(vbias1, vexpw_max);
1302 vbias2 = _mm_and_si128(vbias2, vexpw_max);
1303 vbias3 = _mm_and_si128(vbias3, vexpw_max);
1304 vbias4 = _mm_and_si128(vbias4, vexpw_max);
1305 vbias5 = _mm_and_si128(vbias5, vexpw_max);
1306
1307 vf0 = _mm_mul_ps(vf0, vscale_to_zero);
1308 vf1 = _mm_mul_ps(vf1, vscale_to_zero);
1309 vf2 = _mm_mul_ps(vf2, vscale_to_zero);
1310 vf3 = _mm_mul_ps(vf3, vscale_to_zero);
1311 vf4 = _mm_mul_ps(vf4, vscale_to_zero);
1312 vf5 = _mm_mul_ps(vf5, vscale_to_zero);
1313
1314 const __m128i vnanmaskh0 = _mm_packs_epi32(vnanmaskw0, vnanmaskw1);
1315 const __m128i vnanmaskh1 = _mm_packs_epi32(vnanmaskw2, vnanmaskw3);
1316 const __m128i vnanmaskh2 = _mm_packs_epi32(vnanmaskw4, vnanmaskw5);
1317
1318 const __m128i vsignh0 = _mm_packs_epi32(_mm_castps_si128(vsignx0), _mm_castps_si128(vsignx1));
1319 const __m128i vsignh1 = _mm_packs_epi32(_mm_castps_si128(vsignx2), _mm_castps_si128(vsignx3));
1320 const __m128i vsignh2 = _mm_packs_epi32(_mm_castps_si128(vsignx4), _mm_castps_si128(vsignx5));
1321
1322 vbias0 = _mm_max_epi16(vbias0, vbias_min);
1323 vbias1 = _mm_max_epi16(vbias1, vbias_min);
1324 vbias2 = _mm_max_epi16(vbias2, vbias_min);
1325 vbias3 = _mm_max_epi16(vbias3, vbias_min);
1326 vbias4 = _mm_max_epi16(vbias4, vbias_min);
1327 vbias5 = _mm_max_epi16(vbias5, vbias_min);
1328
1329
1330 vf0 = _mm_add_ps(vf0, _mm_castsi128_ps(vbias0));
1331 vf1 = _mm_add_ps(vf1, _mm_castsi128_ps(vbias1));
1332 vf2 = _mm_add_ps(vf2, _mm_castsi128_ps(vbias2));
1333 vf3 = _mm_add_ps(vf3, _mm_castsi128_ps(vbias3));
1334 vf4 = _mm_add_ps(vf4, _mm_castsi128_ps(vbias4));
1335 vf5 = _mm_add_ps(vf5, _mm_castsi128_ps(vbias5));
1336
1337
1338 __m128i vexpw0 = _mm_srli_epi32(_mm_castps_si128(vf0), 13);
1339 __m128i vexpw1 = _mm_srli_epi32(_mm_castps_si128(vf1), 13);
1340 __m128i vexpw2 = _mm_srli_epi32(_mm_castps_si128(vf2), 13);
1341 __m128i vexpw3 = _mm_srli_epi32(_mm_castps_si128(vf3), 13);
1342 __m128i vexpw4 = _mm_srli_epi32(_mm_castps_si128(vf4), 13);
1343 __m128i vexpw5 = _mm_srli_epi32(_mm_castps_si128(vf5), 13);
1344
1345 const __m128i vmantw0 = _mm_and_si128(_mm_castps_si128(vf0), vmanth_mask);
1346 const __m128i vmantw1 = _mm_and_si128(_mm_castps_si128(vf1), vmanth_mask);
1347 const __m128i vmantw2 = _mm_and_si128(_mm_castps_si128(vf2), vmanth_mask);
1348 const __m128i vmantw3 = _mm_and_si128(_mm_castps_si128(vf3), vmanth_mask);
1349 const __m128i vmantw4 = _mm_and_si128(_mm_castps_si128(vf4), vmanth_mask);
1350 const __m128i vmantw5 = _mm_and_si128(_mm_castps_si128(vf5), vmanth_mask);
1351
1352 vexpw0 = _mm_and_si128(vexpw0, vexph_mask);
1353 vexpw1 = _mm_and_si128(vexpw1, vexph_mask);
1354 vexpw2 = _mm_and_si128(vexpw2, vexph_mask);
1355 vexpw3 = _mm_and_si128(vexpw3, vexph_mask);
1356 vexpw4 = _mm_and_si128(vexpw4, vexph_mask);
1357 vexpw5 = _mm_and_si128(vexpw5, vexph_mask);
1358
1359 const __m128i vnonsignw0 = _mm_add_epi32(vmantw0, vexpw0);
1360 const __m128i vnonsignw1 = _mm_add_epi32(vmantw1, vexpw1);
1361 const __m128i vnonsignw2 = _mm_add_epi32(vmantw2, vexpw2);
1362 const __m128i vnonsignw3 = _mm_add_epi32(vmantw3, vexpw3);
1363 const __m128i vnonsignw4 = _mm_add_epi32(vmantw4, vexpw4);
1364 const __m128i vnonsignw5 = _mm_add_epi32(vmantw5, vexpw5);
1365
1366 const __m128i vnonsignh0 = _mm_packs_epi32(vnonsignw0, vnonsignw1);
1367 const __m128i vnonsignh1 = _mm_packs_epi32(vnonsignw2, vnonsignw3);
1368 const __m128i vnonsignh2 = _mm_packs_epi32(vnonsignw4, vnonsignw5);
1369
1370 const __m128i vabsh0 = _mm_blendv_epi8(vnonsignh0, vnanh, vnanmaskh0);
1371 const __m128i vabsh1 = _mm_blendv_epi8(vnonsignh1, vnanh, vnanmaskh1);
1372 const __m128i vabsh2 = _mm_blendv_epi8(vnonsignh2, vnanh, vnanmaskh2);
1373
1374 const __m128i vh0 = _mm_or_si128(vabsh0, vsignh0);
1375 const __m128i vh1 = _mm_or_si128(vabsh1, vsignh1);
1376 const __m128i vh2 = _mm_or_si128(vabsh2, vsignh2);
1377
1378 _mm_storeu_si128((__m128i*) o, vh0);
1379 _mm_storeu_si128((__m128i*) (o + 8), vh1);
1380 _mm_storeu_si128((__m128i*) (o + 16), vh2);
1381 o += 24;
1382 }
1383 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
1384 const __m128 vx_lo = _mm_loadu_ps(input);
1385 const __m128 vx_hi = _mm_loadu_ps(input + 4);
1386 input += 8;
1387
1388 const __m128 vabsx_lo = _mm_and_ps(vx_lo, vnonsign_mask);
1389 const __m128 vabsx_hi = _mm_and_ps(vx_hi, vnonsign_mask);
1390
1391 const __m128 vsignx_lo = _mm_xor_ps(vx_lo, vabsx_lo);
1392 const __m128 vsignx_hi = _mm_xor_ps(vx_hi, vabsx_hi);
1393 __m128i vbias_lo = _mm_add_epi32(_mm_castps_si128(vabsx_lo), vexp_bias);
1394 __m128i vbias_hi = _mm_add_epi32(_mm_castps_si128(vabsx_hi), vexp_bias);
1395 __m128 vf_lo = _mm_mul_ps(vabsx_lo, vscale_to_inf);
1396 __m128 vf_hi = _mm_mul_ps(vabsx_hi, vscale_to_inf);
1397 const __m128i vnanmaskw_lo = _mm_cmpgt_epi32(_mm_castps_si128(vabsx_lo), vexpw_max);
1398 const __m128i vnanmaskw_hi = _mm_cmpgt_epi32(_mm_castps_si128(vabsx_hi), vexpw_max);
1399
1400 vbias_lo = _mm_and_si128(vbias_lo, vexpw_max);
1401 vbias_hi = _mm_and_si128(vbias_hi, vexpw_max);
1402 vf_lo = _mm_mul_ps(vf_lo, vscale_to_zero);
1403 vf_hi = _mm_mul_ps(vf_hi, vscale_to_zero);
1404 const __m128i vnanmaskh = _mm_packs_epi32(vnanmaskw_lo, vnanmaskw_hi);
1405 const __m128i vsignh = _mm_packs_epi32(_mm_castps_si128(vsignx_lo), _mm_castps_si128(vsignx_hi));
1406
1407 vbias_lo = _mm_max_epi16(vbias_lo, vbias_min);
1408 vbias_hi = _mm_max_epi16(vbias_hi, vbias_min);
1409
1410 vf_lo = _mm_add_ps(vf_lo, _mm_castsi128_ps(vbias_lo));
1411 vf_hi = _mm_add_ps(vf_hi, _mm_castsi128_ps(vbias_hi));
1412
1413 __m128i vexpw_lo = _mm_srli_epi32(_mm_castps_si128(vf_lo), 13);
1414 __m128i vexpw_hi = _mm_srli_epi32(_mm_castps_si128(vf_hi), 13);
1415 const __m128i vmantw_lo = _mm_and_si128(_mm_castps_si128(vf_lo), vmanth_mask);
1416 const __m128i vmantw_hi = _mm_and_si128(_mm_castps_si128(vf_hi), vmanth_mask);
1417
1418 vexpw_lo = _mm_and_si128(vexpw_lo, vexph_mask);
1419 vexpw_hi = _mm_and_si128(vexpw_hi, vexph_mask);
1420
1421 const __m128i vnonsignw_lo = _mm_add_epi32(vmantw_lo, vexpw_lo);
1422 const __m128i vnonsignw_hi = _mm_add_epi32(vmantw_hi, vexpw_hi);
1423
1424 const __m128i vnonsignh = _mm_packs_epi32(vnonsignw_lo, vnonsignw_hi);
1425
1426 const __m128i vabsh = _mm_blendv_epi8(vnonsignh, vnanh, vnanmaskh);
1427
1428 const __m128i vh = _mm_or_si128(vabsh, vsignh);
1429
1430 _mm_storeu_si128((__m128i*) o, vh);
1431 o += 8;
1432 }
1433 if XNN_UNPREDICTABLE(n != 0) {
1434 const __m128 vx_lo = _mm_loadu_ps(input);
1435 const float* input_hi = (const float*) ((uintptr_t) input + (n & (4 * sizeof(float))));
1436 const __m128 vx_hi = _mm_loadu_ps(input_hi);
1437
1438 const __m128 vabsx_lo = _mm_and_ps(vx_lo, vnonsign_mask);
1439 const __m128 vabsx_hi = _mm_and_ps(vx_hi, vnonsign_mask);
1440
1441 const __m128 vsignx_lo = _mm_xor_ps(vx_lo, vabsx_lo);
1442 const __m128 vsignx_hi = _mm_xor_ps(vx_hi, vabsx_hi);
1443 __m128i vbias_lo = _mm_add_epi32(_mm_castps_si128(vabsx_lo), vexp_bias);
1444 __m128i vbias_hi = _mm_add_epi32(_mm_castps_si128(vabsx_hi), vexp_bias);
1445 __m128 vf_lo = _mm_mul_ps(vabsx_lo, vscale_to_inf);
1446 __m128 vf_hi = _mm_mul_ps(vabsx_hi, vscale_to_inf);
1447 const __m128i vnanmaskw_lo = _mm_cmpgt_epi32(_mm_castps_si128(vabsx_lo), vexpw_max);
1448 const __m128i vnanmaskw_hi = _mm_cmpgt_epi32(_mm_castps_si128(vabsx_hi), vexpw_max);
1449
1450 vbias_lo = _mm_and_si128(vbias_lo, vexpw_max);
1451 vbias_hi = _mm_and_si128(vbias_hi, vexpw_max);
1452 vf_lo = _mm_mul_ps(vf_lo, vscale_to_zero);
1453 vf_hi = _mm_mul_ps(vf_hi, vscale_to_zero);
1454 const __m128i vnanmaskh = _mm_packs_epi32(vnanmaskw_lo, vnanmaskw_hi);
1455 const __m128i vsignh = _mm_packs_epi32(_mm_castps_si128(vsignx_lo), _mm_castps_si128(vsignx_hi));
1456
1457 vbias_lo = _mm_max_epi16(vbias_lo, vbias_min);
1458 vbias_hi = _mm_max_epi16(vbias_hi, vbias_min);
1459
1460 vf_lo = _mm_add_ps(vf_lo, _mm_castsi128_ps(vbias_lo));
1461 vf_hi = _mm_add_ps(vf_hi, _mm_castsi128_ps(vbias_hi));
1462
1463 __m128i vexpw_lo = _mm_srli_epi32(_mm_castps_si128(vf_lo), 13);
1464 __m128i vexpw_hi = _mm_srli_epi32(_mm_castps_si128(vf_hi), 13);
1465 const __m128i vmantw_lo = _mm_and_si128(_mm_castps_si128(vf_lo), vmanth_mask);
1466 const __m128i vmantw_hi = _mm_and_si128(_mm_castps_si128(vf_hi), vmanth_mask);
1467
1468 vexpw_lo = _mm_and_si128(vexpw_lo, vexph_mask);
1469 vexpw_hi = _mm_and_si128(vexpw_hi, vexph_mask);
1470
1471 const __m128i vnonsignw_lo = _mm_add_epi32(vmantw_lo, vexpw_lo);
1472 const __m128i vnonsignw_hi = _mm_add_epi32(vmantw_hi, vexpw_hi);
1473
1474 const __m128i vnonsignh = _mm_packs_epi32(vnonsignw_lo, vnonsignw_hi);
1475
1476 const __m128i vabsh = _mm_blendv_epi8(vnonsignh, vnanh, vnanmaskh);
1477
1478 __m128i vh = _mm_or_si128(vabsh, vsignh);
1479
1480 if (n & (4 * sizeof(float))) {
1481 _mm_storel_epi64((__m128i*) o, vh);
1482 vh = _mm_unpackhi_epi64(vh, vh);
1483 o += 4;
1484 }
1485 if (n & (2 * sizeof(float))) {
1486 unaligned_store_u32(o, (uint32_t) _mm_cvtsi128_si32(vh));
1487 vh = _mm_srli_epi64(vh, 32);
1488 o += 2;
1489 }
1490 if (n & (1 * sizeof(float))) {
1491 *o = (uint16_t) _mm_extract_epi16(vh, 0);
1492 }
1493 }
1494 }
1495
xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast(size_t mr,size_t nc,size_t kc,const float * restrict a,size_t a_stride,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1496 void xnn_f32_gemm_minmax_ukernel_1x16__avx_broadcast(
1497 size_t mr,
1498 size_t nc,
1499 size_t kc,
1500 const float*restrict a,
1501 size_t a_stride,
1502 const float*restrict w,
1503 float*restrict c,
1504 size_t cm_stride,
1505 size_t cn_stride,
1506 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
1507 {
1508 assert(mr != 0);
1509 assert(mr <= 1);
1510 assert(nc != 0);
1511 assert(kc != 0);
1512 assert(kc % sizeof(float) == 0);
1513 assert(a != NULL);
1514 assert(w != NULL);
1515 assert(c != NULL);
1516
1517 const float* a0 = a;
1518 float* c0 = c;
1519
1520 do {
1521 __m256 vacc0x01234567 = _mm256_load_ps(w + 0);
1522 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8);
1523 w += 16;
1524
1525 size_t k = kc;
1526 do {
1527 const __m256 va0 = _mm256_broadcast_ss(a0);
1528 a0 += 1;
1529
1530 const __m256 vb01234567 = _mm256_load_ps(w);
1531 const __m256 vb89ABCDEF = _mm256_load_ps(w + 8);
1532 w += 16;
1533
1534 vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
1535 vacc0x89ABCDEF = _mm256_add_ps(vacc0x89ABCDEF, _mm256_mul_ps(va0, vb89ABCDEF));
1536
1537 k -= sizeof(float);
1538 } while (k != 0);
1539
1540 const __m256 vmin = _mm256_load_ps(params->avx.min);
1541 vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
1542 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
1543
1544 const __m256 vmax = _mm256_load_ps(params->avx.max);
1545 vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
1546 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
1547
1548 if XNN_LIKELY(nc >= 16) {
1549 _mm256_storeu_ps(c0, vacc0x01234567);
1550 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF);
1551 c0 = (float*) ((uintptr_t) c0 + cn_stride);
1552
1553 a0 = (const float*) ((uintptr_t) a0 - kc);
1554
1555 nc -= 16;
1556 } else {
1557 if (nc & 8) {
1558 _mm256_storeu_ps(c0, vacc0x01234567);
1559
1560 vacc0x01234567 = vacc0x89ABCDEF;
1561
1562 c0 += 8;
1563 }
1564 __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
1565 if (nc & 4) {
1566 _mm_storeu_ps(c0, vacc0x0123);
1567
1568 vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
1569
1570 c0 += 4;
1571 }
1572 if (nc & 2) {
1573 _mm_storel_pi((__m64*) c0, vacc0x0123);
1574
1575 vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
1576
1577 c0 += 2;
1578 }
1579 if (nc & 1) {
1580 _mm_store_ss(c0, vacc0x0123);
1581 }
1582
1583 nc = 0;
1584 }
1585 } while (nc != 0);
1586 }
1587
xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast(size_t mr,size_t nc,size_t kc,const float * restrict a,size_t a_stride,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1588 void xnn_f32_gemm_minmax_ukernel_5x16__avx_broadcast(
1589 size_t mr,
1590 size_t nc,
1591 size_t kc,
1592 const float*restrict a,
1593 size_t a_stride,
1594 const float*restrict w,
1595 float*restrict c,
1596 size_t cm_stride,
1597 size_t cn_stride,
1598 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
1599 {
1600 assert(mr != 0);
1601 assert(mr <= 5);
1602 assert(nc != 0);
1603 assert(kc != 0);
1604 assert(kc % sizeof(float) == 0);
1605 assert(a != NULL);
1606 assert(w != NULL);
1607 assert(c != NULL);
1608
1609 const float* a0 = a;
1610 float* c0 = c;
1611 const float* a1 = (const float*) ((uintptr_t) a0 + a_stride);
1612 float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
1613 if XNN_UNPREDICTABLE(mr < 2) {
1614 a1 = a0;
1615 c1 = c0;
1616 }
1617 const float* a2 = (const float*) ((uintptr_t) a1 + a_stride);
1618 float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
1619 if XNN_UNPREDICTABLE(mr <= 2) {
1620 a2 = a1;
1621 c2 = c1;
1622 }
1623 const float* a3 = (const float*) ((uintptr_t) a2 + a_stride);
1624 float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
1625 if XNN_UNPREDICTABLE(mr < 4) {
1626 a3 = a2;
1627 c3 = c2;
1628 }
1629 const float* a4 = (const float*) ((uintptr_t) a3 + a_stride);
1630 float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
1631 if XNN_UNPREDICTABLE(mr <= 4) {
1632 a4 = a3;
1633 c4 = c3;
1634 }
1635
1636 do {
1637 __m256 vacc0x01234567 = _mm256_load_ps(w + 0);
1638 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8);
1639 __m256 vacc1x01234567 = vacc0x01234567;
1640 __m256 vacc1x89ABCDEF = vacc0x89ABCDEF;
1641 __m256 vacc2x01234567 = vacc0x01234567;
1642 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF;
1643 __m256 vacc3x01234567 = vacc0x01234567;
1644 __m256 vacc3x89ABCDEF = vacc0x89ABCDEF;
1645 __m256 vacc4x01234567 = vacc0x01234567;
1646 __m256 vacc4x89ABCDEF = vacc0x89ABCDEF;
1647 w += 16;
1648
1649 size_t k = kc;
1650 do {
1651 const __m256 va0 = _mm256_broadcast_ss(a0);
1652 a0 += 1;
1653 const __m256 va1 = _mm256_broadcast_ss(a1);
1654 a1 += 1;
1655 const __m256 va2 = _mm256_broadcast_ss(a2);
1656 a2 += 1;
1657 const __m256 va3 = _mm256_broadcast_ss(a3);
1658 a3 += 1;
1659 const __m256 va4 = _mm256_broadcast_ss(a4);
1660 a4 += 1;
1661
1662 const __m256 vb01234567 = _mm256_load_ps(w);
1663 const __m256 vb89ABCDEF = _mm256_load_ps(w + 8);
1664 w += 16;
1665
1666 vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
1667 vacc1x01234567 = _mm256_add_ps(vacc1x01234567, _mm256_mul_ps(va1, vb01234567));
1668 vacc2x01234567 = _mm256_add_ps(vacc2x01234567, _mm256_mul_ps(va2, vb01234567));
1669 vacc3x01234567 = _mm256_add_ps(vacc3x01234567, _mm256_mul_ps(va3, vb01234567));
1670 vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567));
1671 vacc0x89ABCDEF = _mm256_add_ps(vacc0x89ABCDEF, _mm256_mul_ps(va0, vb89ABCDEF));
1672 vacc1x89ABCDEF = _mm256_add_ps(vacc1x89ABCDEF, _mm256_mul_ps(va1, vb89ABCDEF));
1673 vacc2x89ABCDEF = _mm256_add_ps(vacc2x89ABCDEF, _mm256_mul_ps(va2, vb89ABCDEF));
1674 vacc3x89ABCDEF = _mm256_add_ps(vacc3x89ABCDEF, _mm256_mul_ps(va3, vb89ABCDEF));
1675 vacc4x89ABCDEF = _mm256_add_ps(vacc4x89ABCDEF, _mm256_mul_ps(va4, vb89ABCDEF));
1676
1677 k -= sizeof(float);
1678 } while (k != 0);
1679
1680 const __m256 vmin = _mm256_load_ps(params->avx.min);
1681 vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
1682 vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
1683 vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
1684 vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
1685 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
1686 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
1687 vacc1x89ABCDEF = _mm256_max_ps(vacc1x89ABCDEF, vmin);
1688 vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin);
1689 vacc3x89ABCDEF = _mm256_max_ps(vacc3x89ABCDEF, vmin);
1690 vacc4x89ABCDEF = _mm256_max_ps(vacc4x89ABCDEF, vmin);
1691
1692 const __m256 vmax = _mm256_load_ps(params->avx.max);
1693 vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
1694 vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
1695 vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
1696 vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
1697 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
1698 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
1699 vacc1x89ABCDEF = _mm256_min_ps(vacc1x89ABCDEF, vmax);
1700 vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax);
1701 vacc3x89ABCDEF = _mm256_min_ps(vacc3x89ABCDEF, vmax);
1702 vacc4x89ABCDEF = _mm256_min_ps(vacc4x89ABCDEF, vmax);
1703
1704 if XNN_LIKELY(nc >= 16) {
1705 _mm256_storeu_ps(c4, vacc4x01234567);
1706 _mm256_storeu_ps(c4 + 8, vacc4x89ABCDEF);
1707 c4 = (float*) ((uintptr_t) c4 + cn_stride);
1708 _mm256_storeu_ps(c3, vacc3x01234567);
1709 _mm256_storeu_ps(c3 + 8, vacc3x89ABCDEF);
1710 c3 = (float*) ((uintptr_t) c3 + cn_stride);
1711 _mm256_storeu_ps(c2, vacc2x01234567);
1712 _mm256_storeu_ps(c2 + 8, vacc2x89ABCDEF);
1713 c2 = (float*) ((uintptr_t) c2 + cn_stride);
1714 _mm256_storeu_ps(c1, vacc1x01234567);
1715 _mm256_storeu_ps(c1 + 8, vacc1x89ABCDEF);
1716 c1 = (float*) ((uintptr_t) c1 + cn_stride);
1717 _mm256_storeu_ps(c0, vacc0x01234567);
1718 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF);
1719 c0 = (float*) ((uintptr_t) c0 + cn_stride);
1720
1721 a4 = (const float*) ((uintptr_t) a4 - kc);
1722 a3 = (const float*) ((uintptr_t) a3 - kc);
1723 a2 = (const float*) ((uintptr_t) a2 - kc);
1724 a1 = (const float*) ((uintptr_t) a1 - kc);
1725 a0 = (const float*) ((uintptr_t) a0 - kc);
1726
1727 nc -= 16;
1728 } else {
1729 if (nc & 8) {
1730 _mm256_storeu_ps(c4, vacc4x01234567);
1731 _mm256_storeu_ps(c3, vacc3x01234567);
1732 _mm256_storeu_ps(c2, vacc2x01234567);
1733 _mm256_storeu_ps(c1, vacc1x01234567);
1734 _mm256_storeu_ps(c0, vacc0x01234567);
1735
1736 vacc4x01234567 = vacc4x89ABCDEF;
1737 vacc3x01234567 = vacc3x89ABCDEF;
1738 vacc2x01234567 = vacc2x89ABCDEF;
1739 vacc1x01234567 = vacc1x89ABCDEF;
1740 vacc0x01234567 = vacc0x89ABCDEF;
1741
1742 c4 += 8;
1743 c3 += 8;
1744 c2 += 8;
1745 c1 += 8;
1746 c0 += 8;
1747 }
1748 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
1749 __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
1750 __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
1751 __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
1752 __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
1753 if (nc & 4) {
1754 _mm_storeu_ps(c4, vacc4x0123);
1755 _mm_storeu_ps(c3, vacc3x0123);
1756 _mm_storeu_ps(c2, vacc2x0123);
1757 _mm_storeu_ps(c1, vacc1x0123);
1758 _mm_storeu_ps(c0, vacc0x0123);
1759
1760 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
1761 vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
1762 vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
1763 vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
1764 vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
1765
1766 c4 += 4;
1767 c3 += 4;
1768 c2 += 4;
1769 c1 += 4;
1770 c0 += 4;
1771 }
1772 if (nc & 2) {
1773 _mm_storel_pi((__m64*) c4, vacc4x0123);
1774 _mm_storel_pi((__m64*) c3, vacc3x0123);
1775 _mm_storel_pi((__m64*) c2, vacc2x0123);
1776 _mm_storel_pi((__m64*) c1, vacc1x0123);
1777 _mm_storel_pi((__m64*) c0, vacc0x0123);
1778
1779 vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
1780 vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
1781 vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
1782 vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
1783 vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
1784
1785 c4 += 2;
1786 c3 += 2;
1787 c2 += 2;
1788 c1 += 2;
1789 c0 += 2;
1790 }
1791 if (nc & 1) {
1792 _mm_store_ss(c4, vacc4x0123);
1793 _mm_store_ss(c3, vacc3x0123);
1794 _mm_store_ss(c2, vacc2x0123);
1795 _mm_store_ss(c1, vacc1x0123);
1796 _mm_store_ss(c0, vacc0x0123);
1797 }
1798
1799 nc = 0;
1800 }
1801 } while (nc != 0);
1802 }
1803
xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast(size_t mr,size_t nc,size_t kc,size_t ks,const float ** restrict a,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1804 void xnn_f32_igemm_minmax_ukernel_1x16__avx_broadcast(
1805 size_t mr,
1806 size_t nc,
1807 size_t kc,
1808 size_t ks,
1809 const float**restrict a,
1810 const float*restrict w,
1811 float*restrict c,
1812 size_t cm_stride,
1813 size_t cn_stride,
1814 size_t a_offset,
1815 const float* zero,
1816 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
1817 {
1818 assert(mr != 0);
1819 assert(mr <= 1);
1820 assert(nc != 0);
1821 assert(kc != 0);
1822 assert(kc % sizeof(float) == 0);
1823 assert(ks != 0);
1824 assert(ks % (1 * sizeof(void*)) == 0);
1825 assert(a_offset % sizeof(float) == 0);
1826 assert(a != NULL);
1827 assert(w != NULL);
1828 assert(c != NULL);
1829
1830 float* c0 = c;
1831
1832 do {
1833 __m256 vacc0x01234567 = _mm256_load_ps(w);
1834 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8);
1835 w += 16;
1836
1837 size_t p = ks;
1838 do {
1839 const float* restrict a0 = a[0];
1840 assert(a0 != NULL);
1841 if XNN_UNPREDICTABLE(a0 != zero) {
1842 a0 = (const float*) ((uintptr_t) a0 + a_offset);
1843 }
1844 a += 1;
1845
1846 size_t k = kc;
1847 do {
1848 const __m256 vb01234567 = _mm256_load_ps(w);
1849 const __m256 vb89ABCDEF = _mm256_load_ps(w + 8);
1850 w += 16;
1851
1852 const __m256 va0 = _mm256_broadcast_ss(a0);
1853 a0 += 1;
1854
1855 vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
1856 vacc0x89ABCDEF = _mm256_add_ps(vacc0x89ABCDEF, _mm256_mul_ps(va0, vb89ABCDEF));
1857 k -= sizeof(float);
1858 } while (k != 0);
1859 p -= 1 * sizeof(void*);
1860 } while (p != 0);
1861
1862 const __m256 vmin = _mm256_load_ps(params->avx.min);
1863 vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
1864 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
1865
1866 const __m256 vmax = _mm256_load_ps(params->avx.max);
1867 vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
1868 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
1869
1870 if XNN_LIKELY(nc >= 16) {
1871 _mm256_storeu_ps(c0, vacc0x01234567);
1872 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF);
1873 c0 = (float*) ((uintptr_t) c0 + cn_stride);
1874
1875 a = (const float**restrict) ((uintptr_t) a - ks);
1876 nc -= 16;
1877 } else {
1878 if (nc & 8) {
1879 _mm256_storeu_ps(c0, vacc0x01234567);
1880
1881 vacc0x01234567 = vacc0x89ABCDEF;
1882
1883 c0 += 8;
1884 }
1885 __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
1886 if (nc & 4) {
1887 _mm_storeu_ps(c0, vacc0x0123);
1888
1889 vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
1890
1891 c0 += 4;
1892 }
1893 if (nc & 2) {
1894 _mm_storel_pi((__m64*) c0, vacc0x0123);
1895
1896 vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
1897
1898 c0 += 2;
1899 }
1900 if (nc & 1) {
1901 _mm_store_ss(c0, vacc0x0123);
1902 }
1903
1904 nc = 0;
1905 }
1906 } while (nc != 0);
1907 }
1908
xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast(size_t mr,size_t nc,size_t kc,size_t ks,const float ** restrict a,const float * restrict w,float * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const float * zero,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1909 void xnn_f32_igemm_minmax_ukernel_5x16__avx_broadcast(
1910 size_t mr,
1911 size_t nc,
1912 size_t kc,
1913 size_t ks,
1914 const float**restrict a,
1915 const float*restrict w,
1916 float*restrict c,
1917 size_t cm_stride,
1918 size_t cn_stride,
1919 size_t a_offset,
1920 const float* zero,
1921 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
1922 {
1923 assert(mr != 0);
1924 assert(mr <= 5);
1925 assert(nc != 0);
1926 assert(kc != 0);
1927 assert(kc % sizeof(float) == 0);
1928 assert(ks != 0);
1929 assert(ks % (5 * sizeof(void*)) == 0);
1930 assert(a_offset % sizeof(float) == 0);
1931 assert(a != NULL);
1932 assert(w != NULL);
1933 assert(c != NULL);
1934
1935 float* c0 = c;
1936 float* c1 = (float*) ((uintptr_t) c0 + cm_stride);
1937 if XNN_UNPREDICTABLE(mr < 2) {
1938 c1 = c0;
1939 }
1940 float* c2 = (float*) ((uintptr_t) c1 + cm_stride);
1941 if XNN_UNPREDICTABLE(mr <= 2) {
1942 c2 = c1;
1943 }
1944 float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
1945 if XNN_UNPREDICTABLE(mr < 4) {
1946 c3 = c2;
1947 }
1948 float* c4 = (float*) ((uintptr_t) c3 + cm_stride);
1949 if XNN_UNPREDICTABLE(mr <= 4) {
1950 c4 = c3;
1951 }
1952
1953 do {
1954 __m256 vacc0x01234567 = _mm256_load_ps(w);
1955 __m256 vacc0x89ABCDEF = _mm256_load_ps(w + 8);
1956 __m256 vacc1x01234567 = vacc0x01234567;
1957 __m256 vacc1x89ABCDEF = vacc0x89ABCDEF;
1958 __m256 vacc2x01234567 = vacc0x01234567;
1959 __m256 vacc2x89ABCDEF = vacc0x89ABCDEF;
1960 __m256 vacc3x01234567 = vacc0x01234567;
1961 __m256 vacc3x89ABCDEF = vacc0x89ABCDEF;
1962 __m256 vacc4x01234567 = vacc0x01234567;
1963 __m256 vacc4x89ABCDEF = vacc0x89ABCDEF;
1964 w += 16;
1965
1966 size_t p = ks;
1967 do {
1968 const float* restrict a0 = a[0];
1969 assert(a0 != NULL);
1970 if XNN_UNPREDICTABLE(a0 != zero) {
1971 a0 = (const float*) ((uintptr_t) a0 + a_offset);
1972 }
1973 const float* restrict a1 = a[1];
1974 assert(a1 != NULL);
1975 if XNN_UNPREDICTABLE(a1 != zero) {
1976 a1 = (const float*) ((uintptr_t) a1 + a_offset);
1977 }
1978 const float* restrict a2 = a[2];
1979 assert(a2 != NULL);
1980 if XNN_UNPREDICTABLE(a2 != zero) {
1981 a2 = (const float*) ((uintptr_t) a2 + a_offset);
1982 }
1983 const float* restrict a3 = a[3];
1984 assert(a3 != NULL);
1985 if XNN_UNPREDICTABLE(a3 != zero) {
1986 a3 = (const float*) ((uintptr_t) a3 + a_offset);
1987 }
1988 const float* restrict a4 = a[4];
1989 assert(a4 != NULL);
1990 if XNN_UNPREDICTABLE(a4 != zero) {
1991 a4 = (const float*) ((uintptr_t) a4 + a_offset);
1992 }
1993 a += 5;
1994
1995 size_t k = kc;
1996 do {
1997 const __m256 vb01234567 = _mm256_load_ps(w);
1998 const __m256 vb89ABCDEF = _mm256_load_ps(w + 8);
1999 w += 16;
2000
2001 const __m256 va0 = _mm256_broadcast_ss(a0);
2002 a0 += 1;
2003 const __m256 va1 = _mm256_broadcast_ss(a1);
2004 a1 += 1;
2005 const __m256 va2 = _mm256_broadcast_ss(a2);
2006 a2 += 1;
2007 const __m256 va3 = _mm256_broadcast_ss(a3);
2008 a3 += 1;
2009 const __m256 va4 = _mm256_broadcast_ss(a4);
2010 a4 += 1;
2011
2012 vacc0x01234567 = _mm256_add_ps(vacc0x01234567, _mm256_mul_ps(va0, vb01234567));
2013 vacc0x89ABCDEF = _mm256_add_ps(vacc0x89ABCDEF, _mm256_mul_ps(va0, vb89ABCDEF));
2014 vacc1x01234567 = _mm256_add_ps(vacc1x01234567, _mm256_mul_ps(va1, vb01234567));
2015 vacc1x89ABCDEF = _mm256_add_ps(vacc1x89ABCDEF, _mm256_mul_ps(va1, vb89ABCDEF));
2016 vacc2x01234567 = _mm256_add_ps(vacc2x01234567, _mm256_mul_ps(va2, vb01234567));
2017 vacc2x89ABCDEF = _mm256_add_ps(vacc2x89ABCDEF, _mm256_mul_ps(va2, vb89ABCDEF));
2018 vacc3x01234567 = _mm256_add_ps(vacc3x01234567, _mm256_mul_ps(va3, vb01234567));
2019 vacc3x89ABCDEF = _mm256_add_ps(vacc3x89ABCDEF, _mm256_mul_ps(va3, vb89ABCDEF));
2020 vacc4x01234567 = _mm256_add_ps(vacc4x01234567, _mm256_mul_ps(va4, vb01234567));
2021 vacc4x89ABCDEF = _mm256_add_ps(vacc4x89ABCDEF, _mm256_mul_ps(va4, vb89ABCDEF));
2022 k -= sizeof(float);
2023 } while (k != 0);
2024 p -= 5 * sizeof(void*);
2025 } while (p != 0);
2026
2027 const __m256 vmin = _mm256_load_ps(params->avx.min);
2028 vacc0x01234567 = _mm256_max_ps(vacc0x01234567, vmin);
2029 vacc1x01234567 = _mm256_max_ps(vacc1x01234567, vmin);
2030 vacc2x01234567 = _mm256_max_ps(vacc2x01234567, vmin);
2031 vacc3x01234567 = _mm256_max_ps(vacc3x01234567, vmin);
2032 vacc4x01234567 = _mm256_max_ps(vacc4x01234567, vmin);
2033 vacc0x89ABCDEF = _mm256_max_ps(vacc0x89ABCDEF, vmin);
2034 vacc1x89ABCDEF = _mm256_max_ps(vacc1x89ABCDEF, vmin);
2035 vacc2x89ABCDEF = _mm256_max_ps(vacc2x89ABCDEF, vmin);
2036 vacc3x89ABCDEF = _mm256_max_ps(vacc3x89ABCDEF, vmin);
2037 vacc4x89ABCDEF = _mm256_max_ps(vacc4x89ABCDEF, vmin);
2038
2039 const __m256 vmax = _mm256_load_ps(params->avx.max);
2040 vacc0x01234567 = _mm256_min_ps(vacc0x01234567, vmax);
2041 vacc1x01234567 = _mm256_min_ps(vacc1x01234567, vmax);
2042 vacc2x01234567 = _mm256_min_ps(vacc2x01234567, vmax);
2043 vacc3x01234567 = _mm256_min_ps(vacc3x01234567, vmax);
2044 vacc4x01234567 = _mm256_min_ps(vacc4x01234567, vmax);
2045 vacc0x89ABCDEF = _mm256_min_ps(vacc0x89ABCDEF, vmax);
2046 vacc1x89ABCDEF = _mm256_min_ps(vacc1x89ABCDEF, vmax);
2047 vacc2x89ABCDEF = _mm256_min_ps(vacc2x89ABCDEF, vmax);
2048 vacc3x89ABCDEF = _mm256_min_ps(vacc3x89ABCDEF, vmax);
2049 vacc4x89ABCDEF = _mm256_min_ps(vacc4x89ABCDEF, vmax);
2050
2051 if XNN_LIKELY(nc >= 16) {
2052 _mm256_storeu_ps(c4, vacc4x01234567);
2053 _mm256_storeu_ps(c4 + 8, vacc4x89ABCDEF);
2054 c4 = (float*) ((uintptr_t) c4 + cn_stride);
2055 _mm256_storeu_ps(c3, vacc3x01234567);
2056 _mm256_storeu_ps(c3 + 8, vacc3x89ABCDEF);
2057 c3 = (float*) ((uintptr_t) c3 + cn_stride);
2058 _mm256_storeu_ps(c2, vacc2x01234567);
2059 _mm256_storeu_ps(c2 + 8, vacc2x89ABCDEF);
2060 c2 = (float*) ((uintptr_t) c2 + cn_stride);
2061 _mm256_storeu_ps(c1, vacc1x01234567);
2062 _mm256_storeu_ps(c1 + 8, vacc1x89ABCDEF);
2063 c1 = (float*) ((uintptr_t) c1 + cn_stride);
2064 _mm256_storeu_ps(c0, vacc0x01234567);
2065 _mm256_storeu_ps(c0 + 8, vacc0x89ABCDEF);
2066 c0 = (float*) ((uintptr_t) c0 + cn_stride);
2067
2068 a = (const float**restrict) ((uintptr_t) a - ks);
2069 nc -= 16;
2070 } else {
2071 if (nc & 8) {
2072 _mm256_storeu_ps(c4, vacc4x01234567);
2073 _mm256_storeu_ps(c3, vacc3x01234567);
2074 _mm256_storeu_ps(c2, vacc2x01234567);
2075 _mm256_storeu_ps(c1, vacc1x01234567);
2076 _mm256_storeu_ps(c0, vacc0x01234567);
2077
2078 vacc4x01234567 = vacc4x89ABCDEF;
2079 vacc3x01234567 = vacc3x89ABCDEF;
2080 vacc2x01234567 = vacc2x89ABCDEF;
2081 vacc1x01234567 = vacc1x89ABCDEF;
2082 vacc0x01234567 = vacc0x89ABCDEF;
2083
2084 c4 += 8;
2085 c3 += 8;
2086 c2 += 8;
2087 c1 += 8;
2088 c0 += 8;
2089 }
2090 __m128 vacc4x0123 = _mm256_castps256_ps128(vacc4x01234567);
2091 __m128 vacc3x0123 = _mm256_castps256_ps128(vacc3x01234567);
2092 __m128 vacc2x0123 = _mm256_castps256_ps128(vacc2x01234567);
2093 __m128 vacc1x0123 = _mm256_castps256_ps128(vacc1x01234567);
2094 __m128 vacc0x0123 = _mm256_castps256_ps128(vacc0x01234567);
2095 if (nc & 4) {
2096 _mm_storeu_ps(c4, vacc4x0123);
2097 _mm_storeu_ps(c3, vacc3x0123);
2098 _mm_storeu_ps(c2, vacc2x0123);
2099 _mm_storeu_ps(c1, vacc1x0123);
2100 _mm_storeu_ps(c0, vacc0x0123);
2101
2102 vacc4x0123 = _mm256_extractf128_ps(vacc4x01234567, 1);
2103 vacc3x0123 = _mm256_extractf128_ps(vacc3x01234567, 1);
2104 vacc2x0123 = _mm256_extractf128_ps(vacc2x01234567, 1);
2105 vacc1x0123 = _mm256_extractf128_ps(vacc1x01234567, 1);
2106 vacc0x0123 = _mm256_extractf128_ps(vacc0x01234567, 1);
2107
2108 c4 += 4;
2109 c3 += 4;
2110 c2 += 4;
2111 c1 += 4;
2112 c0 += 4;
2113 }
2114 if (nc & 2) {
2115 _mm_storel_pi((__m64*) c4, vacc4x0123);
2116 _mm_storel_pi((__m64*) c3, vacc3x0123);
2117 _mm_storel_pi((__m64*) c2, vacc2x0123);
2118 _mm_storel_pi((__m64*) c1, vacc1x0123);
2119 _mm_storel_pi((__m64*) c0, vacc0x0123);
2120
2121 vacc4x0123 = _mm_movehl_ps(vacc4x0123, vacc4x0123);
2122 vacc3x0123 = _mm_movehl_ps(vacc3x0123, vacc3x0123);
2123 vacc2x0123 = _mm_movehl_ps(vacc2x0123, vacc2x0123);
2124 vacc1x0123 = _mm_movehl_ps(vacc1x0123, vacc1x0123);
2125 vacc0x0123 = _mm_movehl_ps(vacc0x0123, vacc0x0123);
2126
2127 c4 += 2;
2128 c3 += 2;
2129 c2 += 2;
2130 c1 += 2;
2131 c0 += 2;
2132 }
2133 if (nc & 1) {
2134 _mm_store_ss(c4, vacc4x0123);
2135 _mm_store_ss(c3, vacc3x0123);
2136 _mm_store_ss(c2, vacc2x0123);
2137 _mm_store_ss(c1, vacc1x0123);
2138 _mm_store_ss(c0, vacc0x0123);
2139 }
2140
2141 nc = 0;
2142 }
2143 } while (nc != 0);
2144 }
2145
2146 static const int32_t mask_table[14] = {-1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0};
2147
xnn_f32_prelu_ukernel__avx_2x16(size_t rows,size_t channels,const float * restrict input,size_t input_stride,const float * restrict weights,float * restrict output,size_t output_stride)2148 void xnn_f32_prelu_ukernel__avx_2x16(
2149 size_t rows,
2150 size_t channels,
2151 const float*restrict input,
2152 size_t input_stride,
2153 const float*restrict weights,
2154 float*restrict output,
2155 size_t output_stride)
2156 {
2157 assert(rows != 0);
2158 assert(channels != 0);
2159 assert(channels % sizeof(float) == 0);
2160
2161 const float* i0 = input;
2162 float* o0 = output;
2163 const float* i1 = (const float*) ((uintptr_t) i0 + input_stride);
2164 float* o1 = (float*) ((uintptr_t) o0 + output_stride);
2165
2166 const size_t input_increment = input_stride * 2 - channels;
2167 const size_t output_increment = output_stride * 2 - channels;
2168
2169 do {
2170 if XNN_UNPREDICTABLE(rows < 2) {
2171 i1 = i0;
2172 o1 = o0;
2173 }
2174
2175 const float* w = weights;
2176 size_t c = channels;
2177 for (; c >= 16 * sizeof(float); c -= 16 * sizeof(float)) {
2178 const __m256 vw01234567 = _mm256_load_ps(w);
2179 const __m256 vw89ABCDEF = _mm256_load_ps(w + 8);
2180 w += 16;
2181
2182 const __m256 vi0x01234567 = _mm256_loadu_ps(i0);
2183 const __m256 vi0x89ABCDEF = _mm256_loadu_ps(i0 + 8);
2184 i0 += 16;
2185 const __m256 vi1x01234567 = _mm256_loadu_ps(i1);
2186 const __m256 vi1x89ABCDEF = _mm256_loadu_ps(i1 + 8);
2187 i1 += 16;
2188
2189 const __m256 vprod0x01234567 = _mm256_mul_ps(vi0x01234567, vw01234567);
2190 const __m256 vprod0x89ABCDEF = _mm256_mul_ps(vi0x89ABCDEF, vw89ABCDEF);
2191 const __m256 vprod1x01234567 = _mm256_mul_ps(vi1x01234567, vw01234567);
2192 const __m256 vprod1x89ABCDEF = _mm256_mul_ps(vi1x89ABCDEF, vw89ABCDEF);
2193
2194 const __m256 vacc0x01234567 = _mm256_blendv_ps(vi0x01234567, vprod0x01234567, vi0x01234567);
2195 const __m256 vacc0x89ABCDEF = _mm256_blendv_ps(vi0x89ABCDEF, vprod0x89ABCDEF, vi0x89ABCDEF);
2196 const __m256 vacc1x01234567 = _mm256_blendv_ps(vi1x01234567, vprod1x01234567, vi1x01234567);
2197 const __m256 vacc1x89ABCDEF = _mm256_blendv_ps(vi1x89ABCDEF, vprod1x89ABCDEF, vi1x89ABCDEF);
2198
2199 _mm256_storeu_ps(o0, vacc0x01234567);
2200 _mm256_storeu_ps(o0 + 8, vacc0x89ABCDEF);
2201 o0 += 16;
2202 _mm256_storeu_ps(o1, vacc1x01234567);
2203 _mm256_storeu_ps(o1 + 8, vacc1x89ABCDEF);
2204 o1 += 16;
2205 }
2206 for (; c >= 8 * sizeof(float); c -= 8 * sizeof(float)) {
2207 const __m256 vw = _mm256_load_ps(w);
2208 w += 8;
2209
2210 const __m256 vi0 = _mm256_loadu_ps(i0);
2211 i0 += 8;
2212 const __m256 vi1 = _mm256_loadu_ps(i1);
2213 i1 += 8;
2214
2215 const __m256 vprod0 = _mm256_mul_ps(vi0, vw);
2216 const __m256 vprod1 = _mm256_mul_ps(vi1, vw);
2217
2218 const __m256 vacc0 = _mm256_blendv_ps(vi0, vprod0, vi0);
2219 const __m256 vacc1 = _mm256_blendv_ps(vi1, vprod1, vi1);
2220
2221 _mm256_storeu_ps(o0, vacc0);
2222 o0 += 8;
2223 _mm256_storeu_ps(o1, vacc1);
2224 o1 += 8;
2225 }
2226 if XNN_UNLIKELY(c != 0) {
2227 assert(c >= 1 * sizeof(float));
2228 assert(c <= 7 * sizeof(float));
2229 __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &mask_table[7] - c));
2230
2231 const __m256 vw = _mm256_maskload_ps(w, vmask);
2232
2233 const __m256 vi0 = _mm256_maskload_ps(i0, vmask);
2234 i0 = (const float*) ((uintptr_t) i0 + c);
2235 const __m256 vi1 = _mm256_maskload_ps(i1, vmask);
2236 i1 = (const float*) ((uintptr_t) i1 + c);
2237
2238 const __m256 vprod0 = _mm256_mul_ps(vi0, vw);
2239 const __m256 vprod1 = _mm256_mul_ps(vi1, vw);
2240
2241 __m256 vacc0 = _mm256_blendv_ps(vi0, vprod0, vi0);
2242 __m256 vacc1 = _mm256_blendv_ps(vi1, vprod1, vi1);
2243
2244 __m128 vacc0_lo = _mm256_castps256_ps128(vacc0);
2245 __m128 vacc1_lo = _mm256_castps256_ps128(vacc1);
2246 if (c & (4 * sizeof(float))) {
2247 _mm_storeu_ps(o0, vacc0_lo);
2248 _mm_storeu_ps(o1, vacc1_lo);
2249
2250 vacc0_lo = _mm256_extractf128_ps(vacc0, 1);
2251 vacc1_lo = _mm256_extractf128_ps(vacc1, 1);
2252
2253 o0 += 4;
2254 o1 += 4;
2255 }
2256 if (c & (2 * sizeof(float))) {
2257 _mm_storel_pi((__m64*) o0, vacc0_lo);
2258 _mm_storel_pi((__m64*) o1, vacc1_lo);
2259
2260 vacc0_lo = _mm_movehl_ps(vacc0_lo, vacc0_lo);
2261 vacc1_lo = _mm_movehl_ps(vacc1_lo, vacc1_lo);
2262
2263 o0 += 2;
2264 o1 += 2;
2265 }
2266 if (c & (1 * sizeof(float))) {
2267 _mm_store_ss(o0, vacc0_lo);
2268 _mm_store_ss(o1, vacc1_lo);
2269
2270 o0 += 1;
2271 o1 += 1;
2272 }
2273 }
2274 i0 = (const float*) ((uintptr_t) i0 + input_increment);
2275 o0 = (float*) ((uintptr_t) o0 + output_increment);
2276 i1 = (const float*) ((uintptr_t) i1 + input_increment);
2277 o1 = (float*) ((uintptr_t) o1 + output_increment);
2278 rows = doz(rows, 2);
2279 } while (rows != 0);
2280 }
2281
xnn_f32_qs8_vcvt_ukernel__avx_x32(size_t n,const float * x,int8_t * y,const union xnn_f32_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])2282 void xnn_f32_qs8_vcvt_ukernel__avx_x32(
2283 size_t n,
2284 const float* x,
2285 int8_t* y,
2286 const union xnn_f32_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
2287 {
2288 assert(n != 0);
2289 assert(n % sizeof(float) == 0);
2290 assert(x != NULL);
2291 assert(y != NULL);
2292
2293 const __m256 vscale = _mm256_load_ps(params->avx.scale);
2294 const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx.output_max_less_zero_point);
2295 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx.output_zero_point);
2296 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx.output_min);
2297
2298 for (; n >= 32 * sizeof(float); n -= 32 * sizeof(float)) {
2299 __m256 vx01234567 = _mm256_loadu_ps(x);
2300 __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
2301 __m256 vxGHIJKLMN = _mm256_loadu_ps(x + 16);
2302 __m256 vxOPQRSTUV = _mm256_loadu_ps(x + 24);
2303 x += 32;
2304
2305 vx01234567 = _mm256_mul_ps(vx01234567, vscale);
2306 vx89ABCDEF = _mm256_mul_ps(vx89ABCDEF, vscale);
2307 vxGHIJKLMN = _mm256_mul_ps(vxGHIJKLMN, vscale);
2308 vxOPQRSTUV = _mm256_mul_ps(vxOPQRSTUV, vscale);
2309
2310 vx01234567 = _mm256_min_ps(vx01234567, voutput_max_less_zero_point);
2311 vx89ABCDEF = _mm256_min_ps(vx89ABCDEF, voutput_max_less_zero_point);
2312 vxGHIJKLMN = _mm256_min_ps(vxGHIJKLMN, voutput_max_less_zero_point);
2313 vxOPQRSTUV = _mm256_min_ps(vxOPQRSTUV, voutput_max_less_zero_point);
2314
2315 const __m256i vacc01234567 = _mm256_cvtps_epi32(vx01234567);
2316 const __m256i vacc89ABCDEF = _mm256_cvtps_epi32(vx89ABCDEF);
2317 const __m256i vaccGHIJKLMN = _mm256_cvtps_epi32(vxGHIJKLMN);
2318 const __m256i vaccOPQRSTUV = _mm256_cvtps_epi32(vxOPQRSTUV);
2319
2320 __m128i vy01234567 = _mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extractf128_si256(vacc01234567, 1));
2321 __m128i vy89ABCDEF = _mm_packs_epi32(_mm256_castsi256_si128(vacc89ABCDEF), _mm256_extractf128_si256(vacc89ABCDEF, 1));
2322 __m128i vyGHIJKLMN = _mm_packs_epi32(_mm256_castsi256_si128(vaccGHIJKLMN), _mm256_extractf128_si256(vaccGHIJKLMN, 1));
2323 __m128i vyOPQRSTUV = _mm_packs_epi32(_mm256_castsi256_si128(vaccOPQRSTUV), _mm256_extractf128_si256(vaccOPQRSTUV, 1));
2324
2325 vy01234567 = _mm_adds_epi16(vy01234567, voutput_zero_point);
2326 vy89ABCDEF = _mm_adds_epi16(vy89ABCDEF, voutput_zero_point);
2327 vyGHIJKLMN = _mm_adds_epi16(vyGHIJKLMN, voutput_zero_point);
2328 vyOPQRSTUV = _mm_adds_epi16(vyOPQRSTUV, voutput_zero_point);
2329
2330 __m128i vy0123456789ABCDEF = _mm_packs_epi16(vy01234567, vy89ABCDEF);
2331 __m128i vyGHIJKLMNOPQRSTUV = _mm_packs_epi16(vyGHIJKLMN, vyOPQRSTUV);
2332
2333 vy0123456789ABCDEF = _mm_max_epi8(vy0123456789ABCDEF, voutput_min);
2334 vyGHIJKLMNOPQRSTUV = _mm_max_epi8(vyGHIJKLMNOPQRSTUV, voutput_min);
2335
2336 _mm_storeu_si128((__m128i*) y, vy0123456789ABCDEF);
2337 _mm_storeu_si128((__m128i*) (y + 16), vyGHIJKLMNOPQRSTUV);
2338 y += 32;
2339 }
2340 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
2341 __m256 vx = _mm256_loadu_ps(x);
2342 vx = _mm256_mul_ps(vx, vscale);
2343 vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
2344 x += 8;
2345
2346 const __m256i vacc = _mm256_cvtps_epi32(vx);
2347
2348 __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
2349 vy = _mm_adds_epi16(vy, voutput_zero_point);
2350 vy = _mm_packs_epi16(vy, vy);
2351 vy = _mm_max_epi8(vy, voutput_min);
2352
2353 _mm_storel_epi64((__m128i*) y, vy);
2354 y += 8;
2355 }
2356 if XNN_UNLIKELY(n != 0) {
2357 assert(n >= 1 * sizeof(float));
2358 assert(n <= 7 * sizeof(float));
2359 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
2360
2361 __m256 vx = _mm256_maskload_ps(x, vmask);
2362 vx = _mm256_mul_ps(vx, vscale);
2363 vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
2364
2365 const __m256i vacc = _mm256_cvtps_epi32(vx);
2366
2367 __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
2368 vy = _mm_adds_epi16(vy, voutput_zero_point);
2369 vy = _mm_packs_epi16(vy, vy);
2370 vy = _mm_max_epi8(vy, voutput_min);
2371
2372 if (n & (4 * sizeof(float))) {
2373 _mm_storeu_si32(y, vy);
2374 y += 4;
2375 vy = _mm_srli_epi64(vy, 32);
2376 }
2377 if (n & (2 * sizeof(float))) {
2378 _mm_storeu_si16(y, vy);
2379 y += 2;
2380 vy = _mm_srli_epi32(vy, 16);
2381 }
2382 if (n & (1 * sizeof(float))) {
2383 *y = (int8_t) _mm_extract_epi8(vy, 0);
2384 }
2385 }
2386 }
2387
xnn_f32_qu8_vcvt_ukernel__avx_x32(size_t n,const float * x,uint8_t * y,const union xnn_f32_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])2388 void xnn_f32_qu8_vcvt_ukernel__avx_x32(
2389 size_t n,
2390 const float* x,
2391 uint8_t* y,
2392 const union xnn_f32_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
2393 {
2394 assert(n != 0);
2395 assert(n % sizeof(float) == 0);
2396 assert(x != NULL);
2397 assert(y != NULL);
2398
2399 const __m256 vscale = _mm256_load_ps(params->avx.scale);
2400 const __m256 voutput_max_less_zero_point = _mm256_load_ps(params->avx.output_max_less_zero_point);
2401 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx.output_zero_point);
2402 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->avx.output_min);
2403
2404 for (; n >= 32 * sizeof(float); n -= 32 * sizeof(float)) {
2405 __m256 vx01234567 = _mm256_loadu_ps(x);
2406 __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
2407 __m256 vxGHIJKLMN = _mm256_loadu_ps(x + 16);
2408 __m256 vxOPQRSTUV = _mm256_loadu_ps(x + 24);
2409 x += 32;
2410
2411 vx01234567 = _mm256_mul_ps(vx01234567, vscale);
2412 vx89ABCDEF = _mm256_mul_ps(vx89ABCDEF, vscale);
2413 vxGHIJKLMN = _mm256_mul_ps(vxGHIJKLMN, vscale);
2414 vxOPQRSTUV = _mm256_mul_ps(vxOPQRSTUV, vscale);
2415
2416 vx01234567 = _mm256_min_ps(vx01234567, voutput_max_less_zero_point);
2417 vx89ABCDEF = _mm256_min_ps(vx89ABCDEF, voutput_max_less_zero_point);
2418 vxGHIJKLMN = _mm256_min_ps(vxGHIJKLMN, voutput_max_less_zero_point);
2419 vxOPQRSTUV = _mm256_min_ps(vxOPQRSTUV, voutput_max_less_zero_point);
2420
2421 const __m256i vacc01234567 = _mm256_cvtps_epi32(vx01234567);
2422 const __m256i vacc89ABCDEF = _mm256_cvtps_epi32(vx89ABCDEF);
2423 const __m256i vaccGHIJKLMN = _mm256_cvtps_epi32(vxGHIJKLMN);
2424 const __m256i vaccOPQRSTUV = _mm256_cvtps_epi32(vxOPQRSTUV);
2425
2426 __m128i vy01234567 = _mm_packs_epi32(_mm256_castsi256_si128(vacc01234567), _mm256_extractf128_si256(vacc01234567, 1));
2427 __m128i vy89ABCDEF = _mm_packs_epi32(_mm256_castsi256_si128(vacc89ABCDEF), _mm256_extractf128_si256(vacc89ABCDEF, 1));
2428 __m128i vyGHIJKLMN = _mm_packs_epi32(_mm256_castsi256_si128(vaccGHIJKLMN), _mm256_extractf128_si256(vaccGHIJKLMN, 1));
2429 __m128i vyOPQRSTUV = _mm_packs_epi32(_mm256_castsi256_si128(vaccOPQRSTUV), _mm256_extractf128_si256(vaccOPQRSTUV, 1));
2430
2431 vy01234567 = _mm_adds_epi16(vy01234567, voutput_zero_point);
2432 vy89ABCDEF = _mm_adds_epi16(vy89ABCDEF, voutput_zero_point);
2433 vyGHIJKLMN = _mm_adds_epi16(vyGHIJKLMN, voutput_zero_point);
2434 vyOPQRSTUV = _mm_adds_epi16(vyOPQRSTUV, voutput_zero_point);
2435
2436 __m128i vy0123456789ABCDEF = _mm_packus_epi16(vy01234567, vy89ABCDEF);
2437 __m128i vyGHIJKLMNOPQRSTUV = _mm_packus_epi16(vyGHIJKLMN, vyOPQRSTUV);
2438
2439 vy0123456789ABCDEF = _mm_max_epu8(vy0123456789ABCDEF, voutput_min);
2440 vyGHIJKLMNOPQRSTUV = _mm_max_epu8(vyGHIJKLMNOPQRSTUV, voutput_min);
2441
2442 _mm_storeu_si128((__m128i*) y, vy0123456789ABCDEF);
2443 _mm_storeu_si128((__m128i*) (y + 16), vyGHIJKLMNOPQRSTUV);
2444 y += 32;
2445 }
2446 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
2447 __m256 vx = _mm256_loadu_ps(x);
2448 vx = _mm256_mul_ps(vx, vscale);
2449 vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
2450 x += 8;
2451
2452 const __m256i vacc = _mm256_cvtps_epi32(vx);
2453
2454 __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
2455 vy = _mm_adds_epi16(vy, voutput_zero_point);
2456 vy = _mm_packus_epi16(vy, vy);
2457 vy = _mm_max_epu8(vy, voutput_min);
2458
2459 _mm_storel_epi64((__m128i*) y, vy);
2460 y += 8;
2461 }
2462 if XNN_UNLIKELY(n != 0) {
2463 assert(n >= 1 * sizeof(float));
2464 assert(n <= 7 * sizeof(float));
2465 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
2466
2467 __m256 vx = _mm256_maskload_ps(x, vmask);
2468 vx = _mm256_mul_ps(vx, vscale);
2469 vx = _mm256_min_ps(vx, voutput_max_less_zero_point);
2470
2471 const __m256i vacc = _mm256_cvtps_epi32(vx);
2472
2473 __m128i vy = _mm_packs_epi32(_mm256_castsi256_si128(vacc), _mm256_extractf128_si256(vacc, 1));
2474 vy = _mm_adds_epi16(vy, voutput_zero_point);
2475 vy = _mm_packus_epi16(vy, vy);
2476 vy = _mm_max_epu8(vy, voutput_min);
2477
2478 if (n & (4 * sizeof(float))) {
2479 _mm_storeu_si32(y, vy);
2480 y += 4;
2481 vy = _mm_srli_epi64(vy, 32);
2482 }
2483 if (n & (2 * sizeof(float))) {
2484 _mm_storeu_si16(y, vy);
2485 y += 2;
2486 vy = _mm_srli_epi32(vy, 16);
2487 }
2488 if (n & (1 * sizeof(float))) {
2489 *y = (uint8_t) _mm_extract_epi8(vy, 0);
2490 }
2491 }
2492 }
2493
xnn_f32_vadd_minmax_ukernel__avx_x16(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2494 void xnn_f32_vadd_minmax_ukernel__avx_x16(
2495 size_t n,
2496 const float* a,
2497 const float* b,
2498 float* y,
2499 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
2500 {
2501 assert(n != 0);
2502 assert(n % sizeof(float) == 0);
2503 assert(a != NULL);
2504 assert(b != NULL);
2505 assert(y != NULL);
2506
2507 const __m256 vy_min = _mm256_load_ps(params->avx.min);
2508 const __m256 vy_max = _mm256_load_ps(params->avx.max);
2509
2510 for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
2511 const __m256 va01234567 = _mm256_loadu_ps(a);
2512 const __m256 va89ABCDEF = _mm256_loadu_ps(a + 8);
2513 a += 16;
2514
2515 const __m256 vb01234567 = _mm256_loadu_ps(b);
2516 const __m256 vb89ABCDEF = _mm256_loadu_ps(b + 8);
2517 b += 16;
2518
2519 __m256 vy01234567 = _mm256_add_ps(va01234567, vb01234567);
2520 __m256 vy89ABCDEF = _mm256_add_ps(va89ABCDEF, vb89ABCDEF);
2521
2522
2523 vy01234567 = _mm256_max_ps(vy01234567, vy_min);
2524 vy89ABCDEF = _mm256_max_ps(vy89ABCDEF, vy_min);
2525
2526 vy01234567 = _mm256_min_ps(vy01234567, vy_max);
2527 vy89ABCDEF = _mm256_min_ps(vy89ABCDEF, vy_max);
2528
2529 _mm256_storeu_ps(y, vy01234567);
2530 _mm256_storeu_ps(y + 8, vy89ABCDEF);
2531 y += 16;
2532 }
2533 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
2534 const __m256 va = _mm256_loadu_ps(a);
2535 a += 8;
2536
2537 const __m256 vb = _mm256_loadu_ps(b);
2538 b += 8;
2539
2540 __m256 vy = _mm256_add_ps(va, vb);
2541 vy = _mm256_max_ps(vy, vy_min);
2542 vy = _mm256_min_ps(vy, vy_max);
2543 _mm256_storeu_ps(y, vy);
2544 y += 8;
2545 }
2546 if XNN_UNLIKELY(n != 0) {
2547 assert(n >= 1 * sizeof(float));
2548 assert(n <= 7 * sizeof(float));
2549 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
2550
2551 const __m256 va = _mm256_maskload_ps(a, vmask);
2552 const __m256 vb = _mm256_maskload_ps(b, vmask);
2553
2554 __m256 vy = _mm256_add_ps(va, vb);
2555 vy = _mm256_max_ps(vy, vy_min);
2556 vy = _mm256_min_ps(vy, vy_max);
2557
2558 __m128 vy_lo = _mm256_castps256_ps128(vy);
2559 if (n & (4 * sizeof(float))) {
2560 _mm_storeu_ps(y, vy_lo);
2561 vy_lo = _mm256_extractf128_ps(vy, 1);
2562 y += 4;
2563 }
2564 if (n & (2 * sizeof(float))) {
2565 _mm_storel_pi((__m64*) y, vy_lo);
2566 vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
2567 y += 2;
2568 }
2569 if (n & (1 * sizeof(float))) {
2570 _mm_store_ss(y, vy_lo);
2571 }
2572 }
2573 }
2574
xnn_f32_vaddc_minmax_ukernel__avx_x16(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2575 void xnn_f32_vaddc_minmax_ukernel__avx_x16(
2576 size_t n,
2577 const float* a,
2578 const float* b,
2579 float* y,
2580 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
2581 {
2582 assert(n != 0);
2583 assert(n % sizeof(float) == 0);
2584 assert(a != NULL);
2585 assert(b != NULL);
2586 assert(y != NULL);
2587
2588 const __m256 vy_min = _mm256_load_ps(params->avx.min);
2589 const __m256 vy_max = _mm256_load_ps(params->avx.max);
2590
2591 const __m256 vb = _mm256_broadcast_ss(b);
2592 for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
2593 const __m256 va01234567 = _mm256_loadu_ps(a);
2594 const __m256 va89ABCDEF = _mm256_loadu_ps(a + 8);
2595 a += 16;
2596
2597 __m256 vy01234567 = _mm256_add_ps(va01234567, vb);
2598 __m256 vy89ABCDEF = _mm256_add_ps(va89ABCDEF, vb);
2599
2600
2601 vy01234567 = _mm256_max_ps(vy01234567, vy_min);
2602 vy89ABCDEF = _mm256_max_ps(vy89ABCDEF, vy_min);
2603
2604 vy01234567 = _mm256_min_ps(vy01234567, vy_max);
2605 vy89ABCDEF = _mm256_min_ps(vy89ABCDEF, vy_max);
2606
2607 _mm256_storeu_ps(y, vy01234567);
2608 _mm256_storeu_ps(y + 8, vy89ABCDEF);
2609 y += 16;
2610 }
2611 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
2612 const __m256 va = _mm256_loadu_ps(a);
2613 a += 8;
2614
2615 __m256 vy = _mm256_add_ps(va, vb);
2616 vy = _mm256_max_ps(vy, vy_min);
2617 vy = _mm256_min_ps(vy, vy_max);
2618 _mm256_storeu_ps(y, vy);
2619 y += 8;
2620 }
2621 if XNN_UNLIKELY(n != 0) {
2622 assert(n >= 1 * sizeof(float));
2623 assert(n <= 7 * sizeof(float));
2624 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
2625
2626 const __m256 va = _mm256_maskload_ps(a, vmask);
2627
2628 __m256 vy = _mm256_add_ps(va, vb);
2629 vy = _mm256_max_ps(vy, vy_min);
2630 vy = _mm256_min_ps(vy, vy_max);
2631
2632 __m128 vy_lo = _mm256_castps256_ps128(vy);
2633 if (n & (4 * sizeof(float))) {
2634 _mm_storeu_ps(y, vy_lo);
2635 vy_lo = _mm256_extractf128_ps(vy, 1);
2636 y += 4;
2637 }
2638 if (n & (2 * sizeof(float))) {
2639 _mm_storel_pi((__m64*) y, vy_lo);
2640 vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
2641 y += 2;
2642 }
2643 if (n & (1 * sizeof(float))) {
2644 _mm_store_ss(y, vy_lo);
2645 }
2646 }
2647 }
2648
xnn_f32_vdiv_minmax_ukernel__avx_x16(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2649 void xnn_f32_vdiv_minmax_ukernel__avx_x16(
2650 size_t n,
2651 const float* a,
2652 const float* b,
2653 float* y,
2654 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
2655 {
2656 assert(n != 0);
2657 assert(n % sizeof(float) == 0);
2658 assert(a != NULL);
2659 assert(b != NULL);
2660 assert(y != NULL);
2661
2662 const __m256 vy_min = _mm256_load_ps(params->avx.min);
2663 const __m256 vy_max = _mm256_load_ps(params->avx.max);
2664
2665 for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
2666 const __m256 va01234567 = _mm256_loadu_ps(a);
2667 const __m256 va89ABCDEF = _mm256_loadu_ps(a + 8);
2668 a += 16;
2669
2670 const __m256 vb01234567 = _mm256_loadu_ps(b);
2671 const __m256 vb89ABCDEF = _mm256_loadu_ps(b + 8);
2672 b += 16;
2673
2674 __m256 vy01234567 = _mm256_div_ps(va01234567, vb01234567);
2675 __m256 vy89ABCDEF = _mm256_div_ps(va89ABCDEF, vb89ABCDEF);
2676
2677
2678 vy01234567 = _mm256_max_ps(vy01234567, vy_min);
2679 vy89ABCDEF = _mm256_max_ps(vy89ABCDEF, vy_min);
2680
2681 vy01234567 = _mm256_min_ps(vy01234567, vy_max);
2682 vy89ABCDEF = _mm256_min_ps(vy89ABCDEF, vy_max);
2683
2684 _mm256_storeu_ps(y, vy01234567);
2685 _mm256_storeu_ps(y + 8, vy89ABCDEF);
2686 y += 16;
2687 }
2688 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
2689 const __m256 va = _mm256_loadu_ps(a);
2690 a += 8;
2691
2692 const __m256 vb = _mm256_loadu_ps(b);
2693 b += 8;
2694
2695 __m256 vy = _mm256_div_ps(va, vb);
2696 vy = _mm256_max_ps(vy, vy_min);
2697 vy = _mm256_min_ps(vy, vy_max);
2698 _mm256_storeu_ps(y, vy);
2699 y += 8;
2700 }
2701 if XNN_UNLIKELY(n != 0) {
2702 assert(n >= 1 * sizeof(float));
2703 assert(n <= 7 * sizeof(float));
2704 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
2705
2706 const __m256 va = _mm256_maskload_ps(a, vmask);
2707 const __m256 vb = _mm256_maskload_ps(b, vmask);
2708
2709 __m256 vy = _mm256_div_ps(va, vb);
2710 vy = _mm256_max_ps(vy, vy_min);
2711 vy = _mm256_min_ps(vy, vy_max);
2712
2713 __m128 vy_lo = _mm256_castps256_ps128(vy);
2714 if (n & (4 * sizeof(float))) {
2715 _mm_storeu_ps(y, vy_lo);
2716 vy_lo = _mm256_extractf128_ps(vy, 1);
2717 y += 4;
2718 }
2719 if (n & (2 * sizeof(float))) {
2720 _mm_storel_pi((__m64*) y, vy_lo);
2721 vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
2722 y += 2;
2723 }
2724 if (n & (1 * sizeof(float))) {
2725 _mm_store_ss(y, vy_lo);
2726 }
2727 }
2728 }
2729
xnn_f32_vdivc_minmax_ukernel__avx_x16(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2730 void xnn_f32_vdivc_minmax_ukernel__avx_x16(
2731 size_t n,
2732 const float* a,
2733 const float* b,
2734 float* y,
2735 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
2736 {
2737 assert(n != 0);
2738 assert(n % sizeof(float) == 0);
2739 assert(a != NULL);
2740 assert(b != NULL);
2741 assert(y != NULL);
2742
2743 const __m256 vy_min = _mm256_load_ps(params->avx.min);
2744 const __m256 vy_max = _mm256_load_ps(params->avx.max);
2745
2746 const __m256 vb = _mm256_broadcast_ss(b);
2747 for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
2748 const __m256 va01234567 = _mm256_loadu_ps(a);
2749 const __m256 va89ABCDEF = _mm256_loadu_ps(a + 8);
2750 a += 16;
2751
2752 __m256 vy01234567 = _mm256_div_ps(va01234567, vb);
2753 __m256 vy89ABCDEF = _mm256_div_ps(va89ABCDEF, vb);
2754
2755
2756 vy01234567 = _mm256_max_ps(vy01234567, vy_min);
2757 vy89ABCDEF = _mm256_max_ps(vy89ABCDEF, vy_min);
2758
2759 vy01234567 = _mm256_min_ps(vy01234567, vy_max);
2760 vy89ABCDEF = _mm256_min_ps(vy89ABCDEF, vy_max);
2761
2762 _mm256_storeu_ps(y, vy01234567);
2763 _mm256_storeu_ps(y + 8, vy89ABCDEF);
2764 y += 16;
2765 }
2766 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
2767 const __m256 va = _mm256_loadu_ps(a);
2768 a += 8;
2769
2770 __m256 vy = _mm256_div_ps(va, vb);
2771 vy = _mm256_max_ps(vy, vy_min);
2772 vy = _mm256_min_ps(vy, vy_max);
2773 _mm256_storeu_ps(y, vy);
2774 y += 8;
2775 }
2776 if XNN_UNLIKELY(n != 0) {
2777 assert(n >= 1 * sizeof(float));
2778 assert(n <= 7 * sizeof(float));
2779 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
2780
2781 const __m256 va = _mm256_maskload_ps(a, vmask);
2782
2783 __m256 vy = _mm256_div_ps(va, vb);
2784 vy = _mm256_max_ps(vy, vy_min);
2785 vy = _mm256_min_ps(vy, vy_max);
2786
2787 __m128 vy_lo = _mm256_castps256_ps128(vy);
2788 if (n & (4 * sizeof(float))) {
2789 _mm_storeu_ps(y, vy_lo);
2790 vy_lo = _mm256_extractf128_ps(vy, 1);
2791 y += 4;
2792 }
2793 if (n & (2 * sizeof(float))) {
2794 _mm_storel_pi((__m64*) y, vy_lo);
2795 vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
2796 y += 2;
2797 }
2798 if (n & (1 * sizeof(float))) {
2799 _mm_store_ss(y, vy_lo);
2800 }
2801 }
2802 }
2803
xnn_f32_vmax_ukernel__avx_x16(size_t n,const float * a,const float * b,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])2804 void xnn_f32_vmax_ukernel__avx_x16(
2805 size_t n,
2806 const float* a,
2807 const float* b,
2808 float* y,
2809 const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
2810 {
2811 assert(n != 0);
2812 assert(n % sizeof(float) == 0);
2813 assert(a != NULL);
2814 assert(b != NULL);
2815 assert(y != NULL);
2816
2817
2818 for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
2819 const __m256 va01234567 = _mm256_loadu_ps(a);
2820 const __m256 va89ABCDEF = _mm256_loadu_ps(a + 8);
2821 a += 16;
2822
2823 const __m256 vb01234567 = _mm256_loadu_ps(b);
2824 const __m256 vb89ABCDEF = _mm256_loadu_ps(b + 8);
2825 b += 16;
2826
2827 __m256 vy01234567 = _mm256_max_ps(va01234567, vb01234567);
2828 __m256 vy89ABCDEF = _mm256_max_ps(va89ABCDEF, vb89ABCDEF);
2829
2830
2831
2832 _mm256_storeu_ps(y, vy01234567);
2833 _mm256_storeu_ps(y + 8, vy89ABCDEF);
2834 y += 16;
2835 }
2836 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
2837 const __m256 va = _mm256_loadu_ps(a);
2838 a += 8;
2839
2840 const __m256 vb = _mm256_loadu_ps(b);
2841 b += 8;
2842
2843 __m256 vy = _mm256_max_ps(va, vb);
2844 _mm256_storeu_ps(y, vy);
2845 y += 8;
2846 }
2847 if XNN_UNLIKELY(n != 0) {
2848 assert(n >= 1 * sizeof(float));
2849 assert(n <= 7 * sizeof(float));
2850 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
2851
2852 const __m256 va = _mm256_maskload_ps(a, vmask);
2853 const __m256 vb = _mm256_maskload_ps(b, vmask);
2854
2855 __m256 vy = _mm256_max_ps(va, vb);
2856
2857 __m128 vy_lo = _mm256_castps256_ps128(vy);
2858 if (n & (4 * sizeof(float))) {
2859 _mm_storeu_ps(y, vy_lo);
2860 vy_lo = _mm256_extractf128_ps(vy, 1);
2861 y += 4;
2862 }
2863 if (n & (2 * sizeof(float))) {
2864 _mm_storel_pi((__m64*) y, vy_lo);
2865 vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
2866 y += 2;
2867 }
2868 if (n & (1 * sizeof(float))) {
2869 _mm_store_ss(y, vy_lo);
2870 }
2871 }
2872 }
2873
xnn_f32_vmaxc_ukernel__avx_x16(size_t n,const float * a,const float * b,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])2874 void xnn_f32_vmaxc_ukernel__avx_x16(
2875 size_t n,
2876 const float* a,
2877 const float* b,
2878 float* y,
2879 const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
2880 {
2881 assert(n != 0);
2882 assert(n % sizeof(float) == 0);
2883 assert(a != NULL);
2884 assert(b != NULL);
2885 assert(y != NULL);
2886
2887
2888 const __m256 vb = _mm256_broadcast_ss(b);
2889 for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
2890 const __m256 va01234567 = _mm256_loadu_ps(a);
2891 const __m256 va89ABCDEF = _mm256_loadu_ps(a + 8);
2892 a += 16;
2893
2894 __m256 vy01234567 = _mm256_max_ps(va01234567, vb);
2895 __m256 vy89ABCDEF = _mm256_max_ps(va89ABCDEF, vb);
2896
2897
2898
2899 _mm256_storeu_ps(y, vy01234567);
2900 _mm256_storeu_ps(y + 8, vy89ABCDEF);
2901 y += 16;
2902 }
2903 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
2904 const __m256 va = _mm256_loadu_ps(a);
2905 a += 8;
2906
2907 __m256 vy = _mm256_max_ps(va, vb);
2908 _mm256_storeu_ps(y, vy);
2909 y += 8;
2910 }
2911 if XNN_UNLIKELY(n != 0) {
2912 assert(n >= 1 * sizeof(float));
2913 assert(n <= 7 * sizeof(float));
2914 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
2915
2916 const __m256 va = _mm256_maskload_ps(a, vmask);
2917
2918 __m256 vy = _mm256_max_ps(va, vb);
2919
2920 __m128 vy_lo = _mm256_castps256_ps128(vy);
2921 if (n & (4 * sizeof(float))) {
2922 _mm_storeu_ps(y, vy_lo);
2923 vy_lo = _mm256_extractf128_ps(vy, 1);
2924 y += 4;
2925 }
2926 if (n & (2 * sizeof(float))) {
2927 _mm_storel_pi((__m64*) y, vy_lo);
2928 vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
2929 y += 2;
2930 }
2931 if (n & (1 * sizeof(float))) {
2932 _mm_store_ss(y, vy_lo);
2933 }
2934 }
2935 }
2936
xnn_f32_vmin_ukernel__avx_x16(size_t n,const float * a,const float * b,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])2937 void xnn_f32_vmin_ukernel__avx_x16(
2938 size_t n,
2939 const float* a,
2940 const float* b,
2941 float* y,
2942 const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
2943 {
2944 assert(n != 0);
2945 assert(n % sizeof(float) == 0);
2946 assert(a != NULL);
2947 assert(b != NULL);
2948 assert(y != NULL);
2949
2950
2951 for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
2952 const __m256 va01234567 = _mm256_loadu_ps(a);
2953 const __m256 va89ABCDEF = _mm256_loadu_ps(a + 8);
2954 a += 16;
2955
2956 const __m256 vb01234567 = _mm256_loadu_ps(b);
2957 const __m256 vb89ABCDEF = _mm256_loadu_ps(b + 8);
2958 b += 16;
2959
2960 __m256 vy01234567 = _mm256_min_ps(va01234567, vb01234567);
2961 __m256 vy89ABCDEF = _mm256_min_ps(va89ABCDEF, vb89ABCDEF);
2962
2963
2964
2965 _mm256_storeu_ps(y, vy01234567);
2966 _mm256_storeu_ps(y + 8, vy89ABCDEF);
2967 y += 16;
2968 }
2969 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
2970 const __m256 va = _mm256_loadu_ps(a);
2971 a += 8;
2972
2973 const __m256 vb = _mm256_loadu_ps(b);
2974 b += 8;
2975
2976 __m256 vy = _mm256_min_ps(va, vb);
2977 _mm256_storeu_ps(y, vy);
2978 y += 8;
2979 }
2980 if XNN_UNLIKELY(n != 0) {
2981 assert(n >= 1 * sizeof(float));
2982 assert(n <= 7 * sizeof(float));
2983 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
2984
2985 const __m256 va = _mm256_maskload_ps(a, vmask);
2986 const __m256 vb = _mm256_maskload_ps(b, vmask);
2987
2988 __m256 vy = _mm256_min_ps(va, vb);
2989
2990 __m128 vy_lo = _mm256_castps256_ps128(vy);
2991 if (n & (4 * sizeof(float))) {
2992 _mm_storeu_ps(y, vy_lo);
2993 vy_lo = _mm256_extractf128_ps(vy, 1);
2994 y += 4;
2995 }
2996 if (n & (2 * sizeof(float))) {
2997 _mm_storel_pi((__m64*) y, vy_lo);
2998 vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
2999 y += 2;
3000 }
3001 if (n & (1 * sizeof(float))) {
3002 _mm_store_ss(y, vy_lo);
3003 }
3004 }
3005 }
3006
xnn_f32_vminc_ukernel__avx_x16(size_t n,const float * a,const float * b,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])3007 void xnn_f32_vminc_ukernel__avx_x16(
3008 size_t n,
3009 const float* a,
3010 const float* b,
3011 float* y,
3012 const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
3013 {
3014 assert(n != 0);
3015 assert(n % sizeof(float) == 0);
3016 assert(a != NULL);
3017 assert(b != NULL);
3018 assert(y != NULL);
3019
3020
3021 const __m256 vb = _mm256_broadcast_ss(b);
3022 for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
3023 const __m256 va01234567 = _mm256_loadu_ps(a);
3024 const __m256 va89ABCDEF = _mm256_loadu_ps(a + 8);
3025 a += 16;
3026
3027 __m256 vy01234567 = _mm256_min_ps(va01234567, vb);
3028 __m256 vy89ABCDEF = _mm256_min_ps(va89ABCDEF, vb);
3029
3030
3031
3032 _mm256_storeu_ps(y, vy01234567);
3033 _mm256_storeu_ps(y + 8, vy89ABCDEF);
3034 y += 16;
3035 }
3036 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
3037 const __m256 va = _mm256_loadu_ps(a);
3038 a += 8;
3039
3040 __m256 vy = _mm256_min_ps(va, vb);
3041 _mm256_storeu_ps(y, vy);
3042 y += 8;
3043 }
3044 if XNN_UNLIKELY(n != 0) {
3045 assert(n >= 1 * sizeof(float));
3046 assert(n <= 7 * sizeof(float));
3047 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
3048
3049 const __m256 va = _mm256_maskload_ps(a, vmask);
3050
3051 __m256 vy = _mm256_min_ps(va, vb);
3052
3053 __m128 vy_lo = _mm256_castps256_ps128(vy);
3054 if (n & (4 * sizeof(float))) {
3055 _mm_storeu_ps(y, vy_lo);
3056 vy_lo = _mm256_extractf128_ps(vy, 1);
3057 y += 4;
3058 }
3059 if (n & (2 * sizeof(float))) {
3060 _mm_storel_pi((__m64*) y, vy_lo);
3061 vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
3062 y += 2;
3063 }
3064 if (n & (1 * sizeof(float))) {
3065 _mm_store_ss(y, vy_lo);
3066 }
3067 }
3068 }
3069
xnn_f32_vmul_minmax_ukernel__avx_x16(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3070 void xnn_f32_vmul_minmax_ukernel__avx_x16(
3071 size_t n,
3072 const float* a,
3073 const float* b,
3074 float* y,
3075 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
3076 {
3077 assert(n != 0);
3078 assert(n % sizeof(float) == 0);
3079 assert(a != NULL);
3080 assert(b != NULL);
3081 assert(y != NULL);
3082
3083 const __m256 vy_min = _mm256_load_ps(params->avx.min);
3084 const __m256 vy_max = _mm256_load_ps(params->avx.max);
3085
3086 for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
3087 const __m256 va01234567 = _mm256_loadu_ps(a);
3088 const __m256 va89ABCDEF = _mm256_loadu_ps(a + 8);
3089 a += 16;
3090
3091 const __m256 vb01234567 = _mm256_loadu_ps(b);
3092 const __m256 vb89ABCDEF = _mm256_loadu_ps(b + 8);
3093 b += 16;
3094
3095 __m256 vy01234567 = _mm256_mul_ps(va01234567, vb01234567);
3096 __m256 vy89ABCDEF = _mm256_mul_ps(va89ABCDEF, vb89ABCDEF);
3097
3098
3099 vy01234567 = _mm256_max_ps(vy01234567, vy_min);
3100 vy89ABCDEF = _mm256_max_ps(vy89ABCDEF, vy_min);
3101
3102 vy01234567 = _mm256_min_ps(vy01234567, vy_max);
3103 vy89ABCDEF = _mm256_min_ps(vy89ABCDEF, vy_max);
3104
3105 _mm256_storeu_ps(y, vy01234567);
3106 _mm256_storeu_ps(y + 8, vy89ABCDEF);
3107 y += 16;
3108 }
3109 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
3110 const __m256 va = _mm256_loadu_ps(a);
3111 a += 8;
3112
3113 const __m256 vb = _mm256_loadu_ps(b);
3114 b += 8;
3115
3116 __m256 vy = _mm256_mul_ps(va, vb);
3117 vy = _mm256_max_ps(vy, vy_min);
3118 vy = _mm256_min_ps(vy, vy_max);
3119 _mm256_storeu_ps(y, vy);
3120 y += 8;
3121 }
3122 if XNN_UNLIKELY(n != 0) {
3123 assert(n >= 1 * sizeof(float));
3124 assert(n <= 7 * sizeof(float));
3125 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
3126
3127 const __m256 va = _mm256_maskload_ps(a, vmask);
3128 const __m256 vb = _mm256_maskload_ps(b, vmask);
3129
3130 __m256 vy = _mm256_mul_ps(va, vb);
3131 vy = _mm256_max_ps(vy, vy_min);
3132 vy = _mm256_min_ps(vy, vy_max);
3133
3134 __m128 vy_lo = _mm256_castps256_ps128(vy);
3135 if (n & (4 * sizeof(float))) {
3136 _mm_storeu_ps(y, vy_lo);
3137 vy_lo = _mm256_extractf128_ps(vy, 1);
3138 y += 4;
3139 }
3140 if (n & (2 * sizeof(float))) {
3141 _mm_storel_pi((__m64*) y, vy_lo);
3142 vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
3143 y += 2;
3144 }
3145 if (n & (1 * sizeof(float))) {
3146 _mm_store_ss(y, vy_lo);
3147 }
3148 }
3149 }
3150
xnn_f32_vmulc_minmax_ukernel__avx_x16(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3151 void xnn_f32_vmulc_minmax_ukernel__avx_x16(
3152 size_t n,
3153 const float* a,
3154 const float* b,
3155 float* y,
3156 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
3157 {
3158 assert(n != 0);
3159 assert(n % sizeof(float) == 0);
3160 assert(a != NULL);
3161 assert(b != NULL);
3162 assert(y != NULL);
3163
3164 const __m256 vy_min = _mm256_load_ps(params->avx.min);
3165 const __m256 vy_max = _mm256_load_ps(params->avx.max);
3166
3167 const __m256 vb = _mm256_broadcast_ss(b);
3168 for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
3169 const __m256 va01234567 = _mm256_loadu_ps(a);
3170 const __m256 va89ABCDEF = _mm256_loadu_ps(a + 8);
3171 a += 16;
3172
3173 __m256 vy01234567 = _mm256_mul_ps(va01234567, vb);
3174 __m256 vy89ABCDEF = _mm256_mul_ps(va89ABCDEF, vb);
3175
3176
3177 vy01234567 = _mm256_max_ps(vy01234567, vy_min);
3178 vy89ABCDEF = _mm256_max_ps(vy89ABCDEF, vy_min);
3179
3180 vy01234567 = _mm256_min_ps(vy01234567, vy_max);
3181 vy89ABCDEF = _mm256_min_ps(vy89ABCDEF, vy_max);
3182
3183 _mm256_storeu_ps(y, vy01234567);
3184 _mm256_storeu_ps(y + 8, vy89ABCDEF);
3185 y += 16;
3186 }
3187 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
3188 const __m256 va = _mm256_loadu_ps(a);
3189 a += 8;
3190
3191 __m256 vy = _mm256_mul_ps(va, vb);
3192 vy = _mm256_max_ps(vy, vy_min);
3193 vy = _mm256_min_ps(vy, vy_max);
3194 _mm256_storeu_ps(y, vy);
3195 y += 8;
3196 }
3197 if XNN_UNLIKELY(n != 0) {
3198 assert(n >= 1 * sizeof(float));
3199 assert(n <= 7 * sizeof(float));
3200 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
3201
3202 const __m256 va = _mm256_maskload_ps(a, vmask);
3203
3204 __m256 vy = _mm256_mul_ps(va, vb);
3205 vy = _mm256_max_ps(vy, vy_min);
3206 vy = _mm256_min_ps(vy, vy_max);
3207
3208 __m128 vy_lo = _mm256_castps256_ps128(vy);
3209 if (n & (4 * sizeof(float))) {
3210 _mm_storeu_ps(y, vy_lo);
3211 vy_lo = _mm256_extractf128_ps(vy, 1);
3212 y += 4;
3213 }
3214 if (n & (2 * sizeof(float))) {
3215 _mm_storel_pi((__m64*) y, vy_lo);
3216 vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
3217 y += 2;
3218 }
3219 if (n & (1 * sizeof(float))) {
3220 _mm_store_ss(y, vy_lo);
3221 }
3222 }
3223 }
3224
xnn_f32_vrdivc_minmax_ukernel__avx_x16(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3225 void xnn_f32_vrdivc_minmax_ukernel__avx_x16(
3226 size_t n,
3227 const float* a,
3228 const float* b,
3229 float* y,
3230 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
3231 {
3232 assert(n != 0);
3233 assert(n % sizeof(float) == 0);
3234 assert(a != NULL);
3235 assert(b != NULL);
3236 assert(y != NULL);
3237
3238 const __m256 vy_min = _mm256_load_ps(params->avx.min);
3239 const __m256 vy_max = _mm256_load_ps(params->avx.max);
3240
3241 const __m256 vb = _mm256_broadcast_ss(b);
3242 for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
3243 const __m256 va01234567 = _mm256_loadu_ps(a);
3244 const __m256 va89ABCDEF = _mm256_loadu_ps(a + 8);
3245 a += 16;
3246
3247 __m256 vy01234567 = _mm256_div_ps(vb, va01234567);
3248 __m256 vy89ABCDEF = _mm256_div_ps(vb, va89ABCDEF);
3249
3250
3251 vy01234567 = _mm256_max_ps(vy01234567, vy_min);
3252 vy89ABCDEF = _mm256_max_ps(vy89ABCDEF, vy_min);
3253
3254 vy01234567 = _mm256_min_ps(vy01234567, vy_max);
3255 vy89ABCDEF = _mm256_min_ps(vy89ABCDEF, vy_max);
3256
3257 _mm256_storeu_ps(y, vy01234567);
3258 _mm256_storeu_ps(y + 8, vy89ABCDEF);
3259 y += 16;
3260 }
3261 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
3262 const __m256 va = _mm256_loadu_ps(a);
3263 a += 8;
3264
3265 __m256 vy = _mm256_div_ps(vb, va);
3266 vy = _mm256_max_ps(vy, vy_min);
3267 vy = _mm256_min_ps(vy, vy_max);
3268 _mm256_storeu_ps(y, vy);
3269 y += 8;
3270 }
3271 if XNN_UNLIKELY(n != 0) {
3272 assert(n >= 1 * sizeof(float));
3273 assert(n <= 7 * sizeof(float));
3274 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
3275
3276 const __m256 va = _mm256_maskload_ps(a, vmask);
3277
3278 __m256 vy = _mm256_div_ps(vb, va);
3279 vy = _mm256_max_ps(vy, vy_min);
3280 vy = _mm256_min_ps(vy, vy_max);
3281
3282 __m128 vy_lo = _mm256_castps256_ps128(vy);
3283 if (n & (4 * sizeof(float))) {
3284 _mm_storeu_ps(y, vy_lo);
3285 vy_lo = _mm256_extractf128_ps(vy, 1);
3286 y += 4;
3287 }
3288 if (n & (2 * sizeof(float))) {
3289 _mm_storel_pi((__m64*) y, vy_lo);
3290 vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
3291 y += 2;
3292 }
3293 if (n & (1 * sizeof(float))) {
3294 _mm_store_ss(y, vy_lo);
3295 }
3296 }
3297 }
3298
xnn_f32_vrsubc_minmax_ukernel__avx_x16(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3299 void xnn_f32_vrsubc_minmax_ukernel__avx_x16(
3300 size_t n,
3301 const float* a,
3302 const float* b,
3303 float* y,
3304 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
3305 {
3306 assert(n != 0);
3307 assert(n % sizeof(float) == 0);
3308 assert(a != NULL);
3309 assert(b != NULL);
3310 assert(y != NULL);
3311
3312 const __m256 vy_min = _mm256_load_ps(params->avx.min);
3313 const __m256 vy_max = _mm256_load_ps(params->avx.max);
3314
3315 const __m256 vb = _mm256_broadcast_ss(b);
3316 for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
3317 const __m256 va01234567 = _mm256_loadu_ps(a);
3318 const __m256 va89ABCDEF = _mm256_loadu_ps(a + 8);
3319 a += 16;
3320
3321 __m256 vy01234567 = _mm256_sub_ps(vb, va01234567);
3322 __m256 vy89ABCDEF = _mm256_sub_ps(vb, va89ABCDEF);
3323
3324
3325 vy01234567 = _mm256_max_ps(vy01234567, vy_min);
3326 vy89ABCDEF = _mm256_max_ps(vy89ABCDEF, vy_min);
3327
3328 vy01234567 = _mm256_min_ps(vy01234567, vy_max);
3329 vy89ABCDEF = _mm256_min_ps(vy89ABCDEF, vy_max);
3330
3331 _mm256_storeu_ps(y, vy01234567);
3332 _mm256_storeu_ps(y + 8, vy89ABCDEF);
3333 y += 16;
3334 }
3335 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
3336 const __m256 va = _mm256_loadu_ps(a);
3337 a += 8;
3338
3339 __m256 vy = _mm256_sub_ps(vb, va);
3340 vy = _mm256_max_ps(vy, vy_min);
3341 vy = _mm256_min_ps(vy, vy_max);
3342 _mm256_storeu_ps(y, vy);
3343 y += 8;
3344 }
3345 if XNN_UNLIKELY(n != 0) {
3346 assert(n >= 1 * sizeof(float));
3347 assert(n <= 7 * sizeof(float));
3348 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
3349
3350 const __m256 va = _mm256_maskload_ps(a, vmask);
3351
3352 __m256 vy = _mm256_sub_ps(vb, va);
3353 vy = _mm256_max_ps(vy, vy_min);
3354 vy = _mm256_min_ps(vy, vy_max);
3355
3356 __m128 vy_lo = _mm256_castps256_ps128(vy);
3357 if (n & (4 * sizeof(float))) {
3358 _mm_storeu_ps(y, vy_lo);
3359 vy_lo = _mm256_extractf128_ps(vy, 1);
3360 y += 4;
3361 }
3362 if (n & (2 * sizeof(float))) {
3363 _mm_storel_pi((__m64*) y, vy_lo);
3364 vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
3365 y += 2;
3366 }
3367 if (n & (1 * sizeof(float))) {
3368 _mm_store_ss(y, vy_lo);
3369 }
3370 }
3371 }
3372
xnn_f32_vsqrdiff_ukernel__avx_x16(size_t n,const float * a,const float * b,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])3373 void xnn_f32_vsqrdiff_ukernel__avx_x16(
3374 size_t n,
3375 const float* a,
3376 const float* b,
3377 float* y,
3378 const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
3379 {
3380 assert(n != 0);
3381 assert(n % sizeof(float) == 0);
3382 assert(a != NULL);
3383 assert(b != NULL);
3384 assert(y != NULL);
3385
3386
3387 for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
3388 const __m256 va01234567 = _mm256_loadu_ps(a);
3389 const __m256 va89ABCDEF = _mm256_loadu_ps(a + 8);
3390 a += 16;
3391
3392 const __m256 vb01234567 = _mm256_loadu_ps(b);
3393 const __m256 vb89ABCDEF = _mm256_loadu_ps(b + 8);
3394 b += 16;
3395
3396 __m256 vy01234567 = _mm256_sub_ps(va01234567, vb01234567);
3397 __m256 vy89ABCDEF = _mm256_sub_ps(va89ABCDEF, vb89ABCDEF);
3398
3399 vy01234567 = _mm256_mul_ps(vy01234567, vy01234567);
3400 vy89ABCDEF = _mm256_mul_ps(vy89ABCDEF, vy89ABCDEF);
3401
3402
3403 _mm256_storeu_ps(y, vy01234567);
3404 _mm256_storeu_ps(y + 8, vy89ABCDEF);
3405 y += 16;
3406 }
3407 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
3408 const __m256 va = _mm256_loadu_ps(a);
3409 a += 8;
3410
3411 const __m256 vb = _mm256_loadu_ps(b);
3412 b += 8;
3413
3414 __m256 vy = _mm256_sub_ps(va, vb);
3415 vy = _mm256_mul_ps(vy, vy);
3416 _mm256_storeu_ps(y, vy);
3417 y += 8;
3418 }
3419 if XNN_UNLIKELY(n != 0) {
3420 assert(n >= 1 * sizeof(float));
3421 assert(n <= 7 * sizeof(float));
3422 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
3423
3424 const __m256 va = _mm256_maskload_ps(a, vmask);
3425 const __m256 vb = _mm256_maskload_ps(b, vmask);
3426
3427 __m256 vy = _mm256_sub_ps(va, vb);
3428 vy = _mm256_mul_ps(vy, vy);
3429
3430 __m128 vy_lo = _mm256_castps256_ps128(vy);
3431 if (n & (4 * sizeof(float))) {
3432 _mm_storeu_ps(y, vy_lo);
3433 vy_lo = _mm256_extractf128_ps(vy, 1);
3434 y += 4;
3435 }
3436 if (n & (2 * sizeof(float))) {
3437 _mm_storel_pi((__m64*) y, vy_lo);
3438 vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
3439 y += 2;
3440 }
3441 if (n & (1 * sizeof(float))) {
3442 _mm_store_ss(y, vy_lo);
3443 }
3444 }
3445 }
3446
xnn_f32_vsqrdiffc_ukernel__avx_x16(size_t n,const float * a,const float * b,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])3447 void xnn_f32_vsqrdiffc_ukernel__avx_x16(
3448 size_t n,
3449 const float* a,
3450 const float* b,
3451 float* y,
3452 const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
3453 {
3454 assert(n != 0);
3455 assert(n % sizeof(float) == 0);
3456 assert(a != NULL);
3457 assert(b != NULL);
3458 assert(y != NULL);
3459
3460
3461 const __m256 vb = _mm256_broadcast_ss(b);
3462 for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
3463 const __m256 va01234567 = _mm256_loadu_ps(a);
3464 const __m256 va89ABCDEF = _mm256_loadu_ps(a + 8);
3465 a += 16;
3466
3467 __m256 vy01234567 = _mm256_sub_ps(va01234567, vb);
3468 __m256 vy89ABCDEF = _mm256_sub_ps(va89ABCDEF, vb);
3469
3470 vy01234567 = _mm256_mul_ps(vy01234567, vy01234567);
3471 vy89ABCDEF = _mm256_mul_ps(vy89ABCDEF, vy89ABCDEF);
3472
3473
3474 _mm256_storeu_ps(y, vy01234567);
3475 _mm256_storeu_ps(y + 8, vy89ABCDEF);
3476 y += 16;
3477 }
3478 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
3479 const __m256 va = _mm256_loadu_ps(a);
3480 a += 8;
3481
3482 __m256 vy = _mm256_sub_ps(va, vb);
3483 vy = _mm256_mul_ps(vy, vy);
3484 _mm256_storeu_ps(y, vy);
3485 y += 8;
3486 }
3487 if XNN_UNLIKELY(n != 0) {
3488 assert(n >= 1 * sizeof(float));
3489 assert(n <= 7 * sizeof(float));
3490 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
3491
3492 const __m256 va = _mm256_maskload_ps(a, vmask);
3493
3494 __m256 vy = _mm256_sub_ps(va, vb);
3495 vy = _mm256_mul_ps(vy, vy);
3496
3497 __m128 vy_lo = _mm256_castps256_ps128(vy);
3498 if (n & (4 * sizeof(float))) {
3499 _mm_storeu_ps(y, vy_lo);
3500 vy_lo = _mm256_extractf128_ps(vy, 1);
3501 y += 4;
3502 }
3503 if (n & (2 * sizeof(float))) {
3504 _mm_storel_pi((__m64*) y, vy_lo);
3505 vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
3506 y += 2;
3507 }
3508 if (n & (1 * sizeof(float))) {
3509 _mm_store_ss(y, vy_lo);
3510 }
3511 }
3512 }
3513
xnn_f32_vsub_minmax_ukernel__avx_x16(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3514 void xnn_f32_vsub_minmax_ukernel__avx_x16(
3515 size_t n,
3516 const float* a,
3517 const float* b,
3518 float* y,
3519 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
3520 {
3521 assert(n != 0);
3522 assert(n % sizeof(float) == 0);
3523 assert(a != NULL);
3524 assert(b != NULL);
3525 assert(y != NULL);
3526
3527 const __m256 vy_min = _mm256_load_ps(params->avx.min);
3528 const __m256 vy_max = _mm256_load_ps(params->avx.max);
3529
3530 for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
3531 const __m256 va01234567 = _mm256_loadu_ps(a);
3532 const __m256 va89ABCDEF = _mm256_loadu_ps(a + 8);
3533 a += 16;
3534
3535 const __m256 vb01234567 = _mm256_loadu_ps(b);
3536 const __m256 vb89ABCDEF = _mm256_loadu_ps(b + 8);
3537 b += 16;
3538
3539 __m256 vy01234567 = _mm256_sub_ps(va01234567, vb01234567);
3540 __m256 vy89ABCDEF = _mm256_sub_ps(va89ABCDEF, vb89ABCDEF);
3541
3542
3543 vy01234567 = _mm256_max_ps(vy01234567, vy_min);
3544 vy89ABCDEF = _mm256_max_ps(vy89ABCDEF, vy_min);
3545
3546 vy01234567 = _mm256_min_ps(vy01234567, vy_max);
3547 vy89ABCDEF = _mm256_min_ps(vy89ABCDEF, vy_max);
3548
3549 _mm256_storeu_ps(y, vy01234567);
3550 _mm256_storeu_ps(y + 8, vy89ABCDEF);
3551 y += 16;
3552 }
3553 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
3554 const __m256 va = _mm256_loadu_ps(a);
3555 a += 8;
3556
3557 const __m256 vb = _mm256_loadu_ps(b);
3558 b += 8;
3559
3560 __m256 vy = _mm256_sub_ps(va, vb);
3561 vy = _mm256_max_ps(vy, vy_min);
3562 vy = _mm256_min_ps(vy, vy_max);
3563 _mm256_storeu_ps(y, vy);
3564 y += 8;
3565 }
3566 if XNN_UNLIKELY(n != 0) {
3567 assert(n >= 1 * sizeof(float));
3568 assert(n <= 7 * sizeof(float));
3569 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
3570
3571 const __m256 va = _mm256_maskload_ps(a, vmask);
3572 const __m256 vb = _mm256_maskload_ps(b, vmask);
3573
3574 __m256 vy = _mm256_sub_ps(va, vb);
3575 vy = _mm256_max_ps(vy, vy_min);
3576 vy = _mm256_min_ps(vy, vy_max);
3577
3578 __m128 vy_lo = _mm256_castps256_ps128(vy);
3579 if (n & (4 * sizeof(float))) {
3580 _mm_storeu_ps(y, vy_lo);
3581 vy_lo = _mm256_extractf128_ps(vy, 1);
3582 y += 4;
3583 }
3584 if (n & (2 * sizeof(float))) {
3585 _mm_storel_pi((__m64*) y, vy_lo);
3586 vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
3587 y += 2;
3588 }
3589 if (n & (1 * sizeof(float))) {
3590 _mm_store_ss(y, vy_lo);
3591 }
3592 }
3593 }
3594
xnn_f32_vsubc_minmax_ukernel__avx_x16(size_t n,const float * a,const float * b,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3595 void xnn_f32_vsubc_minmax_ukernel__avx_x16(
3596 size_t n,
3597 const float* a,
3598 const float* b,
3599 float* y,
3600 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
3601 {
3602 assert(n != 0);
3603 assert(n % sizeof(float) == 0);
3604 assert(a != NULL);
3605 assert(b != NULL);
3606 assert(y != NULL);
3607
3608 const __m256 vy_min = _mm256_load_ps(params->avx.min);
3609 const __m256 vy_max = _mm256_load_ps(params->avx.max);
3610
3611 const __m256 vb = _mm256_broadcast_ss(b);
3612 for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
3613 const __m256 va01234567 = _mm256_loadu_ps(a);
3614 const __m256 va89ABCDEF = _mm256_loadu_ps(a + 8);
3615 a += 16;
3616
3617 __m256 vy01234567 = _mm256_sub_ps(va01234567, vb);
3618 __m256 vy89ABCDEF = _mm256_sub_ps(va89ABCDEF, vb);
3619
3620
3621 vy01234567 = _mm256_max_ps(vy01234567, vy_min);
3622 vy89ABCDEF = _mm256_max_ps(vy89ABCDEF, vy_min);
3623
3624 vy01234567 = _mm256_min_ps(vy01234567, vy_max);
3625 vy89ABCDEF = _mm256_min_ps(vy89ABCDEF, vy_max);
3626
3627 _mm256_storeu_ps(y, vy01234567);
3628 _mm256_storeu_ps(y + 8, vy89ABCDEF);
3629 y += 16;
3630 }
3631 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
3632 const __m256 va = _mm256_loadu_ps(a);
3633 a += 8;
3634
3635 __m256 vy = _mm256_sub_ps(va, vb);
3636 vy = _mm256_max_ps(vy, vy_min);
3637 vy = _mm256_min_ps(vy, vy_max);
3638 _mm256_storeu_ps(y, vy);
3639 y += 8;
3640 }
3641 if XNN_UNLIKELY(n != 0) {
3642 assert(n >= 1 * sizeof(float));
3643 assert(n <= 7 * sizeof(float));
3644 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
3645
3646 const __m256 va = _mm256_maskload_ps(a, vmask);
3647
3648 __m256 vy = _mm256_sub_ps(va, vb);
3649 vy = _mm256_max_ps(vy, vy_min);
3650 vy = _mm256_min_ps(vy, vy_max);
3651
3652 __m128 vy_lo = _mm256_castps256_ps128(vy);
3653 if (n & (4 * sizeof(float))) {
3654 _mm_storeu_ps(y, vy_lo);
3655 vy_lo = _mm256_extractf128_ps(vy, 1);
3656 y += 4;
3657 }
3658 if (n & (2 * sizeof(float))) {
3659 _mm_storel_pi((__m64*) y, vy_lo);
3660 vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
3661 y += 2;
3662 }
3663 if (n & (1 * sizeof(float))) {
3664 _mm_store_ss(y, vy_lo);
3665 }
3666 }
3667 }
3668
xnn_f32_vclamp_ukernel__avx_x16(size_t n,const float * x,float * y,const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])3669 void xnn_f32_vclamp_ukernel__avx_x16(
3670 size_t n,
3671 const float* x,
3672 float* y,
3673 const union xnn_f32_minmax_params params[restrict XNN_MIN_ELEMENTS(1)])
3674 {
3675 assert(n != 0);
3676 assert(n % sizeof(float) == 0);
3677 assert(x != NULL);
3678 assert(y != NULL);
3679
3680 const __m256 vy_min = _mm256_load_ps(params->avx.min);
3681 const __m256 vy_max = _mm256_load_ps(params->avx.max);
3682
3683 for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
3684 __m256 vacc01234567 = _mm256_loadu_ps(x);
3685 __m256 vacc89ABCDEF = _mm256_loadu_ps(x + 8);
3686 x += 16;
3687
3688 vacc01234567 = _mm256_max_ps(vacc01234567, vy_min);
3689 vacc89ABCDEF = _mm256_max_ps(vacc89ABCDEF, vy_min);
3690
3691 vacc01234567 = _mm256_min_ps(vacc01234567, vy_max);
3692 vacc89ABCDEF = _mm256_min_ps(vacc89ABCDEF, vy_max);
3693
3694 _mm256_storeu_ps(y, vacc01234567);
3695 _mm256_storeu_ps(y + 8, vacc89ABCDEF);
3696 y += 16;
3697 }
3698 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
3699 __m256 vacc = _mm256_loadu_ps(x);
3700 x += 8;
3701
3702 vacc = _mm256_max_ps(vacc, vy_min);
3703 vacc = _mm256_min_ps(vacc, vy_max);
3704
3705 _mm256_storeu_ps(y, vacc);
3706 y += 8;
3707 }
3708 if XNN_UNLIKELY(n != 0) {
3709 assert(n >= 1 * sizeof(float));
3710 assert(n <= 7 * sizeof(float));
3711 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
3712
3713 __m256 vacc = _mm256_maskload_ps(x, vmask);
3714 vacc = _mm256_max_ps(vacc, vy_min);
3715 vacc = _mm256_min_ps(vacc, vy_max);
3716
3717 __m128 vacc_lo = _mm256_castps256_ps128(vacc);
3718 if (n & (4 * sizeof(float))) {
3719 _mm_storeu_ps(y, vacc_lo);
3720 vacc_lo = _mm256_extractf128_ps(vacc, 1);
3721 y += 4;
3722 }
3723 if (n & (2 * sizeof(float))) {
3724 _mm_storel_pi((__m64*) y, vacc_lo);
3725 vacc_lo = _mm_movehl_ps(vacc_lo, vacc_lo);
3726 y += 2;
3727 }
3728 if (n & (1 * sizeof(float))) {
3729 _mm_store_ss(y, vacc_lo);
3730 }
3731 }
3732 }
3733
xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x32(size_t n,const float * x,float * y,const union xnn_f32_elu_params params[restrict XNN_MIN_ELEMENTS (1)])3734 void xnn_f32_velu_ukernel__avx_rr2_lut4_p4_perm_x32(
3735 size_t n,
3736 const float* x,
3737 float* y,
3738 const union xnn_f32_elu_params params[restrict XNN_MIN_ELEMENTS(1)])
3739 {
3740 assert(n % sizeof(float) == 0);
3741
3742 const __m256 vprescale = _mm256_load_ps(params->avx_rr2_lut4_p4.prescale);
3743 const __m256 valpha = _mm256_load_ps(params->avx_rr2_lut4_p4.alpha);
3744 const __m256 vbeta = _mm256_load_ps(params->avx_rr2_lut4_p4.beta);
3745 const __m256 vsat_cutoff = _mm256_load_ps(params->avx_rr2_lut4_p4.sat_cutoff);
3746 const __m256 vmagic_bias = _mm256_load_ps(params->avx_rr2_lut4_p4.magic_bias);
3747 const __m256 vlog2e = _mm256_load_ps(params->avx_rr2_lut4_p4.log2e);
3748 const __m256 vindex_mask = _mm256_load_ps((const float*) params->avx_rr2_lut4_p4.index_mask);
3749 const __m256 vtable = _mm256_load_ps(params->avx_rr2_lut4_p4.table);
3750 const __m256 vminus_ln2_hi = _mm256_load_ps(params->avx_rr2_lut4_p4.minus_ln2_hi);
3751 const __m256 vminus_ln2_lo = _mm256_load_ps(params->avx_rr2_lut4_p4.minus_ln2_lo);
3752 const __m256 vc4 = _mm256_load_ps(params->avx_rr2_lut4_p4.c4);
3753 const __m256 vc3 = _mm256_load_ps(params->avx_rr2_lut4_p4.c3);
3754 const __m256 vc2 = _mm256_load_ps(params->avx_rr2_lut4_p4.c2);
3755 const __m256 vone = _mm256_load_ps(params->avx_rr2_lut4_p4.one);
3756
3757 for (; n >= 32 * sizeof(float); n -= 32 * sizeof(float)) {
3758 __m256 vx0 = _mm256_loadu_ps(x);
3759 __m256 vx1 = _mm256_loadu_ps(x + 8);
3760 __m256 vx2 = _mm256_loadu_ps(x + 16);
3761 __m256 vx3 = _mm256_loadu_ps(x + 24);
3762 x += 32;
3763
3764 const __m256 vz0 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx0, vprescale));
3765 const __m256 vz1 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx1, vprescale));
3766 const __m256 vz2 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx2, vprescale));
3767 const __m256 vz3 = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx3, vprescale));
3768
3769 __m256 vn0 = _mm256_add_ps(_mm256_mul_ps(vz0, vlog2e), vmagic_bias);
3770 __m256 vn1 = _mm256_add_ps(_mm256_mul_ps(vz1, vlog2e), vmagic_bias);
3771 __m256 vn2 = _mm256_add_ps(_mm256_mul_ps(vz2, vlog2e), vmagic_bias);
3772 __m256 vn3 = _mm256_add_ps(_mm256_mul_ps(vz3, vlog2e), vmagic_bias);
3773
3774 __m256 ven0 = _mm256_andnot_ps(vindex_mask, vn0);
3775 const __m256 vl0 = _mm256_permutevar_ps(vtable, _mm256_castps_si256(vn0));
3776 const __m128 ven0_lo = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(ven0)), 21));
3777 __m256 ven1 = _mm256_andnot_ps(vindex_mask, vn1);
3778 const __m256 vl1 = _mm256_permutevar_ps(vtable, _mm256_castps_si256(vn1));
3779 const __m128 ven1_lo = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(ven1)), 21));
3780 __m256 ven2 = _mm256_andnot_ps(vindex_mask, vn2);
3781 const __m256 vl2 = _mm256_permutevar_ps(vtable, _mm256_castps_si256(vn2));
3782 const __m128 ven2_lo = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(ven2)), 21));
3783 __m256 ven3 = _mm256_andnot_ps(vindex_mask, vn3);
3784 const __m256 vl3 = _mm256_permutevar_ps(vtable, _mm256_castps_si256(vn3));
3785 const __m128 ven3_lo = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(ven3)), 21));
3786
3787 vn0 = _mm256_sub_ps(vn0, vmagic_bias);
3788 const __m128 ven0_hi = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(ven0, 1)), 21));
3789 vn1 = _mm256_sub_ps(vn1, vmagic_bias);
3790 const __m128 ven1_hi = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(ven1, 1)), 21));
3791 vn2 = _mm256_sub_ps(vn2, vmagic_bias);
3792 const __m128 ven2_hi = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(ven2, 1)), 21));
3793 vn3 = _mm256_sub_ps(vn3, vmagic_bias);
3794 const __m128 ven3_hi = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(ven3, 1)), 21));
3795
3796 __m256 vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_hi), vz0);
3797 ven0 = _mm256_insertf128_ps(_mm256_castps128_ps256(ven0_lo), ven0_hi, 1);
3798 __m256 vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_hi), vz1);
3799 ven1 = _mm256_insertf128_ps(_mm256_castps128_ps256(ven1_lo), ven1_hi, 1);
3800 __m256 vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_hi), vz2);
3801 ven2 = _mm256_insertf128_ps(_mm256_castps128_ps256(ven2_lo), ven2_hi, 1);
3802 __m256 vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_hi), vz3);
3803 ven3 = _mm256_insertf128_ps(_mm256_castps128_ps256(ven3_lo), ven3_hi, 1);
3804
3805 vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_lo), vt0);
3806 __m256 vs0 = _mm256_mul_ps(vl0, ven0);
3807 vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_lo), vt1);
3808 __m256 vs1 = _mm256_mul_ps(vl1, ven1);
3809 vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_lo), vt2);
3810 __m256 vs2 = _mm256_mul_ps(vl2, ven2);
3811 vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_lo), vt3);
3812 __m256 vs3 = _mm256_mul_ps(vl3, ven3);
3813
3814 __m256 vp0 = _mm256_add_ps(_mm256_mul_ps(vc4, vt0), vc3);
3815 __m256 vp1 = _mm256_add_ps(_mm256_mul_ps(vc4, vt1), vc3);
3816 __m256 vp2 = _mm256_add_ps(_mm256_mul_ps(vc4, vt2), vc3);
3817 __m256 vp3 = _mm256_add_ps(_mm256_mul_ps(vc4, vt3), vc3);
3818
3819 vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc2);
3820 vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc2);
3821 vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc2);
3822 vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc2);
3823
3824 vp0 = _mm256_mul_ps(vp0, vt0);
3825 vp1 = _mm256_mul_ps(vp1, vt1);
3826 vp2 = _mm256_mul_ps(vp2, vt2);
3827 vp3 = _mm256_mul_ps(vp3, vt3);
3828
3829 vt0 = _mm256_mul_ps(vt0, vs0);
3830 vs0 = _mm256_sub_ps(vs0, vone);
3831 vt1 = _mm256_mul_ps(vt1, vs1);
3832 vs1 = _mm256_sub_ps(vs1, vone);
3833 vt2 = _mm256_mul_ps(vt2, vs2);
3834 vs2 = _mm256_sub_ps(vs2, vone);
3835 vt3 = _mm256_mul_ps(vt3, vs3);
3836 vs3 = _mm256_sub_ps(vs3, vone);
3837
3838 vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vt0);
3839 vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vt1);
3840 vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vt2);
3841 vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vt3);
3842
3843 const __m256 ve0 = _mm256_mul_ps(_mm256_add_ps(vp0, vs0), valpha);
3844 vx0 = _mm256_mul_ps(vx0, vbeta);
3845 const __m256 ve1 = _mm256_mul_ps(_mm256_add_ps(vp1, vs1), valpha);
3846 vx1 = _mm256_mul_ps(vx1, vbeta);
3847 const __m256 ve2 = _mm256_mul_ps(_mm256_add_ps(vp2, vs2), valpha);
3848 vx2 = _mm256_mul_ps(vx2, vbeta);
3849 const __m256 ve3 = _mm256_mul_ps(_mm256_add_ps(vp3, vs3), valpha);
3850 vx3 = _mm256_mul_ps(vx3, vbeta);
3851
3852 const __m256 vy0 = _mm256_blendv_ps(vx0, ve0, vx0);
3853 const __m256 vy1 = _mm256_blendv_ps(vx1, ve1, vx1);
3854 const __m256 vy2 = _mm256_blendv_ps(vx2, ve2, vx2);
3855 const __m256 vy3 = _mm256_blendv_ps(vx3, ve3, vx3);
3856
3857 _mm256_storeu_ps(y, vy0);
3858 _mm256_storeu_ps(y + 8, vy1);
3859 _mm256_storeu_ps(y + 16, vy2);
3860 _mm256_storeu_ps(y + 24, vy3);
3861 y += 32;
3862 }
3863 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
3864 __m256 vx = _mm256_loadu_ps(x);
3865 x += 8;
3866
3867 const __m256 vz = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx, vprescale));
3868
3869 __m256 vn = _mm256_add_ps(_mm256_mul_ps(vz, vlog2e), vmagic_bias);
3870 __m256 ven = _mm256_andnot_ps(vindex_mask, vn);
3871 const __m256 vl = _mm256_permutevar_ps(vtable, _mm256_castps_si256(vn));
3872 const __m128 ven_lo = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(ven)), 21));
3873 vn = _mm256_sub_ps(vn, vmagic_bias);
3874 const __m128 ven_hi = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(ven, 1)), 21));
3875
3876 __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vz);
3877 ven = _mm256_insertf128_ps(_mm256_castps128_ps256(ven_lo), ven_hi, 1);
3878 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt);
3879 __m256 vs = _mm256_mul_ps(vl, ven);
3880
3881 __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc4, vt), vc3);
3882 vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2);
3883 vp = _mm256_mul_ps(vp, vt);
3884
3885 vt = _mm256_mul_ps(vt, vs);
3886 vs = _mm256_sub_ps(vs, vone);
3887 vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vt);
3888
3889 const __m256 ve = _mm256_mul_ps(_mm256_add_ps(vp, vs), valpha);
3890 vx = _mm256_mul_ps(vx, vbeta);
3891 const __m256 vy = _mm256_blendv_ps(vx, ve, vx);
3892
3893 _mm256_storeu_ps(y, vy);
3894 y += 8;
3895 }
3896 if XNN_UNLIKELY(n != 0) {
3897 assert(n >= 1 * sizeof(float));
3898 assert(n <= 7 * sizeof(float));
3899 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx_rr2_p6.mask_table[7] - n));
3900
3901 __m256 vx = _mm256_maskload_ps(x, vmask);
3902
3903 const __m256 vz = _mm256_max_ps(vsat_cutoff, _mm256_mul_ps(vx, vprescale));
3904
3905 __m256 vn = _mm256_add_ps(_mm256_mul_ps(vz, vlog2e), vmagic_bias);
3906 __m256 ven = _mm256_andnot_ps(vindex_mask, vn);
3907 const __m256 vl = _mm256_permutevar_ps(vtable, _mm256_castps_si256(vn));
3908 const __m128 ven_lo = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(ven)), 21));
3909 vn = _mm256_sub_ps(vn, vmagic_bias);
3910 const __m128 ven_hi = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(ven, 1)), 21));
3911
3912 __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vz);
3913 ven = _mm256_insertf128_ps(_mm256_castps128_ps256(ven_lo), ven_hi, 1);
3914 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt);
3915 __m256 vs = _mm256_mul_ps(vl, ven);
3916
3917 __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc4, vt), vc3);
3918 vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2);
3919 vp = _mm256_mul_ps(vp, vt);
3920
3921 vt = _mm256_mul_ps(vt, vs);
3922 vs = _mm256_sub_ps(vs, vone);
3923 vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vt);
3924
3925 const __m256 ve = _mm256_mul_ps(_mm256_add_ps(vp, vs), valpha);
3926 vx = _mm256_mul_ps(vx, vbeta);
3927 const __m256 vy = _mm256_blendv_ps(vx, ve, vx);
3928
3929 __m128 vy_lo = _mm256_castps256_ps128(vy);
3930 if (n & (4 * sizeof(float))) {
3931 _mm_storeu_ps(y, vy_lo);
3932 vy_lo = _mm256_extractf128_ps(vy, 1);
3933 y += 4;
3934 }
3935 if (n & (2 * sizeof(float))) {
3936 _mm_storel_pi((__m64*) y, vy_lo);
3937 vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
3938 y += 2;
3939 }
3940 if (n & (1 * sizeof(float))) {
3941 _mm_store_ss(y, vy_lo);
3942 }
3943 }
3944 }
3945
xnn_f32_vhswish_ukernel__avx_x16(size_t n,const float * x,float * y,const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS (1)])3946 void xnn_f32_vhswish_ukernel__avx_x16(
3947 size_t n,
3948 const float* x,
3949 float* y,
3950 const union xnn_f32_hswish_params params[restrict XNN_MIN_ELEMENTS(1)])
3951 {
3952 assert(n != 0);
3953 assert(n % sizeof(float) == 0);
3954
3955 const __m256 vsixth = _mm256_load_ps(params->avx.sixth);
3956 const __m256 vhalf = _mm256_load_ps(params->avx.half);
3957 const __m256 vone = _mm256_load_ps(params->avx.one);
3958 const __m256 vzero = _mm256_setzero_ps();
3959
3960 for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
3961 const __m256 vx01234567 = _mm256_loadu_ps(x);
3962 const __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
3963 x += 16;
3964
3965 __m256 vacc01234567 = _mm256_mul_ps(vx01234567, vsixth);
3966 __m256 vacc89ABCDEF = _mm256_mul_ps(vx89ABCDEF, vsixth);
3967
3968 vacc01234567 = _mm256_add_ps(vacc01234567, vhalf);
3969 vacc89ABCDEF = _mm256_add_ps(vacc89ABCDEF, vhalf);
3970
3971 vacc01234567 = _mm256_max_ps(vacc01234567, vzero);
3972 vacc89ABCDEF = _mm256_max_ps(vacc89ABCDEF, vzero);
3973
3974 vacc01234567 = _mm256_min_ps(vacc01234567, vone);
3975 vacc89ABCDEF = _mm256_min_ps(vacc89ABCDEF, vone);
3976
3977 vacc01234567 = _mm256_mul_ps(vacc01234567, vx01234567);
3978 vacc89ABCDEF = _mm256_mul_ps(vacc89ABCDEF, vx89ABCDEF);
3979
3980 _mm256_storeu_ps(y, vacc01234567);
3981 _mm256_storeu_ps(y + 8, vacc89ABCDEF);
3982 y += 16;
3983 }
3984 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
3985 const __m256 vx = _mm256_loadu_ps(x);
3986 x += 8;
3987 __m256 vacc = _mm256_mul_ps(vx, vsixth);
3988 vacc = _mm256_add_ps(vacc, vhalf);
3989 vacc = _mm256_max_ps(vacc, vzero);
3990 vacc = _mm256_min_ps(vacc, vone);
3991 vacc = _mm256_mul_ps(vacc, vx);
3992 _mm256_storeu_ps(y, vacc);
3993 y += 8;
3994 }
3995 if XNN_UNLIKELY(n != 0) {
3996 assert(n >= 1 * sizeof(float));
3997 assert(n <= 7 * sizeof(float));
3998 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
3999
4000 const __m256 vx = _mm256_maskload_ps(x, vmask);
4001 __m256 vacc = _mm256_mul_ps(vx, vsixth);
4002 vacc = _mm256_add_ps(vacc, vhalf);
4003 vacc = _mm256_max_ps(vacc, vzero);
4004 vacc = _mm256_min_ps(vacc, vone);
4005 vacc = _mm256_mul_ps(vacc, vx);
4006
4007 __m128 vacc_lo = _mm256_castps256_ps128(vacc);
4008 if (n & (4 * sizeof(float))) {
4009 _mm_storeu_ps(y, vacc_lo);
4010 vacc_lo = _mm256_extractf128_ps(vacc, 1);
4011 y += 4;
4012 }
4013 if (n & (2 * sizeof(float))) {
4014 _mm_storel_pi((__m64*) y, vacc_lo);
4015 vacc_lo = _mm_movehl_ps(vacc_lo, vacc_lo);
4016 y += 2;
4017 }
4018 if (n & (1 * sizeof(float))) {
4019 _mm_store_ss(y, vacc_lo);
4020 }
4021 }
4022 }
4023
xnn_f32_vlrelu_ukernel__avx_x16(size_t n,const float * x,float * y,const union xnn_f32_lrelu_params params[restrict XNN_MIN_ELEMENTS (1)])4024 void xnn_f32_vlrelu_ukernel__avx_x16(
4025 size_t n,
4026 const float* x,
4027 float* y,
4028 const union xnn_f32_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)])
4029 {
4030 assert(n != 0);
4031 assert(n % sizeof(float) == 0);
4032
4033 const __m256 vslope = _mm256_load_ps(params->avx.slope);
4034 for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
4035 const __m256 vx01234567 = _mm256_loadu_ps(x);
4036 const __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
4037 x += 16;
4038
4039 __m256 vacc01234567 = _mm256_mul_ps(vx01234567, vslope);
4040 __m256 vacc89ABCDEF = _mm256_mul_ps(vx89ABCDEF, vslope);
4041
4042 vacc01234567 = _mm256_blendv_ps(vx01234567, vacc01234567, vx01234567);
4043 vacc89ABCDEF = _mm256_blendv_ps(vx89ABCDEF, vacc89ABCDEF, vx89ABCDEF);
4044
4045 _mm256_storeu_ps(y, vacc01234567);
4046 _mm256_storeu_ps(y + 8, vacc89ABCDEF);
4047 y += 16;
4048 }
4049 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
4050 const __m256 vx = _mm256_loadu_ps(x);
4051 x += 8;
4052 __m256 vacc = _mm256_mul_ps(vx, vslope);
4053 vacc = _mm256_blendv_ps(vx, vacc, vx);
4054 _mm256_storeu_ps(y, vacc);
4055 y += 8;
4056 }
4057 if XNN_UNLIKELY(n != 0) {
4058 assert(n >= 1 * sizeof(float));
4059 assert(n <= 7 * sizeof(float));
4060 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
4061
4062 const __m256 vx = _mm256_maskload_ps(x, vmask);
4063 __m256 vacc = _mm256_mul_ps(vx, vslope);
4064 vacc = _mm256_blendv_ps(vx, vacc, vx);
4065
4066 __m128 vacc_lo = _mm256_castps256_ps128(vacc);
4067 if (n & (4 * sizeof(float))) {
4068 _mm_storeu_ps(y, vacc_lo);
4069 vacc_lo = _mm256_extractf128_ps(vacc, 1);
4070 y += 4;
4071 }
4072 if (n & (2 * sizeof(float))) {
4073 _mm_storel_pi((__m64*) y, vacc_lo);
4074 vacc_lo = _mm_movehl_ps(vacc_lo, vacc_lo);
4075 y += 2;
4076 }
4077 if (n & (1 * sizeof(float))) {
4078 _mm_store_ss(y, vacc_lo);
4079 }
4080 }
4081 }
4082
xnn_f32_vrndd_ukernel__avx_x16(size_t n,const float * x,float * y,const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS (1)])4083 void xnn_f32_vrndd_ukernel__avx_x16(
4084 size_t n,
4085 const float* x,
4086 float* y,
4087 const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)])
4088 {
4089 assert(n != 0);
4090 assert(n % sizeof(float) == 0);
4091
4092 for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
4093 const __m256 vx01234567 = _mm256_loadu_ps(x);
4094 const __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
4095 x += 16;
4096
4097 const __m256 vy01234567 = _mm256_round_ps(vx01234567, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
4098 const __m256 vy89ABCDEF = _mm256_round_ps(vx89ABCDEF, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
4099
4100 _mm256_storeu_ps(y, vy01234567);
4101 _mm256_storeu_ps(y + 8, vy89ABCDEF);
4102 y += 16;
4103 }
4104 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
4105 const __m256 vx = _mm256_loadu_ps(x);
4106 x += 8;
4107
4108 const __m256 vy = _mm256_round_ps(vx, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
4109
4110 _mm256_storeu_ps(y, vy);
4111 y += 8;
4112 }
4113 if XNN_UNLIKELY(n != 0) {
4114 assert(n >= 1 * sizeof(float));
4115 assert(n <= 7 * sizeof(float));
4116 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
4117
4118 const __m256 vx = _mm256_maskload_ps(x, vmask);
4119 const __m256 vy = _mm256_round_ps(vx, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
4120
4121 __m128 vy_lo = _mm256_castps256_ps128(vy);
4122 if (n & (4 * sizeof(float))) {
4123 _mm_storeu_ps(y, vy_lo);
4124 vy_lo = _mm256_extractf128_ps(vy, 1);
4125 y += 4;
4126 }
4127 if (n & (2 * sizeof(float))) {
4128 _mm_storel_pi((__m64*) y, vy_lo);
4129 vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
4130 y += 2;
4131 }
4132 if (n & (1 * sizeof(float))) {
4133 _mm_store_ss(y, vy_lo);
4134 }
4135 }
4136 }
4137
xnn_f32_vrndne_ukernel__avx_x16(size_t n,const float * x,float * y,const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS (1)])4138 void xnn_f32_vrndne_ukernel__avx_x16(
4139 size_t n,
4140 const float* x,
4141 float* y,
4142 const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)])
4143 {
4144 assert(n != 0);
4145 assert(n % sizeof(float) == 0);
4146
4147 for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
4148 const __m256 vx01234567 = _mm256_loadu_ps(x);
4149 const __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
4150 x += 16;
4151
4152 const __m256 vy01234567 = _mm256_round_ps(vx01234567, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
4153 const __m256 vy89ABCDEF = _mm256_round_ps(vx89ABCDEF, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
4154
4155 _mm256_storeu_ps(y, vy01234567);
4156 _mm256_storeu_ps(y + 8, vy89ABCDEF);
4157 y += 16;
4158 }
4159 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
4160 const __m256 vx = _mm256_loadu_ps(x);
4161 x += 8;
4162
4163 const __m256 vy = _mm256_round_ps(vx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
4164
4165 _mm256_storeu_ps(y, vy);
4166 y += 8;
4167 }
4168 if XNN_UNLIKELY(n != 0) {
4169 assert(n >= 1 * sizeof(float));
4170 assert(n <= 7 * sizeof(float));
4171 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
4172
4173 const __m256 vx = _mm256_maskload_ps(x, vmask);
4174 const __m256 vy = _mm256_round_ps(vx, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
4175
4176 __m128 vy_lo = _mm256_castps256_ps128(vy);
4177 if (n & (4 * sizeof(float))) {
4178 _mm_storeu_ps(y, vy_lo);
4179 vy_lo = _mm256_extractf128_ps(vy, 1);
4180 y += 4;
4181 }
4182 if (n & (2 * sizeof(float))) {
4183 _mm_storel_pi((__m64*) y, vy_lo);
4184 vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
4185 y += 2;
4186 }
4187 if (n & (1 * sizeof(float))) {
4188 _mm_store_ss(y, vy_lo);
4189 }
4190 }
4191 }
4192
xnn_f32_vrndu_ukernel__avx_x16(size_t n,const float * x,float * y,const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS (1)])4193 void xnn_f32_vrndu_ukernel__avx_x16(
4194 size_t n,
4195 const float* x,
4196 float* y,
4197 const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)])
4198 {
4199 assert(n != 0);
4200 assert(n % sizeof(float) == 0);
4201
4202 for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
4203 const __m256 vx01234567 = _mm256_loadu_ps(x);
4204 const __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
4205 x += 16;
4206
4207 const __m256 vy01234567 = _mm256_round_ps(vx01234567, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
4208 const __m256 vy89ABCDEF = _mm256_round_ps(vx89ABCDEF, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
4209
4210 _mm256_storeu_ps(y, vy01234567);
4211 _mm256_storeu_ps(y + 8, vy89ABCDEF);
4212 y += 16;
4213 }
4214 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
4215 const __m256 vx = _mm256_loadu_ps(x);
4216 x += 8;
4217
4218 const __m256 vy = _mm256_round_ps(vx, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
4219
4220 _mm256_storeu_ps(y, vy);
4221 y += 8;
4222 }
4223 if XNN_UNLIKELY(n != 0) {
4224 assert(n >= 1 * sizeof(float));
4225 assert(n <= 7 * sizeof(float));
4226 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
4227
4228 const __m256 vx = _mm256_maskload_ps(x, vmask);
4229 const __m256 vy = _mm256_round_ps(vx, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
4230
4231 __m128 vy_lo = _mm256_castps256_ps128(vy);
4232 if (n & (4 * sizeof(float))) {
4233 _mm_storeu_ps(y, vy_lo);
4234 vy_lo = _mm256_extractf128_ps(vy, 1);
4235 y += 4;
4236 }
4237 if (n & (2 * sizeof(float))) {
4238 _mm_storel_pi((__m64*) y, vy_lo);
4239 vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
4240 y += 2;
4241 }
4242 if (n & (1 * sizeof(float))) {
4243 _mm_store_ss(y, vy_lo);
4244 }
4245 }
4246 }
4247
xnn_f32_vrndz_ukernel__avx_x16(size_t n,const float * x,float * y,const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS (1)])4248 void xnn_f32_vrndz_ukernel__avx_x16(
4249 size_t n,
4250 const float* x,
4251 float* y,
4252 const union xnn_f32_rnd_params params[restrict XNN_MIN_ELEMENTS(1)])
4253 {
4254 assert(n != 0);
4255 assert(n % sizeof(float) == 0);
4256
4257 for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
4258 const __m256 vx01234567 = _mm256_loadu_ps(x);
4259 const __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
4260 x += 16;
4261
4262 const __m256 vy01234567 = _mm256_round_ps(vx01234567, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
4263 const __m256 vy89ABCDEF = _mm256_round_ps(vx89ABCDEF, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
4264
4265 _mm256_storeu_ps(y, vy01234567);
4266 _mm256_storeu_ps(y + 8, vy89ABCDEF);
4267 y += 16;
4268 }
4269 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
4270 const __m256 vx = _mm256_loadu_ps(x);
4271 x += 8;
4272
4273 const __m256 vy = _mm256_round_ps(vx, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
4274
4275 _mm256_storeu_ps(y, vy);
4276 y += 8;
4277 }
4278 if XNN_UNLIKELY(n != 0) {
4279 assert(n >= 1 * sizeof(float));
4280 assert(n <= 7 * sizeof(float));
4281 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
4282
4283 const __m256 vx = _mm256_maskload_ps(x, vmask);
4284 const __m256 vy = _mm256_round_ps(vx, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
4285
4286 __m128 vy_lo = _mm256_castps256_ps128(vy);
4287 if (n & (4 * sizeof(float))) {
4288 _mm_storeu_ps(y, vy_lo);
4289 vy_lo = _mm256_extractf128_ps(vy, 1);
4290 y += 4;
4291 }
4292 if (n & (2 * sizeof(float))) {
4293 _mm_storel_pi((__m64*) y, vy_lo);
4294 vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
4295 y += 2;
4296 }
4297 if (n & (1 * sizeof(float))) {
4298 _mm_store_ss(y, vy_lo);
4299 }
4300 }
4301 }
4302
xnn_f32_vsigmoid_ukernel__avx_rr2_p5_nr2_x40(size_t n,const float * x,float * y,const union xnn_f32_sigmoid_params params[restrict XNN_MIN_ELEMENTS (1)])4303 void xnn_f32_vsigmoid_ukernel__avx_rr2_p5_nr2_x40(
4304 size_t n,
4305 const float* x,
4306 float* y,
4307 const union xnn_f32_sigmoid_params params[restrict XNN_MIN_ELEMENTS(1)])
4308 {
4309 assert(n % sizeof(float) == 0);
4310
4311 const __m256 vsign_mask = _mm256_load_ps(params->avx_rr2_p5.sign_mask);
4312 const __m256 vmagic_bias = _mm256_load_ps(params->avx_rr2_p5.magic_bias);
4313 const __m256 vlog2e = _mm256_load_ps(params->avx_rr2_p5.log2e);
4314 const __m256 vminus_ln2_hi = _mm256_load_ps(params->avx_rr2_p5.minus_ln2_hi);
4315 const __m256 vminus_ln2_lo = _mm256_load_ps(params->avx_rr2_p5.minus_ln2_lo);
4316 const __m256 vc5 = _mm256_load_ps(params->avx_rr2_p5.c5);
4317 const __m256 vc4 = _mm256_load_ps(params->avx_rr2_p5.c4);
4318 const __m256 vc3 = _mm256_load_ps(params->avx_rr2_p5.c3);
4319 const __m256 vc2 = _mm256_load_ps(params->avx_rr2_p5.c2);
4320 const __m256 vc1 = _mm256_load_ps(params->avx_rr2_p5.c1);
4321 const __m256 vone = _mm256_load_ps(params->avx_rr2_p5.one);
4322 const __m256 vtwo = _mm256_load_ps(params->avx_rr2_p5.two);
4323 const __m256 vdenorm_cutoff = _mm256_load_ps(params->avx_rr2_p5.denorm_cutoff);
4324
4325 for (; n >= 40 * sizeof(float); n -= 40 * sizeof(float)) {
4326 const __m256 vx0 = _mm256_loadu_ps(x);
4327 const __m256 vx1 = _mm256_loadu_ps(x + 8);
4328 const __m256 vx2 = _mm256_loadu_ps(x + 16);
4329 const __m256 vx3 = _mm256_loadu_ps(x + 24);
4330 const __m256 vx4 = _mm256_loadu_ps(x + 32);
4331 x += 40;
4332
4333 const __m256 vz0 = _mm256_or_ps(vx0, vsign_mask);
4334 const __m256 vz1 = _mm256_or_ps(vx1, vsign_mask);
4335 const __m256 vz2 = _mm256_or_ps(vx2, vsign_mask);
4336 const __m256 vz3 = _mm256_or_ps(vx3, vsign_mask);
4337 const __m256 vz4 = _mm256_or_ps(vx4, vsign_mask);
4338
4339 __m256 vn0 = _mm256_add_ps(_mm256_mul_ps(vz0, vlog2e), vmagic_bias);
4340 __m256 vn1 = _mm256_add_ps(_mm256_mul_ps(vz1, vlog2e), vmagic_bias);
4341 __m256 vn2 = _mm256_add_ps(_mm256_mul_ps(vz2, vlog2e), vmagic_bias);
4342 __m256 vn3 = _mm256_add_ps(_mm256_mul_ps(vz3, vlog2e), vmagic_bias);
4343 __m256 vn4 = _mm256_add_ps(_mm256_mul_ps(vz4, vlog2e), vmagic_bias);
4344
4345 const __m128 vs_lo0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(vn0)), 23));
4346 const __m128 vs_hi0 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(vn0, 1)), 23));
4347 const __m256 vs0 = _mm256_insertf128_ps(_mm256_castps128_ps256(vs_lo0), vs_hi0, 1);
4348 const __m128 vs_lo1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(vn1)), 23));
4349 const __m128 vs_hi1 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(vn1, 1)), 23));
4350 const __m256 vs1 = _mm256_insertf128_ps(_mm256_castps128_ps256(vs_lo1), vs_hi1, 1);
4351 const __m128 vs_lo2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(vn2)), 23));
4352 const __m128 vs_hi2 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(vn2, 1)), 23));
4353 const __m256 vs2 = _mm256_insertf128_ps(_mm256_castps128_ps256(vs_lo2), vs_hi2, 1);
4354 const __m128 vs_lo3 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(vn3)), 23));
4355 const __m128 vs_hi3 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(vn3, 1)), 23));
4356 const __m256 vs3 = _mm256_insertf128_ps(_mm256_castps128_ps256(vs_lo3), vs_hi3, 1);
4357 const __m128 vs_lo4 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(vn4)), 23));
4358 const __m128 vs_hi4 = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(vn4, 1)), 23));
4359 const __m256 vs4 = _mm256_insertf128_ps(_mm256_castps128_ps256(vs_lo4), vs_hi4, 1);
4360
4361 vn0 = _mm256_sub_ps(vn0, vmagic_bias);
4362 vn1 = _mm256_sub_ps(vn1, vmagic_bias);
4363 vn2 = _mm256_sub_ps(vn2, vmagic_bias);
4364 vn3 = _mm256_sub_ps(vn3, vmagic_bias);
4365 vn4 = _mm256_sub_ps(vn4, vmagic_bias);
4366
4367 __m256 vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_hi), vz0);
4368 __m256 vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_hi), vz1);
4369 __m256 vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_hi), vz2);
4370 __m256 vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_hi), vz3);
4371 __m256 vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_hi), vz4);
4372
4373 vt0 = _mm256_add_ps(_mm256_mul_ps(vn0, vminus_ln2_lo), vt0);
4374 vt1 = _mm256_add_ps(_mm256_mul_ps(vn1, vminus_ln2_lo), vt1);
4375 vt2 = _mm256_add_ps(_mm256_mul_ps(vn2, vminus_ln2_lo), vt2);
4376 vt3 = _mm256_add_ps(_mm256_mul_ps(vn3, vminus_ln2_lo), vt3);
4377 vt4 = _mm256_add_ps(_mm256_mul_ps(vn4, vminus_ln2_lo), vt4);
4378
4379 __m256 vp0 = _mm256_add_ps(_mm256_mul_ps(vc5, vt0), vc4);
4380 __m256 vp1 = _mm256_add_ps(_mm256_mul_ps(vc5, vt1), vc4);
4381 __m256 vp2 = _mm256_add_ps(_mm256_mul_ps(vc5, vt2), vc4);
4382 __m256 vp3 = _mm256_add_ps(_mm256_mul_ps(vc5, vt3), vc4);
4383 __m256 vp4 = _mm256_add_ps(_mm256_mul_ps(vc5, vt4), vc4);
4384
4385 vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc3);
4386 vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc3);
4387 vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc3);
4388 vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc3);
4389 vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc3);
4390
4391 vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc2);
4392 vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc2);
4393 vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc2);
4394 vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc2);
4395 vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc2);
4396
4397 vp0 = _mm256_add_ps(_mm256_mul_ps(vp0, vt0), vc1);
4398 vp1 = _mm256_add_ps(_mm256_mul_ps(vp1, vt1), vc1);
4399 vp2 = _mm256_add_ps(_mm256_mul_ps(vp2, vt2), vc1);
4400 vp3 = _mm256_add_ps(_mm256_mul_ps(vp3, vt3), vc1);
4401 vp4 = _mm256_add_ps(_mm256_mul_ps(vp4, vt4), vc1);
4402
4403 vt0 = _mm256_mul_ps(vt0, vs0);
4404 vt1 = _mm256_mul_ps(vt1, vs1);
4405 vt2 = _mm256_mul_ps(vt2, vs2);
4406 vt3 = _mm256_mul_ps(vt3, vs3);
4407 vt4 = _mm256_mul_ps(vt4, vs4);
4408
4409 const __m256 ve0 = _mm256_add_ps(_mm256_mul_ps(vt0, vp0), vs0);
4410 const __m256 ve1 = _mm256_add_ps(_mm256_mul_ps(vt1, vp1), vs1);
4411 const __m256 ve2 = _mm256_add_ps(_mm256_mul_ps(vt2, vp2), vs2);
4412 const __m256 ve3 = _mm256_add_ps(_mm256_mul_ps(vt3, vp3), vs3);
4413 const __m256 ve4 = _mm256_add_ps(_mm256_mul_ps(vt4, vp4), vs4);
4414
4415 const __m256 vd0 = _mm256_add_ps(ve0, vone);
4416 const __m256 vd1 = _mm256_add_ps(ve1, vone);
4417 const __m256 vd2 = _mm256_add_ps(ve2, vone);
4418 const __m256 vd3 = _mm256_add_ps(ve3, vone);
4419 const __m256 vd4 = _mm256_add_ps(ve4, vone);
4420
4421 __m256 vr0 = _mm256_rcp_ps(vd0);
4422 __m256 vr1 = _mm256_rcp_ps(vd1);
4423 __m256 vr2 = _mm256_rcp_ps(vd2);
4424 __m256 vr3 = _mm256_rcp_ps(vd3);
4425 __m256 vr4 = _mm256_rcp_ps(vd4);
4426
4427 vr0 = _mm256_mul_ps(vr0, _mm256_sub_ps(vtwo, _mm256_mul_ps(vr0, vd0)));
4428 vr0 = _mm256_mul_ps(vr0, _mm256_sub_ps(vtwo, _mm256_mul_ps(vr0, vd0)));
4429 vr1 = _mm256_mul_ps(vr1, _mm256_sub_ps(vtwo, _mm256_mul_ps(vr1, vd1)));
4430 vr1 = _mm256_mul_ps(vr1, _mm256_sub_ps(vtwo, _mm256_mul_ps(vr1, vd1)));
4431 vr2 = _mm256_mul_ps(vr2, _mm256_sub_ps(vtwo, _mm256_mul_ps(vr2, vd2)));
4432 vr2 = _mm256_mul_ps(vr2, _mm256_sub_ps(vtwo, _mm256_mul_ps(vr2, vd2)));
4433 vr3 = _mm256_mul_ps(vr3, _mm256_sub_ps(vtwo, _mm256_mul_ps(vr3, vd3)));
4434 vr3 = _mm256_mul_ps(vr3, _mm256_sub_ps(vtwo, _mm256_mul_ps(vr3, vd3)));
4435 vr4 = _mm256_mul_ps(vr4, _mm256_sub_ps(vtwo, _mm256_mul_ps(vr4, vd4)));
4436 vr4 = _mm256_mul_ps(vr4, _mm256_sub_ps(vtwo, _mm256_mul_ps(vr4, vd4)));
4437
4438 __m256 vf0 = _mm256_mul_ps(ve0, vr0);
4439 __m256 vf1 = _mm256_mul_ps(ve1, vr1);
4440 __m256 vf2 = _mm256_mul_ps(ve2, vr2);
4441 __m256 vf3 = _mm256_mul_ps(ve3, vr3);
4442 __m256 vf4 = _mm256_mul_ps(ve4, vr4);
4443
4444 vf0 = _mm256_andnot_ps(_mm256_cmp_ps(vz0, vdenorm_cutoff, _CMP_LT_OS), vf0);
4445 vf1 = _mm256_andnot_ps(_mm256_cmp_ps(vz1, vdenorm_cutoff, _CMP_LT_OS), vf1);
4446 vf2 = _mm256_andnot_ps(_mm256_cmp_ps(vz2, vdenorm_cutoff, _CMP_LT_OS), vf2);
4447 vf3 = _mm256_andnot_ps(_mm256_cmp_ps(vz3, vdenorm_cutoff, _CMP_LT_OS), vf3);
4448 vf4 = _mm256_andnot_ps(_mm256_cmp_ps(vz4, vdenorm_cutoff, _CMP_LT_OS), vf4);
4449
4450 vf0 = _mm256_blendv_ps(_mm256_sub_ps(vone, vf0), vf0, vx0);
4451 vf1 = _mm256_blendv_ps(_mm256_sub_ps(vone, vf1), vf1, vx1);
4452 vf2 = _mm256_blendv_ps(_mm256_sub_ps(vone, vf2), vf2, vx2);
4453 vf3 = _mm256_blendv_ps(_mm256_sub_ps(vone, vf3), vf3, vx3);
4454 vf4 = _mm256_blendv_ps(_mm256_sub_ps(vone, vf4), vf4, vx4);
4455
4456 _mm256_storeu_ps(y, vf0);
4457 _mm256_storeu_ps(y + 8, vf1);
4458 _mm256_storeu_ps(y + 16, vf2);
4459 _mm256_storeu_ps(y + 24, vf3);
4460 _mm256_storeu_ps(y + 32, vf4);
4461 y += 40;
4462 }
4463 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
4464 const __m256 vx = _mm256_loadu_ps(x);
4465 x += 8;
4466
4467 const __m256 vz = _mm256_or_ps(vx, vsign_mask);
4468
4469 __m256 vn = _mm256_add_ps(_mm256_mul_ps(vz, vlog2e), vmagic_bias);
4470
4471 const __m128 vs_lo = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(vn)), 23));
4472 const __m128 vs_hi = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(vn, 1)), 23));
4473 const __m256 vs = _mm256_insertf128_ps(_mm256_castps128_ps256(vs_lo), vs_hi, 1);
4474
4475 vn = _mm256_sub_ps(vn, vmagic_bias);
4476
4477 __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vz);
4478 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt);
4479
4480 __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4);
4481 vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3);
4482 vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2);
4483 vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1);
4484
4485 vt = _mm256_mul_ps(vt, vs);
4486 const __m256 ve = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs);
4487
4488 const __m256 vd = _mm256_add_ps(ve, vone);
4489 __m256 vr = _mm256_rcp_ps(vd);
4490 vr = _mm256_mul_ps(vr, _mm256_sub_ps(vtwo, _mm256_mul_ps(vr, vd)));
4491 vr = _mm256_mul_ps(vr, _mm256_sub_ps(vtwo, _mm256_mul_ps(vr, vd)));
4492 __m256 vf = _mm256_mul_ps(ve, vr);
4493
4494 vf = _mm256_andnot_ps(_mm256_cmp_ps(vz, vdenorm_cutoff, _CMP_LT_OS), vf);
4495 vf = _mm256_blendv_ps(_mm256_sub_ps(vone, vf), vf, vx);
4496
4497 _mm256_storeu_ps(y, vf);
4498 y += 8;
4499 }
4500 if XNN_UNLIKELY(n != 0) {
4501 assert(n >= 1 * sizeof(float));
4502 assert(n <= 7 * sizeof(float));
4503 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx_rr2_p5.mask_table[7] - n));
4504
4505 const __m256 vx = _mm256_maskload_ps(x, vmask);
4506
4507 const __m256 vz = _mm256_or_ps(vx, vsign_mask);
4508
4509 __m256 vn = _mm256_add_ps(_mm256_mul_ps(vz, vlog2e), vmagic_bias);
4510 const __m128 vs_lo = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_castps256_ps128(vn)), 23));
4511 const __m128 vs_hi = _mm_castsi128_ps(_mm_slli_epi32(_mm_castps_si128(_mm256_extractf128_ps(vn, 1)), 23));
4512 const __m256 vs = _mm256_insertf128_ps(_mm256_castps128_ps256(vs_lo), vs_hi, 1);
4513
4514 vn = _mm256_sub_ps(vn, vmagic_bias);
4515
4516 __m256 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_hi), vz);
4517 vt = _mm256_add_ps(_mm256_mul_ps(vn, vminus_ln2_lo), vt);
4518
4519 __m256 vp = _mm256_add_ps(_mm256_mul_ps(vc5, vt), vc4);
4520 vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc3);
4521 vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc2);
4522 vp = _mm256_add_ps(_mm256_mul_ps(vp, vt), vc1);
4523
4524 vt = _mm256_mul_ps(vt, vs);
4525 const __m256 ve = _mm256_add_ps(_mm256_mul_ps(vt, vp), vs);
4526
4527 const __m256 vd = _mm256_add_ps(ve, vone);
4528 __m256 vr = _mm256_rcp_ps(vd);
4529 vr = _mm256_mul_ps(vr, _mm256_sub_ps(vtwo, _mm256_mul_ps(vr, vd)));
4530 vr = _mm256_mul_ps(vr, _mm256_sub_ps(vtwo, _mm256_mul_ps(vr, vd)));
4531 __m256 vf = _mm256_mul_ps(ve, vr);
4532
4533 vf = _mm256_andnot_ps(_mm256_cmp_ps(vz, vdenorm_cutoff, _CMP_LT_OS), vf);
4534 vf = _mm256_blendv_ps(_mm256_sub_ps(vone, vf), vf, vx);
4535
4536 __m128 vf_lo = _mm256_castps256_ps128(vf);
4537 if (n & (4 * sizeof(float))) {
4538 _mm_storeu_ps(y, vf_lo);
4539 vf_lo = _mm256_extractf128_ps(vf, 1);
4540 y += 4;
4541 }
4542 if (n & (2 * sizeof(float))) {
4543 _mm_storel_pi((__m64*) y, vf_lo);
4544 vf_lo = _mm_movehl_ps(vf_lo, vf_lo);
4545 y += 2;
4546 }
4547 if (n & (1 * sizeof(float))) {
4548 _mm_store_ss(y, vf_lo);
4549 }
4550 }
4551 }
4552
xnn_f32_vsqrt_ukernel__avx_sqrt_x8(size_t n,const float * x,float * y,const union xnn_f32_sqrt_params params[restrict XNN_MIN_ELEMENTS (1)])4553 void xnn_f32_vsqrt_ukernel__avx_sqrt_x8(
4554 size_t n,
4555 const float* x,
4556 float* y,
4557 const union xnn_f32_sqrt_params params[restrict XNN_MIN_ELEMENTS(1)])
4558 {
4559 assert(n != 0);
4560 assert(n % sizeof(float) == 0);
4561
4562 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
4563 const __m256 vx = _mm256_loadu_ps(x);
4564 x += 8;
4565 const __m256 vy = _mm256_sqrt_ps(vx);
4566 _mm256_storeu_ps(y, vy);
4567 y += 8;
4568 }
4569 if XNN_UNLIKELY(n != 0) {
4570 assert(n >= 1 * sizeof(float));
4571 assert(n <= 7 * sizeof(float));
4572 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
4573
4574 const __m256 vx = _mm256_maskload_ps(x, vmask);
4575 const __m256 vy = _mm256_sqrt_ps(vx);
4576
4577 __m128 vy_lo = _mm256_castps256_ps128(vy);
4578 if (n & (4 * sizeof(float))) {
4579 _mm_storeu_ps(y, vy_lo);
4580 vy_lo = _mm256_extractf128_ps(vy, 1);
4581 y += 4;
4582 }
4583 if (n & (2 * sizeof(float))) {
4584 _mm_storel_pi((__m64*) y, vy_lo);
4585 vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
4586 y += 2;
4587 }
4588 if (n & (1 * sizeof(float))) {
4589 _mm_store_ss(y, vy_lo);
4590 }
4591 }
4592 }
4593
xnn_f32_vabs_ukernel__avx_x16(size_t n,const float * x,float * y,const union xnn_f32_abs_params params[restrict XNN_MIN_ELEMENTS (1)])4594 void xnn_f32_vabs_ukernel__avx_x16(
4595 size_t n,
4596 const float* x,
4597 float* y,
4598 const union xnn_f32_abs_params params[restrict XNN_MIN_ELEMENTS(1)])
4599 {
4600 assert(n != 0);
4601 assert(n % sizeof(float) == 0);
4602 assert(x != NULL);
4603 assert(y != NULL);
4604
4605 const __m256 vnonsign_mask = _mm256_load_ps(params->avx.nonsign_mask);
4606 for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
4607 const __m256 vx01234567 = _mm256_loadu_ps(x);
4608 const __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
4609 x += 16;
4610
4611 const __m256 vy01234567 = _mm256_and_ps(vx01234567, vnonsign_mask);
4612 const __m256 vy89ABCDEF = _mm256_and_ps(vx89ABCDEF, vnonsign_mask);
4613
4614 _mm256_storeu_ps(y, vy01234567);
4615 _mm256_storeu_ps(y + 8, vy89ABCDEF);
4616 y += 16;
4617 }
4618 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
4619 const __m256 vx = _mm256_loadu_ps(x);
4620 x += 8;
4621 const __m256 vy = _mm256_and_ps(vx, vnonsign_mask);
4622 _mm256_storeu_ps(y, vy);
4623 y += 8;
4624 }
4625 if XNN_UNLIKELY(n != 0) {
4626 assert(n >= 1 * sizeof(float));
4627 assert(n <= 7 * sizeof(float));
4628 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
4629
4630 const __m256 vx = _mm256_maskload_ps(x, vmask);
4631 const __m256 vy = _mm256_and_ps(vx, vnonsign_mask);
4632
4633 __m128 vy_lo = _mm256_castps256_ps128(vy);
4634 if (n & (4 * sizeof(float))) {
4635 _mm_storeu_ps(y, vy_lo);
4636 vy_lo = _mm256_extractf128_ps(vy, 1);
4637 y += 4;
4638 }
4639 if (n & (2 * sizeof(float))) {
4640 _mm_storel_pi((__m64*) y, vy_lo);
4641 vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
4642 y += 2;
4643 }
4644 if (n & (1 * sizeof(float))) {
4645 _mm_store_ss(y, vy_lo);
4646 }
4647 }
4648 }
4649
xnn_f32_vneg_ukernel__avx_x16(size_t n,const float * x,float * y,const union xnn_f32_neg_params params[restrict XNN_MIN_ELEMENTS (1)])4650 void xnn_f32_vneg_ukernel__avx_x16(
4651 size_t n,
4652 const float* x,
4653 float* y,
4654 const union xnn_f32_neg_params params[restrict XNN_MIN_ELEMENTS(1)])
4655 {
4656 assert(n != 0);
4657 assert(n % sizeof(float) == 0);
4658 assert(x != NULL);
4659 assert(y != NULL);
4660
4661 const __m256 vsign_mask = _mm256_load_ps(params->sse.sign_mask);
4662 for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
4663 const __m256 vx01234567 = _mm256_loadu_ps(x);
4664 const __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
4665 x += 16;
4666
4667 const __m256 vy01234567 = _mm256_xor_ps(vx01234567, vsign_mask);
4668 const __m256 vy89ABCDEF = _mm256_xor_ps(vx89ABCDEF, vsign_mask);
4669
4670 _mm256_storeu_ps(y, vy01234567);
4671 _mm256_storeu_ps(y + 8, vy89ABCDEF);
4672 y += 16;
4673 }
4674 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
4675 const __m256 vx = _mm256_loadu_ps(x);
4676 x += 8;
4677 const __m256 vy = _mm256_xor_ps(vx, vsign_mask);
4678 _mm256_storeu_ps(y, vy);
4679 y += 8;
4680 }
4681 if XNN_UNLIKELY(n != 0) {
4682 assert(n >= 1 * sizeof(float));
4683 assert(n <= 7 * sizeof(float));
4684 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
4685
4686 const __m256 vx = _mm256_maskload_ps(x, vmask);
4687 const __m256 vy = _mm256_xor_ps(vx, vsign_mask);
4688
4689 __m128 vy_lo = _mm256_castps256_ps128(vy);
4690 if (n & (4 * sizeof(float))) {
4691 _mm_storeu_ps(y, vy_lo);
4692 vy_lo = _mm256_extractf128_ps(vy, 1);
4693 y += 4;
4694 }
4695 if (n & (2 * sizeof(float))) {
4696 _mm_storel_pi((__m64*) y, vy_lo);
4697 vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
4698 y += 2;
4699 }
4700 if (n & (1 * sizeof(float))) {
4701 _mm_store_ss(y, vy_lo);
4702 }
4703 }
4704 }
4705
xnn_f32_vsqr_ukernel__avx_x16(size_t n,const float * x,float * y,const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS (1)])4706 void xnn_f32_vsqr_ukernel__avx_x16(
4707 size_t n,
4708 const float* x,
4709 float* y,
4710 const union xnn_f32_default_params params[restrict XNN_MIN_ELEMENTS(1)])
4711 {
4712 assert(n != 0);
4713 assert(n % sizeof(float) == 0);
4714 assert(x != NULL);
4715 assert(y != NULL);
4716
4717 for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
4718 const __m256 vx01234567 = _mm256_loadu_ps(x);
4719 const __m256 vx89ABCDEF = _mm256_loadu_ps(x + 8);
4720 x += 16;
4721
4722 const __m256 vy01234567 = _mm256_mul_ps(vx01234567, vx01234567);
4723 const __m256 vy89ABCDEF = _mm256_mul_ps(vx89ABCDEF, vx89ABCDEF);
4724
4725 _mm256_storeu_ps(y, vy01234567);
4726 _mm256_storeu_ps(y + 8, vy89ABCDEF);
4727 y += 16;
4728 }
4729 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
4730 const __m256 vx = _mm256_loadu_ps(x);
4731 x += 8;
4732 const __m256 vy = _mm256_mul_ps(vx, vx);
4733 _mm256_storeu_ps(y, vy);
4734 y += 8;
4735 }
4736 if XNN_UNLIKELY(n != 0) {
4737 assert(n >= 1 * sizeof(float));
4738 assert(n <= 7 * sizeof(float));
4739 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->avx.mask_table[7] - n));
4740
4741 const __m256 vx = _mm256_maskload_ps(x, vmask);
4742 const __m256 vy = _mm256_mul_ps(vx, vx);
4743
4744 __m128 vy_lo = _mm256_castps256_ps128(vy);
4745 if (n & (4 * sizeof(float))) {
4746 _mm_storeu_ps(y, vy_lo);
4747 vy_lo = _mm256_extractf128_ps(vy, 1);
4748 y += 4;
4749 }
4750 if (n & (2 * sizeof(float))) {
4751 _mm_storel_pi((__m64*) y, vy_lo);
4752 vy_lo = _mm_movehl_ps(vy_lo, vy_lo);
4753 y += 2;
4754 }
4755 if (n & (1 * sizeof(float))) {
4756 _mm_store_ss(y, vy_lo);
4757 }
4758 }
4759 }
4760
xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16_add16(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])4761 void xnn_qc8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16_add16(
4762 size_t channels,
4763 size_t output_width,
4764 const int8_t** input,
4765 const void* weights,
4766 int8_t* output,
4767 size_t input_stride,
4768 size_t output_increment,
4769 size_t input_offset,
4770 const int8_t* zero,
4771 const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
4772 {
4773 assert(channels != 0);
4774 assert(output_width != 0);
4775
4776 do {
4777 const int8_t* i0 = input[0];
4778 assert(i0 != NULL);
4779 if XNN_UNPREDICTABLE(i0 != zero) {
4780 i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
4781 }
4782 const int8_t* i1 = input[1];
4783 assert(i1 != NULL);
4784 if XNN_UNPREDICTABLE(i1 != zero) {
4785 i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
4786 }
4787 const int8_t* i2 = input[2];
4788 assert(i2 != NULL);
4789 if XNN_UNPREDICTABLE(i2 != zero) {
4790 i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
4791 }
4792 const int8_t* i3 = input[3];
4793 assert(i3 != NULL);
4794 if XNN_UNPREDICTABLE(i3 != zero) {
4795 i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
4796 }
4797 const int8_t* i4 = input[4];
4798 assert(i4 != NULL);
4799 if XNN_UNPREDICTABLE(i4 != zero) {
4800 i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
4801 }
4802 const int8_t* i5 = input[5];
4803 assert(i5 != NULL);
4804 if XNN_UNPREDICTABLE(i5 != zero) {
4805 i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
4806 }
4807 const int8_t* i6 = input[6];
4808 assert(i6 != NULL);
4809 if XNN_UNPREDICTABLE(i6 != zero) {
4810 i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
4811 }
4812 const int8_t* i7 = input[7];
4813 assert(i7 != NULL);
4814 if XNN_UNPREDICTABLE(i7 != zero) {
4815 i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
4816 }
4817 const int8_t* i8 = input[8];
4818 assert(i8 != NULL);
4819 if XNN_UNPREDICTABLE(i8 != zero) {
4820 i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
4821 }
4822 const int8_t* i9 = input[9];
4823 assert(i9 != NULL);
4824 if XNN_UNPREDICTABLE(i9 != zero) {
4825 i9 = (const int8_t*) ((uintptr_t) i9 + input_offset);
4826 }
4827 const int8_t* i10 = input[10];
4828 assert(i10 != NULL);
4829 if XNN_UNPREDICTABLE(i10 != zero) {
4830 i10 = (const int8_t*) ((uintptr_t) i10 + input_offset);
4831 }
4832 const int8_t* i11 = input[11];
4833 assert(i11 != NULL);
4834 if XNN_UNPREDICTABLE(i11 != zero) {
4835 i11 = (const int8_t*) ((uintptr_t) i11 + input_offset);
4836 }
4837 const int8_t* i12 = input[12];
4838 assert(i12 != NULL);
4839 if XNN_UNPREDICTABLE(i12 != zero) {
4840 i12 = (const int8_t*) ((uintptr_t) i12 + input_offset);
4841 }
4842 const int8_t* i13 = input[13];
4843 assert(i13 != NULL);
4844 if XNN_UNPREDICTABLE(i13 != zero) {
4845 i13 = (const int8_t*) ((uintptr_t) i13 + input_offset);
4846 }
4847 const int8_t* i14 = input[14];
4848 assert(i14 != NULL);
4849 if XNN_UNPREDICTABLE(i14 != zero) {
4850 i14 = (const int8_t*) ((uintptr_t) i14 + input_offset);
4851 }
4852 const int8_t* i15 = input[15];
4853 assert(i15 != NULL);
4854 if XNN_UNPREDICTABLE(i15 != zero) {
4855 i15 = (const int8_t*) ((uintptr_t) i15 + input_offset);
4856 }
4857 const int8_t* i16 = input[16];
4858 assert(i16 != NULL);
4859 if XNN_UNPREDICTABLE(i16 != zero) {
4860 i16 = (const int8_t*) ((uintptr_t) i16 + input_offset);
4861 }
4862 const int8_t* i17 = input[17];
4863 assert(i17 != NULL);
4864 if XNN_UNPREDICTABLE(i17 != zero) {
4865 i17 = (const int8_t*) ((uintptr_t) i17 + input_offset);
4866 }
4867 const int8_t* i18 = input[18];
4868 assert(i18 != NULL);
4869 if XNN_UNPREDICTABLE(i18 != zero) {
4870 i18 = (const int8_t*) ((uintptr_t) i18 + input_offset);
4871 }
4872 const int8_t* i19 = input[19];
4873 assert(i19 != NULL);
4874 if XNN_UNPREDICTABLE(i19 != zero) {
4875 i19 = (const int8_t*) ((uintptr_t) i19 + input_offset);
4876 }
4877 const int8_t* i20 = input[20];
4878 assert(i20 != NULL);
4879 if XNN_UNPREDICTABLE(i20 != zero) {
4880 i20 = (const int8_t*) ((uintptr_t) i20 + input_offset);
4881 }
4882 const int8_t* i21 = input[21];
4883 assert(i21 != NULL);
4884 if XNN_UNPREDICTABLE(i21 != zero) {
4885 i21 = (const int8_t*) ((uintptr_t) i21 + input_offset);
4886 }
4887 const int8_t* i22 = input[22];
4888 assert(i22 != NULL);
4889 if XNN_UNPREDICTABLE(i22 != zero) {
4890 i22 = (const int8_t*) ((uintptr_t) i22 + input_offset);
4891 }
4892 const int8_t* i23 = input[23];
4893 assert(i23 != NULL);
4894 if XNN_UNPREDICTABLE(i23 != zero) {
4895 i23 = (const int8_t*) ((uintptr_t) i23 + input_offset);
4896 }
4897 const int8_t* i24 = input[24];
4898 assert(i24 != NULL);
4899 if XNN_UNPREDICTABLE(i24 != zero) {
4900 i24 = (const int8_t*) ((uintptr_t) i24 + input_offset);
4901 }
4902 input = (const int8_t**) ((uintptr_t) input + input_stride);
4903
4904 size_t c = channels;
4905 const void* w = weights;
4906 for (; c >= 16; c -= 16) {
4907 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
4908 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
4909 __m128i vacc89AB = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 8));
4910 __m128i vaccCDEF = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 12));
4911
4912
4913 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
4914 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
4915 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t)));
4916 const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
4917 const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8));
4918 const __m128i vxi0x89ABCDEF = _mm_cvtepi8_epi16(vi0x89ABCDEF);
4919 const __m128i vk0x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t)));
4920 const __m128i vxk0x89ABCDEF = _mm_cvtepi8_epi16(vk0x89ABCDEF);
4921 i0 += 16;
4922
4923
4924 __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
4925 __m128i vprod89ABCDEF = _mm_mullo_epi16(vxi0x89ABCDEF, vxk0x89ABCDEF);
4926
4927
4928 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
4929 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
4930 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t)));
4931 const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
4932 const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8));
4933 const __m128i vxi1x89ABCDEF = _mm_cvtepi8_epi16(vi1x89ABCDEF);
4934 const __m128i vk1x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t)));
4935 const __m128i vxk1x89ABCDEF = _mm_cvtepi8_epi16(vk1x89ABCDEF);
4936 i1 += 16;
4937
4938
4939 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi1x01234567, vxk1x01234567));
4940 vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi1x89ABCDEF, vxk1x89ABCDEF));
4941
4942 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
4943 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
4944 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
4945 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
4946
4947 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
4948 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
4949 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t)));
4950 const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
4951 const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8));
4952 const __m128i vxi2x89ABCDEF = _mm_cvtepi8_epi16(vi2x89ABCDEF);
4953 const __m128i vk2x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t)));
4954 const __m128i vxk2x89ABCDEF = _mm_cvtepi8_epi16(vk2x89ABCDEF);
4955 i2 += 16;
4956
4957
4958 vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
4959 vprod89ABCDEF = _mm_mullo_epi16(vxi2x89ABCDEF, vxk2x89ABCDEF);
4960
4961
4962 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
4963 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567);
4964 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t)));
4965 const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567);
4966 const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8));
4967 const __m128i vxi3x89ABCDEF = _mm_cvtepi8_epi16(vi3x89ABCDEF);
4968 const __m128i vk3x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t)));
4969 const __m128i vxk3x89ABCDEF = _mm_cvtepi8_epi16(vk3x89ABCDEF);
4970 i3 += 16;
4971
4972
4973 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi3x01234567, vxk3x01234567));
4974 vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi3x89ABCDEF, vxk3x89ABCDEF));
4975
4976 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
4977 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
4978 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
4979 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
4980
4981 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
4982 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567);
4983 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t)));
4984 const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567);
4985 const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8));
4986 const __m128i vxi4x89ABCDEF = _mm_cvtepi8_epi16(vi4x89ABCDEF);
4987 const __m128i vk4x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t)));
4988 const __m128i vxk4x89ABCDEF = _mm_cvtepi8_epi16(vk4x89ABCDEF);
4989 i4 += 16;
4990
4991
4992 vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
4993 vprod89ABCDEF = _mm_mullo_epi16(vxi4x89ABCDEF, vxk4x89ABCDEF);
4994
4995
4996 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
4997 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567);
4998 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t)));
4999 const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567);
5000 const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8));
5001 const __m128i vxi5x89ABCDEF = _mm_cvtepi8_epi16(vi5x89ABCDEF);
5002 const __m128i vk5x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t)));
5003 const __m128i vxk5x89ABCDEF = _mm_cvtepi8_epi16(vk5x89ABCDEF);
5004 i5 += 16;
5005
5006
5007 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi5x01234567, vxk5x01234567));
5008 vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi5x89ABCDEF, vxk5x89ABCDEF));
5009
5010 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5011 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5012 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
5013 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
5014
5015 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
5016 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567);
5017 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t)));
5018 const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567);
5019 const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8));
5020 const __m128i vxi6x89ABCDEF = _mm_cvtepi8_epi16(vi6x89ABCDEF);
5021 const __m128i vk6x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t)));
5022 const __m128i vxk6x89ABCDEF = _mm_cvtepi8_epi16(vk6x89ABCDEF);
5023 i6 += 16;
5024
5025
5026 vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
5027 vprod89ABCDEF = _mm_mullo_epi16(vxi6x89ABCDEF, vxk6x89ABCDEF);
5028
5029
5030 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
5031 const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567);
5032 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t)));
5033 const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567);
5034 const __m128i vi7x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i7 + 8));
5035 const __m128i vxi7x89ABCDEF = _mm_cvtepi8_epi16(vi7x89ABCDEF);
5036 const __m128i vk7x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t)));
5037 const __m128i vxk7x89ABCDEF = _mm_cvtepi8_epi16(vk7x89ABCDEF);
5038 i7 += 16;
5039
5040
5041 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi7x01234567, vxk7x01234567));
5042 vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi7x89ABCDEF, vxk7x89ABCDEF));
5043
5044 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5045 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5046 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
5047 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
5048
5049 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
5050 const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567);
5051 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t)));
5052 const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567);
5053 const __m128i vi8x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i8 + 8));
5054 const __m128i vxi8x89ABCDEF = _mm_cvtepi8_epi16(vi8x89ABCDEF);
5055 const __m128i vk8x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t)));
5056 const __m128i vxk8x89ABCDEF = _mm_cvtepi8_epi16(vk8x89ABCDEF);
5057 i8 += 16;
5058
5059
5060 vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
5061 vprod89ABCDEF = _mm_mullo_epi16(vxi8x89ABCDEF, vxk8x89ABCDEF);
5062
5063
5064 const __m128i vi9x01234567 = _mm_loadl_epi64((const __m128i*) i9);
5065 const __m128i vxi9x01234567 = _mm_cvtepi8_epi16(vi9x01234567);
5066 const __m128i vk9x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t)));
5067 const __m128i vxk9x01234567 = _mm_cvtepi8_epi16(vk9x01234567);
5068 const __m128i vi9x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i9 + 8));
5069 const __m128i vxi9x89ABCDEF = _mm_cvtepi8_epi16(vi9x89ABCDEF);
5070 const __m128i vk9x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 152 * sizeof(int8_t)));
5071 const __m128i vxk9x89ABCDEF = _mm_cvtepi8_epi16(vk9x89ABCDEF);
5072 i9 += 16;
5073
5074
5075 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi9x01234567, vxk9x01234567));
5076 vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi9x89ABCDEF, vxk9x89ABCDEF));
5077
5078 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5079 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5080 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
5081 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
5082
5083 const __m128i vi10x01234567 = _mm_loadl_epi64((const __m128i*) i10);
5084 const __m128i vxi10x01234567 = _mm_cvtepi8_epi16(vi10x01234567);
5085 const __m128i vk10x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 160 * sizeof(int8_t)));
5086 const __m128i vxk10x01234567 = _mm_cvtepi8_epi16(vk10x01234567);
5087 const __m128i vi10x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i10 + 8));
5088 const __m128i vxi10x89ABCDEF = _mm_cvtepi8_epi16(vi10x89ABCDEF);
5089 const __m128i vk10x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 168 * sizeof(int8_t)));
5090 const __m128i vxk10x89ABCDEF = _mm_cvtepi8_epi16(vk10x89ABCDEF);
5091 i10 += 16;
5092
5093
5094 vprod01234567 = _mm_mullo_epi16(vxi10x01234567, vxk10x01234567);
5095 vprod89ABCDEF = _mm_mullo_epi16(vxi10x89ABCDEF, vxk10x89ABCDEF);
5096
5097
5098 const __m128i vi11x01234567 = _mm_loadl_epi64((const __m128i*) i11);
5099 const __m128i vxi11x01234567 = _mm_cvtepi8_epi16(vi11x01234567);
5100 const __m128i vk11x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 176 * sizeof(int8_t)));
5101 const __m128i vxk11x01234567 = _mm_cvtepi8_epi16(vk11x01234567);
5102 const __m128i vi11x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i11 + 8));
5103 const __m128i vxi11x89ABCDEF = _mm_cvtepi8_epi16(vi11x89ABCDEF);
5104 const __m128i vk11x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 184 * sizeof(int8_t)));
5105 const __m128i vxk11x89ABCDEF = _mm_cvtepi8_epi16(vk11x89ABCDEF);
5106 i11 += 16;
5107
5108
5109 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi11x01234567, vxk11x01234567));
5110 vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi11x89ABCDEF, vxk11x89ABCDEF));
5111
5112 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5113 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5114 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
5115 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
5116
5117 const __m128i vi12x01234567 = _mm_loadl_epi64((const __m128i*) i12);
5118 const __m128i vxi12x01234567 = _mm_cvtepi8_epi16(vi12x01234567);
5119 const __m128i vk12x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 192 * sizeof(int8_t)));
5120 const __m128i vxk12x01234567 = _mm_cvtepi8_epi16(vk12x01234567);
5121 const __m128i vi12x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i12 + 8));
5122 const __m128i vxi12x89ABCDEF = _mm_cvtepi8_epi16(vi12x89ABCDEF);
5123 const __m128i vk12x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 200 * sizeof(int8_t)));
5124 const __m128i vxk12x89ABCDEF = _mm_cvtepi8_epi16(vk12x89ABCDEF);
5125 i12 += 16;
5126
5127
5128 vprod01234567 = _mm_mullo_epi16(vxi12x01234567, vxk12x01234567);
5129 vprod89ABCDEF = _mm_mullo_epi16(vxi12x89ABCDEF, vxk12x89ABCDEF);
5130
5131
5132 const __m128i vi13x01234567 = _mm_loadl_epi64((const __m128i*) i13);
5133 const __m128i vxi13x01234567 = _mm_cvtepi8_epi16(vi13x01234567);
5134 const __m128i vk13x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 208 * sizeof(int8_t)));
5135 const __m128i vxk13x01234567 = _mm_cvtepi8_epi16(vk13x01234567);
5136 const __m128i vi13x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i13 + 8));
5137 const __m128i vxi13x89ABCDEF = _mm_cvtepi8_epi16(vi13x89ABCDEF);
5138 const __m128i vk13x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 216 * sizeof(int8_t)));
5139 const __m128i vxk13x89ABCDEF = _mm_cvtepi8_epi16(vk13x89ABCDEF);
5140 i13 += 16;
5141
5142
5143 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi13x01234567, vxk13x01234567));
5144 vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi13x89ABCDEF, vxk13x89ABCDEF));
5145
5146 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5147 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5148 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
5149 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
5150
5151 const __m128i vi14x01234567 = _mm_loadl_epi64((const __m128i*) i14);
5152 const __m128i vxi14x01234567 = _mm_cvtepi8_epi16(vi14x01234567);
5153 const __m128i vk14x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 224 * sizeof(int8_t)));
5154 const __m128i vxk14x01234567 = _mm_cvtepi8_epi16(vk14x01234567);
5155 const __m128i vi14x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i14 + 8));
5156 const __m128i vxi14x89ABCDEF = _mm_cvtepi8_epi16(vi14x89ABCDEF);
5157 const __m128i vk14x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 232 * sizeof(int8_t)));
5158 const __m128i vxk14x89ABCDEF = _mm_cvtepi8_epi16(vk14x89ABCDEF);
5159 i14 += 16;
5160
5161
5162 vprod01234567 = _mm_mullo_epi16(vxi14x01234567, vxk14x01234567);
5163 vprod89ABCDEF = _mm_mullo_epi16(vxi14x89ABCDEF, vxk14x89ABCDEF);
5164
5165
5166 const __m128i vi15x01234567 = _mm_loadl_epi64((const __m128i*) i15);
5167 const __m128i vxi15x01234567 = _mm_cvtepi8_epi16(vi15x01234567);
5168 const __m128i vk15x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 240 * sizeof(int8_t)));
5169 const __m128i vxk15x01234567 = _mm_cvtepi8_epi16(vk15x01234567);
5170 const __m128i vi15x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i15 + 8));
5171 const __m128i vxi15x89ABCDEF = _mm_cvtepi8_epi16(vi15x89ABCDEF);
5172 const __m128i vk15x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 248 * sizeof(int8_t)));
5173 const __m128i vxk15x89ABCDEF = _mm_cvtepi8_epi16(vk15x89ABCDEF);
5174 i15 += 16;
5175
5176
5177 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi15x01234567, vxk15x01234567));
5178 vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi15x89ABCDEF, vxk15x89ABCDEF));
5179
5180 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5181 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5182 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
5183 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
5184
5185 const __m128i vi16x01234567 = _mm_loadl_epi64((const __m128i*) i16);
5186 const __m128i vxi16x01234567 = _mm_cvtepi8_epi16(vi16x01234567);
5187 const __m128i vk16x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 256 * sizeof(int8_t)));
5188 const __m128i vxk16x01234567 = _mm_cvtepi8_epi16(vk16x01234567);
5189 const __m128i vi16x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i16 + 8));
5190 const __m128i vxi16x89ABCDEF = _mm_cvtepi8_epi16(vi16x89ABCDEF);
5191 const __m128i vk16x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 264 * sizeof(int8_t)));
5192 const __m128i vxk16x89ABCDEF = _mm_cvtepi8_epi16(vk16x89ABCDEF);
5193 i16 += 16;
5194
5195
5196 vprod01234567 = _mm_mullo_epi16(vxi16x01234567, vxk16x01234567);
5197 vprod89ABCDEF = _mm_mullo_epi16(vxi16x89ABCDEF, vxk16x89ABCDEF);
5198
5199
5200 const __m128i vi17x01234567 = _mm_loadl_epi64((const __m128i*) i17);
5201 const __m128i vxi17x01234567 = _mm_cvtepi8_epi16(vi17x01234567);
5202 const __m128i vk17x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 272 * sizeof(int8_t)));
5203 const __m128i vxk17x01234567 = _mm_cvtepi8_epi16(vk17x01234567);
5204 const __m128i vi17x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i17 + 8));
5205 const __m128i vxi17x89ABCDEF = _mm_cvtepi8_epi16(vi17x89ABCDEF);
5206 const __m128i vk17x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 280 * sizeof(int8_t)));
5207 const __m128i vxk17x89ABCDEF = _mm_cvtepi8_epi16(vk17x89ABCDEF);
5208 i17 += 16;
5209
5210
5211 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi17x01234567, vxk17x01234567));
5212 vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi17x89ABCDEF, vxk17x89ABCDEF));
5213
5214 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5215 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5216 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
5217 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
5218
5219 const __m128i vi18x01234567 = _mm_loadl_epi64((const __m128i*) i18);
5220 const __m128i vxi18x01234567 = _mm_cvtepi8_epi16(vi18x01234567);
5221 const __m128i vk18x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 288 * sizeof(int8_t)));
5222 const __m128i vxk18x01234567 = _mm_cvtepi8_epi16(vk18x01234567);
5223 const __m128i vi18x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i18 + 8));
5224 const __m128i vxi18x89ABCDEF = _mm_cvtepi8_epi16(vi18x89ABCDEF);
5225 const __m128i vk18x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 296 * sizeof(int8_t)));
5226 const __m128i vxk18x89ABCDEF = _mm_cvtepi8_epi16(vk18x89ABCDEF);
5227 i18 += 16;
5228
5229
5230 vprod01234567 = _mm_mullo_epi16(vxi18x01234567, vxk18x01234567);
5231 vprod89ABCDEF = _mm_mullo_epi16(vxi18x89ABCDEF, vxk18x89ABCDEF);
5232
5233
5234 const __m128i vi19x01234567 = _mm_loadl_epi64((const __m128i*) i19);
5235 const __m128i vxi19x01234567 = _mm_cvtepi8_epi16(vi19x01234567);
5236 const __m128i vk19x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 304 * sizeof(int8_t)));
5237 const __m128i vxk19x01234567 = _mm_cvtepi8_epi16(vk19x01234567);
5238 const __m128i vi19x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i19 + 8));
5239 const __m128i vxi19x89ABCDEF = _mm_cvtepi8_epi16(vi19x89ABCDEF);
5240 const __m128i vk19x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 312 * sizeof(int8_t)));
5241 const __m128i vxk19x89ABCDEF = _mm_cvtepi8_epi16(vk19x89ABCDEF);
5242 i19 += 16;
5243
5244
5245 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi19x01234567, vxk19x01234567));
5246 vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi19x89ABCDEF, vxk19x89ABCDEF));
5247
5248 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5249 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5250 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
5251 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
5252
5253 const __m128i vi20x01234567 = _mm_loadl_epi64((const __m128i*) i20);
5254 const __m128i vxi20x01234567 = _mm_cvtepi8_epi16(vi20x01234567);
5255 const __m128i vk20x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 320 * sizeof(int8_t)));
5256 const __m128i vxk20x01234567 = _mm_cvtepi8_epi16(vk20x01234567);
5257 const __m128i vi20x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i20 + 8));
5258 const __m128i vxi20x89ABCDEF = _mm_cvtepi8_epi16(vi20x89ABCDEF);
5259 const __m128i vk20x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 328 * sizeof(int8_t)));
5260 const __m128i vxk20x89ABCDEF = _mm_cvtepi8_epi16(vk20x89ABCDEF);
5261 i20 += 16;
5262
5263
5264 vprod01234567 = _mm_mullo_epi16(vxi20x01234567, vxk20x01234567);
5265 vprod89ABCDEF = _mm_mullo_epi16(vxi20x89ABCDEF, vxk20x89ABCDEF);
5266
5267
5268 const __m128i vi21x01234567 = _mm_loadl_epi64((const __m128i*) i21);
5269 const __m128i vxi21x01234567 = _mm_cvtepi8_epi16(vi21x01234567);
5270 const __m128i vk21x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 336 * sizeof(int8_t)));
5271 const __m128i vxk21x01234567 = _mm_cvtepi8_epi16(vk21x01234567);
5272 const __m128i vi21x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i21 + 8));
5273 const __m128i vxi21x89ABCDEF = _mm_cvtepi8_epi16(vi21x89ABCDEF);
5274 const __m128i vk21x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 344 * sizeof(int8_t)));
5275 const __m128i vxk21x89ABCDEF = _mm_cvtepi8_epi16(vk21x89ABCDEF);
5276 i21 += 16;
5277
5278
5279 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi21x01234567, vxk21x01234567));
5280 vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi21x89ABCDEF, vxk21x89ABCDEF));
5281
5282 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5283 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5284 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
5285 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
5286
5287 const __m128i vi22x01234567 = _mm_loadl_epi64((const __m128i*) i22);
5288 const __m128i vxi22x01234567 = _mm_cvtepi8_epi16(vi22x01234567);
5289 const __m128i vk22x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 352 * sizeof(int8_t)));
5290 const __m128i vxk22x01234567 = _mm_cvtepi8_epi16(vk22x01234567);
5291 const __m128i vi22x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i22 + 8));
5292 const __m128i vxi22x89ABCDEF = _mm_cvtepi8_epi16(vi22x89ABCDEF);
5293 const __m128i vk22x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 360 * sizeof(int8_t)));
5294 const __m128i vxk22x89ABCDEF = _mm_cvtepi8_epi16(vk22x89ABCDEF);
5295 i22 += 16;
5296
5297
5298 vprod01234567 = _mm_mullo_epi16(vxi22x01234567, vxk22x01234567);
5299 vprod89ABCDEF = _mm_mullo_epi16(vxi22x89ABCDEF, vxk22x89ABCDEF);
5300
5301
5302 const __m128i vi23x01234567 = _mm_loadl_epi64((const __m128i*) i23);
5303 const __m128i vxi23x01234567 = _mm_cvtepi8_epi16(vi23x01234567);
5304 const __m128i vk23x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 368 * sizeof(int8_t)));
5305 const __m128i vxk23x01234567 = _mm_cvtepi8_epi16(vk23x01234567);
5306 const __m128i vi23x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i23 + 8));
5307 const __m128i vxi23x89ABCDEF = _mm_cvtepi8_epi16(vi23x89ABCDEF);
5308 const __m128i vk23x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 376 * sizeof(int8_t)));
5309 const __m128i vxk23x89ABCDEF = _mm_cvtepi8_epi16(vk23x89ABCDEF);
5310 i23 += 16;
5311
5312
5313 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi23x01234567, vxk23x01234567));
5314 vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi23x89ABCDEF, vxk23x89ABCDEF));
5315
5316 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5317 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5318 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
5319 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
5320
5321 const __m128i vi24x01234567 = _mm_loadl_epi64((const __m128i*) i24);
5322 const __m128i vxi24x01234567 = _mm_cvtepi8_epi16(vi24x01234567);
5323 const __m128i vk24x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 384 * sizeof(int8_t)));
5324 const __m128i vxk24x01234567 = _mm_cvtepi8_epi16(vk24x01234567);
5325 const __m128i vi24x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i24 + 8));
5326 const __m128i vxi24x89ABCDEF = _mm_cvtepi8_epi16(vi24x89ABCDEF);
5327 const __m128i vk24x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 392 * sizeof(int8_t)));
5328 const __m128i vxk24x89ABCDEF = _mm_cvtepi8_epi16(vk24x89ABCDEF);
5329 i24 += 16;
5330
5331
5332 vprod01234567 = _mm_mullo_epi16(vxi24x01234567, vxk24x01234567);
5333 vprod89ABCDEF = _mm_mullo_epi16(vxi24x89ABCDEF, vxk24x89ABCDEF);
5334
5335 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5336 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5337 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
5338 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
5339
5340 w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(int8_t));
5341
5342 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
5343 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
5344 __m128 vscaled89AB = _mm_cvtepi32_ps(vacc89AB);
5345 __m128 vscaledCDEF = _mm_cvtepi32_ps(vaccCDEF);
5346
5347 const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
5348 const __m128 vscale4567 = _mm_loadu_ps((const float*) w + 4);
5349 const __m128 vscale89AB = _mm_loadu_ps((const float*) w + 8);
5350 const __m128 vscaleCDEF = _mm_loadu_ps((const float*) w + 12);
5351 w = (const void*) ((const float*) w + 16);
5352 vscaled0123 = _mm_mul_ps(vscaled0123, vscale0123);
5353 vscaled4567 = _mm_mul_ps(vscaled4567, vscale4567);
5354 vscaled89AB = _mm_mul_ps(vscaled89AB, vscale89AB);
5355 vscaledCDEF = _mm_mul_ps(vscaledCDEF, vscaleCDEF);
5356
5357 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
5358 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
5359 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
5360 vscaled89AB = _mm_min_ps(vscaled89AB, voutput_max_less_zero_point);
5361 vscaledCDEF = _mm_min_ps(vscaledCDEF, voutput_max_less_zero_point);
5362
5363 vacc0123 = _mm_cvtps_epi32(vscaled0123);
5364 vacc4567 = _mm_cvtps_epi32(vscaled4567);
5365 vacc89AB = _mm_cvtps_epi32(vscaled89AB);
5366 vaccCDEF = _mm_cvtps_epi32(vscaledCDEF);
5367
5368 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
5369 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
5370 __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
5371
5372
5373 __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
5374
5375 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
5376 vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
5377
5378 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
5379 output += 16;
5380 }
5381 if XNN_UNLIKELY(c != 0) {
5382 const int8_t* k = (const int8_t*) ((const int32_t*) w + 16);
5383 do {
5384 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
5385 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
5386
5387
5388 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
5389 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
5390 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) k);
5391 const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
5392 i0 += 8;
5393
5394
5395 __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
5396
5397
5398 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
5399 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
5400 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) (k + 16));
5401 const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
5402 i1 += 8;
5403
5404
5405 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi1x01234567, vxk1x01234567));
5406
5407 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5408 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5409
5410 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
5411 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
5412 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) (k + 32));
5413 const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
5414 i2 += 8;
5415
5416
5417 vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
5418
5419
5420 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
5421 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567);
5422 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) (k + 48));
5423 const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567);
5424 i3 += 8;
5425
5426
5427 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi3x01234567, vxk3x01234567));
5428
5429 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5430 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5431
5432 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
5433 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567);
5434 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) (k + 64));
5435 const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567);
5436 i4 += 8;
5437
5438
5439 vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
5440
5441
5442 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
5443 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567);
5444 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) (k + 80));
5445 const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567);
5446 i5 += 8;
5447
5448
5449 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi5x01234567, vxk5x01234567));
5450
5451 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5452 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5453
5454 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
5455 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567);
5456 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) (k + 96));
5457 const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567);
5458 i6 += 8;
5459
5460
5461 vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
5462
5463
5464 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
5465 const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567);
5466 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) (k + 112));
5467 const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567);
5468 i7 += 8;
5469
5470
5471 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi7x01234567, vxk7x01234567));
5472
5473 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5474 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5475
5476 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
5477 const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567);
5478 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) (k + 128));
5479 const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567);
5480 i8 += 8;
5481
5482
5483 vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
5484
5485
5486 const __m128i vi9x01234567 = _mm_loadl_epi64((const __m128i*) i9);
5487 const __m128i vxi9x01234567 = _mm_cvtepi8_epi16(vi9x01234567);
5488 const __m128i vk9x01234567 = _mm_loadl_epi64((const __m128i*) (k + 144));
5489 const __m128i vxk9x01234567 = _mm_cvtepi8_epi16(vk9x01234567);
5490 i9 += 8;
5491
5492
5493 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi9x01234567, vxk9x01234567));
5494
5495 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5496 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5497
5498 const __m128i vi10x01234567 = _mm_loadl_epi64((const __m128i*) i10);
5499 const __m128i vxi10x01234567 = _mm_cvtepi8_epi16(vi10x01234567);
5500 const __m128i vk10x01234567 = _mm_loadl_epi64((const __m128i*) (k + 160));
5501 const __m128i vxk10x01234567 = _mm_cvtepi8_epi16(vk10x01234567);
5502 i10 += 8;
5503
5504
5505 vprod01234567 = _mm_mullo_epi16(vxi10x01234567, vxk10x01234567);
5506
5507
5508 const __m128i vi11x01234567 = _mm_loadl_epi64((const __m128i*) i11);
5509 const __m128i vxi11x01234567 = _mm_cvtepi8_epi16(vi11x01234567);
5510 const __m128i vk11x01234567 = _mm_loadl_epi64((const __m128i*) (k + 176));
5511 const __m128i vxk11x01234567 = _mm_cvtepi8_epi16(vk11x01234567);
5512 i11 += 8;
5513
5514
5515 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi11x01234567, vxk11x01234567));
5516
5517 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5518 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5519
5520 const __m128i vi12x01234567 = _mm_loadl_epi64((const __m128i*) i12);
5521 const __m128i vxi12x01234567 = _mm_cvtepi8_epi16(vi12x01234567);
5522 const __m128i vk12x01234567 = _mm_loadl_epi64((const __m128i*) (k + 192));
5523 const __m128i vxk12x01234567 = _mm_cvtepi8_epi16(vk12x01234567);
5524 i12 += 8;
5525
5526
5527 vprod01234567 = _mm_mullo_epi16(vxi12x01234567, vxk12x01234567);
5528
5529
5530 const __m128i vi13x01234567 = _mm_loadl_epi64((const __m128i*) i13);
5531 const __m128i vxi13x01234567 = _mm_cvtepi8_epi16(vi13x01234567);
5532 const __m128i vk13x01234567 = _mm_loadl_epi64((const __m128i*) (k + 208));
5533 const __m128i vxk13x01234567 = _mm_cvtepi8_epi16(vk13x01234567);
5534 i13 += 8;
5535
5536
5537 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi13x01234567, vxk13x01234567));
5538
5539 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5540 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5541
5542 const __m128i vi14x01234567 = _mm_loadl_epi64((const __m128i*) i14);
5543 const __m128i vxi14x01234567 = _mm_cvtepi8_epi16(vi14x01234567);
5544 const __m128i vk14x01234567 = _mm_loadl_epi64((const __m128i*) (k + 224));
5545 const __m128i vxk14x01234567 = _mm_cvtepi8_epi16(vk14x01234567);
5546 i14 += 8;
5547
5548
5549 vprod01234567 = _mm_mullo_epi16(vxi14x01234567, vxk14x01234567);
5550
5551
5552 const __m128i vi15x01234567 = _mm_loadl_epi64((const __m128i*) i15);
5553 const __m128i vxi15x01234567 = _mm_cvtepi8_epi16(vi15x01234567);
5554 const __m128i vk15x01234567 = _mm_loadl_epi64((const __m128i*) (k + 240));
5555 const __m128i vxk15x01234567 = _mm_cvtepi8_epi16(vk15x01234567);
5556 i15 += 8;
5557
5558
5559 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi15x01234567, vxk15x01234567));
5560
5561 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5562 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5563
5564 const __m128i vi16x01234567 = _mm_loadl_epi64((const __m128i*) i16);
5565 const __m128i vxi16x01234567 = _mm_cvtepi8_epi16(vi16x01234567);
5566 const __m128i vk16x01234567 = _mm_loadl_epi64((const __m128i*) (k + 256));
5567 const __m128i vxk16x01234567 = _mm_cvtepi8_epi16(vk16x01234567);
5568 i16 += 8;
5569
5570
5571 vprod01234567 = _mm_mullo_epi16(vxi16x01234567, vxk16x01234567);
5572
5573
5574 const __m128i vi17x01234567 = _mm_loadl_epi64((const __m128i*) i17);
5575 const __m128i vxi17x01234567 = _mm_cvtepi8_epi16(vi17x01234567);
5576 const __m128i vk17x01234567 = _mm_loadl_epi64((const __m128i*) (k + 272));
5577 const __m128i vxk17x01234567 = _mm_cvtepi8_epi16(vk17x01234567);
5578 i17 += 8;
5579
5580
5581 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi17x01234567, vxk17x01234567));
5582
5583 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5584 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5585
5586 const __m128i vi18x01234567 = _mm_loadl_epi64((const __m128i*) i18);
5587 const __m128i vxi18x01234567 = _mm_cvtepi8_epi16(vi18x01234567);
5588 const __m128i vk18x01234567 = _mm_loadl_epi64((const __m128i*) (k + 288));
5589 const __m128i vxk18x01234567 = _mm_cvtepi8_epi16(vk18x01234567);
5590 i18 += 8;
5591
5592
5593 vprod01234567 = _mm_mullo_epi16(vxi18x01234567, vxk18x01234567);
5594
5595
5596 const __m128i vi19x01234567 = _mm_loadl_epi64((const __m128i*) i19);
5597 const __m128i vxi19x01234567 = _mm_cvtepi8_epi16(vi19x01234567);
5598 const __m128i vk19x01234567 = _mm_loadl_epi64((const __m128i*) (k + 304));
5599 const __m128i vxk19x01234567 = _mm_cvtepi8_epi16(vk19x01234567);
5600 i19 += 8;
5601
5602
5603 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi19x01234567, vxk19x01234567));
5604
5605 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5606 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5607
5608 const __m128i vi20x01234567 = _mm_loadl_epi64((const __m128i*) i20);
5609 const __m128i vxi20x01234567 = _mm_cvtepi8_epi16(vi20x01234567);
5610 const __m128i vk20x01234567 = _mm_loadl_epi64((const __m128i*) (k + 320));
5611 const __m128i vxk20x01234567 = _mm_cvtepi8_epi16(vk20x01234567);
5612 i20 += 8;
5613
5614
5615 vprod01234567 = _mm_mullo_epi16(vxi20x01234567, vxk20x01234567);
5616
5617
5618 const __m128i vi21x01234567 = _mm_loadl_epi64((const __m128i*) i21);
5619 const __m128i vxi21x01234567 = _mm_cvtepi8_epi16(vi21x01234567);
5620 const __m128i vk21x01234567 = _mm_loadl_epi64((const __m128i*) (k + 336));
5621 const __m128i vxk21x01234567 = _mm_cvtepi8_epi16(vk21x01234567);
5622 i21 += 8;
5623
5624
5625 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi21x01234567, vxk21x01234567));
5626
5627 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5628 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5629
5630 const __m128i vi22x01234567 = _mm_loadl_epi64((const __m128i*) i22);
5631 const __m128i vxi22x01234567 = _mm_cvtepi8_epi16(vi22x01234567);
5632 const __m128i vk22x01234567 = _mm_loadl_epi64((const __m128i*) (k + 352));
5633 const __m128i vxk22x01234567 = _mm_cvtepi8_epi16(vk22x01234567);
5634 i22 += 8;
5635
5636
5637 vprod01234567 = _mm_mullo_epi16(vxi22x01234567, vxk22x01234567);
5638
5639
5640 const __m128i vi23x01234567 = _mm_loadl_epi64((const __m128i*) i23);
5641 const __m128i vxi23x01234567 = _mm_cvtepi8_epi16(vi23x01234567);
5642 const __m128i vk23x01234567 = _mm_loadl_epi64((const __m128i*) (k + 368));
5643 const __m128i vxk23x01234567 = _mm_cvtepi8_epi16(vk23x01234567);
5644 i23 += 8;
5645
5646
5647 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi23x01234567, vxk23x01234567));
5648
5649 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5650 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5651
5652 const __m128i vi24x01234567 = _mm_loadl_epi64((const __m128i*) i24);
5653 const __m128i vxi24x01234567 = _mm_cvtepi8_epi16(vi24x01234567);
5654 const __m128i vk24x01234567 = _mm_loadl_epi64((const __m128i*) (k + 384));
5655 const __m128i vxk24x01234567 = _mm_cvtepi8_epi16(vk24x01234567);
5656 i24 += 8;
5657
5658
5659 vprod01234567 = _mm_mullo_epi16(vxi24x01234567, vxk24x01234567);
5660
5661 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5662 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5663
5664 k += 8;
5665
5666 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
5667 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
5668
5669 const __m128 vscale0123 = _mm_loadu_ps((const float*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(int8_t)));
5670 const __m128 vscale4567 = _mm_loadu_ps((const float*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(int8_t) + 4 * sizeof(float)));
5671 vscaled0123 = _mm_mul_ps(vscaled0123, vscale0123);
5672 vscaled4567 = _mm_mul_ps(vscaled4567, vscale4567);
5673
5674 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
5675 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
5676 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
5677
5678 vacc0123 = _mm_cvtps_epi32(vscaled0123);
5679 vacc4567 = _mm_cvtps_epi32(vscaled4567);
5680
5681 w = (const void*) ((const int32_t*) w + 8);
5682
5683 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
5684 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
5685
5686
5687 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
5688
5689 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
5690
5691 if XNN_LIKELY(c >= 8) {
5692 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
5693 output += 8;
5694 c -= 8;
5695 } else {
5696 if (c & 4) {
5697 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
5698 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
5699 output += 4;
5700 }
5701 if (c & 2) {
5702 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
5703 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
5704 output += 2;
5705 }
5706 if (c & 1) {
5707 *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
5708 output += 1;
5709 }
5710 c = 0;
5711 }
5712 } while (c != 0);
5713 }
5714
5715 output = (int8_t*) ((uintptr_t) output + output_increment);
5716 } while (--output_width != 0);
5717 }
5718
xnn_qc8_dwconv_minmax_fp32_ukernel_up16x3__avx_mul16_add16(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5719 void xnn_qc8_dwconv_minmax_fp32_ukernel_up16x3__avx_mul16_add16(
5720 size_t channels,
5721 size_t output_width,
5722 const int8_t** input,
5723 const void* weights,
5724 int8_t* output,
5725 size_t input_stride,
5726 size_t output_increment,
5727 size_t input_offset,
5728 const int8_t* zero,
5729 const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5730 {
5731 assert(channels != 0);
5732 assert(output_width != 0);
5733
5734 do {
5735 const int8_t* i0 = input[0];
5736 assert(i0 != NULL);
5737 if XNN_UNPREDICTABLE(i0 != zero) {
5738 i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
5739 }
5740 const int8_t* i1 = input[1];
5741 assert(i1 != NULL);
5742 if XNN_UNPREDICTABLE(i1 != zero) {
5743 i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
5744 }
5745 const int8_t* i2 = input[2];
5746 assert(i2 != NULL);
5747 if XNN_UNPREDICTABLE(i2 != zero) {
5748 i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
5749 }
5750 input = (const int8_t**) ((uintptr_t) input + input_stride);
5751
5752 size_t c = channels;
5753 const void* w = weights;
5754 for (; c >= 16; c -= 16) {
5755 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
5756 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
5757 __m128i vacc89AB = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 8));
5758 __m128i vaccCDEF = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 12));
5759
5760
5761 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
5762 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
5763 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t)));
5764 const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
5765 const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8));
5766 const __m128i vxi0x89ABCDEF = _mm_cvtepi8_epi16(vi0x89ABCDEF);
5767 const __m128i vk0x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t)));
5768 const __m128i vxk0x89ABCDEF = _mm_cvtepi8_epi16(vk0x89ABCDEF);
5769 i0 += 16;
5770
5771
5772 __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
5773 __m128i vprod89ABCDEF = _mm_mullo_epi16(vxi0x89ABCDEF, vxk0x89ABCDEF);
5774
5775
5776 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
5777 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
5778 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t)));
5779 const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
5780 const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8));
5781 const __m128i vxi1x89ABCDEF = _mm_cvtepi8_epi16(vi1x89ABCDEF);
5782 const __m128i vk1x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t)));
5783 const __m128i vxk1x89ABCDEF = _mm_cvtepi8_epi16(vk1x89ABCDEF);
5784 i1 += 16;
5785
5786
5787 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi1x01234567, vxk1x01234567));
5788 vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi1x89ABCDEF, vxk1x89ABCDEF));
5789
5790 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5791 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5792 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
5793 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
5794
5795 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
5796 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
5797 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t)));
5798 const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
5799 const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8));
5800 const __m128i vxi2x89ABCDEF = _mm_cvtepi8_epi16(vi2x89ABCDEF);
5801 const __m128i vk2x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t)));
5802 const __m128i vxk2x89ABCDEF = _mm_cvtepi8_epi16(vk2x89ABCDEF);
5803 i2 += 16;
5804
5805
5806 vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
5807 vprod89ABCDEF = _mm_mullo_epi16(vxi2x89ABCDEF, vxk2x89ABCDEF);
5808
5809 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5810 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5811 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
5812 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
5813
5814 w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t));
5815
5816 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
5817 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
5818 __m128 vscaled89AB = _mm_cvtepi32_ps(vacc89AB);
5819 __m128 vscaledCDEF = _mm_cvtepi32_ps(vaccCDEF);
5820
5821 const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
5822 const __m128 vscale4567 = _mm_loadu_ps((const float*) w + 4);
5823 const __m128 vscale89AB = _mm_loadu_ps((const float*) w + 8);
5824 const __m128 vscaleCDEF = _mm_loadu_ps((const float*) w + 12);
5825 w = (const void*) ((const float*) w + 16);
5826 vscaled0123 = _mm_mul_ps(vscaled0123, vscale0123);
5827 vscaled4567 = _mm_mul_ps(vscaled4567, vscale4567);
5828 vscaled89AB = _mm_mul_ps(vscaled89AB, vscale89AB);
5829 vscaledCDEF = _mm_mul_ps(vscaledCDEF, vscaleCDEF);
5830
5831 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
5832 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
5833 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
5834 vscaled89AB = _mm_min_ps(vscaled89AB, voutput_max_less_zero_point);
5835 vscaledCDEF = _mm_min_ps(vscaledCDEF, voutput_max_less_zero_point);
5836
5837 vacc0123 = _mm_cvtps_epi32(vscaled0123);
5838 vacc4567 = _mm_cvtps_epi32(vscaled4567);
5839 vacc89AB = _mm_cvtps_epi32(vscaled89AB);
5840 vaccCDEF = _mm_cvtps_epi32(vscaledCDEF);
5841
5842 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
5843 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
5844 __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
5845
5846
5847 __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
5848
5849 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
5850 vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
5851
5852 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
5853 output += 16;
5854 }
5855 if XNN_UNLIKELY(c != 0) {
5856 const int8_t* k = (const int8_t*) ((const int32_t*) w + 16);
5857 do {
5858 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
5859 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
5860
5861
5862 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
5863 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
5864 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) k);
5865 const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
5866 i0 += 8;
5867
5868
5869 __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
5870
5871
5872 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
5873 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
5874 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) (k + 16));
5875 const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
5876 i1 += 8;
5877
5878
5879 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi1x01234567, vxk1x01234567));
5880
5881 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5882 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5883
5884 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
5885 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
5886 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) (k + 32));
5887 const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
5888 i2 += 8;
5889
5890
5891 vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
5892
5893 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
5894 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
5895
5896 k += 8;
5897
5898 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
5899 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
5900
5901 const __m128 vscale0123 = _mm_loadu_ps((const float*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t)));
5902 const __m128 vscale4567 = _mm_loadu_ps((const float*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t) + 4 * sizeof(float)));
5903 vscaled0123 = _mm_mul_ps(vscaled0123, vscale0123);
5904 vscaled4567 = _mm_mul_ps(vscaled4567, vscale4567);
5905
5906 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
5907 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
5908 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
5909
5910 vacc0123 = _mm_cvtps_epi32(vscaled0123);
5911 vacc4567 = _mm_cvtps_epi32(vscaled4567);
5912
5913 w = (const void*) ((const int32_t*) w + 8);
5914
5915 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
5916 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
5917
5918
5919 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
5920
5921 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
5922
5923 if XNN_LIKELY(c >= 8) {
5924 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
5925 output += 8;
5926 c -= 8;
5927 } else {
5928 if (c & 4) {
5929 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
5930 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
5931 output += 4;
5932 }
5933 if (c & 2) {
5934 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
5935 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
5936 output += 2;
5937 }
5938 if (c & 1) {
5939 *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
5940 output += 1;
5941 }
5942 c = 0;
5943 }
5944 } while (c != 0);
5945 }
5946
5947 output = (int8_t*) ((uintptr_t) output + output_increment);
5948 } while (--output_width != 0);
5949 }
5950
xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16_add16(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])5951 void xnn_qc8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16_add16(
5952 size_t channels,
5953 size_t output_width,
5954 const int8_t** input,
5955 const void* weights,
5956 int8_t* output,
5957 size_t input_stride,
5958 size_t output_increment,
5959 size_t input_offset,
5960 const int8_t* zero,
5961 const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
5962 {
5963 assert(channels != 0);
5964 assert(output_width != 0);
5965
5966 do {
5967 const int8_t* i0 = input[0];
5968 assert(i0 != NULL);
5969 if XNN_UNPREDICTABLE(i0 != zero) {
5970 i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
5971 }
5972 const int8_t* i1 = input[1];
5973 assert(i1 != NULL);
5974 if XNN_UNPREDICTABLE(i1 != zero) {
5975 i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
5976 }
5977 const int8_t* i2 = input[2];
5978 assert(i2 != NULL);
5979 if XNN_UNPREDICTABLE(i2 != zero) {
5980 i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
5981 }
5982 const int8_t* i3 = input[3];
5983 assert(i3 != NULL);
5984 if XNN_UNPREDICTABLE(i3 != zero) {
5985 i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
5986 }
5987 const int8_t* i4 = input[4];
5988 assert(i4 != NULL);
5989 if XNN_UNPREDICTABLE(i4 != zero) {
5990 i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
5991 }
5992 const int8_t* i5 = input[5];
5993 assert(i5 != NULL);
5994 if XNN_UNPREDICTABLE(i5 != zero) {
5995 i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
5996 }
5997 const int8_t* i6 = input[6];
5998 assert(i6 != NULL);
5999 if XNN_UNPREDICTABLE(i6 != zero) {
6000 i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
6001 }
6002 const int8_t* i7 = input[7];
6003 assert(i7 != NULL);
6004 if XNN_UNPREDICTABLE(i7 != zero) {
6005 i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
6006 }
6007 const int8_t* i8 = input[8];
6008 assert(i8 != NULL);
6009 if XNN_UNPREDICTABLE(i8 != zero) {
6010 i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
6011 }
6012 input = (const int8_t**) ((uintptr_t) input + input_stride);
6013
6014 size_t c = channels;
6015 const void* w = weights;
6016 for (; c >= 16; c -= 16) {
6017 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
6018 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
6019 __m128i vacc89AB = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 8));
6020 __m128i vaccCDEF = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 12));
6021
6022
6023 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
6024 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
6025 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t)));
6026 const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
6027 const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8));
6028 const __m128i vxi0x89ABCDEF = _mm_cvtepi8_epi16(vi0x89ABCDEF);
6029 const __m128i vk0x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t)));
6030 const __m128i vxk0x89ABCDEF = _mm_cvtepi8_epi16(vk0x89ABCDEF);
6031 i0 += 16;
6032
6033
6034 __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
6035 __m128i vprod89ABCDEF = _mm_mullo_epi16(vxi0x89ABCDEF, vxk0x89ABCDEF);
6036
6037
6038 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
6039 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
6040 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t)));
6041 const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
6042 const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8));
6043 const __m128i vxi1x89ABCDEF = _mm_cvtepi8_epi16(vi1x89ABCDEF);
6044 const __m128i vk1x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t)));
6045 const __m128i vxk1x89ABCDEF = _mm_cvtepi8_epi16(vk1x89ABCDEF);
6046 i1 += 16;
6047
6048
6049 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi1x01234567, vxk1x01234567));
6050 vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi1x89ABCDEF, vxk1x89ABCDEF));
6051
6052 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
6053 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
6054 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
6055 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
6056
6057 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
6058 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
6059 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t)));
6060 const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
6061 const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8));
6062 const __m128i vxi2x89ABCDEF = _mm_cvtepi8_epi16(vi2x89ABCDEF);
6063 const __m128i vk2x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t)));
6064 const __m128i vxk2x89ABCDEF = _mm_cvtepi8_epi16(vk2x89ABCDEF);
6065 i2 += 16;
6066
6067
6068 vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
6069 vprod89ABCDEF = _mm_mullo_epi16(vxi2x89ABCDEF, vxk2x89ABCDEF);
6070
6071
6072 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
6073 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567);
6074 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t)));
6075 const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567);
6076 const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8));
6077 const __m128i vxi3x89ABCDEF = _mm_cvtepi8_epi16(vi3x89ABCDEF);
6078 const __m128i vk3x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t)));
6079 const __m128i vxk3x89ABCDEF = _mm_cvtepi8_epi16(vk3x89ABCDEF);
6080 i3 += 16;
6081
6082
6083 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi3x01234567, vxk3x01234567));
6084 vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi3x89ABCDEF, vxk3x89ABCDEF));
6085
6086 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
6087 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
6088 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
6089 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
6090
6091 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
6092 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567);
6093 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t)));
6094 const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567);
6095 const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8));
6096 const __m128i vxi4x89ABCDEF = _mm_cvtepi8_epi16(vi4x89ABCDEF);
6097 const __m128i vk4x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t)));
6098 const __m128i vxk4x89ABCDEF = _mm_cvtepi8_epi16(vk4x89ABCDEF);
6099 i4 += 16;
6100
6101
6102 vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
6103 vprod89ABCDEF = _mm_mullo_epi16(vxi4x89ABCDEF, vxk4x89ABCDEF);
6104
6105
6106 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
6107 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567);
6108 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t)));
6109 const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567);
6110 const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8));
6111 const __m128i vxi5x89ABCDEF = _mm_cvtepi8_epi16(vi5x89ABCDEF);
6112 const __m128i vk5x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t)));
6113 const __m128i vxk5x89ABCDEF = _mm_cvtepi8_epi16(vk5x89ABCDEF);
6114 i5 += 16;
6115
6116
6117 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi5x01234567, vxk5x01234567));
6118 vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi5x89ABCDEF, vxk5x89ABCDEF));
6119
6120 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
6121 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
6122 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
6123 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
6124
6125 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
6126 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567);
6127 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t)));
6128 const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567);
6129 const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8));
6130 const __m128i vxi6x89ABCDEF = _mm_cvtepi8_epi16(vi6x89ABCDEF);
6131 const __m128i vk6x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t)));
6132 const __m128i vxk6x89ABCDEF = _mm_cvtepi8_epi16(vk6x89ABCDEF);
6133 i6 += 16;
6134
6135
6136 vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
6137 vprod89ABCDEF = _mm_mullo_epi16(vxi6x89ABCDEF, vxk6x89ABCDEF);
6138
6139
6140 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
6141 const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567);
6142 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t)));
6143 const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567);
6144 const __m128i vi7x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i7 + 8));
6145 const __m128i vxi7x89ABCDEF = _mm_cvtepi8_epi16(vi7x89ABCDEF);
6146 const __m128i vk7x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t)));
6147 const __m128i vxk7x89ABCDEF = _mm_cvtepi8_epi16(vk7x89ABCDEF);
6148 i7 += 16;
6149
6150
6151 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi7x01234567, vxk7x01234567));
6152 vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi7x89ABCDEF, vxk7x89ABCDEF));
6153
6154 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
6155 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
6156 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
6157 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
6158
6159 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
6160 const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567);
6161 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t)));
6162 const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567);
6163 const __m128i vi8x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i8 + 8));
6164 const __m128i vxi8x89ABCDEF = _mm_cvtepi8_epi16(vi8x89ABCDEF);
6165 const __m128i vk8x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t)));
6166 const __m128i vxk8x89ABCDEF = _mm_cvtepi8_epi16(vk8x89ABCDEF);
6167 i8 += 16;
6168
6169
6170 vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
6171 vprod89ABCDEF = _mm_mullo_epi16(vxi8x89ABCDEF, vxk8x89ABCDEF);
6172
6173 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
6174 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
6175 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
6176 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
6177
6178 w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t));
6179
6180 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
6181 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
6182 __m128 vscaled89AB = _mm_cvtepi32_ps(vacc89AB);
6183 __m128 vscaledCDEF = _mm_cvtepi32_ps(vaccCDEF);
6184
6185 const __m128 vscale0123 = _mm_loadu_ps((const float*) w);
6186 const __m128 vscale4567 = _mm_loadu_ps((const float*) w + 4);
6187 const __m128 vscale89AB = _mm_loadu_ps((const float*) w + 8);
6188 const __m128 vscaleCDEF = _mm_loadu_ps((const float*) w + 12);
6189 w = (const void*) ((const float*) w + 16);
6190 vscaled0123 = _mm_mul_ps(vscaled0123, vscale0123);
6191 vscaled4567 = _mm_mul_ps(vscaled4567, vscale4567);
6192 vscaled89AB = _mm_mul_ps(vscaled89AB, vscale89AB);
6193 vscaledCDEF = _mm_mul_ps(vscaledCDEF, vscaleCDEF);
6194
6195 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
6196 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
6197 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
6198 vscaled89AB = _mm_min_ps(vscaled89AB, voutput_max_less_zero_point);
6199 vscaledCDEF = _mm_min_ps(vscaledCDEF, voutput_max_less_zero_point);
6200
6201 vacc0123 = _mm_cvtps_epi32(vscaled0123);
6202 vacc4567 = _mm_cvtps_epi32(vscaled4567);
6203 vacc89AB = _mm_cvtps_epi32(vscaled89AB);
6204 vaccCDEF = _mm_cvtps_epi32(vscaledCDEF);
6205
6206 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
6207 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
6208 __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
6209
6210
6211 __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
6212
6213 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
6214 vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
6215
6216 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
6217 output += 16;
6218 }
6219 if XNN_UNLIKELY(c != 0) {
6220 const int8_t* k = (const int8_t*) ((const int32_t*) w + 16);
6221 do {
6222 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
6223 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
6224
6225
6226 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
6227 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
6228 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) k);
6229 const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
6230 i0 += 8;
6231
6232
6233 __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
6234
6235
6236 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
6237 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
6238 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) (k + 16));
6239 const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
6240 i1 += 8;
6241
6242
6243 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi1x01234567, vxk1x01234567));
6244
6245 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
6246 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
6247
6248 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
6249 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
6250 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) (k + 32));
6251 const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
6252 i2 += 8;
6253
6254
6255 vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
6256
6257
6258 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
6259 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567);
6260 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) (k + 48));
6261 const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567);
6262 i3 += 8;
6263
6264
6265 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi3x01234567, vxk3x01234567));
6266
6267 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
6268 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
6269
6270 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
6271 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567);
6272 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) (k + 64));
6273 const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567);
6274 i4 += 8;
6275
6276
6277 vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
6278
6279
6280 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
6281 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567);
6282 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) (k + 80));
6283 const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567);
6284 i5 += 8;
6285
6286
6287 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi5x01234567, vxk5x01234567));
6288
6289 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
6290 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
6291
6292 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
6293 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567);
6294 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) (k + 96));
6295 const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567);
6296 i6 += 8;
6297
6298
6299 vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
6300
6301
6302 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
6303 const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567);
6304 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) (k + 112));
6305 const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567);
6306 i7 += 8;
6307
6308
6309 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi7x01234567, vxk7x01234567));
6310
6311 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
6312 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
6313
6314 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
6315 const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567);
6316 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) (k + 128));
6317 const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567);
6318 i8 += 8;
6319
6320
6321 vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
6322
6323 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
6324 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
6325
6326 k += 8;
6327
6328 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
6329 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
6330
6331 const __m128 vscale0123 = _mm_loadu_ps((const float*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t)));
6332 const __m128 vscale4567 = _mm_loadu_ps((const float*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t) + 4 * sizeof(float)));
6333 vscaled0123 = _mm_mul_ps(vscaled0123, vscale0123);
6334 vscaled4567 = _mm_mul_ps(vscaled4567, vscale4567);
6335
6336 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
6337 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
6338 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
6339
6340 vacc0123 = _mm_cvtps_epi32(vscaled0123);
6341 vacc4567 = _mm_cvtps_epi32(vscaled4567);
6342
6343 w = (const void*) ((const int32_t*) w + 8);
6344
6345 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
6346 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
6347
6348
6349 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
6350
6351 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
6352
6353 if XNN_LIKELY(c >= 8) {
6354 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
6355 output += 8;
6356 c -= 8;
6357 } else {
6358 if (c & 4) {
6359 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
6360 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
6361 output += 4;
6362 }
6363 if (c & 2) {
6364 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
6365 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
6366 output += 2;
6367 }
6368 if (c & 1) {
6369 *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
6370 output += 1;
6371 }
6372 c = 0;
6373 }
6374 } while (c != 0);
6375 }
6376
6377 output = (int8_t*) ((uintptr_t) output + output_increment);
6378 } while (--output_width != 0);
6379 }
6380
xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6381 void xnn_qc8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128(
6382 size_t mr,
6383 size_t nc,
6384 size_t kc,
6385 const int8_t* restrict a,
6386 size_t a_stride,
6387 const void* restrict w,
6388 int8_t* restrict c,
6389 size_t cm_stride,
6390 size_t cn_stride,
6391 const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6392 {
6393 assert(mr != 0);
6394 assert(mr <= 1);
6395 assert(nc != 0);
6396 assert(kc != 0);
6397 assert(kc % sizeof(int8_t) == 0);
6398 assert(a != NULL);
6399 assert(w != NULL);
6400 assert(c != NULL);
6401
6402 kc = round_up_po2(kc, 8);
6403 const int8_t* a0 = a;
6404 int8_t* c0 = c;
6405
6406 do {
6407 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
6408 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
6409 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
6410 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
6411 w = (const int32_t*) w + 4;
6412
6413 size_t k = 0;
6414 while (k < kc) {
6415 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
6416 const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
6417 a0 += 8;
6418
6419 const __m128i vb01 = _mm_load_si128((const __m128i*) w);
6420 const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
6421 const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
6422
6423 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
6424 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
6425 const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
6426 const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
6427 const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
6428
6429 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
6430 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
6431
6432 w = (const void*) ((const int8_t*) w + 32);
6433 k += 8 * sizeof(int8_t);
6434 }
6435
6436 const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
6437 const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
6438
6439 __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
6440
6441 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
6442
6443 const __m128 vscale0123 = _mm_load_ps((const float*) w);
6444 w = (const void*) ((const float*) w + 4);
6445 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
6446
6447 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
6448 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
6449
6450 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
6451
6452 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
6453 __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
6454
6455
6456 __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
6457
6458 vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
6459
6460 if (nc >= 4) {
6461 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
6462
6463 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
6464
6465 a0 = (const int8_t*) ((uintptr_t) a0 - kc);
6466
6467 nc -= 4;
6468 } else {
6469 if (nc & 2) {
6470 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
6471 c0 += 2;
6472 vout = _mm_srli_epi32(vout, 16);
6473 }
6474 if (nc & 1) {
6475 *c0 = (int8_t) _mm_extract_epi8(vout, 0);
6476 }
6477
6478 nc = 0;
6479 }
6480 } while (nc != 0);
6481 }
6482
xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6483 void xnn_qc8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128(
6484 size_t mr,
6485 size_t nc,
6486 size_t kc,
6487 const int8_t* restrict a,
6488 size_t a_stride,
6489 const void* restrict w,
6490 int8_t* restrict c,
6491 size_t cm_stride,
6492 size_t cn_stride,
6493 const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6494 {
6495 assert(mr != 0);
6496 assert(mr <= 2);
6497 assert(nc != 0);
6498 assert(kc != 0);
6499 assert(kc % sizeof(int8_t) == 0);
6500 assert(a != NULL);
6501 assert(w != NULL);
6502 assert(c != NULL);
6503
6504 kc = round_up_po2(kc, 8);
6505 const int8_t* a0 = a;
6506 int8_t* c0 = c;
6507 const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
6508 int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
6509 if XNN_UNPREDICTABLE(mr != 2) {
6510 a1 = a0;
6511 c1 = c0;
6512 }
6513
6514 do {
6515 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
6516 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
6517 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
6518 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
6519 __m128i vacc1x0 = vacc0x0;
6520 __m128i vacc1x1 = vacc0x1;
6521 __m128i vacc1x2 = vacc0x2;
6522 __m128i vacc1x3 = vacc0x3;
6523 w = (const int32_t*) w + 4;
6524
6525 size_t k = 0;
6526 while (k < kc) {
6527 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
6528 const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
6529 a0 += 8;
6530 const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
6531 const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
6532 a1 += 8;
6533
6534 const __m128i vb01 = _mm_load_si128((const __m128i*) w);
6535 const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
6536 const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
6537
6538 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
6539 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
6540 vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
6541 vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
6542 const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
6543 const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
6544 const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
6545
6546 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
6547 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
6548 vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
6549 vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
6550
6551 w = (const void*) ((const int8_t*) w + 32);
6552 k += 8 * sizeof(int8_t);
6553 }
6554
6555 const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
6556 const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
6557 const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
6558 const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
6559
6560 __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
6561 __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
6562
6563 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
6564 __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
6565
6566 const __m128 vscale0123 = _mm_load_ps((const float*) w);
6567 w = (const void*) ((const float*) w + 4);
6568 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
6569 vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
6570
6571 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
6572 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
6573 vscaled1x0123 = _mm_min_ps(vscaled1x0123, voutput_max_less_zero_point);
6574
6575 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
6576 vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
6577
6578 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
6579 __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
6580
6581
6582 __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
6583
6584 vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
6585
6586 if (nc >= 4) {
6587 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
6588 unaligned_store_u32(c1, (uint32_t) _mm_extract_epi32(vout, 1));
6589
6590 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
6591 c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
6592
6593 a0 = (const int8_t*) ((uintptr_t) a0 - kc);
6594 a1 = (const int8_t*) ((uintptr_t) a1 - kc);
6595
6596 nc -= 4;
6597 } else {
6598 if (nc & 2) {
6599 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
6600 c0 += 2;
6601 unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout, 2));
6602 c1 += 2;
6603 vout = _mm_srli_epi32(vout, 16);
6604 }
6605 if (nc & 1) {
6606 *c0 = (int8_t) _mm_extract_epi8(vout, 0);
6607 *c1 = (int8_t) _mm_extract_epi8(vout, 4);
6608 }
6609
6610 nc = 0;
6611 }
6612 } while (nc != 0);
6613 }
6614
xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6615 void xnn_qc8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128(
6616 size_t mr,
6617 size_t nc,
6618 size_t kc,
6619 size_t ks,
6620 const int8_t** restrict a,
6621 const void* restrict w,
6622 int8_t* restrict c,
6623 size_t cm_stride,
6624 size_t cn_stride,
6625 size_t a_offset,
6626 const int8_t* zero,
6627 const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6628 {
6629 assert(mr != 0);
6630 assert(mr <= 1);
6631 assert(nc != 0);
6632 assert(kc != 0);
6633 assert(ks != 0);
6634 assert(ks % (1 * sizeof(void*)) == 0);
6635 assert(a_offset % sizeof(int8_t) == 0);
6636 assert(a != NULL);
6637 assert(w != NULL);
6638 assert(c != NULL);
6639
6640 kc = round_up_po2(kc, 8);
6641 int8_t* c0 = c;
6642
6643 do {
6644 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
6645 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
6646 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
6647 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
6648 w = (const int32_t*) w + 4;
6649
6650 size_t p = ks;
6651 do {
6652 const int8_t* restrict a0 = a[0];
6653 if XNN_UNPREDICTABLE(a0 != zero) {
6654 a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
6655 }
6656 a += 1;
6657
6658 size_t k = 0;
6659 while (k < kc) {
6660 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
6661 const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
6662 a0 += 8;
6663
6664 const __m128i vb01 = _mm_load_si128((const __m128i*) w);
6665 const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
6666 const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
6667
6668 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
6669 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
6670 const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
6671 const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
6672 const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
6673
6674 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
6675 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
6676
6677 w = (const void*) ((const int8_t*) w + 32);
6678 k += 8 * sizeof(int8_t);
6679 }
6680 p -= 1 * sizeof(void*);
6681 } while (p != 0);
6682
6683 const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
6684 const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
6685
6686 __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
6687
6688 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
6689
6690 const __m128 vscale0123 = _mm_load_ps((const float*) w);
6691 w = (const void*) ((const float*) w + 4);
6692 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
6693
6694 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
6695 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
6696
6697 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
6698
6699 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
6700 __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
6701
6702
6703 __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
6704
6705 vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
6706
6707 if (nc >= 4) {
6708 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
6709 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
6710
6711 a = (const int8_t**restrict) ((uintptr_t) a - ks);
6712
6713 nc -= 4;
6714 } else {
6715 if (nc & 2) {
6716 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
6717 c0 += 2;
6718 vout = _mm_srli_epi32(vout, 16);
6719 }
6720 if (nc & 1) {
6721 *c0 = (int8_t) _mm_extract_epi8(vout, 0);
6722 }
6723
6724 nc = 0;
6725 }
6726 } while (nc != 0);
6727 }
6728
xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6729 void xnn_qc8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128(
6730 size_t mr,
6731 size_t nc,
6732 size_t kc,
6733 size_t ks,
6734 const int8_t** restrict a,
6735 const void* restrict w,
6736 int8_t* restrict c,
6737 size_t cm_stride,
6738 size_t cn_stride,
6739 size_t a_offset,
6740 const int8_t* zero,
6741 const union xnn_qc8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6742 {
6743 assert(mr != 0);
6744 assert(mr <= 2);
6745 assert(nc != 0);
6746 assert(kc != 0);
6747 assert(ks != 0);
6748 assert(ks % (2 * sizeof(void*)) == 0);
6749 assert(a_offset % sizeof(int8_t) == 0);
6750 assert(a != NULL);
6751 assert(w != NULL);
6752 assert(c != NULL);
6753
6754 kc = round_up_po2(kc, 8);
6755 int8_t* c0 = c;
6756 int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
6757 if XNN_UNPREDICTABLE(mr != 2) {
6758 c1 = c0;
6759 }
6760
6761 do {
6762 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
6763 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
6764 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
6765 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
6766 __m128i vacc1x0 = vacc0x0;
6767 __m128i vacc1x1 = vacc0x1;
6768 __m128i vacc1x2 = vacc0x2;
6769 __m128i vacc1x3 = vacc0x3;
6770 w = (const int32_t*) w + 4;
6771
6772 size_t p = ks;
6773 do {
6774 const int8_t* restrict a0 = a[0];
6775 if XNN_UNPREDICTABLE(a0 != zero) {
6776 a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
6777 }
6778 const int8_t* restrict a1 = a[1];
6779 if XNN_UNPREDICTABLE(a1 != zero) {
6780 a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
6781 }
6782 a += 2;
6783
6784 size_t k = 0;
6785 while (k < kc) {
6786 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
6787 const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
6788 a0 += 8;
6789 const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
6790 const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
6791 a1 += 8;
6792
6793 const __m128i vb01 = _mm_load_si128((const __m128i*) w);
6794 const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
6795 const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
6796
6797 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
6798 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
6799 vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
6800 vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
6801 const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
6802 const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
6803 const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
6804
6805 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
6806 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
6807 vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
6808 vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
6809
6810 w = (const void*) ((const int8_t*) w + 32);
6811 k += 8 * sizeof(int8_t);
6812 }
6813 p -= 2 * sizeof(void*);
6814 } while (p != 0);
6815
6816 const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
6817 const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
6818 const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
6819 const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
6820
6821 __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
6822 __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
6823
6824 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
6825 __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
6826
6827 const __m128 vscale0123 = _mm_load_ps((const float*) w);
6828 w = (const void*) ((const float*) w + 4);
6829 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale0123);
6830 vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale0123);
6831
6832 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
6833 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
6834 vscaled1x0123 = _mm_min_ps(vscaled1x0123, voutput_max_less_zero_point);
6835
6836 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
6837 vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
6838
6839 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
6840 __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
6841
6842
6843 __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
6844
6845 vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
6846
6847 if (nc >= 4) {
6848 unaligned_store_u32(c1, (uint32_t) _mm_extract_epi32(vout, 1));
6849 c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
6850 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
6851 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
6852
6853 a = (const int8_t**restrict) ((uintptr_t) a - ks);
6854
6855 nc -= 4;
6856 } else {
6857 if (nc & 2) {
6858 unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout, 2));
6859 c1 += 2;
6860 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
6861 c0 += 2;
6862 vout = _mm_srli_epi32(vout, 16);
6863 }
6864 if (nc & 1) {
6865 *c1 = (int8_t) _mm_extract_epi8(vout, 4);
6866 *c0 = (int8_t) _mm_extract_epi8(vout, 0);
6867 }
6868
6869 nc = 0;
6870 }
6871 } while (nc != 0);
6872 }
6873
xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16_add16(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])6874 void xnn_qs8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16_add16(
6875 size_t channels,
6876 size_t output_width,
6877 const int8_t** input,
6878 const void* weights,
6879 int8_t* output,
6880 size_t input_stride,
6881 size_t output_increment,
6882 size_t input_offset,
6883 const int8_t* zero,
6884 const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
6885 {
6886 assert(channels != 0);
6887 assert(output_width != 0);
6888
6889 do {
6890 const int8_t* i0 = input[0];
6891 assert(i0 != NULL);
6892 if XNN_UNPREDICTABLE(i0 != zero) {
6893 i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
6894 }
6895 const int8_t* i1 = input[1];
6896 assert(i1 != NULL);
6897 if XNN_UNPREDICTABLE(i1 != zero) {
6898 i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
6899 }
6900 const int8_t* i2 = input[2];
6901 assert(i2 != NULL);
6902 if XNN_UNPREDICTABLE(i2 != zero) {
6903 i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
6904 }
6905 const int8_t* i3 = input[3];
6906 assert(i3 != NULL);
6907 if XNN_UNPREDICTABLE(i3 != zero) {
6908 i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
6909 }
6910 const int8_t* i4 = input[4];
6911 assert(i4 != NULL);
6912 if XNN_UNPREDICTABLE(i4 != zero) {
6913 i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
6914 }
6915 const int8_t* i5 = input[5];
6916 assert(i5 != NULL);
6917 if XNN_UNPREDICTABLE(i5 != zero) {
6918 i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
6919 }
6920 const int8_t* i6 = input[6];
6921 assert(i6 != NULL);
6922 if XNN_UNPREDICTABLE(i6 != zero) {
6923 i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
6924 }
6925 const int8_t* i7 = input[7];
6926 assert(i7 != NULL);
6927 if XNN_UNPREDICTABLE(i7 != zero) {
6928 i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
6929 }
6930 const int8_t* i8 = input[8];
6931 assert(i8 != NULL);
6932 if XNN_UNPREDICTABLE(i8 != zero) {
6933 i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
6934 }
6935 const int8_t* i9 = input[9];
6936 assert(i9 != NULL);
6937 if XNN_UNPREDICTABLE(i9 != zero) {
6938 i9 = (const int8_t*) ((uintptr_t) i9 + input_offset);
6939 }
6940 const int8_t* i10 = input[10];
6941 assert(i10 != NULL);
6942 if XNN_UNPREDICTABLE(i10 != zero) {
6943 i10 = (const int8_t*) ((uintptr_t) i10 + input_offset);
6944 }
6945 const int8_t* i11 = input[11];
6946 assert(i11 != NULL);
6947 if XNN_UNPREDICTABLE(i11 != zero) {
6948 i11 = (const int8_t*) ((uintptr_t) i11 + input_offset);
6949 }
6950 const int8_t* i12 = input[12];
6951 assert(i12 != NULL);
6952 if XNN_UNPREDICTABLE(i12 != zero) {
6953 i12 = (const int8_t*) ((uintptr_t) i12 + input_offset);
6954 }
6955 const int8_t* i13 = input[13];
6956 assert(i13 != NULL);
6957 if XNN_UNPREDICTABLE(i13 != zero) {
6958 i13 = (const int8_t*) ((uintptr_t) i13 + input_offset);
6959 }
6960 const int8_t* i14 = input[14];
6961 assert(i14 != NULL);
6962 if XNN_UNPREDICTABLE(i14 != zero) {
6963 i14 = (const int8_t*) ((uintptr_t) i14 + input_offset);
6964 }
6965 const int8_t* i15 = input[15];
6966 assert(i15 != NULL);
6967 if XNN_UNPREDICTABLE(i15 != zero) {
6968 i15 = (const int8_t*) ((uintptr_t) i15 + input_offset);
6969 }
6970 const int8_t* i16 = input[16];
6971 assert(i16 != NULL);
6972 if XNN_UNPREDICTABLE(i16 != zero) {
6973 i16 = (const int8_t*) ((uintptr_t) i16 + input_offset);
6974 }
6975 const int8_t* i17 = input[17];
6976 assert(i17 != NULL);
6977 if XNN_UNPREDICTABLE(i17 != zero) {
6978 i17 = (const int8_t*) ((uintptr_t) i17 + input_offset);
6979 }
6980 const int8_t* i18 = input[18];
6981 assert(i18 != NULL);
6982 if XNN_UNPREDICTABLE(i18 != zero) {
6983 i18 = (const int8_t*) ((uintptr_t) i18 + input_offset);
6984 }
6985 const int8_t* i19 = input[19];
6986 assert(i19 != NULL);
6987 if XNN_UNPREDICTABLE(i19 != zero) {
6988 i19 = (const int8_t*) ((uintptr_t) i19 + input_offset);
6989 }
6990 const int8_t* i20 = input[20];
6991 assert(i20 != NULL);
6992 if XNN_UNPREDICTABLE(i20 != zero) {
6993 i20 = (const int8_t*) ((uintptr_t) i20 + input_offset);
6994 }
6995 const int8_t* i21 = input[21];
6996 assert(i21 != NULL);
6997 if XNN_UNPREDICTABLE(i21 != zero) {
6998 i21 = (const int8_t*) ((uintptr_t) i21 + input_offset);
6999 }
7000 const int8_t* i22 = input[22];
7001 assert(i22 != NULL);
7002 if XNN_UNPREDICTABLE(i22 != zero) {
7003 i22 = (const int8_t*) ((uintptr_t) i22 + input_offset);
7004 }
7005 const int8_t* i23 = input[23];
7006 assert(i23 != NULL);
7007 if XNN_UNPREDICTABLE(i23 != zero) {
7008 i23 = (const int8_t*) ((uintptr_t) i23 + input_offset);
7009 }
7010 const int8_t* i24 = input[24];
7011 assert(i24 != NULL);
7012 if XNN_UNPREDICTABLE(i24 != zero) {
7013 i24 = (const int8_t*) ((uintptr_t) i24 + input_offset);
7014 }
7015 input = (const int8_t**) ((uintptr_t) input + input_stride);
7016
7017 size_t c = channels;
7018 const void* w = weights;
7019 for (; c >= 16; c -= 16) {
7020 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
7021 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
7022 __m128i vacc89AB = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 8));
7023 __m128i vaccCDEF = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 12));
7024
7025
7026 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
7027 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
7028 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t)));
7029 const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
7030 const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8));
7031 const __m128i vxi0x89ABCDEF = _mm_cvtepi8_epi16(vi0x89ABCDEF);
7032 const __m128i vk0x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t)));
7033 const __m128i vxk0x89ABCDEF = _mm_cvtepi8_epi16(vk0x89ABCDEF);
7034 i0 += 16;
7035
7036
7037 __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
7038 __m128i vprod89ABCDEF = _mm_mullo_epi16(vxi0x89ABCDEF, vxk0x89ABCDEF);
7039
7040
7041 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
7042 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
7043 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t)));
7044 const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
7045 const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8));
7046 const __m128i vxi1x89ABCDEF = _mm_cvtepi8_epi16(vi1x89ABCDEF);
7047 const __m128i vk1x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t)));
7048 const __m128i vxk1x89ABCDEF = _mm_cvtepi8_epi16(vk1x89ABCDEF);
7049 i1 += 16;
7050
7051
7052 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi1x01234567, vxk1x01234567));
7053 vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi1x89ABCDEF, vxk1x89ABCDEF));
7054
7055 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7056 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7057 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
7058 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
7059
7060 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
7061 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
7062 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t)));
7063 const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
7064 const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8));
7065 const __m128i vxi2x89ABCDEF = _mm_cvtepi8_epi16(vi2x89ABCDEF);
7066 const __m128i vk2x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t)));
7067 const __m128i vxk2x89ABCDEF = _mm_cvtepi8_epi16(vk2x89ABCDEF);
7068 i2 += 16;
7069
7070
7071 vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
7072 vprod89ABCDEF = _mm_mullo_epi16(vxi2x89ABCDEF, vxk2x89ABCDEF);
7073
7074
7075 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
7076 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567);
7077 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t)));
7078 const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567);
7079 const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8));
7080 const __m128i vxi3x89ABCDEF = _mm_cvtepi8_epi16(vi3x89ABCDEF);
7081 const __m128i vk3x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t)));
7082 const __m128i vxk3x89ABCDEF = _mm_cvtepi8_epi16(vk3x89ABCDEF);
7083 i3 += 16;
7084
7085
7086 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi3x01234567, vxk3x01234567));
7087 vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi3x89ABCDEF, vxk3x89ABCDEF));
7088
7089 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7090 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7091 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
7092 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
7093
7094 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
7095 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567);
7096 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t)));
7097 const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567);
7098 const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8));
7099 const __m128i vxi4x89ABCDEF = _mm_cvtepi8_epi16(vi4x89ABCDEF);
7100 const __m128i vk4x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t)));
7101 const __m128i vxk4x89ABCDEF = _mm_cvtepi8_epi16(vk4x89ABCDEF);
7102 i4 += 16;
7103
7104
7105 vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
7106 vprod89ABCDEF = _mm_mullo_epi16(vxi4x89ABCDEF, vxk4x89ABCDEF);
7107
7108
7109 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
7110 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567);
7111 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t)));
7112 const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567);
7113 const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8));
7114 const __m128i vxi5x89ABCDEF = _mm_cvtepi8_epi16(vi5x89ABCDEF);
7115 const __m128i vk5x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t)));
7116 const __m128i vxk5x89ABCDEF = _mm_cvtepi8_epi16(vk5x89ABCDEF);
7117 i5 += 16;
7118
7119
7120 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi5x01234567, vxk5x01234567));
7121 vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi5x89ABCDEF, vxk5x89ABCDEF));
7122
7123 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7124 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7125 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
7126 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
7127
7128 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
7129 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567);
7130 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t)));
7131 const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567);
7132 const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8));
7133 const __m128i vxi6x89ABCDEF = _mm_cvtepi8_epi16(vi6x89ABCDEF);
7134 const __m128i vk6x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t)));
7135 const __m128i vxk6x89ABCDEF = _mm_cvtepi8_epi16(vk6x89ABCDEF);
7136 i6 += 16;
7137
7138
7139 vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
7140 vprod89ABCDEF = _mm_mullo_epi16(vxi6x89ABCDEF, vxk6x89ABCDEF);
7141
7142
7143 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
7144 const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567);
7145 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t)));
7146 const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567);
7147 const __m128i vi7x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i7 + 8));
7148 const __m128i vxi7x89ABCDEF = _mm_cvtepi8_epi16(vi7x89ABCDEF);
7149 const __m128i vk7x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t)));
7150 const __m128i vxk7x89ABCDEF = _mm_cvtepi8_epi16(vk7x89ABCDEF);
7151 i7 += 16;
7152
7153
7154 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi7x01234567, vxk7x01234567));
7155 vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi7x89ABCDEF, vxk7x89ABCDEF));
7156
7157 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7158 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7159 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
7160 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
7161
7162 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
7163 const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567);
7164 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t)));
7165 const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567);
7166 const __m128i vi8x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i8 + 8));
7167 const __m128i vxi8x89ABCDEF = _mm_cvtepi8_epi16(vi8x89ABCDEF);
7168 const __m128i vk8x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t)));
7169 const __m128i vxk8x89ABCDEF = _mm_cvtepi8_epi16(vk8x89ABCDEF);
7170 i8 += 16;
7171
7172
7173 vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
7174 vprod89ABCDEF = _mm_mullo_epi16(vxi8x89ABCDEF, vxk8x89ABCDEF);
7175
7176
7177 const __m128i vi9x01234567 = _mm_loadl_epi64((const __m128i*) i9);
7178 const __m128i vxi9x01234567 = _mm_cvtepi8_epi16(vi9x01234567);
7179 const __m128i vk9x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t)));
7180 const __m128i vxk9x01234567 = _mm_cvtepi8_epi16(vk9x01234567);
7181 const __m128i vi9x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i9 + 8));
7182 const __m128i vxi9x89ABCDEF = _mm_cvtepi8_epi16(vi9x89ABCDEF);
7183 const __m128i vk9x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 152 * sizeof(int8_t)));
7184 const __m128i vxk9x89ABCDEF = _mm_cvtepi8_epi16(vk9x89ABCDEF);
7185 i9 += 16;
7186
7187
7188 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi9x01234567, vxk9x01234567));
7189 vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi9x89ABCDEF, vxk9x89ABCDEF));
7190
7191 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7192 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7193 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
7194 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
7195
7196 const __m128i vi10x01234567 = _mm_loadl_epi64((const __m128i*) i10);
7197 const __m128i vxi10x01234567 = _mm_cvtepi8_epi16(vi10x01234567);
7198 const __m128i vk10x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 160 * sizeof(int8_t)));
7199 const __m128i vxk10x01234567 = _mm_cvtepi8_epi16(vk10x01234567);
7200 const __m128i vi10x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i10 + 8));
7201 const __m128i vxi10x89ABCDEF = _mm_cvtepi8_epi16(vi10x89ABCDEF);
7202 const __m128i vk10x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 168 * sizeof(int8_t)));
7203 const __m128i vxk10x89ABCDEF = _mm_cvtepi8_epi16(vk10x89ABCDEF);
7204 i10 += 16;
7205
7206
7207 vprod01234567 = _mm_mullo_epi16(vxi10x01234567, vxk10x01234567);
7208 vprod89ABCDEF = _mm_mullo_epi16(vxi10x89ABCDEF, vxk10x89ABCDEF);
7209
7210
7211 const __m128i vi11x01234567 = _mm_loadl_epi64((const __m128i*) i11);
7212 const __m128i vxi11x01234567 = _mm_cvtepi8_epi16(vi11x01234567);
7213 const __m128i vk11x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 176 * sizeof(int8_t)));
7214 const __m128i vxk11x01234567 = _mm_cvtepi8_epi16(vk11x01234567);
7215 const __m128i vi11x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i11 + 8));
7216 const __m128i vxi11x89ABCDEF = _mm_cvtepi8_epi16(vi11x89ABCDEF);
7217 const __m128i vk11x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 184 * sizeof(int8_t)));
7218 const __m128i vxk11x89ABCDEF = _mm_cvtepi8_epi16(vk11x89ABCDEF);
7219 i11 += 16;
7220
7221
7222 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi11x01234567, vxk11x01234567));
7223 vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi11x89ABCDEF, vxk11x89ABCDEF));
7224
7225 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7226 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7227 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
7228 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
7229
7230 const __m128i vi12x01234567 = _mm_loadl_epi64((const __m128i*) i12);
7231 const __m128i vxi12x01234567 = _mm_cvtepi8_epi16(vi12x01234567);
7232 const __m128i vk12x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 192 * sizeof(int8_t)));
7233 const __m128i vxk12x01234567 = _mm_cvtepi8_epi16(vk12x01234567);
7234 const __m128i vi12x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i12 + 8));
7235 const __m128i vxi12x89ABCDEF = _mm_cvtepi8_epi16(vi12x89ABCDEF);
7236 const __m128i vk12x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 200 * sizeof(int8_t)));
7237 const __m128i vxk12x89ABCDEF = _mm_cvtepi8_epi16(vk12x89ABCDEF);
7238 i12 += 16;
7239
7240
7241 vprod01234567 = _mm_mullo_epi16(vxi12x01234567, vxk12x01234567);
7242 vprod89ABCDEF = _mm_mullo_epi16(vxi12x89ABCDEF, vxk12x89ABCDEF);
7243
7244
7245 const __m128i vi13x01234567 = _mm_loadl_epi64((const __m128i*) i13);
7246 const __m128i vxi13x01234567 = _mm_cvtepi8_epi16(vi13x01234567);
7247 const __m128i vk13x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 208 * sizeof(int8_t)));
7248 const __m128i vxk13x01234567 = _mm_cvtepi8_epi16(vk13x01234567);
7249 const __m128i vi13x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i13 + 8));
7250 const __m128i vxi13x89ABCDEF = _mm_cvtepi8_epi16(vi13x89ABCDEF);
7251 const __m128i vk13x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 216 * sizeof(int8_t)));
7252 const __m128i vxk13x89ABCDEF = _mm_cvtepi8_epi16(vk13x89ABCDEF);
7253 i13 += 16;
7254
7255
7256 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi13x01234567, vxk13x01234567));
7257 vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi13x89ABCDEF, vxk13x89ABCDEF));
7258
7259 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7260 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7261 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
7262 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
7263
7264 const __m128i vi14x01234567 = _mm_loadl_epi64((const __m128i*) i14);
7265 const __m128i vxi14x01234567 = _mm_cvtepi8_epi16(vi14x01234567);
7266 const __m128i vk14x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 224 * sizeof(int8_t)));
7267 const __m128i vxk14x01234567 = _mm_cvtepi8_epi16(vk14x01234567);
7268 const __m128i vi14x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i14 + 8));
7269 const __m128i vxi14x89ABCDEF = _mm_cvtepi8_epi16(vi14x89ABCDEF);
7270 const __m128i vk14x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 232 * sizeof(int8_t)));
7271 const __m128i vxk14x89ABCDEF = _mm_cvtepi8_epi16(vk14x89ABCDEF);
7272 i14 += 16;
7273
7274
7275 vprod01234567 = _mm_mullo_epi16(vxi14x01234567, vxk14x01234567);
7276 vprod89ABCDEF = _mm_mullo_epi16(vxi14x89ABCDEF, vxk14x89ABCDEF);
7277
7278
7279 const __m128i vi15x01234567 = _mm_loadl_epi64((const __m128i*) i15);
7280 const __m128i vxi15x01234567 = _mm_cvtepi8_epi16(vi15x01234567);
7281 const __m128i vk15x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 240 * sizeof(int8_t)));
7282 const __m128i vxk15x01234567 = _mm_cvtepi8_epi16(vk15x01234567);
7283 const __m128i vi15x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i15 + 8));
7284 const __m128i vxi15x89ABCDEF = _mm_cvtepi8_epi16(vi15x89ABCDEF);
7285 const __m128i vk15x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 248 * sizeof(int8_t)));
7286 const __m128i vxk15x89ABCDEF = _mm_cvtepi8_epi16(vk15x89ABCDEF);
7287 i15 += 16;
7288
7289
7290 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi15x01234567, vxk15x01234567));
7291 vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi15x89ABCDEF, vxk15x89ABCDEF));
7292
7293 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7294 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7295 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
7296 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
7297
7298 const __m128i vi16x01234567 = _mm_loadl_epi64((const __m128i*) i16);
7299 const __m128i vxi16x01234567 = _mm_cvtepi8_epi16(vi16x01234567);
7300 const __m128i vk16x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 256 * sizeof(int8_t)));
7301 const __m128i vxk16x01234567 = _mm_cvtepi8_epi16(vk16x01234567);
7302 const __m128i vi16x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i16 + 8));
7303 const __m128i vxi16x89ABCDEF = _mm_cvtepi8_epi16(vi16x89ABCDEF);
7304 const __m128i vk16x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 264 * sizeof(int8_t)));
7305 const __m128i vxk16x89ABCDEF = _mm_cvtepi8_epi16(vk16x89ABCDEF);
7306 i16 += 16;
7307
7308
7309 vprod01234567 = _mm_mullo_epi16(vxi16x01234567, vxk16x01234567);
7310 vprod89ABCDEF = _mm_mullo_epi16(vxi16x89ABCDEF, vxk16x89ABCDEF);
7311
7312
7313 const __m128i vi17x01234567 = _mm_loadl_epi64((const __m128i*) i17);
7314 const __m128i vxi17x01234567 = _mm_cvtepi8_epi16(vi17x01234567);
7315 const __m128i vk17x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 272 * sizeof(int8_t)));
7316 const __m128i vxk17x01234567 = _mm_cvtepi8_epi16(vk17x01234567);
7317 const __m128i vi17x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i17 + 8));
7318 const __m128i vxi17x89ABCDEF = _mm_cvtepi8_epi16(vi17x89ABCDEF);
7319 const __m128i vk17x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 280 * sizeof(int8_t)));
7320 const __m128i vxk17x89ABCDEF = _mm_cvtepi8_epi16(vk17x89ABCDEF);
7321 i17 += 16;
7322
7323
7324 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi17x01234567, vxk17x01234567));
7325 vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi17x89ABCDEF, vxk17x89ABCDEF));
7326
7327 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7328 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7329 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
7330 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
7331
7332 const __m128i vi18x01234567 = _mm_loadl_epi64((const __m128i*) i18);
7333 const __m128i vxi18x01234567 = _mm_cvtepi8_epi16(vi18x01234567);
7334 const __m128i vk18x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 288 * sizeof(int8_t)));
7335 const __m128i vxk18x01234567 = _mm_cvtepi8_epi16(vk18x01234567);
7336 const __m128i vi18x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i18 + 8));
7337 const __m128i vxi18x89ABCDEF = _mm_cvtepi8_epi16(vi18x89ABCDEF);
7338 const __m128i vk18x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 296 * sizeof(int8_t)));
7339 const __m128i vxk18x89ABCDEF = _mm_cvtepi8_epi16(vk18x89ABCDEF);
7340 i18 += 16;
7341
7342
7343 vprod01234567 = _mm_mullo_epi16(vxi18x01234567, vxk18x01234567);
7344 vprod89ABCDEF = _mm_mullo_epi16(vxi18x89ABCDEF, vxk18x89ABCDEF);
7345
7346
7347 const __m128i vi19x01234567 = _mm_loadl_epi64((const __m128i*) i19);
7348 const __m128i vxi19x01234567 = _mm_cvtepi8_epi16(vi19x01234567);
7349 const __m128i vk19x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 304 * sizeof(int8_t)));
7350 const __m128i vxk19x01234567 = _mm_cvtepi8_epi16(vk19x01234567);
7351 const __m128i vi19x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i19 + 8));
7352 const __m128i vxi19x89ABCDEF = _mm_cvtepi8_epi16(vi19x89ABCDEF);
7353 const __m128i vk19x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 312 * sizeof(int8_t)));
7354 const __m128i vxk19x89ABCDEF = _mm_cvtepi8_epi16(vk19x89ABCDEF);
7355 i19 += 16;
7356
7357
7358 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi19x01234567, vxk19x01234567));
7359 vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi19x89ABCDEF, vxk19x89ABCDEF));
7360
7361 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7362 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7363 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
7364 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
7365
7366 const __m128i vi20x01234567 = _mm_loadl_epi64((const __m128i*) i20);
7367 const __m128i vxi20x01234567 = _mm_cvtepi8_epi16(vi20x01234567);
7368 const __m128i vk20x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 320 * sizeof(int8_t)));
7369 const __m128i vxk20x01234567 = _mm_cvtepi8_epi16(vk20x01234567);
7370 const __m128i vi20x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i20 + 8));
7371 const __m128i vxi20x89ABCDEF = _mm_cvtepi8_epi16(vi20x89ABCDEF);
7372 const __m128i vk20x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 328 * sizeof(int8_t)));
7373 const __m128i vxk20x89ABCDEF = _mm_cvtepi8_epi16(vk20x89ABCDEF);
7374 i20 += 16;
7375
7376
7377 vprod01234567 = _mm_mullo_epi16(vxi20x01234567, vxk20x01234567);
7378 vprod89ABCDEF = _mm_mullo_epi16(vxi20x89ABCDEF, vxk20x89ABCDEF);
7379
7380
7381 const __m128i vi21x01234567 = _mm_loadl_epi64((const __m128i*) i21);
7382 const __m128i vxi21x01234567 = _mm_cvtepi8_epi16(vi21x01234567);
7383 const __m128i vk21x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 336 * sizeof(int8_t)));
7384 const __m128i vxk21x01234567 = _mm_cvtepi8_epi16(vk21x01234567);
7385 const __m128i vi21x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i21 + 8));
7386 const __m128i vxi21x89ABCDEF = _mm_cvtepi8_epi16(vi21x89ABCDEF);
7387 const __m128i vk21x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 344 * sizeof(int8_t)));
7388 const __m128i vxk21x89ABCDEF = _mm_cvtepi8_epi16(vk21x89ABCDEF);
7389 i21 += 16;
7390
7391
7392 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi21x01234567, vxk21x01234567));
7393 vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi21x89ABCDEF, vxk21x89ABCDEF));
7394
7395 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7396 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7397 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
7398 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
7399
7400 const __m128i vi22x01234567 = _mm_loadl_epi64((const __m128i*) i22);
7401 const __m128i vxi22x01234567 = _mm_cvtepi8_epi16(vi22x01234567);
7402 const __m128i vk22x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 352 * sizeof(int8_t)));
7403 const __m128i vxk22x01234567 = _mm_cvtepi8_epi16(vk22x01234567);
7404 const __m128i vi22x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i22 + 8));
7405 const __m128i vxi22x89ABCDEF = _mm_cvtepi8_epi16(vi22x89ABCDEF);
7406 const __m128i vk22x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 360 * sizeof(int8_t)));
7407 const __m128i vxk22x89ABCDEF = _mm_cvtepi8_epi16(vk22x89ABCDEF);
7408 i22 += 16;
7409
7410
7411 vprod01234567 = _mm_mullo_epi16(vxi22x01234567, vxk22x01234567);
7412 vprod89ABCDEF = _mm_mullo_epi16(vxi22x89ABCDEF, vxk22x89ABCDEF);
7413
7414
7415 const __m128i vi23x01234567 = _mm_loadl_epi64((const __m128i*) i23);
7416 const __m128i vxi23x01234567 = _mm_cvtepi8_epi16(vi23x01234567);
7417 const __m128i vk23x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 368 * sizeof(int8_t)));
7418 const __m128i vxk23x01234567 = _mm_cvtepi8_epi16(vk23x01234567);
7419 const __m128i vi23x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i23 + 8));
7420 const __m128i vxi23x89ABCDEF = _mm_cvtepi8_epi16(vi23x89ABCDEF);
7421 const __m128i vk23x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 376 * sizeof(int8_t)));
7422 const __m128i vxk23x89ABCDEF = _mm_cvtepi8_epi16(vk23x89ABCDEF);
7423 i23 += 16;
7424
7425
7426 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi23x01234567, vxk23x01234567));
7427 vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi23x89ABCDEF, vxk23x89ABCDEF));
7428
7429 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7430 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7431 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
7432 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
7433
7434 const __m128i vi24x01234567 = _mm_loadl_epi64((const __m128i*) i24);
7435 const __m128i vxi24x01234567 = _mm_cvtepi8_epi16(vi24x01234567);
7436 const __m128i vk24x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 384 * sizeof(int8_t)));
7437 const __m128i vxk24x01234567 = _mm_cvtepi8_epi16(vk24x01234567);
7438 const __m128i vi24x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i24 + 8));
7439 const __m128i vxi24x89ABCDEF = _mm_cvtepi8_epi16(vi24x89ABCDEF);
7440 const __m128i vk24x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 392 * sizeof(int8_t)));
7441 const __m128i vxk24x89ABCDEF = _mm_cvtepi8_epi16(vk24x89ABCDEF);
7442 i24 += 16;
7443
7444
7445 vprod01234567 = _mm_mullo_epi16(vxi24x01234567, vxk24x01234567);
7446 vprod89ABCDEF = _mm_mullo_epi16(vxi24x89ABCDEF, vxk24x89ABCDEF);
7447
7448 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7449 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7450 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
7451 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
7452
7453 w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(int8_t));
7454
7455 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
7456 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
7457 __m128 vscaled89AB = _mm_cvtepi32_ps(vacc89AB);
7458 __m128 vscaledCDEF = _mm_cvtepi32_ps(vaccCDEF);
7459
7460 const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
7461 vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
7462 vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
7463 vscaled89AB = _mm_mul_ps(vscaled89AB, vscale);
7464 vscaledCDEF = _mm_mul_ps(vscaledCDEF, vscale);
7465
7466 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
7467 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
7468 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
7469 vscaled89AB = _mm_min_ps(vscaled89AB, voutput_max_less_zero_point);
7470 vscaledCDEF = _mm_min_ps(vscaledCDEF, voutput_max_less_zero_point);
7471
7472 vacc0123 = _mm_cvtps_epi32(vscaled0123);
7473 vacc4567 = _mm_cvtps_epi32(vscaled4567);
7474 vacc89AB = _mm_cvtps_epi32(vscaled89AB);
7475 vaccCDEF = _mm_cvtps_epi32(vscaledCDEF);
7476
7477 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
7478 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
7479 __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
7480
7481
7482 __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
7483
7484 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
7485 vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
7486
7487 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
7488 output += 16;
7489 }
7490 if XNN_UNLIKELY(c != 0) {
7491 const int8_t* k = (const int8_t*) ((const int32_t*) w + 16);
7492 do {
7493 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
7494 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
7495
7496
7497 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
7498 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
7499 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) k);
7500 const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
7501 i0 += 8;
7502
7503
7504 __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
7505
7506
7507 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
7508 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
7509 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) (k + 16));
7510 const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
7511 i1 += 8;
7512
7513
7514 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi1x01234567, vxk1x01234567));
7515
7516 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7517 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7518
7519 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
7520 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
7521 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) (k + 32));
7522 const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
7523 i2 += 8;
7524
7525
7526 vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
7527
7528
7529 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
7530 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567);
7531 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) (k + 48));
7532 const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567);
7533 i3 += 8;
7534
7535
7536 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi3x01234567, vxk3x01234567));
7537
7538 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7539 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7540
7541 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
7542 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567);
7543 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) (k + 64));
7544 const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567);
7545 i4 += 8;
7546
7547
7548 vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
7549
7550
7551 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
7552 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567);
7553 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) (k + 80));
7554 const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567);
7555 i5 += 8;
7556
7557
7558 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi5x01234567, vxk5x01234567));
7559
7560 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7561 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7562
7563 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
7564 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567);
7565 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) (k + 96));
7566 const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567);
7567 i6 += 8;
7568
7569
7570 vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
7571
7572
7573 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
7574 const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567);
7575 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) (k + 112));
7576 const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567);
7577 i7 += 8;
7578
7579
7580 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi7x01234567, vxk7x01234567));
7581
7582 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7583 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7584
7585 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
7586 const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567);
7587 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) (k + 128));
7588 const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567);
7589 i8 += 8;
7590
7591
7592 vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
7593
7594
7595 const __m128i vi9x01234567 = _mm_loadl_epi64((const __m128i*) i9);
7596 const __m128i vxi9x01234567 = _mm_cvtepi8_epi16(vi9x01234567);
7597 const __m128i vk9x01234567 = _mm_loadl_epi64((const __m128i*) (k + 144));
7598 const __m128i vxk9x01234567 = _mm_cvtepi8_epi16(vk9x01234567);
7599 i9 += 8;
7600
7601
7602 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi9x01234567, vxk9x01234567));
7603
7604 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7605 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7606
7607 const __m128i vi10x01234567 = _mm_loadl_epi64((const __m128i*) i10);
7608 const __m128i vxi10x01234567 = _mm_cvtepi8_epi16(vi10x01234567);
7609 const __m128i vk10x01234567 = _mm_loadl_epi64((const __m128i*) (k + 160));
7610 const __m128i vxk10x01234567 = _mm_cvtepi8_epi16(vk10x01234567);
7611 i10 += 8;
7612
7613
7614 vprod01234567 = _mm_mullo_epi16(vxi10x01234567, vxk10x01234567);
7615
7616
7617 const __m128i vi11x01234567 = _mm_loadl_epi64((const __m128i*) i11);
7618 const __m128i vxi11x01234567 = _mm_cvtepi8_epi16(vi11x01234567);
7619 const __m128i vk11x01234567 = _mm_loadl_epi64((const __m128i*) (k + 176));
7620 const __m128i vxk11x01234567 = _mm_cvtepi8_epi16(vk11x01234567);
7621 i11 += 8;
7622
7623
7624 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi11x01234567, vxk11x01234567));
7625
7626 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7627 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7628
7629 const __m128i vi12x01234567 = _mm_loadl_epi64((const __m128i*) i12);
7630 const __m128i vxi12x01234567 = _mm_cvtepi8_epi16(vi12x01234567);
7631 const __m128i vk12x01234567 = _mm_loadl_epi64((const __m128i*) (k + 192));
7632 const __m128i vxk12x01234567 = _mm_cvtepi8_epi16(vk12x01234567);
7633 i12 += 8;
7634
7635
7636 vprod01234567 = _mm_mullo_epi16(vxi12x01234567, vxk12x01234567);
7637
7638
7639 const __m128i vi13x01234567 = _mm_loadl_epi64((const __m128i*) i13);
7640 const __m128i vxi13x01234567 = _mm_cvtepi8_epi16(vi13x01234567);
7641 const __m128i vk13x01234567 = _mm_loadl_epi64((const __m128i*) (k + 208));
7642 const __m128i vxk13x01234567 = _mm_cvtepi8_epi16(vk13x01234567);
7643 i13 += 8;
7644
7645
7646 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi13x01234567, vxk13x01234567));
7647
7648 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7649 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7650
7651 const __m128i vi14x01234567 = _mm_loadl_epi64((const __m128i*) i14);
7652 const __m128i vxi14x01234567 = _mm_cvtepi8_epi16(vi14x01234567);
7653 const __m128i vk14x01234567 = _mm_loadl_epi64((const __m128i*) (k + 224));
7654 const __m128i vxk14x01234567 = _mm_cvtepi8_epi16(vk14x01234567);
7655 i14 += 8;
7656
7657
7658 vprod01234567 = _mm_mullo_epi16(vxi14x01234567, vxk14x01234567);
7659
7660
7661 const __m128i vi15x01234567 = _mm_loadl_epi64((const __m128i*) i15);
7662 const __m128i vxi15x01234567 = _mm_cvtepi8_epi16(vi15x01234567);
7663 const __m128i vk15x01234567 = _mm_loadl_epi64((const __m128i*) (k + 240));
7664 const __m128i vxk15x01234567 = _mm_cvtepi8_epi16(vk15x01234567);
7665 i15 += 8;
7666
7667
7668 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi15x01234567, vxk15x01234567));
7669
7670 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7671 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7672
7673 const __m128i vi16x01234567 = _mm_loadl_epi64((const __m128i*) i16);
7674 const __m128i vxi16x01234567 = _mm_cvtepi8_epi16(vi16x01234567);
7675 const __m128i vk16x01234567 = _mm_loadl_epi64((const __m128i*) (k + 256));
7676 const __m128i vxk16x01234567 = _mm_cvtepi8_epi16(vk16x01234567);
7677 i16 += 8;
7678
7679
7680 vprod01234567 = _mm_mullo_epi16(vxi16x01234567, vxk16x01234567);
7681
7682
7683 const __m128i vi17x01234567 = _mm_loadl_epi64((const __m128i*) i17);
7684 const __m128i vxi17x01234567 = _mm_cvtepi8_epi16(vi17x01234567);
7685 const __m128i vk17x01234567 = _mm_loadl_epi64((const __m128i*) (k + 272));
7686 const __m128i vxk17x01234567 = _mm_cvtepi8_epi16(vk17x01234567);
7687 i17 += 8;
7688
7689
7690 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi17x01234567, vxk17x01234567));
7691
7692 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7693 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7694
7695 const __m128i vi18x01234567 = _mm_loadl_epi64((const __m128i*) i18);
7696 const __m128i vxi18x01234567 = _mm_cvtepi8_epi16(vi18x01234567);
7697 const __m128i vk18x01234567 = _mm_loadl_epi64((const __m128i*) (k + 288));
7698 const __m128i vxk18x01234567 = _mm_cvtepi8_epi16(vk18x01234567);
7699 i18 += 8;
7700
7701
7702 vprod01234567 = _mm_mullo_epi16(vxi18x01234567, vxk18x01234567);
7703
7704
7705 const __m128i vi19x01234567 = _mm_loadl_epi64((const __m128i*) i19);
7706 const __m128i vxi19x01234567 = _mm_cvtepi8_epi16(vi19x01234567);
7707 const __m128i vk19x01234567 = _mm_loadl_epi64((const __m128i*) (k + 304));
7708 const __m128i vxk19x01234567 = _mm_cvtepi8_epi16(vk19x01234567);
7709 i19 += 8;
7710
7711
7712 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi19x01234567, vxk19x01234567));
7713
7714 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7715 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7716
7717 const __m128i vi20x01234567 = _mm_loadl_epi64((const __m128i*) i20);
7718 const __m128i vxi20x01234567 = _mm_cvtepi8_epi16(vi20x01234567);
7719 const __m128i vk20x01234567 = _mm_loadl_epi64((const __m128i*) (k + 320));
7720 const __m128i vxk20x01234567 = _mm_cvtepi8_epi16(vk20x01234567);
7721 i20 += 8;
7722
7723
7724 vprod01234567 = _mm_mullo_epi16(vxi20x01234567, vxk20x01234567);
7725
7726
7727 const __m128i vi21x01234567 = _mm_loadl_epi64((const __m128i*) i21);
7728 const __m128i vxi21x01234567 = _mm_cvtepi8_epi16(vi21x01234567);
7729 const __m128i vk21x01234567 = _mm_loadl_epi64((const __m128i*) (k + 336));
7730 const __m128i vxk21x01234567 = _mm_cvtepi8_epi16(vk21x01234567);
7731 i21 += 8;
7732
7733
7734 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi21x01234567, vxk21x01234567));
7735
7736 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7737 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7738
7739 const __m128i vi22x01234567 = _mm_loadl_epi64((const __m128i*) i22);
7740 const __m128i vxi22x01234567 = _mm_cvtepi8_epi16(vi22x01234567);
7741 const __m128i vk22x01234567 = _mm_loadl_epi64((const __m128i*) (k + 352));
7742 const __m128i vxk22x01234567 = _mm_cvtepi8_epi16(vk22x01234567);
7743 i22 += 8;
7744
7745
7746 vprod01234567 = _mm_mullo_epi16(vxi22x01234567, vxk22x01234567);
7747
7748
7749 const __m128i vi23x01234567 = _mm_loadl_epi64((const __m128i*) i23);
7750 const __m128i vxi23x01234567 = _mm_cvtepi8_epi16(vi23x01234567);
7751 const __m128i vk23x01234567 = _mm_loadl_epi64((const __m128i*) (k + 368));
7752 const __m128i vxk23x01234567 = _mm_cvtepi8_epi16(vk23x01234567);
7753 i23 += 8;
7754
7755
7756 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi23x01234567, vxk23x01234567));
7757
7758 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7759 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7760
7761 const __m128i vi24x01234567 = _mm_loadl_epi64((const __m128i*) i24);
7762 const __m128i vxi24x01234567 = _mm_cvtepi8_epi16(vi24x01234567);
7763 const __m128i vk24x01234567 = _mm_loadl_epi64((const __m128i*) (k + 384));
7764 const __m128i vxk24x01234567 = _mm_cvtepi8_epi16(vk24x01234567);
7765 i24 += 8;
7766
7767
7768 vprod01234567 = _mm_mullo_epi16(vxi24x01234567, vxk24x01234567);
7769
7770 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7771 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7772
7773 k += 8;
7774
7775 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
7776 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
7777
7778 const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
7779 vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
7780 vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
7781
7782 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
7783 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
7784 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
7785
7786 vacc0123 = _mm_cvtps_epi32(vscaled0123);
7787 vacc4567 = _mm_cvtps_epi32(vscaled4567);
7788
7789 w = (const void*) ((const int32_t*) w + 8);
7790
7791 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
7792 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
7793
7794
7795 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
7796
7797 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
7798
7799 if XNN_LIKELY(c >= 8) {
7800 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
7801 output += 8;
7802 c -= 8;
7803 } else {
7804 if (c & 4) {
7805 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
7806 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
7807 output += 4;
7808 }
7809 if (c & 2) {
7810 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
7811 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
7812 output += 2;
7813 }
7814 if (c & 1) {
7815 *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
7816 output += 1;
7817 }
7818 c = 0;
7819 }
7820 } while (c != 0);
7821 }
7822
7823 output = (int8_t*) ((uintptr_t) output + output_increment);
7824 } while (--output_width != 0);
7825 }
7826
xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16_add16(size_t channels,size_t output_width,const int8_t ** input,const void * weights,int8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])7827 void xnn_qs8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16_add16(
7828 size_t channels,
7829 size_t output_width,
7830 const int8_t** input,
7831 const void* weights,
7832 int8_t* output,
7833 size_t input_stride,
7834 size_t output_increment,
7835 size_t input_offset,
7836 const int8_t* zero,
7837 const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
7838 {
7839 assert(channels != 0);
7840 assert(output_width != 0);
7841
7842 do {
7843 const int8_t* i0 = input[0];
7844 assert(i0 != NULL);
7845 if XNN_UNPREDICTABLE(i0 != zero) {
7846 i0 = (const int8_t*) ((uintptr_t) i0 + input_offset);
7847 }
7848 const int8_t* i1 = input[1];
7849 assert(i1 != NULL);
7850 if XNN_UNPREDICTABLE(i1 != zero) {
7851 i1 = (const int8_t*) ((uintptr_t) i1 + input_offset);
7852 }
7853 const int8_t* i2 = input[2];
7854 assert(i2 != NULL);
7855 if XNN_UNPREDICTABLE(i2 != zero) {
7856 i2 = (const int8_t*) ((uintptr_t) i2 + input_offset);
7857 }
7858 const int8_t* i3 = input[3];
7859 assert(i3 != NULL);
7860 if XNN_UNPREDICTABLE(i3 != zero) {
7861 i3 = (const int8_t*) ((uintptr_t) i3 + input_offset);
7862 }
7863 const int8_t* i4 = input[4];
7864 assert(i4 != NULL);
7865 if XNN_UNPREDICTABLE(i4 != zero) {
7866 i4 = (const int8_t*) ((uintptr_t) i4 + input_offset);
7867 }
7868 const int8_t* i5 = input[5];
7869 assert(i5 != NULL);
7870 if XNN_UNPREDICTABLE(i5 != zero) {
7871 i5 = (const int8_t*) ((uintptr_t) i5 + input_offset);
7872 }
7873 const int8_t* i6 = input[6];
7874 assert(i6 != NULL);
7875 if XNN_UNPREDICTABLE(i6 != zero) {
7876 i6 = (const int8_t*) ((uintptr_t) i6 + input_offset);
7877 }
7878 const int8_t* i7 = input[7];
7879 assert(i7 != NULL);
7880 if XNN_UNPREDICTABLE(i7 != zero) {
7881 i7 = (const int8_t*) ((uintptr_t) i7 + input_offset);
7882 }
7883 const int8_t* i8 = input[8];
7884 assert(i8 != NULL);
7885 if XNN_UNPREDICTABLE(i8 != zero) {
7886 i8 = (const int8_t*) ((uintptr_t) i8 + input_offset);
7887 }
7888 input = (const int8_t**) ((uintptr_t) input + input_stride);
7889
7890 size_t c = channels;
7891 const void* w = weights;
7892 for (; c >= 16; c -= 16) {
7893 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
7894 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
7895 __m128i vacc89AB = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 8));
7896 __m128i vaccCDEF = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 12));
7897
7898
7899 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
7900 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
7901 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(int8_t)));
7902 const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
7903 const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8));
7904 const __m128i vxi0x89ABCDEF = _mm_cvtepi8_epi16(vi0x89ABCDEF);
7905 const __m128i vk0x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(int8_t)));
7906 const __m128i vxk0x89ABCDEF = _mm_cvtepi8_epi16(vk0x89ABCDEF);
7907 i0 += 16;
7908
7909
7910 __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
7911 __m128i vprod89ABCDEF = _mm_mullo_epi16(vxi0x89ABCDEF, vxk0x89ABCDEF);
7912
7913
7914 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
7915 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
7916 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(int8_t)));
7917 const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
7918 const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8));
7919 const __m128i vxi1x89ABCDEF = _mm_cvtepi8_epi16(vi1x89ABCDEF);
7920 const __m128i vk1x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(int8_t)));
7921 const __m128i vxk1x89ABCDEF = _mm_cvtepi8_epi16(vk1x89ABCDEF);
7922 i1 += 16;
7923
7924
7925 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi1x01234567, vxk1x01234567));
7926 vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi1x89ABCDEF, vxk1x89ABCDEF));
7927
7928 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7929 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7930 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
7931 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
7932
7933 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
7934 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
7935 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(int8_t)));
7936 const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
7937 const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8));
7938 const __m128i vxi2x89ABCDEF = _mm_cvtepi8_epi16(vi2x89ABCDEF);
7939 const __m128i vk2x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(int8_t)));
7940 const __m128i vxk2x89ABCDEF = _mm_cvtepi8_epi16(vk2x89ABCDEF);
7941 i2 += 16;
7942
7943
7944 vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
7945 vprod89ABCDEF = _mm_mullo_epi16(vxi2x89ABCDEF, vxk2x89ABCDEF);
7946
7947
7948 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
7949 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567);
7950 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(int8_t)));
7951 const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567);
7952 const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8));
7953 const __m128i vxi3x89ABCDEF = _mm_cvtepi8_epi16(vi3x89ABCDEF);
7954 const __m128i vk3x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(int8_t)));
7955 const __m128i vxk3x89ABCDEF = _mm_cvtepi8_epi16(vk3x89ABCDEF);
7956 i3 += 16;
7957
7958
7959 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi3x01234567, vxk3x01234567));
7960 vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi3x89ABCDEF, vxk3x89ABCDEF));
7961
7962 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7963 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7964 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
7965 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
7966
7967 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
7968 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567);
7969 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(int8_t)));
7970 const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567);
7971 const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8));
7972 const __m128i vxi4x89ABCDEF = _mm_cvtepi8_epi16(vi4x89ABCDEF);
7973 const __m128i vk4x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(int8_t)));
7974 const __m128i vxk4x89ABCDEF = _mm_cvtepi8_epi16(vk4x89ABCDEF);
7975 i4 += 16;
7976
7977
7978 vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
7979 vprod89ABCDEF = _mm_mullo_epi16(vxi4x89ABCDEF, vxk4x89ABCDEF);
7980
7981
7982 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
7983 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567);
7984 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(int8_t)));
7985 const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567);
7986 const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8));
7987 const __m128i vxi5x89ABCDEF = _mm_cvtepi8_epi16(vi5x89ABCDEF);
7988 const __m128i vk5x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(int8_t)));
7989 const __m128i vxk5x89ABCDEF = _mm_cvtepi8_epi16(vk5x89ABCDEF);
7990 i5 += 16;
7991
7992
7993 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi5x01234567, vxk5x01234567));
7994 vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi5x89ABCDEF, vxk5x89ABCDEF));
7995
7996 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
7997 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
7998 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
7999 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
8000
8001 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
8002 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567);
8003 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(int8_t)));
8004 const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567);
8005 const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8));
8006 const __m128i vxi6x89ABCDEF = _mm_cvtepi8_epi16(vi6x89ABCDEF);
8007 const __m128i vk6x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(int8_t)));
8008 const __m128i vxk6x89ABCDEF = _mm_cvtepi8_epi16(vk6x89ABCDEF);
8009 i6 += 16;
8010
8011
8012 vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
8013 vprod89ABCDEF = _mm_mullo_epi16(vxi6x89ABCDEF, vxk6x89ABCDEF);
8014
8015
8016 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
8017 const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567);
8018 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(int8_t)));
8019 const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567);
8020 const __m128i vi7x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i7 + 8));
8021 const __m128i vxi7x89ABCDEF = _mm_cvtepi8_epi16(vi7x89ABCDEF);
8022 const __m128i vk7x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(int8_t)));
8023 const __m128i vxk7x89ABCDEF = _mm_cvtepi8_epi16(vk7x89ABCDEF);
8024 i7 += 16;
8025
8026
8027 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi7x01234567, vxk7x01234567));
8028 vprod89ABCDEF = _mm_add_epi16(vprod89ABCDEF, _mm_mullo_epi16(vxi7x89ABCDEF, vxk7x89ABCDEF));
8029
8030 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
8031 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
8032 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
8033 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
8034
8035 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
8036 const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567);
8037 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(int8_t)));
8038 const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567);
8039 const __m128i vi8x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i8 + 8));
8040 const __m128i vxi8x89ABCDEF = _mm_cvtepi8_epi16(vi8x89ABCDEF);
8041 const __m128i vk8x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(int8_t)));
8042 const __m128i vxk8x89ABCDEF = _mm_cvtepi8_epi16(vk8x89ABCDEF);
8043 i8 += 16;
8044
8045
8046 vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
8047 vprod89ABCDEF = _mm_mullo_epi16(vxi8x89ABCDEF, vxk8x89ABCDEF);
8048
8049 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
8050 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
8051 vacc89AB = _mm_add_epi32(vacc89AB, _mm_cvtepi16_epi32(vprod89ABCDEF));
8052 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_srai_epi32(_mm_unpackhi_epi16(vprod89ABCDEF, vprod89ABCDEF), 16));
8053
8054 w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(int8_t));
8055
8056 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
8057 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
8058 __m128 vscaled89AB = _mm_cvtepi32_ps(vacc89AB);
8059 __m128 vscaledCDEF = _mm_cvtepi32_ps(vaccCDEF);
8060
8061 const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
8062 vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
8063 vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
8064 vscaled89AB = _mm_mul_ps(vscaled89AB, vscale);
8065 vscaledCDEF = _mm_mul_ps(vscaledCDEF, vscale);
8066
8067 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
8068 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
8069 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
8070 vscaled89AB = _mm_min_ps(vscaled89AB, voutput_max_less_zero_point);
8071 vscaledCDEF = _mm_min_ps(vscaledCDEF, voutput_max_less_zero_point);
8072
8073 vacc0123 = _mm_cvtps_epi32(vscaled0123);
8074 vacc4567 = _mm_cvtps_epi32(vscaled4567);
8075 vacc89AB = _mm_cvtps_epi32(vscaled89AB);
8076 vaccCDEF = _mm_cvtps_epi32(vscaledCDEF);
8077
8078 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
8079 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
8080 __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
8081
8082
8083 __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
8084
8085 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
8086 vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
8087
8088 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
8089 output += 16;
8090 }
8091 if XNN_UNLIKELY(c != 0) {
8092 const int8_t* k = (const int8_t*) ((const int32_t*) w + 16);
8093 do {
8094 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
8095 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
8096
8097
8098 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
8099 const __m128i vxi0x01234567 = _mm_cvtepi8_epi16(vi0x01234567);
8100 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) k);
8101 const __m128i vxk0x01234567 = _mm_cvtepi8_epi16(vk0x01234567);
8102 i0 += 8;
8103
8104
8105 __m128i vprod01234567 = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
8106
8107
8108 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
8109 const __m128i vxi1x01234567 = _mm_cvtepi8_epi16(vi1x01234567);
8110 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) (k + 16));
8111 const __m128i vxk1x01234567 = _mm_cvtepi8_epi16(vk1x01234567);
8112 i1 += 8;
8113
8114
8115 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi1x01234567, vxk1x01234567));
8116
8117 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
8118 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
8119
8120 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
8121 const __m128i vxi2x01234567 = _mm_cvtepi8_epi16(vi2x01234567);
8122 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) (k + 32));
8123 const __m128i vxk2x01234567 = _mm_cvtepi8_epi16(vk2x01234567);
8124 i2 += 8;
8125
8126
8127 vprod01234567 = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
8128
8129
8130 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
8131 const __m128i vxi3x01234567 = _mm_cvtepi8_epi16(vi3x01234567);
8132 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) (k + 48));
8133 const __m128i vxk3x01234567 = _mm_cvtepi8_epi16(vk3x01234567);
8134 i3 += 8;
8135
8136
8137 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi3x01234567, vxk3x01234567));
8138
8139 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
8140 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
8141
8142 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
8143 const __m128i vxi4x01234567 = _mm_cvtepi8_epi16(vi4x01234567);
8144 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) (k + 64));
8145 const __m128i vxk4x01234567 = _mm_cvtepi8_epi16(vk4x01234567);
8146 i4 += 8;
8147
8148
8149 vprod01234567 = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
8150
8151
8152 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
8153 const __m128i vxi5x01234567 = _mm_cvtepi8_epi16(vi5x01234567);
8154 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) (k + 80));
8155 const __m128i vxk5x01234567 = _mm_cvtepi8_epi16(vk5x01234567);
8156 i5 += 8;
8157
8158
8159 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi5x01234567, vxk5x01234567));
8160
8161 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
8162 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
8163
8164 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
8165 const __m128i vxi6x01234567 = _mm_cvtepi8_epi16(vi6x01234567);
8166 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) (k + 96));
8167 const __m128i vxk6x01234567 = _mm_cvtepi8_epi16(vk6x01234567);
8168 i6 += 8;
8169
8170
8171 vprod01234567 = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
8172
8173
8174 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
8175 const __m128i vxi7x01234567 = _mm_cvtepi8_epi16(vi7x01234567);
8176 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) (k + 112));
8177 const __m128i vxk7x01234567 = _mm_cvtepi8_epi16(vk7x01234567);
8178 i7 += 8;
8179
8180
8181 vprod01234567 = _mm_add_epi16(vprod01234567, _mm_mullo_epi16(vxi7x01234567, vxk7x01234567));
8182
8183 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
8184 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
8185
8186 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
8187 const __m128i vxi8x01234567 = _mm_cvtepi8_epi16(vi8x01234567);
8188 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) (k + 128));
8189 const __m128i vxk8x01234567 = _mm_cvtepi8_epi16(vk8x01234567);
8190 i8 += 8;
8191
8192
8193 vprod01234567 = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
8194
8195 vacc0123 = _mm_add_epi32(vacc0123, _mm_cvtepi16_epi32(vprod01234567));
8196 vacc4567 = _mm_add_epi32(vacc4567, _mm_srai_epi32(_mm_unpackhi_epi16(vprod01234567, vprod01234567), 16));
8197
8198 k += 8;
8199
8200 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
8201 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
8202
8203 const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
8204 vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
8205 vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
8206
8207 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
8208 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
8209 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
8210
8211 vacc0123 = _mm_cvtps_epi32(vscaled0123);
8212 vacc4567 = _mm_cvtps_epi32(vscaled4567);
8213
8214 w = (const void*) ((const int32_t*) w + 8);
8215
8216 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
8217 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
8218
8219
8220 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
8221
8222 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
8223
8224 if XNN_LIKELY(c >= 8) {
8225 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
8226 output += 8;
8227 c -= 8;
8228 } else {
8229 if (c & 4) {
8230 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
8231 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
8232 output += 4;
8233 }
8234 if (c & 2) {
8235 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
8236 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
8237 output += 2;
8238 }
8239 if (c & 1) {
8240 *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
8241 output += 1;
8242 }
8243 c = 0;
8244 }
8245 } while (c != 0);
8246 }
8247
8248 output = (int8_t*) ((uintptr_t) output + output_increment);
8249 } while (--output_width != 0);
8250 }
8251
xnn_qs8_f32_vcvt_ukernel__avx_x32(size_t n,const int8_t * x,float * y,const union xnn_qs8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])8252 void xnn_qs8_f32_vcvt_ukernel__avx_x32(
8253 size_t n,
8254 const int8_t* x,
8255 float* y,
8256 const union xnn_qs8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8257 {
8258 assert(n != 0);
8259 assert(n % sizeof(int8_t) == 0);
8260 assert(x != NULL);
8261 assert(y != NULL);
8262
8263 const __m128i vminus_zero_point = _mm_load_si128((const __m128i*) params->avx.minus_zero_point);
8264 const __m256 vscale = _mm256_load_ps(params->avx.scale);
8265 for (; n >= 32 * sizeof(int8_t); n -= 32 * sizeof(int8_t)) {
8266 __m128i vx0123 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x)));
8267 __m128i vx4567 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 4)));
8268 __m128i vx89AB = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 8)));
8269 __m128i vxCDEF = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 12)));
8270 __m128i vxGHIJ = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 16)));
8271 __m128i vxKLMN = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 20)));
8272 __m128i vxOPQR = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 24)));
8273 __m128i vxSTUV = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 28)));
8274 x += 32;
8275
8276 vx0123 = _mm_add_epi32(vx0123, vminus_zero_point);
8277 vx4567 = _mm_add_epi32(vx4567, vminus_zero_point);
8278 vx89AB = _mm_add_epi32(vx89AB, vminus_zero_point);
8279 vxCDEF = _mm_add_epi32(vxCDEF, vminus_zero_point);
8280 vxGHIJ = _mm_add_epi32(vxGHIJ, vminus_zero_point);
8281 vxKLMN = _mm_add_epi32(vxKLMN, vminus_zero_point);
8282 vxOPQR = _mm_add_epi32(vxOPQR, vminus_zero_point);
8283 vxSTUV = _mm_add_epi32(vxSTUV, vminus_zero_point);
8284
8285 const __m256i vx01234567 = _mm256_insertf128_si256(_mm256_castsi128_si256(vx0123), vx4567, 1);
8286 const __m256i vx89ABCDEF = _mm256_insertf128_si256(_mm256_castsi128_si256(vx89AB), vxCDEF, 1);
8287 const __m256i vxGHIJKLMN = _mm256_insertf128_si256(_mm256_castsi128_si256(vxGHIJ), vxKLMN, 1);
8288 const __m256i vxOPQRSTUV = _mm256_insertf128_si256(_mm256_castsi128_si256(vxOPQR), vxSTUV, 1);
8289
8290 __m256 vy01234567 = _mm256_cvtepi32_ps(vx01234567);
8291 __m256 vy89ABCDEF = _mm256_cvtepi32_ps(vx89ABCDEF);
8292 __m256 vyGHIJKLMN = _mm256_cvtepi32_ps(vxGHIJKLMN);
8293 __m256 vyOPQRSTUV = _mm256_cvtepi32_ps(vxOPQRSTUV);
8294
8295 vy01234567 = _mm256_mul_ps(vy01234567, vscale);
8296 vy89ABCDEF = _mm256_mul_ps(vy89ABCDEF, vscale);
8297 vyGHIJKLMN = _mm256_mul_ps(vyGHIJKLMN, vscale);
8298 vyOPQRSTUV = _mm256_mul_ps(vyOPQRSTUV, vscale);
8299
8300 _mm256_storeu_ps(y, vy01234567);
8301 _mm256_storeu_ps(y + 8, vy89ABCDEF);
8302 _mm256_storeu_ps(y + 16, vyGHIJKLMN);
8303 _mm256_storeu_ps(y + 24, vyOPQRSTUV);
8304 y += 32;
8305 }
8306 for (; n >= 4 * sizeof(int8_t); n -= 4 * sizeof(int8_t)) {
8307 __m128i vx = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x)));
8308 vx = _mm_add_epi32(vx, vminus_zero_point);
8309 x += 4;
8310
8311 __m128 vy = _mm_cvtepi32_ps(vx);
8312 vy = _mm_mul_ps(vy, _mm256_castps256_ps128(vscale));
8313
8314 _mm_storeu_ps(y, vy);
8315 y += 4;
8316 }
8317 if XNN_UNLIKELY(n != 0) {
8318 assert(n >= 1 * sizeof(int8_t));
8319 assert(n <= 3 * sizeof(int8_t));
8320
8321 __m128i vx = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x)));
8322 vx = _mm_add_epi32(vx, vminus_zero_point);
8323
8324 __m128 vy = _mm_cvtepi32_ps(vx);
8325 vy = _mm_mul_ps(vy, _mm256_castps256_ps128(vscale));
8326
8327 if (n & (2 * sizeof(int8_t))) {
8328 _mm_storel_pi((__m64*) y, vy);
8329 vy = _mm_movehl_ps(vy, vy);
8330 y += 2;
8331 }
8332 if (n & (1 * sizeof(int8_t))) {
8333 _mm_store_ss(y, vy);
8334 }
8335 }
8336 }
8337
xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])8338 void xnn_qs8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128(
8339 size_t mr,
8340 size_t nc,
8341 size_t kc,
8342 const int8_t* restrict a,
8343 size_t a_stride,
8344 const void* restrict w,
8345 int8_t* restrict c,
8346 size_t cm_stride,
8347 size_t cn_stride,
8348 const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8349 {
8350 assert(mr != 0);
8351 assert(mr <= 1);
8352 assert(nc != 0);
8353 assert(kc != 0);
8354 assert(kc % sizeof(int8_t) == 0);
8355 assert(a != NULL);
8356 assert(w != NULL);
8357 assert(c != NULL);
8358
8359 kc = round_up_po2(kc, 8);
8360 const int8_t* a0 = a;
8361 int8_t* c0 = c;
8362
8363 do {
8364 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
8365 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
8366 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
8367 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
8368 w = (const int32_t*) w + 4;
8369
8370 size_t k = 0;
8371 while (k < kc) {
8372 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
8373 const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
8374 a0 += 8;
8375
8376 const __m128i vb01 = _mm_load_si128((const __m128i*) w);
8377 const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
8378 const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
8379
8380 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
8381 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
8382 const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
8383 const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
8384 const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
8385
8386 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
8387 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
8388
8389 w = (const void*) ((const int8_t*) w + 32);
8390 k += 8 * sizeof(int8_t);
8391 }
8392
8393 const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
8394 const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
8395
8396 __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
8397
8398 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
8399
8400 const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
8401 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
8402
8403 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
8404 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
8405
8406 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
8407
8408 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
8409 __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
8410
8411
8412 __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
8413
8414 vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
8415
8416 if (nc >= 4) {
8417 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
8418
8419 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
8420
8421 a0 = (const int8_t*) ((uintptr_t) a0 - kc);
8422
8423 nc -= 4;
8424 } else {
8425 if (nc & 2) {
8426 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
8427 c0 += 2;
8428 vout = _mm_srli_epi32(vout, 16);
8429 }
8430 if (nc & 1) {
8431 *c0 = (int8_t) _mm_extract_epi8(vout, 0);
8432 }
8433
8434 nc = 0;
8435 }
8436 } while (nc != 0);
8437 }
8438
xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128(size_t mr,size_t nc,size_t kc,const int8_t * restrict a,size_t a_stride,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])8439 void xnn_qs8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128(
8440 size_t mr,
8441 size_t nc,
8442 size_t kc,
8443 const int8_t* restrict a,
8444 size_t a_stride,
8445 const void* restrict w,
8446 int8_t* restrict c,
8447 size_t cm_stride,
8448 size_t cn_stride,
8449 const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8450 {
8451 assert(mr != 0);
8452 assert(mr <= 2);
8453 assert(nc != 0);
8454 assert(kc != 0);
8455 assert(kc % sizeof(int8_t) == 0);
8456 assert(a != NULL);
8457 assert(w != NULL);
8458 assert(c != NULL);
8459
8460 kc = round_up_po2(kc, 8);
8461 const int8_t* a0 = a;
8462 int8_t* c0 = c;
8463 const int8_t* a1 = (const int8_t*) ((uintptr_t) a0 + a_stride);
8464 int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
8465 if XNN_UNPREDICTABLE(mr != 2) {
8466 a1 = a0;
8467 c1 = c0;
8468 }
8469
8470 do {
8471 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
8472 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
8473 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
8474 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
8475 __m128i vacc1x0 = vacc0x0;
8476 __m128i vacc1x1 = vacc0x1;
8477 __m128i vacc1x2 = vacc0x2;
8478 __m128i vacc1x3 = vacc0x3;
8479 w = (const int32_t*) w + 4;
8480
8481 size_t k = 0;
8482 while (k < kc) {
8483 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
8484 const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
8485 a0 += 8;
8486 const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
8487 const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
8488 a1 += 8;
8489
8490 const __m128i vb01 = _mm_load_si128((const __m128i*) w);
8491 const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
8492 const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
8493
8494 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
8495 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
8496 vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
8497 vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
8498 const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
8499 const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
8500 const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
8501
8502 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
8503 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
8504 vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
8505 vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
8506
8507 w = (const void*) ((const int8_t*) w + 32);
8508 k += 8 * sizeof(int8_t);
8509 }
8510
8511 const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
8512 const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
8513 const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
8514 const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
8515
8516 __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
8517 __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
8518
8519 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
8520 __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
8521
8522 const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
8523 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
8524 vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
8525
8526 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
8527 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
8528 vscaled1x0123 = _mm_min_ps(vscaled1x0123, voutput_max_less_zero_point);
8529
8530 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
8531 vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
8532
8533 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
8534 __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
8535
8536
8537 __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
8538
8539 vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
8540
8541 if (nc >= 4) {
8542 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
8543 unaligned_store_u32(c1, (uint32_t) _mm_extract_epi32(vout, 1));
8544
8545 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
8546 c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
8547
8548 a0 = (const int8_t*) ((uintptr_t) a0 - kc);
8549 a1 = (const int8_t*) ((uintptr_t) a1 - kc);
8550
8551 nc -= 4;
8552 } else {
8553 if (nc & 2) {
8554 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
8555 c0 += 2;
8556 unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout, 2));
8557 c1 += 2;
8558 vout = _mm_srli_epi32(vout, 16);
8559 }
8560 if (nc & 1) {
8561 *c0 = (int8_t) _mm_extract_epi8(vout, 0);
8562 *c1 = (int8_t) _mm_extract_epi8(vout, 4);
8563 }
8564
8565 nc = 0;
8566 }
8567 } while (nc != 0);
8568 }
8569
xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])8570 void xnn_qs8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128(
8571 size_t mr,
8572 size_t nc,
8573 size_t kc,
8574 size_t ks,
8575 const int8_t** restrict a,
8576 const void* restrict w,
8577 int8_t* restrict c,
8578 size_t cm_stride,
8579 size_t cn_stride,
8580 size_t a_offset,
8581 const int8_t* zero,
8582 const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8583 {
8584 assert(mr != 0);
8585 assert(mr <= 1);
8586 assert(nc != 0);
8587 assert(kc != 0);
8588 assert(ks != 0);
8589 assert(ks % (1 * sizeof(void*)) == 0);
8590 assert(a_offset % sizeof(int8_t) == 0);
8591 assert(a != NULL);
8592 assert(w != NULL);
8593 assert(c != NULL);
8594
8595 kc = round_up_po2(kc, 8);
8596 int8_t* c0 = c;
8597
8598 do {
8599 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
8600 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
8601 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
8602 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
8603 w = (const int32_t*) w + 4;
8604
8605 size_t p = ks;
8606 do {
8607 const int8_t* restrict a0 = a[0];
8608 if XNN_UNPREDICTABLE(a0 != zero) {
8609 a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
8610 }
8611 a += 1;
8612
8613 size_t k = 0;
8614 while (k < kc) {
8615 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
8616 const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
8617 a0 += 8;
8618
8619 const __m128i vb01 = _mm_load_si128((const __m128i*) w);
8620 const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
8621 const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
8622
8623 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
8624 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
8625 const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
8626 const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
8627 const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
8628
8629 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
8630 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
8631
8632 w = (const void*) ((const int8_t*) w + 32);
8633 k += 8 * sizeof(int8_t);
8634 }
8635 p -= 1 * sizeof(void*);
8636 } while (p != 0);
8637
8638 const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
8639 const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
8640
8641 __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
8642
8643 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
8644
8645 const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
8646 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
8647
8648 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
8649 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
8650
8651 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
8652
8653 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
8654 __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
8655
8656
8657 __m128i vout = _mm_packs_epi16(vacc00x0123, vacc00x0123);
8658
8659 vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
8660
8661 if (nc >= 4) {
8662 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
8663 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
8664
8665 a = (const int8_t**restrict) ((uintptr_t) a - ks);
8666
8667 nc -= 4;
8668 } else {
8669 if (nc & 2) {
8670 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
8671 c0 += 2;
8672 vout = _mm_srli_epi32(vout, 16);
8673 }
8674 if (nc & 1) {
8675 *c0 = (int8_t) _mm_extract_epi8(vout, 0);
8676 }
8677
8678 nc = 0;
8679 }
8680 } while (nc != 0);
8681 }
8682
xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128(size_t mr,size_t nc,size_t kc,size_t ks,const int8_t ** restrict a,const void * restrict w,int8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const int8_t * zero,const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])8683 void xnn_qs8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128(
8684 size_t mr,
8685 size_t nc,
8686 size_t kc,
8687 size_t ks,
8688 const int8_t** restrict a,
8689 const void* restrict w,
8690 int8_t* restrict c,
8691 size_t cm_stride,
8692 size_t cn_stride,
8693 size_t a_offset,
8694 const int8_t* zero,
8695 const union xnn_qs8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8696 {
8697 assert(mr != 0);
8698 assert(mr <= 2);
8699 assert(nc != 0);
8700 assert(kc != 0);
8701 assert(ks != 0);
8702 assert(ks % (2 * sizeof(void*)) == 0);
8703 assert(a_offset % sizeof(int8_t) == 0);
8704 assert(a != NULL);
8705 assert(w != NULL);
8706 assert(c != NULL);
8707
8708 kc = round_up_po2(kc, 8);
8709 int8_t* c0 = c;
8710 int8_t* c1 = (int8_t*) ((uintptr_t) c0 + cm_stride);
8711 if XNN_UNPREDICTABLE(mr != 2) {
8712 c1 = c0;
8713 }
8714
8715 do {
8716 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
8717 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
8718 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
8719 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
8720 __m128i vacc1x0 = vacc0x0;
8721 __m128i vacc1x1 = vacc0x1;
8722 __m128i vacc1x2 = vacc0x2;
8723 __m128i vacc1x3 = vacc0x3;
8724 w = (const int32_t*) w + 4;
8725
8726 size_t p = ks;
8727 do {
8728 const int8_t* restrict a0 = a[0];
8729 if XNN_UNPREDICTABLE(a0 != zero) {
8730 a0 = (const int8_t*) ((uintptr_t) a0 + a_offset);
8731 }
8732 const int8_t* restrict a1 = a[1];
8733 if XNN_UNPREDICTABLE(a1 != zero) {
8734 a1 = (const int8_t*) ((uintptr_t) a1 + a_offset);
8735 }
8736 a += 2;
8737
8738 size_t k = 0;
8739 while (k < kc) {
8740 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
8741 const __m128i vxa0 = _mm_cvtepi8_epi16(va0);
8742 a0 += 8;
8743 const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
8744 const __m128i vxa1 = _mm_cvtepi8_epi16(va1);
8745 a1 += 8;
8746
8747 const __m128i vb01 = _mm_load_si128((const __m128i*) w);
8748 const __m128i vxb0 = _mm_cvtepi8_epi16(vb01);
8749 const __m128i vxb1 = _mm_srai_epi16(_mm_unpackhi_epi8(vb01, vb01), 8);
8750
8751 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
8752 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
8753 vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
8754 vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
8755 const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
8756 const __m128i vxb2 = _mm_cvtepi8_epi16(vb23);
8757 const __m128i vxb3 = _mm_srai_epi16(_mm_unpackhi_epi8(vb23, vb23), 8);
8758
8759 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
8760 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
8761 vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
8762 vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
8763
8764 w = (const void*) ((const int8_t*) w + 32);
8765 k += 8 * sizeof(int8_t);
8766 }
8767 p -= 2 * sizeof(void*);
8768 } while (p != 0);
8769
8770 const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
8771 const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
8772 const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
8773 const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
8774
8775 __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
8776 __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
8777
8778 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
8779 __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
8780
8781 const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
8782 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
8783 vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
8784
8785 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse4.output_max_less_zero_point);
8786 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
8787 vscaled1x0123 = _mm_min_ps(vscaled1x0123, voutput_max_less_zero_point);
8788
8789 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
8790 vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
8791
8792 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
8793 __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
8794
8795
8796 __m128i vout = _mm_packs_epi16(vacc01x0123, vacc01x0123);
8797
8798 vout = _mm_max_epi8(vout, _mm_load_si128((const __m128i*) params->fp32_sse4.output_min));
8799
8800 if (nc >= 4) {
8801 unaligned_store_u32(c1, (uint32_t) _mm_extract_epi32(vout, 1));
8802 c1 = (int8_t*) ((uintptr_t) c1 + cn_stride);
8803 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
8804 c0 = (int8_t*) ((uintptr_t) c0 + cn_stride);
8805
8806 a = (const int8_t**restrict) ((uintptr_t) a - ks);
8807
8808 nc -= 4;
8809 } else {
8810 if (nc & 2) {
8811 unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout, 2));
8812 c1 += 2;
8813 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
8814 c0 += 2;
8815 vout = _mm_srli_epi32(vout, 16);
8816 }
8817 if (nc & 1) {
8818 *c1 = (int8_t) _mm_extract_epi8(vout, 4);
8819 *c0 = (int8_t) _mm_extract_epi8(vout, 0);
8820 }
8821
8822 nc = 0;
8823 }
8824 } while (nc != 0);
8825 }
8826
xnn_qs8_vadd_minmax_ukernel__avx_mul32_ld32_x8(size_t n,const int8_t * input_a,const int8_t * input_b,int8_t * output,const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])8827 void xnn_qs8_vadd_minmax_ukernel__avx_mul32_ld32_x8(
8828 size_t n,
8829 const int8_t* input_a,
8830 const int8_t* input_b,
8831 int8_t* output,
8832 const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8833 {
8834 const __m128i vbias = _mm_load_si128((const __m128i*) params->sse4_mul32.bias);
8835 const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.a_multiplier);
8836 const __m128i vb_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.b_multiplier);
8837 const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4_mul32.shift);
8838 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul32.output_zero_point);
8839 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul32.output_min);
8840 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4_mul32.output_max);
8841
8842 for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
8843 const __m128i va0123 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_a)));
8844 const __m128i vb0123 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_b)));
8845 const __m128i va4567 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_a + 4)));
8846 const __m128i vb4567 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_b + 4)));
8847 input_a += 8;
8848 input_b += 8;
8849
8850 __m128i vacc0123 = _mm_add_epi32(vbias, _mm_mullo_epi32(va0123, va_multiplier));
8851 __m128i vacc4567 = _mm_add_epi32(vbias, _mm_mullo_epi32(va4567, va_multiplier));
8852
8853 vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vb0123, vb_multiplier));
8854 vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vb4567, vb_multiplier));
8855
8856 vacc0123 = _mm_sra_epi32(vacc0123, vshift);
8857 vacc4567 = _mm_sra_epi32(vacc4567, vshift);
8858
8859 const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
8860
8861 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
8862
8863 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
8864
8865 vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
8866
8867 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
8868 output += 8;
8869 }
8870 if XNN_UNLIKELY(n != 0) {
8871 {
8872 const __m128i va0123 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_a)));
8873 const __m128i vb0123 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_b)));
8874 const __m128i va4567 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_a + 4)));
8875 const __m128i vb4567 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_b + 4)));
8876
8877 __m128i vacc0123 = _mm_add_epi32(vbias, _mm_mullo_epi32(va0123, va_multiplier));
8878 __m128i vacc4567 = _mm_add_epi32(vbias, _mm_mullo_epi32(va4567, va_multiplier));
8879
8880 vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vb0123, vb_multiplier));
8881 vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vb4567, vb_multiplier));
8882
8883 vacc0123 = _mm_sra_epi32(vacc0123, vshift);
8884 vacc4567 = _mm_sra_epi32(vacc4567, vshift);
8885
8886 const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
8887
8888 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
8889 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
8890 vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
8891
8892 if (n & (4 * sizeof(int8_t))) {
8893 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
8894 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
8895 output += 4;
8896 }
8897 if (n & (2 * sizeof(int8_t))) {
8898 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
8899 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
8900 output += 2;
8901 }
8902 if (n & (1 * sizeof(int8_t))) {
8903 *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
8904 }
8905 }
8906 }
8907 }
8908
xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x8(size_t n,const int8_t * input_a,const int8_t * input_b,int8_t * output,const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])8909 void xnn_qs8_vaddc_minmax_ukernel__avx_mul32_ld32_x8(
8910 size_t n,
8911 const int8_t* input_a,
8912 const int8_t* input_b,
8913 int8_t* output,
8914 const union xnn_qs8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8915 {
8916 const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4_mul32.a_multiplier);
8917 const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4_mul32.shift);
8918 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4_mul32.output_zero_point);
8919 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4_mul32.output_min);
8920 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4_mul32.output_max);
8921
8922 __m128i vbias = _mm_cvtsi32_si128(params->sse4_mul32.b_multiplier[0] * (int32_t) *input_b);
8923 vbias = _mm_shuffle_epi32(vbias, _MM_SHUFFLE(0, 0, 0, 0));
8924 vbias = _mm_add_epi32(vbias, _mm_load_si128((const __m128i*) params->sse4_mul32.bias));
8925 for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
8926 const __m128i va0123 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_a)));
8927 const __m128i va4567 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_a + 4)));
8928 input_a += 8;
8929 input_b += 8;
8930
8931 __m128i vacc0123 = _mm_add_epi32(vbias, _mm_mullo_epi32(va0123, va_multiplier));
8932 __m128i vacc4567 = _mm_add_epi32(vbias, _mm_mullo_epi32(va4567, va_multiplier));
8933
8934 vacc0123 = _mm_sra_epi32(vacc0123, vshift);
8935 vacc4567 = _mm_sra_epi32(vacc4567, vshift);
8936
8937 const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
8938
8939 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
8940
8941 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
8942
8943 vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
8944
8945 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
8946 output += 8;
8947 }
8948 if XNN_UNLIKELY(n != 0) {
8949 {
8950 const __m128i va0123 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_a)));
8951 const __m128i va4567 = _mm_cvtepi8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_a + 4)));
8952
8953 __m128i vacc0123 = _mm_add_epi32(vbias, _mm_mullo_epi32(va0123, va_multiplier));
8954 __m128i vacc4567 = _mm_add_epi32(vbias, _mm_mullo_epi32(va4567, va_multiplier));
8955
8956 vacc0123 = _mm_sra_epi32(vacc0123, vshift);
8957 vacc4567 = _mm_sra_epi32(vacc4567, vshift);
8958
8959 const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
8960
8961 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
8962 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
8963 vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
8964
8965 if (n & (4 * sizeof(int8_t))) {
8966 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
8967 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
8968 output += 4;
8969 }
8970 if (n & (2 * sizeof(int8_t))) {
8971 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
8972 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
8973 output += 2;
8974 }
8975 if (n & (1 * sizeof(int8_t))) {
8976 *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
8977 }
8978 }
8979 }
8980 }
8981
xnn_qs8_vcvt_ukernel__avx_x32(size_t n,const int8_t * x,int8_t * y,const union xnn_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])8982 void xnn_qs8_vcvt_ukernel__avx_x32(
8983 size_t n,
8984 const int8_t* x,
8985 int8_t* y,
8986 const union xnn_qs8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
8987 {
8988 assert(n != 0);
8989 assert(n % sizeof(int8_t) == 0);
8990 assert(x != NULL);
8991 assert(y != NULL);
8992
8993 const __m128i vinput_zero_point = _mm_load_si128((const __m128i*) params->ssse3.input_zero_point);
8994 const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->ssse3.multiplier);
8995 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->ssse3.output_zero_point);
8996 for (; n >= 32 * sizeof(int8_t); n -= 32 * sizeof(int8_t)) {
8997 __m128i vacc0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) x));
8998 __m128i vacc1 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (x + 8)));
8999 __m128i vacc2 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (x + 16)));
9000 __m128i vacc3 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (x + 24)));
9001 x += 32;
9002
9003 vacc0 = _mm_sub_epi16(vinput_zero_point, vacc0);
9004 vacc1 = _mm_sub_epi16(vinput_zero_point, vacc1);
9005 vacc2 = _mm_sub_epi16(vinput_zero_point, vacc2);
9006 vacc3 = _mm_sub_epi16(vinput_zero_point, vacc3);
9007
9008 vacc0 = _mm_slli_epi16(vacc0, 7);
9009 vacc1 = _mm_slli_epi16(vacc1, 7);
9010 vacc2 = _mm_slli_epi16(vacc2, 7);
9011 vacc3 = _mm_slli_epi16(vacc3, 7);
9012
9013 vacc0 = _mm_mulhrs_epi16(vacc0, vmultiplier);
9014 vacc1 = _mm_mulhrs_epi16(vacc1, vmultiplier);
9015 vacc2 = _mm_mulhrs_epi16(vacc2, vmultiplier);
9016 vacc3 = _mm_mulhrs_epi16(vacc3, vmultiplier);
9017
9018 vacc0 = _mm_adds_epi16(vacc0, voutput_zero_point);
9019 vacc1 = _mm_adds_epi16(vacc1, voutput_zero_point);
9020 vacc2 = _mm_adds_epi16(vacc2, voutput_zero_point);
9021 vacc3 = _mm_adds_epi16(vacc3, voutput_zero_point);
9022
9023 const __m128i vy0 = _mm_packs_epi16(vacc0, vacc1);
9024 const __m128i vy1 = _mm_packs_epi16(vacc2, vacc3);
9025
9026 _mm_storeu_si128((__m128i*) y, vy0);
9027 _mm_storeu_si128((__m128i*) (y + 16), vy1);
9028 y += 32;
9029 }
9030 for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
9031 __m128i vacc = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) x));
9032 vacc = _mm_sub_epi16(vinput_zero_point, vacc);
9033 vacc = _mm_slli_epi16(vacc, 7);
9034 vacc = _mm_mulhrs_epi16(vacc, vmultiplier);
9035 vacc = _mm_adds_epi16(vacc, voutput_zero_point);
9036 x += 8;
9037
9038 const __m128i vy = _mm_packs_epi16(vacc, vacc);
9039 _mm_storel_epi64((__m128i*) y, vy);
9040 y += 8;
9041 }
9042 if XNN_UNLIKELY(n != 0) {
9043 assert(n >= 1 * sizeof(int8_t));
9044 assert(n <= 7 * sizeof(int8_t));
9045
9046 __m128i vacc = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) x));
9047 vacc = _mm_sub_epi16(vinput_zero_point, vacc);
9048 vacc = _mm_slli_epi16(vacc, 7);
9049 vacc = _mm_mulhrs_epi16(vacc, vmultiplier);
9050 vacc = _mm_adds_epi16(vacc, voutput_zero_point);
9051
9052 __m128i vy = _mm_packs_epi16(vacc, vacc);
9053 if (n & (4 * sizeof(int8_t))) {
9054 _mm_storeu_si32(y, vy);
9055 vy = _mm_srli_epi64(vy, 32);
9056 y += 4;
9057 }
9058 if (n & (2 * sizeof(int8_t))) {
9059 _mm_storeu_si16(y, vy);
9060 vy = _mm_srli_epi32(vy, 16);
9061 y += 2;
9062 }
9063 if (n & (1 * sizeof(int8_t))) {
9064 *y = (int8_t) _mm_extract_epi8(vy, 0);
9065 }
9066 }
9067 }
9068
xnn_qs8_vlrelu_ukernel__avx_x32(size_t n,const int8_t * x,int8_t * y,const union xnn_qs8_lrelu_params params[restrict XNN_MIN_ELEMENTS (1)])9069 void xnn_qs8_vlrelu_ukernel__avx_x32(
9070 size_t n,
9071 const int8_t* x,
9072 int8_t* y,
9073 const union xnn_qs8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
9074 {
9075 assert(n != 0);
9076 assert(n % sizeof(int8_t) == 0);
9077 assert(x != NULL);
9078 assert(y != NULL);
9079
9080 const __m128i vinput_zero_point = _mm_load_si128((const __m128i*) params->avx.input_zero_point);
9081 const __m128i vpositive_multiplier = _mm_load_si128((const __m128i*) params->avx.positive_multiplier);
9082 const __m128i vnegative_multiplier = _mm_load_si128((const __m128i*) params->avx.negative_multiplier);
9083 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx.output_zero_point);
9084 for (; n >= 32 * sizeof(int8_t); n -= 32 * sizeof(int8_t)) {
9085 __m128i vacc0 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) x));
9086 __m128i vacc1 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (x + 8)));
9087 __m128i vacc2 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (x + 16)));
9088 __m128i vacc3 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (x + 24)));
9089 x += 32;
9090
9091 __m128i vmultiplier0 = _mm_cmpgt_epi16(vacc0, vinput_zero_point);
9092 vacc0 = _mm_sub_epi16(vinput_zero_point, vacc0);
9093 __m128i vmultiplier1 = _mm_cmpgt_epi16(vacc1, vinput_zero_point);
9094 vacc1 = _mm_sub_epi16(vinput_zero_point, vacc1);
9095 __m128i vmultiplier2 = _mm_cmpgt_epi16(vacc2, vinput_zero_point);
9096 vacc2 = _mm_sub_epi16(vinput_zero_point, vacc2);
9097 __m128i vmultiplier3 = _mm_cmpgt_epi16(vacc3, vinput_zero_point);
9098 vacc3 = _mm_sub_epi16(vinput_zero_point, vacc3);
9099
9100 vmultiplier0 = _mm_blendv_epi8(vnegative_multiplier, vpositive_multiplier, vmultiplier0);
9101 vacc0 = _mm_slli_epi16(vacc0, 7);
9102 vmultiplier1 = _mm_blendv_epi8(vnegative_multiplier, vpositive_multiplier, vmultiplier1);
9103 vacc1 = _mm_slli_epi16(vacc1, 7);
9104 vmultiplier2 = _mm_blendv_epi8(vnegative_multiplier, vpositive_multiplier, vmultiplier2);
9105 vacc2 = _mm_slli_epi16(vacc2, 7);
9106 vmultiplier3 = _mm_blendv_epi8(vnegative_multiplier, vpositive_multiplier, vmultiplier3);
9107 vacc3 = _mm_slli_epi16(vacc3, 7);
9108
9109 vacc0 = _mm_mulhrs_epi16(vacc0, vmultiplier0);
9110 vacc1 = _mm_mulhrs_epi16(vacc1, vmultiplier1);
9111 vacc2 = _mm_mulhrs_epi16(vacc2, vmultiplier2);
9112 vacc3 = _mm_mulhrs_epi16(vacc3, vmultiplier3);
9113
9114 vacc0 = _mm_adds_epi16(vacc0, voutput_zero_point);
9115 vacc1 = _mm_adds_epi16(vacc1, voutput_zero_point);
9116 vacc2 = _mm_adds_epi16(vacc2, voutput_zero_point);
9117 vacc3 = _mm_adds_epi16(vacc3, voutput_zero_point);
9118
9119 const __m128i vy0 = _mm_packs_epi16(vacc0, vacc1);
9120 const __m128i vy1 = _mm_packs_epi16(vacc2, vacc3);
9121
9122 _mm_storeu_si128((__m128i*) y, vy0);
9123 _mm_storeu_si128((__m128i*) (y + 16), vy1);
9124 y += 32;
9125 }
9126 for (; n >= 8 * sizeof(int8_t); n -= 8 * sizeof(int8_t)) {
9127 __m128i vacc = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) x));
9128 __m128i vmultiplier = _mm_cmpgt_epi16(vacc, vinput_zero_point);
9129 vacc = _mm_sub_epi16(vinput_zero_point, vacc);
9130 vmultiplier = _mm_blendv_epi8(vnegative_multiplier, vpositive_multiplier, vmultiplier);
9131 vacc = _mm_slli_epi16(vacc, 7);
9132 vacc = _mm_mulhrs_epi16(vacc, vmultiplier);
9133 vacc = _mm_adds_epi16(vacc, voutput_zero_point);
9134 x += 8;
9135
9136 const __m128i vy = _mm_packs_epi16(vacc, vacc);
9137 _mm_storel_epi64((__m128i*) y, vy);
9138 y += 8;
9139 }
9140 if XNN_UNLIKELY(n != 0) {
9141 assert(n >= 1 * sizeof(int8_t));
9142 assert(n <= 7 * sizeof(int8_t));
9143
9144 __m128i vacc = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) x));
9145 __m128i vmultiplier = _mm_cmpgt_epi16(vacc, vinput_zero_point);
9146 vacc = _mm_sub_epi16(vinput_zero_point, vacc);
9147 vmultiplier = _mm_blendv_epi8(vnegative_multiplier, vpositive_multiplier, vmultiplier);
9148 vacc = _mm_slli_epi16(vacc, 7);
9149 vacc = _mm_mulhrs_epi16(vacc, vmultiplier);
9150 vacc = _mm_adds_epi16(vacc, voutput_zero_point);
9151
9152 __m128i vy = _mm_packs_epi16(vacc, vacc);
9153 if (n & (4 * sizeof(int8_t))) {
9154 _mm_storeu_si32(y, vy);
9155 vy = _mm_srli_epi64(vy, 32);
9156 y += 4;
9157 }
9158 if (n & (2 * sizeof(int8_t))) {
9159 _mm_storeu_si16(y, vy);
9160 vy = _mm_srli_epi32(vy, 16);
9161 y += 2;
9162 }
9163 if (n & (1 * sizeof(int8_t))) {
9164 *y = (int8_t) _mm_extract_epi8(vy, 0);
9165 }
9166 }
9167 }
9168
xnn_qs8_vmul_minmax_fp32_ukernel__avx_mul16_ld64_x16(size_t n,const int8_t * input_a,const int8_t * input_b,int8_t * output,const union xnn_qs8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])9169 void xnn_qs8_vmul_minmax_fp32_ukernel__avx_mul16_ld64_x16(
9170 size_t n,
9171 const int8_t* input_a,
9172 const int8_t* input_b,
9173 int8_t* output,
9174 const union xnn_qs8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
9175
9176 {
9177 const __m128i va_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.a_zero_point);
9178 const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.b_zero_point);
9179 const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
9180 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
9181 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
9182 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse4.output_max);
9183
9184 for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
9185 const __m128i va01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
9186 const __m128i vb01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_b));
9187 const __m128i va89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (input_a + 8)));
9188 const __m128i vb89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (input_b + 8)));
9189 input_a += 16;
9190 input_b += 16;
9191
9192
9193 const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
9194 const __m128i vxb01234567 = _mm_sub_epi16(vb01234567, vb_zero_point);
9195 const __m128i vxa89ABCDEF = _mm_sub_epi16(va89ABCDEF, va_zero_point);
9196 const __m128i vxb89ABCDEF = _mm_sub_epi16(vb89ABCDEF, vb_zero_point);
9197
9198 const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb01234567);
9199 const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb01234567);
9200 const __m128i vprod89ABCDEFlo = _mm_mullo_epi16(vxa89ABCDEF, vxb89ABCDEF);
9201 const __m128i vprod89ABCDEFhi = _mm_mulhi_epi16(vxa89ABCDEF, vxb89ABCDEF);
9202
9203 const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
9204 const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
9205 const __m128i vprod89AB = _mm_unpacklo_epi16(vprod89ABCDEFlo, vprod89ABCDEFhi);
9206 const __m128i vprodCDEF = _mm_unpackhi_epi16(vprod89ABCDEFlo, vprod89ABCDEFhi);
9207
9208 __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
9209 __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
9210 __m128 vfpacc89AB = _mm_cvtepi32_ps(vprod89AB);
9211 __m128 vfpaccCDEF = _mm_cvtepi32_ps(vprodCDEF);
9212
9213 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
9214 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
9215 vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale);
9216 vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale);
9217
9218 const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
9219 const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
9220 const __m128i vacc89AB = _mm_cvtps_epi32(vfpacc89AB);
9221 const __m128i vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF);
9222
9223 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
9224 __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
9225
9226
9227 __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
9228
9229 vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
9230
9231 vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
9232
9233 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
9234 output += 16;
9235 }
9236 if XNN_UNLIKELY(n != 0) {
9237 do {
9238 const __m128i va01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
9239 const __m128i vb01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_b));
9240 input_a += 8;
9241 input_b += 8;
9242
9243
9244 const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
9245 const __m128i vxb01234567 = _mm_sub_epi16(vb01234567, vb_zero_point);
9246
9247 const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb01234567);
9248 const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb01234567);
9249
9250 const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
9251 const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
9252
9253 __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
9254 __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
9255
9256 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
9257 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
9258
9259 const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
9260 const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
9261
9262 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
9263
9264 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
9265 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
9266 vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
9267
9268 if XNN_LIKELY(n >= (8 * sizeof(int8_t))) {
9269 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
9270 output += 8;
9271 n -= 8 * sizeof(int8_t);
9272 } else {
9273 if (n & (4 * sizeof(int8_t))) {
9274 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
9275 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
9276 output += 4;
9277 }
9278 if (n & (2 * sizeof(int8_t))) {
9279 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
9280 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
9281 output += 2;
9282 }
9283 if (n & (1 * sizeof(int8_t))) {
9284 *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
9285 }
9286 n = 0;
9287 }
9288 } while (n != 0);
9289 }
9290 }
9291
xnn_qs8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16(size_t n,const int8_t * input_a,const int8_t * input_b,int8_t * output,const union xnn_qs8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])9292 void xnn_qs8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16(
9293 size_t n,
9294 const int8_t* input_a,
9295 const int8_t* input_b,
9296 int8_t* output,
9297 const union xnn_qs8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
9298
9299 {
9300 const __m128i va_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.a_zero_point);
9301 const __m128 vscale = _mm_load_ps(params->fp32_sse4.scale);
9302 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse4.output_zero_point);
9303 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse4.output_min);
9304 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse4.output_max);
9305
9306 __m128i vxb = _mm_sub_epi16(
9307 _mm_shuffle_epi32(_mm_cvtsi32_si128(UINT32_C(0x00010001) * (uint32_t) (uint16_t) (int16_t) *input_b), 0),
9308 _mm_load_si128((const __m128i*) params->fp32_sse4.b_zero_point));
9309 for (; n >= 16 * sizeof(int8_t); n -= 16 * sizeof(int8_t)) {
9310 const __m128i va01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
9311 const __m128i va89ABCDEF = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) (input_a + 8)));
9312 input_a += 16;
9313
9314
9315 const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
9316 const __m128i vxa89ABCDEF = _mm_sub_epi16(va89ABCDEF, va_zero_point);
9317
9318 const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb);
9319 const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb);
9320 const __m128i vprod89ABCDEFlo = _mm_mullo_epi16(vxa89ABCDEF, vxb);
9321 const __m128i vprod89ABCDEFhi = _mm_mulhi_epi16(vxa89ABCDEF, vxb);
9322
9323 const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
9324 const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
9325 const __m128i vprod89AB = _mm_unpacklo_epi16(vprod89ABCDEFlo, vprod89ABCDEFhi);
9326 const __m128i vprodCDEF = _mm_unpackhi_epi16(vprod89ABCDEFlo, vprod89ABCDEFhi);
9327
9328 __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
9329 __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
9330 __m128 vfpacc89AB = _mm_cvtepi32_ps(vprod89AB);
9331 __m128 vfpaccCDEF = _mm_cvtepi32_ps(vprodCDEF);
9332
9333 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
9334 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
9335 vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale);
9336 vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale);
9337
9338 const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
9339 const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
9340 const __m128i vacc89AB = _mm_cvtps_epi32(vfpacc89AB);
9341 const __m128i vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF);
9342
9343 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
9344 __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
9345
9346
9347 __m128i vout0123456789ABCDEF = _mm_packs_epi16(vout01234567, vout89ABCDEF);
9348
9349 vout0123456789ABCDEF = _mm_max_epi8(vout0123456789ABCDEF, voutput_min);
9350
9351 vout0123456789ABCDEF = _mm_min_epi8(vout0123456789ABCDEF, voutput_max);
9352
9353 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
9354 output += 16;
9355 }
9356 if XNN_UNLIKELY(n != 0) {
9357 do {
9358 const __m128i va01234567 = _mm_cvtepi8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
9359 input_a += 8;
9360
9361
9362 const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
9363
9364 const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb);
9365 const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb);
9366
9367 const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
9368 const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
9369
9370 __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
9371 __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
9372
9373 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
9374 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
9375
9376 const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
9377 const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
9378
9379 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
9380
9381 __m128i vout0123456701234567 = _mm_packs_epi16(vout01234567, vout01234567);
9382 vout0123456701234567 = _mm_max_epi8(vout0123456701234567, voutput_min);
9383 vout0123456701234567 = _mm_min_epi8(vout0123456701234567, voutput_max);
9384
9385 if XNN_LIKELY(n >= (8 * sizeof(int8_t))) {
9386 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
9387 output += 8;
9388 n -= 8 * sizeof(int8_t);
9389 } else {
9390 if (n & (4 * sizeof(int8_t))) {
9391 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
9392 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
9393 output += 4;
9394 }
9395 if (n & (2 * sizeof(int8_t))) {
9396 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
9397 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
9398 output += 2;
9399 }
9400 if (n & (1 * sizeof(int8_t))) {
9401 *output = (int8_t) _mm_extract_epi8(vout0123456701234567, 0);
9402 }
9403 n = 0;
9404 }
9405 } while (n != 0);
9406 }
9407 }
9408
xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16(size_t channels,size_t output_width,const uint8_t ** input,const void * weights,uint8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])9409 void xnn_qu8_dwconv_minmax_fp32_ukernel_up16x25__avx_mul16(
9410 size_t channels,
9411 size_t output_width,
9412 const uint8_t** input,
9413 const void* weights,
9414 uint8_t* output,
9415 size_t input_stride,
9416 size_t output_increment,
9417 size_t input_offset,
9418 const uint8_t* zero,
9419 const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
9420 {
9421 assert(channels != 0);
9422 assert(output_width != 0);
9423
9424 do {
9425 const uint8_t* i0 = input[0];
9426 assert(i0 != NULL);
9427 if XNN_UNPREDICTABLE(i0 != zero) {
9428 i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
9429 }
9430 const uint8_t* i1 = input[1];
9431 assert(i1 != NULL);
9432 if XNN_UNPREDICTABLE(i1 != zero) {
9433 i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
9434 }
9435 const uint8_t* i2 = input[2];
9436 assert(i2 != NULL);
9437 if XNN_UNPREDICTABLE(i2 != zero) {
9438 i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
9439 }
9440 const uint8_t* i3 = input[3];
9441 assert(i3 != NULL);
9442 if XNN_UNPREDICTABLE(i3 != zero) {
9443 i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
9444 }
9445 const uint8_t* i4 = input[4];
9446 assert(i4 != NULL);
9447 if XNN_UNPREDICTABLE(i4 != zero) {
9448 i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
9449 }
9450 const uint8_t* i5 = input[5];
9451 assert(i5 != NULL);
9452 if XNN_UNPREDICTABLE(i5 != zero) {
9453 i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
9454 }
9455 const uint8_t* i6 = input[6];
9456 assert(i6 != NULL);
9457 if XNN_UNPREDICTABLE(i6 != zero) {
9458 i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
9459 }
9460 const uint8_t* i7 = input[7];
9461 assert(i7 != NULL);
9462 if XNN_UNPREDICTABLE(i7 != zero) {
9463 i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
9464 }
9465 const uint8_t* i8 = input[8];
9466 assert(i8 != NULL);
9467 if XNN_UNPREDICTABLE(i8 != zero) {
9468 i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
9469 }
9470 const uint8_t* i9 = input[9];
9471 assert(i9 != NULL);
9472 if XNN_UNPREDICTABLE(i9 != zero) {
9473 i9 = (const uint8_t*) ((uintptr_t) i9 + input_offset);
9474 }
9475 const uint8_t* i10 = input[10];
9476 assert(i10 != NULL);
9477 if XNN_UNPREDICTABLE(i10 != zero) {
9478 i10 = (const uint8_t*) ((uintptr_t) i10 + input_offset);
9479 }
9480 const uint8_t* i11 = input[11];
9481 assert(i11 != NULL);
9482 if XNN_UNPREDICTABLE(i11 != zero) {
9483 i11 = (const uint8_t*) ((uintptr_t) i11 + input_offset);
9484 }
9485 const uint8_t* i12 = input[12];
9486 assert(i12 != NULL);
9487 if XNN_UNPREDICTABLE(i12 != zero) {
9488 i12 = (const uint8_t*) ((uintptr_t) i12 + input_offset);
9489 }
9490 const uint8_t* i13 = input[13];
9491 assert(i13 != NULL);
9492 if XNN_UNPREDICTABLE(i13 != zero) {
9493 i13 = (const uint8_t*) ((uintptr_t) i13 + input_offset);
9494 }
9495 const uint8_t* i14 = input[14];
9496 assert(i14 != NULL);
9497 if XNN_UNPREDICTABLE(i14 != zero) {
9498 i14 = (const uint8_t*) ((uintptr_t) i14 + input_offset);
9499 }
9500 const uint8_t* i15 = input[15];
9501 assert(i15 != NULL);
9502 if XNN_UNPREDICTABLE(i15 != zero) {
9503 i15 = (const uint8_t*) ((uintptr_t) i15 + input_offset);
9504 }
9505 const uint8_t* i16 = input[16];
9506 assert(i16 != NULL);
9507 if XNN_UNPREDICTABLE(i16 != zero) {
9508 i16 = (const uint8_t*) ((uintptr_t) i16 + input_offset);
9509 }
9510 const uint8_t* i17 = input[17];
9511 assert(i17 != NULL);
9512 if XNN_UNPREDICTABLE(i17 != zero) {
9513 i17 = (const uint8_t*) ((uintptr_t) i17 + input_offset);
9514 }
9515 const uint8_t* i18 = input[18];
9516 assert(i18 != NULL);
9517 if XNN_UNPREDICTABLE(i18 != zero) {
9518 i18 = (const uint8_t*) ((uintptr_t) i18 + input_offset);
9519 }
9520 const uint8_t* i19 = input[19];
9521 assert(i19 != NULL);
9522 if XNN_UNPREDICTABLE(i19 != zero) {
9523 i19 = (const uint8_t*) ((uintptr_t) i19 + input_offset);
9524 }
9525 const uint8_t* i20 = input[20];
9526 assert(i20 != NULL);
9527 if XNN_UNPREDICTABLE(i20 != zero) {
9528 i20 = (const uint8_t*) ((uintptr_t) i20 + input_offset);
9529 }
9530 const uint8_t* i21 = input[21];
9531 assert(i21 != NULL);
9532 if XNN_UNPREDICTABLE(i21 != zero) {
9533 i21 = (const uint8_t*) ((uintptr_t) i21 + input_offset);
9534 }
9535 const uint8_t* i22 = input[22];
9536 assert(i22 != NULL);
9537 if XNN_UNPREDICTABLE(i22 != zero) {
9538 i22 = (const uint8_t*) ((uintptr_t) i22 + input_offset);
9539 }
9540 const uint8_t* i23 = input[23];
9541 assert(i23 != NULL);
9542 if XNN_UNPREDICTABLE(i23 != zero) {
9543 i23 = (const uint8_t*) ((uintptr_t) i23 + input_offset);
9544 }
9545 const uint8_t* i24 = input[24];
9546 assert(i24 != NULL);
9547 if XNN_UNPREDICTABLE(i24 != zero) {
9548 i24 = (const uint8_t*) ((uintptr_t) i24 + input_offset);
9549 }
9550 input = (const uint8_t**) ((uintptr_t) input + input_stride);
9551
9552 size_t c = channels;
9553 const void* w = weights;
9554 const __m128i vk_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point);
9555 for (; c >= 16; c -= 16) {
9556 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
9557 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
9558 __m128i vacc89AB = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 8));
9559 __m128i vaccCDEF = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 12));
9560
9561
9562 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
9563 const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(vi0x01234567);
9564 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(uint8_t)));
9565 const __m128i vxk0x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk0x01234567), vk_zero_point);
9566 const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8));
9567 const __m128i vxi0x89ABCDEF = _mm_cvtepu8_epi16(vi0x89ABCDEF);
9568 const __m128i vk0x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(uint8_t)));
9569 const __m128i vxk0x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk0x89ABCDEF), vk_zero_point);
9570 i0 += 16;
9571
9572
9573 const __m128i vprod0x01234567lo = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
9574 const __m128i vprod0x01234567hi = _mm_mulhi_epi16(vxi0x01234567, vxk0x01234567);
9575 const __m128i vprod0x89ABCDEFlo = _mm_mullo_epi16(vxi0x89ABCDEF, vxk0x89ABCDEF);
9576 const __m128i vprod0x89ABCDEFhi = _mm_mulhi_epi16(vxi0x89ABCDEF, vxk0x89ABCDEF);
9577
9578 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod0x01234567lo, vprod0x01234567hi));
9579 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod0x01234567lo, vprod0x01234567hi));
9580 vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod0x89ABCDEFlo, vprod0x89ABCDEFhi));
9581 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod0x89ABCDEFlo, vprod0x89ABCDEFhi));
9582
9583 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
9584 const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(vi1x01234567);
9585 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(uint8_t)));
9586 const __m128i vxk1x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk1x01234567), vk_zero_point);
9587 const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8));
9588 const __m128i vxi1x89ABCDEF = _mm_cvtepu8_epi16(vi1x89ABCDEF);
9589 const __m128i vk1x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(uint8_t)));
9590 const __m128i vxk1x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk1x89ABCDEF), vk_zero_point);
9591 i1 += 16;
9592
9593
9594 const __m128i vprod1x01234567lo = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
9595 const __m128i vprod1x01234567hi = _mm_mulhi_epi16(vxi1x01234567, vxk1x01234567);
9596 const __m128i vprod1x89ABCDEFlo = _mm_mullo_epi16(vxi1x89ABCDEF, vxk1x89ABCDEF);
9597 const __m128i vprod1x89ABCDEFhi = _mm_mulhi_epi16(vxi1x89ABCDEF, vxk1x89ABCDEF);
9598
9599 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod1x01234567lo, vprod1x01234567hi));
9600 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod1x01234567lo, vprod1x01234567hi));
9601 vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod1x89ABCDEFlo, vprod1x89ABCDEFhi));
9602 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod1x89ABCDEFlo, vprod1x89ABCDEFhi));
9603
9604 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
9605 const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(vi2x01234567);
9606 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(uint8_t)));
9607 const __m128i vxk2x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk2x01234567), vk_zero_point);
9608 const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8));
9609 const __m128i vxi2x89ABCDEF = _mm_cvtepu8_epi16(vi2x89ABCDEF);
9610 const __m128i vk2x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(uint8_t)));
9611 const __m128i vxk2x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk2x89ABCDEF), vk_zero_point);
9612 i2 += 16;
9613
9614
9615 const __m128i vprod2x01234567lo = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
9616 const __m128i vprod2x01234567hi = _mm_mulhi_epi16(vxi2x01234567, vxk2x01234567);
9617 const __m128i vprod2x89ABCDEFlo = _mm_mullo_epi16(vxi2x89ABCDEF, vxk2x89ABCDEF);
9618 const __m128i vprod2x89ABCDEFhi = _mm_mulhi_epi16(vxi2x89ABCDEF, vxk2x89ABCDEF);
9619
9620 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod2x01234567lo, vprod2x01234567hi));
9621 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod2x01234567lo, vprod2x01234567hi));
9622 vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod2x89ABCDEFlo, vprod2x89ABCDEFhi));
9623 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod2x89ABCDEFlo, vprod2x89ABCDEFhi));
9624
9625 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
9626 const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(vi3x01234567);
9627 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(uint8_t)));
9628 const __m128i vxk3x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk3x01234567), vk_zero_point);
9629 const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8));
9630 const __m128i vxi3x89ABCDEF = _mm_cvtepu8_epi16(vi3x89ABCDEF);
9631 const __m128i vk3x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(uint8_t)));
9632 const __m128i vxk3x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk3x89ABCDEF), vk_zero_point);
9633 i3 += 16;
9634
9635
9636 const __m128i vprod3x01234567lo = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567);
9637 const __m128i vprod3x01234567hi = _mm_mulhi_epi16(vxi3x01234567, vxk3x01234567);
9638 const __m128i vprod3x89ABCDEFlo = _mm_mullo_epi16(vxi3x89ABCDEF, vxk3x89ABCDEF);
9639 const __m128i vprod3x89ABCDEFhi = _mm_mulhi_epi16(vxi3x89ABCDEF, vxk3x89ABCDEF);
9640
9641 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod3x01234567lo, vprod3x01234567hi));
9642 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod3x01234567lo, vprod3x01234567hi));
9643 vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod3x89ABCDEFlo, vprod3x89ABCDEFhi));
9644 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod3x89ABCDEFlo, vprod3x89ABCDEFhi));
9645
9646 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
9647 const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(vi4x01234567);
9648 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(uint8_t)));
9649 const __m128i vxk4x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk4x01234567), vk_zero_point);
9650 const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8));
9651 const __m128i vxi4x89ABCDEF = _mm_cvtepu8_epi16(vi4x89ABCDEF);
9652 const __m128i vk4x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(uint8_t)));
9653 const __m128i vxk4x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk4x89ABCDEF), vk_zero_point);
9654 i4 += 16;
9655
9656
9657 const __m128i vprod4x01234567lo = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
9658 const __m128i vprod4x01234567hi = _mm_mulhi_epi16(vxi4x01234567, vxk4x01234567);
9659 const __m128i vprod4x89ABCDEFlo = _mm_mullo_epi16(vxi4x89ABCDEF, vxk4x89ABCDEF);
9660 const __m128i vprod4x89ABCDEFhi = _mm_mulhi_epi16(vxi4x89ABCDEF, vxk4x89ABCDEF);
9661
9662 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod4x01234567lo, vprod4x01234567hi));
9663 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod4x01234567lo, vprod4x01234567hi));
9664 vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod4x89ABCDEFlo, vprod4x89ABCDEFhi));
9665 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod4x89ABCDEFlo, vprod4x89ABCDEFhi));
9666
9667 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
9668 const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(vi5x01234567);
9669 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(uint8_t)));
9670 const __m128i vxk5x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk5x01234567), vk_zero_point);
9671 const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8));
9672 const __m128i vxi5x89ABCDEF = _mm_cvtepu8_epi16(vi5x89ABCDEF);
9673 const __m128i vk5x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(uint8_t)));
9674 const __m128i vxk5x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk5x89ABCDEF), vk_zero_point);
9675 i5 += 16;
9676
9677
9678 const __m128i vprod5x01234567lo = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567);
9679 const __m128i vprod5x01234567hi = _mm_mulhi_epi16(vxi5x01234567, vxk5x01234567);
9680 const __m128i vprod5x89ABCDEFlo = _mm_mullo_epi16(vxi5x89ABCDEF, vxk5x89ABCDEF);
9681 const __m128i vprod5x89ABCDEFhi = _mm_mulhi_epi16(vxi5x89ABCDEF, vxk5x89ABCDEF);
9682
9683 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod5x01234567lo, vprod5x01234567hi));
9684 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod5x01234567lo, vprod5x01234567hi));
9685 vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod5x89ABCDEFlo, vprod5x89ABCDEFhi));
9686 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod5x89ABCDEFlo, vprod5x89ABCDEFhi));
9687
9688 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
9689 const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(vi6x01234567);
9690 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(uint8_t)));
9691 const __m128i vxk6x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk6x01234567), vk_zero_point);
9692 const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8));
9693 const __m128i vxi6x89ABCDEF = _mm_cvtepu8_epi16(vi6x89ABCDEF);
9694 const __m128i vk6x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(uint8_t)));
9695 const __m128i vxk6x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk6x89ABCDEF), vk_zero_point);
9696 i6 += 16;
9697
9698
9699 const __m128i vprod6x01234567lo = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
9700 const __m128i vprod6x01234567hi = _mm_mulhi_epi16(vxi6x01234567, vxk6x01234567);
9701 const __m128i vprod6x89ABCDEFlo = _mm_mullo_epi16(vxi6x89ABCDEF, vxk6x89ABCDEF);
9702 const __m128i vprod6x89ABCDEFhi = _mm_mulhi_epi16(vxi6x89ABCDEF, vxk6x89ABCDEF);
9703
9704 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod6x01234567lo, vprod6x01234567hi));
9705 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod6x01234567lo, vprod6x01234567hi));
9706 vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod6x89ABCDEFlo, vprod6x89ABCDEFhi));
9707 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod6x89ABCDEFlo, vprod6x89ABCDEFhi));
9708
9709 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
9710 const __m128i vxi7x01234567 = _mm_cvtepu8_epi16(vi7x01234567);
9711 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(uint8_t)));
9712 const __m128i vxk7x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk7x01234567), vk_zero_point);
9713 const __m128i vi7x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i7 + 8));
9714 const __m128i vxi7x89ABCDEF = _mm_cvtepu8_epi16(vi7x89ABCDEF);
9715 const __m128i vk7x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(uint8_t)));
9716 const __m128i vxk7x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk7x89ABCDEF), vk_zero_point);
9717 i7 += 16;
9718
9719
9720 const __m128i vprod7x01234567lo = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567);
9721 const __m128i vprod7x01234567hi = _mm_mulhi_epi16(vxi7x01234567, vxk7x01234567);
9722 const __m128i vprod7x89ABCDEFlo = _mm_mullo_epi16(vxi7x89ABCDEF, vxk7x89ABCDEF);
9723 const __m128i vprod7x89ABCDEFhi = _mm_mulhi_epi16(vxi7x89ABCDEF, vxk7x89ABCDEF);
9724
9725 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod7x01234567lo, vprod7x01234567hi));
9726 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod7x01234567lo, vprod7x01234567hi));
9727 vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod7x89ABCDEFlo, vprod7x89ABCDEFhi));
9728 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod7x89ABCDEFlo, vprod7x89ABCDEFhi));
9729
9730 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
9731 const __m128i vxi8x01234567 = _mm_cvtepu8_epi16(vi8x01234567);
9732 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(uint8_t)));
9733 const __m128i vxk8x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk8x01234567), vk_zero_point);
9734 const __m128i vi8x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i8 + 8));
9735 const __m128i vxi8x89ABCDEF = _mm_cvtepu8_epi16(vi8x89ABCDEF);
9736 const __m128i vk8x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(uint8_t)));
9737 const __m128i vxk8x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk8x89ABCDEF), vk_zero_point);
9738 i8 += 16;
9739
9740
9741 const __m128i vprod8x01234567lo = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
9742 const __m128i vprod8x01234567hi = _mm_mulhi_epi16(vxi8x01234567, vxk8x01234567);
9743 const __m128i vprod8x89ABCDEFlo = _mm_mullo_epi16(vxi8x89ABCDEF, vxk8x89ABCDEF);
9744 const __m128i vprod8x89ABCDEFhi = _mm_mulhi_epi16(vxi8x89ABCDEF, vxk8x89ABCDEF);
9745
9746 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod8x01234567lo, vprod8x01234567hi));
9747 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod8x01234567lo, vprod8x01234567hi));
9748 vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod8x89ABCDEFlo, vprod8x89ABCDEFhi));
9749 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod8x89ABCDEFlo, vprod8x89ABCDEFhi));
9750
9751 const __m128i vi9x01234567 = _mm_loadl_epi64((const __m128i*) i9);
9752 const __m128i vxi9x01234567 = _mm_cvtepu8_epi16(vi9x01234567);
9753 const __m128i vk9x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(uint8_t)));
9754 const __m128i vxk9x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk9x01234567), vk_zero_point);
9755 const __m128i vi9x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i9 + 8));
9756 const __m128i vxi9x89ABCDEF = _mm_cvtepu8_epi16(vi9x89ABCDEF);
9757 const __m128i vk9x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 152 * sizeof(uint8_t)));
9758 const __m128i vxk9x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk9x89ABCDEF), vk_zero_point);
9759 i9 += 16;
9760
9761
9762 const __m128i vprod9x01234567lo = _mm_mullo_epi16(vxi9x01234567, vxk9x01234567);
9763 const __m128i vprod9x01234567hi = _mm_mulhi_epi16(vxi9x01234567, vxk9x01234567);
9764 const __m128i vprod9x89ABCDEFlo = _mm_mullo_epi16(vxi9x89ABCDEF, vxk9x89ABCDEF);
9765 const __m128i vprod9x89ABCDEFhi = _mm_mulhi_epi16(vxi9x89ABCDEF, vxk9x89ABCDEF);
9766
9767 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod9x01234567lo, vprod9x01234567hi));
9768 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod9x01234567lo, vprod9x01234567hi));
9769 vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod9x89ABCDEFlo, vprod9x89ABCDEFhi));
9770 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod9x89ABCDEFlo, vprod9x89ABCDEFhi));
9771
9772 const __m128i vi10x01234567 = _mm_loadl_epi64((const __m128i*) i10);
9773 const __m128i vxi10x01234567 = _mm_cvtepu8_epi16(vi10x01234567);
9774 const __m128i vk10x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 160 * sizeof(uint8_t)));
9775 const __m128i vxk10x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk10x01234567), vk_zero_point);
9776 const __m128i vi10x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i10 + 8));
9777 const __m128i vxi10x89ABCDEF = _mm_cvtepu8_epi16(vi10x89ABCDEF);
9778 const __m128i vk10x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 168 * sizeof(uint8_t)));
9779 const __m128i vxk10x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk10x89ABCDEF), vk_zero_point);
9780 i10 += 16;
9781
9782
9783 const __m128i vprod10x01234567lo = _mm_mullo_epi16(vxi10x01234567, vxk10x01234567);
9784 const __m128i vprod10x01234567hi = _mm_mulhi_epi16(vxi10x01234567, vxk10x01234567);
9785 const __m128i vprod10x89ABCDEFlo = _mm_mullo_epi16(vxi10x89ABCDEF, vxk10x89ABCDEF);
9786 const __m128i vprod10x89ABCDEFhi = _mm_mulhi_epi16(vxi10x89ABCDEF, vxk10x89ABCDEF);
9787
9788 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod10x01234567lo, vprod10x01234567hi));
9789 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod10x01234567lo, vprod10x01234567hi));
9790 vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod10x89ABCDEFlo, vprod10x89ABCDEFhi));
9791 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod10x89ABCDEFlo, vprod10x89ABCDEFhi));
9792
9793 const __m128i vi11x01234567 = _mm_loadl_epi64((const __m128i*) i11);
9794 const __m128i vxi11x01234567 = _mm_cvtepu8_epi16(vi11x01234567);
9795 const __m128i vk11x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 176 * sizeof(uint8_t)));
9796 const __m128i vxk11x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk11x01234567), vk_zero_point);
9797 const __m128i vi11x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i11 + 8));
9798 const __m128i vxi11x89ABCDEF = _mm_cvtepu8_epi16(vi11x89ABCDEF);
9799 const __m128i vk11x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 184 * sizeof(uint8_t)));
9800 const __m128i vxk11x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk11x89ABCDEF), vk_zero_point);
9801 i11 += 16;
9802
9803
9804 const __m128i vprod11x01234567lo = _mm_mullo_epi16(vxi11x01234567, vxk11x01234567);
9805 const __m128i vprod11x01234567hi = _mm_mulhi_epi16(vxi11x01234567, vxk11x01234567);
9806 const __m128i vprod11x89ABCDEFlo = _mm_mullo_epi16(vxi11x89ABCDEF, vxk11x89ABCDEF);
9807 const __m128i vprod11x89ABCDEFhi = _mm_mulhi_epi16(vxi11x89ABCDEF, vxk11x89ABCDEF);
9808
9809 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod11x01234567lo, vprod11x01234567hi));
9810 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod11x01234567lo, vprod11x01234567hi));
9811 vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod11x89ABCDEFlo, vprod11x89ABCDEFhi));
9812 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod11x89ABCDEFlo, vprod11x89ABCDEFhi));
9813
9814 const __m128i vi12x01234567 = _mm_loadl_epi64((const __m128i*) i12);
9815 const __m128i vxi12x01234567 = _mm_cvtepu8_epi16(vi12x01234567);
9816 const __m128i vk12x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 192 * sizeof(uint8_t)));
9817 const __m128i vxk12x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk12x01234567), vk_zero_point);
9818 const __m128i vi12x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i12 + 8));
9819 const __m128i vxi12x89ABCDEF = _mm_cvtepu8_epi16(vi12x89ABCDEF);
9820 const __m128i vk12x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 200 * sizeof(uint8_t)));
9821 const __m128i vxk12x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk12x89ABCDEF), vk_zero_point);
9822 i12 += 16;
9823
9824
9825 const __m128i vprod12x01234567lo = _mm_mullo_epi16(vxi12x01234567, vxk12x01234567);
9826 const __m128i vprod12x01234567hi = _mm_mulhi_epi16(vxi12x01234567, vxk12x01234567);
9827 const __m128i vprod12x89ABCDEFlo = _mm_mullo_epi16(vxi12x89ABCDEF, vxk12x89ABCDEF);
9828 const __m128i vprod12x89ABCDEFhi = _mm_mulhi_epi16(vxi12x89ABCDEF, vxk12x89ABCDEF);
9829
9830 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod12x01234567lo, vprod12x01234567hi));
9831 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod12x01234567lo, vprod12x01234567hi));
9832 vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod12x89ABCDEFlo, vprod12x89ABCDEFhi));
9833 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod12x89ABCDEFlo, vprod12x89ABCDEFhi));
9834
9835 const __m128i vi13x01234567 = _mm_loadl_epi64((const __m128i*) i13);
9836 const __m128i vxi13x01234567 = _mm_cvtepu8_epi16(vi13x01234567);
9837 const __m128i vk13x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 208 * sizeof(uint8_t)));
9838 const __m128i vxk13x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk13x01234567), vk_zero_point);
9839 const __m128i vi13x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i13 + 8));
9840 const __m128i vxi13x89ABCDEF = _mm_cvtepu8_epi16(vi13x89ABCDEF);
9841 const __m128i vk13x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 216 * sizeof(uint8_t)));
9842 const __m128i vxk13x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk13x89ABCDEF), vk_zero_point);
9843 i13 += 16;
9844
9845
9846 const __m128i vprod13x01234567lo = _mm_mullo_epi16(vxi13x01234567, vxk13x01234567);
9847 const __m128i vprod13x01234567hi = _mm_mulhi_epi16(vxi13x01234567, vxk13x01234567);
9848 const __m128i vprod13x89ABCDEFlo = _mm_mullo_epi16(vxi13x89ABCDEF, vxk13x89ABCDEF);
9849 const __m128i vprod13x89ABCDEFhi = _mm_mulhi_epi16(vxi13x89ABCDEF, vxk13x89ABCDEF);
9850
9851 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod13x01234567lo, vprod13x01234567hi));
9852 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod13x01234567lo, vprod13x01234567hi));
9853 vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod13x89ABCDEFlo, vprod13x89ABCDEFhi));
9854 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod13x89ABCDEFlo, vprod13x89ABCDEFhi));
9855
9856 const __m128i vi14x01234567 = _mm_loadl_epi64((const __m128i*) i14);
9857 const __m128i vxi14x01234567 = _mm_cvtepu8_epi16(vi14x01234567);
9858 const __m128i vk14x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 224 * sizeof(uint8_t)));
9859 const __m128i vxk14x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk14x01234567), vk_zero_point);
9860 const __m128i vi14x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i14 + 8));
9861 const __m128i vxi14x89ABCDEF = _mm_cvtepu8_epi16(vi14x89ABCDEF);
9862 const __m128i vk14x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 232 * sizeof(uint8_t)));
9863 const __m128i vxk14x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk14x89ABCDEF), vk_zero_point);
9864 i14 += 16;
9865
9866
9867 const __m128i vprod14x01234567lo = _mm_mullo_epi16(vxi14x01234567, vxk14x01234567);
9868 const __m128i vprod14x01234567hi = _mm_mulhi_epi16(vxi14x01234567, vxk14x01234567);
9869 const __m128i vprod14x89ABCDEFlo = _mm_mullo_epi16(vxi14x89ABCDEF, vxk14x89ABCDEF);
9870 const __m128i vprod14x89ABCDEFhi = _mm_mulhi_epi16(vxi14x89ABCDEF, vxk14x89ABCDEF);
9871
9872 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod14x01234567lo, vprod14x01234567hi));
9873 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod14x01234567lo, vprod14x01234567hi));
9874 vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod14x89ABCDEFlo, vprod14x89ABCDEFhi));
9875 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod14x89ABCDEFlo, vprod14x89ABCDEFhi));
9876
9877 const __m128i vi15x01234567 = _mm_loadl_epi64((const __m128i*) i15);
9878 const __m128i vxi15x01234567 = _mm_cvtepu8_epi16(vi15x01234567);
9879 const __m128i vk15x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 240 * sizeof(uint8_t)));
9880 const __m128i vxk15x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk15x01234567), vk_zero_point);
9881 const __m128i vi15x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i15 + 8));
9882 const __m128i vxi15x89ABCDEF = _mm_cvtepu8_epi16(vi15x89ABCDEF);
9883 const __m128i vk15x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 248 * sizeof(uint8_t)));
9884 const __m128i vxk15x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk15x89ABCDEF), vk_zero_point);
9885 i15 += 16;
9886
9887
9888 const __m128i vprod15x01234567lo = _mm_mullo_epi16(vxi15x01234567, vxk15x01234567);
9889 const __m128i vprod15x01234567hi = _mm_mulhi_epi16(vxi15x01234567, vxk15x01234567);
9890 const __m128i vprod15x89ABCDEFlo = _mm_mullo_epi16(vxi15x89ABCDEF, vxk15x89ABCDEF);
9891 const __m128i vprod15x89ABCDEFhi = _mm_mulhi_epi16(vxi15x89ABCDEF, vxk15x89ABCDEF);
9892
9893 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod15x01234567lo, vprod15x01234567hi));
9894 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod15x01234567lo, vprod15x01234567hi));
9895 vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod15x89ABCDEFlo, vprod15x89ABCDEFhi));
9896 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod15x89ABCDEFlo, vprod15x89ABCDEFhi));
9897
9898 const __m128i vi16x01234567 = _mm_loadl_epi64((const __m128i*) i16);
9899 const __m128i vxi16x01234567 = _mm_cvtepu8_epi16(vi16x01234567);
9900 const __m128i vk16x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 256 * sizeof(uint8_t)));
9901 const __m128i vxk16x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk16x01234567), vk_zero_point);
9902 const __m128i vi16x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i16 + 8));
9903 const __m128i vxi16x89ABCDEF = _mm_cvtepu8_epi16(vi16x89ABCDEF);
9904 const __m128i vk16x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 264 * sizeof(uint8_t)));
9905 const __m128i vxk16x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk16x89ABCDEF), vk_zero_point);
9906 i16 += 16;
9907
9908
9909 const __m128i vprod16x01234567lo = _mm_mullo_epi16(vxi16x01234567, vxk16x01234567);
9910 const __m128i vprod16x01234567hi = _mm_mulhi_epi16(vxi16x01234567, vxk16x01234567);
9911 const __m128i vprod16x89ABCDEFlo = _mm_mullo_epi16(vxi16x89ABCDEF, vxk16x89ABCDEF);
9912 const __m128i vprod16x89ABCDEFhi = _mm_mulhi_epi16(vxi16x89ABCDEF, vxk16x89ABCDEF);
9913
9914 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod16x01234567lo, vprod16x01234567hi));
9915 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod16x01234567lo, vprod16x01234567hi));
9916 vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod16x89ABCDEFlo, vprod16x89ABCDEFhi));
9917 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod16x89ABCDEFlo, vprod16x89ABCDEFhi));
9918
9919 const __m128i vi17x01234567 = _mm_loadl_epi64((const __m128i*) i17);
9920 const __m128i vxi17x01234567 = _mm_cvtepu8_epi16(vi17x01234567);
9921 const __m128i vk17x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 272 * sizeof(uint8_t)));
9922 const __m128i vxk17x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk17x01234567), vk_zero_point);
9923 const __m128i vi17x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i17 + 8));
9924 const __m128i vxi17x89ABCDEF = _mm_cvtepu8_epi16(vi17x89ABCDEF);
9925 const __m128i vk17x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 280 * sizeof(uint8_t)));
9926 const __m128i vxk17x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk17x89ABCDEF), vk_zero_point);
9927 i17 += 16;
9928
9929
9930 const __m128i vprod17x01234567lo = _mm_mullo_epi16(vxi17x01234567, vxk17x01234567);
9931 const __m128i vprod17x01234567hi = _mm_mulhi_epi16(vxi17x01234567, vxk17x01234567);
9932 const __m128i vprod17x89ABCDEFlo = _mm_mullo_epi16(vxi17x89ABCDEF, vxk17x89ABCDEF);
9933 const __m128i vprod17x89ABCDEFhi = _mm_mulhi_epi16(vxi17x89ABCDEF, vxk17x89ABCDEF);
9934
9935 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod17x01234567lo, vprod17x01234567hi));
9936 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod17x01234567lo, vprod17x01234567hi));
9937 vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod17x89ABCDEFlo, vprod17x89ABCDEFhi));
9938 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod17x89ABCDEFlo, vprod17x89ABCDEFhi));
9939
9940 const __m128i vi18x01234567 = _mm_loadl_epi64((const __m128i*) i18);
9941 const __m128i vxi18x01234567 = _mm_cvtepu8_epi16(vi18x01234567);
9942 const __m128i vk18x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 288 * sizeof(uint8_t)));
9943 const __m128i vxk18x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk18x01234567), vk_zero_point);
9944 const __m128i vi18x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i18 + 8));
9945 const __m128i vxi18x89ABCDEF = _mm_cvtepu8_epi16(vi18x89ABCDEF);
9946 const __m128i vk18x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 296 * sizeof(uint8_t)));
9947 const __m128i vxk18x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk18x89ABCDEF), vk_zero_point);
9948 i18 += 16;
9949
9950
9951 const __m128i vprod18x01234567lo = _mm_mullo_epi16(vxi18x01234567, vxk18x01234567);
9952 const __m128i vprod18x01234567hi = _mm_mulhi_epi16(vxi18x01234567, vxk18x01234567);
9953 const __m128i vprod18x89ABCDEFlo = _mm_mullo_epi16(vxi18x89ABCDEF, vxk18x89ABCDEF);
9954 const __m128i vprod18x89ABCDEFhi = _mm_mulhi_epi16(vxi18x89ABCDEF, vxk18x89ABCDEF);
9955
9956 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod18x01234567lo, vprod18x01234567hi));
9957 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod18x01234567lo, vprod18x01234567hi));
9958 vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod18x89ABCDEFlo, vprod18x89ABCDEFhi));
9959 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod18x89ABCDEFlo, vprod18x89ABCDEFhi));
9960
9961 const __m128i vi19x01234567 = _mm_loadl_epi64((const __m128i*) i19);
9962 const __m128i vxi19x01234567 = _mm_cvtepu8_epi16(vi19x01234567);
9963 const __m128i vk19x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 304 * sizeof(uint8_t)));
9964 const __m128i vxk19x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk19x01234567), vk_zero_point);
9965 const __m128i vi19x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i19 + 8));
9966 const __m128i vxi19x89ABCDEF = _mm_cvtepu8_epi16(vi19x89ABCDEF);
9967 const __m128i vk19x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 312 * sizeof(uint8_t)));
9968 const __m128i vxk19x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk19x89ABCDEF), vk_zero_point);
9969 i19 += 16;
9970
9971
9972 const __m128i vprod19x01234567lo = _mm_mullo_epi16(vxi19x01234567, vxk19x01234567);
9973 const __m128i vprod19x01234567hi = _mm_mulhi_epi16(vxi19x01234567, vxk19x01234567);
9974 const __m128i vprod19x89ABCDEFlo = _mm_mullo_epi16(vxi19x89ABCDEF, vxk19x89ABCDEF);
9975 const __m128i vprod19x89ABCDEFhi = _mm_mulhi_epi16(vxi19x89ABCDEF, vxk19x89ABCDEF);
9976
9977 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod19x01234567lo, vprod19x01234567hi));
9978 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod19x01234567lo, vprod19x01234567hi));
9979 vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod19x89ABCDEFlo, vprod19x89ABCDEFhi));
9980 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod19x89ABCDEFlo, vprod19x89ABCDEFhi));
9981
9982 const __m128i vi20x01234567 = _mm_loadl_epi64((const __m128i*) i20);
9983 const __m128i vxi20x01234567 = _mm_cvtepu8_epi16(vi20x01234567);
9984 const __m128i vk20x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 320 * sizeof(uint8_t)));
9985 const __m128i vxk20x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk20x01234567), vk_zero_point);
9986 const __m128i vi20x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i20 + 8));
9987 const __m128i vxi20x89ABCDEF = _mm_cvtepu8_epi16(vi20x89ABCDEF);
9988 const __m128i vk20x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 328 * sizeof(uint8_t)));
9989 const __m128i vxk20x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk20x89ABCDEF), vk_zero_point);
9990 i20 += 16;
9991
9992
9993 const __m128i vprod20x01234567lo = _mm_mullo_epi16(vxi20x01234567, vxk20x01234567);
9994 const __m128i vprod20x01234567hi = _mm_mulhi_epi16(vxi20x01234567, vxk20x01234567);
9995 const __m128i vprod20x89ABCDEFlo = _mm_mullo_epi16(vxi20x89ABCDEF, vxk20x89ABCDEF);
9996 const __m128i vprod20x89ABCDEFhi = _mm_mulhi_epi16(vxi20x89ABCDEF, vxk20x89ABCDEF);
9997
9998 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod20x01234567lo, vprod20x01234567hi));
9999 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod20x01234567lo, vprod20x01234567hi));
10000 vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod20x89ABCDEFlo, vprod20x89ABCDEFhi));
10001 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod20x89ABCDEFlo, vprod20x89ABCDEFhi));
10002
10003 const __m128i vi21x01234567 = _mm_loadl_epi64((const __m128i*) i21);
10004 const __m128i vxi21x01234567 = _mm_cvtepu8_epi16(vi21x01234567);
10005 const __m128i vk21x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 336 * sizeof(uint8_t)));
10006 const __m128i vxk21x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk21x01234567), vk_zero_point);
10007 const __m128i vi21x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i21 + 8));
10008 const __m128i vxi21x89ABCDEF = _mm_cvtepu8_epi16(vi21x89ABCDEF);
10009 const __m128i vk21x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 344 * sizeof(uint8_t)));
10010 const __m128i vxk21x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk21x89ABCDEF), vk_zero_point);
10011 i21 += 16;
10012
10013
10014 const __m128i vprod21x01234567lo = _mm_mullo_epi16(vxi21x01234567, vxk21x01234567);
10015 const __m128i vprod21x01234567hi = _mm_mulhi_epi16(vxi21x01234567, vxk21x01234567);
10016 const __m128i vprod21x89ABCDEFlo = _mm_mullo_epi16(vxi21x89ABCDEF, vxk21x89ABCDEF);
10017 const __m128i vprod21x89ABCDEFhi = _mm_mulhi_epi16(vxi21x89ABCDEF, vxk21x89ABCDEF);
10018
10019 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod21x01234567lo, vprod21x01234567hi));
10020 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod21x01234567lo, vprod21x01234567hi));
10021 vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod21x89ABCDEFlo, vprod21x89ABCDEFhi));
10022 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod21x89ABCDEFlo, vprod21x89ABCDEFhi));
10023
10024 const __m128i vi22x01234567 = _mm_loadl_epi64((const __m128i*) i22);
10025 const __m128i vxi22x01234567 = _mm_cvtepu8_epi16(vi22x01234567);
10026 const __m128i vk22x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 352 * sizeof(uint8_t)));
10027 const __m128i vxk22x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk22x01234567), vk_zero_point);
10028 const __m128i vi22x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i22 + 8));
10029 const __m128i vxi22x89ABCDEF = _mm_cvtepu8_epi16(vi22x89ABCDEF);
10030 const __m128i vk22x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 360 * sizeof(uint8_t)));
10031 const __m128i vxk22x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk22x89ABCDEF), vk_zero_point);
10032 i22 += 16;
10033
10034
10035 const __m128i vprod22x01234567lo = _mm_mullo_epi16(vxi22x01234567, vxk22x01234567);
10036 const __m128i vprod22x01234567hi = _mm_mulhi_epi16(vxi22x01234567, vxk22x01234567);
10037 const __m128i vprod22x89ABCDEFlo = _mm_mullo_epi16(vxi22x89ABCDEF, vxk22x89ABCDEF);
10038 const __m128i vprod22x89ABCDEFhi = _mm_mulhi_epi16(vxi22x89ABCDEF, vxk22x89ABCDEF);
10039
10040 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod22x01234567lo, vprod22x01234567hi));
10041 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod22x01234567lo, vprod22x01234567hi));
10042 vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod22x89ABCDEFlo, vprod22x89ABCDEFhi));
10043 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod22x89ABCDEFlo, vprod22x89ABCDEFhi));
10044
10045 const __m128i vi23x01234567 = _mm_loadl_epi64((const __m128i*) i23);
10046 const __m128i vxi23x01234567 = _mm_cvtepu8_epi16(vi23x01234567);
10047 const __m128i vk23x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 368 * sizeof(uint8_t)));
10048 const __m128i vxk23x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk23x01234567), vk_zero_point);
10049 const __m128i vi23x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i23 + 8));
10050 const __m128i vxi23x89ABCDEF = _mm_cvtepu8_epi16(vi23x89ABCDEF);
10051 const __m128i vk23x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 376 * sizeof(uint8_t)));
10052 const __m128i vxk23x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk23x89ABCDEF), vk_zero_point);
10053 i23 += 16;
10054
10055
10056 const __m128i vprod23x01234567lo = _mm_mullo_epi16(vxi23x01234567, vxk23x01234567);
10057 const __m128i vprod23x01234567hi = _mm_mulhi_epi16(vxi23x01234567, vxk23x01234567);
10058 const __m128i vprod23x89ABCDEFlo = _mm_mullo_epi16(vxi23x89ABCDEF, vxk23x89ABCDEF);
10059 const __m128i vprod23x89ABCDEFhi = _mm_mulhi_epi16(vxi23x89ABCDEF, vxk23x89ABCDEF);
10060
10061 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod23x01234567lo, vprod23x01234567hi));
10062 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod23x01234567lo, vprod23x01234567hi));
10063 vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod23x89ABCDEFlo, vprod23x89ABCDEFhi));
10064 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod23x89ABCDEFlo, vprod23x89ABCDEFhi));
10065
10066 const __m128i vi24x01234567 = _mm_loadl_epi64((const __m128i*) i24);
10067 const __m128i vxi24x01234567 = _mm_cvtepu8_epi16(vi24x01234567);
10068 const __m128i vk24x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 384 * sizeof(uint8_t)));
10069 const __m128i vxk24x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk24x01234567), vk_zero_point);
10070 const __m128i vi24x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i24 + 8));
10071 const __m128i vxi24x89ABCDEF = _mm_cvtepu8_epi16(vi24x89ABCDEF);
10072 const __m128i vk24x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 392 * sizeof(uint8_t)));
10073 const __m128i vxk24x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk24x89ABCDEF), vk_zero_point);
10074 i24 += 16;
10075
10076
10077 const __m128i vprod24x01234567lo = _mm_mullo_epi16(vxi24x01234567, vxk24x01234567);
10078 const __m128i vprod24x01234567hi = _mm_mulhi_epi16(vxi24x01234567, vxk24x01234567);
10079 const __m128i vprod24x89ABCDEFlo = _mm_mullo_epi16(vxi24x89ABCDEF, vxk24x89ABCDEF);
10080 const __m128i vprod24x89ABCDEFhi = _mm_mulhi_epi16(vxi24x89ABCDEF, vxk24x89ABCDEF);
10081
10082 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod24x01234567lo, vprod24x01234567hi));
10083 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod24x01234567lo, vprod24x01234567hi));
10084 vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod24x89ABCDEFlo, vprod24x89ABCDEFhi));
10085 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod24x89ABCDEFlo, vprod24x89ABCDEFhi));
10086
10087 w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 400 * sizeof(uint8_t));
10088
10089 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
10090 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
10091 __m128 vscaled89AB = _mm_cvtepi32_ps(vacc89AB);
10092 __m128 vscaledCDEF = _mm_cvtepi32_ps(vaccCDEF);
10093
10094 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
10095 vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
10096 vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
10097 vscaled89AB = _mm_mul_ps(vscaled89AB, vscale);
10098 vscaledCDEF = _mm_mul_ps(vscaledCDEF, vscale);
10099
10100 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
10101 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
10102 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
10103 vscaled89AB = _mm_min_ps(vscaled89AB, voutput_max_less_zero_point);
10104 vscaledCDEF = _mm_min_ps(vscaledCDEF, voutput_max_less_zero_point);
10105
10106 vacc0123 = _mm_cvtps_epi32(vscaled0123);
10107 vacc4567 = _mm_cvtps_epi32(vscaled4567);
10108 vacc89AB = _mm_cvtps_epi32(vscaled89AB);
10109 vaccCDEF = _mm_cvtps_epi32(vscaledCDEF);
10110
10111 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
10112 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
10113 __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
10114
10115 __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF);
10116
10117 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
10118 vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
10119
10120 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
10121 output += 16;
10122 }
10123 if XNN_UNLIKELY(c != 0) {
10124 const uint8_t* k = (const uint8_t*) ((const int32_t*) w + 16);
10125 do {
10126 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
10127 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
10128
10129
10130 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
10131 const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(vi0x01234567);
10132 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) k);
10133 const __m128i vxk0x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk0x01234567), vk_zero_point);
10134 i0 += 8;
10135
10136
10137 const __m128i vprod0x01234567lo = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
10138 const __m128i vprod0x01234567hi = _mm_mulhi_epi16(vxi0x01234567, vxk0x01234567);
10139
10140 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod0x01234567lo, vprod0x01234567hi));
10141 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod0x01234567lo, vprod0x01234567hi));
10142
10143 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
10144 const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(vi1x01234567);
10145 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) (k + 16));
10146 const __m128i vxk1x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk1x01234567), vk_zero_point);
10147 i1 += 8;
10148
10149
10150 const __m128i vprod1x01234567lo = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
10151 const __m128i vprod1x01234567hi = _mm_mulhi_epi16(vxi1x01234567, vxk1x01234567);
10152
10153 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod1x01234567lo, vprod1x01234567hi));
10154 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod1x01234567lo, vprod1x01234567hi));
10155
10156 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
10157 const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(vi2x01234567);
10158 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) (k + 32));
10159 const __m128i vxk2x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk2x01234567), vk_zero_point);
10160 i2 += 8;
10161
10162
10163 const __m128i vprod2x01234567lo = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
10164 const __m128i vprod2x01234567hi = _mm_mulhi_epi16(vxi2x01234567, vxk2x01234567);
10165
10166 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod2x01234567lo, vprod2x01234567hi));
10167 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod2x01234567lo, vprod2x01234567hi));
10168
10169 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
10170 const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(vi3x01234567);
10171 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) (k + 48));
10172 const __m128i vxk3x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk3x01234567), vk_zero_point);
10173 i3 += 8;
10174
10175
10176 const __m128i vprod3x01234567lo = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567);
10177 const __m128i vprod3x01234567hi = _mm_mulhi_epi16(vxi3x01234567, vxk3x01234567);
10178
10179 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod3x01234567lo, vprod3x01234567hi));
10180 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod3x01234567lo, vprod3x01234567hi));
10181
10182 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
10183 const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(vi4x01234567);
10184 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) (k + 64));
10185 const __m128i vxk4x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk4x01234567), vk_zero_point);
10186 i4 += 8;
10187
10188
10189 const __m128i vprod4x01234567lo = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
10190 const __m128i vprod4x01234567hi = _mm_mulhi_epi16(vxi4x01234567, vxk4x01234567);
10191
10192 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod4x01234567lo, vprod4x01234567hi));
10193 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod4x01234567lo, vprod4x01234567hi));
10194
10195 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
10196 const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(vi5x01234567);
10197 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) (k + 80));
10198 const __m128i vxk5x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk5x01234567), vk_zero_point);
10199 i5 += 8;
10200
10201
10202 const __m128i vprod5x01234567lo = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567);
10203 const __m128i vprod5x01234567hi = _mm_mulhi_epi16(vxi5x01234567, vxk5x01234567);
10204
10205 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod5x01234567lo, vprod5x01234567hi));
10206 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod5x01234567lo, vprod5x01234567hi));
10207
10208 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
10209 const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(vi6x01234567);
10210 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) (k + 96));
10211 const __m128i vxk6x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk6x01234567), vk_zero_point);
10212 i6 += 8;
10213
10214
10215 const __m128i vprod6x01234567lo = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
10216 const __m128i vprod6x01234567hi = _mm_mulhi_epi16(vxi6x01234567, vxk6x01234567);
10217
10218 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod6x01234567lo, vprod6x01234567hi));
10219 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod6x01234567lo, vprod6x01234567hi));
10220
10221 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
10222 const __m128i vxi7x01234567 = _mm_cvtepu8_epi16(vi7x01234567);
10223 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) (k + 112));
10224 const __m128i vxk7x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk7x01234567), vk_zero_point);
10225 i7 += 8;
10226
10227
10228 const __m128i vprod7x01234567lo = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567);
10229 const __m128i vprod7x01234567hi = _mm_mulhi_epi16(vxi7x01234567, vxk7x01234567);
10230
10231 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod7x01234567lo, vprod7x01234567hi));
10232 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod7x01234567lo, vprod7x01234567hi));
10233
10234 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
10235 const __m128i vxi8x01234567 = _mm_cvtepu8_epi16(vi8x01234567);
10236 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) (k + 128));
10237 const __m128i vxk8x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk8x01234567), vk_zero_point);
10238 i8 += 8;
10239
10240
10241 const __m128i vprod8x01234567lo = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
10242 const __m128i vprod8x01234567hi = _mm_mulhi_epi16(vxi8x01234567, vxk8x01234567);
10243
10244 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod8x01234567lo, vprod8x01234567hi));
10245 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod8x01234567lo, vprod8x01234567hi));
10246
10247 const __m128i vi9x01234567 = _mm_loadl_epi64((const __m128i*) i9);
10248 const __m128i vxi9x01234567 = _mm_cvtepu8_epi16(vi9x01234567);
10249 const __m128i vk9x01234567 = _mm_loadl_epi64((const __m128i*) (k + 144));
10250 const __m128i vxk9x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk9x01234567), vk_zero_point);
10251 i9 += 8;
10252
10253
10254 const __m128i vprod9x01234567lo = _mm_mullo_epi16(vxi9x01234567, vxk9x01234567);
10255 const __m128i vprod9x01234567hi = _mm_mulhi_epi16(vxi9x01234567, vxk9x01234567);
10256
10257 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod9x01234567lo, vprod9x01234567hi));
10258 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod9x01234567lo, vprod9x01234567hi));
10259
10260 const __m128i vi10x01234567 = _mm_loadl_epi64((const __m128i*) i10);
10261 const __m128i vxi10x01234567 = _mm_cvtepu8_epi16(vi10x01234567);
10262 const __m128i vk10x01234567 = _mm_loadl_epi64((const __m128i*) (k + 160));
10263 const __m128i vxk10x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk10x01234567), vk_zero_point);
10264 i10 += 8;
10265
10266
10267 const __m128i vprod10x01234567lo = _mm_mullo_epi16(vxi10x01234567, vxk10x01234567);
10268 const __m128i vprod10x01234567hi = _mm_mulhi_epi16(vxi10x01234567, vxk10x01234567);
10269
10270 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod10x01234567lo, vprod10x01234567hi));
10271 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod10x01234567lo, vprod10x01234567hi));
10272
10273 const __m128i vi11x01234567 = _mm_loadl_epi64((const __m128i*) i11);
10274 const __m128i vxi11x01234567 = _mm_cvtepu8_epi16(vi11x01234567);
10275 const __m128i vk11x01234567 = _mm_loadl_epi64((const __m128i*) (k + 176));
10276 const __m128i vxk11x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk11x01234567), vk_zero_point);
10277 i11 += 8;
10278
10279
10280 const __m128i vprod11x01234567lo = _mm_mullo_epi16(vxi11x01234567, vxk11x01234567);
10281 const __m128i vprod11x01234567hi = _mm_mulhi_epi16(vxi11x01234567, vxk11x01234567);
10282
10283 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod11x01234567lo, vprod11x01234567hi));
10284 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod11x01234567lo, vprod11x01234567hi));
10285
10286 const __m128i vi12x01234567 = _mm_loadl_epi64((const __m128i*) i12);
10287 const __m128i vxi12x01234567 = _mm_cvtepu8_epi16(vi12x01234567);
10288 const __m128i vk12x01234567 = _mm_loadl_epi64((const __m128i*) (k + 192));
10289 const __m128i vxk12x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk12x01234567), vk_zero_point);
10290 i12 += 8;
10291
10292
10293 const __m128i vprod12x01234567lo = _mm_mullo_epi16(vxi12x01234567, vxk12x01234567);
10294 const __m128i vprod12x01234567hi = _mm_mulhi_epi16(vxi12x01234567, vxk12x01234567);
10295
10296 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod12x01234567lo, vprod12x01234567hi));
10297 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod12x01234567lo, vprod12x01234567hi));
10298
10299 const __m128i vi13x01234567 = _mm_loadl_epi64((const __m128i*) i13);
10300 const __m128i vxi13x01234567 = _mm_cvtepu8_epi16(vi13x01234567);
10301 const __m128i vk13x01234567 = _mm_loadl_epi64((const __m128i*) (k + 208));
10302 const __m128i vxk13x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk13x01234567), vk_zero_point);
10303 i13 += 8;
10304
10305
10306 const __m128i vprod13x01234567lo = _mm_mullo_epi16(vxi13x01234567, vxk13x01234567);
10307 const __m128i vprod13x01234567hi = _mm_mulhi_epi16(vxi13x01234567, vxk13x01234567);
10308
10309 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod13x01234567lo, vprod13x01234567hi));
10310 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod13x01234567lo, vprod13x01234567hi));
10311
10312 const __m128i vi14x01234567 = _mm_loadl_epi64((const __m128i*) i14);
10313 const __m128i vxi14x01234567 = _mm_cvtepu8_epi16(vi14x01234567);
10314 const __m128i vk14x01234567 = _mm_loadl_epi64((const __m128i*) (k + 224));
10315 const __m128i vxk14x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk14x01234567), vk_zero_point);
10316 i14 += 8;
10317
10318
10319 const __m128i vprod14x01234567lo = _mm_mullo_epi16(vxi14x01234567, vxk14x01234567);
10320 const __m128i vprod14x01234567hi = _mm_mulhi_epi16(vxi14x01234567, vxk14x01234567);
10321
10322 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod14x01234567lo, vprod14x01234567hi));
10323 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod14x01234567lo, vprod14x01234567hi));
10324
10325 const __m128i vi15x01234567 = _mm_loadl_epi64((const __m128i*) i15);
10326 const __m128i vxi15x01234567 = _mm_cvtepu8_epi16(vi15x01234567);
10327 const __m128i vk15x01234567 = _mm_loadl_epi64((const __m128i*) (k + 240));
10328 const __m128i vxk15x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk15x01234567), vk_zero_point);
10329 i15 += 8;
10330
10331
10332 const __m128i vprod15x01234567lo = _mm_mullo_epi16(vxi15x01234567, vxk15x01234567);
10333 const __m128i vprod15x01234567hi = _mm_mulhi_epi16(vxi15x01234567, vxk15x01234567);
10334
10335 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod15x01234567lo, vprod15x01234567hi));
10336 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod15x01234567lo, vprod15x01234567hi));
10337
10338 const __m128i vi16x01234567 = _mm_loadl_epi64((const __m128i*) i16);
10339 const __m128i vxi16x01234567 = _mm_cvtepu8_epi16(vi16x01234567);
10340 const __m128i vk16x01234567 = _mm_loadl_epi64((const __m128i*) (k + 256));
10341 const __m128i vxk16x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk16x01234567), vk_zero_point);
10342 i16 += 8;
10343
10344
10345 const __m128i vprod16x01234567lo = _mm_mullo_epi16(vxi16x01234567, vxk16x01234567);
10346 const __m128i vprod16x01234567hi = _mm_mulhi_epi16(vxi16x01234567, vxk16x01234567);
10347
10348 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod16x01234567lo, vprod16x01234567hi));
10349 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod16x01234567lo, vprod16x01234567hi));
10350
10351 const __m128i vi17x01234567 = _mm_loadl_epi64((const __m128i*) i17);
10352 const __m128i vxi17x01234567 = _mm_cvtepu8_epi16(vi17x01234567);
10353 const __m128i vk17x01234567 = _mm_loadl_epi64((const __m128i*) (k + 272));
10354 const __m128i vxk17x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk17x01234567), vk_zero_point);
10355 i17 += 8;
10356
10357
10358 const __m128i vprod17x01234567lo = _mm_mullo_epi16(vxi17x01234567, vxk17x01234567);
10359 const __m128i vprod17x01234567hi = _mm_mulhi_epi16(vxi17x01234567, vxk17x01234567);
10360
10361 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod17x01234567lo, vprod17x01234567hi));
10362 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod17x01234567lo, vprod17x01234567hi));
10363
10364 const __m128i vi18x01234567 = _mm_loadl_epi64((const __m128i*) i18);
10365 const __m128i vxi18x01234567 = _mm_cvtepu8_epi16(vi18x01234567);
10366 const __m128i vk18x01234567 = _mm_loadl_epi64((const __m128i*) (k + 288));
10367 const __m128i vxk18x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk18x01234567), vk_zero_point);
10368 i18 += 8;
10369
10370
10371 const __m128i vprod18x01234567lo = _mm_mullo_epi16(vxi18x01234567, vxk18x01234567);
10372 const __m128i vprod18x01234567hi = _mm_mulhi_epi16(vxi18x01234567, vxk18x01234567);
10373
10374 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod18x01234567lo, vprod18x01234567hi));
10375 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod18x01234567lo, vprod18x01234567hi));
10376
10377 const __m128i vi19x01234567 = _mm_loadl_epi64((const __m128i*) i19);
10378 const __m128i vxi19x01234567 = _mm_cvtepu8_epi16(vi19x01234567);
10379 const __m128i vk19x01234567 = _mm_loadl_epi64((const __m128i*) (k + 304));
10380 const __m128i vxk19x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk19x01234567), vk_zero_point);
10381 i19 += 8;
10382
10383
10384 const __m128i vprod19x01234567lo = _mm_mullo_epi16(vxi19x01234567, vxk19x01234567);
10385 const __m128i vprod19x01234567hi = _mm_mulhi_epi16(vxi19x01234567, vxk19x01234567);
10386
10387 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod19x01234567lo, vprod19x01234567hi));
10388 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod19x01234567lo, vprod19x01234567hi));
10389
10390 const __m128i vi20x01234567 = _mm_loadl_epi64((const __m128i*) i20);
10391 const __m128i vxi20x01234567 = _mm_cvtepu8_epi16(vi20x01234567);
10392 const __m128i vk20x01234567 = _mm_loadl_epi64((const __m128i*) (k + 320));
10393 const __m128i vxk20x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk20x01234567), vk_zero_point);
10394 i20 += 8;
10395
10396
10397 const __m128i vprod20x01234567lo = _mm_mullo_epi16(vxi20x01234567, vxk20x01234567);
10398 const __m128i vprod20x01234567hi = _mm_mulhi_epi16(vxi20x01234567, vxk20x01234567);
10399
10400 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod20x01234567lo, vprod20x01234567hi));
10401 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod20x01234567lo, vprod20x01234567hi));
10402
10403 const __m128i vi21x01234567 = _mm_loadl_epi64((const __m128i*) i21);
10404 const __m128i vxi21x01234567 = _mm_cvtepu8_epi16(vi21x01234567);
10405 const __m128i vk21x01234567 = _mm_loadl_epi64((const __m128i*) (k + 336));
10406 const __m128i vxk21x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk21x01234567), vk_zero_point);
10407 i21 += 8;
10408
10409
10410 const __m128i vprod21x01234567lo = _mm_mullo_epi16(vxi21x01234567, vxk21x01234567);
10411 const __m128i vprod21x01234567hi = _mm_mulhi_epi16(vxi21x01234567, vxk21x01234567);
10412
10413 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod21x01234567lo, vprod21x01234567hi));
10414 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod21x01234567lo, vprod21x01234567hi));
10415
10416 const __m128i vi22x01234567 = _mm_loadl_epi64((const __m128i*) i22);
10417 const __m128i vxi22x01234567 = _mm_cvtepu8_epi16(vi22x01234567);
10418 const __m128i vk22x01234567 = _mm_loadl_epi64((const __m128i*) (k + 352));
10419 const __m128i vxk22x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk22x01234567), vk_zero_point);
10420 i22 += 8;
10421
10422
10423 const __m128i vprod22x01234567lo = _mm_mullo_epi16(vxi22x01234567, vxk22x01234567);
10424 const __m128i vprod22x01234567hi = _mm_mulhi_epi16(vxi22x01234567, vxk22x01234567);
10425
10426 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod22x01234567lo, vprod22x01234567hi));
10427 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod22x01234567lo, vprod22x01234567hi));
10428
10429 const __m128i vi23x01234567 = _mm_loadl_epi64((const __m128i*) i23);
10430 const __m128i vxi23x01234567 = _mm_cvtepu8_epi16(vi23x01234567);
10431 const __m128i vk23x01234567 = _mm_loadl_epi64((const __m128i*) (k + 368));
10432 const __m128i vxk23x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk23x01234567), vk_zero_point);
10433 i23 += 8;
10434
10435
10436 const __m128i vprod23x01234567lo = _mm_mullo_epi16(vxi23x01234567, vxk23x01234567);
10437 const __m128i vprod23x01234567hi = _mm_mulhi_epi16(vxi23x01234567, vxk23x01234567);
10438
10439 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod23x01234567lo, vprod23x01234567hi));
10440 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod23x01234567lo, vprod23x01234567hi));
10441
10442 const __m128i vi24x01234567 = _mm_loadl_epi64((const __m128i*) i24);
10443 const __m128i vxi24x01234567 = _mm_cvtepu8_epi16(vi24x01234567);
10444 const __m128i vk24x01234567 = _mm_loadl_epi64((const __m128i*) (k + 384));
10445 const __m128i vxk24x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk24x01234567), vk_zero_point);
10446 i24 += 8;
10447
10448
10449 const __m128i vprod24x01234567lo = _mm_mullo_epi16(vxi24x01234567, vxk24x01234567);
10450 const __m128i vprod24x01234567hi = _mm_mulhi_epi16(vxi24x01234567, vxk24x01234567);
10451
10452 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod24x01234567lo, vprod24x01234567hi));
10453 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod24x01234567lo, vprod24x01234567hi));
10454
10455 k += 8;
10456
10457 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
10458 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
10459
10460 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
10461 vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
10462 vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
10463
10464 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
10465 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
10466 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
10467
10468 vacc0123 = _mm_cvtps_epi32(vscaled0123);
10469 vacc4567 = _mm_cvtps_epi32(vscaled4567);
10470
10471 w = (const void*) ((const int32_t*) w + 8);
10472
10473 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
10474 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
10475
10476 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
10477
10478 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
10479
10480 if XNN_LIKELY(c >= 8) {
10481 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
10482 output += 8;
10483 c -= 8;
10484 } else {
10485 if (c & 4) {
10486 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
10487 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
10488 output += 4;
10489 }
10490 if (c & 2) {
10491 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
10492 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
10493 output += 2;
10494 }
10495 if (c & 1) {
10496 *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
10497 output += 1;
10498 }
10499 c = 0;
10500 }
10501 } while (c != 0);
10502 }
10503
10504 output = (uint8_t*) ((uintptr_t) output + output_increment);
10505 } while (--output_width != 0);
10506 }
10507
xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16(size_t channels,size_t output_width,const uint8_t ** input,const void * weights,uint8_t * output,size_t input_stride,size_t output_increment,size_t input_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])10508 void xnn_qu8_dwconv_minmax_fp32_ukernel_up16x9__avx_mul16(
10509 size_t channels,
10510 size_t output_width,
10511 const uint8_t** input,
10512 const void* weights,
10513 uint8_t* output,
10514 size_t input_stride,
10515 size_t output_increment,
10516 size_t input_offset,
10517 const uint8_t* zero,
10518 const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
10519 {
10520 assert(channels != 0);
10521 assert(output_width != 0);
10522
10523 do {
10524 const uint8_t* i0 = input[0];
10525 assert(i0 != NULL);
10526 if XNN_UNPREDICTABLE(i0 != zero) {
10527 i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
10528 }
10529 const uint8_t* i1 = input[1];
10530 assert(i1 != NULL);
10531 if XNN_UNPREDICTABLE(i1 != zero) {
10532 i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
10533 }
10534 const uint8_t* i2 = input[2];
10535 assert(i2 != NULL);
10536 if XNN_UNPREDICTABLE(i2 != zero) {
10537 i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
10538 }
10539 const uint8_t* i3 = input[3];
10540 assert(i3 != NULL);
10541 if XNN_UNPREDICTABLE(i3 != zero) {
10542 i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
10543 }
10544 const uint8_t* i4 = input[4];
10545 assert(i4 != NULL);
10546 if XNN_UNPREDICTABLE(i4 != zero) {
10547 i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
10548 }
10549 const uint8_t* i5 = input[5];
10550 assert(i5 != NULL);
10551 if XNN_UNPREDICTABLE(i5 != zero) {
10552 i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
10553 }
10554 const uint8_t* i6 = input[6];
10555 assert(i6 != NULL);
10556 if XNN_UNPREDICTABLE(i6 != zero) {
10557 i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
10558 }
10559 const uint8_t* i7 = input[7];
10560 assert(i7 != NULL);
10561 if XNN_UNPREDICTABLE(i7 != zero) {
10562 i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
10563 }
10564 const uint8_t* i8 = input[8];
10565 assert(i8 != NULL);
10566 if XNN_UNPREDICTABLE(i8 != zero) {
10567 i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
10568 }
10569 input = (const uint8_t**) ((uintptr_t) input + input_stride);
10570
10571 size_t c = channels;
10572 const void* w = weights;
10573 const __m128i vk_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point);
10574 for (; c >= 16; c -= 16) {
10575 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
10576 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
10577 __m128i vacc89AB = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 8));
10578 __m128i vaccCDEF = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 12));
10579
10580
10581 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
10582 const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(vi0x01234567);
10583 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 0 * sizeof(uint8_t)));
10584 const __m128i vxk0x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk0x01234567), vk_zero_point);
10585 const __m128i vi0x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i0 + 8));
10586 const __m128i vxi0x89ABCDEF = _mm_cvtepu8_epi16(vi0x89ABCDEF);
10587 const __m128i vk0x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 8 * sizeof(uint8_t)));
10588 const __m128i vxk0x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk0x89ABCDEF), vk_zero_point);
10589 i0 += 16;
10590
10591
10592 const __m128i vprod0x01234567lo = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
10593 const __m128i vprod0x01234567hi = _mm_mulhi_epi16(vxi0x01234567, vxk0x01234567);
10594 const __m128i vprod0x89ABCDEFlo = _mm_mullo_epi16(vxi0x89ABCDEF, vxk0x89ABCDEF);
10595 const __m128i vprod0x89ABCDEFhi = _mm_mulhi_epi16(vxi0x89ABCDEF, vxk0x89ABCDEF);
10596
10597 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod0x01234567lo, vprod0x01234567hi));
10598 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod0x01234567lo, vprod0x01234567hi));
10599 vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod0x89ABCDEFlo, vprod0x89ABCDEFhi));
10600 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod0x89ABCDEFlo, vprod0x89ABCDEFhi));
10601
10602 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
10603 const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(vi1x01234567);
10604 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 16 * sizeof(uint8_t)));
10605 const __m128i vxk1x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk1x01234567), vk_zero_point);
10606 const __m128i vi1x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i1 + 8));
10607 const __m128i vxi1x89ABCDEF = _mm_cvtepu8_epi16(vi1x89ABCDEF);
10608 const __m128i vk1x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 24 * sizeof(uint8_t)));
10609 const __m128i vxk1x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk1x89ABCDEF), vk_zero_point);
10610 i1 += 16;
10611
10612
10613 const __m128i vprod1x01234567lo = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
10614 const __m128i vprod1x01234567hi = _mm_mulhi_epi16(vxi1x01234567, vxk1x01234567);
10615 const __m128i vprod1x89ABCDEFlo = _mm_mullo_epi16(vxi1x89ABCDEF, vxk1x89ABCDEF);
10616 const __m128i vprod1x89ABCDEFhi = _mm_mulhi_epi16(vxi1x89ABCDEF, vxk1x89ABCDEF);
10617
10618 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod1x01234567lo, vprod1x01234567hi));
10619 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod1x01234567lo, vprod1x01234567hi));
10620 vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod1x89ABCDEFlo, vprod1x89ABCDEFhi));
10621 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod1x89ABCDEFlo, vprod1x89ABCDEFhi));
10622
10623 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
10624 const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(vi2x01234567);
10625 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 32 * sizeof(uint8_t)));
10626 const __m128i vxk2x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk2x01234567), vk_zero_point);
10627 const __m128i vi2x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i2 + 8));
10628 const __m128i vxi2x89ABCDEF = _mm_cvtepu8_epi16(vi2x89ABCDEF);
10629 const __m128i vk2x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 40 * sizeof(uint8_t)));
10630 const __m128i vxk2x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk2x89ABCDEF), vk_zero_point);
10631 i2 += 16;
10632
10633
10634 const __m128i vprod2x01234567lo = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
10635 const __m128i vprod2x01234567hi = _mm_mulhi_epi16(vxi2x01234567, vxk2x01234567);
10636 const __m128i vprod2x89ABCDEFlo = _mm_mullo_epi16(vxi2x89ABCDEF, vxk2x89ABCDEF);
10637 const __m128i vprod2x89ABCDEFhi = _mm_mulhi_epi16(vxi2x89ABCDEF, vxk2x89ABCDEF);
10638
10639 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod2x01234567lo, vprod2x01234567hi));
10640 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod2x01234567lo, vprod2x01234567hi));
10641 vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod2x89ABCDEFlo, vprod2x89ABCDEFhi));
10642 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod2x89ABCDEFlo, vprod2x89ABCDEFhi));
10643
10644 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
10645 const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(vi3x01234567);
10646 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 48 * sizeof(uint8_t)));
10647 const __m128i vxk3x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk3x01234567), vk_zero_point);
10648 const __m128i vi3x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i3 + 8));
10649 const __m128i vxi3x89ABCDEF = _mm_cvtepu8_epi16(vi3x89ABCDEF);
10650 const __m128i vk3x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 56 * sizeof(uint8_t)));
10651 const __m128i vxk3x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk3x89ABCDEF), vk_zero_point);
10652 i3 += 16;
10653
10654
10655 const __m128i vprod3x01234567lo = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567);
10656 const __m128i vprod3x01234567hi = _mm_mulhi_epi16(vxi3x01234567, vxk3x01234567);
10657 const __m128i vprod3x89ABCDEFlo = _mm_mullo_epi16(vxi3x89ABCDEF, vxk3x89ABCDEF);
10658 const __m128i vprod3x89ABCDEFhi = _mm_mulhi_epi16(vxi3x89ABCDEF, vxk3x89ABCDEF);
10659
10660 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod3x01234567lo, vprod3x01234567hi));
10661 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod3x01234567lo, vprod3x01234567hi));
10662 vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod3x89ABCDEFlo, vprod3x89ABCDEFhi));
10663 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod3x89ABCDEFlo, vprod3x89ABCDEFhi));
10664
10665 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
10666 const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(vi4x01234567);
10667 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 64 * sizeof(uint8_t)));
10668 const __m128i vxk4x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk4x01234567), vk_zero_point);
10669 const __m128i vi4x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i4 + 8));
10670 const __m128i vxi4x89ABCDEF = _mm_cvtepu8_epi16(vi4x89ABCDEF);
10671 const __m128i vk4x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 72 * sizeof(uint8_t)));
10672 const __m128i vxk4x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk4x89ABCDEF), vk_zero_point);
10673 i4 += 16;
10674
10675
10676 const __m128i vprod4x01234567lo = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
10677 const __m128i vprod4x01234567hi = _mm_mulhi_epi16(vxi4x01234567, vxk4x01234567);
10678 const __m128i vprod4x89ABCDEFlo = _mm_mullo_epi16(vxi4x89ABCDEF, vxk4x89ABCDEF);
10679 const __m128i vprod4x89ABCDEFhi = _mm_mulhi_epi16(vxi4x89ABCDEF, vxk4x89ABCDEF);
10680
10681 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod4x01234567lo, vprod4x01234567hi));
10682 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod4x01234567lo, vprod4x01234567hi));
10683 vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod4x89ABCDEFlo, vprod4x89ABCDEFhi));
10684 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod4x89ABCDEFlo, vprod4x89ABCDEFhi));
10685
10686 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
10687 const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(vi5x01234567);
10688 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 80 * sizeof(uint8_t)));
10689 const __m128i vxk5x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk5x01234567), vk_zero_point);
10690 const __m128i vi5x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i5 + 8));
10691 const __m128i vxi5x89ABCDEF = _mm_cvtepu8_epi16(vi5x89ABCDEF);
10692 const __m128i vk5x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 88 * sizeof(uint8_t)));
10693 const __m128i vxk5x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk5x89ABCDEF), vk_zero_point);
10694 i5 += 16;
10695
10696
10697 const __m128i vprod5x01234567lo = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567);
10698 const __m128i vprod5x01234567hi = _mm_mulhi_epi16(vxi5x01234567, vxk5x01234567);
10699 const __m128i vprod5x89ABCDEFlo = _mm_mullo_epi16(vxi5x89ABCDEF, vxk5x89ABCDEF);
10700 const __m128i vprod5x89ABCDEFhi = _mm_mulhi_epi16(vxi5x89ABCDEF, vxk5x89ABCDEF);
10701
10702 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod5x01234567lo, vprod5x01234567hi));
10703 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod5x01234567lo, vprod5x01234567hi));
10704 vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod5x89ABCDEFlo, vprod5x89ABCDEFhi));
10705 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod5x89ABCDEFlo, vprod5x89ABCDEFhi));
10706
10707 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
10708 const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(vi6x01234567);
10709 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 96 * sizeof(uint8_t)));
10710 const __m128i vxk6x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk6x01234567), vk_zero_point);
10711 const __m128i vi6x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i6 + 8));
10712 const __m128i vxi6x89ABCDEF = _mm_cvtepu8_epi16(vi6x89ABCDEF);
10713 const __m128i vk6x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 104 * sizeof(uint8_t)));
10714 const __m128i vxk6x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk6x89ABCDEF), vk_zero_point);
10715 i6 += 16;
10716
10717
10718 const __m128i vprod6x01234567lo = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
10719 const __m128i vprod6x01234567hi = _mm_mulhi_epi16(vxi6x01234567, vxk6x01234567);
10720 const __m128i vprod6x89ABCDEFlo = _mm_mullo_epi16(vxi6x89ABCDEF, vxk6x89ABCDEF);
10721 const __m128i vprod6x89ABCDEFhi = _mm_mulhi_epi16(vxi6x89ABCDEF, vxk6x89ABCDEF);
10722
10723 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod6x01234567lo, vprod6x01234567hi));
10724 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod6x01234567lo, vprod6x01234567hi));
10725 vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod6x89ABCDEFlo, vprod6x89ABCDEFhi));
10726 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod6x89ABCDEFlo, vprod6x89ABCDEFhi));
10727
10728 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
10729 const __m128i vxi7x01234567 = _mm_cvtepu8_epi16(vi7x01234567);
10730 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 112 * sizeof(uint8_t)));
10731 const __m128i vxk7x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk7x01234567), vk_zero_point);
10732 const __m128i vi7x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i7 + 8));
10733 const __m128i vxi7x89ABCDEF = _mm_cvtepu8_epi16(vi7x89ABCDEF);
10734 const __m128i vk7x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 120 * sizeof(uint8_t)));
10735 const __m128i vxk7x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk7x89ABCDEF), vk_zero_point);
10736 i7 += 16;
10737
10738
10739 const __m128i vprod7x01234567lo = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567);
10740 const __m128i vprod7x01234567hi = _mm_mulhi_epi16(vxi7x01234567, vxk7x01234567);
10741 const __m128i vprod7x89ABCDEFlo = _mm_mullo_epi16(vxi7x89ABCDEF, vxk7x89ABCDEF);
10742 const __m128i vprod7x89ABCDEFhi = _mm_mulhi_epi16(vxi7x89ABCDEF, vxk7x89ABCDEF);
10743
10744 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod7x01234567lo, vprod7x01234567hi));
10745 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod7x01234567lo, vprod7x01234567hi));
10746 vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod7x89ABCDEFlo, vprod7x89ABCDEFhi));
10747 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod7x89ABCDEFlo, vprod7x89ABCDEFhi));
10748
10749 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
10750 const __m128i vxi8x01234567 = _mm_cvtepu8_epi16(vi8x01234567);
10751 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 128 * sizeof(uint8_t)));
10752 const __m128i vxk8x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk8x01234567), vk_zero_point);
10753 const __m128i vi8x89ABCDEF = _mm_loadl_epi64((const __m128i*) (i8 + 8));
10754 const __m128i vxi8x89ABCDEF = _mm_cvtepu8_epi16(vi8x89ABCDEF);
10755 const __m128i vk8x89ABCDEF = _mm_loadl_epi64((const __m128i*) ((uintptr_t) w + 16 * sizeof(int32_t) + 136 * sizeof(uint8_t)));
10756 const __m128i vxk8x89ABCDEF = _mm_sub_epi16(_mm_cvtepu8_epi16(vk8x89ABCDEF), vk_zero_point);
10757 i8 += 16;
10758
10759
10760 const __m128i vprod8x01234567lo = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
10761 const __m128i vprod8x01234567hi = _mm_mulhi_epi16(vxi8x01234567, vxk8x01234567);
10762 const __m128i vprod8x89ABCDEFlo = _mm_mullo_epi16(vxi8x89ABCDEF, vxk8x89ABCDEF);
10763 const __m128i vprod8x89ABCDEFhi = _mm_mulhi_epi16(vxi8x89ABCDEF, vxk8x89ABCDEF);
10764
10765 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod8x01234567lo, vprod8x01234567hi));
10766 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod8x01234567lo, vprod8x01234567hi));
10767 vacc89AB = _mm_add_epi32(vacc89AB, _mm_unpacklo_epi16(vprod8x89ABCDEFlo, vprod8x89ABCDEFhi));
10768 vaccCDEF = _mm_add_epi32(vaccCDEF, _mm_unpackhi_epi16(vprod8x89ABCDEFlo, vprod8x89ABCDEFhi));
10769
10770 w = (const void*) ((uintptr_t) w + 16 * sizeof(int32_t) + 144 * sizeof(uint8_t));
10771
10772 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
10773 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
10774 __m128 vscaled89AB = _mm_cvtepi32_ps(vacc89AB);
10775 __m128 vscaledCDEF = _mm_cvtepi32_ps(vaccCDEF);
10776
10777 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
10778 vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
10779 vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
10780 vscaled89AB = _mm_mul_ps(vscaled89AB, vscale);
10781 vscaledCDEF = _mm_mul_ps(vscaledCDEF, vscale);
10782
10783 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
10784 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
10785 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
10786 vscaled89AB = _mm_min_ps(vscaled89AB, voutput_max_less_zero_point);
10787 vscaledCDEF = _mm_min_ps(vscaledCDEF, voutput_max_less_zero_point);
10788
10789 vacc0123 = _mm_cvtps_epi32(vscaled0123);
10790 vacc4567 = _mm_cvtps_epi32(vscaled4567);
10791 vacc89AB = _mm_cvtps_epi32(vscaled89AB);
10792 vaccCDEF = _mm_cvtps_epi32(vscaledCDEF);
10793
10794 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
10795 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
10796 __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
10797
10798 __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF);
10799
10800 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
10801 vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
10802
10803 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
10804 output += 16;
10805 }
10806 if XNN_UNLIKELY(c != 0) {
10807 const uint8_t* k = (const uint8_t*) ((const int32_t*) w + 16);
10808 do {
10809 __m128i vacc0123 = _mm_loadu_si128((const __m128i*) w);
10810 __m128i vacc4567 = _mm_loadu_si128((const __m128i*) ((const int32_t*) w + 4));
10811
10812
10813 const __m128i vi0x01234567 = _mm_loadl_epi64((const __m128i*) i0);
10814 const __m128i vxi0x01234567 = _mm_cvtepu8_epi16(vi0x01234567);
10815 const __m128i vk0x01234567 = _mm_loadl_epi64((const __m128i*) k);
10816 const __m128i vxk0x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk0x01234567), vk_zero_point);
10817 i0 += 8;
10818
10819
10820 const __m128i vprod0x01234567lo = _mm_mullo_epi16(vxi0x01234567, vxk0x01234567);
10821 const __m128i vprod0x01234567hi = _mm_mulhi_epi16(vxi0x01234567, vxk0x01234567);
10822
10823 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod0x01234567lo, vprod0x01234567hi));
10824 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod0x01234567lo, vprod0x01234567hi));
10825
10826 const __m128i vi1x01234567 = _mm_loadl_epi64((const __m128i*) i1);
10827 const __m128i vxi1x01234567 = _mm_cvtepu8_epi16(vi1x01234567);
10828 const __m128i vk1x01234567 = _mm_loadl_epi64((const __m128i*) (k + 16));
10829 const __m128i vxk1x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk1x01234567), vk_zero_point);
10830 i1 += 8;
10831
10832
10833 const __m128i vprod1x01234567lo = _mm_mullo_epi16(vxi1x01234567, vxk1x01234567);
10834 const __m128i vprod1x01234567hi = _mm_mulhi_epi16(vxi1x01234567, vxk1x01234567);
10835
10836 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod1x01234567lo, vprod1x01234567hi));
10837 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod1x01234567lo, vprod1x01234567hi));
10838
10839 const __m128i vi2x01234567 = _mm_loadl_epi64((const __m128i*) i2);
10840 const __m128i vxi2x01234567 = _mm_cvtepu8_epi16(vi2x01234567);
10841 const __m128i vk2x01234567 = _mm_loadl_epi64((const __m128i*) (k + 32));
10842 const __m128i vxk2x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk2x01234567), vk_zero_point);
10843 i2 += 8;
10844
10845
10846 const __m128i vprod2x01234567lo = _mm_mullo_epi16(vxi2x01234567, vxk2x01234567);
10847 const __m128i vprod2x01234567hi = _mm_mulhi_epi16(vxi2x01234567, vxk2x01234567);
10848
10849 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod2x01234567lo, vprod2x01234567hi));
10850 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod2x01234567lo, vprod2x01234567hi));
10851
10852 const __m128i vi3x01234567 = _mm_loadl_epi64((const __m128i*) i3);
10853 const __m128i vxi3x01234567 = _mm_cvtepu8_epi16(vi3x01234567);
10854 const __m128i vk3x01234567 = _mm_loadl_epi64((const __m128i*) (k + 48));
10855 const __m128i vxk3x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk3x01234567), vk_zero_point);
10856 i3 += 8;
10857
10858
10859 const __m128i vprod3x01234567lo = _mm_mullo_epi16(vxi3x01234567, vxk3x01234567);
10860 const __m128i vprod3x01234567hi = _mm_mulhi_epi16(vxi3x01234567, vxk3x01234567);
10861
10862 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod3x01234567lo, vprod3x01234567hi));
10863 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod3x01234567lo, vprod3x01234567hi));
10864
10865 const __m128i vi4x01234567 = _mm_loadl_epi64((const __m128i*) i4);
10866 const __m128i vxi4x01234567 = _mm_cvtepu8_epi16(vi4x01234567);
10867 const __m128i vk4x01234567 = _mm_loadl_epi64((const __m128i*) (k + 64));
10868 const __m128i vxk4x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk4x01234567), vk_zero_point);
10869 i4 += 8;
10870
10871
10872 const __m128i vprod4x01234567lo = _mm_mullo_epi16(vxi4x01234567, vxk4x01234567);
10873 const __m128i vprod4x01234567hi = _mm_mulhi_epi16(vxi4x01234567, vxk4x01234567);
10874
10875 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod4x01234567lo, vprod4x01234567hi));
10876 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod4x01234567lo, vprod4x01234567hi));
10877
10878 const __m128i vi5x01234567 = _mm_loadl_epi64((const __m128i*) i5);
10879 const __m128i vxi5x01234567 = _mm_cvtepu8_epi16(vi5x01234567);
10880 const __m128i vk5x01234567 = _mm_loadl_epi64((const __m128i*) (k + 80));
10881 const __m128i vxk5x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk5x01234567), vk_zero_point);
10882 i5 += 8;
10883
10884
10885 const __m128i vprod5x01234567lo = _mm_mullo_epi16(vxi5x01234567, vxk5x01234567);
10886 const __m128i vprod5x01234567hi = _mm_mulhi_epi16(vxi5x01234567, vxk5x01234567);
10887
10888 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod5x01234567lo, vprod5x01234567hi));
10889 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod5x01234567lo, vprod5x01234567hi));
10890
10891 const __m128i vi6x01234567 = _mm_loadl_epi64((const __m128i*) i6);
10892 const __m128i vxi6x01234567 = _mm_cvtepu8_epi16(vi6x01234567);
10893 const __m128i vk6x01234567 = _mm_loadl_epi64((const __m128i*) (k + 96));
10894 const __m128i vxk6x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk6x01234567), vk_zero_point);
10895 i6 += 8;
10896
10897
10898 const __m128i vprod6x01234567lo = _mm_mullo_epi16(vxi6x01234567, vxk6x01234567);
10899 const __m128i vprod6x01234567hi = _mm_mulhi_epi16(vxi6x01234567, vxk6x01234567);
10900
10901 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod6x01234567lo, vprod6x01234567hi));
10902 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod6x01234567lo, vprod6x01234567hi));
10903
10904 const __m128i vi7x01234567 = _mm_loadl_epi64((const __m128i*) i7);
10905 const __m128i vxi7x01234567 = _mm_cvtepu8_epi16(vi7x01234567);
10906 const __m128i vk7x01234567 = _mm_loadl_epi64((const __m128i*) (k + 112));
10907 const __m128i vxk7x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk7x01234567), vk_zero_point);
10908 i7 += 8;
10909
10910
10911 const __m128i vprod7x01234567lo = _mm_mullo_epi16(vxi7x01234567, vxk7x01234567);
10912 const __m128i vprod7x01234567hi = _mm_mulhi_epi16(vxi7x01234567, vxk7x01234567);
10913
10914 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod7x01234567lo, vprod7x01234567hi));
10915 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod7x01234567lo, vprod7x01234567hi));
10916
10917 const __m128i vi8x01234567 = _mm_loadl_epi64((const __m128i*) i8);
10918 const __m128i vxi8x01234567 = _mm_cvtepu8_epi16(vi8x01234567);
10919 const __m128i vk8x01234567 = _mm_loadl_epi64((const __m128i*) (k + 128));
10920 const __m128i vxk8x01234567 = _mm_sub_epi16(_mm_cvtepu8_epi16(vk8x01234567), vk_zero_point);
10921 i8 += 8;
10922
10923
10924 const __m128i vprod8x01234567lo = _mm_mullo_epi16(vxi8x01234567, vxk8x01234567);
10925 const __m128i vprod8x01234567hi = _mm_mulhi_epi16(vxi8x01234567, vxk8x01234567);
10926
10927 vacc0123 = _mm_add_epi32(vacc0123, _mm_unpacklo_epi16(vprod8x01234567lo, vprod8x01234567hi));
10928 vacc4567 = _mm_add_epi32(vacc4567, _mm_unpackhi_epi16(vprod8x01234567lo, vprod8x01234567hi));
10929
10930 k += 8;
10931
10932 __m128 vscaled0123 = _mm_cvtepi32_ps(vacc0123);
10933 __m128 vscaled4567 = _mm_cvtepi32_ps(vacc4567);
10934
10935 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
10936 vscaled0123 = _mm_mul_ps(vscaled0123, vscale);
10937 vscaled4567 = _mm_mul_ps(vscaled4567, vscale);
10938
10939 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
10940 vscaled0123 = _mm_min_ps(vscaled0123, voutput_max_less_zero_point);
10941 vscaled4567 = _mm_min_ps(vscaled4567, voutput_max_less_zero_point);
10942
10943 vacc0123 = _mm_cvtps_epi32(vscaled0123);
10944 vacc4567 = _mm_cvtps_epi32(vscaled4567);
10945
10946 w = (const void*) ((const int32_t*) w + 8);
10947
10948 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
10949 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
10950
10951 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
10952
10953 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
10954
10955 if XNN_LIKELY(c >= 8) {
10956 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
10957 output += 8;
10958 c -= 8;
10959 } else {
10960 if (c & 4) {
10961 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
10962 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
10963 output += 4;
10964 }
10965 if (c & 2) {
10966 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
10967 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
10968 output += 2;
10969 }
10970 if (c & 1) {
10971 *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
10972 output += 1;
10973 }
10974 c = 0;
10975 }
10976 } while (c != 0);
10977 }
10978
10979 output = (uint8_t*) ((uintptr_t) output + output_increment);
10980 } while (--output_width != 0);
10981 }
10982
xnn_qu8_f32_vcvt_ukernel__avx_x32(size_t n,const uint8_t * x,float * y,const union xnn_qu8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])10983 void xnn_qu8_f32_vcvt_ukernel__avx_x32(
10984 size_t n,
10985 const uint8_t* x,
10986 float* y,
10987 const union xnn_qu8_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
10988 {
10989 assert(n != 0);
10990 assert(n % sizeof(uint8_t) == 0);
10991 assert(x != NULL);
10992 assert(y != NULL);
10993
10994 const __m128i vminus_zero_point = _mm_load_si128((const __m128i*) params->avx.minus_zero_point);
10995 const __m256 vscale = _mm256_load_ps(params->avx.scale);
10996 for (; n >= 32 * sizeof(uint8_t); n -= 32 * sizeof(uint8_t)) {
10997 __m128i vx0123 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x)));
10998 __m128i vx4567 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 4)));
10999 __m128i vx89AB = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 8)));
11000 __m128i vxCDEF = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 12)));
11001 __m128i vxGHIJ = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 16)));
11002 __m128i vxKLMN = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 20)));
11003 __m128i vxOPQR = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 24)));
11004 __m128i vxSTUV = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x + 28)));
11005 x += 32;
11006
11007 vx0123 = _mm_add_epi32(vx0123, vminus_zero_point);
11008 vx4567 = _mm_add_epi32(vx4567, vminus_zero_point);
11009 vx89AB = _mm_add_epi32(vx89AB, vminus_zero_point);
11010 vxCDEF = _mm_add_epi32(vxCDEF, vminus_zero_point);
11011 vxGHIJ = _mm_add_epi32(vxGHIJ, vminus_zero_point);
11012 vxKLMN = _mm_add_epi32(vxKLMN, vminus_zero_point);
11013 vxOPQR = _mm_add_epi32(vxOPQR, vminus_zero_point);
11014 vxSTUV = _mm_add_epi32(vxSTUV, vminus_zero_point);
11015
11016 const __m256i vx01234567 = _mm256_insertf128_si256(_mm256_castsi128_si256(vx0123), vx4567, 1);
11017 const __m256i vx89ABCDEF = _mm256_insertf128_si256(_mm256_castsi128_si256(vx89AB), vxCDEF, 1);
11018 const __m256i vxGHIJKLMN = _mm256_insertf128_si256(_mm256_castsi128_si256(vxGHIJ), vxKLMN, 1);
11019 const __m256i vxOPQRSTUV = _mm256_insertf128_si256(_mm256_castsi128_si256(vxOPQR), vxSTUV, 1);
11020
11021 __m256 vy01234567 = _mm256_cvtepi32_ps(vx01234567);
11022 __m256 vy89ABCDEF = _mm256_cvtepi32_ps(vx89ABCDEF);
11023 __m256 vyGHIJKLMN = _mm256_cvtepi32_ps(vxGHIJKLMN);
11024 __m256 vyOPQRSTUV = _mm256_cvtepi32_ps(vxOPQRSTUV);
11025
11026 vy01234567 = _mm256_mul_ps(vy01234567, vscale);
11027 vy89ABCDEF = _mm256_mul_ps(vy89ABCDEF, vscale);
11028 vyGHIJKLMN = _mm256_mul_ps(vyGHIJKLMN, vscale);
11029 vyOPQRSTUV = _mm256_mul_ps(vyOPQRSTUV, vscale);
11030
11031 _mm256_storeu_ps(y, vy01234567);
11032 _mm256_storeu_ps(y + 8, vy89ABCDEF);
11033 _mm256_storeu_ps(y + 16, vyGHIJKLMN);
11034 _mm256_storeu_ps(y + 24, vyOPQRSTUV);
11035 y += 32;
11036 }
11037 for (; n >= 4 * sizeof(uint8_t); n -= 4 * sizeof(uint8_t)) {
11038 __m128i vx = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x)));
11039 vx = _mm_add_epi32(vx, vminus_zero_point);
11040 x += 4;
11041
11042 __m128 vy = _mm_cvtepi32_ps(vx);
11043 vy = _mm_mul_ps(vy, _mm256_castps256_ps128(vscale));
11044
11045 _mm_storeu_ps(y, vy);
11046 y += 4;
11047 }
11048 if XNN_UNLIKELY(n != 0) {
11049 assert(n >= 1 * sizeof(uint8_t));
11050 assert(n <= 3 * sizeof(uint8_t));
11051
11052 __m128i vx = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(x)));
11053 vx = _mm_add_epi32(vx, vminus_zero_point);
11054
11055 __m128 vy = _mm_cvtepi32_ps(vx);
11056 vy = _mm_mul_ps(vy, _mm256_castps256_ps128(vscale));
11057
11058 if (n & (2 * sizeof(uint8_t))) {
11059 _mm_storel_pi((__m64*) y, vy);
11060 vy = _mm_movehl_ps(vy, vy);
11061 y += 2;
11062 }
11063 if (n & (1 * sizeof(uint8_t))) {
11064 _mm_store_ss(y, vy);
11065 }
11066 }
11067 }
11068
xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128(size_t mr,size_t nc,size_t kc,const uint8_t * restrict a,size_t a_stride,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])11069 void xnn_qu8_gemm_minmax_fp32_ukernel_1x4c8__avx_ld128(
11070 size_t mr,
11071 size_t nc,
11072 size_t kc,
11073 const uint8_t* restrict a,
11074 size_t a_stride,
11075 const void* restrict w,
11076 uint8_t* restrict c,
11077 size_t cm_stride,
11078 size_t cn_stride,
11079 const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
11080 {
11081 assert(mr != 0);
11082 assert(mr <= 1);
11083 assert(nc != 0);
11084 assert(kc != 0);
11085 assert(kc % sizeof(uint8_t) == 0);
11086 assert(a != NULL);
11087 assert(w != NULL);
11088 assert(c != NULL);
11089
11090 kc = round_up_po2(kc, 8);
11091 const uint8_t* a0 = a;
11092 uint8_t* c0 = c;
11093
11094 do {
11095 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
11096 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
11097 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
11098 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
11099 w = (const int32_t*) w + 4;
11100
11101 size_t k = 0;
11102 const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point);
11103 const __m128i vzero = _mm_setzero_si128();
11104 while (k < kc) {
11105 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
11106 const __m128i vxa0 = _mm_cvtepu8_epi16(va0);
11107 a0 += 8;
11108
11109 const __m128i vb01 = _mm_load_si128((const __m128i*) w);
11110 const __m128i vxb0 = _mm_sub_epi16(_mm_unpacklo_epi8(vb01, vzero), vb_zero_point);
11111 const __m128i vxb1 = _mm_sub_epi16(_mm_unpackhi_epi8(vb01, vzero), vb_zero_point);
11112
11113 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
11114 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
11115 const __m128i vb23 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 16));
11116 const __m128i vxb2 = _mm_sub_epi16(_mm_unpacklo_epi8(vb23, vzero), vb_zero_point);
11117 const __m128i vxb3 = _mm_sub_epi16(_mm_unpackhi_epi8(vb23, vzero), vb_zero_point);
11118
11119 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
11120 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
11121
11122 w = (const void*) ((const uint8_t*) w + 32);
11123 k += 8 * sizeof(uint8_t);
11124 }
11125
11126 const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
11127 const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
11128
11129 __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
11130
11131 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
11132
11133 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
11134 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
11135
11136 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
11137 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
11138
11139 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
11140
11141 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
11142 __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
11143
11144 __m128i vout = _mm_packus_epi16(vacc00x0123, vacc00x0123);
11145
11146 vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
11147
11148 if (nc >= 4) {
11149 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
11150
11151 c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
11152
11153 a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
11154
11155 nc -= 4;
11156 } else {
11157 if (nc & 2) {
11158 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
11159 c0 += 2;
11160 vout = _mm_srli_epi32(vout, 16);
11161 }
11162 if (nc & 1) {
11163 *c0 = (uint8_t) _mm_extract_epi8(vout, 0);
11164 }
11165
11166 nc = 0;
11167 }
11168 } while (nc != 0);
11169 }
11170
xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128(size_t mr,size_t nc,size_t kc,const uint8_t * restrict a,size_t a_stride,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])11171 void xnn_qu8_gemm_minmax_fp32_ukernel_2x4c8__avx_ld128(
11172 size_t mr,
11173 size_t nc,
11174 size_t kc,
11175 const uint8_t* restrict a,
11176 size_t a_stride,
11177 const void* restrict w,
11178 uint8_t* restrict c,
11179 size_t cm_stride,
11180 size_t cn_stride,
11181 const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
11182 {
11183 assert(mr != 0);
11184 assert(mr <= 2);
11185 assert(nc != 0);
11186 assert(kc != 0);
11187 assert(kc % sizeof(uint8_t) == 0);
11188 assert(a != NULL);
11189 assert(w != NULL);
11190 assert(c != NULL);
11191
11192 kc = round_up_po2(kc, 8);
11193 const uint8_t* a0 = a;
11194 uint8_t* c0 = c;
11195 const uint8_t* a1 = (const uint8_t*) ((uintptr_t) a0 + a_stride);
11196 uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
11197 if XNN_UNPREDICTABLE(mr != 2) {
11198 a1 = a0;
11199 c1 = c0;
11200 }
11201
11202 do {
11203 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
11204 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
11205 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
11206 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
11207 __m128i vacc1x0 = vacc0x0;
11208 __m128i vacc1x1 = vacc0x1;
11209 __m128i vacc1x2 = vacc0x2;
11210 __m128i vacc1x3 = vacc0x3;
11211 w = (const int32_t*) w + 4;
11212
11213 size_t k = 0;
11214 const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point);
11215 const __m128i vzero = _mm_setzero_si128();
11216 while (k < kc) {
11217 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
11218 const __m128i vxa0 = _mm_cvtepu8_epi16(va0);
11219 a0 += 8;
11220 const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
11221 const __m128i vxa1 = _mm_cvtepu8_epi16(va1);
11222 a1 += 8;
11223
11224 const __m128i vb01 = _mm_load_si128((const __m128i*) w);
11225 const __m128i vxb0 = _mm_sub_epi16(_mm_unpacklo_epi8(vb01, vzero), vb_zero_point);
11226 const __m128i vxb1 = _mm_sub_epi16(_mm_unpackhi_epi8(vb01, vzero), vb_zero_point);
11227
11228 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
11229 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
11230 vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
11231 vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
11232 const __m128i vb23 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 16));
11233 const __m128i vxb2 = _mm_sub_epi16(_mm_unpacklo_epi8(vb23, vzero), vb_zero_point);
11234 const __m128i vxb3 = _mm_sub_epi16(_mm_unpackhi_epi8(vb23, vzero), vb_zero_point);
11235
11236 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
11237 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
11238 vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
11239 vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
11240
11241 w = (const void*) ((const uint8_t*) w + 32);
11242 k += 8 * sizeof(uint8_t);
11243 }
11244
11245 const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
11246 const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
11247 const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
11248 const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
11249
11250 __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
11251 __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
11252
11253 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
11254 __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
11255
11256 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
11257 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
11258 vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
11259
11260 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
11261 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
11262 vscaled1x0123 = _mm_min_ps(vscaled1x0123, voutput_max_less_zero_point);
11263
11264 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
11265 vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
11266
11267 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
11268 __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
11269
11270 __m128i vout = _mm_packus_epi16(vacc01x0123, vacc01x0123);
11271
11272 vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
11273
11274 if (nc >= 4) {
11275 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
11276 unaligned_store_u32(c1, (uint32_t) _mm_extract_epi32(vout, 1));
11277
11278 c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
11279 c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
11280
11281 a0 = (const uint8_t*) ((uintptr_t) a0 - kc);
11282 a1 = (const uint8_t*) ((uintptr_t) a1 - kc);
11283
11284 nc -= 4;
11285 } else {
11286 if (nc & 2) {
11287 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
11288 c0 += 2;
11289 unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout, 2));
11290 c1 += 2;
11291 vout = _mm_srli_epi32(vout, 16);
11292 }
11293 if (nc & 1) {
11294 *c0 = (uint8_t) _mm_extract_epi8(vout, 0);
11295 *c1 = (uint8_t) _mm_extract_epi8(vout, 4);
11296 }
11297
11298 nc = 0;
11299 }
11300 } while (nc != 0);
11301 }
11302
xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128(size_t mr,size_t nc,size_t kc,size_t ks,const uint8_t ** restrict a,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])11303 void xnn_qu8_igemm_minmax_fp32_ukernel_1x4c8__avx_ld128(
11304 size_t mr,
11305 size_t nc,
11306 size_t kc,
11307 size_t ks,
11308 const uint8_t** restrict a,
11309 const void* restrict w,
11310 uint8_t* restrict c,
11311 size_t cm_stride,
11312 size_t cn_stride,
11313 size_t a_offset,
11314 const uint8_t* zero,
11315 const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
11316 {
11317 assert(mr != 0);
11318 assert(mr <= 1);
11319 assert(nc != 0);
11320 assert(kc != 0);
11321 assert(ks != 0);
11322 assert(ks % (1 * sizeof(void*)) == 0);
11323 assert(a_offset % sizeof(uint8_t) == 0);
11324 assert(a != NULL);
11325 assert(w != NULL);
11326 assert(c != NULL);
11327
11328 kc = round_up_po2(kc, 8);
11329 uint8_t* c0 = c;
11330
11331 do {
11332 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
11333 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
11334 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
11335 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
11336 w = (const int32_t*) w + 4;
11337
11338 size_t p = ks;
11339 do {
11340 const uint8_t* restrict a0 = a[0];
11341 if XNN_UNPREDICTABLE(a0 != zero) {
11342 a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
11343 }
11344 a += 1;
11345
11346 size_t k = 0;
11347 const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point);
11348 const __m128i vzero = _mm_setzero_si128();
11349 while (k < kc) {
11350 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
11351 const __m128i vxa0 = _mm_cvtepu8_epi16(va0);
11352 a0 += 8;
11353
11354 const __m128i vb01 = _mm_load_si128((const __m128i*) w);
11355 const __m128i vxb0 = _mm_sub_epi16(_mm_unpacklo_epi8(vb01, vzero), vb_zero_point);
11356 const __m128i vxb1 = _mm_sub_epi16(_mm_unpackhi_epi8(vb01, vzero), vb_zero_point);
11357
11358 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
11359 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
11360 const __m128i vb23 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 16));
11361 const __m128i vxb2 = _mm_sub_epi16(_mm_unpacklo_epi8(vb23, vzero), vb_zero_point);
11362 const __m128i vxb3 = _mm_sub_epi16(_mm_unpackhi_epi8(vb23, vzero), vb_zero_point);
11363
11364 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
11365 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
11366
11367 w = (const void*) ((const uint8_t*) w + 32);
11368 k += 8 * sizeof(uint8_t);
11369 }
11370 p -= 1 * sizeof(void*);
11371 } while (p != 0);
11372
11373 const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
11374 const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
11375
11376 __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
11377
11378 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
11379
11380 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
11381 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
11382
11383 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
11384 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
11385
11386 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
11387
11388 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
11389 __m128i vacc00x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc0x0123), voutput_zero_point);
11390
11391 __m128i vout = _mm_packus_epi16(vacc00x0123, vacc00x0123);
11392
11393 vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
11394
11395 if (nc >= 4) {
11396 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
11397 c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
11398
11399 a = (const uint8_t**restrict) ((uintptr_t) a - ks);
11400
11401 nc -= 4;
11402 } else {
11403 if (nc & 2) {
11404 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
11405 c0 += 2;
11406 vout = _mm_srli_epi32(vout, 16);
11407 }
11408 if (nc & 1) {
11409 *c0 = (uint8_t) _mm_extract_epi8(vout, 0);
11410 }
11411
11412 nc = 0;
11413 }
11414 } while (nc != 0);
11415 }
11416
xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128(size_t mr,size_t nc,size_t kc,size_t ks,const uint8_t ** restrict a,const void * restrict w,uint8_t * restrict c,size_t cm_stride,size_t cn_stride,size_t a_offset,const uint8_t * zero,const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])11417 void xnn_qu8_igemm_minmax_fp32_ukernel_2x4c8__avx_ld128(
11418 size_t mr,
11419 size_t nc,
11420 size_t kc,
11421 size_t ks,
11422 const uint8_t** restrict a,
11423 const void* restrict w,
11424 uint8_t* restrict c,
11425 size_t cm_stride,
11426 size_t cn_stride,
11427 size_t a_offset,
11428 const uint8_t* zero,
11429 const union xnn_qu8_conv_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
11430 {
11431 assert(mr != 0);
11432 assert(mr <= 2);
11433 assert(nc != 0);
11434 assert(kc != 0);
11435 assert(ks != 0);
11436 assert(ks % (2 * sizeof(void*)) == 0);
11437 assert(a_offset % sizeof(uint8_t) == 0);
11438 assert(a != NULL);
11439 assert(w != NULL);
11440 assert(c != NULL);
11441
11442 kc = round_up_po2(kc, 8);
11443 uint8_t* c0 = c;
11444 uint8_t* c1 = (uint8_t*) ((uintptr_t) c0 + cm_stride);
11445 if XNN_UNPREDICTABLE(mr != 2) {
11446 c1 = c0;
11447 }
11448
11449 do {
11450 __m128i vacc0x0 = _mm_cvtsi32_si128(((const int*) w)[0]);
11451 __m128i vacc0x1 = _mm_cvtsi32_si128(((const int*) w)[1]);
11452 __m128i vacc0x2 = _mm_cvtsi32_si128(((const int*) w)[2]);
11453 __m128i vacc0x3 = _mm_cvtsi32_si128(((const int*) w)[3]);
11454 __m128i vacc1x0 = vacc0x0;
11455 __m128i vacc1x1 = vacc0x1;
11456 __m128i vacc1x2 = vacc0x2;
11457 __m128i vacc1x3 = vacc0x3;
11458 w = (const int32_t*) w + 4;
11459
11460 size_t p = ks;
11461 do {
11462 const uint8_t* restrict a0 = a[0];
11463 if XNN_UNPREDICTABLE(a0 != zero) {
11464 a0 = (const uint8_t*) ((uintptr_t) a0 + a_offset);
11465 }
11466 const uint8_t* restrict a1 = a[1];
11467 if XNN_UNPREDICTABLE(a1 != zero) {
11468 a1 = (const uint8_t*) ((uintptr_t) a1 + a_offset);
11469 }
11470 a += 2;
11471
11472 size_t k = 0;
11473 const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.kernel_zero_point);
11474 const __m128i vzero = _mm_setzero_si128();
11475 while (k < kc) {
11476 const __m128i va0 = _mm_loadl_epi64((const __m128i*) a0);
11477 const __m128i vxa0 = _mm_cvtepu8_epi16(va0);
11478 a0 += 8;
11479 const __m128i va1 = _mm_loadl_epi64((const __m128i*) a1);
11480 const __m128i vxa1 = _mm_cvtepu8_epi16(va1);
11481 a1 += 8;
11482
11483 const __m128i vb01 = _mm_load_si128((const __m128i*) w);
11484 const __m128i vxb0 = _mm_sub_epi16(_mm_unpacklo_epi8(vb01, vzero), vb_zero_point);
11485 const __m128i vxb1 = _mm_sub_epi16(_mm_unpackhi_epi8(vb01, vzero), vb_zero_point);
11486
11487 vacc0x0 = _mm_add_epi32(vacc0x0, _mm_madd_epi16(vxa0, vxb0));
11488 vacc0x1 = _mm_add_epi32(vacc0x1, _mm_madd_epi16(vxa0, vxb1));
11489 vacc1x0 = _mm_add_epi32(vacc1x0, _mm_madd_epi16(vxa1, vxb0));
11490 vacc1x1 = _mm_add_epi32(vacc1x1, _mm_madd_epi16(vxa1, vxb1));
11491 const __m128i vb23 = _mm_load_si128((const __m128i*) ((const uint8_t*) w + 16));
11492 const __m128i vxb2 = _mm_sub_epi16(_mm_unpacklo_epi8(vb23, vzero), vb_zero_point);
11493 const __m128i vxb3 = _mm_sub_epi16(_mm_unpackhi_epi8(vb23, vzero), vb_zero_point);
11494
11495 vacc0x2 = _mm_add_epi32(vacc0x2, _mm_madd_epi16(vxa0, vxb2));
11496 vacc0x3 = _mm_add_epi32(vacc0x3, _mm_madd_epi16(vxa0, vxb3));
11497 vacc1x2 = _mm_add_epi32(vacc1x2, _mm_madd_epi16(vxa1, vxb2));
11498 vacc1x3 = _mm_add_epi32(vacc1x3, _mm_madd_epi16(vxa1, vxb3));
11499
11500 w = (const void*) ((const uint8_t*) w + 32);
11501 k += 8 * sizeof(uint8_t);
11502 }
11503 p -= 2 * sizeof(void*);
11504 } while (p != 0);
11505
11506 const __m128i vacc0x01 = _mm_hadd_epi32(vacc0x0, vacc0x1);
11507 const __m128i vacc0x23 = _mm_hadd_epi32(vacc0x2, vacc0x3);
11508 const __m128i vacc1x01 = _mm_hadd_epi32(vacc1x0, vacc1x1);
11509 const __m128i vacc1x23 = _mm_hadd_epi32(vacc1x2, vacc1x3);
11510
11511 __m128i vacc0x0123 = _mm_hadd_epi32(vacc0x01, vacc0x23);
11512 __m128i vacc1x0123 = _mm_hadd_epi32(vacc1x01, vacc1x23);
11513
11514 __m128 vscaled0x0123 = _mm_cvtepi32_ps(vacc0x0123);
11515 __m128 vscaled1x0123 = _mm_cvtepi32_ps(vacc1x0123);
11516
11517 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
11518 vscaled0x0123 = _mm_mul_ps(vscaled0x0123, vscale);
11519 vscaled1x0123 = _mm_mul_ps(vscaled1x0123, vscale);
11520
11521 const __m128 voutput_max_less_zero_point = _mm_load_ps(params->fp32_sse2.output_max_less_zero_point);
11522 vscaled0x0123 = _mm_min_ps(vscaled0x0123, voutput_max_less_zero_point);
11523 vscaled1x0123 = _mm_min_ps(vscaled1x0123, voutput_max_less_zero_point);
11524
11525 vacc0x0123 = _mm_cvtps_epi32(vscaled0x0123);
11526 vacc1x0123 = _mm_cvtps_epi32(vscaled1x0123);
11527
11528 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
11529 __m128i vacc01x0123 = _mm_adds_epi16(_mm_packs_epi32(vacc0x0123, vacc1x0123), voutput_zero_point);
11530
11531 __m128i vout = _mm_packus_epi16(vacc01x0123, vacc01x0123);
11532
11533 vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) params->fp32_sse2.output_min));
11534
11535 if (nc >= 4) {
11536 unaligned_store_u32(c1, (uint32_t) _mm_extract_epi32(vout, 1));
11537 c1 = (uint8_t*) ((uintptr_t) c1 + cn_stride);
11538 unaligned_store_u32(c0, (uint32_t) _mm_cvtsi128_si32(vout));
11539 c0 = (uint8_t*) ((uintptr_t) c0 + cn_stride);
11540
11541 a = (const uint8_t**restrict) ((uintptr_t) a - ks);
11542
11543 nc -= 4;
11544 } else {
11545 if (nc & 2) {
11546 unaligned_store_u16(c1, (uint16_t) _mm_extract_epi16(vout, 2));
11547 c1 += 2;
11548 unaligned_store_u16(c0, (uint16_t) _mm_extract_epi16(vout, 0));
11549 c0 += 2;
11550 vout = _mm_srli_epi32(vout, 16);
11551 }
11552 if (nc & 1) {
11553 *c1 = (uint8_t) _mm_extract_epi8(vout, 4);
11554 *c0 = (uint8_t) _mm_extract_epi8(vout, 0);
11555 }
11556
11557 nc = 0;
11558 }
11559 } while (nc != 0);
11560 }
11561
xnn_qu8_vadd_minmax_ukernel__avx_mul32_ld32_x8(size_t n,const uint8_t * input_a,const uint8_t * input_b,uint8_t * output,const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])11562 void xnn_qu8_vadd_minmax_ukernel__avx_mul32_ld32_x8(
11563 size_t n,
11564 const uint8_t* input_a,
11565 const uint8_t* input_b,
11566 uint8_t* output,
11567 const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
11568 {
11569 const __m128i vbias = _mm_load_si128((const __m128i*) params->sse4.bias);
11570 const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4.a_multiplier);
11571 const __m128i vb_multiplier = _mm_load_si128((const __m128i*) params->sse4.b_multiplier);
11572 const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
11573 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
11574 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
11575 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
11576
11577 for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
11578 const __m128i va0123 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_a)));
11579 const __m128i vb0123 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_b)));
11580 const __m128i va4567 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_a + 4)));
11581 const __m128i vb4567 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_b + 4)));
11582 input_a += 8;
11583 input_b += 8;
11584
11585 __m128i vacc0123 = _mm_add_epi32(vbias, _mm_mullo_epi32(va0123, va_multiplier));
11586 __m128i vacc4567 = _mm_add_epi32(vbias, _mm_mullo_epi32(va4567, va_multiplier));
11587
11588 vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vb0123, vb_multiplier));
11589 vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vb4567, vb_multiplier));
11590
11591 vacc0123 = _mm_sra_epi32(vacc0123, vshift);
11592 vacc4567 = _mm_sra_epi32(vacc4567, vshift);
11593
11594 const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
11595
11596 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
11597
11598 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
11599
11600 vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
11601
11602 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
11603 output += 8;
11604 }
11605 if XNN_UNLIKELY(n != 0) {
11606 {
11607 const __m128i va0123 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_a)));
11608 const __m128i vb0123 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_b)));
11609 const __m128i va4567 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_a + 4)));
11610 const __m128i vb4567 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_b + 4)));
11611
11612 __m128i vacc0123 = _mm_add_epi32(vbias, _mm_mullo_epi32(va0123, va_multiplier));
11613 __m128i vacc4567 = _mm_add_epi32(vbias, _mm_mullo_epi32(va4567, va_multiplier));
11614
11615 vacc0123 = _mm_add_epi32(vacc0123, _mm_mullo_epi32(vb0123, vb_multiplier));
11616 vacc4567 = _mm_add_epi32(vacc4567, _mm_mullo_epi32(vb4567, vb_multiplier));
11617
11618 vacc0123 = _mm_sra_epi32(vacc0123, vshift);
11619 vacc4567 = _mm_sra_epi32(vacc4567, vshift);
11620
11621 const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
11622
11623 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
11624 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
11625 vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
11626
11627 if (n & (4 * sizeof(uint8_t))) {
11628 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
11629 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
11630 output += 4;
11631 }
11632 if (n & (2 * sizeof(uint8_t))) {
11633 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
11634 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
11635 output += 2;
11636 }
11637 if (n & (1 * sizeof(uint8_t))) {
11638 *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
11639 }
11640 }
11641 }
11642 }
11643
xnn_qu8_vaddc_minmax_ukernel__avx_mul32_ld32_x8(size_t n,const uint8_t * input_a,const uint8_t * input_b,uint8_t * output,const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])11644 void xnn_qu8_vaddc_minmax_ukernel__avx_mul32_ld32_x8(
11645 size_t n,
11646 const uint8_t* input_a,
11647 const uint8_t* input_b,
11648 uint8_t* output,
11649 const union xnn_qu8_add_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
11650 {
11651 const __m128i va_multiplier = _mm_load_si128((const __m128i*) params->sse4.a_multiplier);
11652 const __m128i vshift = _mm_load_si128((const __m128i*) params->sse4.shift);
11653 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->sse4.output_zero_point);
11654 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->sse4.output_min);
11655 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->sse4.output_max);
11656
11657 __m128i vbias = _mm_cvtsi32_si128(params->sse4.b_multiplier[0] * (int32_t) *input_b);
11658 vbias = _mm_shuffle_epi32(vbias, _MM_SHUFFLE(0, 0, 0, 0));
11659 vbias = _mm_add_epi32(vbias, _mm_load_si128((const __m128i*) params->sse4.bias));
11660 for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
11661 const __m128i va0123 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_a)));
11662 const __m128i va4567 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_a + 4)));
11663 input_a += 8;
11664 input_b += 8;
11665
11666 __m128i vacc0123 = _mm_add_epi32(vbias, _mm_mullo_epi32(va0123, va_multiplier));
11667 __m128i vacc4567 = _mm_add_epi32(vbias, _mm_mullo_epi32(va4567, va_multiplier));
11668
11669 vacc0123 = _mm_sra_epi32(vacc0123, vshift);
11670 vacc4567 = _mm_sra_epi32(vacc4567, vshift);
11671
11672 const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
11673
11674 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
11675
11676 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
11677
11678 vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
11679
11680 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
11681 output += 8;
11682 }
11683 if XNN_UNLIKELY(n != 0) {
11684 {
11685 const __m128i va0123 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_a)));
11686 const __m128i va4567 = _mm_cvtepu8_epi32(_mm_cvtsi32_si128((int) unaligned_load_s32(input_a + 4)));
11687
11688 __m128i vacc0123 = _mm_add_epi32(vbias, _mm_mullo_epi32(va0123, va_multiplier));
11689 __m128i vacc4567 = _mm_add_epi32(vbias, _mm_mullo_epi32(va4567, va_multiplier));
11690
11691 vacc0123 = _mm_sra_epi32(vacc0123, vshift);
11692 vacc4567 = _mm_sra_epi32(vacc4567, vshift);
11693
11694 const __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
11695
11696 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
11697 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
11698 vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
11699
11700 if (n & (4 * sizeof(uint8_t))) {
11701 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
11702 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
11703 output += 4;
11704 }
11705 if (n & (2 * sizeof(uint8_t))) {
11706 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
11707 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
11708 output += 2;
11709 }
11710 if (n & (1 * sizeof(uint8_t))) {
11711 *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
11712 }
11713 }
11714 }
11715 }
11716
xnn_qu8_vcvt_ukernel__avx_x32(size_t n,const uint8_t * x,uint8_t * y,const union xnn_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])11717 void xnn_qu8_vcvt_ukernel__avx_x32(
11718 size_t n,
11719 const uint8_t* x,
11720 uint8_t* y,
11721 const union xnn_qu8_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
11722 {
11723 assert(n != 0);
11724 assert(n % sizeof(uint8_t) == 0);
11725 assert(x != NULL);
11726 assert(y != NULL);
11727
11728 const __m128i vinput_zero_point = _mm_load_si128((const __m128i*) params->ssse3.input_zero_point);
11729 const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->ssse3.multiplier);
11730 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->ssse3.output_zero_point);
11731 for (; n >= 32 * sizeof(uint8_t); n -= 32 * sizeof(uint8_t)) {
11732 __m128i vacc0 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) x));
11733 __m128i vacc1 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (x + 8)));
11734 __m128i vacc2 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (x + 16)));
11735 __m128i vacc3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (x + 24)));
11736 x += 32;
11737
11738 vacc0 = _mm_sub_epi16(vinput_zero_point, vacc0);
11739 vacc1 = _mm_sub_epi16(vinput_zero_point, vacc1);
11740 vacc2 = _mm_sub_epi16(vinput_zero_point, vacc2);
11741 vacc3 = _mm_sub_epi16(vinput_zero_point, vacc3);
11742
11743 vacc0 = _mm_slli_epi16(vacc0, 7);
11744 vacc1 = _mm_slli_epi16(vacc1, 7);
11745 vacc2 = _mm_slli_epi16(vacc2, 7);
11746 vacc3 = _mm_slli_epi16(vacc3, 7);
11747
11748 vacc0 = _mm_mulhrs_epi16(vacc0, vmultiplier);
11749 vacc1 = _mm_mulhrs_epi16(vacc1, vmultiplier);
11750 vacc2 = _mm_mulhrs_epi16(vacc2, vmultiplier);
11751 vacc3 = _mm_mulhrs_epi16(vacc3, vmultiplier);
11752
11753 vacc0 = _mm_adds_epi16(vacc0, voutput_zero_point);
11754 vacc1 = _mm_adds_epi16(vacc1, voutput_zero_point);
11755 vacc2 = _mm_adds_epi16(vacc2, voutput_zero_point);
11756 vacc3 = _mm_adds_epi16(vacc3, voutput_zero_point);
11757
11758 const __m128i vy0 = _mm_packus_epi16(vacc0, vacc1);
11759 const __m128i vy1 = _mm_packus_epi16(vacc2, vacc3);
11760
11761 _mm_storeu_si128((__m128i*) y, vy0);
11762 _mm_storeu_si128((__m128i*) (y + 16), vy1);
11763 y += 32;
11764 }
11765 for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
11766 __m128i vacc = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) x));
11767 vacc = _mm_sub_epi16(vinput_zero_point, vacc);
11768 vacc = _mm_slli_epi16(vacc, 7);
11769 vacc = _mm_mulhrs_epi16(vacc, vmultiplier);
11770 vacc = _mm_adds_epi16(vacc, voutput_zero_point);
11771 x += 8;
11772
11773 const __m128i vy = _mm_packus_epi16(vacc, vacc);
11774 _mm_storel_epi64((__m128i*) y, vy);
11775 y += 8;
11776 }
11777 if XNN_UNLIKELY(n != 0) {
11778 assert(n >= 1 * sizeof(uint8_t));
11779 assert(n <= 7 * sizeof(uint8_t));
11780
11781 __m128i vacc = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) x));
11782 vacc = _mm_sub_epi16(vinput_zero_point, vacc);
11783 vacc = _mm_slli_epi16(vacc, 7);
11784 vacc = _mm_mulhrs_epi16(vacc, vmultiplier);
11785 vacc = _mm_adds_epi16(vacc, voutput_zero_point);
11786
11787 __m128i vy = _mm_packus_epi16(vacc, vacc);
11788 if (n & (4 * sizeof(uint8_t))) {
11789 _mm_storeu_si32(y, vy);
11790 vy = _mm_srli_epi64(vy, 32);
11791 y += 4;
11792 }
11793 if (n & (2 * sizeof(uint8_t))) {
11794 _mm_storeu_si16(y, vy);
11795 vy = _mm_srli_epi32(vy, 16);
11796 y += 2;
11797 }
11798 if (n & (1 * sizeof(uint8_t))) {
11799 *y = (uint8_t) _mm_extract_epi8(vy, 0);
11800 }
11801 }
11802 }
11803
xnn_qu8_vlrelu_ukernel__avx_x32(size_t n,const uint8_t * x,uint8_t * y,const union xnn_qu8_lrelu_params params[restrict XNN_MIN_ELEMENTS (1)])11804 void xnn_qu8_vlrelu_ukernel__avx_x32(
11805 size_t n,
11806 const uint8_t* x,
11807 uint8_t* y,
11808 const union xnn_qu8_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
11809 {
11810 assert(n != 0);
11811 assert(n % sizeof(uint8_t) == 0);
11812 assert(x != NULL);
11813 assert(y != NULL);
11814
11815 const __m128i vinput_zero_point = _mm_load_si128((const __m128i*) params->avx.input_zero_point);
11816 const __m128i vpositive_multiplier = _mm_load_si128((const __m128i*) params->avx.positive_multiplier);
11817 const __m128i vnegative_multiplier = _mm_load_si128((const __m128i*) params->avx.negative_multiplier);
11818 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->avx.output_zero_point);
11819 for (; n >= 32 * sizeof(uint8_t); n -= 32 * sizeof(uint8_t)) {
11820 __m128i vacc0 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) x));
11821 __m128i vacc1 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (x + 8)));
11822 __m128i vacc2 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (x + 16)));
11823 __m128i vacc3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (x + 24)));
11824 x += 32;
11825
11826 __m128i vmultiplier0 = _mm_cmpgt_epi16(vacc0, vinput_zero_point);
11827 vacc0 = _mm_sub_epi16(vinput_zero_point, vacc0);
11828 __m128i vmultiplier1 = _mm_cmpgt_epi16(vacc1, vinput_zero_point);
11829 vacc1 = _mm_sub_epi16(vinput_zero_point, vacc1);
11830 __m128i vmultiplier2 = _mm_cmpgt_epi16(vacc2, vinput_zero_point);
11831 vacc2 = _mm_sub_epi16(vinput_zero_point, vacc2);
11832 __m128i vmultiplier3 = _mm_cmpgt_epi16(vacc3, vinput_zero_point);
11833 vacc3 = _mm_sub_epi16(vinput_zero_point, vacc3);
11834
11835 vmultiplier0 = _mm_blendv_epi8(vnegative_multiplier, vpositive_multiplier, vmultiplier0);
11836 vacc0 = _mm_slli_epi16(vacc0, 7);
11837 vmultiplier1 = _mm_blendv_epi8(vnegative_multiplier, vpositive_multiplier, vmultiplier1);
11838 vacc1 = _mm_slli_epi16(vacc1, 7);
11839 vmultiplier2 = _mm_blendv_epi8(vnegative_multiplier, vpositive_multiplier, vmultiplier2);
11840 vacc2 = _mm_slli_epi16(vacc2, 7);
11841 vmultiplier3 = _mm_blendv_epi8(vnegative_multiplier, vpositive_multiplier, vmultiplier3);
11842 vacc3 = _mm_slli_epi16(vacc3, 7);
11843
11844 vacc0 = _mm_mulhrs_epi16(vacc0, vmultiplier0);
11845 vacc1 = _mm_mulhrs_epi16(vacc1, vmultiplier1);
11846 vacc2 = _mm_mulhrs_epi16(vacc2, vmultiplier2);
11847 vacc3 = _mm_mulhrs_epi16(vacc3, vmultiplier3);
11848
11849 vacc0 = _mm_adds_epi16(vacc0, voutput_zero_point);
11850 vacc1 = _mm_adds_epi16(vacc1, voutput_zero_point);
11851 vacc2 = _mm_adds_epi16(vacc2, voutput_zero_point);
11852 vacc3 = _mm_adds_epi16(vacc3, voutput_zero_point);
11853
11854 const __m128i vy0 = _mm_packus_epi16(vacc0, vacc1);
11855 const __m128i vy1 = _mm_packus_epi16(vacc2, vacc3);
11856
11857 _mm_storeu_si128((__m128i*) y, vy0);
11858 _mm_storeu_si128((__m128i*) (y + 16), vy1);
11859 y += 32;
11860 }
11861 for (; n >= 8 * sizeof(uint8_t); n -= 8 * sizeof(uint8_t)) {
11862 __m128i vacc = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) x));
11863 __m128i vmultiplier = _mm_cmpgt_epi16(vacc, vinput_zero_point);
11864 vacc = _mm_sub_epi16(vinput_zero_point, vacc);
11865 vmultiplier = _mm_blendv_epi8(vnegative_multiplier, vpositive_multiplier, vmultiplier);
11866 vacc = _mm_slli_epi16(vacc, 7);
11867 vacc = _mm_mulhrs_epi16(vacc, vmultiplier);
11868 vacc = _mm_adds_epi16(vacc, voutput_zero_point);
11869 x += 8;
11870
11871 const __m128i vy = _mm_packus_epi16(vacc, vacc);
11872 _mm_storel_epi64((__m128i*) y, vy);
11873 y += 8;
11874 }
11875 if XNN_UNLIKELY(n != 0) {
11876 assert(n >= 1 * sizeof(uint8_t));
11877 assert(n <= 7 * sizeof(uint8_t));
11878
11879 __m128i vacc = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) x));
11880 __m128i vmultiplier = _mm_cmpgt_epi16(vacc, vinput_zero_point);
11881 vacc = _mm_sub_epi16(vinput_zero_point, vacc);
11882 vmultiplier = _mm_blendv_epi8(vnegative_multiplier, vpositive_multiplier, vmultiplier);
11883 vacc = _mm_slli_epi16(vacc, 7);
11884 vacc = _mm_mulhrs_epi16(vacc, vmultiplier);
11885 vacc = _mm_adds_epi16(vacc, voutput_zero_point);
11886
11887 __m128i vy = _mm_packus_epi16(vacc, vacc);
11888 if (n & (4 * sizeof(uint8_t))) {
11889 _mm_storeu_si32(y, vy);
11890 vy = _mm_srli_epi64(vy, 32);
11891 y += 4;
11892 }
11893 if (n & (2 * sizeof(uint8_t))) {
11894 _mm_storeu_si16(y, vy);
11895 vy = _mm_srli_epi32(vy, 16);
11896 y += 2;
11897 }
11898 if (n & (1 * sizeof(uint8_t))) {
11899 *y = (uint8_t) _mm_extract_epi8(vy, 0);
11900 }
11901 }
11902 }
11903
xnn_qu8_vmul_minmax_fp32_ukernel__avx_mul16_ld64_x16(size_t n,const uint8_t * input_a,const uint8_t * input_b,uint8_t * output,const union xnn_qu8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])11904 void xnn_qu8_vmul_minmax_fp32_ukernel__avx_mul16_ld64_x16(
11905 size_t n,
11906 const uint8_t* input_a,
11907 const uint8_t* input_b,
11908 uint8_t* output,
11909 const union xnn_qu8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
11910
11911 {
11912 const __m128i va_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.a_zero_point);
11913 const __m128i vb_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.b_zero_point);
11914 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
11915 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
11916 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
11917 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
11918
11919 for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
11920 const __m128i va01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
11921 const __m128i vb01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_b));
11922 const __m128i va89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (input_a + 8)));
11923 const __m128i vb89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (input_b + 8)));
11924 input_a += 16;
11925 input_b += 16;
11926
11927
11928 const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
11929 const __m128i vxb01234567 = _mm_sub_epi16(vb01234567, vb_zero_point);
11930 const __m128i vxa89ABCDEF = _mm_sub_epi16(va89ABCDEF, va_zero_point);
11931 const __m128i vxb89ABCDEF = _mm_sub_epi16(vb89ABCDEF, vb_zero_point);
11932
11933 const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb01234567);
11934 const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb01234567);
11935 const __m128i vprod89ABCDEFlo = _mm_mullo_epi16(vxa89ABCDEF, vxb89ABCDEF);
11936 const __m128i vprod89ABCDEFhi = _mm_mulhi_epi16(vxa89ABCDEF, vxb89ABCDEF);
11937
11938 const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
11939 const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
11940 const __m128i vprod89AB = _mm_unpacklo_epi16(vprod89ABCDEFlo, vprod89ABCDEFhi);
11941 const __m128i vprodCDEF = _mm_unpackhi_epi16(vprod89ABCDEFlo, vprod89ABCDEFhi);
11942
11943 __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
11944 __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
11945 __m128 vfpacc89AB = _mm_cvtepi32_ps(vprod89AB);
11946 __m128 vfpaccCDEF = _mm_cvtepi32_ps(vprodCDEF);
11947
11948 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
11949 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
11950 vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale);
11951 vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale);
11952
11953 const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
11954 const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
11955 const __m128i vacc89AB = _mm_cvtps_epi32(vfpacc89AB);
11956 const __m128i vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF);
11957
11958 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
11959 __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
11960
11961
11962 __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF);
11963
11964 vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
11965
11966 vout0123456789ABCDEF = _mm_min_epu8(vout0123456789ABCDEF, voutput_max);
11967
11968 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
11969 output += 16;
11970 }
11971 if XNN_UNLIKELY(n != 0) {
11972 do {
11973 const __m128i va01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
11974 const __m128i vb01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_b));
11975 input_a += 8;
11976 input_b += 8;
11977
11978
11979 const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
11980 const __m128i vxb01234567 = _mm_sub_epi16(vb01234567, vb_zero_point);
11981
11982 const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb01234567);
11983 const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb01234567);
11984
11985 const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
11986 const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
11987
11988 __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
11989 __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
11990
11991 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
11992 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
11993
11994 const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
11995 const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
11996
11997 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
11998
11999 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
12000 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
12001 vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
12002
12003 if XNN_LIKELY(n >= (8 * sizeof(uint8_t))) {
12004 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
12005 output += 8;
12006 n -= 8 * sizeof(uint8_t);
12007 } else {
12008 if (n & (4 * sizeof(uint8_t))) {
12009 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
12010 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
12011 output += 4;
12012 }
12013 if (n & (2 * sizeof(uint8_t))) {
12014 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
12015 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
12016 output += 2;
12017 }
12018 if (n & (1 * sizeof(uint8_t))) {
12019 *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
12020 }
12021 n = 0;
12022 }
12023 } while (n != 0);
12024 }
12025 }
12026
xnn_qu8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16(size_t n,const uint8_t * input_a,const uint8_t * input_b,uint8_t * output,const union xnn_qu8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])12027 void xnn_qu8_vmulc_minmax_fp32_ukernel__avx_mul16_ld64_x16(
12028 size_t n,
12029 const uint8_t* input_a,
12030 const uint8_t* input_b,
12031 uint8_t* output,
12032 const union xnn_qu8_mul_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
12033
12034 {
12035 const __m128i va_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.a_zero_point);
12036 const __m128 vscale = _mm_load_ps(params->fp32_sse2.scale);
12037 const __m128i voutput_zero_point = _mm_load_si128((const __m128i*) params->fp32_sse2.output_zero_point);
12038 const __m128i voutput_min = _mm_load_si128((const __m128i*) params->fp32_sse2.output_min);
12039 const __m128i voutput_max = _mm_load_si128((const __m128i*) params->fp32_sse2.output_max);
12040
12041 __m128i vxb = _mm_sub_epi16(
12042 _mm_shuffle_epi32(_mm_cvtsi32_si128(UINT32_C(0x00010001) * (uint32_t) (uint16_t) (int16_t) *input_b), 0),
12043 _mm_load_si128((const __m128i*) params->fp32_sse2.b_zero_point));
12044 for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
12045 const __m128i va01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
12046 const __m128i va89ABCDEF = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) (input_a + 8)));
12047 input_a += 16;
12048
12049
12050 const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
12051 const __m128i vxa89ABCDEF = _mm_sub_epi16(va89ABCDEF, va_zero_point);
12052
12053 const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb);
12054 const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb);
12055 const __m128i vprod89ABCDEFlo = _mm_mullo_epi16(vxa89ABCDEF, vxb);
12056 const __m128i vprod89ABCDEFhi = _mm_mulhi_epi16(vxa89ABCDEF, vxb);
12057
12058 const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
12059 const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
12060 const __m128i vprod89AB = _mm_unpacklo_epi16(vprod89ABCDEFlo, vprod89ABCDEFhi);
12061 const __m128i vprodCDEF = _mm_unpackhi_epi16(vprod89ABCDEFlo, vprod89ABCDEFhi);
12062
12063 __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
12064 __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
12065 __m128 vfpacc89AB = _mm_cvtepi32_ps(vprod89AB);
12066 __m128 vfpaccCDEF = _mm_cvtepi32_ps(vprodCDEF);
12067
12068 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
12069 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
12070 vfpacc89AB = _mm_mul_ps(vfpacc89AB, vscale);
12071 vfpaccCDEF = _mm_mul_ps(vfpaccCDEF, vscale);
12072
12073 const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
12074 const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
12075 const __m128i vacc89AB = _mm_cvtps_epi32(vfpacc89AB);
12076 const __m128i vaccCDEF = _mm_cvtps_epi32(vfpaccCDEF);
12077
12078 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
12079 __m128i vout89ABCDEF = _mm_adds_epi16(_mm_packs_epi32(vacc89AB, vaccCDEF), voutput_zero_point);
12080
12081
12082 __m128i vout0123456789ABCDEF = _mm_packus_epi16(vout01234567, vout89ABCDEF);
12083
12084 vout0123456789ABCDEF = _mm_max_epu8(vout0123456789ABCDEF, voutput_min);
12085
12086 vout0123456789ABCDEF = _mm_min_epu8(vout0123456789ABCDEF, voutput_max);
12087
12088 _mm_storeu_si128((__m128i*) output, vout0123456789ABCDEF);
12089 output += 16;
12090 }
12091 if XNN_UNLIKELY(n != 0) {
12092 do {
12093 const __m128i va01234567 = _mm_cvtepu8_epi16(_mm_loadl_epi64((const __m128i*) input_a));
12094 input_a += 8;
12095
12096
12097 const __m128i vxa01234567 = _mm_sub_epi16(va01234567, va_zero_point);
12098
12099 const __m128i vprod01234567lo = _mm_mullo_epi16(vxa01234567, vxb);
12100 const __m128i vprod01234567hi = _mm_mulhi_epi16(vxa01234567, vxb);
12101
12102 const __m128i vprod0123 = _mm_unpacklo_epi16(vprod01234567lo, vprod01234567hi);
12103 const __m128i vprod4567 = _mm_unpackhi_epi16(vprod01234567lo, vprod01234567hi);
12104
12105 __m128 vfpacc0123 = _mm_cvtepi32_ps(vprod0123);
12106 __m128 vfpacc4567 = _mm_cvtepi32_ps(vprod4567);
12107
12108 vfpacc0123 = _mm_mul_ps(vfpacc0123, vscale);
12109 vfpacc4567 = _mm_mul_ps(vfpacc4567, vscale);
12110
12111 const __m128i vacc0123 = _mm_cvtps_epi32(vfpacc0123);
12112 const __m128i vacc4567 = _mm_cvtps_epi32(vfpacc4567);
12113
12114 __m128i vout01234567 = _mm_adds_epi16(_mm_packs_epi32(vacc0123, vacc4567), voutput_zero_point);
12115
12116 __m128i vout0123456701234567 = _mm_packus_epi16(vout01234567, vout01234567);
12117 vout0123456701234567 = _mm_max_epu8(vout0123456701234567, voutput_min);
12118 vout0123456701234567 = _mm_min_epu8(vout0123456701234567, voutput_max);
12119
12120 if XNN_LIKELY(n >= (8 * sizeof(uint8_t))) {
12121 _mm_storel_epi64((__m128i*) output, vout0123456701234567);
12122 output += 8;
12123 n -= 8 * sizeof(uint8_t);
12124 } else {
12125 if (n & (4 * sizeof(uint8_t))) {
12126 unaligned_store_u32(output, (uint32_t) _mm_cvtsi128_si32(vout0123456701234567));
12127 vout0123456701234567 = _mm_srli_epi64(vout0123456701234567, 32);
12128 output += 4;
12129 }
12130 if (n & (2 * sizeof(uint8_t))) {
12131 unaligned_store_u16(output, (uint16_t) _mm_extract_epi16(vout0123456701234567, 0));
12132 vout0123456701234567 = _mm_srli_epi32(vout0123456701234567, 16);
12133 output += 2;
12134 }
12135 if (n & (1 * sizeof(uint8_t))) {
12136 *output = (uint8_t) _mm_extract_epi8(vout0123456701234567, 0);
12137 }
12138 n = 0;
12139 }
12140 } while (n != 0);
12141 }
12142 }
12143
xnn_x8_lut_ukernel__avx_x64(size_t n,const uint8_t * x,uint8_t * y,const uint8_t t[restrict XNN_MIN_ELEMENTS (256)])12144 void xnn_x8_lut_ukernel__avx_x64(
12145 size_t n,
12146 const uint8_t* x,
12147 uint8_t* y,
12148 const uint8_t t[restrict XNN_MIN_ELEMENTS(256)])
12149 {
12150 assert(n != 0);
12151 assert(x != NULL);
12152 assert(y != NULL);
12153
12154 const __m128i vt0 = _mm_load_si128((const __m128i*) t);
12155 const __m128i vt1 = _mm_load_si128((const __m128i*) (t + 16));
12156 const __m128i vt2 = _mm_load_si128((const __m128i*) (t + 32));
12157 const __m128i vt3 = _mm_load_si128((const __m128i*) (t + 48));
12158 const __m128i vt4 = _mm_load_si128((const __m128i*) (t + 64));
12159 const __m128i vt5 = _mm_load_si128((const __m128i*) (t + 80));
12160 const __m128i vt6 = _mm_load_si128((const __m128i*) (t + 96));
12161 const __m128i vt7 = _mm_load_si128((const __m128i*) (t + 112));
12162 const __m128i vt8 = _mm_load_si128((const __m128i*) (t + 128));
12163 const __m128i vt9 = _mm_load_si128((const __m128i*) (t + 144));
12164 const __m128i vtA = _mm_load_si128((const __m128i*) (t + 160));
12165 const __m128i vtB = _mm_load_si128((const __m128i*) (t + 176));
12166 const __m128i vtC = _mm_load_si128((const __m128i*) (t + 192));
12167 const __m128i vtD = _mm_load_si128((const __m128i*) (t + 208));
12168 const __m128i vtE = _mm_load_si128((const __m128i*) (t + 224));
12169 const __m128i vtF = _mm_load_si128((const __m128i*) (t + 240));
12170
12171 const __m128i vtable0 = vt0;
12172 const __m128i vtable1 = _mm_xor_si128(vt0, vt1);
12173 const __m128i vtable2 = _mm_xor_si128(vt1, vt2);
12174 const __m128i vtable3 = _mm_xor_si128(vt2, vt3);
12175 const __m128i vtable4 = _mm_xor_si128(vt3, vt4);
12176 const __m128i vtable5 = _mm_xor_si128(vt4, vt5);
12177 const __m128i vtable6 = _mm_xor_si128(vt5, vt6);
12178 const __m128i vtable7 = _mm_xor_si128(vt6, vt7);
12179 const __m128i vtable8 = _mm_xor_si128(_mm_xor_si128(vt7, vt8), vtable0);
12180 const __m128i vtable9 = _mm_xor_si128(_mm_xor_si128(vt8, vt9), vtable1);
12181 const __m128i vtableA = _mm_xor_si128(_mm_xor_si128(vt9, vtA), vtable2);
12182 const __m128i vtableB = _mm_xor_si128(_mm_xor_si128(vtA, vtB), vtable3);
12183 const __m128i vtableC = _mm_xor_si128(_mm_xor_si128(vtB, vtC), vtable4);
12184 const __m128i vtableD = _mm_xor_si128(_mm_xor_si128(vtC, vtD), vtable5);
12185 const __m128i vtableE = _mm_xor_si128(_mm_xor_si128(vtD, vtE), vtable6);
12186 const __m128i vtableF = _mm_xor_si128(_mm_xor_si128(vtE, vtF), vtable7);
12187
12188 const __m128i voffset = _mm_set1_epi8(16);
12189 for (; n >= 64 * sizeof(uint8_t); n -= 64 * sizeof(uint8_t)) {
12190 __m128i vx0 = _mm_loadu_si128((const __m128i*) x);
12191 __m128i vx1 = _mm_loadu_si128((const __m128i*) (x + 16));
12192 __m128i vx2 = _mm_loadu_si128((const __m128i*) (x + 32));
12193 __m128i vx3 = _mm_loadu_si128((const __m128i*) (x + 48));
12194 x += 64;
12195
12196 __m128i vy0 = _mm_shuffle_epi8(vtable0, vx0);
12197 __m128i vy1 = _mm_shuffle_epi8(vtable0, vx1);
12198 __m128i vy2 = _mm_shuffle_epi8(vtable0, vx2);
12199 __m128i vy3 = _mm_shuffle_epi8(vtable0, vx3);
12200
12201 vx0 = _mm_sub_epi8(vx0, voffset);
12202 vx1 = _mm_sub_epi8(vx1, voffset);
12203 vx2 = _mm_sub_epi8(vx2, voffset);
12204 vx3 = _mm_sub_epi8(vx3, voffset);
12205 vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable1, vx0));
12206 vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable1, vx1));
12207 vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable1, vx2));
12208 vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtable1, vx3));
12209 vx0 = _mm_sub_epi8(vx0, voffset);
12210 vx1 = _mm_sub_epi8(vx1, voffset);
12211 vx2 = _mm_sub_epi8(vx2, voffset);
12212 vx3 = _mm_sub_epi8(vx3, voffset);
12213 vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable2, vx0));
12214 vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable2, vx1));
12215 vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable2, vx2));
12216 vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtable2, vx3));
12217 vx0 = _mm_sub_epi8(vx0, voffset);
12218 vx1 = _mm_sub_epi8(vx1, voffset);
12219 vx2 = _mm_sub_epi8(vx2, voffset);
12220 vx3 = _mm_sub_epi8(vx3, voffset);
12221 vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable3, vx0));
12222 vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable3, vx1));
12223 vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable3, vx2));
12224 vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtable3, vx3));
12225 vx0 = _mm_sub_epi8(vx0, voffset);
12226 vx1 = _mm_sub_epi8(vx1, voffset);
12227 vx2 = _mm_sub_epi8(vx2, voffset);
12228 vx3 = _mm_sub_epi8(vx3, voffset);
12229 vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable4, vx0));
12230 vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable4, vx1));
12231 vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable4, vx2));
12232 vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtable4, vx3));
12233 vx0 = _mm_sub_epi8(vx0, voffset);
12234 vx1 = _mm_sub_epi8(vx1, voffset);
12235 vx2 = _mm_sub_epi8(vx2, voffset);
12236 vx3 = _mm_sub_epi8(vx3, voffset);
12237 vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable5, vx0));
12238 vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable5, vx1));
12239 vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable5, vx2));
12240 vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtable5, vx3));
12241 vx0 = _mm_sub_epi8(vx0, voffset);
12242 vx1 = _mm_sub_epi8(vx1, voffset);
12243 vx2 = _mm_sub_epi8(vx2, voffset);
12244 vx3 = _mm_sub_epi8(vx3, voffset);
12245 vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable6, vx0));
12246 vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable6, vx1));
12247 vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable6, vx2));
12248 vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtable6, vx3));
12249 vx0 = _mm_sub_epi8(vx0, voffset);
12250 vx1 = _mm_sub_epi8(vx1, voffset);
12251 vx2 = _mm_sub_epi8(vx2, voffset);
12252 vx3 = _mm_sub_epi8(vx3, voffset);
12253 vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable7, vx0));
12254 vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable7, vx1));
12255 vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable7, vx2));
12256 vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtable7, vx3));
12257 vx0 = _mm_sub_epi8(vx0, voffset);
12258 vx1 = _mm_sub_epi8(vx1, voffset);
12259 vx2 = _mm_sub_epi8(vx2, voffset);
12260 vx3 = _mm_sub_epi8(vx3, voffset);
12261 vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable8, vx0));
12262 vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable8, vx1));
12263 vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable8, vx2));
12264 vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtable8, vx3));
12265
12266 vx0 = _mm_subs_epi8(vx0, voffset);
12267 vx1 = _mm_subs_epi8(vx1, voffset);
12268 vx2 = _mm_subs_epi8(vx2, voffset);
12269 vx3 = _mm_subs_epi8(vx3, voffset);
12270 vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtable9, vx0));
12271 vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtable9, vx1));
12272 vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtable9, vx2));
12273 vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtable9, vx3));
12274 vx0 = _mm_subs_epi8(vx0, voffset);
12275 vx1 = _mm_subs_epi8(vx1, voffset);
12276 vx2 = _mm_subs_epi8(vx2, voffset);
12277 vx3 = _mm_subs_epi8(vx3, voffset);
12278 vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableA, vx0));
12279 vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableA, vx1));
12280 vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtableA, vx2));
12281 vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtableA, vx3));
12282 vx0 = _mm_subs_epi8(vx0, voffset);
12283 vx1 = _mm_subs_epi8(vx1, voffset);
12284 vx2 = _mm_subs_epi8(vx2, voffset);
12285 vx3 = _mm_subs_epi8(vx3, voffset);
12286 vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableB, vx0));
12287 vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableB, vx1));
12288 vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtableB, vx2));
12289 vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtableB, vx3));
12290 vx0 = _mm_subs_epi8(vx0, voffset);
12291 vx1 = _mm_subs_epi8(vx1, voffset);
12292 vx2 = _mm_subs_epi8(vx2, voffset);
12293 vx3 = _mm_subs_epi8(vx3, voffset);
12294 vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableC, vx0));
12295 vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableC, vx1));
12296 vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtableC, vx2));
12297 vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtableC, vx3));
12298 vx0 = _mm_subs_epi8(vx0, voffset);
12299 vx1 = _mm_subs_epi8(vx1, voffset);
12300 vx2 = _mm_subs_epi8(vx2, voffset);
12301 vx3 = _mm_subs_epi8(vx3, voffset);
12302 vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableD, vx0));
12303 vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableD, vx1));
12304 vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtableD, vx2));
12305 vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtableD, vx3));
12306 vx0 = _mm_subs_epi8(vx0, voffset);
12307 vx1 = _mm_subs_epi8(vx1, voffset);
12308 vx2 = _mm_subs_epi8(vx2, voffset);
12309 vx3 = _mm_subs_epi8(vx3, voffset);
12310 vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableE, vx0));
12311 vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableE, vx1));
12312 vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtableE, vx2));
12313 vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtableE, vx3));
12314 vx0 = _mm_subs_epi8(vx0, voffset);
12315 vx1 = _mm_subs_epi8(vx1, voffset);
12316 vx2 = _mm_subs_epi8(vx2, voffset);
12317 vx3 = _mm_subs_epi8(vx3, voffset);
12318 vy0 = _mm_xor_si128(vy0, _mm_shuffle_epi8(vtableF, vx0));
12319 vy1 = _mm_xor_si128(vy1, _mm_shuffle_epi8(vtableF, vx1));
12320 vy2 = _mm_xor_si128(vy2, _mm_shuffle_epi8(vtableF, vx2));
12321 vy3 = _mm_xor_si128(vy3, _mm_shuffle_epi8(vtableF, vx3));
12322
12323 _mm_storeu_si128((__m128i*) y, vy0);
12324 _mm_storeu_si128((__m128i*) (y + 16), vy1);
12325 _mm_storeu_si128((__m128i*) (y + 32), vy2);
12326 _mm_storeu_si128((__m128i*) (y + 48), vy3);
12327 y += 64;
12328 }
12329 for (; n >= 16 * sizeof(uint8_t); n -= 16 * sizeof(uint8_t)) {
12330 __m128i vx = _mm_loadu_si128((const __m128i*) x);
12331 x += 16;
12332
12333 __m128i vy = _mm_shuffle_epi8(vtable0, vx);
12334
12335 vx = _mm_sub_epi8(vx, voffset);
12336 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable1, vx));
12337 vx = _mm_sub_epi8(vx, voffset);
12338 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable2, vx));
12339 vx = _mm_sub_epi8(vx, voffset);
12340 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable3, vx));
12341 vx = _mm_sub_epi8(vx, voffset);
12342 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable4, vx));
12343 vx = _mm_sub_epi8(vx, voffset);
12344 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable5, vx));
12345 vx = _mm_sub_epi8(vx, voffset);
12346 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable6, vx));
12347 vx = _mm_sub_epi8(vx, voffset);
12348 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable7, vx));
12349 vx = _mm_sub_epi8(vx, voffset);
12350 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable8, vx));
12351
12352 vx = _mm_subs_epi8(vx, voffset);
12353 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable9, vx));
12354 vx = _mm_subs_epi8(vx, voffset);
12355 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableA, vx));
12356 vx = _mm_subs_epi8(vx, voffset);
12357 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableB, vx));
12358 vx = _mm_subs_epi8(vx, voffset);
12359 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableC, vx));
12360 vx = _mm_subs_epi8(vx, voffset);
12361 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableD, vx));
12362 vx = _mm_subs_epi8(vx, voffset);
12363 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableE, vx));
12364 vx = _mm_subs_epi8(vx, voffset);
12365 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableF, vx));
12366
12367 _mm_storeu_si128((__m128i*) y, vy);
12368 y += 16;
12369 }
12370 if XNN_UNLIKELY(n != 0) {
12371 __m128i vx = _mm_loadu_si128((const __m128i*) x);
12372
12373 __m128i vy = _mm_shuffle_epi8(vtable0, vx);
12374
12375 vx = _mm_sub_epi8(vx, voffset);
12376 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable1, vx));
12377 vx = _mm_sub_epi8(vx, voffset);
12378 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable2, vx));
12379 vx = _mm_sub_epi8(vx, voffset);
12380 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable3, vx));
12381 vx = _mm_sub_epi8(vx, voffset);
12382 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable4, vx));
12383 vx = _mm_sub_epi8(vx, voffset);
12384 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable5, vx));
12385 vx = _mm_sub_epi8(vx, voffset);
12386 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable6, vx));
12387 vx = _mm_sub_epi8(vx, voffset);
12388 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable7, vx));
12389 vx = _mm_sub_epi8(vx, voffset);
12390 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable8, vx));
12391
12392 vx = _mm_subs_epi8(vx, voffset);
12393 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtable9, vx));
12394 vx = _mm_subs_epi8(vx, voffset);
12395 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableA, vx));
12396 vx = _mm_subs_epi8(vx, voffset);
12397 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableB, vx));
12398 vx = _mm_subs_epi8(vx, voffset);
12399 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableC, vx));
12400 vx = _mm_subs_epi8(vx, voffset);
12401 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableD, vx));
12402 vx = _mm_subs_epi8(vx, voffset);
12403 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableE, vx));
12404 vx = _mm_subs_epi8(vx, voffset);
12405 vy = _mm_xor_si128(vy, _mm_shuffle_epi8(vtableF, vx));
12406
12407 if (n & (8 * sizeof(uint8_t))) {
12408 _mm_storel_epi64((__m128i*) y, vy);
12409 vy = _mm_unpackhi_epi64(vy, vy);
12410 y += 8;
12411 }
12412 if (n & (4 * sizeof(uint8_t))) {
12413 _mm_storeu_si32(y, vy);
12414 vy = _mm_srli_epi64(vy, 32);
12415 y += 4;
12416 }
12417 if (n & (2 * sizeof(uint8_t))) {
12418 _mm_storeu_si16(y, vy);
12419 vy = _mm_srli_epi32(vy, 16);
12420 y += 2;
12421 }
12422 if (n & (1 * sizeof(uint8_t))) {
12423 *y = (uint8_t) _mm_extract_epi8(vy, 0);
12424 }
12425 }
12426 }
12427