1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <assert.h>
7
8 #include <immintrin.h>
9
10 #include <xnnpack/avgpool.h>
11 #include <xnnpack/common.h>
12 #include <xnnpack/gavgpool.h>
13 #include <xnnpack/intrinsics-polyfill.h>
14 #include <xnnpack/math.h>
15 #include <xnnpack/maxpool.h>
16 #include <xnnpack/prelu.h>
17 #include <xnnpack/rmax.h>
18 #include <xnnpack/vbinary.h>
19 #include <xnnpack/vcvt.h>
20 #include <xnnpack/vunary.h>
21
22
xnn_f16_avgpool_minmax_ukernel_9p8x__f16c_c8(size_t output_pixels,size_t kernel_elements,size_t channels,const void ** input,size_t input_offset,const void * zero,void * buffer,void * output,size_t input_increment,size_t output_increment,const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])23 void xnn_f16_avgpool_minmax_ukernel_9p8x__f16c_c8(
24 size_t output_pixels,
25 size_t kernel_elements,
26 size_t channels,
27 const void** input,
28 size_t input_offset,
29 const void* zero,
30 void* buffer,
31 void* output,
32 size_t input_increment,
33 size_t output_increment,
34 const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
35 {
36 assert(output_pixels != 0);
37 assert(kernel_elements > 9);
38 assert(channels != 0);
39
40 const __m256 vscale = _mm256_load_ps(params->avx.scale);
41 const __m256 vmin = _mm256_load_ps(params->avx.min);
42 const __m256 vmax = _mm256_load_ps(params->avx.max);
43
44 uint16_t* o = (uint16_t*) output;
45 do {
46 {
47 const uint16_t* i0 = *input++;
48 assert(i0 != NULL);
49 if XNN_UNPREDICTABLE(i0 != zero) {
50 i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
51 }
52 const uint16_t* i1 = *input++;
53 assert(i1 != NULL);
54 if XNN_UNPREDICTABLE(i1 != zero) {
55 i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
56 }
57 const uint16_t* i2 = *input++;
58 assert(i2 != NULL);
59 if XNN_UNPREDICTABLE(i2 != zero) {
60 i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
61 }
62 const uint16_t* i3 = *input++;
63 assert(i3 != NULL);
64 if XNN_UNPREDICTABLE(i3 != zero) {
65 i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
66 }
67 const uint16_t* i4 = *input++;
68 assert(i4 != NULL);
69 if XNN_UNPREDICTABLE(i4 != zero) {
70 i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
71 }
72 const uint16_t* i5 = *input++;
73 assert(i5 != NULL);
74 if XNN_UNPREDICTABLE(i5 != zero) {
75 i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
76 }
77 const uint16_t* i6 = *input++;
78 assert(i6 != NULL);
79 if XNN_UNPREDICTABLE(i6 != zero) {
80 i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
81 }
82 const uint16_t* i7 = *input++;
83 assert(i7 != NULL);
84 if XNN_UNPREDICTABLE(i7 != zero) {
85 i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
86 }
87 const uint16_t* i8 = *input++;
88 assert(i8 != NULL);
89 if XNN_UNPREDICTABLE(i8 != zero) {
90 i8 = (const uint16_t*) ((uintptr_t) i8 + input_offset);
91 }
92
93 uint16_t* b = (uint16_t*) buffer;
94 for (size_t c = 0; c < channels; c += 8) {
95 const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
96 i0 += 8;
97 const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
98 i1 += 8;
99 const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
100 i2 += 8;
101 const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
102 i3 += 8;
103 const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
104 i4 += 8;
105 const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
106 i5 += 8;
107 const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
108 i6 += 8;
109 const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
110 i7 += 8;
111 const __m256 vi8 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
112 i8 += 8;
113
114 const __m256 vsum01 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi0, vi1), _MM_FROUND_NO_EXC));
115 const __m256 vsum23 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi2, vi3), _MM_FROUND_NO_EXC));
116 const __m256 vsum45 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi4, vi5), _MM_FROUND_NO_EXC));
117 const __m256 vsum67 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi6, vi7), _MM_FROUND_NO_EXC));
118 const __m256 vsum018 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01, vi8), _MM_FROUND_NO_EXC));
119 const __m256 vsum2345 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum23, vsum45), _MM_FROUND_NO_EXC));
120 const __m256 vsum01678 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum018, vsum67), _MM_FROUND_NO_EXC));
121 const __m128i vsum = _mm256_cvtps_ph(_mm256_add_ps(vsum2345, vsum01678), _MM_FROUND_NO_EXC);
122
123 _mm_storeu_si128((__m128i*) b, vsum);
124 b += 8;
125 }
126 }
127
128 size_t k = kernel_elements;
129 for (k -= 9; k > 8; k -= 8) {
130 const uint16_t* i0 = (const uint16_t*) *input++;
131 assert(i0 != NULL);
132 if XNN_UNPREDICTABLE(i0 != zero) {
133 i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
134 }
135 const uint16_t* i1 = (const uint16_t*) *input++;
136 assert(i1 != NULL);
137 if XNN_UNPREDICTABLE(i1 != zero) {
138 i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
139 }
140 const uint16_t* i2 = (const uint16_t*) *input++;
141 assert(i2 != NULL);
142 if XNN_UNPREDICTABLE(i2 != zero) {
143 i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
144 }
145 const uint16_t* i3 = (const uint16_t*) *input++;
146 assert(i3 != NULL);
147 if XNN_UNPREDICTABLE(i3 != zero) {
148 i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
149 }
150 const uint16_t* i4 = (const uint16_t*) *input++;
151 assert(i4 != NULL);
152 if XNN_UNPREDICTABLE(i4 != zero) {
153 i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
154 }
155 const uint16_t* i5 = (const uint16_t*) *input++;
156 assert(i5 != NULL);
157 if XNN_UNPREDICTABLE(i5 != zero) {
158 i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
159 }
160 const uint16_t* i6 = (const uint16_t*) *input++;
161 assert(i6 != NULL);
162 if XNN_UNPREDICTABLE(i6 != zero) {
163 i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
164 }
165 const uint16_t* i7 = (const uint16_t*) *input++;
166 assert(i7 != NULL);
167 if XNN_UNPREDICTABLE(i7 != zero) {
168 i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
169 }
170
171 uint16_t* b = (uint16_t*) buffer;
172 for (size_t c = 0; c < channels; c += 8) {
173 const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
174 i0 += 8;
175 const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
176 i1 += 8;
177 const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
178 i2 += 8;
179 const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
180 i3 += 8;
181 const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
182 i4 += 8;
183 const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
184 i5 += 8;
185 const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
186 i6 += 8;
187 const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
188 i7 += 8;
189 const __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
190
191 const __m256 vsum01 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi0, vi1), _MM_FROUND_NO_EXC));
192 const __m256 vsum23 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi2, vi3), _MM_FROUND_NO_EXC));
193 const __m256 vsum45 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi4, vi5), _MM_FROUND_NO_EXC));
194 const __m256 vsum67 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi6, vi7), _MM_FROUND_NO_EXC));
195 const __m256 vsum01a = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01, vacc), _MM_FROUND_NO_EXC));
196 const __m256 vsum2345 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum23, vsum45), _MM_FROUND_NO_EXC));
197 const __m256 vsum0167a = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01a, vsum67), _MM_FROUND_NO_EXC));
198 const __m128i vsum = _mm256_cvtps_ph(_mm256_add_ps(vsum2345, vsum0167a), _MM_FROUND_NO_EXC);
199
200 _mm_storeu_si128((__m128i*) b, vsum);
201 b += 8;
202 }
203 }
204
205 assert(k >= 1);
206 {
207 const uint16_t* i0 = (const uint16_t*) input[0];
208 assert(i0 != NULL);
209 const uint16_t* i1 = (const uint16_t*) input[1];
210 const uint16_t* i2 = (const uint16_t*) input[2];
211 const uint16_t* i3 = (const uint16_t*) input[3];
212 const uint16_t* i4 = (const uint16_t*) input[4];
213 const uint16_t* i5 = (const uint16_t*) input[5];
214 const uint16_t* i6 = (const uint16_t*) input[6];
215 const uint16_t* i7 = (const uint16_t*) input[7];
216 input = (const void**) ((uintptr_t) input + input_increment);
217 if (k < 2) {
218 i1 = (const uint16_t*) zero;
219 }
220 assert(i1 != NULL);
221 if (k <= 2) {
222 i2 = (const uint16_t*) zero;
223 }
224 assert(i2 != NULL);
225 if (k < 4) {
226 i3 = (const uint16_t*) zero;
227 }
228 assert(i3 != NULL);
229 if (k <= 4) {
230 i4 = (const uint16_t*) zero;
231 }
232 assert(i4 != NULL);
233 if (k < 6) {
234 i5 = (const uint16_t*) zero;
235 }
236 assert(i5 != NULL);
237 if (k <= 6) {
238 i6 = (const uint16_t*) zero;
239 }
240 assert(i6 != NULL);
241 if (k < 8) {
242 i7 = (const uint16_t*) zero;
243 }
244 assert(i7 != NULL);
245 if XNN_UNPREDICTABLE(i0 != zero) {
246 i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
247 }
248 if XNN_UNPREDICTABLE(i1 != zero) {
249 i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
250 }
251 if XNN_UNPREDICTABLE(i2 != zero) {
252 i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
253 }
254 if XNN_UNPREDICTABLE(i3 != zero) {
255 i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
256 }
257 if XNN_UNPREDICTABLE(i4 != zero) {
258 i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
259 }
260 if XNN_UNPREDICTABLE(i5 != zero) {
261 i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
262 }
263 if XNN_UNPREDICTABLE(i6 != zero) {
264 i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
265 }
266 if XNN_UNPREDICTABLE(i7 != zero) {
267 i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
268 }
269
270 size_t c = channels;
271 uint16_t* b = (uint16_t*) buffer;
272 while (c >= 8) {
273 const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
274 i0 += 8;
275 const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
276 i1 += 8;
277 const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
278 i2 += 8;
279 const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
280 i3 += 8;
281 const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
282 i4 += 8;
283 const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
284 i5 += 8;
285 const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
286 i6 += 8;
287 const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
288 i7 += 8;
289 const __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
290 b += 8;
291
292 const __m256 vsum01 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi0, vi1), _MM_FROUND_NO_EXC));
293 const __m256 vsum23 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi2, vi3), _MM_FROUND_NO_EXC));
294 const __m256 vsum45 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi4, vi5), _MM_FROUND_NO_EXC));
295 const __m256 vsum67 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi6, vi7), _MM_FROUND_NO_EXC));
296 const __m256 vsum01a = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01, vacc), _MM_FROUND_NO_EXC));
297 const __m256 vsum2345 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum23, vsum45), _MM_FROUND_NO_EXC));
298 const __m256 vsum0167a = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01a, vsum67), _MM_FROUND_NO_EXC));
299 const __m256 vsum = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum2345, vsum0167a), _MM_FROUND_NO_EXC));
300
301 __m256 vout = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vsum, vscale), _MM_FROUND_NO_EXC));
302 vout = _mm256_max_ps(vout, vmin);
303 vout = _mm256_min_ps(vout, vmax);
304
305 _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC));
306 o += 8;
307
308 c -= 8;
309 }
310 if (c != 0) {
311 const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
312 const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
313 const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
314 const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
315 const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
316 const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
317 const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
318 const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
319 const __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
320
321 const __m256 vsum01 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi0, vi1), _MM_FROUND_NO_EXC));
322 const __m256 vsum23 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi2, vi3), _MM_FROUND_NO_EXC));
323 const __m256 vsum45 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi4, vi5), _MM_FROUND_NO_EXC));
324 const __m256 vsum67 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi6, vi7), _MM_FROUND_NO_EXC));
325 const __m256 vsum01a = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01, vacc), _MM_FROUND_NO_EXC));
326 const __m256 vsum2345 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum23, vsum45), _MM_FROUND_NO_EXC));
327 const __m256 vsum0167a = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01a, vsum67), _MM_FROUND_NO_EXC));
328 const __m256 vsum = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum2345, vsum0167a), _MM_FROUND_NO_EXC));
329
330 __m256 vout = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vsum, vscale), _MM_FROUND_NO_EXC));
331 vout = _mm256_max_ps(vout, vmin);
332 vout = _mm256_min_ps(vout, vmax);
333
334 __m128i vh = _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC);
335 if (c & 4) {
336 _mm_storel_epi64((__m128i*) o, vh);
337 vh = _mm_unpackhi_epi64(vh, vh);
338 o += 4;
339 }
340 if (c & 2) {
341 _mm_storeu_si32(o, vh);
342 vh = _mm_srli_epi64(vh, 32);
343 o += 2;
344 }
345 if (c & 1) {
346 *o = (uint16_t) _mm_extract_epi16(vh, 0);
347 o += 1;
348 }
349 }
350 }
351 o = (uint16_t*) ((uintptr_t) o + output_increment);
352 } while (--output_pixels != 0);
353 }
354
xnn_f16_avgpool_minmax_ukernel_9x__f16c_c8(size_t output_pixels,size_t kernel_elements,size_t channels,const void ** input,size_t input_offset,const void * zero,void * output,size_t input_increment,size_t output_increment,const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])355 void xnn_f16_avgpool_minmax_ukernel_9x__f16c_c8(
356 size_t output_pixels,
357 size_t kernel_elements,
358 size_t channels,
359 const void** input,
360 size_t input_offset,
361 const void* zero,
362 void* output,
363 size_t input_increment,
364 size_t output_increment,
365 const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
366 {
367 assert(output_pixels != 0);
368 assert(kernel_elements != 0);
369 assert(kernel_elements <= 9);
370 assert(channels != 0);
371
372 const __m256 vscale = _mm256_load_ps(params->avx.scale);
373 const __m256 vmin = _mm256_load_ps(params->avx.min);
374 const __m256 vmax = _mm256_load_ps(params->avx.max);
375
376 uint16_t* o = (uint16_t*) output;
377 do {
378 const uint16_t* i0 = (const uint16_t*) input[0];
379 assert(i0 != NULL);
380 const uint16_t* i1 = (const uint16_t*) input[1];
381 const uint16_t* i2 = (const uint16_t*) input[2];
382 const uint16_t* i3 = (const uint16_t*) input[3];
383 const uint16_t* i4 = (const uint16_t*) input[4];
384 const uint16_t* i5 = (const uint16_t*) input[5];
385 const uint16_t* i6 = (const uint16_t*) input[6];
386 const uint16_t* i7 = (const uint16_t*) input[7];
387 const uint16_t* i8 = (const uint16_t*) input[8];
388 input = (const void**) ((uintptr_t) input + input_increment);
389 if (kernel_elements < 2) {
390 i1 = (const uint16_t*) zero;
391 }
392 assert(i1 != NULL);
393 if (kernel_elements <= 2) {
394 i2 = (const uint16_t*) zero;
395 }
396 assert(i2 != NULL);
397 if (kernel_elements < 4) {
398 i3 = (const uint16_t*) zero;
399 }
400 assert(i3 != NULL);
401 if (kernel_elements <= 4) {
402 i4 = (const uint16_t*) zero;
403 }
404 assert(i4 != NULL);
405 if (kernel_elements < 6) {
406 i5 = (const uint16_t*) zero;
407 }
408 assert(i5 != NULL);
409 if (kernel_elements <= 6) {
410 i6 = (const uint16_t*) zero;
411 }
412 assert(i6 != NULL);
413 if (kernel_elements < 8) {
414 i7 = (const uint16_t*) zero;
415 }
416 assert(i7 != NULL);
417 if (kernel_elements <= 8) {
418 i8 = (const uint16_t*) zero;
419 }
420 assert(i8 != NULL);
421 if XNN_UNPREDICTABLE(i0 != zero) {
422 i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
423 }
424 if XNN_UNPREDICTABLE(i1 != zero) {
425 i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
426 }
427 if XNN_UNPREDICTABLE(i2 != zero) {
428 i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
429 }
430 if XNN_UNPREDICTABLE(i3 != zero) {
431 i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
432 }
433 if XNN_UNPREDICTABLE(i4 != zero) {
434 i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
435 }
436 if XNN_UNPREDICTABLE(i5 != zero) {
437 i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
438 }
439 if XNN_UNPREDICTABLE(i6 != zero) {
440 i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
441 }
442 if XNN_UNPREDICTABLE(i7 != zero) {
443 i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
444 }
445 if XNN_UNPREDICTABLE(i8 != zero) {
446 i8 = (const uint16_t*) ((uintptr_t) i8 + input_offset);
447 }
448
449 size_t c = channels;
450 while (c >= 8) {
451 const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
452 i0 += 8;
453 const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
454 i1 += 8;
455 const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
456 i2 += 8;
457 const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
458 i3 += 8;
459 const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
460 i4 += 8;
461 const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
462 i5 += 8;
463 const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
464 i6 += 8;
465 const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
466 i7 += 8;
467 const __m256 vi8 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
468 i8 += 8;
469
470 const __m256 vsum01 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi0, vi1), _MM_FROUND_NO_EXC));
471 const __m256 vsum23 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi2, vi3), _MM_FROUND_NO_EXC));
472 const __m256 vsum45 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi4, vi5), _MM_FROUND_NO_EXC));
473 const __m256 vsum67 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi6, vi7), _MM_FROUND_NO_EXC));
474 const __m256 vsum018 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01, vi8), _MM_FROUND_NO_EXC));
475 const __m256 vsum2345 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum23, vsum45), _MM_FROUND_NO_EXC));
476 const __m256 vsum01678 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum018, vsum67), _MM_FROUND_NO_EXC));
477 const __m256 vsum = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum2345, vsum01678), _MM_FROUND_NO_EXC));
478
479 __m256 vout = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vsum, vscale), _MM_FROUND_NO_EXC));
480 vout = _mm256_max_ps(vout, vmin);
481 vout = _mm256_min_ps(vout, vmax);
482
483 _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC));
484 o += 8;
485
486 c -= 8;
487 }
488 if (c != 0) {
489 const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
490 const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
491 const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
492 const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
493 const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
494 const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
495 const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
496 const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
497 const __m256 vi8 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
498
499 const __m256 vsum01 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi0, vi1), _MM_FROUND_NO_EXC));
500 const __m256 vsum23 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi2, vi3), _MM_FROUND_NO_EXC));
501 const __m256 vsum45 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi4, vi5), _MM_FROUND_NO_EXC));
502 const __m256 vsum67 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi6, vi7), _MM_FROUND_NO_EXC));
503 const __m256 vsum018 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01, vi8), _MM_FROUND_NO_EXC));
504 const __m256 vsum2345 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum23, vsum45), _MM_FROUND_NO_EXC));
505 const __m256 vsum01678 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum018, vsum67), _MM_FROUND_NO_EXC));
506 const __m256 vsum = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum2345, vsum01678), _MM_FROUND_NO_EXC));
507
508 __m256 vout = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vsum, vscale), _MM_FROUND_NO_EXC));
509 vout = _mm256_max_ps(vout, vmin);
510 vout = _mm256_min_ps(vout, vmax);
511
512 __m128i vh = _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC);
513 if (c & 4) {
514 _mm_storel_epi64((__m128i*) o, vh);
515 vh = _mm_unpackhi_epi64(vh, vh);
516 o += 4;
517 }
518 if (c & 2) {
519 _mm_storeu_si32(o, vh);
520 vh = _mm_srli_epi64(vh, 32);
521 o += 2;
522 }
523 if (c & 1) {
524 *o = (uint16_t) _mm_extract_epi16(vh, 0);
525 o += 1;
526 }
527 }
528 o = (uint16_t*) ((uintptr_t) o + output_increment);
529 } while (--output_pixels != 0);
530 }
531
xnn_f16_f32_vcvt_ukernel__f16c_x16(size_t n,const void * input,float * output,const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])532 void xnn_f16_f32_vcvt_ukernel__f16c_x16(
533 size_t n,
534 const void* input,
535 float* output,
536 const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
537 {
538 assert(n != 0);
539 assert(n % sizeof(uint16_t) == 0);
540 assert(input != NULL);
541 assert(output != NULL);
542
543 const uint16_t* i = (const uint16_t*) input;
544 for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
545 const __m256 vacc0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
546 const __m256 vacc1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 8)));
547 i += 16;
548
549 _mm256_storeu_ps(output, vacc0);
550 _mm256_storeu_ps(output + 8, vacc1);
551 output += 16;
552 }
553 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
554 const __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
555 i += 8;
556
557 _mm256_storeu_ps(output, vacc);
558 output += 8;
559 }
560 if XNN_UNLIKELY(n != 0) {
561 assert(n >= 1 * sizeof(uint16_t));
562 assert(n <= 7 * sizeof(uint16_t));
563 const __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
564
565 __m128 vacc_lo = _mm256_castps256_ps128(vacc);
566 if (n & (4 * sizeof(uint16_t))) {
567 _mm_storeu_ps(output, vacc_lo);
568 vacc_lo = _mm256_extractf128_ps(vacc, 1);
569 output += 4;
570 }
571 if (n & (2 * sizeof(uint16_t))) {
572 _mm_storel_pi((__m64*) output, vacc_lo);
573 vacc_lo = _mm_movehl_ps(vacc_lo, vacc_lo);
574 output += 2;
575 }
576 if (n & (1 * sizeof(uint16_t))) {
577 _mm_store_ss(output, vacc_lo);
578 }
579 }
580 }
581
xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8(size_t rows,size_t channels,const void * input,size_t input_stride,const void * zero,void * buffer,void * output,const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])582 void xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8(
583 size_t rows,
584 size_t channels,
585 const void* input,
586 size_t input_stride,
587 const void* zero,
588 void* buffer,
589 void* output,
590 const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
591 {
592 assert(rows > 7);
593 assert(channels != 0);
594
595 const uint16_t* i0 = input;
596 const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
597 const uint16_t* i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride);
598 const uint16_t* i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride);
599 const uint16_t* i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride);
600 const uint16_t* i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride);
601 const uint16_t* i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride);
602 const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint16_t);
603
604 uint16_t* b = buffer;
605 size_t c = channels;
606 for (; c != 0; c = doz(c, 8)) {
607 const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
608 const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
609
610 const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
611 __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_NO_EXC);
612
613 const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
614 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
615 const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
616 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
617 const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
618 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
619 const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
620 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
621 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
622
623 _mm_store_si128((__m128i*) b, vacc01234567); b += 8;
624 }
625
626 for (rows -= 7; rows > 7; rows -= 7) {
627 i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment);
628 i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment);
629 i2 = (const uint16_t*) ((uintptr_t) i2 + input_increment);
630 i3 = (const uint16_t*) ((uintptr_t) i3 + input_increment);
631 i4 = (const uint16_t*) ((uintptr_t) i4 + input_increment);
632 i5 = (const uint16_t*) ((uintptr_t) i5 + input_increment);
633 i6 = (const uint16_t*) ((uintptr_t) i6 + input_increment);
634
635 uint16_t* b = buffer;
636 size_t c = channels;
637 for (; c != 0; c = doz(c, 8)) {
638 __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) b);
639
640 const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
641
642 const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
643 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_NO_EXC);
644 const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
645 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_NO_EXC);
646 const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
647 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
648 const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
649 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
650 const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
651 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
652 const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
653 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
654 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
655
656 _mm_store_si128((__m128i*) b, vacc01234567); b += 8;
657 }
658 }
659
660 i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment);
661 i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment);
662 if XNN_UNPREDICTABLE(rows < 2) {
663 i1 = (const uint16_t*) zero;
664 }
665 i2 = (const uint16_t*) ((uintptr_t) i2 + input_increment);
666 if XNN_UNPREDICTABLE(rows <= 2) {
667 i2 = (const uint16_t*) zero;
668 }
669 i3 = (const uint16_t*) ((uintptr_t) i3 + input_increment);
670 if XNN_UNPREDICTABLE(rows < 4) {
671 i3 = (const uint16_t*) zero;
672 }
673 i4 = (const uint16_t*) ((uintptr_t) i4 + input_increment);
674 if XNN_UNPREDICTABLE(rows <= 4) {
675 i4 = (const uint16_t*) zero;
676 }
677 i5 = (const uint16_t*) ((uintptr_t) i5 + input_increment);
678 if XNN_UNPREDICTABLE(rows < 6) {
679 i5 = (const uint16_t*) zero;
680 }
681 i6 = (const uint16_t*) ((uintptr_t) i6 + input_increment);
682 if XNN_UNPREDICTABLE(rows <= 6) {
683 i6 = (const uint16_t*) zero;
684 }
685 uint16_t* o = (uint16_t*) output;
686
687 const __m256 vscale = _mm256_load_ps(params->avx.scale);
688 const __m256 vmin = _mm256_load_ps(params->avx.min);
689 const __m256 vmax = _mm256_load_ps(params->avx.max);
690 for (; channels >= 8; channels -= 8) {
691 __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) buffer); buffer = (uint16_t*) buffer + 8;
692
693 const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
694
695 const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
696 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_NO_EXC);
697 const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
698 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_NO_EXC);
699 const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
700 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
701 const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
702 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
703 const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
704 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
705 const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
706 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
707 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
708
709 vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_NO_EXC);
710
711 __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin);
712
713 vout01234567 = _mm256_min_ps(vout01234567, vmax);
714
715 _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout01234567, _MM_FROUND_NO_EXC));
716 o += 8;
717 }
718 if XNN_UNLIKELY(channels != 0) {
719 {
720 __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) buffer); buffer = (uint16_t*) buffer + 8;
721
722 const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
723 const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
724 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_NO_EXC);
725 const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
726 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_NO_EXC);
727 const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
728 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
729 const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
730 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
731 const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
732 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
733 const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
734 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
735 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
736
737 vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_NO_EXC);
738 __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin);
739 vout01234567 = _mm256_min_ps(vout01234567, vmax);
740
741 __m128i vh01234567 = _mm256_cvtps_ph(vout01234567, _MM_FROUND_NO_EXC);
742 if (channels & 4) {
743 _mm_storel_epi64((__m128i*) o, vh01234567);
744 o += 4;
745 vh01234567 = _mm_unpackhi_epi64(vh01234567, vh01234567);
746 }
747 if (channels & 2) {
748 _mm_storeu_si32(o, vh01234567);
749 o += 2;
750 vh01234567 = _mm_srli_epi64(vh01234567, 32);
751 }
752 if (channels & 1) {
753 *o = (uint16_t) _mm_extract_epi16(vh01234567, 0);
754 }
755 }
756 }
757 }
758
xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8(size_t rows,size_t channels,const void * input,size_t input_stride,const void * zero,void * output,const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])759 void xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8(
760 size_t rows,
761 size_t channels,
762 const void* input,
763 size_t input_stride,
764 const void* zero,
765 void* output,
766 const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
767 {
768 assert(rows != 0);
769 assert(rows <= 7);
770 assert(channels != 0);
771
772 const uint16_t* i0 = input;
773 const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
774 if XNN_UNPREDICTABLE(rows < 2) {
775 i1 = (const uint16_t*) zero;
776 }
777 const uint16_t* i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride);
778 if XNN_UNPREDICTABLE(rows <= 2) {
779 i2 = (const uint16_t*) zero;
780 }
781 const uint16_t* i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride);
782 if XNN_UNPREDICTABLE(rows < 4) {
783 i3 = (const uint16_t*) zero;
784 }
785 const uint16_t* i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride);
786 if XNN_UNPREDICTABLE(rows <= 4) {
787 i4 = (const uint16_t*) zero;
788 }
789 const uint16_t* i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride);
790 if XNN_UNPREDICTABLE(rows < 6) {
791 i5 = (const uint16_t*) zero;
792 }
793 const uint16_t* i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride);
794 if XNN_UNPREDICTABLE(rows <= 6) {
795 i6 = (const uint16_t*) zero;
796 }
797 uint16_t* o = (uint16_t*) output;
798
799 const __m256 vscale = _mm256_load_ps(params->avx.scale);
800 const __m256 vmin = _mm256_load_ps(params->avx.min);
801 const __m256 vmax = _mm256_load_ps(params->avx.max);
802 for (; channels >= 8; channels -= 8) {
803 const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
804 i0 += 8;
805 const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
806 i1 += 8;
807
808 const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
809 __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_NO_EXC);
810 i2 += 8;
811
812 const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
813 i3 += 8;
814 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
815 const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
816 i4 += 8;
817 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
818 const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
819 i5 += 8;
820 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
821 const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
822 i6 += 8;
823 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
824 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
825
826 vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_NO_EXC);
827
828 __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin);
829
830 vout01234567 = _mm256_min_ps(vout01234567, vmax);
831
832 _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout01234567, _MM_FROUND_NO_EXC));
833 o += 8;
834 }
835 if XNN_UNLIKELY(channels != 0) {
836 {
837 const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
838 const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
839
840 const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
841 __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_NO_EXC);
842
843 const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
844 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
845 const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
846 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
847 const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
848 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
849 const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
850 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
851 vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
852
853 vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_NO_EXC);
854 __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin);
855 vout01234567 = _mm256_min_ps(vout01234567, vmax);
856
857 __m128i vh01234567 = _mm256_cvtps_ph(vout01234567, _MM_FROUND_NO_EXC);
858 if (channels & 4) {
859 _mm_storel_epi64((__m128i*) o, vh01234567);
860 o += 4;
861 vh01234567 = _mm_unpackhi_epi64(vh01234567, vh01234567);
862 }
863 if (channels & 2) {
864 _mm_storeu_si32(o, vh01234567);
865 o += 2;
866 vh01234567 = _mm_srli_epi64(vh01234567, 32);
867 }
868 if (channels & 1) {
869 *o = (uint16_t) _mm_extract_epi16(vh01234567, 0);
870 }
871 }
872 }
873 }
874
xnn_f16_maxpool_minmax_ukernel_9p8x__f16c_c8(size_t output_pixels,size_t kernel_elements,size_t channels,const void ** input,size_t input_offset,void * output,size_t input_increment,size_t output_increment,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])875 void xnn_f16_maxpool_minmax_ukernel_9p8x__f16c_c8(
876 size_t output_pixels,
877 size_t kernel_elements,
878 size_t channels,
879 const void** input,
880 size_t input_offset,
881 void* output,
882 size_t input_increment,
883 size_t output_increment,
884 const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
885 {
886 assert(output_pixels != 0);
887 assert(kernel_elements != 0);
888 assert(channels != 0);
889
890 const __m256 voutput_min = _mm256_load_ps(params->avx.min);
891 const __m256 voutput_max = _mm256_load_ps(params->avx.max);
892 do {
893 uint16_t* o = output;
894 {
895 const uint16_t* i0 = *input++;
896 const uint16_t* i1 = *input++;
897 const uint16_t* i2 = *input++;
898 const uint16_t* i3 = *input++;
899 const uint16_t* i4 = *input++;
900 const uint16_t* i5 = *input++;
901 const uint16_t* i6 = *input++;
902 const uint16_t* i7 = *input++;
903 const uint16_t* i8 = *input++;
904 i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
905 i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
906 i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
907 i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
908 i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
909 i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
910 i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
911 i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
912 i8 = (const uint16_t*) ((uintptr_t) i8 + input_offset);
913 if (kernel_elements < 2) {
914 i1 = i0;
915 }
916 if (kernel_elements <= 2) {
917 i2 = i0;
918 }
919 if (kernel_elements < 4) {
920 i3 = i0;
921 }
922 if (kernel_elements <= 4) {
923 i4 = i0;
924 }
925 if (kernel_elements < 6) {
926 i5 = i0;
927 }
928 if (kernel_elements <= 6) {
929 i6 = i0;
930 }
931 if (kernel_elements < 8) {
932 i7 = i0;
933 }
934 if (kernel_elements <= 8) {
935 i8 = i0;
936 }
937
938 size_t c = channels;
939 for (; c >= 8; c -= 8) {
940 const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
941 i0 += 8;
942 const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
943 i1 += 8;
944 const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
945 i2 += 8;
946 const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
947 i3 += 8;
948 const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
949 i4 += 8;
950 const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
951 i5 += 8;
952 const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
953 i6 += 8;
954 const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
955 i7 += 8;
956 const __m256 vi8 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
957 i8 += 8;
958
959 const __m256 vmax018 = _mm256_max_ps(_mm256_max_ps(vi0, vi1), vi8);
960 const __m256 vmax23 = _mm256_max_ps(vi2, vi3);
961 const __m256 vmax45 = _mm256_max_ps(vi4, vi5);
962 const __m256 vmax67 = _mm256_max_ps(vi6, vi7);
963
964 const __m256 vmax2345 = _mm256_max_ps(vmax23, vmax45);
965 const __m256 vmax01678 = _mm256_max_ps(vmax018, vmax67);
966 const __m256 vmax = _mm256_max_ps(vmax2345, vmax01678);
967 const __m256 vout = _mm256_max_ps(_mm256_min_ps(vmax, voutput_max), voutput_min);
968
969 _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC));
970 o += 8;
971 }
972 if (c != 0) {
973 const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
974 i0 += 8;
975 const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
976 i1 += 8;
977 const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
978 i2 += 8;
979 const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
980 i3 += 8;
981 const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
982 i4 += 8;
983 const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
984 i5 += 8;
985 const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
986 i6 += 8;
987 const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
988 i7 += 8;
989 const __m256 vi8 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
990 i8 += 8;
991
992 const __m256 vmax018 = _mm256_max_ps(_mm256_max_ps(vi0, vi1), vi8);
993 const __m256 vmax23 = _mm256_max_ps(vi2, vi3);
994 const __m256 vmax45 = _mm256_max_ps(vi4, vi5);
995 const __m256 vmax67 = _mm256_max_ps(vi6, vi7);
996
997 const __m256 vmax2345 = _mm256_max_ps(vmax23, vmax45);
998 const __m256 vmax01678 = _mm256_max_ps(vmax018, vmax67);
999 const __m256 vmax = _mm256_max_ps(vmax2345, vmax01678);
1000 __m256 vout = _mm256_max_ps(_mm256_min_ps(vmax, voutput_max), voutput_min);
1001
1002 __m128i vh = _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC);
1003 if (c & 4) {
1004 _mm_storel_epi64((__m128i*) o, vh);
1005 vh = _mm_unpackhi_epi64(vh, vh);
1006 o += 4;
1007 }
1008 if (c & 2) {
1009 _mm_storeu_si32(o, vh);
1010 vh = _mm_srli_epi64(vh, 32);
1011 o += 2;
1012 }
1013 if (c & 1) {
1014 *o = _mm_extract_epi16(vh, 0);
1015 o += 1;
1016 }
1017 }
1018 }
1019
1020 for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
1021 const uint16_t* i0 = *input++;
1022 const uint16_t* i1 = *input++;
1023 const uint16_t* i2 = *input++;
1024 const uint16_t* i3 = *input++;
1025 const uint16_t* i4 = *input++;
1026 const uint16_t* i5 = *input++;
1027 const uint16_t* i6 = *input++;
1028 const uint16_t* i7 = *input++;
1029 i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
1030 i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
1031 i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
1032 i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
1033 i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
1034 i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
1035 i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
1036 i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
1037 if (k < 2) {
1038 i1 = i0;
1039 }
1040 if (k <= 2) {
1041 i2 = i0;
1042 }
1043 if (k < 4) {
1044 i3 = i0;
1045 }
1046 if (k <= 4) {
1047 i4 = i0;
1048 }
1049 if (k < 6) {
1050 i5 = i0;
1051 }
1052 if (k <= 6) {
1053 i6 = i0;
1054 }
1055 if (k < 8) {
1056 i7 = i0;
1057 }
1058
1059 o = output;
1060 size_t c = channels;
1061 for (; c >= 8; c -= 8) {
1062 const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
1063 i0 += 8;
1064 const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
1065 i1 += 8;
1066 const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
1067 i2 += 8;
1068 const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
1069 i3 += 8;
1070 const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
1071 i4 += 8;
1072 const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
1073 i5 += 8;
1074 const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
1075 i6 += 8;
1076 const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
1077 i7 += 8;
1078 const __m256 vo = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) o));
1079
1080 const __m256 vmax01 = _mm256_max_ps(_mm256_max_ps(vi0, vi1), vo);
1081 const __m256 vmax23 = _mm256_max_ps(vi2, vi3);
1082 const __m256 vmax45 = _mm256_max_ps(vi4, vi5);
1083 const __m256 vmax67 = _mm256_max_ps(vi6, vi7);
1084
1085 const __m256 vmax2345 = _mm256_max_ps(vmax23, vmax45);
1086 const __m256 vmax0167 = _mm256_max_ps(vmax01, vmax67);
1087 const __m256 vmax = _mm256_max_ps(vmax2345, vmax0167);
1088 const __m256 vout = _mm256_max_ps(_mm256_min_ps(vmax, voutput_max), voutput_min);
1089
1090 _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC));
1091 o += 8;
1092 }
1093 if (c != 0) {
1094 const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
1095 const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
1096 const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
1097 const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
1098 const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
1099 const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
1100 const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
1101 const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
1102 const __m256 vo = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) o));
1103
1104 const __m256 vmax01 = _mm256_max_ps(_mm256_max_ps(vi0, vi1), vo);
1105 const __m256 vmax23 = _mm256_max_ps(vi2, vi3);
1106 const __m256 vmax45 = _mm256_max_ps(vi4, vi5);
1107 const __m256 vmax67 = _mm256_max_ps(vi6, vi7);
1108
1109 const __m256 vmax2345 = _mm256_max_ps(vmax23, vmax45);
1110 const __m256 vmax0167 = _mm256_max_ps(vmax01, vmax67);
1111 const __m256 vmax = _mm256_max_ps(vmax2345, vmax0167);
1112 __m256 vout = _mm256_max_ps(_mm256_min_ps(vmax, voutput_max), voutput_min);
1113
1114 __m128i vh = _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC);
1115 if (c & 4) {
1116 _mm_storel_epi64((__m128i*) o, vh);
1117 vh = _mm_unpackhi_epi64(vh, vh);
1118 o += 4;
1119 }
1120 if (c & 2) {
1121 _mm_storeu_si32(o, vh);
1122 vh = _mm_srli_epi64(vh, 32);
1123 o += 2;
1124 }
1125 if (c & 1) {
1126 *o = _mm_extract_epi16(vh, 0);
1127 o += 1;
1128 }
1129 }
1130 }
1131 input = (const void**) ((uintptr_t) input + input_increment);
1132 output = (uint16_t*) ((uintptr_t) o + output_increment);
1133 } while (--output_pixels != 0);
1134 }
1135
xnn_f16_prelu_ukernel__f16c_2x16(size_t rows,size_t channels,const void * restrict input,size_t input_stride,const void * restrict weights,void * restrict output,size_t output_stride)1136 void xnn_f16_prelu_ukernel__f16c_2x16(
1137 size_t rows,
1138 size_t channels,
1139 const void* restrict input,
1140 size_t input_stride,
1141 const void* restrict weights,
1142 void* restrict output,
1143 size_t output_stride) XNN_OOB_READS
1144 {
1145 assert(rows != 0);
1146 assert(channels != 0);
1147 assert(channels % sizeof(uint16_t) == 0);
1148
1149 const uint16_t* i0 = (const uint16_t*) input;
1150 uint16_t* o0 = (uint16_t*) output;
1151 const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
1152 uint16_t* o1 = (uint16_t*) ((uintptr_t) o0 + output_stride);
1153
1154 const size_t input_increment = input_stride * 2 - channels;
1155 const size_t output_increment = output_stride * 2 - channels;
1156
1157 do {
1158 if XNN_UNPREDICTABLE(rows < 2) {
1159 i1 = i0;
1160 o1 = o0;
1161 }
1162
1163 const uint16_t* w = (const uint16_t*) weights;
1164 size_t c = channels;
1165 for (; c >= 16 * sizeof(uint16_t); c -= 16 * sizeof(uint16_t)) {
1166 const __m256 vw01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w));
1167 const __m256 vw89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 8)));
1168 w += 16;
1169
1170 const __m256 vi0x001234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
1171 const __m256 vi0x089ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i0 + 8)));
1172 i0 += 16;
1173 const __m256 vi1x001234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
1174 const __m256 vi1x089ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i1 + 8)));
1175 i1 += 16;
1176
1177 __m256 vacc0x001234567 = _mm256_mul_ps(vi0x001234567, vw01234567);
1178 __m256 vacc0x089ABCDEF = _mm256_mul_ps(vi0x089ABCDEF, vw89ABCDEF);
1179 __m256 vacc1x001234567 = _mm256_mul_ps(vi1x001234567, vw01234567);
1180 __m256 vacc1x089ABCDEF = _mm256_mul_ps(vi1x089ABCDEF, vw89ABCDEF);
1181
1182 vacc0x001234567 = _mm256_blendv_ps(vi0x001234567, vacc0x001234567, vi0x001234567);
1183 vacc0x089ABCDEF = _mm256_blendv_ps(vi0x089ABCDEF, vacc0x089ABCDEF, vi0x089ABCDEF);
1184 vacc1x001234567 = _mm256_blendv_ps(vi1x001234567, vacc1x001234567, vi1x001234567);
1185 vacc1x089ABCDEF = _mm256_blendv_ps(vi1x089ABCDEF, vacc1x089ABCDEF, vi1x089ABCDEF);
1186
1187 _mm_storeu_si128((__m128i*) o0, _mm256_cvtps_ph(vacc0x089ABCDEF, _MM_FROUND_NO_EXC));
1188 _mm_storeu_si128((__m128i*) (o0 + 0), _mm256_cvtps_ph(vacc0x001234567, _MM_FROUND_NO_EXC));
1189 _mm_storeu_si128((__m128i*) (o0 + 8), _mm256_cvtps_ph(vacc0x089ABCDEF, _MM_FROUND_NO_EXC));
1190 o0 += 16;
1191 _mm_storeu_si128((__m128i*) o1, _mm256_cvtps_ph(vacc1x089ABCDEF, _MM_FROUND_NO_EXC));
1192 _mm_storeu_si128((__m128i*) (o1 + 0), _mm256_cvtps_ph(vacc1x001234567, _MM_FROUND_NO_EXC));
1193 _mm_storeu_si128((__m128i*) (o1 + 8), _mm256_cvtps_ph(vacc1x089ABCDEF, _MM_FROUND_NO_EXC));
1194 o1 += 16;
1195 }
1196 for (; c >= 8 * sizeof(uint16_t); c -= 8 * sizeof(uint16_t)) {
1197 const __m256 vw01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w));
1198 w += 8;
1199
1200 const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
1201 i0 += 8;
1202 const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
1203 i1 += 8;
1204
1205 __m256 vacc0x01234567 = _mm256_mul_ps(vi0x01234567, vw01234567);
1206 __m256 vacc1x01234567 = _mm256_mul_ps(vi1x01234567, vw01234567);
1207
1208 vacc0x01234567 = _mm256_blendv_ps(vi0x01234567, vacc0x01234567, vi0x01234567);
1209 vacc1x01234567 = _mm256_blendv_ps(vi1x01234567, vacc1x01234567, vi1x01234567);
1210
1211 _mm_storeu_si128((__m128i*) o0, _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC));
1212 o0 += 8;
1213 _mm_storeu_si128((__m128i*) o1, _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_NO_EXC));
1214 o1 += 8;
1215 }
1216 if XNN_UNLIKELY(c != 0) {
1217 const __m256 vw01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w));
1218
1219 const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
1220 i0 = (const uint16_t*) ((uintptr_t) i0 + c);
1221 const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
1222 i1 = (const uint16_t*) ((uintptr_t) i1 + c);
1223
1224 __m256 vacc0x01234567 = _mm256_mul_ps(vi0x01234567, vw01234567);
1225 __m256 vacc1x01234567 = _mm256_mul_ps(vi1x01234567, vw01234567);
1226
1227 vacc0x01234567 = _mm256_blendv_ps(vi0x01234567, vacc0x01234567, vi0x01234567);
1228 vacc1x01234567 = _mm256_blendv_ps(vi1x01234567, vacc1x01234567, vi1x01234567);
1229
1230 __m128i vh0x01234567 = _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC);
1231 __m128i vh1x01234567 = _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_NO_EXC);
1232 if (c & (4 * sizeof(uint16_t))) {
1233 _mm_storel_epi64((__m128i*) o0, vh0x01234567);
1234 _mm_storel_epi64((__m128i*) o1, vh1x01234567);
1235
1236 vh0x01234567 = _mm_unpackhi_epi64(vh0x01234567, vh0x01234567);
1237 vh1x01234567 = _mm_unpackhi_epi64(vh1x01234567, vh1x01234567);
1238
1239 o0 += 4;
1240 o1 += 4;
1241 }
1242 if (c & (2 * sizeof(uint16_t))) {
1243 _mm_storeu_si32(o0, vh0x01234567);
1244 _mm_storeu_si32(o1, vh1x01234567);
1245
1246 vh0x01234567 = _mm_srli_epi64(vh0x01234567, 32);
1247 vh1x01234567 = _mm_srli_epi64(vh1x01234567, 32);
1248
1249 o0 += 2;
1250 o1 += 2;
1251 }
1252 if (c & (1 * sizeof(uint16_t))) {
1253 *o0 = (uint16_t) _mm_extract_epi16(vh0x01234567, 0);
1254 *o1 = (uint16_t) _mm_extract_epi16(vh1x01234567, 0);
1255
1256 o0 += 1;
1257 o1 += 1;
1258 }
1259 }
1260 i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment);
1261 o0 = (uint16_t*) ((uintptr_t) o0 + output_increment);
1262 i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment);
1263 o1 = (uint16_t*) ((uintptr_t) o1 + output_increment);
1264 rows = doz(rows, 2);
1265 } while (rows != 0);
1266 }
1267
xnn_f16_rmax_ukernel__f16c(size_t batch,const void * input,void * output)1268 void xnn_f16_rmax_ukernel__f16c(
1269 size_t batch,
1270 const void* input,
1271 void* output) XNN_OOB_READS
1272 {
1273 assert(batch != 0);
1274 assert(batch % sizeof(uint16_t) == 0);
1275
1276 const uint16_t* i = (const uint16_t*) input;
1277 __m128i vmax_init = _mm_shufflelo_epi16(_mm_loadl_epi64((const __m128i*) i), _MM_SHUFFLE(0, 0, 0, 0));
1278 vmax_init = _mm_unpacklo_epi64(vmax_init, vmax_init);
1279 __m256 vmax0 = _mm256_cvtph_ps(vmax_init);
1280 __m256 vmax1 = vmax0;
1281 __m256 vmax2 = vmax0;
1282 __m256 vmax3 = vmax0;
1283 for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) {
1284 const __m256 vx0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
1285 const __m256 vx1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 8)));
1286 const __m256 vx2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 16)));
1287 const __m256 vx3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 24)));
1288 i += 32;
1289
1290 vmax0 = _mm256_max_ps(vmax0, vx0);
1291 vmax1 = _mm256_max_ps(vmax1, vx1);
1292 vmax2 = _mm256_max_ps(vmax2, vx2);
1293 vmax3 = _mm256_max_ps(vmax3, vx3);
1294 }
1295 __m256 vmax = _mm256_max_ps(_mm256_max_ps(vmax0, vmax1), _mm256_max_ps(vmax2, vmax3));
1296 for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) {
1297 const __m256 vx = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
1298 i += 8;
1299 vmax = _mm256_max_ps(vmax, vx);
1300 }
1301 __m128 vmax_lo = _mm_max_ps(_mm256_castps256_ps128(vmax), _mm256_extractf128_ps(vmax, 1));
1302 if XNN_UNLIKELY(batch != 0) {
1303 const __m256 vx = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
1304 __m128 vx_lo = _mm256_castps256_ps128(vx);
1305 if (batch & (4 * sizeof(uint16_t))) {
1306 vmax_lo = _mm_max_ps(vmax_lo, vx_lo);
1307 vx_lo = _mm256_extractf128_ps(vx, 1);
1308 }
1309 if (batch & (2 * sizeof(uint16_t))) {
1310 vmax_lo = _mm_blend_ps(_mm_max_ps(vmax_lo, vx_lo), vmax_lo, 0xC);
1311 vx_lo = _mm_movehl_ps(vx_lo, vx_lo);
1312 }
1313 if (batch & (1 * sizeof(uint16_t))) {
1314 vmax_lo = _mm_max_ss(vmax_lo, vx_lo);
1315 }
1316 }
1317 vmax_lo = _mm_max_ps(vmax_lo, _mm_movehl_ps(vmax_lo, vmax_lo));
1318 vmax_lo = _mm_max_ss(vmax_lo, _mm_movehdup_ps(vmax_lo));
1319 *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(_mm_cvtps_ph(vmax_lo, _MM_FROUND_NO_EXC), 0);
1320 }
1321
xnn_f16_vadd_minmax_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1322 void xnn_f16_vadd_minmax_ukernel__f16c_x16(
1323 size_t n,
1324 const void* restrict a_ptr,
1325 const void* restrict b_ptr,
1326 void* restrict y_ptr,
1327 const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1328 {
1329 assert(n != 0);
1330 assert(n % sizeof(uint16_t) == 0);
1331 assert(a_ptr != NULL);
1332 assert(b_ptr != NULL);
1333 assert(y_ptr != NULL);
1334
1335 const uint16_t* a = (const uint16_t*) a_ptr;
1336 const uint16_t* b = (const uint16_t*) b_ptr;
1337 uint16_t* y = (uint16_t*) y_ptr;
1338
1339 const __m256 vy_min = _mm256_load_ps(params->avx.min);
1340 const __m256 vy_max = _mm256_load_ps(params->avx.max);
1341
1342 for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
1343 const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1344 const __m256 vb01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1345 const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
1346 const __m256 vb456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (b + 8)));
1347 a += 16;
1348 b += 16;
1349
1350 __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va01234567, vb01234567), _MM_FROUND_NO_EXC));
1351 __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va456789AB, vb456789AB), _MM_FROUND_NO_EXC));
1352
1353
1354 vy01234567 = _mm256_max_ps(vy01234567, vy_min);
1355 vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
1356
1357 vy01234567 = _mm256_min_ps(vy01234567, vy_max);
1358 vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
1359
1360 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
1361 _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
1362 y += 16;
1363 }
1364 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1365 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1366 const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1367 a += 8;
1368 b += 8;
1369
1370 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_NO_EXC));
1371
1372 vy = _mm256_max_ps(vy, vy_min);
1373 vy = _mm256_min_ps(vy, vy_max);
1374
1375 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
1376 y += 8;
1377 }
1378 if XNN_UNLIKELY(n != 0) {
1379 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1380 const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1381
1382 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_NO_EXC));
1383
1384 vy = _mm256_max_ps(vy, vy_min);
1385 vy = _mm256_min_ps(vy, vy_max);
1386
1387 __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
1388 if (n & (4 * sizeof(uint16_t))) {
1389 _mm_storel_epi64((__m128i*) y, vh);
1390 vh = _mm_unpackhi_epi64(vh, vh);
1391 y += 4;
1392 }
1393 if (n & (2 * sizeof(uint16_t))) {
1394 _mm_storeu_si32(y, vh);
1395 vh = _mm_srli_epi64(vh, 32);
1396 y += 2;
1397 }
1398 if (n & (1 * sizeof(uint16_t))) {
1399 *y = (uint16_t) _mm_extract_epi16(vh, 0);
1400 }
1401 }
1402 }
1403
xnn_f16_vaddc_minmax_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1404 void xnn_f16_vaddc_minmax_ukernel__f16c_x16(
1405 size_t n,
1406 const void* restrict a_ptr,
1407 const void* restrict b_ptr,
1408 void* restrict y_ptr,
1409 const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1410 {
1411 assert(n != 0);
1412 assert(n % sizeof(uint16_t) == 0);
1413 assert(a_ptr != NULL);
1414 assert(b_ptr != NULL);
1415 assert(y_ptr != NULL);
1416
1417 const uint16_t* a = (const uint16_t*) a_ptr;
1418 const uint16_t* b = (const uint16_t*) b_ptr;
1419 uint16_t* y = (uint16_t*) y_ptr;
1420
1421 const __m256 vy_min = _mm256_load_ps(params->avx.min);
1422 const __m256 vy_max = _mm256_load_ps(params->avx.max);
1423
1424 const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
1425 for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
1426 const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1427 const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
1428 a += 16;
1429
1430 __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va01234567, vb), _MM_FROUND_NO_EXC));
1431 __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va456789AB, vb), _MM_FROUND_NO_EXC));
1432
1433
1434 vy01234567 = _mm256_max_ps(vy01234567, vy_min);
1435 vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
1436
1437 vy01234567 = _mm256_min_ps(vy01234567, vy_max);
1438 vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
1439
1440 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
1441 _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
1442 y += 16;
1443 }
1444 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1445 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1446 a += 8;
1447
1448 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_NO_EXC));
1449
1450 vy = _mm256_max_ps(vy, vy_min);
1451 vy = _mm256_min_ps(vy, vy_max);
1452
1453 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
1454 y += 8;
1455 }
1456 if XNN_UNLIKELY(n != 0) {
1457 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1458
1459 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_NO_EXC));
1460
1461 vy = _mm256_max_ps(vy, vy_min);
1462 vy = _mm256_min_ps(vy, vy_max);
1463
1464 __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
1465 if (n & (4 * sizeof(uint16_t))) {
1466 _mm_storel_epi64((__m128i*) y, vh);
1467 vh = _mm_unpackhi_epi64(vh, vh);
1468 y += 4;
1469 }
1470 if (n & (2 * sizeof(uint16_t))) {
1471 _mm_storeu_si32(y, vh);
1472 vh = _mm_srli_epi64(vh, 32);
1473 y += 2;
1474 }
1475 if (n & (1 * sizeof(uint16_t))) {
1476 *y = (uint16_t) _mm_extract_epi16(vh, 0);
1477 }
1478 }
1479 }
1480
xnn_f16_vdiv_minmax_ukernel__f16c_x8(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1481 void xnn_f16_vdiv_minmax_ukernel__f16c_x8(
1482 size_t n,
1483 const void* restrict a_ptr,
1484 const void* restrict b_ptr,
1485 void* restrict y_ptr,
1486 const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1487 {
1488 assert(n != 0);
1489 assert(n % sizeof(uint16_t) == 0);
1490 assert(a_ptr != NULL);
1491 assert(b_ptr != NULL);
1492 assert(y_ptr != NULL);
1493
1494 const uint16_t* a = (const uint16_t*) a_ptr;
1495 const uint16_t* b = (const uint16_t*) b_ptr;
1496 uint16_t* y = (uint16_t*) y_ptr;
1497
1498 const __m256 vy_min = _mm256_load_ps(params->avx.min);
1499 const __m256 vy_max = _mm256_load_ps(params->avx.max);
1500
1501 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1502 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1503 const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1504 a += 8;
1505 b += 8;
1506
1507 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_div_ps(va, vb), _MM_FROUND_NO_EXC));
1508
1509 vy = _mm256_max_ps(vy, vy_min);
1510 vy = _mm256_min_ps(vy, vy_max);
1511
1512 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
1513 y += 8;
1514 }
1515 if XNN_UNLIKELY(n != 0) {
1516 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1517 const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1518
1519 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_div_ps(va, vb), _MM_FROUND_NO_EXC));
1520
1521 vy = _mm256_max_ps(vy, vy_min);
1522 vy = _mm256_min_ps(vy, vy_max);
1523
1524 __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
1525 if (n & (4 * sizeof(uint16_t))) {
1526 _mm_storel_epi64((__m128i*) y, vh);
1527 vh = _mm_unpackhi_epi64(vh, vh);
1528 y += 4;
1529 }
1530 if (n & (2 * sizeof(uint16_t))) {
1531 _mm_storeu_si32(y, vh);
1532 vh = _mm_srli_epi64(vh, 32);
1533 y += 2;
1534 }
1535 if (n & (1 * sizeof(uint16_t))) {
1536 *y = (uint16_t) _mm_extract_epi16(vh, 0);
1537 }
1538 }
1539 }
1540
xnn_f16_vdivc_minmax_ukernel__f16c_x8(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1541 void xnn_f16_vdivc_minmax_ukernel__f16c_x8(
1542 size_t n,
1543 const void* restrict a_ptr,
1544 const void* restrict b_ptr,
1545 void* restrict y_ptr,
1546 const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1547 {
1548 assert(n != 0);
1549 assert(n % sizeof(uint16_t) == 0);
1550 assert(a_ptr != NULL);
1551 assert(b_ptr != NULL);
1552 assert(y_ptr != NULL);
1553
1554 const uint16_t* a = (const uint16_t*) a_ptr;
1555 const uint16_t* b = (const uint16_t*) b_ptr;
1556 uint16_t* y = (uint16_t*) y_ptr;
1557
1558 const __m256 vy_min = _mm256_load_ps(params->avx.min);
1559 const __m256 vy_max = _mm256_load_ps(params->avx.max);
1560
1561 const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
1562 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1563 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1564 a += 8;
1565
1566 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_div_ps(va, vb), _MM_FROUND_NO_EXC));
1567
1568 vy = _mm256_max_ps(vy, vy_min);
1569 vy = _mm256_min_ps(vy, vy_max);
1570
1571 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
1572 y += 8;
1573 }
1574 if XNN_UNLIKELY(n != 0) {
1575 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1576
1577 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_div_ps(va, vb), _MM_FROUND_NO_EXC));
1578
1579 vy = _mm256_max_ps(vy, vy_min);
1580 vy = _mm256_min_ps(vy, vy_max);
1581
1582 __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
1583 if (n & (4 * sizeof(uint16_t))) {
1584 _mm_storel_epi64((__m128i*) y, vh);
1585 vh = _mm_unpackhi_epi64(vh, vh);
1586 y += 4;
1587 }
1588 if (n & (2 * sizeof(uint16_t))) {
1589 _mm_storeu_si32(y, vh);
1590 vh = _mm_srli_epi64(vh, 32);
1591 y += 2;
1592 }
1593 if (n & (1 * sizeof(uint16_t))) {
1594 *y = (uint16_t) _mm_extract_epi16(vh, 0);
1595 }
1596 }
1597 }
1598
xnn_f16_vmax_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS (1)])1599 void xnn_f16_vmax_ukernel__f16c_x16(
1600 size_t n,
1601 const void* restrict a_ptr,
1602 const void* restrict b_ptr,
1603 void* restrict y_ptr,
1604 const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1605 {
1606 assert(n != 0);
1607 assert(n % sizeof(uint16_t) == 0);
1608 assert(a_ptr != NULL);
1609 assert(b_ptr != NULL);
1610 assert(y_ptr != NULL);
1611
1612 const uint16_t* a = (const uint16_t*) a_ptr;
1613 const uint16_t* b = (const uint16_t*) b_ptr;
1614 uint16_t* y = (uint16_t*) y_ptr;
1615
1616
1617 for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
1618 const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1619 const __m256 vb01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1620 const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
1621 const __m256 vb456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (b + 8)));
1622 a += 16;
1623 b += 16;
1624
1625 __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_max_ps(va01234567, vb01234567), _MM_FROUND_NO_EXC));
1626 __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_max_ps(va456789AB, vb456789AB), _MM_FROUND_NO_EXC));
1627
1628
1629
1630 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
1631 _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
1632 y += 16;
1633 }
1634 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1635 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1636 const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1637 a += 8;
1638 b += 8;
1639
1640 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_max_ps(va, vb), _MM_FROUND_NO_EXC));
1641
1642
1643 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
1644 y += 8;
1645 }
1646 if XNN_UNLIKELY(n != 0) {
1647 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1648 const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1649
1650 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_max_ps(va, vb), _MM_FROUND_NO_EXC));
1651
1652
1653 __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
1654 if (n & (4 * sizeof(uint16_t))) {
1655 _mm_storel_epi64((__m128i*) y, vh);
1656 vh = _mm_unpackhi_epi64(vh, vh);
1657 y += 4;
1658 }
1659 if (n & (2 * sizeof(uint16_t))) {
1660 _mm_storeu_si32(y, vh);
1661 vh = _mm_srli_epi64(vh, 32);
1662 y += 2;
1663 }
1664 if (n & (1 * sizeof(uint16_t))) {
1665 *y = (uint16_t) _mm_extract_epi16(vh, 0);
1666 }
1667 }
1668 }
1669
xnn_f16_vmaxc_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS (1)])1670 void xnn_f16_vmaxc_ukernel__f16c_x16(
1671 size_t n,
1672 const void* restrict a_ptr,
1673 const void* restrict b_ptr,
1674 void* restrict y_ptr,
1675 const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1676 {
1677 assert(n != 0);
1678 assert(n % sizeof(uint16_t) == 0);
1679 assert(a_ptr != NULL);
1680 assert(b_ptr != NULL);
1681 assert(y_ptr != NULL);
1682
1683 const uint16_t* a = (const uint16_t*) a_ptr;
1684 const uint16_t* b = (const uint16_t*) b_ptr;
1685 uint16_t* y = (uint16_t*) y_ptr;
1686
1687
1688 const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
1689 for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
1690 const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1691 const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
1692 a += 16;
1693
1694 __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_max_ps(va01234567, vb), _MM_FROUND_NO_EXC));
1695 __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_max_ps(va456789AB, vb), _MM_FROUND_NO_EXC));
1696
1697
1698
1699 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
1700 _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
1701 y += 16;
1702 }
1703 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1704 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1705 a += 8;
1706
1707 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_max_ps(va, vb), _MM_FROUND_NO_EXC));
1708
1709
1710 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
1711 y += 8;
1712 }
1713 if XNN_UNLIKELY(n != 0) {
1714 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1715
1716 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_max_ps(va, vb), _MM_FROUND_NO_EXC));
1717
1718
1719 __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
1720 if (n & (4 * sizeof(uint16_t))) {
1721 _mm_storel_epi64((__m128i*) y, vh);
1722 vh = _mm_unpackhi_epi64(vh, vh);
1723 y += 4;
1724 }
1725 if (n & (2 * sizeof(uint16_t))) {
1726 _mm_storeu_si32(y, vh);
1727 vh = _mm_srli_epi64(vh, 32);
1728 y += 2;
1729 }
1730 if (n & (1 * sizeof(uint16_t))) {
1731 *y = (uint16_t) _mm_extract_epi16(vh, 0);
1732 }
1733 }
1734 }
1735
xnn_f16_vmin_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS (1)])1736 void xnn_f16_vmin_ukernel__f16c_x16(
1737 size_t n,
1738 const void* restrict a_ptr,
1739 const void* restrict b_ptr,
1740 void* restrict y_ptr,
1741 const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1742 {
1743 assert(n != 0);
1744 assert(n % sizeof(uint16_t) == 0);
1745 assert(a_ptr != NULL);
1746 assert(b_ptr != NULL);
1747 assert(y_ptr != NULL);
1748
1749 const uint16_t* a = (const uint16_t*) a_ptr;
1750 const uint16_t* b = (const uint16_t*) b_ptr;
1751 uint16_t* y = (uint16_t*) y_ptr;
1752
1753
1754 for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
1755 const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1756 const __m256 vb01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1757 const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
1758 const __m256 vb456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (b + 8)));
1759 a += 16;
1760 b += 16;
1761
1762 __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_min_ps(va01234567, vb01234567), _MM_FROUND_NO_EXC));
1763 __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_min_ps(va456789AB, vb456789AB), _MM_FROUND_NO_EXC));
1764
1765
1766
1767 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
1768 _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
1769 y += 16;
1770 }
1771 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1772 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1773 const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1774 a += 8;
1775 b += 8;
1776
1777 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_min_ps(va, vb), _MM_FROUND_NO_EXC));
1778
1779
1780 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
1781 y += 8;
1782 }
1783 if XNN_UNLIKELY(n != 0) {
1784 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1785 const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1786
1787 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_min_ps(va, vb), _MM_FROUND_NO_EXC));
1788
1789
1790 __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
1791 if (n & (4 * sizeof(uint16_t))) {
1792 _mm_storel_epi64((__m128i*) y, vh);
1793 vh = _mm_unpackhi_epi64(vh, vh);
1794 y += 4;
1795 }
1796 if (n & (2 * sizeof(uint16_t))) {
1797 _mm_storeu_si32(y, vh);
1798 vh = _mm_srli_epi64(vh, 32);
1799 y += 2;
1800 }
1801 if (n & (1 * sizeof(uint16_t))) {
1802 *y = (uint16_t) _mm_extract_epi16(vh, 0);
1803 }
1804 }
1805 }
1806
xnn_f16_vminc_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS (1)])1807 void xnn_f16_vminc_ukernel__f16c_x16(
1808 size_t n,
1809 const void* restrict a_ptr,
1810 const void* restrict b_ptr,
1811 void* restrict y_ptr,
1812 const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1813 {
1814 assert(n != 0);
1815 assert(n % sizeof(uint16_t) == 0);
1816 assert(a_ptr != NULL);
1817 assert(b_ptr != NULL);
1818 assert(y_ptr != NULL);
1819
1820 const uint16_t* a = (const uint16_t*) a_ptr;
1821 const uint16_t* b = (const uint16_t*) b_ptr;
1822 uint16_t* y = (uint16_t*) y_ptr;
1823
1824
1825 const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
1826 for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
1827 const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1828 const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
1829 a += 16;
1830
1831 __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_min_ps(va01234567, vb), _MM_FROUND_NO_EXC));
1832 __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_min_ps(va456789AB, vb), _MM_FROUND_NO_EXC));
1833
1834
1835
1836 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
1837 _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
1838 y += 16;
1839 }
1840 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1841 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1842 a += 8;
1843
1844 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_min_ps(va, vb), _MM_FROUND_NO_EXC));
1845
1846
1847 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
1848 y += 8;
1849 }
1850 if XNN_UNLIKELY(n != 0) {
1851 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1852
1853 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_min_ps(va, vb), _MM_FROUND_NO_EXC));
1854
1855
1856 __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
1857 if (n & (4 * sizeof(uint16_t))) {
1858 _mm_storel_epi64((__m128i*) y, vh);
1859 vh = _mm_unpackhi_epi64(vh, vh);
1860 y += 4;
1861 }
1862 if (n & (2 * sizeof(uint16_t))) {
1863 _mm_storeu_si32(y, vh);
1864 vh = _mm_srli_epi64(vh, 32);
1865 y += 2;
1866 }
1867 if (n & (1 * sizeof(uint16_t))) {
1868 *y = (uint16_t) _mm_extract_epi16(vh, 0);
1869 }
1870 }
1871 }
1872
xnn_f16_vmul_minmax_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1873 void xnn_f16_vmul_minmax_ukernel__f16c_x16(
1874 size_t n,
1875 const void* restrict a_ptr,
1876 const void* restrict b_ptr,
1877 void* restrict y_ptr,
1878 const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1879 {
1880 assert(n != 0);
1881 assert(n % sizeof(uint16_t) == 0);
1882 assert(a_ptr != NULL);
1883 assert(b_ptr != NULL);
1884 assert(y_ptr != NULL);
1885
1886 const uint16_t* a = (const uint16_t*) a_ptr;
1887 const uint16_t* b = (const uint16_t*) b_ptr;
1888 uint16_t* y = (uint16_t*) y_ptr;
1889
1890 const __m256 vy_min = _mm256_load_ps(params->avx.min);
1891 const __m256 vy_max = _mm256_load_ps(params->avx.max);
1892
1893 for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
1894 const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1895 const __m256 vb01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1896 const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
1897 const __m256 vb456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (b + 8)));
1898 a += 16;
1899 b += 16;
1900
1901 __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va01234567, vb01234567), _MM_FROUND_NO_EXC));
1902 __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va456789AB, vb456789AB), _MM_FROUND_NO_EXC));
1903
1904
1905 vy01234567 = _mm256_max_ps(vy01234567, vy_min);
1906 vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
1907
1908 vy01234567 = _mm256_min_ps(vy01234567, vy_max);
1909 vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
1910
1911 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
1912 _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
1913 y += 16;
1914 }
1915 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1916 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1917 const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1918 a += 8;
1919 b += 8;
1920
1921 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_NO_EXC));
1922
1923 vy = _mm256_max_ps(vy, vy_min);
1924 vy = _mm256_min_ps(vy, vy_max);
1925
1926 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
1927 y += 8;
1928 }
1929 if XNN_UNLIKELY(n != 0) {
1930 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1931 const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1932
1933 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_NO_EXC));
1934
1935 vy = _mm256_max_ps(vy, vy_min);
1936 vy = _mm256_min_ps(vy, vy_max);
1937
1938 __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
1939 if (n & (4 * sizeof(uint16_t))) {
1940 _mm_storel_epi64((__m128i*) y, vh);
1941 vh = _mm_unpackhi_epi64(vh, vh);
1942 y += 4;
1943 }
1944 if (n & (2 * sizeof(uint16_t))) {
1945 _mm_storeu_si32(y, vh);
1946 vh = _mm_srli_epi64(vh, 32);
1947 y += 2;
1948 }
1949 if (n & (1 * sizeof(uint16_t))) {
1950 *y = (uint16_t) _mm_extract_epi16(vh, 0);
1951 }
1952 }
1953 }
1954
xnn_f16_vmulc_minmax_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1955 void xnn_f16_vmulc_minmax_ukernel__f16c_x16(
1956 size_t n,
1957 const void* restrict a_ptr,
1958 const void* restrict b_ptr,
1959 void* restrict y_ptr,
1960 const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1961 {
1962 assert(n != 0);
1963 assert(n % sizeof(uint16_t) == 0);
1964 assert(a_ptr != NULL);
1965 assert(b_ptr != NULL);
1966 assert(y_ptr != NULL);
1967
1968 const uint16_t* a = (const uint16_t*) a_ptr;
1969 const uint16_t* b = (const uint16_t*) b_ptr;
1970 uint16_t* y = (uint16_t*) y_ptr;
1971
1972 const __m256 vy_min = _mm256_load_ps(params->avx.min);
1973 const __m256 vy_max = _mm256_load_ps(params->avx.max);
1974
1975 const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
1976 for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
1977 const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1978 const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
1979 a += 16;
1980
1981 __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va01234567, vb), _MM_FROUND_NO_EXC));
1982 __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va456789AB, vb), _MM_FROUND_NO_EXC));
1983
1984
1985 vy01234567 = _mm256_max_ps(vy01234567, vy_min);
1986 vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
1987
1988 vy01234567 = _mm256_min_ps(vy01234567, vy_max);
1989 vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
1990
1991 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
1992 _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
1993 y += 16;
1994 }
1995 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1996 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1997 a += 8;
1998
1999 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_NO_EXC));
2000
2001 vy = _mm256_max_ps(vy, vy_min);
2002 vy = _mm256_min_ps(vy, vy_max);
2003
2004 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
2005 y += 8;
2006 }
2007 if XNN_UNLIKELY(n != 0) {
2008 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2009
2010 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_NO_EXC));
2011
2012 vy = _mm256_max_ps(vy, vy_min);
2013 vy = _mm256_min_ps(vy, vy_max);
2014
2015 __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
2016 if (n & (4 * sizeof(uint16_t))) {
2017 _mm_storel_epi64((__m128i*) y, vh);
2018 vh = _mm_unpackhi_epi64(vh, vh);
2019 y += 4;
2020 }
2021 if (n & (2 * sizeof(uint16_t))) {
2022 _mm_storeu_si32(y, vh);
2023 vh = _mm_srli_epi64(vh, 32);
2024 y += 2;
2025 }
2026 if (n & (1 * sizeof(uint16_t))) {
2027 *y = (uint16_t) _mm_extract_epi16(vh, 0);
2028 }
2029 }
2030 }
2031
xnn_f16_vrdivc_minmax_ukernel__f16c_x8(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2032 void xnn_f16_vrdivc_minmax_ukernel__f16c_x8(
2033 size_t n,
2034 const void* restrict a_ptr,
2035 const void* restrict b_ptr,
2036 void* restrict y_ptr,
2037 const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2038 {
2039 assert(n != 0);
2040 assert(n % sizeof(uint16_t) == 0);
2041 assert(a_ptr != NULL);
2042 assert(b_ptr != NULL);
2043 assert(y_ptr != NULL);
2044
2045 const uint16_t* a = (const uint16_t*) a_ptr;
2046 const uint16_t* b = (const uint16_t*) b_ptr;
2047 uint16_t* y = (uint16_t*) y_ptr;
2048
2049 const __m256 vy_min = _mm256_load_ps(params->avx.min);
2050 const __m256 vy_max = _mm256_load_ps(params->avx.max);
2051
2052 const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
2053 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2054 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2055 a += 8;
2056
2057 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_div_ps(vb, va), _MM_FROUND_NO_EXC));
2058
2059 vy = _mm256_max_ps(vy, vy_min);
2060 vy = _mm256_min_ps(vy, vy_max);
2061
2062 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
2063 y += 8;
2064 }
2065 if XNN_UNLIKELY(n != 0) {
2066 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2067
2068 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_div_ps(vb, va), _MM_FROUND_NO_EXC));
2069
2070 vy = _mm256_max_ps(vy, vy_min);
2071 vy = _mm256_min_ps(vy, vy_max);
2072
2073 __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
2074 if (n & (4 * sizeof(uint16_t))) {
2075 _mm_storel_epi64((__m128i*) y, vh);
2076 vh = _mm_unpackhi_epi64(vh, vh);
2077 y += 4;
2078 }
2079 if (n & (2 * sizeof(uint16_t))) {
2080 _mm_storeu_si32(y, vh);
2081 vh = _mm_srli_epi64(vh, 32);
2082 y += 2;
2083 }
2084 if (n & (1 * sizeof(uint16_t))) {
2085 *y = (uint16_t) _mm_extract_epi16(vh, 0);
2086 }
2087 }
2088 }
2089
xnn_f16_vrsubc_minmax_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2090 void xnn_f16_vrsubc_minmax_ukernel__f16c_x16(
2091 size_t n,
2092 const void* restrict a_ptr,
2093 const void* restrict b_ptr,
2094 void* restrict y_ptr,
2095 const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2096 {
2097 assert(n != 0);
2098 assert(n % sizeof(uint16_t) == 0);
2099 assert(a_ptr != NULL);
2100 assert(b_ptr != NULL);
2101 assert(y_ptr != NULL);
2102
2103 const uint16_t* a = (const uint16_t*) a_ptr;
2104 const uint16_t* b = (const uint16_t*) b_ptr;
2105 uint16_t* y = (uint16_t*) y_ptr;
2106
2107 const __m256 vy_min = _mm256_load_ps(params->avx.min);
2108 const __m256 vy_max = _mm256_load_ps(params->avx.max);
2109
2110 const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
2111 for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2112 const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2113 const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
2114 a += 16;
2115
2116 __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(vb, va01234567), _MM_FROUND_NO_EXC));
2117 __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(vb, va456789AB), _MM_FROUND_NO_EXC));
2118
2119
2120 vy01234567 = _mm256_max_ps(vy01234567, vy_min);
2121 vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
2122
2123 vy01234567 = _mm256_min_ps(vy01234567, vy_max);
2124 vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
2125
2126 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
2127 _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
2128 y += 16;
2129 }
2130 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2131 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2132 a += 8;
2133
2134 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(vb, va), _MM_FROUND_NO_EXC));
2135
2136 vy = _mm256_max_ps(vy, vy_min);
2137 vy = _mm256_min_ps(vy, vy_max);
2138
2139 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
2140 y += 8;
2141 }
2142 if XNN_UNLIKELY(n != 0) {
2143 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2144
2145 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(vb, va), _MM_FROUND_NO_EXC));
2146
2147 vy = _mm256_max_ps(vy, vy_min);
2148 vy = _mm256_min_ps(vy, vy_max);
2149
2150 __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
2151 if (n & (4 * sizeof(uint16_t))) {
2152 _mm_storel_epi64((__m128i*) y, vh);
2153 vh = _mm_unpackhi_epi64(vh, vh);
2154 y += 4;
2155 }
2156 if (n & (2 * sizeof(uint16_t))) {
2157 _mm_storeu_si32(y, vh);
2158 vh = _mm_srli_epi64(vh, 32);
2159 y += 2;
2160 }
2161 if (n & (1 * sizeof(uint16_t))) {
2162 *y = (uint16_t) _mm_extract_epi16(vh, 0);
2163 }
2164 }
2165 }
2166
xnn_f16_vsqrdiff_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS (1)])2167 void xnn_f16_vsqrdiff_ukernel__f16c_x16(
2168 size_t n,
2169 const void* restrict a_ptr,
2170 const void* restrict b_ptr,
2171 void* restrict y_ptr,
2172 const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2173 {
2174 assert(n != 0);
2175 assert(n % sizeof(uint16_t) == 0);
2176 assert(a_ptr != NULL);
2177 assert(b_ptr != NULL);
2178 assert(y_ptr != NULL);
2179
2180 const uint16_t* a = (const uint16_t*) a_ptr;
2181 const uint16_t* b = (const uint16_t*) b_ptr;
2182 uint16_t* y = (uint16_t*) y_ptr;
2183
2184
2185 for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2186 const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2187 const __m256 vb01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
2188 const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
2189 const __m256 vb456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (b + 8)));
2190 a += 16;
2191 b += 16;
2192
2193 __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va01234567, vb01234567), _MM_FROUND_NO_EXC));
2194 __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va456789AB, vb456789AB), _MM_FROUND_NO_EXC));
2195
2196 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy01234567, vy01234567), _MM_FROUND_NO_EXC));
2197 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy456789AB, vy456789AB), _MM_FROUND_NO_EXC));
2198
2199
2200 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
2201 _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
2202 y += 16;
2203 }
2204 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2205 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2206 const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
2207 a += 8;
2208 b += 8;
2209
2210 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_NO_EXC));
2211 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy, vy), _MM_FROUND_NO_EXC));
2212
2213
2214 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
2215 y += 8;
2216 }
2217 if XNN_UNLIKELY(n != 0) {
2218 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2219 const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
2220
2221 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_NO_EXC));
2222 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy, vy), _MM_FROUND_NO_EXC));
2223
2224
2225 __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
2226 if (n & (4 * sizeof(uint16_t))) {
2227 _mm_storel_epi64((__m128i*) y, vh);
2228 vh = _mm_unpackhi_epi64(vh, vh);
2229 y += 4;
2230 }
2231 if (n & (2 * sizeof(uint16_t))) {
2232 _mm_storeu_si32(y, vh);
2233 vh = _mm_srli_epi64(vh, 32);
2234 y += 2;
2235 }
2236 if (n & (1 * sizeof(uint16_t))) {
2237 *y = (uint16_t) _mm_extract_epi16(vh, 0);
2238 }
2239 }
2240 }
2241
xnn_f16_vsqrdiffc_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS (1)])2242 void xnn_f16_vsqrdiffc_ukernel__f16c_x16(
2243 size_t n,
2244 const void* restrict a_ptr,
2245 const void* restrict b_ptr,
2246 void* restrict y_ptr,
2247 const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2248 {
2249 assert(n != 0);
2250 assert(n % sizeof(uint16_t) == 0);
2251 assert(a_ptr != NULL);
2252 assert(b_ptr != NULL);
2253 assert(y_ptr != NULL);
2254
2255 const uint16_t* a = (const uint16_t*) a_ptr;
2256 const uint16_t* b = (const uint16_t*) b_ptr;
2257 uint16_t* y = (uint16_t*) y_ptr;
2258
2259
2260 const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
2261 for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2262 const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2263 const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
2264 a += 16;
2265
2266 __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va01234567, vb), _MM_FROUND_NO_EXC));
2267 __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va456789AB, vb), _MM_FROUND_NO_EXC));
2268
2269 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy01234567, vy01234567), _MM_FROUND_NO_EXC));
2270 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy456789AB, vy456789AB), _MM_FROUND_NO_EXC));
2271
2272
2273 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
2274 _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
2275 y += 16;
2276 }
2277 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2278 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2279 a += 8;
2280
2281 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_NO_EXC));
2282 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy, vy), _MM_FROUND_NO_EXC));
2283
2284
2285 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
2286 y += 8;
2287 }
2288 if XNN_UNLIKELY(n != 0) {
2289 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2290
2291 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_NO_EXC));
2292 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy, vy), _MM_FROUND_NO_EXC));
2293
2294
2295 __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
2296 if (n & (4 * sizeof(uint16_t))) {
2297 _mm_storel_epi64((__m128i*) y, vh);
2298 vh = _mm_unpackhi_epi64(vh, vh);
2299 y += 4;
2300 }
2301 if (n & (2 * sizeof(uint16_t))) {
2302 _mm_storeu_si32(y, vh);
2303 vh = _mm_srli_epi64(vh, 32);
2304 y += 2;
2305 }
2306 if (n & (1 * sizeof(uint16_t))) {
2307 *y = (uint16_t) _mm_extract_epi16(vh, 0);
2308 }
2309 }
2310 }
2311
xnn_f16_vsub_minmax_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2312 void xnn_f16_vsub_minmax_ukernel__f16c_x16(
2313 size_t n,
2314 const void* restrict a_ptr,
2315 const void* restrict b_ptr,
2316 void* restrict y_ptr,
2317 const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2318 {
2319 assert(n != 0);
2320 assert(n % sizeof(uint16_t) == 0);
2321 assert(a_ptr != NULL);
2322 assert(b_ptr != NULL);
2323 assert(y_ptr != NULL);
2324
2325 const uint16_t* a = (const uint16_t*) a_ptr;
2326 const uint16_t* b = (const uint16_t*) b_ptr;
2327 uint16_t* y = (uint16_t*) y_ptr;
2328
2329 const __m256 vy_min = _mm256_load_ps(params->avx.min);
2330 const __m256 vy_max = _mm256_load_ps(params->avx.max);
2331
2332 for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2333 const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2334 const __m256 vb01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
2335 const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
2336 const __m256 vb456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (b + 8)));
2337 a += 16;
2338 b += 16;
2339
2340 __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va01234567, vb01234567), _MM_FROUND_NO_EXC));
2341 __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va456789AB, vb456789AB), _MM_FROUND_NO_EXC));
2342
2343
2344 vy01234567 = _mm256_max_ps(vy01234567, vy_min);
2345 vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
2346
2347 vy01234567 = _mm256_min_ps(vy01234567, vy_max);
2348 vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
2349
2350 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
2351 _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
2352 y += 16;
2353 }
2354 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2355 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2356 const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
2357 a += 8;
2358 b += 8;
2359
2360 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_NO_EXC));
2361
2362 vy = _mm256_max_ps(vy, vy_min);
2363 vy = _mm256_min_ps(vy, vy_max);
2364
2365 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
2366 y += 8;
2367 }
2368 if XNN_UNLIKELY(n != 0) {
2369 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2370 const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
2371
2372 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_NO_EXC));
2373
2374 vy = _mm256_max_ps(vy, vy_min);
2375 vy = _mm256_min_ps(vy, vy_max);
2376
2377 __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
2378 if (n & (4 * sizeof(uint16_t))) {
2379 _mm_storel_epi64((__m128i*) y, vh);
2380 vh = _mm_unpackhi_epi64(vh, vh);
2381 y += 4;
2382 }
2383 if (n & (2 * sizeof(uint16_t))) {
2384 _mm_storeu_si32(y, vh);
2385 vh = _mm_srli_epi64(vh, 32);
2386 y += 2;
2387 }
2388 if (n & (1 * sizeof(uint16_t))) {
2389 *y = (uint16_t) _mm_extract_epi16(vh, 0);
2390 }
2391 }
2392 }
2393
xnn_f16_vsubc_minmax_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2394 void xnn_f16_vsubc_minmax_ukernel__f16c_x16(
2395 size_t n,
2396 const void* restrict a_ptr,
2397 const void* restrict b_ptr,
2398 void* restrict y_ptr,
2399 const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2400 {
2401 assert(n != 0);
2402 assert(n % sizeof(uint16_t) == 0);
2403 assert(a_ptr != NULL);
2404 assert(b_ptr != NULL);
2405 assert(y_ptr != NULL);
2406
2407 const uint16_t* a = (const uint16_t*) a_ptr;
2408 const uint16_t* b = (const uint16_t*) b_ptr;
2409 uint16_t* y = (uint16_t*) y_ptr;
2410
2411 const __m256 vy_min = _mm256_load_ps(params->avx.min);
2412 const __m256 vy_max = _mm256_load_ps(params->avx.max);
2413
2414 const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
2415 for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2416 const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2417 const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
2418 a += 16;
2419
2420 __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va01234567, vb), _MM_FROUND_NO_EXC));
2421 __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va456789AB, vb), _MM_FROUND_NO_EXC));
2422
2423
2424 vy01234567 = _mm256_max_ps(vy01234567, vy_min);
2425 vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
2426
2427 vy01234567 = _mm256_min_ps(vy01234567, vy_max);
2428 vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
2429
2430 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
2431 _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
2432 y += 16;
2433 }
2434 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2435 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2436 a += 8;
2437
2438 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_NO_EXC));
2439
2440 vy = _mm256_max_ps(vy, vy_min);
2441 vy = _mm256_min_ps(vy, vy_max);
2442
2443 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
2444 y += 8;
2445 }
2446 if XNN_UNLIKELY(n != 0) {
2447 const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2448
2449 __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_NO_EXC));
2450
2451 vy = _mm256_max_ps(vy, vy_min);
2452 vy = _mm256_min_ps(vy, vy_max);
2453
2454 __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
2455 if (n & (4 * sizeof(uint16_t))) {
2456 _mm_storel_epi64((__m128i*) y, vh);
2457 vh = _mm_unpackhi_epi64(vh, vh);
2458 y += 4;
2459 }
2460 if (n & (2 * sizeof(uint16_t))) {
2461 _mm_storeu_si32(y, vh);
2462 vh = _mm_srli_epi64(vh, 32);
2463 y += 2;
2464 }
2465 if (n & (1 * sizeof(uint16_t))) {
2466 *y = (uint16_t) _mm_extract_epi16(vh, 0);
2467 }
2468 }
2469 }
2470
xnn_f16_vclamp_ukernel__f16c_x16(size_t n,const void * restrict x_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2471 void xnn_f16_vclamp_ukernel__f16c_x16(
2472 size_t n,
2473 const void* restrict x_ptr,
2474 void* restrict y_ptr,
2475 const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2476 {
2477 assert(n != 0);
2478 assert(n % sizeof(uint16_t) == 0);
2479 assert(x_ptr != NULL);
2480 assert(y_ptr != NULL);
2481
2482 const uint16_t* x = (const uint16_t*) x_ptr;
2483 uint16_t* y = (uint16_t*) y_ptr;
2484
2485 const __m256 vy_min = _mm256_load_ps(params->avx.min);
2486 const __m256 vy_max = _mm256_load_ps(params->avx.max);
2487
2488 for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2489 __m256 vacc01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) x));
2490 __m256 vacc89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (x + 8)));
2491 x += 16;
2492
2493 vacc01234567 = _mm256_max_ps(vacc01234567, vy_min);
2494 vacc89ABCDEF = _mm256_max_ps(vacc89ABCDEF, vy_min);
2495
2496 vacc01234567 = _mm256_min_ps(vacc01234567, vy_max);
2497 vacc89ABCDEF = _mm256_min_ps(vacc89ABCDEF, vy_max);
2498
2499 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vacc01234567, _MM_FROUND_NO_EXC));
2500 _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vacc89ABCDEF, _MM_FROUND_NO_EXC));
2501 y += 16;
2502 }
2503 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2504 __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) x));
2505 x += 8;
2506 vacc = _mm256_max_ps(vacc, vy_min);
2507 vacc = _mm256_min_ps(vacc, vy_max);
2508 _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC));
2509 y += 8;
2510 }
2511 if XNN_UNLIKELY(n != 0) {
2512 __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) x));
2513 vacc = _mm256_max_ps(vacc, vy_min);
2514 vacc = _mm256_min_ps(vacc, vy_max);
2515
2516 __m128i vh = _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC);
2517 if (n & (4 * sizeof(uint16_t))) {
2518 _mm_storel_epi64((__m128i*) y, vh);
2519 vh = _mm_unpackhi_epi64(vh, vh);
2520 y += 4;
2521 }
2522 if (n & (2 * sizeof(uint16_t))) {
2523 _mm_storeu_si32(y, vh);
2524 vh = _mm_srli_epi64(vh, 32);
2525 y += 2;
2526 }
2527 if (n & (1 * sizeof(uint16_t))) {
2528 *y = _mm_extract_epi16(vh, 0);
2529 }
2530 }
2531 }
2532
xnn_f16_vhswish_ukernel__f16c_x16(size_t n,const void * restrict x_ptr,void * restrict y_ptr,const union xnn_f16_hswish_params params[restrict XNN_MIN_ELEMENTS (1)])2533 void xnn_f16_vhswish_ukernel__f16c_x16(
2534 size_t n,
2535 const void* restrict x_ptr,
2536 void* restrict y_ptr,
2537 const union xnn_f16_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2538 {
2539 assert(n != 0);
2540 assert(n % sizeof(uint16_t) == 0);
2541
2542 const uint16_t* x = (const uint16_t*) x_ptr;
2543 uint16_t* y = (uint16_t*) y_ptr;
2544
2545 const __m256 vsixth = _mm256_load_ps(params->avx.sixth);
2546 const __m256 vthree = _mm256_load_ps(params->avx.three);
2547 const __m128i vsix = _mm_load_si128((const __m128i*) params->avx.six);
2548 const __m128i vzero = _mm_setzero_si128();
2549
2550 for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2551 __m256 vx01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) x));
2552 __m256 vx89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (x + 8)));
2553 x += 16;
2554
2555 __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vx01234567, vthree), _MM_FROUND_NO_EXC);
2556 vx01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vx01234567, vsixth), _MM_FROUND_NO_EXC));
2557 __m128i vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(vx89ABCDEF, vthree), _MM_FROUND_NO_EXC);
2558 vx89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vx89ABCDEF, vsixth), _MM_FROUND_NO_EXC));
2559
2560 vacc01234567 = _mm_max_epi16(vacc01234567, vzero);
2561 vacc89ABCDEF = _mm_max_epi16(vacc89ABCDEF, vzero);
2562
2563 vacc01234567 = _mm_min_epi16(vacc01234567, vsix);
2564 vacc89ABCDEF = _mm_min_epi16(vacc89ABCDEF, vsix);
2565
2566 vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vx01234567), _MM_FROUND_NO_EXC);
2567 vacc89ABCDEF = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc89ABCDEF), vx89ABCDEF), _MM_FROUND_NO_EXC);
2568
2569 _mm_storeu_si128((__m128i*) y, vacc01234567);
2570 _mm_storeu_si128((__m128i*) (y + 8), vacc89ABCDEF);
2571 y += 16;
2572 }
2573 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2574 __m256 vx = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) x));
2575 x += 8;
2576 __m128i vacc = _mm256_cvtps_ph(_mm256_add_ps(vx, vthree), _MM_FROUND_NO_EXC);
2577 vx = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vx, vsixth), _MM_FROUND_NO_EXC));
2578 vacc = _mm_max_epi16(vacc, vzero);
2579 vacc = _mm_min_epi16(vacc, vsix);
2580 vacc = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc), vx), _MM_FROUND_NO_EXC);
2581 _mm_storeu_si128((__m128i*) y, vacc);
2582 y += 8;
2583 }
2584 if XNN_UNLIKELY(n != 0) {
2585 __m256 vx = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) x));
2586 __m128i vacc = _mm256_cvtps_ph(_mm256_add_ps(vx, vthree), _MM_FROUND_NO_EXC);
2587 vx = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vx, vsixth), _MM_FROUND_NO_EXC));
2588 vacc = _mm_max_epi16(vacc, vzero);
2589 vacc = _mm_min_epi16(vacc, vsix);
2590 vacc = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc), vx), _MM_FROUND_NO_EXC);
2591
2592 if (n & (4 * sizeof(uint16_t))) {
2593 _mm_storel_epi64((__m128i*) y, vacc);
2594 vacc = _mm_unpackhi_epi64(vacc, vacc);
2595 y += 4;
2596 }
2597 if (n & (2 * sizeof(uint16_t))) {
2598 _mm_storeu_si32(y, vacc);
2599 vacc = _mm_srli_epi64(vacc, 32);
2600 y += 2;
2601 }
2602 if (n & (1 * sizeof(uint16_t))) {
2603 *y = (uint16_t) _mm_extract_epi16(vacc, 0);
2604 }
2605 }
2606 }
2607
xnn_f16_vlrelu_ukernel__f16c_x16(size_t batch,const void * input,void * output,const union xnn_f16_lrelu_params params[restrict XNN_MIN_ELEMENTS (1)])2608 void xnn_f16_vlrelu_ukernel__f16c_x16(
2609 size_t batch,
2610 const void* input,
2611 void* output,
2612 const union xnn_f16_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2613 {
2614 assert(batch != 0);
2615 assert(batch % sizeof(uint16_t) == 0);
2616
2617 const __m256 vslope = _mm256_load_ps(params->avx.slope);
2618 const uint16_t* i = (const uint16_t*) input;
2619 uint16_t* o = (uint16_t*) output;
2620 for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) {
2621 const __m256 vx01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2622 const __m256 vx89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 8)));
2623 i += 16;
2624
2625 __m256 vacc01234567 = _mm256_mul_ps(vx01234567, vslope);
2626 __m256 vacc89ABCDEF = _mm256_mul_ps(vx89ABCDEF, vslope);
2627
2628 vacc01234567 = _mm256_blendv_ps(vx01234567, vacc01234567, vx01234567);
2629 vacc89ABCDEF = _mm256_blendv_ps(vx89ABCDEF, vacc89ABCDEF, vx89ABCDEF);
2630
2631 _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc01234567, _MM_FROUND_NO_EXC));
2632 _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vacc89ABCDEF, _MM_FROUND_NO_EXC));
2633 o += 16;
2634 }
2635 for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) {
2636 const __m256 vx = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2637 i += 8;
2638
2639 __m256 vacc = _mm256_mul_ps(vx, vslope);
2640 vacc = _mm256_blendv_ps(vx, vacc, vx);
2641
2642 _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC));
2643 o += 8;
2644 }
2645 if XNN_UNLIKELY(batch != 0) {
2646 const __m256 vx = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2647
2648 __m256 vacc = _mm256_mul_ps(vx, vslope);
2649 vacc = _mm256_blendv_ps(vx, vacc, vx);
2650
2651 __m128i vh = _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC);
2652 if (batch & (4 * sizeof(uint16_t))) {
2653 _mm_storel_epi64((__m128i*) o, vh);
2654 vh = _mm_unpackhi_epi64(vh, vh);
2655 o += 4;
2656 }
2657 if (batch & (2 * sizeof(uint16_t))) {
2658 _mm_storeu_si32(o, vh);
2659 vh = _mm_srli_epi64(vh, 32);
2660 o += 2;
2661 }
2662 if (batch & (1 * sizeof(uint16_t))) {
2663 *o = _mm_extract_epi16(vh, 0);
2664 }
2665 }
2666 }
2667
xnn_f16_vrndd_ukernel__f16c_x16(size_t n,const void * input,void * output,const union xnn_f16_rnd_params params[restrict XNN_MIN_ELEMENTS (1)])2668 void xnn_f16_vrndd_ukernel__f16c_x16(
2669 size_t n,
2670 const void* input,
2671 void* output,
2672 const union xnn_f16_rnd_params params[restrict XNN_MIN_ELEMENTS(1)])
2673 {
2674 assert(n != 0);
2675 assert(n % sizeof(uint16_t) == 0);
2676
2677 const uint16_t* i = (const uint16_t*) input;
2678 uint16_t* o = (uint16_t*) output;
2679 for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2680 __m256 vacc0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2681 __m256 vacc1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 8)));
2682 i += 16;
2683
2684 vacc0 = _mm256_round_ps(vacc0, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
2685 vacc1 = _mm256_round_ps(vacc1, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
2686
2687 _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc0, _MM_FROUND_NO_EXC));
2688 _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vacc1, _MM_FROUND_NO_EXC));
2689 o += 16;
2690 }
2691 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2692 __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2693 i += 8;
2694
2695 vacc = _mm256_round_ps(vacc, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
2696
2697 _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC));
2698 o += 8;
2699 }
2700 if XNN_UNLIKELY(n != 0) {
2701 assert(n >= 1 * sizeof(uint16_t));
2702 assert(n <= 7 * sizeof(uint16_t));
2703 __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2704 vacc = _mm256_round_ps(vacc, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
2705 __m128i vh = _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC);
2706 if (n & (4 * sizeof(uint16_t))) {
2707 _mm_storel_epi64((__m128i*) o, vh);
2708 vh = _mm_unpackhi_epi64(vh, vh);
2709 o += 4;
2710 }
2711 if (n & (2 * sizeof(uint16_t))) {
2712 _mm_storeu_si32(o, vh);
2713 vh = _mm_srli_epi64(vh, 32);
2714 o += 2;
2715 }
2716 if (n & (1 * sizeof(uint16_t))) {
2717 *o = (uint16_t) _mm_extract_epi16(vh, 0);
2718 }
2719 }
2720 }
2721
xnn_f16_vrndne_ukernel__f16c_x16(size_t n,const void * input,void * output,const union xnn_f16_rnd_params params[restrict XNN_MIN_ELEMENTS (1)])2722 void xnn_f16_vrndne_ukernel__f16c_x16(
2723 size_t n,
2724 const void* input,
2725 void* output,
2726 const union xnn_f16_rnd_params params[restrict XNN_MIN_ELEMENTS(1)])
2727 {
2728 assert(n != 0);
2729 assert(n % sizeof(uint16_t) == 0);
2730
2731 const uint16_t* i = (const uint16_t*) input;
2732 uint16_t* o = (uint16_t*) output;
2733 for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2734 __m256 vacc0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2735 __m256 vacc1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 8)));
2736 i += 16;
2737
2738 vacc0 = _mm256_round_ps(vacc0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
2739 vacc1 = _mm256_round_ps(vacc1, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
2740
2741 _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc0, _MM_FROUND_NO_EXC));
2742 _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vacc1, _MM_FROUND_NO_EXC));
2743 o += 16;
2744 }
2745 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2746 __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2747 i += 8;
2748
2749 vacc = _mm256_round_ps(vacc, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
2750
2751 _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC));
2752 o += 8;
2753 }
2754 if XNN_UNLIKELY(n != 0) {
2755 assert(n >= 1 * sizeof(uint16_t));
2756 assert(n <= 7 * sizeof(uint16_t));
2757 __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2758 vacc = _mm256_round_ps(vacc, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
2759 __m128i vh = _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC);
2760 if (n & (4 * sizeof(uint16_t))) {
2761 _mm_storel_epi64((__m128i*) o, vh);
2762 vh = _mm_unpackhi_epi64(vh, vh);
2763 o += 4;
2764 }
2765 if (n & (2 * sizeof(uint16_t))) {
2766 _mm_storeu_si32(o, vh);
2767 vh = _mm_srli_epi64(vh, 32);
2768 o += 2;
2769 }
2770 if (n & (1 * sizeof(uint16_t))) {
2771 *o = (uint16_t) _mm_extract_epi16(vh, 0);
2772 }
2773 }
2774 }
2775
xnn_f16_vrndu_ukernel__f16c_x16(size_t n,const void * input,void * output,const union xnn_f16_rnd_params params[restrict XNN_MIN_ELEMENTS (1)])2776 void xnn_f16_vrndu_ukernel__f16c_x16(
2777 size_t n,
2778 const void* input,
2779 void* output,
2780 const union xnn_f16_rnd_params params[restrict XNN_MIN_ELEMENTS(1)])
2781 {
2782 assert(n != 0);
2783 assert(n % sizeof(uint16_t) == 0);
2784
2785 const uint16_t* i = (const uint16_t*) input;
2786 uint16_t* o = (uint16_t*) output;
2787 for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2788 __m256 vacc0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2789 __m256 vacc1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 8)));
2790 i += 16;
2791
2792 vacc0 = _mm256_round_ps(vacc0, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
2793 vacc1 = _mm256_round_ps(vacc1, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
2794
2795 _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc0, _MM_FROUND_NO_EXC));
2796 _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vacc1, _MM_FROUND_NO_EXC));
2797 o += 16;
2798 }
2799 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2800 __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2801 i += 8;
2802
2803 vacc = _mm256_round_ps(vacc, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
2804
2805 _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC));
2806 o += 8;
2807 }
2808 if XNN_UNLIKELY(n != 0) {
2809 assert(n >= 1 * sizeof(uint16_t));
2810 assert(n <= 7 * sizeof(uint16_t));
2811 __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2812 vacc = _mm256_round_ps(vacc, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
2813 __m128i vh = _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC);
2814 if (n & (4 * sizeof(uint16_t))) {
2815 _mm_storel_epi64((__m128i*) o, vh);
2816 vh = _mm_unpackhi_epi64(vh, vh);
2817 o += 4;
2818 }
2819 if (n & (2 * sizeof(uint16_t))) {
2820 _mm_storeu_si32(o, vh);
2821 vh = _mm_srli_epi64(vh, 32);
2822 o += 2;
2823 }
2824 if (n & (1 * sizeof(uint16_t))) {
2825 *o = (uint16_t) _mm_extract_epi16(vh, 0);
2826 }
2827 }
2828 }
2829
xnn_f16_vrndz_ukernel__f16c_x16(size_t n,const void * input,void * output,const union xnn_f16_rnd_params params[restrict XNN_MIN_ELEMENTS (1)])2830 void xnn_f16_vrndz_ukernel__f16c_x16(
2831 size_t n,
2832 const void* input,
2833 void* output,
2834 const union xnn_f16_rnd_params params[restrict XNN_MIN_ELEMENTS(1)])
2835 {
2836 assert(n != 0);
2837 assert(n % sizeof(uint16_t) == 0);
2838
2839 const uint16_t* i = (const uint16_t*) input;
2840 uint16_t* o = (uint16_t*) output;
2841 for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2842 __m256 vacc0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2843 __m256 vacc1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 8)));
2844 i += 16;
2845
2846 vacc0 = _mm256_round_ps(vacc0, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2847 vacc1 = _mm256_round_ps(vacc1, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2848
2849 _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc0, _MM_FROUND_NO_EXC));
2850 _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vacc1, _MM_FROUND_NO_EXC));
2851 o += 16;
2852 }
2853 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2854 __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2855 i += 8;
2856
2857 vacc = _mm256_round_ps(vacc, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2858
2859 _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC));
2860 o += 8;
2861 }
2862 if XNN_UNLIKELY(n != 0) {
2863 assert(n >= 1 * sizeof(uint16_t));
2864 assert(n <= 7 * sizeof(uint16_t));
2865 __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2866 vacc = _mm256_round_ps(vacc, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2867 __m128i vh = _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC);
2868 if (n & (4 * sizeof(uint16_t))) {
2869 _mm_storel_epi64((__m128i*) o, vh);
2870 vh = _mm_unpackhi_epi64(vh, vh);
2871 o += 4;
2872 }
2873 if (n & (2 * sizeof(uint16_t))) {
2874 _mm_storeu_si32(o, vh);
2875 vh = _mm_srli_epi64(vh, 32);
2876 o += 2;
2877 }
2878 if (n & (1 * sizeof(uint16_t))) {
2879 *o = (uint16_t) _mm_extract_epi16(vh, 0);
2880 }
2881 }
2882 }
2883
xnn_f16_vsqrt_ukernel__f16c_sqrt_x8(size_t n,const void * input,void * output,const union xnn_f16_sqrt_params params[restrict XNN_MIN_ELEMENTS (1)])2884 void xnn_f16_vsqrt_ukernel__f16c_sqrt_x8(
2885 size_t n,
2886 const void* input,
2887 void* output,
2888 const union xnn_f16_sqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2889 {
2890 assert(n != 0);
2891 assert(n % sizeof(uint16_t) == 0);
2892 assert(input != NULL);
2893 assert(output != NULL);
2894
2895 const uint16_t* i = (const uint16_t*) input;
2896 uint16_t* o = (uint16_t*) output;
2897 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2898 __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2899 i += 8;
2900 vacc = _mm256_sqrt_ps(vacc);
2901 _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC));
2902 o += 8;
2903 }
2904 if XNN_UNLIKELY(n != 0) {
2905 __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2906 vacc = _mm256_sqrt_ps(vacc);
2907 __m128i vh = _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC);
2908 if (n & (4 * sizeof(uint16_t))) {
2909 _mm_storel_epi64((__m128i*) o, vh);
2910 o += 4;
2911 vh = _mm_unpackhi_epi64(vh, vh);
2912 }
2913 if (n & (2 * sizeof(uint16_t))) {
2914 _mm_storeu_si32(o, vh);
2915 o += 2;
2916 vh = _mm_srli_epi64(vh, 32);
2917 }
2918 if (n & (1 * sizeof(uint16_t))) {
2919 *o = (uint16_t) _mm_extract_epi16(vh, 0);
2920 }
2921 }
2922 }
2923
xnn_f16_vsqr_ukernel__f16c_x16(size_t n,const void * input,void * output,const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS (1)])2924 void xnn_f16_vsqr_ukernel__f16c_x16(
2925 size_t n,
2926 const void* input,
2927 void* output,
2928 const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2929 {
2930 assert(n != 0);
2931 assert(n % sizeof(uint16_t) == 0);
2932 assert(input != NULL);
2933 assert(output != NULL);
2934
2935 const uint16_t* i = (const uint16_t*) input;
2936 uint16_t* o = (uint16_t*) output;
2937 for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2938 __m256 vacc0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2939 __m256 vacc1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 8)));
2940 i += 16;
2941
2942 vacc0 = _mm256_mul_ps(vacc0, vacc0);
2943 vacc1 = _mm256_mul_ps(vacc1, vacc1);
2944
2945 _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc0, _MM_FROUND_NO_EXC));
2946 _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vacc1, _MM_FROUND_NO_EXC));
2947 o += 16;
2948 }
2949 for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2950 __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2951 i += 8;
2952 vacc = _mm256_mul_ps(vacc, vacc);
2953 _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC));
2954 o += 8;
2955 }
2956 if XNN_UNLIKELY(n != 0) {
2957 __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2958 vacc = _mm256_mul_ps(vacc, vacc);
2959 __m128i vh = _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC);
2960 if (n & (4 * sizeof(uint16_t))) {
2961 _mm_storel_epi64((__m128i*) o, vh);
2962 o += 4;
2963 vh = _mm_unpackhi_epi64(vh, vh);
2964 }
2965 if (n & (2 * sizeof(uint16_t))) {
2966 _mm_storeu_si32(o, vh);
2967 o += 2;
2968 vh = _mm_srli_epi64(vh, 32);
2969 }
2970 if (n & (1 * sizeof(uint16_t))) {
2971 *o = (uint16_t) _mm_extract_epi16(vh, 0);
2972 }
2973 }
2974 }
2975
xnn_f32_f16_vcvt_ukernel__f16c_x16(size_t n,const float * input,void * output,const union xnn_f32_f16_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])2976 void xnn_f32_f16_vcvt_ukernel__f16c_x16(
2977 size_t n,
2978 const float* input,
2979 void* output,
2980 const union xnn_f32_f16_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
2981 {
2982 assert(n != 0);
2983 assert(n % sizeof(float) == 0);
2984 assert(input != NULL);
2985 assert(output != NULL);
2986
2987 uint16_t* o = (uint16_t*) output;
2988 for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
2989 const __m256 vf0 = _mm256_loadu_ps(input);
2990 const __m256 vf1 = _mm256_loadu_ps(input + 8);
2991 input += 16;
2992
2993 _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vf0, _MM_FROUND_NO_EXC));
2994 _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vf1, _MM_FROUND_NO_EXC));
2995 o += 16;
2996 }
2997 for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
2998 const __m256 vf = _mm256_loadu_ps(input);
2999 input += 8;
3000
3001 _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vf, _MM_FROUND_NO_EXC));
3002 o += 8;
3003 }
3004 if XNN_UNLIKELY(n != 0) {
3005 assert(n >= 1 * sizeof(float));
3006 assert(n <= 7 * sizeof(float));
3007 const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->f16c.mask_table[7] - n));
3008
3009 const __m256 vf = _mm256_maskload_ps(input, vmask);
3010
3011 __m128 vf_lo = _mm256_castps256_ps128(vf);
3012 if (n & (4 * sizeof(float))) {
3013 _mm_storel_epi64((__m128i*) o, _mm_cvtps_ph(vf_lo, _MM_FROUND_NO_EXC));
3014 vf_lo = _mm256_extractf128_ps(vf, 1);
3015 o += 4;
3016 }
3017 __m128i vh = _mm_cvtps_ph(vf_lo, _MM_FROUND_NO_EXC);
3018 if (n & (2 * sizeof(float))) {
3019 _mm_storeu_si32(o, vh);
3020 vh = _mm_srli_epi64(vh, 32);
3021 o += 2;
3022 }
3023 if (n & (1 * sizeof(float))) {
3024 *((uint16_t*) o) = _mm_extract_epi16(vh, 0);
3025 }
3026 }
3027 }
3028