xref: /aosp_15_r20/external/XNNPACK/src/amalgam/f16c.c (revision 4bdc94577ba0e567308109d787f7fec7b531ce36)
1 // Copyright 2021 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <assert.h>
7 
8 #include <immintrin.h>
9 
10 #include <xnnpack/avgpool.h>
11 #include <xnnpack/common.h>
12 #include <xnnpack/gavgpool.h>
13 #include <xnnpack/intrinsics-polyfill.h>
14 #include <xnnpack/math.h>
15 #include <xnnpack/maxpool.h>
16 #include <xnnpack/prelu.h>
17 #include <xnnpack/rmax.h>
18 #include <xnnpack/vbinary.h>
19 #include <xnnpack/vcvt.h>
20 #include <xnnpack/vunary.h>
21 
22 
xnn_f16_avgpool_minmax_ukernel_9p8x__f16c_c8(size_t output_pixels,size_t kernel_elements,size_t channels,const void ** input,size_t input_offset,const void * zero,void * buffer,void * output,size_t input_increment,size_t output_increment,const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])23 void xnn_f16_avgpool_minmax_ukernel_9p8x__f16c_c8(
24     size_t output_pixels,
25     size_t kernel_elements,
26     size_t channels,
27     const void** input,
28     size_t input_offset,
29     const void* zero,
30     void* buffer,
31     void* output,
32     size_t input_increment,
33     size_t output_increment,
34     const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
35 {
36   assert(output_pixels != 0);
37   assert(kernel_elements > 9);
38   assert(channels != 0);
39 
40   const __m256 vscale = _mm256_load_ps(params->avx.scale);
41   const __m256 vmin = _mm256_load_ps(params->avx.min);
42   const __m256 vmax = _mm256_load_ps(params->avx.max);
43 
44   uint16_t* o = (uint16_t*) output;
45   do {
46     {
47       const uint16_t* i0 = *input++;
48       assert(i0 != NULL);
49       if XNN_UNPREDICTABLE(i0 != zero) {
50         i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
51       }
52       const uint16_t* i1 = *input++;
53       assert(i1 != NULL);
54       if XNN_UNPREDICTABLE(i1 != zero) {
55         i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
56       }
57       const uint16_t* i2 = *input++;
58       assert(i2 != NULL);
59       if XNN_UNPREDICTABLE(i2 != zero) {
60         i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
61       }
62       const uint16_t* i3 = *input++;
63       assert(i3 != NULL);
64       if XNN_UNPREDICTABLE(i3 != zero) {
65         i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
66       }
67       const uint16_t* i4 = *input++;
68       assert(i4 != NULL);
69       if XNN_UNPREDICTABLE(i4 != zero) {
70         i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
71       }
72       const uint16_t* i5 = *input++;
73       assert(i5 != NULL);
74       if XNN_UNPREDICTABLE(i5 != zero) {
75         i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
76       }
77       const uint16_t* i6 = *input++;
78       assert(i6 != NULL);
79       if XNN_UNPREDICTABLE(i6 != zero) {
80         i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
81       }
82       const uint16_t* i7 = *input++;
83       assert(i7 != NULL);
84       if XNN_UNPREDICTABLE(i7 != zero) {
85         i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
86       }
87       const uint16_t* i8 = *input++;
88       assert(i8 != NULL);
89       if XNN_UNPREDICTABLE(i8 != zero) {
90         i8 = (const uint16_t*) ((uintptr_t) i8 + input_offset);
91       }
92 
93       uint16_t* b = (uint16_t*) buffer;
94       for (size_t c = 0; c < channels; c += 8) {
95         const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
96         i0 += 8;
97         const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
98         i1 += 8;
99         const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
100         i2 += 8;
101         const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
102         i3 += 8;
103         const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
104         i4 += 8;
105         const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
106         i5 += 8;
107         const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
108         i6 += 8;
109         const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
110         i7 += 8;
111         const __m256 vi8 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
112         i8 += 8;
113 
114         const __m256 vsum01 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi0, vi1), _MM_FROUND_NO_EXC));
115         const __m256 vsum23 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi2, vi3), _MM_FROUND_NO_EXC));
116         const __m256 vsum45 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi4, vi5), _MM_FROUND_NO_EXC));
117         const __m256 vsum67 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi6, vi7), _MM_FROUND_NO_EXC));
118         const __m256 vsum018 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01, vi8), _MM_FROUND_NO_EXC));
119         const __m256 vsum2345 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum23, vsum45), _MM_FROUND_NO_EXC));
120         const __m256 vsum01678 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum018, vsum67), _MM_FROUND_NO_EXC));
121         const __m128i vsum = _mm256_cvtps_ph(_mm256_add_ps(vsum2345, vsum01678), _MM_FROUND_NO_EXC);
122 
123         _mm_storeu_si128((__m128i*) b, vsum);
124         b += 8;
125       }
126     }
127 
128     size_t k = kernel_elements;
129     for (k -= 9; k > 8; k -= 8) {
130       const uint16_t* i0 = (const uint16_t*) *input++;
131       assert(i0 != NULL);
132       if XNN_UNPREDICTABLE(i0 != zero) {
133         i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
134       }
135       const uint16_t* i1 = (const uint16_t*) *input++;
136       assert(i1 != NULL);
137       if XNN_UNPREDICTABLE(i1 != zero) {
138         i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
139       }
140       const uint16_t* i2 = (const uint16_t*) *input++;
141       assert(i2 != NULL);
142       if XNN_UNPREDICTABLE(i2 != zero) {
143         i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
144       }
145       const uint16_t* i3 = (const uint16_t*) *input++;
146       assert(i3 != NULL);
147       if XNN_UNPREDICTABLE(i3 != zero) {
148         i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
149       }
150       const uint16_t* i4 = (const uint16_t*) *input++;
151       assert(i4 != NULL);
152       if XNN_UNPREDICTABLE(i4 != zero) {
153         i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
154       }
155       const uint16_t* i5 = (const uint16_t*) *input++;
156       assert(i5 != NULL);
157       if XNN_UNPREDICTABLE(i5 != zero) {
158         i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
159       }
160       const uint16_t* i6 = (const uint16_t*) *input++;
161       assert(i6 != NULL);
162       if XNN_UNPREDICTABLE(i6 != zero) {
163         i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
164       }
165       const uint16_t* i7 = (const uint16_t*) *input++;
166       assert(i7 != NULL);
167       if XNN_UNPREDICTABLE(i7 != zero) {
168         i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
169       }
170 
171       uint16_t* b = (uint16_t*) buffer;
172       for (size_t c = 0; c < channels; c += 8) {
173         const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
174         i0 += 8;
175         const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
176         i1 += 8;
177         const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
178         i2 += 8;
179         const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
180         i3 += 8;
181         const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
182         i4 += 8;
183         const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
184         i5 += 8;
185         const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
186         i6 += 8;
187         const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
188         i7 += 8;
189         const __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
190 
191         const __m256 vsum01 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi0, vi1), _MM_FROUND_NO_EXC));
192         const __m256 vsum23 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi2, vi3), _MM_FROUND_NO_EXC));
193         const __m256 vsum45 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi4, vi5), _MM_FROUND_NO_EXC));
194         const __m256 vsum67 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi6, vi7), _MM_FROUND_NO_EXC));
195         const __m256 vsum01a = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01, vacc), _MM_FROUND_NO_EXC));
196         const __m256 vsum2345 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum23, vsum45), _MM_FROUND_NO_EXC));
197         const __m256 vsum0167a = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01a, vsum67), _MM_FROUND_NO_EXC));
198         const __m128i vsum = _mm256_cvtps_ph(_mm256_add_ps(vsum2345, vsum0167a), _MM_FROUND_NO_EXC);
199 
200         _mm_storeu_si128((__m128i*) b, vsum);
201         b += 8;
202       }
203     }
204 
205     assert(k >= 1);
206     {
207       const uint16_t* i0 = (const uint16_t*) input[0];
208       assert(i0 != NULL);
209       const uint16_t* i1 = (const uint16_t*) input[1];
210       const uint16_t* i2 = (const uint16_t*) input[2];
211       const uint16_t* i3 = (const uint16_t*) input[3];
212       const uint16_t* i4 = (const uint16_t*) input[4];
213       const uint16_t* i5 = (const uint16_t*) input[5];
214       const uint16_t* i6 = (const uint16_t*) input[6];
215       const uint16_t* i7 = (const uint16_t*) input[7];
216       input = (const void**) ((uintptr_t) input + input_increment);
217       if (k < 2) {
218         i1 = (const uint16_t*) zero;
219       }
220       assert(i1 != NULL);
221       if (k <= 2) {
222         i2 = (const uint16_t*) zero;
223       }
224       assert(i2 != NULL);
225       if (k < 4) {
226         i3 = (const uint16_t*) zero;
227       }
228       assert(i3 != NULL);
229       if (k <= 4) {
230         i4 = (const uint16_t*) zero;
231       }
232       assert(i4 != NULL);
233       if (k < 6) {
234         i5 = (const uint16_t*) zero;
235       }
236       assert(i5 != NULL);
237       if (k <= 6) {
238         i6 = (const uint16_t*) zero;
239       }
240       assert(i6 != NULL);
241       if (k < 8) {
242         i7 = (const uint16_t*) zero;
243       }
244       assert(i7 != NULL);
245       if XNN_UNPREDICTABLE(i0 != zero) {
246         i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
247       }
248       if XNN_UNPREDICTABLE(i1 != zero) {
249         i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
250       }
251       if XNN_UNPREDICTABLE(i2 != zero) {
252         i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
253       }
254       if XNN_UNPREDICTABLE(i3 != zero) {
255         i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
256       }
257       if XNN_UNPREDICTABLE(i4 != zero) {
258         i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
259       }
260       if XNN_UNPREDICTABLE(i5 != zero) {
261         i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
262       }
263       if XNN_UNPREDICTABLE(i6 != zero) {
264         i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
265       }
266       if XNN_UNPREDICTABLE(i7 != zero) {
267         i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
268       }
269 
270       size_t c = channels;
271       uint16_t* b = (uint16_t*) buffer;
272       while (c >= 8) {
273         const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
274         i0 += 8;
275         const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
276         i1 += 8;
277         const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
278         i2 += 8;
279         const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
280         i3 += 8;
281         const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
282         i4 += 8;
283         const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
284         i5 += 8;
285         const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
286         i6 += 8;
287         const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
288         i7 += 8;
289         const __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
290         b += 8;
291 
292         const __m256 vsum01 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi0, vi1), _MM_FROUND_NO_EXC));
293         const __m256 vsum23 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi2, vi3), _MM_FROUND_NO_EXC));
294         const __m256 vsum45 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi4, vi5), _MM_FROUND_NO_EXC));
295         const __m256 vsum67 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi6, vi7), _MM_FROUND_NO_EXC));
296         const __m256 vsum01a = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01, vacc), _MM_FROUND_NO_EXC));
297         const __m256 vsum2345 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum23, vsum45), _MM_FROUND_NO_EXC));
298         const __m256 vsum0167a = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01a, vsum67), _MM_FROUND_NO_EXC));
299         const __m256 vsum = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum2345, vsum0167a), _MM_FROUND_NO_EXC));
300 
301         __m256 vout = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vsum, vscale), _MM_FROUND_NO_EXC));
302         vout = _mm256_max_ps(vout, vmin);
303         vout = _mm256_min_ps(vout, vmax);
304 
305         _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC));
306         o += 8;
307 
308         c -= 8;
309       }
310       if (c != 0) {
311         const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
312         const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
313         const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
314         const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
315         const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
316         const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
317         const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
318         const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
319         const __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
320 
321         const __m256 vsum01 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi0, vi1), _MM_FROUND_NO_EXC));
322         const __m256 vsum23 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi2, vi3), _MM_FROUND_NO_EXC));
323         const __m256 vsum45 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi4, vi5), _MM_FROUND_NO_EXC));
324         const __m256 vsum67 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi6, vi7), _MM_FROUND_NO_EXC));
325         const __m256 vsum01a = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01, vacc), _MM_FROUND_NO_EXC));
326         const __m256 vsum2345 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum23, vsum45), _MM_FROUND_NO_EXC));
327         const __m256 vsum0167a = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01a, vsum67), _MM_FROUND_NO_EXC));
328         const __m256 vsum = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum2345, vsum0167a), _MM_FROUND_NO_EXC));
329 
330         __m256 vout = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vsum, vscale), _MM_FROUND_NO_EXC));
331         vout = _mm256_max_ps(vout, vmin);
332         vout = _mm256_min_ps(vout, vmax);
333 
334         __m128i vh = _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC);
335         if (c & 4) {
336           _mm_storel_epi64((__m128i*) o, vh);
337           vh = _mm_unpackhi_epi64(vh, vh);
338           o += 4;
339         }
340         if (c & 2) {
341           _mm_storeu_si32(o, vh);
342           vh = _mm_srli_epi64(vh, 32);
343           o += 2;
344         }
345         if (c & 1) {
346           *o = (uint16_t) _mm_extract_epi16(vh, 0);
347           o += 1;
348         }
349       }
350     }
351     o = (uint16_t*) ((uintptr_t) o + output_increment);
352   } while (--output_pixels != 0);
353 }
354 
xnn_f16_avgpool_minmax_ukernel_9x__f16c_c8(size_t output_pixels,size_t kernel_elements,size_t channels,const void ** input,size_t input_offset,const void * zero,void * output,size_t input_increment,size_t output_increment,const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])355 void xnn_f16_avgpool_minmax_ukernel_9x__f16c_c8(
356     size_t output_pixels,
357     size_t kernel_elements,
358     size_t channels,
359     const void** input,
360     size_t input_offset,
361     const void* zero,
362     void* output,
363     size_t input_increment,
364     size_t output_increment,
365     const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
366 {
367   assert(output_pixels != 0);
368   assert(kernel_elements != 0);
369   assert(kernel_elements <= 9);
370   assert(channels != 0);
371 
372   const __m256 vscale = _mm256_load_ps(params->avx.scale);
373   const __m256 vmin = _mm256_load_ps(params->avx.min);
374   const __m256 vmax = _mm256_load_ps(params->avx.max);
375 
376   uint16_t* o = (uint16_t*) output;
377   do {
378     const uint16_t* i0 = (const uint16_t*) input[0];
379     assert(i0 != NULL);
380     const uint16_t* i1 = (const uint16_t*) input[1];
381     const uint16_t* i2 = (const uint16_t*) input[2];
382     const uint16_t* i3 = (const uint16_t*) input[3];
383     const uint16_t* i4 = (const uint16_t*) input[4];
384     const uint16_t* i5 = (const uint16_t*) input[5];
385     const uint16_t* i6 = (const uint16_t*) input[6];
386     const uint16_t* i7 = (const uint16_t*) input[7];
387     const uint16_t* i8 = (const uint16_t*) input[8];
388     input = (const void**) ((uintptr_t) input + input_increment);
389     if (kernel_elements < 2) {
390       i1 = (const uint16_t*) zero;
391     }
392     assert(i1 != NULL);
393     if (kernel_elements <= 2) {
394       i2 = (const uint16_t*) zero;
395     }
396     assert(i2 != NULL);
397     if (kernel_elements < 4) {
398       i3 = (const uint16_t*) zero;
399     }
400     assert(i3 != NULL);
401     if (kernel_elements <= 4) {
402       i4 = (const uint16_t*) zero;
403     }
404     assert(i4 != NULL);
405     if (kernel_elements < 6) {
406       i5 = (const uint16_t*) zero;
407     }
408     assert(i5 != NULL);
409     if (kernel_elements <= 6) {
410       i6 = (const uint16_t*) zero;
411     }
412     assert(i6 != NULL);
413     if (kernel_elements < 8) {
414       i7 = (const uint16_t*) zero;
415     }
416     assert(i7 != NULL);
417     if (kernel_elements <= 8) {
418       i8 = (const uint16_t*) zero;
419     }
420     assert(i8 != NULL);
421     if XNN_UNPREDICTABLE(i0 != zero) {
422       i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
423     }
424     if XNN_UNPREDICTABLE(i1 != zero) {
425       i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
426     }
427     if XNN_UNPREDICTABLE(i2 != zero) {
428       i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
429     }
430     if XNN_UNPREDICTABLE(i3 != zero) {
431       i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
432     }
433     if XNN_UNPREDICTABLE(i4 != zero) {
434       i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
435     }
436     if XNN_UNPREDICTABLE(i5 != zero) {
437       i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
438     }
439     if XNN_UNPREDICTABLE(i6 != zero) {
440       i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
441     }
442     if XNN_UNPREDICTABLE(i7 != zero) {
443       i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
444     }
445     if XNN_UNPREDICTABLE(i8 != zero) {
446       i8 = (const uint16_t*) ((uintptr_t) i8 + input_offset);
447     }
448 
449     size_t c = channels;
450     while (c >= 8) {
451       const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
452       i0 += 8;
453       const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
454       i1 += 8;
455       const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
456       i2 += 8;
457       const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
458       i3 += 8;
459       const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
460       i4 += 8;
461       const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
462       i5 += 8;
463       const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
464       i6 += 8;
465       const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
466       i7 += 8;
467       const __m256 vi8 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
468       i8 += 8;
469 
470       const __m256 vsum01 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi0, vi1), _MM_FROUND_NO_EXC));
471       const __m256 vsum23 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi2, vi3), _MM_FROUND_NO_EXC));
472       const __m256 vsum45 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi4, vi5), _MM_FROUND_NO_EXC));
473       const __m256 vsum67 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi6, vi7), _MM_FROUND_NO_EXC));
474       const __m256 vsum018 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01, vi8), _MM_FROUND_NO_EXC));
475       const __m256 vsum2345 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum23, vsum45), _MM_FROUND_NO_EXC));
476       const __m256 vsum01678 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum018, vsum67), _MM_FROUND_NO_EXC));
477       const __m256 vsum = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum2345, vsum01678), _MM_FROUND_NO_EXC));
478 
479       __m256 vout = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vsum, vscale), _MM_FROUND_NO_EXC));
480       vout = _mm256_max_ps(vout, vmin);
481       vout = _mm256_min_ps(vout, vmax);
482 
483       _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC));
484       o += 8;
485 
486       c -= 8;
487     }
488     if (c != 0) {
489       const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
490       const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
491       const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
492       const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
493       const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
494       const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
495       const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
496       const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
497       const __m256 vi8 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
498 
499       const __m256 vsum01 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi0, vi1), _MM_FROUND_NO_EXC));
500       const __m256 vsum23 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi2, vi3), _MM_FROUND_NO_EXC));
501       const __m256 vsum45 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi4, vi5), _MM_FROUND_NO_EXC));
502       const __m256 vsum67 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi6, vi7), _MM_FROUND_NO_EXC));
503       const __m256 vsum018 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01, vi8), _MM_FROUND_NO_EXC));
504       const __m256 vsum2345 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum23, vsum45), _MM_FROUND_NO_EXC));
505       const __m256 vsum01678 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum018, vsum67), _MM_FROUND_NO_EXC));
506       const __m256 vsum = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum2345, vsum01678), _MM_FROUND_NO_EXC));
507 
508       __m256 vout = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vsum, vscale), _MM_FROUND_NO_EXC));
509       vout = _mm256_max_ps(vout, vmin);
510       vout = _mm256_min_ps(vout, vmax);
511 
512       __m128i vh = _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC);
513       if (c & 4) {
514         _mm_storel_epi64((__m128i*) o, vh);
515         vh = _mm_unpackhi_epi64(vh, vh);
516         o += 4;
517       }
518       if (c & 2) {
519         _mm_storeu_si32(o, vh);
520         vh = _mm_srli_epi64(vh, 32);
521         o += 2;
522       }
523       if (c & 1) {
524         *o = (uint16_t) _mm_extract_epi16(vh, 0);
525         o += 1;
526       }
527     }
528     o = (uint16_t*) ((uintptr_t) o + output_increment);
529   } while (--output_pixels != 0);
530 }
531 
xnn_f16_f32_vcvt_ukernel__f16c_x16(size_t n,const void * input,float * output,const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])532 void xnn_f16_f32_vcvt_ukernel__f16c_x16(
533     size_t n,
534     const void* input,
535     float* output,
536     const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
537 {
538   assert(n != 0);
539   assert(n % sizeof(uint16_t) == 0);
540   assert(input != NULL);
541   assert(output != NULL);
542 
543   const uint16_t* i = (const uint16_t*) input;
544   for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
545     const __m256 vacc0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
546     const __m256 vacc1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 8)));
547     i += 16;
548 
549     _mm256_storeu_ps(output, vacc0);
550     _mm256_storeu_ps(output + 8, vacc1);
551     output += 16;
552   }
553   for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
554     const __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
555     i += 8;
556 
557     _mm256_storeu_ps(output, vacc);
558     output += 8;
559   }
560   if XNN_UNLIKELY(n != 0) {
561     assert(n >= 1 * sizeof(uint16_t));
562     assert(n <= 7 * sizeof(uint16_t));
563     const __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
564 
565     __m128 vacc_lo = _mm256_castps256_ps128(vacc);
566     if (n & (4 * sizeof(uint16_t))) {
567       _mm_storeu_ps(output, vacc_lo);
568       vacc_lo = _mm256_extractf128_ps(vacc, 1);
569       output += 4;
570     }
571     if (n & (2 * sizeof(uint16_t))) {
572       _mm_storel_pi((__m64*) output, vacc_lo);
573       vacc_lo = _mm_movehl_ps(vacc_lo, vacc_lo);
574       output += 2;
575     }
576     if (n & (1 * sizeof(uint16_t))) {
577       _mm_store_ss(output, vacc_lo);
578     }
579   }
580 }
581 
xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8(size_t rows,size_t channels,const void * input,size_t input_stride,const void * zero,void * buffer,void * output,const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])582 void xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8(
583     size_t rows,
584     size_t channels,
585     const void* input,
586     size_t input_stride,
587     const void* zero,
588     void* buffer,
589     void* output,
590     const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
591 {
592   assert(rows > 7);
593   assert(channels != 0);
594 
595   const uint16_t* i0 = input;
596   const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
597   const uint16_t* i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride);
598   const uint16_t* i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride);
599   const uint16_t* i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride);
600   const uint16_t* i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride);
601   const uint16_t* i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride);
602   const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint16_t);
603 
604   uint16_t* b = buffer;
605   size_t c = channels;
606   for (; c != 0; c = doz(c, 8)) {
607     const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
608     const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
609 
610     const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
611     __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_NO_EXC);
612 
613     const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
614     vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
615     const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
616     vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
617     const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
618     vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
619     const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
620     vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
621     vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
622 
623     _mm_store_si128((__m128i*) b, vacc01234567); b += 8;
624   }
625 
626   for (rows -= 7; rows > 7; rows -= 7) {
627     i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment);
628     i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment);
629     i2 = (const uint16_t*) ((uintptr_t) i2 + input_increment);
630     i3 = (const uint16_t*) ((uintptr_t) i3 + input_increment);
631     i4 = (const uint16_t*) ((uintptr_t) i4 + input_increment);
632     i5 = (const uint16_t*) ((uintptr_t) i5 + input_increment);
633     i6 = (const uint16_t*) ((uintptr_t) i6 + input_increment);
634 
635     uint16_t* b = buffer;
636     size_t c = channels;
637     for (; c != 0; c = doz(c, 8)) {
638       __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) b);
639 
640       const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
641 
642       const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
643       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_NO_EXC);
644       const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
645       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_NO_EXC);
646       const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
647       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
648       const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
649       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
650       const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
651       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
652       const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
653       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
654       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
655 
656       _mm_store_si128((__m128i*) b, vacc01234567); b += 8;
657     }
658   }
659 
660   i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment);
661   i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment);
662   if XNN_UNPREDICTABLE(rows < 2) {
663     i1 = (const uint16_t*) zero;
664   }
665   i2 = (const uint16_t*) ((uintptr_t) i2 + input_increment);
666   if XNN_UNPREDICTABLE(rows <= 2) {
667     i2 = (const uint16_t*) zero;
668   }
669   i3 = (const uint16_t*) ((uintptr_t) i3 + input_increment);
670   if XNN_UNPREDICTABLE(rows < 4) {
671     i3 = (const uint16_t*) zero;
672   }
673   i4 = (const uint16_t*) ((uintptr_t) i4 + input_increment);
674   if XNN_UNPREDICTABLE(rows <= 4) {
675     i4 = (const uint16_t*) zero;
676   }
677   i5 = (const uint16_t*) ((uintptr_t) i5 + input_increment);
678   if XNN_UNPREDICTABLE(rows < 6) {
679     i5 = (const uint16_t*) zero;
680   }
681   i6 = (const uint16_t*) ((uintptr_t) i6 + input_increment);
682   if XNN_UNPREDICTABLE(rows <= 6) {
683     i6 = (const uint16_t*) zero;
684   }
685   uint16_t* o = (uint16_t*) output;
686 
687   const __m256 vscale = _mm256_load_ps(params->avx.scale);
688   const __m256 vmin = _mm256_load_ps(params->avx.min);
689   const __m256 vmax = _mm256_load_ps(params->avx.max);
690   for (; channels >= 8; channels -= 8) {
691     __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) buffer); buffer = (uint16_t*) buffer + 8;
692 
693     const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
694 
695     const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
696     vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_NO_EXC);
697     const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
698     vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_NO_EXC);
699     const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
700     vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
701     const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
702     vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
703     const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
704     vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
705     const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
706     vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
707     vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
708 
709     vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_NO_EXC);
710 
711     __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin);
712 
713     vout01234567 = _mm256_min_ps(vout01234567, vmax);
714 
715     _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout01234567, _MM_FROUND_NO_EXC));
716     o += 8;
717   }
718   if XNN_UNLIKELY(channels != 0) {
719     {
720       __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) buffer); buffer = (uint16_t*) buffer + 8;
721 
722       const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
723       const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
724       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_NO_EXC);
725       const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
726       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_NO_EXC);
727       const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
728       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
729       const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
730       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
731       const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
732       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
733       const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
734       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
735       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
736 
737       vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_NO_EXC);
738       __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin);
739       vout01234567 = _mm256_min_ps(vout01234567, vmax);
740 
741       __m128i vh01234567 = _mm256_cvtps_ph(vout01234567, _MM_FROUND_NO_EXC);
742       if (channels & 4) {
743         _mm_storel_epi64((__m128i*) o, vh01234567);
744         o += 4;
745         vh01234567 = _mm_unpackhi_epi64(vh01234567, vh01234567);
746       }
747       if (channels & 2) {
748         _mm_storeu_si32(o, vh01234567);
749         o += 2;
750         vh01234567 = _mm_srli_epi64(vh01234567, 32);
751       }
752       if (channels & 1) {
753         *o = (uint16_t) _mm_extract_epi16(vh01234567, 0);
754       }
755     }
756   }
757 }
758 
xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8(size_t rows,size_t channels,const void * input,size_t input_stride,const void * zero,void * output,const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])759 void xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8(
760     size_t rows,
761     size_t channels,
762     const void* input,
763     size_t input_stride,
764     const void* zero,
765     void* output,
766     const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
767 {
768   assert(rows != 0);
769   assert(rows <= 7);
770   assert(channels != 0);
771 
772   const uint16_t* i0 = input;
773   const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
774   if XNN_UNPREDICTABLE(rows < 2) {
775     i1 = (const uint16_t*) zero;
776   }
777   const uint16_t* i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride);
778   if XNN_UNPREDICTABLE(rows <= 2) {
779     i2 = (const uint16_t*) zero;
780   }
781   const uint16_t* i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride);
782   if XNN_UNPREDICTABLE(rows < 4) {
783     i3 = (const uint16_t*) zero;
784   }
785   const uint16_t* i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride);
786   if XNN_UNPREDICTABLE(rows <= 4) {
787     i4 = (const uint16_t*) zero;
788   }
789   const uint16_t* i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride);
790   if XNN_UNPREDICTABLE(rows < 6) {
791     i5 = (const uint16_t*) zero;
792   }
793   const uint16_t* i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride);
794   if XNN_UNPREDICTABLE(rows <= 6) {
795     i6 = (const uint16_t*) zero;
796   }
797   uint16_t* o = (uint16_t*) output;
798 
799   const __m256 vscale = _mm256_load_ps(params->avx.scale);
800   const __m256 vmin = _mm256_load_ps(params->avx.min);
801   const __m256 vmax = _mm256_load_ps(params->avx.max);
802   for (; channels >= 8; channels -= 8) {
803     const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
804     i0 += 8;
805     const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
806     i1 += 8;
807 
808     const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
809     __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_NO_EXC);
810     i2 += 8;
811 
812     const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
813     i3 += 8;
814     vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
815     const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
816     i4 += 8;
817     vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
818     const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
819     i5 += 8;
820     vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
821     const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
822     i6 += 8;
823     vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
824     vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
825 
826     vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_NO_EXC);
827 
828     __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin);
829 
830     vout01234567 = _mm256_min_ps(vout01234567, vmax);
831 
832     _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout01234567, _MM_FROUND_NO_EXC));
833     o += 8;
834   }
835   if XNN_UNLIKELY(channels != 0) {
836     {
837       const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
838       const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
839 
840       const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
841       __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_NO_EXC);
842 
843       const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
844       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
845       const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
846       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
847       const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
848       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
849       const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
850       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
851       vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
852 
853       vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_NO_EXC);
854       __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin);
855       vout01234567 = _mm256_min_ps(vout01234567, vmax);
856 
857       __m128i vh01234567 = _mm256_cvtps_ph(vout01234567, _MM_FROUND_NO_EXC);
858       if (channels & 4) {
859         _mm_storel_epi64((__m128i*) o, vh01234567);
860         o += 4;
861         vh01234567 = _mm_unpackhi_epi64(vh01234567, vh01234567);
862       }
863       if (channels & 2) {
864         _mm_storeu_si32(o, vh01234567);
865         o += 2;
866         vh01234567 = _mm_srli_epi64(vh01234567, 32);
867       }
868       if (channels & 1) {
869         *o = (uint16_t) _mm_extract_epi16(vh01234567, 0);
870       }
871     }
872   }
873 }
874 
xnn_f16_maxpool_minmax_ukernel_9p8x__f16c_c8(size_t output_pixels,size_t kernel_elements,size_t channels,const void ** input,size_t input_offset,void * output,size_t input_increment,size_t output_increment,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])875 void xnn_f16_maxpool_minmax_ukernel_9p8x__f16c_c8(
876     size_t output_pixels,
877     size_t kernel_elements,
878     size_t channels,
879     const void** input,
880     size_t input_offset,
881     void* output,
882     size_t input_increment,
883     size_t output_increment,
884     const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
885 {
886   assert(output_pixels != 0);
887   assert(kernel_elements != 0);
888   assert(channels != 0);
889 
890   const __m256 voutput_min = _mm256_load_ps(params->avx.min);
891   const __m256 voutput_max = _mm256_load_ps(params->avx.max);
892   do {
893     uint16_t* o = output;
894     {
895       const uint16_t* i0 = *input++;
896       const uint16_t* i1 = *input++;
897       const uint16_t* i2 = *input++;
898       const uint16_t* i3 = *input++;
899       const uint16_t* i4 = *input++;
900       const uint16_t* i5 = *input++;
901       const uint16_t* i6 = *input++;
902       const uint16_t* i7 = *input++;
903       const uint16_t* i8 = *input++;
904       i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
905       i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
906       i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
907       i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
908       i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
909       i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
910       i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
911       i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
912       i8 = (const uint16_t*) ((uintptr_t) i8 + input_offset);
913       if (kernel_elements < 2) {
914         i1 = i0;
915       }
916       if (kernel_elements <= 2) {
917         i2 = i0;
918       }
919       if (kernel_elements < 4) {
920         i3 = i0;
921       }
922       if (kernel_elements <= 4) {
923         i4 = i0;
924       }
925       if (kernel_elements < 6) {
926         i5 = i0;
927       }
928       if (kernel_elements <= 6) {
929         i6 = i0;
930       }
931       if (kernel_elements < 8) {
932         i7 = i0;
933       }
934       if (kernel_elements <= 8) {
935         i8 = i0;
936       }
937 
938       size_t c = channels;
939       for (; c >= 8; c -= 8) {
940         const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
941         i0 += 8;
942         const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
943         i1 += 8;
944         const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
945         i2 += 8;
946         const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
947         i3 += 8;
948         const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
949         i4 += 8;
950         const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
951         i5 += 8;
952         const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
953         i6 += 8;
954         const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
955         i7 += 8;
956         const __m256 vi8 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
957         i8 += 8;
958 
959         const __m256 vmax018 = _mm256_max_ps(_mm256_max_ps(vi0, vi1), vi8);
960         const __m256 vmax23 = _mm256_max_ps(vi2, vi3);
961         const __m256 vmax45 = _mm256_max_ps(vi4, vi5);
962         const __m256 vmax67 = _mm256_max_ps(vi6, vi7);
963 
964         const __m256 vmax2345 = _mm256_max_ps(vmax23, vmax45);
965         const __m256 vmax01678 = _mm256_max_ps(vmax018, vmax67);
966         const __m256 vmax = _mm256_max_ps(vmax2345, vmax01678);
967         const __m256 vout = _mm256_max_ps(_mm256_min_ps(vmax, voutput_max), voutput_min);
968 
969         _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC));
970         o += 8;
971       }
972       if (c != 0) {
973         const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
974         i0 += 8;
975         const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
976         i1 += 8;
977         const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
978         i2 += 8;
979         const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
980         i3 += 8;
981         const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
982         i4 += 8;
983         const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
984         i5 += 8;
985         const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
986         i6 += 8;
987         const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
988         i7 += 8;
989         const __m256 vi8 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
990         i8 += 8;
991 
992         const __m256 vmax018 = _mm256_max_ps(_mm256_max_ps(vi0, vi1), vi8);
993         const __m256 vmax23 = _mm256_max_ps(vi2, vi3);
994         const __m256 vmax45 = _mm256_max_ps(vi4, vi5);
995         const __m256 vmax67 = _mm256_max_ps(vi6, vi7);
996 
997         const __m256 vmax2345 = _mm256_max_ps(vmax23, vmax45);
998         const __m256 vmax01678 = _mm256_max_ps(vmax018, vmax67);
999         const __m256 vmax = _mm256_max_ps(vmax2345, vmax01678);
1000         __m256 vout = _mm256_max_ps(_mm256_min_ps(vmax, voutput_max), voutput_min);
1001 
1002         __m128i vh = _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC);
1003         if (c & 4) {
1004           _mm_storel_epi64((__m128i*) o, vh);
1005           vh = _mm_unpackhi_epi64(vh, vh);
1006           o += 4;
1007         }
1008         if (c & 2) {
1009           _mm_storeu_si32(o, vh);
1010           vh = _mm_srli_epi64(vh, 32);
1011           o += 2;
1012         }
1013         if (c & 1) {
1014           *o = _mm_extract_epi16(vh, 0);
1015           o += 1;
1016         }
1017       }
1018     }
1019 
1020     for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
1021       const uint16_t* i0 = *input++;
1022       const uint16_t* i1 = *input++;
1023       const uint16_t* i2 = *input++;
1024       const uint16_t* i3 = *input++;
1025       const uint16_t* i4 = *input++;
1026       const uint16_t* i5 = *input++;
1027       const uint16_t* i6 = *input++;
1028       const uint16_t* i7 = *input++;
1029       i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
1030       i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
1031       i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
1032       i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
1033       i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
1034       i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
1035       i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
1036       i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
1037       if (k < 2) {
1038         i1 = i0;
1039       }
1040       if (k <= 2) {
1041         i2 = i0;
1042       }
1043       if (k < 4) {
1044         i3 = i0;
1045       }
1046       if (k <= 4) {
1047         i4 = i0;
1048       }
1049       if (k < 6) {
1050         i5 = i0;
1051       }
1052       if (k <= 6) {
1053         i6 = i0;
1054       }
1055       if (k < 8) {
1056         i7 = i0;
1057       }
1058 
1059       o = output;
1060       size_t c = channels;
1061       for (; c >= 8; c -= 8) {
1062         const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
1063         i0 += 8;
1064         const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
1065         i1 += 8;
1066         const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
1067         i2 += 8;
1068         const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
1069         i3 += 8;
1070         const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
1071         i4 += 8;
1072         const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
1073         i5 += 8;
1074         const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
1075         i6 += 8;
1076         const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
1077         i7 += 8;
1078         const __m256 vo = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) o));
1079 
1080         const __m256 vmax01 = _mm256_max_ps(_mm256_max_ps(vi0, vi1), vo);
1081         const __m256 vmax23 = _mm256_max_ps(vi2, vi3);
1082         const __m256 vmax45 = _mm256_max_ps(vi4, vi5);
1083         const __m256 vmax67 = _mm256_max_ps(vi6, vi7);
1084 
1085         const __m256 vmax2345 = _mm256_max_ps(vmax23, vmax45);
1086         const __m256 vmax0167 = _mm256_max_ps(vmax01, vmax67);
1087         const __m256 vmax = _mm256_max_ps(vmax2345, vmax0167);
1088         const __m256 vout = _mm256_max_ps(_mm256_min_ps(vmax, voutput_max), voutput_min);
1089 
1090         _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC));
1091         o += 8;
1092       }
1093       if (c != 0) {
1094         const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
1095         const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
1096         const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
1097         const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
1098         const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
1099         const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
1100         const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
1101         const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
1102         const __m256 vo = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) o));
1103 
1104         const __m256 vmax01 = _mm256_max_ps(_mm256_max_ps(vi0, vi1), vo);
1105         const __m256 vmax23 = _mm256_max_ps(vi2, vi3);
1106         const __m256 vmax45 = _mm256_max_ps(vi4, vi5);
1107         const __m256 vmax67 = _mm256_max_ps(vi6, vi7);
1108 
1109         const __m256 vmax2345 = _mm256_max_ps(vmax23, vmax45);
1110         const __m256 vmax0167 = _mm256_max_ps(vmax01, vmax67);
1111         const __m256 vmax = _mm256_max_ps(vmax2345, vmax0167);
1112         __m256 vout = _mm256_max_ps(_mm256_min_ps(vmax, voutput_max), voutput_min);
1113 
1114         __m128i vh = _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC);
1115         if (c & 4) {
1116           _mm_storel_epi64((__m128i*) o, vh);
1117           vh = _mm_unpackhi_epi64(vh, vh);
1118           o += 4;
1119         }
1120         if (c & 2) {
1121           _mm_storeu_si32(o, vh);
1122           vh = _mm_srli_epi64(vh, 32);
1123           o += 2;
1124         }
1125         if (c & 1) {
1126           *o = _mm_extract_epi16(vh, 0);
1127           o += 1;
1128         }
1129       }
1130     }
1131     input = (const void**) ((uintptr_t) input + input_increment);
1132     output = (uint16_t*) ((uintptr_t) o + output_increment);
1133   } while (--output_pixels != 0);
1134 }
1135 
xnn_f16_prelu_ukernel__f16c_2x16(size_t rows,size_t channels,const void * restrict input,size_t input_stride,const void * restrict weights,void * restrict output,size_t output_stride)1136 void xnn_f16_prelu_ukernel__f16c_2x16(
1137     size_t rows,
1138     size_t channels,
1139     const void* restrict input,
1140     size_t input_stride,
1141     const void* restrict weights,
1142     void* restrict output,
1143     size_t output_stride) XNN_OOB_READS
1144 {
1145   assert(rows != 0);
1146   assert(channels != 0);
1147   assert(channels % sizeof(uint16_t) == 0);
1148 
1149   const uint16_t* i0 = (const uint16_t*) input;
1150   uint16_t* o0 = (uint16_t*) output;
1151   const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
1152   uint16_t* o1 = (uint16_t*) ((uintptr_t) o0 + output_stride);
1153 
1154   const size_t input_increment = input_stride * 2 - channels;
1155   const size_t output_increment = output_stride * 2 - channels;
1156 
1157   do {
1158     if XNN_UNPREDICTABLE(rows < 2) {
1159       i1 = i0;
1160       o1 = o0;
1161     }
1162 
1163     const uint16_t* w = (const uint16_t*) weights;
1164     size_t c = channels;
1165     for (; c >= 16 * sizeof(uint16_t); c -= 16 * sizeof(uint16_t)) {
1166       const __m256 vw01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w));
1167       const __m256 vw89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 8)));
1168       w += 16;
1169 
1170       const __m256 vi0x001234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
1171       const __m256 vi0x089ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i0 + 8)));
1172       i0 += 16;
1173       const __m256 vi1x001234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
1174       const __m256 vi1x089ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i1 + 8)));
1175       i1 += 16;
1176 
1177       __m256 vacc0x001234567 = _mm256_mul_ps(vi0x001234567, vw01234567);
1178       __m256 vacc0x089ABCDEF = _mm256_mul_ps(vi0x089ABCDEF, vw89ABCDEF);
1179       __m256 vacc1x001234567 = _mm256_mul_ps(vi1x001234567, vw01234567);
1180       __m256 vacc1x089ABCDEF = _mm256_mul_ps(vi1x089ABCDEF, vw89ABCDEF);
1181 
1182       vacc0x001234567 = _mm256_blendv_ps(vi0x001234567, vacc0x001234567, vi0x001234567);
1183       vacc0x089ABCDEF = _mm256_blendv_ps(vi0x089ABCDEF, vacc0x089ABCDEF, vi0x089ABCDEF);
1184       vacc1x001234567 = _mm256_blendv_ps(vi1x001234567, vacc1x001234567, vi1x001234567);
1185       vacc1x089ABCDEF = _mm256_blendv_ps(vi1x089ABCDEF, vacc1x089ABCDEF, vi1x089ABCDEF);
1186 
1187       _mm_storeu_si128((__m128i*) o0, _mm256_cvtps_ph(vacc0x089ABCDEF, _MM_FROUND_NO_EXC));
1188       _mm_storeu_si128((__m128i*) (o0 + 0), _mm256_cvtps_ph(vacc0x001234567, _MM_FROUND_NO_EXC));
1189       _mm_storeu_si128((__m128i*) (o0 + 8), _mm256_cvtps_ph(vacc0x089ABCDEF, _MM_FROUND_NO_EXC));
1190       o0 += 16;
1191       _mm_storeu_si128((__m128i*) o1, _mm256_cvtps_ph(vacc1x089ABCDEF, _MM_FROUND_NO_EXC));
1192       _mm_storeu_si128((__m128i*) (o1 + 0), _mm256_cvtps_ph(vacc1x001234567, _MM_FROUND_NO_EXC));
1193       _mm_storeu_si128((__m128i*) (o1 + 8), _mm256_cvtps_ph(vacc1x089ABCDEF, _MM_FROUND_NO_EXC));
1194       o1 += 16;
1195     }
1196     for (; c >= 8 * sizeof(uint16_t); c -= 8 * sizeof(uint16_t)) {
1197       const __m256 vw01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w));
1198       w += 8;
1199 
1200       const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
1201       i0 += 8;
1202       const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
1203       i1 += 8;
1204 
1205       __m256 vacc0x01234567 = _mm256_mul_ps(vi0x01234567, vw01234567);
1206       __m256 vacc1x01234567 = _mm256_mul_ps(vi1x01234567, vw01234567);
1207 
1208       vacc0x01234567 = _mm256_blendv_ps(vi0x01234567, vacc0x01234567, vi0x01234567);
1209       vacc1x01234567 = _mm256_blendv_ps(vi1x01234567, vacc1x01234567, vi1x01234567);
1210 
1211       _mm_storeu_si128((__m128i*) o0, _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC));
1212       o0 += 8;
1213       _mm_storeu_si128((__m128i*) o1, _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_NO_EXC));
1214       o1 += 8;
1215     }
1216     if XNN_UNLIKELY(c != 0) {
1217       const __m256 vw01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w));
1218 
1219       const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
1220       i0 = (const uint16_t*) ((uintptr_t) i0 + c);
1221       const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
1222       i1 = (const uint16_t*) ((uintptr_t) i1 + c);
1223 
1224       __m256 vacc0x01234567 = _mm256_mul_ps(vi0x01234567, vw01234567);
1225       __m256 vacc1x01234567 = _mm256_mul_ps(vi1x01234567, vw01234567);
1226 
1227       vacc0x01234567 = _mm256_blendv_ps(vi0x01234567, vacc0x01234567, vi0x01234567);
1228       vacc1x01234567 = _mm256_blendv_ps(vi1x01234567, vacc1x01234567, vi1x01234567);
1229 
1230       __m128i vh0x01234567 = _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC);
1231       __m128i vh1x01234567 = _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_NO_EXC);
1232       if (c & (4 * sizeof(uint16_t))) {
1233         _mm_storel_epi64((__m128i*) o0, vh0x01234567);
1234         _mm_storel_epi64((__m128i*) o1, vh1x01234567);
1235 
1236         vh0x01234567 = _mm_unpackhi_epi64(vh0x01234567, vh0x01234567);
1237         vh1x01234567 = _mm_unpackhi_epi64(vh1x01234567, vh1x01234567);
1238 
1239         o0 += 4;
1240         o1 += 4;
1241       }
1242       if (c & (2 * sizeof(uint16_t))) {
1243         _mm_storeu_si32(o0, vh0x01234567);
1244         _mm_storeu_si32(o1, vh1x01234567);
1245 
1246         vh0x01234567 = _mm_srli_epi64(vh0x01234567, 32);
1247         vh1x01234567 = _mm_srli_epi64(vh1x01234567, 32);
1248 
1249         o0 += 2;
1250         o1 += 2;
1251       }
1252       if (c & (1 * sizeof(uint16_t))) {
1253         *o0 = (uint16_t) _mm_extract_epi16(vh0x01234567, 0);
1254         *o1 = (uint16_t) _mm_extract_epi16(vh1x01234567, 0);
1255 
1256         o0 += 1;
1257         o1 += 1;
1258       }
1259     }
1260     i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment);
1261     o0 = (uint16_t*) ((uintptr_t) o0 + output_increment);
1262     i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment);
1263     o1 = (uint16_t*) ((uintptr_t) o1 + output_increment);
1264     rows = doz(rows, 2);
1265   } while (rows != 0);
1266 }
1267 
xnn_f16_rmax_ukernel__f16c(size_t batch,const void * input,void * output)1268 void xnn_f16_rmax_ukernel__f16c(
1269     size_t batch,
1270     const void* input,
1271     void* output) XNN_OOB_READS
1272 {
1273   assert(batch != 0);
1274   assert(batch % sizeof(uint16_t) == 0);
1275 
1276   const uint16_t* i = (const uint16_t*) input;
1277   __m128i vmax_init = _mm_shufflelo_epi16(_mm_loadl_epi64((const __m128i*) i), _MM_SHUFFLE(0, 0, 0, 0));
1278   vmax_init = _mm_unpacklo_epi64(vmax_init, vmax_init);
1279   __m256 vmax0 = _mm256_cvtph_ps(vmax_init);
1280   __m256 vmax1 = vmax0;
1281   __m256 vmax2 = vmax0;
1282   __m256 vmax3 = vmax0;
1283   for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) {
1284     const __m256 vx0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
1285     const __m256 vx1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 8)));
1286     const __m256 vx2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 16)));
1287     const __m256 vx3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 24)));
1288     i += 32;
1289 
1290     vmax0 = _mm256_max_ps(vmax0, vx0);
1291     vmax1 = _mm256_max_ps(vmax1, vx1);
1292     vmax2 = _mm256_max_ps(vmax2, vx2);
1293     vmax3 = _mm256_max_ps(vmax3, vx3);
1294   }
1295   __m256 vmax = _mm256_max_ps(_mm256_max_ps(vmax0, vmax1), _mm256_max_ps(vmax2, vmax3));
1296   for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) {
1297     const __m256 vx = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
1298     i += 8;
1299     vmax = _mm256_max_ps(vmax, vx);
1300   }
1301   __m128 vmax_lo = _mm_max_ps(_mm256_castps256_ps128(vmax), _mm256_extractf128_ps(vmax, 1));
1302   if XNN_UNLIKELY(batch != 0) {
1303     const __m256 vx = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
1304     __m128 vx_lo = _mm256_castps256_ps128(vx);
1305     if (batch & (4 * sizeof(uint16_t))) {
1306       vmax_lo = _mm_max_ps(vmax_lo, vx_lo);
1307       vx_lo = _mm256_extractf128_ps(vx, 1);
1308     }
1309     if (batch & (2 * sizeof(uint16_t))) {
1310       vmax_lo = _mm_blend_ps(_mm_max_ps(vmax_lo, vx_lo), vmax_lo, 0xC);
1311       vx_lo = _mm_movehl_ps(vx_lo, vx_lo);
1312     }
1313     if (batch & (1 * sizeof(uint16_t))) {
1314       vmax_lo = _mm_max_ss(vmax_lo, vx_lo);
1315     }
1316   }
1317   vmax_lo = _mm_max_ps(vmax_lo, _mm_movehl_ps(vmax_lo, vmax_lo));
1318   vmax_lo = _mm_max_ss(vmax_lo, _mm_movehdup_ps(vmax_lo));
1319   *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(_mm_cvtps_ph(vmax_lo, _MM_FROUND_NO_EXC), 0);
1320 }
1321 
xnn_f16_vadd_minmax_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1322 void xnn_f16_vadd_minmax_ukernel__f16c_x16(
1323     size_t n,
1324     const void* restrict a_ptr,
1325     const void* restrict b_ptr,
1326     void* restrict y_ptr,
1327     const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1328 {
1329   assert(n != 0);
1330   assert(n % sizeof(uint16_t) == 0);
1331   assert(a_ptr != NULL);
1332   assert(b_ptr != NULL);
1333   assert(y_ptr != NULL);
1334 
1335   const uint16_t* a = (const uint16_t*) a_ptr;
1336   const uint16_t* b = (const uint16_t*) b_ptr;
1337   uint16_t* y = (uint16_t*) y_ptr;
1338 
1339   const __m256 vy_min = _mm256_load_ps(params->avx.min);
1340   const __m256 vy_max = _mm256_load_ps(params->avx.max);
1341 
1342   for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
1343     const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1344     const __m256 vb01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1345     const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
1346     const __m256 vb456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (b + 8)));
1347     a += 16;
1348     b += 16;
1349 
1350     __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va01234567, vb01234567), _MM_FROUND_NO_EXC));
1351     __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va456789AB, vb456789AB), _MM_FROUND_NO_EXC));
1352 
1353 
1354     vy01234567 = _mm256_max_ps(vy01234567, vy_min);
1355     vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
1356 
1357     vy01234567 = _mm256_min_ps(vy01234567, vy_max);
1358     vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
1359 
1360     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
1361     _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
1362     y += 16;
1363   }
1364   for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1365     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1366     const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1367     a += 8;
1368     b += 8;
1369 
1370     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_NO_EXC));
1371 
1372     vy = _mm256_max_ps(vy, vy_min);
1373     vy = _mm256_min_ps(vy, vy_max);
1374 
1375     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
1376     y += 8;
1377   }
1378   if XNN_UNLIKELY(n != 0) {
1379     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1380     const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1381 
1382     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_NO_EXC));
1383 
1384     vy = _mm256_max_ps(vy, vy_min);
1385     vy = _mm256_min_ps(vy, vy_max);
1386 
1387     __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
1388     if (n & (4 * sizeof(uint16_t))) {
1389       _mm_storel_epi64((__m128i*) y, vh);
1390       vh = _mm_unpackhi_epi64(vh, vh);
1391       y += 4;
1392     }
1393     if (n & (2 * sizeof(uint16_t))) {
1394       _mm_storeu_si32(y, vh);
1395       vh = _mm_srli_epi64(vh, 32);
1396       y += 2;
1397     }
1398     if (n & (1 * sizeof(uint16_t))) {
1399       *y = (uint16_t) _mm_extract_epi16(vh, 0);
1400     }
1401   }
1402 }
1403 
xnn_f16_vaddc_minmax_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1404 void xnn_f16_vaddc_minmax_ukernel__f16c_x16(
1405     size_t n,
1406     const void* restrict a_ptr,
1407     const void* restrict b_ptr,
1408     void* restrict y_ptr,
1409     const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1410 {
1411   assert(n != 0);
1412   assert(n % sizeof(uint16_t) == 0);
1413   assert(a_ptr != NULL);
1414   assert(b_ptr != NULL);
1415   assert(y_ptr != NULL);
1416 
1417   const uint16_t* a = (const uint16_t*) a_ptr;
1418   const uint16_t* b = (const uint16_t*) b_ptr;
1419   uint16_t* y = (uint16_t*) y_ptr;
1420 
1421   const __m256 vy_min = _mm256_load_ps(params->avx.min);
1422   const __m256 vy_max = _mm256_load_ps(params->avx.max);
1423 
1424   const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
1425   for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
1426     const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1427     const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
1428     a += 16;
1429 
1430     __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va01234567, vb), _MM_FROUND_NO_EXC));
1431     __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va456789AB, vb), _MM_FROUND_NO_EXC));
1432 
1433 
1434     vy01234567 = _mm256_max_ps(vy01234567, vy_min);
1435     vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
1436 
1437     vy01234567 = _mm256_min_ps(vy01234567, vy_max);
1438     vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
1439 
1440     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
1441     _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
1442     y += 16;
1443   }
1444   for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1445     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1446     a += 8;
1447 
1448     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_NO_EXC));
1449 
1450     vy = _mm256_max_ps(vy, vy_min);
1451     vy = _mm256_min_ps(vy, vy_max);
1452 
1453     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
1454     y += 8;
1455   }
1456   if XNN_UNLIKELY(n != 0) {
1457     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1458 
1459     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_NO_EXC));
1460 
1461     vy = _mm256_max_ps(vy, vy_min);
1462     vy = _mm256_min_ps(vy, vy_max);
1463 
1464     __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
1465     if (n & (4 * sizeof(uint16_t))) {
1466       _mm_storel_epi64((__m128i*) y, vh);
1467       vh = _mm_unpackhi_epi64(vh, vh);
1468       y += 4;
1469     }
1470     if (n & (2 * sizeof(uint16_t))) {
1471       _mm_storeu_si32(y, vh);
1472       vh = _mm_srli_epi64(vh, 32);
1473       y += 2;
1474     }
1475     if (n & (1 * sizeof(uint16_t))) {
1476       *y = (uint16_t) _mm_extract_epi16(vh, 0);
1477     }
1478   }
1479 }
1480 
xnn_f16_vdiv_minmax_ukernel__f16c_x8(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1481 void xnn_f16_vdiv_minmax_ukernel__f16c_x8(
1482     size_t n,
1483     const void* restrict a_ptr,
1484     const void* restrict b_ptr,
1485     void* restrict y_ptr,
1486     const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1487 {
1488   assert(n != 0);
1489   assert(n % sizeof(uint16_t) == 0);
1490   assert(a_ptr != NULL);
1491   assert(b_ptr != NULL);
1492   assert(y_ptr != NULL);
1493 
1494   const uint16_t* a = (const uint16_t*) a_ptr;
1495   const uint16_t* b = (const uint16_t*) b_ptr;
1496   uint16_t* y = (uint16_t*) y_ptr;
1497 
1498   const __m256 vy_min = _mm256_load_ps(params->avx.min);
1499   const __m256 vy_max = _mm256_load_ps(params->avx.max);
1500 
1501   for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1502     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1503     const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1504     a += 8;
1505     b += 8;
1506 
1507     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_div_ps(va, vb), _MM_FROUND_NO_EXC));
1508 
1509     vy = _mm256_max_ps(vy, vy_min);
1510     vy = _mm256_min_ps(vy, vy_max);
1511 
1512     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
1513     y += 8;
1514   }
1515   if XNN_UNLIKELY(n != 0) {
1516     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1517     const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1518 
1519     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_div_ps(va, vb), _MM_FROUND_NO_EXC));
1520 
1521     vy = _mm256_max_ps(vy, vy_min);
1522     vy = _mm256_min_ps(vy, vy_max);
1523 
1524     __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
1525     if (n & (4 * sizeof(uint16_t))) {
1526       _mm_storel_epi64((__m128i*) y, vh);
1527       vh = _mm_unpackhi_epi64(vh, vh);
1528       y += 4;
1529     }
1530     if (n & (2 * sizeof(uint16_t))) {
1531       _mm_storeu_si32(y, vh);
1532       vh = _mm_srli_epi64(vh, 32);
1533       y += 2;
1534     }
1535     if (n & (1 * sizeof(uint16_t))) {
1536       *y = (uint16_t) _mm_extract_epi16(vh, 0);
1537     }
1538   }
1539 }
1540 
xnn_f16_vdivc_minmax_ukernel__f16c_x8(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1541 void xnn_f16_vdivc_minmax_ukernel__f16c_x8(
1542     size_t n,
1543     const void* restrict a_ptr,
1544     const void* restrict b_ptr,
1545     void* restrict y_ptr,
1546     const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1547 {
1548   assert(n != 0);
1549   assert(n % sizeof(uint16_t) == 0);
1550   assert(a_ptr != NULL);
1551   assert(b_ptr != NULL);
1552   assert(y_ptr != NULL);
1553 
1554   const uint16_t* a = (const uint16_t*) a_ptr;
1555   const uint16_t* b = (const uint16_t*) b_ptr;
1556   uint16_t* y = (uint16_t*) y_ptr;
1557 
1558   const __m256 vy_min = _mm256_load_ps(params->avx.min);
1559   const __m256 vy_max = _mm256_load_ps(params->avx.max);
1560 
1561   const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
1562   for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1563     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1564     a += 8;
1565 
1566     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_div_ps(va, vb), _MM_FROUND_NO_EXC));
1567 
1568     vy = _mm256_max_ps(vy, vy_min);
1569     vy = _mm256_min_ps(vy, vy_max);
1570 
1571     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
1572     y += 8;
1573   }
1574   if XNN_UNLIKELY(n != 0) {
1575     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1576 
1577     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_div_ps(va, vb), _MM_FROUND_NO_EXC));
1578 
1579     vy = _mm256_max_ps(vy, vy_min);
1580     vy = _mm256_min_ps(vy, vy_max);
1581 
1582     __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
1583     if (n & (4 * sizeof(uint16_t))) {
1584       _mm_storel_epi64((__m128i*) y, vh);
1585       vh = _mm_unpackhi_epi64(vh, vh);
1586       y += 4;
1587     }
1588     if (n & (2 * sizeof(uint16_t))) {
1589       _mm_storeu_si32(y, vh);
1590       vh = _mm_srli_epi64(vh, 32);
1591       y += 2;
1592     }
1593     if (n & (1 * sizeof(uint16_t))) {
1594       *y = (uint16_t) _mm_extract_epi16(vh, 0);
1595     }
1596   }
1597 }
1598 
xnn_f16_vmax_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS (1)])1599 void xnn_f16_vmax_ukernel__f16c_x16(
1600     size_t n,
1601     const void* restrict a_ptr,
1602     const void* restrict b_ptr,
1603     void* restrict y_ptr,
1604     const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1605 {
1606   assert(n != 0);
1607   assert(n % sizeof(uint16_t) == 0);
1608   assert(a_ptr != NULL);
1609   assert(b_ptr != NULL);
1610   assert(y_ptr != NULL);
1611 
1612   const uint16_t* a = (const uint16_t*) a_ptr;
1613   const uint16_t* b = (const uint16_t*) b_ptr;
1614   uint16_t* y = (uint16_t*) y_ptr;
1615 
1616 
1617   for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
1618     const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1619     const __m256 vb01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1620     const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
1621     const __m256 vb456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (b + 8)));
1622     a += 16;
1623     b += 16;
1624 
1625     __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_max_ps(va01234567, vb01234567), _MM_FROUND_NO_EXC));
1626     __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_max_ps(va456789AB, vb456789AB), _MM_FROUND_NO_EXC));
1627 
1628 
1629 
1630     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
1631     _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
1632     y += 16;
1633   }
1634   for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1635     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1636     const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1637     a += 8;
1638     b += 8;
1639 
1640     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_max_ps(va, vb), _MM_FROUND_NO_EXC));
1641 
1642 
1643     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
1644     y += 8;
1645   }
1646   if XNN_UNLIKELY(n != 0) {
1647     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1648     const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1649 
1650     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_max_ps(va, vb), _MM_FROUND_NO_EXC));
1651 
1652 
1653     __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
1654     if (n & (4 * sizeof(uint16_t))) {
1655       _mm_storel_epi64((__m128i*) y, vh);
1656       vh = _mm_unpackhi_epi64(vh, vh);
1657       y += 4;
1658     }
1659     if (n & (2 * sizeof(uint16_t))) {
1660       _mm_storeu_si32(y, vh);
1661       vh = _mm_srli_epi64(vh, 32);
1662       y += 2;
1663     }
1664     if (n & (1 * sizeof(uint16_t))) {
1665       *y = (uint16_t) _mm_extract_epi16(vh, 0);
1666     }
1667   }
1668 }
1669 
xnn_f16_vmaxc_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS (1)])1670 void xnn_f16_vmaxc_ukernel__f16c_x16(
1671     size_t n,
1672     const void* restrict a_ptr,
1673     const void* restrict b_ptr,
1674     void* restrict y_ptr,
1675     const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1676 {
1677   assert(n != 0);
1678   assert(n % sizeof(uint16_t) == 0);
1679   assert(a_ptr != NULL);
1680   assert(b_ptr != NULL);
1681   assert(y_ptr != NULL);
1682 
1683   const uint16_t* a = (const uint16_t*) a_ptr;
1684   const uint16_t* b = (const uint16_t*) b_ptr;
1685   uint16_t* y = (uint16_t*) y_ptr;
1686 
1687 
1688   const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
1689   for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
1690     const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1691     const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
1692     a += 16;
1693 
1694     __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_max_ps(va01234567, vb), _MM_FROUND_NO_EXC));
1695     __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_max_ps(va456789AB, vb), _MM_FROUND_NO_EXC));
1696 
1697 
1698 
1699     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
1700     _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
1701     y += 16;
1702   }
1703   for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1704     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1705     a += 8;
1706 
1707     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_max_ps(va, vb), _MM_FROUND_NO_EXC));
1708 
1709 
1710     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
1711     y += 8;
1712   }
1713   if XNN_UNLIKELY(n != 0) {
1714     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1715 
1716     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_max_ps(va, vb), _MM_FROUND_NO_EXC));
1717 
1718 
1719     __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
1720     if (n & (4 * sizeof(uint16_t))) {
1721       _mm_storel_epi64((__m128i*) y, vh);
1722       vh = _mm_unpackhi_epi64(vh, vh);
1723       y += 4;
1724     }
1725     if (n & (2 * sizeof(uint16_t))) {
1726       _mm_storeu_si32(y, vh);
1727       vh = _mm_srli_epi64(vh, 32);
1728       y += 2;
1729     }
1730     if (n & (1 * sizeof(uint16_t))) {
1731       *y = (uint16_t) _mm_extract_epi16(vh, 0);
1732     }
1733   }
1734 }
1735 
xnn_f16_vmin_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS (1)])1736 void xnn_f16_vmin_ukernel__f16c_x16(
1737     size_t n,
1738     const void* restrict a_ptr,
1739     const void* restrict b_ptr,
1740     void* restrict y_ptr,
1741     const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1742 {
1743   assert(n != 0);
1744   assert(n % sizeof(uint16_t) == 0);
1745   assert(a_ptr != NULL);
1746   assert(b_ptr != NULL);
1747   assert(y_ptr != NULL);
1748 
1749   const uint16_t* a = (const uint16_t*) a_ptr;
1750   const uint16_t* b = (const uint16_t*) b_ptr;
1751   uint16_t* y = (uint16_t*) y_ptr;
1752 
1753 
1754   for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
1755     const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1756     const __m256 vb01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1757     const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
1758     const __m256 vb456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (b + 8)));
1759     a += 16;
1760     b += 16;
1761 
1762     __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_min_ps(va01234567, vb01234567), _MM_FROUND_NO_EXC));
1763     __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_min_ps(va456789AB, vb456789AB), _MM_FROUND_NO_EXC));
1764 
1765 
1766 
1767     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
1768     _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
1769     y += 16;
1770   }
1771   for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1772     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1773     const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1774     a += 8;
1775     b += 8;
1776 
1777     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_min_ps(va, vb), _MM_FROUND_NO_EXC));
1778 
1779 
1780     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
1781     y += 8;
1782   }
1783   if XNN_UNLIKELY(n != 0) {
1784     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1785     const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1786 
1787     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_min_ps(va, vb), _MM_FROUND_NO_EXC));
1788 
1789 
1790     __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
1791     if (n & (4 * sizeof(uint16_t))) {
1792       _mm_storel_epi64((__m128i*) y, vh);
1793       vh = _mm_unpackhi_epi64(vh, vh);
1794       y += 4;
1795     }
1796     if (n & (2 * sizeof(uint16_t))) {
1797       _mm_storeu_si32(y, vh);
1798       vh = _mm_srli_epi64(vh, 32);
1799       y += 2;
1800     }
1801     if (n & (1 * sizeof(uint16_t))) {
1802       *y = (uint16_t) _mm_extract_epi16(vh, 0);
1803     }
1804   }
1805 }
1806 
xnn_f16_vminc_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS (1)])1807 void xnn_f16_vminc_ukernel__f16c_x16(
1808     size_t n,
1809     const void* restrict a_ptr,
1810     const void* restrict b_ptr,
1811     void* restrict y_ptr,
1812     const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1813 {
1814   assert(n != 0);
1815   assert(n % sizeof(uint16_t) == 0);
1816   assert(a_ptr != NULL);
1817   assert(b_ptr != NULL);
1818   assert(y_ptr != NULL);
1819 
1820   const uint16_t* a = (const uint16_t*) a_ptr;
1821   const uint16_t* b = (const uint16_t*) b_ptr;
1822   uint16_t* y = (uint16_t*) y_ptr;
1823 
1824 
1825   const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
1826   for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
1827     const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1828     const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
1829     a += 16;
1830 
1831     __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_min_ps(va01234567, vb), _MM_FROUND_NO_EXC));
1832     __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_min_ps(va456789AB, vb), _MM_FROUND_NO_EXC));
1833 
1834 
1835 
1836     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
1837     _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
1838     y += 16;
1839   }
1840   for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1841     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1842     a += 8;
1843 
1844     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_min_ps(va, vb), _MM_FROUND_NO_EXC));
1845 
1846 
1847     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
1848     y += 8;
1849   }
1850   if XNN_UNLIKELY(n != 0) {
1851     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1852 
1853     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_min_ps(va, vb), _MM_FROUND_NO_EXC));
1854 
1855 
1856     __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
1857     if (n & (4 * sizeof(uint16_t))) {
1858       _mm_storel_epi64((__m128i*) y, vh);
1859       vh = _mm_unpackhi_epi64(vh, vh);
1860       y += 4;
1861     }
1862     if (n & (2 * sizeof(uint16_t))) {
1863       _mm_storeu_si32(y, vh);
1864       vh = _mm_srli_epi64(vh, 32);
1865       y += 2;
1866     }
1867     if (n & (1 * sizeof(uint16_t))) {
1868       *y = (uint16_t) _mm_extract_epi16(vh, 0);
1869     }
1870   }
1871 }
1872 
xnn_f16_vmul_minmax_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1873 void xnn_f16_vmul_minmax_ukernel__f16c_x16(
1874     size_t n,
1875     const void* restrict a_ptr,
1876     const void* restrict b_ptr,
1877     void* restrict y_ptr,
1878     const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1879 {
1880   assert(n != 0);
1881   assert(n % sizeof(uint16_t) == 0);
1882   assert(a_ptr != NULL);
1883   assert(b_ptr != NULL);
1884   assert(y_ptr != NULL);
1885 
1886   const uint16_t* a = (const uint16_t*) a_ptr;
1887   const uint16_t* b = (const uint16_t*) b_ptr;
1888   uint16_t* y = (uint16_t*) y_ptr;
1889 
1890   const __m256 vy_min = _mm256_load_ps(params->avx.min);
1891   const __m256 vy_max = _mm256_load_ps(params->avx.max);
1892 
1893   for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
1894     const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1895     const __m256 vb01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1896     const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
1897     const __m256 vb456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (b + 8)));
1898     a += 16;
1899     b += 16;
1900 
1901     __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va01234567, vb01234567), _MM_FROUND_NO_EXC));
1902     __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va456789AB, vb456789AB), _MM_FROUND_NO_EXC));
1903 
1904 
1905     vy01234567 = _mm256_max_ps(vy01234567, vy_min);
1906     vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
1907 
1908     vy01234567 = _mm256_min_ps(vy01234567, vy_max);
1909     vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
1910 
1911     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
1912     _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
1913     y += 16;
1914   }
1915   for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1916     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1917     const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1918     a += 8;
1919     b += 8;
1920 
1921     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_NO_EXC));
1922 
1923     vy = _mm256_max_ps(vy, vy_min);
1924     vy = _mm256_min_ps(vy, vy_max);
1925 
1926     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
1927     y += 8;
1928   }
1929   if XNN_UNLIKELY(n != 0) {
1930     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1931     const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1932 
1933     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_NO_EXC));
1934 
1935     vy = _mm256_max_ps(vy, vy_min);
1936     vy = _mm256_min_ps(vy, vy_max);
1937 
1938     __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
1939     if (n & (4 * sizeof(uint16_t))) {
1940       _mm_storel_epi64((__m128i*) y, vh);
1941       vh = _mm_unpackhi_epi64(vh, vh);
1942       y += 4;
1943     }
1944     if (n & (2 * sizeof(uint16_t))) {
1945       _mm_storeu_si32(y, vh);
1946       vh = _mm_srli_epi64(vh, 32);
1947       y += 2;
1948     }
1949     if (n & (1 * sizeof(uint16_t))) {
1950       *y = (uint16_t) _mm_extract_epi16(vh, 0);
1951     }
1952   }
1953 }
1954 
xnn_f16_vmulc_minmax_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1955 void xnn_f16_vmulc_minmax_ukernel__f16c_x16(
1956     size_t n,
1957     const void* restrict a_ptr,
1958     const void* restrict b_ptr,
1959     void* restrict y_ptr,
1960     const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1961 {
1962   assert(n != 0);
1963   assert(n % sizeof(uint16_t) == 0);
1964   assert(a_ptr != NULL);
1965   assert(b_ptr != NULL);
1966   assert(y_ptr != NULL);
1967 
1968   const uint16_t* a = (const uint16_t*) a_ptr;
1969   const uint16_t* b = (const uint16_t*) b_ptr;
1970   uint16_t* y = (uint16_t*) y_ptr;
1971 
1972   const __m256 vy_min = _mm256_load_ps(params->avx.min);
1973   const __m256 vy_max = _mm256_load_ps(params->avx.max);
1974 
1975   const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
1976   for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
1977     const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1978     const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
1979     a += 16;
1980 
1981     __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va01234567, vb), _MM_FROUND_NO_EXC));
1982     __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va456789AB, vb), _MM_FROUND_NO_EXC));
1983 
1984 
1985     vy01234567 = _mm256_max_ps(vy01234567, vy_min);
1986     vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
1987 
1988     vy01234567 = _mm256_min_ps(vy01234567, vy_max);
1989     vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
1990 
1991     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
1992     _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
1993     y += 16;
1994   }
1995   for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1996     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1997     a += 8;
1998 
1999     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_NO_EXC));
2000 
2001     vy = _mm256_max_ps(vy, vy_min);
2002     vy = _mm256_min_ps(vy, vy_max);
2003 
2004     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
2005     y += 8;
2006   }
2007   if XNN_UNLIKELY(n != 0) {
2008     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2009 
2010     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_NO_EXC));
2011 
2012     vy = _mm256_max_ps(vy, vy_min);
2013     vy = _mm256_min_ps(vy, vy_max);
2014 
2015     __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
2016     if (n & (4 * sizeof(uint16_t))) {
2017       _mm_storel_epi64((__m128i*) y, vh);
2018       vh = _mm_unpackhi_epi64(vh, vh);
2019       y += 4;
2020     }
2021     if (n & (2 * sizeof(uint16_t))) {
2022       _mm_storeu_si32(y, vh);
2023       vh = _mm_srli_epi64(vh, 32);
2024       y += 2;
2025     }
2026     if (n & (1 * sizeof(uint16_t))) {
2027       *y = (uint16_t) _mm_extract_epi16(vh, 0);
2028     }
2029   }
2030 }
2031 
xnn_f16_vrdivc_minmax_ukernel__f16c_x8(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2032 void xnn_f16_vrdivc_minmax_ukernel__f16c_x8(
2033     size_t n,
2034     const void* restrict a_ptr,
2035     const void* restrict b_ptr,
2036     void* restrict y_ptr,
2037     const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2038 {
2039   assert(n != 0);
2040   assert(n % sizeof(uint16_t) == 0);
2041   assert(a_ptr != NULL);
2042   assert(b_ptr != NULL);
2043   assert(y_ptr != NULL);
2044 
2045   const uint16_t* a = (const uint16_t*) a_ptr;
2046   const uint16_t* b = (const uint16_t*) b_ptr;
2047   uint16_t* y = (uint16_t*) y_ptr;
2048 
2049   const __m256 vy_min = _mm256_load_ps(params->avx.min);
2050   const __m256 vy_max = _mm256_load_ps(params->avx.max);
2051 
2052   const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
2053   for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2054     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2055     a += 8;
2056 
2057     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_div_ps(vb, va), _MM_FROUND_NO_EXC));
2058 
2059     vy = _mm256_max_ps(vy, vy_min);
2060     vy = _mm256_min_ps(vy, vy_max);
2061 
2062     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
2063     y += 8;
2064   }
2065   if XNN_UNLIKELY(n != 0) {
2066     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2067 
2068     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_div_ps(vb, va), _MM_FROUND_NO_EXC));
2069 
2070     vy = _mm256_max_ps(vy, vy_min);
2071     vy = _mm256_min_ps(vy, vy_max);
2072 
2073     __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
2074     if (n & (4 * sizeof(uint16_t))) {
2075       _mm_storel_epi64((__m128i*) y, vh);
2076       vh = _mm_unpackhi_epi64(vh, vh);
2077       y += 4;
2078     }
2079     if (n & (2 * sizeof(uint16_t))) {
2080       _mm_storeu_si32(y, vh);
2081       vh = _mm_srli_epi64(vh, 32);
2082       y += 2;
2083     }
2084     if (n & (1 * sizeof(uint16_t))) {
2085       *y = (uint16_t) _mm_extract_epi16(vh, 0);
2086     }
2087   }
2088 }
2089 
xnn_f16_vrsubc_minmax_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2090 void xnn_f16_vrsubc_minmax_ukernel__f16c_x16(
2091     size_t n,
2092     const void* restrict a_ptr,
2093     const void* restrict b_ptr,
2094     void* restrict y_ptr,
2095     const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2096 {
2097   assert(n != 0);
2098   assert(n % sizeof(uint16_t) == 0);
2099   assert(a_ptr != NULL);
2100   assert(b_ptr != NULL);
2101   assert(y_ptr != NULL);
2102 
2103   const uint16_t* a = (const uint16_t*) a_ptr;
2104   const uint16_t* b = (const uint16_t*) b_ptr;
2105   uint16_t* y = (uint16_t*) y_ptr;
2106 
2107   const __m256 vy_min = _mm256_load_ps(params->avx.min);
2108   const __m256 vy_max = _mm256_load_ps(params->avx.max);
2109 
2110   const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
2111   for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2112     const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2113     const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
2114     a += 16;
2115 
2116     __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(vb, va01234567), _MM_FROUND_NO_EXC));
2117     __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(vb, va456789AB), _MM_FROUND_NO_EXC));
2118 
2119 
2120     vy01234567 = _mm256_max_ps(vy01234567, vy_min);
2121     vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
2122 
2123     vy01234567 = _mm256_min_ps(vy01234567, vy_max);
2124     vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
2125 
2126     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
2127     _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
2128     y += 16;
2129   }
2130   for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2131     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2132     a += 8;
2133 
2134     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(vb, va), _MM_FROUND_NO_EXC));
2135 
2136     vy = _mm256_max_ps(vy, vy_min);
2137     vy = _mm256_min_ps(vy, vy_max);
2138 
2139     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
2140     y += 8;
2141   }
2142   if XNN_UNLIKELY(n != 0) {
2143     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2144 
2145     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(vb, va), _MM_FROUND_NO_EXC));
2146 
2147     vy = _mm256_max_ps(vy, vy_min);
2148     vy = _mm256_min_ps(vy, vy_max);
2149 
2150     __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
2151     if (n & (4 * sizeof(uint16_t))) {
2152       _mm_storel_epi64((__m128i*) y, vh);
2153       vh = _mm_unpackhi_epi64(vh, vh);
2154       y += 4;
2155     }
2156     if (n & (2 * sizeof(uint16_t))) {
2157       _mm_storeu_si32(y, vh);
2158       vh = _mm_srli_epi64(vh, 32);
2159       y += 2;
2160     }
2161     if (n & (1 * sizeof(uint16_t))) {
2162       *y = (uint16_t) _mm_extract_epi16(vh, 0);
2163     }
2164   }
2165 }
2166 
xnn_f16_vsqrdiff_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS (1)])2167 void xnn_f16_vsqrdiff_ukernel__f16c_x16(
2168     size_t n,
2169     const void* restrict a_ptr,
2170     const void* restrict b_ptr,
2171     void* restrict y_ptr,
2172     const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2173 {
2174   assert(n != 0);
2175   assert(n % sizeof(uint16_t) == 0);
2176   assert(a_ptr != NULL);
2177   assert(b_ptr != NULL);
2178   assert(y_ptr != NULL);
2179 
2180   const uint16_t* a = (const uint16_t*) a_ptr;
2181   const uint16_t* b = (const uint16_t*) b_ptr;
2182   uint16_t* y = (uint16_t*) y_ptr;
2183 
2184 
2185   for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2186     const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2187     const __m256 vb01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
2188     const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
2189     const __m256 vb456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (b + 8)));
2190     a += 16;
2191     b += 16;
2192 
2193     __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va01234567, vb01234567), _MM_FROUND_NO_EXC));
2194     __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va456789AB, vb456789AB), _MM_FROUND_NO_EXC));
2195 
2196     vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy01234567, vy01234567), _MM_FROUND_NO_EXC));
2197     vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy456789AB, vy456789AB), _MM_FROUND_NO_EXC));
2198 
2199 
2200     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
2201     _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
2202     y += 16;
2203   }
2204   for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2205     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2206     const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
2207     a += 8;
2208     b += 8;
2209 
2210     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_NO_EXC));
2211     vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy, vy), _MM_FROUND_NO_EXC));
2212 
2213 
2214     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
2215     y += 8;
2216   }
2217   if XNN_UNLIKELY(n != 0) {
2218     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2219     const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
2220 
2221     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_NO_EXC));
2222     vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy, vy), _MM_FROUND_NO_EXC));
2223 
2224 
2225     __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
2226     if (n & (4 * sizeof(uint16_t))) {
2227       _mm_storel_epi64((__m128i*) y, vh);
2228       vh = _mm_unpackhi_epi64(vh, vh);
2229       y += 4;
2230     }
2231     if (n & (2 * sizeof(uint16_t))) {
2232       _mm_storeu_si32(y, vh);
2233       vh = _mm_srli_epi64(vh, 32);
2234       y += 2;
2235     }
2236     if (n & (1 * sizeof(uint16_t))) {
2237       *y = (uint16_t) _mm_extract_epi16(vh, 0);
2238     }
2239   }
2240 }
2241 
xnn_f16_vsqrdiffc_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS (1)])2242 void xnn_f16_vsqrdiffc_ukernel__f16c_x16(
2243     size_t n,
2244     const void* restrict a_ptr,
2245     const void* restrict b_ptr,
2246     void* restrict y_ptr,
2247     const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2248 {
2249   assert(n != 0);
2250   assert(n % sizeof(uint16_t) == 0);
2251   assert(a_ptr != NULL);
2252   assert(b_ptr != NULL);
2253   assert(y_ptr != NULL);
2254 
2255   const uint16_t* a = (const uint16_t*) a_ptr;
2256   const uint16_t* b = (const uint16_t*) b_ptr;
2257   uint16_t* y = (uint16_t*) y_ptr;
2258 
2259 
2260   const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
2261   for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2262     const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2263     const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
2264     a += 16;
2265 
2266     __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va01234567, vb), _MM_FROUND_NO_EXC));
2267     __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va456789AB, vb), _MM_FROUND_NO_EXC));
2268 
2269     vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy01234567, vy01234567), _MM_FROUND_NO_EXC));
2270     vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy456789AB, vy456789AB), _MM_FROUND_NO_EXC));
2271 
2272 
2273     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
2274     _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
2275     y += 16;
2276   }
2277   for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2278     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2279     a += 8;
2280 
2281     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_NO_EXC));
2282     vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy, vy), _MM_FROUND_NO_EXC));
2283 
2284 
2285     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
2286     y += 8;
2287   }
2288   if XNN_UNLIKELY(n != 0) {
2289     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2290 
2291     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_NO_EXC));
2292     vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy, vy), _MM_FROUND_NO_EXC));
2293 
2294 
2295     __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
2296     if (n & (4 * sizeof(uint16_t))) {
2297       _mm_storel_epi64((__m128i*) y, vh);
2298       vh = _mm_unpackhi_epi64(vh, vh);
2299       y += 4;
2300     }
2301     if (n & (2 * sizeof(uint16_t))) {
2302       _mm_storeu_si32(y, vh);
2303       vh = _mm_srli_epi64(vh, 32);
2304       y += 2;
2305     }
2306     if (n & (1 * sizeof(uint16_t))) {
2307       *y = (uint16_t) _mm_extract_epi16(vh, 0);
2308     }
2309   }
2310 }
2311 
xnn_f16_vsub_minmax_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2312 void xnn_f16_vsub_minmax_ukernel__f16c_x16(
2313     size_t n,
2314     const void* restrict a_ptr,
2315     const void* restrict b_ptr,
2316     void* restrict y_ptr,
2317     const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2318 {
2319   assert(n != 0);
2320   assert(n % sizeof(uint16_t) == 0);
2321   assert(a_ptr != NULL);
2322   assert(b_ptr != NULL);
2323   assert(y_ptr != NULL);
2324 
2325   const uint16_t* a = (const uint16_t*) a_ptr;
2326   const uint16_t* b = (const uint16_t*) b_ptr;
2327   uint16_t* y = (uint16_t*) y_ptr;
2328 
2329   const __m256 vy_min = _mm256_load_ps(params->avx.min);
2330   const __m256 vy_max = _mm256_load_ps(params->avx.max);
2331 
2332   for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2333     const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2334     const __m256 vb01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
2335     const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
2336     const __m256 vb456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (b + 8)));
2337     a += 16;
2338     b += 16;
2339 
2340     __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va01234567, vb01234567), _MM_FROUND_NO_EXC));
2341     __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va456789AB, vb456789AB), _MM_FROUND_NO_EXC));
2342 
2343 
2344     vy01234567 = _mm256_max_ps(vy01234567, vy_min);
2345     vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
2346 
2347     vy01234567 = _mm256_min_ps(vy01234567, vy_max);
2348     vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
2349 
2350     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
2351     _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
2352     y += 16;
2353   }
2354   for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2355     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2356     const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
2357     a += 8;
2358     b += 8;
2359 
2360     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_NO_EXC));
2361 
2362     vy = _mm256_max_ps(vy, vy_min);
2363     vy = _mm256_min_ps(vy, vy_max);
2364 
2365     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
2366     y += 8;
2367   }
2368   if XNN_UNLIKELY(n != 0) {
2369     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2370     const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
2371 
2372     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_NO_EXC));
2373 
2374     vy = _mm256_max_ps(vy, vy_min);
2375     vy = _mm256_min_ps(vy, vy_max);
2376 
2377     __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
2378     if (n & (4 * sizeof(uint16_t))) {
2379       _mm_storel_epi64((__m128i*) y, vh);
2380       vh = _mm_unpackhi_epi64(vh, vh);
2381       y += 4;
2382     }
2383     if (n & (2 * sizeof(uint16_t))) {
2384       _mm_storeu_si32(y, vh);
2385       vh = _mm_srli_epi64(vh, 32);
2386       y += 2;
2387     }
2388     if (n & (1 * sizeof(uint16_t))) {
2389       *y = (uint16_t) _mm_extract_epi16(vh, 0);
2390     }
2391   }
2392 }
2393 
xnn_f16_vsubc_minmax_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2394 void xnn_f16_vsubc_minmax_ukernel__f16c_x16(
2395     size_t n,
2396     const void* restrict a_ptr,
2397     const void* restrict b_ptr,
2398     void* restrict y_ptr,
2399     const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2400 {
2401   assert(n != 0);
2402   assert(n % sizeof(uint16_t) == 0);
2403   assert(a_ptr != NULL);
2404   assert(b_ptr != NULL);
2405   assert(y_ptr != NULL);
2406 
2407   const uint16_t* a = (const uint16_t*) a_ptr;
2408   const uint16_t* b = (const uint16_t*) b_ptr;
2409   uint16_t* y = (uint16_t*) y_ptr;
2410 
2411   const __m256 vy_min = _mm256_load_ps(params->avx.min);
2412   const __m256 vy_max = _mm256_load_ps(params->avx.max);
2413 
2414   const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
2415   for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2416     const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2417     const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
2418     a += 16;
2419 
2420     __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va01234567, vb), _MM_FROUND_NO_EXC));
2421     __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va456789AB, vb), _MM_FROUND_NO_EXC));
2422 
2423 
2424     vy01234567 = _mm256_max_ps(vy01234567, vy_min);
2425     vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
2426 
2427     vy01234567 = _mm256_min_ps(vy01234567, vy_max);
2428     vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
2429 
2430     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
2431     _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
2432     y += 16;
2433   }
2434   for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2435     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2436     a += 8;
2437 
2438     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_NO_EXC));
2439 
2440     vy = _mm256_max_ps(vy, vy_min);
2441     vy = _mm256_min_ps(vy, vy_max);
2442 
2443     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
2444     y += 8;
2445   }
2446   if XNN_UNLIKELY(n != 0) {
2447     const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2448 
2449     __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_NO_EXC));
2450 
2451     vy = _mm256_max_ps(vy, vy_min);
2452     vy = _mm256_min_ps(vy, vy_max);
2453 
2454     __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
2455     if (n & (4 * sizeof(uint16_t))) {
2456       _mm_storel_epi64((__m128i*) y, vh);
2457       vh = _mm_unpackhi_epi64(vh, vh);
2458       y += 4;
2459     }
2460     if (n & (2 * sizeof(uint16_t))) {
2461       _mm_storeu_si32(y, vh);
2462       vh = _mm_srli_epi64(vh, 32);
2463       y += 2;
2464     }
2465     if (n & (1 * sizeof(uint16_t))) {
2466       *y = (uint16_t) _mm_extract_epi16(vh, 0);
2467     }
2468   }
2469 }
2470 
xnn_f16_vclamp_ukernel__f16c_x16(size_t n,const void * restrict x_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2471 void xnn_f16_vclamp_ukernel__f16c_x16(
2472     size_t n,
2473     const void* restrict x_ptr,
2474     void* restrict y_ptr,
2475     const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2476 {
2477   assert(n != 0);
2478   assert(n % sizeof(uint16_t) == 0);
2479   assert(x_ptr != NULL);
2480   assert(y_ptr != NULL);
2481 
2482   const uint16_t* x = (const uint16_t*) x_ptr;
2483   uint16_t* y = (uint16_t*) y_ptr;
2484 
2485   const __m256 vy_min = _mm256_load_ps(params->avx.min);
2486   const __m256 vy_max = _mm256_load_ps(params->avx.max);
2487 
2488   for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2489     __m256 vacc01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) x));
2490     __m256 vacc89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (x + 8)));
2491     x += 16;
2492 
2493     vacc01234567 = _mm256_max_ps(vacc01234567, vy_min);
2494     vacc89ABCDEF = _mm256_max_ps(vacc89ABCDEF, vy_min);
2495 
2496     vacc01234567 = _mm256_min_ps(vacc01234567, vy_max);
2497     vacc89ABCDEF = _mm256_min_ps(vacc89ABCDEF, vy_max);
2498 
2499     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vacc01234567, _MM_FROUND_NO_EXC));
2500     _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vacc89ABCDEF, _MM_FROUND_NO_EXC));
2501     y += 16;
2502   }
2503   for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2504     __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) x));
2505     x += 8;
2506     vacc = _mm256_max_ps(vacc, vy_min);
2507     vacc = _mm256_min_ps(vacc, vy_max);
2508     _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC));
2509     y += 8;
2510   }
2511   if XNN_UNLIKELY(n != 0) {
2512     __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) x));
2513     vacc = _mm256_max_ps(vacc, vy_min);
2514     vacc = _mm256_min_ps(vacc, vy_max);
2515 
2516     __m128i vh = _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC);
2517     if (n & (4 * sizeof(uint16_t))) {
2518       _mm_storel_epi64((__m128i*) y, vh);
2519       vh = _mm_unpackhi_epi64(vh, vh);
2520       y += 4;
2521     }
2522     if (n & (2 * sizeof(uint16_t))) {
2523       _mm_storeu_si32(y, vh);
2524       vh = _mm_srli_epi64(vh, 32);
2525       y += 2;
2526     }
2527     if (n & (1 * sizeof(uint16_t))) {
2528       *y = _mm_extract_epi16(vh, 0);
2529     }
2530   }
2531 }
2532 
xnn_f16_vhswish_ukernel__f16c_x16(size_t n,const void * restrict x_ptr,void * restrict y_ptr,const union xnn_f16_hswish_params params[restrict XNN_MIN_ELEMENTS (1)])2533 void xnn_f16_vhswish_ukernel__f16c_x16(
2534     size_t n,
2535     const void* restrict x_ptr,
2536     void* restrict y_ptr,
2537     const union xnn_f16_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2538 {
2539   assert(n != 0);
2540   assert(n % sizeof(uint16_t) == 0);
2541 
2542   const uint16_t* x = (const uint16_t*) x_ptr;
2543   uint16_t* y = (uint16_t*) y_ptr;
2544 
2545   const __m256 vsixth = _mm256_load_ps(params->avx.sixth);
2546   const __m256 vthree = _mm256_load_ps(params->avx.three);
2547   const __m128i vsix = _mm_load_si128((const __m128i*) params->avx.six);
2548   const __m128i vzero = _mm_setzero_si128();
2549 
2550   for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2551     __m256 vx01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) x));
2552     __m256 vx89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (x + 8)));
2553     x += 16;
2554 
2555     __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vx01234567, vthree), _MM_FROUND_NO_EXC);
2556     vx01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vx01234567, vsixth), _MM_FROUND_NO_EXC));
2557     __m128i vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(vx89ABCDEF, vthree), _MM_FROUND_NO_EXC);
2558     vx89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vx89ABCDEF, vsixth), _MM_FROUND_NO_EXC));
2559 
2560     vacc01234567 = _mm_max_epi16(vacc01234567, vzero);
2561     vacc89ABCDEF = _mm_max_epi16(vacc89ABCDEF, vzero);
2562 
2563     vacc01234567 = _mm_min_epi16(vacc01234567, vsix);
2564     vacc89ABCDEF = _mm_min_epi16(vacc89ABCDEF, vsix);
2565 
2566     vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vx01234567), _MM_FROUND_NO_EXC);
2567     vacc89ABCDEF = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc89ABCDEF), vx89ABCDEF), _MM_FROUND_NO_EXC);
2568 
2569     _mm_storeu_si128((__m128i*) y, vacc01234567);
2570     _mm_storeu_si128((__m128i*) (y + 8), vacc89ABCDEF);
2571     y += 16;
2572   }
2573   for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2574     __m256 vx = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) x));
2575     x += 8;
2576     __m128i vacc = _mm256_cvtps_ph(_mm256_add_ps(vx, vthree), _MM_FROUND_NO_EXC);
2577     vx = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vx, vsixth), _MM_FROUND_NO_EXC));
2578     vacc = _mm_max_epi16(vacc, vzero);
2579     vacc = _mm_min_epi16(vacc, vsix);
2580     vacc = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc), vx), _MM_FROUND_NO_EXC);
2581     _mm_storeu_si128((__m128i*) y, vacc);
2582     y += 8;
2583   }
2584   if XNN_UNLIKELY(n != 0) {
2585     __m256 vx = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) x));
2586     __m128i vacc = _mm256_cvtps_ph(_mm256_add_ps(vx, vthree), _MM_FROUND_NO_EXC);
2587     vx = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vx, vsixth), _MM_FROUND_NO_EXC));
2588     vacc = _mm_max_epi16(vacc, vzero);
2589     vacc = _mm_min_epi16(vacc, vsix);
2590     vacc = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc), vx), _MM_FROUND_NO_EXC);
2591 
2592     if (n & (4 * sizeof(uint16_t))) {
2593       _mm_storel_epi64((__m128i*) y, vacc);
2594       vacc = _mm_unpackhi_epi64(vacc, vacc);
2595       y += 4;
2596     }
2597     if (n & (2 * sizeof(uint16_t))) {
2598       _mm_storeu_si32(y, vacc);
2599       vacc = _mm_srli_epi64(vacc, 32);
2600       y += 2;
2601     }
2602     if (n & (1 * sizeof(uint16_t))) {
2603       *y = (uint16_t) _mm_extract_epi16(vacc, 0);
2604     }
2605   }
2606 }
2607 
xnn_f16_vlrelu_ukernel__f16c_x16(size_t batch,const void * input,void * output,const union xnn_f16_lrelu_params params[restrict XNN_MIN_ELEMENTS (1)])2608 void xnn_f16_vlrelu_ukernel__f16c_x16(
2609     size_t batch,
2610     const void* input,
2611     void* output,
2612     const union xnn_f16_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2613 {
2614   assert(batch != 0);
2615   assert(batch % sizeof(uint16_t) == 0);
2616 
2617   const __m256 vslope = _mm256_load_ps(params->avx.slope);
2618   const uint16_t* i = (const uint16_t*) input;
2619   uint16_t* o = (uint16_t*) output;
2620   for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) {
2621     const __m256 vx01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2622     const __m256 vx89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 8)));
2623     i += 16;
2624 
2625     __m256 vacc01234567 = _mm256_mul_ps(vx01234567, vslope);
2626     __m256 vacc89ABCDEF = _mm256_mul_ps(vx89ABCDEF, vslope);
2627 
2628     vacc01234567 = _mm256_blendv_ps(vx01234567, vacc01234567, vx01234567);
2629     vacc89ABCDEF = _mm256_blendv_ps(vx89ABCDEF, vacc89ABCDEF, vx89ABCDEF);
2630 
2631     _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc01234567, _MM_FROUND_NO_EXC));
2632     _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vacc89ABCDEF, _MM_FROUND_NO_EXC));
2633     o += 16;
2634   }
2635   for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) {
2636     const __m256 vx = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2637     i += 8;
2638 
2639     __m256 vacc = _mm256_mul_ps(vx, vslope);
2640     vacc = _mm256_blendv_ps(vx, vacc, vx);
2641 
2642     _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC));
2643     o += 8;
2644   }
2645   if XNN_UNLIKELY(batch != 0) {
2646     const __m256 vx = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2647 
2648     __m256 vacc = _mm256_mul_ps(vx, vslope);
2649     vacc = _mm256_blendv_ps(vx, vacc, vx);
2650 
2651     __m128i vh = _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC);
2652     if (batch & (4 * sizeof(uint16_t))) {
2653       _mm_storel_epi64((__m128i*) o, vh);
2654       vh = _mm_unpackhi_epi64(vh, vh);
2655       o += 4;
2656     }
2657     if (batch & (2 * sizeof(uint16_t))) {
2658       _mm_storeu_si32(o, vh);
2659       vh = _mm_srli_epi64(vh, 32);
2660       o += 2;
2661     }
2662     if (batch & (1 * sizeof(uint16_t))) {
2663       *o = _mm_extract_epi16(vh, 0);
2664     }
2665   }
2666 }
2667 
xnn_f16_vrndd_ukernel__f16c_x16(size_t n,const void * input,void * output,const union xnn_f16_rnd_params params[restrict XNN_MIN_ELEMENTS (1)])2668 void xnn_f16_vrndd_ukernel__f16c_x16(
2669     size_t n,
2670     const void* input,
2671     void* output,
2672     const union xnn_f16_rnd_params params[restrict XNN_MIN_ELEMENTS(1)])
2673 {
2674   assert(n != 0);
2675   assert(n % sizeof(uint16_t) == 0);
2676 
2677   const uint16_t* i = (const uint16_t*) input;
2678   uint16_t* o = (uint16_t*) output;
2679   for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2680     __m256 vacc0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2681     __m256 vacc1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 8)));
2682     i += 16;
2683 
2684     vacc0 = _mm256_round_ps(vacc0, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
2685     vacc1 = _mm256_round_ps(vacc1, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
2686 
2687     _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc0, _MM_FROUND_NO_EXC));
2688     _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vacc1, _MM_FROUND_NO_EXC));
2689     o += 16;
2690   }
2691   for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2692     __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2693     i += 8;
2694 
2695     vacc = _mm256_round_ps(vacc, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
2696 
2697     _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC));
2698     o += 8;
2699   }
2700   if XNN_UNLIKELY(n != 0) {
2701     assert(n >= 1 * sizeof(uint16_t));
2702     assert(n <= 7 * sizeof(uint16_t));
2703     __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2704     vacc = _mm256_round_ps(vacc, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
2705     __m128i vh = _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC);
2706     if (n & (4 * sizeof(uint16_t))) {
2707       _mm_storel_epi64((__m128i*) o, vh);
2708       vh = _mm_unpackhi_epi64(vh, vh);
2709       o += 4;
2710     }
2711     if (n & (2 * sizeof(uint16_t))) {
2712       _mm_storeu_si32(o, vh);
2713       vh = _mm_srli_epi64(vh, 32);
2714       o += 2;
2715     }
2716     if (n & (1 * sizeof(uint16_t))) {
2717       *o = (uint16_t) _mm_extract_epi16(vh, 0);
2718     }
2719   }
2720 }
2721 
xnn_f16_vrndne_ukernel__f16c_x16(size_t n,const void * input,void * output,const union xnn_f16_rnd_params params[restrict XNN_MIN_ELEMENTS (1)])2722 void xnn_f16_vrndne_ukernel__f16c_x16(
2723     size_t n,
2724     const void* input,
2725     void* output,
2726     const union xnn_f16_rnd_params params[restrict XNN_MIN_ELEMENTS(1)])
2727 {
2728   assert(n != 0);
2729   assert(n % sizeof(uint16_t) == 0);
2730 
2731   const uint16_t* i = (const uint16_t*) input;
2732   uint16_t* o = (uint16_t*) output;
2733   for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2734     __m256 vacc0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2735     __m256 vacc1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 8)));
2736     i += 16;
2737 
2738     vacc0 = _mm256_round_ps(vacc0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
2739     vacc1 = _mm256_round_ps(vacc1, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
2740 
2741     _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc0, _MM_FROUND_NO_EXC));
2742     _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vacc1, _MM_FROUND_NO_EXC));
2743     o += 16;
2744   }
2745   for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2746     __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2747     i += 8;
2748 
2749     vacc = _mm256_round_ps(vacc, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
2750 
2751     _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC));
2752     o += 8;
2753   }
2754   if XNN_UNLIKELY(n != 0) {
2755     assert(n >= 1 * sizeof(uint16_t));
2756     assert(n <= 7 * sizeof(uint16_t));
2757     __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2758     vacc = _mm256_round_ps(vacc, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
2759     __m128i vh = _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC);
2760     if (n & (4 * sizeof(uint16_t))) {
2761       _mm_storel_epi64((__m128i*) o, vh);
2762       vh = _mm_unpackhi_epi64(vh, vh);
2763       o += 4;
2764     }
2765     if (n & (2 * sizeof(uint16_t))) {
2766       _mm_storeu_si32(o, vh);
2767       vh = _mm_srli_epi64(vh, 32);
2768       o += 2;
2769     }
2770     if (n & (1 * sizeof(uint16_t))) {
2771       *o = (uint16_t) _mm_extract_epi16(vh, 0);
2772     }
2773   }
2774 }
2775 
xnn_f16_vrndu_ukernel__f16c_x16(size_t n,const void * input,void * output,const union xnn_f16_rnd_params params[restrict XNN_MIN_ELEMENTS (1)])2776 void xnn_f16_vrndu_ukernel__f16c_x16(
2777     size_t n,
2778     const void* input,
2779     void* output,
2780     const union xnn_f16_rnd_params params[restrict XNN_MIN_ELEMENTS(1)])
2781 {
2782   assert(n != 0);
2783   assert(n % sizeof(uint16_t) == 0);
2784 
2785   const uint16_t* i = (const uint16_t*) input;
2786   uint16_t* o = (uint16_t*) output;
2787   for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2788     __m256 vacc0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2789     __m256 vacc1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 8)));
2790     i += 16;
2791 
2792     vacc0 = _mm256_round_ps(vacc0, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
2793     vacc1 = _mm256_round_ps(vacc1, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
2794 
2795     _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc0, _MM_FROUND_NO_EXC));
2796     _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vacc1, _MM_FROUND_NO_EXC));
2797     o += 16;
2798   }
2799   for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2800     __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2801     i += 8;
2802 
2803     vacc = _mm256_round_ps(vacc, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
2804 
2805     _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC));
2806     o += 8;
2807   }
2808   if XNN_UNLIKELY(n != 0) {
2809     assert(n >= 1 * sizeof(uint16_t));
2810     assert(n <= 7 * sizeof(uint16_t));
2811     __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2812     vacc = _mm256_round_ps(vacc, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
2813     __m128i vh = _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC);
2814     if (n & (4 * sizeof(uint16_t))) {
2815       _mm_storel_epi64((__m128i*) o, vh);
2816       vh = _mm_unpackhi_epi64(vh, vh);
2817       o += 4;
2818     }
2819     if (n & (2 * sizeof(uint16_t))) {
2820       _mm_storeu_si32(o, vh);
2821       vh = _mm_srli_epi64(vh, 32);
2822       o += 2;
2823     }
2824     if (n & (1 * sizeof(uint16_t))) {
2825       *o = (uint16_t) _mm_extract_epi16(vh, 0);
2826     }
2827   }
2828 }
2829 
xnn_f16_vrndz_ukernel__f16c_x16(size_t n,const void * input,void * output,const union xnn_f16_rnd_params params[restrict XNN_MIN_ELEMENTS (1)])2830 void xnn_f16_vrndz_ukernel__f16c_x16(
2831     size_t n,
2832     const void* input,
2833     void* output,
2834     const union xnn_f16_rnd_params params[restrict XNN_MIN_ELEMENTS(1)])
2835 {
2836   assert(n != 0);
2837   assert(n % sizeof(uint16_t) == 0);
2838 
2839   const uint16_t* i = (const uint16_t*) input;
2840   uint16_t* o = (uint16_t*) output;
2841   for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2842     __m256 vacc0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2843     __m256 vacc1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 8)));
2844     i += 16;
2845 
2846     vacc0 = _mm256_round_ps(vacc0, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2847     vacc1 = _mm256_round_ps(vacc1, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2848 
2849     _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc0, _MM_FROUND_NO_EXC));
2850     _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vacc1, _MM_FROUND_NO_EXC));
2851     o += 16;
2852   }
2853   for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2854     __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2855     i += 8;
2856 
2857     vacc = _mm256_round_ps(vacc, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2858 
2859     _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC));
2860     o += 8;
2861   }
2862   if XNN_UNLIKELY(n != 0) {
2863     assert(n >= 1 * sizeof(uint16_t));
2864     assert(n <= 7 * sizeof(uint16_t));
2865     __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2866     vacc = _mm256_round_ps(vacc, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2867     __m128i vh = _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC);
2868     if (n & (4 * sizeof(uint16_t))) {
2869       _mm_storel_epi64((__m128i*) o, vh);
2870       vh = _mm_unpackhi_epi64(vh, vh);
2871       o += 4;
2872     }
2873     if (n & (2 * sizeof(uint16_t))) {
2874       _mm_storeu_si32(o, vh);
2875       vh = _mm_srli_epi64(vh, 32);
2876       o += 2;
2877     }
2878     if (n & (1 * sizeof(uint16_t))) {
2879       *o = (uint16_t) _mm_extract_epi16(vh, 0);
2880     }
2881   }
2882 }
2883 
xnn_f16_vsqrt_ukernel__f16c_sqrt_x8(size_t n,const void * input,void * output,const union xnn_f16_sqrt_params params[restrict XNN_MIN_ELEMENTS (1)])2884 void xnn_f16_vsqrt_ukernel__f16c_sqrt_x8(
2885     size_t n,
2886     const void* input,
2887     void* output,
2888     const union xnn_f16_sqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2889 {
2890   assert(n != 0);
2891   assert(n % sizeof(uint16_t) == 0);
2892   assert(input != NULL);
2893   assert(output != NULL);
2894 
2895   const uint16_t* i = (const uint16_t*) input;
2896   uint16_t* o = (uint16_t*) output;
2897   for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2898     __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2899     i += 8;
2900     vacc = _mm256_sqrt_ps(vacc);
2901     _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC));
2902     o += 8;
2903   }
2904   if XNN_UNLIKELY(n != 0) {
2905     __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2906     vacc = _mm256_sqrt_ps(vacc);
2907     __m128i vh = _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC);
2908     if (n & (4 * sizeof(uint16_t))) {
2909       _mm_storel_epi64((__m128i*) o, vh);
2910       o += 4;
2911       vh = _mm_unpackhi_epi64(vh, vh);
2912     }
2913     if (n & (2 * sizeof(uint16_t))) {
2914       _mm_storeu_si32(o, vh);
2915       o += 2;
2916       vh = _mm_srli_epi64(vh, 32);
2917     }
2918     if (n & (1 * sizeof(uint16_t))) {
2919       *o = (uint16_t) _mm_extract_epi16(vh, 0);
2920     }
2921   }
2922 }
2923 
xnn_f16_vsqr_ukernel__f16c_x16(size_t n,const void * input,void * output,const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS (1)])2924 void xnn_f16_vsqr_ukernel__f16c_x16(
2925     size_t n,
2926     const void* input,
2927     void* output,
2928     const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2929 {
2930   assert(n != 0);
2931   assert(n % sizeof(uint16_t) == 0);
2932   assert(input != NULL);
2933   assert(output != NULL);
2934 
2935   const uint16_t* i = (const uint16_t*) input;
2936   uint16_t* o = (uint16_t*) output;
2937   for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2938     __m256 vacc0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2939     __m256 vacc1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 8)));
2940     i += 16;
2941 
2942     vacc0 = _mm256_mul_ps(vacc0, vacc0);
2943     vacc1 = _mm256_mul_ps(vacc1, vacc1);
2944 
2945     _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc0, _MM_FROUND_NO_EXC));
2946     _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vacc1, _MM_FROUND_NO_EXC));
2947     o += 16;
2948   }
2949   for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2950     __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2951     i += 8;
2952     vacc = _mm256_mul_ps(vacc, vacc);
2953     _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC));
2954     o += 8;
2955   }
2956   if XNN_UNLIKELY(n != 0) {
2957     __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2958     vacc = _mm256_mul_ps(vacc, vacc);
2959     __m128i vh = _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC);
2960     if (n & (4 * sizeof(uint16_t))) {
2961       _mm_storel_epi64((__m128i*) o, vh);
2962       o += 4;
2963       vh = _mm_unpackhi_epi64(vh, vh);
2964     }
2965     if (n & (2 * sizeof(uint16_t))) {
2966       _mm_storeu_si32(o, vh);
2967       o += 2;
2968       vh = _mm_srli_epi64(vh, 32);
2969     }
2970     if (n & (1 * sizeof(uint16_t))) {
2971       *o = (uint16_t) _mm_extract_epi16(vh, 0);
2972     }
2973   }
2974 }
2975 
xnn_f32_f16_vcvt_ukernel__f16c_x16(size_t n,const float * input,void * output,const union xnn_f32_f16_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])2976 void xnn_f32_f16_vcvt_ukernel__f16c_x16(
2977     size_t n,
2978     const float* input,
2979     void* output,
2980     const union xnn_f32_f16_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
2981 {
2982   assert(n != 0);
2983   assert(n % sizeof(float) == 0);
2984   assert(input != NULL);
2985   assert(output != NULL);
2986 
2987   uint16_t* o = (uint16_t*) output;
2988   for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
2989     const __m256 vf0 = _mm256_loadu_ps(input);
2990     const __m256 vf1 = _mm256_loadu_ps(input + 8);
2991     input += 16;
2992 
2993     _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vf0, _MM_FROUND_NO_EXC));
2994     _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vf1, _MM_FROUND_NO_EXC));
2995     o += 16;
2996   }
2997   for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
2998     const __m256 vf = _mm256_loadu_ps(input);
2999     input += 8;
3000 
3001     _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vf, _MM_FROUND_NO_EXC));
3002     o += 8;
3003   }
3004   if XNN_UNLIKELY(n != 0) {
3005     assert(n >= 1 * sizeof(float));
3006     assert(n <= 7 * sizeof(float));
3007     const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) &params->f16c.mask_table[7] - n));
3008 
3009     const __m256 vf = _mm256_maskload_ps(input, vmask);
3010 
3011     __m128 vf_lo = _mm256_castps256_ps128(vf);
3012     if (n & (4 * sizeof(float))) {
3013       _mm_storel_epi64((__m128i*) o, _mm_cvtps_ph(vf_lo, _MM_FROUND_NO_EXC));
3014       vf_lo = _mm256_extractf128_ps(vf, 1);
3015       o += 4;
3016     }
3017     __m128i vh = _mm_cvtps_ph(vf_lo, _MM_FROUND_NO_EXC);
3018     if (n & (2 * sizeof(float))) {
3019       _mm_storeu_si32(o, vh);
3020       vh = _mm_srli_epi64(vh, 32);
3021       o += 2;
3022     }
3023     if (n & (1 * sizeof(float))) {
3024       *((uint16_t*) o) = _mm_extract_epi16(vh, 0);
3025     }
3026   }
3027 }
3028