1*4bdc9457SAndroid Build Coastguard Worker // Copyright 2021 Google LLC
2*4bdc9457SAndroid Build Coastguard Worker //
3*4bdc9457SAndroid Build Coastguard Worker // This source code is licensed under the BSD-style license found in the
4*4bdc9457SAndroid Build Coastguard Worker // LICENSE file in the root directory of this source tree.
5*4bdc9457SAndroid Build Coastguard Worker
6*4bdc9457SAndroid Build Coastguard Worker #include <assert.h>
7*4bdc9457SAndroid Build Coastguard Worker
8*4bdc9457SAndroid Build Coastguard Worker #include <immintrin.h>
9*4bdc9457SAndroid Build Coastguard Worker
10*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/avgpool.h>
11*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/common.h>
12*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/gavgpool.h>
13*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/intrinsics-polyfill.h>
14*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/math.h>
15*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/maxpool.h>
16*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/prelu.h>
17*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/rmax.h>
18*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/vbinary.h>
19*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/vcvt.h>
20*4bdc9457SAndroid Build Coastguard Worker #include <xnnpack/vunary.h>
21*4bdc9457SAndroid Build Coastguard Worker
22*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_avgpool_minmax_ukernel_9p8x__f16c_c8(size_t output_pixels,size_t kernel_elements,size_t channels,const void ** input,size_t input_offset,const void * zero,void * buffer,void * output,size_t input_increment,size_t output_increment,const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])23*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_avgpool_minmax_ukernel_9p8x__f16c_c8(
24*4bdc9457SAndroid Build Coastguard Worker size_t output_pixels,
25*4bdc9457SAndroid Build Coastguard Worker size_t kernel_elements,
26*4bdc9457SAndroid Build Coastguard Worker size_t channels,
27*4bdc9457SAndroid Build Coastguard Worker const void** input,
28*4bdc9457SAndroid Build Coastguard Worker size_t input_offset,
29*4bdc9457SAndroid Build Coastguard Worker const void* zero,
30*4bdc9457SAndroid Build Coastguard Worker void* buffer,
31*4bdc9457SAndroid Build Coastguard Worker void* output,
32*4bdc9457SAndroid Build Coastguard Worker size_t input_increment,
33*4bdc9457SAndroid Build Coastguard Worker size_t output_increment,
34*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
35*4bdc9457SAndroid Build Coastguard Worker {
36*4bdc9457SAndroid Build Coastguard Worker assert(output_pixels != 0);
37*4bdc9457SAndroid Build Coastguard Worker assert(kernel_elements > 9);
38*4bdc9457SAndroid Build Coastguard Worker assert(channels != 0);
39*4bdc9457SAndroid Build Coastguard Worker
40*4bdc9457SAndroid Build Coastguard Worker const __m256 vscale = _mm256_load_ps(params->avx.scale);
41*4bdc9457SAndroid Build Coastguard Worker const __m256 vmin = _mm256_load_ps(params->avx.min);
42*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax = _mm256_load_ps(params->avx.max);
43*4bdc9457SAndroid Build Coastguard Worker
44*4bdc9457SAndroid Build Coastguard Worker uint16_t* o = (uint16_t*) output;
45*4bdc9457SAndroid Build Coastguard Worker do {
46*4bdc9457SAndroid Build Coastguard Worker {
47*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i0 = *input++;
48*4bdc9457SAndroid Build Coastguard Worker assert(i0 != NULL);
49*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i0 != zero) {
50*4bdc9457SAndroid Build Coastguard Worker i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
51*4bdc9457SAndroid Build Coastguard Worker }
52*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i1 = *input++;
53*4bdc9457SAndroid Build Coastguard Worker assert(i1 != NULL);
54*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i1 != zero) {
55*4bdc9457SAndroid Build Coastguard Worker i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
56*4bdc9457SAndroid Build Coastguard Worker }
57*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i2 = *input++;
58*4bdc9457SAndroid Build Coastguard Worker assert(i2 != NULL);
59*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i2 != zero) {
60*4bdc9457SAndroid Build Coastguard Worker i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
61*4bdc9457SAndroid Build Coastguard Worker }
62*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i3 = *input++;
63*4bdc9457SAndroid Build Coastguard Worker assert(i3 != NULL);
64*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i3 != zero) {
65*4bdc9457SAndroid Build Coastguard Worker i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
66*4bdc9457SAndroid Build Coastguard Worker }
67*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i4 = *input++;
68*4bdc9457SAndroid Build Coastguard Worker assert(i4 != NULL);
69*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i4 != zero) {
70*4bdc9457SAndroid Build Coastguard Worker i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
71*4bdc9457SAndroid Build Coastguard Worker }
72*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i5 = *input++;
73*4bdc9457SAndroid Build Coastguard Worker assert(i5 != NULL);
74*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i5 != zero) {
75*4bdc9457SAndroid Build Coastguard Worker i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
76*4bdc9457SAndroid Build Coastguard Worker }
77*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i6 = *input++;
78*4bdc9457SAndroid Build Coastguard Worker assert(i6 != NULL);
79*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i6 != zero) {
80*4bdc9457SAndroid Build Coastguard Worker i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
81*4bdc9457SAndroid Build Coastguard Worker }
82*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i7 = *input++;
83*4bdc9457SAndroid Build Coastguard Worker assert(i7 != NULL);
84*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i7 != zero) {
85*4bdc9457SAndroid Build Coastguard Worker i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
86*4bdc9457SAndroid Build Coastguard Worker }
87*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i8 = *input++;
88*4bdc9457SAndroid Build Coastguard Worker assert(i8 != NULL);
89*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i8 != zero) {
90*4bdc9457SAndroid Build Coastguard Worker i8 = (const uint16_t*) ((uintptr_t) i8 + input_offset);
91*4bdc9457SAndroid Build Coastguard Worker }
92*4bdc9457SAndroid Build Coastguard Worker
93*4bdc9457SAndroid Build Coastguard Worker uint16_t* b = (uint16_t*) buffer;
94*4bdc9457SAndroid Build Coastguard Worker for (size_t c = 0; c < channels; c += 8) {
95*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
96*4bdc9457SAndroid Build Coastguard Worker i0 += 8;
97*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
98*4bdc9457SAndroid Build Coastguard Worker i1 += 8;
99*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
100*4bdc9457SAndroid Build Coastguard Worker i2 += 8;
101*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
102*4bdc9457SAndroid Build Coastguard Worker i3 += 8;
103*4bdc9457SAndroid Build Coastguard Worker const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
104*4bdc9457SAndroid Build Coastguard Worker i4 += 8;
105*4bdc9457SAndroid Build Coastguard Worker const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
106*4bdc9457SAndroid Build Coastguard Worker i5 += 8;
107*4bdc9457SAndroid Build Coastguard Worker const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
108*4bdc9457SAndroid Build Coastguard Worker i6 += 8;
109*4bdc9457SAndroid Build Coastguard Worker const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
110*4bdc9457SAndroid Build Coastguard Worker i7 += 8;
111*4bdc9457SAndroid Build Coastguard Worker const __m256 vi8 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
112*4bdc9457SAndroid Build Coastguard Worker i8 += 8;
113*4bdc9457SAndroid Build Coastguard Worker
114*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum01 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi0, vi1), _MM_FROUND_NO_EXC));
115*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum23 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi2, vi3), _MM_FROUND_NO_EXC));
116*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum45 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi4, vi5), _MM_FROUND_NO_EXC));
117*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum67 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi6, vi7), _MM_FROUND_NO_EXC));
118*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum018 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01, vi8), _MM_FROUND_NO_EXC));
119*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum2345 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum23, vsum45), _MM_FROUND_NO_EXC));
120*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum01678 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum018, vsum67), _MM_FROUND_NO_EXC));
121*4bdc9457SAndroid Build Coastguard Worker const __m128i vsum = _mm256_cvtps_ph(_mm256_add_ps(vsum2345, vsum01678), _MM_FROUND_NO_EXC);
122*4bdc9457SAndroid Build Coastguard Worker
123*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) b, vsum);
124*4bdc9457SAndroid Build Coastguard Worker b += 8;
125*4bdc9457SAndroid Build Coastguard Worker }
126*4bdc9457SAndroid Build Coastguard Worker }
127*4bdc9457SAndroid Build Coastguard Worker
128*4bdc9457SAndroid Build Coastguard Worker size_t k = kernel_elements;
129*4bdc9457SAndroid Build Coastguard Worker for (k -= 9; k > 8; k -= 8) {
130*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i0 = (const uint16_t*) *input++;
131*4bdc9457SAndroid Build Coastguard Worker assert(i0 != NULL);
132*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i0 != zero) {
133*4bdc9457SAndroid Build Coastguard Worker i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
134*4bdc9457SAndroid Build Coastguard Worker }
135*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i1 = (const uint16_t*) *input++;
136*4bdc9457SAndroid Build Coastguard Worker assert(i1 != NULL);
137*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i1 != zero) {
138*4bdc9457SAndroid Build Coastguard Worker i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
139*4bdc9457SAndroid Build Coastguard Worker }
140*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i2 = (const uint16_t*) *input++;
141*4bdc9457SAndroid Build Coastguard Worker assert(i2 != NULL);
142*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i2 != zero) {
143*4bdc9457SAndroid Build Coastguard Worker i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
144*4bdc9457SAndroid Build Coastguard Worker }
145*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i3 = (const uint16_t*) *input++;
146*4bdc9457SAndroid Build Coastguard Worker assert(i3 != NULL);
147*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i3 != zero) {
148*4bdc9457SAndroid Build Coastguard Worker i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
149*4bdc9457SAndroid Build Coastguard Worker }
150*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i4 = (const uint16_t*) *input++;
151*4bdc9457SAndroid Build Coastguard Worker assert(i4 != NULL);
152*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i4 != zero) {
153*4bdc9457SAndroid Build Coastguard Worker i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
154*4bdc9457SAndroid Build Coastguard Worker }
155*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i5 = (const uint16_t*) *input++;
156*4bdc9457SAndroid Build Coastguard Worker assert(i5 != NULL);
157*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i5 != zero) {
158*4bdc9457SAndroid Build Coastguard Worker i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
159*4bdc9457SAndroid Build Coastguard Worker }
160*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i6 = (const uint16_t*) *input++;
161*4bdc9457SAndroid Build Coastguard Worker assert(i6 != NULL);
162*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i6 != zero) {
163*4bdc9457SAndroid Build Coastguard Worker i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
164*4bdc9457SAndroid Build Coastguard Worker }
165*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i7 = (const uint16_t*) *input++;
166*4bdc9457SAndroid Build Coastguard Worker assert(i7 != NULL);
167*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i7 != zero) {
168*4bdc9457SAndroid Build Coastguard Worker i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
169*4bdc9457SAndroid Build Coastguard Worker }
170*4bdc9457SAndroid Build Coastguard Worker
171*4bdc9457SAndroid Build Coastguard Worker uint16_t* b = (uint16_t*) buffer;
172*4bdc9457SAndroid Build Coastguard Worker for (size_t c = 0; c < channels; c += 8) {
173*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
174*4bdc9457SAndroid Build Coastguard Worker i0 += 8;
175*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
176*4bdc9457SAndroid Build Coastguard Worker i1 += 8;
177*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
178*4bdc9457SAndroid Build Coastguard Worker i2 += 8;
179*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
180*4bdc9457SAndroid Build Coastguard Worker i3 += 8;
181*4bdc9457SAndroid Build Coastguard Worker const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
182*4bdc9457SAndroid Build Coastguard Worker i4 += 8;
183*4bdc9457SAndroid Build Coastguard Worker const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
184*4bdc9457SAndroid Build Coastguard Worker i5 += 8;
185*4bdc9457SAndroid Build Coastguard Worker const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
186*4bdc9457SAndroid Build Coastguard Worker i6 += 8;
187*4bdc9457SAndroid Build Coastguard Worker const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
188*4bdc9457SAndroid Build Coastguard Worker i7 += 8;
189*4bdc9457SAndroid Build Coastguard Worker const __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
190*4bdc9457SAndroid Build Coastguard Worker
191*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum01 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi0, vi1), _MM_FROUND_NO_EXC));
192*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum23 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi2, vi3), _MM_FROUND_NO_EXC));
193*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum45 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi4, vi5), _MM_FROUND_NO_EXC));
194*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum67 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi6, vi7), _MM_FROUND_NO_EXC));
195*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum01a = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01, vacc), _MM_FROUND_NO_EXC));
196*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum2345 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum23, vsum45), _MM_FROUND_NO_EXC));
197*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum0167a = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01a, vsum67), _MM_FROUND_NO_EXC));
198*4bdc9457SAndroid Build Coastguard Worker const __m128i vsum = _mm256_cvtps_ph(_mm256_add_ps(vsum2345, vsum0167a), _MM_FROUND_NO_EXC);
199*4bdc9457SAndroid Build Coastguard Worker
200*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) b, vsum);
201*4bdc9457SAndroid Build Coastguard Worker b += 8;
202*4bdc9457SAndroid Build Coastguard Worker }
203*4bdc9457SAndroid Build Coastguard Worker }
204*4bdc9457SAndroid Build Coastguard Worker
205*4bdc9457SAndroid Build Coastguard Worker assert(k >= 1);
206*4bdc9457SAndroid Build Coastguard Worker {
207*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i0 = (const uint16_t*) input[0];
208*4bdc9457SAndroid Build Coastguard Worker assert(i0 != NULL);
209*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i1 = (const uint16_t*) input[1];
210*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i2 = (const uint16_t*) input[2];
211*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i3 = (const uint16_t*) input[3];
212*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i4 = (const uint16_t*) input[4];
213*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i5 = (const uint16_t*) input[5];
214*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i6 = (const uint16_t*) input[6];
215*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i7 = (const uint16_t*) input[7];
216*4bdc9457SAndroid Build Coastguard Worker input = (const void**) ((uintptr_t) input + input_increment);
217*4bdc9457SAndroid Build Coastguard Worker if (k < 2) {
218*4bdc9457SAndroid Build Coastguard Worker i1 = (const uint16_t*) zero;
219*4bdc9457SAndroid Build Coastguard Worker }
220*4bdc9457SAndroid Build Coastguard Worker assert(i1 != NULL);
221*4bdc9457SAndroid Build Coastguard Worker if (k <= 2) {
222*4bdc9457SAndroid Build Coastguard Worker i2 = (const uint16_t*) zero;
223*4bdc9457SAndroid Build Coastguard Worker }
224*4bdc9457SAndroid Build Coastguard Worker assert(i2 != NULL);
225*4bdc9457SAndroid Build Coastguard Worker if (k < 4) {
226*4bdc9457SAndroid Build Coastguard Worker i3 = (const uint16_t*) zero;
227*4bdc9457SAndroid Build Coastguard Worker }
228*4bdc9457SAndroid Build Coastguard Worker assert(i3 != NULL);
229*4bdc9457SAndroid Build Coastguard Worker if (k <= 4) {
230*4bdc9457SAndroid Build Coastguard Worker i4 = (const uint16_t*) zero;
231*4bdc9457SAndroid Build Coastguard Worker }
232*4bdc9457SAndroid Build Coastguard Worker assert(i4 != NULL);
233*4bdc9457SAndroid Build Coastguard Worker if (k < 6) {
234*4bdc9457SAndroid Build Coastguard Worker i5 = (const uint16_t*) zero;
235*4bdc9457SAndroid Build Coastguard Worker }
236*4bdc9457SAndroid Build Coastguard Worker assert(i5 != NULL);
237*4bdc9457SAndroid Build Coastguard Worker if (k <= 6) {
238*4bdc9457SAndroid Build Coastguard Worker i6 = (const uint16_t*) zero;
239*4bdc9457SAndroid Build Coastguard Worker }
240*4bdc9457SAndroid Build Coastguard Worker assert(i6 != NULL);
241*4bdc9457SAndroid Build Coastguard Worker if (k < 8) {
242*4bdc9457SAndroid Build Coastguard Worker i7 = (const uint16_t*) zero;
243*4bdc9457SAndroid Build Coastguard Worker }
244*4bdc9457SAndroid Build Coastguard Worker assert(i7 != NULL);
245*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i0 != zero) {
246*4bdc9457SAndroid Build Coastguard Worker i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
247*4bdc9457SAndroid Build Coastguard Worker }
248*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i1 != zero) {
249*4bdc9457SAndroid Build Coastguard Worker i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
250*4bdc9457SAndroid Build Coastguard Worker }
251*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i2 != zero) {
252*4bdc9457SAndroid Build Coastguard Worker i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
253*4bdc9457SAndroid Build Coastguard Worker }
254*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i3 != zero) {
255*4bdc9457SAndroid Build Coastguard Worker i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
256*4bdc9457SAndroid Build Coastguard Worker }
257*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i4 != zero) {
258*4bdc9457SAndroid Build Coastguard Worker i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
259*4bdc9457SAndroid Build Coastguard Worker }
260*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i5 != zero) {
261*4bdc9457SAndroid Build Coastguard Worker i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
262*4bdc9457SAndroid Build Coastguard Worker }
263*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i6 != zero) {
264*4bdc9457SAndroid Build Coastguard Worker i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
265*4bdc9457SAndroid Build Coastguard Worker }
266*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i7 != zero) {
267*4bdc9457SAndroid Build Coastguard Worker i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
268*4bdc9457SAndroid Build Coastguard Worker }
269*4bdc9457SAndroid Build Coastguard Worker
270*4bdc9457SAndroid Build Coastguard Worker size_t c = channels;
271*4bdc9457SAndroid Build Coastguard Worker uint16_t* b = (uint16_t*) buffer;
272*4bdc9457SAndroid Build Coastguard Worker while (c >= 8) {
273*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
274*4bdc9457SAndroid Build Coastguard Worker i0 += 8;
275*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
276*4bdc9457SAndroid Build Coastguard Worker i1 += 8;
277*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
278*4bdc9457SAndroid Build Coastguard Worker i2 += 8;
279*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
280*4bdc9457SAndroid Build Coastguard Worker i3 += 8;
281*4bdc9457SAndroid Build Coastguard Worker const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
282*4bdc9457SAndroid Build Coastguard Worker i4 += 8;
283*4bdc9457SAndroid Build Coastguard Worker const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
284*4bdc9457SAndroid Build Coastguard Worker i5 += 8;
285*4bdc9457SAndroid Build Coastguard Worker const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
286*4bdc9457SAndroid Build Coastguard Worker i6 += 8;
287*4bdc9457SAndroid Build Coastguard Worker const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
288*4bdc9457SAndroid Build Coastguard Worker i7 += 8;
289*4bdc9457SAndroid Build Coastguard Worker const __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
290*4bdc9457SAndroid Build Coastguard Worker b += 8;
291*4bdc9457SAndroid Build Coastguard Worker
292*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum01 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi0, vi1), _MM_FROUND_NO_EXC));
293*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum23 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi2, vi3), _MM_FROUND_NO_EXC));
294*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum45 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi4, vi5), _MM_FROUND_NO_EXC));
295*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum67 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi6, vi7), _MM_FROUND_NO_EXC));
296*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum01a = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01, vacc), _MM_FROUND_NO_EXC));
297*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum2345 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum23, vsum45), _MM_FROUND_NO_EXC));
298*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum0167a = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01a, vsum67), _MM_FROUND_NO_EXC));
299*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum2345, vsum0167a), _MM_FROUND_NO_EXC));
300*4bdc9457SAndroid Build Coastguard Worker
301*4bdc9457SAndroid Build Coastguard Worker __m256 vout = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vsum, vscale), _MM_FROUND_NO_EXC));
302*4bdc9457SAndroid Build Coastguard Worker vout = _mm256_max_ps(vout, vmin);
303*4bdc9457SAndroid Build Coastguard Worker vout = _mm256_min_ps(vout, vmax);
304*4bdc9457SAndroid Build Coastguard Worker
305*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC));
306*4bdc9457SAndroid Build Coastguard Worker o += 8;
307*4bdc9457SAndroid Build Coastguard Worker
308*4bdc9457SAndroid Build Coastguard Worker c -= 8;
309*4bdc9457SAndroid Build Coastguard Worker }
310*4bdc9457SAndroid Build Coastguard Worker if (c != 0) {
311*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
312*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
313*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
314*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
315*4bdc9457SAndroid Build Coastguard Worker const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
316*4bdc9457SAndroid Build Coastguard Worker const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
317*4bdc9457SAndroid Build Coastguard Worker const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
318*4bdc9457SAndroid Build Coastguard Worker const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
319*4bdc9457SAndroid Build Coastguard Worker const __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
320*4bdc9457SAndroid Build Coastguard Worker
321*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum01 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi0, vi1), _MM_FROUND_NO_EXC));
322*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum23 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi2, vi3), _MM_FROUND_NO_EXC));
323*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum45 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi4, vi5), _MM_FROUND_NO_EXC));
324*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum67 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi6, vi7), _MM_FROUND_NO_EXC));
325*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum01a = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01, vacc), _MM_FROUND_NO_EXC));
326*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum2345 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum23, vsum45), _MM_FROUND_NO_EXC));
327*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum0167a = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01a, vsum67), _MM_FROUND_NO_EXC));
328*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum2345, vsum0167a), _MM_FROUND_NO_EXC));
329*4bdc9457SAndroid Build Coastguard Worker
330*4bdc9457SAndroid Build Coastguard Worker __m256 vout = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vsum, vscale), _MM_FROUND_NO_EXC));
331*4bdc9457SAndroid Build Coastguard Worker vout = _mm256_max_ps(vout, vmin);
332*4bdc9457SAndroid Build Coastguard Worker vout = _mm256_min_ps(vout, vmax);
333*4bdc9457SAndroid Build Coastguard Worker
334*4bdc9457SAndroid Build Coastguard Worker __m128i vh = _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC);
335*4bdc9457SAndroid Build Coastguard Worker if (c & 4) {
336*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) o, vh);
337*4bdc9457SAndroid Build Coastguard Worker vh = _mm_unpackhi_epi64(vh, vh);
338*4bdc9457SAndroid Build Coastguard Worker o += 4;
339*4bdc9457SAndroid Build Coastguard Worker }
340*4bdc9457SAndroid Build Coastguard Worker if (c & 2) {
341*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(o, vh);
342*4bdc9457SAndroid Build Coastguard Worker vh = _mm_srli_epi64(vh, 32);
343*4bdc9457SAndroid Build Coastguard Worker o += 2;
344*4bdc9457SAndroid Build Coastguard Worker }
345*4bdc9457SAndroid Build Coastguard Worker if (c & 1) {
346*4bdc9457SAndroid Build Coastguard Worker *o = (uint16_t) _mm_extract_epi16(vh, 0);
347*4bdc9457SAndroid Build Coastguard Worker o += 1;
348*4bdc9457SAndroid Build Coastguard Worker }
349*4bdc9457SAndroid Build Coastguard Worker }
350*4bdc9457SAndroid Build Coastguard Worker }
351*4bdc9457SAndroid Build Coastguard Worker o = (uint16_t*) ((uintptr_t) o + output_increment);
352*4bdc9457SAndroid Build Coastguard Worker } while (--output_pixels != 0);
353*4bdc9457SAndroid Build Coastguard Worker }
354*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_avgpool_minmax_ukernel_9x__f16c_c8(size_t output_pixels,size_t kernel_elements,size_t channels,const void ** input,size_t input_offset,const void * zero,void * output,size_t input_increment,size_t output_increment,const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])355*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_avgpool_minmax_ukernel_9x__f16c_c8(
356*4bdc9457SAndroid Build Coastguard Worker size_t output_pixels,
357*4bdc9457SAndroid Build Coastguard Worker size_t kernel_elements,
358*4bdc9457SAndroid Build Coastguard Worker size_t channels,
359*4bdc9457SAndroid Build Coastguard Worker const void** input,
360*4bdc9457SAndroid Build Coastguard Worker size_t input_offset,
361*4bdc9457SAndroid Build Coastguard Worker const void* zero,
362*4bdc9457SAndroid Build Coastguard Worker void* output,
363*4bdc9457SAndroid Build Coastguard Worker size_t input_increment,
364*4bdc9457SAndroid Build Coastguard Worker size_t output_increment,
365*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
366*4bdc9457SAndroid Build Coastguard Worker {
367*4bdc9457SAndroid Build Coastguard Worker assert(output_pixels != 0);
368*4bdc9457SAndroid Build Coastguard Worker assert(kernel_elements != 0);
369*4bdc9457SAndroid Build Coastguard Worker assert(kernel_elements <= 9);
370*4bdc9457SAndroid Build Coastguard Worker assert(channels != 0);
371*4bdc9457SAndroid Build Coastguard Worker
372*4bdc9457SAndroid Build Coastguard Worker const __m256 vscale = _mm256_load_ps(params->avx.scale);
373*4bdc9457SAndroid Build Coastguard Worker const __m256 vmin = _mm256_load_ps(params->avx.min);
374*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax = _mm256_load_ps(params->avx.max);
375*4bdc9457SAndroid Build Coastguard Worker
376*4bdc9457SAndroid Build Coastguard Worker uint16_t* o = (uint16_t*) output;
377*4bdc9457SAndroid Build Coastguard Worker do {
378*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i0 = (const uint16_t*) input[0];
379*4bdc9457SAndroid Build Coastguard Worker assert(i0 != NULL);
380*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i1 = (const uint16_t*) input[1];
381*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i2 = (const uint16_t*) input[2];
382*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i3 = (const uint16_t*) input[3];
383*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i4 = (const uint16_t*) input[4];
384*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i5 = (const uint16_t*) input[5];
385*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i6 = (const uint16_t*) input[6];
386*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i7 = (const uint16_t*) input[7];
387*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i8 = (const uint16_t*) input[8];
388*4bdc9457SAndroid Build Coastguard Worker input = (const void**) ((uintptr_t) input + input_increment);
389*4bdc9457SAndroid Build Coastguard Worker if (kernel_elements < 2) {
390*4bdc9457SAndroid Build Coastguard Worker i1 = (const uint16_t*) zero;
391*4bdc9457SAndroid Build Coastguard Worker }
392*4bdc9457SAndroid Build Coastguard Worker assert(i1 != NULL);
393*4bdc9457SAndroid Build Coastguard Worker if (kernel_elements <= 2) {
394*4bdc9457SAndroid Build Coastguard Worker i2 = (const uint16_t*) zero;
395*4bdc9457SAndroid Build Coastguard Worker }
396*4bdc9457SAndroid Build Coastguard Worker assert(i2 != NULL);
397*4bdc9457SAndroid Build Coastguard Worker if (kernel_elements < 4) {
398*4bdc9457SAndroid Build Coastguard Worker i3 = (const uint16_t*) zero;
399*4bdc9457SAndroid Build Coastguard Worker }
400*4bdc9457SAndroid Build Coastguard Worker assert(i3 != NULL);
401*4bdc9457SAndroid Build Coastguard Worker if (kernel_elements <= 4) {
402*4bdc9457SAndroid Build Coastguard Worker i4 = (const uint16_t*) zero;
403*4bdc9457SAndroid Build Coastguard Worker }
404*4bdc9457SAndroid Build Coastguard Worker assert(i4 != NULL);
405*4bdc9457SAndroid Build Coastguard Worker if (kernel_elements < 6) {
406*4bdc9457SAndroid Build Coastguard Worker i5 = (const uint16_t*) zero;
407*4bdc9457SAndroid Build Coastguard Worker }
408*4bdc9457SAndroid Build Coastguard Worker assert(i5 != NULL);
409*4bdc9457SAndroid Build Coastguard Worker if (kernel_elements <= 6) {
410*4bdc9457SAndroid Build Coastguard Worker i6 = (const uint16_t*) zero;
411*4bdc9457SAndroid Build Coastguard Worker }
412*4bdc9457SAndroid Build Coastguard Worker assert(i6 != NULL);
413*4bdc9457SAndroid Build Coastguard Worker if (kernel_elements < 8) {
414*4bdc9457SAndroid Build Coastguard Worker i7 = (const uint16_t*) zero;
415*4bdc9457SAndroid Build Coastguard Worker }
416*4bdc9457SAndroid Build Coastguard Worker assert(i7 != NULL);
417*4bdc9457SAndroid Build Coastguard Worker if (kernel_elements <= 8) {
418*4bdc9457SAndroid Build Coastguard Worker i8 = (const uint16_t*) zero;
419*4bdc9457SAndroid Build Coastguard Worker }
420*4bdc9457SAndroid Build Coastguard Worker assert(i8 != NULL);
421*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i0 != zero) {
422*4bdc9457SAndroid Build Coastguard Worker i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
423*4bdc9457SAndroid Build Coastguard Worker }
424*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i1 != zero) {
425*4bdc9457SAndroid Build Coastguard Worker i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
426*4bdc9457SAndroid Build Coastguard Worker }
427*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i2 != zero) {
428*4bdc9457SAndroid Build Coastguard Worker i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
429*4bdc9457SAndroid Build Coastguard Worker }
430*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i3 != zero) {
431*4bdc9457SAndroid Build Coastguard Worker i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
432*4bdc9457SAndroid Build Coastguard Worker }
433*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i4 != zero) {
434*4bdc9457SAndroid Build Coastguard Worker i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
435*4bdc9457SAndroid Build Coastguard Worker }
436*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i5 != zero) {
437*4bdc9457SAndroid Build Coastguard Worker i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
438*4bdc9457SAndroid Build Coastguard Worker }
439*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i6 != zero) {
440*4bdc9457SAndroid Build Coastguard Worker i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
441*4bdc9457SAndroid Build Coastguard Worker }
442*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i7 != zero) {
443*4bdc9457SAndroid Build Coastguard Worker i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
444*4bdc9457SAndroid Build Coastguard Worker }
445*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(i8 != zero) {
446*4bdc9457SAndroid Build Coastguard Worker i8 = (const uint16_t*) ((uintptr_t) i8 + input_offset);
447*4bdc9457SAndroid Build Coastguard Worker }
448*4bdc9457SAndroid Build Coastguard Worker
449*4bdc9457SAndroid Build Coastguard Worker size_t c = channels;
450*4bdc9457SAndroid Build Coastguard Worker while (c >= 8) {
451*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
452*4bdc9457SAndroid Build Coastguard Worker i0 += 8;
453*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
454*4bdc9457SAndroid Build Coastguard Worker i1 += 8;
455*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
456*4bdc9457SAndroid Build Coastguard Worker i2 += 8;
457*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
458*4bdc9457SAndroid Build Coastguard Worker i3 += 8;
459*4bdc9457SAndroid Build Coastguard Worker const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
460*4bdc9457SAndroid Build Coastguard Worker i4 += 8;
461*4bdc9457SAndroid Build Coastguard Worker const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
462*4bdc9457SAndroid Build Coastguard Worker i5 += 8;
463*4bdc9457SAndroid Build Coastguard Worker const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
464*4bdc9457SAndroid Build Coastguard Worker i6 += 8;
465*4bdc9457SAndroid Build Coastguard Worker const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
466*4bdc9457SAndroid Build Coastguard Worker i7 += 8;
467*4bdc9457SAndroid Build Coastguard Worker const __m256 vi8 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
468*4bdc9457SAndroid Build Coastguard Worker i8 += 8;
469*4bdc9457SAndroid Build Coastguard Worker
470*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum01 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi0, vi1), _MM_FROUND_NO_EXC));
471*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum23 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi2, vi3), _MM_FROUND_NO_EXC));
472*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum45 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi4, vi5), _MM_FROUND_NO_EXC));
473*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum67 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi6, vi7), _MM_FROUND_NO_EXC));
474*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum018 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01, vi8), _MM_FROUND_NO_EXC));
475*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum2345 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum23, vsum45), _MM_FROUND_NO_EXC));
476*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum01678 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum018, vsum67), _MM_FROUND_NO_EXC));
477*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum2345, vsum01678), _MM_FROUND_NO_EXC));
478*4bdc9457SAndroid Build Coastguard Worker
479*4bdc9457SAndroid Build Coastguard Worker __m256 vout = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vsum, vscale), _MM_FROUND_NO_EXC));
480*4bdc9457SAndroid Build Coastguard Worker vout = _mm256_max_ps(vout, vmin);
481*4bdc9457SAndroid Build Coastguard Worker vout = _mm256_min_ps(vout, vmax);
482*4bdc9457SAndroid Build Coastguard Worker
483*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC));
484*4bdc9457SAndroid Build Coastguard Worker o += 8;
485*4bdc9457SAndroid Build Coastguard Worker
486*4bdc9457SAndroid Build Coastguard Worker c -= 8;
487*4bdc9457SAndroid Build Coastguard Worker }
488*4bdc9457SAndroid Build Coastguard Worker if (c != 0) {
489*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
490*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
491*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
492*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
493*4bdc9457SAndroid Build Coastguard Worker const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
494*4bdc9457SAndroid Build Coastguard Worker const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
495*4bdc9457SAndroid Build Coastguard Worker const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
496*4bdc9457SAndroid Build Coastguard Worker const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
497*4bdc9457SAndroid Build Coastguard Worker const __m256 vi8 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
498*4bdc9457SAndroid Build Coastguard Worker
499*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum01 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi0, vi1), _MM_FROUND_NO_EXC));
500*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum23 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi2, vi3), _MM_FROUND_NO_EXC));
501*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum45 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi4, vi5), _MM_FROUND_NO_EXC));
502*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum67 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vi6, vi7), _MM_FROUND_NO_EXC));
503*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum018 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum01, vi8), _MM_FROUND_NO_EXC));
504*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum2345 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum23, vsum45), _MM_FROUND_NO_EXC));
505*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum01678 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum018, vsum67), _MM_FROUND_NO_EXC));
506*4bdc9457SAndroid Build Coastguard Worker const __m256 vsum = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(vsum2345, vsum01678), _MM_FROUND_NO_EXC));
507*4bdc9457SAndroid Build Coastguard Worker
508*4bdc9457SAndroid Build Coastguard Worker __m256 vout = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vsum, vscale), _MM_FROUND_NO_EXC));
509*4bdc9457SAndroid Build Coastguard Worker vout = _mm256_max_ps(vout, vmin);
510*4bdc9457SAndroid Build Coastguard Worker vout = _mm256_min_ps(vout, vmax);
511*4bdc9457SAndroid Build Coastguard Worker
512*4bdc9457SAndroid Build Coastguard Worker __m128i vh = _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC);
513*4bdc9457SAndroid Build Coastguard Worker if (c & 4) {
514*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) o, vh);
515*4bdc9457SAndroid Build Coastguard Worker vh = _mm_unpackhi_epi64(vh, vh);
516*4bdc9457SAndroid Build Coastguard Worker o += 4;
517*4bdc9457SAndroid Build Coastguard Worker }
518*4bdc9457SAndroid Build Coastguard Worker if (c & 2) {
519*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(o, vh);
520*4bdc9457SAndroid Build Coastguard Worker vh = _mm_srli_epi64(vh, 32);
521*4bdc9457SAndroid Build Coastguard Worker o += 2;
522*4bdc9457SAndroid Build Coastguard Worker }
523*4bdc9457SAndroid Build Coastguard Worker if (c & 1) {
524*4bdc9457SAndroid Build Coastguard Worker *o = (uint16_t) _mm_extract_epi16(vh, 0);
525*4bdc9457SAndroid Build Coastguard Worker o += 1;
526*4bdc9457SAndroid Build Coastguard Worker }
527*4bdc9457SAndroid Build Coastguard Worker }
528*4bdc9457SAndroid Build Coastguard Worker o = (uint16_t*) ((uintptr_t) o + output_increment);
529*4bdc9457SAndroid Build Coastguard Worker } while (--output_pixels != 0);
530*4bdc9457SAndroid Build Coastguard Worker }
531*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_f32_vcvt_ukernel__f16c_x16(size_t n,const void * input,float * output,const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])532*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_f32_vcvt_ukernel__f16c_x16(
533*4bdc9457SAndroid Build Coastguard Worker size_t n,
534*4bdc9457SAndroid Build Coastguard Worker const void* input,
535*4bdc9457SAndroid Build Coastguard Worker float* output,
536*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_f32_cvt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
537*4bdc9457SAndroid Build Coastguard Worker {
538*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
539*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(uint16_t) == 0);
540*4bdc9457SAndroid Build Coastguard Worker assert(input != NULL);
541*4bdc9457SAndroid Build Coastguard Worker assert(output != NULL);
542*4bdc9457SAndroid Build Coastguard Worker
543*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i = (const uint16_t*) input;
544*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
545*4bdc9457SAndroid Build Coastguard Worker const __m256 vacc0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
546*4bdc9457SAndroid Build Coastguard Worker const __m256 vacc1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 8)));
547*4bdc9457SAndroid Build Coastguard Worker i += 16;
548*4bdc9457SAndroid Build Coastguard Worker
549*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(output, vacc0);
550*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(output + 8, vacc1);
551*4bdc9457SAndroid Build Coastguard Worker output += 16;
552*4bdc9457SAndroid Build Coastguard Worker }
553*4bdc9457SAndroid Build Coastguard Worker for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
554*4bdc9457SAndroid Build Coastguard Worker const __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
555*4bdc9457SAndroid Build Coastguard Worker i += 8;
556*4bdc9457SAndroid Build Coastguard Worker
557*4bdc9457SAndroid Build Coastguard Worker _mm256_storeu_ps(output, vacc);
558*4bdc9457SAndroid Build Coastguard Worker output += 8;
559*4bdc9457SAndroid Build Coastguard Worker }
560*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
561*4bdc9457SAndroid Build Coastguard Worker assert(n >= 1 * sizeof(uint16_t));
562*4bdc9457SAndroid Build Coastguard Worker assert(n <= 7 * sizeof(uint16_t));
563*4bdc9457SAndroid Build Coastguard Worker const __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
564*4bdc9457SAndroid Build Coastguard Worker
565*4bdc9457SAndroid Build Coastguard Worker __m128 vacc_lo = _mm256_castps256_ps128(vacc);
566*4bdc9457SAndroid Build Coastguard Worker if (n & (4 * sizeof(uint16_t))) {
567*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_ps(output, vacc_lo);
568*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm256_extractf128_ps(vacc, 1);
569*4bdc9457SAndroid Build Coastguard Worker output += 4;
570*4bdc9457SAndroid Build Coastguard Worker }
571*4bdc9457SAndroid Build Coastguard Worker if (n & (2 * sizeof(uint16_t))) {
572*4bdc9457SAndroid Build Coastguard Worker _mm_storel_pi((__m64*) output, vacc_lo);
573*4bdc9457SAndroid Build Coastguard Worker vacc_lo = _mm_movehl_ps(vacc_lo, vacc_lo);
574*4bdc9457SAndroid Build Coastguard Worker output += 2;
575*4bdc9457SAndroid Build Coastguard Worker }
576*4bdc9457SAndroid Build Coastguard Worker if (n & (1 * sizeof(uint16_t))) {
577*4bdc9457SAndroid Build Coastguard Worker _mm_store_ss(output, vacc_lo);
578*4bdc9457SAndroid Build Coastguard Worker }
579*4bdc9457SAndroid Build Coastguard Worker }
580*4bdc9457SAndroid Build Coastguard Worker }
581*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8(size_t rows,size_t channels,const void * input,size_t input_stride,const void * zero,void * buffer,void * output,const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])582*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_gavgpool_minmax_ukernel_7p7x__f16c_c8(
583*4bdc9457SAndroid Build Coastguard Worker size_t rows,
584*4bdc9457SAndroid Build Coastguard Worker size_t channels,
585*4bdc9457SAndroid Build Coastguard Worker const void* input,
586*4bdc9457SAndroid Build Coastguard Worker size_t input_stride,
587*4bdc9457SAndroid Build Coastguard Worker const void* zero,
588*4bdc9457SAndroid Build Coastguard Worker void* buffer,
589*4bdc9457SAndroid Build Coastguard Worker void* output,
590*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
591*4bdc9457SAndroid Build Coastguard Worker {
592*4bdc9457SAndroid Build Coastguard Worker assert(rows > 7);
593*4bdc9457SAndroid Build Coastguard Worker assert(channels != 0);
594*4bdc9457SAndroid Build Coastguard Worker
595*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i0 = input;
596*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
597*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride);
598*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride);
599*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride);
600*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride);
601*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride);
602*4bdc9457SAndroid Build Coastguard Worker const size_t input_increment = 7 * input_stride - round_up_po2(channels, 8) * sizeof(uint16_t);
603*4bdc9457SAndroid Build Coastguard Worker
604*4bdc9457SAndroid Build Coastguard Worker uint16_t* b = buffer;
605*4bdc9457SAndroid Build Coastguard Worker size_t c = channels;
606*4bdc9457SAndroid Build Coastguard Worker for (; c != 0; c = doz(c, 8)) {
607*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
608*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
609*4bdc9457SAndroid Build Coastguard Worker
610*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
611*4bdc9457SAndroid Build Coastguard Worker __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_NO_EXC);
612*4bdc9457SAndroid Build Coastguard Worker
613*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
614*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
615*4bdc9457SAndroid Build Coastguard Worker const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
616*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
617*4bdc9457SAndroid Build Coastguard Worker const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
618*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
619*4bdc9457SAndroid Build Coastguard Worker const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
620*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
621*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
622*4bdc9457SAndroid Build Coastguard Worker
623*4bdc9457SAndroid Build Coastguard Worker _mm_store_si128((__m128i*) b, vacc01234567); b += 8;
624*4bdc9457SAndroid Build Coastguard Worker }
625*4bdc9457SAndroid Build Coastguard Worker
626*4bdc9457SAndroid Build Coastguard Worker for (rows -= 7; rows > 7; rows -= 7) {
627*4bdc9457SAndroid Build Coastguard Worker i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment);
628*4bdc9457SAndroid Build Coastguard Worker i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment);
629*4bdc9457SAndroid Build Coastguard Worker i2 = (const uint16_t*) ((uintptr_t) i2 + input_increment);
630*4bdc9457SAndroid Build Coastguard Worker i3 = (const uint16_t*) ((uintptr_t) i3 + input_increment);
631*4bdc9457SAndroid Build Coastguard Worker i4 = (const uint16_t*) ((uintptr_t) i4 + input_increment);
632*4bdc9457SAndroid Build Coastguard Worker i5 = (const uint16_t*) ((uintptr_t) i5 + input_increment);
633*4bdc9457SAndroid Build Coastguard Worker i6 = (const uint16_t*) ((uintptr_t) i6 + input_increment);
634*4bdc9457SAndroid Build Coastguard Worker
635*4bdc9457SAndroid Build Coastguard Worker uint16_t* b = buffer;
636*4bdc9457SAndroid Build Coastguard Worker size_t c = channels;
637*4bdc9457SAndroid Build Coastguard Worker for (; c != 0; c = doz(c, 8)) {
638*4bdc9457SAndroid Build Coastguard Worker __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) b);
639*4bdc9457SAndroid Build Coastguard Worker
640*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
641*4bdc9457SAndroid Build Coastguard Worker
642*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
643*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_NO_EXC);
644*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
645*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_NO_EXC);
646*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
647*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
648*4bdc9457SAndroid Build Coastguard Worker const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
649*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
650*4bdc9457SAndroid Build Coastguard Worker const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
651*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
652*4bdc9457SAndroid Build Coastguard Worker const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
653*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
654*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
655*4bdc9457SAndroid Build Coastguard Worker
656*4bdc9457SAndroid Build Coastguard Worker _mm_store_si128((__m128i*) b, vacc01234567); b += 8;
657*4bdc9457SAndroid Build Coastguard Worker }
658*4bdc9457SAndroid Build Coastguard Worker }
659*4bdc9457SAndroid Build Coastguard Worker
660*4bdc9457SAndroid Build Coastguard Worker i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment);
661*4bdc9457SAndroid Build Coastguard Worker i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment);
662*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(rows < 2) {
663*4bdc9457SAndroid Build Coastguard Worker i1 = (const uint16_t*) zero;
664*4bdc9457SAndroid Build Coastguard Worker }
665*4bdc9457SAndroid Build Coastguard Worker i2 = (const uint16_t*) ((uintptr_t) i2 + input_increment);
666*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(rows <= 2) {
667*4bdc9457SAndroid Build Coastguard Worker i2 = (const uint16_t*) zero;
668*4bdc9457SAndroid Build Coastguard Worker }
669*4bdc9457SAndroid Build Coastguard Worker i3 = (const uint16_t*) ((uintptr_t) i3 + input_increment);
670*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(rows < 4) {
671*4bdc9457SAndroid Build Coastguard Worker i3 = (const uint16_t*) zero;
672*4bdc9457SAndroid Build Coastguard Worker }
673*4bdc9457SAndroid Build Coastguard Worker i4 = (const uint16_t*) ((uintptr_t) i4 + input_increment);
674*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(rows <= 4) {
675*4bdc9457SAndroid Build Coastguard Worker i4 = (const uint16_t*) zero;
676*4bdc9457SAndroid Build Coastguard Worker }
677*4bdc9457SAndroid Build Coastguard Worker i5 = (const uint16_t*) ((uintptr_t) i5 + input_increment);
678*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(rows < 6) {
679*4bdc9457SAndroid Build Coastguard Worker i5 = (const uint16_t*) zero;
680*4bdc9457SAndroid Build Coastguard Worker }
681*4bdc9457SAndroid Build Coastguard Worker i6 = (const uint16_t*) ((uintptr_t) i6 + input_increment);
682*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(rows <= 6) {
683*4bdc9457SAndroid Build Coastguard Worker i6 = (const uint16_t*) zero;
684*4bdc9457SAndroid Build Coastguard Worker }
685*4bdc9457SAndroid Build Coastguard Worker uint16_t* o = (uint16_t*) output;
686*4bdc9457SAndroid Build Coastguard Worker
687*4bdc9457SAndroid Build Coastguard Worker const __m256 vscale = _mm256_load_ps(params->avx.scale);
688*4bdc9457SAndroid Build Coastguard Worker const __m256 vmin = _mm256_load_ps(params->avx.min);
689*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax = _mm256_load_ps(params->avx.max);
690*4bdc9457SAndroid Build Coastguard Worker for (; channels >= 8; channels -= 8) {
691*4bdc9457SAndroid Build Coastguard Worker __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) buffer); buffer = (uint16_t*) buffer + 8;
692*4bdc9457SAndroid Build Coastguard Worker
693*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
694*4bdc9457SAndroid Build Coastguard Worker
695*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
696*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_NO_EXC);
697*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
698*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_NO_EXC);
699*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
700*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
701*4bdc9457SAndroid Build Coastguard Worker const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
702*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
703*4bdc9457SAndroid Build Coastguard Worker const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
704*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
705*4bdc9457SAndroid Build Coastguard Worker const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
706*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
707*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
708*4bdc9457SAndroid Build Coastguard Worker
709*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_NO_EXC);
710*4bdc9457SAndroid Build Coastguard Worker
711*4bdc9457SAndroid Build Coastguard Worker __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin);
712*4bdc9457SAndroid Build Coastguard Worker
713*4bdc9457SAndroid Build Coastguard Worker vout01234567 = _mm256_min_ps(vout01234567, vmax);
714*4bdc9457SAndroid Build Coastguard Worker
715*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout01234567, _MM_FROUND_NO_EXC));
716*4bdc9457SAndroid Build Coastguard Worker o += 8;
717*4bdc9457SAndroid Build Coastguard Worker }
718*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(channels != 0) {
719*4bdc9457SAndroid Build Coastguard Worker {
720*4bdc9457SAndroid Build Coastguard Worker __m128i vacc01234567 = _mm_loadu_si128((const __m128i*) buffer); buffer = (uint16_t*) buffer + 8;
721*4bdc9457SAndroid Build Coastguard Worker
722*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0)); i0 += 8;
723*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1)); i1 += 8;
724*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi0x01234567), _MM_FROUND_NO_EXC);
725*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2)); i2 += 8;
726*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi1x01234567), _MM_FROUND_NO_EXC);
727*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3)); i3 += 8;
728*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
729*4bdc9457SAndroid Build Coastguard Worker const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4)); i4 += 8;
730*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
731*4bdc9457SAndroid Build Coastguard Worker const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5)); i5 += 8;
732*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
733*4bdc9457SAndroid Build Coastguard Worker const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6)); i6 += 8;
734*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
735*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
736*4bdc9457SAndroid Build Coastguard Worker
737*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_NO_EXC);
738*4bdc9457SAndroid Build Coastguard Worker __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin);
739*4bdc9457SAndroid Build Coastguard Worker vout01234567 = _mm256_min_ps(vout01234567, vmax);
740*4bdc9457SAndroid Build Coastguard Worker
741*4bdc9457SAndroid Build Coastguard Worker __m128i vh01234567 = _mm256_cvtps_ph(vout01234567, _MM_FROUND_NO_EXC);
742*4bdc9457SAndroid Build Coastguard Worker if (channels & 4) {
743*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) o, vh01234567);
744*4bdc9457SAndroid Build Coastguard Worker o += 4;
745*4bdc9457SAndroid Build Coastguard Worker vh01234567 = _mm_unpackhi_epi64(vh01234567, vh01234567);
746*4bdc9457SAndroid Build Coastguard Worker }
747*4bdc9457SAndroid Build Coastguard Worker if (channels & 2) {
748*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(o, vh01234567);
749*4bdc9457SAndroid Build Coastguard Worker o += 2;
750*4bdc9457SAndroid Build Coastguard Worker vh01234567 = _mm_srli_epi64(vh01234567, 32);
751*4bdc9457SAndroid Build Coastguard Worker }
752*4bdc9457SAndroid Build Coastguard Worker if (channels & 1) {
753*4bdc9457SAndroid Build Coastguard Worker *o = (uint16_t) _mm_extract_epi16(vh01234567, 0);
754*4bdc9457SAndroid Build Coastguard Worker }
755*4bdc9457SAndroid Build Coastguard Worker }
756*4bdc9457SAndroid Build Coastguard Worker }
757*4bdc9457SAndroid Build Coastguard Worker }
758*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8(size_t rows,size_t channels,const void * input,size_t input_stride,const void * zero,void * output,const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS (1)])759*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_gavgpool_minmax_ukernel_7x__f16c_c8(
760*4bdc9457SAndroid Build Coastguard Worker size_t rows,
761*4bdc9457SAndroid Build Coastguard Worker size_t channels,
762*4bdc9457SAndroid Build Coastguard Worker const void* input,
763*4bdc9457SAndroid Build Coastguard Worker size_t input_stride,
764*4bdc9457SAndroid Build Coastguard Worker const void* zero,
765*4bdc9457SAndroid Build Coastguard Worker void* output,
766*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_scaleminmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
767*4bdc9457SAndroid Build Coastguard Worker {
768*4bdc9457SAndroid Build Coastguard Worker assert(rows != 0);
769*4bdc9457SAndroid Build Coastguard Worker assert(rows <= 7);
770*4bdc9457SAndroid Build Coastguard Worker assert(channels != 0);
771*4bdc9457SAndroid Build Coastguard Worker
772*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i0 = input;
773*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
774*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(rows < 2) {
775*4bdc9457SAndroid Build Coastguard Worker i1 = (const uint16_t*) zero;
776*4bdc9457SAndroid Build Coastguard Worker }
777*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i2 = (const uint16_t*) ((uintptr_t) i1 + input_stride);
778*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(rows <= 2) {
779*4bdc9457SAndroid Build Coastguard Worker i2 = (const uint16_t*) zero;
780*4bdc9457SAndroid Build Coastguard Worker }
781*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i3 = (const uint16_t*) ((uintptr_t) i2 + input_stride);
782*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(rows < 4) {
783*4bdc9457SAndroid Build Coastguard Worker i3 = (const uint16_t*) zero;
784*4bdc9457SAndroid Build Coastguard Worker }
785*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i4 = (const uint16_t*) ((uintptr_t) i3 + input_stride);
786*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(rows <= 4) {
787*4bdc9457SAndroid Build Coastguard Worker i4 = (const uint16_t*) zero;
788*4bdc9457SAndroid Build Coastguard Worker }
789*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i5 = (const uint16_t*) ((uintptr_t) i4 + input_stride);
790*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(rows < 6) {
791*4bdc9457SAndroid Build Coastguard Worker i5 = (const uint16_t*) zero;
792*4bdc9457SAndroid Build Coastguard Worker }
793*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i6 = (const uint16_t*) ((uintptr_t) i5 + input_stride);
794*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(rows <= 6) {
795*4bdc9457SAndroid Build Coastguard Worker i6 = (const uint16_t*) zero;
796*4bdc9457SAndroid Build Coastguard Worker }
797*4bdc9457SAndroid Build Coastguard Worker uint16_t* o = (uint16_t*) output;
798*4bdc9457SAndroid Build Coastguard Worker
799*4bdc9457SAndroid Build Coastguard Worker const __m256 vscale = _mm256_load_ps(params->avx.scale);
800*4bdc9457SAndroid Build Coastguard Worker const __m256 vmin = _mm256_load_ps(params->avx.min);
801*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax = _mm256_load_ps(params->avx.max);
802*4bdc9457SAndroid Build Coastguard Worker for (; channels >= 8; channels -= 8) {
803*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
804*4bdc9457SAndroid Build Coastguard Worker i0 += 8;
805*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
806*4bdc9457SAndroid Build Coastguard Worker i1 += 8;
807*4bdc9457SAndroid Build Coastguard Worker
808*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
809*4bdc9457SAndroid Build Coastguard Worker __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_NO_EXC);
810*4bdc9457SAndroid Build Coastguard Worker i2 += 8;
811*4bdc9457SAndroid Build Coastguard Worker
812*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
813*4bdc9457SAndroid Build Coastguard Worker i3 += 8;
814*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
815*4bdc9457SAndroid Build Coastguard Worker const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
816*4bdc9457SAndroid Build Coastguard Worker i4 += 8;
817*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
818*4bdc9457SAndroid Build Coastguard Worker const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
819*4bdc9457SAndroid Build Coastguard Worker i5 += 8;
820*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
821*4bdc9457SAndroid Build Coastguard Worker const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
822*4bdc9457SAndroid Build Coastguard Worker i6 += 8;
823*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
824*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
825*4bdc9457SAndroid Build Coastguard Worker
826*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_NO_EXC);
827*4bdc9457SAndroid Build Coastguard Worker
828*4bdc9457SAndroid Build Coastguard Worker __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin);
829*4bdc9457SAndroid Build Coastguard Worker
830*4bdc9457SAndroid Build Coastguard Worker vout01234567 = _mm256_min_ps(vout01234567, vmax);
831*4bdc9457SAndroid Build Coastguard Worker
832*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout01234567, _MM_FROUND_NO_EXC));
833*4bdc9457SAndroid Build Coastguard Worker o += 8;
834*4bdc9457SAndroid Build Coastguard Worker }
835*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(channels != 0) {
836*4bdc9457SAndroid Build Coastguard Worker {
837*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
838*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
839*4bdc9457SAndroid Build Coastguard Worker
840*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
841*4bdc9457SAndroid Build Coastguard Worker __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vi0x01234567, vi1x01234567), _MM_FROUND_NO_EXC);
842*4bdc9457SAndroid Build Coastguard Worker
843*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
844*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi2x01234567), _MM_FROUND_NO_EXC);
845*4bdc9457SAndroid Build Coastguard Worker const __m256 vi4x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
846*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi3x01234567), _MM_FROUND_NO_EXC);
847*4bdc9457SAndroid Build Coastguard Worker const __m256 vi5x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
848*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi4x01234567), _MM_FROUND_NO_EXC);
849*4bdc9457SAndroid Build Coastguard Worker const __m256 vi6x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
850*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi5x01234567), _MM_FROUND_NO_EXC);
851*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(_mm256_cvtph_ps(vacc01234567), vi6x01234567), _MM_FROUND_NO_EXC);
852*4bdc9457SAndroid Build Coastguard Worker
853*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vscale), _MM_FROUND_NO_EXC);
854*4bdc9457SAndroid Build Coastguard Worker __m256 vout01234567 = _mm256_max_ps(_mm256_cvtph_ps(vacc01234567), vmin);
855*4bdc9457SAndroid Build Coastguard Worker vout01234567 = _mm256_min_ps(vout01234567, vmax);
856*4bdc9457SAndroid Build Coastguard Worker
857*4bdc9457SAndroid Build Coastguard Worker __m128i vh01234567 = _mm256_cvtps_ph(vout01234567, _MM_FROUND_NO_EXC);
858*4bdc9457SAndroid Build Coastguard Worker if (channels & 4) {
859*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) o, vh01234567);
860*4bdc9457SAndroid Build Coastguard Worker o += 4;
861*4bdc9457SAndroid Build Coastguard Worker vh01234567 = _mm_unpackhi_epi64(vh01234567, vh01234567);
862*4bdc9457SAndroid Build Coastguard Worker }
863*4bdc9457SAndroid Build Coastguard Worker if (channels & 2) {
864*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(o, vh01234567);
865*4bdc9457SAndroid Build Coastguard Worker o += 2;
866*4bdc9457SAndroid Build Coastguard Worker vh01234567 = _mm_srli_epi64(vh01234567, 32);
867*4bdc9457SAndroid Build Coastguard Worker }
868*4bdc9457SAndroid Build Coastguard Worker if (channels & 1) {
869*4bdc9457SAndroid Build Coastguard Worker *o = (uint16_t) _mm_extract_epi16(vh01234567, 0);
870*4bdc9457SAndroid Build Coastguard Worker }
871*4bdc9457SAndroid Build Coastguard Worker }
872*4bdc9457SAndroid Build Coastguard Worker }
873*4bdc9457SAndroid Build Coastguard Worker }
874*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_maxpool_minmax_ukernel_9p8x__f16c_c8(size_t output_pixels,size_t kernel_elements,size_t channels,const void ** input,size_t input_offset,void * output,size_t input_increment,size_t output_increment,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])875*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_maxpool_minmax_ukernel_9p8x__f16c_c8(
876*4bdc9457SAndroid Build Coastguard Worker size_t output_pixels,
877*4bdc9457SAndroid Build Coastguard Worker size_t kernel_elements,
878*4bdc9457SAndroid Build Coastguard Worker size_t channels,
879*4bdc9457SAndroid Build Coastguard Worker const void** input,
880*4bdc9457SAndroid Build Coastguard Worker size_t input_offset,
881*4bdc9457SAndroid Build Coastguard Worker void* output,
882*4bdc9457SAndroid Build Coastguard Worker size_t input_increment,
883*4bdc9457SAndroid Build Coastguard Worker size_t output_increment,
884*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
885*4bdc9457SAndroid Build Coastguard Worker {
886*4bdc9457SAndroid Build Coastguard Worker assert(output_pixels != 0);
887*4bdc9457SAndroid Build Coastguard Worker assert(kernel_elements != 0);
888*4bdc9457SAndroid Build Coastguard Worker assert(channels != 0);
889*4bdc9457SAndroid Build Coastguard Worker
890*4bdc9457SAndroid Build Coastguard Worker const __m256 voutput_min = _mm256_load_ps(params->avx.min);
891*4bdc9457SAndroid Build Coastguard Worker const __m256 voutput_max = _mm256_load_ps(params->avx.max);
892*4bdc9457SAndroid Build Coastguard Worker do {
893*4bdc9457SAndroid Build Coastguard Worker uint16_t* o = output;
894*4bdc9457SAndroid Build Coastguard Worker {
895*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i0 = *input++;
896*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i1 = *input++;
897*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i2 = *input++;
898*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i3 = *input++;
899*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i4 = *input++;
900*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i5 = *input++;
901*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i6 = *input++;
902*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i7 = *input++;
903*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i8 = *input++;
904*4bdc9457SAndroid Build Coastguard Worker i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
905*4bdc9457SAndroid Build Coastguard Worker i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
906*4bdc9457SAndroid Build Coastguard Worker i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
907*4bdc9457SAndroid Build Coastguard Worker i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
908*4bdc9457SAndroid Build Coastguard Worker i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
909*4bdc9457SAndroid Build Coastguard Worker i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
910*4bdc9457SAndroid Build Coastguard Worker i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
911*4bdc9457SAndroid Build Coastguard Worker i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
912*4bdc9457SAndroid Build Coastguard Worker i8 = (const uint16_t*) ((uintptr_t) i8 + input_offset);
913*4bdc9457SAndroid Build Coastguard Worker if (kernel_elements < 2) {
914*4bdc9457SAndroid Build Coastguard Worker i1 = i0;
915*4bdc9457SAndroid Build Coastguard Worker }
916*4bdc9457SAndroid Build Coastguard Worker if (kernel_elements <= 2) {
917*4bdc9457SAndroid Build Coastguard Worker i2 = i0;
918*4bdc9457SAndroid Build Coastguard Worker }
919*4bdc9457SAndroid Build Coastguard Worker if (kernel_elements < 4) {
920*4bdc9457SAndroid Build Coastguard Worker i3 = i0;
921*4bdc9457SAndroid Build Coastguard Worker }
922*4bdc9457SAndroid Build Coastguard Worker if (kernel_elements <= 4) {
923*4bdc9457SAndroid Build Coastguard Worker i4 = i0;
924*4bdc9457SAndroid Build Coastguard Worker }
925*4bdc9457SAndroid Build Coastguard Worker if (kernel_elements < 6) {
926*4bdc9457SAndroid Build Coastguard Worker i5 = i0;
927*4bdc9457SAndroid Build Coastguard Worker }
928*4bdc9457SAndroid Build Coastguard Worker if (kernel_elements <= 6) {
929*4bdc9457SAndroid Build Coastguard Worker i6 = i0;
930*4bdc9457SAndroid Build Coastguard Worker }
931*4bdc9457SAndroid Build Coastguard Worker if (kernel_elements < 8) {
932*4bdc9457SAndroid Build Coastguard Worker i7 = i0;
933*4bdc9457SAndroid Build Coastguard Worker }
934*4bdc9457SAndroid Build Coastguard Worker if (kernel_elements <= 8) {
935*4bdc9457SAndroid Build Coastguard Worker i8 = i0;
936*4bdc9457SAndroid Build Coastguard Worker }
937*4bdc9457SAndroid Build Coastguard Worker
938*4bdc9457SAndroid Build Coastguard Worker size_t c = channels;
939*4bdc9457SAndroid Build Coastguard Worker for (; c >= 8; c -= 8) {
940*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
941*4bdc9457SAndroid Build Coastguard Worker i0 += 8;
942*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
943*4bdc9457SAndroid Build Coastguard Worker i1 += 8;
944*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
945*4bdc9457SAndroid Build Coastguard Worker i2 += 8;
946*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
947*4bdc9457SAndroid Build Coastguard Worker i3 += 8;
948*4bdc9457SAndroid Build Coastguard Worker const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
949*4bdc9457SAndroid Build Coastguard Worker i4 += 8;
950*4bdc9457SAndroid Build Coastguard Worker const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
951*4bdc9457SAndroid Build Coastguard Worker i5 += 8;
952*4bdc9457SAndroid Build Coastguard Worker const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
953*4bdc9457SAndroid Build Coastguard Worker i6 += 8;
954*4bdc9457SAndroid Build Coastguard Worker const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
955*4bdc9457SAndroid Build Coastguard Worker i7 += 8;
956*4bdc9457SAndroid Build Coastguard Worker const __m256 vi8 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
957*4bdc9457SAndroid Build Coastguard Worker i8 += 8;
958*4bdc9457SAndroid Build Coastguard Worker
959*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax018 = _mm256_max_ps(_mm256_max_ps(vi0, vi1), vi8);
960*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax23 = _mm256_max_ps(vi2, vi3);
961*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax45 = _mm256_max_ps(vi4, vi5);
962*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax67 = _mm256_max_ps(vi6, vi7);
963*4bdc9457SAndroid Build Coastguard Worker
964*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax2345 = _mm256_max_ps(vmax23, vmax45);
965*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax01678 = _mm256_max_ps(vmax018, vmax67);
966*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax = _mm256_max_ps(vmax2345, vmax01678);
967*4bdc9457SAndroid Build Coastguard Worker const __m256 vout = _mm256_max_ps(_mm256_min_ps(vmax, voutput_max), voutput_min);
968*4bdc9457SAndroid Build Coastguard Worker
969*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC));
970*4bdc9457SAndroid Build Coastguard Worker o += 8;
971*4bdc9457SAndroid Build Coastguard Worker }
972*4bdc9457SAndroid Build Coastguard Worker if (c != 0) {
973*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
974*4bdc9457SAndroid Build Coastguard Worker i0 += 8;
975*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
976*4bdc9457SAndroid Build Coastguard Worker i1 += 8;
977*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
978*4bdc9457SAndroid Build Coastguard Worker i2 += 8;
979*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
980*4bdc9457SAndroid Build Coastguard Worker i3 += 8;
981*4bdc9457SAndroid Build Coastguard Worker const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
982*4bdc9457SAndroid Build Coastguard Worker i4 += 8;
983*4bdc9457SAndroid Build Coastguard Worker const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
984*4bdc9457SAndroid Build Coastguard Worker i5 += 8;
985*4bdc9457SAndroid Build Coastguard Worker const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
986*4bdc9457SAndroid Build Coastguard Worker i6 += 8;
987*4bdc9457SAndroid Build Coastguard Worker const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
988*4bdc9457SAndroid Build Coastguard Worker i7 += 8;
989*4bdc9457SAndroid Build Coastguard Worker const __m256 vi8 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i8));
990*4bdc9457SAndroid Build Coastguard Worker i8 += 8;
991*4bdc9457SAndroid Build Coastguard Worker
992*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax018 = _mm256_max_ps(_mm256_max_ps(vi0, vi1), vi8);
993*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax23 = _mm256_max_ps(vi2, vi3);
994*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax45 = _mm256_max_ps(vi4, vi5);
995*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax67 = _mm256_max_ps(vi6, vi7);
996*4bdc9457SAndroid Build Coastguard Worker
997*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax2345 = _mm256_max_ps(vmax23, vmax45);
998*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax01678 = _mm256_max_ps(vmax018, vmax67);
999*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax = _mm256_max_ps(vmax2345, vmax01678);
1000*4bdc9457SAndroid Build Coastguard Worker __m256 vout = _mm256_max_ps(_mm256_min_ps(vmax, voutput_max), voutput_min);
1001*4bdc9457SAndroid Build Coastguard Worker
1002*4bdc9457SAndroid Build Coastguard Worker __m128i vh = _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC);
1003*4bdc9457SAndroid Build Coastguard Worker if (c & 4) {
1004*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) o, vh);
1005*4bdc9457SAndroid Build Coastguard Worker vh = _mm_unpackhi_epi64(vh, vh);
1006*4bdc9457SAndroid Build Coastguard Worker o += 4;
1007*4bdc9457SAndroid Build Coastguard Worker }
1008*4bdc9457SAndroid Build Coastguard Worker if (c & 2) {
1009*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(o, vh);
1010*4bdc9457SAndroid Build Coastguard Worker vh = _mm_srli_epi64(vh, 32);
1011*4bdc9457SAndroid Build Coastguard Worker o += 2;
1012*4bdc9457SAndroid Build Coastguard Worker }
1013*4bdc9457SAndroid Build Coastguard Worker if (c & 1) {
1014*4bdc9457SAndroid Build Coastguard Worker *o = _mm_extract_epi16(vh, 0);
1015*4bdc9457SAndroid Build Coastguard Worker o += 1;
1016*4bdc9457SAndroid Build Coastguard Worker }
1017*4bdc9457SAndroid Build Coastguard Worker }
1018*4bdc9457SAndroid Build Coastguard Worker }
1019*4bdc9457SAndroid Build Coastguard Worker
1020*4bdc9457SAndroid Build Coastguard Worker for (ptrdiff_t k = (ptrdiff_t) kernel_elements - 9; k > 0; k -= 8) {
1021*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i0 = *input++;
1022*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i1 = *input++;
1023*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i2 = *input++;
1024*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i3 = *input++;
1025*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i4 = *input++;
1026*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i5 = *input++;
1027*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i6 = *input++;
1028*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i7 = *input++;
1029*4bdc9457SAndroid Build Coastguard Worker i0 = (const uint16_t*) ((uintptr_t) i0 + input_offset);
1030*4bdc9457SAndroid Build Coastguard Worker i1 = (const uint16_t*) ((uintptr_t) i1 + input_offset);
1031*4bdc9457SAndroid Build Coastguard Worker i2 = (const uint16_t*) ((uintptr_t) i2 + input_offset);
1032*4bdc9457SAndroid Build Coastguard Worker i3 = (const uint16_t*) ((uintptr_t) i3 + input_offset);
1033*4bdc9457SAndroid Build Coastguard Worker i4 = (const uint16_t*) ((uintptr_t) i4 + input_offset);
1034*4bdc9457SAndroid Build Coastguard Worker i5 = (const uint16_t*) ((uintptr_t) i5 + input_offset);
1035*4bdc9457SAndroid Build Coastguard Worker i6 = (const uint16_t*) ((uintptr_t) i6 + input_offset);
1036*4bdc9457SAndroid Build Coastguard Worker i7 = (const uint16_t*) ((uintptr_t) i7 + input_offset);
1037*4bdc9457SAndroid Build Coastguard Worker if (k < 2) {
1038*4bdc9457SAndroid Build Coastguard Worker i1 = i0;
1039*4bdc9457SAndroid Build Coastguard Worker }
1040*4bdc9457SAndroid Build Coastguard Worker if (k <= 2) {
1041*4bdc9457SAndroid Build Coastguard Worker i2 = i0;
1042*4bdc9457SAndroid Build Coastguard Worker }
1043*4bdc9457SAndroid Build Coastguard Worker if (k < 4) {
1044*4bdc9457SAndroid Build Coastguard Worker i3 = i0;
1045*4bdc9457SAndroid Build Coastguard Worker }
1046*4bdc9457SAndroid Build Coastguard Worker if (k <= 4) {
1047*4bdc9457SAndroid Build Coastguard Worker i4 = i0;
1048*4bdc9457SAndroid Build Coastguard Worker }
1049*4bdc9457SAndroid Build Coastguard Worker if (k < 6) {
1050*4bdc9457SAndroid Build Coastguard Worker i5 = i0;
1051*4bdc9457SAndroid Build Coastguard Worker }
1052*4bdc9457SAndroid Build Coastguard Worker if (k <= 6) {
1053*4bdc9457SAndroid Build Coastguard Worker i6 = i0;
1054*4bdc9457SAndroid Build Coastguard Worker }
1055*4bdc9457SAndroid Build Coastguard Worker if (k < 8) {
1056*4bdc9457SAndroid Build Coastguard Worker i7 = i0;
1057*4bdc9457SAndroid Build Coastguard Worker }
1058*4bdc9457SAndroid Build Coastguard Worker
1059*4bdc9457SAndroid Build Coastguard Worker o = output;
1060*4bdc9457SAndroid Build Coastguard Worker size_t c = channels;
1061*4bdc9457SAndroid Build Coastguard Worker for (; c >= 8; c -= 8) {
1062*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
1063*4bdc9457SAndroid Build Coastguard Worker i0 += 8;
1064*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
1065*4bdc9457SAndroid Build Coastguard Worker i1 += 8;
1066*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
1067*4bdc9457SAndroid Build Coastguard Worker i2 += 8;
1068*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
1069*4bdc9457SAndroid Build Coastguard Worker i3 += 8;
1070*4bdc9457SAndroid Build Coastguard Worker const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
1071*4bdc9457SAndroid Build Coastguard Worker i4 += 8;
1072*4bdc9457SAndroid Build Coastguard Worker const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
1073*4bdc9457SAndroid Build Coastguard Worker i5 += 8;
1074*4bdc9457SAndroid Build Coastguard Worker const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
1075*4bdc9457SAndroid Build Coastguard Worker i6 += 8;
1076*4bdc9457SAndroid Build Coastguard Worker const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
1077*4bdc9457SAndroid Build Coastguard Worker i7 += 8;
1078*4bdc9457SAndroid Build Coastguard Worker const __m256 vo = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) o));
1079*4bdc9457SAndroid Build Coastguard Worker
1080*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax01 = _mm256_max_ps(_mm256_max_ps(vi0, vi1), vo);
1081*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax23 = _mm256_max_ps(vi2, vi3);
1082*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax45 = _mm256_max_ps(vi4, vi5);
1083*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax67 = _mm256_max_ps(vi6, vi7);
1084*4bdc9457SAndroid Build Coastguard Worker
1085*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax2345 = _mm256_max_ps(vmax23, vmax45);
1086*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax0167 = _mm256_max_ps(vmax01, vmax67);
1087*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax = _mm256_max_ps(vmax2345, vmax0167);
1088*4bdc9457SAndroid Build Coastguard Worker const __m256 vout = _mm256_max_ps(_mm256_min_ps(vmax, voutput_max), voutput_min);
1089*4bdc9457SAndroid Build Coastguard Worker
1090*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC));
1091*4bdc9457SAndroid Build Coastguard Worker o += 8;
1092*4bdc9457SAndroid Build Coastguard Worker }
1093*4bdc9457SAndroid Build Coastguard Worker if (c != 0) {
1094*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
1095*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
1096*4bdc9457SAndroid Build Coastguard Worker const __m256 vi2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i2));
1097*4bdc9457SAndroid Build Coastguard Worker const __m256 vi3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i3));
1098*4bdc9457SAndroid Build Coastguard Worker const __m256 vi4 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i4));
1099*4bdc9457SAndroid Build Coastguard Worker const __m256 vi5 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i5));
1100*4bdc9457SAndroid Build Coastguard Worker const __m256 vi6 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i6));
1101*4bdc9457SAndroid Build Coastguard Worker const __m256 vi7 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i7));
1102*4bdc9457SAndroid Build Coastguard Worker const __m256 vo = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) o));
1103*4bdc9457SAndroid Build Coastguard Worker
1104*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax01 = _mm256_max_ps(_mm256_max_ps(vi0, vi1), vo);
1105*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax23 = _mm256_max_ps(vi2, vi3);
1106*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax45 = _mm256_max_ps(vi4, vi5);
1107*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax67 = _mm256_max_ps(vi6, vi7);
1108*4bdc9457SAndroid Build Coastguard Worker
1109*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax2345 = _mm256_max_ps(vmax23, vmax45);
1110*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax0167 = _mm256_max_ps(vmax01, vmax67);
1111*4bdc9457SAndroid Build Coastguard Worker const __m256 vmax = _mm256_max_ps(vmax2345, vmax0167);
1112*4bdc9457SAndroid Build Coastguard Worker __m256 vout = _mm256_max_ps(_mm256_min_ps(vmax, voutput_max), voutput_min);
1113*4bdc9457SAndroid Build Coastguard Worker
1114*4bdc9457SAndroid Build Coastguard Worker __m128i vh = _mm256_cvtps_ph(vout, _MM_FROUND_NO_EXC);
1115*4bdc9457SAndroid Build Coastguard Worker if (c & 4) {
1116*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) o, vh);
1117*4bdc9457SAndroid Build Coastguard Worker vh = _mm_unpackhi_epi64(vh, vh);
1118*4bdc9457SAndroid Build Coastguard Worker o += 4;
1119*4bdc9457SAndroid Build Coastguard Worker }
1120*4bdc9457SAndroid Build Coastguard Worker if (c & 2) {
1121*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(o, vh);
1122*4bdc9457SAndroid Build Coastguard Worker vh = _mm_srli_epi64(vh, 32);
1123*4bdc9457SAndroid Build Coastguard Worker o += 2;
1124*4bdc9457SAndroid Build Coastguard Worker }
1125*4bdc9457SAndroid Build Coastguard Worker if (c & 1) {
1126*4bdc9457SAndroid Build Coastguard Worker *o = _mm_extract_epi16(vh, 0);
1127*4bdc9457SAndroid Build Coastguard Worker o += 1;
1128*4bdc9457SAndroid Build Coastguard Worker }
1129*4bdc9457SAndroid Build Coastguard Worker }
1130*4bdc9457SAndroid Build Coastguard Worker }
1131*4bdc9457SAndroid Build Coastguard Worker input = (const void**) ((uintptr_t) input + input_increment);
1132*4bdc9457SAndroid Build Coastguard Worker output = (uint16_t*) ((uintptr_t) o + output_increment);
1133*4bdc9457SAndroid Build Coastguard Worker } while (--output_pixels != 0);
1134*4bdc9457SAndroid Build Coastguard Worker }
1135*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_prelu_ukernel__f16c_2x16(size_t rows,size_t channels,const void * restrict input,size_t input_stride,const void * restrict weights,void * restrict output,size_t output_stride)1136*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_prelu_ukernel__f16c_2x16(
1137*4bdc9457SAndroid Build Coastguard Worker size_t rows,
1138*4bdc9457SAndroid Build Coastguard Worker size_t channels,
1139*4bdc9457SAndroid Build Coastguard Worker const void* restrict input,
1140*4bdc9457SAndroid Build Coastguard Worker size_t input_stride,
1141*4bdc9457SAndroid Build Coastguard Worker const void* restrict weights,
1142*4bdc9457SAndroid Build Coastguard Worker void* restrict output,
1143*4bdc9457SAndroid Build Coastguard Worker size_t output_stride) XNN_OOB_READS
1144*4bdc9457SAndroid Build Coastguard Worker {
1145*4bdc9457SAndroid Build Coastguard Worker assert(rows != 0);
1146*4bdc9457SAndroid Build Coastguard Worker assert(channels != 0);
1147*4bdc9457SAndroid Build Coastguard Worker assert(channels % sizeof(uint16_t) == 0);
1148*4bdc9457SAndroid Build Coastguard Worker
1149*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i0 = (const uint16_t*) input;
1150*4bdc9457SAndroid Build Coastguard Worker uint16_t* o0 = (uint16_t*) output;
1151*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i1 = (const uint16_t*) ((uintptr_t) i0 + input_stride);
1152*4bdc9457SAndroid Build Coastguard Worker uint16_t* o1 = (uint16_t*) ((uintptr_t) o0 + output_stride);
1153*4bdc9457SAndroid Build Coastguard Worker
1154*4bdc9457SAndroid Build Coastguard Worker const size_t input_increment = input_stride * 2 - channels;
1155*4bdc9457SAndroid Build Coastguard Worker const size_t output_increment = output_stride * 2 - channels;
1156*4bdc9457SAndroid Build Coastguard Worker
1157*4bdc9457SAndroid Build Coastguard Worker do {
1158*4bdc9457SAndroid Build Coastguard Worker if XNN_UNPREDICTABLE(rows < 2) {
1159*4bdc9457SAndroid Build Coastguard Worker i1 = i0;
1160*4bdc9457SAndroid Build Coastguard Worker o1 = o0;
1161*4bdc9457SAndroid Build Coastguard Worker }
1162*4bdc9457SAndroid Build Coastguard Worker
1163*4bdc9457SAndroid Build Coastguard Worker const uint16_t* w = (const uint16_t*) weights;
1164*4bdc9457SAndroid Build Coastguard Worker size_t c = channels;
1165*4bdc9457SAndroid Build Coastguard Worker for (; c >= 16 * sizeof(uint16_t); c -= 16 * sizeof(uint16_t)) {
1166*4bdc9457SAndroid Build Coastguard Worker const __m256 vw01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w));
1167*4bdc9457SAndroid Build Coastguard Worker const __m256 vw89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (w + 8)));
1168*4bdc9457SAndroid Build Coastguard Worker w += 16;
1169*4bdc9457SAndroid Build Coastguard Worker
1170*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x001234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
1171*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x089ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i0 + 8)));
1172*4bdc9457SAndroid Build Coastguard Worker i0 += 16;
1173*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x001234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
1174*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x089ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i1 + 8)));
1175*4bdc9457SAndroid Build Coastguard Worker i1 += 16;
1176*4bdc9457SAndroid Build Coastguard Worker
1177*4bdc9457SAndroid Build Coastguard Worker __m256 vacc0x001234567 = _mm256_mul_ps(vi0x001234567, vw01234567);
1178*4bdc9457SAndroid Build Coastguard Worker __m256 vacc0x089ABCDEF = _mm256_mul_ps(vi0x089ABCDEF, vw89ABCDEF);
1179*4bdc9457SAndroid Build Coastguard Worker __m256 vacc1x001234567 = _mm256_mul_ps(vi1x001234567, vw01234567);
1180*4bdc9457SAndroid Build Coastguard Worker __m256 vacc1x089ABCDEF = _mm256_mul_ps(vi1x089ABCDEF, vw89ABCDEF);
1181*4bdc9457SAndroid Build Coastguard Worker
1182*4bdc9457SAndroid Build Coastguard Worker vacc0x001234567 = _mm256_blendv_ps(vi0x001234567, vacc0x001234567, vi0x001234567);
1183*4bdc9457SAndroid Build Coastguard Worker vacc0x089ABCDEF = _mm256_blendv_ps(vi0x089ABCDEF, vacc0x089ABCDEF, vi0x089ABCDEF);
1184*4bdc9457SAndroid Build Coastguard Worker vacc1x001234567 = _mm256_blendv_ps(vi1x001234567, vacc1x001234567, vi1x001234567);
1185*4bdc9457SAndroid Build Coastguard Worker vacc1x089ABCDEF = _mm256_blendv_ps(vi1x089ABCDEF, vacc1x089ABCDEF, vi1x089ABCDEF);
1186*4bdc9457SAndroid Build Coastguard Worker
1187*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o0, _mm256_cvtps_ph(vacc0x089ABCDEF, _MM_FROUND_NO_EXC));
1188*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (o0 + 0), _mm256_cvtps_ph(vacc0x001234567, _MM_FROUND_NO_EXC));
1189*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (o0 + 8), _mm256_cvtps_ph(vacc0x089ABCDEF, _MM_FROUND_NO_EXC));
1190*4bdc9457SAndroid Build Coastguard Worker o0 += 16;
1191*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o1, _mm256_cvtps_ph(vacc1x089ABCDEF, _MM_FROUND_NO_EXC));
1192*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (o1 + 0), _mm256_cvtps_ph(vacc1x001234567, _MM_FROUND_NO_EXC));
1193*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (o1 + 8), _mm256_cvtps_ph(vacc1x089ABCDEF, _MM_FROUND_NO_EXC));
1194*4bdc9457SAndroid Build Coastguard Worker o1 += 16;
1195*4bdc9457SAndroid Build Coastguard Worker }
1196*4bdc9457SAndroid Build Coastguard Worker for (; c >= 8 * sizeof(uint16_t); c -= 8 * sizeof(uint16_t)) {
1197*4bdc9457SAndroid Build Coastguard Worker const __m256 vw01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w));
1198*4bdc9457SAndroid Build Coastguard Worker w += 8;
1199*4bdc9457SAndroid Build Coastguard Worker
1200*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
1201*4bdc9457SAndroid Build Coastguard Worker i0 += 8;
1202*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
1203*4bdc9457SAndroid Build Coastguard Worker i1 += 8;
1204*4bdc9457SAndroid Build Coastguard Worker
1205*4bdc9457SAndroid Build Coastguard Worker __m256 vacc0x01234567 = _mm256_mul_ps(vi0x01234567, vw01234567);
1206*4bdc9457SAndroid Build Coastguard Worker __m256 vacc1x01234567 = _mm256_mul_ps(vi1x01234567, vw01234567);
1207*4bdc9457SAndroid Build Coastguard Worker
1208*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_blendv_ps(vi0x01234567, vacc0x01234567, vi0x01234567);
1209*4bdc9457SAndroid Build Coastguard Worker vacc1x01234567 = _mm256_blendv_ps(vi1x01234567, vacc1x01234567, vi1x01234567);
1210*4bdc9457SAndroid Build Coastguard Worker
1211*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o0, _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC));
1212*4bdc9457SAndroid Build Coastguard Worker o0 += 8;
1213*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o1, _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_NO_EXC));
1214*4bdc9457SAndroid Build Coastguard Worker o1 += 8;
1215*4bdc9457SAndroid Build Coastguard Worker }
1216*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(c != 0) {
1217*4bdc9457SAndroid Build Coastguard Worker const __m256 vw01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) w));
1218*4bdc9457SAndroid Build Coastguard Worker
1219*4bdc9457SAndroid Build Coastguard Worker const __m256 vi0x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i0));
1220*4bdc9457SAndroid Build Coastguard Worker i0 = (const uint16_t*) ((uintptr_t) i0 + c);
1221*4bdc9457SAndroid Build Coastguard Worker const __m256 vi1x01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i1));
1222*4bdc9457SAndroid Build Coastguard Worker i1 = (const uint16_t*) ((uintptr_t) i1 + c);
1223*4bdc9457SAndroid Build Coastguard Worker
1224*4bdc9457SAndroid Build Coastguard Worker __m256 vacc0x01234567 = _mm256_mul_ps(vi0x01234567, vw01234567);
1225*4bdc9457SAndroid Build Coastguard Worker __m256 vacc1x01234567 = _mm256_mul_ps(vi1x01234567, vw01234567);
1226*4bdc9457SAndroid Build Coastguard Worker
1227*4bdc9457SAndroid Build Coastguard Worker vacc0x01234567 = _mm256_blendv_ps(vi0x01234567, vacc0x01234567, vi0x01234567);
1228*4bdc9457SAndroid Build Coastguard Worker vacc1x01234567 = _mm256_blendv_ps(vi1x01234567, vacc1x01234567, vi1x01234567);
1229*4bdc9457SAndroid Build Coastguard Worker
1230*4bdc9457SAndroid Build Coastguard Worker __m128i vh0x01234567 = _mm256_cvtps_ph(vacc0x01234567, _MM_FROUND_NO_EXC);
1231*4bdc9457SAndroid Build Coastguard Worker __m128i vh1x01234567 = _mm256_cvtps_ph(vacc1x01234567, _MM_FROUND_NO_EXC);
1232*4bdc9457SAndroid Build Coastguard Worker if (c & (4 * sizeof(uint16_t))) {
1233*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) o0, vh0x01234567);
1234*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) o1, vh1x01234567);
1235*4bdc9457SAndroid Build Coastguard Worker
1236*4bdc9457SAndroid Build Coastguard Worker vh0x01234567 = _mm_unpackhi_epi64(vh0x01234567, vh0x01234567);
1237*4bdc9457SAndroid Build Coastguard Worker vh1x01234567 = _mm_unpackhi_epi64(vh1x01234567, vh1x01234567);
1238*4bdc9457SAndroid Build Coastguard Worker
1239*4bdc9457SAndroid Build Coastguard Worker o0 += 4;
1240*4bdc9457SAndroid Build Coastguard Worker o1 += 4;
1241*4bdc9457SAndroid Build Coastguard Worker }
1242*4bdc9457SAndroid Build Coastguard Worker if (c & (2 * sizeof(uint16_t))) {
1243*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(o0, vh0x01234567);
1244*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(o1, vh1x01234567);
1245*4bdc9457SAndroid Build Coastguard Worker
1246*4bdc9457SAndroid Build Coastguard Worker vh0x01234567 = _mm_srli_epi64(vh0x01234567, 32);
1247*4bdc9457SAndroid Build Coastguard Worker vh1x01234567 = _mm_srli_epi64(vh1x01234567, 32);
1248*4bdc9457SAndroid Build Coastguard Worker
1249*4bdc9457SAndroid Build Coastguard Worker o0 += 2;
1250*4bdc9457SAndroid Build Coastguard Worker o1 += 2;
1251*4bdc9457SAndroid Build Coastguard Worker }
1252*4bdc9457SAndroid Build Coastguard Worker if (c & (1 * sizeof(uint16_t))) {
1253*4bdc9457SAndroid Build Coastguard Worker *o0 = (uint16_t) _mm_extract_epi16(vh0x01234567, 0);
1254*4bdc9457SAndroid Build Coastguard Worker *o1 = (uint16_t) _mm_extract_epi16(vh1x01234567, 0);
1255*4bdc9457SAndroid Build Coastguard Worker
1256*4bdc9457SAndroid Build Coastguard Worker o0 += 1;
1257*4bdc9457SAndroid Build Coastguard Worker o1 += 1;
1258*4bdc9457SAndroid Build Coastguard Worker }
1259*4bdc9457SAndroid Build Coastguard Worker }
1260*4bdc9457SAndroid Build Coastguard Worker i0 = (const uint16_t*) ((uintptr_t) i0 + input_increment);
1261*4bdc9457SAndroid Build Coastguard Worker o0 = (uint16_t*) ((uintptr_t) o0 + output_increment);
1262*4bdc9457SAndroid Build Coastguard Worker i1 = (const uint16_t*) ((uintptr_t) i1 + input_increment);
1263*4bdc9457SAndroid Build Coastguard Worker o1 = (uint16_t*) ((uintptr_t) o1 + output_increment);
1264*4bdc9457SAndroid Build Coastguard Worker rows = doz(rows, 2);
1265*4bdc9457SAndroid Build Coastguard Worker } while (rows != 0);
1266*4bdc9457SAndroid Build Coastguard Worker }
1267*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_rmax_ukernel__f16c(size_t batch,const void * input,void * output)1268*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_rmax_ukernel__f16c(
1269*4bdc9457SAndroid Build Coastguard Worker size_t batch,
1270*4bdc9457SAndroid Build Coastguard Worker const void* input,
1271*4bdc9457SAndroid Build Coastguard Worker void* output) XNN_OOB_READS
1272*4bdc9457SAndroid Build Coastguard Worker {
1273*4bdc9457SAndroid Build Coastguard Worker assert(batch != 0);
1274*4bdc9457SAndroid Build Coastguard Worker assert(batch % sizeof(uint16_t) == 0);
1275*4bdc9457SAndroid Build Coastguard Worker
1276*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i = (const uint16_t*) input;
1277*4bdc9457SAndroid Build Coastguard Worker __m128i vmax_init = _mm_shufflelo_epi16(_mm_loadl_epi64((const __m128i*) i), _MM_SHUFFLE(0, 0, 0, 0));
1278*4bdc9457SAndroid Build Coastguard Worker vmax_init = _mm_unpacklo_epi64(vmax_init, vmax_init);
1279*4bdc9457SAndroid Build Coastguard Worker __m256 vmax0 = _mm256_cvtph_ps(vmax_init);
1280*4bdc9457SAndroid Build Coastguard Worker __m256 vmax1 = vmax0;
1281*4bdc9457SAndroid Build Coastguard Worker __m256 vmax2 = vmax0;
1282*4bdc9457SAndroid Build Coastguard Worker __m256 vmax3 = vmax0;
1283*4bdc9457SAndroid Build Coastguard Worker for (; batch >= 32 * sizeof(uint16_t); batch -= 32 * sizeof(uint16_t)) {
1284*4bdc9457SAndroid Build Coastguard Worker const __m256 vx0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
1285*4bdc9457SAndroid Build Coastguard Worker const __m256 vx1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 8)));
1286*4bdc9457SAndroid Build Coastguard Worker const __m256 vx2 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 16)));
1287*4bdc9457SAndroid Build Coastguard Worker const __m256 vx3 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 24)));
1288*4bdc9457SAndroid Build Coastguard Worker i += 32;
1289*4bdc9457SAndroid Build Coastguard Worker
1290*4bdc9457SAndroid Build Coastguard Worker vmax0 = _mm256_max_ps(vmax0, vx0);
1291*4bdc9457SAndroid Build Coastguard Worker vmax1 = _mm256_max_ps(vmax1, vx1);
1292*4bdc9457SAndroid Build Coastguard Worker vmax2 = _mm256_max_ps(vmax2, vx2);
1293*4bdc9457SAndroid Build Coastguard Worker vmax3 = _mm256_max_ps(vmax3, vx3);
1294*4bdc9457SAndroid Build Coastguard Worker }
1295*4bdc9457SAndroid Build Coastguard Worker __m256 vmax = _mm256_max_ps(_mm256_max_ps(vmax0, vmax1), _mm256_max_ps(vmax2, vmax3));
1296*4bdc9457SAndroid Build Coastguard Worker for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) {
1297*4bdc9457SAndroid Build Coastguard Worker const __m256 vx = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
1298*4bdc9457SAndroid Build Coastguard Worker i += 8;
1299*4bdc9457SAndroid Build Coastguard Worker vmax = _mm256_max_ps(vmax, vx);
1300*4bdc9457SAndroid Build Coastguard Worker }
1301*4bdc9457SAndroid Build Coastguard Worker __m128 vmax_lo = _mm_max_ps(_mm256_castps256_ps128(vmax), _mm256_extractf128_ps(vmax, 1));
1302*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(batch != 0) {
1303*4bdc9457SAndroid Build Coastguard Worker const __m256 vx = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
1304*4bdc9457SAndroid Build Coastguard Worker __m128 vx_lo = _mm256_castps256_ps128(vx);
1305*4bdc9457SAndroid Build Coastguard Worker if (batch & (4 * sizeof(uint16_t))) {
1306*4bdc9457SAndroid Build Coastguard Worker vmax_lo = _mm_max_ps(vmax_lo, vx_lo);
1307*4bdc9457SAndroid Build Coastguard Worker vx_lo = _mm256_extractf128_ps(vx, 1);
1308*4bdc9457SAndroid Build Coastguard Worker }
1309*4bdc9457SAndroid Build Coastguard Worker if (batch & (2 * sizeof(uint16_t))) {
1310*4bdc9457SAndroid Build Coastguard Worker vmax_lo = _mm_blend_ps(_mm_max_ps(vmax_lo, vx_lo), vmax_lo, 0xC);
1311*4bdc9457SAndroid Build Coastguard Worker vx_lo = _mm_movehl_ps(vx_lo, vx_lo);
1312*4bdc9457SAndroid Build Coastguard Worker }
1313*4bdc9457SAndroid Build Coastguard Worker if (batch & (1 * sizeof(uint16_t))) {
1314*4bdc9457SAndroid Build Coastguard Worker vmax_lo = _mm_max_ss(vmax_lo, vx_lo);
1315*4bdc9457SAndroid Build Coastguard Worker }
1316*4bdc9457SAndroid Build Coastguard Worker }
1317*4bdc9457SAndroid Build Coastguard Worker vmax_lo = _mm_max_ps(vmax_lo, _mm_movehl_ps(vmax_lo, vmax_lo));
1318*4bdc9457SAndroid Build Coastguard Worker vmax_lo = _mm_max_ss(vmax_lo, _mm_movehdup_ps(vmax_lo));
1319*4bdc9457SAndroid Build Coastguard Worker *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(_mm_cvtps_ph(vmax_lo, _MM_FROUND_NO_EXC), 0);
1320*4bdc9457SAndroid Build Coastguard Worker }
1321*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_vadd_minmax_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1322*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_vadd_minmax_ukernel__f16c_x16(
1323*4bdc9457SAndroid Build Coastguard Worker size_t n,
1324*4bdc9457SAndroid Build Coastguard Worker const void* restrict a_ptr,
1325*4bdc9457SAndroid Build Coastguard Worker const void* restrict b_ptr,
1326*4bdc9457SAndroid Build Coastguard Worker void* restrict y_ptr,
1327*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1328*4bdc9457SAndroid Build Coastguard Worker {
1329*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
1330*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(uint16_t) == 0);
1331*4bdc9457SAndroid Build Coastguard Worker assert(a_ptr != NULL);
1332*4bdc9457SAndroid Build Coastguard Worker assert(b_ptr != NULL);
1333*4bdc9457SAndroid Build Coastguard Worker assert(y_ptr != NULL);
1334*4bdc9457SAndroid Build Coastguard Worker
1335*4bdc9457SAndroid Build Coastguard Worker const uint16_t* a = (const uint16_t*) a_ptr;
1336*4bdc9457SAndroid Build Coastguard Worker const uint16_t* b = (const uint16_t*) b_ptr;
1337*4bdc9457SAndroid Build Coastguard Worker uint16_t* y = (uint16_t*) y_ptr;
1338*4bdc9457SAndroid Build Coastguard Worker
1339*4bdc9457SAndroid Build Coastguard Worker const __m256 vy_min = _mm256_load_ps(params->avx.min);
1340*4bdc9457SAndroid Build Coastguard Worker const __m256 vy_max = _mm256_load_ps(params->avx.max);
1341*4bdc9457SAndroid Build Coastguard Worker
1342*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
1343*4bdc9457SAndroid Build Coastguard Worker const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1344*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1345*4bdc9457SAndroid Build Coastguard Worker const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
1346*4bdc9457SAndroid Build Coastguard Worker const __m256 vb456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (b + 8)));
1347*4bdc9457SAndroid Build Coastguard Worker a += 16;
1348*4bdc9457SAndroid Build Coastguard Worker b += 16;
1349*4bdc9457SAndroid Build Coastguard Worker
1350*4bdc9457SAndroid Build Coastguard Worker __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va01234567, vb01234567), _MM_FROUND_NO_EXC));
1351*4bdc9457SAndroid Build Coastguard Worker __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va456789AB, vb456789AB), _MM_FROUND_NO_EXC));
1352*4bdc9457SAndroid Build Coastguard Worker
1353*4bdc9457SAndroid Build Coastguard Worker
1354*4bdc9457SAndroid Build Coastguard Worker vy01234567 = _mm256_max_ps(vy01234567, vy_min);
1355*4bdc9457SAndroid Build Coastguard Worker vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
1356*4bdc9457SAndroid Build Coastguard Worker
1357*4bdc9457SAndroid Build Coastguard Worker vy01234567 = _mm256_min_ps(vy01234567, vy_max);
1358*4bdc9457SAndroid Build Coastguard Worker vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
1359*4bdc9457SAndroid Build Coastguard Worker
1360*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
1361*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
1362*4bdc9457SAndroid Build Coastguard Worker y += 16;
1363*4bdc9457SAndroid Build Coastguard Worker }
1364*4bdc9457SAndroid Build Coastguard Worker for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1365*4bdc9457SAndroid Build Coastguard Worker const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1366*4bdc9457SAndroid Build Coastguard Worker const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1367*4bdc9457SAndroid Build Coastguard Worker a += 8;
1368*4bdc9457SAndroid Build Coastguard Worker b += 8;
1369*4bdc9457SAndroid Build Coastguard Worker
1370*4bdc9457SAndroid Build Coastguard Worker __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_NO_EXC));
1371*4bdc9457SAndroid Build Coastguard Worker
1372*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_max_ps(vy, vy_min);
1373*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_min_ps(vy, vy_max);
1374*4bdc9457SAndroid Build Coastguard Worker
1375*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
1376*4bdc9457SAndroid Build Coastguard Worker y += 8;
1377*4bdc9457SAndroid Build Coastguard Worker }
1378*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
1379*4bdc9457SAndroid Build Coastguard Worker const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1380*4bdc9457SAndroid Build Coastguard Worker const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1381*4bdc9457SAndroid Build Coastguard Worker
1382*4bdc9457SAndroid Build Coastguard Worker __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_NO_EXC));
1383*4bdc9457SAndroid Build Coastguard Worker
1384*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_max_ps(vy, vy_min);
1385*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_min_ps(vy, vy_max);
1386*4bdc9457SAndroid Build Coastguard Worker
1387*4bdc9457SAndroid Build Coastguard Worker __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
1388*4bdc9457SAndroid Build Coastguard Worker if (n & (4 * sizeof(uint16_t))) {
1389*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) y, vh);
1390*4bdc9457SAndroid Build Coastguard Worker vh = _mm_unpackhi_epi64(vh, vh);
1391*4bdc9457SAndroid Build Coastguard Worker y += 4;
1392*4bdc9457SAndroid Build Coastguard Worker }
1393*4bdc9457SAndroid Build Coastguard Worker if (n & (2 * sizeof(uint16_t))) {
1394*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(y, vh);
1395*4bdc9457SAndroid Build Coastguard Worker vh = _mm_srli_epi64(vh, 32);
1396*4bdc9457SAndroid Build Coastguard Worker y += 2;
1397*4bdc9457SAndroid Build Coastguard Worker }
1398*4bdc9457SAndroid Build Coastguard Worker if (n & (1 * sizeof(uint16_t))) {
1399*4bdc9457SAndroid Build Coastguard Worker *y = (uint16_t) _mm_extract_epi16(vh, 0);
1400*4bdc9457SAndroid Build Coastguard Worker }
1401*4bdc9457SAndroid Build Coastguard Worker }
1402*4bdc9457SAndroid Build Coastguard Worker }
1403*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_vaddc_minmax_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1404*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_vaddc_minmax_ukernel__f16c_x16(
1405*4bdc9457SAndroid Build Coastguard Worker size_t n,
1406*4bdc9457SAndroid Build Coastguard Worker const void* restrict a_ptr,
1407*4bdc9457SAndroid Build Coastguard Worker const void* restrict b_ptr,
1408*4bdc9457SAndroid Build Coastguard Worker void* restrict y_ptr,
1409*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1410*4bdc9457SAndroid Build Coastguard Worker {
1411*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
1412*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(uint16_t) == 0);
1413*4bdc9457SAndroid Build Coastguard Worker assert(a_ptr != NULL);
1414*4bdc9457SAndroid Build Coastguard Worker assert(b_ptr != NULL);
1415*4bdc9457SAndroid Build Coastguard Worker assert(y_ptr != NULL);
1416*4bdc9457SAndroid Build Coastguard Worker
1417*4bdc9457SAndroid Build Coastguard Worker const uint16_t* a = (const uint16_t*) a_ptr;
1418*4bdc9457SAndroid Build Coastguard Worker const uint16_t* b = (const uint16_t*) b_ptr;
1419*4bdc9457SAndroid Build Coastguard Worker uint16_t* y = (uint16_t*) y_ptr;
1420*4bdc9457SAndroid Build Coastguard Worker
1421*4bdc9457SAndroid Build Coastguard Worker const __m256 vy_min = _mm256_load_ps(params->avx.min);
1422*4bdc9457SAndroid Build Coastguard Worker const __m256 vy_max = _mm256_load_ps(params->avx.max);
1423*4bdc9457SAndroid Build Coastguard Worker
1424*4bdc9457SAndroid Build Coastguard Worker const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
1425*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
1426*4bdc9457SAndroid Build Coastguard Worker const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1427*4bdc9457SAndroid Build Coastguard Worker const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
1428*4bdc9457SAndroid Build Coastguard Worker a += 16;
1429*4bdc9457SAndroid Build Coastguard Worker
1430*4bdc9457SAndroid Build Coastguard Worker __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va01234567, vb), _MM_FROUND_NO_EXC));
1431*4bdc9457SAndroid Build Coastguard Worker __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va456789AB, vb), _MM_FROUND_NO_EXC));
1432*4bdc9457SAndroid Build Coastguard Worker
1433*4bdc9457SAndroid Build Coastguard Worker
1434*4bdc9457SAndroid Build Coastguard Worker vy01234567 = _mm256_max_ps(vy01234567, vy_min);
1435*4bdc9457SAndroid Build Coastguard Worker vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
1436*4bdc9457SAndroid Build Coastguard Worker
1437*4bdc9457SAndroid Build Coastguard Worker vy01234567 = _mm256_min_ps(vy01234567, vy_max);
1438*4bdc9457SAndroid Build Coastguard Worker vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
1439*4bdc9457SAndroid Build Coastguard Worker
1440*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
1441*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
1442*4bdc9457SAndroid Build Coastguard Worker y += 16;
1443*4bdc9457SAndroid Build Coastguard Worker }
1444*4bdc9457SAndroid Build Coastguard Worker for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1445*4bdc9457SAndroid Build Coastguard Worker const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1446*4bdc9457SAndroid Build Coastguard Worker a += 8;
1447*4bdc9457SAndroid Build Coastguard Worker
1448*4bdc9457SAndroid Build Coastguard Worker __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_NO_EXC));
1449*4bdc9457SAndroid Build Coastguard Worker
1450*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_max_ps(vy, vy_min);
1451*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_min_ps(vy, vy_max);
1452*4bdc9457SAndroid Build Coastguard Worker
1453*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
1454*4bdc9457SAndroid Build Coastguard Worker y += 8;
1455*4bdc9457SAndroid Build Coastguard Worker }
1456*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
1457*4bdc9457SAndroid Build Coastguard Worker const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1458*4bdc9457SAndroid Build Coastguard Worker
1459*4bdc9457SAndroid Build Coastguard Worker __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_add_ps(va, vb), _MM_FROUND_NO_EXC));
1460*4bdc9457SAndroid Build Coastguard Worker
1461*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_max_ps(vy, vy_min);
1462*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_min_ps(vy, vy_max);
1463*4bdc9457SAndroid Build Coastguard Worker
1464*4bdc9457SAndroid Build Coastguard Worker __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
1465*4bdc9457SAndroid Build Coastguard Worker if (n & (4 * sizeof(uint16_t))) {
1466*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) y, vh);
1467*4bdc9457SAndroid Build Coastguard Worker vh = _mm_unpackhi_epi64(vh, vh);
1468*4bdc9457SAndroid Build Coastguard Worker y += 4;
1469*4bdc9457SAndroid Build Coastguard Worker }
1470*4bdc9457SAndroid Build Coastguard Worker if (n & (2 * sizeof(uint16_t))) {
1471*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(y, vh);
1472*4bdc9457SAndroid Build Coastguard Worker vh = _mm_srli_epi64(vh, 32);
1473*4bdc9457SAndroid Build Coastguard Worker y += 2;
1474*4bdc9457SAndroid Build Coastguard Worker }
1475*4bdc9457SAndroid Build Coastguard Worker if (n & (1 * sizeof(uint16_t))) {
1476*4bdc9457SAndroid Build Coastguard Worker *y = (uint16_t) _mm_extract_epi16(vh, 0);
1477*4bdc9457SAndroid Build Coastguard Worker }
1478*4bdc9457SAndroid Build Coastguard Worker }
1479*4bdc9457SAndroid Build Coastguard Worker }
1480*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_vdiv_minmax_ukernel__f16c_x8(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1481*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_vdiv_minmax_ukernel__f16c_x8(
1482*4bdc9457SAndroid Build Coastguard Worker size_t n,
1483*4bdc9457SAndroid Build Coastguard Worker const void* restrict a_ptr,
1484*4bdc9457SAndroid Build Coastguard Worker const void* restrict b_ptr,
1485*4bdc9457SAndroid Build Coastguard Worker void* restrict y_ptr,
1486*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1487*4bdc9457SAndroid Build Coastguard Worker {
1488*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
1489*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(uint16_t) == 0);
1490*4bdc9457SAndroid Build Coastguard Worker assert(a_ptr != NULL);
1491*4bdc9457SAndroid Build Coastguard Worker assert(b_ptr != NULL);
1492*4bdc9457SAndroid Build Coastguard Worker assert(y_ptr != NULL);
1493*4bdc9457SAndroid Build Coastguard Worker
1494*4bdc9457SAndroid Build Coastguard Worker const uint16_t* a = (const uint16_t*) a_ptr;
1495*4bdc9457SAndroid Build Coastguard Worker const uint16_t* b = (const uint16_t*) b_ptr;
1496*4bdc9457SAndroid Build Coastguard Worker uint16_t* y = (uint16_t*) y_ptr;
1497*4bdc9457SAndroid Build Coastguard Worker
1498*4bdc9457SAndroid Build Coastguard Worker const __m256 vy_min = _mm256_load_ps(params->avx.min);
1499*4bdc9457SAndroid Build Coastguard Worker const __m256 vy_max = _mm256_load_ps(params->avx.max);
1500*4bdc9457SAndroid Build Coastguard Worker
1501*4bdc9457SAndroid Build Coastguard Worker for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1502*4bdc9457SAndroid Build Coastguard Worker const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1503*4bdc9457SAndroid Build Coastguard Worker const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1504*4bdc9457SAndroid Build Coastguard Worker a += 8;
1505*4bdc9457SAndroid Build Coastguard Worker b += 8;
1506*4bdc9457SAndroid Build Coastguard Worker
1507*4bdc9457SAndroid Build Coastguard Worker __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_div_ps(va, vb), _MM_FROUND_NO_EXC));
1508*4bdc9457SAndroid Build Coastguard Worker
1509*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_max_ps(vy, vy_min);
1510*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_min_ps(vy, vy_max);
1511*4bdc9457SAndroid Build Coastguard Worker
1512*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
1513*4bdc9457SAndroid Build Coastguard Worker y += 8;
1514*4bdc9457SAndroid Build Coastguard Worker }
1515*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
1516*4bdc9457SAndroid Build Coastguard Worker const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1517*4bdc9457SAndroid Build Coastguard Worker const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1518*4bdc9457SAndroid Build Coastguard Worker
1519*4bdc9457SAndroid Build Coastguard Worker __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_div_ps(va, vb), _MM_FROUND_NO_EXC));
1520*4bdc9457SAndroid Build Coastguard Worker
1521*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_max_ps(vy, vy_min);
1522*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_min_ps(vy, vy_max);
1523*4bdc9457SAndroid Build Coastguard Worker
1524*4bdc9457SAndroid Build Coastguard Worker __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
1525*4bdc9457SAndroid Build Coastguard Worker if (n & (4 * sizeof(uint16_t))) {
1526*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) y, vh);
1527*4bdc9457SAndroid Build Coastguard Worker vh = _mm_unpackhi_epi64(vh, vh);
1528*4bdc9457SAndroid Build Coastguard Worker y += 4;
1529*4bdc9457SAndroid Build Coastguard Worker }
1530*4bdc9457SAndroid Build Coastguard Worker if (n & (2 * sizeof(uint16_t))) {
1531*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(y, vh);
1532*4bdc9457SAndroid Build Coastguard Worker vh = _mm_srli_epi64(vh, 32);
1533*4bdc9457SAndroid Build Coastguard Worker y += 2;
1534*4bdc9457SAndroid Build Coastguard Worker }
1535*4bdc9457SAndroid Build Coastguard Worker if (n & (1 * sizeof(uint16_t))) {
1536*4bdc9457SAndroid Build Coastguard Worker *y = (uint16_t) _mm_extract_epi16(vh, 0);
1537*4bdc9457SAndroid Build Coastguard Worker }
1538*4bdc9457SAndroid Build Coastguard Worker }
1539*4bdc9457SAndroid Build Coastguard Worker }
1540*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_vdivc_minmax_ukernel__f16c_x8(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1541*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_vdivc_minmax_ukernel__f16c_x8(
1542*4bdc9457SAndroid Build Coastguard Worker size_t n,
1543*4bdc9457SAndroid Build Coastguard Worker const void* restrict a_ptr,
1544*4bdc9457SAndroid Build Coastguard Worker const void* restrict b_ptr,
1545*4bdc9457SAndroid Build Coastguard Worker void* restrict y_ptr,
1546*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1547*4bdc9457SAndroid Build Coastguard Worker {
1548*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
1549*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(uint16_t) == 0);
1550*4bdc9457SAndroid Build Coastguard Worker assert(a_ptr != NULL);
1551*4bdc9457SAndroid Build Coastguard Worker assert(b_ptr != NULL);
1552*4bdc9457SAndroid Build Coastguard Worker assert(y_ptr != NULL);
1553*4bdc9457SAndroid Build Coastguard Worker
1554*4bdc9457SAndroid Build Coastguard Worker const uint16_t* a = (const uint16_t*) a_ptr;
1555*4bdc9457SAndroid Build Coastguard Worker const uint16_t* b = (const uint16_t*) b_ptr;
1556*4bdc9457SAndroid Build Coastguard Worker uint16_t* y = (uint16_t*) y_ptr;
1557*4bdc9457SAndroid Build Coastguard Worker
1558*4bdc9457SAndroid Build Coastguard Worker const __m256 vy_min = _mm256_load_ps(params->avx.min);
1559*4bdc9457SAndroid Build Coastguard Worker const __m256 vy_max = _mm256_load_ps(params->avx.max);
1560*4bdc9457SAndroid Build Coastguard Worker
1561*4bdc9457SAndroid Build Coastguard Worker const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
1562*4bdc9457SAndroid Build Coastguard Worker for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1563*4bdc9457SAndroid Build Coastguard Worker const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1564*4bdc9457SAndroid Build Coastguard Worker a += 8;
1565*4bdc9457SAndroid Build Coastguard Worker
1566*4bdc9457SAndroid Build Coastguard Worker __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_div_ps(va, vb), _MM_FROUND_NO_EXC));
1567*4bdc9457SAndroid Build Coastguard Worker
1568*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_max_ps(vy, vy_min);
1569*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_min_ps(vy, vy_max);
1570*4bdc9457SAndroid Build Coastguard Worker
1571*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
1572*4bdc9457SAndroid Build Coastguard Worker y += 8;
1573*4bdc9457SAndroid Build Coastguard Worker }
1574*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
1575*4bdc9457SAndroid Build Coastguard Worker const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1576*4bdc9457SAndroid Build Coastguard Worker
1577*4bdc9457SAndroid Build Coastguard Worker __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_div_ps(va, vb), _MM_FROUND_NO_EXC));
1578*4bdc9457SAndroid Build Coastguard Worker
1579*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_max_ps(vy, vy_min);
1580*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_min_ps(vy, vy_max);
1581*4bdc9457SAndroid Build Coastguard Worker
1582*4bdc9457SAndroid Build Coastguard Worker __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
1583*4bdc9457SAndroid Build Coastguard Worker if (n & (4 * sizeof(uint16_t))) {
1584*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) y, vh);
1585*4bdc9457SAndroid Build Coastguard Worker vh = _mm_unpackhi_epi64(vh, vh);
1586*4bdc9457SAndroid Build Coastguard Worker y += 4;
1587*4bdc9457SAndroid Build Coastguard Worker }
1588*4bdc9457SAndroid Build Coastguard Worker if (n & (2 * sizeof(uint16_t))) {
1589*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(y, vh);
1590*4bdc9457SAndroid Build Coastguard Worker vh = _mm_srli_epi64(vh, 32);
1591*4bdc9457SAndroid Build Coastguard Worker y += 2;
1592*4bdc9457SAndroid Build Coastguard Worker }
1593*4bdc9457SAndroid Build Coastguard Worker if (n & (1 * sizeof(uint16_t))) {
1594*4bdc9457SAndroid Build Coastguard Worker *y = (uint16_t) _mm_extract_epi16(vh, 0);
1595*4bdc9457SAndroid Build Coastguard Worker }
1596*4bdc9457SAndroid Build Coastguard Worker }
1597*4bdc9457SAndroid Build Coastguard Worker }
1598*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_vmax_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS (1)])1599*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_vmax_ukernel__f16c_x16(
1600*4bdc9457SAndroid Build Coastguard Worker size_t n,
1601*4bdc9457SAndroid Build Coastguard Worker const void* restrict a_ptr,
1602*4bdc9457SAndroid Build Coastguard Worker const void* restrict b_ptr,
1603*4bdc9457SAndroid Build Coastguard Worker void* restrict y_ptr,
1604*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1605*4bdc9457SAndroid Build Coastguard Worker {
1606*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
1607*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(uint16_t) == 0);
1608*4bdc9457SAndroid Build Coastguard Worker assert(a_ptr != NULL);
1609*4bdc9457SAndroid Build Coastguard Worker assert(b_ptr != NULL);
1610*4bdc9457SAndroid Build Coastguard Worker assert(y_ptr != NULL);
1611*4bdc9457SAndroid Build Coastguard Worker
1612*4bdc9457SAndroid Build Coastguard Worker const uint16_t* a = (const uint16_t*) a_ptr;
1613*4bdc9457SAndroid Build Coastguard Worker const uint16_t* b = (const uint16_t*) b_ptr;
1614*4bdc9457SAndroid Build Coastguard Worker uint16_t* y = (uint16_t*) y_ptr;
1615*4bdc9457SAndroid Build Coastguard Worker
1616*4bdc9457SAndroid Build Coastguard Worker
1617*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
1618*4bdc9457SAndroid Build Coastguard Worker const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1619*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1620*4bdc9457SAndroid Build Coastguard Worker const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
1621*4bdc9457SAndroid Build Coastguard Worker const __m256 vb456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (b + 8)));
1622*4bdc9457SAndroid Build Coastguard Worker a += 16;
1623*4bdc9457SAndroid Build Coastguard Worker b += 16;
1624*4bdc9457SAndroid Build Coastguard Worker
1625*4bdc9457SAndroid Build Coastguard Worker __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_max_ps(va01234567, vb01234567), _MM_FROUND_NO_EXC));
1626*4bdc9457SAndroid Build Coastguard Worker __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_max_ps(va456789AB, vb456789AB), _MM_FROUND_NO_EXC));
1627*4bdc9457SAndroid Build Coastguard Worker
1628*4bdc9457SAndroid Build Coastguard Worker
1629*4bdc9457SAndroid Build Coastguard Worker
1630*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
1631*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
1632*4bdc9457SAndroid Build Coastguard Worker y += 16;
1633*4bdc9457SAndroid Build Coastguard Worker }
1634*4bdc9457SAndroid Build Coastguard Worker for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1635*4bdc9457SAndroid Build Coastguard Worker const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1636*4bdc9457SAndroid Build Coastguard Worker const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1637*4bdc9457SAndroid Build Coastguard Worker a += 8;
1638*4bdc9457SAndroid Build Coastguard Worker b += 8;
1639*4bdc9457SAndroid Build Coastguard Worker
1640*4bdc9457SAndroid Build Coastguard Worker __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_max_ps(va, vb), _MM_FROUND_NO_EXC));
1641*4bdc9457SAndroid Build Coastguard Worker
1642*4bdc9457SAndroid Build Coastguard Worker
1643*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
1644*4bdc9457SAndroid Build Coastguard Worker y += 8;
1645*4bdc9457SAndroid Build Coastguard Worker }
1646*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
1647*4bdc9457SAndroid Build Coastguard Worker const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1648*4bdc9457SAndroid Build Coastguard Worker const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1649*4bdc9457SAndroid Build Coastguard Worker
1650*4bdc9457SAndroid Build Coastguard Worker __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_max_ps(va, vb), _MM_FROUND_NO_EXC));
1651*4bdc9457SAndroid Build Coastguard Worker
1652*4bdc9457SAndroid Build Coastguard Worker
1653*4bdc9457SAndroid Build Coastguard Worker __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
1654*4bdc9457SAndroid Build Coastguard Worker if (n & (4 * sizeof(uint16_t))) {
1655*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) y, vh);
1656*4bdc9457SAndroid Build Coastguard Worker vh = _mm_unpackhi_epi64(vh, vh);
1657*4bdc9457SAndroid Build Coastguard Worker y += 4;
1658*4bdc9457SAndroid Build Coastguard Worker }
1659*4bdc9457SAndroid Build Coastguard Worker if (n & (2 * sizeof(uint16_t))) {
1660*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(y, vh);
1661*4bdc9457SAndroid Build Coastguard Worker vh = _mm_srli_epi64(vh, 32);
1662*4bdc9457SAndroid Build Coastguard Worker y += 2;
1663*4bdc9457SAndroid Build Coastguard Worker }
1664*4bdc9457SAndroid Build Coastguard Worker if (n & (1 * sizeof(uint16_t))) {
1665*4bdc9457SAndroid Build Coastguard Worker *y = (uint16_t) _mm_extract_epi16(vh, 0);
1666*4bdc9457SAndroid Build Coastguard Worker }
1667*4bdc9457SAndroid Build Coastguard Worker }
1668*4bdc9457SAndroid Build Coastguard Worker }
1669*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_vmaxc_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS (1)])1670*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_vmaxc_ukernel__f16c_x16(
1671*4bdc9457SAndroid Build Coastguard Worker size_t n,
1672*4bdc9457SAndroid Build Coastguard Worker const void* restrict a_ptr,
1673*4bdc9457SAndroid Build Coastguard Worker const void* restrict b_ptr,
1674*4bdc9457SAndroid Build Coastguard Worker void* restrict y_ptr,
1675*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1676*4bdc9457SAndroid Build Coastguard Worker {
1677*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
1678*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(uint16_t) == 0);
1679*4bdc9457SAndroid Build Coastguard Worker assert(a_ptr != NULL);
1680*4bdc9457SAndroid Build Coastguard Worker assert(b_ptr != NULL);
1681*4bdc9457SAndroid Build Coastguard Worker assert(y_ptr != NULL);
1682*4bdc9457SAndroid Build Coastguard Worker
1683*4bdc9457SAndroid Build Coastguard Worker const uint16_t* a = (const uint16_t*) a_ptr;
1684*4bdc9457SAndroid Build Coastguard Worker const uint16_t* b = (const uint16_t*) b_ptr;
1685*4bdc9457SAndroid Build Coastguard Worker uint16_t* y = (uint16_t*) y_ptr;
1686*4bdc9457SAndroid Build Coastguard Worker
1687*4bdc9457SAndroid Build Coastguard Worker
1688*4bdc9457SAndroid Build Coastguard Worker const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
1689*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
1690*4bdc9457SAndroid Build Coastguard Worker const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1691*4bdc9457SAndroid Build Coastguard Worker const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
1692*4bdc9457SAndroid Build Coastguard Worker a += 16;
1693*4bdc9457SAndroid Build Coastguard Worker
1694*4bdc9457SAndroid Build Coastguard Worker __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_max_ps(va01234567, vb), _MM_FROUND_NO_EXC));
1695*4bdc9457SAndroid Build Coastguard Worker __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_max_ps(va456789AB, vb), _MM_FROUND_NO_EXC));
1696*4bdc9457SAndroid Build Coastguard Worker
1697*4bdc9457SAndroid Build Coastguard Worker
1698*4bdc9457SAndroid Build Coastguard Worker
1699*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
1700*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
1701*4bdc9457SAndroid Build Coastguard Worker y += 16;
1702*4bdc9457SAndroid Build Coastguard Worker }
1703*4bdc9457SAndroid Build Coastguard Worker for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1704*4bdc9457SAndroid Build Coastguard Worker const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1705*4bdc9457SAndroid Build Coastguard Worker a += 8;
1706*4bdc9457SAndroid Build Coastguard Worker
1707*4bdc9457SAndroid Build Coastguard Worker __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_max_ps(va, vb), _MM_FROUND_NO_EXC));
1708*4bdc9457SAndroid Build Coastguard Worker
1709*4bdc9457SAndroid Build Coastguard Worker
1710*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
1711*4bdc9457SAndroid Build Coastguard Worker y += 8;
1712*4bdc9457SAndroid Build Coastguard Worker }
1713*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
1714*4bdc9457SAndroid Build Coastguard Worker const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1715*4bdc9457SAndroid Build Coastguard Worker
1716*4bdc9457SAndroid Build Coastguard Worker __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_max_ps(va, vb), _MM_FROUND_NO_EXC));
1717*4bdc9457SAndroid Build Coastguard Worker
1718*4bdc9457SAndroid Build Coastguard Worker
1719*4bdc9457SAndroid Build Coastguard Worker __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
1720*4bdc9457SAndroid Build Coastguard Worker if (n & (4 * sizeof(uint16_t))) {
1721*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) y, vh);
1722*4bdc9457SAndroid Build Coastguard Worker vh = _mm_unpackhi_epi64(vh, vh);
1723*4bdc9457SAndroid Build Coastguard Worker y += 4;
1724*4bdc9457SAndroid Build Coastguard Worker }
1725*4bdc9457SAndroid Build Coastguard Worker if (n & (2 * sizeof(uint16_t))) {
1726*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(y, vh);
1727*4bdc9457SAndroid Build Coastguard Worker vh = _mm_srli_epi64(vh, 32);
1728*4bdc9457SAndroid Build Coastguard Worker y += 2;
1729*4bdc9457SAndroid Build Coastguard Worker }
1730*4bdc9457SAndroid Build Coastguard Worker if (n & (1 * sizeof(uint16_t))) {
1731*4bdc9457SAndroid Build Coastguard Worker *y = (uint16_t) _mm_extract_epi16(vh, 0);
1732*4bdc9457SAndroid Build Coastguard Worker }
1733*4bdc9457SAndroid Build Coastguard Worker }
1734*4bdc9457SAndroid Build Coastguard Worker }
1735*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_vmin_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS (1)])1736*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_vmin_ukernel__f16c_x16(
1737*4bdc9457SAndroid Build Coastguard Worker size_t n,
1738*4bdc9457SAndroid Build Coastguard Worker const void* restrict a_ptr,
1739*4bdc9457SAndroid Build Coastguard Worker const void* restrict b_ptr,
1740*4bdc9457SAndroid Build Coastguard Worker void* restrict y_ptr,
1741*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1742*4bdc9457SAndroid Build Coastguard Worker {
1743*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
1744*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(uint16_t) == 0);
1745*4bdc9457SAndroid Build Coastguard Worker assert(a_ptr != NULL);
1746*4bdc9457SAndroid Build Coastguard Worker assert(b_ptr != NULL);
1747*4bdc9457SAndroid Build Coastguard Worker assert(y_ptr != NULL);
1748*4bdc9457SAndroid Build Coastguard Worker
1749*4bdc9457SAndroid Build Coastguard Worker const uint16_t* a = (const uint16_t*) a_ptr;
1750*4bdc9457SAndroid Build Coastguard Worker const uint16_t* b = (const uint16_t*) b_ptr;
1751*4bdc9457SAndroid Build Coastguard Worker uint16_t* y = (uint16_t*) y_ptr;
1752*4bdc9457SAndroid Build Coastguard Worker
1753*4bdc9457SAndroid Build Coastguard Worker
1754*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
1755*4bdc9457SAndroid Build Coastguard Worker const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1756*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1757*4bdc9457SAndroid Build Coastguard Worker const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
1758*4bdc9457SAndroid Build Coastguard Worker const __m256 vb456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (b + 8)));
1759*4bdc9457SAndroid Build Coastguard Worker a += 16;
1760*4bdc9457SAndroid Build Coastguard Worker b += 16;
1761*4bdc9457SAndroid Build Coastguard Worker
1762*4bdc9457SAndroid Build Coastguard Worker __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_min_ps(va01234567, vb01234567), _MM_FROUND_NO_EXC));
1763*4bdc9457SAndroid Build Coastguard Worker __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_min_ps(va456789AB, vb456789AB), _MM_FROUND_NO_EXC));
1764*4bdc9457SAndroid Build Coastguard Worker
1765*4bdc9457SAndroid Build Coastguard Worker
1766*4bdc9457SAndroid Build Coastguard Worker
1767*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
1768*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
1769*4bdc9457SAndroid Build Coastguard Worker y += 16;
1770*4bdc9457SAndroid Build Coastguard Worker }
1771*4bdc9457SAndroid Build Coastguard Worker for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1772*4bdc9457SAndroid Build Coastguard Worker const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1773*4bdc9457SAndroid Build Coastguard Worker const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1774*4bdc9457SAndroid Build Coastguard Worker a += 8;
1775*4bdc9457SAndroid Build Coastguard Worker b += 8;
1776*4bdc9457SAndroid Build Coastguard Worker
1777*4bdc9457SAndroid Build Coastguard Worker __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_min_ps(va, vb), _MM_FROUND_NO_EXC));
1778*4bdc9457SAndroid Build Coastguard Worker
1779*4bdc9457SAndroid Build Coastguard Worker
1780*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
1781*4bdc9457SAndroid Build Coastguard Worker y += 8;
1782*4bdc9457SAndroid Build Coastguard Worker }
1783*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
1784*4bdc9457SAndroid Build Coastguard Worker const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1785*4bdc9457SAndroid Build Coastguard Worker const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1786*4bdc9457SAndroid Build Coastguard Worker
1787*4bdc9457SAndroid Build Coastguard Worker __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_min_ps(va, vb), _MM_FROUND_NO_EXC));
1788*4bdc9457SAndroid Build Coastguard Worker
1789*4bdc9457SAndroid Build Coastguard Worker
1790*4bdc9457SAndroid Build Coastguard Worker __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
1791*4bdc9457SAndroid Build Coastguard Worker if (n & (4 * sizeof(uint16_t))) {
1792*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) y, vh);
1793*4bdc9457SAndroid Build Coastguard Worker vh = _mm_unpackhi_epi64(vh, vh);
1794*4bdc9457SAndroid Build Coastguard Worker y += 4;
1795*4bdc9457SAndroid Build Coastguard Worker }
1796*4bdc9457SAndroid Build Coastguard Worker if (n & (2 * sizeof(uint16_t))) {
1797*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(y, vh);
1798*4bdc9457SAndroid Build Coastguard Worker vh = _mm_srli_epi64(vh, 32);
1799*4bdc9457SAndroid Build Coastguard Worker y += 2;
1800*4bdc9457SAndroid Build Coastguard Worker }
1801*4bdc9457SAndroid Build Coastguard Worker if (n & (1 * sizeof(uint16_t))) {
1802*4bdc9457SAndroid Build Coastguard Worker *y = (uint16_t) _mm_extract_epi16(vh, 0);
1803*4bdc9457SAndroid Build Coastguard Worker }
1804*4bdc9457SAndroid Build Coastguard Worker }
1805*4bdc9457SAndroid Build Coastguard Worker }
1806*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_vminc_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS (1)])1807*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_vminc_ukernel__f16c_x16(
1808*4bdc9457SAndroid Build Coastguard Worker size_t n,
1809*4bdc9457SAndroid Build Coastguard Worker const void* restrict a_ptr,
1810*4bdc9457SAndroid Build Coastguard Worker const void* restrict b_ptr,
1811*4bdc9457SAndroid Build Coastguard Worker void* restrict y_ptr,
1812*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1813*4bdc9457SAndroid Build Coastguard Worker {
1814*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
1815*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(uint16_t) == 0);
1816*4bdc9457SAndroid Build Coastguard Worker assert(a_ptr != NULL);
1817*4bdc9457SAndroid Build Coastguard Worker assert(b_ptr != NULL);
1818*4bdc9457SAndroid Build Coastguard Worker assert(y_ptr != NULL);
1819*4bdc9457SAndroid Build Coastguard Worker
1820*4bdc9457SAndroid Build Coastguard Worker const uint16_t* a = (const uint16_t*) a_ptr;
1821*4bdc9457SAndroid Build Coastguard Worker const uint16_t* b = (const uint16_t*) b_ptr;
1822*4bdc9457SAndroid Build Coastguard Worker uint16_t* y = (uint16_t*) y_ptr;
1823*4bdc9457SAndroid Build Coastguard Worker
1824*4bdc9457SAndroid Build Coastguard Worker
1825*4bdc9457SAndroid Build Coastguard Worker const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
1826*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
1827*4bdc9457SAndroid Build Coastguard Worker const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1828*4bdc9457SAndroid Build Coastguard Worker const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
1829*4bdc9457SAndroid Build Coastguard Worker a += 16;
1830*4bdc9457SAndroid Build Coastguard Worker
1831*4bdc9457SAndroid Build Coastguard Worker __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_min_ps(va01234567, vb), _MM_FROUND_NO_EXC));
1832*4bdc9457SAndroid Build Coastguard Worker __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_min_ps(va456789AB, vb), _MM_FROUND_NO_EXC));
1833*4bdc9457SAndroid Build Coastguard Worker
1834*4bdc9457SAndroid Build Coastguard Worker
1835*4bdc9457SAndroid Build Coastguard Worker
1836*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
1837*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
1838*4bdc9457SAndroid Build Coastguard Worker y += 16;
1839*4bdc9457SAndroid Build Coastguard Worker }
1840*4bdc9457SAndroid Build Coastguard Worker for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1841*4bdc9457SAndroid Build Coastguard Worker const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1842*4bdc9457SAndroid Build Coastguard Worker a += 8;
1843*4bdc9457SAndroid Build Coastguard Worker
1844*4bdc9457SAndroid Build Coastguard Worker __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_min_ps(va, vb), _MM_FROUND_NO_EXC));
1845*4bdc9457SAndroid Build Coastguard Worker
1846*4bdc9457SAndroid Build Coastguard Worker
1847*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
1848*4bdc9457SAndroid Build Coastguard Worker y += 8;
1849*4bdc9457SAndroid Build Coastguard Worker }
1850*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
1851*4bdc9457SAndroid Build Coastguard Worker const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1852*4bdc9457SAndroid Build Coastguard Worker
1853*4bdc9457SAndroid Build Coastguard Worker __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_min_ps(va, vb), _MM_FROUND_NO_EXC));
1854*4bdc9457SAndroid Build Coastguard Worker
1855*4bdc9457SAndroid Build Coastguard Worker
1856*4bdc9457SAndroid Build Coastguard Worker __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
1857*4bdc9457SAndroid Build Coastguard Worker if (n & (4 * sizeof(uint16_t))) {
1858*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) y, vh);
1859*4bdc9457SAndroid Build Coastguard Worker vh = _mm_unpackhi_epi64(vh, vh);
1860*4bdc9457SAndroid Build Coastguard Worker y += 4;
1861*4bdc9457SAndroid Build Coastguard Worker }
1862*4bdc9457SAndroid Build Coastguard Worker if (n & (2 * sizeof(uint16_t))) {
1863*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(y, vh);
1864*4bdc9457SAndroid Build Coastguard Worker vh = _mm_srli_epi64(vh, 32);
1865*4bdc9457SAndroid Build Coastguard Worker y += 2;
1866*4bdc9457SAndroid Build Coastguard Worker }
1867*4bdc9457SAndroid Build Coastguard Worker if (n & (1 * sizeof(uint16_t))) {
1868*4bdc9457SAndroid Build Coastguard Worker *y = (uint16_t) _mm_extract_epi16(vh, 0);
1869*4bdc9457SAndroid Build Coastguard Worker }
1870*4bdc9457SAndroid Build Coastguard Worker }
1871*4bdc9457SAndroid Build Coastguard Worker }
1872*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_vmul_minmax_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1873*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_vmul_minmax_ukernel__f16c_x16(
1874*4bdc9457SAndroid Build Coastguard Worker size_t n,
1875*4bdc9457SAndroid Build Coastguard Worker const void* restrict a_ptr,
1876*4bdc9457SAndroid Build Coastguard Worker const void* restrict b_ptr,
1877*4bdc9457SAndroid Build Coastguard Worker void* restrict y_ptr,
1878*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1879*4bdc9457SAndroid Build Coastguard Worker {
1880*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
1881*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(uint16_t) == 0);
1882*4bdc9457SAndroid Build Coastguard Worker assert(a_ptr != NULL);
1883*4bdc9457SAndroid Build Coastguard Worker assert(b_ptr != NULL);
1884*4bdc9457SAndroid Build Coastguard Worker assert(y_ptr != NULL);
1885*4bdc9457SAndroid Build Coastguard Worker
1886*4bdc9457SAndroid Build Coastguard Worker const uint16_t* a = (const uint16_t*) a_ptr;
1887*4bdc9457SAndroid Build Coastguard Worker const uint16_t* b = (const uint16_t*) b_ptr;
1888*4bdc9457SAndroid Build Coastguard Worker uint16_t* y = (uint16_t*) y_ptr;
1889*4bdc9457SAndroid Build Coastguard Worker
1890*4bdc9457SAndroid Build Coastguard Worker const __m256 vy_min = _mm256_load_ps(params->avx.min);
1891*4bdc9457SAndroid Build Coastguard Worker const __m256 vy_max = _mm256_load_ps(params->avx.max);
1892*4bdc9457SAndroid Build Coastguard Worker
1893*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
1894*4bdc9457SAndroid Build Coastguard Worker const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1895*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1896*4bdc9457SAndroid Build Coastguard Worker const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
1897*4bdc9457SAndroid Build Coastguard Worker const __m256 vb456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (b + 8)));
1898*4bdc9457SAndroid Build Coastguard Worker a += 16;
1899*4bdc9457SAndroid Build Coastguard Worker b += 16;
1900*4bdc9457SAndroid Build Coastguard Worker
1901*4bdc9457SAndroid Build Coastguard Worker __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va01234567, vb01234567), _MM_FROUND_NO_EXC));
1902*4bdc9457SAndroid Build Coastguard Worker __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va456789AB, vb456789AB), _MM_FROUND_NO_EXC));
1903*4bdc9457SAndroid Build Coastguard Worker
1904*4bdc9457SAndroid Build Coastguard Worker
1905*4bdc9457SAndroid Build Coastguard Worker vy01234567 = _mm256_max_ps(vy01234567, vy_min);
1906*4bdc9457SAndroid Build Coastguard Worker vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
1907*4bdc9457SAndroid Build Coastguard Worker
1908*4bdc9457SAndroid Build Coastguard Worker vy01234567 = _mm256_min_ps(vy01234567, vy_max);
1909*4bdc9457SAndroid Build Coastguard Worker vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
1910*4bdc9457SAndroid Build Coastguard Worker
1911*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
1912*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
1913*4bdc9457SAndroid Build Coastguard Worker y += 16;
1914*4bdc9457SAndroid Build Coastguard Worker }
1915*4bdc9457SAndroid Build Coastguard Worker for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1916*4bdc9457SAndroid Build Coastguard Worker const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1917*4bdc9457SAndroid Build Coastguard Worker const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1918*4bdc9457SAndroid Build Coastguard Worker a += 8;
1919*4bdc9457SAndroid Build Coastguard Worker b += 8;
1920*4bdc9457SAndroid Build Coastguard Worker
1921*4bdc9457SAndroid Build Coastguard Worker __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_NO_EXC));
1922*4bdc9457SAndroid Build Coastguard Worker
1923*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_max_ps(vy, vy_min);
1924*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_min_ps(vy, vy_max);
1925*4bdc9457SAndroid Build Coastguard Worker
1926*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
1927*4bdc9457SAndroid Build Coastguard Worker y += 8;
1928*4bdc9457SAndroid Build Coastguard Worker }
1929*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
1930*4bdc9457SAndroid Build Coastguard Worker const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1931*4bdc9457SAndroid Build Coastguard Worker const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
1932*4bdc9457SAndroid Build Coastguard Worker
1933*4bdc9457SAndroid Build Coastguard Worker __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_NO_EXC));
1934*4bdc9457SAndroid Build Coastguard Worker
1935*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_max_ps(vy, vy_min);
1936*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_min_ps(vy, vy_max);
1937*4bdc9457SAndroid Build Coastguard Worker
1938*4bdc9457SAndroid Build Coastguard Worker __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
1939*4bdc9457SAndroid Build Coastguard Worker if (n & (4 * sizeof(uint16_t))) {
1940*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) y, vh);
1941*4bdc9457SAndroid Build Coastguard Worker vh = _mm_unpackhi_epi64(vh, vh);
1942*4bdc9457SAndroid Build Coastguard Worker y += 4;
1943*4bdc9457SAndroid Build Coastguard Worker }
1944*4bdc9457SAndroid Build Coastguard Worker if (n & (2 * sizeof(uint16_t))) {
1945*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(y, vh);
1946*4bdc9457SAndroid Build Coastguard Worker vh = _mm_srli_epi64(vh, 32);
1947*4bdc9457SAndroid Build Coastguard Worker y += 2;
1948*4bdc9457SAndroid Build Coastguard Worker }
1949*4bdc9457SAndroid Build Coastguard Worker if (n & (1 * sizeof(uint16_t))) {
1950*4bdc9457SAndroid Build Coastguard Worker *y = (uint16_t) _mm_extract_epi16(vh, 0);
1951*4bdc9457SAndroid Build Coastguard Worker }
1952*4bdc9457SAndroid Build Coastguard Worker }
1953*4bdc9457SAndroid Build Coastguard Worker }
1954*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_vmulc_minmax_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])1955*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_vmulc_minmax_ukernel__f16c_x16(
1956*4bdc9457SAndroid Build Coastguard Worker size_t n,
1957*4bdc9457SAndroid Build Coastguard Worker const void* restrict a_ptr,
1958*4bdc9457SAndroid Build Coastguard Worker const void* restrict b_ptr,
1959*4bdc9457SAndroid Build Coastguard Worker void* restrict y_ptr,
1960*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
1961*4bdc9457SAndroid Build Coastguard Worker {
1962*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
1963*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(uint16_t) == 0);
1964*4bdc9457SAndroid Build Coastguard Worker assert(a_ptr != NULL);
1965*4bdc9457SAndroid Build Coastguard Worker assert(b_ptr != NULL);
1966*4bdc9457SAndroid Build Coastguard Worker assert(y_ptr != NULL);
1967*4bdc9457SAndroid Build Coastguard Worker
1968*4bdc9457SAndroid Build Coastguard Worker const uint16_t* a = (const uint16_t*) a_ptr;
1969*4bdc9457SAndroid Build Coastguard Worker const uint16_t* b = (const uint16_t*) b_ptr;
1970*4bdc9457SAndroid Build Coastguard Worker uint16_t* y = (uint16_t*) y_ptr;
1971*4bdc9457SAndroid Build Coastguard Worker
1972*4bdc9457SAndroid Build Coastguard Worker const __m256 vy_min = _mm256_load_ps(params->avx.min);
1973*4bdc9457SAndroid Build Coastguard Worker const __m256 vy_max = _mm256_load_ps(params->avx.max);
1974*4bdc9457SAndroid Build Coastguard Worker
1975*4bdc9457SAndroid Build Coastguard Worker const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
1976*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
1977*4bdc9457SAndroid Build Coastguard Worker const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1978*4bdc9457SAndroid Build Coastguard Worker const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
1979*4bdc9457SAndroid Build Coastguard Worker a += 16;
1980*4bdc9457SAndroid Build Coastguard Worker
1981*4bdc9457SAndroid Build Coastguard Worker __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va01234567, vb), _MM_FROUND_NO_EXC));
1982*4bdc9457SAndroid Build Coastguard Worker __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va456789AB, vb), _MM_FROUND_NO_EXC));
1983*4bdc9457SAndroid Build Coastguard Worker
1984*4bdc9457SAndroid Build Coastguard Worker
1985*4bdc9457SAndroid Build Coastguard Worker vy01234567 = _mm256_max_ps(vy01234567, vy_min);
1986*4bdc9457SAndroid Build Coastguard Worker vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
1987*4bdc9457SAndroid Build Coastguard Worker
1988*4bdc9457SAndroid Build Coastguard Worker vy01234567 = _mm256_min_ps(vy01234567, vy_max);
1989*4bdc9457SAndroid Build Coastguard Worker vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
1990*4bdc9457SAndroid Build Coastguard Worker
1991*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
1992*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
1993*4bdc9457SAndroid Build Coastguard Worker y += 16;
1994*4bdc9457SAndroid Build Coastguard Worker }
1995*4bdc9457SAndroid Build Coastguard Worker for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
1996*4bdc9457SAndroid Build Coastguard Worker const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
1997*4bdc9457SAndroid Build Coastguard Worker a += 8;
1998*4bdc9457SAndroid Build Coastguard Worker
1999*4bdc9457SAndroid Build Coastguard Worker __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_NO_EXC));
2000*4bdc9457SAndroid Build Coastguard Worker
2001*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_max_ps(vy, vy_min);
2002*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_min_ps(vy, vy_max);
2003*4bdc9457SAndroid Build Coastguard Worker
2004*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
2005*4bdc9457SAndroid Build Coastguard Worker y += 8;
2006*4bdc9457SAndroid Build Coastguard Worker }
2007*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
2008*4bdc9457SAndroid Build Coastguard Worker const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2009*4bdc9457SAndroid Build Coastguard Worker
2010*4bdc9457SAndroid Build Coastguard Worker __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(va, vb), _MM_FROUND_NO_EXC));
2011*4bdc9457SAndroid Build Coastguard Worker
2012*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_max_ps(vy, vy_min);
2013*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_min_ps(vy, vy_max);
2014*4bdc9457SAndroid Build Coastguard Worker
2015*4bdc9457SAndroid Build Coastguard Worker __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
2016*4bdc9457SAndroid Build Coastguard Worker if (n & (4 * sizeof(uint16_t))) {
2017*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) y, vh);
2018*4bdc9457SAndroid Build Coastguard Worker vh = _mm_unpackhi_epi64(vh, vh);
2019*4bdc9457SAndroid Build Coastguard Worker y += 4;
2020*4bdc9457SAndroid Build Coastguard Worker }
2021*4bdc9457SAndroid Build Coastguard Worker if (n & (2 * sizeof(uint16_t))) {
2022*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(y, vh);
2023*4bdc9457SAndroid Build Coastguard Worker vh = _mm_srli_epi64(vh, 32);
2024*4bdc9457SAndroid Build Coastguard Worker y += 2;
2025*4bdc9457SAndroid Build Coastguard Worker }
2026*4bdc9457SAndroid Build Coastguard Worker if (n & (1 * sizeof(uint16_t))) {
2027*4bdc9457SAndroid Build Coastguard Worker *y = (uint16_t) _mm_extract_epi16(vh, 0);
2028*4bdc9457SAndroid Build Coastguard Worker }
2029*4bdc9457SAndroid Build Coastguard Worker }
2030*4bdc9457SAndroid Build Coastguard Worker }
2031*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_vrdivc_minmax_ukernel__f16c_x8(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2032*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_vrdivc_minmax_ukernel__f16c_x8(
2033*4bdc9457SAndroid Build Coastguard Worker size_t n,
2034*4bdc9457SAndroid Build Coastguard Worker const void* restrict a_ptr,
2035*4bdc9457SAndroid Build Coastguard Worker const void* restrict b_ptr,
2036*4bdc9457SAndroid Build Coastguard Worker void* restrict y_ptr,
2037*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2038*4bdc9457SAndroid Build Coastguard Worker {
2039*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
2040*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(uint16_t) == 0);
2041*4bdc9457SAndroid Build Coastguard Worker assert(a_ptr != NULL);
2042*4bdc9457SAndroid Build Coastguard Worker assert(b_ptr != NULL);
2043*4bdc9457SAndroid Build Coastguard Worker assert(y_ptr != NULL);
2044*4bdc9457SAndroid Build Coastguard Worker
2045*4bdc9457SAndroid Build Coastguard Worker const uint16_t* a = (const uint16_t*) a_ptr;
2046*4bdc9457SAndroid Build Coastguard Worker const uint16_t* b = (const uint16_t*) b_ptr;
2047*4bdc9457SAndroid Build Coastguard Worker uint16_t* y = (uint16_t*) y_ptr;
2048*4bdc9457SAndroid Build Coastguard Worker
2049*4bdc9457SAndroid Build Coastguard Worker const __m256 vy_min = _mm256_load_ps(params->avx.min);
2050*4bdc9457SAndroid Build Coastguard Worker const __m256 vy_max = _mm256_load_ps(params->avx.max);
2051*4bdc9457SAndroid Build Coastguard Worker
2052*4bdc9457SAndroid Build Coastguard Worker const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
2053*4bdc9457SAndroid Build Coastguard Worker for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2054*4bdc9457SAndroid Build Coastguard Worker const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2055*4bdc9457SAndroid Build Coastguard Worker a += 8;
2056*4bdc9457SAndroid Build Coastguard Worker
2057*4bdc9457SAndroid Build Coastguard Worker __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_div_ps(vb, va), _MM_FROUND_NO_EXC));
2058*4bdc9457SAndroid Build Coastguard Worker
2059*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_max_ps(vy, vy_min);
2060*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_min_ps(vy, vy_max);
2061*4bdc9457SAndroid Build Coastguard Worker
2062*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
2063*4bdc9457SAndroid Build Coastguard Worker y += 8;
2064*4bdc9457SAndroid Build Coastguard Worker }
2065*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
2066*4bdc9457SAndroid Build Coastguard Worker const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2067*4bdc9457SAndroid Build Coastguard Worker
2068*4bdc9457SAndroid Build Coastguard Worker __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_div_ps(vb, va), _MM_FROUND_NO_EXC));
2069*4bdc9457SAndroid Build Coastguard Worker
2070*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_max_ps(vy, vy_min);
2071*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_min_ps(vy, vy_max);
2072*4bdc9457SAndroid Build Coastguard Worker
2073*4bdc9457SAndroid Build Coastguard Worker __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
2074*4bdc9457SAndroid Build Coastguard Worker if (n & (4 * sizeof(uint16_t))) {
2075*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) y, vh);
2076*4bdc9457SAndroid Build Coastguard Worker vh = _mm_unpackhi_epi64(vh, vh);
2077*4bdc9457SAndroid Build Coastguard Worker y += 4;
2078*4bdc9457SAndroid Build Coastguard Worker }
2079*4bdc9457SAndroid Build Coastguard Worker if (n & (2 * sizeof(uint16_t))) {
2080*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(y, vh);
2081*4bdc9457SAndroid Build Coastguard Worker vh = _mm_srli_epi64(vh, 32);
2082*4bdc9457SAndroid Build Coastguard Worker y += 2;
2083*4bdc9457SAndroid Build Coastguard Worker }
2084*4bdc9457SAndroid Build Coastguard Worker if (n & (1 * sizeof(uint16_t))) {
2085*4bdc9457SAndroid Build Coastguard Worker *y = (uint16_t) _mm_extract_epi16(vh, 0);
2086*4bdc9457SAndroid Build Coastguard Worker }
2087*4bdc9457SAndroid Build Coastguard Worker }
2088*4bdc9457SAndroid Build Coastguard Worker }
2089*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_vrsubc_minmax_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2090*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_vrsubc_minmax_ukernel__f16c_x16(
2091*4bdc9457SAndroid Build Coastguard Worker size_t n,
2092*4bdc9457SAndroid Build Coastguard Worker const void* restrict a_ptr,
2093*4bdc9457SAndroid Build Coastguard Worker const void* restrict b_ptr,
2094*4bdc9457SAndroid Build Coastguard Worker void* restrict y_ptr,
2095*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2096*4bdc9457SAndroid Build Coastguard Worker {
2097*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
2098*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(uint16_t) == 0);
2099*4bdc9457SAndroid Build Coastguard Worker assert(a_ptr != NULL);
2100*4bdc9457SAndroid Build Coastguard Worker assert(b_ptr != NULL);
2101*4bdc9457SAndroid Build Coastguard Worker assert(y_ptr != NULL);
2102*4bdc9457SAndroid Build Coastguard Worker
2103*4bdc9457SAndroid Build Coastguard Worker const uint16_t* a = (const uint16_t*) a_ptr;
2104*4bdc9457SAndroid Build Coastguard Worker const uint16_t* b = (const uint16_t*) b_ptr;
2105*4bdc9457SAndroid Build Coastguard Worker uint16_t* y = (uint16_t*) y_ptr;
2106*4bdc9457SAndroid Build Coastguard Worker
2107*4bdc9457SAndroid Build Coastguard Worker const __m256 vy_min = _mm256_load_ps(params->avx.min);
2108*4bdc9457SAndroid Build Coastguard Worker const __m256 vy_max = _mm256_load_ps(params->avx.max);
2109*4bdc9457SAndroid Build Coastguard Worker
2110*4bdc9457SAndroid Build Coastguard Worker const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
2111*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2112*4bdc9457SAndroid Build Coastguard Worker const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2113*4bdc9457SAndroid Build Coastguard Worker const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
2114*4bdc9457SAndroid Build Coastguard Worker a += 16;
2115*4bdc9457SAndroid Build Coastguard Worker
2116*4bdc9457SAndroid Build Coastguard Worker __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(vb, va01234567), _MM_FROUND_NO_EXC));
2117*4bdc9457SAndroid Build Coastguard Worker __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(vb, va456789AB), _MM_FROUND_NO_EXC));
2118*4bdc9457SAndroid Build Coastguard Worker
2119*4bdc9457SAndroid Build Coastguard Worker
2120*4bdc9457SAndroid Build Coastguard Worker vy01234567 = _mm256_max_ps(vy01234567, vy_min);
2121*4bdc9457SAndroid Build Coastguard Worker vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
2122*4bdc9457SAndroid Build Coastguard Worker
2123*4bdc9457SAndroid Build Coastguard Worker vy01234567 = _mm256_min_ps(vy01234567, vy_max);
2124*4bdc9457SAndroid Build Coastguard Worker vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
2125*4bdc9457SAndroid Build Coastguard Worker
2126*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
2127*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
2128*4bdc9457SAndroid Build Coastguard Worker y += 16;
2129*4bdc9457SAndroid Build Coastguard Worker }
2130*4bdc9457SAndroid Build Coastguard Worker for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2131*4bdc9457SAndroid Build Coastguard Worker const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2132*4bdc9457SAndroid Build Coastguard Worker a += 8;
2133*4bdc9457SAndroid Build Coastguard Worker
2134*4bdc9457SAndroid Build Coastguard Worker __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(vb, va), _MM_FROUND_NO_EXC));
2135*4bdc9457SAndroid Build Coastguard Worker
2136*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_max_ps(vy, vy_min);
2137*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_min_ps(vy, vy_max);
2138*4bdc9457SAndroid Build Coastguard Worker
2139*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
2140*4bdc9457SAndroid Build Coastguard Worker y += 8;
2141*4bdc9457SAndroid Build Coastguard Worker }
2142*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
2143*4bdc9457SAndroid Build Coastguard Worker const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2144*4bdc9457SAndroid Build Coastguard Worker
2145*4bdc9457SAndroid Build Coastguard Worker __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(vb, va), _MM_FROUND_NO_EXC));
2146*4bdc9457SAndroid Build Coastguard Worker
2147*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_max_ps(vy, vy_min);
2148*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_min_ps(vy, vy_max);
2149*4bdc9457SAndroid Build Coastguard Worker
2150*4bdc9457SAndroid Build Coastguard Worker __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
2151*4bdc9457SAndroid Build Coastguard Worker if (n & (4 * sizeof(uint16_t))) {
2152*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) y, vh);
2153*4bdc9457SAndroid Build Coastguard Worker vh = _mm_unpackhi_epi64(vh, vh);
2154*4bdc9457SAndroid Build Coastguard Worker y += 4;
2155*4bdc9457SAndroid Build Coastguard Worker }
2156*4bdc9457SAndroid Build Coastguard Worker if (n & (2 * sizeof(uint16_t))) {
2157*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(y, vh);
2158*4bdc9457SAndroid Build Coastguard Worker vh = _mm_srli_epi64(vh, 32);
2159*4bdc9457SAndroid Build Coastguard Worker y += 2;
2160*4bdc9457SAndroid Build Coastguard Worker }
2161*4bdc9457SAndroid Build Coastguard Worker if (n & (1 * sizeof(uint16_t))) {
2162*4bdc9457SAndroid Build Coastguard Worker *y = (uint16_t) _mm_extract_epi16(vh, 0);
2163*4bdc9457SAndroid Build Coastguard Worker }
2164*4bdc9457SAndroid Build Coastguard Worker }
2165*4bdc9457SAndroid Build Coastguard Worker }
2166*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_vsqrdiff_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS (1)])2167*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_vsqrdiff_ukernel__f16c_x16(
2168*4bdc9457SAndroid Build Coastguard Worker size_t n,
2169*4bdc9457SAndroid Build Coastguard Worker const void* restrict a_ptr,
2170*4bdc9457SAndroid Build Coastguard Worker const void* restrict b_ptr,
2171*4bdc9457SAndroid Build Coastguard Worker void* restrict y_ptr,
2172*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2173*4bdc9457SAndroid Build Coastguard Worker {
2174*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
2175*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(uint16_t) == 0);
2176*4bdc9457SAndroid Build Coastguard Worker assert(a_ptr != NULL);
2177*4bdc9457SAndroid Build Coastguard Worker assert(b_ptr != NULL);
2178*4bdc9457SAndroid Build Coastguard Worker assert(y_ptr != NULL);
2179*4bdc9457SAndroid Build Coastguard Worker
2180*4bdc9457SAndroid Build Coastguard Worker const uint16_t* a = (const uint16_t*) a_ptr;
2181*4bdc9457SAndroid Build Coastguard Worker const uint16_t* b = (const uint16_t*) b_ptr;
2182*4bdc9457SAndroid Build Coastguard Worker uint16_t* y = (uint16_t*) y_ptr;
2183*4bdc9457SAndroid Build Coastguard Worker
2184*4bdc9457SAndroid Build Coastguard Worker
2185*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2186*4bdc9457SAndroid Build Coastguard Worker const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2187*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
2188*4bdc9457SAndroid Build Coastguard Worker const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
2189*4bdc9457SAndroid Build Coastguard Worker const __m256 vb456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (b + 8)));
2190*4bdc9457SAndroid Build Coastguard Worker a += 16;
2191*4bdc9457SAndroid Build Coastguard Worker b += 16;
2192*4bdc9457SAndroid Build Coastguard Worker
2193*4bdc9457SAndroid Build Coastguard Worker __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va01234567, vb01234567), _MM_FROUND_NO_EXC));
2194*4bdc9457SAndroid Build Coastguard Worker __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va456789AB, vb456789AB), _MM_FROUND_NO_EXC));
2195*4bdc9457SAndroid Build Coastguard Worker
2196*4bdc9457SAndroid Build Coastguard Worker vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy01234567, vy01234567), _MM_FROUND_NO_EXC));
2197*4bdc9457SAndroid Build Coastguard Worker vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy456789AB, vy456789AB), _MM_FROUND_NO_EXC));
2198*4bdc9457SAndroid Build Coastguard Worker
2199*4bdc9457SAndroid Build Coastguard Worker
2200*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
2201*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
2202*4bdc9457SAndroid Build Coastguard Worker y += 16;
2203*4bdc9457SAndroid Build Coastguard Worker }
2204*4bdc9457SAndroid Build Coastguard Worker for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2205*4bdc9457SAndroid Build Coastguard Worker const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2206*4bdc9457SAndroid Build Coastguard Worker const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
2207*4bdc9457SAndroid Build Coastguard Worker a += 8;
2208*4bdc9457SAndroid Build Coastguard Worker b += 8;
2209*4bdc9457SAndroid Build Coastguard Worker
2210*4bdc9457SAndroid Build Coastguard Worker __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_NO_EXC));
2211*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy, vy), _MM_FROUND_NO_EXC));
2212*4bdc9457SAndroid Build Coastguard Worker
2213*4bdc9457SAndroid Build Coastguard Worker
2214*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
2215*4bdc9457SAndroid Build Coastguard Worker y += 8;
2216*4bdc9457SAndroid Build Coastguard Worker }
2217*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
2218*4bdc9457SAndroid Build Coastguard Worker const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2219*4bdc9457SAndroid Build Coastguard Worker const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
2220*4bdc9457SAndroid Build Coastguard Worker
2221*4bdc9457SAndroid Build Coastguard Worker __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_NO_EXC));
2222*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy, vy), _MM_FROUND_NO_EXC));
2223*4bdc9457SAndroid Build Coastguard Worker
2224*4bdc9457SAndroid Build Coastguard Worker
2225*4bdc9457SAndroid Build Coastguard Worker __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
2226*4bdc9457SAndroid Build Coastguard Worker if (n & (4 * sizeof(uint16_t))) {
2227*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) y, vh);
2228*4bdc9457SAndroid Build Coastguard Worker vh = _mm_unpackhi_epi64(vh, vh);
2229*4bdc9457SAndroid Build Coastguard Worker y += 4;
2230*4bdc9457SAndroid Build Coastguard Worker }
2231*4bdc9457SAndroid Build Coastguard Worker if (n & (2 * sizeof(uint16_t))) {
2232*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(y, vh);
2233*4bdc9457SAndroid Build Coastguard Worker vh = _mm_srli_epi64(vh, 32);
2234*4bdc9457SAndroid Build Coastguard Worker y += 2;
2235*4bdc9457SAndroid Build Coastguard Worker }
2236*4bdc9457SAndroid Build Coastguard Worker if (n & (1 * sizeof(uint16_t))) {
2237*4bdc9457SAndroid Build Coastguard Worker *y = (uint16_t) _mm_extract_epi16(vh, 0);
2238*4bdc9457SAndroid Build Coastguard Worker }
2239*4bdc9457SAndroid Build Coastguard Worker }
2240*4bdc9457SAndroid Build Coastguard Worker }
2241*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_vsqrdiffc_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS (1)])2242*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_vsqrdiffc_ukernel__f16c_x16(
2243*4bdc9457SAndroid Build Coastguard Worker size_t n,
2244*4bdc9457SAndroid Build Coastguard Worker const void* restrict a_ptr,
2245*4bdc9457SAndroid Build Coastguard Worker const void* restrict b_ptr,
2246*4bdc9457SAndroid Build Coastguard Worker void* restrict y_ptr,
2247*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2248*4bdc9457SAndroid Build Coastguard Worker {
2249*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
2250*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(uint16_t) == 0);
2251*4bdc9457SAndroid Build Coastguard Worker assert(a_ptr != NULL);
2252*4bdc9457SAndroid Build Coastguard Worker assert(b_ptr != NULL);
2253*4bdc9457SAndroid Build Coastguard Worker assert(y_ptr != NULL);
2254*4bdc9457SAndroid Build Coastguard Worker
2255*4bdc9457SAndroid Build Coastguard Worker const uint16_t* a = (const uint16_t*) a_ptr;
2256*4bdc9457SAndroid Build Coastguard Worker const uint16_t* b = (const uint16_t*) b_ptr;
2257*4bdc9457SAndroid Build Coastguard Worker uint16_t* y = (uint16_t*) y_ptr;
2258*4bdc9457SAndroid Build Coastguard Worker
2259*4bdc9457SAndroid Build Coastguard Worker
2260*4bdc9457SAndroid Build Coastguard Worker const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
2261*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2262*4bdc9457SAndroid Build Coastguard Worker const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2263*4bdc9457SAndroid Build Coastguard Worker const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
2264*4bdc9457SAndroid Build Coastguard Worker a += 16;
2265*4bdc9457SAndroid Build Coastguard Worker
2266*4bdc9457SAndroid Build Coastguard Worker __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va01234567, vb), _MM_FROUND_NO_EXC));
2267*4bdc9457SAndroid Build Coastguard Worker __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va456789AB, vb), _MM_FROUND_NO_EXC));
2268*4bdc9457SAndroid Build Coastguard Worker
2269*4bdc9457SAndroid Build Coastguard Worker vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy01234567, vy01234567), _MM_FROUND_NO_EXC));
2270*4bdc9457SAndroid Build Coastguard Worker vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy456789AB, vy456789AB), _MM_FROUND_NO_EXC));
2271*4bdc9457SAndroid Build Coastguard Worker
2272*4bdc9457SAndroid Build Coastguard Worker
2273*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
2274*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
2275*4bdc9457SAndroid Build Coastguard Worker y += 16;
2276*4bdc9457SAndroid Build Coastguard Worker }
2277*4bdc9457SAndroid Build Coastguard Worker for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2278*4bdc9457SAndroid Build Coastguard Worker const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2279*4bdc9457SAndroid Build Coastguard Worker a += 8;
2280*4bdc9457SAndroid Build Coastguard Worker
2281*4bdc9457SAndroid Build Coastguard Worker __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_NO_EXC));
2282*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy, vy), _MM_FROUND_NO_EXC));
2283*4bdc9457SAndroid Build Coastguard Worker
2284*4bdc9457SAndroid Build Coastguard Worker
2285*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
2286*4bdc9457SAndroid Build Coastguard Worker y += 8;
2287*4bdc9457SAndroid Build Coastguard Worker }
2288*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
2289*4bdc9457SAndroid Build Coastguard Worker const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2290*4bdc9457SAndroid Build Coastguard Worker
2291*4bdc9457SAndroid Build Coastguard Worker __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_NO_EXC));
2292*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vy, vy), _MM_FROUND_NO_EXC));
2293*4bdc9457SAndroid Build Coastguard Worker
2294*4bdc9457SAndroid Build Coastguard Worker
2295*4bdc9457SAndroid Build Coastguard Worker __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
2296*4bdc9457SAndroid Build Coastguard Worker if (n & (4 * sizeof(uint16_t))) {
2297*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) y, vh);
2298*4bdc9457SAndroid Build Coastguard Worker vh = _mm_unpackhi_epi64(vh, vh);
2299*4bdc9457SAndroid Build Coastguard Worker y += 4;
2300*4bdc9457SAndroid Build Coastguard Worker }
2301*4bdc9457SAndroid Build Coastguard Worker if (n & (2 * sizeof(uint16_t))) {
2302*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(y, vh);
2303*4bdc9457SAndroid Build Coastguard Worker vh = _mm_srli_epi64(vh, 32);
2304*4bdc9457SAndroid Build Coastguard Worker y += 2;
2305*4bdc9457SAndroid Build Coastguard Worker }
2306*4bdc9457SAndroid Build Coastguard Worker if (n & (1 * sizeof(uint16_t))) {
2307*4bdc9457SAndroid Build Coastguard Worker *y = (uint16_t) _mm_extract_epi16(vh, 0);
2308*4bdc9457SAndroid Build Coastguard Worker }
2309*4bdc9457SAndroid Build Coastguard Worker }
2310*4bdc9457SAndroid Build Coastguard Worker }
2311*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_vsub_minmax_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2312*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_vsub_minmax_ukernel__f16c_x16(
2313*4bdc9457SAndroid Build Coastguard Worker size_t n,
2314*4bdc9457SAndroid Build Coastguard Worker const void* restrict a_ptr,
2315*4bdc9457SAndroid Build Coastguard Worker const void* restrict b_ptr,
2316*4bdc9457SAndroid Build Coastguard Worker void* restrict y_ptr,
2317*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2318*4bdc9457SAndroid Build Coastguard Worker {
2319*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
2320*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(uint16_t) == 0);
2321*4bdc9457SAndroid Build Coastguard Worker assert(a_ptr != NULL);
2322*4bdc9457SAndroid Build Coastguard Worker assert(b_ptr != NULL);
2323*4bdc9457SAndroid Build Coastguard Worker assert(y_ptr != NULL);
2324*4bdc9457SAndroid Build Coastguard Worker
2325*4bdc9457SAndroid Build Coastguard Worker const uint16_t* a = (const uint16_t*) a_ptr;
2326*4bdc9457SAndroid Build Coastguard Worker const uint16_t* b = (const uint16_t*) b_ptr;
2327*4bdc9457SAndroid Build Coastguard Worker uint16_t* y = (uint16_t*) y_ptr;
2328*4bdc9457SAndroid Build Coastguard Worker
2329*4bdc9457SAndroid Build Coastguard Worker const __m256 vy_min = _mm256_load_ps(params->avx.min);
2330*4bdc9457SAndroid Build Coastguard Worker const __m256 vy_max = _mm256_load_ps(params->avx.max);
2331*4bdc9457SAndroid Build Coastguard Worker
2332*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2333*4bdc9457SAndroid Build Coastguard Worker const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2334*4bdc9457SAndroid Build Coastguard Worker const __m256 vb01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
2335*4bdc9457SAndroid Build Coastguard Worker const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
2336*4bdc9457SAndroid Build Coastguard Worker const __m256 vb456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (b + 8)));
2337*4bdc9457SAndroid Build Coastguard Worker a += 16;
2338*4bdc9457SAndroid Build Coastguard Worker b += 16;
2339*4bdc9457SAndroid Build Coastguard Worker
2340*4bdc9457SAndroid Build Coastguard Worker __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va01234567, vb01234567), _MM_FROUND_NO_EXC));
2341*4bdc9457SAndroid Build Coastguard Worker __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va456789AB, vb456789AB), _MM_FROUND_NO_EXC));
2342*4bdc9457SAndroid Build Coastguard Worker
2343*4bdc9457SAndroid Build Coastguard Worker
2344*4bdc9457SAndroid Build Coastguard Worker vy01234567 = _mm256_max_ps(vy01234567, vy_min);
2345*4bdc9457SAndroid Build Coastguard Worker vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
2346*4bdc9457SAndroid Build Coastguard Worker
2347*4bdc9457SAndroid Build Coastguard Worker vy01234567 = _mm256_min_ps(vy01234567, vy_max);
2348*4bdc9457SAndroid Build Coastguard Worker vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
2349*4bdc9457SAndroid Build Coastguard Worker
2350*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
2351*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
2352*4bdc9457SAndroid Build Coastguard Worker y += 16;
2353*4bdc9457SAndroid Build Coastguard Worker }
2354*4bdc9457SAndroid Build Coastguard Worker for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2355*4bdc9457SAndroid Build Coastguard Worker const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2356*4bdc9457SAndroid Build Coastguard Worker const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
2357*4bdc9457SAndroid Build Coastguard Worker a += 8;
2358*4bdc9457SAndroid Build Coastguard Worker b += 8;
2359*4bdc9457SAndroid Build Coastguard Worker
2360*4bdc9457SAndroid Build Coastguard Worker __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_NO_EXC));
2361*4bdc9457SAndroid Build Coastguard Worker
2362*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_max_ps(vy, vy_min);
2363*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_min_ps(vy, vy_max);
2364*4bdc9457SAndroid Build Coastguard Worker
2365*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
2366*4bdc9457SAndroid Build Coastguard Worker y += 8;
2367*4bdc9457SAndroid Build Coastguard Worker }
2368*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
2369*4bdc9457SAndroid Build Coastguard Worker const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2370*4bdc9457SAndroid Build Coastguard Worker const __m256 vb = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) b));
2371*4bdc9457SAndroid Build Coastguard Worker
2372*4bdc9457SAndroid Build Coastguard Worker __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_NO_EXC));
2373*4bdc9457SAndroid Build Coastguard Worker
2374*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_max_ps(vy, vy_min);
2375*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_min_ps(vy, vy_max);
2376*4bdc9457SAndroid Build Coastguard Worker
2377*4bdc9457SAndroid Build Coastguard Worker __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
2378*4bdc9457SAndroid Build Coastguard Worker if (n & (4 * sizeof(uint16_t))) {
2379*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) y, vh);
2380*4bdc9457SAndroid Build Coastguard Worker vh = _mm_unpackhi_epi64(vh, vh);
2381*4bdc9457SAndroid Build Coastguard Worker y += 4;
2382*4bdc9457SAndroid Build Coastguard Worker }
2383*4bdc9457SAndroid Build Coastguard Worker if (n & (2 * sizeof(uint16_t))) {
2384*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(y, vh);
2385*4bdc9457SAndroid Build Coastguard Worker vh = _mm_srli_epi64(vh, 32);
2386*4bdc9457SAndroid Build Coastguard Worker y += 2;
2387*4bdc9457SAndroid Build Coastguard Worker }
2388*4bdc9457SAndroid Build Coastguard Worker if (n & (1 * sizeof(uint16_t))) {
2389*4bdc9457SAndroid Build Coastguard Worker *y = (uint16_t) _mm_extract_epi16(vh, 0);
2390*4bdc9457SAndroid Build Coastguard Worker }
2391*4bdc9457SAndroid Build Coastguard Worker }
2392*4bdc9457SAndroid Build Coastguard Worker }
2393*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_vsubc_minmax_ukernel__f16c_x16(size_t n,const void * restrict a_ptr,const void * restrict b_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2394*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_vsubc_minmax_ukernel__f16c_x16(
2395*4bdc9457SAndroid Build Coastguard Worker size_t n,
2396*4bdc9457SAndroid Build Coastguard Worker const void* restrict a_ptr,
2397*4bdc9457SAndroid Build Coastguard Worker const void* restrict b_ptr,
2398*4bdc9457SAndroid Build Coastguard Worker void* restrict y_ptr,
2399*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2400*4bdc9457SAndroid Build Coastguard Worker {
2401*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
2402*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(uint16_t) == 0);
2403*4bdc9457SAndroid Build Coastguard Worker assert(a_ptr != NULL);
2404*4bdc9457SAndroid Build Coastguard Worker assert(b_ptr != NULL);
2405*4bdc9457SAndroid Build Coastguard Worker assert(y_ptr != NULL);
2406*4bdc9457SAndroid Build Coastguard Worker
2407*4bdc9457SAndroid Build Coastguard Worker const uint16_t* a = (const uint16_t*) a_ptr;
2408*4bdc9457SAndroid Build Coastguard Worker const uint16_t* b = (const uint16_t*) b_ptr;
2409*4bdc9457SAndroid Build Coastguard Worker uint16_t* y = (uint16_t*) y_ptr;
2410*4bdc9457SAndroid Build Coastguard Worker
2411*4bdc9457SAndroid Build Coastguard Worker const __m256 vy_min = _mm256_load_ps(params->avx.min);
2412*4bdc9457SAndroid Build Coastguard Worker const __m256 vy_max = _mm256_load_ps(params->avx.max);
2413*4bdc9457SAndroid Build Coastguard Worker
2414*4bdc9457SAndroid Build Coastguard Worker const __m256 vb = _mm256_cvtph_ps(_mm_set1_epi16((short) *b));
2415*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2416*4bdc9457SAndroid Build Coastguard Worker const __m256 va01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2417*4bdc9457SAndroid Build Coastguard Worker const __m256 va456789AB = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (a + 8)));
2418*4bdc9457SAndroid Build Coastguard Worker a += 16;
2419*4bdc9457SAndroid Build Coastguard Worker
2420*4bdc9457SAndroid Build Coastguard Worker __m256 vy01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va01234567, vb), _MM_FROUND_NO_EXC));
2421*4bdc9457SAndroid Build Coastguard Worker __m256 vy456789AB = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va456789AB, vb), _MM_FROUND_NO_EXC));
2422*4bdc9457SAndroid Build Coastguard Worker
2423*4bdc9457SAndroid Build Coastguard Worker
2424*4bdc9457SAndroid Build Coastguard Worker vy01234567 = _mm256_max_ps(vy01234567, vy_min);
2425*4bdc9457SAndroid Build Coastguard Worker vy456789AB = _mm256_max_ps(vy456789AB, vy_min);
2426*4bdc9457SAndroid Build Coastguard Worker
2427*4bdc9457SAndroid Build Coastguard Worker vy01234567 = _mm256_min_ps(vy01234567, vy_max);
2428*4bdc9457SAndroid Build Coastguard Worker vy456789AB = _mm256_min_ps(vy456789AB, vy_max);
2429*4bdc9457SAndroid Build Coastguard Worker
2430*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy01234567, _MM_FROUND_NO_EXC));
2431*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vy456789AB, _MM_FROUND_NO_EXC));
2432*4bdc9457SAndroid Build Coastguard Worker y += 16;
2433*4bdc9457SAndroid Build Coastguard Worker }
2434*4bdc9457SAndroid Build Coastguard Worker for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2435*4bdc9457SAndroid Build Coastguard Worker const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2436*4bdc9457SAndroid Build Coastguard Worker a += 8;
2437*4bdc9457SAndroid Build Coastguard Worker
2438*4bdc9457SAndroid Build Coastguard Worker __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_NO_EXC));
2439*4bdc9457SAndroid Build Coastguard Worker
2440*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_max_ps(vy, vy_min);
2441*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_min_ps(vy, vy_max);
2442*4bdc9457SAndroid Build Coastguard Worker
2443*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC));
2444*4bdc9457SAndroid Build Coastguard Worker y += 8;
2445*4bdc9457SAndroid Build Coastguard Worker }
2446*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
2447*4bdc9457SAndroid Build Coastguard Worker const __m256 va = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) a));
2448*4bdc9457SAndroid Build Coastguard Worker
2449*4bdc9457SAndroid Build Coastguard Worker __m256 vy = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_sub_ps(va, vb), _MM_FROUND_NO_EXC));
2450*4bdc9457SAndroid Build Coastguard Worker
2451*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_max_ps(vy, vy_min);
2452*4bdc9457SAndroid Build Coastguard Worker vy = _mm256_min_ps(vy, vy_max);
2453*4bdc9457SAndroid Build Coastguard Worker
2454*4bdc9457SAndroid Build Coastguard Worker __m128i vh = _mm256_cvtps_ph(vy, _MM_FROUND_NO_EXC);
2455*4bdc9457SAndroid Build Coastguard Worker if (n & (4 * sizeof(uint16_t))) {
2456*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) y, vh);
2457*4bdc9457SAndroid Build Coastguard Worker vh = _mm_unpackhi_epi64(vh, vh);
2458*4bdc9457SAndroid Build Coastguard Worker y += 4;
2459*4bdc9457SAndroid Build Coastguard Worker }
2460*4bdc9457SAndroid Build Coastguard Worker if (n & (2 * sizeof(uint16_t))) {
2461*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(y, vh);
2462*4bdc9457SAndroid Build Coastguard Worker vh = _mm_srli_epi64(vh, 32);
2463*4bdc9457SAndroid Build Coastguard Worker y += 2;
2464*4bdc9457SAndroid Build Coastguard Worker }
2465*4bdc9457SAndroid Build Coastguard Worker if (n & (1 * sizeof(uint16_t))) {
2466*4bdc9457SAndroid Build Coastguard Worker *y = (uint16_t) _mm_extract_epi16(vh, 0);
2467*4bdc9457SAndroid Build Coastguard Worker }
2468*4bdc9457SAndroid Build Coastguard Worker }
2469*4bdc9457SAndroid Build Coastguard Worker }
2470*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_vclamp_ukernel__f16c_x16(size_t n,const void * restrict x_ptr,void * restrict y_ptr,const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS (1)])2471*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_vclamp_ukernel__f16c_x16(
2472*4bdc9457SAndroid Build Coastguard Worker size_t n,
2473*4bdc9457SAndroid Build Coastguard Worker const void* restrict x_ptr,
2474*4bdc9457SAndroid Build Coastguard Worker void* restrict y_ptr,
2475*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_minmax_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2476*4bdc9457SAndroid Build Coastguard Worker {
2477*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
2478*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(uint16_t) == 0);
2479*4bdc9457SAndroid Build Coastguard Worker assert(x_ptr != NULL);
2480*4bdc9457SAndroid Build Coastguard Worker assert(y_ptr != NULL);
2481*4bdc9457SAndroid Build Coastguard Worker
2482*4bdc9457SAndroid Build Coastguard Worker const uint16_t* x = (const uint16_t*) x_ptr;
2483*4bdc9457SAndroid Build Coastguard Worker uint16_t* y = (uint16_t*) y_ptr;
2484*4bdc9457SAndroid Build Coastguard Worker
2485*4bdc9457SAndroid Build Coastguard Worker const __m256 vy_min = _mm256_load_ps(params->avx.min);
2486*4bdc9457SAndroid Build Coastguard Worker const __m256 vy_max = _mm256_load_ps(params->avx.max);
2487*4bdc9457SAndroid Build Coastguard Worker
2488*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2489*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) x));
2490*4bdc9457SAndroid Build Coastguard Worker __m256 vacc89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (x + 8)));
2491*4bdc9457SAndroid Build Coastguard Worker x += 16;
2492*4bdc9457SAndroid Build Coastguard Worker
2493*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_max_ps(vacc01234567, vy_min);
2494*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEF = _mm256_max_ps(vacc89ABCDEF, vy_min);
2495*4bdc9457SAndroid Build Coastguard Worker
2496*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_min_ps(vacc01234567, vy_max);
2497*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEF = _mm256_min_ps(vacc89ABCDEF, vy_max);
2498*4bdc9457SAndroid Build Coastguard Worker
2499*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vacc01234567, _MM_FROUND_NO_EXC));
2500*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (y + 8), _mm256_cvtps_ph(vacc89ABCDEF, _MM_FROUND_NO_EXC));
2501*4bdc9457SAndroid Build Coastguard Worker y += 16;
2502*4bdc9457SAndroid Build Coastguard Worker }
2503*4bdc9457SAndroid Build Coastguard Worker for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2504*4bdc9457SAndroid Build Coastguard Worker __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) x));
2505*4bdc9457SAndroid Build Coastguard Worker x += 8;
2506*4bdc9457SAndroid Build Coastguard Worker vacc = _mm256_max_ps(vacc, vy_min);
2507*4bdc9457SAndroid Build Coastguard Worker vacc = _mm256_min_ps(vacc, vy_max);
2508*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC));
2509*4bdc9457SAndroid Build Coastguard Worker y += 8;
2510*4bdc9457SAndroid Build Coastguard Worker }
2511*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
2512*4bdc9457SAndroid Build Coastguard Worker __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) x));
2513*4bdc9457SAndroid Build Coastguard Worker vacc = _mm256_max_ps(vacc, vy_min);
2514*4bdc9457SAndroid Build Coastguard Worker vacc = _mm256_min_ps(vacc, vy_max);
2515*4bdc9457SAndroid Build Coastguard Worker
2516*4bdc9457SAndroid Build Coastguard Worker __m128i vh = _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC);
2517*4bdc9457SAndroid Build Coastguard Worker if (n & (4 * sizeof(uint16_t))) {
2518*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) y, vh);
2519*4bdc9457SAndroid Build Coastguard Worker vh = _mm_unpackhi_epi64(vh, vh);
2520*4bdc9457SAndroid Build Coastguard Worker y += 4;
2521*4bdc9457SAndroid Build Coastguard Worker }
2522*4bdc9457SAndroid Build Coastguard Worker if (n & (2 * sizeof(uint16_t))) {
2523*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(y, vh);
2524*4bdc9457SAndroid Build Coastguard Worker vh = _mm_srli_epi64(vh, 32);
2525*4bdc9457SAndroid Build Coastguard Worker y += 2;
2526*4bdc9457SAndroid Build Coastguard Worker }
2527*4bdc9457SAndroid Build Coastguard Worker if (n & (1 * sizeof(uint16_t))) {
2528*4bdc9457SAndroid Build Coastguard Worker *y = _mm_extract_epi16(vh, 0);
2529*4bdc9457SAndroid Build Coastguard Worker }
2530*4bdc9457SAndroid Build Coastguard Worker }
2531*4bdc9457SAndroid Build Coastguard Worker }
2532*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_vhswish_ukernel__f16c_x16(size_t n,const void * restrict x_ptr,void * restrict y_ptr,const union xnn_f16_hswish_params params[restrict XNN_MIN_ELEMENTS (1)])2533*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_vhswish_ukernel__f16c_x16(
2534*4bdc9457SAndroid Build Coastguard Worker size_t n,
2535*4bdc9457SAndroid Build Coastguard Worker const void* restrict x_ptr,
2536*4bdc9457SAndroid Build Coastguard Worker void* restrict y_ptr,
2537*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_hswish_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2538*4bdc9457SAndroid Build Coastguard Worker {
2539*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
2540*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(uint16_t) == 0);
2541*4bdc9457SAndroid Build Coastguard Worker
2542*4bdc9457SAndroid Build Coastguard Worker const uint16_t* x = (const uint16_t*) x_ptr;
2543*4bdc9457SAndroid Build Coastguard Worker uint16_t* y = (uint16_t*) y_ptr;
2544*4bdc9457SAndroid Build Coastguard Worker
2545*4bdc9457SAndroid Build Coastguard Worker const __m256 vsixth = _mm256_load_ps(params->avx.sixth);
2546*4bdc9457SAndroid Build Coastguard Worker const __m256 vthree = _mm256_load_ps(params->avx.three);
2547*4bdc9457SAndroid Build Coastguard Worker const __m128i vsix = _mm_load_si128((const __m128i*) params->avx.six);
2548*4bdc9457SAndroid Build Coastguard Worker const __m128i vzero = _mm_setzero_si128();
2549*4bdc9457SAndroid Build Coastguard Worker
2550*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2551*4bdc9457SAndroid Build Coastguard Worker __m256 vx01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) x));
2552*4bdc9457SAndroid Build Coastguard Worker __m256 vx89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (x + 8)));
2553*4bdc9457SAndroid Build Coastguard Worker x += 16;
2554*4bdc9457SAndroid Build Coastguard Worker
2555*4bdc9457SAndroid Build Coastguard Worker __m128i vacc01234567 = _mm256_cvtps_ph(_mm256_add_ps(vx01234567, vthree), _MM_FROUND_NO_EXC);
2556*4bdc9457SAndroid Build Coastguard Worker vx01234567 = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vx01234567, vsixth), _MM_FROUND_NO_EXC));
2557*4bdc9457SAndroid Build Coastguard Worker __m128i vacc89ABCDEF = _mm256_cvtps_ph(_mm256_add_ps(vx89ABCDEF, vthree), _MM_FROUND_NO_EXC);
2558*4bdc9457SAndroid Build Coastguard Worker vx89ABCDEF = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vx89ABCDEF, vsixth), _MM_FROUND_NO_EXC));
2559*4bdc9457SAndroid Build Coastguard Worker
2560*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm_max_epi16(vacc01234567, vzero);
2561*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEF = _mm_max_epi16(vacc89ABCDEF, vzero);
2562*4bdc9457SAndroid Build Coastguard Worker
2563*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm_min_epi16(vacc01234567, vsix);
2564*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEF = _mm_min_epi16(vacc89ABCDEF, vsix);
2565*4bdc9457SAndroid Build Coastguard Worker
2566*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc01234567), vx01234567), _MM_FROUND_NO_EXC);
2567*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEF = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc89ABCDEF), vx89ABCDEF), _MM_FROUND_NO_EXC);
2568*4bdc9457SAndroid Build Coastguard Worker
2569*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, vacc01234567);
2570*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (y + 8), vacc89ABCDEF);
2571*4bdc9457SAndroid Build Coastguard Worker y += 16;
2572*4bdc9457SAndroid Build Coastguard Worker }
2573*4bdc9457SAndroid Build Coastguard Worker for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2574*4bdc9457SAndroid Build Coastguard Worker __m256 vx = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) x));
2575*4bdc9457SAndroid Build Coastguard Worker x += 8;
2576*4bdc9457SAndroid Build Coastguard Worker __m128i vacc = _mm256_cvtps_ph(_mm256_add_ps(vx, vthree), _MM_FROUND_NO_EXC);
2577*4bdc9457SAndroid Build Coastguard Worker vx = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vx, vsixth), _MM_FROUND_NO_EXC));
2578*4bdc9457SAndroid Build Coastguard Worker vacc = _mm_max_epi16(vacc, vzero);
2579*4bdc9457SAndroid Build Coastguard Worker vacc = _mm_min_epi16(vacc, vsix);
2580*4bdc9457SAndroid Build Coastguard Worker vacc = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc), vx), _MM_FROUND_NO_EXC);
2581*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) y, vacc);
2582*4bdc9457SAndroid Build Coastguard Worker y += 8;
2583*4bdc9457SAndroid Build Coastguard Worker }
2584*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
2585*4bdc9457SAndroid Build Coastguard Worker __m256 vx = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) x));
2586*4bdc9457SAndroid Build Coastguard Worker __m128i vacc = _mm256_cvtps_ph(_mm256_add_ps(vx, vthree), _MM_FROUND_NO_EXC);
2587*4bdc9457SAndroid Build Coastguard Worker vx = _mm256_cvtph_ps(_mm256_cvtps_ph(_mm256_mul_ps(vx, vsixth), _MM_FROUND_NO_EXC));
2588*4bdc9457SAndroid Build Coastguard Worker vacc = _mm_max_epi16(vacc, vzero);
2589*4bdc9457SAndroid Build Coastguard Worker vacc = _mm_min_epi16(vacc, vsix);
2590*4bdc9457SAndroid Build Coastguard Worker vacc = _mm256_cvtps_ph(_mm256_mul_ps(_mm256_cvtph_ps(vacc), vx), _MM_FROUND_NO_EXC);
2591*4bdc9457SAndroid Build Coastguard Worker
2592*4bdc9457SAndroid Build Coastguard Worker if (n & (4 * sizeof(uint16_t))) {
2593*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) y, vacc);
2594*4bdc9457SAndroid Build Coastguard Worker vacc = _mm_unpackhi_epi64(vacc, vacc);
2595*4bdc9457SAndroid Build Coastguard Worker y += 4;
2596*4bdc9457SAndroid Build Coastguard Worker }
2597*4bdc9457SAndroid Build Coastguard Worker if (n & (2 * sizeof(uint16_t))) {
2598*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(y, vacc);
2599*4bdc9457SAndroid Build Coastguard Worker vacc = _mm_srli_epi64(vacc, 32);
2600*4bdc9457SAndroid Build Coastguard Worker y += 2;
2601*4bdc9457SAndroid Build Coastguard Worker }
2602*4bdc9457SAndroid Build Coastguard Worker if (n & (1 * sizeof(uint16_t))) {
2603*4bdc9457SAndroid Build Coastguard Worker *y = (uint16_t) _mm_extract_epi16(vacc, 0);
2604*4bdc9457SAndroid Build Coastguard Worker }
2605*4bdc9457SAndroid Build Coastguard Worker }
2606*4bdc9457SAndroid Build Coastguard Worker }
2607*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_vlrelu_ukernel__f16c_x16(size_t batch,const void * input,void * output,const union xnn_f16_lrelu_params params[restrict XNN_MIN_ELEMENTS (1)])2608*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_vlrelu_ukernel__f16c_x16(
2609*4bdc9457SAndroid Build Coastguard Worker size_t batch,
2610*4bdc9457SAndroid Build Coastguard Worker const void* input,
2611*4bdc9457SAndroid Build Coastguard Worker void* output,
2612*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_lrelu_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2613*4bdc9457SAndroid Build Coastguard Worker {
2614*4bdc9457SAndroid Build Coastguard Worker assert(batch != 0);
2615*4bdc9457SAndroid Build Coastguard Worker assert(batch % sizeof(uint16_t) == 0);
2616*4bdc9457SAndroid Build Coastguard Worker
2617*4bdc9457SAndroid Build Coastguard Worker const __m256 vslope = _mm256_load_ps(params->avx.slope);
2618*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i = (const uint16_t*) input;
2619*4bdc9457SAndroid Build Coastguard Worker uint16_t* o = (uint16_t*) output;
2620*4bdc9457SAndroid Build Coastguard Worker for (; batch >= 16 * sizeof(uint16_t); batch -= 16 * sizeof(uint16_t)) {
2621*4bdc9457SAndroid Build Coastguard Worker const __m256 vx01234567 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2622*4bdc9457SAndroid Build Coastguard Worker const __m256 vx89ABCDEF = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 8)));
2623*4bdc9457SAndroid Build Coastguard Worker i += 16;
2624*4bdc9457SAndroid Build Coastguard Worker
2625*4bdc9457SAndroid Build Coastguard Worker __m256 vacc01234567 = _mm256_mul_ps(vx01234567, vslope);
2626*4bdc9457SAndroid Build Coastguard Worker __m256 vacc89ABCDEF = _mm256_mul_ps(vx89ABCDEF, vslope);
2627*4bdc9457SAndroid Build Coastguard Worker
2628*4bdc9457SAndroid Build Coastguard Worker vacc01234567 = _mm256_blendv_ps(vx01234567, vacc01234567, vx01234567);
2629*4bdc9457SAndroid Build Coastguard Worker vacc89ABCDEF = _mm256_blendv_ps(vx89ABCDEF, vacc89ABCDEF, vx89ABCDEF);
2630*4bdc9457SAndroid Build Coastguard Worker
2631*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc01234567, _MM_FROUND_NO_EXC));
2632*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vacc89ABCDEF, _MM_FROUND_NO_EXC));
2633*4bdc9457SAndroid Build Coastguard Worker o += 16;
2634*4bdc9457SAndroid Build Coastguard Worker }
2635*4bdc9457SAndroid Build Coastguard Worker for (; batch >= 8 * sizeof(uint16_t); batch -= 8 * sizeof(uint16_t)) {
2636*4bdc9457SAndroid Build Coastguard Worker const __m256 vx = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2637*4bdc9457SAndroid Build Coastguard Worker i += 8;
2638*4bdc9457SAndroid Build Coastguard Worker
2639*4bdc9457SAndroid Build Coastguard Worker __m256 vacc = _mm256_mul_ps(vx, vslope);
2640*4bdc9457SAndroid Build Coastguard Worker vacc = _mm256_blendv_ps(vx, vacc, vx);
2641*4bdc9457SAndroid Build Coastguard Worker
2642*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC));
2643*4bdc9457SAndroid Build Coastguard Worker o += 8;
2644*4bdc9457SAndroid Build Coastguard Worker }
2645*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(batch != 0) {
2646*4bdc9457SAndroid Build Coastguard Worker const __m256 vx = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2647*4bdc9457SAndroid Build Coastguard Worker
2648*4bdc9457SAndroid Build Coastguard Worker __m256 vacc = _mm256_mul_ps(vx, vslope);
2649*4bdc9457SAndroid Build Coastguard Worker vacc = _mm256_blendv_ps(vx, vacc, vx);
2650*4bdc9457SAndroid Build Coastguard Worker
2651*4bdc9457SAndroid Build Coastguard Worker __m128i vh = _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC);
2652*4bdc9457SAndroid Build Coastguard Worker if (batch & (4 * sizeof(uint16_t))) {
2653*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) o, vh);
2654*4bdc9457SAndroid Build Coastguard Worker vh = _mm_unpackhi_epi64(vh, vh);
2655*4bdc9457SAndroid Build Coastguard Worker o += 4;
2656*4bdc9457SAndroid Build Coastguard Worker }
2657*4bdc9457SAndroid Build Coastguard Worker if (batch & (2 * sizeof(uint16_t))) {
2658*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(o, vh);
2659*4bdc9457SAndroid Build Coastguard Worker vh = _mm_srli_epi64(vh, 32);
2660*4bdc9457SAndroid Build Coastguard Worker o += 2;
2661*4bdc9457SAndroid Build Coastguard Worker }
2662*4bdc9457SAndroid Build Coastguard Worker if (batch & (1 * sizeof(uint16_t))) {
2663*4bdc9457SAndroid Build Coastguard Worker *o = _mm_extract_epi16(vh, 0);
2664*4bdc9457SAndroid Build Coastguard Worker }
2665*4bdc9457SAndroid Build Coastguard Worker }
2666*4bdc9457SAndroid Build Coastguard Worker }
2667*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_vrndd_ukernel__f16c_x16(size_t n,const void * input,void * output,const union xnn_f16_rnd_params params[restrict XNN_MIN_ELEMENTS (1)])2668*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_vrndd_ukernel__f16c_x16(
2669*4bdc9457SAndroid Build Coastguard Worker size_t n,
2670*4bdc9457SAndroid Build Coastguard Worker const void* input,
2671*4bdc9457SAndroid Build Coastguard Worker void* output,
2672*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_rnd_params params[restrict XNN_MIN_ELEMENTS(1)])
2673*4bdc9457SAndroid Build Coastguard Worker {
2674*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
2675*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(uint16_t) == 0);
2676*4bdc9457SAndroid Build Coastguard Worker
2677*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i = (const uint16_t*) input;
2678*4bdc9457SAndroid Build Coastguard Worker uint16_t* o = (uint16_t*) output;
2679*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2680*4bdc9457SAndroid Build Coastguard Worker __m256 vacc0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2681*4bdc9457SAndroid Build Coastguard Worker __m256 vacc1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 8)));
2682*4bdc9457SAndroid Build Coastguard Worker i += 16;
2683*4bdc9457SAndroid Build Coastguard Worker
2684*4bdc9457SAndroid Build Coastguard Worker vacc0 = _mm256_round_ps(vacc0, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
2685*4bdc9457SAndroid Build Coastguard Worker vacc1 = _mm256_round_ps(vacc1, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
2686*4bdc9457SAndroid Build Coastguard Worker
2687*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc0, _MM_FROUND_NO_EXC));
2688*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vacc1, _MM_FROUND_NO_EXC));
2689*4bdc9457SAndroid Build Coastguard Worker o += 16;
2690*4bdc9457SAndroid Build Coastguard Worker }
2691*4bdc9457SAndroid Build Coastguard Worker for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2692*4bdc9457SAndroid Build Coastguard Worker __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2693*4bdc9457SAndroid Build Coastguard Worker i += 8;
2694*4bdc9457SAndroid Build Coastguard Worker
2695*4bdc9457SAndroid Build Coastguard Worker vacc = _mm256_round_ps(vacc, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
2696*4bdc9457SAndroid Build Coastguard Worker
2697*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC));
2698*4bdc9457SAndroid Build Coastguard Worker o += 8;
2699*4bdc9457SAndroid Build Coastguard Worker }
2700*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
2701*4bdc9457SAndroid Build Coastguard Worker assert(n >= 1 * sizeof(uint16_t));
2702*4bdc9457SAndroid Build Coastguard Worker assert(n <= 7 * sizeof(uint16_t));
2703*4bdc9457SAndroid Build Coastguard Worker __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2704*4bdc9457SAndroid Build Coastguard Worker vacc = _mm256_round_ps(vacc, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
2705*4bdc9457SAndroid Build Coastguard Worker __m128i vh = _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC);
2706*4bdc9457SAndroid Build Coastguard Worker if (n & (4 * sizeof(uint16_t))) {
2707*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) o, vh);
2708*4bdc9457SAndroid Build Coastguard Worker vh = _mm_unpackhi_epi64(vh, vh);
2709*4bdc9457SAndroid Build Coastguard Worker o += 4;
2710*4bdc9457SAndroid Build Coastguard Worker }
2711*4bdc9457SAndroid Build Coastguard Worker if (n & (2 * sizeof(uint16_t))) {
2712*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(o, vh);
2713*4bdc9457SAndroid Build Coastguard Worker vh = _mm_srli_epi64(vh, 32);
2714*4bdc9457SAndroid Build Coastguard Worker o += 2;
2715*4bdc9457SAndroid Build Coastguard Worker }
2716*4bdc9457SAndroid Build Coastguard Worker if (n & (1 * sizeof(uint16_t))) {
2717*4bdc9457SAndroid Build Coastguard Worker *o = (uint16_t) _mm_extract_epi16(vh, 0);
2718*4bdc9457SAndroid Build Coastguard Worker }
2719*4bdc9457SAndroid Build Coastguard Worker }
2720*4bdc9457SAndroid Build Coastguard Worker }
2721*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_vrndne_ukernel__f16c_x16(size_t n,const void * input,void * output,const union xnn_f16_rnd_params params[restrict XNN_MIN_ELEMENTS (1)])2722*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_vrndne_ukernel__f16c_x16(
2723*4bdc9457SAndroid Build Coastguard Worker size_t n,
2724*4bdc9457SAndroid Build Coastguard Worker const void* input,
2725*4bdc9457SAndroid Build Coastguard Worker void* output,
2726*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_rnd_params params[restrict XNN_MIN_ELEMENTS(1)])
2727*4bdc9457SAndroid Build Coastguard Worker {
2728*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
2729*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(uint16_t) == 0);
2730*4bdc9457SAndroid Build Coastguard Worker
2731*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i = (const uint16_t*) input;
2732*4bdc9457SAndroid Build Coastguard Worker uint16_t* o = (uint16_t*) output;
2733*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2734*4bdc9457SAndroid Build Coastguard Worker __m256 vacc0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2735*4bdc9457SAndroid Build Coastguard Worker __m256 vacc1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 8)));
2736*4bdc9457SAndroid Build Coastguard Worker i += 16;
2737*4bdc9457SAndroid Build Coastguard Worker
2738*4bdc9457SAndroid Build Coastguard Worker vacc0 = _mm256_round_ps(vacc0, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
2739*4bdc9457SAndroid Build Coastguard Worker vacc1 = _mm256_round_ps(vacc1, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
2740*4bdc9457SAndroid Build Coastguard Worker
2741*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc0, _MM_FROUND_NO_EXC));
2742*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vacc1, _MM_FROUND_NO_EXC));
2743*4bdc9457SAndroid Build Coastguard Worker o += 16;
2744*4bdc9457SAndroid Build Coastguard Worker }
2745*4bdc9457SAndroid Build Coastguard Worker for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2746*4bdc9457SAndroid Build Coastguard Worker __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2747*4bdc9457SAndroid Build Coastguard Worker i += 8;
2748*4bdc9457SAndroid Build Coastguard Worker
2749*4bdc9457SAndroid Build Coastguard Worker vacc = _mm256_round_ps(vacc, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
2750*4bdc9457SAndroid Build Coastguard Worker
2751*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC));
2752*4bdc9457SAndroid Build Coastguard Worker o += 8;
2753*4bdc9457SAndroid Build Coastguard Worker }
2754*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
2755*4bdc9457SAndroid Build Coastguard Worker assert(n >= 1 * sizeof(uint16_t));
2756*4bdc9457SAndroid Build Coastguard Worker assert(n <= 7 * sizeof(uint16_t));
2757*4bdc9457SAndroid Build Coastguard Worker __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2758*4bdc9457SAndroid Build Coastguard Worker vacc = _mm256_round_ps(vacc, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
2759*4bdc9457SAndroid Build Coastguard Worker __m128i vh = _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC);
2760*4bdc9457SAndroid Build Coastguard Worker if (n & (4 * sizeof(uint16_t))) {
2761*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) o, vh);
2762*4bdc9457SAndroid Build Coastguard Worker vh = _mm_unpackhi_epi64(vh, vh);
2763*4bdc9457SAndroid Build Coastguard Worker o += 4;
2764*4bdc9457SAndroid Build Coastguard Worker }
2765*4bdc9457SAndroid Build Coastguard Worker if (n & (2 * sizeof(uint16_t))) {
2766*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(o, vh);
2767*4bdc9457SAndroid Build Coastguard Worker vh = _mm_srli_epi64(vh, 32);
2768*4bdc9457SAndroid Build Coastguard Worker o += 2;
2769*4bdc9457SAndroid Build Coastguard Worker }
2770*4bdc9457SAndroid Build Coastguard Worker if (n & (1 * sizeof(uint16_t))) {
2771*4bdc9457SAndroid Build Coastguard Worker *o = (uint16_t) _mm_extract_epi16(vh, 0);
2772*4bdc9457SAndroid Build Coastguard Worker }
2773*4bdc9457SAndroid Build Coastguard Worker }
2774*4bdc9457SAndroid Build Coastguard Worker }
2775*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_vrndu_ukernel__f16c_x16(size_t n,const void * input,void * output,const union xnn_f16_rnd_params params[restrict XNN_MIN_ELEMENTS (1)])2776*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_vrndu_ukernel__f16c_x16(
2777*4bdc9457SAndroid Build Coastguard Worker size_t n,
2778*4bdc9457SAndroid Build Coastguard Worker const void* input,
2779*4bdc9457SAndroid Build Coastguard Worker void* output,
2780*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_rnd_params params[restrict XNN_MIN_ELEMENTS(1)])
2781*4bdc9457SAndroid Build Coastguard Worker {
2782*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
2783*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(uint16_t) == 0);
2784*4bdc9457SAndroid Build Coastguard Worker
2785*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i = (const uint16_t*) input;
2786*4bdc9457SAndroid Build Coastguard Worker uint16_t* o = (uint16_t*) output;
2787*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2788*4bdc9457SAndroid Build Coastguard Worker __m256 vacc0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2789*4bdc9457SAndroid Build Coastguard Worker __m256 vacc1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 8)));
2790*4bdc9457SAndroid Build Coastguard Worker i += 16;
2791*4bdc9457SAndroid Build Coastguard Worker
2792*4bdc9457SAndroid Build Coastguard Worker vacc0 = _mm256_round_ps(vacc0, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
2793*4bdc9457SAndroid Build Coastguard Worker vacc1 = _mm256_round_ps(vacc1, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
2794*4bdc9457SAndroid Build Coastguard Worker
2795*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc0, _MM_FROUND_NO_EXC));
2796*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vacc1, _MM_FROUND_NO_EXC));
2797*4bdc9457SAndroid Build Coastguard Worker o += 16;
2798*4bdc9457SAndroid Build Coastguard Worker }
2799*4bdc9457SAndroid Build Coastguard Worker for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2800*4bdc9457SAndroid Build Coastguard Worker __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2801*4bdc9457SAndroid Build Coastguard Worker i += 8;
2802*4bdc9457SAndroid Build Coastguard Worker
2803*4bdc9457SAndroid Build Coastguard Worker vacc = _mm256_round_ps(vacc, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
2804*4bdc9457SAndroid Build Coastguard Worker
2805*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC));
2806*4bdc9457SAndroid Build Coastguard Worker o += 8;
2807*4bdc9457SAndroid Build Coastguard Worker }
2808*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
2809*4bdc9457SAndroid Build Coastguard Worker assert(n >= 1 * sizeof(uint16_t));
2810*4bdc9457SAndroid Build Coastguard Worker assert(n <= 7 * sizeof(uint16_t));
2811*4bdc9457SAndroid Build Coastguard Worker __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2812*4bdc9457SAndroid Build Coastguard Worker vacc = _mm256_round_ps(vacc, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
2813*4bdc9457SAndroid Build Coastguard Worker __m128i vh = _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC);
2814*4bdc9457SAndroid Build Coastguard Worker if (n & (4 * sizeof(uint16_t))) {
2815*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) o, vh);
2816*4bdc9457SAndroid Build Coastguard Worker vh = _mm_unpackhi_epi64(vh, vh);
2817*4bdc9457SAndroid Build Coastguard Worker o += 4;
2818*4bdc9457SAndroid Build Coastguard Worker }
2819*4bdc9457SAndroid Build Coastguard Worker if (n & (2 * sizeof(uint16_t))) {
2820*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(o, vh);
2821*4bdc9457SAndroid Build Coastguard Worker vh = _mm_srli_epi64(vh, 32);
2822*4bdc9457SAndroid Build Coastguard Worker o += 2;
2823*4bdc9457SAndroid Build Coastguard Worker }
2824*4bdc9457SAndroid Build Coastguard Worker if (n & (1 * sizeof(uint16_t))) {
2825*4bdc9457SAndroid Build Coastguard Worker *o = (uint16_t) _mm_extract_epi16(vh, 0);
2826*4bdc9457SAndroid Build Coastguard Worker }
2827*4bdc9457SAndroid Build Coastguard Worker }
2828*4bdc9457SAndroid Build Coastguard Worker }
2829*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_vrndz_ukernel__f16c_x16(size_t n,const void * input,void * output,const union xnn_f16_rnd_params params[restrict XNN_MIN_ELEMENTS (1)])2830*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_vrndz_ukernel__f16c_x16(
2831*4bdc9457SAndroid Build Coastguard Worker size_t n,
2832*4bdc9457SAndroid Build Coastguard Worker const void* input,
2833*4bdc9457SAndroid Build Coastguard Worker void* output,
2834*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_rnd_params params[restrict XNN_MIN_ELEMENTS(1)])
2835*4bdc9457SAndroid Build Coastguard Worker {
2836*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
2837*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(uint16_t) == 0);
2838*4bdc9457SAndroid Build Coastguard Worker
2839*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i = (const uint16_t*) input;
2840*4bdc9457SAndroid Build Coastguard Worker uint16_t* o = (uint16_t*) output;
2841*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2842*4bdc9457SAndroid Build Coastguard Worker __m256 vacc0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2843*4bdc9457SAndroid Build Coastguard Worker __m256 vacc1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 8)));
2844*4bdc9457SAndroid Build Coastguard Worker i += 16;
2845*4bdc9457SAndroid Build Coastguard Worker
2846*4bdc9457SAndroid Build Coastguard Worker vacc0 = _mm256_round_ps(vacc0, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2847*4bdc9457SAndroid Build Coastguard Worker vacc1 = _mm256_round_ps(vacc1, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2848*4bdc9457SAndroid Build Coastguard Worker
2849*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc0, _MM_FROUND_NO_EXC));
2850*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vacc1, _MM_FROUND_NO_EXC));
2851*4bdc9457SAndroid Build Coastguard Worker o += 16;
2852*4bdc9457SAndroid Build Coastguard Worker }
2853*4bdc9457SAndroid Build Coastguard Worker for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2854*4bdc9457SAndroid Build Coastguard Worker __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2855*4bdc9457SAndroid Build Coastguard Worker i += 8;
2856*4bdc9457SAndroid Build Coastguard Worker
2857*4bdc9457SAndroid Build Coastguard Worker vacc = _mm256_round_ps(vacc, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2858*4bdc9457SAndroid Build Coastguard Worker
2859*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC));
2860*4bdc9457SAndroid Build Coastguard Worker o += 8;
2861*4bdc9457SAndroid Build Coastguard Worker }
2862*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
2863*4bdc9457SAndroid Build Coastguard Worker assert(n >= 1 * sizeof(uint16_t));
2864*4bdc9457SAndroid Build Coastguard Worker assert(n <= 7 * sizeof(uint16_t));
2865*4bdc9457SAndroid Build Coastguard Worker __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2866*4bdc9457SAndroid Build Coastguard Worker vacc = _mm256_round_ps(vacc, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
2867*4bdc9457SAndroid Build Coastguard Worker __m128i vh = _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC);
2868*4bdc9457SAndroid Build Coastguard Worker if (n & (4 * sizeof(uint16_t))) {
2869*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) o, vh);
2870*4bdc9457SAndroid Build Coastguard Worker vh = _mm_unpackhi_epi64(vh, vh);
2871*4bdc9457SAndroid Build Coastguard Worker o += 4;
2872*4bdc9457SAndroid Build Coastguard Worker }
2873*4bdc9457SAndroid Build Coastguard Worker if (n & (2 * sizeof(uint16_t))) {
2874*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(o, vh);
2875*4bdc9457SAndroid Build Coastguard Worker vh = _mm_srli_epi64(vh, 32);
2876*4bdc9457SAndroid Build Coastguard Worker o += 2;
2877*4bdc9457SAndroid Build Coastguard Worker }
2878*4bdc9457SAndroid Build Coastguard Worker if (n & (1 * sizeof(uint16_t))) {
2879*4bdc9457SAndroid Build Coastguard Worker *o = (uint16_t) _mm_extract_epi16(vh, 0);
2880*4bdc9457SAndroid Build Coastguard Worker }
2881*4bdc9457SAndroid Build Coastguard Worker }
2882*4bdc9457SAndroid Build Coastguard Worker }
2883*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_vsqrt_ukernel__f16c_sqrt_x8(size_t n,const void * input,void * output,const union xnn_f16_sqrt_params params[restrict XNN_MIN_ELEMENTS (1)])2884*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_vsqrt_ukernel__f16c_sqrt_x8(
2885*4bdc9457SAndroid Build Coastguard Worker size_t n,
2886*4bdc9457SAndroid Build Coastguard Worker const void* input,
2887*4bdc9457SAndroid Build Coastguard Worker void* output,
2888*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_sqrt_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2889*4bdc9457SAndroid Build Coastguard Worker {
2890*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
2891*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(uint16_t) == 0);
2892*4bdc9457SAndroid Build Coastguard Worker assert(input != NULL);
2893*4bdc9457SAndroid Build Coastguard Worker assert(output != NULL);
2894*4bdc9457SAndroid Build Coastguard Worker
2895*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i = (const uint16_t*) input;
2896*4bdc9457SAndroid Build Coastguard Worker uint16_t* o = (uint16_t*) output;
2897*4bdc9457SAndroid Build Coastguard Worker for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2898*4bdc9457SAndroid Build Coastguard Worker __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2899*4bdc9457SAndroid Build Coastguard Worker i += 8;
2900*4bdc9457SAndroid Build Coastguard Worker vacc = _mm256_sqrt_ps(vacc);
2901*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC));
2902*4bdc9457SAndroid Build Coastguard Worker o += 8;
2903*4bdc9457SAndroid Build Coastguard Worker }
2904*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
2905*4bdc9457SAndroid Build Coastguard Worker __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2906*4bdc9457SAndroid Build Coastguard Worker vacc = _mm256_sqrt_ps(vacc);
2907*4bdc9457SAndroid Build Coastguard Worker __m128i vh = _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC);
2908*4bdc9457SAndroid Build Coastguard Worker if (n & (4 * sizeof(uint16_t))) {
2909*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) o, vh);
2910*4bdc9457SAndroid Build Coastguard Worker o += 4;
2911*4bdc9457SAndroid Build Coastguard Worker vh = _mm_unpackhi_epi64(vh, vh);
2912*4bdc9457SAndroid Build Coastguard Worker }
2913*4bdc9457SAndroid Build Coastguard Worker if (n & (2 * sizeof(uint16_t))) {
2914*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(o, vh);
2915*4bdc9457SAndroid Build Coastguard Worker o += 2;
2916*4bdc9457SAndroid Build Coastguard Worker vh = _mm_srli_epi64(vh, 32);
2917*4bdc9457SAndroid Build Coastguard Worker }
2918*4bdc9457SAndroid Build Coastguard Worker if (n & (1 * sizeof(uint16_t))) {
2919*4bdc9457SAndroid Build Coastguard Worker *o = (uint16_t) _mm_extract_epi16(vh, 0);
2920*4bdc9457SAndroid Build Coastguard Worker }
2921*4bdc9457SAndroid Build Coastguard Worker }
2922*4bdc9457SAndroid Build Coastguard Worker }
2923*4bdc9457SAndroid Build Coastguard Worker
xnn_f16_vsqr_ukernel__f16c_x16(size_t n,const void * input,void * output,const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS (1)])2924*4bdc9457SAndroid Build Coastguard Worker void xnn_f16_vsqr_ukernel__f16c_x16(
2925*4bdc9457SAndroid Build Coastguard Worker size_t n,
2926*4bdc9457SAndroid Build Coastguard Worker const void* input,
2927*4bdc9457SAndroid Build Coastguard Worker void* output,
2928*4bdc9457SAndroid Build Coastguard Worker const union xnn_f16_default_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
2929*4bdc9457SAndroid Build Coastguard Worker {
2930*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
2931*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(uint16_t) == 0);
2932*4bdc9457SAndroid Build Coastguard Worker assert(input != NULL);
2933*4bdc9457SAndroid Build Coastguard Worker assert(output != NULL);
2934*4bdc9457SAndroid Build Coastguard Worker
2935*4bdc9457SAndroid Build Coastguard Worker const uint16_t* i = (const uint16_t*) input;
2936*4bdc9457SAndroid Build Coastguard Worker uint16_t* o = (uint16_t*) output;
2937*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(uint16_t); n -= 16 * sizeof(uint16_t)) {
2938*4bdc9457SAndroid Build Coastguard Worker __m256 vacc0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2939*4bdc9457SAndroid Build Coastguard Worker __m256 vacc1 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) (i + 8)));
2940*4bdc9457SAndroid Build Coastguard Worker i += 16;
2941*4bdc9457SAndroid Build Coastguard Worker
2942*4bdc9457SAndroid Build Coastguard Worker vacc0 = _mm256_mul_ps(vacc0, vacc0);
2943*4bdc9457SAndroid Build Coastguard Worker vacc1 = _mm256_mul_ps(vacc1, vacc1);
2944*4bdc9457SAndroid Build Coastguard Worker
2945*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc0, _MM_FROUND_NO_EXC));
2946*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vacc1, _MM_FROUND_NO_EXC));
2947*4bdc9457SAndroid Build Coastguard Worker o += 16;
2948*4bdc9457SAndroid Build Coastguard Worker }
2949*4bdc9457SAndroid Build Coastguard Worker for (; n >= 8 * sizeof(uint16_t); n -= 8 * sizeof(uint16_t)) {
2950*4bdc9457SAndroid Build Coastguard Worker __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2951*4bdc9457SAndroid Build Coastguard Worker i += 8;
2952*4bdc9457SAndroid Build Coastguard Worker vacc = _mm256_mul_ps(vacc, vacc);
2953*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC));
2954*4bdc9457SAndroid Build Coastguard Worker o += 8;
2955*4bdc9457SAndroid Build Coastguard Worker }
2956*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
2957*4bdc9457SAndroid Build Coastguard Worker __m256 vacc = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i*) i));
2958*4bdc9457SAndroid Build Coastguard Worker vacc = _mm256_mul_ps(vacc, vacc);
2959*4bdc9457SAndroid Build Coastguard Worker __m128i vh = _mm256_cvtps_ph(vacc, _MM_FROUND_NO_EXC);
2960*4bdc9457SAndroid Build Coastguard Worker if (n & (4 * sizeof(uint16_t))) {
2961*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) o, vh);
2962*4bdc9457SAndroid Build Coastguard Worker o += 4;
2963*4bdc9457SAndroid Build Coastguard Worker vh = _mm_unpackhi_epi64(vh, vh);
2964*4bdc9457SAndroid Build Coastguard Worker }
2965*4bdc9457SAndroid Build Coastguard Worker if (n & (2 * sizeof(uint16_t))) {
2966*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(o, vh);
2967*4bdc9457SAndroid Build Coastguard Worker o += 2;
2968*4bdc9457SAndroid Build Coastguard Worker vh = _mm_srli_epi64(vh, 32);
2969*4bdc9457SAndroid Build Coastguard Worker }
2970*4bdc9457SAndroid Build Coastguard Worker if (n & (1 * sizeof(uint16_t))) {
2971*4bdc9457SAndroid Build Coastguard Worker *o = (uint16_t) _mm_extract_epi16(vh, 0);
2972*4bdc9457SAndroid Build Coastguard Worker }
2973*4bdc9457SAndroid Build Coastguard Worker }
2974*4bdc9457SAndroid Build Coastguard Worker }
2975*4bdc9457SAndroid Build Coastguard Worker
xnn_f32_f16_vcvt_ukernel__f16c_x16(size_t n,const float * input,void * output,const union xnn_f32_f16_cvt_params params[restrict XNN_MIN_ELEMENTS (1)])2976*4bdc9457SAndroid Build Coastguard Worker void xnn_f32_f16_vcvt_ukernel__f16c_x16(
2977*4bdc9457SAndroid Build Coastguard Worker size_t n,
2978*4bdc9457SAndroid Build Coastguard Worker const float* input,
2979*4bdc9457SAndroid Build Coastguard Worker void* output,
2980*4bdc9457SAndroid Build Coastguard Worker const union xnn_f32_f16_cvt_params params[restrict XNN_MIN_ELEMENTS(1)])
2981*4bdc9457SAndroid Build Coastguard Worker {
2982*4bdc9457SAndroid Build Coastguard Worker assert(n != 0);
2983*4bdc9457SAndroid Build Coastguard Worker assert(n % sizeof(float) == 0);
2984*4bdc9457SAndroid Build Coastguard Worker assert(input != NULL);
2985*4bdc9457SAndroid Build Coastguard Worker assert(output != NULL);
2986*4bdc9457SAndroid Build Coastguard Worker
2987*4bdc9457SAndroid Build Coastguard Worker uint16_t* o = (uint16_t*) output;
2988*4bdc9457SAndroid Build Coastguard Worker for (; n >= 16 * sizeof(float); n -= 16 * sizeof(float)) {
2989*4bdc9457SAndroid Build Coastguard Worker const __m256 vf0 = _mm256_loadu_ps(input);
2990*4bdc9457SAndroid Build Coastguard Worker const __m256 vf1 = _mm256_loadu_ps(input + 8);
2991*4bdc9457SAndroid Build Coastguard Worker input += 16;
2992*4bdc9457SAndroid Build Coastguard Worker
2993*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vf0, _MM_FROUND_NO_EXC));
2994*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) (o + 8), _mm256_cvtps_ph(vf1, _MM_FROUND_NO_EXC));
2995*4bdc9457SAndroid Build Coastguard Worker o += 16;
2996*4bdc9457SAndroid Build Coastguard Worker }
2997*4bdc9457SAndroid Build Coastguard Worker for (; n >= 8 * sizeof(float); n -= 8 * sizeof(float)) {
2998*4bdc9457SAndroid Build Coastguard Worker const __m256 vf = _mm256_loadu_ps(input);
2999*4bdc9457SAndroid Build Coastguard Worker input += 8;
3000*4bdc9457SAndroid Build Coastguard Worker
3001*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si128((__m128i*) o, _mm256_cvtps_ph(vf, _MM_FROUND_NO_EXC));
3002*4bdc9457SAndroid Build Coastguard Worker o += 8;
3003*4bdc9457SAndroid Build Coastguard Worker }
3004*4bdc9457SAndroid Build Coastguard Worker if XNN_UNLIKELY(n != 0) {
3005*4bdc9457SAndroid Build Coastguard Worker assert(n >= 1 * sizeof(float));
3006*4bdc9457SAndroid Build Coastguard Worker assert(n <= 7 * sizeof(float));
3007*4bdc9457SAndroid Build Coastguard Worker const __m256i vmask = _mm256_loadu_si256((const __m256i*) ((uintptr_t) ¶ms->f16c.mask_table[7] - n));
3008*4bdc9457SAndroid Build Coastguard Worker
3009*4bdc9457SAndroid Build Coastguard Worker const __m256 vf = _mm256_maskload_ps(input, vmask);
3010*4bdc9457SAndroid Build Coastguard Worker
3011*4bdc9457SAndroid Build Coastguard Worker __m128 vf_lo = _mm256_castps256_ps128(vf);
3012*4bdc9457SAndroid Build Coastguard Worker if (n & (4 * sizeof(float))) {
3013*4bdc9457SAndroid Build Coastguard Worker _mm_storel_epi64((__m128i*) o, _mm_cvtps_ph(vf_lo, _MM_FROUND_NO_EXC));
3014*4bdc9457SAndroid Build Coastguard Worker vf_lo = _mm256_extractf128_ps(vf, 1);
3015*4bdc9457SAndroid Build Coastguard Worker o += 4;
3016*4bdc9457SAndroid Build Coastguard Worker }
3017*4bdc9457SAndroid Build Coastguard Worker __m128i vh = _mm_cvtps_ph(vf_lo, _MM_FROUND_NO_EXC);
3018*4bdc9457SAndroid Build Coastguard Worker if (n & (2 * sizeof(float))) {
3019*4bdc9457SAndroid Build Coastguard Worker _mm_storeu_si32(o, vh);
3020*4bdc9457SAndroid Build Coastguard Worker vh = _mm_srli_epi64(vh, 32);
3021*4bdc9457SAndroid Build Coastguard Worker o += 2;
3022*4bdc9457SAndroid Build Coastguard Worker }
3023*4bdc9457SAndroid Build Coastguard Worker if (n & (1 * sizeof(float))) {
3024*4bdc9457SAndroid Build Coastguard Worker *((uint16_t*) o) = _mm_extract_epi16(vh, 0);
3025*4bdc9457SAndroid Build Coastguard Worker }
3026*4bdc9457SAndroid Build Coastguard Worker }
3027*4bdc9457SAndroid Build Coastguard Worker }
3028