1 /*
2 * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <arm_neon.h>
13
14 #include "config/aom_config.h"
15 #include "config/av1_rtcd.h"
16
17 #include "aom_dsp/aom_dsp_common.h"
18 #include "aom_dsp/arm/mem_neon.h"
19 #include "aom_ports/mem.h"
20 #include "av1/common/arm/convolve_neon.h"
21 #include "av1/common/convolve.h"
22 #include "av1/common/filter.h"
23
24 DECLARE_ALIGNED(16, static const uint8_t, kDotProdPermuteTbl[48]) = {
25 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6,
26 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
27 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
28 };
29
30 DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = {
31 // Shift left and insert new last column in transposed 4x4 block.
32 1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
33 // Shift left and insert two new columns in transposed 4x4 block.
34 2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
35 // Shift left and insert three new columns in transposed 4x4 block.
36 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
37 };
38
convolve12_4_x(uint8x16_t samples,const int8x16_t filter,const uint8x16x3_t permute_tbl)39 static inline int16x4_t convolve12_4_x(uint8x16_t samples,
40 const int8x16_t filter,
41 const uint8x16x3_t permute_tbl) {
42 // Transform sample range to [-128, 127] for 8-bit signed dot product.
43 int8x16_t samples_128 =
44 vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
45
46 // Permute samples ready for dot product.
47 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
48 // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
49 // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
50 int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
51 vqtbl1q_s8(samples_128, permute_tbl.val[1]),
52 vqtbl1q_s8(samples_128, permute_tbl.val[2]) };
53
54 // Dot product constants:
55 // Accumulate into 128 << FILTER_BITS to account for range transform.
56 // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
57 // right shift by FILTER_BITS - instead of a first rounding right shift by
58 // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
59 // ROUND0_BITS.
60 int32x4_t acc =
61 vdupq_n_s32((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1))));
62
63 int32x4_t sum = vdotq_laneq_s32(acc, perm_samples[0], filter, 0);
64 sum = vdotq_laneq_s32(sum, perm_samples[1], filter, 1);
65 sum = vdotq_laneq_s32(sum, perm_samples[2], filter, 2);
66
67 return vqrshrn_n_s32(sum, FILTER_BITS);
68 }
69
convolve12_8_x(uint8x16_t samples[2],const int8x16_t filter,const uint8x16x3_t permute_tbl)70 static inline uint8x8_t convolve12_8_x(uint8x16_t samples[2],
71 const int8x16_t filter,
72 const uint8x16x3_t permute_tbl) {
73 // Transform sample range to [-128, 127] for 8-bit signed dot product.
74 int8x16_t samples_128[2] = {
75 vreinterpretq_s8_u8(vsubq_u8(samples[0], vdupq_n_u8(128))),
76 vreinterpretq_s8_u8(vsubq_u8(samples[1], vdupq_n_u8(128)))
77 };
78
79 // Permute samples ready for dot product.
80 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
81 // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
82 // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
83 // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 }
84 int8x16_t perm_samples[4] = { vqtbl1q_s8(samples_128[0], permute_tbl.val[0]),
85 vqtbl1q_s8(samples_128[0], permute_tbl.val[1]),
86 vqtbl1q_s8(samples_128[0], permute_tbl.val[2]),
87 vqtbl1q_s8(samples_128[1],
88 permute_tbl.val[2]) };
89
90 // Dot product constants:
91 // Accumulate into 128 << FILTER_BITS to account for range transform.
92 // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
93 // right shift by FILTER_BITS - instead of a first rounding right shift by
94 // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
95 // ROUND0_BITS.
96 int32x4_t acc =
97 vdupq_n_s32((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1))));
98
99 int32x4_t sum0123 = vdotq_laneq_s32(acc, perm_samples[0], filter, 0);
100 sum0123 = vdotq_laneq_s32(sum0123, perm_samples[1], filter, 1);
101 sum0123 = vdotq_laneq_s32(sum0123, perm_samples[2], filter, 2);
102
103 int32x4_t sum4567 = vdotq_laneq_s32(acc, perm_samples[1], filter, 0);
104 sum4567 = vdotq_laneq_s32(sum4567, perm_samples[2], filter, 1);
105 sum4567 = vdotq_laneq_s32(sum4567, perm_samples[3], filter, 2);
106
107 // Narrow and re-pack.
108 int16x8_t sum_s16 = vcombine_s16(vqrshrn_n_s32(sum0123, FILTER_BITS),
109 vqrshrn_n_s32(sum4567, FILTER_BITS));
110 return vqmovun_s16(sum_s16);
111 }
112
convolve_x_sr_12tap_neon_dotprod(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const int16_t * x_filter_ptr)113 static inline void convolve_x_sr_12tap_neon_dotprod(
114 const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
115 int h, const int16_t *x_filter_ptr) {
116 // The no-op filter should never be used here.
117 assert(x_filter_ptr[5] != 128);
118
119 const int16x8_t filter_0_7 = vld1q_s16(x_filter_ptr);
120 const int16x4_t filter_8_11 = vld1_s16(x_filter_ptr + 8);
121 const int16x8_t filter_8_15 = vcombine_s16(filter_8_11, vdup_n_s16(0));
122 const int8x16_t filter =
123 vcombine_s8(vmovn_s16(filter_0_7), vmovn_s16(filter_8_15));
124
125 const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
126
127 if (w <= 4) {
128 do {
129 uint8x16_t s0, s1, s2, s3;
130 load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
131
132 int16x4_t d0 = convolve12_4_x(s0, filter, permute_tbl);
133 int16x4_t d1 = convolve12_4_x(s1, filter, permute_tbl);
134 int16x4_t d2 = convolve12_4_x(s2, filter, permute_tbl);
135 int16x4_t d3 = convolve12_4_x(s3, filter, permute_tbl);
136
137 uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1));
138 uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3));
139
140 store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
141 store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
142
143 dst += 4 * dst_stride;
144 src += 4 * src_stride;
145 h -= 4;
146 } while (h != 0);
147 } else {
148 do {
149 const uint8_t *s = src;
150 uint8_t *d = dst;
151 int width = w;
152
153 do {
154 uint8x16_t s0[2], s1[2], s2[2], s3[2];
155 load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
156 load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
157
158 uint8x8_t d0 = convolve12_8_x(s0, filter, permute_tbl);
159 uint8x8_t d1 = convolve12_8_x(s1, filter, permute_tbl);
160 uint8x8_t d2 = convolve12_8_x(s2, filter, permute_tbl);
161 uint8x8_t d3 = convolve12_8_x(s3, filter, permute_tbl);
162
163 store_u8_8x4(d + 0 * dst_stride, dst_stride, d0, d1, d2, d3);
164
165 s += 8;
166 d += 8;
167 width -= 8;
168 } while (width != 0);
169 src += 4 * src_stride;
170 dst += 4 * dst_stride;
171 h -= 4;
172 } while (h != 0);
173 }
174 }
175
convolve4_4_x(const uint8x16_t samples,const int8x8_t filters,const uint8x16_t permute_tbl)176 static inline int16x4_t convolve4_4_x(const uint8x16_t samples,
177 const int8x8_t filters,
178 const uint8x16_t permute_tbl) {
179 // Transform sample range to [-128, 127] for 8-bit signed dot product.
180 int8x16_t samples_128 =
181 vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
182
183 // Permute samples ready for dot product.
184 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
185 int8x16_t perm_samples = vqtbl1q_s8(samples_128, permute_tbl);
186
187 // Dot product constants:
188 // Accumulate into 128 << FILTER_BITS to account for range transform.
189 // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
190 // right shift by FILTER_BITS - instead of a first rounding right shift by
191 // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
192 // ROUND0_BITS. Halve the total because we halved the filter values.
193 int32x4_t acc =
194 vdupq_n_s32(((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))) / 2);
195 int32x4_t sum = vdotq_lane_s32(acc, perm_samples, filters, 0);
196
197 // Further narrowing and packing is performed by the caller.
198 return vmovn_s32(sum);
199 }
200
convolve4_8_x(const uint8x16_t samples,const int8x8_t filters,const uint8x16x2_t permute_tbl)201 static inline uint8x8_t convolve4_8_x(const uint8x16_t samples,
202 const int8x8_t filters,
203 const uint8x16x2_t permute_tbl) {
204 // Transform sample range to [-128, 127] for 8-bit signed dot product.
205 int8x16_t samples_128 =
206 vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
207
208 // Permute samples ready for dot product.
209 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
210 // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
211 int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
212 vqtbl1q_s8(samples_128, permute_tbl.val[1]) };
213
214 // Dot product constants:
215 // Accumulate into 128 << FILTER_BITS to account for range transform.
216 // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
217 // right shift by FILTER_BITS - instead of a first rounding right shift by
218 // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
219 // ROUND0_BITS. Halve the total because we halved the filter values.
220 int32x4_t acc =
221 vdupq_n_s32(((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))) / 2);
222
223 int32x4_t sum0123 = vdotq_lane_s32(acc, perm_samples[0], filters, 0);
224 int32x4_t sum4567 = vdotq_lane_s32(acc, perm_samples[1], filters, 0);
225
226 // Narrow and re-pack.
227 int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));
228 // We halved the filter values so -1 from right shift.
229 return vqrshrun_n_s16(sum, FILTER_BITS - 1);
230 }
231
convolve_x_sr_4tap_neon_dotprod(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int width,int height,const int16_t * filter_x)232 static inline void convolve_x_sr_4tap_neon_dotprod(
233 const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
234 ptrdiff_t dst_stride, int width, int height, const int16_t *filter_x) {
235 const int16x4_t x_filter = vld1_s16(filter_x + 2);
236 // All 4-tap and bilinear filter values are even, so halve them to reduce
237 // intermediate precision requirements.
238 const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
239
240 if (width == 4) {
241 const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl);
242
243 do {
244 uint8x16_t s0, s1, s2, s3;
245 load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
246
247 int16x4_t t0 = convolve4_4_x(s0, filter, permute_tbl);
248 int16x4_t t1 = convolve4_4_x(s1, filter, permute_tbl);
249 int16x4_t t2 = convolve4_4_x(s2, filter, permute_tbl);
250 int16x4_t t3 = convolve4_4_x(s3, filter, permute_tbl);
251 // We halved the filter values so -1 from right shift.
252 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS - 1);
253 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS - 1);
254
255 store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
256 store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
257
258 src += 4 * src_stride;
259 dst += 4 * dst_stride;
260 height -= 4;
261 } while (height != 0);
262 } else {
263 const uint8x16x2_t permute_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
264
265 do {
266 const uint8_t *s = src;
267 uint8_t *d = dst;
268 int w = width;
269
270 do {
271 uint8x16_t s0, s1, s2, s3;
272 load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
273
274 uint8x8_t d0 = convolve4_8_x(s0, filter, permute_tbl);
275 uint8x8_t d1 = convolve4_8_x(s1, filter, permute_tbl);
276 uint8x8_t d2 = convolve4_8_x(s2, filter, permute_tbl);
277 uint8x8_t d3 = convolve4_8_x(s3, filter, permute_tbl);
278
279 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
280
281 s += 8;
282 d += 8;
283 w -= 8;
284 } while (w != 0);
285 src += 4 * src_stride;
286 dst += 4 * dst_stride;
287 height -= 4;
288 } while (height != 0);
289 }
290 }
291
convolve8_8_x(uint8x16_t samples,const int8x8_t filter,const uint8x16x3_t permute_tbl)292 static inline uint8x8_t convolve8_8_x(uint8x16_t samples, const int8x8_t filter,
293 const uint8x16x3_t permute_tbl) {
294 // Transform sample range to [-128, 127] for 8-bit signed dot product.
295 int8x16_t samples_128 =
296 vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
297
298 // Permute samples ready for dot product. */
299 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
300 // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
301 // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
302 int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
303 vqtbl1q_s8(samples_128, permute_tbl.val[1]),
304 vqtbl1q_s8(samples_128, permute_tbl.val[2]) };
305
306 // Dot product constants:
307 // Accumulate into 128 << FILTER_BITS to account for range transform.
308 // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
309 // right shift by FILTER_BITS - instead of a first rounding right shift by
310 // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
311 // ROUND0_BITS. Halve the total because we halved the filter values.
312 int32x4_t acc =
313 vdupq_n_s32(((128 << FILTER_BITS) + (1 << ((ROUND0_BITS - 1)))) / 2);
314
315 int32x4_t sum0123 = vdotq_lane_s32(acc, perm_samples[0], filter, 0);
316 sum0123 = vdotq_lane_s32(sum0123, perm_samples[1], filter, 1);
317
318 int32x4_t sum4567 = vdotq_lane_s32(acc, perm_samples[1], filter, 0);
319 sum4567 = vdotq_lane_s32(sum4567, perm_samples[2], filter, 1);
320
321 // Narrow and re-pack.
322 int16x8_t sum_s16 = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));
323 // We halved the convolution filter values so - 1 from the right shift.
324 return vqrshrun_n_s16(sum_s16, FILTER_BITS - 1);
325 }
326
av1_convolve_x_sr_neon_dotprod(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const int subpel_x_qn,ConvolveParams * conv_params)327 void av1_convolve_x_sr_neon_dotprod(const uint8_t *src, int src_stride,
328 uint8_t *dst, int dst_stride, int w, int h,
329 const InterpFilterParams *filter_params_x,
330 const int subpel_x_qn,
331 ConvolveParams *conv_params) {
332 if (w == 2 || h == 2) {
333 av1_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_x,
334 subpel_x_qn, conv_params);
335 return;
336 }
337
338 const uint8_t horiz_offset = filter_params_x->taps / 2 - 1;
339 src -= horiz_offset;
340
341 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
342 filter_params_x, subpel_x_qn & SUBPEL_MASK);
343
344 int filter_taps = get_filter_tap(filter_params_x, subpel_x_qn & SUBPEL_MASK);
345
346 if (filter_taps > 8) {
347 convolve_x_sr_12tap_neon_dotprod(src, src_stride, dst, dst_stride, w, h,
348 x_filter_ptr);
349 return;
350 }
351
352 if (filter_taps <= 4) {
353 convolve_x_sr_4tap_neon_dotprod(src + 2, src_stride, dst, dst_stride, w, h,
354 x_filter_ptr);
355 return;
356 }
357
358 const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr);
359
360 const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
361 // Filter values are even, so halve to reduce intermediate precision reqs.
362 const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
363
364 do {
365 int width = w;
366 const uint8_t *s = src;
367 uint8_t *d = dst;
368
369 do {
370 uint8x16_t s0, s1, s2, s3;
371 load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
372
373 uint8x8_t d0 = convolve8_8_x(s0, x_filter, permute_tbl);
374 uint8x8_t d1 = convolve8_8_x(s1, x_filter, permute_tbl);
375 uint8x8_t d2 = convolve8_8_x(s2, x_filter, permute_tbl);
376 uint8x8_t d3 = convolve8_8_x(s3, x_filter, permute_tbl);
377
378 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
379
380 s += 8;
381 d += 8;
382 width -= 8;
383 } while (width != 0);
384 src += 4 * src_stride;
385 dst += 4 * dst_stride;
386 h -= 4;
387 } while (h != 0);
388 }
389
transpose_concat_4x4(int8x8_t a0,int8x8_t a1,int8x8_t a2,int8x8_t a3,int8x16_t * b)390 static inline void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
391 int8x8_t a3, int8x16_t *b) {
392 // Transpose 8-bit elements and concatenate result rows as follows:
393 // a0: 00, 01, 02, 03, XX, XX, XX, XX
394 // a1: 10, 11, 12, 13, XX, XX, XX, XX
395 // a2: 20, 21, 22, 23, XX, XX, XX, XX
396 // a3: 30, 31, 32, 33, XX, XX, XX, XX
397 //
398 // b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
399
400 int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0));
401 int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0));
402 int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0));
403 int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0));
404
405 int8x16_t a01 = vzipq_s8(a0q, a1q).val[0];
406 int8x16_t a23 = vzipq_s8(a2q, a3q).val[0];
407
408 int16x8_t a0123 =
409 vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23)).val[0];
410
411 *b = vreinterpretq_s8_s16(a0123);
412 }
413
transpose_concat_8x4(int8x8_t a0,int8x8_t a1,int8x8_t a2,int8x8_t a3,int8x16_t * b0,int8x16_t * b1)414 static inline void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
415 int8x8_t a3, int8x16_t *b0,
416 int8x16_t *b1) {
417 // Transpose 8-bit elements and concatenate result rows as follows:
418 // a0: 00, 01, 02, 03, 04, 05, 06, 07
419 // a1: 10, 11, 12, 13, 14, 15, 16, 17
420 // a2: 20, 21, 22, 23, 24, 25, 26, 27
421 // a3: 30, 31, 32, 33, 34, 35, 36, 37
422 //
423 // b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
424 // b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
425
426 int8x16_t a0q = vcombine_s8(a0, vdup_n_s8(0));
427 int8x16_t a1q = vcombine_s8(a1, vdup_n_s8(0));
428 int8x16_t a2q = vcombine_s8(a2, vdup_n_s8(0));
429 int8x16_t a3q = vcombine_s8(a3, vdup_n_s8(0));
430
431 int8x16_t a01 = vzipq_s8(a0q, a1q).val[0];
432 int8x16_t a23 = vzipq_s8(a2q, a3q).val[0];
433
434 int16x8x2_t a0123 =
435 vzipq_s16(vreinterpretq_s16_s8(a01), vreinterpretq_s16_s8(a23));
436
437 *b0 = vreinterpretq_s8_s16(a0123.val[0]);
438 *b1 = vreinterpretq_s8_s16(a0123.val[1]);
439 }
440
convolve12_4_y(const int8x16_t s0,const int8x16_t s1,const int8x16_t s2,const int8x8_t filters_0_7,const int8x8_t filters_4_11)441 static inline int16x4_t convolve12_4_y(const int8x16_t s0, const int8x16_t s1,
442 const int8x16_t s2,
443 const int8x8_t filters_0_7,
444 const int8x8_t filters_4_11) {
445 // The sample range transform and permutation are performed by the caller.
446 // Accumulate into 128 << FILTER_BITS to account for range transform.
447 const int32x4_t acc = vdupq_n_s32(128 << FILTER_BITS);
448 int32x4_t sum = vdotq_lane_s32(acc, s0, filters_0_7, 0);
449 sum = vdotq_lane_s32(sum, s1, filters_0_7, 1);
450 sum = vdotq_lane_s32(sum, s2, filters_4_11, 1);
451
452 // Further narrowing and packing is performed by the caller.
453 return vqmovn_s32(sum);
454 }
455
convolve12_8_y(const int8x16_t s0_lo,const int8x16_t s0_hi,const int8x16_t s1_lo,const int8x16_t s1_hi,const int8x16_t s2_lo,const int8x16_t s2_hi,const int8x8_t filters_0_7,const int8x8_t filters_4_11)456 static inline uint8x8_t convolve12_8_y(
457 const int8x16_t s0_lo, const int8x16_t s0_hi, const int8x16_t s1_lo,
458 const int8x16_t s1_hi, const int8x16_t s2_lo, const int8x16_t s2_hi,
459 const int8x8_t filters_0_7, const int8x8_t filters_4_11) {
460 // The sample range transform and permutation are performed by the caller.
461 // Accumulate into 128 << FILTER_BITS to account for range transform.
462 const int32x4_t acc = vdupq_n_s32(128 << FILTER_BITS);
463
464 int32x4_t sum0123 = vdotq_lane_s32(acc, s0_lo, filters_0_7, 0);
465 sum0123 = vdotq_lane_s32(sum0123, s1_lo, filters_0_7, 1);
466 sum0123 = vdotq_lane_s32(sum0123, s2_lo, filters_4_11, 1);
467
468 int32x4_t sum4567 = vdotq_lane_s32(acc, s0_hi, filters_0_7, 0);
469 sum4567 = vdotq_lane_s32(sum4567, s1_hi, filters_0_7, 1);
470 sum4567 = vdotq_lane_s32(sum4567, s2_hi, filters_4_11, 1);
471
472 // Narrow and re-pack.
473 int16x8_t sum = vcombine_s16(vqmovn_s32(sum0123), vqmovn_s32(sum4567));
474 return vqrshrun_n_s16(sum, FILTER_BITS);
475 }
476
convolve_y_sr_12tap_neon_dotprod(const uint8_t * src_ptr,int src_stride,uint8_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr)477 static inline void convolve_y_sr_12tap_neon_dotprod(
478 const uint8_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride,
479 int w, int h, const int16_t *y_filter_ptr) {
480 // The no-op filter should never be used here.
481 assert(y_filter_ptr[5] != 128);
482
483 const int8x8_t filter_0_7 = vmovn_s16(vld1q_s16(y_filter_ptr));
484 const int8x8_t filter_4_11 = vmovn_s16(vld1q_s16(y_filter_ptr + 4));
485
486 const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
487
488 if (w == 4) {
489 uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, tA;
490 load_u8_8x11(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7,
491 &t8, &t9, &tA);
492 src_ptr += 11 * src_stride;
493
494 // Transform sample range to [-128, 127] for 8-bit signed dot product.
495 int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
496 int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
497 int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
498 int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
499 int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
500 int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
501 int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
502 int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
503 int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
504 int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
505 int8x8_t sA = vreinterpret_s8_u8(vsub_u8(tA, vdup_n_u8(128)));
506
507 int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s789A;
508 transpose_concat_4x4(s0, s1, s2, s3, &s0123);
509 transpose_concat_4x4(s1, s2, s3, s4, &s1234);
510 transpose_concat_4x4(s2, s3, s4, s5, &s2345);
511 transpose_concat_4x4(s3, s4, s5, s6, &s3456);
512 transpose_concat_4x4(s4, s5, s6, s7, &s4567);
513 transpose_concat_4x4(s5, s6, s7, s8, &s5678);
514 transpose_concat_4x4(s6, s7, s8, s9, &s6789);
515 transpose_concat_4x4(s7, s8, s9, sA, &s789A);
516
517 do {
518 uint8x8_t tB, tC, tD, tE;
519 load_u8_8x4(src_ptr, src_stride, &tB, &tC, &tD, &tE);
520
521 int8x8_t sB = vreinterpret_s8_u8(vsub_u8(tB, vdup_n_u8(128)));
522 int8x8_t sC = vreinterpret_s8_u8(vsub_u8(tC, vdup_n_u8(128)));
523 int8x8_t sD = vreinterpret_s8_u8(vsub_u8(tD, vdup_n_u8(128)));
524 int8x8_t sE = vreinterpret_s8_u8(vsub_u8(tE, vdup_n_u8(128)));
525
526 int8x16_t s89AB, s9ABC, sABCD, sBCDE;
527 transpose_concat_4x4(sB, sC, sD, sE, &sBCDE);
528
529 // Merge new data into block from previous iteration.
530 int8x16x2_t samples_LUT = { { s789A, sBCDE } };
531 s89AB = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
532 s9ABC = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
533 sABCD = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
534
535 int16x4_t d0 =
536 convolve12_4_y(s0123, s4567, s89AB, filter_0_7, filter_4_11);
537 int16x4_t d1 =
538 convolve12_4_y(s1234, s5678, s9ABC, filter_0_7, filter_4_11);
539 int16x4_t d2 =
540 convolve12_4_y(s2345, s6789, sABCD, filter_0_7, filter_4_11);
541 int16x4_t d3 =
542 convolve12_4_y(s3456, s789A, sBCDE, filter_0_7, filter_4_11);
543 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
544 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
545
546 store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
547 store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
548
549 // Prepare block for next iteration - re-using as much as possible.
550 // Shuffle everything up four rows.
551 s0123 = s4567;
552 s1234 = s5678;
553 s2345 = s6789;
554 s3456 = s789A;
555 s4567 = s89AB;
556 s5678 = s9ABC;
557 s6789 = sABCD;
558 s789A = sBCDE;
559
560 src_ptr += 4 * src_stride;
561 dst_ptr += 4 * dst_stride;
562 h -= 4;
563 } while (h != 0);
564 } else {
565 do {
566 int height = h;
567 const uint8_t *s = src_ptr;
568 uint8_t *d = dst_ptr;
569
570 uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, tA;
571 load_u8_8x11(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8,
572 &t9, &tA);
573 s += 11 * src_stride;
574
575 // Transform sample range to [-128, 127] for 8-bit signed dot product.
576 int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
577 int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
578 int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
579 int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
580 int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
581 int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
582 int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
583 int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
584 int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
585 int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
586 int8x8_t sA = vreinterpret_s8_u8(vsub_u8(tA, vdup_n_u8(128)));
587
588 // This operation combines a conventional transpose and the sample
589 // permute (see horizontal case) required before computing the dot
590 // product.
591 int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
592 s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
593 s6789_hi, s789A_lo, s789A_hi;
594 transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
595 transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
596 transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
597 transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
598 transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi);
599 transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi);
600 transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi);
601 transpose_concat_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi);
602
603 do {
604 uint8x8_t tB, tC, tD, tE;
605 load_u8_8x4(s, src_stride, &tB, &tC, &tD, &tE);
606
607 int8x8_t sB = vreinterpret_s8_u8(vsub_u8(tB, vdup_n_u8(128)));
608 int8x8_t sC = vreinterpret_s8_u8(vsub_u8(tC, vdup_n_u8(128)));
609 int8x8_t sD = vreinterpret_s8_u8(vsub_u8(tD, vdup_n_u8(128)));
610 int8x8_t sE = vreinterpret_s8_u8(vsub_u8(tE, vdup_n_u8(128)));
611
612 int8x16_t s89AB_lo, s89AB_hi, s9ABC_lo, s9ABC_hi, sABCD_lo, sABCD_hi,
613 sBCDE_lo, sBCDE_hi;
614 transpose_concat_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi);
615
616 // Merge new data into block from previous iteration.
617 int8x16x2_t samples_LUT_lo = { { s789A_lo, sBCDE_lo } };
618 s89AB_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[0]);
619 s9ABC_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[1]);
620 sABCD_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[2]);
621
622 int8x16x2_t samples_LUT_hi = { { s789A_hi, sBCDE_hi } };
623 s89AB_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[0]);
624 s9ABC_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[1]);
625 sABCD_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[2]);
626
627 uint8x8_t d0 =
628 convolve12_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, s89AB_lo,
629 s89AB_hi, filter_0_7, filter_4_11);
630 uint8x8_t d1 =
631 convolve12_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, s9ABC_lo,
632 s9ABC_hi, filter_0_7, filter_4_11);
633 uint8x8_t d2 =
634 convolve12_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, sABCD_lo,
635 sABCD_hi, filter_0_7, filter_4_11);
636 uint8x8_t d3 =
637 convolve12_8_y(s3456_lo, s3456_hi, s789A_lo, s789A_hi, sBCDE_lo,
638 sBCDE_hi, filter_0_7, filter_4_11);
639
640 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
641
642 // Prepare block for next iteration - re-using as much as possible.
643 // Shuffle everything up four rows.
644 s0123_lo = s4567_lo;
645 s0123_hi = s4567_hi;
646 s1234_lo = s5678_lo;
647 s1234_hi = s5678_hi;
648 s2345_lo = s6789_lo;
649 s2345_hi = s6789_hi;
650 s3456_lo = s789A_lo;
651 s3456_hi = s789A_hi;
652 s4567_lo = s89AB_lo;
653 s4567_hi = s89AB_hi;
654 s5678_lo = s9ABC_lo;
655 s5678_hi = s9ABC_hi;
656 s6789_lo = sABCD_lo;
657 s6789_hi = sABCD_hi;
658 s789A_lo = sBCDE_lo;
659 s789A_hi = sBCDE_hi;
660
661 s += 4 * src_stride;
662 d += 4 * dst_stride;
663 height -= 4;
664 } while (height != 0);
665 src_ptr += 8;
666 dst_ptr += 8;
667 w -= 8;
668 } while (w != 0);
669 }
670 }
671
convolve8_4_y(const int8x16_t s0,const int8x16_t s1,const int8x8_t filters)672 static inline int16x4_t convolve8_4_y(const int8x16_t s0, const int8x16_t s1,
673 const int8x8_t filters) {
674 // The sample range transform and permutation are performed by the caller.
675 // Accumulate into 128 << FILTER_BITS to account for range transform.
676 const int32x4_t acc = vdupq_n_s32(128 << FILTER_BITS);
677 int32x4_t sum = vdotq_lane_s32(acc, s0, filters, 0);
678 sum = vdotq_lane_s32(sum, s1, filters, 1);
679
680 // Further narrowing and packing is performed by the caller.
681 return vqmovn_s32(sum);
682 }
683
convolve8_8_y(const int8x16_t s0_lo,const int8x16_t s0_hi,const int8x16_t s1_lo,const int8x16_t s1_hi,const int8x8_t filters)684 static inline uint8x8_t convolve8_8_y(const int8x16_t s0_lo,
685 const int8x16_t s0_hi,
686 const int8x16_t s1_lo,
687 const int8x16_t s1_hi,
688 const int8x8_t filters) {
689 // The sample range transform and permutation are performed by the caller.
690 // Accumulate into 128 << FILTER_BITS to account for range transform.
691 const int32x4_t acc = vdupq_n_s32(128 << FILTER_BITS);
692
693 int32x4_t sum0123 = vdotq_lane_s32(acc, s0_lo, filters, 0);
694 sum0123 = vdotq_lane_s32(sum0123, s1_lo, filters, 1);
695
696 int32x4_t sum4567 = vdotq_lane_s32(acc, s0_hi, filters, 0);
697 sum4567 = vdotq_lane_s32(sum4567, s1_hi, filters, 1);
698
699 // Narrow and re-pack.
700 int16x8_t sum = vcombine_s16(vqmovn_s32(sum0123), vqmovn_s32(sum4567));
701 return vqrshrun_n_s16(sum, FILTER_BITS);
702 }
703
convolve_y_sr_8tap_neon_dotprod(const uint8_t * src_ptr,int src_stride,uint8_t * dst_ptr,int dst_stride,int w,int h,const int16_t * y_filter_ptr)704 static inline void convolve_y_sr_8tap_neon_dotprod(
705 const uint8_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride,
706 int w, int h, const int16_t *y_filter_ptr) {
707 const int8x8_t filter = vmovn_s16(vld1q_s16(y_filter_ptr));
708
709 const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
710
711 if (w == 4) {
712 uint8x8_t t0, t1, t2, t3, t4, t5, t6;
713 load_u8_8x7(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
714 src_ptr += 7 * src_stride;
715
716 // Transform sample range to [-128, 127] for 8-bit signed dot product.
717 int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
718 int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
719 int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
720 int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
721 int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
722 int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
723 int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
724
725 int8x16_t s0123, s1234, s2345, s3456;
726 transpose_concat_4x4(s0, s1, s2, s3, &s0123);
727 transpose_concat_4x4(s1, s2, s3, s4, &s1234);
728 transpose_concat_4x4(s2, s3, s4, s5, &s2345);
729 transpose_concat_4x4(s3, s4, s5, s6, &s3456);
730
731 do {
732 uint8x8_t t7, t8, t9, t10;
733 load_u8_8x4(src_ptr, src_stride, &t7, &t8, &t9, &t10);
734
735 int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
736 int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
737 int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
738 int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
739
740 int8x16_t s4567, s5678, s6789, s78910;
741 transpose_concat_4x4(s7, s8, s9, s10, &s78910);
742
743 // Merge new data into block from previous iteration.
744 int8x16x2_t samples_LUT = { { s3456, s78910 } };
745 s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
746 s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
747 s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
748
749 int16x4_t d0 = convolve8_4_y(s0123, s4567, filter);
750 int16x4_t d1 = convolve8_4_y(s1234, s5678, filter);
751 int16x4_t d2 = convolve8_4_y(s2345, s6789, filter);
752 int16x4_t d3 = convolve8_4_y(s3456, s78910, filter);
753 uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
754 uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
755
756 store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
757 store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
758
759 // Prepare block for next iteration - re-using as much as possible.
760 // Shuffle everything up four rows.
761 s0123 = s4567;
762 s1234 = s5678;
763 s2345 = s6789;
764 s3456 = s78910;
765
766 src_ptr += 4 * src_stride;
767 dst_ptr += 4 * dst_stride;
768 h -= 4;
769 } while (h != 0);
770 } else {
771 do {
772 int height = h;
773 const uint8_t *s = src_ptr;
774 uint8_t *d = dst_ptr;
775
776 uint8x8_t t0, t1, t2, t3, t4, t5, t6;
777 load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
778 s += 7 * src_stride;
779
780 // Transform sample range to [-128, 127] for 8-bit signed dot product.
781 int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
782 int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
783 int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
784 int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
785 int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
786 int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
787 int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
788
789 // This operation combines a conventional transpose and the sample
790 // permute (see horizontal case) required before computing the dot
791 // product.
792 int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
793 s3456_lo, s3456_hi;
794 transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
795 transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
796 transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
797 transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
798
799 do {
800 uint8x8_t t7, t8, t9, t10;
801 load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
802
803 int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
804 int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
805 int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
806 int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, vdup_n_u8(128)));
807
808 int8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi,
809 s78910_lo, s78910_hi;
810 transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
811
812 // Merge new data into block from previous iteration.
813 int8x16x2_t samples_LUT_lo = { { s3456_lo, s78910_lo } };
814 s4567_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[0]);
815 s5678_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[1]);
816 s6789_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[2]);
817
818 int8x16x2_t samples_LUT_hi = { { s3456_hi, s78910_hi } };
819 s4567_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[0]);
820 s5678_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[1]);
821 s6789_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[2]);
822
823 uint8x8_t d0 =
824 convolve8_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, filter);
825 uint8x8_t d1 =
826 convolve8_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, filter);
827 uint8x8_t d2 =
828 convolve8_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, filter);
829 uint8x8_t d3 =
830 convolve8_8_y(s3456_lo, s3456_hi, s78910_lo, s78910_hi, filter);
831
832 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
833
834 // Prepare block for next iteration - re-using as much as possible.
835 // Shuffle everything up four rows.
836 s0123_lo = s4567_lo;
837 s0123_hi = s4567_hi;
838 s1234_lo = s5678_lo;
839 s1234_hi = s5678_hi;
840 s2345_lo = s6789_lo;
841 s2345_hi = s6789_hi;
842 s3456_lo = s78910_lo;
843 s3456_hi = s78910_hi;
844
845 s += 4 * src_stride;
846 d += 4 * dst_stride;
847 height -= 4;
848 } while (height != 0);
849 src_ptr += 8;
850 dst_ptr += 8;
851 w -= 8;
852 } while (w != 0);
853 }
854 }
855
av1_convolve_y_sr_neon_dotprod(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_y,const int subpel_y_qn)856 void av1_convolve_y_sr_neon_dotprod(const uint8_t *src, int src_stride,
857 uint8_t *dst, int dst_stride, int w, int h,
858 const InterpFilterParams *filter_params_y,
859 const int subpel_y_qn) {
860 if (w == 2 || h == 2) {
861 av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, filter_params_y,
862 subpel_y_qn);
863 return;
864 }
865
866 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
867
868 if (y_filter_taps <= 6) {
869 av1_convolve_y_sr_neon(src, src_stride, dst, dst_stride, w, h,
870 filter_params_y, subpel_y_qn);
871 return;
872 }
873
874 const int vert_offset = y_filter_taps / 2 - 1;
875 src -= vert_offset * src_stride;
876
877 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
878 filter_params_y, subpel_y_qn & SUBPEL_MASK);
879
880 if (y_filter_taps > 8) {
881 convolve_y_sr_12tap_neon_dotprod(src, src_stride, dst, dst_stride, w, h,
882 y_filter_ptr);
883 return;
884 }
885
886 convolve_y_sr_8tap_neon_dotprod(src, src_stride, dst, dst_stride, w, h,
887 y_filter_ptr);
888 }
889
convolve12_4_2d_h(uint8x16_t samples,const int8x16_t filters,const int32x4_t horiz_const,const uint8x16x3_t permute_tbl)890 static inline int16x4_t convolve12_4_2d_h(uint8x16_t samples,
891 const int8x16_t filters,
892 const int32x4_t horiz_const,
893 const uint8x16x3_t permute_tbl) {
894 // Transform sample range to [-128, 127] for 8-bit signed dot product.
895 int8x16_t samples_128 =
896 vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
897
898 // Permute samples ready for dot product.
899 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
900 // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
901 // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
902 int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
903 vqtbl1q_s8(samples_128, permute_tbl.val[1]),
904 vqtbl1q_s8(samples_128, permute_tbl.val[2]) };
905
906 // Accumulate dot product into 'correction' to account for range transform.
907 int32x4_t sum = vdotq_laneq_s32(horiz_const, perm_samples[0], filters, 0);
908 sum = vdotq_laneq_s32(sum, perm_samples[1], filters, 1);
909 sum = vdotq_laneq_s32(sum, perm_samples[2], filters, 2);
910
911 // Narrow and re-pack.
912 return vshrn_n_s32(sum, ROUND0_BITS);
913 }
914
convolve12_8_2d_h(uint8x16_t samples[2],const int8x16_t filters,const int32x4_t correction,const uint8x16x3_t permute_tbl)915 static inline int16x8_t convolve12_8_2d_h(uint8x16_t samples[2],
916 const int8x16_t filters,
917 const int32x4_t correction,
918 const uint8x16x3_t permute_tbl) {
919 // Transform sample range to [-128, 127] for 8-bit signed dot product.
920 int8x16_t samples_128[2] = {
921 vreinterpretq_s8_u8(vsubq_u8(samples[0], vdupq_n_u8(128))),
922 vreinterpretq_s8_u8(vsubq_u8(samples[1], vdupq_n_u8(128)))
923 };
924
925 // Permute samples ready for dot product.
926 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
927 // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
928 // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
929 // {12, 13, 14, 15, 13, 14, 15, 16, 14, 15, 16, 17, 15, 16, 17, 18 }
930 int8x16_t perm_samples[4] = { vqtbl1q_s8(samples_128[0], permute_tbl.val[0]),
931 vqtbl1q_s8(samples_128[0], permute_tbl.val[1]),
932 vqtbl1q_s8(samples_128[0], permute_tbl.val[2]),
933 vqtbl1q_s8(samples_128[1],
934 permute_tbl.val[2]) };
935
936 // Accumulate dot product into 'correction' to account for range transform.
937 int32x4_t sum0123 = vdotq_laneq_s32(correction, perm_samples[0], filters, 0);
938 sum0123 = vdotq_laneq_s32(sum0123, perm_samples[1], filters, 1);
939 sum0123 = vdotq_laneq_s32(sum0123, perm_samples[2], filters, 2);
940
941 int32x4_t sum4567 = vdotq_laneq_s32(correction, perm_samples[1], filters, 0);
942 sum4567 = vdotq_laneq_s32(sum4567, perm_samples[2], filters, 1);
943 sum4567 = vdotq_laneq_s32(sum4567, perm_samples[3], filters, 2);
944
945 // Narrow and re-pack.
946 return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS),
947 vshrn_n_s32(sum4567, ROUND0_BITS));
948 }
949
convolve_2d_sr_horiz_12tap_neon_dotprod(const uint8_t * src_ptr,int src_stride,int16_t * dst_ptr,const int dst_stride,int w,int h,const int16x8_t x_filter_0_7,const int16x4_t x_filter_8_11)950 static inline void convolve_2d_sr_horiz_12tap_neon_dotprod(
951 const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
952 const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
953 const int16x4_t x_filter_8_11) {
954 // The no-op filter should never be used here.
955 assert(vgetq_lane_s16(x_filter_0_7, 5) != 128);
956
957 const int bd = 8;
958
959 // Narrow filter values to 8-bit.
960 const int16x8x2_t x_filter_s16 = {
961 { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) }
962 };
963 const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]),
964 vmovn_s16(x_filter_s16.val[1]));
965
966 // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
967 // shifts - which are generally faster than rounding shifts on modern CPUs.
968 const int32_t horiz_const =
969 ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
970 // Dot product constants.
971 const int32x4_t correction = vdupq_n_s32((128 << FILTER_BITS) + horiz_const);
972 const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
973
974 if (w <= 4) {
975 do {
976 uint8x16_t s0, s1, s2, s3;
977 load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
978
979 int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, correction, permute_tbl);
980 int16x4_t d1 = convolve12_4_2d_h(s1, x_filter, correction, permute_tbl);
981 int16x4_t d2 = convolve12_4_2d_h(s2, x_filter, correction, permute_tbl);
982 int16x4_t d3 = convolve12_4_2d_h(s3, x_filter, correction, permute_tbl);
983
984 store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
985
986 src_ptr += 4 * src_stride;
987 dst_ptr += 4 * dst_stride;
988 h -= 4;
989 } while (h > 4);
990
991 do {
992 uint8x16_t s0 = vld1q_u8(src_ptr);
993 int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, correction, permute_tbl);
994 vst1_s16(dst_ptr, d0);
995
996 src_ptr += src_stride;
997 dst_ptr += dst_stride;
998 } while (--h != 0);
999
1000 } else {
1001 do {
1002 const uint8_t *s = src_ptr;
1003 int16_t *d = dst_ptr;
1004 int width = w;
1005
1006 do {
1007 uint8x16_t s0[2], s1[2], s2[2], s3[2];
1008 load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
1009 load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
1010
1011 int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction, permute_tbl);
1012 int16x8_t d1 = convolve12_8_2d_h(s1, x_filter, correction, permute_tbl);
1013 int16x8_t d2 = convolve12_8_2d_h(s2, x_filter, correction, permute_tbl);
1014 int16x8_t d3 = convolve12_8_2d_h(s3, x_filter, correction, permute_tbl);
1015
1016 store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
1017
1018 s += 8;
1019 d += 8;
1020 width -= 8;
1021 } while (width != 0);
1022 src_ptr += 4 * src_stride;
1023 dst_ptr += 4 * dst_stride;
1024 h -= 4;
1025 } while (h > 4);
1026
1027 do {
1028 const uint8_t *s = src_ptr;
1029 int16_t *d = dst_ptr;
1030 int width = w;
1031
1032 do {
1033 uint8x16_t s0[2];
1034 s0[0] = vld1q_u8(s);
1035 s0[1] = vld1q_u8(s + 4);
1036 int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction, permute_tbl);
1037 vst1q_s16(d, d0);
1038
1039 s += 8;
1040 d += 8;
1041 width -= 8;
1042 } while (width != 0);
1043 src_ptr += src_stride;
1044 dst_ptr += dst_stride;
1045 } while (--h != 0);
1046 }
1047 }
1048
convolve4_4_2d_h(const uint8x16_t samples,const int8x8_t filters,const uint8x16_t permute_tbl,const int32x4_t correction)1049 static inline int16x4_t convolve4_4_2d_h(const uint8x16_t samples,
1050 const int8x8_t filters,
1051 const uint8x16_t permute_tbl,
1052 const int32x4_t correction) {
1053 // Transform sample range to [-128, 127] for 8-bit signed dot product.
1054 int8x16_t samples_128 =
1055 vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
1056
1057 // Permute samples ready for dot product.
1058 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
1059 int8x16_t perm_samples = vqtbl1q_s8(samples_128, permute_tbl);
1060
1061 // Accumulate into 'correction' to account for range transform.
1062 int32x4_t sum = vdotq_lane_s32(correction, perm_samples, filters, 0);
1063
1064 // We halved the convolution filter values so -1 from the right shift.
1065 return vshrn_n_s32(sum, ROUND0_BITS - 1);
1066 }
1067
convolve4_8_2d_h(const uint8x16_t samples,const int8x8_t filters,const uint8x16x2_t permute_tbl,const int32x4_t correction)1068 static inline int16x8_t convolve4_8_2d_h(const uint8x16_t samples,
1069 const int8x8_t filters,
1070 const uint8x16x2_t permute_tbl,
1071 const int32x4_t correction) {
1072 // Transform sample range to [-128, 127] for 8-bit signed dot product.
1073 int8x16_t samples_128 =
1074 vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
1075
1076 // Permute samples ready for dot product.
1077 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
1078 // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
1079 int8x16_t perm_samples[2] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
1080 vqtbl1q_s8(samples_128, permute_tbl.val[1]) };
1081
1082 // Accumulate into 'correction' to account for range transform.
1083 int32x4_t sum0123 = vdotq_lane_s32(correction, perm_samples[0], filters, 0);
1084 int32x4_t sum4567 = vdotq_lane_s32(correction, perm_samples[1], filters, 0);
1085
1086 // Narrow and re-pack.
1087 // We halved the filter values so -1 from right shift.
1088 return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1),
1089 vshrn_n_s32(sum4567, ROUND0_BITS - 1));
1090 }
1091
convolve_2d_sr_horiz_4tap_neon_dotprod(const uint8_t * src,ptrdiff_t src_stride,int16_t * dst,ptrdiff_t dst_stride,int w,int h,const int16_t * filter_x)1092 static inline void convolve_2d_sr_horiz_4tap_neon_dotprod(
1093 const uint8_t *src, ptrdiff_t src_stride, int16_t *dst,
1094 ptrdiff_t dst_stride, int w, int h, const int16_t *filter_x) {
1095 const int bd = 8;
1096 const int16x4_t x_filter = vld1_s16(filter_x + 2);
1097 // All 4-tap and bilinear filter values are even, so halve them to reduce
1098 // intermediate precision requirements.
1099 const int8x8_t filter = vshrn_n_s16(vcombine_s16(x_filter, vdup_n_s16(0)), 1);
1100
1101 // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
1102 // shifts - which are generally faster than rounding shifts on modern CPUs.
1103 const int32_t horiz_const =
1104 ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
1105 // Accumulate into 128 << FILTER_BITS to account for range transform.
1106 // Halve the total because we halved the filter values.
1107 const int32x4_t correction =
1108 vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2);
1109
1110 if (w == 4) {
1111 const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl);
1112
1113 do {
1114 uint8x16_t s0, s1, s2, s3;
1115 load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
1116
1117 int16x4_t d0 = convolve4_4_2d_h(s0, filter, permute_tbl, correction);
1118 int16x4_t d1 = convolve4_4_2d_h(s1, filter, permute_tbl, correction);
1119 int16x4_t d2 = convolve4_4_2d_h(s2, filter, permute_tbl, correction);
1120 int16x4_t d3 = convolve4_4_2d_h(s3, filter, permute_tbl, correction);
1121
1122 store_s16_4x4(dst, dst_stride, d0, d1, d2, d3);
1123
1124 src += 4 * src_stride;
1125 dst += 4 * dst_stride;
1126 h -= 4;
1127 } while (h > 4);
1128
1129 do {
1130 uint8x16_t s0 = vld1q_u8(src);
1131 int16x4_t d0 = convolve4_4_2d_h(s0, filter, permute_tbl, correction);
1132 vst1_s16(dst, d0);
1133
1134 src += src_stride;
1135 dst += dst_stride;
1136 } while (--h != 0);
1137 } else {
1138 const uint8x16x2_t permute_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
1139 do {
1140 const uint8_t *s = src;
1141 int16_t *d = dst;
1142 int width = w;
1143
1144 do {
1145 uint8x16_t s0, s1, s2, s3;
1146 load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
1147
1148 int16x8_t d0 = convolve4_8_2d_h(s0, filter, permute_tbl, correction);
1149 int16x8_t d1 = convolve4_8_2d_h(s1, filter, permute_tbl, correction);
1150 int16x8_t d2 = convolve4_8_2d_h(s2, filter, permute_tbl, correction);
1151 int16x8_t d3 = convolve4_8_2d_h(s3, filter, permute_tbl, correction);
1152
1153 store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
1154
1155 s += 8;
1156 d += 8;
1157 width -= 8;
1158 } while (width != 0);
1159 src += 4 * src_stride;
1160 dst += 4 * dst_stride;
1161 h -= 4;
1162 } while (h > 4);
1163
1164 do {
1165 const uint8_t *s = src;
1166 int16_t *d = dst;
1167 int width = w;
1168
1169 do {
1170 uint8x16_t s0 = vld1q_u8(s);
1171 int16x8_t d0 = convolve4_8_2d_h(s0, filter, permute_tbl, correction);
1172 vst1q_s16(d, d0);
1173
1174 s += 8;
1175 d += 8;
1176 width -= 8;
1177 } while (width != 0);
1178 src += src_stride;
1179 dst += dst_stride;
1180 } while (--h != 0);
1181 }
1182 }
1183
convolve8_8_2d_h(uint8x16_t samples,const int8x8_t filters,const int32x4_t correction,const uint8x16x3_t permute_tbl)1184 static inline int16x8_t convolve8_8_2d_h(uint8x16_t samples,
1185 const int8x8_t filters,
1186 const int32x4_t correction,
1187 const uint8x16x3_t permute_tbl) {
1188 // Transform sample range to [-128, 127] for 8-bit signed dot product.
1189 int8x16_t samples_128 =
1190 vreinterpretq_s8_u8(vsubq_u8(samples, vdupq_n_u8(128)));
1191
1192 // Permute samples ready for dot product.
1193 // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }
1194 // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }
1195 // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }
1196 int8x16_t perm_samples[3] = { vqtbl1q_s8(samples_128, permute_tbl.val[0]),
1197 vqtbl1q_s8(samples_128, permute_tbl.val[1]),
1198 vqtbl1q_s8(samples_128, permute_tbl.val[2]) };
1199
1200 // Accumulate dot product into 'correction' to account for range transform.
1201 int32x4_t sum0123 = vdotq_lane_s32(correction, perm_samples[0], filters, 0);
1202 sum0123 = vdotq_lane_s32(sum0123, perm_samples[1], filters, 1);
1203
1204 int32x4_t sum4567 = vdotq_lane_s32(correction, perm_samples[1], filters, 0);
1205 sum4567 = vdotq_lane_s32(sum4567, perm_samples[2], filters, 1);
1206
1207 // Narrow and re-pack.
1208 // We halved the convolution filter values so -1 from the right shift.
1209 return vcombine_s16(vshrn_n_s32(sum0123, ROUND0_BITS - 1),
1210 vshrn_n_s32(sum4567, ROUND0_BITS - 1));
1211 }
1212
convolve_2d_sr_horiz_8tap_neon_dotprod(const uint8_t * src,int src_stride,int16_t * im_block,int im_stride,int w,int im_h,const int16_t * x_filter_ptr)1213 static inline void convolve_2d_sr_horiz_8tap_neon_dotprod(
1214 const uint8_t *src, int src_stride, int16_t *im_block, int im_stride, int w,
1215 int im_h, const int16_t *x_filter_ptr) {
1216 const int16x8_t x_filter_s16 = vld1q_s16(x_filter_ptr);
1217 // Filter values are even, so halve to reduce intermediate precision reqs.
1218 const int8x8_t x_filter = vshrn_n_s16(x_filter_s16, 1);
1219
1220 const int bd = 8;
1221 // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
1222 // shifts - which are generally faster than rounding shifts on modern CPUs.
1223 const int32_t horiz_const =
1224 ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
1225 // Halve the total because we halved the filter values.
1226 const int32x4_t correction =
1227 vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2);
1228
1229 const uint8_t *src_ptr = src;
1230 int16_t *dst_ptr = im_block;
1231 int dst_stride = im_stride;
1232 int height = im_h;
1233
1234 const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
1235 do {
1236 const uint8_t *s = src_ptr;
1237 int16_t *d = dst_ptr;
1238 int width = w;
1239
1240 do {
1241 uint8x16_t s0, s1, s2, s3;
1242 load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
1243
1244 int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, permute_tbl);
1245 int16x8_t d1 = convolve8_8_2d_h(s1, x_filter, correction, permute_tbl);
1246 int16x8_t d2 = convolve8_8_2d_h(s2, x_filter, correction, permute_tbl);
1247 int16x8_t d3 = convolve8_8_2d_h(s3, x_filter, correction, permute_tbl);
1248
1249 store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
1250
1251 s += 8;
1252 d += 8;
1253 width -= 8;
1254 } while (width != 0);
1255 src_ptr += 4 * src_stride;
1256 dst_ptr += 4 * dst_stride;
1257 height -= 4;
1258 } while (height > 4);
1259
1260 do {
1261 const uint8_t *s = src_ptr;
1262 int16_t *d = dst_ptr;
1263 int width = w;
1264
1265 do {
1266 uint8x16_t s0 = vld1q_u8(s);
1267 int16x8_t d0 = convolve8_8_2d_h(s0, x_filter, correction, permute_tbl);
1268 vst1q_s16(d, d0);
1269
1270 s += 8;
1271 d += 8;
1272 width -= 8;
1273 } while (width != 0);
1274 src_ptr += src_stride;
1275 dst_ptr += dst_stride;
1276 } while (--height != 0);
1277 }
1278
convolve_2d_sr_6tap_neon_dotprod(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int16_t * y_filter_ptr)1279 static inline void convolve_2d_sr_6tap_neon_dotprod(
1280 const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
1281 int h, const int16_t *x_filter_ptr, const int16_t *y_filter_ptr) {
1282 const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
1283 // Filter values are even, so halve to reduce intermediate precision reqs.
1284 const int8x8_t x_filter = vshrn_n_s16(vld1q_s16(x_filter_ptr), 1);
1285
1286 const int bd = 8;
1287 // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
1288 // shifts - which are generally faster than rounding shifts on modern CPUs.
1289 const int32_t horiz_const =
1290 ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
1291 // Accumulate into 128 << FILTER_BITS to account for range transform.
1292 // Halve the total because we halved the filter values.
1293 const int32x4_t correction =
1294 vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2);
1295 const int16x8_t vert_const = vdupq_n_s16(1 << (bd - 1));
1296 const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
1297
1298 do {
1299 const uint8_t *s = src;
1300 uint8_t *d = dst;
1301 int height = h;
1302
1303 uint8x16_t h_s0, h_s1, h_s2, h_s3, h_s4;
1304 load_u8_16x5(s, src_stride, &h_s0, &h_s1, &h_s2, &h_s3, &h_s4);
1305 s += 5 * src_stride;
1306
1307 int16x8_t v_s0 = convolve8_8_2d_h(h_s0, x_filter, correction, permute_tbl);
1308 int16x8_t v_s1 = convolve8_8_2d_h(h_s1, x_filter, correction, permute_tbl);
1309 int16x8_t v_s2 = convolve8_8_2d_h(h_s2, x_filter, correction, permute_tbl);
1310 int16x8_t v_s3 = convolve8_8_2d_h(h_s3, x_filter, correction, permute_tbl);
1311 int16x8_t v_s4 = convolve8_8_2d_h(h_s4, x_filter, correction, permute_tbl);
1312
1313 do {
1314 uint8x16_t h_s5, h_s6, h_s7, h_s8;
1315 load_u8_16x4(s, src_stride, &h_s5, &h_s6, &h_s7, &h_s8);
1316
1317 int16x8_t v_s5 =
1318 convolve8_8_2d_h(h_s5, x_filter, correction, permute_tbl);
1319 int16x8_t v_s6 =
1320 convolve8_8_2d_h(h_s6, x_filter, correction, permute_tbl);
1321 int16x8_t v_s7 =
1322 convolve8_8_2d_h(h_s7, x_filter, correction, permute_tbl);
1323 int16x8_t v_s8 =
1324 convolve8_8_2d_h(h_s8, x_filter, correction, permute_tbl);
1325
1326 uint8x8_t d0 = convolve6_8_2d_v(v_s0, v_s1, v_s2, v_s3, v_s4, v_s5,
1327 y_filter, vert_const);
1328 uint8x8_t d1 = convolve6_8_2d_v(v_s1, v_s2, v_s3, v_s4, v_s5, v_s6,
1329 y_filter, vert_const);
1330 uint8x8_t d2 = convolve6_8_2d_v(v_s2, v_s3, v_s4, v_s5, v_s6, v_s7,
1331 y_filter, vert_const);
1332 uint8x8_t d3 = convolve6_8_2d_v(v_s3, v_s4, v_s5, v_s6, v_s7, v_s8,
1333 y_filter, vert_const);
1334
1335 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
1336
1337 v_s0 = v_s4;
1338 v_s1 = v_s5;
1339 v_s2 = v_s6;
1340 v_s3 = v_s7;
1341 v_s4 = v_s8;
1342
1343 s += 4 * src_stride;
1344 d += 4 * dst_stride;
1345 height -= 4;
1346 } while (height != 0);
1347 src += 8;
1348 dst += 8;
1349 w -= 8;
1350 } while (w != 0);
1351 }
1352
convolve_2d_sr_4tap_neon_dotprod(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const int16_t * x_filter_ptr,const int16_t * y_filter_ptr)1353 static inline void convolve_2d_sr_4tap_neon_dotprod(
1354 const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
1355 int h, const int16_t *x_filter_ptr, const int16_t *y_filter_ptr) {
1356 const int bd = 8;
1357 const int16x8_t vert_const = vdupq_n_s16(1 << (bd - 1));
1358
1359 const int16x4_t y_filter = vld1_s16(y_filter_ptr + 2);
1360 const int16x4_t x_filter_s16 = vld1_s16(x_filter_ptr + 2);
1361 // All 4-tap and bilinear filter values are even, so halve them to reduce
1362 // intermediate precision requirements.
1363 const int8x8_t x_filter =
1364 vshrn_n_s16(vcombine_s16(x_filter_s16, vdup_n_s16(0)), 1);
1365
1366 // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
1367 // shifts - which are generally faster than rounding shifts on modern CPUs.
1368 const int32_t horiz_const =
1369 ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
1370 // Accumulate into 128 << FILTER_BITS to account for range transform.
1371 // Halve the total because we halved the filter values.
1372 const int32x4_t correction =
1373 vdupq_n_s32(((128 << FILTER_BITS) + horiz_const) / 2);
1374
1375 if (w == 4) {
1376 const uint8x16_t permute_tbl = vld1q_u8(kDotProdPermuteTbl);
1377
1378 uint8x16_t h_s0, h_s1, h_s2;
1379 load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2);
1380
1381 int16x4_t v_s0 = convolve4_4_2d_h(h_s0, x_filter, permute_tbl, correction);
1382 int16x4_t v_s1 = convolve4_4_2d_h(h_s1, x_filter, permute_tbl, correction);
1383 int16x4_t v_s2 = convolve4_4_2d_h(h_s2, x_filter, permute_tbl, correction);
1384
1385 src += 3 * src_stride;
1386
1387 do {
1388 uint8x16_t h_s3, h_s4, h_s5, h_s6;
1389 load_u8_16x4(src, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
1390
1391 int16x4_t v_s3 =
1392 convolve4_4_2d_h(h_s3, x_filter, permute_tbl, correction);
1393 int16x4_t v_s4 =
1394 convolve4_4_2d_h(h_s4, x_filter, permute_tbl, correction);
1395 int16x4_t v_s5 =
1396 convolve4_4_2d_h(h_s5, x_filter, permute_tbl, correction);
1397 int16x4_t v_s6 =
1398 convolve4_4_2d_h(h_s6, x_filter, permute_tbl, correction);
1399
1400 int16x4_t d0 = convolve4_4_2d_v(v_s0, v_s1, v_s2, v_s3, y_filter);
1401 int16x4_t d1 = convolve4_4_2d_v(v_s1, v_s2, v_s3, v_s4, y_filter);
1402 int16x4_t d2 = convolve4_4_2d_v(v_s2, v_s3, v_s4, v_s5, y_filter);
1403 int16x4_t d3 = convolve4_4_2d_v(v_s3, v_s4, v_s5, v_s6, y_filter);
1404
1405 uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), vert_const));
1406 uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), vert_const));
1407
1408 store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
1409 store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
1410
1411 v_s0 = v_s4;
1412 v_s1 = v_s5;
1413 v_s2 = v_s6;
1414
1415 src += 4 * src_stride;
1416 dst += 4 * dst_stride;
1417 h -= 4;
1418 } while (h != 0);
1419 } else {
1420 const uint8x16x2_t permute_tbl = vld1q_u8_x2(kDotProdPermuteTbl);
1421
1422 do {
1423 int height = h;
1424 const uint8_t *s = src;
1425 uint8_t *d = dst;
1426
1427 uint8x16_t h_s0, h_s1, h_s2;
1428 load_u8_16x3(src, src_stride, &h_s0, &h_s1, &h_s2);
1429
1430 int16x8_t v_s0 =
1431 convolve4_8_2d_h(h_s0, x_filter, permute_tbl, correction);
1432 int16x8_t v_s1 =
1433 convolve4_8_2d_h(h_s1, x_filter, permute_tbl, correction);
1434 int16x8_t v_s2 =
1435 convolve4_8_2d_h(h_s2, x_filter, permute_tbl, correction);
1436
1437 s += 3 * src_stride;
1438
1439 do {
1440 uint8x16_t h_s3, h_s4, h_s5, h_s6;
1441 load_u8_16x4(s, src_stride, &h_s3, &h_s4, &h_s5, &h_s6);
1442
1443 int16x8_t v_s3 =
1444 convolve4_8_2d_h(h_s3, x_filter, permute_tbl, correction);
1445 int16x8_t v_s4 =
1446 convolve4_8_2d_h(h_s4, x_filter, permute_tbl, correction);
1447 int16x8_t v_s5 =
1448 convolve4_8_2d_h(h_s5, x_filter, permute_tbl, correction);
1449 int16x8_t v_s6 =
1450 convolve4_8_2d_h(h_s6, x_filter, permute_tbl, correction);
1451
1452 uint8x8_t d0 =
1453 convolve4_8_2d_v(v_s0, v_s1, v_s2, v_s3, y_filter, vert_const);
1454 uint8x8_t d1 =
1455 convolve4_8_2d_v(v_s1, v_s2, v_s3, v_s4, y_filter, vert_const);
1456 uint8x8_t d2 =
1457 convolve4_8_2d_v(v_s2, v_s3, v_s4, v_s5, y_filter, vert_const);
1458 uint8x8_t d3 =
1459 convolve4_8_2d_v(v_s3, v_s4, v_s5, v_s6, y_filter, vert_const);
1460
1461 store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
1462
1463 v_s0 = v_s4;
1464 v_s1 = v_s5;
1465 v_s2 = v_s6;
1466
1467 s += 4 * src_stride;
1468 d += 4 * dst_stride;
1469 height -= 4;
1470 } while (height != 0);
1471 src += 8;
1472 dst += 8;
1473 w -= 8;
1474 } while (w != 0);
1475 }
1476 }
1477
av1_convolve_2d_sr_neon_dotprod(const uint8_t * src,int src_stride,uint8_t * dst,int dst_stride,int w,int h,const InterpFilterParams * filter_params_x,const InterpFilterParams * filter_params_y,const int subpel_x_qn,const int subpel_y_qn,ConvolveParams * conv_params)1478 void av1_convolve_2d_sr_neon_dotprod(const uint8_t *src, int src_stride,
1479 uint8_t *dst, int dst_stride, int w, int h,
1480 const InterpFilterParams *filter_params_x,
1481 const InterpFilterParams *filter_params_y,
1482 const int subpel_x_qn,
1483 const int subpel_y_qn,
1484 ConvolveParams *conv_params) {
1485 if (w == 2 || h == 2) {
1486 av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
1487 filter_params_x, filter_params_y, subpel_x_qn,
1488 subpel_y_qn, conv_params);
1489 return;
1490 }
1491
1492 const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn);
1493 const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn);
1494 const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps;
1495 const int im_h = h + clamped_y_taps - 1;
1496 const int im_stride = MAX_SB_SIZE;
1497 const int vert_offset = clamped_y_taps / 2 - 1;
1498 const int horiz_offset = filter_params_x->taps / 2 - 1;
1499 const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
1500
1501 const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel(
1502 filter_params_x, subpel_x_qn & SUBPEL_MASK);
1503 const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel(
1504 filter_params_y, subpel_y_qn & SUBPEL_MASK);
1505
1506 if (filter_params_x->taps > 8) {
1507 DECLARE_ALIGNED(16, int16_t,
1508 im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
1509
1510 const int16x8_t x_filter_0_7 = vld1q_s16(x_filter_ptr);
1511 const int16x4_t x_filter_8_11 = vld1_s16(x_filter_ptr + 8);
1512 const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr);
1513 const int16x4_t y_filter_8_11 = vld1_s16(y_filter_ptr + 8);
1514
1515 convolve_2d_sr_horiz_12tap_neon_dotprod(src_ptr, src_stride, im_block,
1516 im_stride, w, im_h, x_filter_0_7,
1517 x_filter_8_11);
1518
1519 convolve_2d_sr_vert_12tap_neon(im_block, im_stride, dst, dst_stride, w, h,
1520 y_filter_0_7, y_filter_8_11);
1521 } else {
1522 if (x_filter_taps >= 6 && y_filter_taps == 6) {
1523 convolve_2d_sr_6tap_neon_dotprod(src_ptr, src_stride, dst, dst_stride, w,
1524 h, x_filter_ptr, y_filter_ptr);
1525 return;
1526 }
1527
1528 if (x_filter_taps <= 4 && y_filter_taps <= 4) {
1529 convolve_2d_sr_4tap_neon_dotprod(src_ptr + 2, src_stride, dst, dst_stride,
1530 w, h, x_filter_ptr, y_filter_ptr);
1531 return;
1532 }
1533
1534 DECLARE_ALIGNED(16, int16_t,
1535 im_block[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
1536
1537 if (x_filter_taps <= 4) {
1538 convolve_2d_sr_horiz_4tap_neon_dotprod(src_ptr + 2, src_stride, im_block,
1539 im_stride, w, im_h, x_filter_ptr);
1540 } else {
1541 convolve_2d_sr_horiz_8tap_neon_dotprod(src_ptr, src_stride, im_block,
1542 im_stride, w, im_h, x_filter_ptr);
1543 }
1544
1545 const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
1546
1547 if (clamped_y_taps <= 4) {
1548 convolve_2d_sr_vert_4tap_neon(im_block, im_stride, dst, dst_stride, w, h,
1549 y_filter_ptr);
1550 } else if (clamped_y_taps == 6) {
1551 convolve_2d_sr_vert_6tap_neon(im_block, im_stride, dst, dst_stride, w, h,
1552 y_filter);
1553 } else {
1554 convolve_2d_sr_vert_8tap_neon(im_block, im_stride, dst, dst_stride, w, h,
1555 y_filter);
1556 }
1557 }
1558 }
1559