1 /*
2 * Copyright (c) 2022, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <arm_neon.h>
13
14 #include "config/aom_config.h"
15 #include "config/aom_dsp_rtcd.h"
16 #include "config/av1_rtcd.h"
17
18 #include "aom/aom_integer.h"
19 #include "aom_dsp/arm/mem_neon.h"
20 #include "aom_dsp/arm/sum_neon.h"
21 #include "aom_dsp/arm/transpose_neon.h"
22 #include "aom_dsp/intrapred_common.h"
23
24 // -----------------------------------------------------------------------------
25 // DC
26
highbd_dc_store_4xh(uint16_t * dst,ptrdiff_t stride,int h,uint16x4_t dc)27 static inline void highbd_dc_store_4xh(uint16_t *dst, ptrdiff_t stride, int h,
28 uint16x4_t dc) {
29 for (int i = 0; i < h; ++i) {
30 vst1_u16(dst + i * stride, dc);
31 }
32 }
33
highbd_dc_store_8xh(uint16_t * dst,ptrdiff_t stride,int h,uint16x8_t dc)34 static inline void highbd_dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int h,
35 uint16x8_t dc) {
36 for (int i = 0; i < h; ++i) {
37 vst1q_u16(dst + i * stride, dc);
38 }
39 }
40
highbd_dc_store_16xh(uint16_t * dst,ptrdiff_t stride,int h,uint16x8_t dc)41 static inline void highbd_dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int h,
42 uint16x8_t dc) {
43 for (int i = 0; i < h; ++i) {
44 vst1q_u16(dst + i * stride, dc);
45 vst1q_u16(dst + i * stride + 8, dc);
46 }
47 }
48
highbd_dc_store_32xh(uint16_t * dst,ptrdiff_t stride,int h,uint16x8_t dc)49 static inline void highbd_dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int h,
50 uint16x8_t dc) {
51 for (int i = 0; i < h; ++i) {
52 vst1q_u16(dst + i * stride, dc);
53 vst1q_u16(dst + i * stride + 8, dc);
54 vst1q_u16(dst + i * stride + 16, dc);
55 vst1q_u16(dst + i * stride + 24, dc);
56 }
57 }
58
highbd_dc_store_64xh(uint16_t * dst,ptrdiff_t stride,int h,uint16x8_t dc)59 static inline void highbd_dc_store_64xh(uint16_t *dst, ptrdiff_t stride, int h,
60 uint16x8_t dc) {
61 for (int i = 0; i < h; ++i) {
62 vst1q_u16(dst + i * stride, dc);
63 vst1q_u16(dst + i * stride + 8, dc);
64 vst1q_u16(dst + i * stride + 16, dc);
65 vst1q_u16(dst + i * stride + 24, dc);
66 vst1q_u16(dst + i * stride + 32, dc);
67 vst1q_u16(dst + i * stride + 40, dc);
68 vst1q_u16(dst + i * stride + 48, dc);
69 vst1q_u16(dst + i * stride + 56, dc);
70 }
71 }
72
horizontal_add_and_broadcast_long_u16x8(uint16x8_t a)73 static inline uint32x4_t horizontal_add_and_broadcast_long_u16x8(uint16x8_t a) {
74 // Need to assume input is up to 16 bits wide from dc 64x64 partial sum, so
75 // promote first.
76 const uint32x4_t b = vpaddlq_u16(a);
77 #if AOM_ARCH_AARCH64
78 const uint32x4_t c = vpaddq_u32(b, b);
79 return vpaddq_u32(c, c);
80 #else
81 const uint32x2_t c = vadd_u32(vget_low_u32(b), vget_high_u32(b));
82 const uint32x2_t d = vpadd_u32(c, c);
83 return vcombine_u32(d, d);
84 #endif
85 }
86
highbd_dc_load_partial_sum_4(const uint16_t * left)87 static inline uint16x8_t highbd_dc_load_partial_sum_4(const uint16_t *left) {
88 // Nothing to do since sum is already one vector, but saves needing to
89 // special case w=4 or h=4 cases. The combine will be zero cost for a sane
90 // compiler since vld1 already sets the top half of a vector to zero as part
91 // of the operation.
92 return vcombine_u16(vld1_u16(left), vdup_n_u16(0));
93 }
94
highbd_dc_load_partial_sum_8(const uint16_t * left)95 static inline uint16x8_t highbd_dc_load_partial_sum_8(const uint16_t *left) {
96 // Nothing to do since sum is already one vector, but saves needing to
97 // special case w=8 or h=8 cases.
98 return vld1q_u16(left);
99 }
100
highbd_dc_load_partial_sum_16(const uint16_t * left)101 static inline uint16x8_t highbd_dc_load_partial_sum_16(const uint16_t *left) {
102 const uint16x8_t a0 = vld1q_u16(left + 0); // up to 12 bits
103 const uint16x8_t a1 = vld1q_u16(left + 8);
104 return vaddq_u16(a0, a1); // up to 13 bits
105 }
106
highbd_dc_load_partial_sum_32(const uint16_t * left)107 static inline uint16x8_t highbd_dc_load_partial_sum_32(const uint16_t *left) {
108 const uint16x8_t a0 = vld1q_u16(left + 0); // up to 12 bits
109 const uint16x8_t a1 = vld1q_u16(left + 8);
110 const uint16x8_t a2 = vld1q_u16(left + 16);
111 const uint16x8_t a3 = vld1q_u16(left + 24);
112 const uint16x8_t b0 = vaddq_u16(a0, a1); // up to 13 bits
113 const uint16x8_t b1 = vaddq_u16(a2, a3);
114 return vaddq_u16(b0, b1); // up to 14 bits
115 }
116
highbd_dc_load_partial_sum_64(const uint16_t * left)117 static inline uint16x8_t highbd_dc_load_partial_sum_64(const uint16_t *left) {
118 const uint16x8_t a0 = vld1q_u16(left + 0); // up to 12 bits
119 const uint16x8_t a1 = vld1q_u16(left + 8);
120 const uint16x8_t a2 = vld1q_u16(left + 16);
121 const uint16x8_t a3 = vld1q_u16(left + 24);
122 const uint16x8_t a4 = vld1q_u16(left + 32);
123 const uint16x8_t a5 = vld1q_u16(left + 40);
124 const uint16x8_t a6 = vld1q_u16(left + 48);
125 const uint16x8_t a7 = vld1q_u16(left + 56);
126 const uint16x8_t b0 = vaddq_u16(a0, a1); // up to 13 bits
127 const uint16x8_t b1 = vaddq_u16(a2, a3);
128 const uint16x8_t b2 = vaddq_u16(a4, a5);
129 const uint16x8_t b3 = vaddq_u16(a6, a7);
130 const uint16x8_t c0 = vaddq_u16(b0, b1); // up to 14 bits
131 const uint16x8_t c1 = vaddq_u16(b2, b3);
132 return vaddq_u16(c0, c1); // up to 15 bits
133 }
134
135 #define HIGHBD_DC_PREDICTOR(w, h, shift) \
136 void aom_highbd_dc_predictor_##w##x##h##_neon( \
137 uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
138 const uint16_t *left, int bd) { \
139 (void)bd; \
140 const uint16x8_t a = highbd_dc_load_partial_sum_##w(above); \
141 const uint16x8_t l = highbd_dc_load_partial_sum_##h(left); \
142 const uint32x4_t sum = \
143 horizontal_add_and_broadcast_long_u16x8(vaddq_u16(a, l)); \
144 const uint16x4_t dc0 = vrshrn_n_u32(sum, shift); \
145 highbd_dc_store_##w##xh(dst, stride, (h), vdupq_lane_u16(dc0, 0)); \
146 }
147
aom_highbd_dc_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)148 void aom_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
149 const uint16_t *above,
150 const uint16_t *left, int bd) {
151 // In the rectangular cases we simply extend the shorter vector to uint16x8
152 // in order to accumulate, however in the 4x4 case there is no shorter vector
153 // to extend so it is beneficial to do the whole calculation in uint16x4
154 // instead.
155 (void)bd;
156 const uint16x4_t a = vld1_u16(above); // up to 12 bits
157 const uint16x4_t l = vld1_u16(left);
158 uint16x4_t sum = vpadd_u16(a, l); // up to 13 bits
159 sum = vpadd_u16(sum, sum); // up to 14 bits
160 sum = vpadd_u16(sum, sum);
161 const uint16x4_t dc = vrshr_n_u16(sum, 3);
162 highbd_dc_store_4xh(dst, stride, 4, dc);
163 }
164
165 HIGHBD_DC_PREDICTOR(8, 8, 4)
166 HIGHBD_DC_PREDICTOR(16, 16, 5)
167 HIGHBD_DC_PREDICTOR(32, 32, 6)
168 HIGHBD_DC_PREDICTOR(64, 64, 7)
169
170 #undef HIGHBD_DC_PREDICTOR
171
divide_using_multiply_shift(int num,int shift1,int multiplier,int shift2)172 static inline int divide_using_multiply_shift(int num, int shift1,
173 int multiplier, int shift2) {
174 const int interm = num >> shift1;
175 return interm * multiplier >> shift2;
176 }
177
178 #define HIGHBD_DC_MULTIPLIER_1X2 0xAAAB
179 #define HIGHBD_DC_MULTIPLIER_1X4 0x6667
180 #define HIGHBD_DC_SHIFT2 17
181
highbd_dc_predictor_rect(int bw,int bh,int sum,int shift1,uint32_t multiplier)182 static inline int highbd_dc_predictor_rect(int bw, int bh, int sum, int shift1,
183 uint32_t multiplier) {
184 return divide_using_multiply_shift(sum + ((bw + bh) >> 1), shift1, multiplier,
185 HIGHBD_DC_SHIFT2);
186 }
187
188 #undef HIGHBD_DC_SHIFT2
189
190 #define HIGHBD_DC_PREDICTOR_RECT(w, h, q, shift, mult) \
191 void aom_highbd_dc_predictor_##w##x##h##_neon( \
192 uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
193 const uint16_t *left, int bd) { \
194 (void)bd; \
195 uint16x8_t sum_above = highbd_dc_load_partial_sum_##w(above); \
196 uint16x8_t sum_left = highbd_dc_load_partial_sum_##h(left); \
197 uint16x8_t sum_vec = vaddq_u16(sum_left, sum_above); \
198 int sum = horizontal_add_u16x8(sum_vec); \
199 int dc0 = highbd_dc_predictor_rect((w), (h), sum, (shift), (mult)); \
200 highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_n_u16(dc0)); \
201 }
202
203 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
204 HIGHBD_DC_PREDICTOR_RECT(4, 8, , 2, HIGHBD_DC_MULTIPLIER_1X2)
205 HIGHBD_DC_PREDICTOR_RECT(4, 16, , 2, HIGHBD_DC_MULTIPLIER_1X4)
206 HIGHBD_DC_PREDICTOR_RECT(8, 4, q, 2, HIGHBD_DC_MULTIPLIER_1X2)
207 HIGHBD_DC_PREDICTOR_RECT(8, 16, q, 3, HIGHBD_DC_MULTIPLIER_1X2)
208 HIGHBD_DC_PREDICTOR_RECT(8, 32, q, 3, HIGHBD_DC_MULTIPLIER_1X4)
209 HIGHBD_DC_PREDICTOR_RECT(16, 4, q, 2, HIGHBD_DC_MULTIPLIER_1X4)
210 HIGHBD_DC_PREDICTOR_RECT(16, 8, q, 3, HIGHBD_DC_MULTIPLIER_1X2)
211 HIGHBD_DC_PREDICTOR_RECT(16, 32, q, 4, HIGHBD_DC_MULTIPLIER_1X2)
212 HIGHBD_DC_PREDICTOR_RECT(16, 64, q, 4, HIGHBD_DC_MULTIPLIER_1X4)
213 HIGHBD_DC_PREDICTOR_RECT(32, 8, q, 3, HIGHBD_DC_MULTIPLIER_1X4)
214 HIGHBD_DC_PREDICTOR_RECT(32, 16, q, 4, HIGHBD_DC_MULTIPLIER_1X2)
215 HIGHBD_DC_PREDICTOR_RECT(32, 64, q, 5, HIGHBD_DC_MULTIPLIER_1X2)
216 HIGHBD_DC_PREDICTOR_RECT(64, 16, q, 4, HIGHBD_DC_MULTIPLIER_1X4)
217 HIGHBD_DC_PREDICTOR_RECT(64, 32, q, 5, HIGHBD_DC_MULTIPLIER_1X2)
218 #else
219 HIGHBD_DC_PREDICTOR_RECT(4, 8, , 2, HIGHBD_DC_MULTIPLIER_1X2)
220 HIGHBD_DC_PREDICTOR_RECT(8, 4, q, 2, HIGHBD_DC_MULTIPLIER_1X2)
221 HIGHBD_DC_PREDICTOR_RECT(8, 16, q, 3, HIGHBD_DC_MULTIPLIER_1X2)
222 HIGHBD_DC_PREDICTOR_RECT(16, 8, q, 3, HIGHBD_DC_MULTIPLIER_1X2)
223 HIGHBD_DC_PREDICTOR_RECT(16, 32, q, 4, HIGHBD_DC_MULTIPLIER_1X2)
224 HIGHBD_DC_PREDICTOR_RECT(32, 16, q, 4, HIGHBD_DC_MULTIPLIER_1X2)
225 HIGHBD_DC_PREDICTOR_RECT(32, 64, q, 5, HIGHBD_DC_MULTIPLIER_1X2)
226 HIGHBD_DC_PREDICTOR_RECT(64, 32, q, 5, HIGHBD_DC_MULTIPLIER_1X2)
227 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
228
229 #undef HIGHBD_DC_PREDICTOR_RECT
230 #undef HIGHBD_DC_MULTIPLIER_1X2
231 #undef HIGHBD_DC_MULTIPLIER_1X4
232
233 // -----------------------------------------------------------------------------
234 // DC_128
235
236 #define HIGHBD_DC_PREDICTOR_128(w, h, q) \
237 void aom_highbd_dc_128_predictor_##w##x##h##_neon( \
238 uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
239 const uint16_t *left, int bd) { \
240 (void)above; \
241 (void)bd; \
242 (void)left; \
243 highbd_dc_store_##w##xh(dst, stride, (h), \
244 vdup##q##_n_u16(0x80 << (bd - 8))); \
245 }
246
247 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
248 HIGHBD_DC_PREDICTOR_128(4, 4, )
249 HIGHBD_DC_PREDICTOR_128(4, 8, )
250 HIGHBD_DC_PREDICTOR_128(4, 16, )
251 HIGHBD_DC_PREDICTOR_128(8, 4, q)
252 HIGHBD_DC_PREDICTOR_128(8, 8, q)
253 HIGHBD_DC_PREDICTOR_128(8, 16, q)
254 HIGHBD_DC_PREDICTOR_128(8, 32, q)
255 HIGHBD_DC_PREDICTOR_128(16, 4, q)
256 HIGHBD_DC_PREDICTOR_128(16, 8, q)
257 HIGHBD_DC_PREDICTOR_128(16, 16, q)
258 HIGHBD_DC_PREDICTOR_128(16, 32, q)
259 HIGHBD_DC_PREDICTOR_128(16, 64, q)
260 HIGHBD_DC_PREDICTOR_128(32, 8, q)
261 HIGHBD_DC_PREDICTOR_128(32, 16, q)
262 HIGHBD_DC_PREDICTOR_128(32, 32, q)
263 HIGHBD_DC_PREDICTOR_128(32, 64, q)
264 HIGHBD_DC_PREDICTOR_128(64, 16, q)
265 HIGHBD_DC_PREDICTOR_128(64, 32, q)
266 HIGHBD_DC_PREDICTOR_128(64, 64, q)
267 #else
268 HIGHBD_DC_PREDICTOR_128(4, 4, )
269 HIGHBD_DC_PREDICTOR_128(4, 8, )
270 HIGHBD_DC_PREDICTOR_128(8, 4, q)
271 HIGHBD_DC_PREDICTOR_128(8, 8, q)
272 HIGHBD_DC_PREDICTOR_128(8, 16, q)
273 HIGHBD_DC_PREDICTOR_128(16, 8, q)
274 HIGHBD_DC_PREDICTOR_128(16, 16, q)
275 HIGHBD_DC_PREDICTOR_128(16, 32, q)
276 HIGHBD_DC_PREDICTOR_128(32, 16, q)
277 HIGHBD_DC_PREDICTOR_128(32, 32, q)
278 HIGHBD_DC_PREDICTOR_128(32, 64, q)
279 HIGHBD_DC_PREDICTOR_128(64, 32, q)
280 HIGHBD_DC_PREDICTOR_128(64, 64, q)
281 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
282
283 #undef HIGHBD_DC_PREDICTOR_128
284
285 // -----------------------------------------------------------------------------
286 // DC_LEFT
287
highbd_dc_load_sum_4(const uint16_t * left)288 static inline uint32x4_t highbd_dc_load_sum_4(const uint16_t *left) {
289 const uint16x4_t a = vld1_u16(left); // up to 12 bits
290 const uint16x4_t b = vpadd_u16(a, a); // up to 13 bits
291 return vcombine_u32(vpaddl_u16(b), vdup_n_u32(0));
292 }
293
highbd_dc_load_sum_8(const uint16_t * left)294 static inline uint32x4_t highbd_dc_load_sum_8(const uint16_t *left) {
295 return horizontal_add_and_broadcast_long_u16x8(vld1q_u16(left));
296 }
297
highbd_dc_load_sum_16(const uint16_t * left)298 static inline uint32x4_t highbd_dc_load_sum_16(const uint16_t *left) {
299 return horizontal_add_and_broadcast_long_u16x8(
300 highbd_dc_load_partial_sum_16(left));
301 }
302
highbd_dc_load_sum_32(const uint16_t * left)303 static inline uint32x4_t highbd_dc_load_sum_32(const uint16_t *left) {
304 return horizontal_add_and_broadcast_long_u16x8(
305 highbd_dc_load_partial_sum_32(left));
306 }
307
highbd_dc_load_sum_64(const uint16_t * left)308 static inline uint32x4_t highbd_dc_load_sum_64(const uint16_t *left) {
309 return horizontal_add_and_broadcast_long_u16x8(
310 highbd_dc_load_partial_sum_64(left));
311 }
312
313 #define DC_PREDICTOR_LEFT(w, h, shift, q) \
314 void aom_highbd_dc_left_predictor_##w##x##h##_neon( \
315 uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
316 const uint16_t *left, int bd) { \
317 (void)above; \
318 (void)bd; \
319 const uint32x4_t sum = highbd_dc_load_sum_##h(left); \
320 const uint16x4_t dc0 = vrshrn_n_u32(sum, (shift)); \
321 highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u16(dc0, 0)); \
322 }
323
324 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
325 DC_PREDICTOR_LEFT(4, 4, 2, )
326 DC_PREDICTOR_LEFT(4, 8, 3, )
327 DC_PREDICTOR_LEFT(4, 16, 4, )
328 DC_PREDICTOR_LEFT(8, 4, 2, q)
329 DC_PREDICTOR_LEFT(8, 8, 3, q)
330 DC_PREDICTOR_LEFT(8, 16, 4, q)
331 DC_PREDICTOR_LEFT(8, 32, 5, q)
332 DC_PREDICTOR_LEFT(16, 4, 2, q)
333 DC_PREDICTOR_LEFT(16, 8, 3, q)
334 DC_PREDICTOR_LEFT(16, 16, 4, q)
335 DC_PREDICTOR_LEFT(16, 32, 5, q)
336 DC_PREDICTOR_LEFT(16, 64, 6, q)
337 DC_PREDICTOR_LEFT(32, 8, 3, q)
338 DC_PREDICTOR_LEFT(32, 16, 4, q)
339 DC_PREDICTOR_LEFT(32, 32, 5, q)
340 DC_PREDICTOR_LEFT(32, 64, 6, q)
341 DC_PREDICTOR_LEFT(64, 16, 4, q)
342 DC_PREDICTOR_LEFT(64, 32, 5, q)
343 DC_PREDICTOR_LEFT(64, 64, 6, q)
344 #else
345 DC_PREDICTOR_LEFT(4, 4, 2, )
346 DC_PREDICTOR_LEFT(4, 8, 3, )
347 DC_PREDICTOR_LEFT(8, 4, 2, q)
348 DC_PREDICTOR_LEFT(8, 8, 3, q)
349 DC_PREDICTOR_LEFT(8, 16, 4, q)
350 DC_PREDICTOR_LEFT(16, 8, 3, q)
351 DC_PREDICTOR_LEFT(16, 16, 4, q)
352 DC_PREDICTOR_LEFT(16, 32, 5, q)
353 DC_PREDICTOR_LEFT(32, 16, 4, q)
354 DC_PREDICTOR_LEFT(32, 32, 5, q)
355 DC_PREDICTOR_LEFT(32, 64, 6, q)
356 DC_PREDICTOR_LEFT(64, 32, 5, q)
357 DC_PREDICTOR_LEFT(64, 64, 6, q)
358 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
359
360 #undef DC_PREDICTOR_LEFT
361
362 // -----------------------------------------------------------------------------
363 // DC_TOP
364
365 #define DC_PREDICTOR_TOP(w, h, shift, q) \
366 void aom_highbd_dc_top_predictor_##w##x##h##_neon( \
367 uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
368 const uint16_t *left, int bd) { \
369 (void)bd; \
370 (void)left; \
371 const uint32x4_t sum = highbd_dc_load_sum_##w(above); \
372 const uint16x4_t dc0 = vrshrn_n_u32(sum, (shift)); \
373 highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u16(dc0, 0)); \
374 }
375
376 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
377 DC_PREDICTOR_TOP(4, 4, 2, )
378 DC_PREDICTOR_TOP(4, 8, 2, )
379 DC_PREDICTOR_TOP(4, 16, 2, )
380 DC_PREDICTOR_TOP(8, 4, 3, q)
381 DC_PREDICTOR_TOP(8, 8, 3, q)
382 DC_PREDICTOR_TOP(8, 16, 3, q)
383 DC_PREDICTOR_TOP(8, 32, 3, q)
384 DC_PREDICTOR_TOP(16, 4, 4, q)
385 DC_PREDICTOR_TOP(16, 8, 4, q)
386 DC_PREDICTOR_TOP(16, 16, 4, q)
387 DC_PREDICTOR_TOP(16, 32, 4, q)
388 DC_PREDICTOR_TOP(16, 64, 4, q)
389 DC_PREDICTOR_TOP(32, 8, 5, q)
390 DC_PREDICTOR_TOP(32, 16, 5, q)
391 DC_PREDICTOR_TOP(32, 32, 5, q)
392 DC_PREDICTOR_TOP(32, 64, 5, q)
393 DC_PREDICTOR_TOP(64, 16, 6, q)
394 DC_PREDICTOR_TOP(64, 32, 6, q)
395 DC_PREDICTOR_TOP(64, 64, 6, q)
396 #else
397 DC_PREDICTOR_TOP(4, 4, 2, )
398 DC_PREDICTOR_TOP(4, 8, 2, )
399 DC_PREDICTOR_TOP(8, 4, 3, q)
400 DC_PREDICTOR_TOP(8, 8, 3, q)
401 DC_PREDICTOR_TOP(8, 16, 3, q)
402 DC_PREDICTOR_TOP(16, 8, 4, q)
403 DC_PREDICTOR_TOP(16, 16, 4, q)
404 DC_PREDICTOR_TOP(16, 32, 4, q)
405 DC_PREDICTOR_TOP(32, 16, 5, q)
406 DC_PREDICTOR_TOP(32, 32, 5, q)
407 DC_PREDICTOR_TOP(32, 64, 5, q)
408 DC_PREDICTOR_TOP(64, 32, 6, q)
409 DC_PREDICTOR_TOP(64, 64, 6, q)
410 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
411
412 #undef DC_PREDICTOR_TOP
413
414 // -----------------------------------------------------------------------------
415 // V_PRED
416
417 #define HIGHBD_V_NXM(W, H) \
418 void aom_highbd_v_predictor_##W##x##H##_neon( \
419 uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
420 const uint16_t *left, int bd) { \
421 (void)left; \
422 (void)bd; \
423 vertical##W##xh_neon(dst, stride, above, H); \
424 }
425
load_uint16x8x2(uint16_t const * ptr)426 static inline uint16x8x2_t load_uint16x8x2(uint16_t const *ptr) {
427 uint16x8x2_t x;
428 // Clang/gcc uses ldp here.
429 x.val[0] = vld1q_u16(ptr);
430 x.val[1] = vld1q_u16(ptr + 8);
431 return x;
432 }
433
store_uint16x8x2(uint16_t * ptr,uint16x8x2_t x)434 static inline void store_uint16x8x2(uint16_t *ptr, uint16x8x2_t x) {
435 vst1q_u16(ptr, x.val[0]);
436 vst1q_u16(ptr + 8, x.val[1]);
437 }
438
vertical4xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const above,int height)439 static inline void vertical4xh_neon(uint16_t *dst, ptrdiff_t stride,
440 const uint16_t *const above, int height) {
441 const uint16x4_t row = vld1_u16(above);
442 int y = height;
443 do {
444 vst1_u16(dst, row);
445 vst1_u16(dst + stride, row);
446 dst += stride << 1;
447 y -= 2;
448 } while (y != 0);
449 }
450
vertical8xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const above,int height)451 static inline void vertical8xh_neon(uint16_t *dst, ptrdiff_t stride,
452 const uint16_t *const above, int height) {
453 const uint16x8_t row = vld1q_u16(above);
454 int y = height;
455 do {
456 vst1q_u16(dst, row);
457 vst1q_u16(dst + stride, row);
458 dst += stride << 1;
459 y -= 2;
460 } while (y != 0);
461 }
462
vertical16xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const above,int height)463 static inline void vertical16xh_neon(uint16_t *dst, ptrdiff_t stride,
464 const uint16_t *const above, int height) {
465 const uint16x8x2_t row = load_uint16x8x2(above);
466 int y = height;
467 do {
468 store_uint16x8x2(dst, row);
469 store_uint16x8x2(dst + stride, row);
470 dst += stride << 1;
471 y -= 2;
472 } while (y != 0);
473 }
474
load_uint16x8x4(uint16_t const * ptr)475 static inline uint16x8x4_t load_uint16x8x4(uint16_t const *ptr) {
476 uint16x8x4_t x;
477 // Clang/gcc uses ldp here.
478 x.val[0] = vld1q_u16(ptr);
479 x.val[1] = vld1q_u16(ptr + 8);
480 x.val[2] = vld1q_u16(ptr + 16);
481 x.val[3] = vld1q_u16(ptr + 24);
482 return x;
483 }
484
store_uint16x8x4(uint16_t * ptr,uint16x8x4_t x)485 static inline void store_uint16x8x4(uint16_t *ptr, uint16x8x4_t x) {
486 vst1q_u16(ptr, x.val[0]);
487 vst1q_u16(ptr + 8, x.val[1]);
488 vst1q_u16(ptr + 16, x.val[2]);
489 vst1q_u16(ptr + 24, x.val[3]);
490 }
491
vertical32xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const above,int height)492 static inline void vertical32xh_neon(uint16_t *dst, ptrdiff_t stride,
493 const uint16_t *const above, int height) {
494 const uint16x8x4_t row = load_uint16x8x4(above);
495 int y = height;
496 do {
497 store_uint16x8x4(dst, row);
498 store_uint16x8x4(dst + stride, row);
499 dst += stride << 1;
500 y -= 2;
501 } while (y != 0);
502 }
503
vertical64xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const above,int height)504 static inline void vertical64xh_neon(uint16_t *dst, ptrdiff_t stride,
505 const uint16_t *const above, int height) {
506 uint16_t *dst32 = dst + 32;
507 const uint16x8x4_t row = load_uint16x8x4(above);
508 const uint16x8x4_t row32 = load_uint16x8x4(above + 32);
509 int y = height;
510 do {
511 store_uint16x8x4(dst, row);
512 store_uint16x8x4(dst32, row32);
513 store_uint16x8x4(dst + stride, row);
514 store_uint16x8x4(dst32 + stride, row32);
515 dst += stride << 1;
516 dst32 += stride << 1;
517 y -= 2;
518 } while (y != 0);
519 }
520
521 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
522 HIGHBD_V_NXM(4, 4)
523 HIGHBD_V_NXM(4, 8)
524 HIGHBD_V_NXM(4, 16)
525
526 HIGHBD_V_NXM(8, 4)
527 HIGHBD_V_NXM(8, 8)
528 HIGHBD_V_NXM(8, 16)
529 HIGHBD_V_NXM(8, 32)
530
531 HIGHBD_V_NXM(16, 4)
532 HIGHBD_V_NXM(16, 8)
533 HIGHBD_V_NXM(16, 16)
534 HIGHBD_V_NXM(16, 32)
535 HIGHBD_V_NXM(16, 64)
536
537 HIGHBD_V_NXM(32, 8)
538 HIGHBD_V_NXM(32, 16)
539 HIGHBD_V_NXM(32, 32)
540 HIGHBD_V_NXM(32, 64)
541
542 HIGHBD_V_NXM(64, 16)
543 HIGHBD_V_NXM(64, 32)
544 HIGHBD_V_NXM(64, 64)
545 #else
546 HIGHBD_V_NXM(4, 4)
547 HIGHBD_V_NXM(4, 8)
548
549 HIGHBD_V_NXM(8, 4)
550 HIGHBD_V_NXM(8, 8)
551 HIGHBD_V_NXM(8, 16)
552
553 HIGHBD_V_NXM(16, 8)
554 HIGHBD_V_NXM(16, 16)
555 HIGHBD_V_NXM(16, 32)
556
557 HIGHBD_V_NXM(32, 16)
558 HIGHBD_V_NXM(32, 32)
559 HIGHBD_V_NXM(32, 64)
560
561 HIGHBD_V_NXM(64, 32)
562 HIGHBD_V_NXM(64, 64)
563 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
564
565 // -----------------------------------------------------------------------------
566 // H_PRED
567
highbd_h_store_4x4(uint16_t * dst,ptrdiff_t stride,uint16x4_t left)568 static inline void highbd_h_store_4x4(uint16_t *dst, ptrdiff_t stride,
569 uint16x4_t left) {
570 vst1_u16(dst + 0 * stride, vdup_lane_u16(left, 0));
571 vst1_u16(dst + 1 * stride, vdup_lane_u16(left, 1));
572 vst1_u16(dst + 2 * stride, vdup_lane_u16(left, 2));
573 vst1_u16(dst + 3 * stride, vdup_lane_u16(left, 3));
574 }
575
highbd_h_store_8x4(uint16_t * dst,ptrdiff_t stride,uint16x4_t left)576 static inline void highbd_h_store_8x4(uint16_t *dst, ptrdiff_t stride,
577 uint16x4_t left) {
578 vst1q_u16(dst + 0 * stride, vdupq_lane_u16(left, 0));
579 vst1q_u16(dst + 1 * stride, vdupq_lane_u16(left, 1));
580 vst1q_u16(dst + 2 * stride, vdupq_lane_u16(left, 2));
581 vst1q_u16(dst + 3 * stride, vdupq_lane_u16(left, 3));
582 }
583
highbd_h_store_16x1(uint16_t * dst,uint16x8_t left)584 static inline void highbd_h_store_16x1(uint16_t *dst, uint16x8_t left) {
585 vst1q_u16(dst + 0, left);
586 vst1q_u16(dst + 8, left);
587 }
588
highbd_h_store_16x4(uint16_t * dst,ptrdiff_t stride,uint16x4_t left)589 static inline void highbd_h_store_16x4(uint16_t *dst, ptrdiff_t stride,
590 uint16x4_t left) {
591 highbd_h_store_16x1(dst + 0 * stride, vdupq_lane_u16(left, 0));
592 highbd_h_store_16x1(dst + 1 * stride, vdupq_lane_u16(left, 1));
593 highbd_h_store_16x1(dst + 2 * stride, vdupq_lane_u16(left, 2));
594 highbd_h_store_16x1(dst + 3 * stride, vdupq_lane_u16(left, 3));
595 }
596
highbd_h_store_32x1(uint16_t * dst,uint16x8_t left)597 static inline void highbd_h_store_32x1(uint16_t *dst, uint16x8_t left) {
598 vst1q_u16(dst + 0, left);
599 vst1q_u16(dst + 8, left);
600 vst1q_u16(dst + 16, left);
601 vst1q_u16(dst + 24, left);
602 }
603
highbd_h_store_32x4(uint16_t * dst,ptrdiff_t stride,uint16x4_t left)604 static inline void highbd_h_store_32x4(uint16_t *dst, ptrdiff_t stride,
605 uint16x4_t left) {
606 highbd_h_store_32x1(dst + 0 * stride, vdupq_lane_u16(left, 0));
607 highbd_h_store_32x1(dst + 1 * stride, vdupq_lane_u16(left, 1));
608 highbd_h_store_32x1(dst + 2 * stride, vdupq_lane_u16(left, 2));
609 highbd_h_store_32x1(dst + 3 * stride, vdupq_lane_u16(left, 3));
610 }
611
highbd_h_store_64x1(uint16_t * dst,uint16x8_t left)612 static inline void highbd_h_store_64x1(uint16_t *dst, uint16x8_t left) {
613 vst1q_u16(dst + 0, left);
614 vst1q_u16(dst + 8, left);
615 vst1q_u16(dst + 16, left);
616 vst1q_u16(dst + 24, left);
617 vst1q_u16(dst + 32, left);
618 vst1q_u16(dst + 40, left);
619 vst1q_u16(dst + 48, left);
620 vst1q_u16(dst + 56, left);
621 }
622
highbd_h_store_64x4(uint16_t * dst,ptrdiff_t stride,uint16x4_t left)623 static inline void highbd_h_store_64x4(uint16_t *dst, ptrdiff_t stride,
624 uint16x4_t left) {
625 highbd_h_store_64x1(dst + 0 * stride, vdupq_lane_u16(left, 0));
626 highbd_h_store_64x1(dst + 1 * stride, vdupq_lane_u16(left, 1));
627 highbd_h_store_64x1(dst + 2 * stride, vdupq_lane_u16(left, 2));
628 highbd_h_store_64x1(dst + 3 * stride, vdupq_lane_u16(left, 3));
629 }
630
aom_highbd_h_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)631 void aom_highbd_h_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
632 const uint16_t *above,
633 const uint16_t *left, int bd) {
634 (void)above;
635 (void)bd;
636 highbd_h_store_4x4(dst, stride, vld1_u16(left));
637 }
638
aom_highbd_h_predictor_4x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)639 void aom_highbd_h_predictor_4x8_neon(uint16_t *dst, ptrdiff_t stride,
640 const uint16_t *above,
641 const uint16_t *left, int bd) {
642 (void)above;
643 (void)bd;
644 uint16x8_t l = vld1q_u16(left);
645 highbd_h_store_4x4(dst + 0 * stride, stride, vget_low_u16(l));
646 highbd_h_store_4x4(dst + 4 * stride, stride, vget_high_u16(l));
647 }
648
aom_highbd_h_predictor_8x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)649 void aom_highbd_h_predictor_8x4_neon(uint16_t *dst, ptrdiff_t stride,
650 const uint16_t *above,
651 const uint16_t *left, int bd) {
652 (void)above;
653 (void)bd;
654 highbd_h_store_8x4(dst, stride, vld1_u16(left));
655 }
656
aom_highbd_h_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)657 void aom_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
658 const uint16_t *above,
659 const uint16_t *left, int bd) {
660 (void)above;
661 (void)bd;
662 uint16x8_t l = vld1q_u16(left);
663 highbd_h_store_8x4(dst + 0 * stride, stride, vget_low_u16(l));
664 highbd_h_store_8x4(dst + 4 * stride, stride, vget_high_u16(l));
665 }
666
667 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_highbd_h_predictor_16x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)668 void aom_highbd_h_predictor_16x4_neon(uint16_t *dst, ptrdiff_t stride,
669 const uint16_t *above,
670 const uint16_t *left, int bd) {
671 (void)above;
672 (void)bd;
673 highbd_h_store_16x4(dst, stride, vld1_u16(left));
674 }
675 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
676
aom_highbd_h_predictor_16x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)677 void aom_highbd_h_predictor_16x8_neon(uint16_t *dst, ptrdiff_t stride,
678 const uint16_t *above,
679 const uint16_t *left, int bd) {
680 (void)above;
681 (void)bd;
682 uint16x8_t l = vld1q_u16(left);
683 highbd_h_store_16x4(dst + 0 * stride, stride, vget_low_u16(l));
684 highbd_h_store_16x4(dst + 4 * stride, stride, vget_high_u16(l));
685 }
686
687 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_highbd_h_predictor_32x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)688 void aom_highbd_h_predictor_32x8_neon(uint16_t *dst, ptrdiff_t stride,
689 const uint16_t *above,
690 const uint16_t *left, int bd) {
691 (void)above;
692 (void)bd;
693 uint16x8_t l = vld1q_u16(left);
694 highbd_h_store_32x4(dst + 0 * stride, stride, vget_low_u16(l));
695 highbd_h_store_32x4(dst + 4 * stride, stride, vget_high_u16(l));
696 }
697 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
698
699 // For cases where height >= 16 we use pairs of loads to get LDP instructions.
700 #define HIGHBD_H_WXH_LARGE(w, h) \
701 void aom_highbd_h_predictor_##w##x##h##_neon( \
702 uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
703 const uint16_t *left, int bd) { \
704 (void)above; \
705 (void)bd; \
706 for (int i = 0; i < (h) / 16; ++i) { \
707 uint16x8_t l0 = vld1q_u16(left + 0); \
708 uint16x8_t l1 = vld1q_u16(left + 8); \
709 highbd_h_store_##w##x4(dst + 0 * stride, stride, vget_low_u16(l0)); \
710 highbd_h_store_##w##x4(dst + 4 * stride, stride, vget_high_u16(l0)); \
711 highbd_h_store_##w##x4(dst + 8 * stride, stride, vget_low_u16(l1)); \
712 highbd_h_store_##w##x4(dst + 12 * stride, stride, vget_high_u16(l1)); \
713 left += 16; \
714 dst += 16 * stride; \
715 } \
716 }
717
718 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
719 HIGHBD_H_WXH_LARGE(4, 16)
720 HIGHBD_H_WXH_LARGE(8, 16)
721 HIGHBD_H_WXH_LARGE(8, 32)
722 HIGHBD_H_WXH_LARGE(16, 16)
723 HIGHBD_H_WXH_LARGE(16, 32)
724 HIGHBD_H_WXH_LARGE(16, 64)
725 HIGHBD_H_WXH_LARGE(32, 16)
726 HIGHBD_H_WXH_LARGE(32, 32)
727 HIGHBD_H_WXH_LARGE(32, 64)
728 HIGHBD_H_WXH_LARGE(64, 16)
729 HIGHBD_H_WXH_LARGE(64, 32)
730 HIGHBD_H_WXH_LARGE(64, 64)
731 #else
732 HIGHBD_H_WXH_LARGE(8, 16)
733 HIGHBD_H_WXH_LARGE(16, 16)
734 HIGHBD_H_WXH_LARGE(16, 32)
735 HIGHBD_H_WXH_LARGE(32, 16)
736 HIGHBD_H_WXH_LARGE(32, 32)
737 HIGHBD_H_WXH_LARGE(32, 64)
738 HIGHBD_H_WXH_LARGE(64, 32)
739 HIGHBD_H_WXH_LARGE(64, 64)
740 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
741
742 #undef HIGHBD_H_WXH_LARGE
743
744 // -----------------------------------------------------------------------------
745 // PAETH
746
highbd_paeth_4or8_x_h_neon(uint16_t * dest,ptrdiff_t stride,const uint16_t * const top_row,const uint16_t * const left_column,int width,int height)747 static inline void highbd_paeth_4or8_x_h_neon(uint16_t *dest, ptrdiff_t stride,
748 const uint16_t *const top_row,
749 const uint16_t *const left_column,
750 int width, int height) {
751 const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
752 const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
753 uint16x8_t top;
754 if (width == 4) {
755 top = vcombine_u16(vld1_u16(top_row), vdup_n_u16(0));
756 } else { // width == 8
757 top = vld1q_u16(top_row);
758 }
759
760 for (int y = 0; y < height; ++y) {
761 const uint16x8_t left = vdupq_n_u16(left_column[y]);
762
763 const uint16x8_t left_dist = vabdq_u16(top, top_left);
764 const uint16x8_t top_dist = vabdq_u16(left, top_left);
765 const uint16x8_t top_left_dist =
766 vabdq_u16(vaddq_u16(top, left), top_left_x2);
767
768 const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist);
769 const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist);
770 const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist);
771
772 // if (left_dist <= top_dist && left_dist <= top_left_dist)
773 const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
774 // dest[x] = left_column[y];
775 // Fill all the unused spaces with 'top'. They will be overwritten when
776 // the positions for top_left are known.
777 uint16x8_t result = vbslq_u16(left_mask, left, top);
778 // else if (top_dist <= top_left_dist)
779 // dest[x] = top_row[x];
780 // Add these values to the mask. They were already set.
781 const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
782 // else
783 // dest[x] = top_left;
784 result = vbslq_u16(left_or_top_mask, result, top_left);
785
786 if (width == 4) {
787 vst1_u16(dest, vget_low_u16(result));
788 } else { // width == 8
789 vst1q_u16(dest, result);
790 }
791 dest += stride;
792 }
793 }
794
795 #define HIGHBD_PAETH_NXM(W, H) \
796 void aom_highbd_paeth_predictor_##W##x##H##_neon( \
797 uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
798 const uint16_t *left, int bd) { \
799 (void)bd; \
800 highbd_paeth_4or8_x_h_neon(dst, stride, above, left, W, H); \
801 }
802
803 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
804 HIGHBD_PAETH_NXM(4, 4)
805 HIGHBD_PAETH_NXM(4, 8)
806 HIGHBD_PAETH_NXM(4, 16)
807 HIGHBD_PAETH_NXM(8, 4)
808 HIGHBD_PAETH_NXM(8, 8)
809 HIGHBD_PAETH_NXM(8, 16)
810 HIGHBD_PAETH_NXM(8, 32)
811 #else
812 HIGHBD_PAETH_NXM(4, 4)
813 HIGHBD_PAETH_NXM(4, 8)
814 HIGHBD_PAETH_NXM(8, 4)
815 HIGHBD_PAETH_NXM(8, 8)
816 HIGHBD_PAETH_NXM(8, 16)
817 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
818
819 // Select the closest values and collect them.
select_paeth(const uint16x8_t top,const uint16x8_t left,const uint16x8_t top_left,const uint16x8_t left_le_top,const uint16x8_t left_le_top_left,const uint16x8_t top_le_top_left)820 static inline uint16x8_t select_paeth(const uint16x8_t top,
821 const uint16x8_t left,
822 const uint16x8_t top_left,
823 const uint16x8_t left_le_top,
824 const uint16x8_t left_le_top_left,
825 const uint16x8_t top_le_top_left) {
826 // if (left_dist <= top_dist && left_dist <= top_left_dist)
827 const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
828 // dest[x] = left_column[y];
829 // Fill all the unused spaces with 'top'. They will be overwritten when
830 // the positions for top_left are known.
831 const uint16x8_t result = vbslq_u16(left_mask, left, top);
832 // else if (top_dist <= top_left_dist)
833 // dest[x] = top_row[x];
834 // Add these values to the mask. They were already set.
835 const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
836 // else
837 // dest[x] = top_left;
838 return vbslq_u16(left_or_top_mask, result, top_left);
839 }
840
841 #define PAETH_PREDICTOR(num) \
842 do { \
843 const uint16x8_t left_dist = vabdq_u16(top[num], top_left); \
844 const uint16x8_t top_left_dist = \
845 vabdq_u16(vaddq_u16(top[num], left), top_left_x2); \
846 const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist); \
847 const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist); \
848 const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist); \
849 const uint16x8_t result = \
850 select_paeth(top[num], left, top_left, left_le_top, left_le_top_left, \
851 top_le_top_left); \
852 vst1q_u16(dest + (num * 8), result); \
853 } while (0)
854
855 #define LOAD_TOP_ROW(num) vld1q_u16(top_row + (num * 8))
856
highbd_paeth16_plus_x_h_neon(uint16_t * dest,ptrdiff_t stride,const uint16_t * const top_row,const uint16_t * const left_column,int width,int height)857 static inline void highbd_paeth16_plus_x_h_neon(
858 uint16_t *dest, ptrdiff_t stride, const uint16_t *const top_row,
859 const uint16_t *const left_column, int width, int height) {
860 const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
861 const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
862 uint16x8_t top[8];
863 top[0] = LOAD_TOP_ROW(0);
864 top[1] = LOAD_TOP_ROW(1);
865 if (width > 16) {
866 top[2] = LOAD_TOP_ROW(2);
867 top[3] = LOAD_TOP_ROW(3);
868 if (width == 64) {
869 top[4] = LOAD_TOP_ROW(4);
870 top[5] = LOAD_TOP_ROW(5);
871 top[6] = LOAD_TOP_ROW(6);
872 top[7] = LOAD_TOP_ROW(7);
873 }
874 }
875
876 for (int y = 0; y < height; ++y) {
877 const uint16x8_t left = vdupq_n_u16(left_column[y]);
878 const uint16x8_t top_dist = vabdq_u16(left, top_left);
879 PAETH_PREDICTOR(0);
880 PAETH_PREDICTOR(1);
881 if (width > 16) {
882 PAETH_PREDICTOR(2);
883 PAETH_PREDICTOR(3);
884 if (width == 64) {
885 PAETH_PREDICTOR(4);
886 PAETH_PREDICTOR(5);
887 PAETH_PREDICTOR(6);
888 PAETH_PREDICTOR(7);
889 }
890 }
891 dest += stride;
892 }
893 }
894
895 #define HIGHBD_PAETH_NXM_WIDE(W, H) \
896 void aom_highbd_paeth_predictor_##W##x##H##_neon( \
897 uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
898 const uint16_t *left, int bd) { \
899 (void)bd; \
900 highbd_paeth16_plus_x_h_neon(dst, stride, above, left, W, H); \
901 }
902
903 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
904 HIGHBD_PAETH_NXM_WIDE(16, 4)
905 HIGHBD_PAETH_NXM_WIDE(16, 8)
906 HIGHBD_PAETH_NXM_WIDE(16, 16)
907 HIGHBD_PAETH_NXM_WIDE(16, 32)
908 HIGHBD_PAETH_NXM_WIDE(16, 64)
909 HIGHBD_PAETH_NXM_WIDE(32, 8)
910 HIGHBD_PAETH_NXM_WIDE(32, 16)
911 HIGHBD_PAETH_NXM_WIDE(32, 32)
912 HIGHBD_PAETH_NXM_WIDE(32, 64)
913 HIGHBD_PAETH_NXM_WIDE(64, 16)
914 HIGHBD_PAETH_NXM_WIDE(64, 32)
915 HIGHBD_PAETH_NXM_WIDE(64, 64)
916 #else
917 HIGHBD_PAETH_NXM_WIDE(16, 8)
918 HIGHBD_PAETH_NXM_WIDE(16, 16)
919 HIGHBD_PAETH_NXM_WIDE(16, 32)
920 HIGHBD_PAETH_NXM_WIDE(32, 16)
921 HIGHBD_PAETH_NXM_WIDE(32, 32)
922 HIGHBD_PAETH_NXM_WIDE(32, 64)
923 HIGHBD_PAETH_NXM_WIDE(64, 32)
924 HIGHBD_PAETH_NXM_WIDE(64, 64)
925 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
926
927 // -----------------------------------------------------------------------------
928 // SMOOTH
929
930 // 256 - v = vneg_s8(v)
negate_s8(const uint16x4_t v)931 static inline uint16x4_t negate_s8(const uint16x4_t v) {
932 return vreinterpret_u16_s8(vneg_s8(vreinterpret_s8_u16(v)));
933 }
934
highbd_smooth_4xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const top_row,const uint16_t * const left_column,const int height)935 static inline void highbd_smooth_4xh_neon(uint16_t *dst, ptrdiff_t stride,
936 const uint16_t *const top_row,
937 const uint16_t *const left_column,
938 const int height) {
939 const uint16_t top_right = top_row[3];
940 const uint16_t bottom_left = left_column[height - 1];
941 const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
942
943 const uint16x4_t top_v = vld1_u16(top_row);
944 const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
945 const uint16x4_t weights_x_v = vld1_u16(smooth_weights_u16);
946 const uint16x4_t scaled_weights_x = negate_s8(weights_x_v);
947 const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
948
949 for (int y = 0; y < height; ++y) {
950 // Each variable in the running summation is named for the last item to be
951 // accumulated.
952 const uint32x4_t weighted_top =
953 vmlal_n_u16(weighted_tr, top_v, weights_y[y]);
954 const uint32x4_t weighted_left =
955 vmlal_n_u16(weighted_top, weights_x_v, left_column[y]);
956 const uint32x4_t weighted_bl =
957 vmlal_n_u16(weighted_left, bottom_left_v, 256 - weights_y[y]);
958
959 const uint16x4_t pred =
960 vrshrn_n_u32(weighted_bl, SMOOTH_WEIGHT_LOG2_SCALE + 1);
961 vst1_u16(dst, pred);
962 dst += stride;
963 }
964 }
965
966 // Common code between 8xH and [16|32|64]xH.
highbd_calculate_pred8(uint16_t * dst,const uint32x4_t weighted_corners_low,const uint32x4_t weighted_corners_high,const uint16x4x2_t top_vals,const uint16x4x2_t weights_x,const uint16_t left_y,const uint16_t weight_y)967 static inline void highbd_calculate_pred8(
968 uint16_t *dst, const uint32x4_t weighted_corners_low,
969 const uint32x4_t weighted_corners_high, const uint16x4x2_t top_vals,
970 const uint16x4x2_t weights_x, const uint16_t left_y,
971 const uint16_t weight_y) {
972 // Each variable in the running summation is named for the last item to be
973 // accumulated.
974 const uint32x4_t weighted_top_low =
975 vmlal_n_u16(weighted_corners_low, top_vals.val[0], weight_y);
976 const uint32x4_t weighted_edges_low =
977 vmlal_n_u16(weighted_top_low, weights_x.val[0], left_y);
978
979 const uint16x4_t pred_low =
980 vrshrn_n_u32(weighted_edges_low, SMOOTH_WEIGHT_LOG2_SCALE + 1);
981 vst1_u16(dst, pred_low);
982
983 const uint32x4_t weighted_top_high =
984 vmlal_n_u16(weighted_corners_high, top_vals.val[1], weight_y);
985 const uint32x4_t weighted_edges_high =
986 vmlal_n_u16(weighted_top_high, weights_x.val[1], left_y);
987
988 const uint16x4_t pred_high =
989 vrshrn_n_u32(weighted_edges_high, SMOOTH_WEIGHT_LOG2_SCALE + 1);
990 vst1_u16(dst + 4, pred_high);
991 }
992
highbd_smooth_8xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const top_row,const uint16_t * const left_column,const int height)993 static void highbd_smooth_8xh_neon(uint16_t *dst, ptrdiff_t stride,
994 const uint16_t *const top_row,
995 const uint16_t *const left_column,
996 const int height) {
997 const uint16_t top_right = top_row[7];
998 const uint16_t bottom_left = left_column[height - 1];
999 const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
1000
1001 const uint16x4x2_t top_vals = { { vld1_u16(top_row),
1002 vld1_u16(top_row + 4) } };
1003 const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
1004 const uint16x4x2_t weights_x = { { vld1_u16(smooth_weights_u16 + 4),
1005 vld1_u16(smooth_weights_u16 + 8) } };
1006 const uint32x4_t weighted_tr_low =
1007 vmull_n_u16(negate_s8(weights_x.val[0]), top_right);
1008 const uint32x4_t weighted_tr_high =
1009 vmull_n_u16(negate_s8(weights_x.val[1]), top_right);
1010
1011 for (int y = 0; y < height; ++y) {
1012 const uint32x4_t weighted_bl =
1013 vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
1014 const uint32x4_t weighted_corners_low =
1015 vaddq_u32(weighted_bl, weighted_tr_low);
1016 const uint32x4_t weighted_corners_high =
1017 vaddq_u32(weighted_bl, weighted_tr_high);
1018 highbd_calculate_pred8(dst, weighted_corners_low, weighted_corners_high,
1019 top_vals, weights_x, left_column[y], weights_y[y]);
1020 dst += stride;
1021 }
1022 }
1023
1024 #define HIGHBD_SMOOTH_NXM(W, H) \
1025 void aom_highbd_smooth_predictor_##W##x##H##_neon( \
1026 uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
1027 const uint16_t *left, int bd) { \
1028 (void)bd; \
1029 highbd_smooth_##W##xh_neon(dst, y_stride, above, left, H); \
1030 }
1031
1032 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1033 HIGHBD_SMOOTH_NXM(4, 4)
1034 HIGHBD_SMOOTH_NXM(4, 8)
1035 HIGHBD_SMOOTH_NXM(8, 4)
1036 HIGHBD_SMOOTH_NXM(8, 8)
1037 HIGHBD_SMOOTH_NXM(4, 16)
1038 HIGHBD_SMOOTH_NXM(8, 16)
1039 HIGHBD_SMOOTH_NXM(8, 32)
1040 #else
1041 HIGHBD_SMOOTH_NXM(4, 4)
1042 HIGHBD_SMOOTH_NXM(4, 8)
1043 HIGHBD_SMOOTH_NXM(8, 4)
1044 HIGHBD_SMOOTH_NXM(8, 8)
1045 HIGHBD_SMOOTH_NXM(8, 16)
1046 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1047
1048 #undef HIGHBD_SMOOTH_NXM
1049
1050 // For width 16 and above.
1051 #define HIGHBD_SMOOTH_PREDICTOR(W) \
1052 static void highbd_smooth_##W##xh_neon( \
1053 uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row, \
1054 const uint16_t *const left_column, const int height) { \
1055 const uint16_t top_right = top_row[(W)-1]; \
1056 const uint16_t bottom_left = left_column[height - 1]; \
1057 const uint16_t *const weights_y = smooth_weights_u16 + height - 4; \
1058 \
1059 /* Precompute weighted values that don't vary with |y|. */ \
1060 uint32x4_t weighted_tr_low[(W) >> 3]; \
1061 uint32x4_t weighted_tr_high[(W) >> 3]; \
1062 for (int i = 0; i < (W) >> 3; ++i) { \
1063 const int x = i << 3; \
1064 const uint16x4_t weights_x_low = \
1065 vld1_u16(smooth_weights_u16 + (W)-4 + x); \
1066 weighted_tr_low[i] = vmull_n_u16(negate_s8(weights_x_low), top_right); \
1067 const uint16x4_t weights_x_high = \
1068 vld1_u16(smooth_weights_u16 + (W) + x); \
1069 weighted_tr_high[i] = vmull_n_u16(negate_s8(weights_x_high), top_right); \
1070 } \
1071 \
1072 const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); \
1073 for (int y = 0; y < height; ++y) { \
1074 const uint32x4_t weighted_bl = \
1075 vmull_n_u16(bottom_left_v, 256 - weights_y[y]); \
1076 uint16_t *dst_x = dst; \
1077 for (int i = 0; i < (W) >> 3; ++i) { \
1078 const int x = i << 3; \
1079 const uint16x4x2_t top_vals = { { vld1_u16(top_row + x), \
1080 vld1_u16(top_row + x + 4) } }; \
1081 const uint32x4_t weighted_corners_low = \
1082 vaddq_u32(weighted_bl, weighted_tr_low[i]); \
1083 const uint32x4_t weighted_corners_high = \
1084 vaddq_u32(weighted_bl, weighted_tr_high[i]); \
1085 /* Accumulate weighted edge values and store. */ \
1086 const uint16x4x2_t weights_x = { \
1087 { vld1_u16(smooth_weights_u16 + (W)-4 + x), \
1088 vld1_u16(smooth_weights_u16 + (W) + x) } \
1089 }; \
1090 highbd_calculate_pred8(dst_x, weighted_corners_low, \
1091 weighted_corners_high, top_vals, weights_x, \
1092 left_column[y], weights_y[y]); \
1093 dst_x += 8; \
1094 } \
1095 dst += stride; \
1096 } \
1097 }
1098
1099 HIGHBD_SMOOTH_PREDICTOR(16)
1100 HIGHBD_SMOOTH_PREDICTOR(32)
1101 HIGHBD_SMOOTH_PREDICTOR(64)
1102
1103 #undef HIGHBD_SMOOTH_PREDICTOR
1104
1105 #define HIGHBD_SMOOTH_NXM_WIDE(W, H) \
1106 void aom_highbd_smooth_predictor_##W##x##H##_neon( \
1107 uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
1108 const uint16_t *left, int bd) { \
1109 (void)bd; \
1110 highbd_smooth_##W##xh_neon(dst, y_stride, above, left, H); \
1111 }
1112
1113 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1114 HIGHBD_SMOOTH_NXM_WIDE(16, 4)
1115 HIGHBD_SMOOTH_NXM_WIDE(16, 8)
1116 HIGHBD_SMOOTH_NXM_WIDE(16, 16)
1117 HIGHBD_SMOOTH_NXM_WIDE(16, 32)
1118 HIGHBD_SMOOTH_NXM_WIDE(16, 64)
1119 HIGHBD_SMOOTH_NXM_WIDE(32, 8)
1120 HIGHBD_SMOOTH_NXM_WIDE(32, 16)
1121 HIGHBD_SMOOTH_NXM_WIDE(32, 32)
1122 HIGHBD_SMOOTH_NXM_WIDE(32, 64)
1123 HIGHBD_SMOOTH_NXM_WIDE(64, 16)
1124 HIGHBD_SMOOTH_NXM_WIDE(64, 32)
1125 HIGHBD_SMOOTH_NXM_WIDE(64, 64)
1126 #else
1127 HIGHBD_SMOOTH_NXM_WIDE(16, 8)
1128 HIGHBD_SMOOTH_NXM_WIDE(16, 16)
1129 HIGHBD_SMOOTH_NXM_WIDE(16, 32)
1130 HIGHBD_SMOOTH_NXM_WIDE(32, 16)
1131 HIGHBD_SMOOTH_NXM_WIDE(32, 32)
1132 HIGHBD_SMOOTH_NXM_WIDE(32, 64)
1133 HIGHBD_SMOOTH_NXM_WIDE(64, 32)
1134 HIGHBD_SMOOTH_NXM_WIDE(64, 64)
1135 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1136
1137 #undef HIGHBD_SMOOTH_NXM_WIDE
1138
highbd_smooth_v_4xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const top_row,const uint16_t * const left_column,const int height)1139 static void highbd_smooth_v_4xh_neon(uint16_t *dst, ptrdiff_t stride,
1140 const uint16_t *const top_row,
1141 const uint16_t *const left_column,
1142 const int height) {
1143 const uint16_t bottom_left = left_column[height - 1];
1144 const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
1145
1146 const uint16x4_t top_v = vld1_u16(top_row);
1147 const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
1148
1149 for (int y = 0; y < height; ++y) {
1150 const uint32x4_t weighted_bl =
1151 vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
1152 const uint32x4_t weighted_top =
1153 vmlal_n_u16(weighted_bl, top_v, weights_y[y]);
1154 vst1_u16(dst, vrshrn_n_u32(weighted_top, SMOOTH_WEIGHT_LOG2_SCALE));
1155
1156 dst += stride;
1157 }
1158 }
1159
highbd_smooth_v_8xh_neon(uint16_t * dst,const ptrdiff_t stride,const uint16_t * const top_row,const uint16_t * const left_column,const int height)1160 static void highbd_smooth_v_8xh_neon(uint16_t *dst, const ptrdiff_t stride,
1161 const uint16_t *const top_row,
1162 const uint16_t *const left_column,
1163 const int height) {
1164 const uint16_t bottom_left = left_column[height - 1];
1165 const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
1166
1167 const uint16x4_t top_low = vld1_u16(top_row);
1168 const uint16x4_t top_high = vld1_u16(top_row + 4);
1169 const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
1170
1171 for (int y = 0; y < height; ++y) {
1172 const uint32x4_t weighted_bl =
1173 vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
1174
1175 const uint32x4_t weighted_top_low =
1176 vmlal_n_u16(weighted_bl, top_low, weights_y[y]);
1177 vst1_u16(dst, vrshrn_n_u32(weighted_top_low, SMOOTH_WEIGHT_LOG2_SCALE));
1178
1179 const uint32x4_t weighted_top_high =
1180 vmlal_n_u16(weighted_bl, top_high, weights_y[y]);
1181 vst1_u16(dst + 4,
1182 vrshrn_n_u32(weighted_top_high, SMOOTH_WEIGHT_LOG2_SCALE));
1183 dst += stride;
1184 }
1185 }
1186
1187 #define HIGHBD_SMOOTH_V_NXM(W, H) \
1188 void aom_highbd_smooth_v_predictor_##W##x##H##_neon( \
1189 uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
1190 const uint16_t *left, int bd) { \
1191 (void)bd; \
1192 highbd_smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \
1193 }
1194
1195 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1196 HIGHBD_SMOOTH_V_NXM(4, 4)
1197 HIGHBD_SMOOTH_V_NXM(4, 8)
1198 HIGHBD_SMOOTH_V_NXM(4, 16)
1199 HIGHBD_SMOOTH_V_NXM(8, 4)
1200 HIGHBD_SMOOTH_V_NXM(8, 8)
1201 HIGHBD_SMOOTH_V_NXM(8, 16)
1202 HIGHBD_SMOOTH_V_NXM(8, 32)
1203 #else
1204 HIGHBD_SMOOTH_V_NXM(4, 4)
1205 HIGHBD_SMOOTH_V_NXM(4, 8)
1206 HIGHBD_SMOOTH_V_NXM(8, 4)
1207 HIGHBD_SMOOTH_V_NXM(8, 8)
1208 HIGHBD_SMOOTH_V_NXM(8, 16)
1209 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1210
1211 #undef HIGHBD_SMOOTH_V_NXM
1212
1213 // For width 16 and above.
1214 #define HIGHBD_SMOOTH_V_PREDICTOR(W) \
1215 static void highbd_smooth_v_##W##xh_neon( \
1216 uint16_t *dst, const ptrdiff_t stride, const uint16_t *const top_row, \
1217 const uint16_t *const left_column, const int height) { \
1218 const uint16_t bottom_left = left_column[height - 1]; \
1219 const uint16_t *const weights_y = smooth_weights_u16 + height - 4; \
1220 \
1221 uint16x4x2_t top_vals[(W) >> 3]; \
1222 for (int i = 0; i < (W) >> 3; ++i) { \
1223 const int x = i << 3; \
1224 top_vals[i].val[0] = vld1_u16(top_row + x); \
1225 top_vals[i].val[1] = vld1_u16(top_row + x + 4); \
1226 } \
1227 \
1228 const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); \
1229 for (int y = 0; y < height; ++y) { \
1230 const uint32x4_t weighted_bl = \
1231 vmull_n_u16(bottom_left_v, 256 - weights_y[y]); \
1232 \
1233 uint16_t *dst_x = dst; \
1234 for (int i = 0; i < (W) >> 3; ++i) { \
1235 const uint32x4_t weighted_top_low = \
1236 vmlal_n_u16(weighted_bl, top_vals[i].val[0], weights_y[y]); \
1237 vst1_u16(dst_x, \
1238 vrshrn_n_u32(weighted_top_low, SMOOTH_WEIGHT_LOG2_SCALE)); \
1239 \
1240 const uint32x4_t weighted_top_high = \
1241 vmlal_n_u16(weighted_bl, top_vals[i].val[1], weights_y[y]); \
1242 vst1_u16(dst_x + 4, \
1243 vrshrn_n_u32(weighted_top_high, SMOOTH_WEIGHT_LOG2_SCALE)); \
1244 dst_x += 8; \
1245 } \
1246 dst += stride; \
1247 } \
1248 }
1249
1250 HIGHBD_SMOOTH_V_PREDICTOR(16)
1251 HIGHBD_SMOOTH_V_PREDICTOR(32)
1252 HIGHBD_SMOOTH_V_PREDICTOR(64)
1253
1254 #undef HIGHBD_SMOOTH_V_PREDICTOR
1255
1256 #define HIGHBD_SMOOTH_V_NXM_WIDE(W, H) \
1257 void aom_highbd_smooth_v_predictor_##W##x##H##_neon( \
1258 uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
1259 const uint16_t *left, int bd) { \
1260 (void)bd; \
1261 highbd_smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \
1262 }
1263
1264 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1265 HIGHBD_SMOOTH_V_NXM_WIDE(16, 4)
1266 HIGHBD_SMOOTH_V_NXM_WIDE(16, 8)
1267 HIGHBD_SMOOTH_V_NXM_WIDE(16, 16)
1268 HIGHBD_SMOOTH_V_NXM_WIDE(16, 32)
1269 HIGHBD_SMOOTH_V_NXM_WIDE(16, 64)
1270 HIGHBD_SMOOTH_V_NXM_WIDE(32, 8)
1271 HIGHBD_SMOOTH_V_NXM_WIDE(32, 16)
1272 HIGHBD_SMOOTH_V_NXM_WIDE(32, 32)
1273 HIGHBD_SMOOTH_V_NXM_WIDE(32, 64)
1274 HIGHBD_SMOOTH_V_NXM_WIDE(64, 16)
1275 HIGHBD_SMOOTH_V_NXM_WIDE(64, 32)
1276 HIGHBD_SMOOTH_V_NXM_WIDE(64, 64)
1277 #else
1278 HIGHBD_SMOOTH_V_NXM_WIDE(16, 8)
1279 HIGHBD_SMOOTH_V_NXM_WIDE(16, 16)
1280 HIGHBD_SMOOTH_V_NXM_WIDE(16, 32)
1281 HIGHBD_SMOOTH_V_NXM_WIDE(32, 16)
1282 HIGHBD_SMOOTH_V_NXM_WIDE(32, 32)
1283 HIGHBD_SMOOTH_V_NXM_WIDE(32, 64)
1284 HIGHBD_SMOOTH_V_NXM_WIDE(64, 32)
1285 HIGHBD_SMOOTH_V_NXM_WIDE(64, 64)
1286 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1287
1288 #undef HIGHBD_SMOOTH_V_NXM_WIDE
1289
highbd_smooth_h_4xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const top_row,const uint16_t * const left_column,const int height)1290 static inline void highbd_smooth_h_4xh_neon(uint16_t *dst, ptrdiff_t stride,
1291 const uint16_t *const top_row,
1292 const uint16_t *const left_column,
1293 const int height) {
1294 const uint16_t top_right = top_row[3];
1295
1296 const uint16x4_t weights_x = vld1_u16(smooth_weights_u16);
1297 const uint16x4_t scaled_weights_x = negate_s8(weights_x);
1298
1299 const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
1300 for (int y = 0; y < height; ++y) {
1301 const uint32x4_t weighted_left =
1302 vmlal_n_u16(weighted_tr, weights_x, left_column[y]);
1303 vst1_u16(dst, vrshrn_n_u32(weighted_left, SMOOTH_WEIGHT_LOG2_SCALE));
1304 dst += stride;
1305 }
1306 }
1307
highbd_smooth_h_8xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const top_row,const uint16_t * const left_column,const int height)1308 static inline void highbd_smooth_h_8xh_neon(uint16_t *dst, ptrdiff_t stride,
1309 const uint16_t *const top_row,
1310 const uint16_t *const left_column,
1311 const int height) {
1312 const uint16_t top_right = top_row[7];
1313
1314 const uint16x4x2_t weights_x = { { vld1_u16(smooth_weights_u16 + 4),
1315 vld1_u16(smooth_weights_u16 + 8) } };
1316
1317 const uint32x4_t weighted_tr_low =
1318 vmull_n_u16(negate_s8(weights_x.val[0]), top_right);
1319 const uint32x4_t weighted_tr_high =
1320 vmull_n_u16(negate_s8(weights_x.val[1]), top_right);
1321
1322 for (int y = 0; y < height; ++y) {
1323 const uint16_t left_y = left_column[y];
1324 const uint32x4_t weighted_left_low =
1325 vmlal_n_u16(weighted_tr_low, weights_x.val[0], left_y);
1326 vst1_u16(dst, vrshrn_n_u32(weighted_left_low, SMOOTH_WEIGHT_LOG2_SCALE));
1327
1328 const uint32x4_t weighted_left_high =
1329 vmlal_n_u16(weighted_tr_high, weights_x.val[1], left_y);
1330 vst1_u16(dst + 4,
1331 vrshrn_n_u32(weighted_left_high, SMOOTH_WEIGHT_LOG2_SCALE));
1332 dst += stride;
1333 }
1334 }
1335
1336 #define HIGHBD_SMOOTH_H_NXM(W, H) \
1337 void aom_highbd_smooth_h_predictor_##W##x##H##_neon( \
1338 uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
1339 const uint16_t *left, int bd) { \
1340 (void)bd; \
1341 highbd_smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \
1342 }
1343
1344 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1345 HIGHBD_SMOOTH_H_NXM(4, 4)
1346 HIGHBD_SMOOTH_H_NXM(4, 8)
1347 HIGHBD_SMOOTH_H_NXM(4, 16)
1348 HIGHBD_SMOOTH_H_NXM(8, 4)
1349 HIGHBD_SMOOTH_H_NXM(8, 8)
1350 HIGHBD_SMOOTH_H_NXM(8, 16)
1351 HIGHBD_SMOOTH_H_NXM(8, 32)
1352 #else
1353 HIGHBD_SMOOTH_H_NXM(4, 4)
1354 HIGHBD_SMOOTH_H_NXM(4, 8)
1355 HIGHBD_SMOOTH_H_NXM(8, 4)
1356 HIGHBD_SMOOTH_H_NXM(8, 8)
1357 HIGHBD_SMOOTH_H_NXM(8, 16)
1358 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1359
1360 #undef HIGHBD_SMOOTH_H_NXM
1361
1362 // For width 16 and above.
1363 #define HIGHBD_SMOOTH_H_PREDICTOR(W) \
1364 static void highbd_smooth_h_##W##xh_neon( \
1365 uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row, \
1366 const uint16_t *const left_column, const int height) { \
1367 const uint16_t top_right = top_row[(W)-1]; \
1368 \
1369 uint16x4_t weights_x_low[(W) >> 3]; \
1370 uint16x4_t weights_x_high[(W) >> 3]; \
1371 uint32x4_t weighted_tr_low[(W) >> 3]; \
1372 uint32x4_t weighted_tr_high[(W) >> 3]; \
1373 for (int i = 0; i < (W) >> 3; ++i) { \
1374 const int x = i << 3; \
1375 weights_x_low[i] = vld1_u16(smooth_weights_u16 + (W)-4 + x); \
1376 weighted_tr_low[i] = \
1377 vmull_n_u16(negate_s8(weights_x_low[i]), top_right); \
1378 weights_x_high[i] = vld1_u16(smooth_weights_u16 + (W) + x); \
1379 weighted_tr_high[i] = \
1380 vmull_n_u16(negate_s8(weights_x_high[i]), top_right); \
1381 } \
1382 \
1383 for (int y = 0; y < height; ++y) { \
1384 uint16_t *dst_x = dst; \
1385 const uint16_t left_y = left_column[y]; \
1386 for (int i = 0; i < (W) >> 3; ++i) { \
1387 const uint32x4_t weighted_left_low = \
1388 vmlal_n_u16(weighted_tr_low[i], weights_x_low[i], left_y); \
1389 vst1_u16(dst_x, \
1390 vrshrn_n_u32(weighted_left_low, SMOOTH_WEIGHT_LOG2_SCALE)); \
1391 \
1392 const uint32x4_t weighted_left_high = \
1393 vmlal_n_u16(weighted_tr_high[i], weights_x_high[i], left_y); \
1394 vst1_u16(dst_x + 4, \
1395 vrshrn_n_u32(weighted_left_high, SMOOTH_WEIGHT_LOG2_SCALE)); \
1396 dst_x += 8; \
1397 } \
1398 dst += stride; \
1399 } \
1400 }
1401
1402 HIGHBD_SMOOTH_H_PREDICTOR(16)
1403 HIGHBD_SMOOTH_H_PREDICTOR(32)
1404 HIGHBD_SMOOTH_H_PREDICTOR(64)
1405
1406 #undef HIGHBD_SMOOTH_H_PREDICTOR
1407
1408 #define HIGHBD_SMOOTH_H_NXM_WIDE(W, H) \
1409 void aom_highbd_smooth_h_predictor_##W##x##H##_neon( \
1410 uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
1411 const uint16_t *left, int bd) { \
1412 (void)bd; \
1413 highbd_smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \
1414 }
1415
1416 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1417 HIGHBD_SMOOTH_H_NXM_WIDE(16, 4)
1418 HIGHBD_SMOOTH_H_NXM_WIDE(16, 8)
1419 HIGHBD_SMOOTH_H_NXM_WIDE(16, 16)
1420 HIGHBD_SMOOTH_H_NXM_WIDE(16, 32)
1421 HIGHBD_SMOOTH_H_NXM_WIDE(16, 64)
1422 HIGHBD_SMOOTH_H_NXM_WIDE(32, 8)
1423 HIGHBD_SMOOTH_H_NXM_WIDE(32, 16)
1424 HIGHBD_SMOOTH_H_NXM_WIDE(32, 32)
1425 HIGHBD_SMOOTH_H_NXM_WIDE(32, 64)
1426 HIGHBD_SMOOTH_H_NXM_WIDE(64, 16)
1427 HIGHBD_SMOOTH_H_NXM_WIDE(64, 32)
1428 HIGHBD_SMOOTH_H_NXM_WIDE(64, 64)
1429 #else
1430 HIGHBD_SMOOTH_H_NXM_WIDE(16, 8)
1431 HIGHBD_SMOOTH_H_NXM_WIDE(16, 16)
1432 HIGHBD_SMOOTH_H_NXM_WIDE(16, 32)
1433 HIGHBD_SMOOTH_H_NXM_WIDE(32, 16)
1434 HIGHBD_SMOOTH_H_NXM_WIDE(32, 32)
1435 HIGHBD_SMOOTH_H_NXM_WIDE(32, 64)
1436 HIGHBD_SMOOTH_H_NXM_WIDE(64, 32)
1437 HIGHBD_SMOOTH_H_NXM_WIDE(64, 64)
1438 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1439
1440 #undef HIGHBD_SMOOTH_H_NXM_WIDE
1441
1442 // -----------------------------------------------------------------------------
1443 // Z1
1444
1445 static int16_t iota1_s16[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8 };
1446 static int16_t iota2_s16[] = { 0, 2, 4, 6, 8, 10, 12, 14 };
1447
highbd_dr_z1_apply_shift_x4(uint16x4_t a0,uint16x4_t a1,int shift)1448 static AOM_FORCE_INLINE uint16x4_t highbd_dr_z1_apply_shift_x4(uint16x4_t a0,
1449 uint16x4_t a1,
1450 int shift) {
1451 // The C implementation of the z1 predictor uses (32 - shift) and a right
1452 // shift by 5, however we instead double shift to avoid an unnecessary right
1453 // shift by 1.
1454 uint32x4_t res = vmull_n_u16(a1, shift);
1455 res = vmlal_n_u16(res, a0, 64 - shift);
1456 return vrshrn_n_u32(res, 6);
1457 }
1458
highbd_dr_z1_apply_shift_x8(uint16x8_t a0,uint16x8_t a1,int shift)1459 static AOM_FORCE_INLINE uint16x8_t highbd_dr_z1_apply_shift_x8(uint16x8_t a0,
1460 uint16x8_t a1,
1461 int shift) {
1462 return vcombine_u16(
1463 highbd_dr_z1_apply_shift_x4(vget_low_u16(a0), vget_low_u16(a1), shift),
1464 highbd_dr_z1_apply_shift_x4(vget_high_u16(a0), vget_high_u16(a1), shift));
1465 }
1466
1467 // clang-format off
1468 static const uint8_t kLoadMaxShuffles[] = {
1469 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
1470 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
1471 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
1472 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
1473 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15,
1474 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15,
1475 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15,
1476 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1477 };
1478 // clang-format on
1479
zn_load_masked_neon(const uint16_t * ptr,int shuffle_idx)1480 static inline uint16x8_t zn_load_masked_neon(const uint16_t *ptr,
1481 int shuffle_idx) {
1482 uint8x16_t shuffle = vld1q_u8(&kLoadMaxShuffles[16 * shuffle_idx]);
1483 uint8x16_t src = vreinterpretq_u8_u16(vld1q_u16(ptr));
1484 #if AOM_ARCH_AARCH64
1485 return vreinterpretq_u16_u8(vqtbl1q_u8(src, shuffle));
1486 #else
1487 uint8x8x2_t src2 = { { vget_low_u8(src), vget_high_u8(src) } };
1488 uint8x8_t lo = vtbl2_u8(src2, vget_low_u8(shuffle));
1489 uint8x8_t hi = vtbl2_u8(src2, vget_high_u8(shuffle));
1490 return vreinterpretq_u16_u8(vcombine_u8(lo, hi));
1491 #endif
1492 }
1493
highbd_dr_prediction_z1_upsample0_neon(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,int dx)1494 static void highbd_dr_prediction_z1_upsample0_neon(uint16_t *dst,
1495 ptrdiff_t stride, int bw,
1496 int bh,
1497 const uint16_t *above,
1498 int dx) {
1499 assert(bw % 4 == 0);
1500 assert(bh % 4 == 0);
1501 assert(dx > 0);
1502
1503 const int max_base_x = (bw + bh) - 1;
1504 const int above_max = above[max_base_x];
1505
1506 const int16x8_t iota1x8 = vld1q_s16(iota1_s16);
1507 const int16x4_t iota1x4 = vget_low_s16(iota1x8);
1508
1509 int x = dx;
1510 int r = 0;
1511 do {
1512 const int base = x >> 6;
1513 if (base >= max_base_x) {
1514 for (int i = r; i < bh; ++i) {
1515 aom_memset16(dst, above_max, bw);
1516 dst += stride;
1517 }
1518 return;
1519 }
1520
1521 // The C implementation of the z1 predictor when not upsampling uses:
1522 // ((x & 0x3f) >> 1)
1523 // The right shift is unnecessary here since we instead shift by +1 later,
1524 // so adjust the mask to 0x3e to ensure we don't consider the extra bit.
1525 const int shift = x & 0x3e;
1526
1527 if (bw == 4) {
1528 const uint16x4_t a0 = vld1_u16(&above[base]);
1529 const uint16x4_t a1 = vld1_u16(&above[base + 1]);
1530 const uint16x4_t val = highbd_dr_z1_apply_shift_x4(a0, a1, shift);
1531 const uint16x4_t cmp = vcgt_s16(vdup_n_s16(max_base_x - base), iota1x4);
1532 const uint16x4_t res = vbsl_u16(cmp, val, vdup_n_u16(above_max));
1533 vst1_u16(dst, res);
1534 } else {
1535 int c = 0;
1536 do {
1537 uint16x8_t a0;
1538 uint16x8_t a1;
1539 if (base + c >= max_base_x) {
1540 a0 = a1 = vdupq_n_u16(above_max);
1541 } else {
1542 if (base + c + 7 >= max_base_x) {
1543 int shuffle_idx = max_base_x - base - c;
1544 a0 = zn_load_masked_neon(above + (max_base_x - 7), shuffle_idx);
1545 } else {
1546 a0 = vld1q_u16(above + base + c);
1547 }
1548 if (base + c + 8 >= max_base_x) {
1549 int shuffle_idx = max_base_x - base - c - 1;
1550 a1 = zn_load_masked_neon(above + (max_base_x - 7), shuffle_idx);
1551 } else {
1552 a1 = vld1q_u16(above + base + c + 1);
1553 }
1554 }
1555
1556 vst1q_u16(dst + c, highbd_dr_z1_apply_shift_x8(a0, a1, shift));
1557 c += 8;
1558 } while (c < bw);
1559 }
1560
1561 dst += stride;
1562 x += dx;
1563 } while (++r < bh);
1564 }
1565
highbd_dr_prediction_z1_upsample1_neon(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,int dx)1566 static void highbd_dr_prediction_z1_upsample1_neon(uint16_t *dst,
1567 ptrdiff_t stride, int bw,
1568 int bh,
1569 const uint16_t *above,
1570 int dx) {
1571 assert(bw % 4 == 0);
1572 assert(bh % 4 == 0);
1573 assert(dx > 0);
1574
1575 const int max_base_x = ((bw + bh) - 1) << 1;
1576 const int above_max = above[max_base_x];
1577
1578 const int16x8_t iota2x8 = vld1q_s16(iota2_s16);
1579 const int16x4_t iota2x4 = vget_low_s16(iota2x8);
1580
1581 int x = dx;
1582 int r = 0;
1583 do {
1584 const int base = x >> 5;
1585 if (base >= max_base_x) {
1586 for (int i = r; i < bh; ++i) {
1587 aom_memset16(dst, above_max, bw);
1588 dst += stride;
1589 }
1590 return;
1591 }
1592
1593 // The C implementation of the z1 predictor when upsampling uses:
1594 // (((x << 1) & 0x3f) >> 1)
1595 // The right shift is unnecessary here since we instead shift by +1 later,
1596 // so adjust the mask to 0x3e to ensure we don't consider the extra bit.
1597 const int shift = (x << 1) & 0x3e;
1598
1599 if (bw == 4) {
1600 const uint16x4x2_t a01 = vld2_u16(&above[base]);
1601 const uint16x4_t val =
1602 highbd_dr_z1_apply_shift_x4(a01.val[0], a01.val[1], shift);
1603 const uint16x4_t cmp = vcgt_s16(vdup_n_s16(max_base_x - base), iota2x4);
1604 const uint16x4_t res = vbsl_u16(cmp, val, vdup_n_u16(above_max));
1605 vst1_u16(dst, res);
1606 } else {
1607 int c = 0;
1608 do {
1609 const uint16x8x2_t a01 = vld2q_u16(&above[base + 2 * c]);
1610 const uint16x8_t val =
1611 highbd_dr_z1_apply_shift_x8(a01.val[0], a01.val[1], shift);
1612 const uint16x8_t cmp =
1613 vcgtq_s16(vdupq_n_s16(max_base_x - base - 2 * c), iota2x8);
1614 const uint16x8_t res = vbslq_u16(cmp, val, vdupq_n_u16(above_max));
1615 vst1q_u16(dst + c, res);
1616 c += 8;
1617 } while (c < bw);
1618 }
1619
1620 dst += stride;
1621 x += dx;
1622 } while (++r < bh);
1623 }
1624
1625 // Directional prediction, zone 1: 0 < angle < 90
av1_highbd_dr_prediction_z1_neon(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int upsample_above,int dx,int dy,int bd)1626 void av1_highbd_dr_prediction_z1_neon(uint16_t *dst, ptrdiff_t stride, int bw,
1627 int bh, const uint16_t *above,
1628 const uint16_t *left, int upsample_above,
1629 int dx, int dy, int bd) {
1630 (void)left;
1631 (void)dy;
1632 (void)bd;
1633 assert(dy == 1);
1634
1635 if (upsample_above) {
1636 highbd_dr_prediction_z1_upsample1_neon(dst, stride, bw, bh, above, dx);
1637 } else {
1638 highbd_dr_prediction_z1_upsample0_neon(dst, stride, bw, bh, above, dx);
1639 }
1640 }
1641
1642 // -----------------------------------------------------------------------------
1643 // Z2
1644
1645 #if AOM_ARCH_AARCH64
1646 // Incrementally shift more elements from `above` into the result, merging with
1647 // existing `left` elements.
1648 // X0, X1, X2, X3
1649 // Y0, X0, X1, X2
1650 // Y0, Y1, X0, X1
1651 // Y0, Y1, Y2, X0
1652 // Y0, Y1, Y2, Y3
1653 // clang-format off
1654 static const uint8_t z2_merge_shuffles_u16x4[5][8] = {
1655 { 8, 9, 10, 11, 12, 13, 14, 15 },
1656 { 0, 1, 8, 9, 10, 11, 12, 13 },
1657 { 0, 1, 2, 3, 8, 9, 10, 11 },
1658 { 0, 1, 2, 3, 4, 5, 8, 9 },
1659 { 0, 1, 2, 3, 4, 5, 6, 7 },
1660 };
1661 // clang-format on
1662
1663 // Incrementally shift more elements from `above` into the result, merging with
1664 // existing `left` elements.
1665 // X0, X1, X2, X3, X4, X5, X6, X7
1666 // Y0, X0, X1, X2, X3, X4, X5, X6
1667 // Y0, Y1, X0, X1, X2, X3, X4, X5
1668 // Y0, Y1, Y2, X0, X1, X2, X3, X4
1669 // Y0, Y1, Y2, Y3, X0, X1, X2, X3
1670 // Y0, Y1, Y2, Y3, Y4, X0, X1, X2
1671 // Y0, Y1, Y2, Y3, Y4, Y5, X0, X1
1672 // Y0, Y1, Y2, Y3, Y4, Y5, Y6, X0
1673 // Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7
1674 // clang-format off
1675 static const uint8_t z2_merge_shuffles_u16x8[9][16] = {
1676 { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 },
1677 { 0, 1, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 },
1678 { 0, 1, 2, 3, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 },
1679 { 0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 },
1680 { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 },
1681 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 17, 18, 19, 20, 21 },
1682 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19 },
1683 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17 },
1684 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
1685 };
1686 // clang-format on
1687
1688 // clang-format off
1689 static const uint16_t z2_y_iter_masks_u16x4[5][4] = {
1690 { 0U, 0U, 0U, 0U },
1691 { 0xffffU, 0U, 0U, 0U },
1692 { 0xffffU, 0xffffU, 0U, 0U },
1693 { 0xffffU, 0xffffU, 0xffffU, 0U },
1694 { 0xffffU, 0xffffU, 0xffffU, 0xffffU },
1695 };
1696 // clang-format on
1697
1698 // clang-format off
1699 static const uint16_t z2_y_iter_masks_u16x8[9][8] = {
1700 { 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U },
1701 { 0xffffU, 0U, 0U, 0U, 0U, 0U, 0U, 0U },
1702 { 0xffffU, 0xffffU, 0U, 0U, 0U, 0U, 0U, 0U },
1703 { 0xffffU, 0xffffU, 0xffffU, 0U, 0U, 0U, 0U, 0U },
1704 { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0U, 0U, 0U, 0U },
1705 { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0U, 0U, 0U },
1706 { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0U, 0U },
1707 { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0U },
1708 { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU },
1709 };
1710 // clang-format on
1711
highbd_dr_prediction_z2_tbl_left_x4_from_x8(const uint16x8_t left_data,const int16x4_t indices,int base,int n)1712 static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_tbl_left_x4_from_x8(
1713 const uint16x8_t left_data, const int16x4_t indices, int base, int n) {
1714 // Need to adjust indices to operate on 0-based indices rather than
1715 // `base`-based indices and then adjust from uint16x4 indices to uint8x8
1716 // indices so we can use a tbl instruction (which only operates on bytes).
1717 uint8x8_t left_indices =
1718 vreinterpret_u8_s16(vsub_s16(indices, vdup_n_s16(base)));
1719 left_indices = vtrn1_u8(left_indices, left_indices);
1720 left_indices = vadd_u8(left_indices, left_indices);
1721 left_indices = vadd_u8(left_indices, vreinterpret_u8_u16(vdup_n_u16(0x0100)));
1722 const uint16x4_t ret = vreinterpret_u16_u8(
1723 vqtbl1_u8(vreinterpretq_u8_u16(left_data), left_indices));
1724 return vand_u16(ret, vld1_u16(z2_y_iter_masks_u16x4[n]));
1725 }
1726
highbd_dr_prediction_z2_tbl_left_x4_from_x16(const uint16x8x2_t left_data,const int16x4_t indices,int base,int n)1727 static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_tbl_left_x4_from_x16(
1728 const uint16x8x2_t left_data, const int16x4_t indices, int base, int n) {
1729 // Need to adjust indices to operate on 0-based indices rather than
1730 // `base`-based indices and then adjust from uint16x4 indices to uint8x8
1731 // indices so we can use a tbl instruction (which only operates on bytes).
1732 uint8x8_t left_indices =
1733 vreinterpret_u8_s16(vsub_s16(indices, vdup_n_s16(base)));
1734 left_indices = vtrn1_u8(left_indices, left_indices);
1735 left_indices = vadd_u8(left_indices, left_indices);
1736 left_indices = vadd_u8(left_indices, vreinterpret_u8_u16(vdup_n_u16(0x0100)));
1737 uint8x16x2_t data_u8 = { { vreinterpretq_u8_u16(left_data.val[0]),
1738 vreinterpretq_u8_u16(left_data.val[1]) } };
1739 const uint16x4_t ret = vreinterpret_u16_u8(vqtbl2_u8(data_u8, left_indices));
1740 return vand_u16(ret, vld1_u16(z2_y_iter_masks_u16x4[n]));
1741 }
1742
highbd_dr_prediction_z2_tbl_left_x8_from_x8(const uint16x8_t left_data,const int16x8_t indices,int base,int n)1743 static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_tbl_left_x8_from_x8(
1744 const uint16x8_t left_data, const int16x8_t indices, int base, int n) {
1745 // Need to adjust indices to operate on 0-based indices rather than
1746 // `base`-based indices and then adjust from uint16x4 indices to uint8x8
1747 // indices so we can use a tbl instruction (which only operates on bytes).
1748 uint8x16_t left_indices =
1749 vreinterpretq_u8_s16(vsubq_s16(indices, vdupq_n_s16(base)));
1750 left_indices = vtrn1q_u8(left_indices, left_indices);
1751 left_indices = vaddq_u8(left_indices, left_indices);
1752 left_indices =
1753 vaddq_u8(left_indices, vreinterpretq_u8_u16(vdupq_n_u16(0x0100)));
1754 const uint16x8_t ret = vreinterpretq_u16_u8(
1755 vqtbl1q_u8(vreinterpretq_u8_u16(left_data), left_indices));
1756 return vandq_u16(ret, vld1q_u16(z2_y_iter_masks_u16x8[n]));
1757 }
1758
highbd_dr_prediction_z2_tbl_left_x8_from_x16(const uint16x8x2_t left_data,const int16x8_t indices,int base,int n)1759 static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_tbl_left_x8_from_x16(
1760 const uint16x8x2_t left_data, const int16x8_t indices, int base, int n) {
1761 // Need to adjust indices to operate on 0-based indices rather than
1762 // `base`-based indices and then adjust from uint16x4 indices to uint8x8
1763 // indices so we can use a tbl instruction (which only operates on bytes).
1764 uint8x16_t left_indices =
1765 vreinterpretq_u8_s16(vsubq_s16(indices, vdupq_n_s16(base)));
1766 left_indices = vtrn1q_u8(left_indices, left_indices);
1767 left_indices = vaddq_u8(left_indices, left_indices);
1768 left_indices =
1769 vaddq_u8(left_indices, vreinterpretq_u8_u16(vdupq_n_u16(0x0100)));
1770 uint8x16x2_t data_u8 = { { vreinterpretq_u8_u16(left_data.val[0]),
1771 vreinterpretq_u8_u16(left_data.val[1]) } };
1772 const uint16x8_t ret =
1773 vreinterpretq_u16_u8(vqtbl2q_u8(data_u8, left_indices));
1774 return vandq_u16(ret, vld1q_u16(z2_y_iter_masks_u16x8[n]));
1775 }
1776 #endif // AOM_ARCH_AARCH64
1777
1778 // TODO(aomedia:349428506): enable this for armv7 after SIGBUS is fixed.
1779 #if AOM_ARCH_AARCH64
highbd_dr_prediction_z2_gather_left_x4(const uint16_t * left,const int16x4_t indices,int n)1780 static AOM_FORCE_INLINE uint16x4x2_t highbd_dr_prediction_z2_gather_left_x4(
1781 const uint16_t *left, const int16x4_t indices, int n) {
1782 assert(n > 0);
1783 assert(n <= 4);
1784 // Load two elements at a time and then uzp them into separate vectors, to
1785 // reduce the number of memory accesses.
1786 uint32x2_t ret0_u32 = vdup_n_u32(0);
1787 uint32x2_t ret1_u32 = vdup_n_u32(0);
1788
1789 // Use a single vget_lane_u64 to minimize vector to general purpose register
1790 // transfers and then mask off the bits we actually want.
1791 const uint64_t indices0123 = vget_lane_u64(vreinterpret_u64_s16(indices), 0);
1792 const int idx0 = (int16_t)((indices0123 >> 0) & 0xffffU);
1793 const int idx1 = (int16_t)((indices0123 >> 16) & 0xffffU);
1794 const int idx2 = (int16_t)((indices0123 >> 32) & 0xffffU);
1795 const int idx3 = (int16_t)((indices0123 >> 48) & 0xffffU);
1796
1797 // At time of writing both Clang and GCC produced better code with these
1798 // nested if-statements compared to a switch statement with fallthrough.
1799 load_unaligned_u32_2x1_lane(ret0_u32, left + idx0, 0);
1800 if (n > 1) {
1801 load_unaligned_u32_2x1_lane(ret0_u32, left + idx1, 1);
1802 if (n > 2) {
1803 load_unaligned_u32_2x1_lane(ret1_u32, left + idx2, 0);
1804 if (n > 3) {
1805 load_unaligned_u32_2x1_lane(ret1_u32, left + idx3, 1);
1806 }
1807 }
1808 }
1809 return vuzp_u16(vreinterpret_u16_u32(ret0_u32),
1810 vreinterpret_u16_u32(ret1_u32));
1811 }
1812
highbd_dr_prediction_z2_gather_left_x8(const uint16_t * left,const int16x8_t indices,int n)1813 static AOM_FORCE_INLINE uint16x8x2_t highbd_dr_prediction_z2_gather_left_x8(
1814 const uint16_t *left, const int16x8_t indices, int n) {
1815 assert(n > 0);
1816 assert(n <= 8);
1817 // Load two elements at a time and then uzp them into separate vectors, to
1818 // reduce the number of memory accesses.
1819 uint32x4_t ret0_u32 = vdupq_n_u32(0);
1820 uint32x4_t ret1_u32 = vdupq_n_u32(0);
1821
1822 // Use a pair of vget_lane_u64 to minimize vector to general purpose register
1823 // transfers and then mask off the bits we actually want.
1824 const uint64_t indices0123 =
1825 vgetq_lane_u64(vreinterpretq_u64_s16(indices), 0);
1826 const uint64_t indices4567 =
1827 vgetq_lane_u64(vreinterpretq_u64_s16(indices), 1);
1828 const int idx0 = (int16_t)((indices0123 >> 0) & 0xffffU);
1829 const int idx1 = (int16_t)((indices0123 >> 16) & 0xffffU);
1830 const int idx2 = (int16_t)((indices0123 >> 32) & 0xffffU);
1831 const int idx3 = (int16_t)((indices0123 >> 48) & 0xffffU);
1832 const int idx4 = (int16_t)((indices4567 >> 0) & 0xffffU);
1833 const int idx5 = (int16_t)((indices4567 >> 16) & 0xffffU);
1834 const int idx6 = (int16_t)((indices4567 >> 32) & 0xffffU);
1835 const int idx7 = (int16_t)((indices4567 >> 48) & 0xffffU);
1836
1837 // At time of writing both Clang and GCC produced better code with these
1838 // nested if-statements compared to a switch statement with fallthrough.
1839 load_unaligned_u32_4x1_lane(ret0_u32, left + idx0, 0);
1840 if (n > 1) {
1841 load_unaligned_u32_4x1_lane(ret0_u32, left + idx1, 1);
1842 if (n > 2) {
1843 load_unaligned_u32_4x1_lane(ret0_u32, left + idx2, 2);
1844 if (n > 3) {
1845 load_unaligned_u32_4x1_lane(ret0_u32, left + idx3, 3);
1846 if (n > 4) {
1847 load_unaligned_u32_4x1_lane(ret1_u32, left + idx4, 0);
1848 if (n > 5) {
1849 load_unaligned_u32_4x1_lane(ret1_u32, left + idx5, 1);
1850 if (n > 6) {
1851 load_unaligned_u32_4x1_lane(ret1_u32, left + idx6, 2);
1852 if (n > 7) {
1853 load_unaligned_u32_4x1_lane(ret1_u32, left + idx7, 3);
1854 }
1855 }
1856 }
1857 }
1858 }
1859 }
1860 }
1861 return vuzpq_u16(vreinterpretq_u16_u32(ret0_u32),
1862 vreinterpretq_u16_u32(ret1_u32));
1863 }
1864
highbd_dr_prediction_z2_merge_x4(uint16x4_t out_x,uint16x4_t out_y,int base_shift)1865 static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_merge_x4(
1866 uint16x4_t out_x, uint16x4_t out_y, int base_shift) {
1867 assert(base_shift >= 0);
1868 assert(base_shift <= 4);
1869 // On AArch64 we can permute the data from the `above` and `left` vectors
1870 // into a single vector in a single load (of the permute vector) + tbl.
1871 #if AOM_ARCH_AARCH64
1872 const uint8x8x2_t out_yx = { { vreinterpret_u8_u16(out_y),
1873 vreinterpret_u8_u16(out_x) } };
1874 return vreinterpret_u16_u8(
1875 vtbl2_u8(out_yx, vld1_u8(z2_merge_shuffles_u16x4[base_shift])));
1876 #else
1877 uint16x4_t out = out_y;
1878 for (int c2 = base_shift, x_idx = 0; c2 < 4; ++c2, ++x_idx) {
1879 out[c2] = out_x[x_idx];
1880 }
1881 return out;
1882 #endif
1883 }
1884
highbd_dr_prediction_z2_merge_x8(uint16x8_t out_x,uint16x8_t out_y,int base_shift)1885 static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_merge_x8(
1886 uint16x8_t out_x, uint16x8_t out_y, int base_shift) {
1887 assert(base_shift >= 0);
1888 assert(base_shift <= 8);
1889 // On AArch64 we can permute the data from the `above` and `left` vectors
1890 // into a single vector in a single load (of the permute vector) + tbl.
1891 #if AOM_ARCH_AARCH64
1892 const uint8x16x2_t out_yx = { { vreinterpretq_u8_u16(out_y),
1893 vreinterpretq_u8_u16(out_x) } };
1894 return vreinterpretq_u16_u8(
1895 vqtbl2q_u8(out_yx, vld1q_u8(z2_merge_shuffles_u16x8[base_shift])));
1896 #else
1897 uint16x8_t out = out_y;
1898 for (int c2 = base_shift, x_idx = 0; c2 < 8; ++c2, ++x_idx) {
1899 out[c2] = out_x[x_idx];
1900 }
1901 return out;
1902 #endif
1903 }
1904
highbd_dr_prediction_z2_apply_shift_x4(uint16x4_t a0,uint16x4_t a1,int16x4_t shift)1905 static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_apply_shift_x4(
1906 uint16x4_t a0, uint16x4_t a1, int16x4_t shift) {
1907 uint32x4_t res = vmull_u16(a1, vreinterpret_u16_s16(shift));
1908 res =
1909 vmlal_u16(res, a0, vsub_u16(vdup_n_u16(32), vreinterpret_u16_s16(shift)));
1910 return vrshrn_n_u32(res, 5);
1911 }
1912
highbd_dr_prediction_z2_apply_shift_x8(uint16x8_t a0,uint16x8_t a1,int16x8_t shift)1913 static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_apply_shift_x8(
1914 uint16x8_t a0, uint16x8_t a1, int16x8_t shift) {
1915 return vcombine_u16(
1916 highbd_dr_prediction_z2_apply_shift_x4(vget_low_u16(a0), vget_low_u16(a1),
1917 vget_low_s16(shift)),
1918 highbd_dr_prediction_z2_apply_shift_x4(
1919 vget_high_u16(a0), vget_high_u16(a1), vget_high_s16(shift)));
1920 }
1921
highbd_dr_prediction_z2_step_x4(const uint16_t * above,const uint16x4_t above0,const uint16x4_t above1,const uint16_t * left,int dx,int dy,int r,int c)1922 static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_step_x4(
1923 const uint16_t *above, const uint16x4_t above0, const uint16x4_t above1,
1924 const uint16_t *left, int dx, int dy, int r, int c) {
1925 const int16x4_t iota = vld1_s16(iota1_s16);
1926
1927 const int x0 = (c << 6) - (r + 1) * dx;
1928 const int y0 = (r << 6) - (c + 1) * dy;
1929
1930 const int16x4_t x0123 = vadd_s16(vdup_n_s16(x0), vshl_n_s16(iota, 6));
1931 const int16x4_t y0123 = vsub_s16(vdup_n_s16(y0), vmul_n_s16(iota, dy));
1932 const int16x4_t shift_x0123 =
1933 vshr_n_s16(vand_s16(x0123, vdup_n_s16(0x3F)), 1);
1934 const int16x4_t shift_y0123 =
1935 vshr_n_s16(vand_s16(y0123, vdup_n_s16(0x3F)), 1);
1936 const int16x4_t base_y0123 = vshr_n_s16(y0123, 6);
1937
1938 const int base_shift = ((((r + 1) * dx) - 1) >> 6) - c;
1939
1940 // Based on the value of `base_shift` there are three possible cases to
1941 // compute the result:
1942 // 1) base_shift <= 0: We can load and operate entirely on data from the
1943 // `above` input vector.
1944 // 2) base_shift < vl: We can load from `above[-1]` and shift
1945 // `vl - base_shift` elements across to the end of the
1946 // vector, then compute the remainder from `left`.
1947 // 3) base_shift >= vl: We can load and operate entirely on data from the
1948 // `left` input vector.
1949
1950 if (base_shift <= 0) {
1951 const int base_x = x0 >> 6;
1952 const uint16x4_t a0 = vld1_u16(above + base_x);
1953 const uint16x4_t a1 = vld1_u16(above + base_x + 1);
1954 return highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123);
1955 } else if (base_shift < 4) {
1956 const uint16x4x2_t l01 = highbd_dr_prediction_z2_gather_left_x4(
1957 left + 1, base_y0123, base_shift);
1958 const uint16x4_t out16_y = highbd_dr_prediction_z2_apply_shift_x4(
1959 l01.val[0], l01.val[1], shift_y0123);
1960
1961 // No need to reload from above in the loop, just use pre-loaded constants.
1962 const uint16x4_t out16_x =
1963 highbd_dr_prediction_z2_apply_shift_x4(above0, above1, shift_x0123);
1964
1965 return highbd_dr_prediction_z2_merge_x4(out16_x, out16_y, base_shift);
1966 } else {
1967 const uint16x4x2_t l01 =
1968 highbd_dr_prediction_z2_gather_left_x4(left + 1, base_y0123, 4);
1969 return highbd_dr_prediction_z2_apply_shift_x4(l01.val[0], l01.val[1],
1970 shift_y0123);
1971 }
1972 }
1973
highbd_dr_prediction_z2_step_x8(const uint16_t * above,const uint16x8_t above0,const uint16x8_t above1,const uint16_t * left,int dx,int dy,int r,int c)1974 static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_step_x8(
1975 const uint16_t *above, const uint16x8_t above0, const uint16x8_t above1,
1976 const uint16_t *left, int dx, int dy, int r, int c) {
1977 const int16x8_t iota = vld1q_s16(iota1_s16);
1978
1979 const int x0 = (c << 6) - (r + 1) * dx;
1980 const int y0 = (r << 6) - (c + 1) * dy;
1981
1982 const int16x8_t x01234567 = vaddq_s16(vdupq_n_s16(x0), vshlq_n_s16(iota, 6));
1983 const int16x8_t y01234567 = vsubq_s16(vdupq_n_s16(y0), vmulq_n_s16(iota, dy));
1984 const int16x8_t shift_x01234567 =
1985 vshrq_n_s16(vandq_s16(x01234567, vdupq_n_s16(0x3F)), 1);
1986 const int16x8_t shift_y01234567 =
1987 vshrq_n_s16(vandq_s16(y01234567, vdupq_n_s16(0x3F)), 1);
1988 const int16x8_t base_y01234567 = vshrq_n_s16(y01234567, 6);
1989
1990 const int base_shift = ((((r + 1) * dx) - 1) >> 6) - c;
1991
1992 // Based on the value of `base_shift` there are three possible cases to
1993 // compute the result:
1994 // 1) base_shift <= 0: We can load and operate entirely on data from the
1995 // `above` input vector.
1996 // 2) base_shift < vl: We can load from `above[-1]` and shift
1997 // `vl - base_shift` elements across to the end of the
1998 // vector, then compute the remainder from `left`.
1999 // 3) base_shift >= vl: We can load and operate entirely on data from the
2000 // `left` input vector.
2001
2002 if (base_shift <= 0) {
2003 const int base_x = x0 >> 6;
2004 const uint16x8_t a0 = vld1q_u16(above + base_x);
2005 const uint16x8_t a1 = vld1q_u16(above + base_x + 1);
2006 return highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567);
2007 } else if (base_shift < 8) {
2008 const uint16x8x2_t l01 = highbd_dr_prediction_z2_gather_left_x8(
2009 left + 1, base_y01234567, base_shift);
2010 const uint16x8_t out16_y = highbd_dr_prediction_z2_apply_shift_x8(
2011 l01.val[0], l01.val[1], shift_y01234567);
2012
2013 // No need to reload from above in the loop, just use pre-loaded constants.
2014 const uint16x8_t out16_x =
2015 highbd_dr_prediction_z2_apply_shift_x8(above0, above1, shift_x01234567);
2016
2017 return highbd_dr_prediction_z2_merge_x8(out16_x, out16_y, base_shift);
2018 } else {
2019 const uint16x8x2_t l01 =
2020 highbd_dr_prediction_z2_gather_left_x8(left + 1, base_y01234567, 8);
2021 return highbd_dr_prediction_z2_apply_shift_x8(l01.val[0], l01.val[1],
2022 shift_y01234567);
2023 }
2024 }
2025
2026 // Left array is accessed from -1 through `bh - 1` inclusive.
2027 // Above array is accessed from -1 through `bw - 1` inclusive.
2028 #define HIGHBD_DR_PREDICTOR_Z2_WXH(bw, bh) \
2029 static void highbd_dr_prediction_z2_##bw##x##bh##_neon( \
2030 uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
2031 const uint16_t *left, int upsample_above, int upsample_left, int dx, \
2032 int dy, int bd) { \
2033 (void)bd; \
2034 (void)upsample_above; \
2035 (void)upsample_left; \
2036 assert(!upsample_above); \
2037 assert(!upsample_left); \
2038 assert(bw % 4 == 0); \
2039 assert(bh % 4 == 0); \
2040 assert(dx > 0); \
2041 assert(dy > 0); \
2042 \
2043 uint16_t left_data[bh + 1]; \
2044 memcpy(left_data, left - 1, (bh + 1) * sizeof(uint16_t)); \
2045 \
2046 uint16x8_t a0, a1; \
2047 if (bw == 4) { \
2048 a0 = vcombine_u16(vld1_u16(above - 1), vdup_n_u16(0)); \
2049 a1 = vcombine_u16(vld1_u16(above + 0), vdup_n_u16(0)); \
2050 } else { \
2051 a0 = vld1q_u16(above - 1); \
2052 a1 = vld1q_u16(above + 0); \
2053 } \
2054 \
2055 int r = 0; \
2056 do { \
2057 if (bw == 4) { \
2058 vst1_u16(dst, highbd_dr_prediction_z2_step_x4( \
2059 above, vget_low_u16(a0), vget_low_u16(a1), \
2060 left_data, dx, dy, r, 0)); \
2061 } else { \
2062 int c = 0; \
2063 do { \
2064 vst1q_u16(dst + c, highbd_dr_prediction_z2_step_x8( \
2065 above, a0, a1, left_data, dx, dy, r, c)); \
2066 c += 8; \
2067 } while (c < bw); \
2068 } \
2069 dst += stride; \
2070 } while (++r < bh); \
2071 }
2072
2073 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2074 HIGHBD_DR_PREDICTOR_Z2_WXH(4, 16)
2075 HIGHBD_DR_PREDICTOR_Z2_WXH(8, 16)
2076 HIGHBD_DR_PREDICTOR_Z2_WXH(8, 32)
2077 HIGHBD_DR_PREDICTOR_Z2_WXH(16, 4)
2078 HIGHBD_DR_PREDICTOR_Z2_WXH(16, 8)
2079 HIGHBD_DR_PREDICTOR_Z2_WXH(16, 16)
2080 HIGHBD_DR_PREDICTOR_Z2_WXH(16, 32)
2081 HIGHBD_DR_PREDICTOR_Z2_WXH(16, 64)
2082 HIGHBD_DR_PREDICTOR_Z2_WXH(32, 8)
2083 HIGHBD_DR_PREDICTOR_Z2_WXH(32, 16)
2084 HIGHBD_DR_PREDICTOR_Z2_WXH(32, 32)
2085 HIGHBD_DR_PREDICTOR_Z2_WXH(32, 64)
2086 HIGHBD_DR_PREDICTOR_Z2_WXH(64, 16)
2087 HIGHBD_DR_PREDICTOR_Z2_WXH(64, 32)
2088 HIGHBD_DR_PREDICTOR_Z2_WXH(64, 64)
2089 #else
2090 HIGHBD_DR_PREDICTOR_Z2_WXH(8, 16)
2091 HIGHBD_DR_PREDICTOR_Z2_WXH(16, 8)
2092 HIGHBD_DR_PREDICTOR_Z2_WXH(16, 16)
2093 HIGHBD_DR_PREDICTOR_Z2_WXH(16, 32)
2094 HIGHBD_DR_PREDICTOR_Z2_WXH(32, 32)
2095 HIGHBD_DR_PREDICTOR_Z2_WXH(32, 64)
2096 HIGHBD_DR_PREDICTOR_Z2_WXH(64, 32)
2097 HIGHBD_DR_PREDICTOR_Z2_WXH(64, 64)
2098 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2099
2100 #undef HIGHBD_DR_PREDICTOR_Z2_WXH
2101
2102 typedef void (*highbd_dr_prediction_z2_ptr)(uint16_t *dst, ptrdiff_t stride,
2103 const uint16_t *above,
2104 const uint16_t *left,
2105 int upsample_above,
2106 int upsample_left, int dx, int dy,
2107 int bd);
2108
highbd_dr_prediction_z2_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy,int bd)2109 static void highbd_dr_prediction_z2_4x4_neon(uint16_t *dst, ptrdiff_t stride,
2110 const uint16_t *above,
2111 const uint16_t *left,
2112 int upsample_above,
2113 int upsample_left, int dx, int dy,
2114 int bd) {
2115 (void)bd;
2116 assert(dx > 0);
2117 assert(dy > 0);
2118
2119 const int frac_bits_x = 6 - upsample_above;
2120 const int frac_bits_y = 6 - upsample_left;
2121 const int min_base_x = -(1 << (upsample_above + frac_bits_x));
2122
2123 // if `upsample_left` then we need -2 through 6 inclusive from `left`.
2124 // else we only need -1 through 3 inclusive.
2125
2126 #if AOM_ARCH_AARCH64
2127 uint16x8_t left_data0, left_data1;
2128 if (upsample_left) {
2129 left_data0 = vld1q_u16(left - 2);
2130 left_data1 = vld1q_u16(left - 1);
2131 } else {
2132 left_data0 = vcombine_u16(vld1_u16(left - 1), vdup_n_u16(0));
2133 left_data1 = vcombine_u16(vld1_u16(left + 0), vdup_n_u16(0));
2134 }
2135 #endif
2136
2137 const int16x4_t iota0123 = vld1_s16(iota1_s16);
2138 const int16x4_t iota1234 = vld1_s16(iota1_s16 + 1);
2139
2140 for (int r = 0; r < 4; ++r) {
2141 const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6;
2142 const int x0 = (r + 1) * dx;
2143 const int16x4_t x0123 = vsub_s16(vshl_n_s16(iota0123, 6), vdup_n_s16(x0));
2144 const int base_x0 = (-x0) >> frac_bits_x;
2145 if (base_shift <= 0) {
2146 uint16x4_t a0, a1;
2147 int16x4_t shift_x0123;
2148 if (upsample_above) {
2149 const uint16x4x2_t a01 = vld2_u16(above + base_x0);
2150 a0 = a01.val[0];
2151 a1 = a01.val[1];
2152 shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F));
2153 } else {
2154 a0 = vld1_u16(above + base_x0);
2155 a1 = vld1_u16(above + base_x0 + 1);
2156 shift_x0123 = vshr_n_s16(vand_s16(x0123, vdup_n_s16(0x3F)), 1);
2157 }
2158 vst1_u16(dst,
2159 highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123));
2160 } else if (base_shift < 4) {
2161 // Calculate Y component from `left`.
2162 const int y_iters = base_shift;
2163 const int16x4_t y0123 =
2164 vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy));
2165 const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y));
2166 const int16x4_t shift_y0123 = vshr_n_s16(
2167 vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1);
2168 uint16x4_t l0, l1;
2169 #if AOM_ARCH_AARCH64
2170 const int left_data_base = upsample_left ? -2 : -1;
2171 l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data0, base_y0123,
2172 left_data_base, y_iters);
2173 l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data1, base_y0123,
2174 left_data_base, y_iters);
2175 #else
2176 const uint16x4x2_t l01 =
2177 highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, y_iters);
2178 l0 = l01.val[0];
2179 l1 = l01.val[1];
2180 #endif
2181
2182 const uint16x4_t out_y =
2183 highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123);
2184
2185 // Calculate X component from `above`.
2186 const int16x4_t shift_x0123 = vshr_n_s16(
2187 vand_s16(vmul_n_s16(x0123, 1 << upsample_above), vdup_n_s16(0x3F)),
2188 1);
2189 uint16x4_t a0, a1;
2190 if (upsample_above) {
2191 const uint16x4x2_t a01 = vld2_u16(above + (base_x0 % 2 == 0 ? -2 : -1));
2192 a0 = a01.val[0];
2193 a1 = a01.val[1];
2194 } else {
2195 a0 = vld1_u16(above - 1);
2196 a1 = vld1_u16(above + 0);
2197 }
2198 const uint16x4_t out_x =
2199 highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123);
2200
2201 // Combine X and Y vectors.
2202 const uint16x4_t out =
2203 highbd_dr_prediction_z2_merge_x4(out_x, out_y, base_shift);
2204 vst1_u16(dst, out);
2205 } else {
2206 const int16x4_t y0123 =
2207 vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy));
2208 const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y));
2209 const int16x4_t shift_y0123 = vshr_n_s16(
2210 vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1);
2211 uint16x4_t l0, l1;
2212 #if AOM_ARCH_AARCH64
2213 const int left_data_base = upsample_left ? -2 : -1;
2214 l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data0, base_y0123,
2215 left_data_base, 4);
2216 l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data1, base_y0123,
2217 left_data_base, 4);
2218 #else
2219 const uint16x4x2_t l01 =
2220 highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, 4);
2221 l0 = l01.val[0];
2222 l1 = l01.val[1];
2223 #endif
2224 vst1_u16(dst,
2225 highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123));
2226 }
2227 dst += stride;
2228 }
2229 }
2230
highbd_dr_prediction_z2_4x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy,int bd)2231 static void highbd_dr_prediction_z2_4x8_neon(uint16_t *dst, ptrdiff_t stride,
2232 const uint16_t *above,
2233 const uint16_t *left,
2234 int upsample_above,
2235 int upsample_left, int dx, int dy,
2236 int bd) {
2237 (void)bd;
2238 assert(dx > 0);
2239 assert(dy > 0);
2240
2241 const int frac_bits_x = 6 - upsample_above;
2242 const int frac_bits_y = 6 - upsample_left;
2243 const int min_base_x = -(1 << (upsample_above + frac_bits_x));
2244
2245 // if `upsample_left` then we need -2 through 14 inclusive from `left`.
2246 // else we only need -1 through 6 inclusive.
2247
2248 #if AOM_ARCH_AARCH64
2249 uint16x8x2_t left_data0, left_data1;
2250 if (upsample_left) {
2251 left_data0 = vld1q_u16_x2(left - 2);
2252 left_data1 = vld1q_u16_x2(left - 1);
2253 } else {
2254 left_data0 = (uint16x8x2_t){ { vld1q_u16(left - 1), vdupq_n_u16(0) } };
2255 left_data1 = (uint16x8x2_t){ { vld1q_u16(left + 0), vdupq_n_u16(0) } };
2256 }
2257 #endif
2258
2259 const int16x4_t iota0123 = vld1_s16(iota1_s16);
2260 const int16x4_t iota1234 = vld1_s16(iota1_s16 + 1);
2261
2262 for (int r = 0; r < 8; ++r) {
2263 const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6;
2264 const int x0 = (r + 1) * dx;
2265 const int16x4_t x0123 = vsub_s16(vshl_n_s16(iota0123, 6), vdup_n_s16(x0));
2266 const int base_x0 = (-x0) >> frac_bits_x;
2267 if (base_shift <= 0) {
2268 uint16x4_t a0, a1;
2269 int16x4_t shift_x0123;
2270 if (upsample_above) {
2271 const uint16x4x2_t a01 = vld2_u16(above + base_x0);
2272 a0 = a01.val[0];
2273 a1 = a01.val[1];
2274 shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F));
2275 } else {
2276 a0 = vld1_u16(above + base_x0);
2277 a1 = vld1_u16(above + base_x0 + 1);
2278 shift_x0123 = vand_s16(vshr_n_s16(x0123, 1), vdup_n_s16(0x1F));
2279 }
2280 vst1_u16(dst,
2281 highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123));
2282 } else if (base_shift < 4) {
2283 // Calculate Y component from `left`.
2284 const int y_iters = base_shift;
2285 const int16x4_t y0123 =
2286 vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy));
2287 const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y));
2288 const int16x4_t shift_y0123 = vshr_n_s16(
2289 vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1);
2290
2291 uint16x4_t l0, l1;
2292 #if AOM_ARCH_AARCH64
2293 const int left_data_base = upsample_left ? -2 : -1;
2294 l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(
2295 left_data0, base_y0123, left_data_base, y_iters);
2296 l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(
2297 left_data1, base_y0123, left_data_base, y_iters);
2298 #else
2299 const uint16x4x2_t l01 =
2300 highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, y_iters);
2301 l0 = l01.val[0];
2302 l1 = l01.val[1];
2303 #endif
2304
2305 const uint16x4_t out_y =
2306 highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123);
2307
2308 // Calculate X component from `above`.
2309 uint16x4_t a0, a1;
2310 int16x4_t shift_x0123;
2311 if (upsample_above) {
2312 const uint16x4x2_t a01 = vld2_u16(above + (base_x0 % 2 == 0 ? -2 : -1));
2313 a0 = a01.val[0];
2314 a1 = a01.val[1];
2315 shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F));
2316 } else {
2317 a0 = vld1_u16(above - 1);
2318 a1 = vld1_u16(above + 0);
2319 shift_x0123 = vand_s16(vshr_n_s16(x0123, 1), vdup_n_s16(0x1F));
2320 }
2321 const uint16x4_t out_x =
2322 highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123);
2323
2324 // Combine X and Y vectors.
2325 const uint16x4_t out =
2326 highbd_dr_prediction_z2_merge_x4(out_x, out_y, base_shift);
2327 vst1_u16(dst, out);
2328 } else {
2329 const int16x4_t y0123 =
2330 vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy));
2331 const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y));
2332 const int16x4_t shift_y0123 = vshr_n_s16(
2333 vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1);
2334
2335 uint16x4_t l0, l1;
2336 #if AOM_ARCH_AARCH64
2337 const int left_data_base = upsample_left ? -2 : -1;
2338 l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(left_data0, base_y0123,
2339 left_data_base, 4);
2340 l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(left_data1, base_y0123,
2341 left_data_base, 4);
2342 #else
2343 const uint16x4x2_t l01 =
2344 highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, 4);
2345 l0 = l01.val[0];
2346 l1 = l01.val[1];
2347 #endif
2348
2349 vst1_u16(dst,
2350 highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123));
2351 }
2352 dst += stride;
2353 }
2354 }
2355
highbd_dr_prediction_z2_8x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy,int bd)2356 static void highbd_dr_prediction_z2_8x4_neon(uint16_t *dst, ptrdiff_t stride,
2357 const uint16_t *above,
2358 const uint16_t *left,
2359 int upsample_above,
2360 int upsample_left, int dx, int dy,
2361 int bd) {
2362 (void)bd;
2363 assert(dx > 0);
2364 assert(dy > 0);
2365
2366 const int frac_bits_x = 6 - upsample_above;
2367 const int frac_bits_y = 6 - upsample_left;
2368 const int min_base_x = -(1 << (upsample_above + frac_bits_x));
2369
2370 // if `upsample_left` then we need -2 through 6 inclusive from `left`.
2371 // else we only need -1 through 3 inclusive.
2372
2373 #if AOM_ARCH_AARCH64
2374 uint16x8_t left_data0, left_data1;
2375 if (upsample_left) {
2376 left_data0 = vld1q_u16(left - 2);
2377 left_data1 = vld1q_u16(left - 1);
2378 } else {
2379 left_data0 = vcombine_u16(vld1_u16(left - 1), vdup_n_u16(0));
2380 left_data1 = vcombine_u16(vld1_u16(left + 0), vdup_n_u16(0));
2381 }
2382 #endif
2383
2384 const int16x8_t iota01234567 = vld1q_s16(iota1_s16);
2385 const int16x8_t iota12345678 = vld1q_s16(iota1_s16 + 1);
2386
2387 for (int r = 0; r < 4; ++r) {
2388 const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6;
2389 const int x0 = (r + 1) * dx;
2390 const int16x8_t x01234567 =
2391 vsubq_s16(vshlq_n_s16(iota01234567, 6), vdupq_n_s16(x0));
2392 const int base_x0 = (-x0) >> frac_bits_x;
2393 if (base_shift <= 0) {
2394 uint16x8_t a0, a1;
2395 int16x8_t shift_x01234567;
2396 if (upsample_above) {
2397 const uint16x8x2_t a01 = vld2q_u16(above + base_x0);
2398 a0 = a01.val[0];
2399 a1 = a01.val[1];
2400 shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F));
2401 } else {
2402 a0 = vld1q_u16(above + base_x0);
2403 a1 = vld1q_u16(above + base_x0 + 1);
2404 shift_x01234567 =
2405 vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F));
2406 }
2407 vst1q_u16(
2408 dst, highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567));
2409 } else if (base_shift < 8) {
2410 // Calculate Y component from `left`.
2411 const int y_iters = base_shift;
2412 const int16x8_t y01234567 =
2413 vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy));
2414 const int16x8_t base_y01234567 =
2415 vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y));
2416 const int16x8_t shift_y01234567 =
2417 vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left),
2418 vdupq_n_s16(0x3F)),
2419 1);
2420
2421 uint16x8_t l0, l1;
2422 #if AOM_ARCH_AARCH64
2423 const int left_data_base = upsample_left ? -2 : -1;
2424 l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x8(
2425 left_data0, base_y01234567, left_data_base, y_iters);
2426 l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x8(
2427 left_data1, base_y01234567, left_data_base, y_iters);
2428 #else
2429 const uint16x8x2_t l01 =
2430 highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, y_iters);
2431 l0 = l01.val[0];
2432 l1 = l01.val[1];
2433 #endif
2434
2435 const uint16x8_t out_y =
2436 highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567);
2437
2438 // Calculate X component from `above`.
2439 uint16x8_t a0, a1;
2440 int16x8_t shift_x01234567;
2441 if (upsample_above) {
2442 const uint16x8x2_t a01 =
2443 vld2q_u16(above + (base_x0 % 2 == 0 ? -2 : -1));
2444 a0 = a01.val[0];
2445 a1 = a01.val[1];
2446 shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F));
2447 } else {
2448 a0 = vld1q_u16(above - 1);
2449 a1 = vld1q_u16(above + 0);
2450 shift_x01234567 =
2451 vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F));
2452 }
2453 const uint16x8_t out_x =
2454 highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567);
2455
2456 // Combine X and Y vectors.
2457 const uint16x8_t out =
2458 highbd_dr_prediction_z2_merge_x8(out_x, out_y, base_shift);
2459 vst1q_u16(dst, out);
2460 } else {
2461 const int16x8_t y01234567 =
2462 vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy));
2463 const int16x8_t base_y01234567 =
2464 vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y));
2465 const int16x8_t shift_y01234567 =
2466 vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left),
2467 vdupq_n_s16(0x3F)),
2468 1);
2469
2470 uint16x8_t l0, l1;
2471 #if AOM_ARCH_AARCH64
2472 const int left_data_base = upsample_left ? -2 : -1;
2473 l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x8(
2474 left_data0, base_y01234567, left_data_base, 8);
2475 l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x8(
2476 left_data1, base_y01234567, left_data_base, 8);
2477 #else
2478 const uint16x8x2_t l01 =
2479 highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, 8);
2480 l0 = l01.val[0];
2481 l1 = l01.val[1];
2482 #endif
2483
2484 vst1q_u16(
2485 dst, highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567));
2486 }
2487 dst += stride;
2488 }
2489 }
2490
highbd_dr_prediction_z2_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy,int bd)2491 static void highbd_dr_prediction_z2_8x8_neon(uint16_t *dst, ptrdiff_t stride,
2492 const uint16_t *above,
2493 const uint16_t *left,
2494 int upsample_above,
2495 int upsample_left, int dx, int dy,
2496 int bd) {
2497 (void)bd;
2498 assert(dx > 0);
2499 assert(dy > 0);
2500
2501 const int frac_bits_x = 6 - upsample_above;
2502 const int frac_bits_y = 6 - upsample_left;
2503 const int min_base_x = -(1 << (upsample_above + frac_bits_x));
2504
2505 // if `upsample_left` then we need -2 through 14 inclusive from `left`.
2506 // else we only need -1 through 6 inclusive.
2507
2508 #if AOM_ARCH_AARCH64
2509 uint16x8x2_t left_data0, left_data1;
2510 if (upsample_left) {
2511 left_data0 = vld1q_u16_x2(left - 2);
2512 left_data1 = vld1q_u16_x2(left - 1);
2513 } else {
2514 left_data0 = (uint16x8x2_t){ { vld1q_u16(left - 1), vdupq_n_u16(0) } };
2515 left_data1 = (uint16x8x2_t){ { vld1q_u16(left + 0), vdupq_n_u16(0) } };
2516 }
2517 #endif
2518
2519 const int16x8_t iota01234567 = vld1q_s16(iota1_s16);
2520 const int16x8_t iota12345678 = vld1q_s16(iota1_s16 + 1);
2521
2522 for (int r = 0; r < 8; ++r) {
2523 const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6;
2524 const int x0 = (r + 1) * dx;
2525 const int16x8_t x01234567 =
2526 vsubq_s16(vshlq_n_s16(iota01234567, 6), vdupq_n_s16(x0));
2527 const int base_x0 = (-x0) >> frac_bits_x;
2528 if (base_shift <= 0) {
2529 uint16x8_t a0, a1;
2530 int16x8_t shift_x01234567;
2531 if (upsample_above) {
2532 const uint16x8x2_t a01 = vld2q_u16(above + base_x0);
2533 a0 = a01.val[0];
2534 a1 = a01.val[1];
2535 shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F));
2536 } else {
2537 a0 = vld1q_u16(above + base_x0);
2538 a1 = vld1q_u16(above + base_x0 + 1);
2539 shift_x01234567 =
2540 vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F));
2541 }
2542 vst1q_u16(
2543 dst, highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567));
2544 } else if (base_shift < 8) {
2545 // Calculate Y component from `left`.
2546 const int y_iters = base_shift;
2547 const int16x8_t y01234567 =
2548 vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy));
2549 const int16x8_t base_y01234567 =
2550 vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y));
2551 const int16x8_t shift_y01234567 =
2552 vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left),
2553 vdupq_n_s16(0x3F)),
2554 1);
2555
2556 uint16x8_t l0, l1;
2557 #if AOM_ARCH_AARCH64
2558 const int left_data_base = upsample_left ? -2 : -1;
2559 l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x16(
2560 left_data0, base_y01234567, left_data_base, y_iters);
2561 l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x16(
2562 left_data1, base_y01234567, left_data_base, y_iters);
2563 #else
2564 const uint16x8x2_t l01 =
2565 highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, y_iters);
2566 l0 = l01.val[0];
2567 l1 = l01.val[1];
2568 #endif
2569
2570 const uint16x8_t out_y =
2571 highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567);
2572
2573 // Calculate X component from `above`.
2574 uint16x8_t a0, a1;
2575 int16x8_t shift_x01234567;
2576 if (upsample_above) {
2577 const uint16x8x2_t a01 =
2578 vld2q_u16(above + (base_x0 % 2 == 0 ? -2 : -1));
2579 a0 = a01.val[0];
2580 a1 = a01.val[1];
2581 shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F));
2582 } else {
2583 a0 = vld1q_u16(above - 1);
2584 a1 = vld1q_u16(above + 0);
2585 shift_x01234567 =
2586 vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F));
2587 }
2588 const uint16x8_t out_x =
2589 highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567);
2590
2591 // Combine X and Y vectors.
2592 const uint16x8_t out =
2593 highbd_dr_prediction_z2_merge_x8(out_x, out_y, base_shift);
2594 vst1q_u16(dst, out);
2595 } else {
2596 const int16x8_t y01234567 =
2597 vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy));
2598 const int16x8_t base_y01234567 =
2599 vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y));
2600 const int16x8_t shift_y01234567 =
2601 vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left),
2602 vdupq_n_s16(0x3F)),
2603 1);
2604
2605 uint16x8_t l0, l1;
2606 #if AOM_ARCH_AARCH64
2607 const int left_data_base = upsample_left ? -2 : -1;
2608 l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x16(
2609 left_data0, base_y01234567, left_data_base, 8);
2610 l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x16(
2611 left_data1, base_y01234567, left_data_base, 8);
2612 #else
2613 const uint16x8x2_t l01 =
2614 highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, 8);
2615 l0 = l01.val[0];
2616 l1 = l01.val[1];
2617 #endif
2618
2619 vst1q_u16(
2620 dst, highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567));
2621 }
2622 dst += stride;
2623 }
2624 }
2625
2626 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2627 static highbd_dr_prediction_z2_ptr dr_predictor_z2_arr_neon[7][7] = {
2628 { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
2629 { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
2630 { NULL, NULL, &highbd_dr_prediction_z2_4x4_neon,
2631 &highbd_dr_prediction_z2_4x8_neon, &highbd_dr_prediction_z2_4x16_neon, NULL,
2632 NULL },
2633 { NULL, NULL, &highbd_dr_prediction_z2_8x4_neon,
2634 &highbd_dr_prediction_z2_8x8_neon, &highbd_dr_prediction_z2_8x16_neon,
2635 &highbd_dr_prediction_z2_8x32_neon, NULL },
2636 { NULL, NULL, &highbd_dr_prediction_z2_16x4_neon,
2637 &highbd_dr_prediction_z2_16x8_neon, &highbd_dr_prediction_z2_16x16_neon,
2638 &highbd_dr_prediction_z2_16x32_neon, &highbd_dr_prediction_z2_16x64_neon },
2639 { NULL, NULL, NULL, &highbd_dr_prediction_z2_32x8_neon,
2640 &highbd_dr_prediction_z2_32x16_neon, &highbd_dr_prediction_z2_32x32_neon,
2641 &highbd_dr_prediction_z2_32x64_neon },
2642 { NULL, NULL, NULL, NULL, &highbd_dr_prediction_z2_64x16_neon,
2643 &highbd_dr_prediction_z2_64x32_neon, &highbd_dr_prediction_z2_64x64_neon },
2644 };
2645 #else
2646 static highbd_dr_prediction_z2_ptr dr_predictor_z2_arr_neon[7][7] = {
2647 { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
2648 { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
2649 { NULL, NULL, &highbd_dr_prediction_z2_4x4_neon,
2650 &highbd_dr_prediction_z2_4x8_neon, NULL, NULL, NULL },
2651 { NULL, NULL, &highbd_dr_prediction_z2_8x4_neon,
2652 &highbd_dr_prediction_z2_8x8_neon, &highbd_dr_prediction_z2_8x16_neon, NULL,
2653 NULL },
2654 { NULL, NULL, NULL, &highbd_dr_prediction_z2_16x8_neon,
2655 &highbd_dr_prediction_z2_16x16_neon, &highbd_dr_prediction_z2_16x32_neon,
2656 NULL },
2657 { NULL, NULL, NULL, NULL, NULL, &highbd_dr_prediction_z2_32x32_neon,
2658 &highbd_dr_prediction_z2_32x64_neon },
2659 { NULL, NULL, NULL, NULL, NULL, &highbd_dr_prediction_z2_64x32_neon,
2660 &highbd_dr_prediction_z2_64x64_neon },
2661 };
2662 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2663
2664 // Directional prediction, zone 2: 90 < angle < 180
av1_highbd_dr_prediction_z2_neon(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy,int bd)2665 void av1_highbd_dr_prediction_z2_neon(uint16_t *dst, ptrdiff_t stride, int bw,
2666 int bh, const uint16_t *above,
2667 const uint16_t *left, int upsample_above,
2668 int upsample_left, int dx, int dy,
2669 int bd) {
2670 highbd_dr_prediction_z2_ptr f =
2671 dr_predictor_z2_arr_neon[get_msb(bw)][get_msb(bh)];
2672 assert(f != NULL);
2673 f(dst, stride, above, left, upsample_above, upsample_left, dx, dy, bd);
2674 }
2675 #endif // AOM_ARCH_AARCH64
2676
2677 // -----------------------------------------------------------------------------
2678 // Z3
2679
2680 // Both the lane to the use and the shift amount must be immediates.
2681 #define HIGHBD_DR_PREDICTOR_Z3_STEP_X4(out, iota, base, in0, in1, s0, s1, \
2682 lane, shift) \
2683 do { \
2684 uint32x4_t val = vmull_lane_u16((in0), (s0), (lane)); \
2685 val = vmlal_lane_u16(val, (in1), (s1), (lane)); \
2686 const uint16x4_t cmp = vadd_u16((iota), vdup_n_u16(base)); \
2687 const uint16x4_t res = vrshrn_n_u32(val, (shift)); \
2688 *(out) = vbsl_u16(vclt_u16(cmp, vdup_n_u16(max_base_y)), res, \
2689 vdup_n_u16(left_max)); \
2690 } while (0)
2691
2692 #define HIGHBD_DR_PREDICTOR_Z3_STEP_X8(out, iota, base, in0, in1, s0, s1, \
2693 lane, shift) \
2694 do { \
2695 uint32x4_t val_lo = vmull_lane_u16(vget_low_u16(in0), (s0), (lane)); \
2696 val_lo = vmlal_lane_u16(val_lo, vget_low_u16(in1), (s1), (lane)); \
2697 uint32x4_t val_hi = vmull_lane_u16(vget_high_u16(in0), (s0), (lane)); \
2698 val_hi = vmlal_lane_u16(val_hi, vget_high_u16(in1), (s1), (lane)); \
2699 *(out) = vcombine_u16(vrshrn_n_u32(val_lo, (shift)), \
2700 vrshrn_n_u32(val_hi, (shift))); \
2701 } while (0)
2702
z3_load_left_neon(const uint16_t * left0,int ofs,int max_ofs)2703 static inline uint16x8x2_t z3_load_left_neon(const uint16_t *left0, int ofs,
2704 int max_ofs) {
2705 uint16x8_t r0;
2706 uint16x8_t r1;
2707 if (ofs + 7 >= max_ofs) {
2708 int shuffle_idx = max_ofs - ofs;
2709 r0 = zn_load_masked_neon(left0 + (max_ofs - 7), shuffle_idx);
2710 } else {
2711 r0 = vld1q_u16(left0 + ofs);
2712 }
2713 if (ofs + 8 >= max_ofs) {
2714 int shuffle_idx = max_ofs - ofs - 1;
2715 r1 = zn_load_masked_neon(left0 + (max_ofs - 7), shuffle_idx);
2716 } else {
2717 r1 = vld1q_u16(left0 + ofs + 1);
2718 }
2719 return (uint16x8x2_t){ { r0, r1 } };
2720 }
2721
highbd_dr_prediction_z3_upsample0_neon(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * left,int dy)2722 static void highbd_dr_prediction_z3_upsample0_neon(uint16_t *dst,
2723 ptrdiff_t stride, int bw,
2724 int bh, const uint16_t *left,
2725 int dy) {
2726 assert(bw % 4 == 0);
2727 assert(bh % 4 == 0);
2728 assert(dy > 0);
2729
2730 // Factor out left + 1 to give the compiler a better chance of recognising
2731 // that the offsets used for the loads from left and left + 1 are otherwise
2732 // identical.
2733 const uint16_t *left1 = left + 1;
2734
2735 const int max_base_y = (bw + bh - 1);
2736 const int left_max = left[max_base_y];
2737 const int frac_bits = 6;
2738
2739 const uint16x8_t iota1x8 = vreinterpretq_u16_s16(vld1q_s16(iota1_s16));
2740 const uint16x4_t iota1x4 = vget_low_u16(iota1x8);
2741
2742 // The C implementation of the z3 predictor when not upsampling uses:
2743 // ((y & 0x3f) >> 1)
2744 // The right shift is unnecessary here since we instead shift by +1 later,
2745 // so adjust the mask to 0x3e to ensure we don't consider the extra bit.
2746 const uint16x4_t shift_mask = vdup_n_u16(0x3e);
2747
2748 if (bh == 4) {
2749 int y = dy;
2750 int c = 0;
2751 do {
2752 // Fully unroll the 4x4 block to allow us to use immediate lane-indexed
2753 // multiply instructions.
2754 const uint16x4_t shifts1 =
2755 vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask);
2756 const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(64), shifts1);
2757 const int base0 = (y + 0 * dy) >> frac_bits;
2758 const int base1 = (y + 1 * dy) >> frac_bits;
2759 const int base2 = (y + 2 * dy) >> frac_bits;
2760 const int base3 = (y + 3 * dy) >> frac_bits;
2761 uint16x4_t out[4];
2762 if (base0 >= max_base_y) {
2763 out[0] = vdup_n_u16(left_max);
2764 } else {
2765 const uint16x4_t l00 = vld1_u16(left + base0);
2766 const uint16x4_t l01 = vld1_u16(left1 + base0);
2767 HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[0], iota1x4, base0, l00, l01,
2768 shifts0, shifts1, 0, 6);
2769 }
2770 if (base1 >= max_base_y) {
2771 out[1] = vdup_n_u16(left_max);
2772 } else {
2773 const uint16x4_t l10 = vld1_u16(left + base1);
2774 const uint16x4_t l11 = vld1_u16(left1 + base1);
2775 HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[1], iota1x4, base1, l10, l11,
2776 shifts0, shifts1, 1, 6);
2777 }
2778 if (base2 >= max_base_y) {
2779 out[2] = vdup_n_u16(left_max);
2780 } else {
2781 const uint16x4_t l20 = vld1_u16(left + base2);
2782 const uint16x4_t l21 = vld1_u16(left1 + base2);
2783 HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[2], iota1x4, base2, l20, l21,
2784 shifts0, shifts1, 2, 6);
2785 }
2786 if (base3 >= max_base_y) {
2787 out[3] = vdup_n_u16(left_max);
2788 } else {
2789 const uint16x4_t l30 = vld1_u16(left + base3);
2790 const uint16x4_t l31 = vld1_u16(left1 + base3);
2791 HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[3], iota1x4, base3, l30, l31,
2792 shifts0, shifts1, 3, 6);
2793 }
2794 transpose_array_inplace_u16_4x4(out);
2795 for (int r2 = 0; r2 < 4; ++r2) {
2796 vst1_u16(dst + r2 * stride + c, out[r2]);
2797 }
2798 y += 4 * dy;
2799 c += 4;
2800 } while (c < bw);
2801 } else {
2802 int y = dy;
2803 int c = 0;
2804 do {
2805 int r = 0;
2806 do {
2807 // Fully unroll the 4x4 block to allow us to use immediate lane-indexed
2808 // multiply instructions.
2809 const uint16x4_t shifts1 =
2810 vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask);
2811 const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(64), shifts1);
2812 const int base0 = ((y + 0 * dy) >> frac_bits) + r;
2813 const int base1 = ((y + 1 * dy) >> frac_bits) + r;
2814 const int base2 = ((y + 2 * dy) >> frac_bits) + r;
2815 const int base3 = ((y + 3 * dy) >> frac_bits) + r;
2816 uint16x8_t out[4];
2817 if (base0 >= max_base_y) {
2818 out[0] = vdupq_n_u16(left_max);
2819 } else {
2820 const uint16x8x2_t l0 = z3_load_left_neon(left, base0, max_base_y);
2821 HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[0], iota1x8, base0, l0.val[0],
2822 l0.val[1], shifts0, shifts1, 0, 6);
2823 }
2824 if (base1 >= max_base_y) {
2825 out[1] = vdupq_n_u16(left_max);
2826 } else {
2827 const uint16x8x2_t l1 = z3_load_left_neon(left, base1, max_base_y);
2828 HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[1], iota1x8, base1, l1.val[0],
2829 l1.val[1], shifts0, shifts1, 1, 6);
2830 }
2831 if (base2 >= max_base_y) {
2832 out[2] = vdupq_n_u16(left_max);
2833 } else {
2834 const uint16x8x2_t l2 = z3_load_left_neon(left, base2, max_base_y);
2835 HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[2], iota1x8, base2, l2.val[0],
2836 l2.val[1], shifts0, shifts1, 2, 6);
2837 }
2838 if (base3 >= max_base_y) {
2839 out[3] = vdupq_n_u16(left_max);
2840 } else {
2841 const uint16x8x2_t l3 = z3_load_left_neon(left, base3, max_base_y);
2842 HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[3], iota1x8, base3, l3.val[0],
2843 l3.val[1], shifts0, shifts1, 3, 6);
2844 }
2845 transpose_array_inplace_u16_4x8(out);
2846 for (int r2 = 0; r2 < 4; ++r2) {
2847 vst1_u16(dst + (r + r2) * stride + c, vget_low_u16(out[r2]));
2848 }
2849 for (int r2 = 0; r2 < 4; ++r2) {
2850 vst1_u16(dst + (r + r2 + 4) * stride + c, vget_high_u16(out[r2]));
2851 }
2852 r += 8;
2853 } while (r < bh);
2854 y += 4 * dy;
2855 c += 4;
2856 } while (c < bw);
2857 }
2858 }
2859
highbd_dr_prediction_z3_upsample1_neon(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * left,int dy)2860 static void highbd_dr_prediction_z3_upsample1_neon(uint16_t *dst,
2861 ptrdiff_t stride, int bw,
2862 int bh, const uint16_t *left,
2863 int dy) {
2864 assert(bw % 4 == 0);
2865 assert(bh % 4 == 0);
2866 assert(dy > 0);
2867
2868 const int max_base_y = (bw + bh - 1) << 1;
2869 const int left_max = left[max_base_y];
2870 const int frac_bits = 5;
2871
2872 const uint16x4_t iota1x4 = vreinterpret_u16_s16(vld1_s16(iota1_s16));
2873 const uint16x8_t iota2x8 = vreinterpretq_u16_s16(vld1q_s16(iota2_s16));
2874 const uint16x4_t iota2x4 = vget_low_u16(iota2x8);
2875
2876 // The C implementation of the z3 predictor when upsampling uses:
2877 // (((x << 1) & 0x3f) >> 1)
2878 // The two shifts are unnecessary here since the lowest bit is guaranteed to
2879 // be zero when the mask is applied, so adjust the mask to 0x1f to avoid
2880 // needing the shifts at all.
2881 const uint16x4_t shift_mask = vdup_n_u16(0x1F);
2882
2883 if (bh == 4) {
2884 int y = dy;
2885 int c = 0;
2886 do {
2887 // Fully unroll the 4x4 block to allow us to use immediate lane-indexed
2888 // multiply instructions.
2889 const uint16x4_t shifts1 =
2890 vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask);
2891 const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(32), shifts1);
2892 const int base0 = (y + 0 * dy) >> frac_bits;
2893 const int base1 = (y + 1 * dy) >> frac_bits;
2894 const int base2 = (y + 2 * dy) >> frac_bits;
2895 const int base3 = (y + 3 * dy) >> frac_bits;
2896 const uint16x4x2_t l0 = vld2_u16(left + base0);
2897 const uint16x4x2_t l1 = vld2_u16(left + base1);
2898 const uint16x4x2_t l2 = vld2_u16(left + base2);
2899 const uint16x4x2_t l3 = vld2_u16(left + base3);
2900 uint16x4_t out[4];
2901 HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[0], iota2x4, base0, l0.val[0],
2902 l0.val[1], shifts0, shifts1, 0, 5);
2903 HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[1], iota2x4, base1, l1.val[0],
2904 l1.val[1], shifts0, shifts1, 1, 5);
2905 HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[2], iota2x4, base2, l2.val[0],
2906 l2.val[1], shifts0, shifts1, 2, 5);
2907 HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[3], iota2x4, base3, l3.val[0],
2908 l3.val[1], shifts0, shifts1, 3, 5);
2909 transpose_array_inplace_u16_4x4(out);
2910 for (int r2 = 0; r2 < 4; ++r2) {
2911 vst1_u16(dst + r2 * stride + c, out[r2]);
2912 }
2913 y += 4 * dy;
2914 c += 4;
2915 } while (c < bw);
2916 } else {
2917 assert(bh % 8 == 0);
2918
2919 int y = dy;
2920 int c = 0;
2921 do {
2922 int r = 0;
2923 do {
2924 // Fully unroll the 4x8 block to allow us to use immediate lane-indexed
2925 // multiply instructions.
2926 const uint16x4_t shifts1 =
2927 vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask);
2928 const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(32), shifts1);
2929 const int base0 = ((y + 0 * dy) >> frac_bits) + (r * 2);
2930 const int base1 = ((y + 1 * dy) >> frac_bits) + (r * 2);
2931 const int base2 = ((y + 2 * dy) >> frac_bits) + (r * 2);
2932 const int base3 = ((y + 3 * dy) >> frac_bits) + (r * 2);
2933 const uint16x8x2_t l0 = vld2q_u16(left + base0);
2934 const uint16x8x2_t l1 = vld2q_u16(left + base1);
2935 const uint16x8x2_t l2 = vld2q_u16(left + base2);
2936 const uint16x8x2_t l3 = vld2q_u16(left + base3);
2937 uint16x8_t out[4];
2938 HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[0], iota2x8, base0, l0.val[0],
2939 l0.val[1], shifts0, shifts1, 0, 5);
2940 HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[1], iota2x8, base1, l1.val[0],
2941 l1.val[1], shifts0, shifts1, 1, 5);
2942 HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[2], iota2x8, base2, l2.val[0],
2943 l2.val[1], shifts0, shifts1, 2, 5);
2944 HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[3], iota2x8, base3, l3.val[0],
2945 l3.val[1], shifts0, shifts1, 3, 5);
2946 transpose_array_inplace_u16_4x8(out);
2947 for (int r2 = 0; r2 < 4; ++r2) {
2948 vst1_u16(dst + (r + r2) * stride + c, vget_low_u16(out[r2]));
2949 }
2950 for (int r2 = 0; r2 < 4; ++r2) {
2951 vst1_u16(dst + (r + r2 + 4) * stride + c, vget_high_u16(out[r2]));
2952 }
2953 r += 8;
2954 } while (r < bh);
2955 y += 4 * dy;
2956 c += 4;
2957 } while (c < bw);
2958 }
2959 }
2960
2961 // Directional prediction, zone 3: 180 < angle < 270
av1_highbd_dr_prediction_z3_neon(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int upsample_left,int dx,int dy,int bd)2962 void av1_highbd_dr_prediction_z3_neon(uint16_t *dst, ptrdiff_t stride, int bw,
2963 int bh, const uint16_t *above,
2964 const uint16_t *left, int upsample_left,
2965 int dx, int dy, int bd) {
2966 (void)above;
2967 (void)dx;
2968 (void)bd;
2969 assert(bw % 4 == 0);
2970 assert(bh % 4 == 0);
2971 assert(dx == 1);
2972 assert(dy > 0);
2973
2974 if (upsample_left) {
2975 highbd_dr_prediction_z3_upsample1_neon(dst, stride, bw, bh, left, dy);
2976 } else {
2977 highbd_dr_prediction_z3_upsample0_neon(dst, stride, bw, bh, left, dy);
2978 }
2979 }
2980
2981 #undef HIGHBD_DR_PREDICTOR_Z3_STEP_X4
2982 #undef HIGHBD_DR_PREDICTOR_Z3_STEP_X8
2983