xref: /aosp_15_r20/external/libaom/aom_dsp/arm/highbd_intrapred_neon.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2022, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <arm_neon.h>
13 
14 #include "config/aom_config.h"
15 #include "config/aom_dsp_rtcd.h"
16 #include "config/av1_rtcd.h"
17 
18 #include "aom/aom_integer.h"
19 #include "aom_dsp/arm/mem_neon.h"
20 #include "aom_dsp/arm/sum_neon.h"
21 #include "aom_dsp/arm/transpose_neon.h"
22 #include "aom_dsp/intrapred_common.h"
23 
24 // -----------------------------------------------------------------------------
25 // DC
26 
highbd_dc_store_4xh(uint16_t * dst,ptrdiff_t stride,int h,uint16x4_t dc)27 static inline void highbd_dc_store_4xh(uint16_t *dst, ptrdiff_t stride, int h,
28                                        uint16x4_t dc) {
29   for (int i = 0; i < h; ++i) {
30     vst1_u16(dst + i * stride, dc);
31   }
32 }
33 
highbd_dc_store_8xh(uint16_t * dst,ptrdiff_t stride,int h,uint16x8_t dc)34 static inline void highbd_dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int h,
35                                        uint16x8_t dc) {
36   for (int i = 0; i < h; ++i) {
37     vst1q_u16(dst + i * stride, dc);
38   }
39 }
40 
highbd_dc_store_16xh(uint16_t * dst,ptrdiff_t stride,int h,uint16x8_t dc)41 static inline void highbd_dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int h,
42                                         uint16x8_t dc) {
43   for (int i = 0; i < h; ++i) {
44     vst1q_u16(dst + i * stride, dc);
45     vst1q_u16(dst + i * stride + 8, dc);
46   }
47 }
48 
highbd_dc_store_32xh(uint16_t * dst,ptrdiff_t stride,int h,uint16x8_t dc)49 static inline void highbd_dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int h,
50                                         uint16x8_t dc) {
51   for (int i = 0; i < h; ++i) {
52     vst1q_u16(dst + i * stride, dc);
53     vst1q_u16(dst + i * stride + 8, dc);
54     vst1q_u16(dst + i * stride + 16, dc);
55     vst1q_u16(dst + i * stride + 24, dc);
56   }
57 }
58 
highbd_dc_store_64xh(uint16_t * dst,ptrdiff_t stride,int h,uint16x8_t dc)59 static inline void highbd_dc_store_64xh(uint16_t *dst, ptrdiff_t stride, int h,
60                                         uint16x8_t dc) {
61   for (int i = 0; i < h; ++i) {
62     vst1q_u16(dst + i * stride, dc);
63     vst1q_u16(dst + i * stride + 8, dc);
64     vst1q_u16(dst + i * stride + 16, dc);
65     vst1q_u16(dst + i * stride + 24, dc);
66     vst1q_u16(dst + i * stride + 32, dc);
67     vst1q_u16(dst + i * stride + 40, dc);
68     vst1q_u16(dst + i * stride + 48, dc);
69     vst1q_u16(dst + i * stride + 56, dc);
70   }
71 }
72 
horizontal_add_and_broadcast_long_u16x8(uint16x8_t a)73 static inline uint32x4_t horizontal_add_and_broadcast_long_u16x8(uint16x8_t a) {
74   // Need to assume input is up to 16 bits wide from dc 64x64 partial sum, so
75   // promote first.
76   const uint32x4_t b = vpaddlq_u16(a);
77 #if AOM_ARCH_AARCH64
78   const uint32x4_t c = vpaddq_u32(b, b);
79   return vpaddq_u32(c, c);
80 #else
81   const uint32x2_t c = vadd_u32(vget_low_u32(b), vget_high_u32(b));
82   const uint32x2_t d = vpadd_u32(c, c);
83   return vcombine_u32(d, d);
84 #endif
85 }
86 
highbd_dc_load_partial_sum_4(const uint16_t * left)87 static inline uint16x8_t highbd_dc_load_partial_sum_4(const uint16_t *left) {
88   // Nothing to do since sum is already one vector, but saves needing to
89   // special case w=4 or h=4 cases. The combine will be zero cost for a sane
90   // compiler since vld1 already sets the top half of a vector to zero as part
91   // of the operation.
92   return vcombine_u16(vld1_u16(left), vdup_n_u16(0));
93 }
94 
highbd_dc_load_partial_sum_8(const uint16_t * left)95 static inline uint16x8_t highbd_dc_load_partial_sum_8(const uint16_t *left) {
96   // Nothing to do since sum is already one vector, but saves needing to
97   // special case w=8 or h=8 cases.
98   return vld1q_u16(left);
99 }
100 
highbd_dc_load_partial_sum_16(const uint16_t * left)101 static inline uint16x8_t highbd_dc_load_partial_sum_16(const uint16_t *left) {
102   const uint16x8_t a0 = vld1q_u16(left + 0);  // up to 12 bits
103   const uint16x8_t a1 = vld1q_u16(left + 8);
104   return vaddq_u16(a0, a1);  // up to 13 bits
105 }
106 
highbd_dc_load_partial_sum_32(const uint16_t * left)107 static inline uint16x8_t highbd_dc_load_partial_sum_32(const uint16_t *left) {
108   const uint16x8_t a0 = vld1q_u16(left + 0);  // up to 12 bits
109   const uint16x8_t a1 = vld1q_u16(left + 8);
110   const uint16x8_t a2 = vld1q_u16(left + 16);
111   const uint16x8_t a3 = vld1q_u16(left + 24);
112   const uint16x8_t b0 = vaddq_u16(a0, a1);  // up to 13 bits
113   const uint16x8_t b1 = vaddq_u16(a2, a3);
114   return vaddq_u16(b0, b1);  // up to 14 bits
115 }
116 
highbd_dc_load_partial_sum_64(const uint16_t * left)117 static inline uint16x8_t highbd_dc_load_partial_sum_64(const uint16_t *left) {
118   const uint16x8_t a0 = vld1q_u16(left + 0);  // up to 12 bits
119   const uint16x8_t a1 = vld1q_u16(left + 8);
120   const uint16x8_t a2 = vld1q_u16(left + 16);
121   const uint16x8_t a3 = vld1q_u16(left + 24);
122   const uint16x8_t a4 = vld1q_u16(left + 32);
123   const uint16x8_t a5 = vld1q_u16(left + 40);
124   const uint16x8_t a6 = vld1q_u16(left + 48);
125   const uint16x8_t a7 = vld1q_u16(left + 56);
126   const uint16x8_t b0 = vaddq_u16(a0, a1);  // up to 13 bits
127   const uint16x8_t b1 = vaddq_u16(a2, a3);
128   const uint16x8_t b2 = vaddq_u16(a4, a5);
129   const uint16x8_t b3 = vaddq_u16(a6, a7);
130   const uint16x8_t c0 = vaddq_u16(b0, b1);  // up to 14 bits
131   const uint16x8_t c1 = vaddq_u16(b2, b3);
132   return vaddq_u16(c0, c1);  // up to 15 bits
133 }
134 
135 #define HIGHBD_DC_PREDICTOR(w, h, shift)                               \
136   void aom_highbd_dc_predictor_##w##x##h##_neon(                       \
137       uint16_t *dst, ptrdiff_t stride, const uint16_t *above,          \
138       const uint16_t *left, int bd) {                                  \
139     (void)bd;                                                          \
140     const uint16x8_t a = highbd_dc_load_partial_sum_##w(above);        \
141     const uint16x8_t l = highbd_dc_load_partial_sum_##h(left);         \
142     const uint32x4_t sum =                                             \
143         horizontal_add_and_broadcast_long_u16x8(vaddq_u16(a, l));      \
144     const uint16x4_t dc0 = vrshrn_n_u32(sum, shift);                   \
145     highbd_dc_store_##w##xh(dst, stride, (h), vdupq_lane_u16(dc0, 0)); \
146   }
147 
aom_highbd_dc_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)148 void aom_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
149                                       const uint16_t *above,
150                                       const uint16_t *left, int bd) {
151   // In the rectangular cases we simply extend the shorter vector to uint16x8
152   // in order to accumulate, however in the 4x4 case there is no shorter vector
153   // to extend so it is beneficial to do the whole calculation in uint16x4
154   // instead.
155   (void)bd;
156   const uint16x4_t a = vld1_u16(above);  // up to 12 bits
157   const uint16x4_t l = vld1_u16(left);
158   uint16x4_t sum = vpadd_u16(a, l);  // up to 13 bits
159   sum = vpadd_u16(sum, sum);         // up to 14 bits
160   sum = vpadd_u16(sum, sum);
161   const uint16x4_t dc = vrshr_n_u16(sum, 3);
162   highbd_dc_store_4xh(dst, stride, 4, dc);
163 }
164 
165 HIGHBD_DC_PREDICTOR(8, 8, 4)
166 HIGHBD_DC_PREDICTOR(16, 16, 5)
167 HIGHBD_DC_PREDICTOR(32, 32, 6)
168 HIGHBD_DC_PREDICTOR(64, 64, 7)
169 
170 #undef HIGHBD_DC_PREDICTOR
171 
divide_using_multiply_shift(int num,int shift1,int multiplier,int shift2)172 static inline int divide_using_multiply_shift(int num, int shift1,
173                                               int multiplier, int shift2) {
174   const int interm = num >> shift1;
175   return interm * multiplier >> shift2;
176 }
177 
178 #define HIGHBD_DC_MULTIPLIER_1X2 0xAAAB
179 #define HIGHBD_DC_MULTIPLIER_1X4 0x6667
180 #define HIGHBD_DC_SHIFT2 17
181 
highbd_dc_predictor_rect(int bw,int bh,int sum,int shift1,uint32_t multiplier)182 static inline int highbd_dc_predictor_rect(int bw, int bh, int sum, int shift1,
183                                            uint32_t multiplier) {
184   return divide_using_multiply_shift(sum + ((bw + bh) >> 1), shift1, multiplier,
185                                      HIGHBD_DC_SHIFT2);
186 }
187 
188 #undef HIGHBD_DC_SHIFT2
189 
190 #define HIGHBD_DC_PREDICTOR_RECT(w, h, q, shift, mult)                  \
191   void aom_highbd_dc_predictor_##w##x##h##_neon(                        \
192       uint16_t *dst, ptrdiff_t stride, const uint16_t *above,           \
193       const uint16_t *left, int bd) {                                   \
194     (void)bd;                                                           \
195     uint16x8_t sum_above = highbd_dc_load_partial_sum_##w(above);       \
196     uint16x8_t sum_left = highbd_dc_load_partial_sum_##h(left);         \
197     uint16x8_t sum_vec = vaddq_u16(sum_left, sum_above);                \
198     int sum = horizontal_add_u16x8(sum_vec);                            \
199     int dc0 = highbd_dc_predictor_rect((w), (h), sum, (shift), (mult)); \
200     highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_n_u16(dc0));    \
201   }
202 
203 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
204 HIGHBD_DC_PREDICTOR_RECT(4, 8, , 2, HIGHBD_DC_MULTIPLIER_1X2)
205 HIGHBD_DC_PREDICTOR_RECT(4, 16, , 2, HIGHBD_DC_MULTIPLIER_1X4)
206 HIGHBD_DC_PREDICTOR_RECT(8, 4, q, 2, HIGHBD_DC_MULTIPLIER_1X2)
207 HIGHBD_DC_PREDICTOR_RECT(8, 16, q, 3, HIGHBD_DC_MULTIPLIER_1X2)
208 HIGHBD_DC_PREDICTOR_RECT(8, 32, q, 3, HIGHBD_DC_MULTIPLIER_1X4)
209 HIGHBD_DC_PREDICTOR_RECT(16, 4, q, 2, HIGHBD_DC_MULTIPLIER_1X4)
210 HIGHBD_DC_PREDICTOR_RECT(16, 8, q, 3, HIGHBD_DC_MULTIPLIER_1X2)
211 HIGHBD_DC_PREDICTOR_RECT(16, 32, q, 4, HIGHBD_DC_MULTIPLIER_1X2)
212 HIGHBD_DC_PREDICTOR_RECT(16, 64, q, 4, HIGHBD_DC_MULTIPLIER_1X4)
213 HIGHBD_DC_PREDICTOR_RECT(32, 8, q, 3, HIGHBD_DC_MULTIPLIER_1X4)
214 HIGHBD_DC_PREDICTOR_RECT(32, 16, q, 4, HIGHBD_DC_MULTIPLIER_1X2)
215 HIGHBD_DC_PREDICTOR_RECT(32, 64, q, 5, HIGHBD_DC_MULTIPLIER_1X2)
216 HIGHBD_DC_PREDICTOR_RECT(64, 16, q, 4, HIGHBD_DC_MULTIPLIER_1X4)
217 HIGHBD_DC_PREDICTOR_RECT(64, 32, q, 5, HIGHBD_DC_MULTIPLIER_1X2)
218 #else
219 HIGHBD_DC_PREDICTOR_RECT(4, 8, , 2, HIGHBD_DC_MULTIPLIER_1X2)
220 HIGHBD_DC_PREDICTOR_RECT(8, 4, q, 2, HIGHBD_DC_MULTIPLIER_1X2)
221 HIGHBD_DC_PREDICTOR_RECT(8, 16, q, 3, HIGHBD_DC_MULTIPLIER_1X2)
222 HIGHBD_DC_PREDICTOR_RECT(16, 8, q, 3, HIGHBD_DC_MULTIPLIER_1X2)
223 HIGHBD_DC_PREDICTOR_RECT(16, 32, q, 4, HIGHBD_DC_MULTIPLIER_1X2)
224 HIGHBD_DC_PREDICTOR_RECT(32, 16, q, 4, HIGHBD_DC_MULTIPLIER_1X2)
225 HIGHBD_DC_PREDICTOR_RECT(32, 64, q, 5, HIGHBD_DC_MULTIPLIER_1X2)
226 HIGHBD_DC_PREDICTOR_RECT(64, 32, q, 5, HIGHBD_DC_MULTIPLIER_1X2)
227 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
228 
229 #undef HIGHBD_DC_PREDICTOR_RECT
230 #undef HIGHBD_DC_MULTIPLIER_1X2
231 #undef HIGHBD_DC_MULTIPLIER_1X4
232 
233 // -----------------------------------------------------------------------------
234 // DC_128
235 
236 #define HIGHBD_DC_PREDICTOR_128(w, h, q)                        \
237   void aom_highbd_dc_128_predictor_##w##x##h##_neon(            \
238       uint16_t *dst, ptrdiff_t stride, const uint16_t *above,   \
239       const uint16_t *left, int bd) {                           \
240     (void)above;                                                \
241     (void)bd;                                                   \
242     (void)left;                                                 \
243     highbd_dc_store_##w##xh(dst, stride, (h),                   \
244                             vdup##q##_n_u16(0x80 << (bd - 8))); \
245   }
246 
247 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
248 HIGHBD_DC_PREDICTOR_128(4, 4, )
249 HIGHBD_DC_PREDICTOR_128(4, 8, )
250 HIGHBD_DC_PREDICTOR_128(4, 16, )
251 HIGHBD_DC_PREDICTOR_128(8, 4, q)
252 HIGHBD_DC_PREDICTOR_128(8, 8, q)
253 HIGHBD_DC_PREDICTOR_128(8, 16, q)
254 HIGHBD_DC_PREDICTOR_128(8, 32, q)
255 HIGHBD_DC_PREDICTOR_128(16, 4, q)
256 HIGHBD_DC_PREDICTOR_128(16, 8, q)
257 HIGHBD_DC_PREDICTOR_128(16, 16, q)
258 HIGHBD_DC_PREDICTOR_128(16, 32, q)
259 HIGHBD_DC_PREDICTOR_128(16, 64, q)
260 HIGHBD_DC_PREDICTOR_128(32, 8, q)
261 HIGHBD_DC_PREDICTOR_128(32, 16, q)
262 HIGHBD_DC_PREDICTOR_128(32, 32, q)
263 HIGHBD_DC_PREDICTOR_128(32, 64, q)
264 HIGHBD_DC_PREDICTOR_128(64, 16, q)
265 HIGHBD_DC_PREDICTOR_128(64, 32, q)
266 HIGHBD_DC_PREDICTOR_128(64, 64, q)
267 #else
268 HIGHBD_DC_PREDICTOR_128(4, 4, )
269 HIGHBD_DC_PREDICTOR_128(4, 8, )
270 HIGHBD_DC_PREDICTOR_128(8, 4, q)
271 HIGHBD_DC_PREDICTOR_128(8, 8, q)
272 HIGHBD_DC_PREDICTOR_128(8, 16, q)
273 HIGHBD_DC_PREDICTOR_128(16, 8, q)
274 HIGHBD_DC_PREDICTOR_128(16, 16, q)
275 HIGHBD_DC_PREDICTOR_128(16, 32, q)
276 HIGHBD_DC_PREDICTOR_128(32, 16, q)
277 HIGHBD_DC_PREDICTOR_128(32, 32, q)
278 HIGHBD_DC_PREDICTOR_128(32, 64, q)
279 HIGHBD_DC_PREDICTOR_128(64, 32, q)
280 HIGHBD_DC_PREDICTOR_128(64, 64, q)
281 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
282 
283 #undef HIGHBD_DC_PREDICTOR_128
284 
285 // -----------------------------------------------------------------------------
286 // DC_LEFT
287 
highbd_dc_load_sum_4(const uint16_t * left)288 static inline uint32x4_t highbd_dc_load_sum_4(const uint16_t *left) {
289   const uint16x4_t a = vld1_u16(left);   // up to 12 bits
290   const uint16x4_t b = vpadd_u16(a, a);  // up to 13 bits
291   return vcombine_u32(vpaddl_u16(b), vdup_n_u32(0));
292 }
293 
highbd_dc_load_sum_8(const uint16_t * left)294 static inline uint32x4_t highbd_dc_load_sum_8(const uint16_t *left) {
295   return horizontal_add_and_broadcast_long_u16x8(vld1q_u16(left));
296 }
297 
highbd_dc_load_sum_16(const uint16_t * left)298 static inline uint32x4_t highbd_dc_load_sum_16(const uint16_t *left) {
299   return horizontal_add_and_broadcast_long_u16x8(
300       highbd_dc_load_partial_sum_16(left));
301 }
302 
highbd_dc_load_sum_32(const uint16_t * left)303 static inline uint32x4_t highbd_dc_load_sum_32(const uint16_t *left) {
304   return horizontal_add_and_broadcast_long_u16x8(
305       highbd_dc_load_partial_sum_32(left));
306 }
307 
highbd_dc_load_sum_64(const uint16_t * left)308 static inline uint32x4_t highbd_dc_load_sum_64(const uint16_t *left) {
309   return horizontal_add_and_broadcast_long_u16x8(
310       highbd_dc_load_partial_sum_64(left));
311 }
312 
313 #define DC_PREDICTOR_LEFT(w, h, shift, q)                                  \
314   void aom_highbd_dc_left_predictor_##w##x##h##_neon(                      \
315       uint16_t *dst, ptrdiff_t stride, const uint16_t *above,              \
316       const uint16_t *left, int bd) {                                      \
317     (void)above;                                                           \
318     (void)bd;                                                              \
319     const uint32x4_t sum = highbd_dc_load_sum_##h(left);                   \
320     const uint16x4_t dc0 = vrshrn_n_u32(sum, (shift));                     \
321     highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u16(dc0, 0)); \
322   }
323 
324 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
325 DC_PREDICTOR_LEFT(4, 4, 2, )
326 DC_PREDICTOR_LEFT(4, 8, 3, )
327 DC_PREDICTOR_LEFT(4, 16, 4, )
328 DC_PREDICTOR_LEFT(8, 4, 2, q)
329 DC_PREDICTOR_LEFT(8, 8, 3, q)
330 DC_PREDICTOR_LEFT(8, 16, 4, q)
331 DC_PREDICTOR_LEFT(8, 32, 5, q)
332 DC_PREDICTOR_LEFT(16, 4, 2, q)
333 DC_PREDICTOR_LEFT(16, 8, 3, q)
334 DC_PREDICTOR_LEFT(16, 16, 4, q)
335 DC_PREDICTOR_LEFT(16, 32, 5, q)
336 DC_PREDICTOR_LEFT(16, 64, 6, q)
337 DC_PREDICTOR_LEFT(32, 8, 3, q)
338 DC_PREDICTOR_LEFT(32, 16, 4, q)
339 DC_PREDICTOR_LEFT(32, 32, 5, q)
340 DC_PREDICTOR_LEFT(32, 64, 6, q)
341 DC_PREDICTOR_LEFT(64, 16, 4, q)
342 DC_PREDICTOR_LEFT(64, 32, 5, q)
343 DC_PREDICTOR_LEFT(64, 64, 6, q)
344 #else
345 DC_PREDICTOR_LEFT(4, 4, 2, )
346 DC_PREDICTOR_LEFT(4, 8, 3, )
347 DC_PREDICTOR_LEFT(8, 4, 2, q)
348 DC_PREDICTOR_LEFT(8, 8, 3, q)
349 DC_PREDICTOR_LEFT(8, 16, 4, q)
350 DC_PREDICTOR_LEFT(16, 8, 3, q)
351 DC_PREDICTOR_LEFT(16, 16, 4, q)
352 DC_PREDICTOR_LEFT(16, 32, 5, q)
353 DC_PREDICTOR_LEFT(32, 16, 4, q)
354 DC_PREDICTOR_LEFT(32, 32, 5, q)
355 DC_PREDICTOR_LEFT(32, 64, 6, q)
356 DC_PREDICTOR_LEFT(64, 32, 5, q)
357 DC_PREDICTOR_LEFT(64, 64, 6, q)
358 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
359 
360 #undef DC_PREDICTOR_LEFT
361 
362 // -----------------------------------------------------------------------------
363 // DC_TOP
364 
365 #define DC_PREDICTOR_TOP(w, h, shift, q)                                   \
366   void aom_highbd_dc_top_predictor_##w##x##h##_neon(                       \
367       uint16_t *dst, ptrdiff_t stride, const uint16_t *above,              \
368       const uint16_t *left, int bd) {                                      \
369     (void)bd;                                                              \
370     (void)left;                                                            \
371     const uint32x4_t sum = highbd_dc_load_sum_##w(above);                  \
372     const uint16x4_t dc0 = vrshrn_n_u32(sum, (shift));                     \
373     highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u16(dc0, 0)); \
374   }
375 
376 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
377 DC_PREDICTOR_TOP(4, 4, 2, )
378 DC_PREDICTOR_TOP(4, 8, 2, )
379 DC_PREDICTOR_TOP(4, 16, 2, )
380 DC_PREDICTOR_TOP(8, 4, 3, q)
381 DC_PREDICTOR_TOP(8, 8, 3, q)
382 DC_PREDICTOR_TOP(8, 16, 3, q)
383 DC_PREDICTOR_TOP(8, 32, 3, q)
384 DC_PREDICTOR_TOP(16, 4, 4, q)
385 DC_PREDICTOR_TOP(16, 8, 4, q)
386 DC_PREDICTOR_TOP(16, 16, 4, q)
387 DC_PREDICTOR_TOP(16, 32, 4, q)
388 DC_PREDICTOR_TOP(16, 64, 4, q)
389 DC_PREDICTOR_TOP(32, 8, 5, q)
390 DC_PREDICTOR_TOP(32, 16, 5, q)
391 DC_PREDICTOR_TOP(32, 32, 5, q)
392 DC_PREDICTOR_TOP(32, 64, 5, q)
393 DC_PREDICTOR_TOP(64, 16, 6, q)
394 DC_PREDICTOR_TOP(64, 32, 6, q)
395 DC_PREDICTOR_TOP(64, 64, 6, q)
396 #else
397 DC_PREDICTOR_TOP(4, 4, 2, )
398 DC_PREDICTOR_TOP(4, 8, 2, )
399 DC_PREDICTOR_TOP(8, 4, 3, q)
400 DC_PREDICTOR_TOP(8, 8, 3, q)
401 DC_PREDICTOR_TOP(8, 16, 3, q)
402 DC_PREDICTOR_TOP(16, 8, 4, q)
403 DC_PREDICTOR_TOP(16, 16, 4, q)
404 DC_PREDICTOR_TOP(16, 32, 4, q)
405 DC_PREDICTOR_TOP(32, 16, 5, q)
406 DC_PREDICTOR_TOP(32, 32, 5, q)
407 DC_PREDICTOR_TOP(32, 64, 5, q)
408 DC_PREDICTOR_TOP(64, 32, 6, q)
409 DC_PREDICTOR_TOP(64, 64, 6, q)
410 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
411 
412 #undef DC_PREDICTOR_TOP
413 
414 // -----------------------------------------------------------------------------
415 // V_PRED
416 
417 #define HIGHBD_V_NXM(W, H)                                    \
418   void aom_highbd_v_predictor_##W##x##H##_neon(               \
419       uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
420       const uint16_t *left, int bd) {                         \
421     (void)left;                                               \
422     (void)bd;                                                 \
423     vertical##W##xh_neon(dst, stride, above, H);              \
424   }
425 
load_uint16x8x2(uint16_t const * ptr)426 static inline uint16x8x2_t load_uint16x8x2(uint16_t const *ptr) {
427   uint16x8x2_t x;
428   // Clang/gcc uses ldp here.
429   x.val[0] = vld1q_u16(ptr);
430   x.val[1] = vld1q_u16(ptr + 8);
431   return x;
432 }
433 
store_uint16x8x2(uint16_t * ptr,uint16x8x2_t x)434 static inline void store_uint16x8x2(uint16_t *ptr, uint16x8x2_t x) {
435   vst1q_u16(ptr, x.val[0]);
436   vst1q_u16(ptr + 8, x.val[1]);
437 }
438 
vertical4xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const above,int height)439 static inline void vertical4xh_neon(uint16_t *dst, ptrdiff_t stride,
440                                     const uint16_t *const above, int height) {
441   const uint16x4_t row = vld1_u16(above);
442   int y = height;
443   do {
444     vst1_u16(dst, row);
445     vst1_u16(dst + stride, row);
446     dst += stride << 1;
447     y -= 2;
448   } while (y != 0);
449 }
450 
vertical8xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const above,int height)451 static inline void vertical8xh_neon(uint16_t *dst, ptrdiff_t stride,
452                                     const uint16_t *const above, int height) {
453   const uint16x8_t row = vld1q_u16(above);
454   int y = height;
455   do {
456     vst1q_u16(dst, row);
457     vst1q_u16(dst + stride, row);
458     dst += stride << 1;
459     y -= 2;
460   } while (y != 0);
461 }
462 
vertical16xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const above,int height)463 static inline void vertical16xh_neon(uint16_t *dst, ptrdiff_t stride,
464                                      const uint16_t *const above, int height) {
465   const uint16x8x2_t row = load_uint16x8x2(above);
466   int y = height;
467   do {
468     store_uint16x8x2(dst, row);
469     store_uint16x8x2(dst + stride, row);
470     dst += stride << 1;
471     y -= 2;
472   } while (y != 0);
473 }
474 
load_uint16x8x4(uint16_t const * ptr)475 static inline uint16x8x4_t load_uint16x8x4(uint16_t const *ptr) {
476   uint16x8x4_t x;
477   // Clang/gcc uses ldp here.
478   x.val[0] = vld1q_u16(ptr);
479   x.val[1] = vld1q_u16(ptr + 8);
480   x.val[2] = vld1q_u16(ptr + 16);
481   x.val[3] = vld1q_u16(ptr + 24);
482   return x;
483 }
484 
store_uint16x8x4(uint16_t * ptr,uint16x8x4_t x)485 static inline void store_uint16x8x4(uint16_t *ptr, uint16x8x4_t x) {
486   vst1q_u16(ptr, x.val[0]);
487   vst1q_u16(ptr + 8, x.val[1]);
488   vst1q_u16(ptr + 16, x.val[2]);
489   vst1q_u16(ptr + 24, x.val[3]);
490 }
491 
vertical32xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const above,int height)492 static inline void vertical32xh_neon(uint16_t *dst, ptrdiff_t stride,
493                                      const uint16_t *const above, int height) {
494   const uint16x8x4_t row = load_uint16x8x4(above);
495   int y = height;
496   do {
497     store_uint16x8x4(dst, row);
498     store_uint16x8x4(dst + stride, row);
499     dst += stride << 1;
500     y -= 2;
501   } while (y != 0);
502 }
503 
vertical64xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const above,int height)504 static inline void vertical64xh_neon(uint16_t *dst, ptrdiff_t stride,
505                                      const uint16_t *const above, int height) {
506   uint16_t *dst32 = dst + 32;
507   const uint16x8x4_t row = load_uint16x8x4(above);
508   const uint16x8x4_t row32 = load_uint16x8x4(above + 32);
509   int y = height;
510   do {
511     store_uint16x8x4(dst, row);
512     store_uint16x8x4(dst32, row32);
513     store_uint16x8x4(dst + stride, row);
514     store_uint16x8x4(dst32 + stride, row32);
515     dst += stride << 1;
516     dst32 += stride << 1;
517     y -= 2;
518   } while (y != 0);
519 }
520 
521 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
522 HIGHBD_V_NXM(4, 4)
523 HIGHBD_V_NXM(4, 8)
524 HIGHBD_V_NXM(4, 16)
525 
526 HIGHBD_V_NXM(8, 4)
527 HIGHBD_V_NXM(8, 8)
528 HIGHBD_V_NXM(8, 16)
529 HIGHBD_V_NXM(8, 32)
530 
531 HIGHBD_V_NXM(16, 4)
532 HIGHBD_V_NXM(16, 8)
533 HIGHBD_V_NXM(16, 16)
534 HIGHBD_V_NXM(16, 32)
535 HIGHBD_V_NXM(16, 64)
536 
537 HIGHBD_V_NXM(32, 8)
538 HIGHBD_V_NXM(32, 16)
539 HIGHBD_V_NXM(32, 32)
540 HIGHBD_V_NXM(32, 64)
541 
542 HIGHBD_V_NXM(64, 16)
543 HIGHBD_V_NXM(64, 32)
544 HIGHBD_V_NXM(64, 64)
545 #else
546 HIGHBD_V_NXM(4, 4)
547 HIGHBD_V_NXM(4, 8)
548 
549 HIGHBD_V_NXM(8, 4)
550 HIGHBD_V_NXM(8, 8)
551 HIGHBD_V_NXM(8, 16)
552 
553 HIGHBD_V_NXM(16, 8)
554 HIGHBD_V_NXM(16, 16)
555 HIGHBD_V_NXM(16, 32)
556 
557 HIGHBD_V_NXM(32, 16)
558 HIGHBD_V_NXM(32, 32)
559 HIGHBD_V_NXM(32, 64)
560 
561 HIGHBD_V_NXM(64, 32)
562 HIGHBD_V_NXM(64, 64)
563 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
564 
565 // -----------------------------------------------------------------------------
566 // H_PRED
567 
highbd_h_store_4x4(uint16_t * dst,ptrdiff_t stride,uint16x4_t left)568 static inline void highbd_h_store_4x4(uint16_t *dst, ptrdiff_t stride,
569                                       uint16x4_t left) {
570   vst1_u16(dst + 0 * stride, vdup_lane_u16(left, 0));
571   vst1_u16(dst + 1 * stride, vdup_lane_u16(left, 1));
572   vst1_u16(dst + 2 * stride, vdup_lane_u16(left, 2));
573   vst1_u16(dst + 3 * stride, vdup_lane_u16(left, 3));
574 }
575 
highbd_h_store_8x4(uint16_t * dst,ptrdiff_t stride,uint16x4_t left)576 static inline void highbd_h_store_8x4(uint16_t *dst, ptrdiff_t stride,
577                                       uint16x4_t left) {
578   vst1q_u16(dst + 0 * stride, vdupq_lane_u16(left, 0));
579   vst1q_u16(dst + 1 * stride, vdupq_lane_u16(left, 1));
580   vst1q_u16(dst + 2 * stride, vdupq_lane_u16(left, 2));
581   vst1q_u16(dst + 3 * stride, vdupq_lane_u16(left, 3));
582 }
583 
highbd_h_store_16x1(uint16_t * dst,uint16x8_t left)584 static inline void highbd_h_store_16x1(uint16_t *dst, uint16x8_t left) {
585   vst1q_u16(dst + 0, left);
586   vst1q_u16(dst + 8, left);
587 }
588 
highbd_h_store_16x4(uint16_t * dst,ptrdiff_t stride,uint16x4_t left)589 static inline void highbd_h_store_16x4(uint16_t *dst, ptrdiff_t stride,
590                                        uint16x4_t left) {
591   highbd_h_store_16x1(dst + 0 * stride, vdupq_lane_u16(left, 0));
592   highbd_h_store_16x1(dst + 1 * stride, vdupq_lane_u16(left, 1));
593   highbd_h_store_16x1(dst + 2 * stride, vdupq_lane_u16(left, 2));
594   highbd_h_store_16x1(dst + 3 * stride, vdupq_lane_u16(left, 3));
595 }
596 
highbd_h_store_32x1(uint16_t * dst,uint16x8_t left)597 static inline void highbd_h_store_32x1(uint16_t *dst, uint16x8_t left) {
598   vst1q_u16(dst + 0, left);
599   vst1q_u16(dst + 8, left);
600   vst1q_u16(dst + 16, left);
601   vst1q_u16(dst + 24, left);
602 }
603 
highbd_h_store_32x4(uint16_t * dst,ptrdiff_t stride,uint16x4_t left)604 static inline void highbd_h_store_32x4(uint16_t *dst, ptrdiff_t stride,
605                                        uint16x4_t left) {
606   highbd_h_store_32x1(dst + 0 * stride, vdupq_lane_u16(left, 0));
607   highbd_h_store_32x1(dst + 1 * stride, vdupq_lane_u16(left, 1));
608   highbd_h_store_32x1(dst + 2 * stride, vdupq_lane_u16(left, 2));
609   highbd_h_store_32x1(dst + 3 * stride, vdupq_lane_u16(left, 3));
610 }
611 
highbd_h_store_64x1(uint16_t * dst,uint16x8_t left)612 static inline void highbd_h_store_64x1(uint16_t *dst, uint16x8_t left) {
613   vst1q_u16(dst + 0, left);
614   vst1q_u16(dst + 8, left);
615   vst1q_u16(dst + 16, left);
616   vst1q_u16(dst + 24, left);
617   vst1q_u16(dst + 32, left);
618   vst1q_u16(dst + 40, left);
619   vst1q_u16(dst + 48, left);
620   vst1q_u16(dst + 56, left);
621 }
622 
highbd_h_store_64x4(uint16_t * dst,ptrdiff_t stride,uint16x4_t left)623 static inline void highbd_h_store_64x4(uint16_t *dst, ptrdiff_t stride,
624                                        uint16x4_t left) {
625   highbd_h_store_64x1(dst + 0 * stride, vdupq_lane_u16(left, 0));
626   highbd_h_store_64x1(dst + 1 * stride, vdupq_lane_u16(left, 1));
627   highbd_h_store_64x1(dst + 2 * stride, vdupq_lane_u16(left, 2));
628   highbd_h_store_64x1(dst + 3 * stride, vdupq_lane_u16(left, 3));
629 }
630 
aom_highbd_h_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)631 void aom_highbd_h_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
632                                      const uint16_t *above,
633                                      const uint16_t *left, int bd) {
634   (void)above;
635   (void)bd;
636   highbd_h_store_4x4(dst, stride, vld1_u16(left));
637 }
638 
aom_highbd_h_predictor_4x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)639 void aom_highbd_h_predictor_4x8_neon(uint16_t *dst, ptrdiff_t stride,
640                                      const uint16_t *above,
641                                      const uint16_t *left, int bd) {
642   (void)above;
643   (void)bd;
644   uint16x8_t l = vld1q_u16(left);
645   highbd_h_store_4x4(dst + 0 * stride, stride, vget_low_u16(l));
646   highbd_h_store_4x4(dst + 4 * stride, stride, vget_high_u16(l));
647 }
648 
aom_highbd_h_predictor_8x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)649 void aom_highbd_h_predictor_8x4_neon(uint16_t *dst, ptrdiff_t stride,
650                                      const uint16_t *above,
651                                      const uint16_t *left, int bd) {
652   (void)above;
653   (void)bd;
654   highbd_h_store_8x4(dst, stride, vld1_u16(left));
655 }
656 
aom_highbd_h_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)657 void aom_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
658                                      const uint16_t *above,
659                                      const uint16_t *left, int bd) {
660   (void)above;
661   (void)bd;
662   uint16x8_t l = vld1q_u16(left);
663   highbd_h_store_8x4(dst + 0 * stride, stride, vget_low_u16(l));
664   highbd_h_store_8x4(dst + 4 * stride, stride, vget_high_u16(l));
665 }
666 
667 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_highbd_h_predictor_16x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)668 void aom_highbd_h_predictor_16x4_neon(uint16_t *dst, ptrdiff_t stride,
669                                       const uint16_t *above,
670                                       const uint16_t *left, int bd) {
671   (void)above;
672   (void)bd;
673   highbd_h_store_16x4(dst, stride, vld1_u16(left));
674 }
675 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
676 
aom_highbd_h_predictor_16x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)677 void aom_highbd_h_predictor_16x8_neon(uint16_t *dst, ptrdiff_t stride,
678                                       const uint16_t *above,
679                                       const uint16_t *left, int bd) {
680   (void)above;
681   (void)bd;
682   uint16x8_t l = vld1q_u16(left);
683   highbd_h_store_16x4(dst + 0 * stride, stride, vget_low_u16(l));
684   highbd_h_store_16x4(dst + 4 * stride, stride, vget_high_u16(l));
685 }
686 
687 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
aom_highbd_h_predictor_32x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)688 void aom_highbd_h_predictor_32x8_neon(uint16_t *dst, ptrdiff_t stride,
689                                       const uint16_t *above,
690                                       const uint16_t *left, int bd) {
691   (void)above;
692   (void)bd;
693   uint16x8_t l = vld1q_u16(left);
694   highbd_h_store_32x4(dst + 0 * stride, stride, vget_low_u16(l));
695   highbd_h_store_32x4(dst + 4 * stride, stride, vget_high_u16(l));
696 }
697 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
698 
699 // For cases where height >= 16 we use pairs of loads to get LDP instructions.
700 #define HIGHBD_H_WXH_LARGE(w, h)                                            \
701   void aom_highbd_h_predictor_##w##x##h##_neon(                             \
702       uint16_t *dst, ptrdiff_t stride, const uint16_t *above,               \
703       const uint16_t *left, int bd) {                                       \
704     (void)above;                                                            \
705     (void)bd;                                                               \
706     for (int i = 0; i < (h) / 16; ++i) {                                    \
707       uint16x8_t l0 = vld1q_u16(left + 0);                                  \
708       uint16x8_t l1 = vld1q_u16(left + 8);                                  \
709       highbd_h_store_##w##x4(dst + 0 * stride, stride, vget_low_u16(l0));   \
710       highbd_h_store_##w##x4(dst + 4 * stride, stride, vget_high_u16(l0));  \
711       highbd_h_store_##w##x4(dst + 8 * stride, stride, vget_low_u16(l1));   \
712       highbd_h_store_##w##x4(dst + 12 * stride, stride, vget_high_u16(l1)); \
713       left += 16;                                                           \
714       dst += 16 * stride;                                                   \
715     }                                                                       \
716   }
717 
718 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
719 HIGHBD_H_WXH_LARGE(4, 16)
720 HIGHBD_H_WXH_LARGE(8, 16)
721 HIGHBD_H_WXH_LARGE(8, 32)
722 HIGHBD_H_WXH_LARGE(16, 16)
723 HIGHBD_H_WXH_LARGE(16, 32)
724 HIGHBD_H_WXH_LARGE(16, 64)
725 HIGHBD_H_WXH_LARGE(32, 16)
726 HIGHBD_H_WXH_LARGE(32, 32)
727 HIGHBD_H_WXH_LARGE(32, 64)
728 HIGHBD_H_WXH_LARGE(64, 16)
729 HIGHBD_H_WXH_LARGE(64, 32)
730 HIGHBD_H_WXH_LARGE(64, 64)
731 #else
732 HIGHBD_H_WXH_LARGE(8, 16)
733 HIGHBD_H_WXH_LARGE(16, 16)
734 HIGHBD_H_WXH_LARGE(16, 32)
735 HIGHBD_H_WXH_LARGE(32, 16)
736 HIGHBD_H_WXH_LARGE(32, 32)
737 HIGHBD_H_WXH_LARGE(32, 64)
738 HIGHBD_H_WXH_LARGE(64, 32)
739 HIGHBD_H_WXH_LARGE(64, 64)
740 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
741 
742 #undef HIGHBD_H_WXH_LARGE
743 
744 // -----------------------------------------------------------------------------
745 // PAETH
746 
highbd_paeth_4or8_x_h_neon(uint16_t * dest,ptrdiff_t stride,const uint16_t * const top_row,const uint16_t * const left_column,int width,int height)747 static inline void highbd_paeth_4or8_x_h_neon(uint16_t *dest, ptrdiff_t stride,
748                                               const uint16_t *const top_row,
749                                               const uint16_t *const left_column,
750                                               int width, int height) {
751   const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
752   const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
753   uint16x8_t top;
754   if (width == 4) {
755     top = vcombine_u16(vld1_u16(top_row), vdup_n_u16(0));
756   } else {  // width == 8
757     top = vld1q_u16(top_row);
758   }
759 
760   for (int y = 0; y < height; ++y) {
761     const uint16x8_t left = vdupq_n_u16(left_column[y]);
762 
763     const uint16x8_t left_dist = vabdq_u16(top, top_left);
764     const uint16x8_t top_dist = vabdq_u16(left, top_left);
765     const uint16x8_t top_left_dist =
766         vabdq_u16(vaddq_u16(top, left), top_left_x2);
767 
768     const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist);
769     const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist);
770     const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist);
771 
772     // if (left_dist <= top_dist && left_dist <= top_left_dist)
773     const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
774     //   dest[x] = left_column[y];
775     // Fill all the unused spaces with 'top'. They will be overwritten when
776     // the positions for top_left are known.
777     uint16x8_t result = vbslq_u16(left_mask, left, top);
778     // else if (top_dist <= top_left_dist)
779     //   dest[x] = top_row[x];
780     // Add these values to the mask. They were already set.
781     const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
782     // else
783     //   dest[x] = top_left;
784     result = vbslq_u16(left_or_top_mask, result, top_left);
785 
786     if (width == 4) {
787       vst1_u16(dest, vget_low_u16(result));
788     } else {  // width == 8
789       vst1q_u16(dest, result);
790     }
791     dest += stride;
792   }
793 }
794 
795 #define HIGHBD_PAETH_NXM(W, H)                                  \
796   void aom_highbd_paeth_predictor_##W##x##H##_neon(             \
797       uint16_t *dst, ptrdiff_t stride, const uint16_t *above,   \
798       const uint16_t *left, int bd) {                           \
799     (void)bd;                                                   \
800     highbd_paeth_4or8_x_h_neon(dst, stride, above, left, W, H); \
801   }
802 
803 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
804 HIGHBD_PAETH_NXM(4, 4)
805 HIGHBD_PAETH_NXM(4, 8)
806 HIGHBD_PAETH_NXM(4, 16)
807 HIGHBD_PAETH_NXM(8, 4)
808 HIGHBD_PAETH_NXM(8, 8)
809 HIGHBD_PAETH_NXM(8, 16)
810 HIGHBD_PAETH_NXM(8, 32)
811 #else
812 HIGHBD_PAETH_NXM(4, 4)
813 HIGHBD_PAETH_NXM(4, 8)
814 HIGHBD_PAETH_NXM(8, 4)
815 HIGHBD_PAETH_NXM(8, 8)
816 HIGHBD_PAETH_NXM(8, 16)
817 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
818 
819 // Select the closest values and collect them.
select_paeth(const uint16x8_t top,const uint16x8_t left,const uint16x8_t top_left,const uint16x8_t left_le_top,const uint16x8_t left_le_top_left,const uint16x8_t top_le_top_left)820 static inline uint16x8_t select_paeth(const uint16x8_t top,
821                                       const uint16x8_t left,
822                                       const uint16x8_t top_left,
823                                       const uint16x8_t left_le_top,
824                                       const uint16x8_t left_le_top_left,
825                                       const uint16x8_t top_le_top_left) {
826   // if (left_dist <= top_dist && left_dist <= top_left_dist)
827   const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
828   //   dest[x] = left_column[y];
829   // Fill all the unused spaces with 'top'. They will be overwritten when
830   // the positions for top_left are known.
831   const uint16x8_t result = vbslq_u16(left_mask, left, top);
832   // else if (top_dist <= top_left_dist)
833   //   dest[x] = top_row[x];
834   // Add these values to the mask. They were already set.
835   const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
836   // else
837   //   dest[x] = top_left;
838   return vbslq_u16(left_or_top_mask, result, top_left);
839 }
840 
841 #define PAETH_PREDICTOR(num)                                                  \
842   do {                                                                        \
843     const uint16x8_t left_dist = vabdq_u16(top[num], top_left);               \
844     const uint16x8_t top_left_dist =                                          \
845         vabdq_u16(vaddq_u16(top[num], left), top_left_x2);                    \
846     const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist);            \
847     const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist);  \
848     const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist);    \
849     const uint16x8_t result =                                                 \
850         select_paeth(top[num], left, top_left, left_le_top, left_le_top_left, \
851                      top_le_top_left);                                        \
852     vst1q_u16(dest + (num * 8), result);                                      \
853   } while (0)
854 
855 #define LOAD_TOP_ROW(num) vld1q_u16(top_row + (num * 8))
856 
highbd_paeth16_plus_x_h_neon(uint16_t * dest,ptrdiff_t stride,const uint16_t * const top_row,const uint16_t * const left_column,int width,int height)857 static inline void highbd_paeth16_plus_x_h_neon(
858     uint16_t *dest, ptrdiff_t stride, const uint16_t *const top_row,
859     const uint16_t *const left_column, int width, int height) {
860   const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
861   const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
862   uint16x8_t top[8];
863   top[0] = LOAD_TOP_ROW(0);
864   top[1] = LOAD_TOP_ROW(1);
865   if (width > 16) {
866     top[2] = LOAD_TOP_ROW(2);
867     top[3] = LOAD_TOP_ROW(3);
868     if (width == 64) {
869       top[4] = LOAD_TOP_ROW(4);
870       top[5] = LOAD_TOP_ROW(5);
871       top[6] = LOAD_TOP_ROW(6);
872       top[7] = LOAD_TOP_ROW(7);
873     }
874   }
875 
876   for (int y = 0; y < height; ++y) {
877     const uint16x8_t left = vdupq_n_u16(left_column[y]);
878     const uint16x8_t top_dist = vabdq_u16(left, top_left);
879     PAETH_PREDICTOR(0);
880     PAETH_PREDICTOR(1);
881     if (width > 16) {
882       PAETH_PREDICTOR(2);
883       PAETH_PREDICTOR(3);
884       if (width == 64) {
885         PAETH_PREDICTOR(4);
886         PAETH_PREDICTOR(5);
887         PAETH_PREDICTOR(6);
888         PAETH_PREDICTOR(7);
889       }
890     }
891     dest += stride;
892   }
893 }
894 
895 #define HIGHBD_PAETH_NXM_WIDE(W, H)                               \
896   void aom_highbd_paeth_predictor_##W##x##H##_neon(               \
897       uint16_t *dst, ptrdiff_t stride, const uint16_t *above,     \
898       const uint16_t *left, int bd) {                             \
899     (void)bd;                                                     \
900     highbd_paeth16_plus_x_h_neon(dst, stride, above, left, W, H); \
901   }
902 
903 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
904 HIGHBD_PAETH_NXM_WIDE(16, 4)
905 HIGHBD_PAETH_NXM_WIDE(16, 8)
906 HIGHBD_PAETH_NXM_WIDE(16, 16)
907 HIGHBD_PAETH_NXM_WIDE(16, 32)
908 HIGHBD_PAETH_NXM_WIDE(16, 64)
909 HIGHBD_PAETH_NXM_WIDE(32, 8)
910 HIGHBD_PAETH_NXM_WIDE(32, 16)
911 HIGHBD_PAETH_NXM_WIDE(32, 32)
912 HIGHBD_PAETH_NXM_WIDE(32, 64)
913 HIGHBD_PAETH_NXM_WIDE(64, 16)
914 HIGHBD_PAETH_NXM_WIDE(64, 32)
915 HIGHBD_PAETH_NXM_WIDE(64, 64)
916 #else
917 HIGHBD_PAETH_NXM_WIDE(16, 8)
918 HIGHBD_PAETH_NXM_WIDE(16, 16)
919 HIGHBD_PAETH_NXM_WIDE(16, 32)
920 HIGHBD_PAETH_NXM_WIDE(32, 16)
921 HIGHBD_PAETH_NXM_WIDE(32, 32)
922 HIGHBD_PAETH_NXM_WIDE(32, 64)
923 HIGHBD_PAETH_NXM_WIDE(64, 32)
924 HIGHBD_PAETH_NXM_WIDE(64, 64)
925 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
926 
927 // -----------------------------------------------------------------------------
928 // SMOOTH
929 
930 // 256 - v = vneg_s8(v)
negate_s8(const uint16x4_t v)931 static inline uint16x4_t negate_s8(const uint16x4_t v) {
932   return vreinterpret_u16_s8(vneg_s8(vreinterpret_s8_u16(v)));
933 }
934 
highbd_smooth_4xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const top_row,const uint16_t * const left_column,const int height)935 static inline void highbd_smooth_4xh_neon(uint16_t *dst, ptrdiff_t stride,
936                                           const uint16_t *const top_row,
937                                           const uint16_t *const left_column,
938                                           const int height) {
939   const uint16_t top_right = top_row[3];
940   const uint16_t bottom_left = left_column[height - 1];
941   const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
942 
943   const uint16x4_t top_v = vld1_u16(top_row);
944   const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
945   const uint16x4_t weights_x_v = vld1_u16(smooth_weights_u16);
946   const uint16x4_t scaled_weights_x = negate_s8(weights_x_v);
947   const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
948 
949   for (int y = 0; y < height; ++y) {
950     // Each variable in the running summation is named for the last item to be
951     // accumulated.
952     const uint32x4_t weighted_top =
953         vmlal_n_u16(weighted_tr, top_v, weights_y[y]);
954     const uint32x4_t weighted_left =
955         vmlal_n_u16(weighted_top, weights_x_v, left_column[y]);
956     const uint32x4_t weighted_bl =
957         vmlal_n_u16(weighted_left, bottom_left_v, 256 - weights_y[y]);
958 
959     const uint16x4_t pred =
960         vrshrn_n_u32(weighted_bl, SMOOTH_WEIGHT_LOG2_SCALE + 1);
961     vst1_u16(dst, pred);
962     dst += stride;
963   }
964 }
965 
966 // Common code between 8xH and [16|32|64]xH.
highbd_calculate_pred8(uint16_t * dst,const uint32x4_t weighted_corners_low,const uint32x4_t weighted_corners_high,const uint16x4x2_t top_vals,const uint16x4x2_t weights_x,const uint16_t left_y,const uint16_t weight_y)967 static inline void highbd_calculate_pred8(
968     uint16_t *dst, const uint32x4_t weighted_corners_low,
969     const uint32x4_t weighted_corners_high, const uint16x4x2_t top_vals,
970     const uint16x4x2_t weights_x, const uint16_t left_y,
971     const uint16_t weight_y) {
972   // Each variable in the running summation is named for the last item to be
973   // accumulated.
974   const uint32x4_t weighted_top_low =
975       vmlal_n_u16(weighted_corners_low, top_vals.val[0], weight_y);
976   const uint32x4_t weighted_edges_low =
977       vmlal_n_u16(weighted_top_low, weights_x.val[0], left_y);
978 
979   const uint16x4_t pred_low =
980       vrshrn_n_u32(weighted_edges_low, SMOOTH_WEIGHT_LOG2_SCALE + 1);
981   vst1_u16(dst, pred_low);
982 
983   const uint32x4_t weighted_top_high =
984       vmlal_n_u16(weighted_corners_high, top_vals.val[1], weight_y);
985   const uint32x4_t weighted_edges_high =
986       vmlal_n_u16(weighted_top_high, weights_x.val[1], left_y);
987 
988   const uint16x4_t pred_high =
989       vrshrn_n_u32(weighted_edges_high, SMOOTH_WEIGHT_LOG2_SCALE + 1);
990   vst1_u16(dst + 4, pred_high);
991 }
992 
highbd_smooth_8xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const top_row,const uint16_t * const left_column,const int height)993 static void highbd_smooth_8xh_neon(uint16_t *dst, ptrdiff_t stride,
994                                    const uint16_t *const top_row,
995                                    const uint16_t *const left_column,
996                                    const int height) {
997   const uint16_t top_right = top_row[7];
998   const uint16_t bottom_left = left_column[height - 1];
999   const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
1000 
1001   const uint16x4x2_t top_vals = { { vld1_u16(top_row),
1002                                     vld1_u16(top_row + 4) } };
1003   const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
1004   const uint16x4x2_t weights_x = { { vld1_u16(smooth_weights_u16 + 4),
1005                                      vld1_u16(smooth_weights_u16 + 8) } };
1006   const uint32x4_t weighted_tr_low =
1007       vmull_n_u16(negate_s8(weights_x.val[0]), top_right);
1008   const uint32x4_t weighted_tr_high =
1009       vmull_n_u16(negate_s8(weights_x.val[1]), top_right);
1010 
1011   for (int y = 0; y < height; ++y) {
1012     const uint32x4_t weighted_bl =
1013         vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
1014     const uint32x4_t weighted_corners_low =
1015         vaddq_u32(weighted_bl, weighted_tr_low);
1016     const uint32x4_t weighted_corners_high =
1017         vaddq_u32(weighted_bl, weighted_tr_high);
1018     highbd_calculate_pred8(dst, weighted_corners_low, weighted_corners_high,
1019                            top_vals, weights_x, left_column[y], weights_y[y]);
1020     dst += stride;
1021   }
1022 }
1023 
1024 #define HIGHBD_SMOOTH_NXM(W, H)                                 \
1025   void aom_highbd_smooth_predictor_##W##x##H##_neon(            \
1026       uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
1027       const uint16_t *left, int bd) {                           \
1028     (void)bd;                                                   \
1029     highbd_smooth_##W##xh_neon(dst, y_stride, above, left, H);  \
1030   }
1031 
1032 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1033 HIGHBD_SMOOTH_NXM(4, 4)
1034 HIGHBD_SMOOTH_NXM(4, 8)
1035 HIGHBD_SMOOTH_NXM(8, 4)
1036 HIGHBD_SMOOTH_NXM(8, 8)
1037 HIGHBD_SMOOTH_NXM(4, 16)
1038 HIGHBD_SMOOTH_NXM(8, 16)
1039 HIGHBD_SMOOTH_NXM(8, 32)
1040 #else
1041 HIGHBD_SMOOTH_NXM(4, 4)
1042 HIGHBD_SMOOTH_NXM(4, 8)
1043 HIGHBD_SMOOTH_NXM(8, 4)
1044 HIGHBD_SMOOTH_NXM(8, 8)
1045 HIGHBD_SMOOTH_NXM(8, 16)
1046 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1047 
1048 #undef HIGHBD_SMOOTH_NXM
1049 
1050 // For width 16 and above.
1051 #define HIGHBD_SMOOTH_PREDICTOR(W)                                             \
1052   static void highbd_smooth_##W##xh_neon(                                      \
1053       uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row,          \
1054       const uint16_t *const left_column, const int height) {                   \
1055     const uint16_t top_right = top_row[(W)-1];                                 \
1056     const uint16_t bottom_left = left_column[height - 1];                      \
1057     const uint16_t *const weights_y = smooth_weights_u16 + height - 4;         \
1058                                                                                \
1059     /* Precompute weighted values that don't vary with |y|. */                 \
1060     uint32x4_t weighted_tr_low[(W) >> 3];                                      \
1061     uint32x4_t weighted_tr_high[(W) >> 3];                                     \
1062     for (int i = 0; i < (W) >> 3; ++i) {                                       \
1063       const int x = i << 3;                                                    \
1064       const uint16x4_t weights_x_low =                                         \
1065           vld1_u16(smooth_weights_u16 + (W)-4 + x);                            \
1066       weighted_tr_low[i] = vmull_n_u16(negate_s8(weights_x_low), top_right);   \
1067       const uint16x4_t weights_x_high =                                        \
1068           vld1_u16(smooth_weights_u16 + (W) + x);                              \
1069       weighted_tr_high[i] = vmull_n_u16(negate_s8(weights_x_high), top_right); \
1070     }                                                                          \
1071                                                                                \
1072     const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);                  \
1073     for (int y = 0; y < height; ++y) {                                         \
1074       const uint32x4_t weighted_bl =                                           \
1075           vmull_n_u16(bottom_left_v, 256 - weights_y[y]);                      \
1076       uint16_t *dst_x = dst;                                                   \
1077       for (int i = 0; i < (W) >> 3; ++i) {                                     \
1078         const int x = i << 3;                                                  \
1079         const uint16x4x2_t top_vals = { { vld1_u16(top_row + x),               \
1080                                           vld1_u16(top_row + x + 4) } };       \
1081         const uint32x4_t weighted_corners_low =                                \
1082             vaddq_u32(weighted_bl, weighted_tr_low[i]);                        \
1083         const uint32x4_t weighted_corners_high =                               \
1084             vaddq_u32(weighted_bl, weighted_tr_high[i]);                       \
1085         /* Accumulate weighted edge values and store. */                       \
1086         const uint16x4x2_t weights_x = {                                       \
1087           { vld1_u16(smooth_weights_u16 + (W)-4 + x),                          \
1088             vld1_u16(smooth_weights_u16 + (W) + x) }                           \
1089         };                                                                     \
1090         highbd_calculate_pred8(dst_x, weighted_corners_low,                    \
1091                                weighted_corners_high, top_vals, weights_x,     \
1092                                left_column[y], weights_y[y]);                  \
1093         dst_x += 8;                                                            \
1094       }                                                                        \
1095       dst += stride;                                                           \
1096     }                                                                          \
1097   }
1098 
1099 HIGHBD_SMOOTH_PREDICTOR(16)
1100 HIGHBD_SMOOTH_PREDICTOR(32)
1101 HIGHBD_SMOOTH_PREDICTOR(64)
1102 
1103 #undef HIGHBD_SMOOTH_PREDICTOR
1104 
1105 #define HIGHBD_SMOOTH_NXM_WIDE(W, H)                            \
1106   void aom_highbd_smooth_predictor_##W##x##H##_neon(            \
1107       uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
1108       const uint16_t *left, int bd) {                           \
1109     (void)bd;                                                   \
1110     highbd_smooth_##W##xh_neon(dst, y_stride, above, left, H);  \
1111   }
1112 
1113 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1114 HIGHBD_SMOOTH_NXM_WIDE(16, 4)
1115 HIGHBD_SMOOTH_NXM_WIDE(16, 8)
1116 HIGHBD_SMOOTH_NXM_WIDE(16, 16)
1117 HIGHBD_SMOOTH_NXM_WIDE(16, 32)
1118 HIGHBD_SMOOTH_NXM_WIDE(16, 64)
1119 HIGHBD_SMOOTH_NXM_WIDE(32, 8)
1120 HIGHBD_SMOOTH_NXM_WIDE(32, 16)
1121 HIGHBD_SMOOTH_NXM_WIDE(32, 32)
1122 HIGHBD_SMOOTH_NXM_WIDE(32, 64)
1123 HIGHBD_SMOOTH_NXM_WIDE(64, 16)
1124 HIGHBD_SMOOTH_NXM_WIDE(64, 32)
1125 HIGHBD_SMOOTH_NXM_WIDE(64, 64)
1126 #else
1127 HIGHBD_SMOOTH_NXM_WIDE(16, 8)
1128 HIGHBD_SMOOTH_NXM_WIDE(16, 16)
1129 HIGHBD_SMOOTH_NXM_WIDE(16, 32)
1130 HIGHBD_SMOOTH_NXM_WIDE(32, 16)
1131 HIGHBD_SMOOTH_NXM_WIDE(32, 32)
1132 HIGHBD_SMOOTH_NXM_WIDE(32, 64)
1133 HIGHBD_SMOOTH_NXM_WIDE(64, 32)
1134 HIGHBD_SMOOTH_NXM_WIDE(64, 64)
1135 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1136 
1137 #undef HIGHBD_SMOOTH_NXM_WIDE
1138 
highbd_smooth_v_4xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const top_row,const uint16_t * const left_column,const int height)1139 static void highbd_smooth_v_4xh_neon(uint16_t *dst, ptrdiff_t stride,
1140                                      const uint16_t *const top_row,
1141                                      const uint16_t *const left_column,
1142                                      const int height) {
1143   const uint16_t bottom_left = left_column[height - 1];
1144   const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
1145 
1146   const uint16x4_t top_v = vld1_u16(top_row);
1147   const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
1148 
1149   for (int y = 0; y < height; ++y) {
1150     const uint32x4_t weighted_bl =
1151         vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
1152     const uint32x4_t weighted_top =
1153         vmlal_n_u16(weighted_bl, top_v, weights_y[y]);
1154     vst1_u16(dst, vrshrn_n_u32(weighted_top, SMOOTH_WEIGHT_LOG2_SCALE));
1155 
1156     dst += stride;
1157   }
1158 }
1159 
highbd_smooth_v_8xh_neon(uint16_t * dst,const ptrdiff_t stride,const uint16_t * const top_row,const uint16_t * const left_column,const int height)1160 static void highbd_smooth_v_8xh_neon(uint16_t *dst, const ptrdiff_t stride,
1161                                      const uint16_t *const top_row,
1162                                      const uint16_t *const left_column,
1163                                      const int height) {
1164   const uint16_t bottom_left = left_column[height - 1];
1165   const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
1166 
1167   const uint16x4_t top_low = vld1_u16(top_row);
1168   const uint16x4_t top_high = vld1_u16(top_row + 4);
1169   const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
1170 
1171   for (int y = 0; y < height; ++y) {
1172     const uint32x4_t weighted_bl =
1173         vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
1174 
1175     const uint32x4_t weighted_top_low =
1176         vmlal_n_u16(weighted_bl, top_low, weights_y[y]);
1177     vst1_u16(dst, vrshrn_n_u32(weighted_top_low, SMOOTH_WEIGHT_LOG2_SCALE));
1178 
1179     const uint32x4_t weighted_top_high =
1180         vmlal_n_u16(weighted_bl, top_high, weights_y[y]);
1181     vst1_u16(dst + 4,
1182              vrshrn_n_u32(weighted_top_high, SMOOTH_WEIGHT_LOG2_SCALE));
1183     dst += stride;
1184   }
1185 }
1186 
1187 #define HIGHBD_SMOOTH_V_NXM(W, H)                                \
1188   void aom_highbd_smooth_v_predictor_##W##x##H##_neon(           \
1189       uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above,  \
1190       const uint16_t *left, int bd) {                            \
1191     (void)bd;                                                    \
1192     highbd_smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \
1193   }
1194 
1195 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1196 HIGHBD_SMOOTH_V_NXM(4, 4)
1197 HIGHBD_SMOOTH_V_NXM(4, 8)
1198 HIGHBD_SMOOTH_V_NXM(4, 16)
1199 HIGHBD_SMOOTH_V_NXM(8, 4)
1200 HIGHBD_SMOOTH_V_NXM(8, 8)
1201 HIGHBD_SMOOTH_V_NXM(8, 16)
1202 HIGHBD_SMOOTH_V_NXM(8, 32)
1203 #else
1204 HIGHBD_SMOOTH_V_NXM(4, 4)
1205 HIGHBD_SMOOTH_V_NXM(4, 8)
1206 HIGHBD_SMOOTH_V_NXM(8, 4)
1207 HIGHBD_SMOOTH_V_NXM(8, 8)
1208 HIGHBD_SMOOTH_V_NXM(8, 16)
1209 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1210 
1211 #undef HIGHBD_SMOOTH_V_NXM
1212 
1213 // For width 16 and above.
1214 #define HIGHBD_SMOOTH_V_PREDICTOR(W)                                         \
1215   static void highbd_smooth_v_##W##xh_neon(                                  \
1216       uint16_t *dst, const ptrdiff_t stride, const uint16_t *const top_row,  \
1217       const uint16_t *const left_column, const int height) {                 \
1218     const uint16_t bottom_left = left_column[height - 1];                    \
1219     const uint16_t *const weights_y = smooth_weights_u16 + height - 4;       \
1220                                                                              \
1221     uint16x4x2_t top_vals[(W) >> 3];                                         \
1222     for (int i = 0; i < (W) >> 3; ++i) {                                     \
1223       const int x = i << 3;                                                  \
1224       top_vals[i].val[0] = vld1_u16(top_row + x);                            \
1225       top_vals[i].val[1] = vld1_u16(top_row + x + 4);                        \
1226     }                                                                        \
1227                                                                              \
1228     const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);                \
1229     for (int y = 0; y < height; ++y) {                                       \
1230       const uint32x4_t weighted_bl =                                         \
1231           vmull_n_u16(bottom_left_v, 256 - weights_y[y]);                    \
1232                                                                              \
1233       uint16_t *dst_x = dst;                                                 \
1234       for (int i = 0; i < (W) >> 3; ++i) {                                   \
1235         const uint32x4_t weighted_top_low =                                  \
1236             vmlal_n_u16(weighted_bl, top_vals[i].val[0], weights_y[y]);      \
1237         vst1_u16(dst_x,                                                      \
1238                  vrshrn_n_u32(weighted_top_low, SMOOTH_WEIGHT_LOG2_SCALE));  \
1239                                                                              \
1240         const uint32x4_t weighted_top_high =                                 \
1241             vmlal_n_u16(weighted_bl, top_vals[i].val[1], weights_y[y]);      \
1242         vst1_u16(dst_x + 4,                                                  \
1243                  vrshrn_n_u32(weighted_top_high, SMOOTH_WEIGHT_LOG2_SCALE)); \
1244         dst_x += 8;                                                          \
1245       }                                                                      \
1246       dst += stride;                                                         \
1247     }                                                                        \
1248   }
1249 
1250 HIGHBD_SMOOTH_V_PREDICTOR(16)
1251 HIGHBD_SMOOTH_V_PREDICTOR(32)
1252 HIGHBD_SMOOTH_V_PREDICTOR(64)
1253 
1254 #undef HIGHBD_SMOOTH_V_PREDICTOR
1255 
1256 #define HIGHBD_SMOOTH_V_NXM_WIDE(W, H)                           \
1257   void aom_highbd_smooth_v_predictor_##W##x##H##_neon(           \
1258       uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above,  \
1259       const uint16_t *left, int bd) {                            \
1260     (void)bd;                                                    \
1261     highbd_smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \
1262   }
1263 
1264 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1265 HIGHBD_SMOOTH_V_NXM_WIDE(16, 4)
1266 HIGHBD_SMOOTH_V_NXM_WIDE(16, 8)
1267 HIGHBD_SMOOTH_V_NXM_WIDE(16, 16)
1268 HIGHBD_SMOOTH_V_NXM_WIDE(16, 32)
1269 HIGHBD_SMOOTH_V_NXM_WIDE(16, 64)
1270 HIGHBD_SMOOTH_V_NXM_WIDE(32, 8)
1271 HIGHBD_SMOOTH_V_NXM_WIDE(32, 16)
1272 HIGHBD_SMOOTH_V_NXM_WIDE(32, 32)
1273 HIGHBD_SMOOTH_V_NXM_WIDE(32, 64)
1274 HIGHBD_SMOOTH_V_NXM_WIDE(64, 16)
1275 HIGHBD_SMOOTH_V_NXM_WIDE(64, 32)
1276 HIGHBD_SMOOTH_V_NXM_WIDE(64, 64)
1277 #else
1278 HIGHBD_SMOOTH_V_NXM_WIDE(16, 8)
1279 HIGHBD_SMOOTH_V_NXM_WIDE(16, 16)
1280 HIGHBD_SMOOTH_V_NXM_WIDE(16, 32)
1281 HIGHBD_SMOOTH_V_NXM_WIDE(32, 16)
1282 HIGHBD_SMOOTH_V_NXM_WIDE(32, 32)
1283 HIGHBD_SMOOTH_V_NXM_WIDE(32, 64)
1284 HIGHBD_SMOOTH_V_NXM_WIDE(64, 32)
1285 HIGHBD_SMOOTH_V_NXM_WIDE(64, 64)
1286 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1287 
1288 #undef HIGHBD_SMOOTH_V_NXM_WIDE
1289 
highbd_smooth_h_4xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const top_row,const uint16_t * const left_column,const int height)1290 static inline void highbd_smooth_h_4xh_neon(uint16_t *dst, ptrdiff_t stride,
1291                                             const uint16_t *const top_row,
1292                                             const uint16_t *const left_column,
1293                                             const int height) {
1294   const uint16_t top_right = top_row[3];
1295 
1296   const uint16x4_t weights_x = vld1_u16(smooth_weights_u16);
1297   const uint16x4_t scaled_weights_x = negate_s8(weights_x);
1298 
1299   const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
1300   for (int y = 0; y < height; ++y) {
1301     const uint32x4_t weighted_left =
1302         vmlal_n_u16(weighted_tr, weights_x, left_column[y]);
1303     vst1_u16(dst, vrshrn_n_u32(weighted_left, SMOOTH_WEIGHT_LOG2_SCALE));
1304     dst += stride;
1305   }
1306 }
1307 
highbd_smooth_h_8xh_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * const top_row,const uint16_t * const left_column,const int height)1308 static inline void highbd_smooth_h_8xh_neon(uint16_t *dst, ptrdiff_t stride,
1309                                             const uint16_t *const top_row,
1310                                             const uint16_t *const left_column,
1311                                             const int height) {
1312   const uint16_t top_right = top_row[7];
1313 
1314   const uint16x4x2_t weights_x = { { vld1_u16(smooth_weights_u16 + 4),
1315                                      vld1_u16(smooth_weights_u16 + 8) } };
1316 
1317   const uint32x4_t weighted_tr_low =
1318       vmull_n_u16(negate_s8(weights_x.val[0]), top_right);
1319   const uint32x4_t weighted_tr_high =
1320       vmull_n_u16(negate_s8(weights_x.val[1]), top_right);
1321 
1322   for (int y = 0; y < height; ++y) {
1323     const uint16_t left_y = left_column[y];
1324     const uint32x4_t weighted_left_low =
1325         vmlal_n_u16(weighted_tr_low, weights_x.val[0], left_y);
1326     vst1_u16(dst, vrshrn_n_u32(weighted_left_low, SMOOTH_WEIGHT_LOG2_SCALE));
1327 
1328     const uint32x4_t weighted_left_high =
1329         vmlal_n_u16(weighted_tr_high, weights_x.val[1], left_y);
1330     vst1_u16(dst + 4,
1331              vrshrn_n_u32(weighted_left_high, SMOOTH_WEIGHT_LOG2_SCALE));
1332     dst += stride;
1333   }
1334 }
1335 
1336 #define HIGHBD_SMOOTH_H_NXM(W, H)                                \
1337   void aom_highbd_smooth_h_predictor_##W##x##H##_neon(           \
1338       uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above,  \
1339       const uint16_t *left, int bd) {                            \
1340     (void)bd;                                                    \
1341     highbd_smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \
1342   }
1343 
1344 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1345 HIGHBD_SMOOTH_H_NXM(4, 4)
1346 HIGHBD_SMOOTH_H_NXM(4, 8)
1347 HIGHBD_SMOOTH_H_NXM(4, 16)
1348 HIGHBD_SMOOTH_H_NXM(8, 4)
1349 HIGHBD_SMOOTH_H_NXM(8, 8)
1350 HIGHBD_SMOOTH_H_NXM(8, 16)
1351 HIGHBD_SMOOTH_H_NXM(8, 32)
1352 #else
1353 HIGHBD_SMOOTH_H_NXM(4, 4)
1354 HIGHBD_SMOOTH_H_NXM(4, 8)
1355 HIGHBD_SMOOTH_H_NXM(8, 4)
1356 HIGHBD_SMOOTH_H_NXM(8, 8)
1357 HIGHBD_SMOOTH_H_NXM(8, 16)
1358 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1359 
1360 #undef HIGHBD_SMOOTH_H_NXM
1361 
1362 // For width 16 and above.
1363 #define HIGHBD_SMOOTH_H_PREDICTOR(W)                                          \
1364   static void highbd_smooth_h_##W##xh_neon(                                   \
1365       uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row,         \
1366       const uint16_t *const left_column, const int height) {                  \
1367     const uint16_t top_right = top_row[(W)-1];                                \
1368                                                                               \
1369     uint16x4_t weights_x_low[(W) >> 3];                                       \
1370     uint16x4_t weights_x_high[(W) >> 3];                                      \
1371     uint32x4_t weighted_tr_low[(W) >> 3];                                     \
1372     uint32x4_t weighted_tr_high[(W) >> 3];                                    \
1373     for (int i = 0; i < (W) >> 3; ++i) {                                      \
1374       const int x = i << 3;                                                   \
1375       weights_x_low[i] = vld1_u16(smooth_weights_u16 + (W)-4 + x);            \
1376       weighted_tr_low[i] =                                                    \
1377           vmull_n_u16(negate_s8(weights_x_low[i]), top_right);                \
1378       weights_x_high[i] = vld1_u16(smooth_weights_u16 + (W) + x);             \
1379       weighted_tr_high[i] =                                                   \
1380           vmull_n_u16(negate_s8(weights_x_high[i]), top_right);               \
1381     }                                                                         \
1382                                                                               \
1383     for (int y = 0; y < height; ++y) {                                        \
1384       uint16_t *dst_x = dst;                                                  \
1385       const uint16_t left_y = left_column[y];                                 \
1386       for (int i = 0; i < (W) >> 3; ++i) {                                    \
1387         const uint32x4_t weighted_left_low =                                  \
1388             vmlal_n_u16(weighted_tr_low[i], weights_x_low[i], left_y);        \
1389         vst1_u16(dst_x,                                                       \
1390                  vrshrn_n_u32(weighted_left_low, SMOOTH_WEIGHT_LOG2_SCALE));  \
1391                                                                               \
1392         const uint32x4_t weighted_left_high =                                 \
1393             vmlal_n_u16(weighted_tr_high[i], weights_x_high[i], left_y);      \
1394         vst1_u16(dst_x + 4,                                                   \
1395                  vrshrn_n_u32(weighted_left_high, SMOOTH_WEIGHT_LOG2_SCALE)); \
1396         dst_x += 8;                                                           \
1397       }                                                                       \
1398       dst += stride;                                                          \
1399     }                                                                         \
1400   }
1401 
1402 HIGHBD_SMOOTH_H_PREDICTOR(16)
1403 HIGHBD_SMOOTH_H_PREDICTOR(32)
1404 HIGHBD_SMOOTH_H_PREDICTOR(64)
1405 
1406 #undef HIGHBD_SMOOTH_H_PREDICTOR
1407 
1408 #define HIGHBD_SMOOTH_H_NXM_WIDE(W, H)                           \
1409   void aom_highbd_smooth_h_predictor_##W##x##H##_neon(           \
1410       uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above,  \
1411       const uint16_t *left, int bd) {                            \
1412     (void)bd;                                                    \
1413     highbd_smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \
1414   }
1415 
1416 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1417 HIGHBD_SMOOTH_H_NXM_WIDE(16, 4)
1418 HIGHBD_SMOOTH_H_NXM_WIDE(16, 8)
1419 HIGHBD_SMOOTH_H_NXM_WIDE(16, 16)
1420 HIGHBD_SMOOTH_H_NXM_WIDE(16, 32)
1421 HIGHBD_SMOOTH_H_NXM_WIDE(16, 64)
1422 HIGHBD_SMOOTH_H_NXM_WIDE(32, 8)
1423 HIGHBD_SMOOTH_H_NXM_WIDE(32, 16)
1424 HIGHBD_SMOOTH_H_NXM_WIDE(32, 32)
1425 HIGHBD_SMOOTH_H_NXM_WIDE(32, 64)
1426 HIGHBD_SMOOTH_H_NXM_WIDE(64, 16)
1427 HIGHBD_SMOOTH_H_NXM_WIDE(64, 32)
1428 HIGHBD_SMOOTH_H_NXM_WIDE(64, 64)
1429 #else
1430 HIGHBD_SMOOTH_H_NXM_WIDE(16, 8)
1431 HIGHBD_SMOOTH_H_NXM_WIDE(16, 16)
1432 HIGHBD_SMOOTH_H_NXM_WIDE(16, 32)
1433 HIGHBD_SMOOTH_H_NXM_WIDE(32, 16)
1434 HIGHBD_SMOOTH_H_NXM_WIDE(32, 32)
1435 HIGHBD_SMOOTH_H_NXM_WIDE(32, 64)
1436 HIGHBD_SMOOTH_H_NXM_WIDE(64, 32)
1437 HIGHBD_SMOOTH_H_NXM_WIDE(64, 64)
1438 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
1439 
1440 #undef HIGHBD_SMOOTH_H_NXM_WIDE
1441 
1442 // -----------------------------------------------------------------------------
1443 // Z1
1444 
1445 static int16_t iota1_s16[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8 };
1446 static int16_t iota2_s16[] = { 0, 2, 4, 6, 8, 10, 12, 14 };
1447 
highbd_dr_z1_apply_shift_x4(uint16x4_t a0,uint16x4_t a1,int shift)1448 static AOM_FORCE_INLINE uint16x4_t highbd_dr_z1_apply_shift_x4(uint16x4_t a0,
1449                                                                uint16x4_t a1,
1450                                                                int shift) {
1451   // The C implementation of the z1 predictor uses (32 - shift) and a right
1452   // shift by 5, however we instead double shift to avoid an unnecessary right
1453   // shift by 1.
1454   uint32x4_t res = vmull_n_u16(a1, shift);
1455   res = vmlal_n_u16(res, a0, 64 - shift);
1456   return vrshrn_n_u32(res, 6);
1457 }
1458 
highbd_dr_z1_apply_shift_x8(uint16x8_t a0,uint16x8_t a1,int shift)1459 static AOM_FORCE_INLINE uint16x8_t highbd_dr_z1_apply_shift_x8(uint16x8_t a0,
1460                                                                uint16x8_t a1,
1461                                                                int shift) {
1462   return vcombine_u16(
1463       highbd_dr_z1_apply_shift_x4(vget_low_u16(a0), vget_low_u16(a1), shift),
1464       highbd_dr_z1_apply_shift_x4(vget_high_u16(a0), vget_high_u16(a1), shift));
1465 }
1466 
1467 // clang-format off
1468 static const uint8_t kLoadMaxShuffles[] = {
1469   14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
1470   12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
1471   10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
1472    8,  9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15,
1473    6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15,
1474    4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15,
1475    2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 14, 15,
1476    0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
1477 };
1478 // clang-format on
1479 
zn_load_masked_neon(const uint16_t * ptr,int shuffle_idx)1480 static inline uint16x8_t zn_load_masked_neon(const uint16_t *ptr,
1481                                              int shuffle_idx) {
1482   uint8x16_t shuffle = vld1q_u8(&kLoadMaxShuffles[16 * shuffle_idx]);
1483   uint8x16_t src = vreinterpretq_u8_u16(vld1q_u16(ptr));
1484 #if AOM_ARCH_AARCH64
1485   return vreinterpretq_u16_u8(vqtbl1q_u8(src, shuffle));
1486 #else
1487   uint8x8x2_t src2 = { { vget_low_u8(src), vget_high_u8(src) } };
1488   uint8x8_t lo = vtbl2_u8(src2, vget_low_u8(shuffle));
1489   uint8x8_t hi = vtbl2_u8(src2, vget_high_u8(shuffle));
1490   return vreinterpretq_u16_u8(vcombine_u8(lo, hi));
1491 #endif
1492 }
1493 
highbd_dr_prediction_z1_upsample0_neon(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,int dx)1494 static void highbd_dr_prediction_z1_upsample0_neon(uint16_t *dst,
1495                                                    ptrdiff_t stride, int bw,
1496                                                    int bh,
1497                                                    const uint16_t *above,
1498                                                    int dx) {
1499   assert(bw % 4 == 0);
1500   assert(bh % 4 == 0);
1501   assert(dx > 0);
1502 
1503   const int max_base_x = (bw + bh) - 1;
1504   const int above_max = above[max_base_x];
1505 
1506   const int16x8_t iota1x8 = vld1q_s16(iota1_s16);
1507   const int16x4_t iota1x4 = vget_low_s16(iota1x8);
1508 
1509   int x = dx;
1510   int r = 0;
1511   do {
1512     const int base = x >> 6;
1513     if (base >= max_base_x) {
1514       for (int i = r; i < bh; ++i) {
1515         aom_memset16(dst, above_max, bw);
1516         dst += stride;
1517       }
1518       return;
1519     }
1520 
1521     // The C implementation of the z1 predictor when not upsampling uses:
1522     // ((x & 0x3f) >> 1)
1523     // The right shift is unnecessary here since we instead shift by +1 later,
1524     // so adjust the mask to 0x3e to ensure we don't consider the extra bit.
1525     const int shift = x & 0x3e;
1526 
1527     if (bw == 4) {
1528       const uint16x4_t a0 = vld1_u16(&above[base]);
1529       const uint16x4_t a1 = vld1_u16(&above[base + 1]);
1530       const uint16x4_t val = highbd_dr_z1_apply_shift_x4(a0, a1, shift);
1531       const uint16x4_t cmp = vcgt_s16(vdup_n_s16(max_base_x - base), iota1x4);
1532       const uint16x4_t res = vbsl_u16(cmp, val, vdup_n_u16(above_max));
1533       vst1_u16(dst, res);
1534     } else {
1535       int c = 0;
1536       do {
1537         uint16x8_t a0;
1538         uint16x8_t a1;
1539         if (base + c >= max_base_x) {
1540           a0 = a1 = vdupq_n_u16(above_max);
1541         } else {
1542           if (base + c + 7 >= max_base_x) {
1543             int shuffle_idx = max_base_x - base - c;
1544             a0 = zn_load_masked_neon(above + (max_base_x - 7), shuffle_idx);
1545           } else {
1546             a0 = vld1q_u16(above + base + c);
1547           }
1548           if (base + c + 8 >= max_base_x) {
1549             int shuffle_idx = max_base_x - base - c - 1;
1550             a1 = zn_load_masked_neon(above + (max_base_x - 7), shuffle_idx);
1551           } else {
1552             a1 = vld1q_u16(above + base + c + 1);
1553           }
1554         }
1555 
1556         vst1q_u16(dst + c, highbd_dr_z1_apply_shift_x8(a0, a1, shift));
1557         c += 8;
1558       } while (c < bw);
1559     }
1560 
1561     dst += stride;
1562     x += dx;
1563   } while (++r < bh);
1564 }
1565 
highbd_dr_prediction_z1_upsample1_neon(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,int dx)1566 static void highbd_dr_prediction_z1_upsample1_neon(uint16_t *dst,
1567                                                    ptrdiff_t stride, int bw,
1568                                                    int bh,
1569                                                    const uint16_t *above,
1570                                                    int dx) {
1571   assert(bw % 4 == 0);
1572   assert(bh % 4 == 0);
1573   assert(dx > 0);
1574 
1575   const int max_base_x = ((bw + bh) - 1) << 1;
1576   const int above_max = above[max_base_x];
1577 
1578   const int16x8_t iota2x8 = vld1q_s16(iota2_s16);
1579   const int16x4_t iota2x4 = vget_low_s16(iota2x8);
1580 
1581   int x = dx;
1582   int r = 0;
1583   do {
1584     const int base = x >> 5;
1585     if (base >= max_base_x) {
1586       for (int i = r; i < bh; ++i) {
1587         aom_memset16(dst, above_max, bw);
1588         dst += stride;
1589       }
1590       return;
1591     }
1592 
1593     // The C implementation of the z1 predictor when upsampling uses:
1594     // (((x << 1) & 0x3f) >> 1)
1595     // The right shift is unnecessary here since we instead shift by +1 later,
1596     // so adjust the mask to 0x3e to ensure we don't consider the extra bit.
1597     const int shift = (x << 1) & 0x3e;
1598 
1599     if (bw == 4) {
1600       const uint16x4x2_t a01 = vld2_u16(&above[base]);
1601       const uint16x4_t val =
1602           highbd_dr_z1_apply_shift_x4(a01.val[0], a01.val[1], shift);
1603       const uint16x4_t cmp = vcgt_s16(vdup_n_s16(max_base_x - base), iota2x4);
1604       const uint16x4_t res = vbsl_u16(cmp, val, vdup_n_u16(above_max));
1605       vst1_u16(dst, res);
1606     } else {
1607       int c = 0;
1608       do {
1609         const uint16x8x2_t a01 = vld2q_u16(&above[base + 2 * c]);
1610         const uint16x8_t val =
1611             highbd_dr_z1_apply_shift_x8(a01.val[0], a01.val[1], shift);
1612         const uint16x8_t cmp =
1613             vcgtq_s16(vdupq_n_s16(max_base_x - base - 2 * c), iota2x8);
1614         const uint16x8_t res = vbslq_u16(cmp, val, vdupq_n_u16(above_max));
1615         vst1q_u16(dst + c, res);
1616         c += 8;
1617       } while (c < bw);
1618     }
1619 
1620     dst += stride;
1621     x += dx;
1622   } while (++r < bh);
1623 }
1624 
1625 // Directional prediction, zone 1: 0 < angle < 90
av1_highbd_dr_prediction_z1_neon(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int upsample_above,int dx,int dy,int bd)1626 void av1_highbd_dr_prediction_z1_neon(uint16_t *dst, ptrdiff_t stride, int bw,
1627                                       int bh, const uint16_t *above,
1628                                       const uint16_t *left, int upsample_above,
1629                                       int dx, int dy, int bd) {
1630   (void)left;
1631   (void)dy;
1632   (void)bd;
1633   assert(dy == 1);
1634 
1635   if (upsample_above) {
1636     highbd_dr_prediction_z1_upsample1_neon(dst, stride, bw, bh, above, dx);
1637   } else {
1638     highbd_dr_prediction_z1_upsample0_neon(dst, stride, bw, bh, above, dx);
1639   }
1640 }
1641 
1642 // -----------------------------------------------------------------------------
1643 // Z2
1644 
1645 #if AOM_ARCH_AARCH64
1646 // Incrementally shift more elements from `above` into the result, merging with
1647 // existing `left` elements.
1648 // X0, X1, X2, X3
1649 // Y0, X0, X1, X2
1650 // Y0, Y1, X0, X1
1651 // Y0, Y1, Y2, X0
1652 // Y0, Y1, Y2, Y3
1653 // clang-format off
1654 static const uint8_t z2_merge_shuffles_u16x4[5][8] = {
1655   {  8,  9, 10, 11, 12, 13, 14, 15 },
1656   {  0,  1,  8,  9, 10, 11, 12, 13 },
1657   {  0,  1,  2,  3,  8,  9, 10, 11 },
1658   {  0,  1,  2,  3,  4,  5,  8,  9 },
1659   {  0,  1,  2,  3,  4,  5,  6,  7 },
1660 };
1661 // clang-format on
1662 
1663 // Incrementally shift more elements from `above` into the result, merging with
1664 // existing `left` elements.
1665 // X0, X1, X2, X3, X4, X5, X6, X7
1666 // Y0, X0, X1, X2, X3, X4, X5, X6
1667 // Y0, Y1, X0, X1, X2, X3, X4, X5
1668 // Y0, Y1, Y2, X0, X1, X2, X3, X4
1669 // Y0, Y1, Y2, Y3, X0, X1, X2, X3
1670 // Y0, Y1, Y2, Y3, Y4, X0, X1, X2
1671 // Y0, Y1, Y2, Y3, Y4, Y5, X0, X1
1672 // Y0, Y1, Y2, Y3, Y4, Y5, Y6, X0
1673 // Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7
1674 // clang-format off
1675 static const uint8_t z2_merge_shuffles_u16x8[9][16] = {
1676   { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 },
1677   {  0,  1, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 },
1678   {  0,  1,  2,  3, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 },
1679   {  0,  1,  2,  3,  4,  5, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 },
1680   {  0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23 },
1681   {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 16, 17, 18, 19, 20, 21 },
1682   {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 16, 17, 18, 19 },
1683   {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 16, 17 },
1684   {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
1685 };
1686 // clang-format on
1687 
1688 // clang-format off
1689 static const uint16_t z2_y_iter_masks_u16x4[5][4] = {
1690   {      0U,      0U,      0U,      0U },
1691   { 0xffffU,      0U,      0U,      0U },
1692   { 0xffffU, 0xffffU,      0U,      0U },
1693   { 0xffffU, 0xffffU, 0xffffU,      0U },
1694   { 0xffffU, 0xffffU, 0xffffU, 0xffffU },
1695 };
1696 // clang-format on
1697 
1698 // clang-format off
1699 static const uint16_t z2_y_iter_masks_u16x8[9][8] = {
1700   {      0U,      0U,      0U,      0U,      0U,      0U,      0U,      0U },
1701   { 0xffffU,      0U,      0U,      0U,      0U,      0U,      0U,      0U },
1702   { 0xffffU, 0xffffU,      0U,      0U,      0U,      0U,      0U,      0U },
1703   { 0xffffU, 0xffffU, 0xffffU,      0U,      0U,      0U,      0U,      0U },
1704   { 0xffffU, 0xffffU, 0xffffU, 0xffffU,      0U,      0U,      0U,      0U },
1705   { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU,      0U,      0U,      0U },
1706   { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU,      0U,      0U },
1707   { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU,      0U },
1708   { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU },
1709 };
1710 // clang-format on
1711 
highbd_dr_prediction_z2_tbl_left_x4_from_x8(const uint16x8_t left_data,const int16x4_t indices,int base,int n)1712 static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_tbl_left_x4_from_x8(
1713     const uint16x8_t left_data, const int16x4_t indices, int base, int n) {
1714   // Need to adjust indices to operate on 0-based indices rather than
1715   // `base`-based indices and then adjust from uint16x4 indices to uint8x8
1716   // indices so we can use a tbl instruction (which only operates on bytes).
1717   uint8x8_t left_indices =
1718       vreinterpret_u8_s16(vsub_s16(indices, vdup_n_s16(base)));
1719   left_indices = vtrn1_u8(left_indices, left_indices);
1720   left_indices = vadd_u8(left_indices, left_indices);
1721   left_indices = vadd_u8(left_indices, vreinterpret_u8_u16(vdup_n_u16(0x0100)));
1722   const uint16x4_t ret = vreinterpret_u16_u8(
1723       vqtbl1_u8(vreinterpretq_u8_u16(left_data), left_indices));
1724   return vand_u16(ret, vld1_u16(z2_y_iter_masks_u16x4[n]));
1725 }
1726 
highbd_dr_prediction_z2_tbl_left_x4_from_x16(const uint16x8x2_t left_data,const int16x4_t indices,int base,int n)1727 static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_tbl_left_x4_from_x16(
1728     const uint16x8x2_t left_data, const int16x4_t indices, int base, int n) {
1729   // Need to adjust indices to operate on 0-based indices rather than
1730   // `base`-based indices and then adjust from uint16x4 indices to uint8x8
1731   // indices so we can use a tbl instruction (which only operates on bytes).
1732   uint8x8_t left_indices =
1733       vreinterpret_u8_s16(vsub_s16(indices, vdup_n_s16(base)));
1734   left_indices = vtrn1_u8(left_indices, left_indices);
1735   left_indices = vadd_u8(left_indices, left_indices);
1736   left_indices = vadd_u8(left_indices, vreinterpret_u8_u16(vdup_n_u16(0x0100)));
1737   uint8x16x2_t data_u8 = { { vreinterpretq_u8_u16(left_data.val[0]),
1738                              vreinterpretq_u8_u16(left_data.val[1]) } };
1739   const uint16x4_t ret = vreinterpret_u16_u8(vqtbl2_u8(data_u8, left_indices));
1740   return vand_u16(ret, vld1_u16(z2_y_iter_masks_u16x4[n]));
1741 }
1742 
highbd_dr_prediction_z2_tbl_left_x8_from_x8(const uint16x8_t left_data,const int16x8_t indices,int base,int n)1743 static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_tbl_left_x8_from_x8(
1744     const uint16x8_t left_data, const int16x8_t indices, int base, int n) {
1745   // Need to adjust indices to operate on 0-based indices rather than
1746   // `base`-based indices and then adjust from uint16x4 indices to uint8x8
1747   // indices so we can use a tbl instruction (which only operates on bytes).
1748   uint8x16_t left_indices =
1749       vreinterpretq_u8_s16(vsubq_s16(indices, vdupq_n_s16(base)));
1750   left_indices = vtrn1q_u8(left_indices, left_indices);
1751   left_indices = vaddq_u8(left_indices, left_indices);
1752   left_indices =
1753       vaddq_u8(left_indices, vreinterpretq_u8_u16(vdupq_n_u16(0x0100)));
1754   const uint16x8_t ret = vreinterpretq_u16_u8(
1755       vqtbl1q_u8(vreinterpretq_u8_u16(left_data), left_indices));
1756   return vandq_u16(ret, vld1q_u16(z2_y_iter_masks_u16x8[n]));
1757 }
1758 
highbd_dr_prediction_z2_tbl_left_x8_from_x16(const uint16x8x2_t left_data,const int16x8_t indices,int base,int n)1759 static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_tbl_left_x8_from_x16(
1760     const uint16x8x2_t left_data, const int16x8_t indices, int base, int n) {
1761   // Need to adjust indices to operate on 0-based indices rather than
1762   // `base`-based indices and then adjust from uint16x4 indices to uint8x8
1763   // indices so we can use a tbl instruction (which only operates on bytes).
1764   uint8x16_t left_indices =
1765       vreinterpretq_u8_s16(vsubq_s16(indices, vdupq_n_s16(base)));
1766   left_indices = vtrn1q_u8(left_indices, left_indices);
1767   left_indices = vaddq_u8(left_indices, left_indices);
1768   left_indices =
1769       vaddq_u8(left_indices, vreinterpretq_u8_u16(vdupq_n_u16(0x0100)));
1770   uint8x16x2_t data_u8 = { { vreinterpretq_u8_u16(left_data.val[0]),
1771                              vreinterpretq_u8_u16(left_data.val[1]) } };
1772   const uint16x8_t ret =
1773       vreinterpretq_u16_u8(vqtbl2q_u8(data_u8, left_indices));
1774   return vandq_u16(ret, vld1q_u16(z2_y_iter_masks_u16x8[n]));
1775 }
1776 #endif  // AOM_ARCH_AARCH64
1777 
1778 // TODO(aomedia:349428506): enable this for armv7 after SIGBUS is fixed.
1779 #if AOM_ARCH_AARCH64
highbd_dr_prediction_z2_gather_left_x4(const uint16_t * left,const int16x4_t indices,int n)1780 static AOM_FORCE_INLINE uint16x4x2_t highbd_dr_prediction_z2_gather_left_x4(
1781     const uint16_t *left, const int16x4_t indices, int n) {
1782   assert(n > 0);
1783   assert(n <= 4);
1784   // Load two elements at a time and then uzp them into separate vectors, to
1785   // reduce the number of memory accesses.
1786   uint32x2_t ret0_u32 = vdup_n_u32(0);
1787   uint32x2_t ret1_u32 = vdup_n_u32(0);
1788 
1789   // Use a single vget_lane_u64 to minimize vector to general purpose register
1790   // transfers and then mask off the bits we actually want.
1791   const uint64_t indices0123 = vget_lane_u64(vreinterpret_u64_s16(indices), 0);
1792   const int idx0 = (int16_t)((indices0123 >> 0) & 0xffffU);
1793   const int idx1 = (int16_t)((indices0123 >> 16) & 0xffffU);
1794   const int idx2 = (int16_t)((indices0123 >> 32) & 0xffffU);
1795   const int idx3 = (int16_t)((indices0123 >> 48) & 0xffffU);
1796 
1797   // At time of writing both Clang and GCC produced better code with these
1798   // nested if-statements compared to a switch statement with fallthrough.
1799   load_unaligned_u32_2x1_lane(ret0_u32, left + idx0, 0);
1800   if (n > 1) {
1801     load_unaligned_u32_2x1_lane(ret0_u32, left + idx1, 1);
1802     if (n > 2) {
1803       load_unaligned_u32_2x1_lane(ret1_u32, left + idx2, 0);
1804       if (n > 3) {
1805         load_unaligned_u32_2x1_lane(ret1_u32, left + idx3, 1);
1806       }
1807     }
1808   }
1809   return vuzp_u16(vreinterpret_u16_u32(ret0_u32),
1810                   vreinterpret_u16_u32(ret1_u32));
1811 }
1812 
highbd_dr_prediction_z2_gather_left_x8(const uint16_t * left,const int16x8_t indices,int n)1813 static AOM_FORCE_INLINE uint16x8x2_t highbd_dr_prediction_z2_gather_left_x8(
1814     const uint16_t *left, const int16x8_t indices, int n) {
1815   assert(n > 0);
1816   assert(n <= 8);
1817   // Load two elements at a time and then uzp them into separate vectors, to
1818   // reduce the number of memory accesses.
1819   uint32x4_t ret0_u32 = vdupq_n_u32(0);
1820   uint32x4_t ret1_u32 = vdupq_n_u32(0);
1821 
1822   // Use a pair of vget_lane_u64 to minimize vector to general purpose register
1823   // transfers and then mask off the bits we actually want.
1824   const uint64_t indices0123 =
1825       vgetq_lane_u64(vreinterpretq_u64_s16(indices), 0);
1826   const uint64_t indices4567 =
1827       vgetq_lane_u64(vreinterpretq_u64_s16(indices), 1);
1828   const int idx0 = (int16_t)((indices0123 >> 0) & 0xffffU);
1829   const int idx1 = (int16_t)((indices0123 >> 16) & 0xffffU);
1830   const int idx2 = (int16_t)((indices0123 >> 32) & 0xffffU);
1831   const int idx3 = (int16_t)((indices0123 >> 48) & 0xffffU);
1832   const int idx4 = (int16_t)((indices4567 >> 0) & 0xffffU);
1833   const int idx5 = (int16_t)((indices4567 >> 16) & 0xffffU);
1834   const int idx6 = (int16_t)((indices4567 >> 32) & 0xffffU);
1835   const int idx7 = (int16_t)((indices4567 >> 48) & 0xffffU);
1836 
1837   // At time of writing both Clang and GCC produced better code with these
1838   // nested if-statements compared to a switch statement with fallthrough.
1839   load_unaligned_u32_4x1_lane(ret0_u32, left + idx0, 0);
1840   if (n > 1) {
1841     load_unaligned_u32_4x1_lane(ret0_u32, left + idx1, 1);
1842     if (n > 2) {
1843       load_unaligned_u32_4x1_lane(ret0_u32, left + idx2, 2);
1844       if (n > 3) {
1845         load_unaligned_u32_4x1_lane(ret0_u32, left + idx3, 3);
1846         if (n > 4) {
1847           load_unaligned_u32_4x1_lane(ret1_u32, left + idx4, 0);
1848           if (n > 5) {
1849             load_unaligned_u32_4x1_lane(ret1_u32, left + idx5, 1);
1850             if (n > 6) {
1851               load_unaligned_u32_4x1_lane(ret1_u32, left + idx6, 2);
1852               if (n > 7) {
1853                 load_unaligned_u32_4x1_lane(ret1_u32, left + idx7, 3);
1854               }
1855             }
1856           }
1857         }
1858       }
1859     }
1860   }
1861   return vuzpq_u16(vreinterpretq_u16_u32(ret0_u32),
1862                    vreinterpretq_u16_u32(ret1_u32));
1863 }
1864 
highbd_dr_prediction_z2_merge_x4(uint16x4_t out_x,uint16x4_t out_y,int base_shift)1865 static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_merge_x4(
1866     uint16x4_t out_x, uint16x4_t out_y, int base_shift) {
1867   assert(base_shift >= 0);
1868   assert(base_shift <= 4);
1869   // On AArch64 we can permute the data from the `above` and `left` vectors
1870   // into a single vector in a single load (of the permute vector) + tbl.
1871 #if AOM_ARCH_AARCH64
1872   const uint8x8x2_t out_yx = { { vreinterpret_u8_u16(out_y),
1873                                  vreinterpret_u8_u16(out_x) } };
1874   return vreinterpret_u16_u8(
1875       vtbl2_u8(out_yx, vld1_u8(z2_merge_shuffles_u16x4[base_shift])));
1876 #else
1877   uint16x4_t out = out_y;
1878   for (int c2 = base_shift, x_idx = 0; c2 < 4; ++c2, ++x_idx) {
1879     out[c2] = out_x[x_idx];
1880   }
1881   return out;
1882 #endif
1883 }
1884 
highbd_dr_prediction_z2_merge_x8(uint16x8_t out_x,uint16x8_t out_y,int base_shift)1885 static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_merge_x8(
1886     uint16x8_t out_x, uint16x8_t out_y, int base_shift) {
1887   assert(base_shift >= 0);
1888   assert(base_shift <= 8);
1889   // On AArch64 we can permute the data from the `above` and `left` vectors
1890   // into a single vector in a single load (of the permute vector) + tbl.
1891 #if AOM_ARCH_AARCH64
1892   const uint8x16x2_t out_yx = { { vreinterpretq_u8_u16(out_y),
1893                                   vreinterpretq_u8_u16(out_x) } };
1894   return vreinterpretq_u16_u8(
1895       vqtbl2q_u8(out_yx, vld1q_u8(z2_merge_shuffles_u16x8[base_shift])));
1896 #else
1897   uint16x8_t out = out_y;
1898   for (int c2 = base_shift, x_idx = 0; c2 < 8; ++c2, ++x_idx) {
1899     out[c2] = out_x[x_idx];
1900   }
1901   return out;
1902 #endif
1903 }
1904 
highbd_dr_prediction_z2_apply_shift_x4(uint16x4_t a0,uint16x4_t a1,int16x4_t shift)1905 static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_apply_shift_x4(
1906     uint16x4_t a0, uint16x4_t a1, int16x4_t shift) {
1907   uint32x4_t res = vmull_u16(a1, vreinterpret_u16_s16(shift));
1908   res =
1909       vmlal_u16(res, a0, vsub_u16(vdup_n_u16(32), vreinterpret_u16_s16(shift)));
1910   return vrshrn_n_u32(res, 5);
1911 }
1912 
highbd_dr_prediction_z2_apply_shift_x8(uint16x8_t a0,uint16x8_t a1,int16x8_t shift)1913 static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_apply_shift_x8(
1914     uint16x8_t a0, uint16x8_t a1, int16x8_t shift) {
1915   return vcombine_u16(
1916       highbd_dr_prediction_z2_apply_shift_x4(vget_low_u16(a0), vget_low_u16(a1),
1917                                              vget_low_s16(shift)),
1918       highbd_dr_prediction_z2_apply_shift_x4(
1919           vget_high_u16(a0), vget_high_u16(a1), vget_high_s16(shift)));
1920 }
1921 
highbd_dr_prediction_z2_step_x4(const uint16_t * above,const uint16x4_t above0,const uint16x4_t above1,const uint16_t * left,int dx,int dy,int r,int c)1922 static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_step_x4(
1923     const uint16_t *above, const uint16x4_t above0, const uint16x4_t above1,
1924     const uint16_t *left, int dx, int dy, int r, int c) {
1925   const int16x4_t iota = vld1_s16(iota1_s16);
1926 
1927   const int x0 = (c << 6) - (r + 1) * dx;
1928   const int y0 = (r << 6) - (c + 1) * dy;
1929 
1930   const int16x4_t x0123 = vadd_s16(vdup_n_s16(x0), vshl_n_s16(iota, 6));
1931   const int16x4_t y0123 = vsub_s16(vdup_n_s16(y0), vmul_n_s16(iota, dy));
1932   const int16x4_t shift_x0123 =
1933       vshr_n_s16(vand_s16(x0123, vdup_n_s16(0x3F)), 1);
1934   const int16x4_t shift_y0123 =
1935       vshr_n_s16(vand_s16(y0123, vdup_n_s16(0x3F)), 1);
1936   const int16x4_t base_y0123 = vshr_n_s16(y0123, 6);
1937 
1938   const int base_shift = ((((r + 1) * dx) - 1) >> 6) - c;
1939 
1940   // Based on the value of `base_shift` there are three possible cases to
1941   // compute the result:
1942   // 1) base_shift <= 0: We can load and operate entirely on data from the
1943   //                     `above` input vector.
1944   // 2) base_shift < vl: We can load from `above[-1]` and shift
1945   //                     `vl - base_shift` elements across to the end of the
1946   //                     vector, then compute the remainder from `left`.
1947   // 3) base_shift >= vl: We can load and operate entirely on data from the
1948   //                      `left` input vector.
1949 
1950   if (base_shift <= 0) {
1951     const int base_x = x0 >> 6;
1952     const uint16x4_t a0 = vld1_u16(above + base_x);
1953     const uint16x4_t a1 = vld1_u16(above + base_x + 1);
1954     return highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123);
1955   } else if (base_shift < 4) {
1956     const uint16x4x2_t l01 = highbd_dr_prediction_z2_gather_left_x4(
1957         left + 1, base_y0123, base_shift);
1958     const uint16x4_t out16_y = highbd_dr_prediction_z2_apply_shift_x4(
1959         l01.val[0], l01.val[1], shift_y0123);
1960 
1961     // No need to reload from above in the loop, just use pre-loaded constants.
1962     const uint16x4_t out16_x =
1963         highbd_dr_prediction_z2_apply_shift_x4(above0, above1, shift_x0123);
1964 
1965     return highbd_dr_prediction_z2_merge_x4(out16_x, out16_y, base_shift);
1966   } else {
1967     const uint16x4x2_t l01 =
1968         highbd_dr_prediction_z2_gather_left_x4(left + 1, base_y0123, 4);
1969     return highbd_dr_prediction_z2_apply_shift_x4(l01.val[0], l01.val[1],
1970                                                   shift_y0123);
1971   }
1972 }
1973 
highbd_dr_prediction_z2_step_x8(const uint16_t * above,const uint16x8_t above0,const uint16x8_t above1,const uint16_t * left,int dx,int dy,int r,int c)1974 static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_step_x8(
1975     const uint16_t *above, const uint16x8_t above0, const uint16x8_t above1,
1976     const uint16_t *left, int dx, int dy, int r, int c) {
1977   const int16x8_t iota = vld1q_s16(iota1_s16);
1978 
1979   const int x0 = (c << 6) - (r + 1) * dx;
1980   const int y0 = (r << 6) - (c + 1) * dy;
1981 
1982   const int16x8_t x01234567 = vaddq_s16(vdupq_n_s16(x0), vshlq_n_s16(iota, 6));
1983   const int16x8_t y01234567 = vsubq_s16(vdupq_n_s16(y0), vmulq_n_s16(iota, dy));
1984   const int16x8_t shift_x01234567 =
1985       vshrq_n_s16(vandq_s16(x01234567, vdupq_n_s16(0x3F)), 1);
1986   const int16x8_t shift_y01234567 =
1987       vshrq_n_s16(vandq_s16(y01234567, vdupq_n_s16(0x3F)), 1);
1988   const int16x8_t base_y01234567 = vshrq_n_s16(y01234567, 6);
1989 
1990   const int base_shift = ((((r + 1) * dx) - 1) >> 6) - c;
1991 
1992   // Based on the value of `base_shift` there are three possible cases to
1993   // compute the result:
1994   // 1) base_shift <= 0: We can load and operate entirely on data from the
1995   //                     `above` input vector.
1996   // 2) base_shift < vl: We can load from `above[-1]` and shift
1997   //                     `vl - base_shift` elements across to the end of the
1998   //                     vector, then compute the remainder from `left`.
1999   // 3) base_shift >= vl: We can load and operate entirely on data from the
2000   //                      `left` input vector.
2001 
2002   if (base_shift <= 0) {
2003     const int base_x = x0 >> 6;
2004     const uint16x8_t a0 = vld1q_u16(above + base_x);
2005     const uint16x8_t a1 = vld1q_u16(above + base_x + 1);
2006     return highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567);
2007   } else if (base_shift < 8) {
2008     const uint16x8x2_t l01 = highbd_dr_prediction_z2_gather_left_x8(
2009         left + 1, base_y01234567, base_shift);
2010     const uint16x8_t out16_y = highbd_dr_prediction_z2_apply_shift_x8(
2011         l01.val[0], l01.val[1], shift_y01234567);
2012 
2013     // No need to reload from above in the loop, just use pre-loaded constants.
2014     const uint16x8_t out16_x =
2015         highbd_dr_prediction_z2_apply_shift_x8(above0, above1, shift_x01234567);
2016 
2017     return highbd_dr_prediction_z2_merge_x8(out16_x, out16_y, base_shift);
2018   } else {
2019     const uint16x8x2_t l01 =
2020         highbd_dr_prediction_z2_gather_left_x8(left + 1, base_y01234567, 8);
2021     return highbd_dr_prediction_z2_apply_shift_x8(l01.val[0], l01.val[1],
2022                                                   shift_y01234567);
2023   }
2024 }
2025 
2026 // Left array is accessed from -1 through `bh - 1` inclusive.
2027 // Above array is accessed from -1 through `bw - 1` inclusive.
2028 #define HIGHBD_DR_PREDICTOR_Z2_WXH(bw, bh)                                 \
2029   static void highbd_dr_prediction_z2_##bw##x##bh##_neon(                  \
2030       uint16_t *dst, ptrdiff_t stride, const uint16_t *above,              \
2031       const uint16_t *left, int upsample_above, int upsample_left, int dx, \
2032       int dy, int bd) {                                                    \
2033     (void)bd;                                                              \
2034     (void)upsample_above;                                                  \
2035     (void)upsample_left;                                                   \
2036     assert(!upsample_above);                                               \
2037     assert(!upsample_left);                                                \
2038     assert(bw % 4 == 0);                                                   \
2039     assert(bh % 4 == 0);                                                   \
2040     assert(dx > 0);                                                        \
2041     assert(dy > 0);                                                        \
2042                                                                            \
2043     uint16_t left_data[bh + 1];                                            \
2044     memcpy(left_data, left - 1, (bh + 1) * sizeof(uint16_t));              \
2045                                                                            \
2046     uint16x8_t a0, a1;                                                     \
2047     if (bw == 4) {                                                         \
2048       a0 = vcombine_u16(vld1_u16(above - 1), vdup_n_u16(0));               \
2049       a1 = vcombine_u16(vld1_u16(above + 0), vdup_n_u16(0));               \
2050     } else {                                                               \
2051       a0 = vld1q_u16(above - 1);                                           \
2052       a1 = vld1q_u16(above + 0);                                           \
2053     }                                                                      \
2054                                                                            \
2055     int r = 0;                                                             \
2056     do {                                                                   \
2057       if (bw == 4) {                                                       \
2058         vst1_u16(dst, highbd_dr_prediction_z2_step_x4(                     \
2059                           above, vget_low_u16(a0), vget_low_u16(a1),       \
2060                           left_data, dx, dy, r, 0));                       \
2061       } else {                                                             \
2062         int c = 0;                                                         \
2063         do {                                                               \
2064           vst1q_u16(dst + c, highbd_dr_prediction_z2_step_x8(              \
2065                                  above, a0, a1, left_data, dx, dy, r, c)); \
2066           c += 8;                                                          \
2067         } while (c < bw);                                                  \
2068       }                                                                    \
2069       dst += stride;                                                       \
2070     } while (++r < bh);                                                    \
2071   }
2072 
2073 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2074 HIGHBD_DR_PREDICTOR_Z2_WXH(4, 16)
2075 HIGHBD_DR_PREDICTOR_Z2_WXH(8, 16)
2076 HIGHBD_DR_PREDICTOR_Z2_WXH(8, 32)
2077 HIGHBD_DR_PREDICTOR_Z2_WXH(16, 4)
2078 HIGHBD_DR_PREDICTOR_Z2_WXH(16, 8)
2079 HIGHBD_DR_PREDICTOR_Z2_WXH(16, 16)
2080 HIGHBD_DR_PREDICTOR_Z2_WXH(16, 32)
2081 HIGHBD_DR_PREDICTOR_Z2_WXH(16, 64)
2082 HIGHBD_DR_PREDICTOR_Z2_WXH(32, 8)
2083 HIGHBD_DR_PREDICTOR_Z2_WXH(32, 16)
2084 HIGHBD_DR_PREDICTOR_Z2_WXH(32, 32)
2085 HIGHBD_DR_PREDICTOR_Z2_WXH(32, 64)
2086 HIGHBD_DR_PREDICTOR_Z2_WXH(64, 16)
2087 HIGHBD_DR_PREDICTOR_Z2_WXH(64, 32)
2088 HIGHBD_DR_PREDICTOR_Z2_WXH(64, 64)
2089 #else
2090 HIGHBD_DR_PREDICTOR_Z2_WXH(8, 16)
2091 HIGHBD_DR_PREDICTOR_Z2_WXH(16, 8)
2092 HIGHBD_DR_PREDICTOR_Z2_WXH(16, 16)
2093 HIGHBD_DR_PREDICTOR_Z2_WXH(16, 32)
2094 HIGHBD_DR_PREDICTOR_Z2_WXH(32, 32)
2095 HIGHBD_DR_PREDICTOR_Z2_WXH(32, 64)
2096 HIGHBD_DR_PREDICTOR_Z2_WXH(64, 32)
2097 HIGHBD_DR_PREDICTOR_Z2_WXH(64, 64)
2098 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2099 
2100 #undef HIGHBD_DR_PREDICTOR_Z2_WXH
2101 
2102 typedef void (*highbd_dr_prediction_z2_ptr)(uint16_t *dst, ptrdiff_t stride,
2103                                             const uint16_t *above,
2104                                             const uint16_t *left,
2105                                             int upsample_above,
2106                                             int upsample_left, int dx, int dy,
2107                                             int bd);
2108 
highbd_dr_prediction_z2_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy,int bd)2109 static void highbd_dr_prediction_z2_4x4_neon(uint16_t *dst, ptrdiff_t stride,
2110                                              const uint16_t *above,
2111                                              const uint16_t *left,
2112                                              int upsample_above,
2113                                              int upsample_left, int dx, int dy,
2114                                              int bd) {
2115   (void)bd;
2116   assert(dx > 0);
2117   assert(dy > 0);
2118 
2119   const int frac_bits_x = 6 - upsample_above;
2120   const int frac_bits_y = 6 - upsample_left;
2121   const int min_base_x = -(1 << (upsample_above + frac_bits_x));
2122 
2123   // if `upsample_left` then we need -2 through 6 inclusive from `left`.
2124   // else we only need -1 through 3 inclusive.
2125 
2126 #if AOM_ARCH_AARCH64
2127   uint16x8_t left_data0, left_data1;
2128   if (upsample_left) {
2129     left_data0 = vld1q_u16(left - 2);
2130     left_data1 = vld1q_u16(left - 1);
2131   } else {
2132     left_data0 = vcombine_u16(vld1_u16(left - 1), vdup_n_u16(0));
2133     left_data1 = vcombine_u16(vld1_u16(left + 0), vdup_n_u16(0));
2134   }
2135 #endif
2136 
2137   const int16x4_t iota0123 = vld1_s16(iota1_s16);
2138   const int16x4_t iota1234 = vld1_s16(iota1_s16 + 1);
2139 
2140   for (int r = 0; r < 4; ++r) {
2141     const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6;
2142     const int x0 = (r + 1) * dx;
2143     const int16x4_t x0123 = vsub_s16(vshl_n_s16(iota0123, 6), vdup_n_s16(x0));
2144     const int base_x0 = (-x0) >> frac_bits_x;
2145     if (base_shift <= 0) {
2146       uint16x4_t a0, a1;
2147       int16x4_t shift_x0123;
2148       if (upsample_above) {
2149         const uint16x4x2_t a01 = vld2_u16(above + base_x0);
2150         a0 = a01.val[0];
2151         a1 = a01.val[1];
2152         shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F));
2153       } else {
2154         a0 = vld1_u16(above + base_x0);
2155         a1 = vld1_u16(above + base_x0 + 1);
2156         shift_x0123 = vshr_n_s16(vand_s16(x0123, vdup_n_s16(0x3F)), 1);
2157       }
2158       vst1_u16(dst,
2159                highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123));
2160     } else if (base_shift < 4) {
2161       // Calculate Y component from `left`.
2162       const int y_iters = base_shift;
2163       const int16x4_t y0123 =
2164           vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy));
2165       const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y));
2166       const int16x4_t shift_y0123 = vshr_n_s16(
2167           vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1);
2168       uint16x4_t l0, l1;
2169 #if AOM_ARCH_AARCH64
2170       const int left_data_base = upsample_left ? -2 : -1;
2171       l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data0, base_y0123,
2172                                                        left_data_base, y_iters);
2173       l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data1, base_y0123,
2174                                                        left_data_base, y_iters);
2175 #else
2176       const uint16x4x2_t l01 =
2177           highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, y_iters);
2178       l0 = l01.val[0];
2179       l1 = l01.val[1];
2180 #endif
2181 
2182       const uint16x4_t out_y =
2183           highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123);
2184 
2185       // Calculate X component from `above`.
2186       const int16x4_t shift_x0123 = vshr_n_s16(
2187           vand_s16(vmul_n_s16(x0123, 1 << upsample_above), vdup_n_s16(0x3F)),
2188           1);
2189       uint16x4_t a0, a1;
2190       if (upsample_above) {
2191         const uint16x4x2_t a01 = vld2_u16(above + (base_x0 % 2 == 0 ? -2 : -1));
2192         a0 = a01.val[0];
2193         a1 = a01.val[1];
2194       } else {
2195         a0 = vld1_u16(above - 1);
2196         a1 = vld1_u16(above + 0);
2197       }
2198       const uint16x4_t out_x =
2199           highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123);
2200 
2201       // Combine X and Y vectors.
2202       const uint16x4_t out =
2203           highbd_dr_prediction_z2_merge_x4(out_x, out_y, base_shift);
2204       vst1_u16(dst, out);
2205     } else {
2206       const int16x4_t y0123 =
2207           vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy));
2208       const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y));
2209       const int16x4_t shift_y0123 = vshr_n_s16(
2210           vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1);
2211       uint16x4_t l0, l1;
2212 #if AOM_ARCH_AARCH64
2213       const int left_data_base = upsample_left ? -2 : -1;
2214       l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data0, base_y0123,
2215                                                        left_data_base, 4);
2216       l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data1, base_y0123,
2217                                                        left_data_base, 4);
2218 #else
2219       const uint16x4x2_t l01 =
2220           highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, 4);
2221       l0 = l01.val[0];
2222       l1 = l01.val[1];
2223 #endif
2224       vst1_u16(dst,
2225                highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123));
2226     }
2227     dst += stride;
2228   }
2229 }
2230 
highbd_dr_prediction_z2_4x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy,int bd)2231 static void highbd_dr_prediction_z2_4x8_neon(uint16_t *dst, ptrdiff_t stride,
2232                                              const uint16_t *above,
2233                                              const uint16_t *left,
2234                                              int upsample_above,
2235                                              int upsample_left, int dx, int dy,
2236                                              int bd) {
2237   (void)bd;
2238   assert(dx > 0);
2239   assert(dy > 0);
2240 
2241   const int frac_bits_x = 6 - upsample_above;
2242   const int frac_bits_y = 6 - upsample_left;
2243   const int min_base_x = -(1 << (upsample_above + frac_bits_x));
2244 
2245   // if `upsample_left` then we need -2 through 14 inclusive from `left`.
2246   // else we only need -1 through 6 inclusive.
2247 
2248 #if AOM_ARCH_AARCH64
2249   uint16x8x2_t left_data0, left_data1;
2250   if (upsample_left) {
2251     left_data0 = vld1q_u16_x2(left - 2);
2252     left_data1 = vld1q_u16_x2(left - 1);
2253   } else {
2254     left_data0 = (uint16x8x2_t){ { vld1q_u16(left - 1), vdupq_n_u16(0) } };
2255     left_data1 = (uint16x8x2_t){ { vld1q_u16(left + 0), vdupq_n_u16(0) } };
2256   }
2257 #endif
2258 
2259   const int16x4_t iota0123 = vld1_s16(iota1_s16);
2260   const int16x4_t iota1234 = vld1_s16(iota1_s16 + 1);
2261 
2262   for (int r = 0; r < 8; ++r) {
2263     const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6;
2264     const int x0 = (r + 1) * dx;
2265     const int16x4_t x0123 = vsub_s16(vshl_n_s16(iota0123, 6), vdup_n_s16(x0));
2266     const int base_x0 = (-x0) >> frac_bits_x;
2267     if (base_shift <= 0) {
2268       uint16x4_t a0, a1;
2269       int16x4_t shift_x0123;
2270       if (upsample_above) {
2271         const uint16x4x2_t a01 = vld2_u16(above + base_x0);
2272         a0 = a01.val[0];
2273         a1 = a01.val[1];
2274         shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F));
2275       } else {
2276         a0 = vld1_u16(above + base_x0);
2277         a1 = vld1_u16(above + base_x0 + 1);
2278         shift_x0123 = vand_s16(vshr_n_s16(x0123, 1), vdup_n_s16(0x1F));
2279       }
2280       vst1_u16(dst,
2281                highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123));
2282     } else if (base_shift < 4) {
2283       // Calculate Y component from `left`.
2284       const int y_iters = base_shift;
2285       const int16x4_t y0123 =
2286           vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy));
2287       const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y));
2288       const int16x4_t shift_y0123 = vshr_n_s16(
2289           vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1);
2290 
2291       uint16x4_t l0, l1;
2292 #if AOM_ARCH_AARCH64
2293       const int left_data_base = upsample_left ? -2 : -1;
2294       l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(
2295           left_data0, base_y0123, left_data_base, y_iters);
2296       l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(
2297           left_data1, base_y0123, left_data_base, y_iters);
2298 #else
2299       const uint16x4x2_t l01 =
2300           highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, y_iters);
2301       l0 = l01.val[0];
2302       l1 = l01.val[1];
2303 #endif
2304 
2305       const uint16x4_t out_y =
2306           highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123);
2307 
2308       // Calculate X component from `above`.
2309       uint16x4_t a0, a1;
2310       int16x4_t shift_x0123;
2311       if (upsample_above) {
2312         const uint16x4x2_t a01 = vld2_u16(above + (base_x0 % 2 == 0 ? -2 : -1));
2313         a0 = a01.val[0];
2314         a1 = a01.val[1];
2315         shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F));
2316       } else {
2317         a0 = vld1_u16(above - 1);
2318         a1 = vld1_u16(above + 0);
2319         shift_x0123 = vand_s16(vshr_n_s16(x0123, 1), vdup_n_s16(0x1F));
2320       }
2321       const uint16x4_t out_x =
2322           highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123);
2323 
2324       // Combine X and Y vectors.
2325       const uint16x4_t out =
2326           highbd_dr_prediction_z2_merge_x4(out_x, out_y, base_shift);
2327       vst1_u16(dst, out);
2328     } else {
2329       const int16x4_t y0123 =
2330           vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy));
2331       const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y));
2332       const int16x4_t shift_y0123 = vshr_n_s16(
2333           vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1);
2334 
2335       uint16x4_t l0, l1;
2336 #if AOM_ARCH_AARCH64
2337       const int left_data_base = upsample_left ? -2 : -1;
2338       l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(left_data0, base_y0123,
2339                                                         left_data_base, 4);
2340       l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(left_data1, base_y0123,
2341                                                         left_data_base, 4);
2342 #else
2343       const uint16x4x2_t l01 =
2344           highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, 4);
2345       l0 = l01.val[0];
2346       l1 = l01.val[1];
2347 #endif
2348 
2349       vst1_u16(dst,
2350                highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123));
2351     }
2352     dst += stride;
2353   }
2354 }
2355 
highbd_dr_prediction_z2_8x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy,int bd)2356 static void highbd_dr_prediction_z2_8x4_neon(uint16_t *dst, ptrdiff_t stride,
2357                                              const uint16_t *above,
2358                                              const uint16_t *left,
2359                                              int upsample_above,
2360                                              int upsample_left, int dx, int dy,
2361                                              int bd) {
2362   (void)bd;
2363   assert(dx > 0);
2364   assert(dy > 0);
2365 
2366   const int frac_bits_x = 6 - upsample_above;
2367   const int frac_bits_y = 6 - upsample_left;
2368   const int min_base_x = -(1 << (upsample_above + frac_bits_x));
2369 
2370   // if `upsample_left` then we need -2 through 6 inclusive from `left`.
2371   // else we only need -1 through 3 inclusive.
2372 
2373 #if AOM_ARCH_AARCH64
2374   uint16x8_t left_data0, left_data1;
2375   if (upsample_left) {
2376     left_data0 = vld1q_u16(left - 2);
2377     left_data1 = vld1q_u16(left - 1);
2378   } else {
2379     left_data0 = vcombine_u16(vld1_u16(left - 1), vdup_n_u16(0));
2380     left_data1 = vcombine_u16(vld1_u16(left + 0), vdup_n_u16(0));
2381   }
2382 #endif
2383 
2384   const int16x8_t iota01234567 = vld1q_s16(iota1_s16);
2385   const int16x8_t iota12345678 = vld1q_s16(iota1_s16 + 1);
2386 
2387   for (int r = 0; r < 4; ++r) {
2388     const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6;
2389     const int x0 = (r + 1) * dx;
2390     const int16x8_t x01234567 =
2391         vsubq_s16(vshlq_n_s16(iota01234567, 6), vdupq_n_s16(x0));
2392     const int base_x0 = (-x0) >> frac_bits_x;
2393     if (base_shift <= 0) {
2394       uint16x8_t a0, a1;
2395       int16x8_t shift_x01234567;
2396       if (upsample_above) {
2397         const uint16x8x2_t a01 = vld2q_u16(above + base_x0);
2398         a0 = a01.val[0];
2399         a1 = a01.val[1];
2400         shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F));
2401       } else {
2402         a0 = vld1q_u16(above + base_x0);
2403         a1 = vld1q_u16(above + base_x0 + 1);
2404         shift_x01234567 =
2405             vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F));
2406       }
2407       vst1q_u16(
2408           dst, highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567));
2409     } else if (base_shift < 8) {
2410       // Calculate Y component from `left`.
2411       const int y_iters = base_shift;
2412       const int16x8_t y01234567 =
2413           vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy));
2414       const int16x8_t base_y01234567 =
2415           vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y));
2416       const int16x8_t shift_y01234567 =
2417           vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left),
2418                                 vdupq_n_s16(0x3F)),
2419                       1);
2420 
2421       uint16x8_t l0, l1;
2422 #if AOM_ARCH_AARCH64
2423       const int left_data_base = upsample_left ? -2 : -1;
2424       l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x8(
2425           left_data0, base_y01234567, left_data_base, y_iters);
2426       l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x8(
2427           left_data1, base_y01234567, left_data_base, y_iters);
2428 #else
2429       const uint16x8x2_t l01 =
2430           highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, y_iters);
2431       l0 = l01.val[0];
2432       l1 = l01.val[1];
2433 #endif
2434 
2435       const uint16x8_t out_y =
2436           highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567);
2437 
2438       // Calculate X component from `above`.
2439       uint16x8_t a0, a1;
2440       int16x8_t shift_x01234567;
2441       if (upsample_above) {
2442         const uint16x8x2_t a01 =
2443             vld2q_u16(above + (base_x0 % 2 == 0 ? -2 : -1));
2444         a0 = a01.val[0];
2445         a1 = a01.val[1];
2446         shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F));
2447       } else {
2448         a0 = vld1q_u16(above - 1);
2449         a1 = vld1q_u16(above + 0);
2450         shift_x01234567 =
2451             vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F));
2452       }
2453       const uint16x8_t out_x =
2454           highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567);
2455 
2456       // Combine X and Y vectors.
2457       const uint16x8_t out =
2458           highbd_dr_prediction_z2_merge_x8(out_x, out_y, base_shift);
2459       vst1q_u16(dst, out);
2460     } else {
2461       const int16x8_t y01234567 =
2462           vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy));
2463       const int16x8_t base_y01234567 =
2464           vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y));
2465       const int16x8_t shift_y01234567 =
2466           vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left),
2467                                 vdupq_n_s16(0x3F)),
2468                       1);
2469 
2470       uint16x8_t l0, l1;
2471 #if AOM_ARCH_AARCH64
2472       const int left_data_base = upsample_left ? -2 : -1;
2473       l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x8(
2474           left_data0, base_y01234567, left_data_base, 8);
2475       l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x8(
2476           left_data1, base_y01234567, left_data_base, 8);
2477 #else
2478       const uint16x8x2_t l01 =
2479           highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, 8);
2480       l0 = l01.val[0];
2481       l1 = l01.val[1];
2482 #endif
2483 
2484       vst1q_u16(
2485           dst, highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567));
2486     }
2487     dst += stride;
2488   }
2489 }
2490 
highbd_dr_prediction_z2_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy,int bd)2491 static void highbd_dr_prediction_z2_8x8_neon(uint16_t *dst, ptrdiff_t stride,
2492                                              const uint16_t *above,
2493                                              const uint16_t *left,
2494                                              int upsample_above,
2495                                              int upsample_left, int dx, int dy,
2496                                              int bd) {
2497   (void)bd;
2498   assert(dx > 0);
2499   assert(dy > 0);
2500 
2501   const int frac_bits_x = 6 - upsample_above;
2502   const int frac_bits_y = 6 - upsample_left;
2503   const int min_base_x = -(1 << (upsample_above + frac_bits_x));
2504 
2505   // if `upsample_left` then we need -2 through 14 inclusive from `left`.
2506   // else we only need -1 through 6 inclusive.
2507 
2508 #if AOM_ARCH_AARCH64
2509   uint16x8x2_t left_data0, left_data1;
2510   if (upsample_left) {
2511     left_data0 = vld1q_u16_x2(left - 2);
2512     left_data1 = vld1q_u16_x2(left - 1);
2513   } else {
2514     left_data0 = (uint16x8x2_t){ { vld1q_u16(left - 1), vdupq_n_u16(0) } };
2515     left_data1 = (uint16x8x2_t){ { vld1q_u16(left + 0), vdupq_n_u16(0) } };
2516   }
2517 #endif
2518 
2519   const int16x8_t iota01234567 = vld1q_s16(iota1_s16);
2520   const int16x8_t iota12345678 = vld1q_s16(iota1_s16 + 1);
2521 
2522   for (int r = 0; r < 8; ++r) {
2523     const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6;
2524     const int x0 = (r + 1) * dx;
2525     const int16x8_t x01234567 =
2526         vsubq_s16(vshlq_n_s16(iota01234567, 6), vdupq_n_s16(x0));
2527     const int base_x0 = (-x0) >> frac_bits_x;
2528     if (base_shift <= 0) {
2529       uint16x8_t a0, a1;
2530       int16x8_t shift_x01234567;
2531       if (upsample_above) {
2532         const uint16x8x2_t a01 = vld2q_u16(above + base_x0);
2533         a0 = a01.val[0];
2534         a1 = a01.val[1];
2535         shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F));
2536       } else {
2537         a0 = vld1q_u16(above + base_x0);
2538         a1 = vld1q_u16(above + base_x0 + 1);
2539         shift_x01234567 =
2540             vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F));
2541       }
2542       vst1q_u16(
2543           dst, highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567));
2544     } else if (base_shift < 8) {
2545       // Calculate Y component from `left`.
2546       const int y_iters = base_shift;
2547       const int16x8_t y01234567 =
2548           vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy));
2549       const int16x8_t base_y01234567 =
2550           vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y));
2551       const int16x8_t shift_y01234567 =
2552           vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left),
2553                                 vdupq_n_s16(0x3F)),
2554                       1);
2555 
2556       uint16x8_t l0, l1;
2557 #if AOM_ARCH_AARCH64
2558       const int left_data_base = upsample_left ? -2 : -1;
2559       l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x16(
2560           left_data0, base_y01234567, left_data_base, y_iters);
2561       l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x16(
2562           left_data1, base_y01234567, left_data_base, y_iters);
2563 #else
2564       const uint16x8x2_t l01 =
2565           highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, y_iters);
2566       l0 = l01.val[0];
2567       l1 = l01.val[1];
2568 #endif
2569 
2570       const uint16x8_t out_y =
2571           highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567);
2572 
2573       // Calculate X component from `above`.
2574       uint16x8_t a0, a1;
2575       int16x8_t shift_x01234567;
2576       if (upsample_above) {
2577         const uint16x8x2_t a01 =
2578             vld2q_u16(above + (base_x0 % 2 == 0 ? -2 : -1));
2579         a0 = a01.val[0];
2580         a1 = a01.val[1];
2581         shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F));
2582       } else {
2583         a0 = vld1q_u16(above - 1);
2584         a1 = vld1q_u16(above + 0);
2585         shift_x01234567 =
2586             vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F));
2587       }
2588       const uint16x8_t out_x =
2589           highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567);
2590 
2591       // Combine X and Y vectors.
2592       const uint16x8_t out =
2593           highbd_dr_prediction_z2_merge_x8(out_x, out_y, base_shift);
2594       vst1q_u16(dst, out);
2595     } else {
2596       const int16x8_t y01234567 =
2597           vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy));
2598       const int16x8_t base_y01234567 =
2599           vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y));
2600       const int16x8_t shift_y01234567 =
2601           vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left),
2602                                 vdupq_n_s16(0x3F)),
2603                       1);
2604 
2605       uint16x8_t l0, l1;
2606 #if AOM_ARCH_AARCH64
2607       const int left_data_base = upsample_left ? -2 : -1;
2608       l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x16(
2609           left_data0, base_y01234567, left_data_base, 8);
2610       l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x16(
2611           left_data1, base_y01234567, left_data_base, 8);
2612 #else
2613       const uint16x8x2_t l01 =
2614           highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, 8);
2615       l0 = l01.val[0];
2616       l1 = l01.val[1];
2617 #endif
2618 
2619       vst1q_u16(
2620           dst, highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567));
2621     }
2622     dst += stride;
2623   }
2624 }
2625 
2626 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2627 static highbd_dr_prediction_z2_ptr dr_predictor_z2_arr_neon[7][7] = {
2628   { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
2629   { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
2630   { NULL, NULL, &highbd_dr_prediction_z2_4x4_neon,
2631     &highbd_dr_prediction_z2_4x8_neon, &highbd_dr_prediction_z2_4x16_neon, NULL,
2632     NULL },
2633   { NULL, NULL, &highbd_dr_prediction_z2_8x4_neon,
2634     &highbd_dr_prediction_z2_8x8_neon, &highbd_dr_prediction_z2_8x16_neon,
2635     &highbd_dr_prediction_z2_8x32_neon, NULL },
2636   { NULL, NULL, &highbd_dr_prediction_z2_16x4_neon,
2637     &highbd_dr_prediction_z2_16x8_neon, &highbd_dr_prediction_z2_16x16_neon,
2638     &highbd_dr_prediction_z2_16x32_neon, &highbd_dr_prediction_z2_16x64_neon },
2639   { NULL, NULL, NULL, &highbd_dr_prediction_z2_32x8_neon,
2640     &highbd_dr_prediction_z2_32x16_neon, &highbd_dr_prediction_z2_32x32_neon,
2641     &highbd_dr_prediction_z2_32x64_neon },
2642   { NULL, NULL, NULL, NULL, &highbd_dr_prediction_z2_64x16_neon,
2643     &highbd_dr_prediction_z2_64x32_neon, &highbd_dr_prediction_z2_64x64_neon },
2644 };
2645 #else
2646 static highbd_dr_prediction_z2_ptr dr_predictor_z2_arr_neon[7][7] = {
2647   { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
2648   { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
2649   { NULL, NULL, &highbd_dr_prediction_z2_4x4_neon,
2650     &highbd_dr_prediction_z2_4x8_neon, NULL, NULL, NULL },
2651   { NULL, NULL, &highbd_dr_prediction_z2_8x4_neon,
2652     &highbd_dr_prediction_z2_8x8_neon, &highbd_dr_prediction_z2_8x16_neon, NULL,
2653     NULL },
2654   { NULL, NULL, NULL, &highbd_dr_prediction_z2_16x8_neon,
2655     &highbd_dr_prediction_z2_16x16_neon, &highbd_dr_prediction_z2_16x32_neon,
2656     NULL },
2657   { NULL, NULL, NULL, NULL, NULL, &highbd_dr_prediction_z2_32x32_neon,
2658     &highbd_dr_prediction_z2_32x64_neon },
2659   { NULL, NULL, NULL, NULL, NULL, &highbd_dr_prediction_z2_64x32_neon,
2660     &highbd_dr_prediction_z2_64x64_neon },
2661 };
2662 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
2663 
2664 // Directional prediction, zone 2: 90 < angle < 180
av1_highbd_dr_prediction_z2_neon(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int upsample_above,int upsample_left,int dx,int dy,int bd)2665 void av1_highbd_dr_prediction_z2_neon(uint16_t *dst, ptrdiff_t stride, int bw,
2666                                       int bh, const uint16_t *above,
2667                                       const uint16_t *left, int upsample_above,
2668                                       int upsample_left, int dx, int dy,
2669                                       int bd) {
2670   highbd_dr_prediction_z2_ptr f =
2671       dr_predictor_z2_arr_neon[get_msb(bw)][get_msb(bh)];
2672   assert(f != NULL);
2673   f(dst, stride, above, left, upsample_above, upsample_left, dx, dy, bd);
2674 }
2675 #endif  // AOM_ARCH_AARCH64
2676 
2677 // -----------------------------------------------------------------------------
2678 // Z3
2679 
2680 // Both the lane to the use and the shift amount must be immediates.
2681 #define HIGHBD_DR_PREDICTOR_Z3_STEP_X4(out, iota, base, in0, in1, s0, s1, \
2682                                        lane, shift)                       \
2683   do {                                                                    \
2684     uint32x4_t val = vmull_lane_u16((in0), (s0), (lane));                 \
2685     val = vmlal_lane_u16(val, (in1), (s1), (lane));                       \
2686     const uint16x4_t cmp = vadd_u16((iota), vdup_n_u16(base));            \
2687     const uint16x4_t res = vrshrn_n_u32(val, (shift));                    \
2688     *(out) = vbsl_u16(vclt_u16(cmp, vdup_n_u16(max_base_y)), res,         \
2689                       vdup_n_u16(left_max));                              \
2690   } while (0)
2691 
2692 #define HIGHBD_DR_PREDICTOR_Z3_STEP_X8(out, iota, base, in0, in1, s0, s1, \
2693                                        lane, shift)                       \
2694   do {                                                                    \
2695     uint32x4_t val_lo = vmull_lane_u16(vget_low_u16(in0), (s0), (lane));  \
2696     val_lo = vmlal_lane_u16(val_lo, vget_low_u16(in1), (s1), (lane));     \
2697     uint32x4_t val_hi = vmull_lane_u16(vget_high_u16(in0), (s0), (lane)); \
2698     val_hi = vmlal_lane_u16(val_hi, vget_high_u16(in1), (s1), (lane));    \
2699     *(out) = vcombine_u16(vrshrn_n_u32(val_lo, (shift)),                  \
2700                           vrshrn_n_u32(val_hi, (shift)));                 \
2701   } while (0)
2702 
z3_load_left_neon(const uint16_t * left0,int ofs,int max_ofs)2703 static inline uint16x8x2_t z3_load_left_neon(const uint16_t *left0, int ofs,
2704                                              int max_ofs) {
2705   uint16x8_t r0;
2706   uint16x8_t r1;
2707   if (ofs + 7 >= max_ofs) {
2708     int shuffle_idx = max_ofs - ofs;
2709     r0 = zn_load_masked_neon(left0 + (max_ofs - 7), shuffle_idx);
2710   } else {
2711     r0 = vld1q_u16(left0 + ofs);
2712   }
2713   if (ofs + 8 >= max_ofs) {
2714     int shuffle_idx = max_ofs - ofs - 1;
2715     r1 = zn_load_masked_neon(left0 + (max_ofs - 7), shuffle_idx);
2716   } else {
2717     r1 = vld1q_u16(left0 + ofs + 1);
2718   }
2719   return (uint16x8x2_t){ { r0, r1 } };
2720 }
2721 
highbd_dr_prediction_z3_upsample0_neon(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * left,int dy)2722 static void highbd_dr_prediction_z3_upsample0_neon(uint16_t *dst,
2723                                                    ptrdiff_t stride, int bw,
2724                                                    int bh, const uint16_t *left,
2725                                                    int dy) {
2726   assert(bw % 4 == 0);
2727   assert(bh % 4 == 0);
2728   assert(dy > 0);
2729 
2730   // Factor out left + 1 to give the compiler a better chance of recognising
2731   // that the offsets used for the loads from left and left + 1 are otherwise
2732   // identical.
2733   const uint16_t *left1 = left + 1;
2734 
2735   const int max_base_y = (bw + bh - 1);
2736   const int left_max = left[max_base_y];
2737   const int frac_bits = 6;
2738 
2739   const uint16x8_t iota1x8 = vreinterpretq_u16_s16(vld1q_s16(iota1_s16));
2740   const uint16x4_t iota1x4 = vget_low_u16(iota1x8);
2741 
2742   // The C implementation of the z3 predictor when not upsampling uses:
2743   // ((y & 0x3f) >> 1)
2744   // The right shift is unnecessary here since we instead shift by +1 later,
2745   // so adjust the mask to 0x3e to ensure we don't consider the extra bit.
2746   const uint16x4_t shift_mask = vdup_n_u16(0x3e);
2747 
2748   if (bh == 4) {
2749     int y = dy;
2750     int c = 0;
2751     do {
2752       // Fully unroll the 4x4 block to allow us to use immediate lane-indexed
2753       // multiply instructions.
2754       const uint16x4_t shifts1 =
2755           vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask);
2756       const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(64), shifts1);
2757       const int base0 = (y + 0 * dy) >> frac_bits;
2758       const int base1 = (y + 1 * dy) >> frac_bits;
2759       const int base2 = (y + 2 * dy) >> frac_bits;
2760       const int base3 = (y + 3 * dy) >> frac_bits;
2761       uint16x4_t out[4];
2762       if (base0 >= max_base_y) {
2763         out[0] = vdup_n_u16(left_max);
2764       } else {
2765         const uint16x4_t l00 = vld1_u16(left + base0);
2766         const uint16x4_t l01 = vld1_u16(left1 + base0);
2767         HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[0], iota1x4, base0, l00, l01,
2768                                        shifts0, shifts1, 0, 6);
2769       }
2770       if (base1 >= max_base_y) {
2771         out[1] = vdup_n_u16(left_max);
2772       } else {
2773         const uint16x4_t l10 = vld1_u16(left + base1);
2774         const uint16x4_t l11 = vld1_u16(left1 + base1);
2775         HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[1], iota1x4, base1, l10, l11,
2776                                        shifts0, shifts1, 1, 6);
2777       }
2778       if (base2 >= max_base_y) {
2779         out[2] = vdup_n_u16(left_max);
2780       } else {
2781         const uint16x4_t l20 = vld1_u16(left + base2);
2782         const uint16x4_t l21 = vld1_u16(left1 + base2);
2783         HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[2], iota1x4, base2, l20, l21,
2784                                        shifts0, shifts1, 2, 6);
2785       }
2786       if (base3 >= max_base_y) {
2787         out[3] = vdup_n_u16(left_max);
2788       } else {
2789         const uint16x4_t l30 = vld1_u16(left + base3);
2790         const uint16x4_t l31 = vld1_u16(left1 + base3);
2791         HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[3], iota1x4, base3, l30, l31,
2792                                        shifts0, shifts1, 3, 6);
2793       }
2794       transpose_array_inplace_u16_4x4(out);
2795       for (int r2 = 0; r2 < 4; ++r2) {
2796         vst1_u16(dst + r2 * stride + c, out[r2]);
2797       }
2798       y += 4 * dy;
2799       c += 4;
2800     } while (c < bw);
2801   } else {
2802     int y = dy;
2803     int c = 0;
2804     do {
2805       int r = 0;
2806       do {
2807         // Fully unroll the 4x4 block to allow us to use immediate lane-indexed
2808         // multiply instructions.
2809         const uint16x4_t shifts1 =
2810             vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask);
2811         const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(64), shifts1);
2812         const int base0 = ((y + 0 * dy) >> frac_bits) + r;
2813         const int base1 = ((y + 1 * dy) >> frac_bits) + r;
2814         const int base2 = ((y + 2 * dy) >> frac_bits) + r;
2815         const int base3 = ((y + 3 * dy) >> frac_bits) + r;
2816         uint16x8_t out[4];
2817         if (base0 >= max_base_y) {
2818           out[0] = vdupq_n_u16(left_max);
2819         } else {
2820           const uint16x8x2_t l0 = z3_load_left_neon(left, base0, max_base_y);
2821           HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[0], iota1x8, base0, l0.val[0],
2822                                          l0.val[1], shifts0, shifts1, 0, 6);
2823         }
2824         if (base1 >= max_base_y) {
2825           out[1] = vdupq_n_u16(left_max);
2826         } else {
2827           const uint16x8x2_t l1 = z3_load_left_neon(left, base1, max_base_y);
2828           HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[1], iota1x8, base1, l1.val[0],
2829                                          l1.val[1], shifts0, shifts1, 1, 6);
2830         }
2831         if (base2 >= max_base_y) {
2832           out[2] = vdupq_n_u16(left_max);
2833         } else {
2834           const uint16x8x2_t l2 = z3_load_left_neon(left, base2, max_base_y);
2835           HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[2], iota1x8, base2, l2.val[0],
2836                                          l2.val[1], shifts0, shifts1, 2, 6);
2837         }
2838         if (base3 >= max_base_y) {
2839           out[3] = vdupq_n_u16(left_max);
2840         } else {
2841           const uint16x8x2_t l3 = z3_load_left_neon(left, base3, max_base_y);
2842           HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[3], iota1x8, base3, l3.val[0],
2843                                          l3.val[1], shifts0, shifts1, 3, 6);
2844         }
2845         transpose_array_inplace_u16_4x8(out);
2846         for (int r2 = 0; r2 < 4; ++r2) {
2847           vst1_u16(dst + (r + r2) * stride + c, vget_low_u16(out[r2]));
2848         }
2849         for (int r2 = 0; r2 < 4; ++r2) {
2850           vst1_u16(dst + (r + r2 + 4) * stride + c, vget_high_u16(out[r2]));
2851         }
2852         r += 8;
2853       } while (r < bh);
2854       y += 4 * dy;
2855       c += 4;
2856     } while (c < bw);
2857   }
2858 }
2859 
highbd_dr_prediction_z3_upsample1_neon(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * left,int dy)2860 static void highbd_dr_prediction_z3_upsample1_neon(uint16_t *dst,
2861                                                    ptrdiff_t stride, int bw,
2862                                                    int bh, const uint16_t *left,
2863                                                    int dy) {
2864   assert(bw % 4 == 0);
2865   assert(bh % 4 == 0);
2866   assert(dy > 0);
2867 
2868   const int max_base_y = (bw + bh - 1) << 1;
2869   const int left_max = left[max_base_y];
2870   const int frac_bits = 5;
2871 
2872   const uint16x4_t iota1x4 = vreinterpret_u16_s16(vld1_s16(iota1_s16));
2873   const uint16x8_t iota2x8 = vreinterpretq_u16_s16(vld1q_s16(iota2_s16));
2874   const uint16x4_t iota2x4 = vget_low_u16(iota2x8);
2875 
2876   // The C implementation of the z3 predictor when upsampling uses:
2877   // (((x << 1) & 0x3f) >> 1)
2878   // The two shifts are unnecessary here since the lowest bit is guaranteed to
2879   // be zero when the mask is applied, so adjust the mask to 0x1f to avoid
2880   // needing the shifts at all.
2881   const uint16x4_t shift_mask = vdup_n_u16(0x1F);
2882 
2883   if (bh == 4) {
2884     int y = dy;
2885     int c = 0;
2886     do {
2887       // Fully unroll the 4x4 block to allow us to use immediate lane-indexed
2888       // multiply instructions.
2889       const uint16x4_t shifts1 =
2890           vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask);
2891       const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(32), shifts1);
2892       const int base0 = (y + 0 * dy) >> frac_bits;
2893       const int base1 = (y + 1 * dy) >> frac_bits;
2894       const int base2 = (y + 2 * dy) >> frac_bits;
2895       const int base3 = (y + 3 * dy) >> frac_bits;
2896       const uint16x4x2_t l0 = vld2_u16(left + base0);
2897       const uint16x4x2_t l1 = vld2_u16(left + base1);
2898       const uint16x4x2_t l2 = vld2_u16(left + base2);
2899       const uint16x4x2_t l3 = vld2_u16(left + base3);
2900       uint16x4_t out[4];
2901       HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[0], iota2x4, base0, l0.val[0],
2902                                      l0.val[1], shifts0, shifts1, 0, 5);
2903       HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[1], iota2x4, base1, l1.val[0],
2904                                      l1.val[1], shifts0, shifts1, 1, 5);
2905       HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[2], iota2x4, base2, l2.val[0],
2906                                      l2.val[1], shifts0, shifts1, 2, 5);
2907       HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[3], iota2x4, base3, l3.val[0],
2908                                      l3.val[1], shifts0, shifts1, 3, 5);
2909       transpose_array_inplace_u16_4x4(out);
2910       for (int r2 = 0; r2 < 4; ++r2) {
2911         vst1_u16(dst + r2 * stride + c, out[r2]);
2912       }
2913       y += 4 * dy;
2914       c += 4;
2915     } while (c < bw);
2916   } else {
2917     assert(bh % 8 == 0);
2918 
2919     int y = dy;
2920     int c = 0;
2921     do {
2922       int r = 0;
2923       do {
2924         // Fully unroll the 4x8 block to allow us to use immediate lane-indexed
2925         // multiply instructions.
2926         const uint16x4_t shifts1 =
2927             vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask);
2928         const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(32), shifts1);
2929         const int base0 = ((y + 0 * dy) >> frac_bits) + (r * 2);
2930         const int base1 = ((y + 1 * dy) >> frac_bits) + (r * 2);
2931         const int base2 = ((y + 2 * dy) >> frac_bits) + (r * 2);
2932         const int base3 = ((y + 3 * dy) >> frac_bits) + (r * 2);
2933         const uint16x8x2_t l0 = vld2q_u16(left + base0);
2934         const uint16x8x2_t l1 = vld2q_u16(left + base1);
2935         const uint16x8x2_t l2 = vld2q_u16(left + base2);
2936         const uint16x8x2_t l3 = vld2q_u16(left + base3);
2937         uint16x8_t out[4];
2938         HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[0], iota2x8, base0, l0.val[0],
2939                                        l0.val[1], shifts0, shifts1, 0, 5);
2940         HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[1], iota2x8, base1, l1.val[0],
2941                                        l1.val[1], shifts0, shifts1, 1, 5);
2942         HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[2], iota2x8, base2, l2.val[0],
2943                                        l2.val[1], shifts0, shifts1, 2, 5);
2944         HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[3], iota2x8, base3, l3.val[0],
2945                                        l3.val[1], shifts0, shifts1, 3, 5);
2946         transpose_array_inplace_u16_4x8(out);
2947         for (int r2 = 0; r2 < 4; ++r2) {
2948           vst1_u16(dst + (r + r2) * stride + c, vget_low_u16(out[r2]));
2949         }
2950         for (int r2 = 0; r2 < 4; ++r2) {
2951           vst1_u16(dst + (r + r2 + 4) * stride + c, vget_high_u16(out[r2]));
2952         }
2953         r += 8;
2954       } while (r < bh);
2955       y += 4 * dy;
2956       c += 4;
2957     } while (c < bw);
2958   }
2959 }
2960 
2961 // Directional prediction, zone 3: 180 < angle < 270
av1_highbd_dr_prediction_z3_neon(uint16_t * dst,ptrdiff_t stride,int bw,int bh,const uint16_t * above,const uint16_t * left,int upsample_left,int dx,int dy,int bd)2962 void av1_highbd_dr_prediction_z3_neon(uint16_t *dst, ptrdiff_t stride, int bw,
2963                                       int bh, const uint16_t *above,
2964                                       const uint16_t *left, int upsample_left,
2965                                       int dx, int dy, int bd) {
2966   (void)above;
2967   (void)dx;
2968   (void)bd;
2969   assert(bw % 4 == 0);
2970   assert(bh % 4 == 0);
2971   assert(dx == 1);
2972   assert(dy > 0);
2973 
2974   if (upsample_left) {
2975     highbd_dr_prediction_z3_upsample1_neon(dst, stride, bw, bh, left, dy);
2976   } else {
2977     highbd_dr_prediction_z3_upsample0_neon(dst, stride, bw, bh, left, dy);
2978   }
2979 }
2980 
2981 #undef HIGHBD_DR_PREDICTOR_Z3_STEP_X4
2982 #undef HIGHBD_DR_PREDICTOR_Z3_STEP_X8
2983