xref: /aosp_15_r20/external/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 
13 #include "./vpx_config.h"
14 #include "./vpx_dsp_rtcd.h"
15 #include "sum_neon.h"
16 #include "vpx/vpx_integer.h"
17 
18 //------------------------------------------------------------------------------
19 // DC 4x4
20 
dc_sum_4(const uint16_t * ref)21 static INLINE uint16_t dc_sum_4(const uint16_t *ref) {
22   const uint16x4_t ref_u16 = vld1_u16(ref);
23   return horizontal_add_uint16x4(ref_u16);
24 }
25 
dc_store_4x4(uint16_t * dst,ptrdiff_t stride,const uint16x4_t dc)26 static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
27                                 const uint16x4_t dc) {
28   int i;
29   for (i = 0; i < 4; ++i, dst += stride) {
30     vst1_u16(dst, dc);
31   }
32 }
33 
vpx_highbd_dc_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)34 void vpx_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
35                                       const uint16_t *above,
36                                       const uint16_t *left, int bd) {
37   const uint16x4_t a = vld1_u16(above);
38   const uint16x4_t l = vld1_u16(left);
39   const uint16_t sum = horizontal_add_uint16x4(vadd_u16(a, l));
40   const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 3);
41   (void)bd;
42   dc_store_4x4(dst, stride, dc);
43 }
44 
vpx_highbd_dc_left_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)45 void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
46                                            const uint16_t *above,
47                                            const uint16_t *left, int bd) {
48   const uint16_t sum = dc_sum_4(left);
49   const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 2);
50   (void)above;
51   (void)bd;
52   dc_store_4x4(dst, stride, dc);
53 }
54 
vpx_highbd_dc_top_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)55 void vpx_highbd_dc_top_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
56                                           const uint16_t *above,
57                                           const uint16_t *left, int bd) {
58   const uint16_t sum = dc_sum_4(above);
59   const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 2);
60   (void)left;
61   (void)bd;
62   dc_store_4x4(dst, stride, dc);
63 }
64 
vpx_highbd_dc_128_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)65 void vpx_highbd_dc_128_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
66                                           const uint16_t *above,
67                                           const uint16_t *left, int bd) {
68   const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
69   (void)above;
70   (void)left;
71   dc_store_4x4(dst, stride, dc);
72 }
73 
74 //------------------------------------------------------------------------------
75 // DC 8x8
76 
dc_sum_8(const uint16_t * ref)77 static INLINE uint16_t dc_sum_8(const uint16_t *ref) {
78   const uint16x8_t ref_u16 = vld1q_u16(ref);
79   return horizontal_add_uint16x8(ref_u16);
80 }
81 
dc_store_8x8(uint16_t * dst,ptrdiff_t stride,const uint16x8_t dc)82 static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride,
83                                 const uint16x8_t dc) {
84   int i;
85   for (i = 0; i < 8; ++i, dst += stride) {
86     vst1q_u16(dst, dc);
87   }
88 }
89 
vpx_highbd_dc_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)90 void vpx_highbd_dc_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
91                                       const uint16_t *above,
92                                       const uint16_t *left, int bd) {
93   const uint16x8_t above_u16 = vld1q_u16(above);
94   const uint16x8_t left_u16 = vld1q_u16(left);
95   const uint16x8_t p0 = vaddq_u16(above_u16, left_u16);
96   const uint16_t sum = horizontal_add_uint16x8(p0);
97   const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4);
98   (void)bd;
99   dc_store_8x8(dst, stride, dc);
100 }
101 
vpx_highbd_dc_left_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)102 void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
103                                            const uint16_t *above,
104                                            const uint16_t *left, int bd) {
105   const uint16_t sum = dc_sum_8(left);
106   const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 3);
107   (void)above;
108   (void)bd;
109   dc_store_8x8(dst, stride, dc);
110 }
111 
vpx_highbd_dc_top_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)112 void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
113                                           const uint16_t *above,
114                                           const uint16_t *left, int bd) {
115   const uint16_t sum = dc_sum_8(above);
116   const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 3);
117   (void)left;
118   (void)bd;
119   dc_store_8x8(dst, stride, dc);
120 }
121 
vpx_highbd_dc_128_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)122 void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
123                                           const uint16_t *above,
124                                           const uint16_t *left, int bd) {
125   const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1));
126   (void)above;
127   (void)left;
128   dc_store_8x8(dst, stride, dc);
129 }
130 
131 //------------------------------------------------------------------------------
132 // DC 16x16
133 
dc_sum_16(const uint16_t * ref)134 static INLINE uint16_t dc_sum_16(const uint16_t *ref) {
135   const uint16x8_t ref_u16_0 = vld1q_u16(ref + 0);
136   const uint16x8_t ref_u16_1 = vld1q_u16(ref + 8);
137   const uint16x8_t p0 = vaddq_u16(ref_u16_0, ref_u16_1);
138   return horizontal_add_uint16x8(p0);
139 }
140 
dc_store_16x16(uint16_t * dst,ptrdiff_t stride,const uint16x8_t dc)141 static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride,
142                                   const uint16x8_t dc) {
143   int i;
144   for (i = 0; i < 16; ++i, dst += stride) {
145     vst1q_u16(dst + 0, dc);
146     vst1q_u16(dst + 8, dc);
147   }
148 }
149 
vpx_highbd_dc_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)150 void vpx_highbd_dc_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
151                                         const uint16_t *above,
152                                         const uint16_t *left, int bd) {
153   const uint16x8_t a0 = vld1q_u16(above + 0);
154   const uint16x8_t a1 = vld1q_u16(above + 8);
155   const uint16x8_t l0 = vld1q_u16(left + 0);
156   const uint16x8_t l1 = vld1q_u16(left + 8);
157   const uint16x8_t pa = vaddq_u16(a0, a1);
158   const uint16x8_t pl = vaddq_u16(l0, l1);
159   const uint16x8_t pal0 = vaddq_u16(pa, pl);
160   const uint32_t sum = horizontal_add_uint16x8(pal0);
161   const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0);
162   (void)bd;
163   dc_store_16x16(dst, stride, dc);
164 }
165 
vpx_highbd_dc_left_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)166 void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
167                                              const uint16_t *above,
168                                              const uint16_t *left, int bd) {
169   const uint16_t sum = dc_sum_16(left);
170   const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4);
171   (void)above;
172   (void)bd;
173   dc_store_16x16(dst, stride, dc);
174 }
175 
vpx_highbd_dc_top_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)176 void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
177                                             const uint16_t *above,
178                                             const uint16_t *left, int bd) {
179   const uint16_t sum = dc_sum_16(above);
180   const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4);
181   (void)left;
182   (void)bd;
183   dc_store_16x16(dst, stride, dc);
184 }
185 
vpx_highbd_dc_128_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)186 void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
187                                             const uint16_t *above,
188                                             const uint16_t *left, int bd) {
189   const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1));
190   (void)above;
191   (void)left;
192   dc_store_16x16(dst, stride, dc);
193 }
194 
195 //------------------------------------------------------------------------------
196 // DC 32x32
197 
dc_sum_32(const uint16_t * ref)198 static INLINE uint32_t dc_sum_32(const uint16_t *ref) {
199   const uint16x8_t r0 = vld1q_u16(ref + 0);
200   const uint16x8_t r1 = vld1q_u16(ref + 8);
201   const uint16x8_t r2 = vld1q_u16(ref + 16);
202   const uint16x8_t r3 = vld1q_u16(ref + 24);
203   const uint16x8_t p0 = vaddq_u16(r0, r1);
204   const uint16x8_t p1 = vaddq_u16(r2, r3);
205   const uint16x8_t p2 = vaddq_u16(p0, p1);
206   return horizontal_add_uint16x8(p2);
207 }
208 
dc_store_32x32(uint16_t * dst,ptrdiff_t stride,const uint16x8_t dc)209 static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride,
210                                   const uint16x8_t dc) {
211   int i;
212   for (i = 0; i < 32; ++i) {
213     vst1q_u16(dst + 0, dc);
214     vst1q_u16(dst + 8, dc);
215     vst1q_u16(dst + 16, dc);
216     vst1q_u16(dst + 24, dc);
217     dst += stride;
218   }
219 }
220 
vpx_highbd_dc_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)221 void vpx_highbd_dc_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
222                                         const uint16_t *above,
223                                         const uint16_t *left, int bd) {
224   const uint16x8_t a0 = vld1q_u16(above + 0);
225   const uint16x8_t a1 = vld1q_u16(above + 8);
226   const uint16x8_t a2 = vld1q_u16(above + 16);
227   const uint16x8_t a3 = vld1q_u16(above + 24);
228   const uint16x8_t l0 = vld1q_u16(left + 0);
229   const uint16x8_t l1 = vld1q_u16(left + 8);
230   const uint16x8_t l2 = vld1q_u16(left + 16);
231   const uint16x8_t l3 = vld1q_u16(left + 24);
232   const uint16x8_t pa0 = vaddq_u16(a0, a1);
233   const uint16x8_t pa1 = vaddq_u16(a2, a3);
234   const uint16x8_t pl0 = vaddq_u16(l0, l1);
235   const uint16x8_t pl1 = vaddq_u16(l2, l3);
236   const uint16x8_t pa = vaddq_u16(pa0, pa1);
237   const uint16x8_t pl = vaddq_u16(pl0, pl1);
238   const uint16x8_t pal0 = vaddq_u16(pa, pl);
239   const uint32_t sum = horizontal_add_uint16x8(pal0);
240   const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 6), 0);
241   (void)bd;
242   dc_store_32x32(dst, stride, dc);
243 }
244 
vpx_highbd_dc_left_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)245 void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
246                                              const uint16_t *above,
247                                              const uint16_t *left, int bd) {
248   const uint32_t sum = dc_sum_32(left);
249   const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0);
250   (void)above;
251   (void)bd;
252   dc_store_32x32(dst, stride, dc);
253 }
254 
vpx_highbd_dc_top_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)255 void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
256                                             const uint16_t *above,
257                                             const uint16_t *left, int bd) {
258   const uint32_t sum = dc_sum_32(above);
259   const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0);
260   (void)left;
261   (void)bd;
262   dc_store_32x32(dst, stride, dc);
263 }
264 
vpx_highbd_dc_128_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)265 void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
266                                             const uint16_t *above,
267                                             const uint16_t *left, int bd) {
268   const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1));
269   (void)above;
270   (void)left;
271   dc_store_32x32(dst, stride, dc);
272 }
273 
274 // -----------------------------------------------------------------------------
275 
vpx_highbd_d45_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)276 void vpx_highbd_d45_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
277                                        const uint16_t *above,
278                                        const uint16_t *left, int bd) {
279   uint16x8_t a0, a1, a2, d0;
280   uint16_t a7;
281   (void)left;
282   (void)bd;
283 
284   a0 = vld1q_u16(above);
285   a7 = above[7];
286 
287   // [ above[1], ..., above[6], x, x ]
288   a1 = vextq_u16(a0, a0, 1);
289   // [ above[2], ..., above[7], x, x ]
290   a2 = vextq_u16(a0, a0, 2);
291 
292   // d0[0] = AVG3(above[0], above[1], above[2]);
293   // ...
294   // d0[5] = AVG3(above[5], above[6], above[7]);
295   // d0[6] = x (don't care)
296   // d0[7] = x (don't care)
297   d0 = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
298 
299   // We want:
300   // stride=0 [ d0[0], d0[1], d0[2],    d0[3] ]
301   // stride=1 [ d0[1], d0[2], d0[3],    d0[4] ]
302   // stride=2 [ d0[2], d0[3], d0[4],    d0[5] ]
303   // stride=2 [ d0[3], d0[4], d0[5], above[7] ]
304   vst1_u16(dst + 0 * stride, vget_low_u16(d0));
305   vst1_u16(dst + 1 * stride, vget_low_u16(vextq_u16(d0, d0, 1)));
306   vst1_u16(dst + 2 * stride, vget_low_u16(vextq_u16(d0, d0, 2)));
307   vst1_u16(dst + 3 * stride, vget_low_u16(vextq_u16(d0, d0, 3)));
308 
309   // We stored d0[6] above, so fixup into above[7].
310   dst[3 * stride + 3] = a7;
311 }
312 
vpx_highbd_d45_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)313 void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
314                                        const uint16_t *above,
315                                        const uint16_t *left, int bd) {
316   uint16x8_t ax0, a0, a1, a7, d0;
317   (void)left;
318   (void)bd;
319 
320   a0 = vld1q_u16(above + 0);
321   a1 = vld1q_u16(above + 1);
322   a7 = vld1q_dup_u16(above + 7);
323 
324   // We want to calculate the AVG3 result in lanes 1-7 inclusive so we can
325   // shift in above[7] later, so shift a0 across by one to get the right
326   // inputs:
327   // [ x, above[0], ... , above[6] ]
328   ax0 = vextq_u16(a0, a0, 7);
329 
330   // d0[0] = x (don't care)
331   // d0[1] = AVG3(above[0], above[1], above[2]);
332   // ...
333   // d0[7] = AVG3(above[6], above[7], above[8]);
334   d0 = vrhaddq_u16(vhaddq_u16(ax0, a1), a0);
335 
336   // Undo the earlier ext, incrementally shift in duplicates of above[7].
337   vst1q_u16(dst + 0 * stride, vextq_u16(d0, a7, 1));
338   vst1q_u16(dst + 1 * stride, vextq_u16(d0, a7, 2));
339   vst1q_u16(dst + 2 * stride, vextq_u16(d0, a7, 3));
340   vst1q_u16(dst + 3 * stride, vextq_u16(d0, a7, 4));
341   vst1q_u16(dst + 4 * stride, vextq_u16(d0, a7, 5));
342   vst1q_u16(dst + 5 * stride, vextq_u16(d0, a7, 6));
343   vst1q_u16(dst + 6 * stride, vextq_u16(d0, a7, 7));
344   vst1q_u16(dst + 7 * stride, a7);
345 }
346 
vpx_highbd_d45_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)347 void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
348                                          const uint16_t *above,
349                                          const uint16_t *left, int bd) {
350   uint16x8_t ax0, a0, a1, a7, a8, a9, a15, d0[2];
351   (void)left;
352   (void)bd;
353 
354   a0 = vld1q_u16(above + 0);
355   a1 = vld1q_u16(above + 1);
356   a7 = vld1q_u16(above + 7);
357   a8 = vld1q_u16(above + 8);
358   a9 = vld1q_u16(above + 9);
359   a15 = vld1q_dup_u16(above + 15);
360 
361   // [ x, above[0], ... , above[6] ]
362   ax0 = vextq_u16(a0, a0, 7);
363 
364   // We have one unused lane here to leave room to shift in above[15] in the
365   // last lane:
366   // d0[0][1] = x (don't care)
367   // d0[0][1] = AVG3(above[0], above[1], above[2]);
368   // ...
369   // d0[0][7] = AVG3(above[6], above[7], above[8]);
370   // d0[1][0] = AVG3(above[7], above[8], above[9]);
371   // ...
372   // d0[1][7] = AVG3(above[14], above[15], above[16]);
373   d0[0] = vrhaddq_u16(vhaddq_u16(ax0, a1), a0);
374   d0[1] = vrhaddq_u16(vhaddq_u16(a7, a9), a8);
375 
376   // Incrementally shift in duplicates of above[15].
377   vst1q_u16(dst + 0 * stride + 0, vextq_u16(d0[0], d0[1], 1));
378   vst1q_u16(dst + 0 * stride + 8, vextq_u16(d0[1], a15, 1));
379   vst1q_u16(dst + 1 * stride + 0, vextq_u16(d0[0], d0[1], 2));
380   vst1q_u16(dst + 1 * stride + 8, vextq_u16(d0[1], a15, 2));
381   vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 3));
382   vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[1], a15, 3));
383   vst1q_u16(dst + 3 * stride + 0, vextq_u16(d0[0], d0[1], 4));
384   vst1q_u16(dst + 3 * stride + 8, vextq_u16(d0[1], a15, 4));
385   vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 5));
386   vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[1], a15, 5));
387   vst1q_u16(dst + 5 * stride + 0, vextq_u16(d0[0], d0[1], 6));
388   vst1q_u16(dst + 5 * stride + 8, vextq_u16(d0[1], a15, 6));
389   vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 7));
390   vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[1], a15, 7));
391   vst1q_u16(dst + 7 * stride + 0, d0[1]);
392   vst1q_u16(dst + 7 * stride + 8, a15);
393 
394   vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[1], a15, 1));
395   vst1q_u16(dst + 8 * stride + 8, a15);
396   vst1q_u16(dst + 9 * stride + 0, vextq_u16(d0[1], a15, 2));
397   vst1q_u16(dst + 9 * stride + 8, a15);
398   vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[1], a15, 3));
399   vst1q_u16(dst + 10 * stride + 8, a15);
400   vst1q_u16(dst + 11 * stride + 0, vextq_u16(d0[1], a15, 4));
401   vst1q_u16(dst + 11 * stride + 8, a15);
402   vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[1], a15, 5));
403   vst1q_u16(dst + 12 * stride + 8, a15);
404   vst1q_u16(dst + 13 * stride + 0, vextq_u16(d0[1], a15, 6));
405   vst1q_u16(dst + 13 * stride + 8, a15);
406   vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[1], a15, 7));
407   vst1q_u16(dst + 14 * stride + 8, a15);
408   vst1q_u16(dst + 15 * stride + 0, a15);
409   vst1q_u16(dst + 15 * stride + 8, a15);
410 }
411 
vpx_highbd_d45_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)412 void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
413                                          const uint16_t *above,
414                                          const uint16_t *left, int bd) {
415   uint16x8_t ax0, a0, a1, a7, a8, a9, a15, a16, a17, a23, a24, a25, a31, d0[4];
416   int i;
417   (void)left;
418   (void)bd;
419 
420   a0 = vld1q_u16(above + 0);
421   a1 = vld1q_u16(above + 1);
422   a7 = vld1q_u16(above + 7);
423   a8 = vld1q_u16(above + 8);
424   a9 = vld1q_u16(above + 9);
425   a15 = vld1q_u16(above + 15);
426   a16 = vld1q_u16(above + 16);
427   a17 = vld1q_u16(above + 17);
428   a23 = vld1q_u16(above + 23);
429   a24 = vld1q_u16(above + 24);
430   a25 = vld1q_u16(above + 25);
431   a31 = vld1q_dup_u16(above + 31);
432 
433   // [ x, above[0], ... , above[6] ]
434   ax0 = vextq_u16(a0, a0, 7);
435 
436   d0[0] = vrhaddq_u16(vhaddq_u16(ax0, a1), a0);
437   d0[1] = vrhaddq_u16(vhaddq_u16(a7, a9), a8);
438   d0[2] = vrhaddq_u16(vhaddq_u16(a15, a17), a16);
439   d0[3] = vrhaddq_u16(vhaddq_u16(a23, a25), a24);
440 
441   for (i = 0; i < 32; ++i) {
442     d0[0] = vextq_u16(d0[0], d0[1], 1);
443     d0[1] = vextq_u16(d0[1], d0[2], 1);
444     d0[2] = vextq_u16(d0[2], d0[3], 1);
445     d0[3] = vextq_u16(d0[3], a31, 1);
446     vst1q_u16(dst + 0, d0[0]);
447     vst1q_u16(dst + 8, d0[1]);
448     vst1q_u16(dst + 16, d0[2]);
449     vst1q_u16(dst + 24, d0[3]);
450     dst += stride;
451   }
452 }
453 
454 // -----------------------------------------------------------------------------
455 
vpx_highbd_d63_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)456 void vpx_highbd_d63_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
457                                        const uint16_t *above,
458                                        const uint16_t *left, int bd) {
459   uint16x4_t a0, a1, a2, a3, d0, d1, d2, d3;
460   (void)left;
461   (void)bd;
462 
463   a0 = vld1_u16(above + 0);
464   a1 = vld1_u16(above + 1);
465   a2 = vld1_u16(above + 2);
466   a3 = vld1_u16(above + 3);
467 
468   d0 = vrhadd_u16(a0, a1);
469   d1 = vrhadd_u16(vhadd_u16(a0, a2), a1);
470   d2 = vrhadd_u16(a1, a2);
471   d3 = vrhadd_u16(vhadd_u16(a1, a3), a2);
472 
473   // Note that here we are performing a full avg calculation for the final
474   // elements rather than storing a duplicate of above[3], which differs
475   // (correctly) from the general scheme employed by the bs={8,16,32}
476   // implementations in order to match the original C implementation.
477   vst1_u16(dst + 0 * stride, d0);
478   vst1_u16(dst + 1 * stride, d1);
479   vst1_u16(dst + 2 * stride, d2);
480   vst1_u16(dst + 3 * stride, d3);
481 }
482 
vpx_highbd_d63_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)483 void vpx_highbd_d63_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
484                                        const uint16_t *above,
485                                        const uint16_t *left, int bd) {
486   uint16x8_t a0, a1, a2, a7, d0, d1, d0_ext, d1_ext;
487   (void)left;
488   (void)bd;
489 
490   a0 = vld1q_u16(above + 0);
491   a1 = vld1q_u16(above + 1);
492   a2 = vld1q_u16(above + 2);
493   a7 = vld1q_dup_u16(above + 7);
494 
495   d0 = vrhaddq_u16(a0, a1);
496   d1 = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
497 
498   // We want to store:
499   // stride=0 [ d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], d0[7] ]
500   // stride=1 [ d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], d1[7] ]
501   // stride=2 [ d0[1], d0[2], d0[3], d0[4], d0[5], d0[6],  a[7],  a[7] ]
502   // stride=3 [ d1[1], d1[2], d1[3], d1[4], d1[5], d1[6],  a[7],  a[7] ]
503   // stride=4 [ d0[2], d0[3], d0[4], d0[5], d0[6],  a[7],  a[7],  a[7] ]
504   // stride=5 [ d1[2], d1[3], d1[4], d1[5], d1[6],  a[7],  a[7],  a[7] ]
505   // stride=6 [ d0[3], d0[4], d0[5], d0[6],  a[7],  a[7],  a[7],  a[7] ]
506   // stride=7 [ d1[3], d1[4], d1[5], d1[6],  a[7],  a[7],  a[7],  a[7] ]
507   // Note in particular that d0[7] and d1[7] are only ever referenced in the
508   // stride=0 and stride=1 cases respectively, and in later strides are
509   // replaced by a copy of above[7]. These are equivalent if for i>7,
510   // above[i]==above[7], however that is not always the case.
511 
512   // Strip out d0[7] and d1[7] so that we can replace it with an additional
513   // copy of above[7], the first vector here doesn't matter so just reuse
514   // d0/d1.
515   d0_ext = vextq_u16(d0, d0, 7);
516   d1_ext = vextq_u16(d1, d1, 7);
517 
518   // Shuffle in duplicates of above[7] and store.
519   vst1q_u16(dst + 0 * stride, d0);
520   vst1q_u16(dst + 1 * stride, d1);
521   vst1q_u16(dst + 2 * stride, vextq_u16(d0_ext, a7, 2));
522   vst1q_u16(dst + 3 * stride, vextq_u16(d1_ext, a7, 2));
523   vst1q_u16(dst + 4 * stride, vextq_u16(d0_ext, a7, 3));
524   vst1q_u16(dst + 5 * stride, vextq_u16(d1_ext, a7, 3));
525   vst1q_u16(dst + 6 * stride, vextq_u16(d0_ext, a7, 4));
526   vst1q_u16(dst + 7 * stride, vextq_u16(d1_ext, a7, 4));
527 }
528 
vpx_highbd_d63_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)529 void vpx_highbd_d63_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
530                                          const uint16_t *above,
531                                          const uint16_t *left, int bd) {
532   // See vpx_highbd_d63_predictor_8x8_neon for details on the implementation.
533   uint16x8_t a0, a1, a2, a8, a9, a10, a15, d0[2], d1[2], d0_ext, d1_ext;
534   (void)left;
535   (void)bd;
536 
537   a0 = vld1q_u16(above + 0);
538   a1 = vld1q_u16(above + 1);
539   a2 = vld1q_u16(above + 2);
540   a8 = vld1q_u16(above + 8);
541   a9 = vld1q_u16(above + 9);
542   a10 = vld1q_u16(above + 10);
543   a15 = vld1q_dup_u16(above + 15);
544 
545   d0[0] = vrhaddq_u16(a0, a1);
546   d0[1] = vrhaddq_u16(a8, a9);
547   d1[0] = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
548   d1[1] = vrhaddq_u16(vhaddq_u16(a8, a10), a9);
549 
550   // Strip out the final element of d0/d1 so that we can replace it with an
551   // additional copy of above[7], the first vector here doesn't matter so just
552   // reuse the same vector.
553   d0_ext = vextq_u16(d0[1], d0[1], 7);
554   d1_ext = vextq_u16(d1[1], d1[1], 7);
555 
556   // Shuffle in duplicates of above[7] and store. Note that cases involving
557   // {d0,d1}_ext require an extra shift to undo the shifting out of the final
558   // element from above.
559   vst1q_u16(dst + 0 * stride + 0, d0[0]);
560   vst1q_u16(dst + 0 * stride + 8, d0[1]);
561   vst1q_u16(dst + 1 * stride + 0, d1[0]);
562   vst1q_u16(dst + 1 * stride + 8, d1[1]);
563   vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 1));
564   vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0_ext, a15, 2));
565   vst1q_u16(dst + 3 * stride + 0, vextq_u16(d1[0], d1[1], 1));
566   vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1_ext, a15, 2));
567   vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 2));
568   vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0_ext, a15, 3));
569   vst1q_u16(dst + 5 * stride + 0, vextq_u16(d1[0], d1[1], 2));
570   vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1_ext, a15, 3));
571   vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 3));
572   vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0_ext, a15, 4));
573   vst1q_u16(dst + 7 * stride + 0, vextq_u16(d1[0], d1[1], 3));
574   vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1_ext, a15, 4));
575   vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[0], d0[1], 4));
576   vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0_ext, a15, 5));
577   vst1q_u16(dst + 9 * stride + 0, vextq_u16(d1[0], d1[1], 4));
578   vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1_ext, a15, 5));
579   vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[0], d0[1], 5));
580   vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0_ext, a15, 6));
581   vst1q_u16(dst + 11 * stride + 0, vextq_u16(d1[0], d1[1], 5));
582   vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1_ext, a15, 6));
583   vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[0], d0[1], 6));
584   vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0_ext, a15, 7));
585   vst1q_u16(dst + 13 * stride + 0, vextq_u16(d1[0], d1[1], 6));
586   vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1_ext, a15, 7));
587   vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[0], d0[1], 7));
588   vst1q_u16(dst + 14 * stride + 8, a15);
589   vst1q_u16(dst + 15 * stride + 0, vextq_u16(d1[0], d1[1], 7));
590   vst1q_u16(dst + 15 * stride + 8, a15);
591 }
592 
vpx_highbd_d63_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)593 void vpx_highbd_d63_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
594                                          const uint16_t *above,
595                                          const uint16_t *left, int bd) {
596   // See vpx_highbd_d63_predictor_8x8_neon for details on the implementation.
597   uint16x8_t a0, a1, a2, a8, a9, a10, a16, a17, a18, a24, a25, a26, a31, d0[4],
598       d1[4], d0_ext, d1_ext;
599   (void)left;
600   (void)bd;
601 
602   a0 = vld1q_u16(above + 0);
603   a1 = vld1q_u16(above + 1);
604   a2 = vld1q_u16(above + 2);
605   a8 = vld1q_u16(above + 8);
606   a9 = vld1q_u16(above + 9);
607   a10 = vld1q_u16(above + 10);
608   a16 = vld1q_u16(above + 16);
609   a17 = vld1q_u16(above + 17);
610   a18 = vld1q_u16(above + 18);
611   a24 = vld1q_u16(above + 24);
612   a25 = vld1q_u16(above + 25);
613   a26 = vld1q_u16(above + 26);
614   a31 = vld1q_dup_u16(above + 31);
615 
616   d0[0] = vrhaddq_u16(a0, a1);
617   d0[1] = vrhaddq_u16(a8, a9);
618   d0[2] = vrhaddq_u16(a16, a17);
619   d0[3] = vrhaddq_u16(a24, a25);
620   d1[0] = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
621   d1[1] = vrhaddq_u16(vhaddq_u16(a8, a10), a9);
622   d1[2] = vrhaddq_u16(vhaddq_u16(a16, a18), a17);
623   d1[3] = vrhaddq_u16(vhaddq_u16(a24, a26), a25);
624 
625   // Strip out the final element of d0/d1 so that we can replace it with an
626   // additional copy of above[7], the first vector here doesn't matter so just
627   // reuse the same vector.
628   d0_ext = vextq_u16(d0[3], d0[3], 7);
629   d1_ext = vextq_u16(d1[3], d1[3], 7);
630 
631   // Shuffle in duplicates of above[7] and store. Note that cases involving
632   // {d0,d1}_ext require an extra shift to undo the shifting out of the final
633   // element from above.
634 
635   vst1q_u16(dst + 0 * stride + 0, d0[0]);
636   vst1q_u16(dst + 0 * stride + 8, d0[1]);
637   vst1q_u16(dst + 0 * stride + 16, d0[2]);
638   vst1q_u16(dst + 0 * stride + 24, d0[3]);
639   vst1q_u16(dst + 1 * stride + 0, d1[0]);
640   vst1q_u16(dst + 1 * stride + 8, d1[1]);
641   vst1q_u16(dst + 1 * stride + 16, d1[2]);
642   vst1q_u16(dst + 1 * stride + 24, d1[3]);
643 
644   vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 1));
645   vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[1], d0[2], 1));
646   vst1q_u16(dst + 2 * stride + 16, vextq_u16(d0[2], d0[3], 1));
647   vst1q_u16(dst + 2 * stride + 24, vextq_u16(d0_ext, a31, 2));
648   vst1q_u16(dst + 3 * stride + 0, vextq_u16(d1[0], d1[1], 1));
649   vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[1], d1[2], 1));
650   vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[2], d1[3], 1));
651   vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1_ext, a31, 2));
652 
653   vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 2));
654   vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[1], d0[2], 2));
655   vst1q_u16(dst + 4 * stride + 16, vextq_u16(d0[2], d0[3], 2));
656   vst1q_u16(dst + 4 * stride + 24, vextq_u16(d0_ext, a31, 3));
657   vst1q_u16(dst + 5 * stride + 0, vextq_u16(d1[0], d1[1], 2));
658   vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1[1], d1[2], 2));
659   vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[2], d1[3], 2));
660   vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1_ext, a31, 3));
661 
662   vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 3));
663   vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[1], d0[2], 3));
664   vst1q_u16(dst + 6 * stride + 16, vextq_u16(d0[2], d0[3], 3));
665   vst1q_u16(dst + 6 * stride + 24, vextq_u16(d0_ext, a31, 4));
666   vst1q_u16(dst + 7 * stride + 0, vextq_u16(d1[0], d1[1], 3));
667   vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1[1], d1[2], 3));
668   vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[2], d1[3], 3));
669   vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1_ext, a31, 4));
670 
671   vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[0], d0[1], 4));
672   vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0[1], d0[2], 4));
673   vst1q_u16(dst + 8 * stride + 16, vextq_u16(d0[2], d0[3], 4));
674   vst1q_u16(dst + 8 * stride + 24, vextq_u16(d0_ext, a31, 5));
675   vst1q_u16(dst + 9 * stride + 0, vextq_u16(d1[0], d1[1], 4));
676   vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1[1], d1[2], 4));
677   vst1q_u16(dst + 9 * stride + 16, vextq_u16(d1[2], d1[3], 4));
678   vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1_ext, a31, 5));
679 
680   vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[0], d0[1], 5));
681   vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0[1], d0[2], 5));
682   vst1q_u16(dst + 10 * stride + 16, vextq_u16(d0[2], d0[3], 5));
683   vst1q_u16(dst + 10 * stride + 24, vextq_u16(d0_ext, a31, 6));
684   vst1q_u16(dst + 11 * stride + 0, vextq_u16(d1[0], d1[1], 5));
685   vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1[1], d1[2], 5));
686   vst1q_u16(dst + 11 * stride + 16, vextq_u16(d1[2], d1[3], 5));
687   vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1_ext, a31, 6));
688 
689   vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[0], d0[1], 6));
690   vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0[1], d0[2], 6));
691   vst1q_u16(dst + 12 * stride + 16, vextq_u16(d0[2], d0[3], 6));
692   vst1q_u16(dst + 12 * stride + 24, vextq_u16(d0_ext, a31, 7));
693   vst1q_u16(dst + 13 * stride + 0, vextq_u16(d1[0], d1[1], 6));
694   vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1[1], d1[2], 6));
695   vst1q_u16(dst + 13 * stride + 16, vextq_u16(d1[2], d1[3], 6));
696   vst1q_u16(dst + 13 * stride + 24, vextq_u16(d1_ext, a31, 7));
697 
698   vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[0], d0[1], 7));
699   vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0[1], d0[2], 7));
700   vst1q_u16(dst + 14 * stride + 16, vextq_u16(d0[2], d0[3], 7));
701   vst1q_u16(dst + 14 * stride + 24, a31);
702   vst1q_u16(dst + 15 * stride + 0, vextq_u16(d1[0], d1[1], 7));
703   vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1[1], d1[2], 7));
704   vst1q_u16(dst + 15 * stride + 16, vextq_u16(d1[2], d1[3], 7));
705   vst1q_u16(dst + 15 * stride + 24, a31);
706 
707   vst1q_u16(dst + 16 * stride + 0, d0[1]);
708   vst1q_u16(dst + 16 * stride + 8, d0[2]);
709   vst1q_u16(dst + 16 * stride + 16, vextq_u16(d0_ext, a31, 1));
710   vst1q_u16(dst + 16 * stride + 24, a31);
711   vst1q_u16(dst + 17 * stride + 0, d1[1]);
712   vst1q_u16(dst + 17 * stride + 8, d1[2]);
713   vst1q_u16(dst + 17 * stride + 16, vextq_u16(d1_ext, a31, 1));
714   vst1q_u16(dst + 17 * stride + 24, a31);
715 
716   vst1q_u16(dst + 18 * stride + 0, vextq_u16(d0[1], d0[2], 1));
717   vst1q_u16(dst + 18 * stride + 8, vextq_u16(d0[2], d0[3], 1));
718   vst1q_u16(dst + 18 * stride + 16, vextq_u16(d0_ext, a31, 2));
719   vst1q_u16(dst + 18 * stride + 24, a31);
720   vst1q_u16(dst + 19 * stride + 0, vextq_u16(d1[1], d1[2], 1));
721   vst1q_u16(dst + 19 * stride + 8, vextq_u16(d1[2], d1[3], 1));
722   vst1q_u16(dst + 19 * stride + 16, vextq_u16(d1_ext, a31, 2));
723   vst1q_u16(dst + 19 * stride + 24, a31);
724 
725   vst1q_u16(dst + 20 * stride + 0, vextq_u16(d0[1], d0[2], 2));
726   vst1q_u16(dst + 20 * stride + 8, vextq_u16(d0[2], d0[3], 2));
727   vst1q_u16(dst + 20 * stride + 16, vextq_u16(d0_ext, a31, 3));
728   vst1q_u16(dst + 20 * stride + 24, a31);
729   vst1q_u16(dst + 21 * stride + 0, vextq_u16(d1[1], d1[2], 2));
730   vst1q_u16(dst + 21 * stride + 8, vextq_u16(d1[2], d1[3], 2));
731   vst1q_u16(dst + 21 * stride + 16, vextq_u16(d1_ext, a31, 3));
732   vst1q_u16(dst + 21 * stride + 24, a31);
733 
734   vst1q_u16(dst + 22 * stride + 0, vextq_u16(d0[1], d0[2], 3));
735   vst1q_u16(dst + 22 * stride + 8, vextq_u16(d0[2], d0[3], 3));
736   vst1q_u16(dst + 22 * stride + 16, vextq_u16(d0_ext, a31, 4));
737   vst1q_u16(dst + 22 * stride + 24, a31);
738   vst1q_u16(dst + 23 * stride + 0, vextq_u16(d1[1], d1[2], 3));
739   vst1q_u16(dst + 23 * stride + 8, vextq_u16(d1[2], d1[3], 3));
740   vst1q_u16(dst + 23 * stride + 16, vextq_u16(d1_ext, a31, 4));
741   vst1q_u16(dst + 23 * stride + 24, a31);
742 
743   vst1q_u16(dst + 24 * stride + 0, vextq_u16(d0[1], d0[2], 4));
744   vst1q_u16(dst + 24 * stride + 8, vextq_u16(d0[2], d0[3], 4));
745   vst1q_u16(dst + 24 * stride + 16, vextq_u16(d0_ext, a31, 5));
746   vst1q_u16(dst + 24 * stride + 24, a31);
747   vst1q_u16(dst + 25 * stride + 0, vextq_u16(d1[1], d1[2], 4));
748   vst1q_u16(dst + 25 * stride + 8, vextq_u16(d1[2], d1[3], 4));
749   vst1q_u16(dst + 25 * stride + 16, vextq_u16(d1_ext, a31, 5));
750   vst1q_u16(dst + 25 * stride + 24, a31);
751 
752   vst1q_u16(dst + 26 * stride + 0, vextq_u16(d0[1], d0[2], 5));
753   vst1q_u16(dst + 26 * stride + 8, vextq_u16(d0[2], d0[3], 5));
754   vst1q_u16(dst + 26 * stride + 16, vextq_u16(d0_ext, a31, 6));
755   vst1q_u16(dst + 26 * stride + 24, a31);
756   vst1q_u16(dst + 27 * stride + 0, vextq_u16(d1[1], d1[2], 5));
757   vst1q_u16(dst + 27 * stride + 8, vextq_u16(d1[2], d1[3], 5));
758   vst1q_u16(dst + 27 * stride + 16, vextq_u16(d1_ext, a31, 6));
759   vst1q_u16(dst + 27 * stride + 24, a31);
760 
761   vst1q_u16(dst + 28 * stride + 0, vextq_u16(d0[1], d0[2], 6));
762   vst1q_u16(dst + 28 * stride + 8, vextq_u16(d0[2], d0[3], 6));
763   vst1q_u16(dst + 28 * stride + 16, vextq_u16(d0_ext, a31, 7));
764   vst1q_u16(dst + 28 * stride + 24, a31);
765   vst1q_u16(dst + 29 * stride + 0, vextq_u16(d1[1], d1[2], 6));
766   vst1q_u16(dst + 29 * stride + 8, vextq_u16(d1[2], d1[3], 6));
767   vst1q_u16(dst + 29 * stride + 16, vextq_u16(d1_ext, a31, 7));
768   vst1q_u16(dst + 29 * stride + 24, a31);
769 
770   vst1q_u16(dst + 30 * stride + 0, vextq_u16(d0[1], d0[2], 7));
771   vst1q_u16(dst + 30 * stride + 8, vextq_u16(d0[2], d0[3], 7));
772   vst1q_u16(dst + 30 * stride + 16, a31);
773   vst1q_u16(dst + 30 * stride + 24, a31);
774   vst1q_u16(dst + 31 * stride + 0, vextq_u16(d1[1], d1[2], 7));
775   vst1q_u16(dst + 31 * stride + 8, vextq_u16(d1[2], d1[3], 7));
776   vst1q_u16(dst + 31 * stride + 16, a31);
777   vst1q_u16(dst + 31 * stride + 24, a31);
778 }
779 
780 // -----------------------------------------------------------------------------
781 
vpx_highbd_d117_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)782 void vpx_highbd_d117_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
783                                         const uint16_t *above,
784                                         const uint16_t *left, int bd) {
785   uint16x4_t az, a0, l0az, l0, l1, azl0, col0, col0_even, col0_odd, d0, d1;
786   (void)bd;
787 
788   az = vld1_u16(above - 1);
789   a0 = vld1_u16(above + 0);
790   // [ left[0], above[-1], above[0], above[1] ]
791   l0az = vext_u16(vld1_dup_u16(left), az, 3);
792 
793   l0 = vld1_u16(left + 0);
794   // The last lane here is unused, reading left[4] could cause a buffer
795   // over-read, so just fill with a duplicate of left[0] to avoid needing to
796   // materialize a zero:
797   // [ left[1], left[2], left[3], x ]
798   l1 = vext_u16(l0, l0, 1);
799   // [ above[-1], left[0], left[1], left[2] ]
800   azl0 = vext_u16(vld1_dup_u16(above - 1), l0, 3);
801 
802   d0 = vrhadd_u16(az, a0);
803   d1 = vrhadd_u16(vhadd_u16(l0az, a0), az);
804 
805   col0 = vrhadd_u16(vhadd_u16(azl0, l1), l0);
806   col0_even = vdup_lane_u16(col0, 0);
807   col0_odd = vdup_lane_u16(col0, 1);
808 
809   vst1_u16(dst + 0 * stride, d0);
810   vst1_u16(dst + 1 * stride, d1);
811   vst1_u16(dst + 2 * stride, vext_u16(col0_even, d0, 3));
812   vst1_u16(dst + 3 * stride, vext_u16(col0_odd, d1, 3));
813 }
814 
vpx_highbd_d117_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)815 void vpx_highbd_d117_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
816                                         const uint16_t *above,
817                                         const uint16_t *left, int bd) {
818   uint16x8_t az, a0, l0az, l0, l1, azl0, col0, col0_even, col0_odd, d0, d1;
819   (void)bd;
820 
821   az = vld1q_u16(above - 1);
822   a0 = vld1q_u16(above + 0);
823   // [ left[0], above[-1], ..., left[5] ]
824   l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
825 
826   l0 = vld1q_u16(left + 0);
827   // The last lane here is unused, reading left[8] could cause a buffer
828   // over-read, so just fill with a duplicate of left[0] to avoid needing to
829   // materialize a zero:
830   // [ left[1], ... , left[7], x ]
831   l1 = vextq_u16(l0, l0, 1);
832   // [ above[-1], left[0], ..., left[6] ]
833   azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
834 
835   // d0[0] = AVG2(above[-1], above[0])
836   // ...
837   // d0[7] = AVG2(above[6], above[7])
838   d0 = vrhaddq_u16(az, a0);
839 
840   // d1[0] = AVG3(left[0], above[-1], above[0])
841   // d1[1] = AVG3(above[-1], above[0], above[1])
842   // ...
843   // d1[7] = AVG3(above[5], above[6], above[7])
844   d1 = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
845 
846   // The ext instruction shifts elements in from the end of the vector rather
847   // than the start, so reverse the vector to put the elements to be shifted in
848   // at the end:
849   // col0[7] = AVG3(above[-1], left[0], left[1])
850   // col0[6] = AVG3(left[0], left[1], left[2])
851   // ...
852   // col0[0] = AVG3(left[6], left[7], left[8])
853   col0 = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
854   col0 = vrev64q_u16(vextq_u16(col0, col0, 4));
855 
856   // We don't care about the first parameter to this uzp since we only ever use
857   // the high three elements, we just use col0 again since it is already
858   // available:
859   // col0_even = [ x, x, x, x, x, col0[3], col0[5], col0[7] ]
860   // col0_odd = [ x, x, x, x, x, col0[2], col0[4], col0[6] ]
861   col0_even = vuzpq_u16(col0, col0).val[1];
862   col0_odd = vuzpq_u16(col0, col0).val[0];
863 
864   // Incrementally shift more elements from col0 into d0/1:
865   // stride=0 [ d0[0],   d0[1],   d0[2],   d0[3], d0[4], d0[5], d0[6], d0[7] ]
866   // stride=1 [ d1[0],   d1[1],   d1[2],   d1[3], d1[4], d1[5], d1[6], d1[7] ]
867   // stride=2 [ col0[7], d0[0],   d0[1],   d0[2], d0[3], d0[4], d0[5], d0[6] ]
868   // stride=3 [ col0[6], d1[0],   d1[1],   d1[2], d1[3], d1[4], d1[5], d1[6] ]
869   // stride=4 [ col0[5], col0[7], d0[0],   d0[1], d0[2], d0[3], d0[4], d0[5] ]
870   // stride=5 [ col0[4], col0[6], d1[0],   d1[1], d1[2], d1[3], d1[4], d1[5] ]
871   // stride=6 [ col0[3], col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4] ]
872   // stride=7 [ col0[2], col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4] ]
873   vst1q_u16(dst + 0 * stride, d0);
874   vst1q_u16(dst + 1 * stride, d1);
875   vst1q_u16(dst + 2 * stride, vextq_u16(col0_even, d0, 7));
876   vst1q_u16(dst + 3 * stride, vextq_u16(col0_odd, d1, 7));
877   vst1q_u16(dst + 4 * stride, vextq_u16(col0_even, d0, 6));
878   vst1q_u16(dst + 5 * stride, vextq_u16(col0_odd, d1, 6));
879   vst1q_u16(dst + 6 * stride, vextq_u16(col0_even, d0, 5));
880   vst1q_u16(dst + 7 * stride, vextq_u16(col0_odd, d1, 5));
881 }
882 
vpx_highbd_d117_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)883 void vpx_highbd_d117_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
884                                           const uint16_t *above,
885                                           const uint16_t *left, int bd) {
886   uint16x8_t az, a0, a6, a7, a8, l0az, l0, l1, l7, l8, l9, azl0, col0_lo,
887       col0_hi, col0_even, col0_odd, d0_lo, d0_hi, d1_lo, d1_hi;
888   (void)bd;
889 
890   az = vld1q_u16(above - 1);
891   a0 = vld1q_u16(above + 0);
892   a6 = vld1q_u16(above + 6);
893   a7 = vld1q_u16(above + 7);
894   a8 = vld1q_u16(above + 8);
895   // [ left[0], above[-1], ..., left[5] ]
896   l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
897 
898   l0 = vld1q_u16(left + 0);
899   l1 = vld1q_u16(left + 1);
900   l7 = vld1q_u16(left + 7);
901   l8 = vld1q_u16(left + 8);
902   // The last lane here is unused, reading left[16] could cause a buffer
903   // over-read, so just fill with a duplicate of left[8] to avoid needing to
904   // materialize a zero:
905   // [ left[9], ... , left[15], x ]
906   l9 = vextq_u16(l8, l8, 1);
907   // [ above[-1], left[0], ..., left[6] ]
908   azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
909 
910   d0_lo = vrhaddq_u16(az, a0);
911   d0_hi = vrhaddq_u16(a7, a8);
912   d1_lo = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
913   d1_hi = vrhaddq_u16(vhaddq_u16(a6, a8), a7);
914 
915   col0_lo = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
916   col0_hi = vrhaddq_u16(vhaddq_u16(l7, l9), l8);
917 
918   // Reverse within each vector, then swap the array indices in the uzp to
919   // complete the reversal across all 16 elements.
920   col0_lo = vrev64q_u16(vextq_u16(col0_lo, col0_lo, 4));
921   col0_hi = vrev64q_u16(vextq_u16(col0_hi, col0_hi, 4));
922   col0_even = vuzpq_u16(col0_hi, col0_lo).val[1];
923   col0_odd = vuzpq_u16(col0_hi, col0_lo).val[0];
924 
925   vst1q_u16(dst + 0 * stride + 0, d0_lo);
926   vst1q_u16(dst + 0 * stride + 8, d0_hi);
927   vst1q_u16(dst + 1 * stride + 0, d1_lo);
928   vst1q_u16(dst + 1 * stride + 8, d1_hi);
929 
930   vst1q_u16(dst + 2 * stride + 0, vextq_u16(col0_even, d0_lo, 7));
931   vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0_lo, d0_hi, 7));
932   vst1q_u16(dst + 3 * stride + 0, vextq_u16(col0_odd, d1_lo, 7));
933   vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1_lo, d1_hi, 7));
934 
935   vst1q_u16(dst + 4 * stride + 0, vextq_u16(col0_even, d0_lo, 6));
936   vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0_lo, d0_hi, 6));
937   vst1q_u16(dst + 5 * stride + 0, vextq_u16(col0_odd, d1_lo, 6));
938   vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1_lo, d1_hi, 6));
939 
940   vst1q_u16(dst + 6 * stride + 0, vextq_u16(col0_even, d0_lo, 5));
941   vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0_lo, d0_hi, 5));
942   vst1q_u16(dst + 7 * stride + 0, vextq_u16(col0_odd, d1_lo, 5));
943   vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1_lo, d1_hi, 5));
944 
945   vst1q_u16(dst + 8 * stride + 0, vextq_u16(col0_even, d0_lo, 4));
946   vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0_lo, d0_hi, 4));
947   vst1q_u16(dst + 9 * stride + 0, vextq_u16(col0_odd, d1_lo, 4));
948   vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1_lo, d1_hi, 4));
949 
950   vst1q_u16(dst + 10 * stride + 0, vextq_u16(col0_even, d0_lo, 3));
951   vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0_lo, d0_hi, 3));
952   vst1q_u16(dst + 11 * stride + 0, vextq_u16(col0_odd, d1_lo, 3));
953   vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1_lo, d1_hi, 3));
954 
955   vst1q_u16(dst + 12 * stride + 0, vextq_u16(col0_even, d0_lo, 2));
956   vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0_lo, d0_hi, 2));
957   vst1q_u16(dst + 13 * stride + 0, vextq_u16(col0_odd, d1_lo, 2));
958   vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1_lo, d1_hi, 2));
959 
960   vst1q_u16(dst + 14 * stride + 0, vextq_u16(col0_even, d0_lo, 1));
961   vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0_lo, d0_hi, 1));
962   vst1q_u16(dst + 15 * stride + 0, vextq_u16(col0_odd, d1_lo, 1));
963   vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1_lo, d1_hi, 1));
964 }
965 
vpx_highbd_d117_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)966 void vpx_highbd_d117_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
967                                           const uint16_t *above,
968                                           const uint16_t *left, int bd) {
969   uint16x8_t az, a0, a6, a7, a8, a14, a15, a16, a22, a23, a24, l0az, l0, l1, l7,
970       l8, l9, l15, l16, l17, l23, l24, l25, azl0, d0[4], d1[4], col0[4],
971       col0_even[2], col0_odd[2];
972   (void)bd;
973 
974   az = vld1q_u16(above - 1);
975   a0 = vld1q_u16(above + 0);
976   a6 = vld1q_u16(above + 6);
977   a7 = vld1q_u16(above + 7);
978   a8 = vld1q_u16(above + 8);
979   a14 = vld1q_u16(above + 14);
980   a15 = vld1q_u16(above + 15);
981   a16 = vld1q_u16(above + 16);
982   a22 = vld1q_u16(above + 22);
983   a23 = vld1q_u16(above + 23);
984   a24 = vld1q_u16(above + 24);
985   // [ left[0], above[-1], ..., left[5] ]
986   l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
987 
988   l0 = vld1q_u16(left + 0);
989   l1 = vld1q_u16(left + 1);
990   l7 = vld1q_u16(left + 7);
991   l8 = vld1q_u16(left + 8);
992   l9 = vld1q_u16(left + 9);
993   l15 = vld1q_u16(left + 15);
994   l16 = vld1q_u16(left + 16);
995   l17 = vld1q_u16(left + 17);
996   l23 = vld1q_u16(left + 23);
997   l24 = vld1q_u16(left + 24);
998   l25 = vld1q_u16(left + 25);
999   // The last lane here is unused, reading left[32] could cause a buffer
1000   // over-read, so just fill with a duplicate of left[24] to avoid needing to
1001   // materialize a zero:
1002   // [ left[25], ... , left[31], x ]
1003   l25 = vextq_u16(l24, l24, 1);
1004   // [ above[-1], left[0], ..., left[6] ]
1005   azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
1006 
1007   d0[0] = vrhaddq_u16(az, a0);
1008   d0[1] = vrhaddq_u16(a7, a8);
1009   d0[2] = vrhaddq_u16(a15, a16);
1010   d0[3] = vrhaddq_u16(a23, a24);
1011   d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
1012   d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7);
1013   d1[2] = vrhaddq_u16(vhaddq_u16(a14, a16), a15);
1014   d1[3] = vrhaddq_u16(vhaddq_u16(a22, a24), a23);
1015 
1016   col0[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
1017   col0[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8);
1018   col0[2] = vrhaddq_u16(vhaddq_u16(l15, l17), l16);
1019   col0[3] = vrhaddq_u16(vhaddq_u16(l23, l25), l24);
1020 
1021   // Reverse within each vector, then swap the array indices in both the uzp
1022   // and the col0_{even,odd} assignment to complete the reversal across all
1023   // 32-elements.
1024   col0[0] = vrev64q_u16(vextq_u16(col0[0], col0[0], 4));
1025   col0[1] = vrev64q_u16(vextq_u16(col0[1], col0[1], 4));
1026   col0[2] = vrev64q_u16(vextq_u16(col0[2], col0[2], 4));
1027   col0[3] = vrev64q_u16(vextq_u16(col0[3], col0[3], 4));
1028 
1029   col0_even[1] = vuzpq_u16(col0[1], col0[0]).val[1];
1030   col0_even[0] = vuzpq_u16(col0[3], col0[2]).val[1];
1031   col0_odd[1] = vuzpq_u16(col0[1], col0[0]).val[0];
1032   col0_odd[0] = vuzpq_u16(col0[3], col0[2]).val[0];
1033 
1034   vst1q_u16(dst + 0 * stride + 0, d0[0]);
1035   vst1q_u16(dst + 0 * stride + 8, d0[1]);
1036   vst1q_u16(dst + 0 * stride + 16, d0[2]);
1037   vst1q_u16(dst + 0 * stride + 24, d0[3]);
1038   vst1q_u16(dst + 1 * stride + 0, d1[0]);
1039   vst1q_u16(dst + 1 * stride + 8, d1[1]);
1040   vst1q_u16(dst + 1 * stride + 16, d1[2]);
1041   vst1q_u16(dst + 1 * stride + 24, d1[3]);
1042 
1043   vst1q_u16(dst + 2 * stride + 0, vextq_u16(col0_even[1], d0[0], 7));
1044   vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[0], d0[1], 7));
1045   vst1q_u16(dst + 2 * stride + 16, vextq_u16(d0[1], d0[2], 7));
1046   vst1q_u16(dst + 2 * stride + 24, vextq_u16(d0[2], d0[3], 7));
1047   vst1q_u16(dst + 3 * stride + 0, vextq_u16(col0_odd[1], d1[0], 7));
1048   vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 7));
1049   vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[1], d1[2], 7));
1050   vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1[2], d1[3], 7));
1051 
1052   vst1q_u16(dst + 4 * stride + 0, vextq_u16(col0_even[1], d0[0], 6));
1053   vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[0], d0[1], 6));
1054   vst1q_u16(dst + 4 * stride + 16, vextq_u16(d0[1], d0[2], 6));
1055   vst1q_u16(dst + 4 * stride + 24, vextq_u16(d0[2], d0[3], 6));
1056   vst1q_u16(dst + 5 * stride + 0, vextq_u16(col0_odd[1], d1[0], 6));
1057   vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1[0], d1[1], 6));
1058   vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[1], d1[2], 6));
1059   vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1[2], d1[3], 6));
1060 
1061   vst1q_u16(dst + 6 * stride + 0, vextq_u16(col0_even[1], d0[0], 5));
1062   vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[0], d0[1], 5));
1063   vst1q_u16(dst + 6 * stride + 16, vextq_u16(d0[1], d0[2], 5));
1064   vst1q_u16(dst + 6 * stride + 24, vextq_u16(d0[2], d0[3], 5));
1065   vst1q_u16(dst + 7 * stride + 0, vextq_u16(col0_odd[1], d1[0], 5));
1066   vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1[0], d1[1], 5));
1067   vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[1], d1[2], 5));
1068   vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1[2], d1[3], 5));
1069 
1070   vst1q_u16(dst + 8 * stride + 0, vextq_u16(col0_even[1], d0[0], 4));
1071   vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0[0], d0[1], 4));
1072   vst1q_u16(dst + 8 * stride + 16, vextq_u16(d0[1], d0[2], 4));
1073   vst1q_u16(dst + 8 * stride + 24, vextq_u16(d0[2], d0[3], 4));
1074   vst1q_u16(dst + 9 * stride + 0, vextq_u16(col0_odd[1], d1[0], 4));
1075   vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1[0], d1[1], 4));
1076   vst1q_u16(dst + 9 * stride + 16, vextq_u16(d1[1], d1[2], 4));
1077   vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1[2], d1[3], 4));
1078 
1079   vst1q_u16(dst + 10 * stride + 0, vextq_u16(col0_even[1], d0[0], 3));
1080   vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0[0], d0[1], 3));
1081   vst1q_u16(dst + 10 * stride + 16, vextq_u16(d0[1], d0[2], 3));
1082   vst1q_u16(dst + 10 * stride + 24, vextq_u16(d0[2], d0[3], 3));
1083   vst1q_u16(dst + 11 * stride + 0, vextq_u16(col0_odd[1], d1[0], 3));
1084   vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1[0], d1[1], 3));
1085   vst1q_u16(dst + 11 * stride + 16, vextq_u16(d1[1], d1[2], 3));
1086   vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1[2], d1[3], 3));
1087 
1088   vst1q_u16(dst + 12 * stride + 0, vextq_u16(col0_even[1], d0[0], 2));
1089   vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0[0], d0[1], 2));
1090   vst1q_u16(dst + 12 * stride + 16, vextq_u16(d0[1], d0[2], 2));
1091   vst1q_u16(dst + 12 * stride + 24, vextq_u16(d0[2], d0[3], 2));
1092   vst1q_u16(dst + 13 * stride + 0, vextq_u16(col0_odd[1], d1[0], 2));
1093   vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1[0], d1[1], 2));
1094   vst1q_u16(dst + 13 * stride + 16, vextq_u16(d1[1], d1[2], 2));
1095   vst1q_u16(dst + 13 * stride + 24, vextq_u16(d1[2], d1[3], 2));
1096 
1097   vst1q_u16(dst + 14 * stride + 0, vextq_u16(col0_even[1], d0[0], 1));
1098   vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0[0], d0[1], 1));
1099   vst1q_u16(dst + 14 * stride + 16, vextq_u16(d0[1], d0[2], 1));
1100   vst1q_u16(dst + 14 * stride + 24, vextq_u16(d0[2], d0[3], 1));
1101   vst1q_u16(dst + 15 * stride + 0, vextq_u16(col0_odd[1], d1[0], 1));
1102   vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1[0], d1[1], 1));
1103   vst1q_u16(dst + 15 * stride + 16, vextq_u16(d1[1], d1[2], 1));
1104   vst1q_u16(dst + 15 * stride + 24, vextq_u16(d1[2], d1[3], 1));
1105 
1106   vst1q_u16(dst + 16 * stride + 0, col0_even[1]);
1107   vst1q_u16(dst + 16 * stride + 8, d0[0]);
1108   vst1q_u16(dst + 16 * stride + 16, d0[1]);
1109   vst1q_u16(dst + 16 * stride + 24, d0[2]);
1110   vst1q_u16(dst + 17 * stride + 0, col0_odd[1]);
1111   vst1q_u16(dst + 17 * stride + 8, d1[0]);
1112   vst1q_u16(dst + 17 * stride + 16, d1[1]);
1113   vst1q_u16(dst + 17 * stride + 24, d1[2]);
1114 
1115   vst1q_u16(dst + 18 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 7));
1116   vst1q_u16(dst + 18 * stride + 8, vextq_u16(col0_even[1], d0[0], 7));
1117   vst1q_u16(dst + 18 * stride + 16, vextq_u16(d0[0], d0[1], 7));
1118   vst1q_u16(dst + 18 * stride + 24, vextq_u16(d0[1], d0[2], 7));
1119   vst1q_u16(dst + 19 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 7));
1120   vst1q_u16(dst + 19 * stride + 8, vextq_u16(col0_odd[1], d1[0], 7));
1121   vst1q_u16(dst + 19 * stride + 16, vextq_u16(d1[0], d1[1], 7));
1122   vst1q_u16(dst + 19 * stride + 24, vextq_u16(d1[1], d1[2], 7));
1123 
1124   vst1q_u16(dst + 20 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 6));
1125   vst1q_u16(dst + 20 * stride + 8, vextq_u16(col0_even[1], d0[0], 6));
1126   vst1q_u16(dst + 20 * stride + 16, vextq_u16(d0[0], d0[1], 6));
1127   vst1q_u16(dst + 20 * stride + 24, vextq_u16(d0[1], d0[2], 6));
1128   vst1q_u16(dst + 21 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 6));
1129   vst1q_u16(dst + 21 * stride + 8, vextq_u16(col0_odd[1], d1[0], 6));
1130   vst1q_u16(dst + 21 * stride + 16, vextq_u16(d1[0], d1[1], 6));
1131   vst1q_u16(dst + 21 * stride + 24, vextq_u16(d1[1], d1[2], 6));
1132 
1133   vst1q_u16(dst + 22 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 5));
1134   vst1q_u16(dst + 22 * stride + 8, vextq_u16(col0_even[1], d0[0], 5));
1135   vst1q_u16(dst + 22 * stride + 16, vextq_u16(d0[0], d0[1], 5));
1136   vst1q_u16(dst + 22 * stride + 24, vextq_u16(d0[1], d0[2], 5));
1137   vst1q_u16(dst + 23 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 5));
1138   vst1q_u16(dst + 23 * stride + 8, vextq_u16(col0_odd[1], d1[0], 5));
1139   vst1q_u16(dst + 23 * stride + 16, vextq_u16(d1[0], d1[1], 5));
1140   vst1q_u16(dst + 23 * stride + 24, vextq_u16(d1[1], d1[2], 5));
1141 
1142   vst1q_u16(dst + 24 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 4));
1143   vst1q_u16(dst + 24 * stride + 8, vextq_u16(col0_even[1], d0[0], 4));
1144   vst1q_u16(dst + 24 * stride + 16, vextq_u16(d0[0], d0[1], 4));
1145   vst1q_u16(dst + 24 * stride + 24, vextq_u16(d0[1], d0[2], 4));
1146   vst1q_u16(dst + 25 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 4));
1147   vst1q_u16(dst + 25 * stride + 8, vextq_u16(col0_odd[1], d1[0], 4));
1148   vst1q_u16(dst + 25 * stride + 16, vextq_u16(d1[0], d1[1], 4));
1149   vst1q_u16(dst + 25 * stride + 24, vextq_u16(d1[1], d1[2], 4));
1150 
1151   vst1q_u16(dst + 26 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 3));
1152   vst1q_u16(dst + 26 * stride + 8, vextq_u16(col0_even[1], d0[0], 3));
1153   vst1q_u16(dst + 26 * stride + 16, vextq_u16(d0[0], d0[1], 3));
1154   vst1q_u16(dst + 26 * stride + 24, vextq_u16(d0[1], d0[2], 3));
1155   vst1q_u16(dst + 27 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 3));
1156   vst1q_u16(dst + 27 * stride + 8, vextq_u16(col0_odd[1], d1[0], 3));
1157   vst1q_u16(dst + 27 * stride + 16, vextq_u16(d1[0], d1[1], 3));
1158   vst1q_u16(dst + 27 * stride + 24, vextq_u16(d1[1], d1[2], 3));
1159 
1160   vst1q_u16(dst + 28 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 2));
1161   vst1q_u16(dst + 28 * stride + 8, vextq_u16(col0_even[1], d0[0], 2));
1162   vst1q_u16(dst + 28 * stride + 16, vextq_u16(d0[0], d0[1], 2));
1163   vst1q_u16(dst + 28 * stride + 24, vextq_u16(d0[1], d0[2], 2));
1164   vst1q_u16(dst + 29 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 2));
1165   vst1q_u16(dst + 29 * stride + 8, vextq_u16(col0_odd[1], d1[0], 2));
1166   vst1q_u16(dst + 29 * stride + 16, vextq_u16(d1[0], d1[1], 2));
1167   vst1q_u16(dst + 29 * stride + 24, vextq_u16(d1[1], d1[2], 2));
1168 
1169   vst1q_u16(dst + 30 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 1));
1170   vst1q_u16(dst + 30 * stride + 8, vextq_u16(col0_even[1], d0[0], 1));
1171   vst1q_u16(dst + 30 * stride + 16, vextq_u16(d0[0], d0[1], 1));
1172   vst1q_u16(dst + 30 * stride + 24, vextq_u16(d0[1], d0[2], 1));
1173   vst1q_u16(dst + 31 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 1));
1174   vst1q_u16(dst + 31 * stride + 8, vextq_u16(col0_odd[1], d1[0], 1));
1175   vst1q_u16(dst + 31 * stride + 16, vextq_u16(d1[0], d1[1], 1));
1176   vst1q_u16(dst + 31 * stride + 24, vextq_u16(d1[1], d1[2], 1));
1177 }
1178 
1179 // -----------------------------------------------------------------------------
1180 
vpx_highbd_d153_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)1181 void vpx_highbd_d153_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
1182                                         const uint16_t *above,
1183                                         const uint16_t *left, int bd) {
1184   // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation.
1185   uint16x4_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d20_lo, d20_hi;
1186   (void)bd;
1187 
1188   az = vld1_u16(above - 1);
1189   a0 = vld1_u16(above + 0);
1190   // [ left[0], above[-1], above[0], above[1] ]
1191   l0az = vext_u16(vld1_dup_u16(left), az, 3);
1192 
1193   l0 = vld1_u16(left);
1194   // The last lane here is unused, reading left[4] could cause a buffer
1195   // over-read, so just fill with a duplicate of left[0] to avoid needing to
1196   // materialize a zero:
1197   // [ left[1], left[2], left[3], x ]
1198   l1 = vext_u16(l0, l0, 1);
1199   // [ above[-1], left[0], left[1], left[2] ]
1200   azl0 = vext_u16(vld1_dup_u16(above - 1), l0, 3);
1201 
1202   d0 = vrhadd_u16(azl0, l0);
1203   d1 = vrhadd_u16(vhadd_u16(l0az, a0), az);
1204   d2 = vrhadd_u16(vhadd_u16(azl0, l1), l0);
1205 
1206   d20_lo = vzip_u16(vrev64_u16(d2), vrev64_u16(d0)).val[0];
1207   d20_hi = vzip_u16(vrev64_u16(d2), vrev64_u16(d0)).val[1];
1208 
1209   // Incrementally shift more elements from d0/d2 reversed into d1:
1210   // stride=0 [ d0[0], d1[0], d1[1], d1[2] ]
1211   // stride=1 [ d0[1], d2[0], d0[0], d1[0] ]
1212   // stride=2 [ d0[2], d2[1], d0[1], d2[0] ]
1213   // stride=3 [ d0[3], d2[2], d0[2], d2[1] ]
1214   vst1_u16(dst + 0 * stride, vext_u16(d20_hi, d1, 3));
1215   vst1_u16(dst + 1 * stride, vext_u16(d20_hi, d1, 1));
1216   vst1_u16(dst + 2 * stride, vext_u16(d20_lo, d20_hi, 3));
1217   vst1_u16(dst + 3 * stride, vext_u16(d20_lo, d20_hi, 1));
1218 }
1219 
vpx_highbd_d153_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)1220 void vpx_highbd_d153_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
1221                                         const uint16_t *above,
1222                                         const uint16_t *left, int bd) {
1223   uint16x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d0_rev, d2_rev, d20_lo,
1224       d20_hi;
1225   (void)bd;
1226 
1227   az = vld1q_u16(above - 1);
1228   a0 = vld1q_u16(above + 0);
1229   // [ left[0], above[-1], ... , above[5] ]
1230   l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
1231 
1232   l0 = vld1q_u16(left);
1233   // The last lane here is unused, reading left[8] could cause a buffer
1234   // over-read, so just fill with a duplicate of left[0] to avoid needing to
1235   // materialize a zero:
1236   // [ left[1], ... , left[7], x ]
1237   l1 = vextq_u16(l0, l0, 1);
1238   // [ above[-1], left[0], ... , left[6] ]
1239   azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
1240 
1241   // d0[0] = AVG2(above[-1], left[0])
1242   // d0[1] = AVG2(left[0], left[1])
1243   // ...
1244   // d0[7] = AVG2(left[6], left[7])
1245   d0 = vrhaddq_u16(azl0, l0);
1246 
1247   // d1[0] = AVG3(left[0], above[-1], above[0])
1248   // d1[1] = AVG3(above[-1], above[0], above[1])
1249   // ...
1250   // d1[7] = AVG3(above[5], above[6], above[7])
1251   d1 = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
1252 
1253   // d2[0] = AVG3(above[-1], left[0], left[1])
1254   // d2[1] = AVG3(left[0], left[1], left[2])
1255   // ...
1256   // d2[7] = AVG3(left[6], left[7], left[8])
1257   d2 = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
1258 
1259   // The ext instruction shifts elements in from the end of the vector rather
1260   // than the start, so reverse the vectors to put the elements to be shifted
1261   // in at the end:
1262   d0_rev = vrev64q_u16(vextq_u16(d0, d0, 4));
1263   d2_rev = vrev64q_u16(vextq_u16(d2, d2, 4));
1264 
1265   d20_lo = vzipq_u16(d2_rev, d0_rev).val[0];
1266   d20_hi = vzipq_u16(d2_rev, d0_rev).val[1];
1267 
1268   // Incrementally shift more elements from d0/d2 reversed into d1:
1269   // stride=0 [ d0[0], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ]
1270   // stride=1 [ d0[1], d2[0], d0[0], d1[0], d1[1], d1[2], d1[3], d1[4] ]
1271   // stride=2 [ d0[2], d2[1], d0[1], d2[0], d0[0], d1[0], d1[1], d1[2] ]
1272   // stride=3 [ d0[3], d2[2], d0[2], d2[1], d0[1], d2[0], d0[0], d1[0] ]
1273   // stride=4 [ d0[4], d2[3], d0[3], d2[2], d0[2], d2[1], d0[1], d2[0] ]
1274   // stride=5 [ d0[5], d2[4], d0[4], d2[3], d0[3], d2[2], d0[2], d2[1] ]
1275   // stride=6 [ d0[6], d2[5], d0[5], d2[4], d0[4], d2[3], d0[3], d2[2] ]
1276   // stride=7 [ d0[7], d2[6], d0[6], d2[5], d0[5], d2[4], d0[4], d2[3] ]
1277   vst1q_u16(dst + 0 * stride, vextq_u16(d20_hi, d1, 7));
1278   vst1q_u16(dst + 1 * stride, vextq_u16(d20_hi, d1, 5));
1279   vst1q_u16(dst + 2 * stride, vextq_u16(d20_hi, d1, 3));
1280   vst1q_u16(dst + 3 * stride, vextq_u16(d20_hi, d1, 1));
1281   vst1q_u16(dst + 4 * stride, vextq_u16(d20_lo, d20_hi, 7));
1282   vst1q_u16(dst + 5 * stride, vextq_u16(d20_lo, d20_hi, 5));
1283   vst1q_u16(dst + 6 * stride, vextq_u16(d20_lo, d20_hi, 3));
1284   vst1q_u16(dst + 7 * stride, vextq_u16(d20_lo, d20_hi, 1));
1285 }
1286 
vpx_highbd_d153_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)1287 void vpx_highbd_d153_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
1288                                           const uint16_t *above,
1289                                           const uint16_t *left, int bd) {
1290   // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation.
1291   uint16x8_t az, a0, a6, a7, a8, l0az, l0, l1, l7, l8, l9, azl0, d0[2], d1[2],
1292       d2[2], d20[4];
1293   (void)bd;
1294 
1295   az = vld1q_u16(above - 1);
1296   a0 = vld1q_u16(above + 0);
1297   a6 = vld1q_u16(above + 6);
1298   a7 = vld1q_u16(above + 7);
1299   a8 = vld1q_u16(above + 8);
1300   // [ left[0], above[-1], ... , above[13] ]
1301   l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
1302 
1303   l0 = vld1q_u16(left + 0);
1304   l1 = vld1q_u16(left + 1);
1305   l7 = vld1q_u16(left + 7);
1306   l8 = vld1q_u16(left + 8);
1307   // The last lane here is unused, reading left[16] could cause a buffer
1308   // over-read, so just fill with a duplicate of left[8] to avoid needing to
1309   // materialize a zero:
1310   // [ left[9], ... , left[15], x ]
1311   l9 = vextq_u16(l8, l8, 1);
1312   // [ above[-1], left[0], ... , left[14] ]
1313   azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
1314 
1315   d0[0] = vrhaddq_u16(azl0, l0);
1316   d0[1] = vrhaddq_u16(l7, l8);
1317   d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
1318   d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7);
1319   d2[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
1320   d2[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8);
1321 
1322   d0[0] = vrev64q_u16(vextq_u16(d0[0], d0[0], 4));
1323   d0[1] = vrev64q_u16(vextq_u16(d0[1], d0[1], 4));
1324   d2[0] = vrev64q_u16(vextq_u16(d2[0], d2[0], 4));
1325   d2[1] = vrev64q_u16(vextq_u16(d2[1], d2[1], 4));
1326 
1327   d20[0] = vzipq_u16(d2[1], d0[1]).val[0];
1328   d20[1] = vzipq_u16(d2[1], d0[1]).val[1];
1329   d20[2] = vzipq_u16(d2[0], d0[0]).val[0];
1330   d20[3] = vzipq_u16(d2[0], d0[0]).val[1];
1331 
1332   vst1q_u16(dst + 0 * stride + 0, vextq_u16(d20[3], d1[0], 7));
1333   vst1q_u16(dst + 0 * stride + 8, vextq_u16(d1[0], d1[1], 7));
1334   vst1q_u16(dst + 1 * stride + 0, vextq_u16(d20[3], d1[0], 5));
1335   vst1q_u16(dst + 1 * stride + 8, vextq_u16(d1[0], d1[1], 5));
1336   vst1q_u16(dst + 2 * stride + 0, vextq_u16(d20[3], d1[0], 3));
1337   vst1q_u16(dst + 2 * stride + 8, vextq_u16(d1[0], d1[1], 3));
1338   vst1q_u16(dst + 3 * stride + 0, vextq_u16(d20[3], d1[0], 1));
1339   vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 1));
1340 
1341   vst1q_u16(dst + 4 * stride + 0, vextq_u16(d20[2], d20[3], 7));
1342   vst1q_u16(dst + 4 * stride + 8, vextq_u16(d20[3], d1[0], 7));
1343   vst1q_u16(dst + 5 * stride + 0, vextq_u16(d20[2], d20[3], 5));
1344   vst1q_u16(dst + 5 * stride + 8, vextq_u16(d20[3], d1[0], 5));
1345   vst1q_u16(dst + 6 * stride + 0, vextq_u16(d20[2], d20[3], 3));
1346   vst1q_u16(dst + 6 * stride + 8, vextq_u16(d20[3], d1[0], 3));
1347   vst1q_u16(dst + 7 * stride + 0, vextq_u16(d20[2], d20[3], 1));
1348   vst1q_u16(dst + 7 * stride + 8, vextq_u16(d20[3], d1[0], 1));
1349 
1350   vst1q_u16(dst + 8 * stride + 0, vextq_u16(d20[1], d20[2], 7));
1351   vst1q_u16(dst + 8 * stride + 8, vextq_u16(d20[2], d20[3], 7));
1352   vst1q_u16(dst + 9 * stride + 0, vextq_u16(d20[1], d20[2], 5));
1353   vst1q_u16(dst + 9 * stride + 8, vextq_u16(d20[2], d20[3], 5));
1354   vst1q_u16(dst + 10 * stride + 0, vextq_u16(d20[1], d20[2], 3));
1355   vst1q_u16(dst + 10 * stride + 8, vextq_u16(d20[2], d20[3], 3));
1356   vst1q_u16(dst + 11 * stride + 0, vextq_u16(d20[1], d20[2], 1));
1357   vst1q_u16(dst + 11 * stride + 8, vextq_u16(d20[2], d20[3], 1));
1358 
1359   vst1q_u16(dst + 12 * stride + 0, vextq_u16(d20[0], d20[1], 7));
1360   vst1q_u16(dst + 12 * stride + 8, vextq_u16(d20[1], d20[2], 7));
1361   vst1q_u16(dst + 13 * stride + 0, vextq_u16(d20[0], d20[1], 5));
1362   vst1q_u16(dst + 13 * stride + 8, vextq_u16(d20[1], d20[2], 5));
1363   vst1q_u16(dst + 14 * stride + 0, vextq_u16(d20[0], d20[1], 3));
1364   vst1q_u16(dst + 14 * stride + 8, vextq_u16(d20[1], d20[2], 3));
1365   vst1q_u16(dst + 15 * stride + 0, vextq_u16(d20[0], d20[1], 1));
1366   vst1q_u16(dst + 15 * stride + 8, vextq_u16(d20[1], d20[2], 1));
1367 }
1368 
vpx_highbd_d153_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)1369 void vpx_highbd_d153_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
1370                                           const uint16_t *above,
1371                                           const uint16_t *left, int bd) {
1372   // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation.
1373   uint16x8_t az, a0, a6, a7, a8, a14, a15, a16, a22, a23, a24, l0az, l0, l1, l7,
1374       l8, l9, l15, l16, l17, l23, l24, l25, azl0, d0[4], d1[4], d2[4], d20[8];
1375   (void)bd;
1376 
1377   az = vld1q_u16(above - 1);
1378   a0 = vld1q_u16(above + 0);
1379   a6 = vld1q_u16(above + 6);
1380   a7 = vld1q_u16(above + 7);
1381   a8 = vld1q_u16(above + 8);
1382   a14 = vld1q_u16(above + 14);
1383   a15 = vld1q_u16(above + 15);
1384   a16 = vld1q_u16(above + 16);
1385   a22 = vld1q_u16(above + 22);
1386   a23 = vld1q_u16(above + 23);
1387   a24 = vld1q_u16(above + 24);
1388   // [ left[0], above[-1], ... , above[13] ]
1389   l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
1390 
1391   l0 = vld1q_u16(left + 0);
1392   l1 = vld1q_u16(left + 1);
1393   l7 = vld1q_u16(left + 7);
1394   l8 = vld1q_u16(left + 8);
1395   l9 = vld1q_u16(left + 9);
1396   l15 = vld1q_u16(left + 15);
1397   l16 = vld1q_u16(left + 16);
1398   l17 = vld1q_u16(left + 17);
1399   l23 = vld1q_u16(left + 23);
1400   l24 = vld1q_u16(left + 24);
1401   // The last lane here is unused, reading left[32] could cause a buffer
1402   // over-read, so just fill with a duplicate of left[24] to avoid needing to
1403   // materialize a zero:
1404   // [ left[25], ... , left[31], x ]
1405   l25 = vextq_u16(l24, l24, 1);
1406   // [ above[-1], left[0], ... , left[14] ]
1407   azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
1408 
1409   d0[0] = vrhaddq_u16(azl0, l0);
1410   d0[1] = vrhaddq_u16(l7, l8);
1411   d0[2] = vrhaddq_u16(l15, l16);
1412   d0[3] = vrhaddq_u16(l23, l24);
1413 
1414   d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
1415   d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7);
1416   d1[2] = vrhaddq_u16(vhaddq_u16(a14, a16), a15);
1417   d1[3] = vrhaddq_u16(vhaddq_u16(a22, a24), a23);
1418 
1419   d2[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
1420   d2[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8);
1421   d2[2] = vrhaddq_u16(vhaddq_u16(l15, l17), l16);
1422   d2[3] = vrhaddq_u16(vhaddq_u16(l23, l25), l24);
1423 
1424   d0[0] = vrev64q_u16(vextq_u16(d0[0], d0[0], 4));
1425   d0[1] = vrev64q_u16(vextq_u16(d0[1], d0[1], 4));
1426   d0[2] = vrev64q_u16(vextq_u16(d0[2], d0[2], 4));
1427   d0[3] = vrev64q_u16(vextq_u16(d0[3], d0[3], 4));
1428   d2[0] = vrev64q_u16(vextq_u16(d2[0], d2[0], 4));
1429   d2[1] = vrev64q_u16(vextq_u16(d2[1], d2[1], 4));
1430   d2[2] = vrev64q_u16(vextq_u16(d2[2], d2[2], 4));
1431   d2[3] = vrev64q_u16(vextq_u16(d2[3], d2[3], 4));
1432 
1433   d20[0] = vzipq_u16(d2[3], d0[3]).val[0];
1434   d20[1] = vzipq_u16(d2[3], d0[3]).val[1];
1435   d20[2] = vzipq_u16(d2[2], d0[2]).val[0];
1436   d20[3] = vzipq_u16(d2[2], d0[2]).val[1];
1437   d20[4] = vzipq_u16(d2[1], d0[1]).val[0];
1438   d20[5] = vzipq_u16(d2[1], d0[1]).val[1];
1439   d20[6] = vzipq_u16(d2[0], d0[0]).val[0];
1440   d20[7] = vzipq_u16(d2[0], d0[0]).val[1];
1441 
1442   vst1q_u16(dst + 0 * stride + 0, vextq_u16(d20[7], d1[0], 7));
1443   vst1q_u16(dst + 0 * stride + 8, vextq_u16(d1[0], d1[1], 7));
1444   vst1q_u16(dst + 0 * stride + 16, vextq_u16(d1[1], d1[2], 7));
1445   vst1q_u16(dst + 0 * stride + 24, vextq_u16(d1[2], d1[3], 7));
1446   vst1q_u16(dst + 1 * stride + 0, vextq_u16(d20[7], d1[0], 5));
1447   vst1q_u16(dst + 1 * stride + 8, vextq_u16(d1[0], d1[1], 5));
1448   vst1q_u16(dst + 1 * stride + 16, vextq_u16(d1[1], d1[2], 5));
1449   vst1q_u16(dst + 1 * stride + 24, vextq_u16(d1[2], d1[3], 5));
1450   vst1q_u16(dst + 2 * stride + 0, vextq_u16(d20[7], d1[0], 3));
1451   vst1q_u16(dst + 2 * stride + 8, vextq_u16(d1[0], d1[1], 3));
1452   vst1q_u16(dst + 2 * stride + 16, vextq_u16(d1[1], d1[2], 3));
1453   vst1q_u16(dst + 2 * stride + 24, vextq_u16(d1[2], d1[3], 3));
1454   vst1q_u16(dst + 3 * stride + 0, vextq_u16(d20[7], d1[0], 1));
1455   vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 1));
1456   vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[1], d1[2], 1));
1457   vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1[2], d1[3], 1));
1458 
1459   vst1q_u16(dst + 4 * stride + 0, vextq_u16(d20[6], d20[7], 7));
1460   vst1q_u16(dst + 4 * stride + 8, vextq_u16(d20[7], d1[0], 7));
1461   vst1q_u16(dst + 4 * stride + 16, vextq_u16(d1[0], d1[1], 7));
1462   vst1q_u16(dst + 4 * stride + 24, vextq_u16(d1[1], d1[2], 7));
1463   vst1q_u16(dst + 5 * stride + 0, vextq_u16(d20[6], d20[7], 5));
1464   vst1q_u16(dst + 5 * stride + 8, vextq_u16(d20[7], d1[0], 5));
1465   vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[0], d1[1], 5));
1466   vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1[1], d1[2], 5));
1467   vst1q_u16(dst + 6 * stride + 0, vextq_u16(d20[6], d20[7], 3));
1468   vst1q_u16(dst + 6 * stride + 8, vextq_u16(d20[7], d1[0], 3));
1469   vst1q_u16(dst + 6 * stride + 16, vextq_u16(d1[0], d1[1], 3));
1470   vst1q_u16(dst + 6 * stride + 24, vextq_u16(d1[1], d1[2], 3));
1471   vst1q_u16(dst + 7 * stride + 0, vextq_u16(d20[6], d20[7], 1));
1472   vst1q_u16(dst + 7 * stride + 8, vextq_u16(d20[7], d1[0], 1));
1473   vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[0], d1[1], 1));
1474   vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1[1], d1[2], 1));
1475 
1476   vst1q_u16(dst + 8 * stride + 0, vextq_u16(d20[5], d20[6], 7));
1477   vst1q_u16(dst + 8 * stride + 8, vextq_u16(d20[6], d20[7], 7));
1478   vst1q_u16(dst + 8 * stride + 16, vextq_u16(d20[7], d1[0], 7));
1479   vst1q_u16(dst + 8 * stride + 24, vextq_u16(d1[0], d1[1], 7));
1480   vst1q_u16(dst + 9 * stride + 0, vextq_u16(d20[5], d20[6], 5));
1481   vst1q_u16(dst + 9 * stride + 8, vextq_u16(d20[6], d20[7], 5));
1482   vst1q_u16(dst + 9 * stride + 16, vextq_u16(d20[7], d1[0], 5));
1483   vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1[0], d1[1], 5));
1484   vst1q_u16(dst + 10 * stride + 0, vextq_u16(d20[5], d20[6], 3));
1485   vst1q_u16(dst + 10 * stride + 8, vextq_u16(d20[6], d20[7], 3));
1486   vst1q_u16(dst + 10 * stride + 16, vextq_u16(d20[7], d1[0], 3));
1487   vst1q_u16(dst + 10 * stride + 24, vextq_u16(d1[0], d1[1], 3));
1488   vst1q_u16(dst + 11 * stride + 0, vextq_u16(d20[5], d20[6], 1));
1489   vst1q_u16(dst + 11 * stride + 8, vextq_u16(d20[6], d20[7], 1));
1490   vst1q_u16(dst + 11 * stride + 16, vextq_u16(d20[7], d1[0], 1));
1491   vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1[0], d1[1], 1));
1492 
1493   vst1q_u16(dst + 12 * stride + 0, vextq_u16(d20[4], d20[5], 7));
1494   vst1q_u16(dst + 12 * stride + 8, vextq_u16(d20[5], d20[6], 7));
1495   vst1q_u16(dst + 12 * stride + 16, vextq_u16(d20[6], d20[7], 7));
1496   vst1q_u16(dst + 12 * stride + 24, vextq_u16(d20[7], d1[0], 7));
1497   vst1q_u16(dst + 13 * stride + 0, vextq_u16(d20[4], d20[5], 5));
1498   vst1q_u16(dst + 13 * stride + 8, vextq_u16(d20[5], d20[6], 5));
1499   vst1q_u16(dst + 13 * stride + 16, vextq_u16(d20[6], d20[7], 5));
1500   vst1q_u16(dst + 13 * stride + 24, vextq_u16(d20[7], d1[0], 5));
1501   vst1q_u16(dst + 14 * stride + 0, vextq_u16(d20[4], d20[5], 3));
1502   vst1q_u16(dst + 14 * stride + 8, vextq_u16(d20[5], d20[6], 3));
1503   vst1q_u16(dst + 14 * stride + 16, vextq_u16(d20[6], d20[7], 3));
1504   vst1q_u16(dst + 14 * stride + 24, vextq_u16(d20[7], d1[0], 3));
1505   vst1q_u16(dst + 15 * stride + 0, vextq_u16(d20[4], d20[5], 1));
1506   vst1q_u16(dst + 15 * stride + 8, vextq_u16(d20[5], d20[6], 1));
1507   vst1q_u16(dst + 15 * stride + 16, vextq_u16(d20[6], d20[7], 1));
1508   vst1q_u16(dst + 15 * stride + 24, vextq_u16(d20[7], d1[0], 1));
1509 
1510   vst1q_u16(dst + 16 * stride + 0, vextq_u16(d20[3], d20[4], 7));
1511   vst1q_u16(dst + 16 * stride + 8, vextq_u16(d20[4], d20[5], 7));
1512   vst1q_u16(dst + 16 * stride + 16, vextq_u16(d20[5], d20[6], 7));
1513   vst1q_u16(dst + 16 * stride + 24, vextq_u16(d20[6], d20[7], 7));
1514   vst1q_u16(dst + 17 * stride + 0, vextq_u16(d20[3], d20[4], 5));
1515   vst1q_u16(dst + 17 * stride + 8, vextq_u16(d20[4], d20[5], 5));
1516   vst1q_u16(dst + 17 * stride + 16, vextq_u16(d20[5], d20[6], 5));
1517   vst1q_u16(dst + 17 * stride + 24, vextq_u16(d20[6], d20[7], 5));
1518   vst1q_u16(dst + 18 * stride + 0, vextq_u16(d20[3], d20[4], 3));
1519   vst1q_u16(dst + 18 * stride + 8, vextq_u16(d20[4], d20[5], 3));
1520   vst1q_u16(dst + 18 * stride + 16, vextq_u16(d20[5], d20[6], 3));
1521   vst1q_u16(dst + 18 * stride + 24, vextq_u16(d20[6], d20[7], 3));
1522   vst1q_u16(dst + 19 * stride + 0, vextq_u16(d20[3], d20[4], 1));
1523   vst1q_u16(dst + 19 * stride + 8, vextq_u16(d20[4], d20[5], 1));
1524   vst1q_u16(dst + 19 * stride + 16, vextq_u16(d20[5], d20[6], 1));
1525   vst1q_u16(dst + 19 * stride + 24, vextq_u16(d20[6], d20[7], 1));
1526 
1527   vst1q_u16(dst + 20 * stride + 0, vextq_u16(d20[2], d20[3], 7));
1528   vst1q_u16(dst + 20 * stride + 8, vextq_u16(d20[3], d20[4], 7));
1529   vst1q_u16(dst + 20 * stride + 16, vextq_u16(d20[4], d20[5], 7));
1530   vst1q_u16(dst + 20 * stride + 24, vextq_u16(d20[5], d20[6], 7));
1531   vst1q_u16(dst + 21 * stride + 0, vextq_u16(d20[2], d20[3], 5));
1532   vst1q_u16(dst + 21 * stride + 8, vextq_u16(d20[3], d20[4], 5));
1533   vst1q_u16(dst + 21 * stride + 16, vextq_u16(d20[4], d20[5], 5));
1534   vst1q_u16(dst + 21 * stride + 24, vextq_u16(d20[5], d20[6], 5));
1535   vst1q_u16(dst + 22 * stride + 0, vextq_u16(d20[2], d20[3], 3));
1536   vst1q_u16(dst + 22 * stride + 8, vextq_u16(d20[3], d20[4], 3));
1537   vst1q_u16(dst + 22 * stride + 16, vextq_u16(d20[4], d20[5], 3));
1538   vst1q_u16(dst + 22 * stride + 24, vextq_u16(d20[5], d20[6], 3));
1539   vst1q_u16(dst + 23 * stride + 0, vextq_u16(d20[2], d20[3], 1));
1540   vst1q_u16(dst + 23 * stride + 8, vextq_u16(d20[3], d20[4], 1));
1541   vst1q_u16(dst + 23 * stride + 16, vextq_u16(d20[4], d20[5], 1));
1542   vst1q_u16(dst + 23 * stride + 24, vextq_u16(d20[5], d20[6], 1));
1543 
1544   vst1q_u16(dst + 24 * stride + 0, vextq_u16(d20[1], d20[2], 7));
1545   vst1q_u16(dst + 24 * stride + 8, vextq_u16(d20[2], d20[3], 7));
1546   vst1q_u16(dst + 24 * stride + 16, vextq_u16(d20[3], d20[4], 7));
1547   vst1q_u16(dst + 24 * stride + 24, vextq_u16(d20[4], d20[5], 7));
1548   vst1q_u16(dst + 25 * stride + 0, vextq_u16(d20[1], d20[2], 5));
1549   vst1q_u16(dst + 25 * stride + 8, vextq_u16(d20[2], d20[3], 5));
1550   vst1q_u16(dst + 25 * stride + 16, vextq_u16(d20[3], d20[4], 5));
1551   vst1q_u16(dst + 25 * stride + 24, vextq_u16(d20[4], d20[5], 5));
1552   vst1q_u16(dst + 26 * stride + 0, vextq_u16(d20[1], d20[2], 3));
1553   vst1q_u16(dst + 26 * stride + 8, vextq_u16(d20[2], d20[3], 3));
1554   vst1q_u16(dst + 26 * stride + 16, vextq_u16(d20[3], d20[4], 3));
1555   vst1q_u16(dst + 26 * stride + 24, vextq_u16(d20[4], d20[5], 3));
1556   vst1q_u16(dst + 27 * stride + 0, vextq_u16(d20[1], d20[2], 1));
1557   vst1q_u16(dst + 27 * stride + 8, vextq_u16(d20[2], d20[3], 1));
1558   vst1q_u16(dst + 27 * stride + 16, vextq_u16(d20[3], d20[4], 1));
1559   vst1q_u16(dst + 27 * stride + 24, vextq_u16(d20[4], d20[5], 1));
1560 
1561   vst1q_u16(dst + 28 * stride + 0, vextq_u16(d20[0], d20[1], 7));
1562   vst1q_u16(dst + 28 * stride + 8, vextq_u16(d20[1], d20[2], 7));
1563   vst1q_u16(dst + 28 * stride + 16, vextq_u16(d20[2], d20[3], 7));
1564   vst1q_u16(dst + 28 * stride + 24, vextq_u16(d20[3], d20[4], 7));
1565   vst1q_u16(dst + 29 * stride + 0, vextq_u16(d20[0], d20[1], 5));
1566   vst1q_u16(dst + 29 * stride + 8, vextq_u16(d20[1], d20[2], 5));
1567   vst1q_u16(dst + 29 * stride + 16, vextq_u16(d20[2], d20[3], 5));
1568   vst1q_u16(dst + 29 * stride + 24, vextq_u16(d20[3], d20[4], 5));
1569   vst1q_u16(dst + 30 * stride + 0, vextq_u16(d20[0], d20[1], 3));
1570   vst1q_u16(dst + 30 * stride + 8, vextq_u16(d20[1], d20[2], 3));
1571   vst1q_u16(dst + 30 * stride + 16, vextq_u16(d20[2], d20[3], 3));
1572   vst1q_u16(dst + 30 * stride + 24, vextq_u16(d20[3], d20[4], 3));
1573   vst1q_u16(dst + 31 * stride + 0, vextq_u16(d20[0], d20[1], 1));
1574   vst1q_u16(dst + 31 * stride + 8, vextq_u16(d20[1], d20[2], 1));
1575   vst1q_u16(dst + 31 * stride + 16, vextq_u16(d20[2], d20[3], 1));
1576   vst1q_u16(dst + 31 * stride + 24, vextq_u16(d20[3], d20[4], 1));
1577 }
1578 
1579 // -----------------------------------------------------------------------------
1580 
vpx_highbd_d135_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)1581 void vpx_highbd_d135_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
1582                                         const uint16_t *above,
1583                                         const uint16_t *left, int bd) {
1584   const uint16x8_t XA0123___ = vld1q_u16(above - 1);
1585   const uint16x4_t L0123 = vld1_u16(left);
1586   const uint16x4_t L3210 = vrev64_u16(L0123);
1587   const uint16x8_t L____3210 = vcombine_u16(L0123, L3210);
1588   const uint16x8_t L3210XA012 = vcombine_u16(L3210, vget_low_u16(XA0123___));
1589   const uint16x8_t L210XA0123 = vextq_u16(L____3210, XA0123___, 5);
1590   const uint16x8_t L10XA0123_ = vextq_u16(L____3210, XA0123___, 6);
1591   const uint16x8_t avg1 = vhaddq_u16(L3210XA012, L10XA0123_);
1592   const uint16x8_t avg2 = vrhaddq_u16(avg1, L210XA0123);
1593   const uint16x4_t row_0 = vget_low_u16(avg2);
1594   const uint16x4_t row_1 = vget_high_u16(avg2);
1595   const uint16x4_t r0 = vext_u16(row_0, row_1, 3);
1596   const uint16x4_t r1 = vext_u16(row_0, row_1, 2);
1597   const uint16x4_t r2 = vext_u16(row_0, row_1, 1);
1598   (void)bd;
1599   vst1_u16(dst, r0);
1600   dst += stride;
1601   vst1_u16(dst, r1);
1602   dst += stride;
1603   vst1_u16(dst, r2);
1604   dst += stride;
1605   vst1_u16(dst, row_0);
1606 }
1607 
vpx_highbd_d135_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)1608 void vpx_highbd_d135_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
1609                                         const uint16_t *above,
1610                                         const uint16_t *left, int bd) {
1611   const uint16x8_t XA0123456 = vld1q_u16(above - 1);
1612   const uint16x8_t A01234567 = vld1q_u16(above);
1613   const uint16x8_t A1234567_ = vld1q_u16(above + 1);
1614   const uint16x8_t L01234567 = vld1q_u16(left);
1615   const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567));
1616   const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567));
1617   const uint16x8_t L76543210 = vcombine_u16(L7654, L3210);
1618   const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1);
1619   const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2);
1620   const uint16x8_t avg_0 = vhaddq_u16(L76543210, L543210XA0);
1621   const uint16x8_t avg_1 = vhaddq_u16(XA0123456, A1234567_);
1622   const uint16x8_t row_0 = vrhaddq_u16(avg_0, L6543210X);
1623   const uint16x8_t row_1 = vrhaddq_u16(avg_1, A01234567);
1624   const uint16x8_t r0 = vextq_u16(row_0, row_1, 7);
1625   const uint16x8_t r1 = vextq_u16(row_0, row_1, 6);
1626   const uint16x8_t r2 = vextq_u16(row_0, row_1, 5);
1627   const uint16x8_t r3 = vextq_u16(row_0, row_1, 4);
1628   const uint16x8_t r4 = vextq_u16(row_0, row_1, 3);
1629   const uint16x8_t r5 = vextq_u16(row_0, row_1, 2);
1630   const uint16x8_t r6 = vextq_u16(row_0, row_1, 1);
1631   (void)bd;
1632   vst1q_u16(dst, r0);
1633   dst += stride;
1634   vst1q_u16(dst, r1);
1635   dst += stride;
1636   vst1q_u16(dst, r2);
1637   dst += stride;
1638   vst1q_u16(dst, r3);
1639   dst += stride;
1640   vst1q_u16(dst, r4);
1641   dst += stride;
1642   vst1q_u16(dst, r5);
1643   dst += stride;
1644   vst1q_u16(dst, r6);
1645   dst += stride;
1646   vst1q_u16(dst, row_0);
1647 }
1648 
d135_store_16(uint16_t ** dst,const ptrdiff_t stride,const uint16x8_t row_0,const uint16x8_t row_1)1649 static INLINE void d135_store_16(uint16_t **dst, const ptrdiff_t stride,
1650                                  const uint16x8_t row_0,
1651                                  const uint16x8_t row_1) {
1652   vst1q_u16(*dst, row_0);
1653   *dst += 8;
1654   vst1q_u16(*dst, row_1);
1655   *dst += stride - 8;
1656 }
1657 
vpx_highbd_d135_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)1658 void vpx_highbd_d135_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
1659                                           const uint16_t *above,
1660                                           const uint16_t *left, int bd) {
1661   const uint16x8_t L01234567 = vld1q_u16(left);
1662   const uint16x8_t L89abcdef = vld1q_u16(left + 8);
1663   const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567));
1664   const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567));
1665   const uint16x4_t Lba98 = vrev64_u16(vget_low_u16(L89abcdef));
1666   const uint16x4_t Lfedc = vrev64_u16(vget_high_u16(L89abcdef));
1667   const uint16x8_t L76543210 = vcombine_u16(L7654, L3210);
1668   const uint16x8_t Lfedcba98 = vcombine_u16(Lfedc, Lba98);
1669   const uint16x8_t Ledcba987 = vextq_u16(Lfedcba98, L76543210, 1);
1670   const uint16x8_t Ldcba9876 = vextq_u16(Lfedcba98, L76543210, 2);
1671   const uint16x8_t avg_0 = vhaddq_u16(Lfedcba98, Ldcba9876);
1672   const uint16x8_t row_0 = vrhaddq_u16(avg_0, Ledcba987);
1673 
1674   const uint16x8_t XA0123456 = vld1q_u16(above - 1);
1675   const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1);
1676   const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2);
1677   const uint16x8_t avg_1 = vhaddq_u16(L76543210, L543210XA0);
1678   const uint16x8_t row_1 = vrhaddq_u16(avg_1, L6543210X);
1679 
1680   const uint16x8_t A01234567 = vld1q_u16(above);
1681   const uint16x8_t A12345678 = vld1q_u16(above + 1);
1682   const uint16x8_t avg_2 = vhaddq_u16(XA0123456, A12345678);
1683   const uint16x8_t row_2 = vrhaddq_u16(avg_2, A01234567);
1684 
1685   const uint16x8_t A789abcde = vld1q_u16(above + 7);
1686   const uint16x8_t A89abcdef = vld1q_u16(above + 8);
1687   const uint16x8_t A9abcdef_ = vld1q_u16(above + 9);
1688   const uint16x8_t avg_3 = vhaddq_u16(A789abcde, A9abcdef_);
1689   const uint16x8_t row_3 = vrhaddq_u16(avg_3, A89abcdef);
1690 
1691   const uint16x8_t r0_0 = vextq_u16(row_1, row_2, 7);
1692   const uint16x8_t r0_1 = vextq_u16(row_2, row_3, 7);
1693   const uint16x8_t r1_0 = vextq_u16(row_1, row_2, 6);
1694   const uint16x8_t r1_1 = vextq_u16(row_2, row_3, 6);
1695   const uint16x8_t r2_0 = vextq_u16(row_1, row_2, 5);
1696   const uint16x8_t r2_1 = vextq_u16(row_2, row_3, 5);
1697   const uint16x8_t r3_0 = vextq_u16(row_1, row_2, 4);
1698   const uint16x8_t r3_1 = vextq_u16(row_2, row_3, 4);
1699   const uint16x8_t r4_0 = vextq_u16(row_1, row_2, 3);
1700   const uint16x8_t r4_1 = vextq_u16(row_2, row_3, 3);
1701   const uint16x8_t r5_0 = vextq_u16(row_1, row_2, 2);
1702   const uint16x8_t r5_1 = vextq_u16(row_2, row_3, 2);
1703   const uint16x8_t r6_0 = vextq_u16(row_1, row_2, 1);
1704   const uint16x8_t r6_1 = vextq_u16(row_2, row_3, 1);
1705   const uint16x8_t r8_0 = vextq_u16(row_0, row_1, 7);
1706   const uint16x8_t r9_0 = vextq_u16(row_0, row_1, 6);
1707   const uint16x8_t ra_0 = vextq_u16(row_0, row_1, 5);
1708   const uint16x8_t rb_0 = vextq_u16(row_0, row_1, 4);
1709   const uint16x8_t rc_0 = vextq_u16(row_0, row_1, 3);
1710   const uint16x8_t rd_0 = vextq_u16(row_0, row_1, 2);
1711   const uint16x8_t re_0 = vextq_u16(row_0, row_1, 1);
1712   (void)bd;
1713 
1714   d135_store_16(&dst, stride, r0_0, r0_1);
1715   d135_store_16(&dst, stride, r1_0, r1_1);
1716   d135_store_16(&dst, stride, r2_0, r2_1);
1717   d135_store_16(&dst, stride, r3_0, r3_1);
1718   d135_store_16(&dst, stride, r4_0, r4_1);
1719   d135_store_16(&dst, stride, r5_0, r5_1);
1720   d135_store_16(&dst, stride, r6_0, r6_1);
1721   d135_store_16(&dst, stride, row_1, row_2);
1722   d135_store_16(&dst, stride, r8_0, r0_0);
1723   d135_store_16(&dst, stride, r9_0, r1_0);
1724   d135_store_16(&dst, stride, ra_0, r2_0);
1725   d135_store_16(&dst, stride, rb_0, r3_0);
1726   d135_store_16(&dst, stride, rc_0, r4_0);
1727   d135_store_16(&dst, stride, rd_0, r5_0);
1728   d135_store_16(&dst, stride, re_0, r6_0);
1729   vst1q_u16(dst, row_0);
1730   dst += 8;
1731   vst1q_u16(dst, row_1);
1732 }
1733 
vpx_highbd_d135_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)1734 void vpx_highbd_d135_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
1735                                           const uint16_t *above,
1736                                           const uint16_t *left, int bd) {
1737   const uint16x8_t LL01234567 = vld1q_u16(left + 16);
1738   const uint16x8_t LL89abcdef = vld1q_u16(left + 24);
1739   const uint16x4_t LL3210 = vrev64_u16(vget_low_u16(LL01234567));
1740   const uint16x4_t LL7654 = vrev64_u16(vget_high_u16(LL01234567));
1741   const uint16x4_t LLba98 = vrev64_u16(vget_low_u16(LL89abcdef));
1742   const uint16x4_t LLfedc = vrev64_u16(vget_high_u16(LL89abcdef));
1743   const uint16x8_t LL76543210 = vcombine_u16(LL7654, LL3210);
1744   const uint16x8_t LLfedcba98 = vcombine_u16(LLfedc, LLba98);
1745   const uint16x8_t LLedcba987 = vextq_u16(LLfedcba98, LL76543210, 1);
1746   const uint16x8_t LLdcba9876 = vextq_u16(LLfedcba98, LL76543210, 2);
1747   const uint16x8_t avg_0 = vhaddq_u16(LLfedcba98, LLdcba9876);
1748   uint16x8_t row_0 = vrhaddq_u16(avg_0, LLedcba987);
1749 
1750   const uint16x8_t LU01234567 = vld1q_u16(left);
1751   const uint16x8_t LU89abcdef = vld1q_u16(left + 8);
1752   const uint16x4_t LU3210 = vrev64_u16(vget_low_u16(LU01234567));
1753   const uint16x4_t LU7654 = vrev64_u16(vget_high_u16(LU01234567));
1754   const uint16x4_t LUba98 = vrev64_u16(vget_low_u16(LU89abcdef));
1755   const uint16x4_t LUfedc = vrev64_u16(vget_high_u16(LU89abcdef));
1756   const uint16x8_t LU76543210 = vcombine_u16(LU7654, LU3210);
1757   const uint16x8_t LUfedcba98 = vcombine_u16(LUfedc, LUba98);
1758   const uint16x8_t LL6543210Uf = vextq_u16(LL76543210, LUfedcba98, 1);
1759   const uint16x8_t LL543210Ufe = vextq_u16(LL76543210, LUfedcba98, 2);
1760   const uint16x8_t avg_1 = vhaddq_u16(LL76543210, LL543210Ufe);
1761   uint16x8_t row_1 = vrhaddq_u16(avg_1, LL6543210Uf);
1762 
1763   const uint16x8_t LUedcba987 = vextq_u16(LUfedcba98, LU76543210, 1);
1764   const uint16x8_t LUdcba9876 = vextq_u16(LUfedcba98, LU76543210, 2);
1765   const uint16x8_t avg_2 = vhaddq_u16(LUfedcba98, LUdcba9876);
1766   uint16x8_t row_2 = vrhaddq_u16(avg_2, LUedcba987);
1767 
1768   const uint16x8_t XAL0123456 = vld1q_u16(above - 1);
1769   const uint16x8_t LU6543210X = vextq_u16(LU76543210, XAL0123456, 1);
1770   const uint16x8_t LU543210XA0 = vextq_u16(LU76543210, XAL0123456, 2);
1771   const uint16x8_t avg_3 = vhaddq_u16(LU76543210, LU543210XA0);
1772   uint16x8_t row_3 = vrhaddq_u16(avg_3, LU6543210X);
1773 
1774   const uint16x8_t AL01234567 = vld1q_u16(above);
1775   const uint16x8_t AL12345678 = vld1q_u16(above + 1);
1776   const uint16x8_t avg_4 = vhaddq_u16(XAL0123456, AL12345678);
1777   uint16x8_t row_4 = vrhaddq_u16(avg_4, AL01234567);
1778 
1779   const uint16x8_t AL789abcde = vld1q_u16(above + 7);
1780   const uint16x8_t AL89abcdef = vld1q_u16(above + 8);
1781   const uint16x8_t AL9abcdefg = vld1q_u16(above + 9);
1782   const uint16x8_t avg_5 = vhaddq_u16(AL789abcde, AL9abcdefg);
1783   uint16x8_t row_5 = vrhaddq_u16(avg_5, AL89abcdef);
1784 
1785   const uint16x8_t ALfR0123456 = vld1q_u16(above + 15);
1786   const uint16x8_t AR01234567 = vld1q_u16(above + 16);
1787   const uint16x8_t AR12345678 = vld1q_u16(above + 17);
1788   const uint16x8_t avg_6 = vhaddq_u16(ALfR0123456, AR12345678);
1789   uint16x8_t row_6 = vrhaddq_u16(avg_6, AR01234567);
1790 
1791   const uint16x8_t AR789abcde = vld1q_u16(above + 23);
1792   const uint16x8_t AR89abcdef = vld1q_u16(above + 24);
1793   const uint16x8_t AR9abcdef_ = vld1q_u16(above + 25);
1794   const uint16x8_t avg_7 = vhaddq_u16(AR789abcde, AR9abcdef_);
1795   uint16x8_t row_7 = vrhaddq_u16(avg_7, AR89abcdef);
1796   int i, j;
1797   (void)bd;
1798 
1799   dst += 31 * stride;
1800   for (i = 0; i < 4; ++i) {
1801     for (j = 0; j < 8; ++j) {
1802       vst1q_u16(dst, row_0);
1803       dst += 8;
1804       vst1q_u16(dst, row_1);
1805       dst += 8;
1806       vst1q_u16(dst, row_2);
1807       dst += 8;
1808       vst1q_u16(dst, row_3);
1809       dst -= stride + 24;
1810       row_0 = vextq_u16(row_0, row_1, 1);
1811       row_1 = vextq_u16(row_1, row_2, 1);
1812       row_2 = vextq_u16(row_2, row_3, 1);
1813       row_3 = vextq_u16(row_3, row_4, 1);
1814       row_4 = vextq_u16(row_4, row_4, 1);
1815     }
1816     row_4 = row_5;
1817     row_5 = row_6;
1818     row_6 = row_7;
1819   }
1820 }
1821 
1822 //------------------------------------------------------------------------------
1823 
vpx_highbd_d207_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)1824 void vpx_highbd_d207_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
1825                                         const uint16_t *above,
1826                                         const uint16_t *left, int bd) {
1827   uint16x4_t l0, l1, l2, l3, c0, c1, c01_lo, c01_hi;
1828   (void)above;
1829   (void)bd;
1830 
1831   l0 = vld1_u16(left + 0);
1832   l3 = vld1_dup_u16(left + 3);
1833 
1834   // [ left[1], left[2], left[3], left[3] ]
1835   l1 = vext_u16(l0, l3, 1);
1836   // [ left[2], left[3], left[3], left[3] ]
1837   l2 = vext_u16(l0, l3, 2);
1838 
1839   c0 = vrhadd_u16(l0, l1);
1840   c1 = vrhadd_u16(vhadd_u16(l0, l2), l1);
1841 
1842   c01_lo = vzip_u16(c0, c1).val[0];
1843   c01_hi = vzip_u16(c0, c1).val[1];
1844 
1845   // stride=0 [ c0[0], c1[0],   c0[1],   c1[1] ]
1846   // stride=1 [ c0[1], c1[1],   c0[2],   c1[2] ]
1847   // stride=2 [ c0[2], c1[2],   c0[3],   c1[3] ]
1848   // stride=3 [ c0[3], c1[3], left[3], left[3] ]
1849   vst1_u16(dst + 0 * stride, c01_lo);
1850   vst1_u16(dst + 1 * stride, vext_u16(c01_lo, c01_hi, 2));
1851   vst1_u16(dst + 2 * stride, c01_hi);
1852   vst1_u16(dst + 3 * stride, vext_u16(c01_hi, l3, 2));
1853 }
1854 
vpx_highbd_d207_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)1855 void vpx_highbd_d207_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
1856                                         const uint16_t *above,
1857                                         const uint16_t *left, int bd) {
1858   uint16x8_t l0, l1, l2, l7, c0, c1, c01_lo, c01_hi;
1859   (void)above;
1860   (void)bd;
1861 
1862   l0 = vld1q_u16(left + 0);
1863   l7 = vld1q_dup_u16(left + 7);
1864 
1865   // [ left[1], left[2], left[3], left[4], left[5], left[6], left[7], left[7] ]
1866   l1 = vextq_u16(l0, l7, 1);
1867   // [ left[2], left[3], left[4], left[5], left[6], left[7], left[7], left[7] ]
1868   l2 = vextq_u16(l0, l7, 2);
1869 
1870   c0 = vrhaddq_u16(l0, l1);
1871   c1 = vrhaddq_u16(vhaddq_u16(l0, l2), l1);
1872 
1873   c01_lo = vzipq_u16(c0, c1).val[0];
1874   c01_hi = vzipq_u16(c0, c1).val[1];
1875 
1876   vst1q_u16(dst + 0 * stride, c01_lo);
1877   vst1q_u16(dst + 1 * stride, vextq_u16(c01_lo, c01_hi, 2));
1878   vst1q_u16(dst + 2 * stride, vextq_u16(c01_lo, c01_hi, 4));
1879   vst1q_u16(dst + 3 * stride, vextq_u16(c01_lo, c01_hi, 6));
1880   vst1q_u16(dst + 4 * stride, c01_hi);
1881   vst1q_u16(dst + 5 * stride, vextq_u16(c01_hi, l7, 2));
1882   vst1q_u16(dst + 6 * stride, vextq_u16(c01_hi, l7, 4));
1883   vst1q_u16(dst + 7 * stride, vextq_u16(c01_hi, l7, 6));
1884 }
1885 
vpx_highbd_d207_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)1886 void vpx_highbd_d207_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
1887                                           const uint16_t *above,
1888                                           const uint16_t *left, int bd) {
1889   uint16x8_t l0, l1, l2, l8, l9, l10, l15, c0[2], c1[2], c01[4];
1890   (void)above;
1891   (void)bd;
1892 
1893   l0 = vld1q_u16(left + 0);
1894   l1 = vld1q_u16(left + 1);
1895   l2 = vld1q_u16(left + 2);
1896   l8 = vld1q_u16(left + 8);
1897   l15 = vld1q_dup_u16(left + 15);
1898 
1899   l9 = vextq_u16(l8, l15, 1);
1900   l10 = vextq_u16(l8, l15, 2);
1901 
1902   c0[0] = vrhaddq_u16(l0, l1);
1903   c0[1] = vrhaddq_u16(l8, l9);
1904   c1[0] = vrhaddq_u16(vhaddq_u16(l0, l2), l1);
1905   c1[1] = vrhaddq_u16(vhaddq_u16(l8, l10), l9);
1906 
1907   c01[0] = vzipq_u16(c0[0], c1[0]).val[0];
1908   c01[1] = vzipq_u16(c0[0], c1[0]).val[1];
1909   c01[2] = vzipq_u16(c0[1], c1[1]).val[0];
1910   c01[3] = vzipq_u16(c0[1], c1[1]).val[1];
1911 
1912   vst1q_u16(dst + 0 * stride + 0, c01[0]);
1913   vst1q_u16(dst + 0 * stride + 8, c01[1]);
1914   vst1q_u16(dst + 1 * stride + 0, vextq_u16(c01[0], c01[1], 2));
1915   vst1q_u16(dst + 1 * stride + 8, vextq_u16(c01[1], c01[2], 2));
1916   vst1q_u16(dst + 2 * stride + 0, vextq_u16(c01[0], c01[1], 4));
1917   vst1q_u16(dst + 2 * stride + 8, vextq_u16(c01[1], c01[2], 4));
1918   vst1q_u16(dst + 3 * stride + 0, vextq_u16(c01[0], c01[1], 6));
1919   vst1q_u16(dst + 3 * stride + 8, vextq_u16(c01[1], c01[2], 6));
1920 
1921   vst1q_u16(dst + 4 * stride + 0, c01[1]);
1922   vst1q_u16(dst + 4 * stride + 8, c01[2]);
1923   vst1q_u16(dst + 5 * stride + 0, vextq_u16(c01[1], c01[2], 2));
1924   vst1q_u16(dst + 5 * stride + 8, vextq_u16(c01[2], c01[3], 2));
1925   vst1q_u16(dst + 6 * stride + 0, vextq_u16(c01[1], c01[2], 4));
1926   vst1q_u16(dst + 6 * stride + 8, vextq_u16(c01[2], c01[3], 4));
1927   vst1q_u16(dst + 7 * stride + 0, vextq_u16(c01[1], c01[2], 6));
1928   vst1q_u16(dst + 7 * stride + 8, vextq_u16(c01[2], c01[3], 6));
1929 
1930   vst1q_u16(dst + 8 * stride + 0, c01[2]);
1931   vst1q_u16(dst + 8 * stride + 8, c01[3]);
1932   vst1q_u16(dst + 9 * stride + 0, vextq_u16(c01[2], c01[3], 2));
1933   vst1q_u16(dst + 9 * stride + 8, vextq_u16(c01[3], l15, 2));
1934   vst1q_u16(dst + 10 * stride + 0, vextq_u16(c01[2], c01[3], 4));
1935   vst1q_u16(dst + 10 * stride + 8, vextq_u16(c01[3], l15, 4));
1936   vst1q_u16(dst + 11 * stride + 0, vextq_u16(c01[2], c01[3], 6));
1937   vst1q_u16(dst + 11 * stride + 8, vextq_u16(c01[3], l15, 6));
1938 
1939   vst1q_u16(dst + 12 * stride + 0, c01[3]);
1940   vst1q_u16(dst + 12 * stride + 8, l15);
1941   vst1q_u16(dst + 13 * stride + 0, vextq_u16(c01[3], l15, 2));
1942   vst1q_u16(dst + 13 * stride + 8, l15);
1943   vst1q_u16(dst + 14 * stride + 0, vextq_u16(c01[3], l15, 4));
1944   vst1q_u16(dst + 14 * stride + 8, l15);
1945   vst1q_u16(dst + 15 * stride + 0, vextq_u16(c01[3], l15, 6));
1946   vst1q_u16(dst + 15 * stride + 8, l15);
1947 }
1948 
vpx_highbd_d207_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)1949 void vpx_highbd_d207_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
1950                                           const uint16_t *above,
1951                                           const uint16_t *left, int bd) {
1952   uint16x8_t l0, l1, l2, l8, l9, l10, l16, l17, l18, l24, l25, l26, l31, c0[4],
1953       c1[4], c01[8];
1954   (void)above;
1955   (void)bd;
1956 
1957   l0 = vld1q_u16(left + 0);
1958   l1 = vld1q_u16(left + 1);
1959   l2 = vld1q_u16(left + 2);
1960   l8 = vld1q_u16(left + 8);
1961   l9 = vld1q_u16(left + 9);
1962   l10 = vld1q_u16(left + 10);
1963   l16 = vld1q_u16(left + 16);
1964   l17 = vld1q_u16(left + 17);
1965   l18 = vld1q_u16(left + 18);
1966   l24 = vld1q_u16(left + 24);
1967   l31 = vld1q_dup_u16(left + 31);
1968 
1969   l25 = vextq_u16(l24, l31, 1);
1970   l26 = vextq_u16(l24, l31, 2);
1971 
1972   c0[0] = vrhaddq_u16(l0, l1);
1973   c0[1] = vrhaddq_u16(l8, l9);
1974   c0[2] = vrhaddq_u16(l16, l17);
1975   c0[3] = vrhaddq_u16(l24, l25);
1976   c1[0] = vrhaddq_u16(vhaddq_u16(l0, l2), l1);
1977   c1[1] = vrhaddq_u16(vhaddq_u16(l8, l10), l9);
1978   c1[2] = vrhaddq_u16(vhaddq_u16(l16, l18), l17);
1979   c1[3] = vrhaddq_u16(vhaddq_u16(l24, l26), l25);
1980 
1981   c01[0] = vzipq_u16(c0[0], c1[0]).val[0];
1982   c01[1] = vzipq_u16(c0[0], c1[0]).val[1];
1983   c01[2] = vzipq_u16(c0[1], c1[1]).val[0];
1984   c01[3] = vzipq_u16(c0[1], c1[1]).val[1];
1985   c01[4] = vzipq_u16(c0[2], c1[2]).val[0];
1986   c01[5] = vzipq_u16(c0[2], c1[2]).val[1];
1987   c01[6] = vzipq_u16(c0[3], c1[3]).val[0];
1988   c01[7] = vzipq_u16(c0[3], c1[3]).val[1];
1989 
1990   vst1q_u16(dst + 0 * stride + 0, c01[0]);
1991   vst1q_u16(dst + 0 * stride + 8, c01[1]);
1992   vst1q_u16(dst + 0 * stride + 16, c01[2]);
1993   vst1q_u16(dst + 0 * stride + 24, c01[3]);
1994   vst1q_u16(dst + 1 * stride + 0, vextq_u16(c01[0], c01[1], 2));
1995   vst1q_u16(dst + 1 * stride + 8, vextq_u16(c01[1], c01[2], 2));
1996   vst1q_u16(dst + 1 * stride + 16, vextq_u16(c01[2], c01[3], 2));
1997   vst1q_u16(dst + 1 * stride + 24, vextq_u16(c01[3], c01[4], 2));
1998   vst1q_u16(dst + 2 * stride + 0, vextq_u16(c01[0], c01[1], 4));
1999   vst1q_u16(dst + 2 * stride + 8, vextq_u16(c01[1], c01[2], 4));
2000   vst1q_u16(dst + 2 * stride + 16, vextq_u16(c01[2], c01[3], 4));
2001   vst1q_u16(dst + 2 * stride + 24, vextq_u16(c01[3], c01[4], 4));
2002   vst1q_u16(dst + 3 * stride + 0, vextq_u16(c01[0], c01[1], 6));
2003   vst1q_u16(dst + 3 * stride + 8, vextq_u16(c01[1], c01[2], 6));
2004   vst1q_u16(dst + 3 * stride + 16, vextq_u16(c01[2], c01[3], 6));
2005   vst1q_u16(dst + 3 * stride + 24, vextq_u16(c01[3], c01[4], 6));
2006 
2007   vst1q_u16(dst + 4 * stride + 0, c01[1]);
2008   vst1q_u16(dst + 4 * stride + 8, c01[2]);
2009   vst1q_u16(dst + 4 * stride + 16, c01[3]);
2010   vst1q_u16(dst + 4 * stride + 24, c01[4]);
2011   vst1q_u16(dst + 5 * stride + 0, vextq_u16(c01[1], c01[2], 2));
2012   vst1q_u16(dst + 5 * stride + 8, vextq_u16(c01[2], c01[3], 2));
2013   vst1q_u16(dst + 5 * stride + 16, vextq_u16(c01[3], c01[4], 2));
2014   vst1q_u16(dst + 5 * stride + 24, vextq_u16(c01[4], c01[5], 2));
2015   vst1q_u16(dst + 6 * stride + 0, vextq_u16(c01[1], c01[2], 4));
2016   vst1q_u16(dst + 6 * stride + 8, vextq_u16(c01[2], c01[3], 4));
2017   vst1q_u16(dst + 6 * stride + 16, vextq_u16(c01[3], c01[4], 4));
2018   vst1q_u16(dst + 6 * stride + 24, vextq_u16(c01[4], c01[5], 4));
2019   vst1q_u16(dst + 7 * stride + 0, vextq_u16(c01[1], c01[2], 6));
2020   vst1q_u16(dst + 7 * stride + 8, vextq_u16(c01[2], c01[3], 6));
2021   vst1q_u16(dst + 7 * stride + 16, vextq_u16(c01[3], c01[4], 6));
2022   vst1q_u16(dst + 7 * stride + 24, vextq_u16(c01[4], c01[5], 6));
2023 
2024   vst1q_u16(dst + 8 * stride + 0, c01[2]);
2025   vst1q_u16(dst + 8 * stride + 8, c01[3]);
2026   vst1q_u16(dst + 8 * stride + 16, c01[4]);
2027   vst1q_u16(dst + 8 * stride + 24, c01[5]);
2028   vst1q_u16(dst + 9 * stride + 0, vextq_u16(c01[2], c01[3], 2));
2029   vst1q_u16(dst + 9 * stride + 8, vextq_u16(c01[3], c01[4], 2));
2030   vst1q_u16(dst + 9 * stride + 16, vextq_u16(c01[4], c01[5], 2));
2031   vst1q_u16(dst + 9 * stride + 24, vextq_u16(c01[5], c01[6], 2));
2032   vst1q_u16(dst + 10 * stride + 0, vextq_u16(c01[2], c01[3], 4));
2033   vst1q_u16(dst + 10 * stride + 8, vextq_u16(c01[3], c01[4], 4));
2034   vst1q_u16(dst + 10 * stride + 16, vextq_u16(c01[4], c01[5], 4));
2035   vst1q_u16(dst + 10 * stride + 24, vextq_u16(c01[5], c01[6], 4));
2036   vst1q_u16(dst + 11 * stride + 0, vextq_u16(c01[2], c01[3], 6));
2037   vst1q_u16(dst + 11 * stride + 8, vextq_u16(c01[3], c01[4], 6));
2038   vst1q_u16(dst + 11 * stride + 16, vextq_u16(c01[4], c01[5], 6));
2039   vst1q_u16(dst + 11 * stride + 24, vextq_u16(c01[5], c01[6], 6));
2040 
2041   vst1q_u16(dst + 12 * stride + 0, c01[3]);
2042   vst1q_u16(dst + 12 * stride + 8, c01[4]);
2043   vst1q_u16(dst + 12 * stride + 16, c01[5]);
2044   vst1q_u16(dst + 12 * stride + 24, c01[6]);
2045   vst1q_u16(dst + 13 * stride + 0, vextq_u16(c01[3], c01[4], 2));
2046   vst1q_u16(dst + 13 * stride + 8, vextq_u16(c01[4], c01[5], 2));
2047   vst1q_u16(dst + 13 * stride + 16, vextq_u16(c01[5], c01[6], 2));
2048   vst1q_u16(dst + 13 * stride + 24, vextq_u16(c01[6], c01[7], 2));
2049   vst1q_u16(dst + 14 * stride + 0, vextq_u16(c01[3], c01[4], 4));
2050   vst1q_u16(dst + 14 * stride + 8, vextq_u16(c01[4], c01[5], 4));
2051   vst1q_u16(dst + 14 * stride + 16, vextq_u16(c01[5], c01[6], 4));
2052   vst1q_u16(dst + 14 * stride + 24, vextq_u16(c01[6], c01[7], 4));
2053   vst1q_u16(dst + 15 * stride + 0, vextq_u16(c01[3], c01[4], 6));
2054   vst1q_u16(dst + 15 * stride + 8, vextq_u16(c01[4], c01[5], 6));
2055   vst1q_u16(dst + 15 * stride + 16, vextq_u16(c01[5], c01[6], 6));
2056   vst1q_u16(dst + 15 * stride + 24, vextq_u16(c01[6], c01[7], 6));
2057 
2058   vst1q_u16(dst + 16 * stride + 0, c01[4]);
2059   vst1q_u16(dst + 16 * stride + 8, c01[5]);
2060   vst1q_u16(dst + 16 * stride + 16, c01[6]);
2061   vst1q_u16(dst + 16 * stride + 24, c01[7]);
2062   vst1q_u16(dst + 17 * stride + 0, vextq_u16(c01[4], c01[5], 2));
2063   vst1q_u16(dst + 17 * stride + 8, vextq_u16(c01[5], c01[6], 2));
2064   vst1q_u16(dst + 17 * stride + 16, vextq_u16(c01[6], c01[7], 2));
2065   vst1q_u16(dst + 17 * stride + 24, vextq_u16(c01[7], l31, 2));
2066   vst1q_u16(dst + 18 * stride + 0, vextq_u16(c01[4], c01[5], 4));
2067   vst1q_u16(dst + 18 * stride + 8, vextq_u16(c01[5], c01[6], 4));
2068   vst1q_u16(dst + 18 * stride + 16, vextq_u16(c01[6], c01[7], 4));
2069   vst1q_u16(dst + 18 * stride + 24, vextq_u16(c01[7], l31, 4));
2070   vst1q_u16(dst + 19 * stride + 0, vextq_u16(c01[4], c01[5], 6));
2071   vst1q_u16(dst + 19 * stride + 8, vextq_u16(c01[5], c01[6], 6));
2072   vst1q_u16(dst + 19 * stride + 16, vextq_u16(c01[6], c01[7], 6));
2073   vst1q_u16(dst + 19 * stride + 24, vextq_u16(c01[7], l31, 6));
2074 
2075   vst1q_u16(dst + 20 * stride + 0, c01[5]);
2076   vst1q_u16(dst + 20 * stride + 8, c01[6]);
2077   vst1q_u16(dst + 20 * stride + 16, c01[7]);
2078   vst1q_u16(dst + 20 * stride + 24, l31);
2079   vst1q_u16(dst + 21 * stride + 0, vextq_u16(c01[5], c01[6], 2));
2080   vst1q_u16(dst + 21 * stride + 8, vextq_u16(c01[6], c01[7], 2));
2081   vst1q_u16(dst + 21 * stride + 16, vextq_u16(c01[7], l31, 2));
2082   vst1q_u16(dst + 21 * stride + 24, vextq_u16(l31, l31, 2));
2083   vst1q_u16(dst + 22 * stride + 0, vextq_u16(c01[5], c01[6], 4));
2084   vst1q_u16(dst + 22 * stride + 8, vextq_u16(c01[6], c01[7], 4));
2085   vst1q_u16(dst + 22 * stride + 16, vextq_u16(c01[7], l31, 4));
2086   vst1q_u16(dst + 22 * stride + 24, vextq_u16(l31, l31, 4));
2087   vst1q_u16(dst + 23 * stride + 0, vextq_u16(c01[5], c01[6], 6));
2088   vst1q_u16(dst + 23 * stride + 8, vextq_u16(c01[6], c01[7], 6));
2089   vst1q_u16(dst + 23 * stride + 16, vextq_u16(c01[7], l31, 6));
2090   vst1q_u16(dst + 23 * stride + 24, vextq_u16(l31, l31, 6));
2091 
2092   vst1q_u16(dst + 24 * stride + 0, c01[6]);
2093   vst1q_u16(dst + 24 * stride + 8, c01[7]);
2094   vst1q_u16(dst + 24 * stride + 16, l31);
2095   vst1q_u16(dst + 24 * stride + 24, l31);
2096   vst1q_u16(dst + 25 * stride + 0, vextq_u16(c01[6], c01[7], 2));
2097   vst1q_u16(dst + 25 * stride + 8, vextq_u16(c01[7], l31, 2));
2098   vst1q_u16(dst + 25 * stride + 16, vextq_u16(l31, l31, 2));
2099   vst1q_u16(dst + 25 * stride + 24, vextq_u16(l31, l31, 2));
2100   vst1q_u16(dst + 26 * stride + 0, vextq_u16(c01[6], c01[7], 4));
2101   vst1q_u16(dst + 26 * stride + 8, vextq_u16(c01[7], l31, 4));
2102   vst1q_u16(dst + 26 * stride + 16, vextq_u16(l31, l31, 4));
2103   vst1q_u16(dst + 26 * stride + 24, vextq_u16(l31, l31, 4));
2104   vst1q_u16(dst + 27 * stride + 0, vextq_u16(c01[6], c01[7], 6));
2105   vst1q_u16(dst + 27 * stride + 8, vextq_u16(c01[7], l31, 6));
2106   vst1q_u16(dst + 27 * stride + 16, vextq_u16(l31, l31, 6));
2107   vst1q_u16(dst + 27 * stride + 24, vextq_u16(l31, l31, 6));
2108 
2109   vst1q_u16(dst + 28 * stride + 0, c01[7]);
2110   vst1q_u16(dst + 28 * stride + 8, l31);
2111   vst1q_u16(dst + 28 * stride + 16, l31);
2112   vst1q_u16(dst + 28 * stride + 24, l31);
2113   vst1q_u16(dst + 29 * stride + 0, vextq_u16(c01[7], l31, 2));
2114   vst1q_u16(dst + 29 * stride + 8, vextq_u16(l31, l31, 2));
2115   vst1q_u16(dst + 29 * stride + 16, vextq_u16(l31, l31, 2));
2116   vst1q_u16(dst + 29 * stride + 24, vextq_u16(l31, l31, 2));
2117   vst1q_u16(dst + 30 * stride + 0, vextq_u16(c01[7], l31, 4));
2118   vst1q_u16(dst + 30 * stride + 8, vextq_u16(l31, l31, 4));
2119   vst1q_u16(dst + 30 * stride + 16, vextq_u16(l31, l31, 4));
2120   vst1q_u16(dst + 30 * stride + 24, vextq_u16(l31, l31, 4));
2121   vst1q_u16(dst + 31 * stride + 0, vextq_u16(c01[7], l31, 6));
2122   vst1q_u16(dst + 31 * stride + 8, vextq_u16(l31, l31, 6));
2123   vst1q_u16(dst + 31 * stride + 16, vextq_u16(l31, l31, 6));
2124   vst1q_u16(dst + 31 * stride + 24, vextq_u16(l31, l31, 6));
2125 }
2126 
2127 //------------------------------------------------------------------------------
2128 
vpx_highbd_v_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)2129 void vpx_highbd_v_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
2130                                      const uint16_t *above,
2131                                      const uint16_t *left, int bd) {
2132   const uint16x4_t row = vld1_u16(above);
2133   int i;
2134   (void)left;
2135   (void)bd;
2136 
2137   for (i = 0; i < 4; i++, dst += stride) {
2138     vst1_u16(dst, row);
2139   }
2140 }
2141 
vpx_highbd_v_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)2142 void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
2143                                      const uint16_t *above,
2144                                      const uint16_t *left, int bd) {
2145   const uint16x8_t row = vld1q_u16(above);
2146   int i;
2147   (void)left;
2148   (void)bd;
2149 
2150   for (i = 0; i < 8; i++, dst += stride) {
2151     vst1q_u16(dst, row);
2152   }
2153 }
2154 
vpx_highbd_v_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)2155 void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
2156                                        const uint16_t *above,
2157                                        const uint16_t *left, int bd) {
2158   const uint16x8_t row0 = vld1q_u16(above + 0);
2159   const uint16x8_t row1 = vld1q_u16(above + 8);
2160   int i;
2161   (void)left;
2162   (void)bd;
2163 
2164   for (i = 0; i < 16; i++) {
2165     vst1q_u16(dst + 0, row0);
2166     vst1q_u16(dst + 8, row1);
2167     dst += stride;
2168   }
2169 }
2170 
vpx_highbd_v_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)2171 void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
2172                                        const uint16_t *above,
2173                                        const uint16_t *left, int bd) {
2174   const uint16x8_t row0 = vld1q_u16(above + 0);
2175   const uint16x8_t row1 = vld1q_u16(above + 8);
2176   const uint16x8_t row2 = vld1q_u16(above + 16);
2177   const uint16x8_t row3 = vld1q_u16(above + 24);
2178   int i;
2179   (void)left;
2180   (void)bd;
2181 
2182   for (i = 0; i < 32; i++) {
2183     vst1q_u16(dst + 0, row0);
2184     vst1q_u16(dst + 8, row1);
2185     vst1q_u16(dst + 16, row2);
2186     vst1q_u16(dst + 24, row3);
2187     dst += stride;
2188   }
2189 }
2190 
2191 // -----------------------------------------------------------------------------
2192 
vpx_highbd_h_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)2193 void vpx_highbd_h_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
2194                                      const uint16_t *above,
2195                                      const uint16_t *left, int bd) {
2196   const uint16x4_t left_u16 = vld1_u16(left);
2197   uint16x4_t row;
2198   (void)above;
2199   (void)bd;
2200 
2201   row = vdup_lane_u16(left_u16, 0);
2202   vst1_u16(dst, row);
2203   dst += stride;
2204   row = vdup_lane_u16(left_u16, 1);
2205   vst1_u16(dst, row);
2206   dst += stride;
2207   row = vdup_lane_u16(left_u16, 2);
2208   vst1_u16(dst, row);
2209   dst += stride;
2210   row = vdup_lane_u16(left_u16, 3);
2211   vst1_u16(dst, row);
2212 }
2213 
vpx_highbd_h_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)2214 void vpx_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
2215                                      const uint16_t *above,
2216                                      const uint16_t *left, int bd) {
2217   const uint16x8_t left_u16 = vld1q_u16(left);
2218   const uint16x4_t left_low = vget_low_u16(left_u16);
2219   const uint16x4_t left_high = vget_high_u16(left_u16);
2220   uint16x8_t row;
2221   (void)above;
2222   (void)bd;
2223 
2224   row = vdupq_lane_u16(left_low, 0);
2225   vst1q_u16(dst, row);
2226   dst += stride;
2227   row = vdupq_lane_u16(left_low, 1);
2228   vst1q_u16(dst, row);
2229   dst += stride;
2230   row = vdupq_lane_u16(left_low, 2);
2231   vst1q_u16(dst, row);
2232   dst += stride;
2233   row = vdupq_lane_u16(left_low, 3);
2234   vst1q_u16(dst, row);
2235   dst += stride;
2236   row = vdupq_lane_u16(left_high, 0);
2237   vst1q_u16(dst, row);
2238   dst += stride;
2239   row = vdupq_lane_u16(left_high, 1);
2240   vst1q_u16(dst, row);
2241   dst += stride;
2242   row = vdupq_lane_u16(left_high, 2);
2243   vst1q_u16(dst, row);
2244   dst += stride;
2245   row = vdupq_lane_u16(left_high, 3);
2246   vst1q_u16(dst, row);
2247 }
2248 
h_store_16(uint16_t ** dst,const ptrdiff_t stride,const uint16x8_t row)2249 static INLINE void h_store_16(uint16_t **dst, const ptrdiff_t stride,
2250                               const uint16x8_t row) {
2251   // Note: vst1q is faster than vst2q
2252   vst1q_u16(*dst, row);
2253   *dst += 8;
2254   vst1q_u16(*dst, row);
2255   *dst += stride - 8;
2256 }
2257 
vpx_highbd_h_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)2258 void vpx_highbd_h_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
2259                                        const uint16_t *above,
2260                                        const uint16_t *left, int bd) {
2261   int i;
2262   (void)above;
2263   (void)bd;
2264 
2265   for (i = 0; i < 2; i++, left += 8) {
2266     const uint16x8_t left_u16q = vld1q_u16(left);
2267     const uint16x4_t left_low = vget_low_u16(left_u16q);
2268     const uint16x4_t left_high = vget_high_u16(left_u16q);
2269     uint16x8_t row;
2270 
2271     row = vdupq_lane_u16(left_low, 0);
2272     h_store_16(&dst, stride, row);
2273     row = vdupq_lane_u16(left_low, 1);
2274     h_store_16(&dst, stride, row);
2275     row = vdupq_lane_u16(left_low, 2);
2276     h_store_16(&dst, stride, row);
2277     row = vdupq_lane_u16(left_low, 3);
2278     h_store_16(&dst, stride, row);
2279     row = vdupq_lane_u16(left_high, 0);
2280     h_store_16(&dst, stride, row);
2281     row = vdupq_lane_u16(left_high, 1);
2282     h_store_16(&dst, stride, row);
2283     row = vdupq_lane_u16(left_high, 2);
2284     h_store_16(&dst, stride, row);
2285     row = vdupq_lane_u16(left_high, 3);
2286     h_store_16(&dst, stride, row);
2287   }
2288 }
2289 
h_store_32(uint16_t ** dst,const ptrdiff_t stride,const uint16x8_t row)2290 static INLINE void h_store_32(uint16_t **dst, const ptrdiff_t stride,
2291                               const uint16x8_t row) {
2292   // Note: vst1q is faster than vst2q
2293   vst1q_u16(*dst, row);
2294   *dst += 8;
2295   vst1q_u16(*dst, row);
2296   *dst += 8;
2297   vst1q_u16(*dst, row);
2298   *dst += 8;
2299   vst1q_u16(*dst, row);
2300   *dst += stride - 24;
2301 }
2302 
vpx_highbd_h_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)2303 void vpx_highbd_h_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
2304                                        const uint16_t *above,
2305                                        const uint16_t *left, int bd) {
2306   int i;
2307   (void)above;
2308   (void)bd;
2309 
2310   for (i = 0; i < 4; i++, left += 8) {
2311     const uint16x8_t left_u16q = vld1q_u16(left);
2312     const uint16x4_t left_low = vget_low_u16(left_u16q);
2313     const uint16x4_t left_high = vget_high_u16(left_u16q);
2314     uint16x8_t row;
2315 
2316     row = vdupq_lane_u16(left_low, 0);
2317     h_store_32(&dst, stride, row);
2318     row = vdupq_lane_u16(left_low, 1);
2319     h_store_32(&dst, stride, row);
2320     row = vdupq_lane_u16(left_low, 2);
2321     h_store_32(&dst, stride, row);
2322     row = vdupq_lane_u16(left_low, 3);
2323     h_store_32(&dst, stride, row);
2324     row = vdupq_lane_u16(left_high, 0);
2325     h_store_32(&dst, stride, row);
2326     row = vdupq_lane_u16(left_high, 1);
2327     h_store_32(&dst, stride, row);
2328     row = vdupq_lane_u16(left_high, 2);
2329     h_store_32(&dst, stride, row);
2330     row = vdupq_lane_u16(left_high, 3);
2331     h_store_32(&dst, stride, row);
2332   }
2333 }
2334 
2335 // -----------------------------------------------------------------------------
2336 
vpx_highbd_tm_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)2337 void vpx_highbd_tm_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
2338                                       const uint16_t *above,
2339                                       const uint16_t *left, int bd) {
2340   const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
2341   const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
2342   const int16x4_t above_s16d = vld1_s16((const int16_t *)above);
2343   const int16x8_t above_s16 = vcombine_s16(above_s16d, above_s16d);
2344   const int16x4_t left_s16 = vld1_s16((const int16_t *)left);
2345   const int16x8_t sub = vsubq_s16(above_s16, top_left);
2346   int16x8_t sum;
2347   uint16x8_t row;
2348 
2349   sum = vcombine_s16(vdup_lane_s16(left_s16, 0), vdup_lane_s16(left_s16, 1));
2350   sum = vaddq_s16(sum, sub);
2351   sum = vminq_s16(sum, max);
2352   row = vqshluq_n_s16(sum, 0);
2353   vst1_u16(dst, vget_low_u16(row));
2354   dst += stride;
2355   vst1_u16(dst, vget_high_u16(row));
2356   dst += stride;
2357 
2358   sum = vcombine_s16(vdup_lane_s16(left_s16, 2), vdup_lane_s16(left_s16, 3));
2359   sum = vaddq_s16(sum, sub);
2360   sum = vminq_s16(sum, max);
2361   row = vqshluq_n_s16(sum, 0);
2362   vst1_u16(dst, vget_low_u16(row));
2363   dst += stride;
2364   vst1_u16(dst, vget_high_u16(row));
2365 }
2366 
tm_8_kernel(uint16_t ** dst,const ptrdiff_t stride,const int16x8_t left_dup,const int16x8_t sub,const int16x8_t max)2367 static INLINE void tm_8_kernel(uint16_t **dst, const ptrdiff_t stride,
2368                                const int16x8_t left_dup, const int16x8_t sub,
2369                                const int16x8_t max) {
2370   uint16x8_t row;
2371   int16x8_t sum = vaddq_s16(left_dup, sub);
2372   sum = vminq_s16(sum, max);
2373   row = vqshluq_n_s16(sum, 0);
2374   vst1q_u16(*dst, row);
2375   *dst += stride;
2376 }
2377 
vpx_highbd_tm_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)2378 void vpx_highbd_tm_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
2379                                       const uint16_t *above,
2380                                       const uint16_t *left, int bd) {
2381   const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
2382   const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
2383   const int16x8_t above_s16 = vld1q_s16((const int16_t *)above);
2384   const int16x8_t left_s16 = vld1q_s16((const int16_t *)left);
2385   const int16x8_t sub = vsubq_s16(above_s16, top_left);
2386   int16x4_t left_s16d;
2387   int16x8_t left_dup;
2388   int i;
2389 
2390   left_s16d = vget_low_s16(left_s16);
2391 
2392   for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16)) {
2393     left_dup = vdupq_lane_s16(left_s16d, 0);
2394     tm_8_kernel(&dst, stride, left_dup, sub, max);
2395 
2396     left_dup = vdupq_lane_s16(left_s16d, 1);
2397     tm_8_kernel(&dst, stride, left_dup, sub, max);
2398 
2399     left_dup = vdupq_lane_s16(left_s16d, 2);
2400     tm_8_kernel(&dst, stride, left_dup, sub, max);
2401 
2402     left_dup = vdupq_lane_s16(left_s16d, 3);
2403     tm_8_kernel(&dst, stride, left_dup, sub, max);
2404   }
2405 }
2406 
tm_16_kernel(uint16_t ** dst,const ptrdiff_t stride,const int16x8_t left_dup,const int16x8_t sub0,const int16x8_t sub1,const int16x8_t max)2407 static INLINE void tm_16_kernel(uint16_t **dst, const ptrdiff_t stride,
2408                                 const int16x8_t left_dup, const int16x8_t sub0,
2409                                 const int16x8_t sub1, const int16x8_t max) {
2410   uint16x8_t row0, row1;
2411   int16x8_t sum0 = vaddq_s16(left_dup, sub0);
2412   int16x8_t sum1 = vaddq_s16(left_dup, sub1);
2413   sum0 = vminq_s16(sum0, max);
2414   sum1 = vminq_s16(sum1, max);
2415   row0 = vqshluq_n_s16(sum0, 0);
2416   row1 = vqshluq_n_s16(sum1, 0);
2417   vst1q_u16(*dst, row0);
2418   *dst += 8;
2419   vst1q_u16(*dst, row1);
2420   *dst += stride - 8;
2421 }
2422 
vpx_highbd_tm_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)2423 void vpx_highbd_tm_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
2424                                         const uint16_t *above,
2425                                         const uint16_t *left, int bd) {
2426   const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
2427   const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
2428   const int16x8_t above0 = vld1q_s16((const int16_t *)above);
2429   const int16x8_t above1 = vld1q_s16((const int16_t *)(above + 8));
2430   const int16x8_t sub0 = vsubq_s16(above0, top_left);
2431   const int16x8_t sub1 = vsubq_s16(above1, top_left);
2432   int16x8_t left_dup;
2433   int i, j;
2434 
2435   for (j = 0; j < 2; j++, left += 8) {
2436     const int16x8_t left_s16q = vld1q_s16((const int16_t *)left);
2437     int16x4_t left_s16d = vget_low_s16(left_s16q);
2438     for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) {
2439       left_dup = vdupq_lane_s16(left_s16d, 0);
2440       tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
2441 
2442       left_dup = vdupq_lane_s16(left_s16d, 1);
2443       tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
2444 
2445       left_dup = vdupq_lane_s16(left_s16d, 2);
2446       tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
2447 
2448       left_dup = vdupq_lane_s16(left_s16d, 3);
2449       tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
2450     }
2451   }
2452 }
2453 
tm_32_kernel(uint16_t ** dst,const ptrdiff_t stride,const int16x8_t left_dup,const int16x8_t sub0,const int16x8_t sub1,const int16x8_t sub2,const int16x8_t sub3,const int16x8_t max)2454 static INLINE void tm_32_kernel(uint16_t **dst, const ptrdiff_t stride,
2455                                 const int16x8_t left_dup, const int16x8_t sub0,
2456                                 const int16x8_t sub1, const int16x8_t sub2,
2457                                 const int16x8_t sub3, const int16x8_t max) {
2458   uint16x8_t row0, row1, row2, row3;
2459   int16x8_t sum0 = vaddq_s16(left_dup, sub0);
2460   int16x8_t sum1 = vaddq_s16(left_dup, sub1);
2461   int16x8_t sum2 = vaddq_s16(left_dup, sub2);
2462   int16x8_t sum3 = vaddq_s16(left_dup, sub3);
2463   sum0 = vminq_s16(sum0, max);
2464   sum1 = vminq_s16(sum1, max);
2465   sum2 = vminq_s16(sum2, max);
2466   sum3 = vminq_s16(sum3, max);
2467   row0 = vqshluq_n_s16(sum0, 0);
2468   row1 = vqshluq_n_s16(sum1, 0);
2469   row2 = vqshluq_n_s16(sum2, 0);
2470   row3 = vqshluq_n_s16(sum3, 0);
2471   vst1q_u16(*dst, row0);
2472   *dst += 8;
2473   vst1q_u16(*dst, row1);
2474   *dst += 8;
2475   vst1q_u16(*dst, row2);
2476   *dst += 8;
2477   vst1q_u16(*dst, row3);
2478   *dst += stride - 24;
2479 }
2480 
vpx_highbd_tm_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)2481 void vpx_highbd_tm_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
2482                                         const uint16_t *above,
2483                                         const uint16_t *left, int bd) {
2484   const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
2485   const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
2486   const int16x8_t above0 = vld1q_s16((const int16_t *)above);
2487   const int16x8_t above1 = vld1q_s16((const int16_t *)(above + 8));
2488   const int16x8_t above2 = vld1q_s16((const int16_t *)(above + 16));
2489   const int16x8_t above3 = vld1q_s16((const int16_t *)(above + 24));
2490   const int16x8_t sub0 = vsubq_s16(above0, top_left);
2491   const int16x8_t sub1 = vsubq_s16(above1, top_left);
2492   const int16x8_t sub2 = vsubq_s16(above2, top_left);
2493   const int16x8_t sub3 = vsubq_s16(above3, top_left);
2494   int16x8_t left_dup;
2495   int i, j;
2496 
2497   for (i = 0; i < 4; i++, left += 8) {
2498     const int16x8_t left_s16q = vld1q_s16((const int16_t *)left);
2499     int16x4_t left_s16d = vget_low_s16(left_s16q);
2500     for (j = 0; j < 2; j++, left_s16d = vget_high_s16(left_s16q)) {
2501       left_dup = vdupq_lane_s16(left_s16d, 0);
2502       tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
2503 
2504       left_dup = vdupq_lane_s16(left_s16d, 1);
2505       tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
2506 
2507       left_dup = vdupq_lane_s16(left_s16d, 2);
2508       tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
2509 
2510       left_dup = vdupq_lane_s16(left_s16d, 3);
2511       tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
2512     }
2513   }
2514 }
2515