xref: /aosp_15_r20/external/libvpx/vpx_dsp/arm/intrapred_neon.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 
13 #include "./vpx_config.h"
14 #include "./vpx_dsp_rtcd.h"
15 #include "mem_neon.h"
16 #include "sum_neon.h"
17 #include "vpx/vpx_integer.h"
18 
19 //------------------------------------------------------------------------------
20 // DC 4x4
21 
dc_sum_4(const uint8_t * ref)22 static INLINE uint16_t dc_sum_4(const uint8_t *ref) {
23   return horizontal_add_uint8x4(load_unaligned_u8_4x1(ref));
24 }
25 
dc_store_4x4(uint8_t * dst,ptrdiff_t stride,const uint8x8_t dc)26 static INLINE void dc_store_4x4(uint8_t *dst, ptrdiff_t stride,
27                                 const uint8x8_t dc) {
28   int i;
29   for (i = 0; i < 4; ++i, dst += stride) {
30     vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dc), 0);
31   }
32 }
33 
vpx_dc_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)34 void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
35                                const uint8_t *above, const uint8_t *left) {
36   const uint8x8_t a = load_unaligned_u8_4x1(above);
37   const uint8x8_t l = load_unaligned_u8_4x1(left);
38   const uint16x4_t al = vget_low_u16(vaddl_u8(a, l));
39   const uint16_t sum = horizontal_add_uint16x4(al);
40   const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3);
41   dc_store_4x4(dst, stride, dc);
42 }
43 
vpx_dc_left_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)44 void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
45                                     const uint8_t *above, const uint8_t *left) {
46   const uint16_t sum = dc_sum_4(left);
47   const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 2);
48   (void)above;
49   dc_store_4x4(dst, stride, dc);
50 }
51 
vpx_dc_top_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)52 void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
53                                    const uint8_t *above, const uint8_t *left) {
54   const uint16_t sum = dc_sum_4(above);
55   const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 2);
56   (void)left;
57   dc_store_4x4(dst, stride, dc);
58 }
59 
vpx_dc_128_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)60 void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
61                                    const uint8_t *above, const uint8_t *left) {
62   const uint8x8_t dc = vdup_n_u8(0x80);
63   (void)above;
64   (void)left;
65   dc_store_4x4(dst, stride, dc);
66 }
67 
68 //------------------------------------------------------------------------------
69 // DC 8x8
70 
dc_sum_8(const uint8_t * ref)71 static INLINE uint16_t dc_sum_8(const uint8_t *ref) {
72   return horizontal_add_uint8x8(vld1_u8(ref));
73 }
74 
dc_store_8x8(uint8_t * dst,ptrdiff_t stride,const uint8x8_t dc)75 static INLINE void dc_store_8x8(uint8_t *dst, ptrdiff_t stride,
76                                 const uint8x8_t dc) {
77   int i;
78   for (i = 0; i < 8; ++i, dst += stride) {
79     vst1_u8(dst, dc);
80   }
81 }
82 
vpx_dc_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)83 void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
84                                const uint8_t *above, const uint8_t *left) {
85   const uint8x8_t above_u8 = vld1_u8(above);
86   const uint8x8_t left_u8 = vld1_u8(left);
87   const uint16x8_t al = vaddl_u8(above_u8, left_u8);
88   const uint16_t sum = horizontal_add_uint16x8(al);
89   const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 4);
90   dc_store_8x8(dst, stride, dc);
91 }
92 
vpx_dc_left_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)93 void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
94                                     const uint8_t *above, const uint8_t *left) {
95   const uint16_t sum = dc_sum_8(left);
96   const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3);
97   (void)above;
98   dc_store_8x8(dst, stride, dc);
99 }
100 
vpx_dc_top_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)101 void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
102                                    const uint8_t *above, const uint8_t *left) {
103   const uint16_t sum = dc_sum_8(above);
104   const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3);
105   (void)left;
106   dc_store_8x8(dst, stride, dc);
107 }
108 
vpx_dc_128_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)109 void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
110                                    const uint8_t *above, const uint8_t *left) {
111   const uint8x8_t dc = vdup_n_u8(0x80);
112   (void)above;
113   (void)left;
114   dc_store_8x8(dst, stride, dc);
115 }
116 
117 //------------------------------------------------------------------------------
118 // DC 16x16
119 
dc_sum_16(const uint8_t * ref)120 static INLINE uint16_t dc_sum_16(const uint8_t *ref) {
121   return horizontal_add_uint8x16(vld1q_u8(ref));
122 }
123 
dc_store_16x16(uint8_t * dst,ptrdiff_t stride,const uint8x16_t dc)124 static INLINE void dc_store_16x16(uint8_t *dst, ptrdiff_t stride,
125                                   const uint8x16_t dc) {
126   int i;
127   for (i = 0; i < 16; ++i, dst += stride) {
128     vst1q_u8(dst + 0, dc);
129   }
130 }
131 
vpx_dc_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)132 void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
133                                  const uint8_t *above, const uint8_t *left) {
134   const uint8x16_t ref0 = vld1q_u8(above);
135   const uint8x16_t ref1 = vld1q_u8(left);
136   const uint16x8_t a = vpaddlq_u8(ref0);
137   const uint16x8_t l = vpaddlq_u8(ref1);
138   const uint16x8_t al = vaddq_u16(a, l);
139   const uint16_t sum = horizontal_add_uint16x8(al);
140   const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0);
141   dc_store_16x16(dst, stride, dc);
142 }
143 
vpx_dc_left_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)144 void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
145                                       const uint8_t *above,
146                                       const uint8_t *left) {
147   const uint16_t sum = dc_sum_16(left);
148   const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 4), 0);
149   (void)above;
150   dc_store_16x16(dst, stride, dc);
151 }
152 
vpx_dc_top_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)153 void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
154                                      const uint8_t *above,
155                                      const uint8_t *left) {
156   const uint16_t sum = dc_sum_16(above);
157   const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 4), 0);
158   (void)left;
159   dc_store_16x16(dst, stride, dc);
160 }
161 
vpx_dc_128_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)162 void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
163                                      const uint8_t *above,
164                                      const uint8_t *left) {
165   const uint8x16_t dc = vdupq_n_u8(0x80);
166   (void)above;
167   (void)left;
168   dc_store_16x16(dst, stride, dc);
169 }
170 
171 //------------------------------------------------------------------------------
172 // DC 32x32
173 
dc_sum_32(const uint8_t * ref)174 static INLINE uint16_t dc_sum_32(const uint8_t *ref) {
175   const uint8x16_t r0 = vld1q_u8(ref + 0);
176   const uint8x16_t r1 = vld1q_u8(ref + 16);
177   const uint16x8_t r01 = vaddq_u16(vpaddlq_u8(r0), vpaddlq_u8(r1));
178   return horizontal_add_uint16x8(r01);
179 }
180 
dc_store_32x32(uint8_t * dst,ptrdiff_t stride,const uint8x16_t dc)181 static INLINE void dc_store_32x32(uint8_t *dst, ptrdiff_t stride,
182                                   const uint8x16_t dc) {
183   int i;
184   for (i = 0; i < 32; ++i, dst += stride) {
185     vst1q_u8(dst + 0, dc);
186     vst1q_u8(dst + 16, dc);
187   }
188 }
189 
vpx_dc_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)190 void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
191                                  const uint8_t *above, const uint8_t *left) {
192   const uint8x16_t a0 = vld1q_u8(above + 0);
193   const uint8x16_t a1 = vld1q_u8(above + 16);
194   const uint8x16_t l0 = vld1q_u8(left + 0);
195   const uint8x16_t l1 = vld1q_u8(left + 16);
196   const uint16x8_t a01 = vaddq_u16(vpaddlq_u8(a0), vpaddlq_u8(a1));
197   const uint16x8_t l01 = vaddq_u16(vpaddlq_u8(l0), vpaddlq_u8(l1));
198   const uint16x8_t al = vaddq_u16(a01, l01);
199   const uint16_t sum = horizontal_add_uint16x8(al);
200   const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 6), 0);
201   dc_store_32x32(dst, stride, dc);
202 }
203 
vpx_dc_left_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)204 void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
205                                       const uint8_t *above,
206                                       const uint8_t *left) {
207   const uint16_t sum = dc_sum_32(left);
208   const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0);
209   (void)above;
210   dc_store_32x32(dst, stride, dc);
211 }
212 
vpx_dc_top_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)213 void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
214                                      const uint8_t *above,
215                                      const uint8_t *left) {
216   const uint16_t sum = dc_sum_32(above);
217   const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0);
218   (void)left;
219   dc_store_32x32(dst, stride, dc);
220 }
221 
vpx_dc_128_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)222 void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
223                                      const uint8_t *above,
224                                      const uint8_t *left) {
225   const uint8x16_t dc = vdupq_n_u8(0x80);
226   (void)above;
227   (void)left;
228   dc_store_32x32(dst, stride, dc);
229 }
230 
231 // -----------------------------------------------------------------------------
232 
vpx_d45_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)233 void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
234                                 const uint8_t *above, const uint8_t *left) {
235   uint8x8_t a0, a1, a2, d0;
236   uint8_t a7;
237   (void)left;
238 
239   a0 = vld1_u8(above);
240   a7 = above[7];
241 
242   // [ above[1], ..., above[6], x, x ]
243   a1 = vext_u8(a0, a0, 1);
244   // [ above[2], ..., above[7], x, x ]
245   a2 = vext_u8(a0, a0, 2);
246 
247   // d0[0] = AVG3(above[0], above[1], above[2]);
248   // ...
249   // d0[5] = AVG3(above[5], above[6], above[7]);
250   // d0[6] = x (don't care)
251   // d0[7] = x (don't care)
252   d0 = vrhadd_u8(vhadd_u8(a0, a2), a1);
253 
254   // We want:
255   // stride=0 [ d0[0], d0[1], d0[2],    d0[3] ]
256   // stride=1 [ d0[1], d0[2], d0[3],    d0[4] ]
257   // stride=2 [ d0[2], d0[3], d0[4],    d0[5] ]
258   // stride=2 [ d0[3], d0[4], d0[5], above[7] ]
259   store_u8_4x1(dst + 0 * stride, d0);
260   store_u8_4x1(dst + 1 * stride, vext_u8(d0, d0, 1));
261   store_u8_4x1(dst + 2 * stride, vext_u8(d0, d0, 2));
262   store_u8_4x1(dst + 3 * stride, vext_u8(d0, d0, 3));
263 
264   // We stored d0[6] above, so fixup into above[7].
265   dst[3 * stride + 3] = a7;
266 }
267 
vpx_d45_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)268 void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
269                                 const uint8_t *above, const uint8_t *left) {
270   uint8x8_t ax0, a0, a1, a7, d0;
271   (void)left;
272 
273   a0 = vld1_u8(above + 0);
274   a1 = vld1_u8(above + 1);
275   a7 = vld1_dup_u8(above + 7);
276 
277   // We want to calculate the AVG3 result in lanes 1-7 inclusive so we can
278   // shift in above[7] later, so shift a0 across by one to get the right
279   // inputs:
280   // [ x, above[0], ... , above[6] ]
281   ax0 = vext_u8(a0, a0, 7);
282 
283   // d0[0] = x (don't care)
284   // d0[1] = AVG3(above[0], above[1], above[2]);
285   // ...
286   // d0[7] = AVG3(above[6], above[7], above[8]);
287   d0 = vrhadd_u8(vhadd_u8(ax0, a1), a0);
288 
289   // Undo the earlier ext, incrementally shift in duplicates of above[7].
290   vst1_u8(dst + 0 * stride, vext_u8(d0, a7, 1));
291   vst1_u8(dst + 1 * stride, vext_u8(d0, a7, 2));
292   vst1_u8(dst + 2 * stride, vext_u8(d0, a7, 3));
293   vst1_u8(dst + 3 * stride, vext_u8(d0, a7, 4));
294   vst1_u8(dst + 4 * stride, vext_u8(d0, a7, 5));
295   vst1_u8(dst + 5 * stride, vext_u8(d0, a7, 6));
296   vst1_u8(dst + 6 * stride, vext_u8(d0, a7, 7));
297   vst1_u8(dst + 7 * stride, a7);
298 }
299 
vpx_d45_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)300 void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
301                                   const uint8_t *above, const uint8_t *left) {
302   uint8x16_t ax0, a0, a1, a15, d0;
303   (void)left;
304 
305   a0 = vld1q_u8(above + 0);
306   a1 = vld1q_u8(above + 1);
307   a15 = vld1q_dup_u8(above + 15);
308 
309   // We want to calculate the AVG3 result in lanes 1-15 inclusive so we can
310   // shift in above[15] later, so shift a0 across by one to get the right
311   // inputs:
312   // [ x, above[0], ... , above[14] ]
313   ax0 = vextq_u8(a0, a0, 15);
314 
315   // d0[0] = x (don't care)
316   // d0[1] = AVG3(above[0], above[1], above[2]);
317   // ...
318   // d0[15] = AVG3(above[14], above[15], above[16]);
319   d0 = vrhaddq_u8(vhaddq_u8(ax0, a1), a0);
320 
321   // Undo the earlier ext, incrementally shift in duplicates of above[15].
322   vst1q_u8(dst + 0 * stride, vextq_u8(d0, a15, 1));
323   vst1q_u8(dst + 1 * stride, vextq_u8(d0, a15, 2));
324   vst1q_u8(dst + 2 * stride, vextq_u8(d0, a15, 3));
325   vst1q_u8(dst + 3 * stride, vextq_u8(d0, a15, 4));
326   vst1q_u8(dst + 4 * stride, vextq_u8(d0, a15, 5));
327   vst1q_u8(dst + 5 * stride, vextq_u8(d0, a15, 6));
328   vst1q_u8(dst + 6 * stride, vextq_u8(d0, a15, 7));
329   vst1q_u8(dst + 7 * stride, vextq_u8(d0, a15, 8));
330   vst1q_u8(dst + 8 * stride, vextq_u8(d0, a15, 9));
331   vst1q_u8(dst + 9 * stride, vextq_u8(d0, a15, 10));
332   vst1q_u8(dst + 10 * stride, vextq_u8(d0, a15, 11));
333   vst1q_u8(dst + 11 * stride, vextq_u8(d0, a15, 12));
334   vst1q_u8(dst + 12 * stride, vextq_u8(d0, a15, 13));
335   vst1q_u8(dst + 13 * stride, vextq_u8(d0, a15, 14));
336   vst1q_u8(dst + 14 * stride, vextq_u8(d0, a15, 15));
337   vst1q_u8(dst + 15 * stride, a15);
338 }
339 
vpx_d45_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)340 void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
341                                   const uint8_t *above, const uint8_t *left) {
342   uint8x16_t ax0, a0, a1, a15, a16, a17, a31, d0[2];
343   (void)left;
344 
345   a0 = vld1q_u8(above + 0);
346   a1 = vld1q_u8(above + 1);
347   a15 = vld1q_u8(above + 15);
348   a16 = vld1q_u8(above + 16);
349   a17 = vld1q_u8(above + 17);
350   a31 = vld1q_dup_u8(above + 31);
351 
352   // We want to calculate the AVG3 result in lanes 1-15 inclusive so we can
353   // shift in above[15] later, so shift a0 across by one to get the right
354   // inputs:
355   // [ x, above[0], ... , above[14] ]
356   ax0 = vextq_u8(a0, a0, 15);
357 
358   // d0[0] = x (don't care)
359   // d0[1] = AVG3(above[0], above[1], above[2]);
360   // ...
361   // d0[15] = AVG3(above[14], above[15], above[16]);
362   d0[0] = vrhaddq_u8(vhaddq_u8(ax0, a1), a0);
363   d0[1] = vrhaddq_u8(vhaddq_u8(a15, a17), a16);
364 
365   // Undo the earlier ext, incrementally shift in duplicates of above[15].
366   vst1q_u8(dst + 0 * stride + 0, vextq_u8(d0[0], d0[1], 1));
367   vst1q_u8(dst + 0 * stride + 16, vextq_u8(d0[1], a31, 1));
368   vst1q_u8(dst + 1 * stride + 0, vextq_u8(d0[0], d0[1], 2));
369   vst1q_u8(dst + 1 * stride + 16, vextq_u8(d0[1], a31, 2));
370   vst1q_u8(dst + 2 * stride + 0, vextq_u8(d0[0], d0[1], 3));
371   vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0[1], a31, 3));
372   vst1q_u8(dst + 3 * stride + 0, vextq_u8(d0[0], d0[1], 4));
373   vst1q_u8(dst + 3 * stride + 16, vextq_u8(d0[1], a31, 4));
374   vst1q_u8(dst + 4 * stride + 0, vextq_u8(d0[0], d0[1], 5));
375   vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0[1], a31, 5));
376   vst1q_u8(dst + 5 * stride + 0, vextq_u8(d0[0], d0[1], 6));
377   vst1q_u8(dst + 5 * stride + 16, vextq_u8(d0[1], a31, 6));
378   vst1q_u8(dst + 6 * stride + 0, vextq_u8(d0[0], d0[1], 7));
379   vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0[1], a31, 7));
380   vst1q_u8(dst + 7 * stride + 0, vextq_u8(d0[0], d0[1], 8));
381   vst1q_u8(dst + 7 * stride + 16, vextq_u8(d0[1], a31, 8));
382   vst1q_u8(dst + 8 * stride + 0, vextq_u8(d0[0], d0[1], 9));
383   vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0[1], a31, 9));
384   vst1q_u8(dst + 9 * stride + 0, vextq_u8(d0[0], d0[1], 10));
385   vst1q_u8(dst + 9 * stride + 16, vextq_u8(d0[1], a31, 10));
386   vst1q_u8(dst + 10 * stride + 0, vextq_u8(d0[0], d0[1], 11));
387   vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0[1], a31, 11));
388   vst1q_u8(dst + 11 * stride + 0, vextq_u8(d0[0], d0[1], 12));
389   vst1q_u8(dst + 11 * stride + 16, vextq_u8(d0[1], a31, 12));
390   vst1q_u8(dst + 12 * stride + 0, vextq_u8(d0[0], d0[1], 13));
391   vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0[1], a31, 13));
392   vst1q_u8(dst + 13 * stride + 0, vextq_u8(d0[0], d0[1], 14));
393   vst1q_u8(dst + 13 * stride + 16, vextq_u8(d0[1], a31, 14));
394   vst1q_u8(dst + 14 * stride + 0, vextq_u8(d0[0], d0[1], 15));
395   vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0[1], a31, 15));
396   vst1q_u8(dst + 15 * stride + 0, d0[1]);
397   vst1q_u8(dst + 15 * stride + 16, a31);
398 
399   vst1q_u8(dst + 16 * stride + 0, vextq_u8(d0[1], a31, 1));
400   vst1q_u8(dst + 16 * stride + 16, a31);
401   vst1q_u8(dst + 17 * stride + 0, vextq_u8(d0[1], a31, 2));
402   vst1q_u8(dst + 17 * stride + 16, a31);
403   vst1q_u8(dst + 18 * stride + 0, vextq_u8(d0[1], a31, 3));
404   vst1q_u8(dst + 18 * stride + 16, a31);
405   vst1q_u8(dst + 19 * stride + 0, vextq_u8(d0[1], a31, 4));
406   vst1q_u8(dst + 19 * stride + 16, a31);
407   vst1q_u8(dst + 20 * stride + 0, vextq_u8(d0[1], a31, 5));
408   vst1q_u8(dst + 20 * stride + 16, a31);
409   vst1q_u8(dst + 21 * stride + 0, vextq_u8(d0[1], a31, 6));
410   vst1q_u8(dst + 21 * stride + 16, a31);
411   vst1q_u8(dst + 22 * stride + 0, vextq_u8(d0[1], a31, 7));
412   vst1q_u8(dst + 22 * stride + 16, a31);
413   vst1q_u8(dst + 23 * stride + 0, vextq_u8(d0[1], a31, 8));
414   vst1q_u8(dst + 23 * stride + 16, a31);
415   vst1q_u8(dst + 24 * stride + 0, vextq_u8(d0[1], a31, 9));
416   vst1q_u8(dst + 24 * stride + 16, a31);
417   vst1q_u8(dst + 25 * stride + 0, vextq_u8(d0[1], a31, 10));
418   vst1q_u8(dst + 25 * stride + 16, a31);
419   vst1q_u8(dst + 26 * stride + 0, vextq_u8(d0[1], a31, 11));
420   vst1q_u8(dst + 26 * stride + 16, a31);
421   vst1q_u8(dst + 27 * stride + 0, vextq_u8(d0[1], a31, 12));
422   vst1q_u8(dst + 27 * stride + 16, a31);
423   vst1q_u8(dst + 28 * stride + 0, vextq_u8(d0[1], a31, 13));
424   vst1q_u8(dst + 28 * stride + 16, a31);
425   vst1q_u8(dst + 29 * stride + 0, vextq_u8(d0[1], a31, 14));
426   vst1q_u8(dst + 29 * stride + 16, a31);
427   vst1q_u8(dst + 30 * stride + 0, vextq_u8(d0[1], a31, 15));
428   vst1q_u8(dst + 30 * stride + 16, a31);
429   vst1q_u8(dst + 31 * stride + 0, a31);
430   vst1q_u8(dst + 31 * stride + 16, a31);
431 }
432 
433 // -----------------------------------------------------------------------------
434 
vpx_d63_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)435 void vpx_d63_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
436                                 const uint8_t *above, const uint8_t *left) {
437   uint8x8_t a0, a1, a2, a3, d0, d1, d2, d3;
438   (void)left;
439 
440   a0 = load_unaligned_u8_4x1(above + 0);
441   a1 = load_unaligned_u8_4x1(above + 1);
442   a2 = load_unaligned_u8_4x1(above + 2);
443   a3 = load_unaligned_u8_4x1(above + 3);
444 
445   d0 = vrhadd_u8(a0, a1);
446   d1 = vrhadd_u8(vhadd_u8(a0, a2), a1);
447   d2 = vrhadd_u8(a1, a2);
448   d3 = vrhadd_u8(vhadd_u8(a1, a3), a2);
449 
450   store_u8_4x1(dst + 0 * stride, d0);
451   store_u8_4x1(dst + 1 * stride, d1);
452   store_u8_4x1(dst + 2 * stride, d2);
453   store_u8_4x1(dst + 3 * stride, d3);
454 }
455 
vpx_d63_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)456 void vpx_d63_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
457                                 const uint8_t *above, const uint8_t *left) {
458   uint8x8_t a0, a1, a2, a7, d0, d1;
459   (void)left;
460 
461   a0 = vld1_u8(above + 0);
462   a1 = vld1_u8(above + 1);
463   a2 = vld1_u8(above + 2);
464   a7 = vld1_dup_u8(above + 7);
465 
466   d0 = vrhadd_u8(a0, a1);
467   d1 = vrhadd_u8(vhadd_u8(a0, a2), a1);
468 
469   vst1_u8(dst + 0 * stride, d0);
470   vst1_u8(dst + 1 * stride, d1);
471 
472   d0 = vext_u8(d0, d0, 7);
473   d1 = vext_u8(d1, d1, 7);
474 
475   vst1_u8(dst + 2 * stride, vext_u8(d0, a7, 2));
476   vst1_u8(dst + 3 * stride, vext_u8(d1, a7, 2));
477   vst1_u8(dst + 4 * stride, vext_u8(d0, a7, 3));
478   vst1_u8(dst + 5 * stride, vext_u8(d1, a7, 3));
479   vst1_u8(dst + 6 * stride, vext_u8(d0, a7, 4));
480   vst1_u8(dst + 7 * stride, vext_u8(d1, a7, 4));
481 }
482 
vpx_d63_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)483 void vpx_d63_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
484                                   const uint8_t *above, const uint8_t *left) {
485   uint8x16_t a0, a1, a2, a15, d0, d1;
486   (void)left;
487 
488   a0 = vld1q_u8(above + 0);
489   a1 = vld1q_u8(above + 1);
490   a2 = vld1q_u8(above + 2);
491   a15 = vld1q_dup_u8(above + 15);
492 
493   d0 = vrhaddq_u8(a0, a1);
494   d1 = vrhaddq_u8(vhaddq_u8(a0, a2), a1);
495 
496   vst1q_u8(dst + 0 * stride, d0);
497   vst1q_u8(dst + 1 * stride, d1);
498 
499   d0 = vextq_u8(d0, d0, 15);
500   d1 = vextq_u8(d1, d1, 15);
501 
502   vst1q_u8(dst + 2 * stride, vextq_u8(d0, a15, 2));
503   vst1q_u8(dst + 3 * stride, vextq_u8(d1, a15, 2));
504   vst1q_u8(dst + 4 * stride, vextq_u8(d0, a15, 3));
505   vst1q_u8(dst + 5 * stride, vextq_u8(d1, a15, 3));
506   vst1q_u8(dst + 6 * stride, vextq_u8(d0, a15, 4));
507   vst1q_u8(dst + 7 * stride, vextq_u8(d1, a15, 4));
508   vst1q_u8(dst + 8 * stride, vextq_u8(d0, a15, 5));
509   vst1q_u8(dst + 9 * stride, vextq_u8(d1, a15, 5));
510   vst1q_u8(dst + 10 * stride, vextq_u8(d0, a15, 6));
511   vst1q_u8(dst + 11 * stride, vextq_u8(d1, a15, 6));
512   vst1q_u8(dst + 12 * stride, vextq_u8(d0, a15, 7));
513   vst1q_u8(dst + 13 * stride, vextq_u8(d1, a15, 7));
514   vst1q_u8(dst + 14 * stride, vextq_u8(d0, a15, 8));
515   vst1q_u8(dst + 15 * stride, vextq_u8(d1, a15, 8));
516 }
517 
vpx_d63_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)518 void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
519                                   const uint8_t *above, const uint8_t *left) {
520   uint8x16_t a0, a1, a2, a16, a17, a18, a31, d0_lo, d0_hi, d1_lo, d1_hi;
521   (void)left;
522 
523   a0 = vld1q_u8(above + 0);
524   a1 = vld1q_u8(above + 1);
525   a2 = vld1q_u8(above + 2);
526   a16 = vld1q_u8(above + 16);
527   a17 = vld1q_u8(above + 17);
528   a18 = vld1q_u8(above + 18);
529   a31 = vld1q_dup_u8(above + 31);
530 
531   d0_lo = vrhaddq_u8(a0, a1);
532   d0_hi = vrhaddq_u8(a16, a17);
533   d1_lo = vrhaddq_u8(vhaddq_u8(a0, a2), a1);
534   d1_hi = vrhaddq_u8(vhaddq_u8(a16, a18), a17);
535 
536   vst1q_u8(dst + 0 * stride + 0, d0_lo);
537   vst1q_u8(dst + 0 * stride + 16, d0_hi);
538   vst1q_u8(dst + 1 * stride + 0, d1_lo);
539   vst1q_u8(dst + 1 * stride + 16, d1_hi);
540 
541   d0_hi = vextq_u8(d0_lo, d0_hi, 15);
542   d0_lo = vextq_u8(d0_lo, d0_lo, 15);
543   d1_hi = vextq_u8(d1_lo, d1_hi, 15);
544   d1_lo = vextq_u8(d1_lo, d1_lo, 15);
545 
546   vst1q_u8(dst + 2 * stride + 0, vextq_u8(d0_lo, d0_hi, 2));
547   vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0_hi, a31, 2));
548   vst1q_u8(dst + 3 * stride + 0, vextq_u8(d1_lo, d1_hi, 2));
549   vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_hi, a31, 2));
550   vst1q_u8(dst + 4 * stride + 0, vextq_u8(d0_lo, d0_hi, 3));
551   vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0_hi, a31, 3));
552   vst1q_u8(dst + 5 * stride + 0, vextq_u8(d1_lo, d1_hi, 3));
553   vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_hi, a31, 3));
554   vst1q_u8(dst + 6 * stride + 0, vextq_u8(d0_lo, d0_hi, 4));
555   vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0_hi, a31, 4));
556   vst1q_u8(dst + 7 * stride + 0, vextq_u8(d1_lo, d1_hi, 4));
557   vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_hi, a31, 4));
558   vst1q_u8(dst + 8 * stride + 0, vextq_u8(d0_lo, d0_hi, 5));
559   vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0_hi, a31, 5));
560   vst1q_u8(dst + 9 * stride + 0, vextq_u8(d1_lo, d1_hi, 5));
561   vst1q_u8(dst + 9 * stride + 16, vextq_u8(d1_hi, a31, 5));
562   vst1q_u8(dst + 10 * stride + 0, vextq_u8(d0_lo, d0_hi, 6));
563   vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0_hi, a31, 6));
564   vst1q_u8(dst + 11 * stride + 0, vextq_u8(d1_lo, d1_hi, 6));
565   vst1q_u8(dst + 11 * stride + 16, vextq_u8(d1_hi, a31, 6));
566   vst1q_u8(dst + 12 * stride + 0, vextq_u8(d0_lo, d0_hi, 7));
567   vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0_hi, a31, 7));
568   vst1q_u8(dst + 13 * stride + 0, vextq_u8(d1_lo, d1_hi, 7));
569   vst1q_u8(dst + 13 * stride + 16, vextq_u8(d1_hi, a31, 7));
570   vst1q_u8(dst + 14 * stride + 0, vextq_u8(d0_lo, d0_hi, 8));
571   vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0_hi, a31, 8));
572   vst1q_u8(dst + 15 * stride + 0, vextq_u8(d1_lo, d1_hi, 8));
573   vst1q_u8(dst + 15 * stride + 16, vextq_u8(d1_hi, a31, 8));
574   vst1q_u8(dst + 16 * stride + 0, vextq_u8(d0_lo, d0_hi, 9));
575   vst1q_u8(dst + 16 * stride + 16, vextq_u8(d0_hi, a31, 9));
576   vst1q_u8(dst + 17 * stride + 0, vextq_u8(d1_lo, d1_hi, 9));
577   vst1q_u8(dst + 17 * stride + 16, vextq_u8(d1_hi, a31, 9));
578   vst1q_u8(dst + 18 * stride + 0, vextq_u8(d0_lo, d0_hi, 10));
579   vst1q_u8(dst + 18 * stride + 16, vextq_u8(d0_hi, a31, 10));
580   vst1q_u8(dst + 19 * stride + 0, vextq_u8(d1_lo, d1_hi, 10));
581   vst1q_u8(dst + 19 * stride + 16, vextq_u8(d1_hi, a31, 10));
582   vst1q_u8(dst + 20 * stride + 0, vextq_u8(d0_lo, d0_hi, 11));
583   vst1q_u8(dst + 20 * stride + 16, vextq_u8(d0_hi, a31, 11));
584   vst1q_u8(dst + 21 * stride + 0, vextq_u8(d1_lo, d1_hi, 11));
585   vst1q_u8(dst + 21 * stride + 16, vextq_u8(d1_hi, a31, 11));
586   vst1q_u8(dst + 22 * stride + 0, vextq_u8(d0_lo, d0_hi, 12));
587   vst1q_u8(dst + 22 * stride + 16, vextq_u8(d0_hi, a31, 12));
588   vst1q_u8(dst + 23 * stride + 0, vextq_u8(d1_lo, d1_hi, 12));
589   vst1q_u8(dst + 23 * stride + 16, vextq_u8(d1_hi, a31, 12));
590   vst1q_u8(dst + 24 * stride + 0, vextq_u8(d0_lo, d0_hi, 13));
591   vst1q_u8(dst + 24 * stride + 16, vextq_u8(d0_hi, a31, 13));
592   vst1q_u8(dst + 25 * stride + 0, vextq_u8(d1_lo, d1_hi, 13));
593   vst1q_u8(dst + 25 * stride + 16, vextq_u8(d1_hi, a31, 13));
594   vst1q_u8(dst + 26 * stride + 0, vextq_u8(d0_lo, d0_hi, 14));
595   vst1q_u8(dst + 26 * stride + 16, vextq_u8(d0_hi, a31, 14));
596   vst1q_u8(dst + 27 * stride + 0, vextq_u8(d1_lo, d1_hi, 14));
597   vst1q_u8(dst + 27 * stride + 16, vextq_u8(d1_hi, a31, 14));
598   vst1q_u8(dst + 28 * stride + 0, vextq_u8(d0_lo, d0_hi, 15));
599   vst1q_u8(dst + 28 * stride + 16, vextq_u8(d0_hi, a31, 15));
600   vst1q_u8(dst + 29 * stride + 0, vextq_u8(d1_lo, d1_hi, 15));
601   vst1q_u8(dst + 29 * stride + 16, vextq_u8(d1_hi, a31, 15));
602   vst1q_u8(dst + 30 * stride + 0, d0_hi);
603   vst1q_u8(dst + 30 * stride + 16, a31);
604   vst1q_u8(dst + 31 * stride + 0, d1_hi);
605   vst1q_u8(dst + 31 * stride + 16, a31);
606 }
607 
608 // -----------------------------------------------------------------------------
609 
vpx_d117_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)610 void vpx_d117_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
611                                  const uint8_t *above, const uint8_t *left) {
612   // See vpx_d117_predictor_8x8_neon for more details on the implementation.
613   uint8x8_t az, a0, l0az, d0, d1, d2, d3, col0, col1;
614 
615   az = load_unaligned_u8_4x1(above - 1);
616   a0 = load_unaligned_u8_4x1(above + 0);
617   // [ left[0], above[-1], above[0], above[1], x, x, x, x ]
618   l0az = vext_u8(vld1_dup_u8(left), az, 7);
619 
620   col0 = vdup_n_u8((above[-1] + 2 * left[0] + left[1] + 2) >> 2);
621   col1 = vdup_n_u8((left[0] + 2 * left[1] + left[2] + 2) >> 2);
622 
623   d0 = vrhadd_u8(az, a0);
624   d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
625   d2 = vext_u8(col0, d0, 7);
626   d3 = vext_u8(col1, d1, 7);
627 
628   store_u8_4x1(dst + 0 * stride, d0);
629   store_u8_4x1(dst + 1 * stride, d1);
630   store_u8_4x1(dst + 2 * stride, d2);
631   store_u8_4x1(dst + 3 * stride, d3);
632 }
633 
vpx_d117_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)634 void vpx_d117_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
635                                  const uint8_t *above, const uint8_t *left) {
636   uint8x8_t az, a0, l0az, d0, d1, l0, l1, azl0, col0, col0_even, col0_odd;
637 
638   az = vld1_u8(above - 1);
639   a0 = vld1_u8(above + 0);
640   // [ left[0], above[-1], ... , above[5] ]
641   l0az = vext_u8(vld1_dup_u8(left), az, 7);
642 
643   l0 = vld1_u8(left + 0);
644   // The last lane here is unused, reading left[8] could cause a buffer
645   // over-read, so just fill with a duplicate of left[0] to avoid needing to
646   // materialize a zero:
647   // [ left[1], ... , left[7], x ]
648   l1 = vext_u8(l0, l0, 1);
649   // [ above[-1], left[0], ... , left[6] ]
650   azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7);
651 
652   // d0[0] = AVG2(above[-1], above[0])
653   // d0[1] = AVG2(above[0], above[1])
654   // ...
655   // d0[7] = AVG2(above[6], above[7])
656   d0 = vrhadd_u8(az, a0);
657 
658   // d1[0] = AVG3(left[0], above[-1], above[0])
659   // d1[1] = AVG3(above[-1], above[0], above[1])
660   // ...
661   // d1[7] = AVG3(above[5], above[6], above[7])
662   d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
663 
664   // The ext instruction shifts elements in from the end of the vector rather
665   // than the start, so reverse the vector to put the elements to be shifted in
666   // at the end. The lowest two lanes here are unused:
667   // col0[7] = AVG3(above[-1], left[0], left[1])
668   // col0[6] = AVG3(left[0], left[1], left[2])
669   // ...
670   // col0[2] = AVG3(left[4], left[5], left[6])
671   // col0[1] = x (don't care)
672   // col0[0] = x (don't care)
673   col0 = vrev64_u8(vrhadd_u8(vhadd_u8(azl0, l1), l0));
674 
675   // We don't care about the first parameter to this uzp since we only ever use
676   // the high three elements, we just use col0 again since it is already
677   // available:
678   // col0_even = [ x, x, x, x, x, col0[3], col0[5], col0[7] ]
679   // col0_odd = [ x, x, x, x, x, col0[2], col0[4], col0[6] ]
680   col0_even = vuzp_u8(col0, col0).val[1];
681   col0_odd = vuzp_u8(col0, col0).val[0];
682 
683   // Incrementally shift more elements from col0 into d0/1:
684   // stride=0 [ d0[0],   d0[1],   d0[2],   d0[3], d0[4], d0[5], d0[6], d0[7] ]
685   // stride=1 [ d1[0],   d1[1],   d1[2],   d1[3], d1[4], d1[5], d1[6], d1[7] ]
686   // stride=2 [ col0[7], d0[0],   d0[1],   d0[2], d0[3], d0[4], d0[5], d0[6] ]
687   // stride=3 [ col0[6], d1[0],   d1[1],   d1[2], d1[3], d1[4], d1[5], d1[6] ]
688   // stride=4 [ col0[5], col0[7], d0[0],   d0[1], d0[2], d0[3], d0[4], d0[5] ]
689   // stride=5 [ col0[4], col0[6], d1[0],   d1[1], d1[2], d1[3], d1[4], d1[5] ]
690   // stride=6 [ col0[3], col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4] ]
691   // stride=7 [ col0[2], col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4] ]
692   vst1_u8(dst + 0 * stride, d0);
693   vst1_u8(dst + 1 * stride, d1);
694   vst1_u8(dst + 2 * stride, vext_u8(col0_even, d0, 7));
695   vst1_u8(dst + 3 * stride, vext_u8(col0_odd, d1, 7));
696   vst1_u8(dst + 4 * stride, vext_u8(col0_even, d0, 6));
697   vst1_u8(dst + 5 * stride, vext_u8(col0_odd, d1, 6));
698   vst1_u8(dst + 6 * stride, vext_u8(col0_even, d0, 5));
699   vst1_u8(dst + 7 * stride, vext_u8(col0_odd, d1, 5));
700 }
701 
vpx_d117_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)702 void vpx_d117_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
703                                    const uint8_t *above, const uint8_t *left) {
704   // See vpx_d117_predictor_8x8_neon for more details on the implementation.
705   uint8x16_t az, a0, l0az, d0, d1, l0, l1, azl0, col0, col0_even, col0_odd;
706 
707   az = vld1q_u8(above - 1);
708   a0 = vld1q_u8(above + 0);
709   // [ left[0], above[-1], ... , above[13] ]
710   l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
711 
712   l0 = vld1q_u8(left + 0);
713   // The last lane here is unused, reading left[16] could cause a buffer
714   // over-read, so just fill with a duplicate of left[0] to avoid needing to
715   // materialize a zero:
716   // [ left[1], ... , left[15], x ]
717   l1 = vextq_u8(l0, l0, 1);
718   // [ above[-1], left[0], ... , left[14] ]
719   azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
720 
721   d0 = vrhaddq_u8(az, a0);
722   d1 = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
723 
724   col0 = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
725   col0 = vrev64q_u8(vextq_u8(col0, col0, 8));
726 
727   // The low nine lanes here are unused so the first input to the uzp is
728   // unused, so just use a duplicate of col0 since we have it already. This
729   // also means that the lowest lane of col0 here is unused.
730   col0_even = vuzpq_u8(col0, col0).val[1];
731   col0_odd = vuzpq_u8(col0, col0).val[0];
732 
733   vst1q_u8(dst + 0 * stride, d0);
734   vst1q_u8(dst + 1 * stride, d1);
735   vst1q_u8(dst + 2 * stride, vextq_u8(col0_even, d0, 15));
736   vst1q_u8(dst + 3 * stride, vextq_u8(col0_odd, d1, 15));
737   vst1q_u8(dst + 4 * stride, vextq_u8(col0_even, d0, 14));
738   vst1q_u8(dst + 5 * stride, vextq_u8(col0_odd, d1, 14));
739   vst1q_u8(dst + 6 * stride, vextq_u8(col0_even, d0, 13));
740   vst1q_u8(dst + 7 * stride, vextq_u8(col0_odd, d1, 13));
741   vst1q_u8(dst + 8 * stride, vextq_u8(col0_even, d0, 12));
742   vst1q_u8(dst + 9 * stride, vextq_u8(col0_odd, d1, 12));
743   vst1q_u8(dst + 10 * stride, vextq_u8(col0_even, d0, 11));
744   vst1q_u8(dst + 11 * stride, vextq_u8(col0_odd, d1, 11));
745   vst1q_u8(dst + 12 * stride, vextq_u8(col0_even, d0, 10));
746   vst1q_u8(dst + 13 * stride, vextq_u8(col0_odd, d1, 10));
747   vst1q_u8(dst + 14 * stride, vextq_u8(col0_even, d0, 9));
748   vst1q_u8(dst + 15 * stride, vextq_u8(col0_odd, d1, 9));
749 }
750 
vpx_d117_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)751 void vpx_d117_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
752                                    const uint8_t *above, const uint8_t *left) {
753   // See vpx_d117_predictor_8x8_neon for more details on the implementation.
754   uint8x16_t az, a0, a14, a15, a16, l0az, d0_lo, d0_hi, d1_lo, d1_hi, l0, l1,
755       l15, l16, l17, azl0, col0_lo, col0_hi, col0_even, col0_odd;
756 
757   az = vld1q_u8(above - 1);
758   a0 = vld1q_u8(above + 0);
759   a14 = vld1q_u8(above + 14);
760   a15 = vld1q_u8(above + 15);
761   a16 = vld1q_u8(above + 16);
762   // [ left[0], above[-1], ... , above[13] ]
763   l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
764 
765   l0 = vld1q_u8(left + 0);
766   l1 = vld1q_u8(left + 1);
767   l15 = vld1q_u8(left + 15);
768   l16 = vld1q_u8(left + 16);
769   // The last lane here is unused, reading left[32] would cause a buffer
770   // over-read (observed as an address-sanitizer failure), so just fill with a
771   // duplicate of left[16] to avoid needing to materialize a zero:
772   // [ left[17], ... , left[31], x ]
773   l17 = vextq_u8(l16, l16, 1);
774   // [ above[-1], left[0], ... , left[14] ]
775   azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
776 
777   d0_lo = vrhaddq_u8(az, a0);
778   d0_hi = vrhaddq_u8(a15, a16);
779   d1_lo = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
780   d1_hi = vrhaddq_u8(vhaddq_u8(a14, a16), a15);
781 
782   // The last lane of col0_hi is unused here.
783   col0_lo = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
784   col0_hi = vrhaddq_u8(vhaddq_u8(l15, l17), l16);
785 
786   col0_lo = vrev64q_u8(vextq_u8(col0_lo, col0_lo, 8));
787   col0_hi = vrev64q_u8(vextq_u8(col0_hi, col0_hi, 8));
788 
789   // The first lane of these are unused since they are only ever called as
790   // ext(col0, _, i) where i >= 1.
791   col0_even = vuzpq_u8(col0_hi, col0_lo).val[1];
792   col0_odd = vuzpq_u8(col0_hi, col0_lo).val[0];
793 
794   vst1q_u8(dst + 0 * stride + 0, d0_lo);
795   vst1q_u8(dst + 0 * stride + 16, d0_hi);
796   vst1q_u8(dst + 1 * stride + 0, d1_lo);
797   vst1q_u8(dst + 1 * stride + 16, d1_hi);
798   vst1q_u8(dst + 2 * stride + 0, vextq_u8(col0_even, d0_lo, 15));
799   vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0_lo, d0_hi, 15));
800   vst1q_u8(dst + 3 * stride + 0, vextq_u8(col0_odd, d1_lo, 15));
801   vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_lo, d1_hi, 15));
802   vst1q_u8(dst + 4 * stride + 0, vextq_u8(col0_even, d0_lo, 14));
803   vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0_lo, d0_hi, 14));
804   vst1q_u8(dst + 5 * stride + 0, vextq_u8(col0_odd, d1_lo, 14));
805   vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_lo, d1_hi, 14));
806   vst1q_u8(dst + 6 * stride + 0, vextq_u8(col0_even, d0_lo, 13));
807   vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0_lo, d0_hi, 13));
808   vst1q_u8(dst + 7 * stride + 0, vextq_u8(col0_odd, d1_lo, 13));
809   vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_lo, d1_hi, 13));
810   vst1q_u8(dst + 8 * stride + 0, vextq_u8(col0_even, d0_lo, 12));
811   vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0_lo, d0_hi, 12));
812   vst1q_u8(dst + 9 * stride + 0, vextq_u8(col0_odd, d1_lo, 12));
813   vst1q_u8(dst + 9 * stride + 16, vextq_u8(d1_lo, d1_hi, 12));
814   vst1q_u8(dst + 10 * stride + 0, vextq_u8(col0_even, d0_lo, 11));
815   vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0_lo, d0_hi, 11));
816   vst1q_u8(dst + 11 * stride + 0, vextq_u8(col0_odd, d1_lo, 11));
817   vst1q_u8(dst + 11 * stride + 16, vextq_u8(d1_lo, d1_hi, 11));
818   vst1q_u8(dst + 12 * stride + 0, vextq_u8(col0_even, d0_lo, 10));
819   vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0_lo, d0_hi, 10));
820   vst1q_u8(dst + 13 * stride + 0, vextq_u8(col0_odd, d1_lo, 10));
821   vst1q_u8(dst + 13 * stride + 16, vextq_u8(d1_lo, d1_hi, 10));
822   vst1q_u8(dst + 14 * stride + 0, vextq_u8(col0_even, d0_lo, 9));
823   vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0_lo, d0_hi, 9));
824   vst1q_u8(dst + 15 * stride + 0, vextq_u8(col0_odd, d1_lo, 9));
825   vst1q_u8(dst + 15 * stride + 16, vextq_u8(d1_lo, d1_hi, 9));
826   vst1q_u8(dst + 16 * stride + 0, vextq_u8(col0_even, d0_lo, 8));
827   vst1q_u8(dst + 16 * stride + 16, vextq_u8(d0_lo, d0_hi, 8));
828   vst1q_u8(dst + 17 * stride + 0, vextq_u8(col0_odd, d1_lo, 8));
829   vst1q_u8(dst + 17 * stride + 16, vextq_u8(d1_lo, d1_hi, 8));
830   vst1q_u8(dst + 18 * stride + 0, vextq_u8(col0_even, d0_lo, 7));
831   vst1q_u8(dst + 18 * stride + 16, vextq_u8(d0_lo, d0_hi, 7));
832   vst1q_u8(dst + 19 * stride + 0, vextq_u8(col0_odd, d1_lo, 7));
833   vst1q_u8(dst + 19 * stride + 16, vextq_u8(d1_lo, d1_hi, 7));
834   vst1q_u8(dst + 20 * stride + 0, vextq_u8(col0_even, d0_lo, 6));
835   vst1q_u8(dst + 20 * stride + 16, vextq_u8(d0_lo, d0_hi, 6));
836   vst1q_u8(dst + 21 * stride + 0, vextq_u8(col0_odd, d1_lo, 6));
837   vst1q_u8(dst + 21 * stride + 16, vextq_u8(d1_lo, d1_hi, 6));
838   vst1q_u8(dst + 22 * stride + 0, vextq_u8(col0_even, d0_lo, 5));
839   vst1q_u8(dst + 22 * stride + 16, vextq_u8(d0_lo, d0_hi, 5));
840   vst1q_u8(dst + 23 * stride + 0, vextq_u8(col0_odd, d1_lo, 5));
841   vst1q_u8(dst + 23 * stride + 16, vextq_u8(d1_lo, d1_hi, 5));
842   vst1q_u8(dst + 24 * stride + 0, vextq_u8(col0_even, d0_lo, 4));
843   vst1q_u8(dst + 24 * stride + 16, vextq_u8(d0_lo, d0_hi, 4));
844   vst1q_u8(dst + 25 * stride + 0, vextq_u8(col0_odd, d1_lo, 4));
845   vst1q_u8(dst + 25 * stride + 16, vextq_u8(d1_lo, d1_hi, 4));
846   vst1q_u8(dst + 26 * stride + 0, vextq_u8(col0_even, d0_lo, 3));
847   vst1q_u8(dst + 26 * stride + 16, vextq_u8(d0_lo, d0_hi, 3));
848   vst1q_u8(dst + 27 * stride + 0, vextq_u8(col0_odd, d1_lo, 3));
849   vst1q_u8(dst + 27 * stride + 16, vextq_u8(d1_lo, d1_hi, 3));
850   vst1q_u8(dst + 28 * stride + 0, vextq_u8(col0_even, d0_lo, 2));
851   vst1q_u8(dst + 28 * stride + 16, vextq_u8(d0_lo, d0_hi, 2));
852   vst1q_u8(dst + 29 * stride + 0, vextq_u8(col0_odd, d1_lo, 2));
853   vst1q_u8(dst + 29 * stride + 16, vextq_u8(d1_lo, d1_hi, 2));
854   vst1q_u8(dst + 30 * stride + 0, vextq_u8(col0_even, d0_lo, 1));
855   vst1q_u8(dst + 30 * stride + 16, vextq_u8(d0_lo, d0_hi, 1));
856   vst1q_u8(dst + 31 * stride + 0, vextq_u8(col0_odd, d1_lo, 1));
857   vst1q_u8(dst + 31 * stride + 16, vextq_u8(d1_lo, d1_hi, 1));
858 }
859 
860 // -----------------------------------------------------------------------------
861 
vpx_d135_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)862 void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
863                                  const uint8_t *above, const uint8_t *left) {
864   const uint8x8_t XA0123 = vld1_u8(above - 1);
865   const uint8x8_t L0123 = vld1_u8(left);
866   const uint8x8_t L3210 = vrev64_u8(L0123);
867   const uint8x8_t L3210XA012 = vext_u8(L3210, XA0123, 4);
868   const uint8x8_t L210XA0123 = vext_u8(L3210, XA0123, 5);
869   const uint8x8_t L10XA0123_ = vext_u8(L210XA0123, L210XA0123, 1);
870   const uint8x8_t avg1 = vhadd_u8(L10XA0123_, L3210XA012);
871   const uint8x8_t avg2 = vrhadd_u8(avg1, L210XA0123);
872 
873   store_u8_4x1(dst + 0 * stride, vext_u8(avg2, avg2, 3));
874   store_u8_4x1(dst + 1 * stride, vext_u8(avg2, avg2, 2));
875   store_u8_4x1(dst + 2 * stride, vext_u8(avg2, avg2, 1));
876   store_u8_4x1(dst + 3 * stride, avg2);
877 }
878 
vpx_d135_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)879 void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
880                                  const uint8_t *above, const uint8_t *left) {
881   const uint8x8_t XA0123456 = vld1_u8(above - 1);
882   const uint8x8_t A01234567 = vld1_u8(above);
883   const uint8x8_t A1234567_ = vld1_u8(above + 1);
884   const uint8x8_t L01234567 = vld1_u8(left);
885   const uint8x8_t L76543210 = vrev64_u8(L01234567);
886   const uint8x8_t L6543210X = vext_u8(L76543210, XA0123456, 1);
887   const uint8x8_t L543210XA0 = vext_u8(L76543210, XA0123456, 2);
888   const uint8x16_t L76543210XA0123456 = vcombine_u8(L76543210, XA0123456);
889   const uint8x16_t L6543210XA01234567 = vcombine_u8(L6543210X, A01234567);
890   const uint8x16_t L543210XA01234567_ = vcombine_u8(L543210XA0, A1234567_);
891   const uint8x16_t avg = vhaddq_u8(L76543210XA0123456, L543210XA01234567_);
892   const uint8x16_t row = vrhaddq_u8(avg, L6543210XA01234567);
893 
894   vst1_u8(dst + 0 * stride, vget_low_u8(vextq_u8(row, row, 7)));
895   vst1_u8(dst + 1 * stride, vget_low_u8(vextq_u8(row, row, 6)));
896   vst1_u8(dst + 2 * stride, vget_low_u8(vextq_u8(row, row, 5)));
897   vst1_u8(dst + 3 * stride, vget_low_u8(vextq_u8(row, row, 4)));
898   vst1_u8(dst + 4 * stride, vget_low_u8(vextq_u8(row, row, 3)));
899   vst1_u8(dst + 5 * stride, vget_low_u8(vextq_u8(row, row, 2)));
900   vst1_u8(dst + 6 * stride, vget_low_u8(vextq_u8(row, row, 1)));
901   vst1_u8(dst + 7 * stride, vget_low_u8(row));
902 }
903 
d135_store_16x8(uint8_t ** dst,const ptrdiff_t stride,const uint8x16_t row_0,const uint8x16_t row_1,const uint8x16_t row_2,const uint8x16_t row_3,const uint8x16_t row_4,const uint8x16_t row_5,const uint8x16_t row_6,const uint8x16_t row_7)904 static INLINE void d135_store_16x8(
905     uint8_t **dst, const ptrdiff_t stride, const uint8x16_t row_0,
906     const uint8x16_t row_1, const uint8x16_t row_2, const uint8x16_t row_3,
907     const uint8x16_t row_4, const uint8x16_t row_5, const uint8x16_t row_6,
908     const uint8x16_t row_7) {
909   vst1q_u8(*dst, row_0);
910   *dst += stride;
911   vst1q_u8(*dst, row_1);
912   *dst += stride;
913   vst1q_u8(*dst, row_2);
914   *dst += stride;
915   vst1q_u8(*dst, row_3);
916   *dst += stride;
917   vst1q_u8(*dst, row_4);
918   *dst += stride;
919   vst1q_u8(*dst, row_5);
920   *dst += stride;
921   vst1q_u8(*dst, row_6);
922   *dst += stride;
923   vst1q_u8(*dst, row_7);
924   *dst += stride;
925 }
926 
vpx_d135_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)927 void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
928                                    const uint8_t *above, const uint8_t *left) {
929   const uint8x16_t XA0123456789abcde = vld1q_u8(above - 1);
930   const uint8x16_t A0123456789abcdef = vld1q_u8(above);
931   const uint8x16_t A123456789abcdef_ = vld1q_u8(above + 1);
932   const uint8x16_t L0123456789abcdef = vld1q_u8(left);
933   const uint8x8_t L76543210 = vrev64_u8(vget_low_u8(L0123456789abcdef));
934   const uint8x8_t Lfedcba98 = vrev64_u8(vget_high_u8(L0123456789abcdef));
935   const uint8x16_t Lfedcba9876543210 = vcombine_u8(Lfedcba98, L76543210);
936   const uint8x16_t Ledcba9876543210X =
937       vextq_u8(Lfedcba9876543210, XA0123456789abcde, 1);
938   const uint8x16_t Ldcba9876543210XA0 =
939       vextq_u8(Lfedcba9876543210, XA0123456789abcde, 2);
940   const uint8x16_t avg_0 = vhaddq_u8(Lfedcba9876543210, Ldcba9876543210XA0);
941   const uint8x16_t avg_1 = vhaddq_u8(XA0123456789abcde, A123456789abcdef_);
942   const uint8x16_t row_0 = vrhaddq_u8(avg_0, Ledcba9876543210X);
943   const uint8x16_t row_1 = vrhaddq_u8(avg_1, A0123456789abcdef);
944 
945   const uint8x16_t r_0 = vextq_u8(row_0, row_1, 15);
946   const uint8x16_t r_1 = vextq_u8(row_0, row_1, 14);
947   const uint8x16_t r_2 = vextq_u8(row_0, row_1, 13);
948   const uint8x16_t r_3 = vextq_u8(row_0, row_1, 12);
949   const uint8x16_t r_4 = vextq_u8(row_0, row_1, 11);
950   const uint8x16_t r_5 = vextq_u8(row_0, row_1, 10);
951   const uint8x16_t r_6 = vextq_u8(row_0, row_1, 9);
952   const uint8x16_t r_7 = vextq_u8(row_0, row_1, 8);
953   const uint8x16_t r_8 = vextq_u8(row_0, row_1, 7);
954   const uint8x16_t r_9 = vextq_u8(row_0, row_1, 6);
955   const uint8x16_t r_a = vextq_u8(row_0, row_1, 5);
956   const uint8x16_t r_b = vextq_u8(row_0, row_1, 4);
957   const uint8x16_t r_c = vextq_u8(row_0, row_1, 3);
958   const uint8x16_t r_d = vextq_u8(row_0, row_1, 2);
959   const uint8x16_t r_e = vextq_u8(row_0, row_1, 1);
960 
961   d135_store_16x8(&dst, stride, r_0, r_1, r_2, r_3, r_4, r_5, r_6, r_7);
962   d135_store_16x8(&dst, stride, r_8, r_9, r_a, r_b, r_c, r_d, r_e, row_0);
963 }
964 
d135_store_32x2(uint8_t ** dst,const ptrdiff_t stride,const uint8x16_t row_0,const uint8x16_t row_1,const uint8x16_t row_2)965 static INLINE void d135_store_32x2(uint8_t **dst, const ptrdiff_t stride,
966                                    const uint8x16_t row_0,
967                                    const uint8x16_t row_1,
968                                    const uint8x16_t row_2) {
969   uint8_t *dst2 = *dst;
970   vst1q_u8(dst2, row_1);
971   dst2 += 16;
972   vst1q_u8(dst2, row_2);
973   dst2 += 16 * stride - 16;
974   vst1q_u8(dst2, row_0);
975   dst2 += 16;
976   vst1q_u8(dst2, row_1);
977   *dst += stride;
978 }
979 
vpx_d135_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)980 void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
981                                    const uint8_t *above, const uint8_t *left) {
982   const uint8x16_t LL0123456789abcdef = vld1q_u8(left + 16);
983   const uint8x16_t LU0123456789abcdef = vld1q_u8(left);
984   const uint8x8_t LL76543210 = vrev64_u8(vget_low_u8(LL0123456789abcdef));
985   const uint8x8_t LU76543210 = vrev64_u8(vget_low_u8(LU0123456789abcdef));
986   const uint8x8_t LLfedcba98 = vrev64_u8(vget_high_u8(LL0123456789abcdef));
987   const uint8x8_t LUfedcba98 = vrev64_u8(vget_high_u8(LU0123456789abcdef));
988   const uint8x16_t LLfedcba9876543210 = vcombine_u8(LLfedcba98, LL76543210);
989   const uint8x16_t LUfedcba9876543210 = vcombine_u8(LUfedcba98, LU76543210);
990   const uint8x16_t LLedcba9876543210Uf =
991       vextq_u8(LLfedcba9876543210, LUfedcba9876543210, 1);
992   const uint8x16_t LLdcba9876543210Ufe =
993       vextq_u8(LLfedcba9876543210, LUfedcba9876543210, 2);
994   const uint8x16_t avg_0 = vhaddq_u8(LLfedcba9876543210, LLdcba9876543210Ufe);
995   const uint8x16_t row_0 = vrhaddq_u8(avg_0, LLedcba9876543210Uf);
996 
997   const uint8x16_t XAL0123456789abcde = vld1q_u8(above - 1);
998   const uint8x16_t LUedcba9876543210X =
999       vextq_u8(LUfedcba9876543210, XAL0123456789abcde, 1);
1000   const uint8x16_t LUdcba9876543210XA0 =
1001       vextq_u8(LUfedcba9876543210, XAL0123456789abcde, 2);
1002   const uint8x16_t avg_1 = vhaddq_u8(LUfedcba9876543210, LUdcba9876543210XA0);
1003   const uint8x16_t row_1 = vrhaddq_u8(avg_1, LUedcba9876543210X);
1004 
1005   const uint8x16_t AL0123456789abcdef = vld1q_u8(above);
1006   const uint8x16_t AL123456789abcdefg = vld1q_u8(above + 1);
1007   const uint8x16_t ALfR0123456789abcde = vld1q_u8(above + 15);
1008   const uint8x16_t AR0123456789abcdef = vld1q_u8(above + 16);
1009   const uint8x16_t AR123456789abcdef_ = vld1q_u8(above + 17);
1010   const uint8x16_t avg_2 = vhaddq_u8(XAL0123456789abcde, AL123456789abcdefg);
1011   const uint8x16_t row_2 = vrhaddq_u8(avg_2, AL0123456789abcdef);
1012   const uint8x16_t avg_3 = vhaddq_u8(ALfR0123456789abcde, AR123456789abcdef_);
1013   const uint8x16_t row_3 = vrhaddq_u8(avg_3, AR0123456789abcdef);
1014 
1015   {
1016     const uint8x16_t r_0 = vextq_u8(row_0, row_1, 15);
1017     const uint8x16_t r_1 = vextq_u8(row_1, row_2, 15);
1018     const uint8x16_t r_2 = vextq_u8(row_2, row_3, 15);
1019     d135_store_32x2(&dst, stride, r_0, r_1, r_2);
1020   }
1021 
1022   {
1023     const uint8x16_t r_0 = vextq_u8(row_0, row_1, 14);
1024     const uint8x16_t r_1 = vextq_u8(row_1, row_2, 14);
1025     const uint8x16_t r_2 = vextq_u8(row_2, row_3, 14);
1026     d135_store_32x2(&dst, stride, r_0, r_1, r_2);
1027   }
1028 
1029   {
1030     const uint8x16_t r_0 = vextq_u8(row_0, row_1, 13);
1031     const uint8x16_t r_1 = vextq_u8(row_1, row_2, 13);
1032     const uint8x16_t r_2 = vextq_u8(row_2, row_3, 13);
1033     d135_store_32x2(&dst, stride, r_0, r_1, r_2);
1034   }
1035 
1036   {
1037     const uint8x16_t r_0 = vextq_u8(row_0, row_1, 12);
1038     const uint8x16_t r_1 = vextq_u8(row_1, row_2, 12);
1039     const uint8x16_t r_2 = vextq_u8(row_2, row_3, 12);
1040     d135_store_32x2(&dst, stride, r_0, r_1, r_2);
1041   }
1042 
1043   {
1044     const uint8x16_t r_0 = vextq_u8(row_0, row_1, 11);
1045     const uint8x16_t r_1 = vextq_u8(row_1, row_2, 11);
1046     const uint8x16_t r_2 = vextq_u8(row_2, row_3, 11);
1047     d135_store_32x2(&dst, stride, r_0, r_1, r_2);
1048   }
1049 
1050   {
1051     const uint8x16_t r_0 = vextq_u8(row_0, row_1, 10);
1052     const uint8x16_t r_1 = vextq_u8(row_1, row_2, 10);
1053     const uint8x16_t r_2 = vextq_u8(row_2, row_3, 10);
1054     d135_store_32x2(&dst, stride, r_0, r_1, r_2);
1055   }
1056 
1057   {
1058     const uint8x16_t r_0 = vextq_u8(row_0, row_1, 9);
1059     const uint8x16_t r_1 = vextq_u8(row_1, row_2, 9);
1060     const uint8x16_t r_2 = vextq_u8(row_2, row_3, 9);
1061     d135_store_32x2(&dst, stride, r_0, r_1, r_2);
1062   }
1063 
1064   {
1065     const uint8x16_t r_0 = vextq_u8(row_0, row_1, 8);
1066     const uint8x16_t r_1 = vextq_u8(row_1, row_2, 8);
1067     const uint8x16_t r_2 = vextq_u8(row_2, row_3, 8);
1068     d135_store_32x2(&dst, stride, r_0, r_1, r_2);
1069   }
1070 
1071   {
1072     const uint8x16_t r_0 = vextq_u8(row_0, row_1, 7);
1073     const uint8x16_t r_1 = vextq_u8(row_1, row_2, 7);
1074     const uint8x16_t r_2 = vextq_u8(row_2, row_3, 7);
1075     d135_store_32x2(&dst, stride, r_0, r_1, r_2);
1076   }
1077 
1078   {
1079     const uint8x16_t r_0 = vextq_u8(row_0, row_1, 6);
1080     const uint8x16_t r_1 = vextq_u8(row_1, row_2, 6);
1081     const uint8x16_t r_2 = vextq_u8(row_2, row_3, 6);
1082     d135_store_32x2(&dst, stride, r_0, r_1, r_2);
1083   }
1084 
1085   {
1086     const uint8x16_t r_0 = vextq_u8(row_0, row_1, 5);
1087     const uint8x16_t r_1 = vextq_u8(row_1, row_2, 5);
1088     const uint8x16_t r_2 = vextq_u8(row_2, row_3, 5);
1089     d135_store_32x2(&dst, stride, r_0, r_1, r_2);
1090   }
1091 
1092   {
1093     const uint8x16_t r_0 = vextq_u8(row_0, row_1, 4);
1094     const uint8x16_t r_1 = vextq_u8(row_1, row_2, 4);
1095     const uint8x16_t r_2 = vextq_u8(row_2, row_3, 4);
1096     d135_store_32x2(&dst, stride, r_0, r_1, r_2);
1097   }
1098 
1099   {
1100     const uint8x16_t r_0 = vextq_u8(row_0, row_1, 3);
1101     const uint8x16_t r_1 = vextq_u8(row_1, row_2, 3);
1102     const uint8x16_t r_2 = vextq_u8(row_2, row_3, 3);
1103     d135_store_32x2(&dst, stride, r_0, r_1, r_2);
1104   }
1105 
1106   {
1107     const uint8x16_t r_0 = vextq_u8(row_0, row_1, 2);
1108     const uint8x16_t r_1 = vextq_u8(row_1, row_2, 2);
1109     const uint8x16_t r_2 = vextq_u8(row_2, row_3, 2);
1110     d135_store_32x2(&dst, stride, r_0, r_1, r_2);
1111   }
1112 
1113   {
1114     const uint8x16_t r_0 = vextq_u8(row_0, row_1, 1);
1115     const uint8x16_t r_1 = vextq_u8(row_1, row_2, 1);
1116     const uint8x16_t r_2 = vextq_u8(row_2, row_3, 1);
1117     d135_store_32x2(&dst, stride, r_0, r_1, r_2);
1118   }
1119 
1120   d135_store_32x2(&dst, stride, row_0, row_1, row_2);
1121 }
1122 
1123 // -----------------------------------------------------------------------------
1124 
vpx_d153_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1125 void vpx_d153_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
1126                                  const uint8_t *above, const uint8_t *left) {
1127   // See vpx_d153_predictor_8x8_neon for more details on the implementation.
1128   uint8x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02;
1129 
1130   az = load_unaligned_u8_4x1(above - 1);
1131   a0 = load_unaligned_u8_4x1(above + 0);
1132   // [ left[0], above[-1], above[0], above[1], x, x, x, x ]
1133   l0az = vext_u8(vld1_dup_u8(left), az, 7);
1134 
1135   l0 = load_unaligned_u8_4x1(left + 0);
1136   l1 = load_unaligned_u8_4x1(left + 1);
1137   // [ above[-1], left[0], left[1], left[2], x, x, x, x ]
1138   azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7);
1139 
1140   d0 = vrhadd_u8(azl0, l0);
1141   d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
1142   d2 = vrhadd_u8(vhadd_u8(azl0, l1), l0);
1143 
1144   d02 = vrev64_u8(vzip_u8(d0, d2).val[0]);
1145 
1146   store_u8_4x1(dst + 0 * stride, vext_u8(d02, d1, 7));
1147   store_u8_4x1(dst + 1 * stride, vext_u8(d02, d1, 5));
1148   store_u8_4x1(dst + 2 * stride, vext_u8(d02, d1, 3));
1149   store_u8_4x1(dst + 3 * stride, vext_u8(d02, d1, 1));
1150 }
1151 
vpx_d153_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1152 void vpx_d153_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
1153                                  const uint8_t *above, const uint8_t *left) {
1154   uint8x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02_lo, d02_hi;
1155 
1156   az = vld1_u8(above - 1);
1157   a0 = vld1_u8(above + 0);
1158   // [ left[0], above[-1], ... , above[5] ]
1159   l0az = vext_u8(vld1_dup_u8(left), az, 7);
1160 
1161   l0 = vld1_u8(left);
1162   // The last lane here is unused, reading left[8] could cause a buffer
1163   // over-read, so just fill with a duplicate of left[0] to avoid needing to
1164   // materialize a zero:
1165   // [ left[1], ... , left[7], x ]
1166   l1 = vext_u8(l0, l0, 1);
1167   // [ above[-1], left[0], ... , left[6] ]
1168   azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7);
1169 
1170   // d0[0] = AVG2(above[-1], left[0])
1171   // d0[1] = AVG2(left[0], left[1])
1172   // ...
1173   // d0[7] = AVG2(left[6], left[7])
1174   d0 = vrhadd_u8(azl0, l0);
1175 
1176   // d1[0] = AVG3(left[0], above[-1], above[0])
1177   // d1[1] = AVG3(above[-1], above[0], above[1])
1178   // ...
1179   // d1[7] = AVG3(above[5], above[6], above[7])
1180   d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
1181 
1182   // d2[0] = AVG3(above[-1], left[0], left[1])
1183   // d2[1] = AVG3(left[0], left[1], left[2])
1184   // ...
1185   // d2[6] = AVG3(left[5], left[6], left[7])
1186   // d2[7] = x (don't care)
1187   d2 = vrhadd_u8(vhadd_u8(azl0, l1), l0);
1188 
1189   // The ext instruction shifts elements in from the end of the vector rather
1190   // than the start, so reverse the vectors to put the elements to be shifted
1191   // in at the end. The lowest lane of d02_lo is unused.
1192   d02_lo = vzip_u8(vrev64_u8(d2), vrev64_u8(d0)).val[0];
1193   d02_hi = vzip_u8(vrev64_u8(d2), vrev64_u8(d0)).val[1];
1194 
1195   // Incrementally shift more elements from d0/d2 reversed into d1:
1196   // stride=0 [ d0[0], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ]
1197   // stride=1 [ d0[1], d2[0], d0[0], d1[0], d1[1], d1[2], d1[3], d1[4] ]
1198   // stride=2 [ d0[2], d2[1], d0[1], d2[0], d0[0], d1[0], d1[1], d1[2] ]
1199   // stride=3 [ d0[3], d2[2], d0[2], d2[1], d0[1], d2[0], d0[0], d1[0] ]
1200   // stride=4 [ d0[4], d2[3], d0[3], d2[2], d0[2], d2[1], d0[1], d2[0] ]
1201   // stride=5 [ d0[5], d2[4], d0[4], d2[3], d0[3], d2[2], d0[2], d2[1] ]
1202   // stride=6 [ d0[6], d2[5], d0[5], d2[4], d0[4], d2[3], d0[3], d2[2] ]
1203   // stride=7 [ d0[7], d2[6], d0[6], d2[5], d0[5], d2[4], d0[4], d2[3] ]
1204   vst1_u8(dst + 0 * stride, vext_u8(d02_hi, d1, 7));
1205   vst1_u8(dst + 1 * stride, vext_u8(d02_hi, d1, 5));
1206   vst1_u8(dst + 2 * stride, vext_u8(d02_hi, d1, 3));
1207   vst1_u8(dst + 3 * stride, vext_u8(d02_hi, d1, 1));
1208   vst1_u8(dst + 4 * stride, vext_u8(d02_lo, d02_hi, 7));
1209   vst1_u8(dst + 5 * stride, vext_u8(d02_lo, d02_hi, 5));
1210   vst1_u8(dst + 6 * stride, vext_u8(d02_lo, d02_hi, 3));
1211   vst1_u8(dst + 7 * stride, vext_u8(d02_lo, d02_hi, 1));
1212 }
1213 
vpx_d153_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1214 void vpx_d153_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
1215                                    const uint8_t *above, const uint8_t *left) {
1216   // See vpx_d153_predictor_8x8_neon for more details on the implementation.
1217   uint8x16_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02_lo, d02_hi;
1218 
1219   az = vld1q_u8(above - 1);
1220   a0 = vld1q_u8(above + 0);
1221   // [ left[0], above[-1], ... , above[13] ]
1222   l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
1223 
1224   l0 = vld1q_u8(left + 0);
1225   // The last lane here is unused, reading left[16] could cause a buffer
1226   // over-read, so just fill with a duplicate of left[0] to avoid needing to
1227   // materialize a zero:
1228   // [ left[1], ... , left[15], x ]
1229   l1 = vextq_u8(l0, l0, 1);
1230   // [ above[-1], left[0], ... , left[14] ]
1231   azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
1232 
1233   d0 = vrhaddq_u8(azl0, l0);
1234   d1 = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
1235   d2 = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
1236 
1237   d0 = vrev64q_u8(vextq_u8(d0, d0, 8));
1238   d2 = vrev64q_u8(vextq_u8(d2, d2, 8));
1239 
1240   // The lowest lane of d02_lo is unused.
1241   d02_lo = vzipq_u8(d2, d0).val[0];
1242   d02_hi = vzipq_u8(d2, d0).val[1];
1243 
1244   vst1q_u8(dst + 0 * stride, vextq_u8(d02_hi, d1, 15));
1245   vst1q_u8(dst + 1 * stride, vextq_u8(d02_hi, d1, 13));
1246   vst1q_u8(dst + 2 * stride, vextq_u8(d02_hi, d1, 11));
1247   vst1q_u8(dst + 3 * stride, vextq_u8(d02_hi, d1, 9));
1248   vst1q_u8(dst + 4 * stride, vextq_u8(d02_hi, d1, 7));
1249   vst1q_u8(dst + 5 * stride, vextq_u8(d02_hi, d1, 5));
1250   vst1q_u8(dst + 6 * stride, vextq_u8(d02_hi, d1, 3));
1251   vst1q_u8(dst + 7 * stride, vextq_u8(d02_hi, d1, 1));
1252   vst1q_u8(dst + 8 * stride, vextq_u8(d02_lo, d02_hi, 15));
1253   vst1q_u8(dst + 9 * stride, vextq_u8(d02_lo, d02_hi, 13));
1254   vst1q_u8(dst + 10 * stride, vextq_u8(d02_lo, d02_hi, 11));
1255   vst1q_u8(dst + 11 * stride, vextq_u8(d02_lo, d02_hi, 9));
1256   vst1q_u8(dst + 12 * stride, vextq_u8(d02_lo, d02_hi, 7));
1257   vst1q_u8(dst + 13 * stride, vextq_u8(d02_lo, d02_hi, 5));
1258   vst1q_u8(dst + 14 * stride, vextq_u8(d02_lo, d02_hi, 3));
1259   vst1q_u8(dst + 15 * stride, vextq_u8(d02_lo, d02_hi, 1));
1260 }
1261 
vpx_d153_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1262 void vpx_d153_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
1263                                    const uint8_t *above, const uint8_t *left) {
1264   // See vpx_d153_predictor_8x8_neon for more details on the implementation.
1265   uint8x16_t az, a0, a14, a15, a16, l0az, l0, l1, l15, l16, l17, azl0, d0_lo,
1266       d0_hi, d1_lo, d1_hi, d2_lo, d2_hi;
1267   uint8x16x2_t d02_hi, d02_lo;
1268 
1269   az = vld1q_u8(above - 1);
1270   a0 = vld1q_u8(above + 0);
1271   a14 = vld1q_u8(above + 14);
1272   a15 = vld1q_u8(above + 15);
1273   a16 = vld1q_u8(above + 16);
1274   // [ left[0], above[-1], ... , above[13] ]
1275   l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
1276 
1277   l0 = vld1q_u8(left);
1278   l1 = vld1q_u8(left + 1);
1279   l15 = vld1q_u8(left + 15);
1280   l16 = vld1q_u8(left + 16);
1281   // The last lane here is unused, reading left[32] would cause a buffer
1282   // over-read (observed as an address-sanitizer failure), so just fill with a
1283   // duplicate of left[16] to avoid needing to materialize a zero:
1284   // [ left[17], ... , left[31], x ]
1285   l17 = vextq_u8(l16, l16, 1);
1286   // [ above[-1], left[0], ... , left[14] ]
1287   azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
1288 
1289   d0_lo = vrhaddq_u8(azl0, l0);
1290   d0_hi = vrhaddq_u8(l15, l16);
1291 
1292   d1_lo = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
1293   d1_hi = vrhaddq_u8(vhaddq_u8(a14, a16), a15);
1294 
1295   // The highest lane of d2_hi is unused.
1296   d2_lo = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
1297   d2_hi = vrhaddq_u8(vhaddq_u8(l15, l17), l16);
1298 
1299   d0_lo = vrev64q_u8(vextq_u8(d0_lo, d0_lo, 8));
1300   d0_hi = vrev64q_u8(vextq_u8(d0_hi, d0_hi, 8));
1301 
1302   d2_lo = vrev64q_u8(vextq_u8(d2_lo, d2_lo, 8));
1303   d2_hi = vrev64q_u8(vextq_u8(d2_hi, d2_hi, 8));
1304 
1305   // d02_hi.val[0][0] is unused here.
1306   d02_hi = vzipq_u8(d2_hi, d0_hi);
1307   d02_lo = vzipq_u8(d2_lo, d0_lo);
1308 
1309   vst1q_u8(dst + 0 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 15));
1310   vst1q_u8(dst + 0 * stride + 16, vextq_u8(d1_lo, d1_hi, 15));
1311   vst1q_u8(dst + 1 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 13));
1312   vst1q_u8(dst + 1 * stride + 16, vextq_u8(d1_lo, d1_hi, 13));
1313   vst1q_u8(dst + 2 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 11));
1314   vst1q_u8(dst + 2 * stride + 16, vextq_u8(d1_lo, d1_hi, 11));
1315   vst1q_u8(dst + 3 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 9));
1316   vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_lo, d1_hi, 9));
1317   vst1q_u8(dst + 4 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 7));
1318   vst1q_u8(dst + 4 * stride + 16, vextq_u8(d1_lo, d1_hi, 7));
1319   vst1q_u8(dst + 5 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 5));
1320   vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_lo, d1_hi, 5));
1321   vst1q_u8(dst + 6 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 3));
1322   vst1q_u8(dst + 6 * stride + 16, vextq_u8(d1_lo, d1_hi, 3));
1323   vst1q_u8(dst + 7 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 1));
1324   vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_lo, d1_hi, 1));
1325   vst1q_u8(dst + 8 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 15));
1326   vst1q_u8(dst + 8 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 15));
1327   vst1q_u8(dst + 9 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 13));
1328   vst1q_u8(dst + 9 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 13));
1329   vst1q_u8(dst + 10 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 11));
1330   vst1q_u8(dst + 10 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 11));
1331   vst1q_u8(dst + 11 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 9));
1332   vst1q_u8(dst + 11 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 9));
1333   vst1q_u8(dst + 12 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 7));
1334   vst1q_u8(dst + 12 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 7));
1335   vst1q_u8(dst + 13 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 5));
1336   vst1q_u8(dst + 13 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 5));
1337   vst1q_u8(dst + 14 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 3));
1338   vst1q_u8(dst + 14 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 3));
1339   vst1q_u8(dst + 15 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 1));
1340   vst1q_u8(dst + 15 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 1));
1341   vst1q_u8(dst + 16 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 15));
1342   vst1q_u8(dst + 16 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 15));
1343   vst1q_u8(dst + 17 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 13));
1344   vst1q_u8(dst + 17 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 13));
1345   vst1q_u8(dst + 18 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 11));
1346   vst1q_u8(dst + 18 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 11));
1347   vst1q_u8(dst + 19 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 9));
1348   vst1q_u8(dst + 19 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 9));
1349   vst1q_u8(dst + 20 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 7));
1350   vst1q_u8(dst + 20 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 7));
1351   vst1q_u8(dst + 21 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 5));
1352   vst1q_u8(dst + 21 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 5));
1353   vst1q_u8(dst + 22 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 3));
1354   vst1q_u8(dst + 22 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 3));
1355   vst1q_u8(dst + 23 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 1));
1356   vst1q_u8(dst + 23 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 1));
1357   vst1q_u8(dst + 24 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 15));
1358   vst1q_u8(dst + 24 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 15));
1359   vst1q_u8(dst + 25 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 13));
1360   vst1q_u8(dst + 25 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 13));
1361   vst1q_u8(dst + 26 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 11));
1362   vst1q_u8(dst + 26 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 11));
1363   vst1q_u8(dst + 27 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 9));
1364   vst1q_u8(dst + 27 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 9));
1365   vst1q_u8(dst + 28 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 7));
1366   vst1q_u8(dst + 28 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 7));
1367   vst1q_u8(dst + 29 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 5));
1368   vst1q_u8(dst + 29 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 5));
1369   vst1q_u8(dst + 30 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 3));
1370   vst1q_u8(dst + 30 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 3));
1371   vst1q_u8(dst + 31 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 1));
1372   vst1q_u8(dst + 31 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 1));
1373 }
1374 
1375 // -----------------------------------------------------------------------------
1376 
vpx_d207_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1377 void vpx_d207_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
1378                                  const uint8_t *above, const uint8_t *left) {
1379   uint8x8_t l0, l3, l1, l2, c0, c1, c01, d0, d1;
1380   (void)above;
1381 
1382   // We need the low half lanes here for the c0/c1 arithmetic but the high half
1383   // lanes for the ext:
1384   // [ left[0], left[1], left[2], left[3], left[0], left[1], left[2], left[3] ]
1385   l0 = load_replicate_u8_4x1(left + 0);
1386   l3 = vld1_dup_u8(left + 3);
1387 
1388   // [ left[1], left[2], left[3], left[3], x, x, x, x ]
1389   l1 = vext_u8(l0, l3, 5);
1390   // [ left[2], left[3], left[3], left[3], x, x, x, x ]
1391   l2 = vext_u8(l0, l3, 6);
1392 
1393   c0 = vrhadd_u8(l0, l1);
1394   c1 = vrhadd_u8(vhadd_u8(l0, l2), l1);
1395 
1396   // [ c0[0], c1[0], c0[1], c1[1], c0[2], c1[2], c0[3], c1[3] ]
1397   c01 = vzip_u8(c0, c1).val[0];
1398 
1399   d0 = c01;
1400   d1 = vext_u8(c01, l3, 2);
1401 
1402   // Store the high half of the vector for stride={2,3} to avoid needing
1403   // additional ext instructions:
1404   // stride=0 [ c0[0], c1[0],   c0[1],   c1[1] ]
1405   // stride=1 [ c0[1], c1[1],   c0[2],   c1[2] ]
1406   // stride=2 [ c0[2], c1[2],   c0[3],   c1[3] ]
1407   // stride=3 [ c0[3], c1[3], left[3], left[3] ]
1408   store_u8_4x1(dst + 0 * stride, d0);
1409   store_u8_4x1(dst + 1 * stride, d1);
1410   store_u8_4x1_high(dst + 2 * stride, d0);
1411   store_u8_4x1_high(dst + 3 * stride, d1);
1412 }
1413 
vpx_d207_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1414 void vpx_d207_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
1415                                  const uint8_t *above, const uint8_t *left) {
1416   uint8x8_t l7, l0, l1, l2, c0, c1, c01_lo, c01_hi;
1417   (void)above;
1418 
1419   l0 = vld1_u8(left + 0);
1420   l7 = vld1_dup_u8(left + 7);
1421 
1422   // [ left[1], left[2], left[3], left[4], left[5], left[6], left[7], left[7] ]
1423   l1 = vext_u8(l0, l7, 1);
1424   // [ left[2], left[3], left[4], left[5], left[6], left[7], left[7], left[7] ]
1425   l2 = vext_u8(l0, l7, 2);
1426 
1427   c0 = vrhadd_u8(l0, l1);
1428   c1 = vrhadd_u8(vhadd_u8(l0, l2), l1);
1429 
1430   c01_lo = vzip_u8(c0, c1).val[0];
1431   c01_hi = vzip_u8(c0, c1).val[1];
1432 
1433   vst1_u8(dst + 0 * stride, c01_lo);
1434   vst1_u8(dst + 1 * stride, vext_u8(c01_lo, c01_hi, 2));
1435   vst1_u8(dst + 2 * stride, vext_u8(c01_lo, c01_hi, 4));
1436   vst1_u8(dst + 3 * stride, vext_u8(c01_lo, c01_hi, 6));
1437   vst1_u8(dst + 4 * stride, c01_hi);
1438   vst1_u8(dst + 5 * stride, vext_u8(c01_hi, l7, 2));
1439   vst1_u8(dst + 6 * stride, vext_u8(c01_hi, l7, 4));
1440   vst1_u8(dst + 7 * stride, vext_u8(c01_hi, l7, 6));
1441 }
1442 
vpx_d207_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1443 void vpx_d207_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
1444                                    const uint8_t *above, const uint8_t *left) {
1445   uint8x16_t l15, l0, l1, l2, c0, c1, c01_lo, c01_hi;
1446   (void)above;
1447 
1448   l0 = vld1q_u8(left + 0);
1449   l15 = vld1q_dup_u8(left + 15);
1450 
1451   l1 = vextq_u8(l0, l15, 1);
1452   l2 = vextq_u8(l0, l15, 2);
1453 
1454   c0 = vrhaddq_u8(l0, l1);
1455   c1 = vrhaddq_u8(vhaddq_u8(l0, l2), l1);
1456 
1457   c01_lo = vzipq_u8(c0, c1).val[0];
1458   c01_hi = vzipq_u8(c0, c1).val[1];
1459 
1460   vst1q_u8(dst + 0 * stride, c01_lo);
1461   vst1q_u8(dst + 1 * stride, vextq_u8(c01_lo, c01_hi, 2));
1462   vst1q_u8(dst + 2 * stride, vextq_u8(c01_lo, c01_hi, 4));
1463   vst1q_u8(dst + 3 * stride, vextq_u8(c01_lo, c01_hi, 6));
1464   vst1q_u8(dst + 4 * stride, vextq_u8(c01_lo, c01_hi, 8));
1465   vst1q_u8(dst + 5 * stride, vextq_u8(c01_lo, c01_hi, 10));
1466   vst1q_u8(dst + 6 * stride, vextq_u8(c01_lo, c01_hi, 12));
1467   vst1q_u8(dst + 7 * stride, vextq_u8(c01_lo, c01_hi, 14));
1468   vst1q_u8(dst + 8 * stride, c01_hi);
1469   vst1q_u8(dst + 9 * stride, vextq_u8(c01_hi, l15, 2));
1470   vst1q_u8(dst + 10 * stride, vextq_u8(c01_hi, l15, 4));
1471   vst1q_u8(dst + 11 * stride, vextq_u8(c01_hi, l15, 6));
1472   vst1q_u8(dst + 12 * stride, vextq_u8(c01_hi, l15, 8));
1473   vst1q_u8(dst + 13 * stride, vextq_u8(c01_hi, l15, 10));
1474   vst1q_u8(dst + 14 * stride, vextq_u8(c01_hi, l15, 12));
1475   vst1q_u8(dst + 15 * stride, vextq_u8(c01_hi, l15, 14));
1476 }
1477 
vpx_d207_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1478 void vpx_d207_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
1479                                    const uint8_t *above, const uint8_t *left) {
1480   uint8x16_t l0_lo, l0_hi, l1_lo, l1_hi, l2_lo, l2_hi, l31, c0_lo, c0_hi, c1_lo,
1481       c1_hi, c01[4];
1482   (void)above;
1483 
1484   l0_lo = vld1q_u8(left + 0);
1485   l0_hi = vld1q_u8(left + 16);
1486   l31 = vld1q_dup_u8(left + 31);
1487 
1488   l1_lo = vextq_u8(l0_lo, l0_hi, 1);
1489   l1_hi = vextq_u8(l0_hi, l31, 1);
1490   l2_lo = vextq_u8(l0_lo, l0_hi, 2);
1491   l2_hi = vextq_u8(l0_hi, l31, 2);
1492 
1493   c0_lo = vrhaddq_u8(l0_lo, l1_lo);
1494   c0_hi = vrhaddq_u8(l0_hi, l1_hi);
1495   c1_lo = vrhaddq_u8(vhaddq_u8(l0_lo, l2_lo), l1_lo);
1496   c1_hi = vrhaddq_u8(vhaddq_u8(l0_hi, l2_hi), l1_hi);
1497 
1498   c01[0] = vzipq_u8(c0_lo, c1_lo).val[0];
1499   c01[1] = vzipq_u8(c0_lo, c1_lo).val[1];
1500   c01[2] = vzipq_u8(c0_hi, c1_hi).val[0];
1501   c01[3] = vzipq_u8(c0_hi, c1_hi).val[1];
1502 
1503   vst1q_u8(dst + 0 * stride + 0, c01[0]);
1504   vst1q_u8(dst + 0 * stride + 16, c01[1]);
1505   vst1q_u8(dst + 1 * stride + 0, vextq_u8(c01[0], c01[1], 2));
1506   vst1q_u8(dst + 1 * stride + 16, vextq_u8(c01[1], c01[2], 2));
1507   vst1q_u8(dst + 2 * stride + 0, vextq_u8(c01[0], c01[1], 4));
1508   vst1q_u8(dst + 2 * stride + 16, vextq_u8(c01[1], c01[2], 4));
1509   vst1q_u8(dst + 3 * stride + 0, vextq_u8(c01[0], c01[1], 6));
1510   vst1q_u8(dst + 3 * stride + 16, vextq_u8(c01[1], c01[2], 6));
1511   vst1q_u8(dst + 4 * stride + 0, vextq_u8(c01[0], c01[1], 8));
1512   vst1q_u8(dst + 4 * stride + 16, vextq_u8(c01[1], c01[2], 8));
1513   vst1q_u8(dst + 5 * stride + 0, vextq_u8(c01[0], c01[1], 10));
1514   vst1q_u8(dst + 5 * stride + 16, vextq_u8(c01[1], c01[2], 10));
1515   vst1q_u8(dst + 6 * stride + 0, vextq_u8(c01[0], c01[1], 12));
1516   vst1q_u8(dst + 6 * stride + 16, vextq_u8(c01[1], c01[2], 12));
1517   vst1q_u8(dst + 7 * stride + 0, vextq_u8(c01[0], c01[1], 14));
1518   vst1q_u8(dst + 7 * stride + 16, vextq_u8(c01[1], c01[2], 14));
1519   vst1q_u8(dst + 8 * stride + 0, c01[1]);
1520   vst1q_u8(dst + 8 * stride + 16, c01[2]);
1521   vst1q_u8(dst + 9 * stride + 0, vextq_u8(c01[1], c01[2], 2));
1522   vst1q_u8(dst + 9 * stride + 16, vextq_u8(c01[2], c01[3], 2));
1523   vst1q_u8(dst + 10 * stride + 0, vextq_u8(c01[1], c01[2], 4));
1524   vst1q_u8(dst + 10 * stride + 16, vextq_u8(c01[2], c01[3], 4));
1525   vst1q_u8(dst + 11 * stride + 0, vextq_u8(c01[1], c01[2], 6));
1526   vst1q_u8(dst + 11 * stride + 16, vextq_u8(c01[2], c01[3], 6));
1527   vst1q_u8(dst + 12 * stride + 0, vextq_u8(c01[1], c01[2], 8));
1528   vst1q_u8(dst + 12 * stride + 16, vextq_u8(c01[2], c01[3], 8));
1529   vst1q_u8(dst + 13 * stride + 0, vextq_u8(c01[1], c01[2], 10));
1530   vst1q_u8(dst + 13 * stride + 16, vextq_u8(c01[2], c01[3], 10));
1531   vst1q_u8(dst + 14 * stride + 0, vextq_u8(c01[1], c01[2], 12));
1532   vst1q_u8(dst + 14 * stride + 16, vextq_u8(c01[2], c01[3], 12));
1533   vst1q_u8(dst + 15 * stride + 0, vextq_u8(c01[1], c01[2], 14));
1534   vst1q_u8(dst + 15 * stride + 16, vextq_u8(c01[2], c01[3], 14));
1535   vst1q_u8(dst + 16 * stride + 0, c01[2]);
1536   vst1q_u8(dst + 16 * stride + 16, c01[3]);
1537   vst1q_u8(dst + 17 * stride + 0, vextq_u8(c01[2], c01[3], 2));
1538   vst1q_u8(dst + 17 * stride + 16, vextq_u8(c01[3], l31, 2));
1539   vst1q_u8(dst + 18 * stride + 0, vextq_u8(c01[2], c01[3], 4));
1540   vst1q_u8(dst + 18 * stride + 16, vextq_u8(c01[3], l31, 4));
1541   vst1q_u8(dst + 19 * stride + 0, vextq_u8(c01[2], c01[3], 6));
1542   vst1q_u8(dst + 19 * stride + 16, vextq_u8(c01[3], l31, 6));
1543   vst1q_u8(dst + 20 * stride + 0, vextq_u8(c01[2], c01[3], 8));
1544   vst1q_u8(dst + 20 * stride + 16, vextq_u8(c01[3], l31, 8));
1545   vst1q_u8(dst + 21 * stride + 0, vextq_u8(c01[2], c01[3], 10));
1546   vst1q_u8(dst + 21 * stride + 16, vextq_u8(c01[3], l31, 10));
1547   vst1q_u8(dst + 22 * stride + 0, vextq_u8(c01[2], c01[3], 12));
1548   vst1q_u8(dst + 22 * stride + 16, vextq_u8(c01[3], l31, 12));
1549   vst1q_u8(dst + 23 * stride + 0, vextq_u8(c01[2], c01[3], 14));
1550   vst1q_u8(dst + 23 * stride + 16, vextq_u8(c01[3], l31, 14));
1551   vst1q_u8(dst + 24 * stride + 0, c01[3]);
1552   vst1q_u8(dst + 24 * stride + 16, l31);
1553   vst1q_u8(dst + 25 * stride + 0, vextq_u8(c01[3], l31, 2));
1554   vst1q_u8(dst + 25 * stride + 16, l31);
1555   vst1q_u8(dst + 26 * stride + 0, vextq_u8(c01[3], l31, 4));
1556   vst1q_u8(dst + 26 * stride + 16, l31);
1557   vst1q_u8(dst + 27 * stride + 0, vextq_u8(c01[3], l31, 6));
1558   vst1q_u8(dst + 27 * stride + 16, l31);
1559   vst1q_u8(dst + 28 * stride + 0, vextq_u8(c01[3], l31, 8));
1560   vst1q_u8(dst + 28 * stride + 16, l31);
1561   vst1q_u8(dst + 29 * stride + 0, vextq_u8(c01[3], l31, 10));
1562   vst1q_u8(dst + 29 * stride + 16, l31);
1563   vst1q_u8(dst + 30 * stride + 0, vextq_u8(c01[3], l31, 12));
1564   vst1q_u8(dst + 30 * stride + 16, l31);
1565   vst1q_u8(dst + 31 * stride + 0, vextq_u8(c01[3], l31, 14));
1566   vst1q_u8(dst + 31 * stride + 16, l31);
1567 }
1568 
1569 // -----------------------------------------------------------------------------
1570 
1571 #if !HAVE_NEON_ASM
1572 
vpx_v_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1573 void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
1574                               const uint8_t *above, const uint8_t *left) {
1575   const uint32_t d = *(const uint32_t *)above;
1576   int i;
1577   (void)left;
1578 
1579   for (i = 0; i < 4; i++, dst += stride) {
1580     *(uint32_t *)dst = d;
1581   }
1582 }
1583 
vpx_v_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1584 void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
1585                               const uint8_t *above, const uint8_t *left) {
1586   const uint8x8_t d = vld1_u8(above);
1587   int i;
1588   (void)left;
1589 
1590   for (i = 0; i < 8; i++, dst += stride) {
1591     vst1_u8(dst, d);
1592   }
1593 }
1594 
vpx_v_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1595 void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
1596                                 const uint8_t *above, const uint8_t *left) {
1597   const uint8x16_t d = vld1q_u8(above);
1598   int i;
1599   (void)left;
1600 
1601   for (i = 0; i < 16; i++, dst += stride) {
1602     vst1q_u8(dst, d);
1603   }
1604 }
1605 
vpx_v_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1606 void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
1607                                 const uint8_t *above, const uint8_t *left) {
1608   const uint8x16_t d0 = vld1q_u8(above);
1609   const uint8x16_t d1 = vld1q_u8(above + 16);
1610   int i;
1611   (void)left;
1612 
1613   for (i = 0; i < 32; i++) {
1614     // Note: performance was worse using vst2q_u8 under gcc-4.9 & clang-3.8.
1615     // clang-3.8 unrolled the loop fully with no filler so the cause is likely
1616     // the latency of the instruction.
1617     vst1q_u8(dst, d0);
1618     dst += 16;
1619     vst1q_u8(dst, d1);
1620     dst += stride - 16;
1621   }
1622 }
1623 
1624 // -----------------------------------------------------------------------------
1625 
vpx_h_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1626 void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
1627                               const uint8_t *above, const uint8_t *left) {
1628   const uint32x2_t zero = vdup_n_u32(0);
1629   const uint8x8_t left_u8 =
1630       vreinterpret_u8_u32(vld1_lane_u32((const uint32_t *)left, zero, 0));
1631   uint8x8_t d;
1632   (void)above;
1633 
1634   d = vdup_lane_u8(left_u8, 0);
1635   vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
1636   dst += stride;
1637   d = vdup_lane_u8(left_u8, 1);
1638   vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
1639   dst += stride;
1640   d = vdup_lane_u8(left_u8, 2);
1641   vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
1642   dst += stride;
1643   d = vdup_lane_u8(left_u8, 3);
1644   vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
1645 }
1646 
vpx_h_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1647 void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
1648                               const uint8_t *above, const uint8_t *left) {
1649   const uint8x8_t left_u8 = vld1_u8(left);
1650   uint8x8_t d;
1651   (void)above;
1652 
1653   d = vdup_lane_u8(left_u8, 0);
1654   vst1_u8(dst, d);
1655   dst += stride;
1656   d = vdup_lane_u8(left_u8, 1);
1657   vst1_u8(dst, d);
1658   dst += stride;
1659   d = vdup_lane_u8(left_u8, 2);
1660   vst1_u8(dst, d);
1661   dst += stride;
1662   d = vdup_lane_u8(left_u8, 3);
1663   vst1_u8(dst, d);
1664   dst += stride;
1665   d = vdup_lane_u8(left_u8, 4);
1666   vst1_u8(dst, d);
1667   dst += stride;
1668   d = vdup_lane_u8(left_u8, 5);
1669   vst1_u8(dst, d);
1670   dst += stride;
1671   d = vdup_lane_u8(left_u8, 6);
1672   vst1_u8(dst, d);
1673   dst += stride;
1674   d = vdup_lane_u8(left_u8, 7);
1675   vst1_u8(dst, d);
1676 }
1677 
h_store_16x8(uint8_t ** dst,const ptrdiff_t stride,const uint8x8_t left)1678 static INLINE void h_store_16x8(uint8_t **dst, const ptrdiff_t stride,
1679                                 const uint8x8_t left) {
1680   const uint8x16_t row_0 = vdupq_lane_u8(left, 0);
1681   const uint8x16_t row_1 = vdupq_lane_u8(left, 1);
1682   const uint8x16_t row_2 = vdupq_lane_u8(left, 2);
1683   const uint8x16_t row_3 = vdupq_lane_u8(left, 3);
1684   const uint8x16_t row_4 = vdupq_lane_u8(left, 4);
1685   const uint8x16_t row_5 = vdupq_lane_u8(left, 5);
1686   const uint8x16_t row_6 = vdupq_lane_u8(left, 6);
1687   const uint8x16_t row_7 = vdupq_lane_u8(left, 7);
1688 
1689   vst1q_u8(*dst, row_0);
1690   *dst += stride;
1691   vst1q_u8(*dst, row_1);
1692   *dst += stride;
1693   vst1q_u8(*dst, row_2);
1694   *dst += stride;
1695   vst1q_u8(*dst, row_3);
1696   *dst += stride;
1697   vst1q_u8(*dst, row_4);
1698   *dst += stride;
1699   vst1q_u8(*dst, row_5);
1700   *dst += stride;
1701   vst1q_u8(*dst, row_6);
1702   *dst += stride;
1703   vst1q_u8(*dst, row_7);
1704   *dst += stride;
1705 }
1706 
vpx_h_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1707 void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
1708                                 const uint8_t *above, const uint8_t *left) {
1709   const uint8x16_t left_u8q = vld1q_u8(left);
1710   (void)above;
1711 
1712   h_store_16x8(&dst, stride, vget_low_u8(left_u8q));
1713   h_store_16x8(&dst, stride, vget_high_u8(left_u8q));
1714 }
1715 
h_store_32x8(uint8_t ** dst,const ptrdiff_t stride,const uint8x8_t left)1716 static INLINE void h_store_32x8(uint8_t **dst, const ptrdiff_t stride,
1717                                 const uint8x8_t left) {
1718   const uint8x16_t row_0 = vdupq_lane_u8(left, 0);
1719   const uint8x16_t row_1 = vdupq_lane_u8(left, 1);
1720   const uint8x16_t row_2 = vdupq_lane_u8(left, 2);
1721   const uint8x16_t row_3 = vdupq_lane_u8(left, 3);
1722   const uint8x16_t row_4 = vdupq_lane_u8(left, 4);
1723   const uint8x16_t row_5 = vdupq_lane_u8(left, 5);
1724   const uint8x16_t row_6 = vdupq_lane_u8(left, 6);
1725   const uint8x16_t row_7 = vdupq_lane_u8(left, 7);
1726 
1727   vst1q_u8(*dst, row_0);  // Note clang-3.8 produced poor code w/vst2q_u8
1728   *dst += 16;
1729   vst1q_u8(*dst, row_0);
1730   *dst += stride - 16;
1731   vst1q_u8(*dst, row_1);
1732   *dst += 16;
1733   vst1q_u8(*dst, row_1);
1734   *dst += stride - 16;
1735   vst1q_u8(*dst, row_2);
1736   *dst += 16;
1737   vst1q_u8(*dst, row_2);
1738   *dst += stride - 16;
1739   vst1q_u8(*dst, row_3);
1740   *dst += 16;
1741   vst1q_u8(*dst, row_3);
1742   *dst += stride - 16;
1743   vst1q_u8(*dst, row_4);
1744   *dst += 16;
1745   vst1q_u8(*dst, row_4);
1746   *dst += stride - 16;
1747   vst1q_u8(*dst, row_5);
1748   *dst += 16;
1749   vst1q_u8(*dst, row_5);
1750   *dst += stride - 16;
1751   vst1q_u8(*dst, row_6);
1752   *dst += 16;
1753   vst1q_u8(*dst, row_6);
1754   *dst += stride - 16;
1755   vst1q_u8(*dst, row_7);
1756   *dst += 16;
1757   vst1q_u8(*dst, row_7);
1758   *dst += stride - 16;
1759 }
1760 
vpx_h_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1761 void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
1762                                 const uint8_t *above, const uint8_t *left) {
1763   int i;
1764   (void)above;
1765 
1766   for (i = 0; i < 2; i++, left += 16) {
1767     const uint8x16_t left_u8 = vld1q_u8(left);
1768     h_store_32x8(&dst, stride, vget_low_u8(left_u8));
1769     h_store_32x8(&dst, stride, vget_high_u8(left_u8));
1770   }
1771 }
1772 
1773 // -----------------------------------------------------------------------------
1774 
convert_u8_to_s16(uint8x8_t v)1775 static INLINE int16x8_t convert_u8_to_s16(uint8x8_t v) {
1776   return vreinterpretq_s16_u16(vmovl_u8(v));
1777 }
1778 
vpx_tm_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1779 void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
1780                                const uint8_t *above, const uint8_t *left) {
1781   const uint8x8_t top_left = vld1_dup_u8(above - 1);
1782   const uint8x8_t left_u8 = vld1_u8(left);
1783   const uint8x8_t above_u8 = vld1_u8(above);
1784   const int16x4_t left_s16 = vget_low_s16(convert_u8_to_s16(left_u8));
1785   int16x8_t sub, sum;
1786   uint32x2_t d;
1787 
1788   sub = vreinterpretq_s16_u16(vsubl_u8(above_u8, top_left));
1789   // Avoid vcombine_s16() which generates lots of redundant code with clang-3.8.
1790   sub = vreinterpretq_s16_s64(
1791       vdupq_lane_s64(vreinterpret_s64_s16(vget_low_s16(sub)), 0));
1792 
1793   sum = vcombine_s16(vdup_lane_s16(left_s16, 0), vdup_lane_s16(left_s16, 1));
1794   sum = vaddq_s16(sum, sub);
1795   d = vreinterpret_u32_u8(vqmovun_s16(sum));
1796   vst1_lane_u32((uint32_t *)dst, d, 0);
1797   dst += stride;
1798   vst1_lane_u32((uint32_t *)dst, d, 1);
1799   dst += stride;
1800 
1801   sum = vcombine_s16(vdup_lane_s16(left_s16, 2), vdup_lane_s16(left_s16, 3));
1802   sum = vaddq_s16(sum, sub);
1803   d = vreinterpret_u32_u8(vqmovun_s16(sum));
1804   vst1_lane_u32((uint32_t *)dst, d, 0);
1805   dst += stride;
1806   vst1_lane_u32((uint32_t *)dst, d, 1);
1807 }
1808 
tm_8_kernel(uint8_t ** dst,const ptrdiff_t stride,const int16x8_t left_dup,const int16x8_t sub)1809 static INLINE void tm_8_kernel(uint8_t **dst, const ptrdiff_t stride,
1810                                const int16x8_t left_dup, const int16x8_t sub) {
1811   const int16x8_t sum = vaddq_s16(left_dup, sub);
1812   const uint8x8_t d = vqmovun_s16(sum);
1813   vst1_u8(*dst, d);
1814   *dst += stride;
1815 }
1816 
vpx_tm_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1817 void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
1818                                const uint8_t *above, const uint8_t *left) {
1819   const uint8x8_t top_left = vld1_dup_u8(above - 1);
1820   const uint8x8_t above_u8 = vld1_u8(above);
1821   const uint8x8_t left_u8 = vld1_u8(left);
1822   const int16x8_t left_s16q = convert_u8_to_s16(left_u8);
1823   const int16x8_t sub = vreinterpretq_s16_u16(vsubl_u8(above_u8, top_left));
1824   int16x4_t left_s16d = vget_low_s16(left_s16q);
1825   int i;
1826 
1827   for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) {
1828     int16x8_t left_dup;
1829 
1830     left_dup = vdupq_lane_s16(left_s16d, 0);
1831     tm_8_kernel(&dst, stride, left_dup, sub);
1832     left_dup = vdupq_lane_s16(left_s16d, 1);
1833     tm_8_kernel(&dst, stride, left_dup, sub);
1834     left_dup = vdupq_lane_s16(left_s16d, 2);
1835     tm_8_kernel(&dst, stride, left_dup, sub);
1836     left_dup = vdupq_lane_s16(left_s16d, 3);
1837     tm_8_kernel(&dst, stride, left_dup, sub);
1838   }
1839 }
1840 
tm_16_kernel(uint8_t ** dst,const ptrdiff_t stride,const int16x8_t left_dup,const int16x8_t sub0,const int16x8_t sub1)1841 static INLINE void tm_16_kernel(uint8_t **dst, const ptrdiff_t stride,
1842                                 const int16x8_t left_dup, const int16x8_t sub0,
1843                                 const int16x8_t sub1) {
1844   const int16x8_t sum0 = vaddq_s16(left_dup, sub0);
1845   const int16x8_t sum1 = vaddq_s16(left_dup, sub1);
1846   const uint8x8_t d0 = vqmovun_s16(sum0);
1847   const uint8x8_t d1 = vqmovun_s16(sum1);
1848   vst1_u8(*dst, d0);
1849   *dst += 8;
1850   vst1_u8(*dst, d1);
1851   *dst += stride - 8;
1852 }
1853 
vpx_tm_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1854 void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
1855                                  const uint8_t *above, const uint8_t *left) {
1856   const uint8x16_t top_left = vld1q_dup_u8(above - 1);
1857   const uint8x16_t above_u8 = vld1q_u8(above);
1858   const int16x8_t sub0 = vreinterpretq_s16_u16(
1859       vsubl_u8(vget_low_u8(above_u8), vget_low_u8(top_left)));
1860   const int16x8_t sub1 = vreinterpretq_s16_u16(
1861       vsubl_u8(vget_high_u8(above_u8), vget_high_u8(top_left)));
1862   int16x8_t left_dup;
1863   int i;
1864 
1865   for (i = 0; i < 2; i++, left += 8) {
1866     const uint8x8_t left_u8 = vld1_u8(left);
1867     const int16x8_t left_s16q = convert_u8_to_s16(left_u8);
1868     const int16x4_t left_low = vget_low_s16(left_s16q);
1869     const int16x4_t left_high = vget_high_s16(left_s16q);
1870 
1871     left_dup = vdupq_lane_s16(left_low, 0);
1872     tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
1873     left_dup = vdupq_lane_s16(left_low, 1);
1874     tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
1875     left_dup = vdupq_lane_s16(left_low, 2);
1876     tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
1877     left_dup = vdupq_lane_s16(left_low, 3);
1878     tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
1879 
1880     left_dup = vdupq_lane_s16(left_high, 0);
1881     tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
1882     left_dup = vdupq_lane_s16(left_high, 1);
1883     tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
1884     left_dup = vdupq_lane_s16(left_high, 2);
1885     tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
1886     left_dup = vdupq_lane_s16(left_high, 3);
1887     tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
1888   }
1889 }
1890 
tm_32_kernel(uint8_t ** dst,const ptrdiff_t stride,const int16x8_t left_dup,const int16x8_t sub0,const int16x8_t sub1,const int16x8_t sub2,const int16x8_t sub3)1891 static INLINE void tm_32_kernel(uint8_t **dst, const ptrdiff_t stride,
1892                                 const int16x8_t left_dup, const int16x8_t sub0,
1893                                 const int16x8_t sub1, const int16x8_t sub2,
1894                                 const int16x8_t sub3) {
1895   const int16x8_t sum0 = vaddq_s16(left_dup, sub0);
1896   const int16x8_t sum1 = vaddq_s16(left_dup, sub1);
1897   const int16x8_t sum2 = vaddq_s16(left_dup, sub2);
1898   const int16x8_t sum3 = vaddq_s16(left_dup, sub3);
1899   const uint8x8_t d0 = vqmovun_s16(sum0);
1900   const uint8x8_t d1 = vqmovun_s16(sum1);
1901   const uint8x8_t d2 = vqmovun_s16(sum2);
1902   const uint8x8_t d3 = vqmovun_s16(sum3);
1903 
1904   vst1q_u8(*dst, vcombine_u8(d0, d1));
1905   *dst += 16;
1906   vst1q_u8(*dst, vcombine_u8(d2, d3));
1907   *dst += stride - 16;
1908 }
1909 
vpx_tm_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1910 void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
1911                                  const uint8_t *above, const uint8_t *left) {
1912   const uint8x16_t top_left = vld1q_dup_u8(above - 1);
1913   const uint8x16_t above_low = vld1q_u8(above);
1914   const uint8x16_t above_high = vld1q_u8(above + 16);
1915   const int16x8_t sub0 = vreinterpretq_s16_u16(
1916       vsubl_u8(vget_low_u8(above_low), vget_low_u8(top_left)));
1917   const int16x8_t sub1 = vreinterpretq_s16_u16(
1918       vsubl_u8(vget_high_u8(above_low), vget_high_u8(top_left)));
1919   const int16x8_t sub2 = vreinterpretq_s16_u16(
1920       vsubl_u8(vget_low_u8(above_high), vget_low_u8(top_left)));
1921   const int16x8_t sub3 = vreinterpretq_s16_u16(
1922       vsubl_u8(vget_high_u8(above_high), vget_high_u8(top_left)));
1923   int16x8_t left_dup;
1924   int i, j;
1925 
1926   for (j = 0; j < 4; j++, left += 8) {
1927     const uint8x8_t left_u8 = vld1_u8(left);
1928     const int16x8_t left_s16q = convert_u8_to_s16(left_u8);
1929     int16x4_t left_s16d = vget_low_s16(left_s16q);
1930     for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) {
1931       left_dup = vdupq_lane_s16(left_s16d, 0);
1932       tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3);
1933       left_dup = vdupq_lane_s16(left_s16d, 1);
1934       tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3);
1935       left_dup = vdupq_lane_s16(left_s16d, 2);
1936       tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3);
1937       left_dup = vdupq_lane_s16(left_s16d, 3);
1938       tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3);
1939     }
1940   }
1941 }
1942 #endif  // !HAVE_NEON_ASM
1943