1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #include "./vpx_config.h"
14 #include "./vpx_dsp_rtcd.h"
15 #include "mem_neon.h"
16 #include "sum_neon.h"
17 #include "vpx/vpx_integer.h"
18
19 //------------------------------------------------------------------------------
20 // DC 4x4
21
dc_sum_4(const uint8_t * ref)22 static INLINE uint16_t dc_sum_4(const uint8_t *ref) {
23 return horizontal_add_uint8x4(load_unaligned_u8_4x1(ref));
24 }
25
dc_store_4x4(uint8_t * dst,ptrdiff_t stride,const uint8x8_t dc)26 static INLINE void dc_store_4x4(uint8_t *dst, ptrdiff_t stride,
27 const uint8x8_t dc) {
28 int i;
29 for (i = 0; i < 4; ++i, dst += stride) {
30 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dc), 0);
31 }
32 }
33
vpx_dc_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)34 void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
35 const uint8_t *above, const uint8_t *left) {
36 const uint8x8_t a = load_unaligned_u8_4x1(above);
37 const uint8x8_t l = load_unaligned_u8_4x1(left);
38 const uint16x4_t al = vget_low_u16(vaddl_u8(a, l));
39 const uint16_t sum = horizontal_add_uint16x4(al);
40 const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3);
41 dc_store_4x4(dst, stride, dc);
42 }
43
vpx_dc_left_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)44 void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
45 const uint8_t *above, const uint8_t *left) {
46 const uint16_t sum = dc_sum_4(left);
47 const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 2);
48 (void)above;
49 dc_store_4x4(dst, stride, dc);
50 }
51
vpx_dc_top_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)52 void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
53 const uint8_t *above, const uint8_t *left) {
54 const uint16_t sum = dc_sum_4(above);
55 const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 2);
56 (void)left;
57 dc_store_4x4(dst, stride, dc);
58 }
59
vpx_dc_128_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)60 void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
61 const uint8_t *above, const uint8_t *left) {
62 const uint8x8_t dc = vdup_n_u8(0x80);
63 (void)above;
64 (void)left;
65 dc_store_4x4(dst, stride, dc);
66 }
67
68 //------------------------------------------------------------------------------
69 // DC 8x8
70
dc_sum_8(const uint8_t * ref)71 static INLINE uint16_t dc_sum_8(const uint8_t *ref) {
72 return horizontal_add_uint8x8(vld1_u8(ref));
73 }
74
dc_store_8x8(uint8_t * dst,ptrdiff_t stride,const uint8x8_t dc)75 static INLINE void dc_store_8x8(uint8_t *dst, ptrdiff_t stride,
76 const uint8x8_t dc) {
77 int i;
78 for (i = 0; i < 8; ++i, dst += stride) {
79 vst1_u8(dst, dc);
80 }
81 }
82
vpx_dc_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)83 void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
84 const uint8_t *above, const uint8_t *left) {
85 const uint8x8_t above_u8 = vld1_u8(above);
86 const uint8x8_t left_u8 = vld1_u8(left);
87 const uint16x8_t al = vaddl_u8(above_u8, left_u8);
88 const uint16_t sum = horizontal_add_uint16x8(al);
89 const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 4);
90 dc_store_8x8(dst, stride, dc);
91 }
92
vpx_dc_left_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)93 void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
94 const uint8_t *above, const uint8_t *left) {
95 const uint16_t sum = dc_sum_8(left);
96 const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3);
97 (void)above;
98 dc_store_8x8(dst, stride, dc);
99 }
100
vpx_dc_top_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)101 void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
102 const uint8_t *above, const uint8_t *left) {
103 const uint16_t sum = dc_sum_8(above);
104 const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3);
105 (void)left;
106 dc_store_8x8(dst, stride, dc);
107 }
108
vpx_dc_128_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)109 void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
110 const uint8_t *above, const uint8_t *left) {
111 const uint8x8_t dc = vdup_n_u8(0x80);
112 (void)above;
113 (void)left;
114 dc_store_8x8(dst, stride, dc);
115 }
116
117 //------------------------------------------------------------------------------
118 // DC 16x16
119
dc_sum_16(const uint8_t * ref)120 static INLINE uint16_t dc_sum_16(const uint8_t *ref) {
121 return horizontal_add_uint8x16(vld1q_u8(ref));
122 }
123
dc_store_16x16(uint8_t * dst,ptrdiff_t stride,const uint8x16_t dc)124 static INLINE void dc_store_16x16(uint8_t *dst, ptrdiff_t stride,
125 const uint8x16_t dc) {
126 int i;
127 for (i = 0; i < 16; ++i, dst += stride) {
128 vst1q_u8(dst + 0, dc);
129 }
130 }
131
vpx_dc_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)132 void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
133 const uint8_t *above, const uint8_t *left) {
134 const uint8x16_t ref0 = vld1q_u8(above);
135 const uint8x16_t ref1 = vld1q_u8(left);
136 const uint16x8_t a = vpaddlq_u8(ref0);
137 const uint16x8_t l = vpaddlq_u8(ref1);
138 const uint16x8_t al = vaddq_u16(a, l);
139 const uint16_t sum = horizontal_add_uint16x8(al);
140 const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0);
141 dc_store_16x16(dst, stride, dc);
142 }
143
vpx_dc_left_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)144 void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
145 const uint8_t *above,
146 const uint8_t *left) {
147 const uint16_t sum = dc_sum_16(left);
148 const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 4), 0);
149 (void)above;
150 dc_store_16x16(dst, stride, dc);
151 }
152
vpx_dc_top_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)153 void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
154 const uint8_t *above,
155 const uint8_t *left) {
156 const uint16_t sum = dc_sum_16(above);
157 const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 4), 0);
158 (void)left;
159 dc_store_16x16(dst, stride, dc);
160 }
161
vpx_dc_128_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)162 void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
163 const uint8_t *above,
164 const uint8_t *left) {
165 const uint8x16_t dc = vdupq_n_u8(0x80);
166 (void)above;
167 (void)left;
168 dc_store_16x16(dst, stride, dc);
169 }
170
171 //------------------------------------------------------------------------------
172 // DC 32x32
173
dc_sum_32(const uint8_t * ref)174 static INLINE uint16_t dc_sum_32(const uint8_t *ref) {
175 const uint8x16_t r0 = vld1q_u8(ref + 0);
176 const uint8x16_t r1 = vld1q_u8(ref + 16);
177 const uint16x8_t r01 = vaddq_u16(vpaddlq_u8(r0), vpaddlq_u8(r1));
178 return horizontal_add_uint16x8(r01);
179 }
180
dc_store_32x32(uint8_t * dst,ptrdiff_t stride,const uint8x16_t dc)181 static INLINE void dc_store_32x32(uint8_t *dst, ptrdiff_t stride,
182 const uint8x16_t dc) {
183 int i;
184 for (i = 0; i < 32; ++i, dst += stride) {
185 vst1q_u8(dst + 0, dc);
186 vst1q_u8(dst + 16, dc);
187 }
188 }
189
vpx_dc_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)190 void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
191 const uint8_t *above, const uint8_t *left) {
192 const uint8x16_t a0 = vld1q_u8(above + 0);
193 const uint8x16_t a1 = vld1q_u8(above + 16);
194 const uint8x16_t l0 = vld1q_u8(left + 0);
195 const uint8x16_t l1 = vld1q_u8(left + 16);
196 const uint16x8_t a01 = vaddq_u16(vpaddlq_u8(a0), vpaddlq_u8(a1));
197 const uint16x8_t l01 = vaddq_u16(vpaddlq_u8(l0), vpaddlq_u8(l1));
198 const uint16x8_t al = vaddq_u16(a01, l01);
199 const uint16_t sum = horizontal_add_uint16x8(al);
200 const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 6), 0);
201 dc_store_32x32(dst, stride, dc);
202 }
203
vpx_dc_left_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)204 void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
205 const uint8_t *above,
206 const uint8_t *left) {
207 const uint16_t sum = dc_sum_32(left);
208 const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0);
209 (void)above;
210 dc_store_32x32(dst, stride, dc);
211 }
212
vpx_dc_top_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)213 void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
214 const uint8_t *above,
215 const uint8_t *left) {
216 const uint16_t sum = dc_sum_32(above);
217 const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0);
218 (void)left;
219 dc_store_32x32(dst, stride, dc);
220 }
221
vpx_dc_128_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)222 void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
223 const uint8_t *above,
224 const uint8_t *left) {
225 const uint8x16_t dc = vdupq_n_u8(0x80);
226 (void)above;
227 (void)left;
228 dc_store_32x32(dst, stride, dc);
229 }
230
231 // -----------------------------------------------------------------------------
232
vpx_d45_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)233 void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
234 const uint8_t *above, const uint8_t *left) {
235 uint8x8_t a0, a1, a2, d0;
236 uint8_t a7;
237 (void)left;
238
239 a0 = vld1_u8(above);
240 a7 = above[7];
241
242 // [ above[1], ..., above[6], x, x ]
243 a1 = vext_u8(a0, a0, 1);
244 // [ above[2], ..., above[7], x, x ]
245 a2 = vext_u8(a0, a0, 2);
246
247 // d0[0] = AVG3(above[0], above[1], above[2]);
248 // ...
249 // d0[5] = AVG3(above[5], above[6], above[7]);
250 // d0[6] = x (don't care)
251 // d0[7] = x (don't care)
252 d0 = vrhadd_u8(vhadd_u8(a0, a2), a1);
253
254 // We want:
255 // stride=0 [ d0[0], d0[1], d0[2], d0[3] ]
256 // stride=1 [ d0[1], d0[2], d0[3], d0[4] ]
257 // stride=2 [ d0[2], d0[3], d0[4], d0[5] ]
258 // stride=2 [ d0[3], d0[4], d0[5], above[7] ]
259 store_u8_4x1(dst + 0 * stride, d0);
260 store_u8_4x1(dst + 1 * stride, vext_u8(d0, d0, 1));
261 store_u8_4x1(dst + 2 * stride, vext_u8(d0, d0, 2));
262 store_u8_4x1(dst + 3 * stride, vext_u8(d0, d0, 3));
263
264 // We stored d0[6] above, so fixup into above[7].
265 dst[3 * stride + 3] = a7;
266 }
267
vpx_d45_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)268 void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
269 const uint8_t *above, const uint8_t *left) {
270 uint8x8_t ax0, a0, a1, a7, d0;
271 (void)left;
272
273 a0 = vld1_u8(above + 0);
274 a1 = vld1_u8(above + 1);
275 a7 = vld1_dup_u8(above + 7);
276
277 // We want to calculate the AVG3 result in lanes 1-7 inclusive so we can
278 // shift in above[7] later, so shift a0 across by one to get the right
279 // inputs:
280 // [ x, above[0], ... , above[6] ]
281 ax0 = vext_u8(a0, a0, 7);
282
283 // d0[0] = x (don't care)
284 // d0[1] = AVG3(above[0], above[1], above[2]);
285 // ...
286 // d0[7] = AVG3(above[6], above[7], above[8]);
287 d0 = vrhadd_u8(vhadd_u8(ax0, a1), a0);
288
289 // Undo the earlier ext, incrementally shift in duplicates of above[7].
290 vst1_u8(dst + 0 * stride, vext_u8(d0, a7, 1));
291 vst1_u8(dst + 1 * stride, vext_u8(d0, a7, 2));
292 vst1_u8(dst + 2 * stride, vext_u8(d0, a7, 3));
293 vst1_u8(dst + 3 * stride, vext_u8(d0, a7, 4));
294 vst1_u8(dst + 4 * stride, vext_u8(d0, a7, 5));
295 vst1_u8(dst + 5 * stride, vext_u8(d0, a7, 6));
296 vst1_u8(dst + 6 * stride, vext_u8(d0, a7, 7));
297 vst1_u8(dst + 7 * stride, a7);
298 }
299
vpx_d45_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)300 void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
301 const uint8_t *above, const uint8_t *left) {
302 uint8x16_t ax0, a0, a1, a15, d0;
303 (void)left;
304
305 a0 = vld1q_u8(above + 0);
306 a1 = vld1q_u8(above + 1);
307 a15 = vld1q_dup_u8(above + 15);
308
309 // We want to calculate the AVG3 result in lanes 1-15 inclusive so we can
310 // shift in above[15] later, so shift a0 across by one to get the right
311 // inputs:
312 // [ x, above[0], ... , above[14] ]
313 ax0 = vextq_u8(a0, a0, 15);
314
315 // d0[0] = x (don't care)
316 // d0[1] = AVG3(above[0], above[1], above[2]);
317 // ...
318 // d0[15] = AVG3(above[14], above[15], above[16]);
319 d0 = vrhaddq_u8(vhaddq_u8(ax0, a1), a0);
320
321 // Undo the earlier ext, incrementally shift in duplicates of above[15].
322 vst1q_u8(dst + 0 * stride, vextq_u8(d0, a15, 1));
323 vst1q_u8(dst + 1 * stride, vextq_u8(d0, a15, 2));
324 vst1q_u8(dst + 2 * stride, vextq_u8(d0, a15, 3));
325 vst1q_u8(dst + 3 * stride, vextq_u8(d0, a15, 4));
326 vst1q_u8(dst + 4 * stride, vextq_u8(d0, a15, 5));
327 vst1q_u8(dst + 5 * stride, vextq_u8(d0, a15, 6));
328 vst1q_u8(dst + 6 * stride, vextq_u8(d0, a15, 7));
329 vst1q_u8(dst + 7 * stride, vextq_u8(d0, a15, 8));
330 vst1q_u8(dst + 8 * stride, vextq_u8(d0, a15, 9));
331 vst1q_u8(dst + 9 * stride, vextq_u8(d0, a15, 10));
332 vst1q_u8(dst + 10 * stride, vextq_u8(d0, a15, 11));
333 vst1q_u8(dst + 11 * stride, vextq_u8(d0, a15, 12));
334 vst1q_u8(dst + 12 * stride, vextq_u8(d0, a15, 13));
335 vst1q_u8(dst + 13 * stride, vextq_u8(d0, a15, 14));
336 vst1q_u8(dst + 14 * stride, vextq_u8(d0, a15, 15));
337 vst1q_u8(dst + 15 * stride, a15);
338 }
339
vpx_d45_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)340 void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
341 const uint8_t *above, const uint8_t *left) {
342 uint8x16_t ax0, a0, a1, a15, a16, a17, a31, d0[2];
343 (void)left;
344
345 a0 = vld1q_u8(above + 0);
346 a1 = vld1q_u8(above + 1);
347 a15 = vld1q_u8(above + 15);
348 a16 = vld1q_u8(above + 16);
349 a17 = vld1q_u8(above + 17);
350 a31 = vld1q_dup_u8(above + 31);
351
352 // We want to calculate the AVG3 result in lanes 1-15 inclusive so we can
353 // shift in above[15] later, so shift a0 across by one to get the right
354 // inputs:
355 // [ x, above[0], ... , above[14] ]
356 ax0 = vextq_u8(a0, a0, 15);
357
358 // d0[0] = x (don't care)
359 // d0[1] = AVG3(above[0], above[1], above[2]);
360 // ...
361 // d0[15] = AVG3(above[14], above[15], above[16]);
362 d0[0] = vrhaddq_u8(vhaddq_u8(ax0, a1), a0);
363 d0[1] = vrhaddq_u8(vhaddq_u8(a15, a17), a16);
364
365 // Undo the earlier ext, incrementally shift in duplicates of above[15].
366 vst1q_u8(dst + 0 * stride + 0, vextq_u8(d0[0], d0[1], 1));
367 vst1q_u8(dst + 0 * stride + 16, vextq_u8(d0[1], a31, 1));
368 vst1q_u8(dst + 1 * stride + 0, vextq_u8(d0[0], d0[1], 2));
369 vst1q_u8(dst + 1 * stride + 16, vextq_u8(d0[1], a31, 2));
370 vst1q_u8(dst + 2 * stride + 0, vextq_u8(d0[0], d0[1], 3));
371 vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0[1], a31, 3));
372 vst1q_u8(dst + 3 * stride + 0, vextq_u8(d0[0], d0[1], 4));
373 vst1q_u8(dst + 3 * stride + 16, vextq_u8(d0[1], a31, 4));
374 vst1q_u8(dst + 4 * stride + 0, vextq_u8(d0[0], d0[1], 5));
375 vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0[1], a31, 5));
376 vst1q_u8(dst + 5 * stride + 0, vextq_u8(d0[0], d0[1], 6));
377 vst1q_u8(dst + 5 * stride + 16, vextq_u8(d0[1], a31, 6));
378 vst1q_u8(dst + 6 * stride + 0, vextq_u8(d0[0], d0[1], 7));
379 vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0[1], a31, 7));
380 vst1q_u8(dst + 7 * stride + 0, vextq_u8(d0[0], d0[1], 8));
381 vst1q_u8(dst + 7 * stride + 16, vextq_u8(d0[1], a31, 8));
382 vst1q_u8(dst + 8 * stride + 0, vextq_u8(d0[0], d0[1], 9));
383 vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0[1], a31, 9));
384 vst1q_u8(dst + 9 * stride + 0, vextq_u8(d0[0], d0[1], 10));
385 vst1q_u8(dst + 9 * stride + 16, vextq_u8(d0[1], a31, 10));
386 vst1q_u8(dst + 10 * stride + 0, vextq_u8(d0[0], d0[1], 11));
387 vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0[1], a31, 11));
388 vst1q_u8(dst + 11 * stride + 0, vextq_u8(d0[0], d0[1], 12));
389 vst1q_u8(dst + 11 * stride + 16, vextq_u8(d0[1], a31, 12));
390 vst1q_u8(dst + 12 * stride + 0, vextq_u8(d0[0], d0[1], 13));
391 vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0[1], a31, 13));
392 vst1q_u8(dst + 13 * stride + 0, vextq_u8(d0[0], d0[1], 14));
393 vst1q_u8(dst + 13 * stride + 16, vextq_u8(d0[1], a31, 14));
394 vst1q_u8(dst + 14 * stride + 0, vextq_u8(d0[0], d0[1], 15));
395 vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0[1], a31, 15));
396 vst1q_u8(dst + 15 * stride + 0, d0[1]);
397 vst1q_u8(dst + 15 * stride + 16, a31);
398
399 vst1q_u8(dst + 16 * stride + 0, vextq_u8(d0[1], a31, 1));
400 vst1q_u8(dst + 16 * stride + 16, a31);
401 vst1q_u8(dst + 17 * stride + 0, vextq_u8(d0[1], a31, 2));
402 vst1q_u8(dst + 17 * stride + 16, a31);
403 vst1q_u8(dst + 18 * stride + 0, vextq_u8(d0[1], a31, 3));
404 vst1q_u8(dst + 18 * stride + 16, a31);
405 vst1q_u8(dst + 19 * stride + 0, vextq_u8(d0[1], a31, 4));
406 vst1q_u8(dst + 19 * stride + 16, a31);
407 vst1q_u8(dst + 20 * stride + 0, vextq_u8(d0[1], a31, 5));
408 vst1q_u8(dst + 20 * stride + 16, a31);
409 vst1q_u8(dst + 21 * stride + 0, vextq_u8(d0[1], a31, 6));
410 vst1q_u8(dst + 21 * stride + 16, a31);
411 vst1q_u8(dst + 22 * stride + 0, vextq_u8(d0[1], a31, 7));
412 vst1q_u8(dst + 22 * stride + 16, a31);
413 vst1q_u8(dst + 23 * stride + 0, vextq_u8(d0[1], a31, 8));
414 vst1q_u8(dst + 23 * stride + 16, a31);
415 vst1q_u8(dst + 24 * stride + 0, vextq_u8(d0[1], a31, 9));
416 vst1q_u8(dst + 24 * stride + 16, a31);
417 vst1q_u8(dst + 25 * stride + 0, vextq_u8(d0[1], a31, 10));
418 vst1q_u8(dst + 25 * stride + 16, a31);
419 vst1q_u8(dst + 26 * stride + 0, vextq_u8(d0[1], a31, 11));
420 vst1q_u8(dst + 26 * stride + 16, a31);
421 vst1q_u8(dst + 27 * stride + 0, vextq_u8(d0[1], a31, 12));
422 vst1q_u8(dst + 27 * stride + 16, a31);
423 vst1q_u8(dst + 28 * stride + 0, vextq_u8(d0[1], a31, 13));
424 vst1q_u8(dst + 28 * stride + 16, a31);
425 vst1q_u8(dst + 29 * stride + 0, vextq_u8(d0[1], a31, 14));
426 vst1q_u8(dst + 29 * stride + 16, a31);
427 vst1q_u8(dst + 30 * stride + 0, vextq_u8(d0[1], a31, 15));
428 vst1q_u8(dst + 30 * stride + 16, a31);
429 vst1q_u8(dst + 31 * stride + 0, a31);
430 vst1q_u8(dst + 31 * stride + 16, a31);
431 }
432
433 // -----------------------------------------------------------------------------
434
vpx_d63_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)435 void vpx_d63_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
436 const uint8_t *above, const uint8_t *left) {
437 uint8x8_t a0, a1, a2, a3, d0, d1, d2, d3;
438 (void)left;
439
440 a0 = load_unaligned_u8_4x1(above + 0);
441 a1 = load_unaligned_u8_4x1(above + 1);
442 a2 = load_unaligned_u8_4x1(above + 2);
443 a3 = load_unaligned_u8_4x1(above + 3);
444
445 d0 = vrhadd_u8(a0, a1);
446 d1 = vrhadd_u8(vhadd_u8(a0, a2), a1);
447 d2 = vrhadd_u8(a1, a2);
448 d3 = vrhadd_u8(vhadd_u8(a1, a3), a2);
449
450 store_u8_4x1(dst + 0 * stride, d0);
451 store_u8_4x1(dst + 1 * stride, d1);
452 store_u8_4x1(dst + 2 * stride, d2);
453 store_u8_4x1(dst + 3 * stride, d3);
454 }
455
vpx_d63_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)456 void vpx_d63_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
457 const uint8_t *above, const uint8_t *left) {
458 uint8x8_t a0, a1, a2, a7, d0, d1;
459 (void)left;
460
461 a0 = vld1_u8(above + 0);
462 a1 = vld1_u8(above + 1);
463 a2 = vld1_u8(above + 2);
464 a7 = vld1_dup_u8(above + 7);
465
466 d0 = vrhadd_u8(a0, a1);
467 d1 = vrhadd_u8(vhadd_u8(a0, a2), a1);
468
469 vst1_u8(dst + 0 * stride, d0);
470 vst1_u8(dst + 1 * stride, d1);
471
472 d0 = vext_u8(d0, d0, 7);
473 d1 = vext_u8(d1, d1, 7);
474
475 vst1_u8(dst + 2 * stride, vext_u8(d0, a7, 2));
476 vst1_u8(dst + 3 * stride, vext_u8(d1, a7, 2));
477 vst1_u8(dst + 4 * stride, vext_u8(d0, a7, 3));
478 vst1_u8(dst + 5 * stride, vext_u8(d1, a7, 3));
479 vst1_u8(dst + 6 * stride, vext_u8(d0, a7, 4));
480 vst1_u8(dst + 7 * stride, vext_u8(d1, a7, 4));
481 }
482
vpx_d63_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)483 void vpx_d63_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
484 const uint8_t *above, const uint8_t *left) {
485 uint8x16_t a0, a1, a2, a15, d0, d1;
486 (void)left;
487
488 a0 = vld1q_u8(above + 0);
489 a1 = vld1q_u8(above + 1);
490 a2 = vld1q_u8(above + 2);
491 a15 = vld1q_dup_u8(above + 15);
492
493 d0 = vrhaddq_u8(a0, a1);
494 d1 = vrhaddq_u8(vhaddq_u8(a0, a2), a1);
495
496 vst1q_u8(dst + 0 * stride, d0);
497 vst1q_u8(dst + 1 * stride, d1);
498
499 d0 = vextq_u8(d0, d0, 15);
500 d1 = vextq_u8(d1, d1, 15);
501
502 vst1q_u8(dst + 2 * stride, vextq_u8(d0, a15, 2));
503 vst1q_u8(dst + 3 * stride, vextq_u8(d1, a15, 2));
504 vst1q_u8(dst + 4 * stride, vextq_u8(d0, a15, 3));
505 vst1q_u8(dst + 5 * stride, vextq_u8(d1, a15, 3));
506 vst1q_u8(dst + 6 * stride, vextq_u8(d0, a15, 4));
507 vst1q_u8(dst + 7 * stride, vextq_u8(d1, a15, 4));
508 vst1q_u8(dst + 8 * stride, vextq_u8(d0, a15, 5));
509 vst1q_u8(dst + 9 * stride, vextq_u8(d1, a15, 5));
510 vst1q_u8(dst + 10 * stride, vextq_u8(d0, a15, 6));
511 vst1q_u8(dst + 11 * stride, vextq_u8(d1, a15, 6));
512 vst1q_u8(dst + 12 * stride, vextq_u8(d0, a15, 7));
513 vst1q_u8(dst + 13 * stride, vextq_u8(d1, a15, 7));
514 vst1q_u8(dst + 14 * stride, vextq_u8(d0, a15, 8));
515 vst1q_u8(dst + 15 * stride, vextq_u8(d1, a15, 8));
516 }
517
vpx_d63_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)518 void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
519 const uint8_t *above, const uint8_t *left) {
520 uint8x16_t a0, a1, a2, a16, a17, a18, a31, d0_lo, d0_hi, d1_lo, d1_hi;
521 (void)left;
522
523 a0 = vld1q_u8(above + 0);
524 a1 = vld1q_u8(above + 1);
525 a2 = vld1q_u8(above + 2);
526 a16 = vld1q_u8(above + 16);
527 a17 = vld1q_u8(above + 17);
528 a18 = vld1q_u8(above + 18);
529 a31 = vld1q_dup_u8(above + 31);
530
531 d0_lo = vrhaddq_u8(a0, a1);
532 d0_hi = vrhaddq_u8(a16, a17);
533 d1_lo = vrhaddq_u8(vhaddq_u8(a0, a2), a1);
534 d1_hi = vrhaddq_u8(vhaddq_u8(a16, a18), a17);
535
536 vst1q_u8(dst + 0 * stride + 0, d0_lo);
537 vst1q_u8(dst + 0 * stride + 16, d0_hi);
538 vst1q_u8(dst + 1 * stride + 0, d1_lo);
539 vst1q_u8(dst + 1 * stride + 16, d1_hi);
540
541 d0_hi = vextq_u8(d0_lo, d0_hi, 15);
542 d0_lo = vextq_u8(d0_lo, d0_lo, 15);
543 d1_hi = vextq_u8(d1_lo, d1_hi, 15);
544 d1_lo = vextq_u8(d1_lo, d1_lo, 15);
545
546 vst1q_u8(dst + 2 * stride + 0, vextq_u8(d0_lo, d0_hi, 2));
547 vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0_hi, a31, 2));
548 vst1q_u8(dst + 3 * stride + 0, vextq_u8(d1_lo, d1_hi, 2));
549 vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_hi, a31, 2));
550 vst1q_u8(dst + 4 * stride + 0, vextq_u8(d0_lo, d0_hi, 3));
551 vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0_hi, a31, 3));
552 vst1q_u8(dst + 5 * stride + 0, vextq_u8(d1_lo, d1_hi, 3));
553 vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_hi, a31, 3));
554 vst1q_u8(dst + 6 * stride + 0, vextq_u8(d0_lo, d0_hi, 4));
555 vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0_hi, a31, 4));
556 vst1q_u8(dst + 7 * stride + 0, vextq_u8(d1_lo, d1_hi, 4));
557 vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_hi, a31, 4));
558 vst1q_u8(dst + 8 * stride + 0, vextq_u8(d0_lo, d0_hi, 5));
559 vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0_hi, a31, 5));
560 vst1q_u8(dst + 9 * stride + 0, vextq_u8(d1_lo, d1_hi, 5));
561 vst1q_u8(dst + 9 * stride + 16, vextq_u8(d1_hi, a31, 5));
562 vst1q_u8(dst + 10 * stride + 0, vextq_u8(d0_lo, d0_hi, 6));
563 vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0_hi, a31, 6));
564 vst1q_u8(dst + 11 * stride + 0, vextq_u8(d1_lo, d1_hi, 6));
565 vst1q_u8(dst + 11 * stride + 16, vextq_u8(d1_hi, a31, 6));
566 vst1q_u8(dst + 12 * stride + 0, vextq_u8(d0_lo, d0_hi, 7));
567 vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0_hi, a31, 7));
568 vst1q_u8(dst + 13 * stride + 0, vextq_u8(d1_lo, d1_hi, 7));
569 vst1q_u8(dst + 13 * stride + 16, vextq_u8(d1_hi, a31, 7));
570 vst1q_u8(dst + 14 * stride + 0, vextq_u8(d0_lo, d0_hi, 8));
571 vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0_hi, a31, 8));
572 vst1q_u8(dst + 15 * stride + 0, vextq_u8(d1_lo, d1_hi, 8));
573 vst1q_u8(dst + 15 * stride + 16, vextq_u8(d1_hi, a31, 8));
574 vst1q_u8(dst + 16 * stride + 0, vextq_u8(d0_lo, d0_hi, 9));
575 vst1q_u8(dst + 16 * stride + 16, vextq_u8(d0_hi, a31, 9));
576 vst1q_u8(dst + 17 * stride + 0, vextq_u8(d1_lo, d1_hi, 9));
577 vst1q_u8(dst + 17 * stride + 16, vextq_u8(d1_hi, a31, 9));
578 vst1q_u8(dst + 18 * stride + 0, vextq_u8(d0_lo, d0_hi, 10));
579 vst1q_u8(dst + 18 * stride + 16, vextq_u8(d0_hi, a31, 10));
580 vst1q_u8(dst + 19 * stride + 0, vextq_u8(d1_lo, d1_hi, 10));
581 vst1q_u8(dst + 19 * stride + 16, vextq_u8(d1_hi, a31, 10));
582 vst1q_u8(dst + 20 * stride + 0, vextq_u8(d0_lo, d0_hi, 11));
583 vst1q_u8(dst + 20 * stride + 16, vextq_u8(d0_hi, a31, 11));
584 vst1q_u8(dst + 21 * stride + 0, vextq_u8(d1_lo, d1_hi, 11));
585 vst1q_u8(dst + 21 * stride + 16, vextq_u8(d1_hi, a31, 11));
586 vst1q_u8(dst + 22 * stride + 0, vextq_u8(d0_lo, d0_hi, 12));
587 vst1q_u8(dst + 22 * stride + 16, vextq_u8(d0_hi, a31, 12));
588 vst1q_u8(dst + 23 * stride + 0, vextq_u8(d1_lo, d1_hi, 12));
589 vst1q_u8(dst + 23 * stride + 16, vextq_u8(d1_hi, a31, 12));
590 vst1q_u8(dst + 24 * stride + 0, vextq_u8(d0_lo, d0_hi, 13));
591 vst1q_u8(dst + 24 * stride + 16, vextq_u8(d0_hi, a31, 13));
592 vst1q_u8(dst + 25 * stride + 0, vextq_u8(d1_lo, d1_hi, 13));
593 vst1q_u8(dst + 25 * stride + 16, vextq_u8(d1_hi, a31, 13));
594 vst1q_u8(dst + 26 * stride + 0, vextq_u8(d0_lo, d0_hi, 14));
595 vst1q_u8(dst + 26 * stride + 16, vextq_u8(d0_hi, a31, 14));
596 vst1q_u8(dst + 27 * stride + 0, vextq_u8(d1_lo, d1_hi, 14));
597 vst1q_u8(dst + 27 * stride + 16, vextq_u8(d1_hi, a31, 14));
598 vst1q_u8(dst + 28 * stride + 0, vextq_u8(d0_lo, d0_hi, 15));
599 vst1q_u8(dst + 28 * stride + 16, vextq_u8(d0_hi, a31, 15));
600 vst1q_u8(dst + 29 * stride + 0, vextq_u8(d1_lo, d1_hi, 15));
601 vst1q_u8(dst + 29 * stride + 16, vextq_u8(d1_hi, a31, 15));
602 vst1q_u8(dst + 30 * stride + 0, d0_hi);
603 vst1q_u8(dst + 30 * stride + 16, a31);
604 vst1q_u8(dst + 31 * stride + 0, d1_hi);
605 vst1q_u8(dst + 31 * stride + 16, a31);
606 }
607
608 // -----------------------------------------------------------------------------
609
vpx_d117_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)610 void vpx_d117_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
611 const uint8_t *above, const uint8_t *left) {
612 // See vpx_d117_predictor_8x8_neon for more details on the implementation.
613 uint8x8_t az, a0, l0az, d0, d1, d2, d3, col0, col1;
614
615 az = load_unaligned_u8_4x1(above - 1);
616 a0 = load_unaligned_u8_4x1(above + 0);
617 // [ left[0], above[-1], above[0], above[1], x, x, x, x ]
618 l0az = vext_u8(vld1_dup_u8(left), az, 7);
619
620 col0 = vdup_n_u8((above[-1] + 2 * left[0] + left[1] + 2) >> 2);
621 col1 = vdup_n_u8((left[0] + 2 * left[1] + left[2] + 2) >> 2);
622
623 d0 = vrhadd_u8(az, a0);
624 d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
625 d2 = vext_u8(col0, d0, 7);
626 d3 = vext_u8(col1, d1, 7);
627
628 store_u8_4x1(dst + 0 * stride, d0);
629 store_u8_4x1(dst + 1 * stride, d1);
630 store_u8_4x1(dst + 2 * stride, d2);
631 store_u8_4x1(dst + 3 * stride, d3);
632 }
633
vpx_d117_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)634 void vpx_d117_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
635 const uint8_t *above, const uint8_t *left) {
636 uint8x8_t az, a0, l0az, d0, d1, l0, l1, azl0, col0, col0_even, col0_odd;
637
638 az = vld1_u8(above - 1);
639 a0 = vld1_u8(above + 0);
640 // [ left[0], above[-1], ... , above[5] ]
641 l0az = vext_u8(vld1_dup_u8(left), az, 7);
642
643 l0 = vld1_u8(left + 0);
644 // The last lane here is unused, reading left[8] could cause a buffer
645 // over-read, so just fill with a duplicate of left[0] to avoid needing to
646 // materialize a zero:
647 // [ left[1], ... , left[7], x ]
648 l1 = vext_u8(l0, l0, 1);
649 // [ above[-1], left[0], ... , left[6] ]
650 azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7);
651
652 // d0[0] = AVG2(above[-1], above[0])
653 // d0[1] = AVG2(above[0], above[1])
654 // ...
655 // d0[7] = AVG2(above[6], above[7])
656 d0 = vrhadd_u8(az, a0);
657
658 // d1[0] = AVG3(left[0], above[-1], above[0])
659 // d1[1] = AVG3(above[-1], above[0], above[1])
660 // ...
661 // d1[7] = AVG3(above[5], above[6], above[7])
662 d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
663
664 // The ext instruction shifts elements in from the end of the vector rather
665 // than the start, so reverse the vector to put the elements to be shifted in
666 // at the end. The lowest two lanes here are unused:
667 // col0[7] = AVG3(above[-1], left[0], left[1])
668 // col0[6] = AVG3(left[0], left[1], left[2])
669 // ...
670 // col0[2] = AVG3(left[4], left[5], left[6])
671 // col0[1] = x (don't care)
672 // col0[0] = x (don't care)
673 col0 = vrev64_u8(vrhadd_u8(vhadd_u8(azl0, l1), l0));
674
675 // We don't care about the first parameter to this uzp since we only ever use
676 // the high three elements, we just use col0 again since it is already
677 // available:
678 // col0_even = [ x, x, x, x, x, col0[3], col0[5], col0[7] ]
679 // col0_odd = [ x, x, x, x, x, col0[2], col0[4], col0[6] ]
680 col0_even = vuzp_u8(col0, col0).val[1];
681 col0_odd = vuzp_u8(col0, col0).val[0];
682
683 // Incrementally shift more elements from col0 into d0/1:
684 // stride=0 [ d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], d0[7] ]
685 // stride=1 [ d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], d1[7] ]
686 // stride=2 [ col0[7], d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6] ]
687 // stride=3 [ col0[6], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ]
688 // stride=4 [ col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4], d0[5] ]
689 // stride=5 [ col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5] ]
690 // stride=6 [ col0[3], col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4] ]
691 // stride=7 [ col0[2], col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4] ]
692 vst1_u8(dst + 0 * stride, d0);
693 vst1_u8(dst + 1 * stride, d1);
694 vst1_u8(dst + 2 * stride, vext_u8(col0_even, d0, 7));
695 vst1_u8(dst + 3 * stride, vext_u8(col0_odd, d1, 7));
696 vst1_u8(dst + 4 * stride, vext_u8(col0_even, d0, 6));
697 vst1_u8(dst + 5 * stride, vext_u8(col0_odd, d1, 6));
698 vst1_u8(dst + 6 * stride, vext_u8(col0_even, d0, 5));
699 vst1_u8(dst + 7 * stride, vext_u8(col0_odd, d1, 5));
700 }
701
vpx_d117_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)702 void vpx_d117_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
703 const uint8_t *above, const uint8_t *left) {
704 // See vpx_d117_predictor_8x8_neon for more details on the implementation.
705 uint8x16_t az, a0, l0az, d0, d1, l0, l1, azl0, col0, col0_even, col0_odd;
706
707 az = vld1q_u8(above - 1);
708 a0 = vld1q_u8(above + 0);
709 // [ left[0], above[-1], ... , above[13] ]
710 l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
711
712 l0 = vld1q_u8(left + 0);
713 // The last lane here is unused, reading left[16] could cause a buffer
714 // over-read, so just fill with a duplicate of left[0] to avoid needing to
715 // materialize a zero:
716 // [ left[1], ... , left[15], x ]
717 l1 = vextq_u8(l0, l0, 1);
718 // [ above[-1], left[0], ... , left[14] ]
719 azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
720
721 d0 = vrhaddq_u8(az, a0);
722 d1 = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
723
724 col0 = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
725 col0 = vrev64q_u8(vextq_u8(col0, col0, 8));
726
727 // The low nine lanes here are unused so the first input to the uzp is
728 // unused, so just use a duplicate of col0 since we have it already. This
729 // also means that the lowest lane of col0 here is unused.
730 col0_even = vuzpq_u8(col0, col0).val[1];
731 col0_odd = vuzpq_u8(col0, col0).val[0];
732
733 vst1q_u8(dst + 0 * stride, d0);
734 vst1q_u8(dst + 1 * stride, d1);
735 vst1q_u8(dst + 2 * stride, vextq_u8(col0_even, d0, 15));
736 vst1q_u8(dst + 3 * stride, vextq_u8(col0_odd, d1, 15));
737 vst1q_u8(dst + 4 * stride, vextq_u8(col0_even, d0, 14));
738 vst1q_u8(dst + 5 * stride, vextq_u8(col0_odd, d1, 14));
739 vst1q_u8(dst + 6 * stride, vextq_u8(col0_even, d0, 13));
740 vst1q_u8(dst + 7 * stride, vextq_u8(col0_odd, d1, 13));
741 vst1q_u8(dst + 8 * stride, vextq_u8(col0_even, d0, 12));
742 vst1q_u8(dst + 9 * stride, vextq_u8(col0_odd, d1, 12));
743 vst1q_u8(dst + 10 * stride, vextq_u8(col0_even, d0, 11));
744 vst1q_u8(dst + 11 * stride, vextq_u8(col0_odd, d1, 11));
745 vst1q_u8(dst + 12 * stride, vextq_u8(col0_even, d0, 10));
746 vst1q_u8(dst + 13 * stride, vextq_u8(col0_odd, d1, 10));
747 vst1q_u8(dst + 14 * stride, vextq_u8(col0_even, d0, 9));
748 vst1q_u8(dst + 15 * stride, vextq_u8(col0_odd, d1, 9));
749 }
750
vpx_d117_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)751 void vpx_d117_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
752 const uint8_t *above, const uint8_t *left) {
753 // See vpx_d117_predictor_8x8_neon for more details on the implementation.
754 uint8x16_t az, a0, a14, a15, a16, l0az, d0_lo, d0_hi, d1_lo, d1_hi, l0, l1,
755 l15, l16, l17, azl0, col0_lo, col0_hi, col0_even, col0_odd;
756
757 az = vld1q_u8(above - 1);
758 a0 = vld1q_u8(above + 0);
759 a14 = vld1q_u8(above + 14);
760 a15 = vld1q_u8(above + 15);
761 a16 = vld1q_u8(above + 16);
762 // [ left[0], above[-1], ... , above[13] ]
763 l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
764
765 l0 = vld1q_u8(left + 0);
766 l1 = vld1q_u8(left + 1);
767 l15 = vld1q_u8(left + 15);
768 l16 = vld1q_u8(left + 16);
769 // The last lane here is unused, reading left[32] would cause a buffer
770 // over-read (observed as an address-sanitizer failure), so just fill with a
771 // duplicate of left[16] to avoid needing to materialize a zero:
772 // [ left[17], ... , left[31], x ]
773 l17 = vextq_u8(l16, l16, 1);
774 // [ above[-1], left[0], ... , left[14] ]
775 azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
776
777 d0_lo = vrhaddq_u8(az, a0);
778 d0_hi = vrhaddq_u8(a15, a16);
779 d1_lo = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
780 d1_hi = vrhaddq_u8(vhaddq_u8(a14, a16), a15);
781
782 // The last lane of col0_hi is unused here.
783 col0_lo = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
784 col0_hi = vrhaddq_u8(vhaddq_u8(l15, l17), l16);
785
786 col0_lo = vrev64q_u8(vextq_u8(col0_lo, col0_lo, 8));
787 col0_hi = vrev64q_u8(vextq_u8(col0_hi, col0_hi, 8));
788
789 // The first lane of these are unused since they are only ever called as
790 // ext(col0, _, i) where i >= 1.
791 col0_even = vuzpq_u8(col0_hi, col0_lo).val[1];
792 col0_odd = vuzpq_u8(col0_hi, col0_lo).val[0];
793
794 vst1q_u8(dst + 0 * stride + 0, d0_lo);
795 vst1q_u8(dst + 0 * stride + 16, d0_hi);
796 vst1q_u8(dst + 1 * stride + 0, d1_lo);
797 vst1q_u8(dst + 1 * stride + 16, d1_hi);
798 vst1q_u8(dst + 2 * stride + 0, vextq_u8(col0_even, d0_lo, 15));
799 vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0_lo, d0_hi, 15));
800 vst1q_u8(dst + 3 * stride + 0, vextq_u8(col0_odd, d1_lo, 15));
801 vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_lo, d1_hi, 15));
802 vst1q_u8(dst + 4 * stride + 0, vextq_u8(col0_even, d0_lo, 14));
803 vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0_lo, d0_hi, 14));
804 vst1q_u8(dst + 5 * stride + 0, vextq_u8(col0_odd, d1_lo, 14));
805 vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_lo, d1_hi, 14));
806 vst1q_u8(dst + 6 * stride + 0, vextq_u8(col0_even, d0_lo, 13));
807 vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0_lo, d0_hi, 13));
808 vst1q_u8(dst + 7 * stride + 0, vextq_u8(col0_odd, d1_lo, 13));
809 vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_lo, d1_hi, 13));
810 vst1q_u8(dst + 8 * stride + 0, vextq_u8(col0_even, d0_lo, 12));
811 vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0_lo, d0_hi, 12));
812 vst1q_u8(dst + 9 * stride + 0, vextq_u8(col0_odd, d1_lo, 12));
813 vst1q_u8(dst + 9 * stride + 16, vextq_u8(d1_lo, d1_hi, 12));
814 vst1q_u8(dst + 10 * stride + 0, vextq_u8(col0_even, d0_lo, 11));
815 vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0_lo, d0_hi, 11));
816 vst1q_u8(dst + 11 * stride + 0, vextq_u8(col0_odd, d1_lo, 11));
817 vst1q_u8(dst + 11 * stride + 16, vextq_u8(d1_lo, d1_hi, 11));
818 vst1q_u8(dst + 12 * stride + 0, vextq_u8(col0_even, d0_lo, 10));
819 vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0_lo, d0_hi, 10));
820 vst1q_u8(dst + 13 * stride + 0, vextq_u8(col0_odd, d1_lo, 10));
821 vst1q_u8(dst + 13 * stride + 16, vextq_u8(d1_lo, d1_hi, 10));
822 vst1q_u8(dst + 14 * stride + 0, vextq_u8(col0_even, d0_lo, 9));
823 vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0_lo, d0_hi, 9));
824 vst1q_u8(dst + 15 * stride + 0, vextq_u8(col0_odd, d1_lo, 9));
825 vst1q_u8(dst + 15 * stride + 16, vextq_u8(d1_lo, d1_hi, 9));
826 vst1q_u8(dst + 16 * stride + 0, vextq_u8(col0_even, d0_lo, 8));
827 vst1q_u8(dst + 16 * stride + 16, vextq_u8(d0_lo, d0_hi, 8));
828 vst1q_u8(dst + 17 * stride + 0, vextq_u8(col0_odd, d1_lo, 8));
829 vst1q_u8(dst + 17 * stride + 16, vextq_u8(d1_lo, d1_hi, 8));
830 vst1q_u8(dst + 18 * stride + 0, vextq_u8(col0_even, d0_lo, 7));
831 vst1q_u8(dst + 18 * stride + 16, vextq_u8(d0_lo, d0_hi, 7));
832 vst1q_u8(dst + 19 * stride + 0, vextq_u8(col0_odd, d1_lo, 7));
833 vst1q_u8(dst + 19 * stride + 16, vextq_u8(d1_lo, d1_hi, 7));
834 vst1q_u8(dst + 20 * stride + 0, vextq_u8(col0_even, d0_lo, 6));
835 vst1q_u8(dst + 20 * stride + 16, vextq_u8(d0_lo, d0_hi, 6));
836 vst1q_u8(dst + 21 * stride + 0, vextq_u8(col0_odd, d1_lo, 6));
837 vst1q_u8(dst + 21 * stride + 16, vextq_u8(d1_lo, d1_hi, 6));
838 vst1q_u8(dst + 22 * stride + 0, vextq_u8(col0_even, d0_lo, 5));
839 vst1q_u8(dst + 22 * stride + 16, vextq_u8(d0_lo, d0_hi, 5));
840 vst1q_u8(dst + 23 * stride + 0, vextq_u8(col0_odd, d1_lo, 5));
841 vst1q_u8(dst + 23 * stride + 16, vextq_u8(d1_lo, d1_hi, 5));
842 vst1q_u8(dst + 24 * stride + 0, vextq_u8(col0_even, d0_lo, 4));
843 vst1q_u8(dst + 24 * stride + 16, vextq_u8(d0_lo, d0_hi, 4));
844 vst1q_u8(dst + 25 * stride + 0, vextq_u8(col0_odd, d1_lo, 4));
845 vst1q_u8(dst + 25 * stride + 16, vextq_u8(d1_lo, d1_hi, 4));
846 vst1q_u8(dst + 26 * stride + 0, vextq_u8(col0_even, d0_lo, 3));
847 vst1q_u8(dst + 26 * stride + 16, vextq_u8(d0_lo, d0_hi, 3));
848 vst1q_u8(dst + 27 * stride + 0, vextq_u8(col0_odd, d1_lo, 3));
849 vst1q_u8(dst + 27 * stride + 16, vextq_u8(d1_lo, d1_hi, 3));
850 vst1q_u8(dst + 28 * stride + 0, vextq_u8(col0_even, d0_lo, 2));
851 vst1q_u8(dst + 28 * stride + 16, vextq_u8(d0_lo, d0_hi, 2));
852 vst1q_u8(dst + 29 * stride + 0, vextq_u8(col0_odd, d1_lo, 2));
853 vst1q_u8(dst + 29 * stride + 16, vextq_u8(d1_lo, d1_hi, 2));
854 vst1q_u8(dst + 30 * stride + 0, vextq_u8(col0_even, d0_lo, 1));
855 vst1q_u8(dst + 30 * stride + 16, vextq_u8(d0_lo, d0_hi, 1));
856 vst1q_u8(dst + 31 * stride + 0, vextq_u8(col0_odd, d1_lo, 1));
857 vst1q_u8(dst + 31 * stride + 16, vextq_u8(d1_lo, d1_hi, 1));
858 }
859
860 // -----------------------------------------------------------------------------
861
vpx_d135_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)862 void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
863 const uint8_t *above, const uint8_t *left) {
864 const uint8x8_t XA0123 = vld1_u8(above - 1);
865 const uint8x8_t L0123 = vld1_u8(left);
866 const uint8x8_t L3210 = vrev64_u8(L0123);
867 const uint8x8_t L3210XA012 = vext_u8(L3210, XA0123, 4);
868 const uint8x8_t L210XA0123 = vext_u8(L3210, XA0123, 5);
869 const uint8x8_t L10XA0123_ = vext_u8(L210XA0123, L210XA0123, 1);
870 const uint8x8_t avg1 = vhadd_u8(L10XA0123_, L3210XA012);
871 const uint8x8_t avg2 = vrhadd_u8(avg1, L210XA0123);
872
873 store_u8_4x1(dst + 0 * stride, vext_u8(avg2, avg2, 3));
874 store_u8_4x1(dst + 1 * stride, vext_u8(avg2, avg2, 2));
875 store_u8_4x1(dst + 2 * stride, vext_u8(avg2, avg2, 1));
876 store_u8_4x1(dst + 3 * stride, avg2);
877 }
878
vpx_d135_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)879 void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
880 const uint8_t *above, const uint8_t *left) {
881 const uint8x8_t XA0123456 = vld1_u8(above - 1);
882 const uint8x8_t A01234567 = vld1_u8(above);
883 const uint8x8_t A1234567_ = vld1_u8(above + 1);
884 const uint8x8_t L01234567 = vld1_u8(left);
885 const uint8x8_t L76543210 = vrev64_u8(L01234567);
886 const uint8x8_t L6543210X = vext_u8(L76543210, XA0123456, 1);
887 const uint8x8_t L543210XA0 = vext_u8(L76543210, XA0123456, 2);
888 const uint8x16_t L76543210XA0123456 = vcombine_u8(L76543210, XA0123456);
889 const uint8x16_t L6543210XA01234567 = vcombine_u8(L6543210X, A01234567);
890 const uint8x16_t L543210XA01234567_ = vcombine_u8(L543210XA0, A1234567_);
891 const uint8x16_t avg = vhaddq_u8(L76543210XA0123456, L543210XA01234567_);
892 const uint8x16_t row = vrhaddq_u8(avg, L6543210XA01234567);
893
894 vst1_u8(dst + 0 * stride, vget_low_u8(vextq_u8(row, row, 7)));
895 vst1_u8(dst + 1 * stride, vget_low_u8(vextq_u8(row, row, 6)));
896 vst1_u8(dst + 2 * stride, vget_low_u8(vextq_u8(row, row, 5)));
897 vst1_u8(dst + 3 * stride, vget_low_u8(vextq_u8(row, row, 4)));
898 vst1_u8(dst + 4 * stride, vget_low_u8(vextq_u8(row, row, 3)));
899 vst1_u8(dst + 5 * stride, vget_low_u8(vextq_u8(row, row, 2)));
900 vst1_u8(dst + 6 * stride, vget_low_u8(vextq_u8(row, row, 1)));
901 vst1_u8(dst + 7 * stride, vget_low_u8(row));
902 }
903
d135_store_16x8(uint8_t ** dst,const ptrdiff_t stride,const uint8x16_t row_0,const uint8x16_t row_1,const uint8x16_t row_2,const uint8x16_t row_3,const uint8x16_t row_4,const uint8x16_t row_5,const uint8x16_t row_6,const uint8x16_t row_7)904 static INLINE void d135_store_16x8(
905 uint8_t **dst, const ptrdiff_t stride, const uint8x16_t row_0,
906 const uint8x16_t row_1, const uint8x16_t row_2, const uint8x16_t row_3,
907 const uint8x16_t row_4, const uint8x16_t row_5, const uint8x16_t row_6,
908 const uint8x16_t row_7) {
909 vst1q_u8(*dst, row_0);
910 *dst += stride;
911 vst1q_u8(*dst, row_1);
912 *dst += stride;
913 vst1q_u8(*dst, row_2);
914 *dst += stride;
915 vst1q_u8(*dst, row_3);
916 *dst += stride;
917 vst1q_u8(*dst, row_4);
918 *dst += stride;
919 vst1q_u8(*dst, row_5);
920 *dst += stride;
921 vst1q_u8(*dst, row_6);
922 *dst += stride;
923 vst1q_u8(*dst, row_7);
924 *dst += stride;
925 }
926
vpx_d135_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)927 void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
928 const uint8_t *above, const uint8_t *left) {
929 const uint8x16_t XA0123456789abcde = vld1q_u8(above - 1);
930 const uint8x16_t A0123456789abcdef = vld1q_u8(above);
931 const uint8x16_t A123456789abcdef_ = vld1q_u8(above + 1);
932 const uint8x16_t L0123456789abcdef = vld1q_u8(left);
933 const uint8x8_t L76543210 = vrev64_u8(vget_low_u8(L0123456789abcdef));
934 const uint8x8_t Lfedcba98 = vrev64_u8(vget_high_u8(L0123456789abcdef));
935 const uint8x16_t Lfedcba9876543210 = vcombine_u8(Lfedcba98, L76543210);
936 const uint8x16_t Ledcba9876543210X =
937 vextq_u8(Lfedcba9876543210, XA0123456789abcde, 1);
938 const uint8x16_t Ldcba9876543210XA0 =
939 vextq_u8(Lfedcba9876543210, XA0123456789abcde, 2);
940 const uint8x16_t avg_0 = vhaddq_u8(Lfedcba9876543210, Ldcba9876543210XA0);
941 const uint8x16_t avg_1 = vhaddq_u8(XA0123456789abcde, A123456789abcdef_);
942 const uint8x16_t row_0 = vrhaddq_u8(avg_0, Ledcba9876543210X);
943 const uint8x16_t row_1 = vrhaddq_u8(avg_1, A0123456789abcdef);
944
945 const uint8x16_t r_0 = vextq_u8(row_0, row_1, 15);
946 const uint8x16_t r_1 = vextq_u8(row_0, row_1, 14);
947 const uint8x16_t r_2 = vextq_u8(row_0, row_1, 13);
948 const uint8x16_t r_3 = vextq_u8(row_0, row_1, 12);
949 const uint8x16_t r_4 = vextq_u8(row_0, row_1, 11);
950 const uint8x16_t r_5 = vextq_u8(row_0, row_1, 10);
951 const uint8x16_t r_6 = vextq_u8(row_0, row_1, 9);
952 const uint8x16_t r_7 = vextq_u8(row_0, row_1, 8);
953 const uint8x16_t r_8 = vextq_u8(row_0, row_1, 7);
954 const uint8x16_t r_9 = vextq_u8(row_0, row_1, 6);
955 const uint8x16_t r_a = vextq_u8(row_0, row_1, 5);
956 const uint8x16_t r_b = vextq_u8(row_0, row_1, 4);
957 const uint8x16_t r_c = vextq_u8(row_0, row_1, 3);
958 const uint8x16_t r_d = vextq_u8(row_0, row_1, 2);
959 const uint8x16_t r_e = vextq_u8(row_0, row_1, 1);
960
961 d135_store_16x8(&dst, stride, r_0, r_1, r_2, r_3, r_4, r_5, r_6, r_7);
962 d135_store_16x8(&dst, stride, r_8, r_9, r_a, r_b, r_c, r_d, r_e, row_0);
963 }
964
d135_store_32x2(uint8_t ** dst,const ptrdiff_t stride,const uint8x16_t row_0,const uint8x16_t row_1,const uint8x16_t row_2)965 static INLINE void d135_store_32x2(uint8_t **dst, const ptrdiff_t stride,
966 const uint8x16_t row_0,
967 const uint8x16_t row_1,
968 const uint8x16_t row_2) {
969 uint8_t *dst2 = *dst;
970 vst1q_u8(dst2, row_1);
971 dst2 += 16;
972 vst1q_u8(dst2, row_2);
973 dst2 += 16 * stride - 16;
974 vst1q_u8(dst2, row_0);
975 dst2 += 16;
976 vst1q_u8(dst2, row_1);
977 *dst += stride;
978 }
979
vpx_d135_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)980 void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
981 const uint8_t *above, const uint8_t *left) {
982 const uint8x16_t LL0123456789abcdef = vld1q_u8(left + 16);
983 const uint8x16_t LU0123456789abcdef = vld1q_u8(left);
984 const uint8x8_t LL76543210 = vrev64_u8(vget_low_u8(LL0123456789abcdef));
985 const uint8x8_t LU76543210 = vrev64_u8(vget_low_u8(LU0123456789abcdef));
986 const uint8x8_t LLfedcba98 = vrev64_u8(vget_high_u8(LL0123456789abcdef));
987 const uint8x8_t LUfedcba98 = vrev64_u8(vget_high_u8(LU0123456789abcdef));
988 const uint8x16_t LLfedcba9876543210 = vcombine_u8(LLfedcba98, LL76543210);
989 const uint8x16_t LUfedcba9876543210 = vcombine_u8(LUfedcba98, LU76543210);
990 const uint8x16_t LLedcba9876543210Uf =
991 vextq_u8(LLfedcba9876543210, LUfedcba9876543210, 1);
992 const uint8x16_t LLdcba9876543210Ufe =
993 vextq_u8(LLfedcba9876543210, LUfedcba9876543210, 2);
994 const uint8x16_t avg_0 = vhaddq_u8(LLfedcba9876543210, LLdcba9876543210Ufe);
995 const uint8x16_t row_0 = vrhaddq_u8(avg_0, LLedcba9876543210Uf);
996
997 const uint8x16_t XAL0123456789abcde = vld1q_u8(above - 1);
998 const uint8x16_t LUedcba9876543210X =
999 vextq_u8(LUfedcba9876543210, XAL0123456789abcde, 1);
1000 const uint8x16_t LUdcba9876543210XA0 =
1001 vextq_u8(LUfedcba9876543210, XAL0123456789abcde, 2);
1002 const uint8x16_t avg_1 = vhaddq_u8(LUfedcba9876543210, LUdcba9876543210XA0);
1003 const uint8x16_t row_1 = vrhaddq_u8(avg_1, LUedcba9876543210X);
1004
1005 const uint8x16_t AL0123456789abcdef = vld1q_u8(above);
1006 const uint8x16_t AL123456789abcdefg = vld1q_u8(above + 1);
1007 const uint8x16_t ALfR0123456789abcde = vld1q_u8(above + 15);
1008 const uint8x16_t AR0123456789abcdef = vld1q_u8(above + 16);
1009 const uint8x16_t AR123456789abcdef_ = vld1q_u8(above + 17);
1010 const uint8x16_t avg_2 = vhaddq_u8(XAL0123456789abcde, AL123456789abcdefg);
1011 const uint8x16_t row_2 = vrhaddq_u8(avg_2, AL0123456789abcdef);
1012 const uint8x16_t avg_3 = vhaddq_u8(ALfR0123456789abcde, AR123456789abcdef_);
1013 const uint8x16_t row_3 = vrhaddq_u8(avg_3, AR0123456789abcdef);
1014
1015 {
1016 const uint8x16_t r_0 = vextq_u8(row_0, row_1, 15);
1017 const uint8x16_t r_1 = vextq_u8(row_1, row_2, 15);
1018 const uint8x16_t r_2 = vextq_u8(row_2, row_3, 15);
1019 d135_store_32x2(&dst, stride, r_0, r_1, r_2);
1020 }
1021
1022 {
1023 const uint8x16_t r_0 = vextq_u8(row_0, row_1, 14);
1024 const uint8x16_t r_1 = vextq_u8(row_1, row_2, 14);
1025 const uint8x16_t r_2 = vextq_u8(row_2, row_3, 14);
1026 d135_store_32x2(&dst, stride, r_0, r_1, r_2);
1027 }
1028
1029 {
1030 const uint8x16_t r_0 = vextq_u8(row_0, row_1, 13);
1031 const uint8x16_t r_1 = vextq_u8(row_1, row_2, 13);
1032 const uint8x16_t r_2 = vextq_u8(row_2, row_3, 13);
1033 d135_store_32x2(&dst, stride, r_0, r_1, r_2);
1034 }
1035
1036 {
1037 const uint8x16_t r_0 = vextq_u8(row_0, row_1, 12);
1038 const uint8x16_t r_1 = vextq_u8(row_1, row_2, 12);
1039 const uint8x16_t r_2 = vextq_u8(row_2, row_3, 12);
1040 d135_store_32x2(&dst, stride, r_0, r_1, r_2);
1041 }
1042
1043 {
1044 const uint8x16_t r_0 = vextq_u8(row_0, row_1, 11);
1045 const uint8x16_t r_1 = vextq_u8(row_1, row_2, 11);
1046 const uint8x16_t r_2 = vextq_u8(row_2, row_3, 11);
1047 d135_store_32x2(&dst, stride, r_0, r_1, r_2);
1048 }
1049
1050 {
1051 const uint8x16_t r_0 = vextq_u8(row_0, row_1, 10);
1052 const uint8x16_t r_1 = vextq_u8(row_1, row_2, 10);
1053 const uint8x16_t r_2 = vextq_u8(row_2, row_3, 10);
1054 d135_store_32x2(&dst, stride, r_0, r_1, r_2);
1055 }
1056
1057 {
1058 const uint8x16_t r_0 = vextq_u8(row_0, row_1, 9);
1059 const uint8x16_t r_1 = vextq_u8(row_1, row_2, 9);
1060 const uint8x16_t r_2 = vextq_u8(row_2, row_3, 9);
1061 d135_store_32x2(&dst, stride, r_0, r_1, r_2);
1062 }
1063
1064 {
1065 const uint8x16_t r_0 = vextq_u8(row_0, row_1, 8);
1066 const uint8x16_t r_1 = vextq_u8(row_1, row_2, 8);
1067 const uint8x16_t r_2 = vextq_u8(row_2, row_3, 8);
1068 d135_store_32x2(&dst, stride, r_0, r_1, r_2);
1069 }
1070
1071 {
1072 const uint8x16_t r_0 = vextq_u8(row_0, row_1, 7);
1073 const uint8x16_t r_1 = vextq_u8(row_1, row_2, 7);
1074 const uint8x16_t r_2 = vextq_u8(row_2, row_3, 7);
1075 d135_store_32x2(&dst, stride, r_0, r_1, r_2);
1076 }
1077
1078 {
1079 const uint8x16_t r_0 = vextq_u8(row_0, row_1, 6);
1080 const uint8x16_t r_1 = vextq_u8(row_1, row_2, 6);
1081 const uint8x16_t r_2 = vextq_u8(row_2, row_3, 6);
1082 d135_store_32x2(&dst, stride, r_0, r_1, r_2);
1083 }
1084
1085 {
1086 const uint8x16_t r_0 = vextq_u8(row_0, row_1, 5);
1087 const uint8x16_t r_1 = vextq_u8(row_1, row_2, 5);
1088 const uint8x16_t r_2 = vextq_u8(row_2, row_3, 5);
1089 d135_store_32x2(&dst, stride, r_0, r_1, r_2);
1090 }
1091
1092 {
1093 const uint8x16_t r_0 = vextq_u8(row_0, row_1, 4);
1094 const uint8x16_t r_1 = vextq_u8(row_1, row_2, 4);
1095 const uint8x16_t r_2 = vextq_u8(row_2, row_3, 4);
1096 d135_store_32x2(&dst, stride, r_0, r_1, r_2);
1097 }
1098
1099 {
1100 const uint8x16_t r_0 = vextq_u8(row_0, row_1, 3);
1101 const uint8x16_t r_1 = vextq_u8(row_1, row_2, 3);
1102 const uint8x16_t r_2 = vextq_u8(row_2, row_3, 3);
1103 d135_store_32x2(&dst, stride, r_0, r_1, r_2);
1104 }
1105
1106 {
1107 const uint8x16_t r_0 = vextq_u8(row_0, row_1, 2);
1108 const uint8x16_t r_1 = vextq_u8(row_1, row_2, 2);
1109 const uint8x16_t r_2 = vextq_u8(row_2, row_3, 2);
1110 d135_store_32x2(&dst, stride, r_0, r_1, r_2);
1111 }
1112
1113 {
1114 const uint8x16_t r_0 = vextq_u8(row_0, row_1, 1);
1115 const uint8x16_t r_1 = vextq_u8(row_1, row_2, 1);
1116 const uint8x16_t r_2 = vextq_u8(row_2, row_3, 1);
1117 d135_store_32x2(&dst, stride, r_0, r_1, r_2);
1118 }
1119
1120 d135_store_32x2(&dst, stride, row_0, row_1, row_2);
1121 }
1122
1123 // -----------------------------------------------------------------------------
1124
vpx_d153_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1125 void vpx_d153_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
1126 const uint8_t *above, const uint8_t *left) {
1127 // See vpx_d153_predictor_8x8_neon for more details on the implementation.
1128 uint8x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02;
1129
1130 az = load_unaligned_u8_4x1(above - 1);
1131 a0 = load_unaligned_u8_4x1(above + 0);
1132 // [ left[0], above[-1], above[0], above[1], x, x, x, x ]
1133 l0az = vext_u8(vld1_dup_u8(left), az, 7);
1134
1135 l0 = load_unaligned_u8_4x1(left + 0);
1136 l1 = load_unaligned_u8_4x1(left + 1);
1137 // [ above[-1], left[0], left[1], left[2], x, x, x, x ]
1138 azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7);
1139
1140 d0 = vrhadd_u8(azl0, l0);
1141 d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
1142 d2 = vrhadd_u8(vhadd_u8(azl0, l1), l0);
1143
1144 d02 = vrev64_u8(vzip_u8(d0, d2).val[0]);
1145
1146 store_u8_4x1(dst + 0 * stride, vext_u8(d02, d1, 7));
1147 store_u8_4x1(dst + 1 * stride, vext_u8(d02, d1, 5));
1148 store_u8_4x1(dst + 2 * stride, vext_u8(d02, d1, 3));
1149 store_u8_4x1(dst + 3 * stride, vext_u8(d02, d1, 1));
1150 }
1151
vpx_d153_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1152 void vpx_d153_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
1153 const uint8_t *above, const uint8_t *left) {
1154 uint8x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02_lo, d02_hi;
1155
1156 az = vld1_u8(above - 1);
1157 a0 = vld1_u8(above + 0);
1158 // [ left[0], above[-1], ... , above[5] ]
1159 l0az = vext_u8(vld1_dup_u8(left), az, 7);
1160
1161 l0 = vld1_u8(left);
1162 // The last lane here is unused, reading left[8] could cause a buffer
1163 // over-read, so just fill with a duplicate of left[0] to avoid needing to
1164 // materialize a zero:
1165 // [ left[1], ... , left[7], x ]
1166 l1 = vext_u8(l0, l0, 1);
1167 // [ above[-1], left[0], ... , left[6] ]
1168 azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7);
1169
1170 // d0[0] = AVG2(above[-1], left[0])
1171 // d0[1] = AVG2(left[0], left[1])
1172 // ...
1173 // d0[7] = AVG2(left[6], left[7])
1174 d0 = vrhadd_u8(azl0, l0);
1175
1176 // d1[0] = AVG3(left[0], above[-1], above[0])
1177 // d1[1] = AVG3(above[-1], above[0], above[1])
1178 // ...
1179 // d1[7] = AVG3(above[5], above[6], above[7])
1180 d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
1181
1182 // d2[0] = AVG3(above[-1], left[0], left[1])
1183 // d2[1] = AVG3(left[0], left[1], left[2])
1184 // ...
1185 // d2[6] = AVG3(left[5], left[6], left[7])
1186 // d2[7] = x (don't care)
1187 d2 = vrhadd_u8(vhadd_u8(azl0, l1), l0);
1188
1189 // The ext instruction shifts elements in from the end of the vector rather
1190 // than the start, so reverse the vectors to put the elements to be shifted
1191 // in at the end. The lowest lane of d02_lo is unused.
1192 d02_lo = vzip_u8(vrev64_u8(d2), vrev64_u8(d0)).val[0];
1193 d02_hi = vzip_u8(vrev64_u8(d2), vrev64_u8(d0)).val[1];
1194
1195 // Incrementally shift more elements from d0/d2 reversed into d1:
1196 // stride=0 [ d0[0], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ]
1197 // stride=1 [ d0[1], d2[0], d0[0], d1[0], d1[1], d1[2], d1[3], d1[4] ]
1198 // stride=2 [ d0[2], d2[1], d0[1], d2[0], d0[0], d1[0], d1[1], d1[2] ]
1199 // stride=3 [ d0[3], d2[2], d0[2], d2[1], d0[1], d2[0], d0[0], d1[0] ]
1200 // stride=4 [ d0[4], d2[3], d0[3], d2[2], d0[2], d2[1], d0[1], d2[0] ]
1201 // stride=5 [ d0[5], d2[4], d0[4], d2[3], d0[3], d2[2], d0[2], d2[1] ]
1202 // stride=6 [ d0[6], d2[5], d0[5], d2[4], d0[4], d2[3], d0[3], d2[2] ]
1203 // stride=7 [ d0[7], d2[6], d0[6], d2[5], d0[5], d2[4], d0[4], d2[3] ]
1204 vst1_u8(dst + 0 * stride, vext_u8(d02_hi, d1, 7));
1205 vst1_u8(dst + 1 * stride, vext_u8(d02_hi, d1, 5));
1206 vst1_u8(dst + 2 * stride, vext_u8(d02_hi, d1, 3));
1207 vst1_u8(dst + 3 * stride, vext_u8(d02_hi, d1, 1));
1208 vst1_u8(dst + 4 * stride, vext_u8(d02_lo, d02_hi, 7));
1209 vst1_u8(dst + 5 * stride, vext_u8(d02_lo, d02_hi, 5));
1210 vst1_u8(dst + 6 * stride, vext_u8(d02_lo, d02_hi, 3));
1211 vst1_u8(dst + 7 * stride, vext_u8(d02_lo, d02_hi, 1));
1212 }
1213
vpx_d153_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1214 void vpx_d153_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
1215 const uint8_t *above, const uint8_t *left) {
1216 // See vpx_d153_predictor_8x8_neon for more details on the implementation.
1217 uint8x16_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02_lo, d02_hi;
1218
1219 az = vld1q_u8(above - 1);
1220 a0 = vld1q_u8(above + 0);
1221 // [ left[0], above[-1], ... , above[13] ]
1222 l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
1223
1224 l0 = vld1q_u8(left + 0);
1225 // The last lane here is unused, reading left[16] could cause a buffer
1226 // over-read, so just fill with a duplicate of left[0] to avoid needing to
1227 // materialize a zero:
1228 // [ left[1], ... , left[15], x ]
1229 l1 = vextq_u8(l0, l0, 1);
1230 // [ above[-1], left[0], ... , left[14] ]
1231 azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
1232
1233 d0 = vrhaddq_u8(azl0, l0);
1234 d1 = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
1235 d2 = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
1236
1237 d0 = vrev64q_u8(vextq_u8(d0, d0, 8));
1238 d2 = vrev64q_u8(vextq_u8(d2, d2, 8));
1239
1240 // The lowest lane of d02_lo is unused.
1241 d02_lo = vzipq_u8(d2, d0).val[0];
1242 d02_hi = vzipq_u8(d2, d0).val[1];
1243
1244 vst1q_u8(dst + 0 * stride, vextq_u8(d02_hi, d1, 15));
1245 vst1q_u8(dst + 1 * stride, vextq_u8(d02_hi, d1, 13));
1246 vst1q_u8(dst + 2 * stride, vextq_u8(d02_hi, d1, 11));
1247 vst1q_u8(dst + 3 * stride, vextq_u8(d02_hi, d1, 9));
1248 vst1q_u8(dst + 4 * stride, vextq_u8(d02_hi, d1, 7));
1249 vst1q_u8(dst + 5 * stride, vextq_u8(d02_hi, d1, 5));
1250 vst1q_u8(dst + 6 * stride, vextq_u8(d02_hi, d1, 3));
1251 vst1q_u8(dst + 7 * stride, vextq_u8(d02_hi, d1, 1));
1252 vst1q_u8(dst + 8 * stride, vextq_u8(d02_lo, d02_hi, 15));
1253 vst1q_u8(dst + 9 * stride, vextq_u8(d02_lo, d02_hi, 13));
1254 vst1q_u8(dst + 10 * stride, vextq_u8(d02_lo, d02_hi, 11));
1255 vst1q_u8(dst + 11 * stride, vextq_u8(d02_lo, d02_hi, 9));
1256 vst1q_u8(dst + 12 * stride, vextq_u8(d02_lo, d02_hi, 7));
1257 vst1q_u8(dst + 13 * stride, vextq_u8(d02_lo, d02_hi, 5));
1258 vst1q_u8(dst + 14 * stride, vextq_u8(d02_lo, d02_hi, 3));
1259 vst1q_u8(dst + 15 * stride, vextq_u8(d02_lo, d02_hi, 1));
1260 }
1261
vpx_d153_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1262 void vpx_d153_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
1263 const uint8_t *above, const uint8_t *left) {
1264 // See vpx_d153_predictor_8x8_neon for more details on the implementation.
1265 uint8x16_t az, a0, a14, a15, a16, l0az, l0, l1, l15, l16, l17, azl0, d0_lo,
1266 d0_hi, d1_lo, d1_hi, d2_lo, d2_hi;
1267 uint8x16x2_t d02_hi, d02_lo;
1268
1269 az = vld1q_u8(above - 1);
1270 a0 = vld1q_u8(above + 0);
1271 a14 = vld1q_u8(above + 14);
1272 a15 = vld1q_u8(above + 15);
1273 a16 = vld1q_u8(above + 16);
1274 // [ left[0], above[-1], ... , above[13] ]
1275 l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
1276
1277 l0 = vld1q_u8(left);
1278 l1 = vld1q_u8(left + 1);
1279 l15 = vld1q_u8(left + 15);
1280 l16 = vld1q_u8(left + 16);
1281 // The last lane here is unused, reading left[32] would cause a buffer
1282 // over-read (observed as an address-sanitizer failure), so just fill with a
1283 // duplicate of left[16] to avoid needing to materialize a zero:
1284 // [ left[17], ... , left[31], x ]
1285 l17 = vextq_u8(l16, l16, 1);
1286 // [ above[-1], left[0], ... , left[14] ]
1287 azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
1288
1289 d0_lo = vrhaddq_u8(azl0, l0);
1290 d0_hi = vrhaddq_u8(l15, l16);
1291
1292 d1_lo = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
1293 d1_hi = vrhaddq_u8(vhaddq_u8(a14, a16), a15);
1294
1295 // The highest lane of d2_hi is unused.
1296 d2_lo = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
1297 d2_hi = vrhaddq_u8(vhaddq_u8(l15, l17), l16);
1298
1299 d0_lo = vrev64q_u8(vextq_u8(d0_lo, d0_lo, 8));
1300 d0_hi = vrev64q_u8(vextq_u8(d0_hi, d0_hi, 8));
1301
1302 d2_lo = vrev64q_u8(vextq_u8(d2_lo, d2_lo, 8));
1303 d2_hi = vrev64q_u8(vextq_u8(d2_hi, d2_hi, 8));
1304
1305 // d02_hi.val[0][0] is unused here.
1306 d02_hi = vzipq_u8(d2_hi, d0_hi);
1307 d02_lo = vzipq_u8(d2_lo, d0_lo);
1308
1309 vst1q_u8(dst + 0 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 15));
1310 vst1q_u8(dst + 0 * stride + 16, vextq_u8(d1_lo, d1_hi, 15));
1311 vst1q_u8(dst + 1 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 13));
1312 vst1q_u8(dst + 1 * stride + 16, vextq_u8(d1_lo, d1_hi, 13));
1313 vst1q_u8(dst + 2 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 11));
1314 vst1q_u8(dst + 2 * stride + 16, vextq_u8(d1_lo, d1_hi, 11));
1315 vst1q_u8(dst + 3 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 9));
1316 vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_lo, d1_hi, 9));
1317 vst1q_u8(dst + 4 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 7));
1318 vst1q_u8(dst + 4 * stride + 16, vextq_u8(d1_lo, d1_hi, 7));
1319 vst1q_u8(dst + 5 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 5));
1320 vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_lo, d1_hi, 5));
1321 vst1q_u8(dst + 6 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 3));
1322 vst1q_u8(dst + 6 * stride + 16, vextq_u8(d1_lo, d1_hi, 3));
1323 vst1q_u8(dst + 7 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 1));
1324 vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_lo, d1_hi, 1));
1325 vst1q_u8(dst + 8 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 15));
1326 vst1q_u8(dst + 8 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 15));
1327 vst1q_u8(dst + 9 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 13));
1328 vst1q_u8(dst + 9 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 13));
1329 vst1q_u8(dst + 10 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 11));
1330 vst1q_u8(dst + 10 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 11));
1331 vst1q_u8(dst + 11 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 9));
1332 vst1q_u8(dst + 11 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 9));
1333 vst1q_u8(dst + 12 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 7));
1334 vst1q_u8(dst + 12 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 7));
1335 vst1q_u8(dst + 13 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 5));
1336 vst1q_u8(dst + 13 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 5));
1337 vst1q_u8(dst + 14 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 3));
1338 vst1q_u8(dst + 14 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 3));
1339 vst1q_u8(dst + 15 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 1));
1340 vst1q_u8(dst + 15 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 1));
1341 vst1q_u8(dst + 16 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 15));
1342 vst1q_u8(dst + 16 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 15));
1343 vst1q_u8(dst + 17 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 13));
1344 vst1q_u8(dst + 17 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 13));
1345 vst1q_u8(dst + 18 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 11));
1346 vst1q_u8(dst + 18 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 11));
1347 vst1q_u8(dst + 19 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 9));
1348 vst1q_u8(dst + 19 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 9));
1349 vst1q_u8(dst + 20 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 7));
1350 vst1q_u8(dst + 20 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 7));
1351 vst1q_u8(dst + 21 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 5));
1352 vst1q_u8(dst + 21 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 5));
1353 vst1q_u8(dst + 22 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 3));
1354 vst1q_u8(dst + 22 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 3));
1355 vst1q_u8(dst + 23 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 1));
1356 vst1q_u8(dst + 23 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 1));
1357 vst1q_u8(dst + 24 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 15));
1358 vst1q_u8(dst + 24 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 15));
1359 vst1q_u8(dst + 25 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 13));
1360 vst1q_u8(dst + 25 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 13));
1361 vst1q_u8(dst + 26 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 11));
1362 vst1q_u8(dst + 26 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 11));
1363 vst1q_u8(dst + 27 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 9));
1364 vst1q_u8(dst + 27 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 9));
1365 vst1q_u8(dst + 28 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 7));
1366 vst1q_u8(dst + 28 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 7));
1367 vst1q_u8(dst + 29 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 5));
1368 vst1q_u8(dst + 29 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 5));
1369 vst1q_u8(dst + 30 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 3));
1370 vst1q_u8(dst + 30 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 3));
1371 vst1q_u8(dst + 31 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 1));
1372 vst1q_u8(dst + 31 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 1));
1373 }
1374
1375 // -----------------------------------------------------------------------------
1376
vpx_d207_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1377 void vpx_d207_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
1378 const uint8_t *above, const uint8_t *left) {
1379 uint8x8_t l0, l3, l1, l2, c0, c1, c01, d0, d1;
1380 (void)above;
1381
1382 // We need the low half lanes here for the c0/c1 arithmetic but the high half
1383 // lanes for the ext:
1384 // [ left[0], left[1], left[2], left[3], left[0], left[1], left[2], left[3] ]
1385 l0 = load_replicate_u8_4x1(left + 0);
1386 l3 = vld1_dup_u8(left + 3);
1387
1388 // [ left[1], left[2], left[3], left[3], x, x, x, x ]
1389 l1 = vext_u8(l0, l3, 5);
1390 // [ left[2], left[3], left[3], left[3], x, x, x, x ]
1391 l2 = vext_u8(l0, l3, 6);
1392
1393 c0 = vrhadd_u8(l0, l1);
1394 c1 = vrhadd_u8(vhadd_u8(l0, l2), l1);
1395
1396 // [ c0[0], c1[0], c0[1], c1[1], c0[2], c1[2], c0[3], c1[3] ]
1397 c01 = vzip_u8(c0, c1).val[0];
1398
1399 d0 = c01;
1400 d1 = vext_u8(c01, l3, 2);
1401
1402 // Store the high half of the vector for stride={2,3} to avoid needing
1403 // additional ext instructions:
1404 // stride=0 [ c0[0], c1[0], c0[1], c1[1] ]
1405 // stride=1 [ c0[1], c1[1], c0[2], c1[2] ]
1406 // stride=2 [ c0[2], c1[2], c0[3], c1[3] ]
1407 // stride=3 [ c0[3], c1[3], left[3], left[3] ]
1408 store_u8_4x1(dst + 0 * stride, d0);
1409 store_u8_4x1(dst + 1 * stride, d1);
1410 store_u8_4x1_high(dst + 2 * stride, d0);
1411 store_u8_4x1_high(dst + 3 * stride, d1);
1412 }
1413
vpx_d207_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1414 void vpx_d207_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
1415 const uint8_t *above, const uint8_t *left) {
1416 uint8x8_t l7, l0, l1, l2, c0, c1, c01_lo, c01_hi;
1417 (void)above;
1418
1419 l0 = vld1_u8(left + 0);
1420 l7 = vld1_dup_u8(left + 7);
1421
1422 // [ left[1], left[2], left[3], left[4], left[5], left[6], left[7], left[7] ]
1423 l1 = vext_u8(l0, l7, 1);
1424 // [ left[2], left[3], left[4], left[5], left[6], left[7], left[7], left[7] ]
1425 l2 = vext_u8(l0, l7, 2);
1426
1427 c0 = vrhadd_u8(l0, l1);
1428 c1 = vrhadd_u8(vhadd_u8(l0, l2), l1);
1429
1430 c01_lo = vzip_u8(c0, c1).val[0];
1431 c01_hi = vzip_u8(c0, c1).val[1];
1432
1433 vst1_u8(dst + 0 * stride, c01_lo);
1434 vst1_u8(dst + 1 * stride, vext_u8(c01_lo, c01_hi, 2));
1435 vst1_u8(dst + 2 * stride, vext_u8(c01_lo, c01_hi, 4));
1436 vst1_u8(dst + 3 * stride, vext_u8(c01_lo, c01_hi, 6));
1437 vst1_u8(dst + 4 * stride, c01_hi);
1438 vst1_u8(dst + 5 * stride, vext_u8(c01_hi, l7, 2));
1439 vst1_u8(dst + 6 * stride, vext_u8(c01_hi, l7, 4));
1440 vst1_u8(dst + 7 * stride, vext_u8(c01_hi, l7, 6));
1441 }
1442
vpx_d207_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1443 void vpx_d207_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
1444 const uint8_t *above, const uint8_t *left) {
1445 uint8x16_t l15, l0, l1, l2, c0, c1, c01_lo, c01_hi;
1446 (void)above;
1447
1448 l0 = vld1q_u8(left + 0);
1449 l15 = vld1q_dup_u8(left + 15);
1450
1451 l1 = vextq_u8(l0, l15, 1);
1452 l2 = vextq_u8(l0, l15, 2);
1453
1454 c0 = vrhaddq_u8(l0, l1);
1455 c1 = vrhaddq_u8(vhaddq_u8(l0, l2), l1);
1456
1457 c01_lo = vzipq_u8(c0, c1).val[0];
1458 c01_hi = vzipq_u8(c0, c1).val[1];
1459
1460 vst1q_u8(dst + 0 * stride, c01_lo);
1461 vst1q_u8(dst + 1 * stride, vextq_u8(c01_lo, c01_hi, 2));
1462 vst1q_u8(dst + 2 * stride, vextq_u8(c01_lo, c01_hi, 4));
1463 vst1q_u8(dst + 3 * stride, vextq_u8(c01_lo, c01_hi, 6));
1464 vst1q_u8(dst + 4 * stride, vextq_u8(c01_lo, c01_hi, 8));
1465 vst1q_u8(dst + 5 * stride, vextq_u8(c01_lo, c01_hi, 10));
1466 vst1q_u8(dst + 6 * stride, vextq_u8(c01_lo, c01_hi, 12));
1467 vst1q_u8(dst + 7 * stride, vextq_u8(c01_lo, c01_hi, 14));
1468 vst1q_u8(dst + 8 * stride, c01_hi);
1469 vst1q_u8(dst + 9 * stride, vextq_u8(c01_hi, l15, 2));
1470 vst1q_u8(dst + 10 * stride, vextq_u8(c01_hi, l15, 4));
1471 vst1q_u8(dst + 11 * stride, vextq_u8(c01_hi, l15, 6));
1472 vst1q_u8(dst + 12 * stride, vextq_u8(c01_hi, l15, 8));
1473 vst1q_u8(dst + 13 * stride, vextq_u8(c01_hi, l15, 10));
1474 vst1q_u8(dst + 14 * stride, vextq_u8(c01_hi, l15, 12));
1475 vst1q_u8(dst + 15 * stride, vextq_u8(c01_hi, l15, 14));
1476 }
1477
vpx_d207_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1478 void vpx_d207_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
1479 const uint8_t *above, const uint8_t *left) {
1480 uint8x16_t l0_lo, l0_hi, l1_lo, l1_hi, l2_lo, l2_hi, l31, c0_lo, c0_hi, c1_lo,
1481 c1_hi, c01[4];
1482 (void)above;
1483
1484 l0_lo = vld1q_u8(left + 0);
1485 l0_hi = vld1q_u8(left + 16);
1486 l31 = vld1q_dup_u8(left + 31);
1487
1488 l1_lo = vextq_u8(l0_lo, l0_hi, 1);
1489 l1_hi = vextq_u8(l0_hi, l31, 1);
1490 l2_lo = vextq_u8(l0_lo, l0_hi, 2);
1491 l2_hi = vextq_u8(l0_hi, l31, 2);
1492
1493 c0_lo = vrhaddq_u8(l0_lo, l1_lo);
1494 c0_hi = vrhaddq_u8(l0_hi, l1_hi);
1495 c1_lo = vrhaddq_u8(vhaddq_u8(l0_lo, l2_lo), l1_lo);
1496 c1_hi = vrhaddq_u8(vhaddq_u8(l0_hi, l2_hi), l1_hi);
1497
1498 c01[0] = vzipq_u8(c0_lo, c1_lo).val[0];
1499 c01[1] = vzipq_u8(c0_lo, c1_lo).val[1];
1500 c01[2] = vzipq_u8(c0_hi, c1_hi).val[0];
1501 c01[3] = vzipq_u8(c0_hi, c1_hi).val[1];
1502
1503 vst1q_u8(dst + 0 * stride + 0, c01[0]);
1504 vst1q_u8(dst + 0 * stride + 16, c01[1]);
1505 vst1q_u8(dst + 1 * stride + 0, vextq_u8(c01[0], c01[1], 2));
1506 vst1q_u8(dst + 1 * stride + 16, vextq_u8(c01[1], c01[2], 2));
1507 vst1q_u8(dst + 2 * stride + 0, vextq_u8(c01[0], c01[1], 4));
1508 vst1q_u8(dst + 2 * stride + 16, vextq_u8(c01[1], c01[2], 4));
1509 vst1q_u8(dst + 3 * stride + 0, vextq_u8(c01[0], c01[1], 6));
1510 vst1q_u8(dst + 3 * stride + 16, vextq_u8(c01[1], c01[2], 6));
1511 vst1q_u8(dst + 4 * stride + 0, vextq_u8(c01[0], c01[1], 8));
1512 vst1q_u8(dst + 4 * stride + 16, vextq_u8(c01[1], c01[2], 8));
1513 vst1q_u8(dst + 5 * stride + 0, vextq_u8(c01[0], c01[1], 10));
1514 vst1q_u8(dst + 5 * stride + 16, vextq_u8(c01[1], c01[2], 10));
1515 vst1q_u8(dst + 6 * stride + 0, vextq_u8(c01[0], c01[1], 12));
1516 vst1q_u8(dst + 6 * stride + 16, vextq_u8(c01[1], c01[2], 12));
1517 vst1q_u8(dst + 7 * stride + 0, vextq_u8(c01[0], c01[1], 14));
1518 vst1q_u8(dst + 7 * stride + 16, vextq_u8(c01[1], c01[2], 14));
1519 vst1q_u8(dst + 8 * stride + 0, c01[1]);
1520 vst1q_u8(dst + 8 * stride + 16, c01[2]);
1521 vst1q_u8(dst + 9 * stride + 0, vextq_u8(c01[1], c01[2], 2));
1522 vst1q_u8(dst + 9 * stride + 16, vextq_u8(c01[2], c01[3], 2));
1523 vst1q_u8(dst + 10 * stride + 0, vextq_u8(c01[1], c01[2], 4));
1524 vst1q_u8(dst + 10 * stride + 16, vextq_u8(c01[2], c01[3], 4));
1525 vst1q_u8(dst + 11 * stride + 0, vextq_u8(c01[1], c01[2], 6));
1526 vst1q_u8(dst + 11 * stride + 16, vextq_u8(c01[2], c01[3], 6));
1527 vst1q_u8(dst + 12 * stride + 0, vextq_u8(c01[1], c01[2], 8));
1528 vst1q_u8(dst + 12 * stride + 16, vextq_u8(c01[2], c01[3], 8));
1529 vst1q_u8(dst + 13 * stride + 0, vextq_u8(c01[1], c01[2], 10));
1530 vst1q_u8(dst + 13 * stride + 16, vextq_u8(c01[2], c01[3], 10));
1531 vst1q_u8(dst + 14 * stride + 0, vextq_u8(c01[1], c01[2], 12));
1532 vst1q_u8(dst + 14 * stride + 16, vextq_u8(c01[2], c01[3], 12));
1533 vst1q_u8(dst + 15 * stride + 0, vextq_u8(c01[1], c01[2], 14));
1534 vst1q_u8(dst + 15 * stride + 16, vextq_u8(c01[2], c01[3], 14));
1535 vst1q_u8(dst + 16 * stride + 0, c01[2]);
1536 vst1q_u8(dst + 16 * stride + 16, c01[3]);
1537 vst1q_u8(dst + 17 * stride + 0, vextq_u8(c01[2], c01[3], 2));
1538 vst1q_u8(dst + 17 * stride + 16, vextq_u8(c01[3], l31, 2));
1539 vst1q_u8(dst + 18 * stride + 0, vextq_u8(c01[2], c01[3], 4));
1540 vst1q_u8(dst + 18 * stride + 16, vextq_u8(c01[3], l31, 4));
1541 vst1q_u8(dst + 19 * stride + 0, vextq_u8(c01[2], c01[3], 6));
1542 vst1q_u8(dst + 19 * stride + 16, vextq_u8(c01[3], l31, 6));
1543 vst1q_u8(dst + 20 * stride + 0, vextq_u8(c01[2], c01[3], 8));
1544 vst1q_u8(dst + 20 * stride + 16, vextq_u8(c01[3], l31, 8));
1545 vst1q_u8(dst + 21 * stride + 0, vextq_u8(c01[2], c01[3], 10));
1546 vst1q_u8(dst + 21 * stride + 16, vextq_u8(c01[3], l31, 10));
1547 vst1q_u8(dst + 22 * stride + 0, vextq_u8(c01[2], c01[3], 12));
1548 vst1q_u8(dst + 22 * stride + 16, vextq_u8(c01[3], l31, 12));
1549 vst1q_u8(dst + 23 * stride + 0, vextq_u8(c01[2], c01[3], 14));
1550 vst1q_u8(dst + 23 * stride + 16, vextq_u8(c01[3], l31, 14));
1551 vst1q_u8(dst + 24 * stride + 0, c01[3]);
1552 vst1q_u8(dst + 24 * stride + 16, l31);
1553 vst1q_u8(dst + 25 * stride + 0, vextq_u8(c01[3], l31, 2));
1554 vst1q_u8(dst + 25 * stride + 16, l31);
1555 vst1q_u8(dst + 26 * stride + 0, vextq_u8(c01[3], l31, 4));
1556 vst1q_u8(dst + 26 * stride + 16, l31);
1557 vst1q_u8(dst + 27 * stride + 0, vextq_u8(c01[3], l31, 6));
1558 vst1q_u8(dst + 27 * stride + 16, l31);
1559 vst1q_u8(dst + 28 * stride + 0, vextq_u8(c01[3], l31, 8));
1560 vst1q_u8(dst + 28 * stride + 16, l31);
1561 vst1q_u8(dst + 29 * stride + 0, vextq_u8(c01[3], l31, 10));
1562 vst1q_u8(dst + 29 * stride + 16, l31);
1563 vst1q_u8(dst + 30 * stride + 0, vextq_u8(c01[3], l31, 12));
1564 vst1q_u8(dst + 30 * stride + 16, l31);
1565 vst1q_u8(dst + 31 * stride + 0, vextq_u8(c01[3], l31, 14));
1566 vst1q_u8(dst + 31 * stride + 16, l31);
1567 }
1568
1569 // -----------------------------------------------------------------------------
1570
1571 #if !HAVE_NEON_ASM
1572
vpx_v_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1573 void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
1574 const uint8_t *above, const uint8_t *left) {
1575 const uint32_t d = *(const uint32_t *)above;
1576 int i;
1577 (void)left;
1578
1579 for (i = 0; i < 4; i++, dst += stride) {
1580 *(uint32_t *)dst = d;
1581 }
1582 }
1583
vpx_v_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1584 void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
1585 const uint8_t *above, const uint8_t *left) {
1586 const uint8x8_t d = vld1_u8(above);
1587 int i;
1588 (void)left;
1589
1590 for (i = 0; i < 8; i++, dst += stride) {
1591 vst1_u8(dst, d);
1592 }
1593 }
1594
vpx_v_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1595 void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
1596 const uint8_t *above, const uint8_t *left) {
1597 const uint8x16_t d = vld1q_u8(above);
1598 int i;
1599 (void)left;
1600
1601 for (i = 0; i < 16; i++, dst += stride) {
1602 vst1q_u8(dst, d);
1603 }
1604 }
1605
vpx_v_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1606 void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
1607 const uint8_t *above, const uint8_t *left) {
1608 const uint8x16_t d0 = vld1q_u8(above);
1609 const uint8x16_t d1 = vld1q_u8(above + 16);
1610 int i;
1611 (void)left;
1612
1613 for (i = 0; i < 32; i++) {
1614 // Note: performance was worse using vst2q_u8 under gcc-4.9 & clang-3.8.
1615 // clang-3.8 unrolled the loop fully with no filler so the cause is likely
1616 // the latency of the instruction.
1617 vst1q_u8(dst, d0);
1618 dst += 16;
1619 vst1q_u8(dst, d1);
1620 dst += stride - 16;
1621 }
1622 }
1623
1624 // -----------------------------------------------------------------------------
1625
vpx_h_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1626 void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
1627 const uint8_t *above, const uint8_t *left) {
1628 const uint32x2_t zero = vdup_n_u32(0);
1629 const uint8x8_t left_u8 =
1630 vreinterpret_u8_u32(vld1_lane_u32((const uint32_t *)left, zero, 0));
1631 uint8x8_t d;
1632 (void)above;
1633
1634 d = vdup_lane_u8(left_u8, 0);
1635 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
1636 dst += stride;
1637 d = vdup_lane_u8(left_u8, 1);
1638 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
1639 dst += stride;
1640 d = vdup_lane_u8(left_u8, 2);
1641 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
1642 dst += stride;
1643 d = vdup_lane_u8(left_u8, 3);
1644 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
1645 }
1646
vpx_h_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1647 void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
1648 const uint8_t *above, const uint8_t *left) {
1649 const uint8x8_t left_u8 = vld1_u8(left);
1650 uint8x8_t d;
1651 (void)above;
1652
1653 d = vdup_lane_u8(left_u8, 0);
1654 vst1_u8(dst, d);
1655 dst += stride;
1656 d = vdup_lane_u8(left_u8, 1);
1657 vst1_u8(dst, d);
1658 dst += stride;
1659 d = vdup_lane_u8(left_u8, 2);
1660 vst1_u8(dst, d);
1661 dst += stride;
1662 d = vdup_lane_u8(left_u8, 3);
1663 vst1_u8(dst, d);
1664 dst += stride;
1665 d = vdup_lane_u8(left_u8, 4);
1666 vst1_u8(dst, d);
1667 dst += stride;
1668 d = vdup_lane_u8(left_u8, 5);
1669 vst1_u8(dst, d);
1670 dst += stride;
1671 d = vdup_lane_u8(left_u8, 6);
1672 vst1_u8(dst, d);
1673 dst += stride;
1674 d = vdup_lane_u8(left_u8, 7);
1675 vst1_u8(dst, d);
1676 }
1677
h_store_16x8(uint8_t ** dst,const ptrdiff_t stride,const uint8x8_t left)1678 static INLINE void h_store_16x8(uint8_t **dst, const ptrdiff_t stride,
1679 const uint8x8_t left) {
1680 const uint8x16_t row_0 = vdupq_lane_u8(left, 0);
1681 const uint8x16_t row_1 = vdupq_lane_u8(left, 1);
1682 const uint8x16_t row_2 = vdupq_lane_u8(left, 2);
1683 const uint8x16_t row_3 = vdupq_lane_u8(left, 3);
1684 const uint8x16_t row_4 = vdupq_lane_u8(left, 4);
1685 const uint8x16_t row_5 = vdupq_lane_u8(left, 5);
1686 const uint8x16_t row_6 = vdupq_lane_u8(left, 6);
1687 const uint8x16_t row_7 = vdupq_lane_u8(left, 7);
1688
1689 vst1q_u8(*dst, row_0);
1690 *dst += stride;
1691 vst1q_u8(*dst, row_1);
1692 *dst += stride;
1693 vst1q_u8(*dst, row_2);
1694 *dst += stride;
1695 vst1q_u8(*dst, row_3);
1696 *dst += stride;
1697 vst1q_u8(*dst, row_4);
1698 *dst += stride;
1699 vst1q_u8(*dst, row_5);
1700 *dst += stride;
1701 vst1q_u8(*dst, row_6);
1702 *dst += stride;
1703 vst1q_u8(*dst, row_7);
1704 *dst += stride;
1705 }
1706
vpx_h_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1707 void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
1708 const uint8_t *above, const uint8_t *left) {
1709 const uint8x16_t left_u8q = vld1q_u8(left);
1710 (void)above;
1711
1712 h_store_16x8(&dst, stride, vget_low_u8(left_u8q));
1713 h_store_16x8(&dst, stride, vget_high_u8(left_u8q));
1714 }
1715
h_store_32x8(uint8_t ** dst,const ptrdiff_t stride,const uint8x8_t left)1716 static INLINE void h_store_32x8(uint8_t **dst, const ptrdiff_t stride,
1717 const uint8x8_t left) {
1718 const uint8x16_t row_0 = vdupq_lane_u8(left, 0);
1719 const uint8x16_t row_1 = vdupq_lane_u8(left, 1);
1720 const uint8x16_t row_2 = vdupq_lane_u8(left, 2);
1721 const uint8x16_t row_3 = vdupq_lane_u8(left, 3);
1722 const uint8x16_t row_4 = vdupq_lane_u8(left, 4);
1723 const uint8x16_t row_5 = vdupq_lane_u8(left, 5);
1724 const uint8x16_t row_6 = vdupq_lane_u8(left, 6);
1725 const uint8x16_t row_7 = vdupq_lane_u8(left, 7);
1726
1727 vst1q_u8(*dst, row_0); // Note clang-3.8 produced poor code w/vst2q_u8
1728 *dst += 16;
1729 vst1q_u8(*dst, row_0);
1730 *dst += stride - 16;
1731 vst1q_u8(*dst, row_1);
1732 *dst += 16;
1733 vst1q_u8(*dst, row_1);
1734 *dst += stride - 16;
1735 vst1q_u8(*dst, row_2);
1736 *dst += 16;
1737 vst1q_u8(*dst, row_2);
1738 *dst += stride - 16;
1739 vst1q_u8(*dst, row_3);
1740 *dst += 16;
1741 vst1q_u8(*dst, row_3);
1742 *dst += stride - 16;
1743 vst1q_u8(*dst, row_4);
1744 *dst += 16;
1745 vst1q_u8(*dst, row_4);
1746 *dst += stride - 16;
1747 vst1q_u8(*dst, row_5);
1748 *dst += 16;
1749 vst1q_u8(*dst, row_5);
1750 *dst += stride - 16;
1751 vst1q_u8(*dst, row_6);
1752 *dst += 16;
1753 vst1q_u8(*dst, row_6);
1754 *dst += stride - 16;
1755 vst1q_u8(*dst, row_7);
1756 *dst += 16;
1757 vst1q_u8(*dst, row_7);
1758 *dst += stride - 16;
1759 }
1760
vpx_h_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1761 void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
1762 const uint8_t *above, const uint8_t *left) {
1763 int i;
1764 (void)above;
1765
1766 for (i = 0; i < 2; i++, left += 16) {
1767 const uint8x16_t left_u8 = vld1q_u8(left);
1768 h_store_32x8(&dst, stride, vget_low_u8(left_u8));
1769 h_store_32x8(&dst, stride, vget_high_u8(left_u8));
1770 }
1771 }
1772
1773 // -----------------------------------------------------------------------------
1774
convert_u8_to_s16(uint8x8_t v)1775 static INLINE int16x8_t convert_u8_to_s16(uint8x8_t v) {
1776 return vreinterpretq_s16_u16(vmovl_u8(v));
1777 }
1778
vpx_tm_predictor_4x4_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1779 void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
1780 const uint8_t *above, const uint8_t *left) {
1781 const uint8x8_t top_left = vld1_dup_u8(above - 1);
1782 const uint8x8_t left_u8 = vld1_u8(left);
1783 const uint8x8_t above_u8 = vld1_u8(above);
1784 const int16x4_t left_s16 = vget_low_s16(convert_u8_to_s16(left_u8));
1785 int16x8_t sub, sum;
1786 uint32x2_t d;
1787
1788 sub = vreinterpretq_s16_u16(vsubl_u8(above_u8, top_left));
1789 // Avoid vcombine_s16() which generates lots of redundant code with clang-3.8.
1790 sub = vreinterpretq_s16_s64(
1791 vdupq_lane_s64(vreinterpret_s64_s16(vget_low_s16(sub)), 0));
1792
1793 sum = vcombine_s16(vdup_lane_s16(left_s16, 0), vdup_lane_s16(left_s16, 1));
1794 sum = vaddq_s16(sum, sub);
1795 d = vreinterpret_u32_u8(vqmovun_s16(sum));
1796 vst1_lane_u32((uint32_t *)dst, d, 0);
1797 dst += stride;
1798 vst1_lane_u32((uint32_t *)dst, d, 1);
1799 dst += stride;
1800
1801 sum = vcombine_s16(vdup_lane_s16(left_s16, 2), vdup_lane_s16(left_s16, 3));
1802 sum = vaddq_s16(sum, sub);
1803 d = vreinterpret_u32_u8(vqmovun_s16(sum));
1804 vst1_lane_u32((uint32_t *)dst, d, 0);
1805 dst += stride;
1806 vst1_lane_u32((uint32_t *)dst, d, 1);
1807 }
1808
tm_8_kernel(uint8_t ** dst,const ptrdiff_t stride,const int16x8_t left_dup,const int16x8_t sub)1809 static INLINE void tm_8_kernel(uint8_t **dst, const ptrdiff_t stride,
1810 const int16x8_t left_dup, const int16x8_t sub) {
1811 const int16x8_t sum = vaddq_s16(left_dup, sub);
1812 const uint8x8_t d = vqmovun_s16(sum);
1813 vst1_u8(*dst, d);
1814 *dst += stride;
1815 }
1816
vpx_tm_predictor_8x8_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1817 void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
1818 const uint8_t *above, const uint8_t *left) {
1819 const uint8x8_t top_left = vld1_dup_u8(above - 1);
1820 const uint8x8_t above_u8 = vld1_u8(above);
1821 const uint8x8_t left_u8 = vld1_u8(left);
1822 const int16x8_t left_s16q = convert_u8_to_s16(left_u8);
1823 const int16x8_t sub = vreinterpretq_s16_u16(vsubl_u8(above_u8, top_left));
1824 int16x4_t left_s16d = vget_low_s16(left_s16q);
1825 int i;
1826
1827 for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) {
1828 int16x8_t left_dup;
1829
1830 left_dup = vdupq_lane_s16(left_s16d, 0);
1831 tm_8_kernel(&dst, stride, left_dup, sub);
1832 left_dup = vdupq_lane_s16(left_s16d, 1);
1833 tm_8_kernel(&dst, stride, left_dup, sub);
1834 left_dup = vdupq_lane_s16(left_s16d, 2);
1835 tm_8_kernel(&dst, stride, left_dup, sub);
1836 left_dup = vdupq_lane_s16(left_s16d, 3);
1837 tm_8_kernel(&dst, stride, left_dup, sub);
1838 }
1839 }
1840
tm_16_kernel(uint8_t ** dst,const ptrdiff_t stride,const int16x8_t left_dup,const int16x8_t sub0,const int16x8_t sub1)1841 static INLINE void tm_16_kernel(uint8_t **dst, const ptrdiff_t stride,
1842 const int16x8_t left_dup, const int16x8_t sub0,
1843 const int16x8_t sub1) {
1844 const int16x8_t sum0 = vaddq_s16(left_dup, sub0);
1845 const int16x8_t sum1 = vaddq_s16(left_dup, sub1);
1846 const uint8x8_t d0 = vqmovun_s16(sum0);
1847 const uint8x8_t d1 = vqmovun_s16(sum1);
1848 vst1_u8(*dst, d0);
1849 *dst += 8;
1850 vst1_u8(*dst, d1);
1851 *dst += stride - 8;
1852 }
1853
vpx_tm_predictor_16x16_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1854 void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
1855 const uint8_t *above, const uint8_t *left) {
1856 const uint8x16_t top_left = vld1q_dup_u8(above - 1);
1857 const uint8x16_t above_u8 = vld1q_u8(above);
1858 const int16x8_t sub0 = vreinterpretq_s16_u16(
1859 vsubl_u8(vget_low_u8(above_u8), vget_low_u8(top_left)));
1860 const int16x8_t sub1 = vreinterpretq_s16_u16(
1861 vsubl_u8(vget_high_u8(above_u8), vget_high_u8(top_left)));
1862 int16x8_t left_dup;
1863 int i;
1864
1865 for (i = 0; i < 2; i++, left += 8) {
1866 const uint8x8_t left_u8 = vld1_u8(left);
1867 const int16x8_t left_s16q = convert_u8_to_s16(left_u8);
1868 const int16x4_t left_low = vget_low_s16(left_s16q);
1869 const int16x4_t left_high = vget_high_s16(left_s16q);
1870
1871 left_dup = vdupq_lane_s16(left_low, 0);
1872 tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
1873 left_dup = vdupq_lane_s16(left_low, 1);
1874 tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
1875 left_dup = vdupq_lane_s16(left_low, 2);
1876 tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
1877 left_dup = vdupq_lane_s16(left_low, 3);
1878 tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
1879
1880 left_dup = vdupq_lane_s16(left_high, 0);
1881 tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
1882 left_dup = vdupq_lane_s16(left_high, 1);
1883 tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
1884 left_dup = vdupq_lane_s16(left_high, 2);
1885 tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
1886 left_dup = vdupq_lane_s16(left_high, 3);
1887 tm_16_kernel(&dst, stride, left_dup, sub0, sub1);
1888 }
1889 }
1890
tm_32_kernel(uint8_t ** dst,const ptrdiff_t stride,const int16x8_t left_dup,const int16x8_t sub0,const int16x8_t sub1,const int16x8_t sub2,const int16x8_t sub3)1891 static INLINE void tm_32_kernel(uint8_t **dst, const ptrdiff_t stride,
1892 const int16x8_t left_dup, const int16x8_t sub0,
1893 const int16x8_t sub1, const int16x8_t sub2,
1894 const int16x8_t sub3) {
1895 const int16x8_t sum0 = vaddq_s16(left_dup, sub0);
1896 const int16x8_t sum1 = vaddq_s16(left_dup, sub1);
1897 const int16x8_t sum2 = vaddq_s16(left_dup, sub2);
1898 const int16x8_t sum3 = vaddq_s16(left_dup, sub3);
1899 const uint8x8_t d0 = vqmovun_s16(sum0);
1900 const uint8x8_t d1 = vqmovun_s16(sum1);
1901 const uint8x8_t d2 = vqmovun_s16(sum2);
1902 const uint8x8_t d3 = vqmovun_s16(sum3);
1903
1904 vst1q_u8(*dst, vcombine_u8(d0, d1));
1905 *dst += 16;
1906 vst1q_u8(*dst, vcombine_u8(d2, d3));
1907 *dst += stride - 16;
1908 }
1909
vpx_tm_predictor_32x32_neon(uint8_t * dst,ptrdiff_t stride,const uint8_t * above,const uint8_t * left)1910 void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
1911 const uint8_t *above, const uint8_t *left) {
1912 const uint8x16_t top_left = vld1q_dup_u8(above - 1);
1913 const uint8x16_t above_low = vld1q_u8(above);
1914 const uint8x16_t above_high = vld1q_u8(above + 16);
1915 const int16x8_t sub0 = vreinterpretq_s16_u16(
1916 vsubl_u8(vget_low_u8(above_low), vget_low_u8(top_left)));
1917 const int16x8_t sub1 = vreinterpretq_s16_u16(
1918 vsubl_u8(vget_high_u8(above_low), vget_high_u8(top_left)));
1919 const int16x8_t sub2 = vreinterpretq_s16_u16(
1920 vsubl_u8(vget_low_u8(above_high), vget_low_u8(top_left)));
1921 const int16x8_t sub3 = vreinterpretq_s16_u16(
1922 vsubl_u8(vget_high_u8(above_high), vget_high_u8(top_left)));
1923 int16x8_t left_dup;
1924 int i, j;
1925
1926 for (j = 0; j < 4; j++, left += 8) {
1927 const uint8x8_t left_u8 = vld1_u8(left);
1928 const int16x8_t left_s16q = convert_u8_to_s16(left_u8);
1929 int16x4_t left_s16d = vget_low_s16(left_s16q);
1930 for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) {
1931 left_dup = vdupq_lane_s16(left_s16d, 0);
1932 tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3);
1933 left_dup = vdupq_lane_s16(left_s16d, 1);
1934 tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3);
1935 left_dup = vdupq_lane_s16(left_s16d, 2);
1936 tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3);
1937 left_dup = vdupq_lane_s16(left_s16d, 3);
1938 tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3);
1939 }
1940 }
1941 }
1942 #endif // !HAVE_NEON_ASM
1943