1 /*
2 * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #include "./vpx_config.h"
14 #include "./vpx_dsp_rtcd.h"
15 #include "sum_neon.h"
16 #include "vpx/vpx_integer.h"
17
18 //------------------------------------------------------------------------------
19 // DC 4x4
20
dc_sum_4(const uint16_t * ref)21 static INLINE uint16_t dc_sum_4(const uint16_t *ref) {
22 const uint16x4_t ref_u16 = vld1_u16(ref);
23 return horizontal_add_uint16x4(ref_u16);
24 }
25
dc_store_4x4(uint16_t * dst,ptrdiff_t stride,const uint16x4_t dc)26 static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
27 const uint16x4_t dc) {
28 int i;
29 for (i = 0; i < 4; ++i, dst += stride) {
30 vst1_u16(dst, dc);
31 }
32 }
33
vpx_highbd_dc_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)34 void vpx_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
35 const uint16_t *above,
36 const uint16_t *left, int bd) {
37 const uint16x4_t a = vld1_u16(above);
38 const uint16x4_t l = vld1_u16(left);
39 const uint16_t sum = horizontal_add_uint16x4(vadd_u16(a, l));
40 const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 3);
41 (void)bd;
42 dc_store_4x4(dst, stride, dc);
43 }
44
vpx_highbd_dc_left_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)45 void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
46 const uint16_t *above,
47 const uint16_t *left, int bd) {
48 const uint16_t sum = dc_sum_4(left);
49 const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 2);
50 (void)above;
51 (void)bd;
52 dc_store_4x4(dst, stride, dc);
53 }
54
vpx_highbd_dc_top_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)55 void vpx_highbd_dc_top_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
56 const uint16_t *above,
57 const uint16_t *left, int bd) {
58 const uint16_t sum = dc_sum_4(above);
59 const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 2);
60 (void)left;
61 (void)bd;
62 dc_store_4x4(dst, stride, dc);
63 }
64
vpx_highbd_dc_128_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)65 void vpx_highbd_dc_128_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
66 const uint16_t *above,
67 const uint16_t *left, int bd) {
68 const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
69 (void)above;
70 (void)left;
71 dc_store_4x4(dst, stride, dc);
72 }
73
74 //------------------------------------------------------------------------------
75 // DC 8x8
76
dc_sum_8(const uint16_t * ref)77 static INLINE uint16_t dc_sum_8(const uint16_t *ref) {
78 const uint16x8_t ref_u16 = vld1q_u16(ref);
79 return horizontal_add_uint16x8(ref_u16);
80 }
81
dc_store_8x8(uint16_t * dst,ptrdiff_t stride,const uint16x8_t dc)82 static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride,
83 const uint16x8_t dc) {
84 int i;
85 for (i = 0; i < 8; ++i, dst += stride) {
86 vst1q_u16(dst, dc);
87 }
88 }
89
vpx_highbd_dc_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)90 void vpx_highbd_dc_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
91 const uint16_t *above,
92 const uint16_t *left, int bd) {
93 const uint16x8_t above_u16 = vld1q_u16(above);
94 const uint16x8_t left_u16 = vld1q_u16(left);
95 const uint16x8_t p0 = vaddq_u16(above_u16, left_u16);
96 const uint16_t sum = horizontal_add_uint16x8(p0);
97 const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4);
98 (void)bd;
99 dc_store_8x8(dst, stride, dc);
100 }
101
vpx_highbd_dc_left_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)102 void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
103 const uint16_t *above,
104 const uint16_t *left, int bd) {
105 const uint16_t sum = dc_sum_8(left);
106 const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 3);
107 (void)above;
108 (void)bd;
109 dc_store_8x8(dst, stride, dc);
110 }
111
vpx_highbd_dc_top_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)112 void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
113 const uint16_t *above,
114 const uint16_t *left, int bd) {
115 const uint16_t sum = dc_sum_8(above);
116 const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 3);
117 (void)left;
118 (void)bd;
119 dc_store_8x8(dst, stride, dc);
120 }
121
vpx_highbd_dc_128_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)122 void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
123 const uint16_t *above,
124 const uint16_t *left, int bd) {
125 const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1));
126 (void)above;
127 (void)left;
128 dc_store_8x8(dst, stride, dc);
129 }
130
131 //------------------------------------------------------------------------------
132 // DC 16x16
133
dc_sum_16(const uint16_t * ref)134 static INLINE uint16_t dc_sum_16(const uint16_t *ref) {
135 const uint16x8_t ref_u16_0 = vld1q_u16(ref + 0);
136 const uint16x8_t ref_u16_1 = vld1q_u16(ref + 8);
137 const uint16x8_t p0 = vaddq_u16(ref_u16_0, ref_u16_1);
138 return horizontal_add_uint16x8(p0);
139 }
140
dc_store_16x16(uint16_t * dst,ptrdiff_t stride,const uint16x8_t dc)141 static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride,
142 const uint16x8_t dc) {
143 int i;
144 for (i = 0; i < 16; ++i, dst += stride) {
145 vst1q_u16(dst + 0, dc);
146 vst1q_u16(dst + 8, dc);
147 }
148 }
149
vpx_highbd_dc_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)150 void vpx_highbd_dc_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
151 const uint16_t *above,
152 const uint16_t *left, int bd) {
153 const uint16x8_t a0 = vld1q_u16(above + 0);
154 const uint16x8_t a1 = vld1q_u16(above + 8);
155 const uint16x8_t l0 = vld1q_u16(left + 0);
156 const uint16x8_t l1 = vld1q_u16(left + 8);
157 const uint16x8_t pa = vaddq_u16(a0, a1);
158 const uint16x8_t pl = vaddq_u16(l0, l1);
159 const uint16x8_t pal0 = vaddq_u16(pa, pl);
160 const uint32_t sum = horizontal_add_uint16x8(pal0);
161 const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0);
162 (void)bd;
163 dc_store_16x16(dst, stride, dc);
164 }
165
vpx_highbd_dc_left_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)166 void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
167 const uint16_t *above,
168 const uint16_t *left, int bd) {
169 const uint16_t sum = dc_sum_16(left);
170 const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4);
171 (void)above;
172 (void)bd;
173 dc_store_16x16(dst, stride, dc);
174 }
175
vpx_highbd_dc_top_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)176 void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
177 const uint16_t *above,
178 const uint16_t *left, int bd) {
179 const uint16_t sum = dc_sum_16(above);
180 const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4);
181 (void)left;
182 (void)bd;
183 dc_store_16x16(dst, stride, dc);
184 }
185
vpx_highbd_dc_128_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)186 void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
187 const uint16_t *above,
188 const uint16_t *left, int bd) {
189 const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1));
190 (void)above;
191 (void)left;
192 dc_store_16x16(dst, stride, dc);
193 }
194
195 //------------------------------------------------------------------------------
196 // DC 32x32
197
dc_sum_32(const uint16_t * ref)198 static INLINE uint32_t dc_sum_32(const uint16_t *ref) {
199 const uint16x8_t r0 = vld1q_u16(ref + 0);
200 const uint16x8_t r1 = vld1q_u16(ref + 8);
201 const uint16x8_t r2 = vld1q_u16(ref + 16);
202 const uint16x8_t r3 = vld1q_u16(ref + 24);
203 const uint16x8_t p0 = vaddq_u16(r0, r1);
204 const uint16x8_t p1 = vaddq_u16(r2, r3);
205 const uint16x8_t p2 = vaddq_u16(p0, p1);
206 return horizontal_add_uint16x8(p2);
207 }
208
dc_store_32x32(uint16_t * dst,ptrdiff_t stride,const uint16x8_t dc)209 static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride,
210 const uint16x8_t dc) {
211 int i;
212 for (i = 0; i < 32; ++i) {
213 vst1q_u16(dst + 0, dc);
214 vst1q_u16(dst + 8, dc);
215 vst1q_u16(dst + 16, dc);
216 vst1q_u16(dst + 24, dc);
217 dst += stride;
218 }
219 }
220
vpx_highbd_dc_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)221 void vpx_highbd_dc_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
222 const uint16_t *above,
223 const uint16_t *left, int bd) {
224 const uint16x8_t a0 = vld1q_u16(above + 0);
225 const uint16x8_t a1 = vld1q_u16(above + 8);
226 const uint16x8_t a2 = vld1q_u16(above + 16);
227 const uint16x8_t a3 = vld1q_u16(above + 24);
228 const uint16x8_t l0 = vld1q_u16(left + 0);
229 const uint16x8_t l1 = vld1q_u16(left + 8);
230 const uint16x8_t l2 = vld1q_u16(left + 16);
231 const uint16x8_t l3 = vld1q_u16(left + 24);
232 const uint16x8_t pa0 = vaddq_u16(a0, a1);
233 const uint16x8_t pa1 = vaddq_u16(a2, a3);
234 const uint16x8_t pl0 = vaddq_u16(l0, l1);
235 const uint16x8_t pl1 = vaddq_u16(l2, l3);
236 const uint16x8_t pa = vaddq_u16(pa0, pa1);
237 const uint16x8_t pl = vaddq_u16(pl0, pl1);
238 const uint16x8_t pal0 = vaddq_u16(pa, pl);
239 const uint32_t sum = horizontal_add_uint16x8(pal0);
240 const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 6), 0);
241 (void)bd;
242 dc_store_32x32(dst, stride, dc);
243 }
244
vpx_highbd_dc_left_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)245 void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
246 const uint16_t *above,
247 const uint16_t *left, int bd) {
248 const uint32_t sum = dc_sum_32(left);
249 const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0);
250 (void)above;
251 (void)bd;
252 dc_store_32x32(dst, stride, dc);
253 }
254
vpx_highbd_dc_top_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)255 void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
256 const uint16_t *above,
257 const uint16_t *left, int bd) {
258 const uint32_t sum = dc_sum_32(above);
259 const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0);
260 (void)left;
261 (void)bd;
262 dc_store_32x32(dst, stride, dc);
263 }
264
vpx_highbd_dc_128_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)265 void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
266 const uint16_t *above,
267 const uint16_t *left, int bd) {
268 const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1));
269 (void)above;
270 (void)left;
271 dc_store_32x32(dst, stride, dc);
272 }
273
274 // -----------------------------------------------------------------------------
275
vpx_highbd_d45_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)276 void vpx_highbd_d45_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
277 const uint16_t *above,
278 const uint16_t *left, int bd) {
279 uint16x8_t a0, a1, a2, d0;
280 uint16_t a7;
281 (void)left;
282 (void)bd;
283
284 a0 = vld1q_u16(above);
285 a7 = above[7];
286
287 // [ above[1], ..., above[6], x, x ]
288 a1 = vextq_u16(a0, a0, 1);
289 // [ above[2], ..., above[7], x, x ]
290 a2 = vextq_u16(a0, a0, 2);
291
292 // d0[0] = AVG3(above[0], above[1], above[2]);
293 // ...
294 // d0[5] = AVG3(above[5], above[6], above[7]);
295 // d0[6] = x (don't care)
296 // d0[7] = x (don't care)
297 d0 = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
298
299 // We want:
300 // stride=0 [ d0[0], d0[1], d0[2], d0[3] ]
301 // stride=1 [ d0[1], d0[2], d0[3], d0[4] ]
302 // stride=2 [ d0[2], d0[3], d0[4], d0[5] ]
303 // stride=2 [ d0[3], d0[4], d0[5], above[7] ]
304 vst1_u16(dst + 0 * stride, vget_low_u16(d0));
305 vst1_u16(dst + 1 * stride, vget_low_u16(vextq_u16(d0, d0, 1)));
306 vst1_u16(dst + 2 * stride, vget_low_u16(vextq_u16(d0, d0, 2)));
307 vst1_u16(dst + 3 * stride, vget_low_u16(vextq_u16(d0, d0, 3)));
308
309 // We stored d0[6] above, so fixup into above[7].
310 dst[3 * stride + 3] = a7;
311 }
312
vpx_highbd_d45_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)313 void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
314 const uint16_t *above,
315 const uint16_t *left, int bd) {
316 uint16x8_t ax0, a0, a1, a7, d0;
317 (void)left;
318 (void)bd;
319
320 a0 = vld1q_u16(above + 0);
321 a1 = vld1q_u16(above + 1);
322 a7 = vld1q_dup_u16(above + 7);
323
324 // We want to calculate the AVG3 result in lanes 1-7 inclusive so we can
325 // shift in above[7] later, so shift a0 across by one to get the right
326 // inputs:
327 // [ x, above[0], ... , above[6] ]
328 ax0 = vextq_u16(a0, a0, 7);
329
330 // d0[0] = x (don't care)
331 // d0[1] = AVG3(above[0], above[1], above[2]);
332 // ...
333 // d0[7] = AVG3(above[6], above[7], above[8]);
334 d0 = vrhaddq_u16(vhaddq_u16(ax0, a1), a0);
335
336 // Undo the earlier ext, incrementally shift in duplicates of above[7].
337 vst1q_u16(dst + 0 * stride, vextq_u16(d0, a7, 1));
338 vst1q_u16(dst + 1 * stride, vextq_u16(d0, a7, 2));
339 vst1q_u16(dst + 2 * stride, vextq_u16(d0, a7, 3));
340 vst1q_u16(dst + 3 * stride, vextq_u16(d0, a7, 4));
341 vst1q_u16(dst + 4 * stride, vextq_u16(d0, a7, 5));
342 vst1q_u16(dst + 5 * stride, vextq_u16(d0, a7, 6));
343 vst1q_u16(dst + 6 * stride, vextq_u16(d0, a7, 7));
344 vst1q_u16(dst + 7 * stride, a7);
345 }
346
vpx_highbd_d45_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)347 void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
348 const uint16_t *above,
349 const uint16_t *left, int bd) {
350 uint16x8_t ax0, a0, a1, a7, a8, a9, a15, d0[2];
351 (void)left;
352 (void)bd;
353
354 a0 = vld1q_u16(above + 0);
355 a1 = vld1q_u16(above + 1);
356 a7 = vld1q_u16(above + 7);
357 a8 = vld1q_u16(above + 8);
358 a9 = vld1q_u16(above + 9);
359 a15 = vld1q_dup_u16(above + 15);
360
361 // [ x, above[0], ... , above[6] ]
362 ax0 = vextq_u16(a0, a0, 7);
363
364 // We have one unused lane here to leave room to shift in above[15] in the
365 // last lane:
366 // d0[0][1] = x (don't care)
367 // d0[0][1] = AVG3(above[0], above[1], above[2]);
368 // ...
369 // d0[0][7] = AVG3(above[6], above[7], above[8]);
370 // d0[1][0] = AVG3(above[7], above[8], above[9]);
371 // ...
372 // d0[1][7] = AVG3(above[14], above[15], above[16]);
373 d0[0] = vrhaddq_u16(vhaddq_u16(ax0, a1), a0);
374 d0[1] = vrhaddq_u16(vhaddq_u16(a7, a9), a8);
375
376 // Incrementally shift in duplicates of above[15].
377 vst1q_u16(dst + 0 * stride + 0, vextq_u16(d0[0], d0[1], 1));
378 vst1q_u16(dst + 0 * stride + 8, vextq_u16(d0[1], a15, 1));
379 vst1q_u16(dst + 1 * stride + 0, vextq_u16(d0[0], d0[1], 2));
380 vst1q_u16(dst + 1 * stride + 8, vextq_u16(d0[1], a15, 2));
381 vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 3));
382 vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[1], a15, 3));
383 vst1q_u16(dst + 3 * stride + 0, vextq_u16(d0[0], d0[1], 4));
384 vst1q_u16(dst + 3 * stride + 8, vextq_u16(d0[1], a15, 4));
385 vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 5));
386 vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[1], a15, 5));
387 vst1q_u16(dst + 5 * stride + 0, vextq_u16(d0[0], d0[1], 6));
388 vst1q_u16(dst + 5 * stride + 8, vextq_u16(d0[1], a15, 6));
389 vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 7));
390 vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[1], a15, 7));
391 vst1q_u16(dst + 7 * stride + 0, d0[1]);
392 vst1q_u16(dst + 7 * stride + 8, a15);
393
394 vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[1], a15, 1));
395 vst1q_u16(dst + 8 * stride + 8, a15);
396 vst1q_u16(dst + 9 * stride + 0, vextq_u16(d0[1], a15, 2));
397 vst1q_u16(dst + 9 * stride + 8, a15);
398 vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[1], a15, 3));
399 vst1q_u16(dst + 10 * stride + 8, a15);
400 vst1q_u16(dst + 11 * stride + 0, vextq_u16(d0[1], a15, 4));
401 vst1q_u16(dst + 11 * stride + 8, a15);
402 vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[1], a15, 5));
403 vst1q_u16(dst + 12 * stride + 8, a15);
404 vst1q_u16(dst + 13 * stride + 0, vextq_u16(d0[1], a15, 6));
405 vst1q_u16(dst + 13 * stride + 8, a15);
406 vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[1], a15, 7));
407 vst1q_u16(dst + 14 * stride + 8, a15);
408 vst1q_u16(dst + 15 * stride + 0, a15);
409 vst1q_u16(dst + 15 * stride + 8, a15);
410 }
411
vpx_highbd_d45_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)412 void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
413 const uint16_t *above,
414 const uint16_t *left, int bd) {
415 uint16x8_t ax0, a0, a1, a7, a8, a9, a15, a16, a17, a23, a24, a25, a31, d0[4];
416 int i;
417 (void)left;
418 (void)bd;
419
420 a0 = vld1q_u16(above + 0);
421 a1 = vld1q_u16(above + 1);
422 a7 = vld1q_u16(above + 7);
423 a8 = vld1q_u16(above + 8);
424 a9 = vld1q_u16(above + 9);
425 a15 = vld1q_u16(above + 15);
426 a16 = vld1q_u16(above + 16);
427 a17 = vld1q_u16(above + 17);
428 a23 = vld1q_u16(above + 23);
429 a24 = vld1q_u16(above + 24);
430 a25 = vld1q_u16(above + 25);
431 a31 = vld1q_dup_u16(above + 31);
432
433 // [ x, above[0], ... , above[6] ]
434 ax0 = vextq_u16(a0, a0, 7);
435
436 d0[0] = vrhaddq_u16(vhaddq_u16(ax0, a1), a0);
437 d0[1] = vrhaddq_u16(vhaddq_u16(a7, a9), a8);
438 d0[2] = vrhaddq_u16(vhaddq_u16(a15, a17), a16);
439 d0[3] = vrhaddq_u16(vhaddq_u16(a23, a25), a24);
440
441 for (i = 0; i < 32; ++i) {
442 d0[0] = vextq_u16(d0[0], d0[1], 1);
443 d0[1] = vextq_u16(d0[1], d0[2], 1);
444 d0[2] = vextq_u16(d0[2], d0[3], 1);
445 d0[3] = vextq_u16(d0[3], a31, 1);
446 vst1q_u16(dst + 0, d0[0]);
447 vst1q_u16(dst + 8, d0[1]);
448 vst1q_u16(dst + 16, d0[2]);
449 vst1q_u16(dst + 24, d0[3]);
450 dst += stride;
451 }
452 }
453
454 // -----------------------------------------------------------------------------
455
vpx_highbd_d63_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)456 void vpx_highbd_d63_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
457 const uint16_t *above,
458 const uint16_t *left, int bd) {
459 uint16x4_t a0, a1, a2, a3, d0, d1, d2, d3;
460 (void)left;
461 (void)bd;
462
463 a0 = vld1_u16(above + 0);
464 a1 = vld1_u16(above + 1);
465 a2 = vld1_u16(above + 2);
466 a3 = vld1_u16(above + 3);
467
468 d0 = vrhadd_u16(a0, a1);
469 d1 = vrhadd_u16(vhadd_u16(a0, a2), a1);
470 d2 = vrhadd_u16(a1, a2);
471 d3 = vrhadd_u16(vhadd_u16(a1, a3), a2);
472
473 // Note that here we are performing a full avg calculation for the final
474 // elements rather than storing a duplicate of above[3], which differs
475 // (correctly) from the general scheme employed by the bs={8,16,32}
476 // implementations in order to match the original C implementation.
477 vst1_u16(dst + 0 * stride, d0);
478 vst1_u16(dst + 1 * stride, d1);
479 vst1_u16(dst + 2 * stride, d2);
480 vst1_u16(dst + 3 * stride, d3);
481 }
482
vpx_highbd_d63_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)483 void vpx_highbd_d63_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
484 const uint16_t *above,
485 const uint16_t *left, int bd) {
486 uint16x8_t a0, a1, a2, a7, d0, d1, d0_ext, d1_ext;
487 (void)left;
488 (void)bd;
489
490 a0 = vld1q_u16(above + 0);
491 a1 = vld1q_u16(above + 1);
492 a2 = vld1q_u16(above + 2);
493 a7 = vld1q_dup_u16(above + 7);
494
495 d0 = vrhaddq_u16(a0, a1);
496 d1 = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
497
498 // We want to store:
499 // stride=0 [ d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], d0[7] ]
500 // stride=1 [ d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], d1[7] ]
501 // stride=2 [ d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], a[7], a[7] ]
502 // stride=3 [ d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], a[7], a[7] ]
503 // stride=4 [ d0[2], d0[3], d0[4], d0[5], d0[6], a[7], a[7], a[7] ]
504 // stride=5 [ d1[2], d1[3], d1[4], d1[5], d1[6], a[7], a[7], a[7] ]
505 // stride=6 [ d0[3], d0[4], d0[5], d0[6], a[7], a[7], a[7], a[7] ]
506 // stride=7 [ d1[3], d1[4], d1[5], d1[6], a[7], a[7], a[7], a[7] ]
507 // Note in particular that d0[7] and d1[7] are only ever referenced in the
508 // stride=0 and stride=1 cases respectively, and in later strides are
509 // replaced by a copy of above[7]. These are equivalent if for i>7,
510 // above[i]==above[7], however that is not always the case.
511
512 // Strip out d0[7] and d1[7] so that we can replace it with an additional
513 // copy of above[7], the first vector here doesn't matter so just reuse
514 // d0/d1.
515 d0_ext = vextq_u16(d0, d0, 7);
516 d1_ext = vextq_u16(d1, d1, 7);
517
518 // Shuffle in duplicates of above[7] and store.
519 vst1q_u16(dst + 0 * stride, d0);
520 vst1q_u16(dst + 1 * stride, d1);
521 vst1q_u16(dst + 2 * stride, vextq_u16(d0_ext, a7, 2));
522 vst1q_u16(dst + 3 * stride, vextq_u16(d1_ext, a7, 2));
523 vst1q_u16(dst + 4 * stride, vextq_u16(d0_ext, a7, 3));
524 vst1q_u16(dst + 5 * stride, vextq_u16(d1_ext, a7, 3));
525 vst1q_u16(dst + 6 * stride, vextq_u16(d0_ext, a7, 4));
526 vst1q_u16(dst + 7 * stride, vextq_u16(d1_ext, a7, 4));
527 }
528
vpx_highbd_d63_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)529 void vpx_highbd_d63_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
530 const uint16_t *above,
531 const uint16_t *left, int bd) {
532 // See vpx_highbd_d63_predictor_8x8_neon for details on the implementation.
533 uint16x8_t a0, a1, a2, a8, a9, a10, a15, d0[2], d1[2], d0_ext, d1_ext;
534 (void)left;
535 (void)bd;
536
537 a0 = vld1q_u16(above + 0);
538 a1 = vld1q_u16(above + 1);
539 a2 = vld1q_u16(above + 2);
540 a8 = vld1q_u16(above + 8);
541 a9 = vld1q_u16(above + 9);
542 a10 = vld1q_u16(above + 10);
543 a15 = vld1q_dup_u16(above + 15);
544
545 d0[0] = vrhaddq_u16(a0, a1);
546 d0[1] = vrhaddq_u16(a8, a9);
547 d1[0] = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
548 d1[1] = vrhaddq_u16(vhaddq_u16(a8, a10), a9);
549
550 // Strip out the final element of d0/d1 so that we can replace it with an
551 // additional copy of above[7], the first vector here doesn't matter so just
552 // reuse the same vector.
553 d0_ext = vextq_u16(d0[1], d0[1], 7);
554 d1_ext = vextq_u16(d1[1], d1[1], 7);
555
556 // Shuffle in duplicates of above[7] and store. Note that cases involving
557 // {d0,d1}_ext require an extra shift to undo the shifting out of the final
558 // element from above.
559 vst1q_u16(dst + 0 * stride + 0, d0[0]);
560 vst1q_u16(dst + 0 * stride + 8, d0[1]);
561 vst1q_u16(dst + 1 * stride + 0, d1[0]);
562 vst1q_u16(dst + 1 * stride + 8, d1[1]);
563 vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 1));
564 vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0_ext, a15, 2));
565 vst1q_u16(dst + 3 * stride + 0, vextq_u16(d1[0], d1[1], 1));
566 vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1_ext, a15, 2));
567 vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 2));
568 vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0_ext, a15, 3));
569 vst1q_u16(dst + 5 * stride + 0, vextq_u16(d1[0], d1[1], 2));
570 vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1_ext, a15, 3));
571 vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 3));
572 vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0_ext, a15, 4));
573 vst1q_u16(dst + 7 * stride + 0, vextq_u16(d1[0], d1[1], 3));
574 vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1_ext, a15, 4));
575 vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[0], d0[1], 4));
576 vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0_ext, a15, 5));
577 vst1q_u16(dst + 9 * stride + 0, vextq_u16(d1[0], d1[1], 4));
578 vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1_ext, a15, 5));
579 vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[0], d0[1], 5));
580 vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0_ext, a15, 6));
581 vst1q_u16(dst + 11 * stride + 0, vextq_u16(d1[0], d1[1], 5));
582 vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1_ext, a15, 6));
583 vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[0], d0[1], 6));
584 vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0_ext, a15, 7));
585 vst1q_u16(dst + 13 * stride + 0, vextq_u16(d1[0], d1[1], 6));
586 vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1_ext, a15, 7));
587 vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[0], d0[1], 7));
588 vst1q_u16(dst + 14 * stride + 8, a15);
589 vst1q_u16(dst + 15 * stride + 0, vextq_u16(d1[0], d1[1], 7));
590 vst1q_u16(dst + 15 * stride + 8, a15);
591 }
592
vpx_highbd_d63_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)593 void vpx_highbd_d63_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
594 const uint16_t *above,
595 const uint16_t *left, int bd) {
596 // See vpx_highbd_d63_predictor_8x8_neon for details on the implementation.
597 uint16x8_t a0, a1, a2, a8, a9, a10, a16, a17, a18, a24, a25, a26, a31, d0[4],
598 d1[4], d0_ext, d1_ext;
599 (void)left;
600 (void)bd;
601
602 a0 = vld1q_u16(above + 0);
603 a1 = vld1q_u16(above + 1);
604 a2 = vld1q_u16(above + 2);
605 a8 = vld1q_u16(above + 8);
606 a9 = vld1q_u16(above + 9);
607 a10 = vld1q_u16(above + 10);
608 a16 = vld1q_u16(above + 16);
609 a17 = vld1q_u16(above + 17);
610 a18 = vld1q_u16(above + 18);
611 a24 = vld1q_u16(above + 24);
612 a25 = vld1q_u16(above + 25);
613 a26 = vld1q_u16(above + 26);
614 a31 = vld1q_dup_u16(above + 31);
615
616 d0[0] = vrhaddq_u16(a0, a1);
617 d0[1] = vrhaddq_u16(a8, a9);
618 d0[2] = vrhaddq_u16(a16, a17);
619 d0[3] = vrhaddq_u16(a24, a25);
620 d1[0] = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
621 d1[1] = vrhaddq_u16(vhaddq_u16(a8, a10), a9);
622 d1[2] = vrhaddq_u16(vhaddq_u16(a16, a18), a17);
623 d1[3] = vrhaddq_u16(vhaddq_u16(a24, a26), a25);
624
625 // Strip out the final element of d0/d1 so that we can replace it with an
626 // additional copy of above[7], the first vector here doesn't matter so just
627 // reuse the same vector.
628 d0_ext = vextq_u16(d0[3], d0[3], 7);
629 d1_ext = vextq_u16(d1[3], d1[3], 7);
630
631 // Shuffle in duplicates of above[7] and store. Note that cases involving
632 // {d0,d1}_ext require an extra shift to undo the shifting out of the final
633 // element from above.
634
635 vst1q_u16(dst + 0 * stride + 0, d0[0]);
636 vst1q_u16(dst + 0 * stride + 8, d0[1]);
637 vst1q_u16(dst + 0 * stride + 16, d0[2]);
638 vst1q_u16(dst + 0 * stride + 24, d0[3]);
639 vst1q_u16(dst + 1 * stride + 0, d1[0]);
640 vst1q_u16(dst + 1 * stride + 8, d1[1]);
641 vst1q_u16(dst + 1 * stride + 16, d1[2]);
642 vst1q_u16(dst + 1 * stride + 24, d1[3]);
643
644 vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 1));
645 vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[1], d0[2], 1));
646 vst1q_u16(dst + 2 * stride + 16, vextq_u16(d0[2], d0[3], 1));
647 vst1q_u16(dst + 2 * stride + 24, vextq_u16(d0_ext, a31, 2));
648 vst1q_u16(dst + 3 * stride + 0, vextq_u16(d1[0], d1[1], 1));
649 vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[1], d1[2], 1));
650 vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[2], d1[3], 1));
651 vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1_ext, a31, 2));
652
653 vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 2));
654 vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[1], d0[2], 2));
655 vst1q_u16(dst + 4 * stride + 16, vextq_u16(d0[2], d0[3], 2));
656 vst1q_u16(dst + 4 * stride + 24, vextq_u16(d0_ext, a31, 3));
657 vst1q_u16(dst + 5 * stride + 0, vextq_u16(d1[0], d1[1], 2));
658 vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1[1], d1[2], 2));
659 vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[2], d1[3], 2));
660 vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1_ext, a31, 3));
661
662 vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 3));
663 vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[1], d0[2], 3));
664 vst1q_u16(dst + 6 * stride + 16, vextq_u16(d0[2], d0[3], 3));
665 vst1q_u16(dst + 6 * stride + 24, vextq_u16(d0_ext, a31, 4));
666 vst1q_u16(dst + 7 * stride + 0, vextq_u16(d1[0], d1[1], 3));
667 vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1[1], d1[2], 3));
668 vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[2], d1[3], 3));
669 vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1_ext, a31, 4));
670
671 vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[0], d0[1], 4));
672 vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0[1], d0[2], 4));
673 vst1q_u16(dst + 8 * stride + 16, vextq_u16(d0[2], d0[3], 4));
674 vst1q_u16(dst + 8 * stride + 24, vextq_u16(d0_ext, a31, 5));
675 vst1q_u16(dst + 9 * stride + 0, vextq_u16(d1[0], d1[1], 4));
676 vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1[1], d1[2], 4));
677 vst1q_u16(dst + 9 * stride + 16, vextq_u16(d1[2], d1[3], 4));
678 vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1_ext, a31, 5));
679
680 vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[0], d0[1], 5));
681 vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0[1], d0[2], 5));
682 vst1q_u16(dst + 10 * stride + 16, vextq_u16(d0[2], d0[3], 5));
683 vst1q_u16(dst + 10 * stride + 24, vextq_u16(d0_ext, a31, 6));
684 vst1q_u16(dst + 11 * stride + 0, vextq_u16(d1[0], d1[1], 5));
685 vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1[1], d1[2], 5));
686 vst1q_u16(dst + 11 * stride + 16, vextq_u16(d1[2], d1[3], 5));
687 vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1_ext, a31, 6));
688
689 vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[0], d0[1], 6));
690 vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0[1], d0[2], 6));
691 vst1q_u16(dst + 12 * stride + 16, vextq_u16(d0[2], d0[3], 6));
692 vst1q_u16(dst + 12 * stride + 24, vextq_u16(d0_ext, a31, 7));
693 vst1q_u16(dst + 13 * stride + 0, vextq_u16(d1[0], d1[1], 6));
694 vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1[1], d1[2], 6));
695 vst1q_u16(dst + 13 * stride + 16, vextq_u16(d1[2], d1[3], 6));
696 vst1q_u16(dst + 13 * stride + 24, vextq_u16(d1_ext, a31, 7));
697
698 vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[0], d0[1], 7));
699 vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0[1], d0[2], 7));
700 vst1q_u16(dst + 14 * stride + 16, vextq_u16(d0[2], d0[3], 7));
701 vst1q_u16(dst + 14 * stride + 24, a31);
702 vst1q_u16(dst + 15 * stride + 0, vextq_u16(d1[0], d1[1], 7));
703 vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1[1], d1[2], 7));
704 vst1q_u16(dst + 15 * stride + 16, vextq_u16(d1[2], d1[3], 7));
705 vst1q_u16(dst + 15 * stride + 24, a31);
706
707 vst1q_u16(dst + 16 * stride + 0, d0[1]);
708 vst1q_u16(dst + 16 * stride + 8, d0[2]);
709 vst1q_u16(dst + 16 * stride + 16, vextq_u16(d0_ext, a31, 1));
710 vst1q_u16(dst + 16 * stride + 24, a31);
711 vst1q_u16(dst + 17 * stride + 0, d1[1]);
712 vst1q_u16(dst + 17 * stride + 8, d1[2]);
713 vst1q_u16(dst + 17 * stride + 16, vextq_u16(d1_ext, a31, 1));
714 vst1q_u16(dst + 17 * stride + 24, a31);
715
716 vst1q_u16(dst + 18 * stride + 0, vextq_u16(d0[1], d0[2], 1));
717 vst1q_u16(dst + 18 * stride + 8, vextq_u16(d0[2], d0[3], 1));
718 vst1q_u16(dst + 18 * stride + 16, vextq_u16(d0_ext, a31, 2));
719 vst1q_u16(dst + 18 * stride + 24, a31);
720 vst1q_u16(dst + 19 * stride + 0, vextq_u16(d1[1], d1[2], 1));
721 vst1q_u16(dst + 19 * stride + 8, vextq_u16(d1[2], d1[3], 1));
722 vst1q_u16(dst + 19 * stride + 16, vextq_u16(d1_ext, a31, 2));
723 vst1q_u16(dst + 19 * stride + 24, a31);
724
725 vst1q_u16(dst + 20 * stride + 0, vextq_u16(d0[1], d0[2], 2));
726 vst1q_u16(dst + 20 * stride + 8, vextq_u16(d0[2], d0[3], 2));
727 vst1q_u16(dst + 20 * stride + 16, vextq_u16(d0_ext, a31, 3));
728 vst1q_u16(dst + 20 * stride + 24, a31);
729 vst1q_u16(dst + 21 * stride + 0, vextq_u16(d1[1], d1[2], 2));
730 vst1q_u16(dst + 21 * stride + 8, vextq_u16(d1[2], d1[3], 2));
731 vst1q_u16(dst + 21 * stride + 16, vextq_u16(d1_ext, a31, 3));
732 vst1q_u16(dst + 21 * stride + 24, a31);
733
734 vst1q_u16(dst + 22 * stride + 0, vextq_u16(d0[1], d0[2], 3));
735 vst1q_u16(dst + 22 * stride + 8, vextq_u16(d0[2], d0[3], 3));
736 vst1q_u16(dst + 22 * stride + 16, vextq_u16(d0_ext, a31, 4));
737 vst1q_u16(dst + 22 * stride + 24, a31);
738 vst1q_u16(dst + 23 * stride + 0, vextq_u16(d1[1], d1[2], 3));
739 vst1q_u16(dst + 23 * stride + 8, vextq_u16(d1[2], d1[3], 3));
740 vst1q_u16(dst + 23 * stride + 16, vextq_u16(d1_ext, a31, 4));
741 vst1q_u16(dst + 23 * stride + 24, a31);
742
743 vst1q_u16(dst + 24 * stride + 0, vextq_u16(d0[1], d0[2], 4));
744 vst1q_u16(dst + 24 * stride + 8, vextq_u16(d0[2], d0[3], 4));
745 vst1q_u16(dst + 24 * stride + 16, vextq_u16(d0_ext, a31, 5));
746 vst1q_u16(dst + 24 * stride + 24, a31);
747 vst1q_u16(dst + 25 * stride + 0, vextq_u16(d1[1], d1[2], 4));
748 vst1q_u16(dst + 25 * stride + 8, vextq_u16(d1[2], d1[3], 4));
749 vst1q_u16(dst + 25 * stride + 16, vextq_u16(d1_ext, a31, 5));
750 vst1q_u16(dst + 25 * stride + 24, a31);
751
752 vst1q_u16(dst + 26 * stride + 0, vextq_u16(d0[1], d0[2], 5));
753 vst1q_u16(dst + 26 * stride + 8, vextq_u16(d0[2], d0[3], 5));
754 vst1q_u16(dst + 26 * stride + 16, vextq_u16(d0_ext, a31, 6));
755 vst1q_u16(dst + 26 * stride + 24, a31);
756 vst1q_u16(dst + 27 * stride + 0, vextq_u16(d1[1], d1[2], 5));
757 vst1q_u16(dst + 27 * stride + 8, vextq_u16(d1[2], d1[3], 5));
758 vst1q_u16(dst + 27 * stride + 16, vextq_u16(d1_ext, a31, 6));
759 vst1q_u16(dst + 27 * stride + 24, a31);
760
761 vst1q_u16(dst + 28 * stride + 0, vextq_u16(d0[1], d0[2], 6));
762 vst1q_u16(dst + 28 * stride + 8, vextq_u16(d0[2], d0[3], 6));
763 vst1q_u16(dst + 28 * stride + 16, vextq_u16(d0_ext, a31, 7));
764 vst1q_u16(dst + 28 * stride + 24, a31);
765 vst1q_u16(dst + 29 * stride + 0, vextq_u16(d1[1], d1[2], 6));
766 vst1q_u16(dst + 29 * stride + 8, vextq_u16(d1[2], d1[3], 6));
767 vst1q_u16(dst + 29 * stride + 16, vextq_u16(d1_ext, a31, 7));
768 vst1q_u16(dst + 29 * stride + 24, a31);
769
770 vst1q_u16(dst + 30 * stride + 0, vextq_u16(d0[1], d0[2], 7));
771 vst1q_u16(dst + 30 * stride + 8, vextq_u16(d0[2], d0[3], 7));
772 vst1q_u16(dst + 30 * stride + 16, a31);
773 vst1q_u16(dst + 30 * stride + 24, a31);
774 vst1q_u16(dst + 31 * stride + 0, vextq_u16(d1[1], d1[2], 7));
775 vst1q_u16(dst + 31 * stride + 8, vextq_u16(d1[2], d1[3], 7));
776 vst1q_u16(dst + 31 * stride + 16, a31);
777 vst1q_u16(dst + 31 * stride + 24, a31);
778 }
779
780 // -----------------------------------------------------------------------------
781
vpx_highbd_d117_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)782 void vpx_highbd_d117_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
783 const uint16_t *above,
784 const uint16_t *left, int bd) {
785 uint16x4_t az, a0, l0az, l0, l1, azl0, col0, col0_even, col0_odd, d0, d1;
786 (void)bd;
787
788 az = vld1_u16(above - 1);
789 a0 = vld1_u16(above + 0);
790 // [ left[0], above[-1], above[0], above[1] ]
791 l0az = vext_u16(vld1_dup_u16(left), az, 3);
792
793 l0 = vld1_u16(left + 0);
794 // The last lane here is unused, reading left[4] could cause a buffer
795 // over-read, so just fill with a duplicate of left[0] to avoid needing to
796 // materialize a zero:
797 // [ left[1], left[2], left[3], x ]
798 l1 = vext_u16(l0, l0, 1);
799 // [ above[-1], left[0], left[1], left[2] ]
800 azl0 = vext_u16(vld1_dup_u16(above - 1), l0, 3);
801
802 d0 = vrhadd_u16(az, a0);
803 d1 = vrhadd_u16(vhadd_u16(l0az, a0), az);
804
805 col0 = vrhadd_u16(vhadd_u16(azl0, l1), l0);
806 col0_even = vdup_lane_u16(col0, 0);
807 col0_odd = vdup_lane_u16(col0, 1);
808
809 vst1_u16(dst + 0 * stride, d0);
810 vst1_u16(dst + 1 * stride, d1);
811 vst1_u16(dst + 2 * stride, vext_u16(col0_even, d0, 3));
812 vst1_u16(dst + 3 * stride, vext_u16(col0_odd, d1, 3));
813 }
814
vpx_highbd_d117_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)815 void vpx_highbd_d117_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
816 const uint16_t *above,
817 const uint16_t *left, int bd) {
818 uint16x8_t az, a0, l0az, l0, l1, azl0, col0, col0_even, col0_odd, d0, d1;
819 (void)bd;
820
821 az = vld1q_u16(above - 1);
822 a0 = vld1q_u16(above + 0);
823 // [ left[0], above[-1], ..., left[5] ]
824 l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
825
826 l0 = vld1q_u16(left + 0);
827 // The last lane here is unused, reading left[8] could cause a buffer
828 // over-read, so just fill with a duplicate of left[0] to avoid needing to
829 // materialize a zero:
830 // [ left[1], ... , left[7], x ]
831 l1 = vextq_u16(l0, l0, 1);
832 // [ above[-1], left[0], ..., left[6] ]
833 azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
834
835 // d0[0] = AVG2(above[-1], above[0])
836 // ...
837 // d0[7] = AVG2(above[6], above[7])
838 d0 = vrhaddq_u16(az, a0);
839
840 // d1[0] = AVG3(left[0], above[-1], above[0])
841 // d1[1] = AVG3(above[-1], above[0], above[1])
842 // ...
843 // d1[7] = AVG3(above[5], above[6], above[7])
844 d1 = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
845
846 // The ext instruction shifts elements in from the end of the vector rather
847 // than the start, so reverse the vector to put the elements to be shifted in
848 // at the end:
849 // col0[7] = AVG3(above[-1], left[0], left[1])
850 // col0[6] = AVG3(left[0], left[1], left[2])
851 // ...
852 // col0[0] = AVG3(left[6], left[7], left[8])
853 col0 = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
854 col0 = vrev64q_u16(vextq_u16(col0, col0, 4));
855
856 // We don't care about the first parameter to this uzp since we only ever use
857 // the high three elements, we just use col0 again since it is already
858 // available:
859 // col0_even = [ x, x, x, x, x, col0[3], col0[5], col0[7] ]
860 // col0_odd = [ x, x, x, x, x, col0[2], col0[4], col0[6] ]
861 col0_even = vuzpq_u16(col0, col0).val[1];
862 col0_odd = vuzpq_u16(col0, col0).val[0];
863
864 // Incrementally shift more elements from col0 into d0/1:
865 // stride=0 [ d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], d0[7] ]
866 // stride=1 [ d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], d1[7] ]
867 // stride=2 [ col0[7], d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6] ]
868 // stride=3 [ col0[6], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ]
869 // stride=4 [ col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4], d0[5] ]
870 // stride=5 [ col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5] ]
871 // stride=6 [ col0[3], col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4] ]
872 // stride=7 [ col0[2], col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4] ]
873 vst1q_u16(dst + 0 * stride, d0);
874 vst1q_u16(dst + 1 * stride, d1);
875 vst1q_u16(dst + 2 * stride, vextq_u16(col0_even, d0, 7));
876 vst1q_u16(dst + 3 * stride, vextq_u16(col0_odd, d1, 7));
877 vst1q_u16(dst + 4 * stride, vextq_u16(col0_even, d0, 6));
878 vst1q_u16(dst + 5 * stride, vextq_u16(col0_odd, d1, 6));
879 vst1q_u16(dst + 6 * stride, vextq_u16(col0_even, d0, 5));
880 vst1q_u16(dst + 7 * stride, vextq_u16(col0_odd, d1, 5));
881 }
882
vpx_highbd_d117_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)883 void vpx_highbd_d117_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
884 const uint16_t *above,
885 const uint16_t *left, int bd) {
886 uint16x8_t az, a0, a6, a7, a8, l0az, l0, l1, l7, l8, l9, azl0, col0_lo,
887 col0_hi, col0_even, col0_odd, d0_lo, d0_hi, d1_lo, d1_hi;
888 (void)bd;
889
890 az = vld1q_u16(above - 1);
891 a0 = vld1q_u16(above + 0);
892 a6 = vld1q_u16(above + 6);
893 a7 = vld1q_u16(above + 7);
894 a8 = vld1q_u16(above + 8);
895 // [ left[0], above[-1], ..., left[5] ]
896 l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
897
898 l0 = vld1q_u16(left + 0);
899 l1 = vld1q_u16(left + 1);
900 l7 = vld1q_u16(left + 7);
901 l8 = vld1q_u16(left + 8);
902 // The last lane here is unused, reading left[16] could cause a buffer
903 // over-read, so just fill with a duplicate of left[8] to avoid needing to
904 // materialize a zero:
905 // [ left[9], ... , left[15], x ]
906 l9 = vextq_u16(l8, l8, 1);
907 // [ above[-1], left[0], ..., left[6] ]
908 azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
909
910 d0_lo = vrhaddq_u16(az, a0);
911 d0_hi = vrhaddq_u16(a7, a8);
912 d1_lo = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
913 d1_hi = vrhaddq_u16(vhaddq_u16(a6, a8), a7);
914
915 col0_lo = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
916 col0_hi = vrhaddq_u16(vhaddq_u16(l7, l9), l8);
917
918 // Reverse within each vector, then swap the array indices in the uzp to
919 // complete the reversal across all 16 elements.
920 col0_lo = vrev64q_u16(vextq_u16(col0_lo, col0_lo, 4));
921 col0_hi = vrev64q_u16(vextq_u16(col0_hi, col0_hi, 4));
922 col0_even = vuzpq_u16(col0_hi, col0_lo).val[1];
923 col0_odd = vuzpq_u16(col0_hi, col0_lo).val[0];
924
925 vst1q_u16(dst + 0 * stride + 0, d0_lo);
926 vst1q_u16(dst + 0 * stride + 8, d0_hi);
927 vst1q_u16(dst + 1 * stride + 0, d1_lo);
928 vst1q_u16(dst + 1 * stride + 8, d1_hi);
929
930 vst1q_u16(dst + 2 * stride + 0, vextq_u16(col0_even, d0_lo, 7));
931 vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0_lo, d0_hi, 7));
932 vst1q_u16(dst + 3 * stride + 0, vextq_u16(col0_odd, d1_lo, 7));
933 vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1_lo, d1_hi, 7));
934
935 vst1q_u16(dst + 4 * stride + 0, vextq_u16(col0_even, d0_lo, 6));
936 vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0_lo, d0_hi, 6));
937 vst1q_u16(dst + 5 * stride + 0, vextq_u16(col0_odd, d1_lo, 6));
938 vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1_lo, d1_hi, 6));
939
940 vst1q_u16(dst + 6 * stride + 0, vextq_u16(col0_even, d0_lo, 5));
941 vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0_lo, d0_hi, 5));
942 vst1q_u16(dst + 7 * stride + 0, vextq_u16(col0_odd, d1_lo, 5));
943 vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1_lo, d1_hi, 5));
944
945 vst1q_u16(dst + 8 * stride + 0, vextq_u16(col0_even, d0_lo, 4));
946 vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0_lo, d0_hi, 4));
947 vst1q_u16(dst + 9 * stride + 0, vextq_u16(col0_odd, d1_lo, 4));
948 vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1_lo, d1_hi, 4));
949
950 vst1q_u16(dst + 10 * stride + 0, vextq_u16(col0_even, d0_lo, 3));
951 vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0_lo, d0_hi, 3));
952 vst1q_u16(dst + 11 * stride + 0, vextq_u16(col0_odd, d1_lo, 3));
953 vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1_lo, d1_hi, 3));
954
955 vst1q_u16(dst + 12 * stride + 0, vextq_u16(col0_even, d0_lo, 2));
956 vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0_lo, d0_hi, 2));
957 vst1q_u16(dst + 13 * stride + 0, vextq_u16(col0_odd, d1_lo, 2));
958 vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1_lo, d1_hi, 2));
959
960 vst1q_u16(dst + 14 * stride + 0, vextq_u16(col0_even, d0_lo, 1));
961 vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0_lo, d0_hi, 1));
962 vst1q_u16(dst + 15 * stride + 0, vextq_u16(col0_odd, d1_lo, 1));
963 vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1_lo, d1_hi, 1));
964 }
965
vpx_highbd_d117_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)966 void vpx_highbd_d117_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
967 const uint16_t *above,
968 const uint16_t *left, int bd) {
969 uint16x8_t az, a0, a6, a7, a8, a14, a15, a16, a22, a23, a24, l0az, l0, l1, l7,
970 l8, l9, l15, l16, l17, l23, l24, l25, azl0, d0[4], d1[4], col0[4],
971 col0_even[2], col0_odd[2];
972 (void)bd;
973
974 az = vld1q_u16(above - 1);
975 a0 = vld1q_u16(above + 0);
976 a6 = vld1q_u16(above + 6);
977 a7 = vld1q_u16(above + 7);
978 a8 = vld1q_u16(above + 8);
979 a14 = vld1q_u16(above + 14);
980 a15 = vld1q_u16(above + 15);
981 a16 = vld1q_u16(above + 16);
982 a22 = vld1q_u16(above + 22);
983 a23 = vld1q_u16(above + 23);
984 a24 = vld1q_u16(above + 24);
985 // [ left[0], above[-1], ..., left[5] ]
986 l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
987
988 l0 = vld1q_u16(left + 0);
989 l1 = vld1q_u16(left + 1);
990 l7 = vld1q_u16(left + 7);
991 l8 = vld1q_u16(left + 8);
992 l9 = vld1q_u16(left + 9);
993 l15 = vld1q_u16(left + 15);
994 l16 = vld1q_u16(left + 16);
995 l17 = vld1q_u16(left + 17);
996 l23 = vld1q_u16(left + 23);
997 l24 = vld1q_u16(left + 24);
998 l25 = vld1q_u16(left + 25);
999 // The last lane here is unused, reading left[32] could cause a buffer
1000 // over-read, so just fill with a duplicate of left[24] to avoid needing to
1001 // materialize a zero:
1002 // [ left[25], ... , left[31], x ]
1003 l25 = vextq_u16(l24, l24, 1);
1004 // [ above[-1], left[0], ..., left[6] ]
1005 azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
1006
1007 d0[0] = vrhaddq_u16(az, a0);
1008 d0[1] = vrhaddq_u16(a7, a8);
1009 d0[2] = vrhaddq_u16(a15, a16);
1010 d0[3] = vrhaddq_u16(a23, a24);
1011 d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
1012 d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7);
1013 d1[2] = vrhaddq_u16(vhaddq_u16(a14, a16), a15);
1014 d1[3] = vrhaddq_u16(vhaddq_u16(a22, a24), a23);
1015
1016 col0[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
1017 col0[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8);
1018 col0[2] = vrhaddq_u16(vhaddq_u16(l15, l17), l16);
1019 col0[3] = vrhaddq_u16(vhaddq_u16(l23, l25), l24);
1020
1021 // Reverse within each vector, then swap the array indices in both the uzp
1022 // and the col0_{even,odd} assignment to complete the reversal across all
1023 // 32-elements.
1024 col0[0] = vrev64q_u16(vextq_u16(col0[0], col0[0], 4));
1025 col0[1] = vrev64q_u16(vextq_u16(col0[1], col0[1], 4));
1026 col0[2] = vrev64q_u16(vextq_u16(col0[2], col0[2], 4));
1027 col0[3] = vrev64q_u16(vextq_u16(col0[3], col0[3], 4));
1028
1029 col0_even[1] = vuzpq_u16(col0[1], col0[0]).val[1];
1030 col0_even[0] = vuzpq_u16(col0[3], col0[2]).val[1];
1031 col0_odd[1] = vuzpq_u16(col0[1], col0[0]).val[0];
1032 col0_odd[0] = vuzpq_u16(col0[3], col0[2]).val[0];
1033
1034 vst1q_u16(dst + 0 * stride + 0, d0[0]);
1035 vst1q_u16(dst + 0 * stride + 8, d0[1]);
1036 vst1q_u16(dst + 0 * stride + 16, d0[2]);
1037 vst1q_u16(dst + 0 * stride + 24, d0[3]);
1038 vst1q_u16(dst + 1 * stride + 0, d1[0]);
1039 vst1q_u16(dst + 1 * stride + 8, d1[1]);
1040 vst1q_u16(dst + 1 * stride + 16, d1[2]);
1041 vst1q_u16(dst + 1 * stride + 24, d1[3]);
1042
1043 vst1q_u16(dst + 2 * stride + 0, vextq_u16(col0_even[1], d0[0], 7));
1044 vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[0], d0[1], 7));
1045 vst1q_u16(dst + 2 * stride + 16, vextq_u16(d0[1], d0[2], 7));
1046 vst1q_u16(dst + 2 * stride + 24, vextq_u16(d0[2], d0[3], 7));
1047 vst1q_u16(dst + 3 * stride + 0, vextq_u16(col0_odd[1], d1[0], 7));
1048 vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 7));
1049 vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[1], d1[2], 7));
1050 vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1[2], d1[3], 7));
1051
1052 vst1q_u16(dst + 4 * stride + 0, vextq_u16(col0_even[1], d0[0], 6));
1053 vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[0], d0[1], 6));
1054 vst1q_u16(dst + 4 * stride + 16, vextq_u16(d0[1], d0[2], 6));
1055 vst1q_u16(dst + 4 * stride + 24, vextq_u16(d0[2], d0[3], 6));
1056 vst1q_u16(dst + 5 * stride + 0, vextq_u16(col0_odd[1], d1[0], 6));
1057 vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1[0], d1[1], 6));
1058 vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[1], d1[2], 6));
1059 vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1[2], d1[3], 6));
1060
1061 vst1q_u16(dst + 6 * stride + 0, vextq_u16(col0_even[1], d0[0], 5));
1062 vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[0], d0[1], 5));
1063 vst1q_u16(dst + 6 * stride + 16, vextq_u16(d0[1], d0[2], 5));
1064 vst1q_u16(dst + 6 * stride + 24, vextq_u16(d0[2], d0[3], 5));
1065 vst1q_u16(dst + 7 * stride + 0, vextq_u16(col0_odd[1], d1[0], 5));
1066 vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1[0], d1[1], 5));
1067 vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[1], d1[2], 5));
1068 vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1[2], d1[3], 5));
1069
1070 vst1q_u16(dst + 8 * stride + 0, vextq_u16(col0_even[1], d0[0], 4));
1071 vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0[0], d0[1], 4));
1072 vst1q_u16(dst + 8 * stride + 16, vextq_u16(d0[1], d0[2], 4));
1073 vst1q_u16(dst + 8 * stride + 24, vextq_u16(d0[2], d0[3], 4));
1074 vst1q_u16(dst + 9 * stride + 0, vextq_u16(col0_odd[1], d1[0], 4));
1075 vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1[0], d1[1], 4));
1076 vst1q_u16(dst + 9 * stride + 16, vextq_u16(d1[1], d1[2], 4));
1077 vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1[2], d1[3], 4));
1078
1079 vst1q_u16(dst + 10 * stride + 0, vextq_u16(col0_even[1], d0[0], 3));
1080 vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0[0], d0[1], 3));
1081 vst1q_u16(dst + 10 * stride + 16, vextq_u16(d0[1], d0[2], 3));
1082 vst1q_u16(dst + 10 * stride + 24, vextq_u16(d0[2], d0[3], 3));
1083 vst1q_u16(dst + 11 * stride + 0, vextq_u16(col0_odd[1], d1[0], 3));
1084 vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1[0], d1[1], 3));
1085 vst1q_u16(dst + 11 * stride + 16, vextq_u16(d1[1], d1[2], 3));
1086 vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1[2], d1[3], 3));
1087
1088 vst1q_u16(dst + 12 * stride + 0, vextq_u16(col0_even[1], d0[0], 2));
1089 vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0[0], d0[1], 2));
1090 vst1q_u16(dst + 12 * stride + 16, vextq_u16(d0[1], d0[2], 2));
1091 vst1q_u16(dst + 12 * stride + 24, vextq_u16(d0[2], d0[3], 2));
1092 vst1q_u16(dst + 13 * stride + 0, vextq_u16(col0_odd[1], d1[0], 2));
1093 vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1[0], d1[1], 2));
1094 vst1q_u16(dst + 13 * stride + 16, vextq_u16(d1[1], d1[2], 2));
1095 vst1q_u16(dst + 13 * stride + 24, vextq_u16(d1[2], d1[3], 2));
1096
1097 vst1q_u16(dst + 14 * stride + 0, vextq_u16(col0_even[1], d0[0], 1));
1098 vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0[0], d0[1], 1));
1099 vst1q_u16(dst + 14 * stride + 16, vextq_u16(d0[1], d0[2], 1));
1100 vst1q_u16(dst + 14 * stride + 24, vextq_u16(d0[2], d0[3], 1));
1101 vst1q_u16(dst + 15 * stride + 0, vextq_u16(col0_odd[1], d1[0], 1));
1102 vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1[0], d1[1], 1));
1103 vst1q_u16(dst + 15 * stride + 16, vextq_u16(d1[1], d1[2], 1));
1104 vst1q_u16(dst + 15 * stride + 24, vextq_u16(d1[2], d1[3], 1));
1105
1106 vst1q_u16(dst + 16 * stride + 0, col0_even[1]);
1107 vst1q_u16(dst + 16 * stride + 8, d0[0]);
1108 vst1q_u16(dst + 16 * stride + 16, d0[1]);
1109 vst1q_u16(dst + 16 * stride + 24, d0[2]);
1110 vst1q_u16(dst + 17 * stride + 0, col0_odd[1]);
1111 vst1q_u16(dst + 17 * stride + 8, d1[0]);
1112 vst1q_u16(dst + 17 * stride + 16, d1[1]);
1113 vst1q_u16(dst + 17 * stride + 24, d1[2]);
1114
1115 vst1q_u16(dst + 18 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 7));
1116 vst1q_u16(dst + 18 * stride + 8, vextq_u16(col0_even[1], d0[0], 7));
1117 vst1q_u16(dst + 18 * stride + 16, vextq_u16(d0[0], d0[1], 7));
1118 vst1q_u16(dst + 18 * stride + 24, vextq_u16(d0[1], d0[2], 7));
1119 vst1q_u16(dst + 19 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 7));
1120 vst1q_u16(dst + 19 * stride + 8, vextq_u16(col0_odd[1], d1[0], 7));
1121 vst1q_u16(dst + 19 * stride + 16, vextq_u16(d1[0], d1[1], 7));
1122 vst1q_u16(dst + 19 * stride + 24, vextq_u16(d1[1], d1[2], 7));
1123
1124 vst1q_u16(dst + 20 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 6));
1125 vst1q_u16(dst + 20 * stride + 8, vextq_u16(col0_even[1], d0[0], 6));
1126 vst1q_u16(dst + 20 * stride + 16, vextq_u16(d0[0], d0[1], 6));
1127 vst1q_u16(dst + 20 * stride + 24, vextq_u16(d0[1], d0[2], 6));
1128 vst1q_u16(dst + 21 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 6));
1129 vst1q_u16(dst + 21 * stride + 8, vextq_u16(col0_odd[1], d1[0], 6));
1130 vst1q_u16(dst + 21 * stride + 16, vextq_u16(d1[0], d1[1], 6));
1131 vst1q_u16(dst + 21 * stride + 24, vextq_u16(d1[1], d1[2], 6));
1132
1133 vst1q_u16(dst + 22 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 5));
1134 vst1q_u16(dst + 22 * stride + 8, vextq_u16(col0_even[1], d0[0], 5));
1135 vst1q_u16(dst + 22 * stride + 16, vextq_u16(d0[0], d0[1], 5));
1136 vst1q_u16(dst + 22 * stride + 24, vextq_u16(d0[1], d0[2], 5));
1137 vst1q_u16(dst + 23 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 5));
1138 vst1q_u16(dst + 23 * stride + 8, vextq_u16(col0_odd[1], d1[0], 5));
1139 vst1q_u16(dst + 23 * stride + 16, vextq_u16(d1[0], d1[1], 5));
1140 vst1q_u16(dst + 23 * stride + 24, vextq_u16(d1[1], d1[2], 5));
1141
1142 vst1q_u16(dst + 24 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 4));
1143 vst1q_u16(dst + 24 * stride + 8, vextq_u16(col0_even[1], d0[0], 4));
1144 vst1q_u16(dst + 24 * stride + 16, vextq_u16(d0[0], d0[1], 4));
1145 vst1q_u16(dst + 24 * stride + 24, vextq_u16(d0[1], d0[2], 4));
1146 vst1q_u16(dst + 25 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 4));
1147 vst1q_u16(dst + 25 * stride + 8, vextq_u16(col0_odd[1], d1[0], 4));
1148 vst1q_u16(dst + 25 * stride + 16, vextq_u16(d1[0], d1[1], 4));
1149 vst1q_u16(dst + 25 * stride + 24, vextq_u16(d1[1], d1[2], 4));
1150
1151 vst1q_u16(dst + 26 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 3));
1152 vst1q_u16(dst + 26 * stride + 8, vextq_u16(col0_even[1], d0[0], 3));
1153 vst1q_u16(dst + 26 * stride + 16, vextq_u16(d0[0], d0[1], 3));
1154 vst1q_u16(dst + 26 * stride + 24, vextq_u16(d0[1], d0[2], 3));
1155 vst1q_u16(dst + 27 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 3));
1156 vst1q_u16(dst + 27 * stride + 8, vextq_u16(col0_odd[1], d1[0], 3));
1157 vst1q_u16(dst + 27 * stride + 16, vextq_u16(d1[0], d1[1], 3));
1158 vst1q_u16(dst + 27 * stride + 24, vextq_u16(d1[1], d1[2], 3));
1159
1160 vst1q_u16(dst + 28 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 2));
1161 vst1q_u16(dst + 28 * stride + 8, vextq_u16(col0_even[1], d0[0], 2));
1162 vst1q_u16(dst + 28 * stride + 16, vextq_u16(d0[0], d0[1], 2));
1163 vst1q_u16(dst + 28 * stride + 24, vextq_u16(d0[1], d0[2], 2));
1164 vst1q_u16(dst + 29 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 2));
1165 vst1q_u16(dst + 29 * stride + 8, vextq_u16(col0_odd[1], d1[0], 2));
1166 vst1q_u16(dst + 29 * stride + 16, vextq_u16(d1[0], d1[1], 2));
1167 vst1q_u16(dst + 29 * stride + 24, vextq_u16(d1[1], d1[2], 2));
1168
1169 vst1q_u16(dst + 30 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 1));
1170 vst1q_u16(dst + 30 * stride + 8, vextq_u16(col0_even[1], d0[0], 1));
1171 vst1q_u16(dst + 30 * stride + 16, vextq_u16(d0[0], d0[1], 1));
1172 vst1q_u16(dst + 30 * stride + 24, vextq_u16(d0[1], d0[2], 1));
1173 vst1q_u16(dst + 31 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 1));
1174 vst1q_u16(dst + 31 * stride + 8, vextq_u16(col0_odd[1], d1[0], 1));
1175 vst1q_u16(dst + 31 * stride + 16, vextq_u16(d1[0], d1[1], 1));
1176 vst1q_u16(dst + 31 * stride + 24, vextq_u16(d1[1], d1[2], 1));
1177 }
1178
1179 // -----------------------------------------------------------------------------
1180
vpx_highbd_d153_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)1181 void vpx_highbd_d153_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
1182 const uint16_t *above,
1183 const uint16_t *left, int bd) {
1184 // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation.
1185 uint16x4_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d20_lo, d20_hi;
1186 (void)bd;
1187
1188 az = vld1_u16(above - 1);
1189 a0 = vld1_u16(above + 0);
1190 // [ left[0], above[-1], above[0], above[1] ]
1191 l0az = vext_u16(vld1_dup_u16(left), az, 3);
1192
1193 l0 = vld1_u16(left);
1194 // The last lane here is unused, reading left[4] could cause a buffer
1195 // over-read, so just fill with a duplicate of left[0] to avoid needing to
1196 // materialize a zero:
1197 // [ left[1], left[2], left[3], x ]
1198 l1 = vext_u16(l0, l0, 1);
1199 // [ above[-1], left[0], left[1], left[2] ]
1200 azl0 = vext_u16(vld1_dup_u16(above - 1), l0, 3);
1201
1202 d0 = vrhadd_u16(azl0, l0);
1203 d1 = vrhadd_u16(vhadd_u16(l0az, a0), az);
1204 d2 = vrhadd_u16(vhadd_u16(azl0, l1), l0);
1205
1206 d20_lo = vzip_u16(vrev64_u16(d2), vrev64_u16(d0)).val[0];
1207 d20_hi = vzip_u16(vrev64_u16(d2), vrev64_u16(d0)).val[1];
1208
1209 // Incrementally shift more elements from d0/d2 reversed into d1:
1210 // stride=0 [ d0[0], d1[0], d1[1], d1[2] ]
1211 // stride=1 [ d0[1], d2[0], d0[0], d1[0] ]
1212 // stride=2 [ d0[2], d2[1], d0[1], d2[0] ]
1213 // stride=3 [ d0[3], d2[2], d0[2], d2[1] ]
1214 vst1_u16(dst + 0 * stride, vext_u16(d20_hi, d1, 3));
1215 vst1_u16(dst + 1 * stride, vext_u16(d20_hi, d1, 1));
1216 vst1_u16(dst + 2 * stride, vext_u16(d20_lo, d20_hi, 3));
1217 vst1_u16(dst + 3 * stride, vext_u16(d20_lo, d20_hi, 1));
1218 }
1219
vpx_highbd_d153_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)1220 void vpx_highbd_d153_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
1221 const uint16_t *above,
1222 const uint16_t *left, int bd) {
1223 uint16x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d0_rev, d2_rev, d20_lo,
1224 d20_hi;
1225 (void)bd;
1226
1227 az = vld1q_u16(above - 1);
1228 a0 = vld1q_u16(above + 0);
1229 // [ left[0], above[-1], ... , above[5] ]
1230 l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
1231
1232 l0 = vld1q_u16(left);
1233 // The last lane here is unused, reading left[8] could cause a buffer
1234 // over-read, so just fill with a duplicate of left[0] to avoid needing to
1235 // materialize a zero:
1236 // [ left[1], ... , left[7], x ]
1237 l1 = vextq_u16(l0, l0, 1);
1238 // [ above[-1], left[0], ... , left[6] ]
1239 azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
1240
1241 // d0[0] = AVG2(above[-1], left[0])
1242 // d0[1] = AVG2(left[0], left[1])
1243 // ...
1244 // d0[7] = AVG2(left[6], left[7])
1245 d0 = vrhaddq_u16(azl0, l0);
1246
1247 // d1[0] = AVG3(left[0], above[-1], above[0])
1248 // d1[1] = AVG3(above[-1], above[0], above[1])
1249 // ...
1250 // d1[7] = AVG3(above[5], above[6], above[7])
1251 d1 = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
1252
1253 // d2[0] = AVG3(above[-1], left[0], left[1])
1254 // d2[1] = AVG3(left[0], left[1], left[2])
1255 // ...
1256 // d2[7] = AVG3(left[6], left[7], left[8])
1257 d2 = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
1258
1259 // The ext instruction shifts elements in from the end of the vector rather
1260 // than the start, so reverse the vectors to put the elements to be shifted
1261 // in at the end:
1262 d0_rev = vrev64q_u16(vextq_u16(d0, d0, 4));
1263 d2_rev = vrev64q_u16(vextq_u16(d2, d2, 4));
1264
1265 d20_lo = vzipq_u16(d2_rev, d0_rev).val[0];
1266 d20_hi = vzipq_u16(d2_rev, d0_rev).val[1];
1267
1268 // Incrementally shift more elements from d0/d2 reversed into d1:
1269 // stride=0 [ d0[0], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ]
1270 // stride=1 [ d0[1], d2[0], d0[0], d1[0], d1[1], d1[2], d1[3], d1[4] ]
1271 // stride=2 [ d0[2], d2[1], d0[1], d2[0], d0[0], d1[0], d1[1], d1[2] ]
1272 // stride=3 [ d0[3], d2[2], d0[2], d2[1], d0[1], d2[0], d0[0], d1[0] ]
1273 // stride=4 [ d0[4], d2[3], d0[3], d2[2], d0[2], d2[1], d0[1], d2[0] ]
1274 // stride=5 [ d0[5], d2[4], d0[4], d2[3], d0[3], d2[2], d0[2], d2[1] ]
1275 // stride=6 [ d0[6], d2[5], d0[5], d2[4], d0[4], d2[3], d0[3], d2[2] ]
1276 // stride=7 [ d0[7], d2[6], d0[6], d2[5], d0[5], d2[4], d0[4], d2[3] ]
1277 vst1q_u16(dst + 0 * stride, vextq_u16(d20_hi, d1, 7));
1278 vst1q_u16(dst + 1 * stride, vextq_u16(d20_hi, d1, 5));
1279 vst1q_u16(dst + 2 * stride, vextq_u16(d20_hi, d1, 3));
1280 vst1q_u16(dst + 3 * stride, vextq_u16(d20_hi, d1, 1));
1281 vst1q_u16(dst + 4 * stride, vextq_u16(d20_lo, d20_hi, 7));
1282 vst1q_u16(dst + 5 * stride, vextq_u16(d20_lo, d20_hi, 5));
1283 vst1q_u16(dst + 6 * stride, vextq_u16(d20_lo, d20_hi, 3));
1284 vst1q_u16(dst + 7 * stride, vextq_u16(d20_lo, d20_hi, 1));
1285 }
1286
vpx_highbd_d153_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)1287 void vpx_highbd_d153_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
1288 const uint16_t *above,
1289 const uint16_t *left, int bd) {
1290 // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation.
1291 uint16x8_t az, a0, a6, a7, a8, l0az, l0, l1, l7, l8, l9, azl0, d0[2], d1[2],
1292 d2[2], d20[4];
1293 (void)bd;
1294
1295 az = vld1q_u16(above - 1);
1296 a0 = vld1q_u16(above + 0);
1297 a6 = vld1q_u16(above + 6);
1298 a7 = vld1q_u16(above + 7);
1299 a8 = vld1q_u16(above + 8);
1300 // [ left[0], above[-1], ... , above[13] ]
1301 l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
1302
1303 l0 = vld1q_u16(left + 0);
1304 l1 = vld1q_u16(left + 1);
1305 l7 = vld1q_u16(left + 7);
1306 l8 = vld1q_u16(left + 8);
1307 // The last lane here is unused, reading left[16] could cause a buffer
1308 // over-read, so just fill with a duplicate of left[8] to avoid needing to
1309 // materialize a zero:
1310 // [ left[9], ... , left[15], x ]
1311 l9 = vextq_u16(l8, l8, 1);
1312 // [ above[-1], left[0], ... , left[14] ]
1313 azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
1314
1315 d0[0] = vrhaddq_u16(azl0, l0);
1316 d0[1] = vrhaddq_u16(l7, l8);
1317 d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
1318 d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7);
1319 d2[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
1320 d2[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8);
1321
1322 d0[0] = vrev64q_u16(vextq_u16(d0[0], d0[0], 4));
1323 d0[1] = vrev64q_u16(vextq_u16(d0[1], d0[1], 4));
1324 d2[0] = vrev64q_u16(vextq_u16(d2[0], d2[0], 4));
1325 d2[1] = vrev64q_u16(vextq_u16(d2[1], d2[1], 4));
1326
1327 d20[0] = vzipq_u16(d2[1], d0[1]).val[0];
1328 d20[1] = vzipq_u16(d2[1], d0[1]).val[1];
1329 d20[2] = vzipq_u16(d2[0], d0[0]).val[0];
1330 d20[3] = vzipq_u16(d2[0], d0[0]).val[1];
1331
1332 vst1q_u16(dst + 0 * stride + 0, vextq_u16(d20[3], d1[0], 7));
1333 vst1q_u16(dst + 0 * stride + 8, vextq_u16(d1[0], d1[1], 7));
1334 vst1q_u16(dst + 1 * stride + 0, vextq_u16(d20[3], d1[0], 5));
1335 vst1q_u16(dst + 1 * stride + 8, vextq_u16(d1[0], d1[1], 5));
1336 vst1q_u16(dst + 2 * stride + 0, vextq_u16(d20[3], d1[0], 3));
1337 vst1q_u16(dst + 2 * stride + 8, vextq_u16(d1[0], d1[1], 3));
1338 vst1q_u16(dst + 3 * stride + 0, vextq_u16(d20[3], d1[0], 1));
1339 vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 1));
1340
1341 vst1q_u16(dst + 4 * stride + 0, vextq_u16(d20[2], d20[3], 7));
1342 vst1q_u16(dst + 4 * stride + 8, vextq_u16(d20[3], d1[0], 7));
1343 vst1q_u16(dst + 5 * stride + 0, vextq_u16(d20[2], d20[3], 5));
1344 vst1q_u16(dst + 5 * stride + 8, vextq_u16(d20[3], d1[0], 5));
1345 vst1q_u16(dst + 6 * stride + 0, vextq_u16(d20[2], d20[3], 3));
1346 vst1q_u16(dst + 6 * stride + 8, vextq_u16(d20[3], d1[0], 3));
1347 vst1q_u16(dst + 7 * stride + 0, vextq_u16(d20[2], d20[3], 1));
1348 vst1q_u16(dst + 7 * stride + 8, vextq_u16(d20[3], d1[0], 1));
1349
1350 vst1q_u16(dst + 8 * stride + 0, vextq_u16(d20[1], d20[2], 7));
1351 vst1q_u16(dst + 8 * stride + 8, vextq_u16(d20[2], d20[3], 7));
1352 vst1q_u16(dst + 9 * stride + 0, vextq_u16(d20[1], d20[2], 5));
1353 vst1q_u16(dst + 9 * stride + 8, vextq_u16(d20[2], d20[3], 5));
1354 vst1q_u16(dst + 10 * stride + 0, vextq_u16(d20[1], d20[2], 3));
1355 vst1q_u16(dst + 10 * stride + 8, vextq_u16(d20[2], d20[3], 3));
1356 vst1q_u16(dst + 11 * stride + 0, vextq_u16(d20[1], d20[2], 1));
1357 vst1q_u16(dst + 11 * stride + 8, vextq_u16(d20[2], d20[3], 1));
1358
1359 vst1q_u16(dst + 12 * stride + 0, vextq_u16(d20[0], d20[1], 7));
1360 vst1q_u16(dst + 12 * stride + 8, vextq_u16(d20[1], d20[2], 7));
1361 vst1q_u16(dst + 13 * stride + 0, vextq_u16(d20[0], d20[1], 5));
1362 vst1q_u16(dst + 13 * stride + 8, vextq_u16(d20[1], d20[2], 5));
1363 vst1q_u16(dst + 14 * stride + 0, vextq_u16(d20[0], d20[1], 3));
1364 vst1q_u16(dst + 14 * stride + 8, vextq_u16(d20[1], d20[2], 3));
1365 vst1q_u16(dst + 15 * stride + 0, vextq_u16(d20[0], d20[1], 1));
1366 vst1q_u16(dst + 15 * stride + 8, vextq_u16(d20[1], d20[2], 1));
1367 }
1368
vpx_highbd_d153_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)1369 void vpx_highbd_d153_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
1370 const uint16_t *above,
1371 const uint16_t *left, int bd) {
1372 // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation.
1373 uint16x8_t az, a0, a6, a7, a8, a14, a15, a16, a22, a23, a24, l0az, l0, l1, l7,
1374 l8, l9, l15, l16, l17, l23, l24, l25, azl0, d0[4], d1[4], d2[4], d20[8];
1375 (void)bd;
1376
1377 az = vld1q_u16(above - 1);
1378 a0 = vld1q_u16(above + 0);
1379 a6 = vld1q_u16(above + 6);
1380 a7 = vld1q_u16(above + 7);
1381 a8 = vld1q_u16(above + 8);
1382 a14 = vld1q_u16(above + 14);
1383 a15 = vld1q_u16(above + 15);
1384 a16 = vld1q_u16(above + 16);
1385 a22 = vld1q_u16(above + 22);
1386 a23 = vld1q_u16(above + 23);
1387 a24 = vld1q_u16(above + 24);
1388 // [ left[0], above[-1], ... , above[13] ]
1389 l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
1390
1391 l0 = vld1q_u16(left + 0);
1392 l1 = vld1q_u16(left + 1);
1393 l7 = vld1q_u16(left + 7);
1394 l8 = vld1q_u16(left + 8);
1395 l9 = vld1q_u16(left + 9);
1396 l15 = vld1q_u16(left + 15);
1397 l16 = vld1q_u16(left + 16);
1398 l17 = vld1q_u16(left + 17);
1399 l23 = vld1q_u16(left + 23);
1400 l24 = vld1q_u16(left + 24);
1401 // The last lane here is unused, reading left[32] could cause a buffer
1402 // over-read, so just fill with a duplicate of left[24] to avoid needing to
1403 // materialize a zero:
1404 // [ left[25], ... , left[31], x ]
1405 l25 = vextq_u16(l24, l24, 1);
1406 // [ above[-1], left[0], ... , left[14] ]
1407 azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
1408
1409 d0[0] = vrhaddq_u16(azl0, l0);
1410 d0[1] = vrhaddq_u16(l7, l8);
1411 d0[2] = vrhaddq_u16(l15, l16);
1412 d0[3] = vrhaddq_u16(l23, l24);
1413
1414 d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
1415 d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7);
1416 d1[2] = vrhaddq_u16(vhaddq_u16(a14, a16), a15);
1417 d1[3] = vrhaddq_u16(vhaddq_u16(a22, a24), a23);
1418
1419 d2[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
1420 d2[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8);
1421 d2[2] = vrhaddq_u16(vhaddq_u16(l15, l17), l16);
1422 d2[3] = vrhaddq_u16(vhaddq_u16(l23, l25), l24);
1423
1424 d0[0] = vrev64q_u16(vextq_u16(d0[0], d0[0], 4));
1425 d0[1] = vrev64q_u16(vextq_u16(d0[1], d0[1], 4));
1426 d0[2] = vrev64q_u16(vextq_u16(d0[2], d0[2], 4));
1427 d0[3] = vrev64q_u16(vextq_u16(d0[3], d0[3], 4));
1428 d2[0] = vrev64q_u16(vextq_u16(d2[0], d2[0], 4));
1429 d2[1] = vrev64q_u16(vextq_u16(d2[1], d2[1], 4));
1430 d2[2] = vrev64q_u16(vextq_u16(d2[2], d2[2], 4));
1431 d2[3] = vrev64q_u16(vextq_u16(d2[3], d2[3], 4));
1432
1433 d20[0] = vzipq_u16(d2[3], d0[3]).val[0];
1434 d20[1] = vzipq_u16(d2[3], d0[3]).val[1];
1435 d20[2] = vzipq_u16(d2[2], d0[2]).val[0];
1436 d20[3] = vzipq_u16(d2[2], d0[2]).val[1];
1437 d20[4] = vzipq_u16(d2[1], d0[1]).val[0];
1438 d20[5] = vzipq_u16(d2[1], d0[1]).val[1];
1439 d20[6] = vzipq_u16(d2[0], d0[0]).val[0];
1440 d20[7] = vzipq_u16(d2[0], d0[0]).val[1];
1441
1442 vst1q_u16(dst + 0 * stride + 0, vextq_u16(d20[7], d1[0], 7));
1443 vst1q_u16(dst + 0 * stride + 8, vextq_u16(d1[0], d1[1], 7));
1444 vst1q_u16(dst + 0 * stride + 16, vextq_u16(d1[1], d1[2], 7));
1445 vst1q_u16(dst + 0 * stride + 24, vextq_u16(d1[2], d1[3], 7));
1446 vst1q_u16(dst + 1 * stride + 0, vextq_u16(d20[7], d1[0], 5));
1447 vst1q_u16(dst + 1 * stride + 8, vextq_u16(d1[0], d1[1], 5));
1448 vst1q_u16(dst + 1 * stride + 16, vextq_u16(d1[1], d1[2], 5));
1449 vst1q_u16(dst + 1 * stride + 24, vextq_u16(d1[2], d1[3], 5));
1450 vst1q_u16(dst + 2 * stride + 0, vextq_u16(d20[7], d1[0], 3));
1451 vst1q_u16(dst + 2 * stride + 8, vextq_u16(d1[0], d1[1], 3));
1452 vst1q_u16(dst + 2 * stride + 16, vextq_u16(d1[1], d1[2], 3));
1453 vst1q_u16(dst + 2 * stride + 24, vextq_u16(d1[2], d1[3], 3));
1454 vst1q_u16(dst + 3 * stride + 0, vextq_u16(d20[7], d1[0], 1));
1455 vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 1));
1456 vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[1], d1[2], 1));
1457 vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1[2], d1[3], 1));
1458
1459 vst1q_u16(dst + 4 * stride + 0, vextq_u16(d20[6], d20[7], 7));
1460 vst1q_u16(dst + 4 * stride + 8, vextq_u16(d20[7], d1[0], 7));
1461 vst1q_u16(dst + 4 * stride + 16, vextq_u16(d1[0], d1[1], 7));
1462 vst1q_u16(dst + 4 * stride + 24, vextq_u16(d1[1], d1[2], 7));
1463 vst1q_u16(dst + 5 * stride + 0, vextq_u16(d20[6], d20[7], 5));
1464 vst1q_u16(dst + 5 * stride + 8, vextq_u16(d20[7], d1[0], 5));
1465 vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[0], d1[1], 5));
1466 vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1[1], d1[2], 5));
1467 vst1q_u16(dst + 6 * stride + 0, vextq_u16(d20[6], d20[7], 3));
1468 vst1q_u16(dst + 6 * stride + 8, vextq_u16(d20[7], d1[0], 3));
1469 vst1q_u16(dst + 6 * stride + 16, vextq_u16(d1[0], d1[1], 3));
1470 vst1q_u16(dst + 6 * stride + 24, vextq_u16(d1[1], d1[2], 3));
1471 vst1q_u16(dst + 7 * stride + 0, vextq_u16(d20[6], d20[7], 1));
1472 vst1q_u16(dst + 7 * stride + 8, vextq_u16(d20[7], d1[0], 1));
1473 vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[0], d1[1], 1));
1474 vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1[1], d1[2], 1));
1475
1476 vst1q_u16(dst + 8 * stride + 0, vextq_u16(d20[5], d20[6], 7));
1477 vst1q_u16(dst + 8 * stride + 8, vextq_u16(d20[6], d20[7], 7));
1478 vst1q_u16(dst + 8 * stride + 16, vextq_u16(d20[7], d1[0], 7));
1479 vst1q_u16(dst + 8 * stride + 24, vextq_u16(d1[0], d1[1], 7));
1480 vst1q_u16(dst + 9 * stride + 0, vextq_u16(d20[5], d20[6], 5));
1481 vst1q_u16(dst + 9 * stride + 8, vextq_u16(d20[6], d20[7], 5));
1482 vst1q_u16(dst + 9 * stride + 16, vextq_u16(d20[7], d1[0], 5));
1483 vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1[0], d1[1], 5));
1484 vst1q_u16(dst + 10 * stride + 0, vextq_u16(d20[5], d20[6], 3));
1485 vst1q_u16(dst + 10 * stride + 8, vextq_u16(d20[6], d20[7], 3));
1486 vst1q_u16(dst + 10 * stride + 16, vextq_u16(d20[7], d1[0], 3));
1487 vst1q_u16(dst + 10 * stride + 24, vextq_u16(d1[0], d1[1], 3));
1488 vst1q_u16(dst + 11 * stride + 0, vextq_u16(d20[5], d20[6], 1));
1489 vst1q_u16(dst + 11 * stride + 8, vextq_u16(d20[6], d20[7], 1));
1490 vst1q_u16(dst + 11 * stride + 16, vextq_u16(d20[7], d1[0], 1));
1491 vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1[0], d1[1], 1));
1492
1493 vst1q_u16(dst + 12 * stride + 0, vextq_u16(d20[4], d20[5], 7));
1494 vst1q_u16(dst + 12 * stride + 8, vextq_u16(d20[5], d20[6], 7));
1495 vst1q_u16(dst + 12 * stride + 16, vextq_u16(d20[6], d20[7], 7));
1496 vst1q_u16(dst + 12 * stride + 24, vextq_u16(d20[7], d1[0], 7));
1497 vst1q_u16(dst + 13 * stride + 0, vextq_u16(d20[4], d20[5], 5));
1498 vst1q_u16(dst + 13 * stride + 8, vextq_u16(d20[5], d20[6], 5));
1499 vst1q_u16(dst + 13 * stride + 16, vextq_u16(d20[6], d20[7], 5));
1500 vst1q_u16(dst + 13 * stride + 24, vextq_u16(d20[7], d1[0], 5));
1501 vst1q_u16(dst + 14 * stride + 0, vextq_u16(d20[4], d20[5], 3));
1502 vst1q_u16(dst + 14 * stride + 8, vextq_u16(d20[5], d20[6], 3));
1503 vst1q_u16(dst + 14 * stride + 16, vextq_u16(d20[6], d20[7], 3));
1504 vst1q_u16(dst + 14 * stride + 24, vextq_u16(d20[7], d1[0], 3));
1505 vst1q_u16(dst + 15 * stride + 0, vextq_u16(d20[4], d20[5], 1));
1506 vst1q_u16(dst + 15 * stride + 8, vextq_u16(d20[5], d20[6], 1));
1507 vst1q_u16(dst + 15 * stride + 16, vextq_u16(d20[6], d20[7], 1));
1508 vst1q_u16(dst + 15 * stride + 24, vextq_u16(d20[7], d1[0], 1));
1509
1510 vst1q_u16(dst + 16 * stride + 0, vextq_u16(d20[3], d20[4], 7));
1511 vst1q_u16(dst + 16 * stride + 8, vextq_u16(d20[4], d20[5], 7));
1512 vst1q_u16(dst + 16 * stride + 16, vextq_u16(d20[5], d20[6], 7));
1513 vst1q_u16(dst + 16 * stride + 24, vextq_u16(d20[6], d20[7], 7));
1514 vst1q_u16(dst + 17 * stride + 0, vextq_u16(d20[3], d20[4], 5));
1515 vst1q_u16(dst + 17 * stride + 8, vextq_u16(d20[4], d20[5], 5));
1516 vst1q_u16(dst + 17 * stride + 16, vextq_u16(d20[5], d20[6], 5));
1517 vst1q_u16(dst + 17 * stride + 24, vextq_u16(d20[6], d20[7], 5));
1518 vst1q_u16(dst + 18 * stride + 0, vextq_u16(d20[3], d20[4], 3));
1519 vst1q_u16(dst + 18 * stride + 8, vextq_u16(d20[4], d20[5], 3));
1520 vst1q_u16(dst + 18 * stride + 16, vextq_u16(d20[5], d20[6], 3));
1521 vst1q_u16(dst + 18 * stride + 24, vextq_u16(d20[6], d20[7], 3));
1522 vst1q_u16(dst + 19 * stride + 0, vextq_u16(d20[3], d20[4], 1));
1523 vst1q_u16(dst + 19 * stride + 8, vextq_u16(d20[4], d20[5], 1));
1524 vst1q_u16(dst + 19 * stride + 16, vextq_u16(d20[5], d20[6], 1));
1525 vst1q_u16(dst + 19 * stride + 24, vextq_u16(d20[6], d20[7], 1));
1526
1527 vst1q_u16(dst + 20 * stride + 0, vextq_u16(d20[2], d20[3], 7));
1528 vst1q_u16(dst + 20 * stride + 8, vextq_u16(d20[3], d20[4], 7));
1529 vst1q_u16(dst + 20 * stride + 16, vextq_u16(d20[4], d20[5], 7));
1530 vst1q_u16(dst + 20 * stride + 24, vextq_u16(d20[5], d20[6], 7));
1531 vst1q_u16(dst + 21 * stride + 0, vextq_u16(d20[2], d20[3], 5));
1532 vst1q_u16(dst + 21 * stride + 8, vextq_u16(d20[3], d20[4], 5));
1533 vst1q_u16(dst + 21 * stride + 16, vextq_u16(d20[4], d20[5], 5));
1534 vst1q_u16(dst + 21 * stride + 24, vextq_u16(d20[5], d20[6], 5));
1535 vst1q_u16(dst + 22 * stride + 0, vextq_u16(d20[2], d20[3], 3));
1536 vst1q_u16(dst + 22 * stride + 8, vextq_u16(d20[3], d20[4], 3));
1537 vst1q_u16(dst + 22 * stride + 16, vextq_u16(d20[4], d20[5], 3));
1538 vst1q_u16(dst + 22 * stride + 24, vextq_u16(d20[5], d20[6], 3));
1539 vst1q_u16(dst + 23 * stride + 0, vextq_u16(d20[2], d20[3], 1));
1540 vst1q_u16(dst + 23 * stride + 8, vextq_u16(d20[3], d20[4], 1));
1541 vst1q_u16(dst + 23 * stride + 16, vextq_u16(d20[4], d20[5], 1));
1542 vst1q_u16(dst + 23 * stride + 24, vextq_u16(d20[5], d20[6], 1));
1543
1544 vst1q_u16(dst + 24 * stride + 0, vextq_u16(d20[1], d20[2], 7));
1545 vst1q_u16(dst + 24 * stride + 8, vextq_u16(d20[2], d20[3], 7));
1546 vst1q_u16(dst + 24 * stride + 16, vextq_u16(d20[3], d20[4], 7));
1547 vst1q_u16(dst + 24 * stride + 24, vextq_u16(d20[4], d20[5], 7));
1548 vst1q_u16(dst + 25 * stride + 0, vextq_u16(d20[1], d20[2], 5));
1549 vst1q_u16(dst + 25 * stride + 8, vextq_u16(d20[2], d20[3], 5));
1550 vst1q_u16(dst + 25 * stride + 16, vextq_u16(d20[3], d20[4], 5));
1551 vst1q_u16(dst + 25 * stride + 24, vextq_u16(d20[4], d20[5], 5));
1552 vst1q_u16(dst + 26 * stride + 0, vextq_u16(d20[1], d20[2], 3));
1553 vst1q_u16(dst + 26 * stride + 8, vextq_u16(d20[2], d20[3], 3));
1554 vst1q_u16(dst + 26 * stride + 16, vextq_u16(d20[3], d20[4], 3));
1555 vst1q_u16(dst + 26 * stride + 24, vextq_u16(d20[4], d20[5], 3));
1556 vst1q_u16(dst + 27 * stride + 0, vextq_u16(d20[1], d20[2], 1));
1557 vst1q_u16(dst + 27 * stride + 8, vextq_u16(d20[2], d20[3], 1));
1558 vst1q_u16(dst + 27 * stride + 16, vextq_u16(d20[3], d20[4], 1));
1559 vst1q_u16(dst + 27 * stride + 24, vextq_u16(d20[4], d20[5], 1));
1560
1561 vst1q_u16(dst + 28 * stride + 0, vextq_u16(d20[0], d20[1], 7));
1562 vst1q_u16(dst + 28 * stride + 8, vextq_u16(d20[1], d20[2], 7));
1563 vst1q_u16(dst + 28 * stride + 16, vextq_u16(d20[2], d20[3], 7));
1564 vst1q_u16(dst + 28 * stride + 24, vextq_u16(d20[3], d20[4], 7));
1565 vst1q_u16(dst + 29 * stride + 0, vextq_u16(d20[0], d20[1], 5));
1566 vst1q_u16(dst + 29 * stride + 8, vextq_u16(d20[1], d20[2], 5));
1567 vst1q_u16(dst + 29 * stride + 16, vextq_u16(d20[2], d20[3], 5));
1568 vst1q_u16(dst + 29 * stride + 24, vextq_u16(d20[3], d20[4], 5));
1569 vst1q_u16(dst + 30 * stride + 0, vextq_u16(d20[0], d20[1], 3));
1570 vst1q_u16(dst + 30 * stride + 8, vextq_u16(d20[1], d20[2], 3));
1571 vst1q_u16(dst + 30 * stride + 16, vextq_u16(d20[2], d20[3], 3));
1572 vst1q_u16(dst + 30 * stride + 24, vextq_u16(d20[3], d20[4], 3));
1573 vst1q_u16(dst + 31 * stride + 0, vextq_u16(d20[0], d20[1], 1));
1574 vst1q_u16(dst + 31 * stride + 8, vextq_u16(d20[1], d20[2], 1));
1575 vst1q_u16(dst + 31 * stride + 16, vextq_u16(d20[2], d20[3], 1));
1576 vst1q_u16(dst + 31 * stride + 24, vextq_u16(d20[3], d20[4], 1));
1577 }
1578
1579 // -----------------------------------------------------------------------------
1580
vpx_highbd_d135_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)1581 void vpx_highbd_d135_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
1582 const uint16_t *above,
1583 const uint16_t *left, int bd) {
1584 const uint16x8_t XA0123___ = vld1q_u16(above - 1);
1585 const uint16x4_t L0123 = vld1_u16(left);
1586 const uint16x4_t L3210 = vrev64_u16(L0123);
1587 const uint16x8_t L____3210 = vcombine_u16(L0123, L3210);
1588 const uint16x8_t L3210XA012 = vcombine_u16(L3210, vget_low_u16(XA0123___));
1589 const uint16x8_t L210XA0123 = vextq_u16(L____3210, XA0123___, 5);
1590 const uint16x8_t L10XA0123_ = vextq_u16(L____3210, XA0123___, 6);
1591 const uint16x8_t avg1 = vhaddq_u16(L3210XA012, L10XA0123_);
1592 const uint16x8_t avg2 = vrhaddq_u16(avg1, L210XA0123);
1593 const uint16x4_t row_0 = vget_low_u16(avg2);
1594 const uint16x4_t row_1 = vget_high_u16(avg2);
1595 const uint16x4_t r0 = vext_u16(row_0, row_1, 3);
1596 const uint16x4_t r1 = vext_u16(row_0, row_1, 2);
1597 const uint16x4_t r2 = vext_u16(row_0, row_1, 1);
1598 (void)bd;
1599 vst1_u16(dst, r0);
1600 dst += stride;
1601 vst1_u16(dst, r1);
1602 dst += stride;
1603 vst1_u16(dst, r2);
1604 dst += stride;
1605 vst1_u16(dst, row_0);
1606 }
1607
vpx_highbd_d135_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)1608 void vpx_highbd_d135_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
1609 const uint16_t *above,
1610 const uint16_t *left, int bd) {
1611 const uint16x8_t XA0123456 = vld1q_u16(above - 1);
1612 const uint16x8_t A01234567 = vld1q_u16(above);
1613 const uint16x8_t A1234567_ = vld1q_u16(above + 1);
1614 const uint16x8_t L01234567 = vld1q_u16(left);
1615 const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567));
1616 const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567));
1617 const uint16x8_t L76543210 = vcombine_u16(L7654, L3210);
1618 const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1);
1619 const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2);
1620 const uint16x8_t avg_0 = vhaddq_u16(L76543210, L543210XA0);
1621 const uint16x8_t avg_1 = vhaddq_u16(XA0123456, A1234567_);
1622 const uint16x8_t row_0 = vrhaddq_u16(avg_0, L6543210X);
1623 const uint16x8_t row_1 = vrhaddq_u16(avg_1, A01234567);
1624 const uint16x8_t r0 = vextq_u16(row_0, row_1, 7);
1625 const uint16x8_t r1 = vextq_u16(row_0, row_1, 6);
1626 const uint16x8_t r2 = vextq_u16(row_0, row_1, 5);
1627 const uint16x8_t r3 = vextq_u16(row_0, row_1, 4);
1628 const uint16x8_t r4 = vextq_u16(row_0, row_1, 3);
1629 const uint16x8_t r5 = vextq_u16(row_0, row_1, 2);
1630 const uint16x8_t r6 = vextq_u16(row_0, row_1, 1);
1631 (void)bd;
1632 vst1q_u16(dst, r0);
1633 dst += stride;
1634 vst1q_u16(dst, r1);
1635 dst += stride;
1636 vst1q_u16(dst, r2);
1637 dst += stride;
1638 vst1q_u16(dst, r3);
1639 dst += stride;
1640 vst1q_u16(dst, r4);
1641 dst += stride;
1642 vst1q_u16(dst, r5);
1643 dst += stride;
1644 vst1q_u16(dst, r6);
1645 dst += stride;
1646 vst1q_u16(dst, row_0);
1647 }
1648
d135_store_16(uint16_t ** dst,const ptrdiff_t stride,const uint16x8_t row_0,const uint16x8_t row_1)1649 static INLINE void d135_store_16(uint16_t **dst, const ptrdiff_t stride,
1650 const uint16x8_t row_0,
1651 const uint16x8_t row_1) {
1652 vst1q_u16(*dst, row_0);
1653 *dst += 8;
1654 vst1q_u16(*dst, row_1);
1655 *dst += stride - 8;
1656 }
1657
vpx_highbd_d135_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)1658 void vpx_highbd_d135_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
1659 const uint16_t *above,
1660 const uint16_t *left, int bd) {
1661 const uint16x8_t L01234567 = vld1q_u16(left);
1662 const uint16x8_t L89abcdef = vld1q_u16(left + 8);
1663 const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567));
1664 const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567));
1665 const uint16x4_t Lba98 = vrev64_u16(vget_low_u16(L89abcdef));
1666 const uint16x4_t Lfedc = vrev64_u16(vget_high_u16(L89abcdef));
1667 const uint16x8_t L76543210 = vcombine_u16(L7654, L3210);
1668 const uint16x8_t Lfedcba98 = vcombine_u16(Lfedc, Lba98);
1669 const uint16x8_t Ledcba987 = vextq_u16(Lfedcba98, L76543210, 1);
1670 const uint16x8_t Ldcba9876 = vextq_u16(Lfedcba98, L76543210, 2);
1671 const uint16x8_t avg_0 = vhaddq_u16(Lfedcba98, Ldcba9876);
1672 const uint16x8_t row_0 = vrhaddq_u16(avg_0, Ledcba987);
1673
1674 const uint16x8_t XA0123456 = vld1q_u16(above - 1);
1675 const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1);
1676 const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2);
1677 const uint16x8_t avg_1 = vhaddq_u16(L76543210, L543210XA0);
1678 const uint16x8_t row_1 = vrhaddq_u16(avg_1, L6543210X);
1679
1680 const uint16x8_t A01234567 = vld1q_u16(above);
1681 const uint16x8_t A12345678 = vld1q_u16(above + 1);
1682 const uint16x8_t avg_2 = vhaddq_u16(XA0123456, A12345678);
1683 const uint16x8_t row_2 = vrhaddq_u16(avg_2, A01234567);
1684
1685 const uint16x8_t A789abcde = vld1q_u16(above + 7);
1686 const uint16x8_t A89abcdef = vld1q_u16(above + 8);
1687 const uint16x8_t A9abcdef_ = vld1q_u16(above + 9);
1688 const uint16x8_t avg_3 = vhaddq_u16(A789abcde, A9abcdef_);
1689 const uint16x8_t row_3 = vrhaddq_u16(avg_3, A89abcdef);
1690
1691 const uint16x8_t r0_0 = vextq_u16(row_1, row_2, 7);
1692 const uint16x8_t r0_1 = vextq_u16(row_2, row_3, 7);
1693 const uint16x8_t r1_0 = vextq_u16(row_1, row_2, 6);
1694 const uint16x8_t r1_1 = vextq_u16(row_2, row_3, 6);
1695 const uint16x8_t r2_0 = vextq_u16(row_1, row_2, 5);
1696 const uint16x8_t r2_1 = vextq_u16(row_2, row_3, 5);
1697 const uint16x8_t r3_0 = vextq_u16(row_1, row_2, 4);
1698 const uint16x8_t r3_1 = vextq_u16(row_2, row_3, 4);
1699 const uint16x8_t r4_0 = vextq_u16(row_1, row_2, 3);
1700 const uint16x8_t r4_1 = vextq_u16(row_2, row_3, 3);
1701 const uint16x8_t r5_0 = vextq_u16(row_1, row_2, 2);
1702 const uint16x8_t r5_1 = vextq_u16(row_2, row_3, 2);
1703 const uint16x8_t r6_0 = vextq_u16(row_1, row_2, 1);
1704 const uint16x8_t r6_1 = vextq_u16(row_2, row_3, 1);
1705 const uint16x8_t r8_0 = vextq_u16(row_0, row_1, 7);
1706 const uint16x8_t r9_0 = vextq_u16(row_0, row_1, 6);
1707 const uint16x8_t ra_0 = vextq_u16(row_0, row_1, 5);
1708 const uint16x8_t rb_0 = vextq_u16(row_0, row_1, 4);
1709 const uint16x8_t rc_0 = vextq_u16(row_0, row_1, 3);
1710 const uint16x8_t rd_0 = vextq_u16(row_0, row_1, 2);
1711 const uint16x8_t re_0 = vextq_u16(row_0, row_1, 1);
1712 (void)bd;
1713
1714 d135_store_16(&dst, stride, r0_0, r0_1);
1715 d135_store_16(&dst, stride, r1_0, r1_1);
1716 d135_store_16(&dst, stride, r2_0, r2_1);
1717 d135_store_16(&dst, stride, r3_0, r3_1);
1718 d135_store_16(&dst, stride, r4_0, r4_1);
1719 d135_store_16(&dst, stride, r5_0, r5_1);
1720 d135_store_16(&dst, stride, r6_0, r6_1);
1721 d135_store_16(&dst, stride, row_1, row_2);
1722 d135_store_16(&dst, stride, r8_0, r0_0);
1723 d135_store_16(&dst, stride, r9_0, r1_0);
1724 d135_store_16(&dst, stride, ra_0, r2_0);
1725 d135_store_16(&dst, stride, rb_0, r3_0);
1726 d135_store_16(&dst, stride, rc_0, r4_0);
1727 d135_store_16(&dst, stride, rd_0, r5_0);
1728 d135_store_16(&dst, stride, re_0, r6_0);
1729 vst1q_u16(dst, row_0);
1730 dst += 8;
1731 vst1q_u16(dst, row_1);
1732 }
1733
vpx_highbd_d135_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)1734 void vpx_highbd_d135_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
1735 const uint16_t *above,
1736 const uint16_t *left, int bd) {
1737 const uint16x8_t LL01234567 = vld1q_u16(left + 16);
1738 const uint16x8_t LL89abcdef = vld1q_u16(left + 24);
1739 const uint16x4_t LL3210 = vrev64_u16(vget_low_u16(LL01234567));
1740 const uint16x4_t LL7654 = vrev64_u16(vget_high_u16(LL01234567));
1741 const uint16x4_t LLba98 = vrev64_u16(vget_low_u16(LL89abcdef));
1742 const uint16x4_t LLfedc = vrev64_u16(vget_high_u16(LL89abcdef));
1743 const uint16x8_t LL76543210 = vcombine_u16(LL7654, LL3210);
1744 const uint16x8_t LLfedcba98 = vcombine_u16(LLfedc, LLba98);
1745 const uint16x8_t LLedcba987 = vextq_u16(LLfedcba98, LL76543210, 1);
1746 const uint16x8_t LLdcba9876 = vextq_u16(LLfedcba98, LL76543210, 2);
1747 const uint16x8_t avg_0 = vhaddq_u16(LLfedcba98, LLdcba9876);
1748 uint16x8_t row_0 = vrhaddq_u16(avg_0, LLedcba987);
1749
1750 const uint16x8_t LU01234567 = vld1q_u16(left);
1751 const uint16x8_t LU89abcdef = vld1q_u16(left + 8);
1752 const uint16x4_t LU3210 = vrev64_u16(vget_low_u16(LU01234567));
1753 const uint16x4_t LU7654 = vrev64_u16(vget_high_u16(LU01234567));
1754 const uint16x4_t LUba98 = vrev64_u16(vget_low_u16(LU89abcdef));
1755 const uint16x4_t LUfedc = vrev64_u16(vget_high_u16(LU89abcdef));
1756 const uint16x8_t LU76543210 = vcombine_u16(LU7654, LU3210);
1757 const uint16x8_t LUfedcba98 = vcombine_u16(LUfedc, LUba98);
1758 const uint16x8_t LL6543210Uf = vextq_u16(LL76543210, LUfedcba98, 1);
1759 const uint16x8_t LL543210Ufe = vextq_u16(LL76543210, LUfedcba98, 2);
1760 const uint16x8_t avg_1 = vhaddq_u16(LL76543210, LL543210Ufe);
1761 uint16x8_t row_1 = vrhaddq_u16(avg_1, LL6543210Uf);
1762
1763 const uint16x8_t LUedcba987 = vextq_u16(LUfedcba98, LU76543210, 1);
1764 const uint16x8_t LUdcba9876 = vextq_u16(LUfedcba98, LU76543210, 2);
1765 const uint16x8_t avg_2 = vhaddq_u16(LUfedcba98, LUdcba9876);
1766 uint16x8_t row_2 = vrhaddq_u16(avg_2, LUedcba987);
1767
1768 const uint16x8_t XAL0123456 = vld1q_u16(above - 1);
1769 const uint16x8_t LU6543210X = vextq_u16(LU76543210, XAL0123456, 1);
1770 const uint16x8_t LU543210XA0 = vextq_u16(LU76543210, XAL0123456, 2);
1771 const uint16x8_t avg_3 = vhaddq_u16(LU76543210, LU543210XA0);
1772 uint16x8_t row_3 = vrhaddq_u16(avg_3, LU6543210X);
1773
1774 const uint16x8_t AL01234567 = vld1q_u16(above);
1775 const uint16x8_t AL12345678 = vld1q_u16(above + 1);
1776 const uint16x8_t avg_4 = vhaddq_u16(XAL0123456, AL12345678);
1777 uint16x8_t row_4 = vrhaddq_u16(avg_4, AL01234567);
1778
1779 const uint16x8_t AL789abcde = vld1q_u16(above + 7);
1780 const uint16x8_t AL89abcdef = vld1q_u16(above + 8);
1781 const uint16x8_t AL9abcdefg = vld1q_u16(above + 9);
1782 const uint16x8_t avg_5 = vhaddq_u16(AL789abcde, AL9abcdefg);
1783 uint16x8_t row_5 = vrhaddq_u16(avg_5, AL89abcdef);
1784
1785 const uint16x8_t ALfR0123456 = vld1q_u16(above + 15);
1786 const uint16x8_t AR01234567 = vld1q_u16(above + 16);
1787 const uint16x8_t AR12345678 = vld1q_u16(above + 17);
1788 const uint16x8_t avg_6 = vhaddq_u16(ALfR0123456, AR12345678);
1789 uint16x8_t row_6 = vrhaddq_u16(avg_6, AR01234567);
1790
1791 const uint16x8_t AR789abcde = vld1q_u16(above + 23);
1792 const uint16x8_t AR89abcdef = vld1q_u16(above + 24);
1793 const uint16x8_t AR9abcdef_ = vld1q_u16(above + 25);
1794 const uint16x8_t avg_7 = vhaddq_u16(AR789abcde, AR9abcdef_);
1795 uint16x8_t row_7 = vrhaddq_u16(avg_7, AR89abcdef);
1796 int i, j;
1797 (void)bd;
1798
1799 dst += 31 * stride;
1800 for (i = 0; i < 4; ++i) {
1801 for (j = 0; j < 8; ++j) {
1802 vst1q_u16(dst, row_0);
1803 dst += 8;
1804 vst1q_u16(dst, row_1);
1805 dst += 8;
1806 vst1q_u16(dst, row_2);
1807 dst += 8;
1808 vst1q_u16(dst, row_3);
1809 dst -= stride + 24;
1810 row_0 = vextq_u16(row_0, row_1, 1);
1811 row_1 = vextq_u16(row_1, row_2, 1);
1812 row_2 = vextq_u16(row_2, row_3, 1);
1813 row_3 = vextq_u16(row_3, row_4, 1);
1814 row_4 = vextq_u16(row_4, row_4, 1);
1815 }
1816 row_4 = row_5;
1817 row_5 = row_6;
1818 row_6 = row_7;
1819 }
1820 }
1821
1822 //------------------------------------------------------------------------------
1823
vpx_highbd_d207_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)1824 void vpx_highbd_d207_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
1825 const uint16_t *above,
1826 const uint16_t *left, int bd) {
1827 uint16x4_t l0, l1, l2, l3, c0, c1, c01_lo, c01_hi;
1828 (void)above;
1829 (void)bd;
1830
1831 l0 = vld1_u16(left + 0);
1832 l3 = vld1_dup_u16(left + 3);
1833
1834 // [ left[1], left[2], left[3], left[3] ]
1835 l1 = vext_u16(l0, l3, 1);
1836 // [ left[2], left[3], left[3], left[3] ]
1837 l2 = vext_u16(l0, l3, 2);
1838
1839 c0 = vrhadd_u16(l0, l1);
1840 c1 = vrhadd_u16(vhadd_u16(l0, l2), l1);
1841
1842 c01_lo = vzip_u16(c0, c1).val[0];
1843 c01_hi = vzip_u16(c0, c1).val[1];
1844
1845 // stride=0 [ c0[0], c1[0], c0[1], c1[1] ]
1846 // stride=1 [ c0[1], c1[1], c0[2], c1[2] ]
1847 // stride=2 [ c0[2], c1[2], c0[3], c1[3] ]
1848 // stride=3 [ c0[3], c1[3], left[3], left[3] ]
1849 vst1_u16(dst + 0 * stride, c01_lo);
1850 vst1_u16(dst + 1 * stride, vext_u16(c01_lo, c01_hi, 2));
1851 vst1_u16(dst + 2 * stride, c01_hi);
1852 vst1_u16(dst + 3 * stride, vext_u16(c01_hi, l3, 2));
1853 }
1854
vpx_highbd_d207_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)1855 void vpx_highbd_d207_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
1856 const uint16_t *above,
1857 const uint16_t *left, int bd) {
1858 uint16x8_t l0, l1, l2, l7, c0, c1, c01_lo, c01_hi;
1859 (void)above;
1860 (void)bd;
1861
1862 l0 = vld1q_u16(left + 0);
1863 l7 = vld1q_dup_u16(left + 7);
1864
1865 // [ left[1], left[2], left[3], left[4], left[5], left[6], left[7], left[7] ]
1866 l1 = vextq_u16(l0, l7, 1);
1867 // [ left[2], left[3], left[4], left[5], left[6], left[7], left[7], left[7] ]
1868 l2 = vextq_u16(l0, l7, 2);
1869
1870 c0 = vrhaddq_u16(l0, l1);
1871 c1 = vrhaddq_u16(vhaddq_u16(l0, l2), l1);
1872
1873 c01_lo = vzipq_u16(c0, c1).val[0];
1874 c01_hi = vzipq_u16(c0, c1).val[1];
1875
1876 vst1q_u16(dst + 0 * stride, c01_lo);
1877 vst1q_u16(dst + 1 * stride, vextq_u16(c01_lo, c01_hi, 2));
1878 vst1q_u16(dst + 2 * stride, vextq_u16(c01_lo, c01_hi, 4));
1879 vst1q_u16(dst + 3 * stride, vextq_u16(c01_lo, c01_hi, 6));
1880 vst1q_u16(dst + 4 * stride, c01_hi);
1881 vst1q_u16(dst + 5 * stride, vextq_u16(c01_hi, l7, 2));
1882 vst1q_u16(dst + 6 * stride, vextq_u16(c01_hi, l7, 4));
1883 vst1q_u16(dst + 7 * stride, vextq_u16(c01_hi, l7, 6));
1884 }
1885
vpx_highbd_d207_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)1886 void vpx_highbd_d207_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
1887 const uint16_t *above,
1888 const uint16_t *left, int bd) {
1889 uint16x8_t l0, l1, l2, l8, l9, l10, l15, c0[2], c1[2], c01[4];
1890 (void)above;
1891 (void)bd;
1892
1893 l0 = vld1q_u16(left + 0);
1894 l1 = vld1q_u16(left + 1);
1895 l2 = vld1q_u16(left + 2);
1896 l8 = vld1q_u16(left + 8);
1897 l15 = vld1q_dup_u16(left + 15);
1898
1899 l9 = vextq_u16(l8, l15, 1);
1900 l10 = vextq_u16(l8, l15, 2);
1901
1902 c0[0] = vrhaddq_u16(l0, l1);
1903 c0[1] = vrhaddq_u16(l8, l9);
1904 c1[0] = vrhaddq_u16(vhaddq_u16(l0, l2), l1);
1905 c1[1] = vrhaddq_u16(vhaddq_u16(l8, l10), l9);
1906
1907 c01[0] = vzipq_u16(c0[0], c1[0]).val[0];
1908 c01[1] = vzipq_u16(c0[0], c1[0]).val[1];
1909 c01[2] = vzipq_u16(c0[1], c1[1]).val[0];
1910 c01[3] = vzipq_u16(c0[1], c1[1]).val[1];
1911
1912 vst1q_u16(dst + 0 * stride + 0, c01[0]);
1913 vst1q_u16(dst + 0 * stride + 8, c01[1]);
1914 vst1q_u16(dst + 1 * stride + 0, vextq_u16(c01[0], c01[1], 2));
1915 vst1q_u16(dst + 1 * stride + 8, vextq_u16(c01[1], c01[2], 2));
1916 vst1q_u16(dst + 2 * stride + 0, vextq_u16(c01[0], c01[1], 4));
1917 vst1q_u16(dst + 2 * stride + 8, vextq_u16(c01[1], c01[2], 4));
1918 vst1q_u16(dst + 3 * stride + 0, vextq_u16(c01[0], c01[1], 6));
1919 vst1q_u16(dst + 3 * stride + 8, vextq_u16(c01[1], c01[2], 6));
1920
1921 vst1q_u16(dst + 4 * stride + 0, c01[1]);
1922 vst1q_u16(dst + 4 * stride + 8, c01[2]);
1923 vst1q_u16(dst + 5 * stride + 0, vextq_u16(c01[1], c01[2], 2));
1924 vst1q_u16(dst + 5 * stride + 8, vextq_u16(c01[2], c01[3], 2));
1925 vst1q_u16(dst + 6 * stride + 0, vextq_u16(c01[1], c01[2], 4));
1926 vst1q_u16(dst + 6 * stride + 8, vextq_u16(c01[2], c01[3], 4));
1927 vst1q_u16(dst + 7 * stride + 0, vextq_u16(c01[1], c01[2], 6));
1928 vst1q_u16(dst + 7 * stride + 8, vextq_u16(c01[2], c01[3], 6));
1929
1930 vst1q_u16(dst + 8 * stride + 0, c01[2]);
1931 vst1q_u16(dst + 8 * stride + 8, c01[3]);
1932 vst1q_u16(dst + 9 * stride + 0, vextq_u16(c01[2], c01[3], 2));
1933 vst1q_u16(dst + 9 * stride + 8, vextq_u16(c01[3], l15, 2));
1934 vst1q_u16(dst + 10 * stride + 0, vextq_u16(c01[2], c01[3], 4));
1935 vst1q_u16(dst + 10 * stride + 8, vextq_u16(c01[3], l15, 4));
1936 vst1q_u16(dst + 11 * stride + 0, vextq_u16(c01[2], c01[3], 6));
1937 vst1q_u16(dst + 11 * stride + 8, vextq_u16(c01[3], l15, 6));
1938
1939 vst1q_u16(dst + 12 * stride + 0, c01[3]);
1940 vst1q_u16(dst + 12 * stride + 8, l15);
1941 vst1q_u16(dst + 13 * stride + 0, vextq_u16(c01[3], l15, 2));
1942 vst1q_u16(dst + 13 * stride + 8, l15);
1943 vst1q_u16(dst + 14 * stride + 0, vextq_u16(c01[3], l15, 4));
1944 vst1q_u16(dst + 14 * stride + 8, l15);
1945 vst1q_u16(dst + 15 * stride + 0, vextq_u16(c01[3], l15, 6));
1946 vst1q_u16(dst + 15 * stride + 8, l15);
1947 }
1948
vpx_highbd_d207_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)1949 void vpx_highbd_d207_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
1950 const uint16_t *above,
1951 const uint16_t *left, int bd) {
1952 uint16x8_t l0, l1, l2, l8, l9, l10, l16, l17, l18, l24, l25, l26, l31, c0[4],
1953 c1[4], c01[8];
1954 (void)above;
1955 (void)bd;
1956
1957 l0 = vld1q_u16(left + 0);
1958 l1 = vld1q_u16(left + 1);
1959 l2 = vld1q_u16(left + 2);
1960 l8 = vld1q_u16(left + 8);
1961 l9 = vld1q_u16(left + 9);
1962 l10 = vld1q_u16(left + 10);
1963 l16 = vld1q_u16(left + 16);
1964 l17 = vld1q_u16(left + 17);
1965 l18 = vld1q_u16(left + 18);
1966 l24 = vld1q_u16(left + 24);
1967 l31 = vld1q_dup_u16(left + 31);
1968
1969 l25 = vextq_u16(l24, l31, 1);
1970 l26 = vextq_u16(l24, l31, 2);
1971
1972 c0[0] = vrhaddq_u16(l0, l1);
1973 c0[1] = vrhaddq_u16(l8, l9);
1974 c0[2] = vrhaddq_u16(l16, l17);
1975 c0[3] = vrhaddq_u16(l24, l25);
1976 c1[0] = vrhaddq_u16(vhaddq_u16(l0, l2), l1);
1977 c1[1] = vrhaddq_u16(vhaddq_u16(l8, l10), l9);
1978 c1[2] = vrhaddq_u16(vhaddq_u16(l16, l18), l17);
1979 c1[3] = vrhaddq_u16(vhaddq_u16(l24, l26), l25);
1980
1981 c01[0] = vzipq_u16(c0[0], c1[0]).val[0];
1982 c01[1] = vzipq_u16(c0[0], c1[0]).val[1];
1983 c01[2] = vzipq_u16(c0[1], c1[1]).val[0];
1984 c01[3] = vzipq_u16(c0[1], c1[1]).val[1];
1985 c01[4] = vzipq_u16(c0[2], c1[2]).val[0];
1986 c01[5] = vzipq_u16(c0[2], c1[2]).val[1];
1987 c01[6] = vzipq_u16(c0[3], c1[3]).val[0];
1988 c01[7] = vzipq_u16(c0[3], c1[3]).val[1];
1989
1990 vst1q_u16(dst + 0 * stride + 0, c01[0]);
1991 vst1q_u16(dst + 0 * stride + 8, c01[1]);
1992 vst1q_u16(dst + 0 * stride + 16, c01[2]);
1993 vst1q_u16(dst + 0 * stride + 24, c01[3]);
1994 vst1q_u16(dst + 1 * stride + 0, vextq_u16(c01[0], c01[1], 2));
1995 vst1q_u16(dst + 1 * stride + 8, vextq_u16(c01[1], c01[2], 2));
1996 vst1q_u16(dst + 1 * stride + 16, vextq_u16(c01[2], c01[3], 2));
1997 vst1q_u16(dst + 1 * stride + 24, vextq_u16(c01[3], c01[4], 2));
1998 vst1q_u16(dst + 2 * stride + 0, vextq_u16(c01[0], c01[1], 4));
1999 vst1q_u16(dst + 2 * stride + 8, vextq_u16(c01[1], c01[2], 4));
2000 vst1q_u16(dst + 2 * stride + 16, vextq_u16(c01[2], c01[3], 4));
2001 vst1q_u16(dst + 2 * stride + 24, vextq_u16(c01[3], c01[4], 4));
2002 vst1q_u16(dst + 3 * stride + 0, vextq_u16(c01[0], c01[1], 6));
2003 vst1q_u16(dst + 3 * stride + 8, vextq_u16(c01[1], c01[2], 6));
2004 vst1q_u16(dst + 3 * stride + 16, vextq_u16(c01[2], c01[3], 6));
2005 vst1q_u16(dst + 3 * stride + 24, vextq_u16(c01[3], c01[4], 6));
2006
2007 vst1q_u16(dst + 4 * stride + 0, c01[1]);
2008 vst1q_u16(dst + 4 * stride + 8, c01[2]);
2009 vst1q_u16(dst + 4 * stride + 16, c01[3]);
2010 vst1q_u16(dst + 4 * stride + 24, c01[4]);
2011 vst1q_u16(dst + 5 * stride + 0, vextq_u16(c01[1], c01[2], 2));
2012 vst1q_u16(dst + 5 * stride + 8, vextq_u16(c01[2], c01[3], 2));
2013 vst1q_u16(dst + 5 * stride + 16, vextq_u16(c01[3], c01[4], 2));
2014 vst1q_u16(dst + 5 * stride + 24, vextq_u16(c01[4], c01[5], 2));
2015 vst1q_u16(dst + 6 * stride + 0, vextq_u16(c01[1], c01[2], 4));
2016 vst1q_u16(dst + 6 * stride + 8, vextq_u16(c01[2], c01[3], 4));
2017 vst1q_u16(dst + 6 * stride + 16, vextq_u16(c01[3], c01[4], 4));
2018 vst1q_u16(dst + 6 * stride + 24, vextq_u16(c01[4], c01[5], 4));
2019 vst1q_u16(dst + 7 * stride + 0, vextq_u16(c01[1], c01[2], 6));
2020 vst1q_u16(dst + 7 * stride + 8, vextq_u16(c01[2], c01[3], 6));
2021 vst1q_u16(dst + 7 * stride + 16, vextq_u16(c01[3], c01[4], 6));
2022 vst1q_u16(dst + 7 * stride + 24, vextq_u16(c01[4], c01[5], 6));
2023
2024 vst1q_u16(dst + 8 * stride + 0, c01[2]);
2025 vst1q_u16(dst + 8 * stride + 8, c01[3]);
2026 vst1q_u16(dst + 8 * stride + 16, c01[4]);
2027 vst1q_u16(dst + 8 * stride + 24, c01[5]);
2028 vst1q_u16(dst + 9 * stride + 0, vextq_u16(c01[2], c01[3], 2));
2029 vst1q_u16(dst + 9 * stride + 8, vextq_u16(c01[3], c01[4], 2));
2030 vst1q_u16(dst + 9 * stride + 16, vextq_u16(c01[4], c01[5], 2));
2031 vst1q_u16(dst + 9 * stride + 24, vextq_u16(c01[5], c01[6], 2));
2032 vst1q_u16(dst + 10 * stride + 0, vextq_u16(c01[2], c01[3], 4));
2033 vst1q_u16(dst + 10 * stride + 8, vextq_u16(c01[3], c01[4], 4));
2034 vst1q_u16(dst + 10 * stride + 16, vextq_u16(c01[4], c01[5], 4));
2035 vst1q_u16(dst + 10 * stride + 24, vextq_u16(c01[5], c01[6], 4));
2036 vst1q_u16(dst + 11 * stride + 0, vextq_u16(c01[2], c01[3], 6));
2037 vst1q_u16(dst + 11 * stride + 8, vextq_u16(c01[3], c01[4], 6));
2038 vst1q_u16(dst + 11 * stride + 16, vextq_u16(c01[4], c01[5], 6));
2039 vst1q_u16(dst + 11 * stride + 24, vextq_u16(c01[5], c01[6], 6));
2040
2041 vst1q_u16(dst + 12 * stride + 0, c01[3]);
2042 vst1q_u16(dst + 12 * stride + 8, c01[4]);
2043 vst1q_u16(dst + 12 * stride + 16, c01[5]);
2044 vst1q_u16(dst + 12 * stride + 24, c01[6]);
2045 vst1q_u16(dst + 13 * stride + 0, vextq_u16(c01[3], c01[4], 2));
2046 vst1q_u16(dst + 13 * stride + 8, vextq_u16(c01[4], c01[5], 2));
2047 vst1q_u16(dst + 13 * stride + 16, vextq_u16(c01[5], c01[6], 2));
2048 vst1q_u16(dst + 13 * stride + 24, vextq_u16(c01[6], c01[7], 2));
2049 vst1q_u16(dst + 14 * stride + 0, vextq_u16(c01[3], c01[4], 4));
2050 vst1q_u16(dst + 14 * stride + 8, vextq_u16(c01[4], c01[5], 4));
2051 vst1q_u16(dst + 14 * stride + 16, vextq_u16(c01[5], c01[6], 4));
2052 vst1q_u16(dst + 14 * stride + 24, vextq_u16(c01[6], c01[7], 4));
2053 vst1q_u16(dst + 15 * stride + 0, vextq_u16(c01[3], c01[4], 6));
2054 vst1q_u16(dst + 15 * stride + 8, vextq_u16(c01[4], c01[5], 6));
2055 vst1q_u16(dst + 15 * stride + 16, vextq_u16(c01[5], c01[6], 6));
2056 vst1q_u16(dst + 15 * stride + 24, vextq_u16(c01[6], c01[7], 6));
2057
2058 vst1q_u16(dst + 16 * stride + 0, c01[4]);
2059 vst1q_u16(dst + 16 * stride + 8, c01[5]);
2060 vst1q_u16(dst + 16 * stride + 16, c01[6]);
2061 vst1q_u16(dst + 16 * stride + 24, c01[7]);
2062 vst1q_u16(dst + 17 * stride + 0, vextq_u16(c01[4], c01[5], 2));
2063 vst1q_u16(dst + 17 * stride + 8, vextq_u16(c01[5], c01[6], 2));
2064 vst1q_u16(dst + 17 * stride + 16, vextq_u16(c01[6], c01[7], 2));
2065 vst1q_u16(dst + 17 * stride + 24, vextq_u16(c01[7], l31, 2));
2066 vst1q_u16(dst + 18 * stride + 0, vextq_u16(c01[4], c01[5], 4));
2067 vst1q_u16(dst + 18 * stride + 8, vextq_u16(c01[5], c01[6], 4));
2068 vst1q_u16(dst + 18 * stride + 16, vextq_u16(c01[6], c01[7], 4));
2069 vst1q_u16(dst + 18 * stride + 24, vextq_u16(c01[7], l31, 4));
2070 vst1q_u16(dst + 19 * stride + 0, vextq_u16(c01[4], c01[5], 6));
2071 vst1q_u16(dst + 19 * stride + 8, vextq_u16(c01[5], c01[6], 6));
2072 vst1q_u16(dst + 19 * stride + 16, vextq_u16(c01[6], c01[7], 6));
2073 vst1q_u16(dst + 19 * stride + 24, vextq_u16(c01[7], l31, 6));
2074
2075 vst1q_u16(dst + 20 * stride + 0, c01[5]);
2076 vst1q_u16(dst + 20 * stride + 8, c01[6]);
2077 vst1q_u16(dst + 20 * stride + 16, c01[7]);
2078 vst1q_u16(dst + 20 * stride + 24, l31);
2079 vst1q_u16(dst + 21 * stride + 0, vextq_u16(c01[5], c01[6], 2));
2080 vst1q_u16(dst + 21 * stride + 8, vextq_u16(c01[6], c01[7], 2));
2081 vst1q_u16(dst + 21 * stride + 16, vextq_u16(c01[7], l31, 2));
2082 vst1q_u16(dst + 21 * stride + 24, vextq_u16(l31, l31, 2));
2083 vst1q_u16(dst + 22 * stride + 0, vextq_u16(c01[5], c01[6], 4));
2084 vst1q_u16(dst + 22 * stride + 8, vextq_u16(c01[6], c01[7], 4));
2085 vst1q_u16(dst + 22 * stride + 16, vextq_u16(c01[7], l31, 4));
2086 vst1q_u16(dst + 22 * stride + 24, vextq_u16(l31, l31, 4));
2087 vst1q_u16(dst + 23 * stride + 0, vextq_u16(c01[5], c01[6], 6));
2088 vst1q_u16(dst + 23 * stride + 8, vextq_u16(c01[6], c01[7], 6));
2089 vst1q_u16(dst + 23 * stride + 16, vextq_u16(c01[7], l31, 6));
2090 vst1q_u16(dst + 23 * stride + 24, vextq_u16(l31, l31, 6));
2091
2092 vst1q_u16(dst + 24 * stride + 0, c01[6]);
2093 vst1q_u16(dst + 24 * stride + 8, c01[7]);
2094 vst1q_u16(dst + 24 * stride + 16, l31);
2095 vst1q_u16(dst + 24 * stride + 24, l31);
2096 vst1q_u16(dst + 25 * stride + 0, vextq_u16(c01[6], c01[7], 2));
2097 vst1q_u16(dst + 25 * stride + 8, vextq_u16(c01[7], l31, 2));
2098 vst1q_u16(dst + 25 * stride + 16, vextq_u16(l31, l31, 2));
2099 vst1q_u16(dst + 25 * stride + 24, vextq_u16(l31, l31, 2));
2100 vst1q_u16(dst + 26 * stride + 0, vextq_u16(c01[6], c01[7], 4));
2101 vst1q_u16(dst + 26 * stride + 8, vextq_u16(c01[7], l31, 4));
2102 vst1q_u16(dst + 26 * stride + 16, vextq_u16(l31, l31, 4));
2103 vst1q_u16(dst + 26 * stride + 24, vextq_u16(l31, l31, 4));
2104 vst1q_u16(dst + 27 * stride + 0, vextq_u16(c01[6], c01[7], 6));
2105 vst1q_u16(dst + 27 * stride + 8, vextq_u16(c01[7], l31, 6));
2106 vst1q_u16(dst + 27 * stride + 16, vextq_u16(l31, l31, 6));
2107 vst1q_u16(dst + 27 * stride + 24, vextq_u16(l31, l31, 6));
2108
2109 vst1q_u16(dst + 28 * stride + 0, c01[7]);
2110 vst1q_u16(dst + 28 * stride + 8, l31);
2111 vst1q_u16(dst + 28 * stride + 16, l31);
2112 vst1q_u16(dst + 28 * stride + 24, l31);
2113 vst1q_u16(dst + 29 * stride + 0, vextq_u16(c01[7], l31, 2));
2114 vst1q_u16(dst + 29 * stride + 8, vextq_u16(l31, l31, 2));
2115 vst1q_u16(dst + 29 * stride + 16, vextq_u16(l31, l31, 2));
2116 vst1q_u16(dst + 29 * stride + 24, vextq_u16(l31, l31, 2));
2117 vst1q_u16(dst + 30 * stride + 0, vextq_u16(c01[7], l31, 4));
2118 vst1q_u16(dst + 30 * stride + 8, vextq_u16(l31, l31, 4));
2119 vst1q_u16(dst + 30 * stride + 16, vextq_u16(l31, l31, 4));
2120 vst1q_u16(dst + 30 * stride + 24, vextq_u16(l31, l31, 4));
2121 vst1q_u16(dst + 31 * stride + 0, vextq_u16(c01[7], l31, 6));
2122 vst1q_u16(dst + 31 * stride + 8, vextq_u16(l31, l31, 6));
2123 vst1q_u16(dst + 31 * stride + 16, vextq_u16(l31, l31, 6));
2124 vst1q_u16(dst + 31 * stride + 24, vextq_u16(l31, l31, 6));
2125 }
2126
2127 //------------------------------------------------------------------------------
2128
vpx_highbd_v_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)2129 void vpx_highbd_v_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
2130 const uint16_t *above,
2131 const uint16_t *left, int bd) {
2132 const uint16x4_t row = vld1_u16(above);
2133 int i;
2134 (void)left;
2135 (void)bd;
2136
2137 for (i = 0; i < 4; i++, dst += stride) {
2138 vst1_u16(dst, row);
2139 }
2140 }
2141
vpx_highbd_v_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)2142 void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
2143 const uint16_t *above,
2144 const uint16_t *left, int bd) {
2145 const uint16x8_t row = vld1q_u16(above);
2146 int i;
2147 (void)left;
2148 (void)bd;
2149
2150 for (i = 0; i < 8; i++, dst += stride) {
2151 vst1q_u16(dst, row);
2152 }
2153 }
2154
vpx_highbd_v_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)2155 void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
2156 const uint16_t *above,
2157 const uint16_t *left, int bd) {
2158 const uint16x8_t row0 = vld1q_u16(above + 0);
2159 const uint16x8_t row1 = vld1q_u16(above + 8);
2160 int i;
2161 (void)left;
2162 (void)bd;
2163
2164 for (i = 0; i < 16; i++) {
2165 vst1q_u16(dst + 0, row0);
2166 vst1q_u16(dst + 8, row1);
2167 dst += stride;
2168 }
2169 }
2170
vpx_highbd_v_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)2171 void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
2172 const uint16_t *above,
2173 const uint16_t *left, int bd) {
2174 const uint16x8_t row0 = vld1q_u16(above + 0);
2175 const uint16x8_t row1 = vld1q_u16(above + 8);
2176 const uint16x8_t row2 = vld1q_u16(above + 16);
2177 const uint16x8_t row3 = vld1q_u16(above + 24);
2178 int i;
2179 (void)left;
2180 (void)bd;
2181
2182 for (i = 0; i < 32; i++) {
2183 vst1q_u16(dst + 0, row0);
2184 vst1q_u16(dst + 8, row1);
2185 vst1q_u16(dst + 16, row2);
2186 vst1q_u16(dst + 24, row3);
2187 dst += stride;
2188 }
2189 }
2190
2191 // -----------------------------------------------------------------------------
2192
vpx_highbd_h_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)2193 void vpx_highbd_h_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
2194 const uint16_t *above,
2195 const uint16_t *left, int bd) {
2196 const uint16x4_t left_u16 = vld1_u16(left);
2197 uint16x4_t row;
2198 (void)above;
2199 (void)bd;
2200
2201 row = vdup_lane_u16(left_u16, 0);
2202 vst1_u16(dst, row);
2203 dst += stride;
2204 row = vdup_lane_u16(left_u16, 1);
2205 vst1_u16(dst, row);
2206 dst += stride;
2207 row = vdup_lane_u16(left_u16, 2);
2208 vst1_u16(dst, row);
2209 dst += stride;
2210 row = vdup_lane_u16(left_u16, 3);
2211 vst1_u16(dst, row);
2212 }
2213
vpx_highbd_h_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)2214 void vpx_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
2215 const uint16_t *above,
2216 const uint16_t *left, int bd) {
2217 const uint16x8_t left_u16 = vld1q_u16(left);
2218 const uint16x4_t left_low = vget_low_u16(left_u16);
2219 const uint16x4_t left_high = vget_high_u16(left_u16);
2220 uint16x8_t row;
2221 (void)above;
2222 (void)bd;
2223
2224 row = vdupq_lane_u16(left_low, 0);
2225 vst1q_u16(dst, row);
2226 dst += stride;
2227 row = vdupq_lane_u16(left_low, 1);
2228 vst1q_u16(dst, row);
2229 dst += stride;
2230 row = vdupq_lane_u16(left_low, 2);
2231 vst1q_u16(dst, row);
2232 dst += stride;
2233 row = vdupq_lane_u16(left_low, 3);
2234 vst1q_u16(dst, row);
2235 dst += stride;
2236 row = vdupq_lane_u16(left_high, 0);
2237 vst1q_u16(dst, row);
2238 dst += stride;
2239 row = vdupq_lane_u16(left_high, 1);
2240 vst1q_u16(dst, row);
2241 dst += stride;
2242 row = vdupq_lane_u16(left_high, 2);
2243 vst1q_u16(dst, row);
2244 dst += stride;
2245 row = vdupq_lane_u16(left_high, 3);
2246 vst1q_u16(dst, row);
2247 }
2248
h_store_16(uint16_t ** dst,const ptrdiff_t stride,const uint16x8_t row)2249 static INLINE void h_store_16(uint16_t **dst, const ptrdiff_t stride,
2250 const uint16x8_t row) {
2251 // Note: vst1q is faster than vst2q
2252 vst1q_u16(*dst, row);
2253 *dst += 8;
2254 vst1q_u16(*dst, row);
2255 *dst += stride - 8;
2256 }
2257
vpx_highbd_h_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)2258 void vpx_highbd_h_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
2259 const uint16_t *above,
2260 const uint16_t *left, int bd) {
2261 int i;
2262 (void)above;
2263 (void)bd;
2264
2265 for (i = 0; i < 2; i++, left += 8) {
2266 const uint16x8_t left_u16q = vld1q_u16(left);
2267 const uint16x4_t left_low = vget_low_u16(left_u16q);
2268 const uint16x4_t left_high = vget_high_u16(left_u16q);
2269 uint16x8_t row;
2270
2271 row = vdupq_lane_u16(left_low, 0);
2272 h_store_16(&dst, stride, row);
2273 row = vdupq_lane_u16(left_low, 1);
2274 h_store_16(&dst, stride, row);
2275 row = vdupq_lane_u16(left_low, 2);
2276 h_store_16(&dst, stride, row);
2277 row = vdupq_lane_u16(left_low, 3);
2278 h_store_16(&dst, stride, row);
2279 row = vdupq_lane_u16(left_high, 0);
2280 h_store_16(&dst, stride, row);
2281 row = vdupq_lane_u16(left_high, 1);
2282 h_store_16(&dst, stride, row);
2283 row = vdupq_lane_u16(left_high, 2);
2284 h_store_16(&dst, stride, row);
2285 row = vdupq_lane_u16(left_high, 3);
2286 h_store_16(&dst, stride, row);
2287 }
2288 }
2289
h_store_32(uint16_t ** dst,const ptrdiff_t stride,const uint16x8_t row)2290 static INLINE void h_store_32(uint16_t **dst, const ptrdiff_t stride,
2291 const uint16x8_t row) {
2292 // Note: vst1q is faster than vst2q
2293 vst1q_u16(*dst, row);
2294 *dst += 8;
2295 vst1q_u16(*dst, row);
2296 *dst += 8;
2297 vst1q_u16(*dst, row);
2298 *dst += 8;
2299 vst1q_u16(*dst, row);
2300 *dst += stride - 24;
2301 }
2302
vpx_highbd_h_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)2303 void vpx_highbd_h_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
2304 const uint16_t *above,
2305 const uint16_t *left, int bd) {
2306 int i;
2307 (void)above;
2308 (void)bd;
2309
2310 for (i = 0; i < 4; i++, left += 8) {
2311 const uint16x8_t left_u16q = vld1q_u16(left);
2312 const uint16x4_t left_low = vget_low_u16(left_u16q);
2313 const uint16x4_t left_high = vget_high_u16(left_u16q);
2314 uint16x8_t row;
2315
2316 row = vdupq_lane_u16(left_low, 0);
2317 h_store_32(&dst, stride, row);
2318 row = vdupq_lane_u16(left_low, 1);
2319 h_store_32(&dst, stride, row);
2320 row = vdupq_lane_u16(left_low, 2);
2321 h_store_32(&dst, stride, row);
2322 row = vdupq_lane_u16(left_low, 3);
2323 h_store_32(&dst, stride, row);
2324 row = vdupq_lane_u16(left_high, 0);
2325 h_store_32(&dst, stride, row);
2326 row = vdupq_lane_u16(left_high, 1);
2327 h_store_32(&dst, stride, row);
2328 row = vdupq_lane_u16(left_high, 2);
2329 h_store_32(&dst, stride, row);
2330 row = vdupq_lane_u16(left_high, 3);
2331 h_store_32(&dst, stride, row);
2332 }
2333 }
2334
2335 // -----------------------------------------------------------------------------
2336
vpx_highbd_tm_predictor_4x4_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)2337 void vpx_highbd_tm_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
2338 const uint16_t *above,
2339 const uint16_t *left, int bd) {
2340 const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
2341 const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
2342 const int16x4_t above_s16d = vld1_s16((const int16_t *)above);
2343 const int16x8_t above_s16 = vcombine_s16(above_s16d, above_s16d);
2344 const int16x4_t left_s16 = vld1_s16((const int16_t *)left);
2345 const int16x8_t sub = vsubq_s16(above_s16, top_left);
2346 int16x8_t sum;
2347 uint16x8_t row;
2348
2349 sum = vcombine_s16(vdup_lane_s16(left_s16, 0), vdup_lane_s16(left_s16, 1));
2350 sum = vaddq_s16(sum, sub);
2351 sum = vminq_s16(sum, max);
2352 row = vqshluq_n_s16(sum, 0);
2353 vst1_u16(dst, vget_low_u16(row));
2354 dst += stride;
2355 vst1_u16(dst, vget_high_u16(row));
2356 dst += stride;
2357
2358 sum = vcombine_s16(vdup_lane_s16(left_s16, 2), vdup_lane_s16(left_s16, 3));
2359 sum = vaddq_s16(sum, sub);
2360 sum = vminq_s16(sum, max);
2361 row = vqshluq_n_s16(sum, 0);
2362 vst1_u16(dst, vget_low_u16(row));
2363 dst += stride;
2364 vst1_u16(dst, vget_high_u16(row));
2365 }
2366
tm_8_kernel(uint16_t ** dst,const ptrdiff_t stride,const int16x8_t left_dup,const int16x8_t sub,const int16x8_t max)2367 static INLINE void tm_8_kernel(uint16_t **dst, const ptrdiff_t stride,
2368 const int16x8_t left_dup, const int16x8_t sub,
2369 const int16x8_t max) {
2370 uint16x8_t row;
2371 int16x8_t sum = vaddq_s16(left_dup, sub);
2372 sum = vminq_s16(sum, max);
2373 row = vqshluq_n_s16(sum, 0);
2374 vst1q_u16(*dst, row);
2375 *dst += stride;
2376 }
2377
vpx_highbd_tm_predictor_8x8_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)2378 void vpx_highbd_tm_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
2379 const uint16_t *above,
2380 const uint16_t *left, int bd) {
2381 const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
2382 const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
2383 const int16x8_t above_s16 = vld1q_s16((const int16_t *)above);
2384 const int16x8_t left_s16 = vld1q_s16((const int16_t *)left);
2385 const int16x8_t sub = vsubq_s16(above_s16, top_left);
2386 int16x4_t left_s16d;
2387 int16x8_t left_dup;
2388 int i;
2389
2390 left_s16d = vget_low_s16(left_s16);
2391
2392 for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16)) {
2393 left_dup = vdupq_lane_s16(left_s16d, 0);
2394 tm_8_kernel(&dst, stride, left_dup, sub, max);
2395
2396 left_dup = vdupq_lane_s16(left_s16d, 1);
2397 tm_8_kernel(&dst, stride, left_dup, sub, max);
2398
2399 left_dup = vdupq_lane_s16(left_s16d, 2);
2400 tm_8_kernel(&dst, stride, left_dup, sub, max);
2401
2402 left_dup = vdupq_lane_s16(left_s16d, 3);
2403 tm_8_kernel(&dst, stride, left_dup, sub, max);
2404 }
2405 }
2406
tm_16_kernel(uint16_t ** dst,const ptrdiff_t stride,const int16x8_t left_dup,const int16x8_t sub0,const int16x8_t sub1,const int16x8_t max)2407 static INLINE void tm_16_kernel(uint16_t **dst, const ptrdiff_t stride,
2408 const int16x8_t left_dup, const int16x8_t sub0,
2409 const int16x8_t sub1, const int16x8_t max) {
2410 uint16x8_t row0, row1;
2411 int16x8_t sum0 = vaddq_s16(left_dup, sub0);
2412 int16x8_t sum1 = vaddq_s16(left_dup, sub1);
2413 sum0 = vminq_s16(sum0, max);
2414 sum1 = vminq_s16(sum1, max);
2415 row0 = vqshluq_n_s16(sum0, 0);
2416 row1 = vqshluq_n_s16(sum1, 0);
2417 vst1q_u16(*dst, row0);
2418 *dst += 8;
2419 vst1q_u16(*dst, row1);
2420 *dst += stride - 8;
2421 }
2422
vpx_highbd_tm_predictor_16x16_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)2423 void vpx_highbd_tm_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
2424 const uint16_t *above,
2425 const uint16_t *left, int bd) {
2426 const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
2427 const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
2428 const int16x8_t above0 = vld1q_s16((const int16_t *)above);
2429 const int16x8_t above1 = vld1q_s16((const int16_t *)(above + 8));
2430 const int16x8_t sub0 = vsubq_s16(above0, top_left);
2431 const int16x8_t sub1 = vsubq_s16(above1, top_left);
2432 int16x8_t left_dup;
2433 int i, j;
2434
2435 for (j = 0; j < 2; j++, left += 8) {
2436 const int16x8_t left_s16q = vld1q_s16((const int16_t *)left);
2437 int16x4_t left_s16d = vget_low_s16(left_s16q);
2438 for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) {
2439 left_dup = vdupq_lane_s16(left_s16d, 0);
2440 tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
2441
2442 left_dup = vdupq_lane_s16(left_s16d, 1);
2443 tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
2444
2445 left_dup = vdupq_lane_s16(left_s16d, 2);
2446 tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
2447
2448 left_dup = vdupq_lane_s16(left_s16d, 3);
2449 tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max);
2450 }
2451 }
2452 }
2453
tm_32_kernel(uint16_t ** dst,const ptrdiff_t stride,const int16x8_t left_dup,const int16x8_t sub0,const int16x8_t sub1,const int16x8_t sub2,const int16x8_t sub3,const int16x8_t max)2454 static INLINE void tm_32_kernel(uint16_t **dst, const ptrdiff_t stride,
2455 const int16x8_t left_dup, const int16x8_t sub0,
2456 const int16x8_t sub1, const int16x8_t sub2,
2457 const int16x8_t sub3, const int16x8_t max) {
2458 uint16x8_t row0, row1, row2, row3;
2459 int16x8_t sum0 = vaddq_s16(left_dup, sub0);
2460 int16x8_t sum1 = vaddq_s16(left_dup, sub1);
2461 int16x8_t sum2 = vaddq_s16(left_dup, sub2);
2462 int16x8_t sum3 = vaddq_s16(left_dup, sub3);
2463 sum0 = vminq_s16(sum0, max);
2464 sum1 = vminq_s16(sum1, max);
2465 sum2 = vminq_s16(sum2, max);
2466 sum3 = vminq_s16(sum3, max);
2467 row0 = vqshluq_n_s16(sum0, 0);
2468 row1 = vqshluq_n_s16(sum1, 0);
2469 row2 = vqshluq_n_s16(sum2, 0);
2470 row3 = vqshluq_n_s16(sum3, 0);
2471 vst1q_u16(*dst, row0);
2472 *dst += 8;
2473 vst1q_u16(*dst, row1);
2474 *dst += 8;
2475 vst1q_u16(*dst, row2);
2476 *dst += 8;
2477 vst1q_u16(*dst, row3);
2478 *dst += stride - 24;
2479 }
2480
vpx_highbd_tm_predictor_32x32_neon(uint16_t * dst,ptrdiff_t stride,const uint16_t * above,const uint16_t * left,int bd)2481 void vpx_highbd_tm_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
2482 const uint16_t *above,
2483 const uint16_t *left, int bd) {
2484 const int16x8_t max = vmovq_n_s16((1 << bd) - 1);
2485 const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1));
2486 const int16x8_t above0 = vld1q_s16((const int16_t *)above);
2487 const int16x8_t above1 = vld1q_s16((const int16_t *)(above + 8));
2488 const int16x8_t above2 = vld1q_s16((const int16_t *)(above + 16));
2489 const int16x8_t above3 = vld1q_s16((const int16_t *)(above + 24));
2490 const int16x8_t sub0 = vsubq_s16(above0, top_left);
2491 const int16x8_t sub1 = vsubq_s16(above1, top_left);
2492 const int16x8_t sub2 = vsubq_s16(above2, top_left);
2493 const int16x8_t sub3 = vsubq_s16(above3, top_left);
2494 int16x8_t left_dup;
2495 int i, j;
2496
2497 for (i = 0; i < 4; i++, left += 8) {
2498 const int16x8_t left_s16q = vld1q_s16((const int16_t *)left);
2499 int16x4_t left_s16d = vget_low_s16(left_s16q);
2500 for (j = 0; j < 2; j++, left_s16d = vget_high_s16(left_s16q)) {
2501 left_dup = vdupq_lane_s16(left_s16d, 0);
2502 tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
2503
2504 left_dup = vdupq_lane_s16(left_s16d, 1);
2505 tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
2506
2507 left_dup = vdupq_lane_s16(left_s16d, 2);
2508 tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
2509
2510 left_dup = vdupq_lane_s16(left_s16d, 3);
2511 tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max);
2512 }
2513 }
2514 }
2515