1 /*
2 * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #ifndef AOM_AV1_ENCODER_ARM_PICKRST_SVE_H_
13 #define AOM_AV1_ENCODER_ARM_PICKRST_SVE_H_
14
15 #include <arm_neon.h>
16 #include <arm_sve.h>
17
18 #include "aom_dsp/arm/aom_neon_sve_bridge.h"
19 #include "av1/encoder/arm/pickrst_neon.h"
20
21 // Swap each half of the dgd vectors so that we can accumulate the result of
22 // the dot-products directly in the destination matrix.
transpose_dgd(int16x8_t dgd0,int16x8_t dgd1)23 static inline int16x8x2_t transpose_dgd(int16x8_t dgd0, int16x8_t dgd1) {
24 int16x8_t dgd_trn0 = vreinterpretq_s16_s64(
25 vzip1q_s64(vreinterpretq_s64_s16(dgd0), vreinterpretq_s64_s16(dgd1)));
26 int16x8_t dgd_trn1 = vreinterpretq_s16_s64(
27 vzip2q_s64(vreinterpretq_s64_s16(dgd0), vreinterpretq_s64_s16(dgd1)));
28
29 return (struct int16x8x2_t){ dgd_trn0, dgd_trn1 };
30 }
31
compute_M_one_row_win5(int16x8_t src,int16x8_t dgd[5],int64_t * M,int row)32 static inline void compute_M_one_row_win5(int16x8_t src, int16x8_t dgd[5],
33 int64_t *M, int row) {
34 const int wiener_win = 5;
35
36 int64x2_t m01 = vld1q_s64(M + row * wiener_win + 0);
37 int16x8x2_t dgd01 = transpose_dgd(dgd[0], dgd[1]);
38
39 int64x2_t cross_corr01 = aom_svdot_lane_s16(m01, dgd01.val[0], src, 0);
40 cross_corr01 = aom_svdot_lane_s16(cross_corr01, dgd01.val[1], src, 1);
41 vst1q_s64(M + row * wiener_win + 0, cross_corr01);
42
43 int64x2_t m23 = vld1q_s64(M + row * wiener_win + 2);
44 int16x8x2_t dgd23 = transpose_dgd(dgd[2], dgd[3]);
45
46 int64x2_t cross_corr23 = aom_svdot_lane_s16(m23, dgd23.val[0], src, 0);
47 cross_corr23 = aom_svdot_lane_s16(cross_corr23, dgd23.val[1], src, 1);
48 vst1q_s64(M + row * wiener_win + 2, cross_corr23);
49
50 int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), src, dgd[4]);
51 M[row * wiener_win + 4] += vaddvq_s64(m4);
52 }
53
compute_M_one_row_win7(int16x8_t src,int16x8_t dgd[7],int64_t * M,int row)54 static inline void compute_M_one_row_win7(int16x8_t src, int16x8_t dgd[7],
55 int64_t *M, int row) {
56 const int wiener_win = 7;
57
58 int64x2_t m01 = vld1q_s64(M + row * wiener_win + 0);
59 int16x8x2_t dgd01 = transpose_dgd(dgd[0], dgd[1]);
60
61 int64x2_t cross_corr01 = aom_svdot_lane_s16(m01, dgd01.val[0], src, 0);
62 cross_corr01 = aom_svdot_lane_s16(cross_corr01, dgd01.val[1], src, 1);
63 vst1q_s64(M + row * wiener_win + 0, cross_corr01);
64
65 int64x2_t m23 = vld1q_s64(M + row * wiener_win + 2);
66 int16x8x2_t dgd23 = transpose_dgd(dgd[2], dgd[3]);
67
68 int64x2_t cross_corr23 = aom_svdot_lane_s16(m23, dgd23.val[0], src, 0);
69 cross_corr23 = aom_svdot_lane_s16(cross_corr23, dgd23.val[1], src, 1);
70 vst1q_s64(M + row * wiener_win + 2, cross_corr23);
71
72 int64x2_t m45 = vld1q_s64(M + row * wiener_win + 4);
73 int16x8x2_t dgd45 = transpose_dgd(dgd[4], dgd[5]);
74
75 int64x2_t cross_corr45 = aom_svdot_lane_s16(m45, dgd45.val[0], src, 0);
76 cross_corr45 = aom_svdot_lane_s16(cross_corr45, dgd45.val[1], src, 1);
77 vst1q_s64(M + row * wiener_win + 4, cross_corr45);
78
79 int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), src, dgd[6]);
80 M[row * wiener_win + 6] += vaddvq_s64(m6);
81 }
82
compute_H_one_col(int16x8_t * dgd,int col,int64_t * H,const int wiener_win,const int wiener_win2)83 static inline void compute_H_one_col(int16x8_t *dgd, int col, int64_t *H,
84 const int wiener_win,
85 const int wiener_win2) {
86 for (int row0 = 0; row0 < wiener_win; row0++) {
87 for (int row1 = row0; row1 < wiener_win; row1++) {
88 int auto_cov_idx =
89 (col * wiener_win + row0) * wiener_win2 + (col * wiener_win) + row1;
90
91 int64x2_t auto_cov = aom_sdotq_s16(vdupq_n_s64(0), dgd[row0], dgd[row1]);
92 H[auto_cov_idx] += vaddvq_s64(auto_cov);
93 }
94 }
95 }
96
compute_H_two_rows_win5(int16x8_t * dgd0,int16x8_t * dgd1,int row0,int row1,int64_t * H)97 static inline void compute_H_two_rows_win5(int16x8_t *dgd0, int16x8_t *dgd1,
98 int row0, int row1, int64_t *H) {
99 for (int col0 = 0; col0 < 5; col0++) {
100 int auto_cov_idx = (row0 * 5 + col0) * 25 + (row1 * 5);
101
102 int64x2_t h01 = vld1q_s64(H + auto_cov_idx);
103 int16x8x2_t dgd01 = transpose_dgd(dgd1[0], dgd1[1]);
104
105 int64x2_t auto_cov01 = aom_svdot_lane_s16(h01, dgd01.val[0], dgd0[col0], 0);
106 auto_cov01 = aom_svdot_lane_s16(auto_cov01, dgd01.val[1], dgd0[col0], 1);
107 vst1q_s64(H + auto_cov_idx, auto_cov01);
108
109 int64x2_t h23 = vld1q_s64(H + auto_cov_idx + 2);
110 int16x8x2_t dgd23 = transpose_dgd(dgd1[2], dgd1[3]);
111
112 int64x2_t auto_cov23 = aom_svdot_lane_s16(h23, dgd23.val[0], dgd0[col0], 0);
113 auto_cov23 = aom_svdot_lane_s16(auto_cov23, dgd23.val[1], dgd0[col0], 1);
114 vst1q_s64(H + auto_cov_idx + 2, auto_cov23);
115
116 int64x2_t auto_cov4 = aom_sdotq_s16(vdupq_n_s64(0), dgd0[col0], dgd1[4]);
117 H[auto_cov_idx + 4] += vaddvq_s64(auto_cov4);
118 }
119 }
120
compute_H_two_rows_win7(int16x8_t * dgd0,int16x8_t * dgd1,int row0,int row1,int64_t * H)121 static inline void compute_H_two_rows_win7(int16x8_t *dgd0, int16x8_t *dgd1,
122 int row0, int row1, int64_t *H) {
123 for (int col0 = 0; col0 < 7; col0++) {
124 int auto_cov_idx = (row0 * 7 + col0) * 49 + (row1 * 7);
125
126 int64x2_t h01 = vld1q_s64(H + auto_cov_idx);
127 int16x8x2_t dgd01 = transpose_dgd(dgd1[0], dgd1[1]);
128
129 int64x2_t auto_cov01 = aom_svdot_lane_s16(h01, dgd01.val[0], dgd0[col0], 0);
130 auto_cov01 = aom_svdot_lane_s16(auto_cov01, dgd01.val[1], dgd0[col0], 1);
131 vst1q_s64(H + auto_cov_idx, auto_cov01);
132
133 int64x2_t h23 = vld1q_s64(H + auto_cov_idx + 2);
134 int16x8x2_t dgd23 = transpose_dgd(dgd1[2], dgd1[3]);
135
136 int64x2_t auto_cov23 = aom_svdot_lane_s16(h23, dgd23.val[0], dgd0[col0], 0);
137 auto_cov23 = aom_svdot_lane_s16(auto_cov23, dgd23.val[1], dgd0[col0], 1);
138 vst1q_s64(H + auto_cov_idx + 2, auto_cov23);
139
140 int64x2_t h45 = vld1q_s64(H + auto_cov_idx + 4);
141 int16x8x2_t dgd45 = transpose_dgd(dgd1[4], dgd1[5]);
142
143 int64x2_t auto_cov45 = aom_svdot_lane_s16(h45, dgd45.val[0], dgd0[col0], 0);
144 auto_cov45 = aom_svdot_lane_s16(auto_cov45, dgd45.val[1], dgd0[col0], 1);
145 vst1q_s64(H + auto_cov_idx + 4, auto_cov45);
146
147 int64x2_t auto_cov6 = aom_sdotq_s16(vdupq_n_s64(0), dgd0[col0], dgd1[6]);
148 H[auto_cov_idx + 6] += vaddvq_s64(auto_cov6);
149 }
150 }
151
stats_top_win5_sve(const int16x8_t src[2],const int16x8_t dgd[2],const int16_t * const d,const int32_t d_stride,int64x2_t * sum_m,int64x2_t * sum_h)152 static inline void stats_top_win5_sve(const int16x8_t src[2],
153 const int16x8_t dgd[2],
154 const int16_t *const d,
155 const int32_t d_stride, int64x2_t *sum_m,
156 int64x2_t *sum_h) {
157 int16x8_t dgds[WIENER_WIN_CHROMA * 2];
158
159 load_s16_8x5(d + 0, d_stride, &dgds[0], &dgds[2], &dgds[4], &dgds[6],
160 &dgds[8]);
161 load_s16_8x5(d + 8, d_stride, &dgds[1], &dgds[3], &dgds[5], &dgds[7],
162 &dgds[9]);
163
164 sum_m[0] = aom_sdotq_s16(sum_m[0], src[0], dgds[0]);
165 sum_m[0] = aom_sdotq_s16(sum_m[0], src[1], dgds[1]);
166 sum_m[1] = aom_sdotq_s16(sum_m[1], src[0], dgds[2]);
167 sum_m[1] = aom_sdotq_s16(sum_m[1], src[1], dgds[3]);
168 sum_m[2] = aom_sdotq_s16(sum_m[2], src[0], dgds[4]);
169 sum_m[2] = aom_sdotq_s16(sum_m[2], src[1], dgds[5]);
170 sum_m[3] = aom_sdotq_s16(sum_m[3], src[0], dgds[6]);
171 sum_m[3] = aom_sdotq_s16(sum_m[3], src[1], dgds[7]);
172 sum_m[4] = aom_sdotq_s16(sum_m[4], src[0], dgds[8]);
173 sum_m[4] = aom_sdotq_s16(sum_m[4], src[1], dgds[9]);
174
175 sum_h[0] = aom_sdotq_s16(sum_h[0], dgd[0], dgds[0]);
176 sum_h[0] = aom_sdotq_s16(sum_h[0], dgd[1], dgds[1]);
177 sum_h[1] = aom_sdotq_s16(sum_h[1], dgd[0], dgds[2]);
178 sum_h[1] = aom_sdotq_s16(sum_h[1], dgd[1], dgds[3]);
179 sum_h[2] = aom_sdotq_s16(sum_h[2], dgd[0], dgds[4]);
180 sum_h[2] = aom_sdotq_s16(sum_h[2], dgd[1], dgds[5]);
181 sum_h[3] = aom_sdotq_s16(sum_h[3], dgd[0], dgds[6]);
182 sum_h[3] = aom_sdotq_s16(sum_h[3], dgd[1], dgds[7]);
183 sum_h[4] = aom_sdotq_s16(sum_h[4], dgd[0], dgds[8]);
184 sum_h[4] = aom_sdotq_s16(sum_h[4], dgd[1], dgds[9]);
185 }
186
stats_left_win5_sve(const int16x8_t src[2],const int16_t * d,const int32_t d_stride,int64x2_t * sum)187 static inline void stats_left_win5_sve(const int16x8_t src[2], const int16_t *d,
188 const int32_t d_stride, int64x2_t *sum) {
189 int16x8_t dgds[WIN_CHROMA];
190
191 load_s16_8x4(d + d_stride + 0, d_stride, &dgds[0], &dgds[2], &dgds[4],
192 &dgds[6]);
193 load_s16_8x4(d + d_stride + 8, d_stride, &dgds[1], &dgds[3], &dgds[5],
194 &dgds[7]);
195
196 sum[0] = aom_sdotq_s16(sum[0], src[0], dgds[0]);
197 sum[0] = aom_sdotq_s16(sum[0], src[1], dgds[1]);
198 sum[1] = aom_sdotq_s16(sum[1], src[0], dgds[2]);
199 sum[1] = aom_sdotq_s16(sum[1], src[1], dgds[3]);
200 sum[2] = aom_sdotq_s16(sum[2], src[0], dgds[4]);
201 sum[2] = aom_sdotq_s16(sum[2], src[1], dgds[5]);
202 sum[3] = aom_sdotq_s16(sum[3], src[0], dgds[6]);
203 sum[3] = aom_sdotq_s16(sum[3], src[1], dgds[7]);
204 }
205
sub_deltas_step4_sve(int16x8_t * A,int16x8_t * B,int64x2_t * deltas)206 static inline void sub_deltas_step4_sve(int16x8_t *A, int16x8_t *B,
207 int64x2_t *deltas) {
208 deltas[0] = aom_sdotq_s16(deltas[0], vnegq_s16(A[0]), B[0]);
209 deltas[1] = aom_sdotq_s16(deltas[1], vnegq_s16(A[0]), B[1]);
210 deltas[2] = aom_sdotq_s16(deltas[2], vnegq_s16(A[0]), B[2]);
211 deltas[3] = aom_sdotq_s16(deltas[3], vnegq_s16(A[0]), B[3]);
212 deltas[4] = aom_sdotq_s16(deltas[4], vnegq_s16(A[0]), B[4]);
213 deltas[5] = aom_sdotq_s16(deltas[5], vnegq_s16(A[1]), B[0]);
214 deltas[6] = aom_sdotq_s16(deltas[6], vnegq_s16(A[2]), B[0]);
215 deltas[7] = aom_sdotq_s16(deltas[7], vnegq_s16(A[3]), B[0]);
216 deltas[8] = aom_sdotq_s16(deltas[8], vnegq_s16(A[4]), B[0]);
217 }
218
add_deltas_step4_sve(int16x8_t * A,int16x8_t * B,int64x2_t * deltas)219 static inline void add_deltas_step4_sve(int16x8_t *A, int16x8_t *B,
220 int64x2_t *deltas) {
221 deltas[0] = aom_sdotq_s16(deltas[0], A[0], B[0]);
222 deltas[1] = aom_sdotq_s16(deltas[1], A[0], B[1]);
223 deltas[2] = aom_sdotq_s16(deltas[2], A[0], B[2]);
224 deltas[3] = aom_sdotq_s16(deltas[3], A[0], B[3]);
225 deltas[4] = aom_sdotq_s16(deltas[4], A[0], B[4]);
226 deltas[5] = aom_sdotq_s16(deltas[5], A[1], B[0]);
227 deltas[6] = aom_sdotq_s16(deltas[6], A[2], B[0]);
228 deltas[7] = aom_sdotq_s16(deltas[7], A[3], B[0]);
229 deltas[8] = aom_sdotq_s16(deltas[8], A[4], B[0]);
230 }
231
load_square_win5_sve(const int16_t * const di,const int16_t * const dj,const int32_t d_stride,const int32_t height,int16x8_t * d_is,int16x8_t * d_ie,int16x8_t * d_js,int16x8_t * d_je,svbool_t p0,svbool_t p1)232 static inline void load_square_win5_sve(
233 const int16_t *const di, const int16_t *const dj, const int32_t d_stride,
234 const int32_t height, int16x8_t *d_is, int16x8_t *d_ie, int16x8_t *d_js,
235 int16x8_t *d_je, svbool_t p0, svbool_t p1) {
236 d_is[0] = svget_neonq_s16(svld1_s16(p0, di + 0 * d_stride + 0));
237 d_is[1] = svget_neonq_s16(svld1_s16(p1, di + 0 * d_stride + 8));
238 d_is[2] = svget_neonq_s16(svld1_s16(p0, di + 1 * d_stride + 0));
239 d_is[3] = svget_neonq_s16(svld1_s16(p1, di + 1 * d_stride + 8));
240 d_is[4] = svget_neonq_s16(svld1_s16(p0, di + 2 * d_stride + 0));
241 d_is[5] = svget_neonq_s16(svld1_s16(p1, di + 2 * d_stride + 8));
242 d_is[6] = svget_neonq_s16(svld1_s16(p0, di + 3 * d_stride + 0));
243 d_is[7] = svget_neonq_s16(svld1_s16(p1, di + 3 * d_stride + 8));
244
245 d_ie[0] = svget_neonq_s16(svld1_s16(p0, di + (height + 0) * d_stride + 0));
246 d_ie[1] = svget_neonq_s16(svld1_s16(p1, di + (height + 0) * d_stride + 8));
247 d_ie[2] = svget_neonq_s16(svld1_s16(p0, di + (height + 1) * d_stride + 0));
248 d_ie[3] = svget_neonq_s16(svld1_s16(p1, di + (height + 1) * d_stride + 8));
249 d_ie[4] = svget_neonq_s16(svld1_s16(p0, di + (height + 2) * d_stride + 0));
250 d_ie[5] = svget_neonq_s16(svld1_s16(p1, di + (height + 2) * d_stride + 8));
251 d_ie[6] = svget_neonq_s16(svld1_s16(p0, di + (height + 3) * d_stride + 0));
252 d_ie[7] = svget_neonq_s16(svld1_s16(p1, di + (height + 3) * d_stride + 8));
253
254 load_s16_8x4(dj + 0, d_stride, &d_js[0], &d_js[2], &d_js[4], &d_js[6]);
255 load_s16_8x4(dj + 8, d_stride, &d_js[1], &d_js[3], &d_js[5], &d_js[7]);
256 load_s16_8x4(dj + height * d_stride + 0, d_stride, &d_je[0], &d_je[2],
257 &d_je[4], &d_je[6]);
258 load_s16_8x4(dj + height * d_stride + 8, d_stride, &d_je[1], &d_je[3],
259 &d_je[5], &d_je[7]);
260 }
261
update_4_stats_sve(const int64_t * const src,const int64x2_t * delta,int64_t * const dst)262 static inline void update_4_stats_sve(const int64_t *const src,
263 const int64x2_t *delta,
264 int64_t *const dst) {
265 const int64x2_t s1 = vld1q_s64(src);
266 const int64x2_t s2 = vld1q_s64(src + 2);
267
268 vst1q_s64(dst + 0, vaddq_s64(s1, delta[0]));
269 vst1q_s64(dst + 2, vaddq_s64(s2, delta[1]));
270 }
271
derive_square_win5_sve(int16x8_t * d_is,const int16x8_t * d_ie,const int16x8_t * d_js,const int16x8_t * d_je,int64x2_t deltas[WIENER_WIN_CHROMA-1][WIENER_WIN_CHROMA-1])272 static inline void derive_square_win5_sve(
273 int16x8_t *d_is, const int16x8_t *d_ie, const int16x8_t *d_js,
274 const int16x8_t *d_je,
275 int64x2_t deltas[WIENER_WIN_CHROMA - 1][WIENER_WIN_CHROMA - 1]) {
276 d_is[0] = vnegq_s16(d_is[0]);
277 d_is[1] = vnegq_s16(d_is[1]);
278 d_is[2] = vnegq_s16(d_is[2]);
279 d_is[3] = vnegq_s16(d_is[3]);
280 d_is[4] = vnegq_s16(d_is[4]);
281 d_is[5] = vnegq_s16(d_is[5]);
282 d_is[6] = vnegq_s16(d_is[6]);
283 d_is[7] = vnegq_s16(d_is[7]);
284
285 deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_is[0], d_js[0]);
286 deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_is[1], d_js[1]);
287 deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_is[0], d_js[2]);
288 deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_is[1], d_js[3]);
289 deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_is[0], d_js[4]);
290 deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_is[1], d_js[5]);
291 deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_is[0], d_js[6]);
292 deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_is[1], d_js[7]);
293
294 deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_is[2], d_js[0]);
295 deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_is[3], d_js[1]);
296 deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_is[2], d_js[2]);
297 deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_is[3], d_js[3]);
298 deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_is[2], d_js[4]);
299 deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_is[3], d_js[5]);
300 deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_is[2], d_js[6]);
301 deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_is[3], d_js[7]);
302
303 deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_is[4], d_js[0]);
304 deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_is[5], d_js[1]);
305 deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_is[4], d_js[2]);
306 deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_is[5], d_js[3]);
307 deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_is[4], d_js[4]);
308 deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_is[5], d_js[5]);
309 deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_is[4], d_js[6]);
310 deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_is[5], d_js[7]);
311
312 deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_is[6], d_js[0]);
313 deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_is[7], d_js[1]);
314 deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_is[6], d_js[2]);
315 deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_is[7], d_js[3]);
316 deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_is[6], d_js[4]);
317 deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_is[7], d_js[5]);
318 deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_is[6], d_js[6]);
319 deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_is[7], d_js[7]);
320
321 deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_ie[0], d_je[0]);
322 deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_ie[1], d_je[1]);
323 deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_ie[0], d_je[2]);
324 deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_ie[1], d_je[3]);
325 deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_ie[0], d_je[4]);
326 deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_ie[1], d_je[5]);
327 deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_ie[0], d_je[6]);
328 deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_ie[1], d_je[7]);
329
330 deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_ie[2], d_je[0]);
331 deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_ie[3], d_je[1]);
332 deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_ie[2], d_je[2]);
333 deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_ie[3], d_je[3]);
334 deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_ie[2], d_je[4]);
335 deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_ie[3], d_je[5]);
336 deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_ie[2], d_je[6]);
337 deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_ie[3], d_je[7]);
338
339 deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_ie[4], d_je[0]);
340 deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_ie[5], d_je[1]);
341 deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_ie[4], d_je[2]);
342 deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_ie[5], d_je[3]);
343 deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_ie[4], d_je[4]);
344 deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_ie[5], d_je[5]);
345 deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_ie[4], d_je[6]);
346 deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_ie[5], d_je[7]);
347
348 deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_ie[6], d_je[0]);
349 deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_ie[7], d_je[1]);
350 deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_ie[6], d_je[2]);
351 deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_ie[7], d_je[3]);
352 deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_ie[6], d_je[4]);
353 deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_ie[7], d_je[5]);
354 deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_ie[6], d_je[6]);
355 deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_ie[7], d_je[7]);
356 }
357
hadd_update_4_stats_sve(const int64_t * const src,const int64x2_t * deltas,int64_t * const dst)358 static inline void hadd_update_4_stats_sve(const int64_t *const src,
359 const int64x2_t *deltas,
360 int64_t *const dst) {
361 int64x2_t src0 = vld1q_s64(src);
362 int64x2_t src1 = vld1q_s64(src + 2);
363 vst1q_s64(dst + 0, vaddq_s64(src0, vpaddq_s64(deltas[0], deltas[1])));
364 vst1q_s64(dst + 2, vaddq_s64(src1, vpaddq_s64(deltas[2], deltas[3])));
365 }
366
load_triangle_win5_sve(const int16_t * const di,const int32_t d_stride,const int32_t height,int16x8_t * d_is,int16x8_t * d_ie,svbool_t p0,svbool_t p1)367 static inline void load_triangle_win5_sve(const int16_t *const di,
368 const int32_t d_stride,
369 const int32_t height, int16x8_t *d_is,
370 int16x8_t *d_ie, svbool_t p0,
371 svbool_t p1) {
372 d_is[0] = svget_neonq_s16(svld1_s16(p0, di + 0 * d_stride + 0));
373 d_is[1] = svget_neonq_s16(svld1_s16(p1, di + 0 * d_stride + 8));
374 d_is[2] = svget_neonq_s16(svld1_s16(p0, di + 1 * d_stride + 0));
375 d_is[3] = svget_neonq_s16(svld1_s16(p1, di + 1 * d_stride + 8));
376 d_is[4] = svget_neonq_s16(svld1_s16(p0, di + 2 * d_stride + 0));
377 d_is[5] = svget_neonq_s16(svld1_s16(p1, di + 2 * d_stride + 8));
378 d_is[6] = svget_neonq_s16(svld1_s16(p0, di + 3 * d_stride + 0));
379 d_is[7] = svget_neonq_s16(svld1_s16(p1, di + 3 * d_stride + 8));
380 d_ie[0] = svget_neonq_s16(svld1_s16(p0, di + (height + 0) * d_stride + 0));
381 d_ie[1] = svget_neonq_s16(svld1_s16(p1, di + (height + 0) * d_stride + 8));
382 d_ie[2] = svget_neonq_s16(svld1_s16(p0, di + (height + 1) * d_stride + 0));
383 d_ie[3] = svget_neonq_s16(svld1_s16(p1, di + (height + 1) * d_stride + 8));
384 d_ie[4] = svget_neonq_s16(svld1_s16(p0, di + (height + 2) * d_stride + 0));
385 d_ie[5] = svget_neonq_s16(svld1_s16(p1, di + (height + 2) * d_stride + 8));
386 d_ie[6] = svget_neonq_s16(svld1_s16(p0, di + (height + 3) * d_stride + 0));
387 d_ie[7] = svget_neonq_s16(svld1_s16(p1, di + (height + 3) * d_stride + 8));
388 }
389
derive_triangle_win5_sve(const int16x8_t * d_is,const int16x8_t * d_ie,int64x2_t * deltas)390 static inline void derive_triangle_win5_sve(const int16x8_t *d_is,
391 const int16x8_t *d_ie,
392 int64x2_t *deltas) {
393 deltas[0] = aom_sdotq_s16(deltas[0], vnegq_s16(d_is[0]), d_is[0]);
394 deltas[0] = aom_sdotq_s16(deltas[0], vnegq_s16(d_is[1]), d_is[1]);
395 deltas[1] = aom_sdotq_s16(deltas[1], vnegq_s16(d_is[0]), d_is[2]);
396 deltas[1] = aom_sdotq_s16(deltas[1], vnegq_s16(d_is[1]), d_is[3]);
397 deltas[2] = aom_sdotq_s16(deltas[2], vnegq_s16(d_is[0]), d_is[4]);
398 deltas[2] = aom_sdotq_s16(deltas[2], vnegq_s16(d_is[1]), d_is[5]);
399 deltas[3] = aom_sdotq_s16(deltas[3], vnegq_s16(d_is[0]), d_is[6]);
400 deltas[3] = aom_sdotq_s16(deltas[3], vnegq_s16(d_is[1]), d_is[7]);
401 deltas[4] = aom_sdotq_s16(deltas[4], vnegq_s16(d_is[2]), d_is[2]);
402 deltas[4] = aom_sdotq_s16(deltas[4], vnegq_s16(d_is[3]), d_is[3]);
403 deltas[5] = aom_sdotq_s16(deltas[5], vnegq_s16(d_is[2]), d_is[4]);
404 deltas[5] = aom_sdotq_s16(deltas[5], vnegq_s16(d_is[3]), d_is[5]);
405 deltas[6] = aom_sdotq_s16(deltas[6], vnegq_s16(d_is[2]), d_is[6]);
406 deltas[6] = aom_sdotq_s16(deltas[6], vnegq_s16(d_is[3]), d_is[7]);
407 deltas[7] = aom_sdotq_s16(deltas[7], vnegq_s16(d_is[4]), d_is[4]);
408 deltas[7] = aom_sdotq_s16(deltas[7], vnegq_s16(d_is[5]), d_is[5]);
409 deltas[8] = aom_sdotq_s16(deltas[8], vnegq_s16(d_is[4]), d_is[6]);
410 deltas[8] = aom_sdotq_s16(deltas[8], vnegq_s16(d_is[5]), d_is[7]);
411 deltas[9] = aom_sdotq_s16(deltas[9], vnegq_s16(d_is[6]), d_is[6]);
412 deltas[9] = aom_sdotq_s16(deltas[9], vnegq_s16(d_is[7]), d_is[7]);
413
414 deltas[0] = aom_sdotq_s16(deltas[0], d_ie[0], d_ie[0]);
415 deltas[0] = aom_sdotq_s16(deltas[0], d_ie[1], d_ie[1]);
416 deltas[1] = aom_sdotq_s16(deltas[1], d_ie[0], d_ie[2]);
417 deltas[1] = aom_sdotq_s16(deltas[1], d_ie[1], d_ie[3]);
418 deltas[2] = aom_sdotq_s16(deltas[2], d_ie[0], d_ie[4]);
419 deltas[2] = aom_sdotq_s16(deltas[2], d_ie[1], d_ie[5]);
420 deltas[3] = aom_sdotq_s16(deltas[3], d_ie[0], d_ie[6]);
421 deltas[3] = aom_sdotq_s16(deltas[3], d_ie[1], d_ie[7]);
422 deltas[4] = aom_sdotq_s16(deltas[4], d_ie[2], d_ie[2]);
423 deltas[4] = aom_sdotq_s16(deltas[4], d_ie[3], d_ie[3]);
424 deltas[5] = aom_sdotq_s16(deltas[5], d_ie[2], d_ie[4]);
425 deltas[5] = aom_sdotq_s16(deltas[5], d_ie[3], d_ie[5]);
426 deltas[6] = aom_sdotq_s16(deltas[6], d_ie[2], d_ie[6]);
427 deltas[6] = aom_sdotq_s16(deltas[6], d_ie[3], d_ie[7]);
428 deltas[7] = aom_sdotq_s16(deltas[7], d_ie[4], d_ie[4]);
429 deltas[7] = aom_sdotq_s16(deltas[7], d_ie[5], d_ie[5]);
430 deltas[8] = aom_sdotq_s16(deltas[8], d_ie[4], d_ie[6]);
431 deltas[8] = aom_sdotq_s16(deltas[8], d_ie[5], d_ie[7]);
432 deltas[9] = aom_sdotq_s16(deltas[9], d_ie[6], d_ie[6]);
433 deltas[9] = aom_sdotq_s16(deltas[9], d_ie[7], d_ie[7]);
434 }
435
compute_stats_win5_sve(const int16_t * const d,const int32_t d_stride,const int16_t * const s,const int32_t s_stride,const int32_t width,const int32_t height,int64_t * const M,int64_t * const H)436 static inline void compute_stats_win5_sve(
437 const int16_t *const d, const int32_t d_stride, const int16_t *const s,
438 const int32_t s_stride, const int32_t width, const int32_t height,
439 int64_t *const M, int64_t *const H) {
440 const int32_t wiener_win = WIENER_WIN_CHROMA;
441 const int32_t wiener_win2 = wiener_win * wiener_win;
442 const int32_t h8 = height & ~7;
443 int32_t i, j, x, y;
444
445 // Use a predicate to compute the last columns.
446 svbool_t p0 = svwhilelt_b16_u32(0, width % 16 == 0 ? 16 : width % 16);
447 svbool_t p1 = svwhilelt_b16_u32(8, width % 16 == 0 ? 16 : width % 16);
448
449 // Step 1: Calculate the top edge of the whole matrix, i.e., the top
450 // edge of each triangle and square on the top row.
451 j = 0;
452 do {
453 const int16_t *s_t = s;
454 const int16_t *d_t = d;
455 int64x2_t sum_m[WIENER_WIN_CHROMA] = { vdupq_n_s64(0) };
456 int64x2_t sum_h[WIENER_WIN_CHROMA] = { vdupq_n_s64(0) };
457 int16x8_t src[2], dgd[2];
458
459 y = height;
460 do {
461 x = 0;
462 while (x < width - 16) {
463 src[0] = vld1q_s16(s_t + x + 0);
464 src[1] = vld1q_s16(s_t + x + 8);
465 dgd[0] = vld1q_s16(d_t + x + 0);
466 dgd[1] = vld1q_s16(d_t + x + 8);
467 stats_top_win5_sve(src, dgd, d_t + j + x, d_stride, sum_m, sum_h);
468 x += 16;
469 }
470
471 src[0] = svget_neonq_s16(svld1_s16(p0, s_t + x + 0));
472 src[1] = svget_neonq_s16(svld1_s16(p1, s_t + x + 8));
473 dgd[0] = svget_neonq_s16(svld1_s16(p0, d_t + x + 0));
474 dgd[1] = svget_neonq_s16(svld1_s16(p1, d_t + x + 8));
475
476 stats_top_win5_sve(src, dgd, d_t + j + x, d_stride, sum_m, sum_h);
477
478 s_t += s_stride;
479 d_t += d_stride;
480 } while (--y);
481
482 vst1q_s64(&M[wiener_win * j + 0], vpaddq_s64(sum_m[0], sum_m[1]));
483 vst1q_s64(&M[wiener_win * j + 2], vpaddq_s64(sum_m[2], sum_m[3]));
484 M[wiener_win * j + 4] = vaddvq_s64(sum_m[4]);
485
486 vst1q_s64(&H[wiener_win * j + 0], vpaddq_s64(sum_h[0], sum_h[1]));
487 vst1q_s64(&H[wiener_win * j + 2], vpaddq_s64(sum_h[2], sum_h[3]));
488 H[wiener_win * j + 4] = vaddvq_s64(sum_h[4]);
489 } while (++j < wiener_win);
490
491 // Step 2: Calculate the left edge of each square on the top row.
492 j = 1;
493 do {
494 const int16_t *d_t = d;
495 int64x2_t sum_h[WIENER_WIN_CHROMA - 1] = { vdupq_n_s64(0) };
496 int16x8_t dgd[2];
497
498 y = height;
499 do {
500 x = 0;
501 while (x < width - 16) {
502 dgd[0] = vld1q_s16(d_t + j + x + 0);
503 dgd[1] = vld1q_s16(d_t + j + x + 8);
504 stats_left_win5_sve(dgd, d_t + x, d_stride, sum_h);
505 x += 16;
506 }
507
508 dgd[0] = svget_neonq_s16(svld1_s16(p0, d_t + j + x + 0));
509 dgd[1] = svget_neonq_s16(svld1_s16(p1, d_t + j + x + 8));
510
511 stats_left_win5_sve(dgd, d_t + x, d_stride, sum_h);
512
513 d_t += d_stride;
514 } while (--y);
515
516 int64x2_t sum_h01 = vpaddq_s64(sum_h[0], sum_h[1]);
517 int64x2_t sum_h23 = vpaddq_s64(sum_h[2], sum_h[3]);
518 vst1_s64(&H[1 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h01));
519 vst1_s64(&H[2 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h01));
520 vst1_s64(&H[3 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h23));
521 vst1_s64(&H[4 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h23));
522
523 } while (++j < wiener_win);
524
525 // Step 3: Derive the top edge of each triangle along the diagonal. No
526 // triangle in top row.
527 {
528 const int16_t *d_t = d;
529
530 if (height % 2) {
531 int32x4_t deltas[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
532 int32x4_t deltas_tr[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
533 int16x8_t ds[WIENER_WIN * 2];
534
535 load_s16_8x4(d_t, d_stride, &ds[0], &ds[2], &ds[4], &ds[6]);
536 load_s16_8x4(d_t + width, d_stride, &ds[1], &ds[3], &ds[5], &ds[7]);
537 d_t += 4 * d_stride;
538
539 step3_win5_oneline_neon(&d_t, d_stride, width, height, ds, deltas);
540 transpose_arrays_s32_8x8(deltas, deltas_tr);
541
542 update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
543 deltas_tr[0], vgetq_lane_s32(deltas_tr[4], 0),
544 H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
545
546 update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
547 deltas_tr[1], vgetq_lane_s32(deltas_tr[5], 0),
548 H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
549
550 update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
551 deltas_tr[2], vgetq_lane_s32(deltas_tr[6], 0),
552 H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
553
554 update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
555 deltas_tr[3], vgetq_lane_s32(deltas_tr[7], 0),
556 H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
557
558 } else {
559 int32x4_t deltas[WIENER_WIN_CHROMA * 2] = { vdupq_n_s32(0) };
560 int16x8_t ds[WIENER_WIN_CHROMA * 2];
561
562 ds[0] = load_unaligned_s16_4x2(d_t + 0 * d_stride, width);
563 ds[1] = load_unaligned_s16_4x2(d_t + 1 * d_stride, width);
564 ds[2] = load_unaligned_s16_4x2(d_t + 2 * d_stride, width);
565 ds[3] = load_unaligned_s16_4x2(d_t + 3 * d_stride, width);
566
567 step3_win5_neon(d_t + 4 * d_stride, d_stride, width, height, ds, deltas);
568
569 transpose_elems_inplace_s32_4x4(&deltas[0], &deltas[1], &deltas[2],
570 &deltas[3]);
571
572 update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
573 deltas[0], vgetq_lane_s32(deltas[4], 0),
574 H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
575
576 update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
577 deltas[1], vgetq_lane_s32(deltas[4], 1),
578 H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
579
580 update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
581 deltas[2], vgetq_lane_s32(deltas[4], 2),
582 H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
583
584 update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
585 deltas[3], vgetq_lane_s32(deltas[4], 3),
586 H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
587 }
588 }
589
590 // Step 4: Derive the top and left edge of each square. No square in top and
591 // bottom row.
592 {
593 y = h8;
594
595 int16x4_t d_s[12];
596 int16x4_t d_e[12];
597 const int16_t *d_t = d;
598 int16x4_t zeros = vdup_n_s16(0);
599 load_s16_4x4(d_t, d_stride, &d_s[0], &d_s[1], &d_s[2], &d_s[3]);
600 load_s16_4x4(d_t + width, d_stride, &d_e[0], &d_e[1], &d_e[2], &d_e[3]);
601 int64x2_t deltas[6][18] = { { vdupq_n_s64(0) }, { vdupq_n_s64(0) } };
602
603 while (y >= 8) {
604 load_s16_4x8(d_t + 4 * d_stride, d_stride, &d_s[4], &d_s[5], &d_s[6],
605 &d_s[7], &d_s[8], &d_s[9], &d_s[10], &d_s[11]);
606 load_s16_4x8(d_t + width + 4 * d_stride, d_stride, &d_e[4], &d_e[5],
607 &d_e[6], &d_e[7], &d_e[8], &d_e[9], &d_e[10], &d_e[11]);
608
609 int16x8_t s_tr[8], e_tr[8];
610 transpose_elems_s16_4x8(d_s[0], d_s[1], d_s[2], d_s[3], d_s[4], d_s[5],
611 d_s[6], d_s[7], &s_tr[0], &s_tr[1], &s_tr[2],
612 &s_tr[3]);
613 transpose_elems_s16_4x8(d_s[8], d_s[9], d_s[10], d_s[11], zeros, zeros,
614 zeros, zeros, &s_tr[4], &s_tr[5], &s_tr[6],
615 &s_tr[7]);
616
617 transpose_elems_s16_4x8(d_e[0], d_e[1], d_e[2], d_e[3], d_e[4], d_e[5],
618 d_e[6], d_e[7], &e_tr[0], &e_tr[1], &e_tr[2],
619 &e_tr[3]);
620 transpose_elems_s16_4x8(d_e[8], d_e[9], d_e[10], d_e[11], zeros, zeros,
621 zeros, zeros, &e_tr[4], &e_tr[5], &e_tr[6],
622 &e_tr[7]);
623
624 int16x8_t start_col0[5], start_col1[5], start_col2[5], start_col3[5];
625 start_col0[0] = s_tr[0];
626 start_col0[1] = vextq_s16(s_tr[0], s_tr[4], 1);
627 start_col0[2] = vextq_s16(s_tr[0], s_tr[4], 2);
628 start_col0[3] = vextq_s16(s_tr[0], s_tr[4], 3);
629 start_col0[4] = vextq_s16(s_tr[0], s_tr[4], 4);
630
631 start_col1[0] = s_tr[1];
632 start_col1[1] = vextq_s16(s_tr[1], s_tr[5], 1);
633 start_col1[2] = vextq_s16(s_tr[1], s_tr[5], 2);
634 start_col1[3] = vextq_s16(s_tr[1], s_tr[5], 3);
635 start_col1[4] = vextq_s16(s_tr[1], s_tr[5], 4);
636
637 start_col2[0] = s_tr[2];
638 start_col2[1] = vextq_s16(s_tr[2], s_tr[6], 1);
639 start_col2[2] = vextq_s16(s_tr[2], s_tr[6], 2);
640 start_col2[3] = vextq_s16(s_tr[2], s_tr[6], 3);
641 start_col2[4] = vextq_s16(s_tr[2], s_tr[6], 4);
642
643 start_col3[0] = s_tr[3];
644 start_col3[1] = vextq_s16(s_tr[3], s_tr[7], 1);
645 start_col3[2] = vextq_s16(s_tr[3], s_tr[7], 2);
646 start_col3[3] = vextq_s16(s_tr[3], s_tr[7], 3);
647 start_col3[4] = vextq_s16(s_tr[3], s_tr[7], 4);
648
649 // i = 1, j = 2;
650 sub_deltas_step4_sve(start_col0, start_col1, deltas[0]);
651
652 // i = 1, j = 3;
653 sub_deltas_step4_sve(start_col0, start_col2, deltas[1]);
654
655 // i = 1, j = 4
656 sub_deltas_step4_sve(start_col0, start_col3, deltas[2]);
657
658 // i = 2, j =3
659 sub_deltas_step4_sve(start_col1, start_col2, deltas[3]);
660
661 // i = 2, j = 4
662 sub_deltas_step4_sve(start_col1, start_col3, deltas[4]);
663
664 // i = 3, j = 4
665 sub_deltas_step4_sve(start_col2, start_col3, deltas[5]);
666
667 int16x8_t end_col0[5], end_col1[5], end_col2[5], end_col3[5];
668 end_col0[0] = e_tr[0];
669 end_col0[1] = vextq_s16(e_tr[0], e_tr[4], 1);
670 end_col0[2] = vextq_s16(e_tr[0], e_tr[4], 2);
671 end_col0[3] = vextq_s16(e_tr[0], e_tr[4], 3);
672 end_col0[4] = vextq_s16(e_tr[0], e_tr[4], 4);
673
674 end_col1[0] = e_tr[1];
675 end_col1[1] = vextq_s16(e_tr[1], e_tr[5], 1);
676 end_col1[2] = vextq_s16(e_tr[1], e_tr[5], 2);
677 end_col1[3] = vextq_s16(e_tr[1], e_tr[5], 3);
678 end_col1[4] = vextq_s16(e_tr[1], e_tr[5], 4);
679
680 end_col2[0] = e_tr[2];
681 end_col2[1] = vextq_s16(e_tr[2], e_tr[6], 1);
682 end_col2[2] = vextq_s16(e_tr[2], e_tr[6], 2);
683 end_col2[3] = vextq_s16(e_tr[2], e_tr[6], 3);
684 end_col2[4] = vextq_s16(e_tr[2], e_tr[6], 4);
685
686 end_col3[0] = e_tr[3];
687 end_col3[1] = vextq_s16(e_tr[3], e_tr[7], 1);
688 end_col3[2] = vextq_s16(e_tr[3], e_tr[7], 2);
689 end_col3[3] = vextq_s16(e_tr[3], e_tr[7], 3);
690 end_col3[4] = vextq_s16(e_tr[3], e_tr[7], 4);
691
692 // i = 1, j = 2;
693 add_deltas_step4_sve(end_col0, end_col1, deltas[0]);
694
695 // i = 1, j = 3;
696 add_deltas_step4_sve(end_col0, end_col2, deltas[1]);
697
698 // i = 1, j = 4
699 add_deltas_step4_sve(end_col0, end_col3, deltas[2]);
700
701 // i = 2, j =3
702 add_deltas_step4_sve(end_col1, end_col2, deltas[3]);
703
704 // i = 2, j = 4
705 add_deltas_step4_sve(end_col1, end_col3, deltas[4]);
706
707 // i = 3, j = 4
708 add_deltas_step4_sve(end_col2, end_col3, deltas[5]);
709
710 d_s[0] = d_s[8];
711 d_s[1] = d_s[9];
712 d_s[2] = d_s[10];
713 d_s[3] = d_s[11];
714 d_e[0] = d_e[8];
715 d_e[1] = d_e[9];
716 d_e[2] = d_e[10];
717 d_e[3] = d_e[11];
718
719 d_t += 8 * d_stride;
720 y -= 8;
721 }
722
723 if (h8 != height) {
724 const int16x8_t mask_h = vld1q_s16(&mask_16bit[16] - (height % 8));
725
726 load_s16_4x8(d_t + 4 * d_stride, d_stride, &d_s[4], &d_s[5], &d_s[6],
727 &d_s[7], &d_s[8], &d_s[9], &d_s[10], &d_s[11]);
728 load_s16_4x8(d_t + width + 4 * d_stride, d_stride, &d_e[4], &d_e[5],
729 &d_e[6], &d_e[7], &d_e[8], &d_e[9], &d_e[10], &d_e[11]);
730 int16x8_t s_tr[8], e_tr[8];
731 transpose_elems_s16_4x8(d_s[0], d_s[1], d_s[2], d_s[3], d_s[4], d_s[5],
732 d_s[6], d_s[7], &s_tr[0], &s_tr[1], &s_tr[2],
733 &s_tr[3]);
734 transpose_elems_s16_4x8(d_s[8], d_s[9], d_s[10], d_s[11], zeros, zeros,
735 zeros, zeros, &s_tr[4], &s_tr[5], &s_tr[6],
736 &s_tr[7]);
737 transpose_elems_s16_4x8(d_e[0], d_e[1], d_e[2], d_e[3], d_e[4], d_e[5],
738 d_e[6], d_e[7], &e_tr[0], &e_tr[1], &e_tr[2],
739 &e_tr[3]);
740 transpose_elems_s16_4x8(d_e[8], d_e[9], d_e[10], d_e[11], zeros, zeros,
741 zeros, zeros, &e_tr[4], &e_tr[5], &e_tr[6],
742 &e_tr[7]);
743
744 int16x8_t start_col0[5], start_col1[5], start_col2[5], start_col3[5];
745 start_col0[0] = vandq_s16(s_tr[0], mask_h);
746 start_col0[1] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 1), mask_h);
747 start_col0[2] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 2), mask_h);
748 start_col0[3] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 3), mask_h);
749 start_col0[4] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 4), mask_h);
750
751 start_col1[0] = vandq_s16(s_tr[1], mask_h);
752 start_col1[1] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 1), mask_h);
753 start_col1[2] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 2), mask_h);
754 start_col1[3] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 3), mask_h);
755 start_col1[4] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 4), mask_h);
756
757 start_col2[0] = vandq_s16(s_tr[2], mask_h);
758 start_col2[1] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 1), mask_h);
759 start_col2[2] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 2), mask_h);
760 start_col2[3] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 3), mask_h);
761 start_col2[4] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 4), mask_h);
762
763 start_col3[0] = vandq_s16(s_tr[3], mask_h);
764 start_col3[1] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 1), mask_h);
765 start_col3[2] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 2), mask_h);
766 start_col3[3] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 3), mask_h);
767 start_col3[4] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 4), mask_h);
768
769 // i = 1, j = 2;
770 sub_deltas_step4_sve(start_col0, start_col1, deltas[0]);
771
772 // i = 1, j = 3;
773 sub_deltas_step4_sve(start_col0, start_col2, deltas[1]);
774
775 // i = 1, j = 4
776 sub_deltas_step4_sve(start_col0, start_col3, deltas[2]);
777
778 // i = 2, j = 3
779 sub_deltas_step4_sve(start_col1, start_col2, deltas[3]);
780
781 // i = 2, j = 4
782 sub_deltas_step4_sve(start_col1, start_col3, deltas[4]);
783
784 // i = 3, j = 4
785 sub_deltas_step4_sve(start_col2, start_col3, deltas[5]);
786
787 int16x8_t end_col0[5], end_col1[5], end_col2[5], end_col3[5];
788 end_col0[0] = vandq_s16(e_tr[0], mask_h);
789 end_col0[1] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 1), mask_h);
790 end_col0[2] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 2), mask_h);
791 end_col0[3] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 3), mask_h);
792 end_col0[4] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 4), mask_h);
793
794 end_col1[0] = vandq_s16(e_tr[1], mask_h);
795 end_col1[1] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 1), mask_h);
796 end_col1[2] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 2), mask_h);
797 end_col1[3] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 3), mask_h);
798 end_col1[4] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 4), mask_h);
799
800 end_col2[0] = vandq_s16(e_tr[2], mask_h);
801 end_col2[1] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 1), mask_h);
802 end_col2[2] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 2), mask_h);
803 end_col2[3] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 3), mask_h);
804 end_col2[4] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 4), mask_h);
805
806 end_col3[0] = vandq_s16(e_tr[3], mask_h);
807 end_col3[1] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 1), mask_h);
808 end_col3[2] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 2), mask_h);
809 end_col3[3] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 3), mask_h);
810 end_col3[4] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 4), mask_h);
811
812 // i = 1, j = 2;
813 add_deltas_step4_sve(end_col0, end_col1, deltas[0]);
814
815 // i = 1, j = 3;
816 add_deltas_step4_sve(end_col0, end_col2, deltas[1]);
817
818 // i = 1, j = 4
819 add_deltas_step4_sve(end_col0, end_col3, deltas[2]);
820
821 // i = 2, j =3
822 add_deltas_step4_sve(end_col1, end_col2, deltas[3]);
823
824 // i = 2, j = 4
825 add_deltas_step4_sve(end_col1, end_col3, deltas[4]);
826
827 // i = 3, j = 4
828 add_deltas_step4_sve(end_col2, end_col3, deltas[5]);
829 }
830
831 int64_t single_delta[6];
832
833 deltas[0][0] = vpaddq_s64(deltas[0][0], deltas[0][1]);
834 deltas[0][1] = vpaddq_s64(deltas[0][2], deltas[0][3]);
835 deltas[1][0] = vpaddq_s64(deltas[1][0], deltas[1][1]);
836 deltas[1][1] = vpaddq_s64(deltas[1][2], deltas[1][3]);
837 deltas[2][0] = vpaddq_s64(deltas[2][0], deltas[2][1]);
838 deltas[2][1] = vpaddq_s64(deltas[2][2], deltas[2][3]);
839 deltas[3][0] = vpaddq_s64(deltas[3][0], deltas[3][1]);
840 deltas[3][1] = vpaddq_s64(deltas[3][2], deltas[3][3]);
841 deltas[4][0] = vpaddq_s64(deltas[4][0], deltas[4][1]);
842 deltas[4][1] = vpaddq_s64(deltas[4][2], deltas[4][3]);
843 deltas[5][0] = vpaddq_s64(deltas[5][0], deltas[5][1]);
844 deltas[5][1] = vpaddq_s64(deltas[5][2], deltas[5][3]);
845
846 deltas[0][5] = vpaddq_s64(deltas[0][5], deltas[0][6]);
847 deltas[0][7] = vpaddq_s64(deltas[0][7], deltas[0][8]);
848 deltas[1][5] = vpaddq_s64(deltas[1][5], deltas[1][6]);
849 deltas[1][7] = vpaddq_s64(deltas[1][7], deltas[1][8]);
850 deltas[2][5] = vpaddq_s64(deltas[2][5], deltas[2][6]);
851 deltas[2][7] = vpaddq_s64(deltas[2][7], deltas[2][8]);
852 deltas[3][5] = vpaddq_s64(deltas[3][5], deltas[3][6]);
853 deltas[3][7] = vpaddq_s64(deltas[3][7], deltas[3][8]);
854 deltas[4][5] = vpaddq_s64(deltas[4][5], deltas[4][6]);
855 deltas[4][7] = vpaddq_s64(deltas[4][7], deltas[4][8]);
856 deltas[5][5] = vpaddq_s64(deltas[5][5], deltas[5][6]);
857 deltas[5][7] = vpaddq_s64(deltas[5][7], deltas[5][8]);
858
859 vst1q_s64(single_delta + 0, vpaddq_s64(deltas[0][4], deltas[1][4]));
860 vst1q_s64(single_delta + 2, vpaddq_s64(deltas[2][4], deltas[3][4]));
861 vst1q_s64(single_delta + 4, vpaddq_s64(deltas[4][4], deltas[5][4]));
862
863 int idx = 0;
864 for (i = 1; i < wiener_win - 1; i++) {
865 for (j = i + 1; j < wiener_win; j++) {
866 update_4_stats_sve(
867 H + (i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win,
868 deltas[idx], H + i * wiener_win * wiener_win2 + j * wiener_win);
869 H[i * wiener_win * wiener_win2 + j * wiener_win + 4] =
870 H[(i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win + 4] +
871 single_delta[idx];
872
873 H[(i * wiener_win + 1) * wiener_win2 + j * wiener_win] =
874 H[((i - 1) * wiener_win + 1) * wiener_win2 + (j - 1) * wiener_win] +
875 vgetq_lane_s64(deltas[idx][5], 0);
876 H[(i * wiener_win + 2) * wiener_win2 + j * wiener_win] =
877 H[((i - 1) * wiener_win + 2) * wiener_win2 + (j - 1) * wiener_win] +
878 vgetq_lane_s64(deltas[idx][5], 1);
879 H[(i * wiener_win + 3) * wiener_win2 + j * wiener_win] =
880 H[((i - 1) * wiener_win + 3) * wiener_win2 + (j - 1) * wiener_win] +
881 vgetq_lane_s64(deltas[idx][7], 0);
882 H[(i * wiener_win + 4) * wiener_win2 + j * wiener_win] =
883 H[((i - 1) * wiener_win + 4) * wiener_win2 + (j - 1) * wiener_win] +
884 vgetq_lane_s64(deltas[idx][7], 1);
885
886 idx++;
887 }
888 }
889 }
890
891 // Step 5: Derive other points of each square. No square in bottom row.
892 i = 0;
893 do {
894 const int16_t *const di = d + i;
895
896 j = i + 1;
897 do {
898 const int16_t *const dj = d + j;
899 int64x2_t deltas[WIENER_WIN_CHROMA - 1][WIENER_WIN_CHROMA - 1] = {
900 { vdupq_n_s64(0) }, { vdupq_n_s64(0) }
901 };
902 int16x8_t d_is[WIN_CHROMA], d_ie[WIN_CHROMA];
903 int16x8_t d_js[WIN_CHROMA], d_je[WIN_CHROMA];
904
905 x = 0;
906 while (x < width - 16) {
907 load_square_win5_neon(di + x, dj + x, d_stride, height, d_is, d_ie,
908 d_js, d_je);
909 derive_square_win5_sve(d_is, d_ie, d_js, d_je, deltas);
910 x += 16;
911 }
912
913 load_square_win5_sve(di + x, dj + x, d_stride, height, d_is, d_ie, d_js,
914 d_je, p0, p1);
915 derive_square_win5_sve(d_is, d_ie, d_js, d_je, deltas);
916
917 hadd_update_4_stats_sve(
918 H + (i * wiener_win + 0) * wiener_win2 + j * wiener_win, deltas[0],
919 H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win + 1);
920 hadd_update_4_stats_sve(
921 H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win, deltas[1],
922 H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win + 1);
923 hadd_update_4_stats_sve(
924 H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win, deltas[2],
925 H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win + 1);
926 hadd_update_4_stats_sve(
927 H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win, deltas[3],
928 H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win + 1);
929 } while (++j < wiener_win);
930 } while (++i < wiener_win - 1);
931
932 // Step 6: Derive other points of each upper triangle along the diagonal.
933 i = 0;
934 do {
935 const int16_t *const di = d + i;
936 int64x2_t deltas[WIENER_WIN_CHROMA * 2 + 1] = { vdupq_n_s64(0) };
937 int16x8_t d_is[WIN_CHROMA], d_ie[WIN_CHROMA];
938
939 x = 0;
940 while (x < width - 16) {
941 load_triangle_win5_neon(di + x, d_stride, height, d_is, d_ie);
942 derive_triangle_win5_sve(d_is, d_ie, deltas);
943 x += 16;
944 }
945
946 load_triangle_win5_sve(di + x, d_stride, height, d_is, d_ie, p0, p1);
947 derive_triangle_win5_sve(d_is, d_ie, deltas);
948
949 // Row 1: 4 points
950 hadd_update_4_stats_sve(
951 H + (i * wiener_win + 0) * wiener_win2 + i * wiener_win, deltas,
952 H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1);
953
954 // Row 2: 3 points
955 int64x2_t src0 =
956 vld1q_s64(H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1);
957 vst1q_s64(H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2,
958 vaddq_s64(src0, vpaddq_s64(deltas[4], deltas[5])));
959
960 int64x2_t deltas69 = vpaddq_s64(deltas[6], deltas[9]);
961
962 H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 4] =
963 H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 3] +
964 vgetq_lane_s64(deltas69, 0);
965
966 // Row 3: 2 points
967 int64x2_t src1 =
968 vld1q_s64(H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2);
969 vst1q_s64(H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3,
970 vaddq_s64(src1, vpaddq_s64(deltas[7], deltas[8])));
971
972 // Row 4: 1 point
973 H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4] =
974 H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3] +
975 vgetq_lane_s64(deltas69, 1);
976 } while (++i < wiener_win);
977 }
978
stats_top_win7_sve(const int16x8_t src[2],const int16x8_t dgd[2],const int16_t * const d,const int32_t d_stride,int64x2_t * sum_m,int64x2_t * sum_h)979 static inline void stats_top_win7_sve(const int16x8_t src[2],
980 const int16x8_t dgd[2],
981 const int16_t *const d,
982 const int32_t d_stride, int64x2_t *sum_m,
983 int64x2_t *sum_h) {
984 int16x8_t dgds[WIENER_WIN * 2];
985
986 load_s16_8x7(d + 0, d_stride, &dgds[0], &dgds[2], &dgds[4], &dgds[6],
987 &dgds[8], &dgds[10], &dgds[12]);
988 load_s16_8x7(d + 8, d_stride, &dgds[1], &dgds[3], &dgds[5], &dgds[7],
989 &dgds[9], &dgds[11], &dgds[13]);
990
991 sum_m[0] = aom_sdotq_s16(sum_m[0], src[0], dgds[0]);
992 sum_m[0] = aom_sdotq_s16(sum_m[0], src[1], dgds[1]);
993 sum_m[1] = aom_sdotq_s16(sum_m[1], src[0], dgds[2]);
994 sum_m[1] = aom_sdotq_s16(sum_m[1], src[1], dgds[3]);
995 sum_m[2] = aom_sdotq_s16(sum_m[2], src[0], dgds[4]);
996 sum_m[2] = aom_sdotq_s16(sum_m[2], src[1], dgds[5]);
997 sum_m[3] = aom_sdotq_s16(sum_m[3], src[0], dgds[6]);
998 sum_m[3] = aom_sdotq_s16(sum_m[3], src[1], dgds[7]);
999 sum_m[4] = aom_sdotq_s16(sum_m[4], src[0], dgds[8]);
1000 sum_m[4] = aom_sdotq_s16(sum_m[4], src[1], dgds[9]);
1001 sum_m[5] = aom_sdotq_s16(sum_m[5], src[0], dgds[10]);
1002 sum_m[5] = aom_sdotq_s16(sum_m[5], src[1], dgds[11]);
1003 sum_m[6] = aom_sdotq_s16(sum_m[6], src[0], dgds[12]);
1004 sum_m[6] = aom_sdotq_s16(sum_m[6], src[1], dgds[13]);
1005
1006 sum_h[0] = aom_sdotq_s16(sum_h[0], dgd[0], dgds[0]);
1007 sum_h[0] = aom_sdotq_s16(sum_h[0], dgd[1], dgds[1]);
1008 sum_h[1] = aom_sdotq_s16(sum_h[1], dgd[0], dgds[2]);
1009 sum_h[1] = aom_sdotq_s16(sum_h[1], dgd[1], dgds[3]);
1010 sum_h[2] = aom_sdotq_s16(sum_h[2], dgd[0], dgds[4]);
1011 sum_h[2] = aom_sdotq_s16(sum_h[2], dgd[1], dgds[5]);
1012 sum_h[3] = aom_sdotq_s16(sum_h[3], dgd[0], dgds[6]);
1013 sum_h[3] = aom_sdotq_s16(sum_h[3], dgd[1], dgds[7]);
1014 sum_h[4] = aom_sdotq_s16(sum_h[4], dgd[0], dgds[8]);
1015 sum_h[4] = aom_sdotq_s16(sum_h[4], dgd[1], dgds[9]);
1016 sum_h[5] = aom_sdotq_s16(sum_h[5], dgd[0], dgds[10]);
1017 sum_h[5] = aom_sdotq_s16(sum_h[5], dgd[1], dgds[11]);
1018 sum_h[6] = aom_sdotq_s16(sum_h[6], dgd[0], dgds[12]);
1019 sum_h[6] = aom_sdotq_s16(sum_h[6], dgd[1], dgds[13]);
1020 }
1021
stats_left_win7_sve(const int16x8_t src[2],const int16_t * d,const int32_t d_stride,int64x2_t * sum)1022 static inline void stats_left_win7_sve(const int16x8_t src[2], const int16_t *d,
1023 const int32_t d_stride, int64x2_t *sum) {
1024 int16x8_t dgds[WIN_7];
1025
1026 load_s16_8x6(d + d_stride + 0, d_stride, &dgds[0], &dgds[2], &dgds[4],
1027 &dgds[6], &dgds[8], &dgds[10]);
1028 load_s16_8x6(d + d_stride + 8, d_stride, &dgds[1], &dgds[3], &dgds[5],
1029 &dgds[7], &dgds[9], &dgds[11]);
1030
1031 sum[0] = aom_sdotq_s16(sum[0], src[0], dgds[0]);
1032 sum[0] = aom_sdotq_s16(sum[0], src[1], dgds[1]);
1033 sum[1] = aom_sdotq_s16(sum[1], src[0], dgds[2]);
1034 sum[1] = aom_sdotq_s16(sum[1], src[1], dgds[3]);
1035 sum[2] = aom_sdotq_s16(sum[2], src[0], dgds[4]);
1036 sum[2] = aom_sdotq_s16(sum[2], src[1], dgds[5]);
1037 sum[3] = aom_sdotq_s16(sum[3], src[0], dgds[6]);
1038 sum[3] = aom_sdotq_s16(sum[3], src[1], dgds[7]);
1039 sum[4] = aom_sdotq_s16(sum[4], src[0], dgds[8]);
1040 sum[4] = aom_sdotq_s16(sum[4], src[1], dgds[9]);
1041 sum[5] = aom_sdotq_s16(sum[5], src[0], dgds[10]);
1042 sum[5] = aom_sdotq_s16(sum[5], src[1], dgds[11]);
1043 }
1044
load_square_win7_sve(const int16_t * const di,const int16_t * const dj,const int32_t d_stride,const int32_t height,int16x8_t * d_is,int16x8_t * d_ie,int16x8_t * d_js,int16x8_t * d_je,svbool_t p0,svbool_t p1)1045 static inline void load_square_win7_sve(
1046 const int16_t *const di, const int16_t *const dj, const int32_t d_stride,
1047 const int32_t height, int16x8_t *d_is, int16x8_t *d_ie, int16x8_t *d_js,
1048 int16x8_t *d_je, svbool_t p0, svbool_t p1) {
1049 d_is[0] = svget_neonq_s16(svld1_s16(p0, di + 0 * d_stride + 0));
1050 d_is[1] = svget_neonq_s16(svld1_s16(p1, di + 0 * d_stride + 8));
1051 d_is[2] = svget_neonq_s16(svld1_s16(p0, di + 1 * d_stride + 0));
1052 d_is[3] = svget_neonq_s16(svld1_s16(p1, di + 1 * d_stride + 8));
1053 d_is[4] = svget_neonq_s16(svld1_s16(p0, di + 2 * d_stride + 0));
1054 d_is[5] = svget_neonq_s16(svld1_s16(p1, di + 2 * d_stride + 8));
1055 d_is[6] = svget_neonq_s16(svld1_s16(p0, di + 3 * d_stride + 0));
1056 d_is[7] = svget_neonq_s16(svld1_s16(p1, di + 3 * d_stride + 8));
1057 d_is[8] = svget_neonq_s16(svld1_s16(p0, di + 4 * d_stride + 0));
1058 d_is[9] = svget_neonq_s16(svld1_s16(p1, di + 4 * d_stride + 8));
1059 d_is[10] = svget_neonq_s16(svld1_s16(p0, di + 5 * d_stride + 0));
1060 d_is[11] = svget_neonq_s16(svld1_s16(p1, di + 5 * d_stride + 8));
1061
1062 d_ie[0] = svget_neonq_s16(svld1_s16(p0, di + (height + 0) * d_stride + 0));
1063 d_ie[1] = svget_neonq_s16(svld1_s16(p1, di + (height + 0) * d_stride + 8));
1064 d_ie[2] = svget_neonq_s16(svld1_s16(p0, di + (height + 1) * d_stride + 0));
1065 d_ie[3] = svget_neonq_s16(svld1_s16(p1, di + (height + 1) * d_stride + 8));
1066 d_ie[4] = svget_neonq_s16(svld1_s16(p0, di + (height + 2) * d_stride + 0));
1067 d_ie[5] = svget_neonq_s16(svld1_s16(p1, di + (height + 2) * d_stride + 8));
1068 d_ie[6] = svget_neonq_s16(svld1_s16(p0, di + (height + 3) * d_stride + 0));
1069 d_ie[7] = svget_neonq_s16(svld1_s16(p1, di + (height + 3) * d_stride + 8));
1070 d_ie[8] = svget_neonq_s16(svld1_s16(p0, di + (height + 4) * d_stride + 0));
1071 d_ie[9] = svget_neonq_s16(svld1_s16(p1, di + (height + 4) * d_stride + 8));
1072 d_ie[10] = svget_neonq_s16(svld1_s16(p0, di + (height + 5) * d_stride + 0));
1073 d_ie[11] = svget_neonq_s16(svld1_s16(p1, di + (height + 5) * d_stride + 8));
1074
1075 load_s16_8x6(dj + 0, d_stride, &d_js[0], &d_js[2], &d_js[4], &d_js[6],
1076 &d_js[8], &d_js[10]);
1077 load_s16_8x6(dj + 8, d_stride, &d_js[1], &d_js[3], &d_js[5], &d_js[7],
1078 &d_js[9], &d_js[11]);
1079 load_s16_8x6(dj + height * d_stride + 0, d_stride, &d_je[0], &d_je[2],
1080 &d_je[4], &d_je[6], &d_je[8], &d_je[10]);
1081 load_s16_8x6(dj + height * d_stride + 8, d_stride, &d_je[1], &d_je[3],
1082 &d_je[5], &d_je[7], &d_je[9], &d_je[11]);
1083 }
1084
derive_square_win7_sve(int16x8_t * d_is,const int16x8_t * d_ie,const int16x8_t * d_js,const int16x8_t * d_je,int64x2_t deltas[][WIN_7])1085 static inline void derive_square_win7_sve(int16x8_t *d_is,
1086 const int16x8_t *d_ie,
1087 const int16x8_t *d_js,
1088 const int16x8_t *d_je,
1089 int64x2_t deltas[][WIN_7]) {
1090 d_is[0] = vnegq_s16(d_is[0]);
1091 d_is[1] = vnegq_s16(d_is[1]);
1092 d_is[2] = vnegq_s16(d_is[2]);
1093 d_is[3] = vnegq_s16(d_is[3]);
1094 d_is[4] = vnegq_s16(d_is[4]);
1095 d_is[5] = vnegq_s16(d_is[5]);
1096 d_is[6] = vnegq_s16(d_is[6]);
1097 d_is[7] = vnegq_s16(d_is[7]);
1098 d_is[8] = vnegq_s16(d_is[8]);
1099 d_is[9] = vnegq_s16(d_is[9]);
1100 d_is[10] = vnegq_s16(d_is[10]);
1101 d_is[11] = vnegq_s16(d_is[11]);
1102
1103 deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_is[0], d_js[0]);
1104 deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_is[1], d_js[1]);
1105 deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_is[0], d_js[2]);
1106 deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_is[1], d_js[3]);
1107 deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_is[0], d_js[4]);
1108 deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_is[1], d_js[5]);
1109 deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_is[0], d_js[6]);
1110 deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_is[1], d_js[7]);
1111 deltas[0][4] = aom_sdotq_s16(deltas[0][4], d_is[0], d_js[8]);
1112 deltas[0][4] = aom_sdotq_s16(deltas[0][4], d_is[1], d_js[9]);
1113 deltas[0][5] = aom_sdotq_s16(deltas[0][5], d_is[0], d_js[10]);
1114 deltas[0][5] = aom_sdotq_s16(deltas[0][5], d_is[1], d_js[11]);
1115
1116 deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_is[2], d_js[0]);
1117 deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_is[3], d_js[1]);
1118 deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_is[2], d_js[2]);
1119 deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_is[3], d_js[3]);
1120 deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_is[2], d_js[4]);
1121 deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_is[3], d_js[5]);
1122 deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_is[2], d_js[6]);
1123 deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_is[3], d_js[7]);
1124 deltas[1][4] = aom_sdotq_s16(deltas[1][4], d_is[2], d_js[8]);
1125 deltas[1][4] = aom_sdotq_s16(deltas[1][4], d_is[3], d_js[9]);
1126 deltas[1][5] = aom_sdotq_s16(deltas[1][5], d_is[2], d_js[10]);
1127 deltas[1][5] = aom_sdotq_s16(deltas[1][5], d_is[3], d_js[11]);
1128
1129 deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_is[4], d_js[0]);
1130 deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_is[5], d_js[1]);
1131 deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_is[4], d_js[2]);
1132 deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_is[5], d_js[3]);
1133 deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_is[4], d_js[4]);
1134 deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_is[5], d_js[5]);
1135 deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_is[4], d_js[6]);
1136 deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_is[5], d_js[7]);
1137 deltas[2][4] = aom_sdotq_s16(deltas[2][4], d_is[4], d_js[8]);
1138 deltas[2][4] = aom_sdotq_s16(deltas[2][4], d_is[5], d_js[9]);
1139 deltas[2][5] = aom_sdotq_s16(deltas[2][5], d_is[4], d_js[10]);
1140 deltas[2][5] = aom_sdotq_s16(deltas[2][5], d_is[5], d_js[11]);
1141
1142 deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_is[6], d_js[0]);
1143 deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_is[7], d_js[1]);
1144 deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_is[6], d_js[2]);
1145 deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_is[7], d_js[3]);
1146 deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_is[6], d_js[4]);
1147 deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_is[7], d_js[5]);
1148 deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_is[6], d_js[6]);
1149 deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_is[7], d_js[7]);
1150 deltas[3][4] = aom_sdotq_s16(deltas[3][4], d_is[6], d_js[8]);
1151 deltas[3][4] = aom_sdotq_s16(deltas[3][4], d_is[7], d_js[9]);
1152 deltas[3][5] = aom_sdotq_s16(deltas[3][5], d_is[6], d_js[10]);
1153 deltas[3][5] = aom_sdotq_s16(deltas[3][5], d_is[7], d_js[11]);
1154
1155 deltas[4][0] = aom_sdotq_s16(deltas[4][0], d_is[8], d_js[0]);
1156 deltas[4][0] = aom_sdotq_s16(deltas[4][0], d_is[9], d_js[1]);
1157 deltas[4][1] = aom_sdotq_s16(deltas[4][1], d_is[8], d_js[2]);
1158 deltas[4][1] = aom_sdotq_s16(deltas[4][1], d_is[9], d_js[3]);
1159 deltas[4][2] = aom_sdotq_s16(deltas[4][2], d_is[8], d_js[4]);
1160 deltas[4][2] = aom_sdotq_s16(deltas[4][2], d_is[9], d_js[5]);
1161 deltas[4][3] = aom_sdotq_s16(deltas[4][3], d_is[8], d_js[6]);
1162 deltas[4][3] = aom_sdotq_s16(deltas[4][3], d_is[9], d_js[7]);
1163 deltas[4][4] = aom_sdotq_s16(deltas[4][4], d_is[8], d_js[8]);
1164 deltas[4][4] = aom_sdotq_s16(deltas[4][4], d_is[9], d_js[9]);
1165 deltas[4][5] = aom_sdotq_s16(deltas[4][5], d_is[8], d_js[10]);
1166 deltas[4][5] = aom_sdotq_s16(deltas[4][5], d_is[9], d_js[11]);
1167
1168 deltas[5][0] = aom_sdotq_s16(deltas[5][0], d_is[10], d_js[0]);
1169 deltas[5][0] = aom_sdotq_s16(deltas[5][0], d_is[11], d_js[1]);
1170 deltas[5][1] = aom_sdotq_s16(deltas[5][1], d_is[10], d_js[2]);
1171 deltas[5][1] = aom_sdotq_s16(deltas[5][1], d_is[11], d_js[3]);
1172 deltas[5][2] = aom_sdotq_s16(deltas[5][2], d_is[10], d_js[4]);
1173 deltas[5][2] = aom_sdotq_s16(deltas[5][2], d_is[11], d_js[5]);
1174 deltas[5][3] = aom_sdotq_s16(deltas[5][3], d_is[10], d_js[6]);
1175 deltas[5][3] = aom_sdotq_s16(deltas[5][3], d_is[11], d_js[7]);
1176 deltas[5][4] = aom_sdotq_s16(deltas[5][4], d_is[10], d_js[8]);
1177 deltas[5][4] = aom_sdotq_s16(deltas[5][4], d_is[11], d_js[9]);
1178 deltas[5][5] = aom_sdotq_s16(deltas[5][5], d_is[10], d_js[10]);
1179 deltas[5][5] = aom_sdotq_s16(deltas[5][5], d_is[11], d_js[11]);
1180
1181 deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_ie[0], d_je[0]);
1182 deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_ie[1], d_je[1]);
1183 deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_ie[0], d_je[2]);
1184 deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_ie[1], d_je[3]);
1185 deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_ie[0], d_je[4]);
1186 deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_ie[1], d_je[5]);
1187 deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_ie[0], d_je[6]);
1188 deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_ie[1], d_je[7]);
1189 deltas[0][4] = aom_sdotq_s16(deltas[0][4], d_ie[0], d_je[8]);
1190 deltas[0][4] = aom_sdotq_s16(deltas[0][4], d_ie[1], d_je[9]);
1191 deltas[0][5] = aom_sdotq_s16(deltas[0][5], d_ie[0], d_je[10]);
1192 deltas[0][5] = aom_sdotq_s16(deltas[0][5], d_ie[1], d_je[11]);
1193
1194 deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_ie[2], d_je[0]);
1195 deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_ie[3], d_je[1]);
1196 deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_ie[2], d_je[2]);
1197 deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_ie[3], d_je[3]);
1198 deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_ie[2], d_je[4]);
1199 deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_ie[3], d_je[5]);
1200 deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_ie[2], d_je[6]);
1201 deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_ie[3], d_je[7]);
1202 deltas[1][4] = aom_sdotq_s16(deltas[1][4], d_ie[2], d_je[8]);
1203 deltas[1][4] = aom_sdotq_s16(deltas[1][4], d_ie[3], d_je[9]);
1204 deltas[1][5] = aom_sdotq_s16(deltas[1][5], d_ie[2], d_je[10]);
1205 deltas[1][5] = aom_sdotq_s16(deltas[1][5], d_ie[3], d_je[11]);
1206
1207 deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_ie[4], d_je[0]);
1208 deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_ie[5], d_je[1]);
1209 deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_ie[4], d_je[2]);
1210 deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_ie[5], d_je[3]);
1211 deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_ie[4], d_je[4]);
1212 deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_ie[5], d_je[5]);
1213 deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_ie[4], d_je[6]);
1214 deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_ie[5], d_je[7]);
1215 deltas[2][4] = aom_sdotq_s16(deltas[2][4], d_ie[4], d_je[8]);
1216 deltas[2][4] = aom_sdotq_s16(deltas[2][4], d_ie[5], d_je[9]);
1217 deltas[2][5] = aom_sdotq_s16(deltas[2][5], d_ie[4], d_je[10]);
1218 deltas[2][5] = aom_sdotq_s16(deltas[2][5], d_ie[5], d_je[11]);
1219
1220 deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_ie[6], d_je[0]);
1221 deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_ie[7], d_je[1]);
1222 deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_ie[6], d_je[2]);
1223 deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_ie[7], d_je[3]);
1224 deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_ie[6], d_je[4]);
1225 deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_ie[7], d_je[5]);
1226 deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_ie[6], d_je[6]);
1227 deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_ie[7], d_je[7]);
1228 deltas[3][4] = aom_sdotq_s16(deltas[3][4], d_ie[6], d_je[8]);
1229 deltas[3][4] = aom_sdotq_s16(deltas[3][4], d_ie[7], d_je[9]);
1230 deltas[3][5] = aom_sdotq_s16(deltas[3][5], d_ie[6], d_je[10]);
1231 deltas[3][5] = aom_sdotq_s16(deltas[3][5], d_ie[7], d_je[11]);
1232
1233 deltas[4][0] = aom_sdotq_s16(deltas[4][0], d_ie[8], d_je[0]);
1234 deltas[4][0] = aom_sdotq_s16(deltas[4][0], d_ie[9], d_je[1]);
1235 deltas[4][1] = aom_sdotq_s16(deltas[4][1], d_ie[8], d_je[2]);
1236 deltas[4][1] = aom_sdotq_s16(deltas[4][1], d_ie[9], d_je[3]);
1237 deltas[4][2] = aom_sdotq_s16(deltas[4][2], d_ie[8], d_je[4]);
1238 deltas[4][2] = aom_sdotq_s16(deltas[4][2], d_ie[9], d_je[5]);
1239 deltas[4][3] = aom_sdotq_s16(deltas[4][3], d_ie[8], d_je[6]);
1240 deltas[4][3] = aom_sdotq_s16(deltas[4][3], d_ie[9], d_je[7]);
1241 deltas[4][4] = aom_sdotq_s16(deltas[4][4], d_ie[8], d_je[8]);
1242 deltas[4][4] = aom_sdotq_s16(deltas[4][4], d_ie[9], d_je[9]);
1243 deltas[4][5] = aom_sdotq_s16(deltas[4][5], d_ie[8], d_je[10]);
1244 deltas[4][5] = aom_sdotq_s16(deltas[4][5], d_ie[9], d_je[11]);
1245
1246 deltas[5][0] = aom_sdotq_s16(deltas[5][0], d_ie[10], d_je[0]);
1247 deltas[5][0] = aom_sdotq_s16(deltas[5][0], d_ie[11], d_je[1]);
1248 deltas[5][1] = aom_sdotq_s16(deltas[5][1], d_ie[10], d_je[2]);
1249 deltas[5][1] = aom_sdotq_s16(deltas[5][1], d_ie[11], d_je[3]);
1250 deltas[5][2] = aom_sdotq_s16(deltas[5][2], d_ie[10], d_je[4]);
1251 deltas[5][2] = aom_sdotq_s16(deltas[5][2], d_ie[11], d_je[5]);
1252 deltas[5][3] = aom_sdotq_s16(deltas[5][3], d_ie[10], d_je[6]);
1253 deltas[5][3] = aom_sdotq_s16(deltas[5][3], d_ie[11], d_je[7]);
1254 deltas[5][4] = aom_sdotq_s16(deltas[5][4], d_ie[10], d_je[8]);
1255 deltas[5][4] = aom_sdotq_s16(deltas[5][4], d_ie[11], d_je[9]);
1256 deltas[5][5] = aom_sdotq_s16(deltas[5][5], d_ie[10], d_je[10]);
1257 deltas[5][5] = aom_sdotq_s16(deltas[5][5], d_ie[11], d_je[11]);
1258 }
1259
hadd_update_6_stats_sve(const int64_t * const src,const int64x2_t * deltas,int64_t * const dst)1260 static inline void hadd_update_6_stats_sve(const int64_t *const src,
1261 const int64x2_t *deltas,
1262 int64_t *const dst) {
1263 int64x2_t src0 = vld1q_s64(src + 0);
1264 int64x2_t src1 = vld1q_s64(src + 2);
1265 int64x2_t src2 = vld1q_s64(src + 4);
1266
1267 int64x2_t deltas01 = vpaddq_s64(deltas[0], deltas[1]);
1268 int64x2_t deltas23 = vpaddq_s64(deltas[2], deltas[3]);
1269 int64x2_t deltas45 = vpaddq_s64(deltas[4], deltas[5]);
1270
1271 vst1q_s64(dst + 0, vaddq_s64(src0, deltas01));
1272 vst1q_s64(dst + 2, vaddq_s64(src1, deltas23));
1273 vst1q_s64(dst + 4, vaddq_s64(src2, deltas45));
1274 }
1275
load_triangle_win7_sve(const int16_t * const di,const int32_t d_stride,const int32_t height,int16x8_t * d_is,int16x8_t * d_ie,svbool_t p0,svbool_t p1)1276 static inline void load_triangle_win7_sve(const int16_t *const di,
1277 const int32_t d_stride,
1278 const int32_t height, int16x8_t *d_is,
1279 int16x8_t *d_ie, svbool_t p0,
1280 svbool_t p1) {
1281 d_is[0] = svget_neonq_s16(svld1_s16(p0, di + 0 * d_stride + 0));
1282 d_is[1] = svget_neonq_s16(svld1_s16(p1, di + 0 * d_stride + 8));
1283 d_is[2] = svget_neonq_s16(svld1_s16(p0, di + 1 * d_stride + 0));
1284 d_is[3] = svget_neonq_s16(svld1_s16(p1, di + 1 * d_stride + 8));
1285 d_is[4] = svget_neonq_s16(svld1_s16(p0, di + 2 * d_stride + 0));
1286 d_is[5] = svget_neonq_s16(svld1_s16(p1, di + 2 * d_stride + 8));
1287 d_is[6] = svget_neonq_s16(svld1_s16(p0, di + 3 * d_stride + 0));
1288 d_is[7] = svget_neonq_s16(svld1_s16(p1, di + 3 * d_stride + 8));
1289 d_is[8] = svget_neonq_s16(svld1_s16(p0, di + 4 * d_stride + 0));
1290 d_is[9] = svget_neonq_s16(svld1_s16(p1, di + 4 * d_stride + 8));
1291 d_is[10] = svget_neonq_s16(svld1_s16(p0, di + 5 * d_stride + 0));
1292 d_is[11] = svget_neonq_s16(svld1_s16(p1, di + 5 * d_stride + 8));
1293
1294 d_ie[0] = svget_neonq_s16(svld1_s16(p0, di + (height + 0) * d_stride + 0));
1295 d_ie[1] = svget_neonq_s16(svld1_s16(p1, di + (height + 0) * d_stride + 8));
1296 d_ie[2] = svget_neonq_s16(svld1_s16(p0, di + (height + 1) * d_stride + 0));
1297 d_ie[3] = svget_neonq_s16(svld1_s16(p1, di + (height + 1) * d_stride + 8));
1298 d_ie[4] = svget_neonq_s16(svld1_s16(p0, di + (height + 2) * d_stride + 0));
1299 d_ie[5] = svget_neonq_s16(svld1_s16(p1, di + (height + 2) * d_stride + 8));
1300 d_ie[6] = svget_neonq_s16(svld1_s16(p0, di + (height + 3) * d_stride + 0));
1301 d_ie[7] = svget_neonq_s16(svld1_s16(p1, di + (height + 3) * d_stride + 8));
1302 d_ie[8] = svget_neonq_s16(svld1_s16(p0, di + (height + 4) * d_stride + 0));
1303 d_ie[9] = svget_neonq_s16(svld1_s16(p1, di + (height + 4) * d_stride + 8));
1304 d_ie[10] = svget_neonq_s16(svld1_s16(p0, di + (height + 5) * d_stride + 0));
1305 d_ie[11] = svget_neonq_s16(svld1_s16(p1, di + (height + 5) * d_stride + 8));
1306 }
1307
derive_triangle_win7_sve(const int16x8_t * d_is,const int16x8_t * d_ie,int64x2_t * deltas)1308 static inline void derive_triangle_win7_sve(const int16x8_t *d_is,
1309 const int16x8_t *d_ie,
1310 int64x2_t *deltas) {
1311 deltas[0] = aom_sdotq_s16(deltas[0], vnegq_s16(d_is[0]), d_is[0]);
1312 deltas[0] = aom_sdotq_s16(deltas[0], vnegq_s16(d_is[1]), d_is[1]);
1313 deltas[1] = aom_sdotq_s16(deltas[1], vnegq_s16(d_is[0]), d_is[2]);
1314 deltas[1] = aom_sdotq_s16(deltas[1], vnegq_s16(d_is[1]), d_is[3]);
1315 deltas[2] = aom_sdotq_s16(deltas[2], vnegq_s16(d_is[0]), d_is[4]);
1316 deltas[2] = aom_sdotq_s16(deltas[2], vnegq_s16(d_is[1]), d_is[5]);
1317 deltas[3] = aom_sdotq_s16(deltas[3], vnegq_s16(d_is[0]), d_is[6]);
1318 deltas[3] = aom_sdotq_s16(deltas[3], vnegq_s16(d_is[1]), d_is[7]);
1319 deltas[4] = aom_sdotq_s16(deltas[4], vnegq_s16(d_is[0]), d_is[8]);
1320 deltas[4] = aom_sdotq_s16(deltas[4], vnegq_s16(d_is[1]), d_is[9]);
1321 deltas[5] = aom_sdotq_s16(deltas[5], vnegq_s16(d_is[0]), d_is[10]);
1322 deltas[5] = aom_sdotq_s16(deltas[5], vnegq_s16(d_is[1]), d_is[11]);
1323
1324 deltas[6] = aom_sdotq_s16(deltas[6], vnegq_s16(d_is[2]), d_is[2]);
1325 deltas[6] = aom_sdotq_s16(deltas[6], vnegq_s16(d_is[3]), d_is[3]);
1326 deltas[7] = aom_sdotq_s16(deltas[7], vnegq_s16(d_is[2]), d_is[4]);
1327 deltas[7] = aom_sdotq_s16(deltas[7], vnegq_s16(d_is[3]), d_is[5]);
1328 deltas[8] = aom_sdotq_s16(deltas[8], vnegq_s16(d_is[2]), d_is[6]);
1329 deltas[8] = aom_sdotq_s16(deltas[8], vnegq_s16(d_is[3]), d_is[7]);
1330 deltas[9] = aom_sdotq_s16(deltas[9], vnegq_s16(d_is[2]), d_is[8]);
1331 deltas[9] = aom_sdotq_s16(deltas[9], vnegq_s16(d_is[3]), d_is[9]);
1332 deltas[10] = aom_sdotq_s16(deltas[10], vnegq_s16(d_is[2]), d_is[10]);
1333 deltas[10] = aom_sdotq_s16(deltas[10], vnegq_s16(d_is[3]), d_is[11]);
1334
1335 deltas[11] = aom_sdotq_s16(deltas[11], vnegq_s16(d_is[4]), d_is[4]);
1336 deltas[11] = aom_sdotq_s16(deltas[11], vnegq_s16(d_is[5]), d_is[5]);
1337 deltas[12] = aom_sdotq_s16(deltas[12], vnegq_s16(d_is[4]), d_is[6]);
1338 deltas[12] = aom_sdotq_s16(deltas[12], vnegq_s16(d_is[5]), d_is[7]);
1339 deltas[13] = aom_sdotq_s16(deltas[13], vnegq_s16(d_is[4]), d_is[8]);
1340 deltas[13] = aom_sdotq_s16(deltas[13], vnegq_s16(d_is[5]), d_is[9]);
1341 deltas[14] = aom_sdotq_s16(deltas[14], vnegq_s16(d_is[4]), d_is[10]);
1342 deltas[14] = aom_sdotq_s16(deltas[14], vnegq_s16(d_is[5]), d_is[11]);
1343
1344 deltas[15] = aom_sdotq_s16(deltas[15], vnegq_s16(d_is[6]), d_is[6]);
1345 deltas[15] = aom_sdotq_s16(deltas[15], vnegq_s16(d_is[7]), d_is[7]);
1346 deltas[16] = aom_sdotq_s16(deltas[16], vnegq_s16(d_is[6]), d_is[8]);
1347 deltas[16] = aom_sdotq_s16(deltas[16], vnegq_s16(d_is[7]), d_is[9]);
1348 deltas[17] = aom_sdotq_s16(deltas[17], vnegq_s16(d_is[6]), d_is[10]);
1349 deltas[17] = aom_sdotq_s16(deltas[17], vnegq_s16(d_is[7]), d_is[11]);
1350
1351 deltas[18] = aom_sdotq_s16(deltas[18], vnegq_s16(d_is[8]), d_is[8]);
1352 deltas[18] = aom_sdotq_s16(deltas[18], vnegq_s16(d_is[9]), d_is[9]);
1353 deltas[19] = aom_sdotq_s16(deltas[19], vnegq_s16(d_is[8]), d_is[10]);
1354 deltas[19] = aom_sdotq_s16(deltas[19], vnegq_s16(d_is[9]), d_is[11]);
1355
1356 deltas[20] = aom_sdotq_s16(deltas[20], vnegq_s16(d_is[10]), d_is[10]);
1357 deltas[20] = aom_sdotq_s16(deltas[20], vnegq_s16(d_is[11]), d_is[11]);
1358
1359 deltas[0] = aom_sdotq_s16(deltas[0], d_ie[0], d_ie[0]);
1360 deltas[0] = aom_sdotq_s16(deltas[0], d_ie[1], d_ie[1]);
1361 deltas[1] = aom_sdotq_s16(deltas[1], d_ie[0], d_ie[2]);
1362 deltas[1] = aom_sdotq_s16(deltas[1], d_ie[1], d_ie[3]);
1363 deltas[2] = aom_sdotq_s16(deltas[2], d_ie[0], d_ie[4]);
1364 deltas[2] = aom_sdotq_s16(deltas[2], d_ie[1], d_ie[5]);
1365 deltas[3] = aom_sdotq_s16(deltas[3], d_ie[0], d_ie[6]);
1366 deltas[3] = aom_sdotq_s16(deltas[3], d_ie[1], d_ie[7]);
1367 deltas[4] = aom_sdotq_s16(deltas[4], d_ie[0], d_ie[8]);
1368 deltas[4] = aom_sdotq_s16(deltas[4], d_ie[1], d_ie[9]);
1369 deltas[5] = aom_sdotq_s16(deltas[5], d_ie[0], d_ie[10]);
1370 deltas[5] = aom_sdotq_s16(deltas[5], d_ie[1], d_ie[11]);
1371
1372 deltas[6] = aom_sdotq_s16(deltas[6], d_ie[2], d_ie[2]);
1373 deltas[6] = aom_sdotq_s16(deltas[6], d_ie[3], d_ie[3]);
1374 deltas[7] = aom_sdotq_s16(deltas[7], d_ie[2], d_ie[4]);
1375 deltas[7] = aom_sdotq_s16(deltas[7], d_ie[3], d_ie[5]);
1376 deltas[8] = aom_sdotq_s16(deltas[8], d_ie[2], d_ie[6]);
1377 deltas[8] = aom_sdotq_s16(deltas[8], d_ie[3], d_ie[7]);
1378 deltas[9] = aom_sdotq_s16(deltas[9], d_ie[2], d_ie[8]);
1379 deltas[9] = aom_sdotq_s16(deltas[9], d_ie[3], d_ie[9]);
1380 deltas[10] = aom_sdotq_s16(deltas[10], d_ie[2], d_ie[10]);
1381 deltas[10] = aom_sdotq_s16(deltas[10], d_ie[3], d_ie[11]);
1382
1383 deltas[11] = aom_sdotq_s16(deltas[11], d_ie[4], d_ie[4]);
1384 deltas[11] = aom_sdotq_s16(deltas[11], d_ie[5], d_ie[5]);
1385 deltas[12] = aom_sdotq_s16(deltas[12], d_ie[4], d_ie[6]);
1386 deltas[12] = aom_sdotq_s16(deltas[12], d_ie[5], d_ie[7]);
1387 deltas[13] = aom_sdotq_s16(deltas[13], d_ie[4], d_ie[8]);
1388 deltas[13] = aom_sdotq_s16(deltas[13], d_ie[5], d_ie[9]);
1389 deltas[14] = aom_sdotq_s16(deltas[14], d_ie[4], d_ie[10]);
1390 deltas[14] = aom_sdotq_s16(deltas[14], d_ie[5], d_ie[11]);
1391
1392 deltas[15] = aom_sdotq_s16(deltas[15], d_ie[6], d_ie[6]);
1393 deltas[15] = aom_sdotq_s16(deltas[15], d_ie[7], d_ie[7]);
1394 deltas[16] = aom_sdotq_s16(deltas[16], d_ie[6], d_ie[8]);
1395 deltas[16] = aom_sdotq_s16(deltas[16], d_ie[7], d_ie[9]);
1396 deltas[17] = aom_sdotq_s16(deltas[17], d_ie[6], d_ie[10]);
1397 deltas[17] = aom_sdotq_s16(deltas[17], d_ie[7], d_ie[11]);
1398
1399 deltas[18] = aom_sdotq_s16(deltas[18], d_ie[8], d_ie[8]);
1400 deltas[18] = aom_sdotq_s16(deltas[18], d_ie[9], d_ie[9]);
1401 deltas[19] = aom_sdotq_s16(deltas[19], d_ie[8], d_ie[10]);
1402 deltas[19] = aom_sdotq_s16(deltas[19], d_ie[9], d_ie[11]);
1403
1404 deltas[20] = aom_sdotq_s16(deltas[20], d_ie[10], d_ie[10]);
1405 deltas[20] = aom_sdotq_s16(deltas[20], d_ie[11], d_ie[11]);
1406 }
1407
compute_stats_win7_sve(const int16_t * const d,const int32_t d_stride,const int16_t * const s,const int32_t s_stride,const int32_t width,const int32_t height,int64_t * const M,int64_t * const H)1408 static inline void compute_stats_win7_sve(
1409 const int16_t *const d, const int32_t d_stride, const int16_t *const s,
1410 const int32_t s_stride, const int32_t width, const int32_t height,
1411 int64_t *const M, int64_t *const H) {
1412 const int32_t wiener_win = WIENER_WIN;
1413 const int32_t wiener_win2 = wiener_win * wiener_win;
1414 const int32_t h8 = height & ~7;
1415 int32_t i, j, x, y;
1416
1417 // Use a predicate to compute the last columns.
1418 svbool_t p0 = svwhilelt_b16_u32(0, width % 16 == 0 ? 16 : width % 16);
1419 svbool_t p1 = svwhilelt_b16_u32(8, width % 16 == 0 ? 16 : width % 16);
1420
1421 // Step 1: Calculate the top edge of the whole matrix, i.e., the top
1422 // edge of each triangle and square on the top row.
1423 j = 0;
1424 do {
1425 const int16_t *s_t = s;
1426 const int16_t *d_t = d;
1427 int64x2_t sum_m[WIENER_WIN] = { vdupq_n_s64(0) };
1428 int64x2_t sum_h[WIENER_WIN] = { vdupq_n_s64(0) };
1429 int16x8_t src[2], dgd[2];
1430
1431 y = height;
1432 do {
1433 x = 0;
1434 while (x < width - 16) {
1435 src[0] = vld1q_s16(s_t + x + 0);
1436 src[1] = vld1q_s16(s_t + x + 8);
1437 dgd[0] = vld1q_s16(d_t + x + 0);
1438 dgd[1] = vld1q_s16(d_t + x + 8);
1439 stats_top_win7_sve(src, dgd, d_t + j + x, d_stride, sum_m, sum_h);
1440 x += 16;
1441 }
1442
1443 src[0] = svget_neonq_s16(svld1_s16(p0, s_t + x + 0));
1444 src[1] = svget_neonq_s16(svld1_s16(p1, s_t + x + 8));
1445 dgd[0] = svget_neonq_s16(svld1_s16(p0, d_t + x + 0));
1446 dgd[1] = svget_neonq_s16(svld1_s16(p1, d_t + x + 8));
1447 stats_top_win7_sve(src, dgd, d_t + j + x, d_stride, sum_m, sum_h);
1448
1449 s_t += s_stride;
1450 d_t += d_stride;
1451 } while (--y);
1452
1453 vst1q_s64(M + wiener_win * j + 0, vpaddq_s64(sum_m[0], sum_m[1]));
1454 vst1q_s64(M + wiener_win * j + 2, vpaddq_s64(sum_m[2], sum_m[3]));
1455 vst1q_s64(M + wiener_win * j + 4, vpaddq_s64(sum_m[4], sum_m[5]));
1456 M[wiener_win * j + 6] = vaddvq_s64(sum_m[6]);
1457
1458 vst1q_s64(H + wiener_win * j + 0, vpaddq_s64(sum_h[0], sum_h[1]));
1459 vst1q_s64(H + wiener_win * j + 2, vpaddq_s64(sum_h[2], sum_h[3]));
1460 vst1q_s64(H + wiener_win * j + 4, vpaddq_s64(sum_h[4], sum_h[5]));
1461 H[wiener_win * j + 6] = vaddvq_s64(sum_h[6]);
1462 } while (++j < wiener_win);
1463
1464 // Step 2: Calculate the left edge of each square on the top row.
1465 j = 1;
1466 do {
1467 const int16_t *d_t = d;
1468 int64x2_t sum_h[WIENER_WIN - 1] = { vdupq_n_s64(0) };
1469 int16x8_t dgd[2];
1470
1471 y = height;
1472 do {
1473 x = 0;
1474 while (x < width - 16) {
1475 dgd[0] = vld1q_s16(d_t + j + x + 0);
1476 dgd[1] = vld1q_s16(d_t + j + x + 8);
1477 stats_left_win7_sve(dgd, d_t + x, d_stride, sum_h);
1478 x += 16;
1479 }
1480
1481 dgd[0] = svget_neonq_s16(svld1_s16(p0, d_t + j + x + 0));
1482 dgd[1] = svget_neonq_s16(svld1_s16(p1, d_t + j + x + 8));
1483 stats_left_win7_sve(dgd, d_t + x, d_stride, sum_h);
1484
1485 d_t += d_stride;
1486 } while (--y);
1487
1488 int64x2_t sum_h01 = vpaddq_s64(sum_h[0], sum_h[1]);
1489 int64x2_t sum_h23 = vpaddq_s64(sum_h[2], sum_h[3]);
1490 int64x2_t sum_h45 = vpaddq_s64(sum_h[4], sum_h[5]);
1491 vst1_s64(&H[1 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h01));
1492 vst1_s64(&H[2 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h01));
1493 vst1_s64(&H[3 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h23));
1494 vst1_s64(&H[4 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h23));
1495 vst1_s64(&H[5 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h45));
1496 vst1_s64(&H[6 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h45));
1497 } while (++j < wiener_win);
1498
1499 // Step 3: Derive the top edge of each triangle along the diagonal. No
1500 // triangle in top row.
1501 {
1502 const int16_t *d_t = d;
1503 // Pad to call transpose function.
1504 int32x4_t deltas[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
1505 int32x4_t deltas_tr[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
1506 int16x8_t ds[WIENER_WIN * 2];
1507
1508 load_s16_8x6(d_t, d_stride, &ds[0], &ds[2], &ds[4], &ds[6], &ds[8],
1509 &ds[10]);
1510 load_s16_8x6(d_t + width, d_stride, &ds[1], &ds[3], &ds[5], &ds[7], &ds[9],
1511 &ds[11]);
1512
1513 d_t += 6 * d_stride;
1514
1515 step3_win7_neon(d_t, d_stride, width, height, ds, deltas);
1516 transpose_arrays_s32_8x8(deltas, deltas_tr);
1517
1518 update_8_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
1519 deltas_tr[0], deltas_tr[4],
1520 H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
1521 update_8_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
1522 deltas_tr[1], deltas_tr[5],
1523 H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
1524 update_8_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
1525 deltas_tr[2], deltas_tr[6],
1526 H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
1527 update_8_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
1528 deltas_tr[3], deltas_tr[7],
1529 H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
1530 update_8_stats_neon(H + 4 * wiener_win * wiener_win2 + 4 * wiener_win,
1531 deltas_tr[8], deltas_tr[12],
1532 H + 5 * wiener_win * wiener_win2 + 5 * wiener_win);
1533 update_8_stats_neon(H + 5 * wiener_win * wiener_win2 + 5 * wiener_win,
1534 deltas_tr[9], deltas_tr[13],
1535 H + 6 * wiener_win * wiener_win2 + 6 * wiener_win);
1536 }
1537
1538 // Step 4: Derive the top and left edge of each square. No square in top and
1539 // bottom row.
1540
1541 i = 1;
1542 do {
1543 j = i + 1;
1544 do {
1545 const int16_t *di = d + i - 1;
1546 const int16_t *dj = d + j - 1;
1547 int64x2_t deltas[(2 * WIENER_WIN - 1) * 2] = { vdupq_n_s64(0) };
1548 int16x8_t dd[WIENER_WIN * 2], ds[WIENER_WIN * 2];
1549
1550 dd[5] = vdupq_n_s16(0); // Initialize to avoid warning.
1551 const int16_t dd0_values[] = { di[0 * d_stride],
1552 di[1 * d_stride],
1553 di[2 * d_stride],
1554 di[3 * d_stride],
1555 di[4 * d_stride],
1556 di[5 * d_stride],
1557 0,
1558 0 };
1559 dd[0] = vld1q_s16(dd0_values);
1560 const int16_t dd1_values[] = { di[0 * d_stride + width],
1561 di[1 * d_stride + width],
1562 di[2 * d_stride + width],
1563 di[3 * d_stride + width],
1564 di[4 * d_stride + width],
1565 di[5 * d_stride + width],
1566 0,
1567 0 };
1568 dd[1] = vld1q_s16(dd1_values);
1569 const int16_t ds0_values[] = { dj[0 * d_stride],
1570 dj[1 * d_stride],
1571 dj[2 * d_stride],
1572 dj[3 * d_stride],
1573 dj[4 * d_stride],
1574 dj[5 * d_stride],
1575 0,
1576 0 };
1577 ds[0] = vld1q_s16(ds0_values);
1578 int16_t ds1_values[] = { dj[0 * d_stride + width],
1579 dj[1 * d_stride + width],
1580 dj[2 * d_stride + width],
1581 dj[3 * d_stride + width],
1582 dj[4 * d_stride + width],
1583 dj[5 * d_stride + width],
1584 0,
1585 0 };
1586 ds[1] = vld1q_s16(ds1_values);
1587
1588 y = 0;
1589 while (y < h8) {
1590 // 00s 10s 20s 30s 40s 50s 60s 70s 00e 10e 20e 30e 40e 50e 60e 70e
1591 dd[0] = vsetq_lane_s16(di[6 * d_stride], dd[0], 6);
1592 dd[0] = vsetq_lane_s16(di[7 * d_stride], dd[0], 7);
1593 dd[1] = vsetq_lane_s16(di[6 * d_stride + width], dd[1], 6);
1594 dd[1] = vsetq_lane_s16(di[7 * d_stride + width], dd[1], 7);
1595
1596 // 00s 10s 20s 30s 40s 50s 60s 70s 00e 10e 20e 30e 40e 50e 60e 70e
1597 // 01s 11s 21s 31s 41s 51s 61s 71s 01e 11e 21e 31e 41e 51e 61e 71e
1598 ds[0] = vsetq_lane_s16(dj[6 * d_stride], ds[0], 6);
1599 ds[0] = vsetq_lane_s16(dj[7 * d_stride], ds[0], 7);
1600 ds[1] = vsetq_lane_s16(dj[6 * d_stride + width], ds[1], 6);
1601 ds[1] = vsetq_lane_s16(dj[7 * d_stride + width], ds[1], 7);
1602
1603 load_more_16_neon(di + 8 * d_stride, width, &dd[0], &dd[2]);
1604 load_more_16_neon(dj + 8 * d_stride, width, &ds[0], &ds[2]);
1605 load_more_16_neon(di + 9 * d_stride, width, &dd[2], &dd[4]);
1606 load_more_16_neon(dj + 9 * d_stride, width, &ds[2], &ds[4]);
1607 load_more_16_neon(di + 10 * d_stride, width, &dd[4], &dd[6]);
1608 load_more_16_neon(dj + 10 * d_stride, width, &ds[4], &ds[6]);
1609 load_more_16_neon(di + 11 * d_stride, width, &dd[6], &dd[8]);
1610 load_more_16_neon(dj + 11 * d_stride, width, &ds[6], &ds[8]);
1611 load_more_16_neon(di + 12 * d_stride, width, &dd[8], &dd[10]);
1612 load_more_16_neon(dj + 12 * d_stride, width, &ds[8], &ds[10]);
1613 load_more_16_neon(di + 13 * d_stride, width, &dd[10], &dd[12]);
1614 load_more_16_neon(dj + 13 * d_stride, width, &ds[10], &ds[12]);
1615
1616 deltas[0] = aom_sdotq_s16(deltas[0], dd[0], ds[0]);
1617 deltas[1] = aom_sdotq_s16(deltas[1], dd[1], ds[1]);
1618 deltas[2] = aom_sdotq_s16(deltas[2], dd[0], ds[2]);
1619 deltas[3] = aom_sdotq_s16(deltas[3], dd[1], ds[3]);
1620 deltas[4] = aom_sdotq_s16(deltas[4], dd[0], ds[4]);
1621 deltas[5] = aom_sdotq_s16(deltas[5], dd[1], ds[5]);
1622 deltas[6] = aom_sdotq_s16(deltas[6], dd[0], ds[6]);
1623 deltas[7] = aom_sdotq_s16(deltas[7], dd[1], ds[7]);
1624 deltas[8] = aom_sdotq_s16(deltas[8], dd[0], ds[8]);
1625 deltas[9] = aom_sdotq_s16(deltas[9], dd[1], ds[9]);
1626 deltas[10] = aom_sdotq_s16(deltas[10], dd[0], ds[10]);
1627 deltas[11] = aom_sdotq_s16(deltas[11], dd[1], ds[11]);
1628 deltas[12] = aom_sdotq_s16(deltas[12], dd[0], ds[12]);
1629 deltas[13] = aom_sdotq_s16(deltas[13], dd[1], ds[13]);
1630 deltas[14] = aom_sdotq_s16(deltas[14], dd[2], ds[0]);
1631 deltas[15] = aom_sdotq_s16(deltas[15], dd[3], ds[1]);
1632 deltas[16] = aom_sdotq_s16(deltas[16], dd[4], ds[0]);
1633 deltas[17] = aom_sdotq_s16(deltas[17], dd[5], ds[1]);
1634 deltas[18] = aom_sdotq_s16(deltas[18], dd[6], ds[0]);
1635 deltas[19] = aom_sdotq_s16(deltas[19], dd[7], ds[1]);
1636 deltas[20] = aom_sdotq_s16(deltas[20], dd[8], ds[0]);
1637 deltas[21] = aom_sdotq_s16(deltas[21], dd[9], ds[1]);
1638 deltas[22] = aom_sdotq_s16(deltas[22], dd[10], ds[0]);
1639 deltas[23] = aom_sdotq_s16(deltas[23], dd[11], ds[1]);
1640 deltas[24] = aom_sdotq_s16(deltas[24], dd[12], ds[0]);
1641 deltas[25] = aom_sdotq_s16(deltas[25], dd[13], ds[1]);
1642
1643 dd[0] = vextq_s16(dd[12], vdupq_n_s16(0), 2);
1644 dd[1] = vextq_s16(dd[13], vdupq_n_s16(0), 2);
1645 ds[0] = vextq_s16(ds[12], vdupq_n_s16(0), 2);
1646 ds[1] = vextq_s16(ds[13], vdupq_n_s16(0), 2);
1647
1648 di += 8 * d_stride;
1649 dj += 8 * d_stride;
1650 y += 8;
1651 }
1652
1653 int64x2_t deltas02 = vpaddq_s64(deltas[0], deltas[2]);
1654 int64x2_t deltas13 = vpaddq_s64(deltas[1], deltas[3]);
1655 int64x2_t deltas46 = vpaddq_s64(deltas[4], deltas[6]);
1656 int64x2_t deltas57 = vpaddq_s64(deltas[5], deltas[7]);
1657 int64x2_t deltas810 = vpaddq_s64(deltas[8], deltas[10]);
1658 int64x2_t deltas911 = vpaddq_s64(deltas[9], deltas[11]);
1659 int64x2_t deltas1212 = vpaddq_s64(deltas[12], deltas[12]);
1660 int64x2_t deltas1313 = vpaddq_s64(deltas[13], deltas[13]);
1661 int64x2_t deltas1416 = vpaddq_s64(deltas[14], deltas[16]);
1662 int64x2_t deltas1820 = vpaddq_s64(deltas[18], deltas[20]);
1663 int64x2_t deltas1517 = vpaddq_s64(deltas[15], deltas[17]);
1664 int64x2_t deltas1921 = vpaddq_s64(deltas[19], deltas[21]);
1665 int64x2_t deltas2224 = vpaddq_s64(deltas[22], deltas[24]);
1666 int64x2_t deltas2325 = vpaddq_s64(deltas[23], deltas[25]);
1667 deltas02 = vsubq_s64(deltas13, deltas02);
1668 deltas46 = vsubq_s64(deltas57, deltas46);
1669 deltas810 = vsubq_s64(deltas911, deltas810);
1670 deltas1212 = vsubq_s64(deltas1313, deltas1212);
1671 deltas1416 = vsubq_s64(deltas1517, deltas1416);
1672 deltas1820 = vsubq_s64(deltas1921, deltas1820);
1673 deltas2224 = vsubq_s64(deltas2325, deltas2224);
1674
1675 if (h8 != height) {
1676 const int16_t ds0_vals[] = {
1677 dj[0 * d_stride], dj[0 * d_stride + width],
1678 dj[1 * d_stride], dj[1 * d_stride + width],
1679 dj[2 * d_stride], dj[2 * d_stride + width],
1680 dj[3 * d_stride], dj[3 * d_stride + width]
1681 };
1682 ds[0] = vld1q_s16(ds0_vals);
1683
1684 ds[1] = vsetq_lane_s16(dj[4 * d_stride], ds[1], 0);
1685 ds[1] = vsetq_lane_s16(dj[4 * d_stride + width], ds[1], 1);
1686 ds[1] = vsetq_lane_s16(dj[5 * d_stride], ds[1], 2);
1687 ds[1] = vsetq_lane_s16(dj[5 * d_stride + width], ds[1], 3);
1688 const int16_t dd4_vals[] = {
1689 -di[1 * d_stride], di[1 * d_stride + width],
1690 -di[2 * d_stride], di[2 * d_stride + width],
1691 -di[3 * d_stride], di[3 * d_stride + width],
1692 -di[4 * d_stride], di[4 * d_stride + width]
1693 };
1694 dd[4] = vld1q_s16(dd4_vals);
1695
1696 dd[5] = vsetq_lane_s16(-di[5 * d_stride], dd[5], 0);
1697 dd[5] = vsetq_lane_s16(di[5 * d_stride + width], dd[5], 1);
1698 do {
1699 dd[0] = vdupq_n_s16(-di[0 * d_stride]);
1700 dd[2] = dd[3] = vdupq_n_s16(di[0 * d_stride + width]);
1701 dd[0] = dd[1] = vzip1q_s16(dd[0], dd[2]);
1702
1703 ds[4] = vdupq_n_s16(dj[0 * d_stride]);
1704 ds[6] = ds[7] = vdupq_n_s16(dj[0 * d_stride + width]);
1705 ds[4] = ds[5] = vzip1q_s16(ds[4], ds[6]);
1706
1707 dd[5] = vsetq_lane_s16(-di[6 * d_stride], dd[5], 2);
1708 dd[5] = vsetq_lane_s16(di[6 * d_stride + width], dd[5], 3);
1709 ds[1] = vsetq_lane_s16(dj[6 * d_stride], ds[1], 4);
1710 ds[1] = vsetq_lane_s16(dj[6 * d_stride + width], ds[1], 5);
1711
1712 const int32x4_t res0 =
1713 vpaddq_s32(vmull_s16(vget_low_s16(dd[0]), vget_low_s16(ds[0])),
1714 vmull_s16(vget_high_s16(dd[0]), vget_high_s16(ds[0])));
1715 deltas02 = vaddw_s32(deltas02, vget_low_s32(res0));
1716 deltas46 = vaddw_s32(deltas46, vget_high_s32(res0));
1717 const int32x4_t res1 =
1718 vpaddq_s32(vmull_s16(vget_low_s16(dd[1]), vget_low_s16(ds[1])),
1719 vmull_s16(vget_high_s16(dd[1]), vget_high_s16(ds[1])));
1720 deltas810 = vaddw_s32(deltas810, vget_low_s32(res1));
1721 deltas1212 = vaddw_s32(deltas1212, vget_high_s32(res1));
1722 const int32x4_t res2 =
1723 vpaddq_s32(vmull_s16(vget_low_s16(dd[4]), vget_low_s16(ds[4])),
1724 vmull_s16(vget_high_s16(dd[4]), vget_high_s16(ds[4])));
1725 deltas1416 = vaddw_s32(deltas1416, vget_low_s32(res2));
1726 deltas1820 = vaddw_s32(deltas1820, vget_high_s32(res2));
1727 const int32x4_t res3 =
1728 vpaddq_s32(vmull_s16(vget_low_s16(dd[5]), vget_low_s16(ds[5])),
1729 vmull_s16(vget_high_s16(dd[5]), vget_high_s16(ds[5])));
1730 deltas2224 = vaddw_s32(deltas2224, vget_low_s32(res3));
1731
1732 int32_t tmp0 = vgetq_lane_s32(vreinterpretq_s32_s16(ds[0]), 0);
1733 ds[0] = vextq_s16(ds[0], ds[1], 2);
1734 ds[1] = vextq_s16(ds[1], ds[0], 2);
1735 ds[1] = vreinterpretq_s16_s32(
1736 vsetq_lane_s32(tmp0, vreinterpretq_s32_s16(ds[1]), 3));
1737 int32_t tmp1 = vgetq_lane_s32(vreinterpretq_s32_s16(dd[4]), 0);
1738 dd[4] = vextq_s16(dd[4], dd[5], 2);
1739 dd[5] = vextq_s16(dd[5], dd[4], 2);
1740 dd[5] = vreinterpretq_s16_s32(
1741 vsetq_lane_s32(tmp1, vreinterpretq_s32_s16(dd[5]), 3));
1742 di += d_stride;
1743 dj += d_stride;
1744 } while (++y < height);
1745 }
1746
1747 // Writing one more element on the top edge of a square falls to
1748 // the next square in the same row or the first element in the next
1749 // row, which will just be overwritten later.
1750 int64x2_t s0 = vld1q_s64(H + (i - 1) * wiener_win * wiener_win2 +
1751 (j - 1) * wiener_win + 0);
1752 int64x2_t s1 = vld1q_s64(H + (i - 1) * wiener_win * wiener_win2 +
1753 (j - 1) * wiener_win + 2);
1754 int64x2_t s2 = vld1q_s64(H + (i - 1) * wiener_win * wiener_win2 +
1755 (j - 1) * wiener_win + 4);
1756 int64x2_t s3 = vld1q_s64(H + (i - 1) * wiener_win * wiener_win2 +
1757 (j - 1) * wiener_win + 6);
1758
1759 vst1q_s64(H + i * wiener_win * wiener_win2 + j * wiener_win + 0,
1760 vaddq_s64(s0, deltas02));
1761 vst1q_s64(H + i * wiener_win * wiener_win2 + j * wiener_win + 2,
1762 vaddq_s64(s1, deltas46));
1763 vst1q_s64(H + i * wiener_win * wiener_win2 + j * wiener_win + 4,
1764 vaddq_s64(s2, deltas810));
1765 vst1q_s64(H + i * wiener_win * wiener_win2 + j * wiener_win + 6,
1766 vaddq_s64(s3, deltas1212));
1767
1768 H[(i * wiener_win + 1) * wiener_win2 + j * wiener_win] =
1769 H[((i - 1) * wiener_win + 1) * wiener_win2 + (j - 1) * wiener_win] +
1770 vgetq_lane_s64(deltas1416, 0);
1771 H[(i * wiener_win + 2) * wiener_win2 + j * wiener_win] =
1772 H[((i - 1) * wiener_win + 2) * wiener_win2 + (j - 1) * wiener_win] +
1773 vgetq_lane_s64(deltas1416, 1);
1774 H[(i * wiener_win + 3) * wiener_win2 + j * wiener_win] =
1775 H[((i - 1) * wiener_win + 3) * wiener_win2 + (j - 1) * wiener_win] +
1776 vgetq_lane_s64(deltas1820, 0);
1777 H[(i * wiener_win + 4) * wiener_win2 + j * wiener_win] =
1778 H[((i - 1) * wiener_win + 4) * wiener_win2 + (j - 1) * wiener_win] +
1779 vgetq_lane_s64(deltas1820, 1);
1780 H[(i * wiener_win + 5) * wiener_win2 + j * wiener_win] =
1781 H[((i - 1) * wiener_win + 5) * wiener_win2 + (j - 1) * wiener_win] +
1782 vgetq_lane_s64(deltas2224, 0);
1783 H[(i * wiener_win + 6) * wiener_win2 + j * wiener_win] =
1784 H[((i - 1) * wiener_win + 6) * wiener_win2 + (j - 1) * wiener_win] +
1785 vgetq_lane_s64(deltas2224, 1);
1786 } while (++j < wiener_win);
1787 } while (++i < wiener_win - 1);
1788
1789 // Step 5: Derive other points of each square. No square in bottom row.
1790 i = 0;
1791 do {
1792 const int16_t *const di = d + i;
1793
1794 j = i + 1;
1795 do {
1796 const int16_t *const dj = d + j;
1797 int64x2_t deltas[WIENER_WIN - 1][WIN_7] = { { vdupq_n_s64(0) },
1798 { vdupq_n_s64(0) } };
1799 int16x8_t d_is[WIN_7];
1800 int16x8_t d_ie[WIN_7];
1801 int16x8_t d_js[WIN_7];
1802 int16x8_t d_je[WIN_7];
1803
1804 x = 0;
1805 while (x < width - 16) {
1806 load_square_win7_neon(di + x, dj + x, d_stride, height, d_is, d_ie,
1807 d_js, d_je);
1808 derive_square_win7_sve(d_is, d_ie, d_js, d_je, deltas);
1809 x += 16;
1810 }
1811
1812 load_square_win7_sve(di + x, dj + x, d_stride, height, d_is, d_ie, d_js,
1813 d_je, p0, p1);
1814 derive_square_win7_sve(d_is, d_ie, d_js, d_je, deltas);
1815
1816 hadd_update_6_stats_sve(
1817 H + (i * wiener_win + 0) * wiener_win2 + j * wiener_win, deltas[0],
1818 H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win + 1);
1819 hadd_update_6_stats_sve(
1820 H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win, deltas[1],
1821 H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win + 1);
1822 hadd_update_6_stats_sve(
1823 H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win, deltas[2],
1824 H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win + 1);
1825 hadd_update_6_stats_sve(
1826 H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win, deltas[3],
1827 H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win + 1);
1828 hadd_update_6_stats_sve(
1829 H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win, deltas[4],
1830 H + (i * wiener_win + 5) * wiener_win2 + j * wiener_win + 1);
1831 hadd_update_6_stats_sve(
1832 H + (i * wiener_win + 5) * wiener_win2 + j * wiener_win, deltas[5],
1833 H + (i * wiener_win + 6) * wiener_win2 + j * wiener_win + 1);
1834 } while (++j < wiener_win);
1835 } while (++i < wiener_win - 1);
1836
1837 // Step 6: Derive other points of each upper triangle along the diagonal.
1838 i = 0;
1839 do {
1840 const int16_t *const di = d + i;
1841 int64x2_t deltas[3 * WIENER_WIN] = { vdupq_n_s64(0) };
1842 int16x8_t d_is[WIN_7], d_ie[WIN_7];
1843
1844 x = 0;
1845 while (x < width - 16) {
1846 load_triangle_win7_neon(di + x, d_stride, height, d_is, d_ie);
1847 derive_triangle_win7_sve(d_is, d_ie, deltas);
1848 x += 16;
1849 }
1850
1851 load_triangle_win7_sve(di + x, d_stride, height, d_is, d_ie, p0, p1);
1852 derive_triangle_win7_sve(d_is, d_ie, deltas);
1853
1854 // Row 1: 6 points
1855 hadd_update_6_stats_sve(
1856 H + (i * wiener_win + 0) * wiener_win2 + i * wiener_win, deltas,
1857 H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1);
1858
1859 int64x2_t deltas1017 = vpaddq_s64(deltas[10], deltas[17]);
1860
1861 // Row 2: 5 points
1862 hadd_update_4_stats_sve(
1863 H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1, deltas + 6,
1864 H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2);
1865 H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 6] =
1866 H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 5] +
1867 vgetq_lane_s64(deltas1017, 0);
1868
1869 // Row 3: 4 points
1870 hadd_update_4_stats_sve(
1871 H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2,
1872 deltas + 11,
1873 H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3);
1874
1875 // Row 4: 3 points
1876 int64x2_t h0 =
1877 vld1q_s64(H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3);
1878 vst1q_s64(H + (i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4,
1879 vaddq_s64(h0, vpaddq_s64(deltas[15], deltas[16])));
1880 H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 6] =
1881 H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 5] +
1882 vgetq_lane_s64(deltas1017, 1);
1883
1884 // Row 5: 2 points
1885 int64x2_t h1 =
1886 vld1q_s64(H + (i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4);
1887 vst1q_s64(H + (i * wiener_win + 5) * wiener_win2 + i * wiener_win + 5,
1888 vaddq_s64(h1, vpaddq_s64(deltas[18], deltas[19])));
1889
1890 // Row 6: 1 points
1891 H[(i * wiener_win + 6) * wiener_win2 + i * wiener_win + 6] =
1892 H[(i * wiener_win + 5) * wiener_win2 + i * wiener_win + 5] +
1893 vaddvq_s64(deltas[20]);
1894 } while (++i < wiener_win);
1895 }
1896
1897 #endif // AOM_AV1_ENCODER_ARM_PICKRST_SVE_H_
1898