xref: /aosp_15_r20/external/libaom/av1/encoder/arm/pickrst_sve.h (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #ifndef AOM_AV1_ENCODER_ARM_PICKRST_SVE_H_
13 #define AOM_AV1_ENCODER_ARM_PICKRST_SVE_H_
14 
15 #include <arm_neon.h>
16 #include <arm_sve.h>
17 
18 #include "aom_dsp/arm/aom_neon_sve_bridge.h"
19 #include "av1/encoder/arm/pickrst_neon.h"
20 
21 // Swap each half of the dgd vectors so that we can accumulate the result of
22 // the dot-products directly in the destination matrix.
transpose_dgd(int16x8_t dgd0,int16x8_t dgd1)23 static inline int16x8x2_t transpose_dgd(int16x8_t dgd0, int16x8_t dgd1) {
24   int16x8_t dgd_trn0 = vreinterpretq_s16_s64(
25       vzip1q_s64(vreinterpretq_s64_s16(dgd0), vreinterpretq_s64_s16(dgd1)));
26   int16x8_t dgd_trn1 = vreinterpretq_s16_s64(
27       vzip2q_s64(vreinterpretq_s64_s16(dgd0), vreinterpretq_s64_s16(dgd1)));
28 
29   return (struct int16x8x2_t){ dgd_trn0, dgd_trn1 };
30 }
31 
compute_M_one_row_win5(int16x8_t src,int16x8_t dgd[5],int64_t * M,int row)32 static inline void compute_M_one_row_win5(int16x8_t src, int16x8_t dgd[5],
33                                           int64_t *M, int row) {
34   const int wiener_win = 5;
35 
36   int64x2_t m01 = vld1q_s64(M + row * wiener_win + 0);
37   int16x8x2_t dgd01 = transpose_dgd(dgd[0], dgd[1]);
38 
39   int64x2_t cross_corr01 = aom_svdot_lane_s16(m01, dgd01.val[0], src, 0);
40   cross_corr01 = aom_svdot_lane_s16(cross_corr01, dgd01.val[1], src, 1);
41   vst1q_s64(M + row * wiener_win + 0, cross_corr01);
42 
43   int64x2_t m23 = vld1q_s64(M + row * wiener_win + 2);
44   int16x8x2_t dgd23 = transpose_dgd(dgd[2], dgd[3]);
45 
46   int64x2_t cross_corr23 = aom_svdot_lane_s16(m23, dgd23.val[0], src, 0);
47   cross_corr23 = aom_svdot_lane_s16(cross_corr23, dgd23.val[1], src, 1);
48   vst1q_s64(M + row * wiener_win + 2, cross_corr23);
49 
50   int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), src, dgd[4]);
51   M[row * wiener_win + 4] += vaddvq_s64(m4);
52 }
53 
compute_M_one_row_win7(int16x8_t src,int16x8_t dgd[7],int64_t * M,int row)54 static inline void compute_M_one_row_win7(int16x8_t src, int16x8_t dgd[7],
55                                           int64_t *M, int row) {
56   const int wiener_win = 7;
57 
58   int64x2_t m01 = vld1q_s64(M + row * wiener_win + 0);
59   int16x8x2_t dgd01 = transpose_dgd(dgd[0], dgd[1]);
60 
61   int64x2_t cross_corr01 = aom_svdot_lane_s16(m01, dgd01.val[0], src, 0);
62   cross_corr01 = aom_svdot_lane_s16(cross_corr01, dgd01.val[1], src, 1);
63   vst1q_s64(M + row * wiener_win + 0, cross_corr01);
64 
65   int64x2_t m23 = vld1q_s64(M + row * wiener_win + 2);
66   int16x8x2_t dgd23 = transpose_dgd(dgd[2], dgd[3]);
67 
68   int64x2_t cross_corr23 = aom_svdot_lane_s16(m23, dgd23.val[0], src, 0);
69   cross_corr23 = aom_svdot_lane_s16(cross_corr23, dgd23.val[1], src, 1);
70   vst1q_s64(M + row * wiener_win + 2, cross_corr23);
71 
72   int64x2_t m45 = vld1q_s64(M + row * wiener_win + 4);
73   int16x8x2_t dgd45 = transpose_dgd(dgd[4], dgd[5]);
74 
75   int64x2_t cross_corr45 = aom_svdot_lane_s16(m45, dgd45.val[0], src, 0);
76   cross_corr45 = aom_svdot_lane_s16(cross_corr45, dgd45.val[1], src, 1);
77   vst1q_s64(M + row * wiener_win + 4, cross_corr45);
78 
79   int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), src, dgd[6]);
80   M[row * wiener_win + 6] += vaddvq_s64(m6);
81 }
82 
compute_H_one_col(int16x8_t * dgd,int col,int64_t * H,const int wiener_win,const int wiener_win2)83 static inline void compute_H_one_col(int16x8_t *dgd, int col, int64_t *H,
84                                      const int wiener_win,
85                                      const int wiener_win2) {
86   for (int row0 = 0; row0 < wiener_win; row0++) {
87     for (int row1 = row0; row1 < wiener_win; row1++) {
88       int auto_cov_idx =
89           (col * wiener_win + row0) * wiener_win2 + (col * wiener_win) + row1;
90 
91       int64x2_t auto_cov = aom_sdotq_s16(vdupq_n_s64(0), dgd[row0], dgd[row1]);
92       H[auto_cov_idx] += vaddvq_s64(auto_cov);
93     }
94   }
95 }
96 
compute_H_two_rows_win5(int16x8_t * dgd0,int16x8_t * dgd1,int row0,int row1,int64_t * H)97 static inline void compute_H_two_rows_win5(int16x8_t *dgd0, int16x8_t *dgd1,
98                                            int row0, int row1, int64_t *H) {
99   for (int col0 = 0; col0 < 5; col0++) {
100     int auto_cov_idx = (row0 * 5 + col0) * 25 + (row1 * 5);
101 
102     int64x2_t h01 = vld1q_s64(H + auto_cov_idx);
103     int16x8x2_t dgd01 = transpose_dgd(dgd1[0], dgd1[1]);
104 
105     int64x2_t auto_cov01 = aom_svdot_lane_s16(h01, dgd01.val[0], dgd0[col0], 0);
106     auto_cov01 = aom_svdot_lane_s16(auto_cov01, dgd01.val[1], dgd0[col0], 1);
107     vst1q_s64(H + auto_cov_idx, auto_cov01);
108 
109     int64x2_t h23 = vld1q_s64(H + auto_cov_idx + 2);
110     int16x8x2_t dgd23 = transpose_dgd(dgd1[2], dgd1[3]);
111 
112     int64x2_t auto_cov23 = aom_svdot_lane_s16(h23, dgd23.val[0], dgd0[col0], 0);
113     auto_cov23 = aom_svdot_lane_s16(auto_cov23, dgd23.val[1], dgd0[col0], 1);
114     vst1q_s64(H + auto_cov_idx + 2, auto_cov23);
115 
116     int64x2_t auto_cov4 = aom_sdotq_s16(vdupq_n_s64(0), dgd0[col0], dgd1[4]);
117     H[auto_cov_idx + 4] += vaddvq_s64(auto_cov4);
118   }
119 }
120 
compute_H_two_rows_win7(int16x8_t * dgd0,int16x8_t * dgd1,int row0,int row1,int64_t * H)121 static inline void compute_H_two_rows_win7(int16x8_t *dgd0, int16x8_t *dgd1,
122                                            int row0, int row1, int64_t *H) {
123   for (int col0 = 0; col0 < 7; col0++) {
124     int auto_cov_idx = (row0 * 7 + col0) * 49 + (row1 * 7);
125 
126     int64x2_t h01 = vld1q_s64(H + auto_cov_idx);
127     int16x8x2_t dgd01 = transpose_dgd(dgd1[0], dgd1[1]);
128 
129     int64x2_t auto_cov01 = aom_svdot_lane_s16(h01, dgd01.val[0], dgd0[col0], 0);
130     auto_cov01 = aom_svdot_lane_s16(auto_cov01, dgd01.val[1], dgd0[col0], 1);
131     vst1q_s64(H + auto_cov_idx, auto_cov01);
132 
133     int64x2_t h23 = vld1q_s64(H + auto_cov_idx + 2);
134     int16x8x2_t dgd23 = transpose_dgd(dgd1[2], dgd1[3]);
135 
136     int64x2_t auto_cov23 = aom_svdot_lane_s16(h23, dgd23.val[0], dgd0[col0], 0);
137     auto_cov23 = aom_svdot_lane_s16(auto_cov23, dgd23.val[1], dgd0[col0], 1);
138     vst1q_s64(H + auto_cov_idx + 2, auto_cov23);
139 
140     int64x2_t h45 = vld1q_s64(H + auto_cov_idx + 4);
141     int16x8x2_t dgd45 = transpose_dgd(dgd1[4], dgd1[5]);
142 
143     int64x2_t auto_cov45 = aom_svdot_lane_s16(h45, dgd45.val[0], dgd0[col0], 0);
144     auto_cov45 = aom_svdot_lane_s16(auto_cov45, dgd45.val[1], dgd0[col0], 1);
145     vst1q_s64(H + auto_cov_idx + 4, auto_cov45);
146 
147     int64x2_t auto_cov6 = aom_sdotq_s16(vdupq_n_s64(0), dgd0[col0], dgd1[6]);
148     H[auto_cov_idx + 6] += vaddvq_s64(auto_cov6);
149   }
150 }
151 
stats_top_win5_sve(const int16x8_t src[2],const int16x8_t dgd[2],const int16_t * const d,const int32_t d_stride,int64x2_t * sum_m,int64x2_t * sum_h)152 static inline void stats_top_win5_sve(const int16x8_t src[2],
153                                       const int16x8_t dgd[2],
154                                       const int16_t *const d,
155                                       const int32_t d_stride, int64x2_t *sum_m,
156                                       int64x2_t *sum_h) {
157   int16x8_t dgds[WIENER_WIN_CHROMA * 2];
158 
159   load_s16_8x5(d + 0, d_stride, &dgds[0], &dgds[2], &dgds[4], &dgds[6],
160                &dgds[8]);
161   load_s16_8x5(d + 8, d_stride, &dgds[1], &dgds[3], &dgds[5], &dgds[7],
162                &dgds[9]);
163 
164   sum_m[0] = aom_sdotq_s16(sum_m[0], src[0], dgds[0]);
165   sum_m[0] = aom_sdotq_s16(sum_m[0], src[1], dgds[1]);
166   sum_m[1] = aom_sdotq_s16(sum_m[1], src[0], dgds[2]);
167   sum_m[1] = aom_sdotq_s16(sum_m[1], src[1], dgds[3]);
168   sum_m[2] = aom_sdotq_s16(sum_m[2], src[0], dgds[4]);
169   sum_m[2] = aom_sdotq_s16(sum_m[2], src[1], dgds[5]);
170   sum_m[3] = aom_sdotq_s16(sum_m[3], src[0], dgds[6]);
171   sum_m[3] = aom_sdotq_s16(sum_m[3], src[1], dgds[7]);
172   sum_m[4] = aom_sdotq_s16(sum_m[4], src[0], dgds[8]);
173   sum_m[4] = aom_sdotq_s16(sum_m[4], src[1], dgds[9]);
174 
175   sum_h[0] = aom_sdotq_s16(sum_h[0], dgd[0], dgds[0]);
176   sum_h[0] = aom_sdotq_s16(sum_h[0], dgd[1], dgds[1]);
177   sum_h[1] = aom_sdotq_s16(sum_h[1], dgd[0], dgds[2]);
178   sum_h[1] = aom_sdotq_s16(sum_h[1], dgd[1], dgds[3]);
179   sum_h[2] = aom_sdotq_s16(sum_h[2], dgd[0], dgds[4]);
180   sum_h[2] = aom_sdotq_s16(sum_h[2], dgd[1], dgds[5]);
181   sum_h[3] = aom_sdotq_s16(sum_h[3], dgd[0], dgds[6]);
182   sum_h[3] = aom_sdotq_s16(sum_h[3], dgd[1], dgds[7]);
183   sum_h[4] = aom_sdotq_s16(sum_h[4], dgd[0], dgds[8]);
184   sum_h[4] = aom_sdotq_s16(sum_h[4], dgd[1], dgds[9]);
185 }
186 
stats_left_win5_sve(const int16x8_t src[2],const int16_t * d,const int32_t d_stride,int64x2_t * sum)187 static inline void stats_left_win5_sve(const int16x8_t src[2], const int16_t *d,
188                                        const int32_t d_stride, int64x2_t *sum) {
189   int16x8_t dgds[WIN_CHROMA];
190 
191   load_s16_8x4(d + d_stride + 0, d_stride, &dgds[0], &dgds[2], &dgds[4],
192                &dgds[6]);
193   load_s16_8x4(d + d_stride + 8, d_stride, &dgds[1], &dgds[3], &dgds[5],
194                &dgds[7]);
195 
196   sum[0] = aom_sdotq_s16(sum[0], src[0], dgds[0]);
197   sum[0] = aom_sdotq_s16(sum[0], src[1], dgds[1]);
198   sum[1] = aom_sdotq_s16(sum[1], src[0], dgds[2]);
199   sum[1] = aom_sdotq_s16(sum[1], src[1], dgds[3]);
200   sum[2] = aom_sdotq_s16(sum[2], src[0], dgds[4]);
201   sum[2] = aom_sdotq_s16(sum[2], src[1], dgds[5]);
202   sum[3] = aom_sdotq_s16(sum[3], src[0], dgds[6]);
203   sum[3] = aom_sdotq_s16(sum[3], src[1], dgds[7]);
204 }
205 
sub_deltas_step4_sve(int16x8_t * A,int16x8_t * B,int64x2_t * deltas)206 static inline void sub_deltas_step4_sve(int16x8_t *A, int16x8_t *B,
207                                         int64x2_t *deltas) {
208   deltas[0] = aom_sdotq_s16(deltas[0], vnegq_s16(A[0]), B[0]);
209   deltas[1] = aom_sdotq_s16(deltas[1], vnegq_s16(A[0]), B[1]);
210   deltas[2] = aom_sdotq_s16(deltas[2], vnegq_s16(A[0]), B[2]);
211   deltas[3] = aom_sdotq_s16(deltas[3], vnegq_s16(A[0]), B[3]);
212   deltas[4] = aom_sdotq_s16(deltas[4], vnegq_s16(A[0]), B[4]);
213   deltas[5] = aom_sdotq_s16(deltas[5], vnegq_s16(A[1]), B[0]);
214   deltas[6] = aom_sdotq_s16(deltas[6], vnegq_s16(A[2]), B[0]);
215   deltas[7] = aom_sdotq_s16(deltas[7], vnegq_s16(A[3]), B[0]);
216   deltas[8] = aom_sdotq_s16(deltas[8], vnegq_s16(A[4]), B[0]);
217 }
218 
add_deltas_step4_sve(int16x8_t * A,int16x8_t * B,int64x2_t * deltas)219 static inline void add_deltas_step4_sve(int16x8_t *A, int16x8_t *B,
220                                         int64x2_t *deltas) {
221   deltas[0] = aom_sdotq_s16(deltas[0], A[0], B[0]);
222   deltas[1] = aom_sdotq_s16(deltas[1], A[0], B[1]);
223   deltas[2] = aom_sdotq_s16(deltas[2], A[0], B[2]);
224   deltas[3] = aom_sdotq_s16(deltas[3], A[0], B[3]);
225   deltas[4] = aom_sdotq_s16(deltas[4], A[0], B[4]);
226   deltas[5] = aom_sdotq_s16(deltas[5], A[1], B[0]);
227   deltas[6] = aom_sdotq_s16(deltas[6], A[2], B[0]);
228   deltas[7] = aom_sdotq_s16(deltas[7], A[3], B[0]);
229   deltas[8] = aom_sdotq_s16(deltas[8], A[4], B[0]);
230 }
231 
load_square_win5_sve(const int16_t * const di,const int16_t * const dj,const int32_t d_stride,const int32_t height,int16x8_t * d_is,int16x8_t * d_ie,int16x8_t * d_js,int16x8_t * d_je,svbool_t p0,svbool_t p1)232 static inline void load_square_win5_sve(
233     const int16_t *const di, const int16_t *const dj, const int32_t d_stride,
234     const int32_t height, int16x8_t *d_is, int16x8_t *d_ie, int16x8_t *d_js,
235     int16x8_t *d_je, svbool_t p0, svbool_t p1) {
236   d_is[0] = svget_neonq_s16(svld1_s16(p0, di + 0 * d_stride + 0));
237   d_is[1] = svget_neonq_s16(svld1_s16(p1, di + 0 * d_stride + 8));
238   d_is[2] = svget_neonq_s16(svld1_s16(p0, di + 1 * d_stride + 0));
239   d_is[3] = svget_neonq_s16(svld1_s16(p1, di + 1 * d_stride + 8));
240   d_is[4] = svget_neonq_s16(svld1_s16(p0, di + 2 * d_stride + 0));
241   d_is[5] = svget_neonq_s16(svld1_s16(p1, di + 2 * d_stride + 8));
242   d_is[6] = svget_neonq_s16(svld1_s16(p0, di + 3 * d_stride + 0));
243   d_is[7] = svget_neonq_s16(svld1_s16(p1, di + 3 * d_stride + 8));
244 
245   d_ie[0] = svget_neonq_s16(svld1_s16(p0, di + (height + 0) * d_stride + 0));
246   d_ie[1] = svget_neonq_s16(svld1_s16(p1, di + (height + 0) * d_stride + 8));
247   d_ie[2] = svget_neonq_s16(svld1_s16(p0, di + (height + 1) * d_stride + 0));
248   d_ie[3] = svget_neonq_s16(svld1_s16(p1, di + (height + 1) * d_stride + 8));
249   d_ie[4] = svget_neonq_s16(svld1_s16(p0, di + (height + 2) * d_stride + 0));
250   d_ie[5] = svget_neonq_s16(svld1_s16(p1, di + (height + 2) * d_stride + 8));
251   d_ie[6] = svget_neonq_s16(svld1_s16(p0, di + (height + 3) * d_stride + 0));
252   d_ie[7] = svget_neonq_s16(svld1_s16(p1, di + (height + 3) * d_stride + 8));
253 
254   load_s16_8x4(dj + 0, d_stride, &d_js[0], &d_js[2], &d_js[4], &d_js[6]);
255   load_s16_8x4(dj + 8, d_stride, &d_js[1], &d_js[3], &d_js[5], &d_js[7]);
256   load_s16_8x4(dj + height * d_stride + 0, d_stride, &d_je[0], &d_je[2],
257                &d_je[4], &d_je[6]);
258   load_s16_8x4(dj + height * d_stride + 8, d_stride, &d_je[1], &d_je[3],
259                &d_je[5], &d_je[7]);
260 }
261 
update_4_stats_sve(const int64_t * const src,const int64x2_t * delta,int64_t * const dst)262 static inline void update_4_stats_sve(const int64_t *const src,
263                                       const int64x2_t *delta,
264                                       int64_t *const dst) {
265   const int64x2_t s1 = vld1q_s64(src);
266   const int64x2_t s2 = vld1q_s64(src + 2);
267 
268   vst1q_s64(dst + 0, vaddq_s64(s1, delta[0]));
269   vst1q_s64(dst + 2, vaddq_s64(s2, delta[1]));
270 }
271 
derive_square_win5_sve(int16x8_t * d_is,const int16x8_t * d_ie,const int16x8_t * d_js,const int16x8_t * d_je,int64x2_t deltas[WIENER_WIN_CHROMA-1][WIENER_WIN_CHROMA-1])272 static inline void derive_square_win5_sve(
273     int16x8_t *d_is, const int16x8_t *d_ie, const int16x8_t *d_js,
274     const int16x8_t *d_je,
275     int64x2_t deltas[WIENER_WIN_CHROMA - 1][WIENER_WIN_CHROMA - 1]) {
276   d_is[0] = vnegq_s16(d_is[0]);
277   d_is[1] = vnegq_s16(d_is[1]);
278   d_is[2] = vnegq_s16(d_is[2]);
279   d_is[3] = vnegq_s16(d_is[3]);
280   d_is[4] = vnegq_s16(d_is[4]);
281   d_is[5] = vnegq_s16(d_is[5]);
282   d_is[6] = vnegq_s16(d_is[6]);
283   d_is[7] = vnegq_s16(d_is[7]);
284 
285   deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_is[0], d_js[0]);
286   deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_is[1], d_js[1]);
287   deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_is[0], d_js[2]);
288   deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_is[1], d_js[3]);
289   deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_is[0], d_js[4]);
290   deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_is[1], d_js[5]);
291   deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_is[0], d_js[6]);
292   deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_is[1], d_js[7]);
293 
294   deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_is[2], d_js[0]);
295   deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_is[3], d_js[1]);
296   deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_is[2], d_js[2]);
297   deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_is[3], d_js[3]);
298   deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_is[2], d_js[4]);
299   deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_is[3], d_js[5]);
300   deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_is[2], d_js[6]);
301   deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_is[3], d_js[7]);
302 
303   deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_is[4], d_js[0]);
304   deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_is[5], d_js[1]);
305   deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_is[4], d_js[2]);
306   deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_is[5], d_js[3]);
307   deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_is[4], d_js[4]);
308   deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_is[5], d_js[5]);
309   deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_is[4], d_js[6]);
310   deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_is[5], d_js[7]);
311 
312   deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_is[6], d_js[0]);
313   deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_is[7], d_js[1]);
314   deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_is[6], d_js[2]);
315   deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_is[7], d_js[3]);
316   deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_is[6], d_js[4]);
317   deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_is[7], d_js[5]);
318   deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_is[6], d_js[6]);
319   deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_is[7], d_js[7]);
320 
321   deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_ie[0], d_je[0]);
322   deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_ie[1], d_je[1]);
323   deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_ie[0], d_je[2]);
324   deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_ie[1], d_je[3]);
325   deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_ie[0], d_je[4]);
326   deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_ie[1], d_je[5]);
327   deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_ie[0], d_je[6]);
328   deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_ie[1], d_je[7]);
329 
330   deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_ie[2], d_je[0]);
331   deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_ie[3], d_je[1]);
332   deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_ie[2], d_je[2]);
333   deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_ie[3], d_je[3]);
334   deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_ie[2], d_je[4]);
335   deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_ie[3], d_je[5]);
336   deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_ie[2], d_je[6]);
337   deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_ie[3], d_je[7]);
338 
339   deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_ie[4], d_je[0]);
340   deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_ie[5], d_je[1]);
341   deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_ie[4], d_je[2]);
342   deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_ie[5], d_je[3]);
343   deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_ie[4], d_je[4]);
344   deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_ie[5], d_je[5]);
345   deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_ie[4], d_je[6]);
346   deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_ie[5], d_je[7]);
347 
348   deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_ie[6], d_je[0]);
349   deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_ie[7], d_je[1]);
350   deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_ie[6], d_je[2]);
351   deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_ie[7], d_je[3]);
352   deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_ie[6], d_je[4]);
353   deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_ie[7], d_je[5]);
354   deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_ie[6], d_je[6]);
355   deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_ie[7], d_je[7]);
356 }
357 
hadd_update_4_stats_sve(const int64_t * const src,const int64x2_t * deltas,int64_t * const dst)358 static inline void hadd_update_4_stats_sve(const int64_t *const src,
359                                            const int64x2_t *deltas,
360                                            int64_t *const dst) {
361   int64x2_t src0 = vld1q_s64(src);
362   int64x2_t src1 = vld1q_s64(src + 2);
363   vst1q_s64(dst + 0, vaddq_s64(src0, vpaddq_s64(deltas[0], deltas[1])));
364   vst1q_s64(dst + 2, vaddq_s64(src1, vpaddq_s64(deltas[2], deltas[3])));
365 }
366 
load_triangle_win5_sve(const int16_t * const di,const int32_t d_stride,const int32_t height,int16x8_t * d_is,int16x8_t * d_ie,svbool_t p0,svbool_t p1)367 static inline void load_triangle_win5_sve(const int16_t *const di,
368                                           const int32_t d_stride,
369                                           const int32_t height, int16x8_t *d_is,
370                                           int16x8_t *d_ie, svbool_t p0,
371                                           svbool_t p1) {
372   d_is[0] = svget_neonq_s16(svld1_s16(p0, di + 0 * d_stride + 0));
373   d_is[1] = svget_neonq_s16(svld1_s16(p1, di + 0 * d_stride + 8));
374   d_is[2] = svget_neonq_s16(svld1_s16(p0, di + 1 * d_stride + 0));
375   d_is[3] = svget_neonq_s16(svld1_s16(p1, di + 1 * d_stride + 8));
376   d_is[4] = svget_neonq_s16(svld1_s16(p0, di + 2 * d_stride + 0));
377   d_is[5] = svget_neonq_s16(svld1_s16(p1, di + 2 * d_stride + 8));
378   d_is[6] = svget_neonq_s16(svld1_s16(p0, di + 3 * d_stride + 0));
379   d_is[7] = svget_neonq_s16(svld1_s16(p1, di + 3 * d_stride + 8));
380   d_ie[0] = svget_neonq_s16(svld1_s16(p0, di + (height + 0) * d_stride + 0));
381   d_ie[1] = svget_neonq_s16(svld1_s16(p1, di + (height + 0) * d_stride + 8));
382   d_ie[2] = svget_neonq_s16(svld1_s16(p0, di + (height + 1) * d_stride + 0));
383   d_ie[3] = svget_neonq_s16(svld1_s16(p1, di + (height + 1) * d_stride + 8));
384   d_ie[4] = svget_neonq_s16(svld1_s16(p0, di + (height + 2) * d_stride + 0));
385   d_ie[5] = svget_neonq_s16(svld1_s16(p1, di + (height + 2) * d_stride + 8));
386   d_ie[6] = svget_neonq_s16(svld1_s16(p0, di + (height + 3) * d_stride + 0));
387   d_ie[7] = svget_neonq_s16(svld1_s16(p1, di + (height + 3) * d_stride + 8));
388 }
389 
derive_triangle_win5_sve(const int16x8_t * d_is,const int16x8_t * d_ie,int64x2_t * deltas)390 static inline void derive_triangle_win5_sve(const int16x8_t *d_is,
391                                             const int16x8_t *d_ie,
392                                             int64x2_t *deltas) {
393   deltas[0] = aom_sdotq_s16(deltas[0], vnegq_s16(d_is[0]), d_is[0]);
394   deltas[0] = aom_sdotq_s16(deltas[0], vnegq_s16(d_is[1]), d_is[1]);
395   deltas[1] = aom_sdotq_s16(deltas[1], vnegq_s16(d_is[0]), d_is[2]);
396   deltas[1] = aom_sdotq_s16(deltas[1], vnegq_s16(d_is[1]), d_is[3]);
397   deltas[2] = aom_sdotq_s16(deltas[2], vnegq_s16(d_is[0]), d_is[4]);
398   deltas[2] = aom_sdotq_s16(deltas[2], vnegq_s16(d_is[1]), d_is[5]);
399   deltas[3] = aom_sdotq_s16(deltas[3], vnegq_s16(d_is[0]), d_is[6]);
400   deltas[3] = aom_sdotq_s16(deltas[3], vnegq_s16(d_is[1]), d_is[7]);
401   deltas[4] = aom_sdotq_s16(deltas[4], vnegq_s16(d_is[2]), d_is[2]);
402   deltas[4] = aom_sdotq_s16(deltas[4], vnegq_s16(d_is[3]), d_is[3]);
403   deltas[5] = aom_sdotq_s16(deltas[5], vnegq_s16(d_is[2]), d_is[4]);
404   deltas[5] = aom_sdotq_s16(deltas[5], vnegq_s16(d_is[3]), d_is[5]);
405   deltas[6] = aom_sdotq_s16(deltas[6], vnegq_s16(d_is[2]), d_is[6]);
406   deltas[6] = aom_sdotq_s16(deltas[6], vnegq_s16(d_is[3]), d_is[7]);
407   deltas[7] = aom_sdotq_s16(deltas[7], vnegq_s16(d_is[4]), d_is[4]);
408   deltas[7] = aom_sdotq_s16(deltas[7], vnegq_s16(d_is[5]), d_is[5]);
409   deltas[8] = aom_sdotq_s16(deltas[8], vnegq_s16(d_is[4]), d_is[6]);
410   deltas[8] = aom_sdotq_s16(deltas[8], vnegq_s16(d_is[5]), d_is[7]);
411   deltas[9] = aom_sdotq_s16(deltas[9], vnegq_s16(d_is[6]), d_is[6]);
412   deltas[9] = aom_sdotq_s16(deltas[9], vnegq_s16(d_is[7]), d_is[7]);
413 
414   deltas[0] = aom_sdotq_s16(deltas[0], d_ie[0], d_ie[0]);
415   deltas[0] = aom_sdotq_s16(deltas[0], d_ie[1], d_ie[1]);
416   deltas[1] = aom_sdotq_s16(deltas[1], d_ie[0], d_ie[2]);
417   deltas[1] = aom_sdotq_s16(deltas[1], d_ie[1], d_ie[3]);
418   deltas[2] = aom_sdotq_s16(deltas[2], d_ie[0], d_ie[4]);
419   deltas[2] = aom_sdotq_s16(deltas[2], d_ie[1], d_ie[5]);
420   deltas[3] = aom_sdotq_s16(deltas[3], d_ie[0], d_ie[6]);
421   deltas[3] = aom_sdotq_s16(deltas[3], d_ie[1], d_ie[7]);
422   deltas[4] = aom_sdotq_s16(deltas[4], d_ie[2], d_ie[2]);
423   deltas[4] = aom_sdotq_s16(deltas[4], d_ie[3], d_ie[3]);
424   deltas[5] = aom_sdotq_s16(deltas[5], d_ie[2], d_ie[4]);
425   deltas[5] = aom_sdotq_s16(deltas[5], d_ie[3], d_ie[5]);
426   deltas[6] = aom_sdotq_s16(deltas[6], d_ie[2], d_ie[6]);
427   deltas[6] = aom_sdotq_s16(deltas[6], d_ie[3], d_ie[7]);
428   deltas[7] = aom_sdotq_s16(deltas[7], d_ie[4], d_ie[4]);
429   deltas[7] = aom_sdotq_s16(deltas[7], d_ie[5], d_ie[5]);
430   deltas[8] = aom_sdotq_s16(deltas[8], d_ie[4], d_ie[6]);
431   deltas[8] = aom_sdotq_s16(deltas[8], d_ie[5], d_ie[7]);
432   deltas[9] = aom_sdotq_s16(deltas[9], d_ie[6], d_ie[6]);
433   deltas[9] = aom_sdotq_s16(deltas[9], d_ie[7], d_ie[7]);
434 }
435 
compute_stats_win5_sve(const int16_t * const d,const int32_t d_stride,const int16_t * const s,const int32_t s_stride,const int32_t width,const int32_t height,int64_t * const M,int64_t * const H)436 static inline void compute_stats_win5_sve(
437     const int16_t *const d, const int32_t d_stride, const int16_t *const s,
438     const int32_t s_stride, const int32_t width, const int32_t height,
439     int64_t *const M, int64_t *const H) {
440   const int32_t wiener_win = WIENER_WIN_CHROMA;
441   const int32_t wiener_win2 = wiener_win * wiener_win;
442   const int32_t h8 = height & ~7;
443   int32_t i, j, x, y;
444 
445   // Use a predicate to compute the last columns.
446   svbool_t p0 = svwhilelt_b16_u32(0, width % 16 == 0 ? 16 : width % 16);
447   svbool_t p1 = svwhilelt_b16_u32(8, width % 16 == 0 ? 16 : width % 16);
448 
449   // Step 1: Calculate the top edge of the whole matrix, i.e., the top
450   // edge of each triangle and square on the top row.
451   j = 0;
452   do {
453     const int16_t *s_t = s;
454     const int16_t *d_t = d;
455     int64x2_t sum_m[WIENER_WIN_CHROMA] = { vdupq_n_s64(0) };
456     int64x2_t sum_h[WIENER_WIN_CHROMA] = { vdupq_n_s64(0) };
457     int16x8_t src[2], dgd[2];
458 
459     y = height;
460     do {
461       x = 0;
462       while (x < width - 16) {
463         src[0] = vld1q_s16(s_t + x + 0);
464         src[1] = vld1q_s16(s_t + x + 8);
465         dgd[0] = vld1q_s16(d_t + x + 0);
466         dgd[1] = vld1q_s16(d_t + x + 8);
467         stats_top_win5_sve(src, dgd, d_t + j + x, d_stride, sum_m, sum_h);
468         x += 16;
469       }
470 
471       src[0] = svget_neonq_s16(svld1_s16(p0, s_t + x + 0));
472       src[1] = svget_neonq_s16(svld1_s16(p1, s_t + x + 8));
473       dgd[0] = svget_neonq_s16(svld1_s16(p0, d_t + x + 0));
474       dgd[1] = svget_neonq_s16(svld1_s16(p1, d_t + x + 8));
475 
476       stats_top_win5_sve(src, dgd, d_t + j + x, d_stride, sum_m, sum_h);
477 
478       s_t += s_stride;
479       d_t += d_stride;
480     } while (--y);
481 
482     vst1q_s64(&M[wiener_win * j + 0], vpaddq_s64(sum_m[0], sum_m[1]));
483     vst1q_s64(&M[wiener_win * j + 2], vpaddq_s64(sum_m[2], sum_m[3]));
484     M[wiener_win * j + 4] = vaddvq_s64(sum_m[4]);
485 
486     vst1q_s64(&H[wiener_win * j + 0], vpaddq_s64(sum_h[0], sum_h[1]));
487     vst1q_s64(&H[wiener_win * j + 2], vpaddq_s64(sum_h[2], sum_h[3]));
488     H[wiener_win * j + 4] = vaddvq_s64(sum_h[4]);
489   } while (++j < wiener_win);
490 
491   // Step 2: Calculate the left edge of each square on the top row.
492   j = 1;
493   do {
494     const int16_t *d_t = d;
495     int64x2_t sum_h[WIENER_WIN_CHROMA - 1] = { vdupq_n_s64(0) };
496     int16x8_t dgd[2];
497 
498     y = height;
499     do {
500       x = 0;
501       while (x < width - 16) {
502         dgd[0] = vld1q_s16(d_t + j + x + 0);
503         dgd[1] = vld1q_s16(d_t + j + x + 8);
504         stats_left_win5_sve(dgd, d_t + x, d_stride, sum_h);
505         x += 16;
506       }
507 
508       dgd[0] = svget_neonq_s16(svld1_s16(p0, d_t + j + x + 0));
509       dgd[1] = svget_neonq_s16(svld1_s16(p1, d_t + j + x + 8));
510 
511       stats_left_win5_sve(dgd, d_t + x, d_stride, sum_h);
512 
513       d_t += d_stride;
514     } while (--y);
515 
516     int64x2_t sum_h01 = vpaddq_s64(sum_h[0], sum_h[1]);
517     int64x2_t sum_h23 = vpaddq_s64(sum_h[2], sum_h[3]);
518     vst1_s64(&H[1 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h01));
519     vst1_s64(&H[2 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h01));
520     vst1_s64(&H[3 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h23));
521     vst1_s64(&H[4 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h23));
522 
523   } while (++j < wiener_win);
524 
525   // Step 3: Derive the top edge of each triangle along the diagonal. No
526   // triangle in top row.
527   {
528     const int16_t *d_t = d;
529 
530     if (height % 2) {
531       int32x4_t deltas[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
532       int32x4_t deltas_tr[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
533       int16x8_t ds[WIENER_WIN * 2];
534 
535       load_s16_8x4(d_t, d_stride, &ds[0], &ds[2], &ds[4], &ds[6]);
536       load_s16_8x4(d_t + width, d_stride, &ds[1], &ds[3], &ds[5], &ds[7]);
537       d_t += 4 * d_stride;
538 
539       step3_win5_oneline_neon(&d_t, d_stride, width, height, ds, deltas);
540       transpose_arrays_s32_8x8(deltas, deltas_tr);
541 
542       update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
543                           deltas_tr[0], vgetq_lane_s32(deltas_tr[4], 0),
544                           H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
545 
546       update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
547                           deltas_tr[1], vgetq_lane_s32(deltas_tr[5], 0),
548                           H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
549 
550       update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
551                           deltas_tr[2], vgetq_lane_s32(deltas_tr[6], 0),
552                           H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
553 
554       update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
555                           deltas_tr[3], vgetq_lane_s32(deltas_tr[7], 0),
556                           H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
557 
558     } else {
559       int32x4_t deltas[WIENER_WIN_CHROMA * 2] = { vdupq_n_s32(0) };
560       int16x8_t ds[WIENER_WIN_CHROMA * 2];
561 
562       ds[0] = load_unaligned_s16_4x2(d_t + 0 * d_stride, width);
563       ds[1] = load_unaligned_s16_4x2(d_t + 1 * d_stride, width);
564       ds[2] = load_unaligned_s16_4x2(d_t + 2 * d_stride, width);
565       ds[3] = load_unaligned_s16_4x2(d_t + 3 * d_stride, width);
566 
567       step3_win5_neon(d_t + 4 * d_stride, d_stride, width, height, ds, deltas);
568 
569       transpose_elems_inplace_s32_4x4(&deltas[0], &deltas[1], &deltas[2],
570                                       &deltas[3]);
571 
572       update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
573                           deltas[0], vgetq_lane_s32(deltas[4], 0),
574                           H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
575 
576       update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
577                           deltas[1], vgetq_lane_s32(deltas[4], 1),
578                           H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
579 
580       update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
581                           deltas[2], vgetq_lane_s32(deltas[4], 2),
582                           H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
583 
584       update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
585                           deltas[3], vgetq_lane_s32(deltas[4], 3),
586                           H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
587     }
588   }
589 
590   // Step 4: Derive the top and left edge of each square. No square in top and
591   // bottom row.
592   {
593     y = h8;
594 
595     int16x4_t d_s[12];
596     int16x4_t d_e[12];
597     const int16_t *d_t = d;
598     int16x4_t zeros = vdup_n_s16(0);
599     load_s16_4x4(d_t, d_stride, &d_s[0], &d_s[1], &d_s[2], &d_s[3]);
600     load_s16_4x4(d_t + width, d_stride, &d_e[0], &d_e[1], &d_e[2], &d_e[3]);
601     int64x2_t deltas[6][18] = { { vdupq_n_s64(0) }, { vdupq_n_s64(0) } };
602 
603     while (y >= 8) {
604       load_s16_4x8(d_t + 4 * d_stride, d_stride, &d_s[4], &d_s[5], &d_s[6],
605                    &d_s[7], &d_s[8], &d_s[9], &d_s[10], &d_s[11]);
606       load_s16_4x8(d_t + width + 4 * d_stride, d_stride, &d_e[4], &d_e[5],
607                    &d_e[6], &d_e[7], &d_e[8], &d_e[9], &d_e[10], &d_e[11]);
608 
609       int16x8_t s_tr[8], e_tr[8];
610       transpose_elems_s16_4x8(d_s[0], d_s[1], d_s[2], d_s[3], d_s[4], d_s[5],
611                               d_s[6], d_s[7], &s_tr[0], &s_tr[1], &s_tr[2],
612                               &s_tr[3]);
613       transpose_elems_s16_4x8(d_s[8], d_s[9], d_s[10], d_s[11], zeros, zeros,
614                               zeros, zeros, &s_tr[4], &s_tr[5], &s_tr[6],
615                               &s_tr[7]);
616 
617       transpose_elems_s16_4x8(d_e[0], d_e[1], d_e[2], d_e[3], d_e[4], d_e[5],
618                               d_e[6], d_e[7], &e_tr[0], &e_tr[1], &e_tr[2],
619                               &e_tr[3]);
620       transpose_elems_s16_4x8(d_e[8], d_e[9], d_e[10], d_e[11], zeros, zeros,
621                               zeros, zeros, &e_tr[4], &e_tr[5], &e_tr[6],
622                               &e_tr[7]);
623 
624       int16x8_t start_col0[5], start_col1[5], start_col2[5], start_col3[5];
625       start_col0[0] = s_tr[0];
626       start_col0[1] = vextq_s16(s_tr[0], s_tr[4], 1);
627       start_col0[2] = vextq_s16(s_tr[0], s_tr[4], 2);
628       start_col0[3] = vextq_s16(s_tr[0], s_tr[4], 3);
629       start_col0[4] = vextq_s16(s_tr[0], s_tr[4], 4);
630 
631       start_col1[0] = s_tr[1];
632       start_col1[1] = vextq_s16(s_tr[1], s_tr[5], 1);
633       start_col1[2] = vextq_s16(s_tr[1], s_tr[5], 2);
634       start_col1[3] = vextq_s16(s_tr[1], s_tr[5], 3);
635       start_col1[4] = vextq_s16(s_tr[1], s_tr[5], 4);
636 
637       start_col2[0] = s_tr[2];
638       start_col2[1] = vextq_s16(s_tr[2], s_tr[6], 1);
639       start_col2[2] = vextq_s16(s_tr[2], s_tr[6], 2);
640       start_col2[3] = vextq_s16(s_tr[2], s_tr[6], 3);
641       start_col2[4] = vextq_s16(s_tr[2], s_tr[6], 4);
642 
643       start_col3[0] = s_tr[3];
644       start_col3[1] = vextq_s16(s_tr[3], s_tr[7], 1);
645       start_col3[2] = vextq_s16(s_tr[3], s_tr[7], 2);
646       start_col3[3] = vextq_s16(s_tr[3], s_tr[7], 3);
647       start_col3[4] = vextq_s16(s_tr[3], s_tr[7], 4);
648 
649       // i = 1, j = 2;
650       sub_deltas_step4_sve(start_col0, start_col1, deltas[0]);
651 
652       // i = 1, j = 3;
653       sub_deltas_step4_sve(start_col0, start_col2, deltas[1]);
654 
655       // i = 1, j = 4
656       sub_deltas_step4_sve(start_col0, start_col3, deltas[2]);
657 
658       // i = 2, j =3
659       sub_deltas_step4_sve(start_col1, start_col2, deltas[3]);
660 
661       // i = 2, j = 4
662       sub_deltas_step4_sve(start_col1, start_col3, deltas[4]);
663 
664       // i = 3, j = 4
665       sub_deltas_step4_sve(start_col2, start_col3, deltas[5]);
666 
667       int16x8_t end_col0[5], end_col1[5], end_col2[5], end_col3[5];
668       end_col0[0] = e_tr[0];
669       end_col0[1] = vextq_s16(e_tr[0], e_tr[4], 1);
670       end_col0[2] = vextq_s16(e_tr[0], e_tr[4], 2);
671       end_col0[3] = vextq_s16(e_tr[0], e_tr[4], 3);
672       end_col0[4] = vextq_s16(e_tr[0], e_tr[4], 4);
673 
674       end_col1[0] = e_tr[1];
675       end_col1[1] = vextq_s16(e_tr[1], e_tr[5], 1);
676       end_col1[2] = vextq_s16(e_tr[1], e_tr[5], 2);
677       end_col1[3] = vextq_s16(e_tr[1], e_tr[5], 3);
678       end_col1[4] = vextq_s16(e_tr[1], e_tr[5], 4);
679 
680       end_col2[0] = e_tr[2];
681       end_col2[1] = vextq_s16(e_tr[2], e_tr[6], 1);
682       end_col2[2] = vextq_s16(e_tr[2], e_tr[6], 2);
683       end_col2[3] = vextq_s16(e_tr[2], e_tr[6], 3);
684       end_col2[4] = vextq_s16(e_tr[2], e_tr[6], 4);
685 
686       end_col3[0] = e_tr[3];
687       end_col3[1] = vextq_s16(e_tr[3], e_tr[7], 1);
688       end_col3[2] = vextq_s16(e_tr[3], e_tr[7], 2);
689       end_col3[3] = vextq_s16(e_tr[3], e_tr[7], 3);
690       end_col3[4] = vextq_s16(e_tr[3], e_tr[7], 4);
691 
692       // i = 1, j = 2;
693       add_deltas_step4_sve(end_col0, end_col1, deltas[0]);
694 
695       // i = 1, j = 3;
696       add_deltas_step4_sve(end_col0, end_col2, deltas[1]);
697 
698       // i = 1, j = 4
699       add_deltas_step4_sve(end_col0, end_col3, deltas[2]);
700 
701       // i = 2, j =3
702       add_deltas_step4_sve(end_col1, end_col2, deltas[3]);
703 
704       // i = 2, j = 4
705       add_deltas_step4_sve(end_col1, end_col3, deltas[4]);
706 
707       // i = 3, j = 4
708       add_deltas_step4_sve(end_col2, end_col3, deltas[5]);
709 
710       d_s[0] = d_s[8];
711       d_s[1] = d_s[9];
712       d_s[2] = d_s[10];
713       d_s[3] = d_s[11];
714       d_e[0] = d_e[8];
715       d_e[1] = d_e[9];
716       d_e[2] = d_e[10];
717       d_e[3] = d_e[11];
718 
719       d_t += 8 * d_stride;
720       y -= 8;
721     }
722 
723     if (h8 != height) {
724       const int16x8_t mask_h = vld1q_s16(&mask_16bit[16] - (height % 8));
725 
726       load_s16_4x8(d_t + 4 * d_stride, d_stride, &d_s[4], &d_s[5], &d_s[6],
727                    &d_s[7], &d_s[8], &d_s[9], &d_s[10], &d_s[11]);
728       load_s16_4x8(d_t + width + 4 * d_stride, d_stride, &d_e[4], &d_e[5],
729                    &d_e[6], &d_e[7], &d_e[8], &d_e[9], &d_e[10], &d_e[11]);
730       int16x8_t s_tr[8], e_tr[8];
731       transpose_elems_s16_4x8(d_s[0], d_s[1], d_s[2], d_s[3], d_s[4], d_s[5],
732                               d_s[6], d_s[7], &s_tr[0], &s_tr[1], &s_tr[2],
733                               &s_tr[3]);
734       transpose_elems_s16_4x8(d_s[8], d_s[9], d_s[10], d_s[11], zeros, zeros,
735                               zeros, zeros, &s_tr[4], &s_tr[5], &s_tr[6],
736                               &s_tr[7]);
737       transpose_elems_s16_4x8(d_e[0], d_e[1], d_e[2], d_e[3], d_e[4], d_e[5],
738                               d_e[6], d_e[7], &e_tr[0], &e_tr[1], &e_tr[2],
739                               &e_tr[3]);
740       transpose_elems_s16_4x8(d_e[8], d_e[9], d_e[10], d_e[11], zeros, zeros,
741                               zeros, zeros, &e_tr[4], &e_tr[5], &e_tr[6],
742                               &e_tr[7]);
743 
744       int16x8_t start_col0[5], start_col1[5], start_col2[5], start_col3[5];
745       start_col0[0] = vandq_s16(s_tr[0], mask_h);
746       start_col0[1] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 1), mask_h);
747       start_col0[2] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 2), mask_h);
748       start_col0[3] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 3), mask_h);
749       start_col0[4] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 4), mask_h);
750 
751       start_col1[0] = vandq_s16(s_tr[1], mask_h);
752       start_col1[1] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 1), mask_h);
753       start_col1[2] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 2), mask_h);
754       start_col1[3] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 3), mask_h);
755       start_col1[4] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 4), mask_h);
756 
757       start_col2[0] = vandq_s16(s_tr[2], mask_h);
758       start_col2[1] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 1), mask_h);
759       start_col2[2] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 2), mask_h);
760       start_col2[3] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 3), mask_h);
761       start_col2[4] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 4), mask_h);
762 
763       start_col3[0] = vandq_s16(s_tr[3], mask_h);
764       start_col3[1] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 1), mask_h);
765       start_col3[2] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 2), mask_h);
766       start_col3[3] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 3), mask_h);
767       start_col3[4] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 4), mask_h);
768 
769       // i = 1, j = 2;
770       sub_deltas_step4_sve(start_col0, start_col1, deltas[0]);
771 
772       // i = 1, j = 3;
773       sub_deltas_step4_sve(start_col0, start_col2, deltas[1]);
774 
775       // i = 1, j = 4
776       sub_deltas_step4_sve(start_col0, start_col3, deltas[2]);
777 
778       // i = 2, j = 3
779       sub_deltas_step4_sve(start_col1, start_col2, deltas[3]);
780 
781       // i = 2, j = 4
782       sub_deltas_step4_sve(start_col1, start_col3, deltas[4]);
783 
784       // i = 3, j = 4
785       sub_deltas_step4_sve(start_col2, start_col3, deltas[5]);
786 
787       int16x8_t end_col0[5], end_col1[5], end_col2[5], end_col3[5];
788       end_col0[0] = vandq_s16(e_tr[0], mask_h);
789       end_col0[1] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 1), mask_h);
790       end_col0[2] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 2), mask_h);
791       end_col0[3] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 3), mask_h);
792       end_col0[4] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 4), mask_h);
793 
794       end_col1[0] = vandq_s16(e_tr[1], mask_h);
795       end_col1[1] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 1), mask_h);
796       end_col1[2] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 2), mask_h);
797       end_col1[3] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 3), mask_h);
798       end_col1[4] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 4), mask_h);
799 
800       end_col2[0] = vandq_s16(e_tr[2], mask_h);
801       end_col2[1] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 1), mask_h);
802       end_col2[2] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 2), mask_h);
803       end_col2[3] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 3), mask_h);
804       end_col2[4] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 4), mask_h);
805 
806       end_col3[0] = vandq_s16(e_tr[3], mask_h);
807       end_col3[1] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 1), mask_h);
808       end_col3[2] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 2), mask_h);
809       end_col3[3] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 3), mask_h);
810       end_col3[4] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 4), mask_h);
811 
812       // i = 1, j = 2;
813       add_deltas_step4_sve(end_col0, end_col1, deltas[0]);
814 
815       // i = 1, j = 3;
816       add_deltas_step4_sve(end_col0, end_col2, deltas[1]);
817 
818       // i = 1, j = 4
819       add_deltas_step4_sve(end_col0, end_col3, deltas[2]);
820 
821       // i = 2, j =3
822       add_deltas_step4_sve(end_col1, end_col2, deltas[3]);
823 
824       // i = 2, j = 4
825       add_deltas_step4_sve(end_col1, end_col3, deltas[4]);
826 
827       // i = 3, j = 4
828       add_deltas_step4_sve(end_col2, end_col3, deltas[5]);
829     }
830 
831     int64_t single_delta[6];
832 
833     deltas[0][0] = vpaddq_s64(deltas[0][0], deltas[0][1]);
834     deltas[0][1] = vpaddq_s64(deltas[0][2], deltas[0][3]);
835     deltas[1][0] = vpaddq_s64(deltas[1][0], deltas[1][1]);
836     deltas[1][1] = vpaddq_s64(deltas[1][2], deltas[1][3]);
837     deltas[2][0] = vpaddq_s64(deltas[2][0], deltas[2][1]);
838     deltas[2][1] = vpaddq_s64(deltas[2][2], deltas[2][3]);
839     deltas[3][0] = vpaddq_s64(deltas[3][0], deltas[3][1]);
840     deltas[3][1] = vpaddq_s64(deltas[3][2], deltas[3][3]);
841     deltas[4][0] = vpaddq_s64(deltas[4][0], deltas[4][1]);
842     deltas[4][1] = vpaddq_s64(deltas[4][2], deltas[4][3]);
843     deltas[5][0] = vpaddq_s64(deltas[5][0], deltas[5][1]);
844     deltas[5][1] = vpaddq_s64(deltas[5][2], deltas[5][3]);
845 
846     deltas[0][5] = vpaddq_s64(deltas[0][5], deltas[0][6]);
847     deltas[0][7] = vpaddq_s64(deltas[0][7], deltas[0][8]);
848     deltas[1][5] = vpaddq_s64(deltas[1][5], deltas[1][6]);
849     deltas[1][7] = vpaddq_s64(deltas[1][7], deltas[1][8]);
850     deltas[2][5] = vpaddq_s64(deltas[2][5], deltas[2][6]);
851     deltas[2][7] = vpaddq_s64(deltas[2][7], deltas[2][8]);
852     deltas[3][5] = vpaddq_s64(deltas[3][5], deltas[3][6]);
853     deltas[3][7] = vpaddq_s64(deltas[3][7], deltas[3][8]);
854     deltas[4][5] = vpaddq_s64(deltas[4][5], deltas[4][6]);
855     deltas[4][7] = vpaddq_s64(deltas[4][7], deltas[4][8]);
856     deltas[5][5] = vpaddq_s64(deltas[5][5], deltas[5][6]);
857     deltas[5][7] = vpaddq_s64(deltas[5][7], deltas[5][8]);
858 
859     vst1q_s64(single_delta + 0, vpaddq_s64(deltas[0][4], deltas[1][4]));
860     vst1q_s64(single_delta + 2, vpaddq_s64(deltas[2][4], deltas[3][4]));
861     vst1q_s64(single_delta + 4, vpaddq_s64(deltas[4][4], deltas[5][4]));
862 
863     int idx = 0;
864     for (i = 1; i < wiener_win - 1; i++) {
865       for (j = i + 1; j < wiener_win; j++) {
866         update_4_stats_sve(
867             H + (i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win,
868             deltas[idx], H + i * wiener_win * wiener_win2 + j * wiener_win);
869         H[i * wiener_win * wiener_win2 + j * wiener_win + 4] =
870             H[(i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win + 4] +
871             single_delta[idx];
872 
873         H[(i * wiener_win + 1) * wiener_win2 + j * wiener_win] =
874             H[((i - 1) * wiener_win + 1) * wiener_win2 + (j - 1) * wiener_win] +
875             vgetq_lane_s64(deltas[idx][5], 0);
876         H[(i * wiener_win + 2) * wiener_win2 + j * wiener_win] =
877             H[((i - 1) * wiener_win + 2) * wiener_win2 + (j - 1) * wiener_win] +
878             vgetq_lane_s64(deltas[idx][5], 1);
879         H[(i * wiener_win + 3) * wiener_win2 + j * wiener_win] =
880             H[((i - 1) * wiener_win + 3) * wiener_win2 + (j - 1) * wiener_win] +
881             vgetq_lane_s64(deltas[idx][7], 0);
882         H[(i * wiener_win + 4) * wiener_win2 + j * wiener_win] =
883             H[((i - 1) * wiener_win + 4) * wiener_win2 + (j - 1) * wiener_win] +
884             vgetq_lane_s64(deltas[idx][7], 1);
885 
886         idx++;
887       }
888     }
889   }
890 
891   // Step 5: Derive other points of each square. No square in bottom row.
892   i = 0;
893   do {
894     const int16_t *const di = d + i;
895 
896     j = i + 1;
897     do {
898       const int16_t *const dj = d + j;
899       int64x2_t deltas[WIENER_WIN_CHROMA - 1][WIENER_WIN_CHROMA - 1] = {
900         { vdupq_n_s64(0) }, { vdupq_n_s64(0) }
901       };
902       int16x8_t d_is[WIN_CHROMA], d_ie[WIN_CHROMA];
903       int16x8_t d_js[WIN_CHROMA], d_je[WIN_CHROMA];
904 
905       x = 0;
906       while (x < width - 16) {
907         load_square_win5_neon(di + x, dj + x, d_stride, height, d_is, d_ie,
908                               d_js, d_je);
909         derive_square_win5_sve(d_is, d_ie, d_js, d_je, deltas);
910         x += 16;
911       }
912 
913       load_square_win5_sve(di + x, dj + x, d_stride, height, d_is, d_ie, d_js,
914                            d_je, p0, p1);
915       derive_square_win5_sve(d_is, d_ie, d_js, d_je, deltas);
916 
917       hadd_update_4_stats_sve(
918           H + (i * wiener_win + 0) * wiener_win2 + j * wiener_win, deltas[0],
919           H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win + 1);
920       hadd_update_4_stats_sve(
921           H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win, deltas[1],
922           H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win + 1);
923       hadd_update_4_stats_sve(
924           H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win, deltas[2],
925           H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win + 1);
926       hadd_update_4_stats_sve(
927           H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win, deltas[3],
928           H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win + 1);
929     } while (++j < wiener_win);
930   } while (++i < wiener_win - 1);
931 
932   // Step 6: Derive other points of each upper triangle along the diagonal.
933   i = 0;
934   do {
935     const int16_t *const di = d + i;
936     int64x2_t deltas[WIENER_WIN_CHROMA * 2 + 1] = { vdupq_n_s64(0) };
937     int16x8_t d_is[WIN_CHROMA], d_ie[WIN_CHROMA];
938 
939     x = 0;
940     while (x < width - 16) {
941       load_triangle_win5_neon(di + x, d_stride, height, d_is, d_ie);
942       derive_triangle_win5_sve(d_is, d_ie, deltas);
943       x += 16;
944     }
945 
946     load_triangle_win5_sve(di + x, d_stride, height, d_is, d_ie, p0, p1);
947     derive_triangle_win5_sve(d_is, d_ie, deltas);
948 
949     // Row 1: 4 points
950     hadd_update_4_stats_sve(
951         H + (i * wiener_win + 0) * wiener_win2 + i * wiener_win, deltas,
952         H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1);
953 
954     // Row 2: 3 points
955     int64x2_t src0 =
956         vld1q_s64(H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1);
957     vst1q_s64(H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2,
958               vaddq_s64(src0, vpaddq_s64(deltas[4], deltas[5])));
959 
960     int64x2_t deltas69 = vpaddq_s64(deltas[6], deltas[9]);
961 
962     H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 4] =
963         H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 3] +
964         vgetq_lane_s64(deltas69, 0);
965 
966     // Row 3: 2 points
967     int64x2_t src1 =
968         vld1q_s64(H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2);
969     vst1q_s64(H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3,
970               vaddq_s64(src1, vpaddq_s64(deltas[7], deltas[8])));
971 
972     // Row 4: 1 point
973     H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4] =
974         H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3] +
975         vgetq_lane_s64(deltas69, 1);
976   } while (++i < wiener_win);
977 }
978 
stats_top_win7_sve(const int16x8_t src[2],const int16x8_t dgd[2],const int16_t * const d,const int32_t d_stride,int64x2_t * sum_m,int64x2_t * sum_h)979 static inline void stats_top_win7_sve(const int16x8_t src[2],
980                                       const int16x8_t dgd[2],
981                                       const int16_t *const d,
982                                       const int32_t d_stride, int64x2_t *sum_m,
983                                       int64x2_t *sum_h) {
984   int16x8_t dgds[WIENER_WIN * 2];
985 
986   load_s16_8x7(d + 0, d_stride, &dgds[0], &dgds[2], &dgds[4], &dgds[6],
987                &dgds[8], &dgds[10], &dgds[12]);
988   load_s16_8x7(d + 8, d_stride, &dgds[1], &dgds[3], &dgds[5], &dgds[7],
989                &dgds[9], &dgds[11], &dgds[13]);
990 
991   sum_m[0] = aom_sdotq_s16(sum_m[0], src[0], dgds[0]);
992   sum_m[0] = aom_sdotq_s16(sum_m[0], src[1], dgds[1]);
993   sum_m[1] = aom_sdotq_s16(sum_m[1], src[0], dgds[2]);
994   sum_m[1] = aom_sdotq_s16(sum_m[1], src[1], dgds[3]);
995   sum_m[2] = aom_sdotq_s16(sum_m[2], src[0], dgds[4]);
996   sum_m[2] = aom_sdotq_s16(sum_m[2], src[1], dgds[5]);
997   sum_m[3] = aom_sdotq_s16(sum_m[3], src[0], dgds[6]);
998   sum_m[3] = aom_sdotq_s16(sum_m[3], src[1], dgds[7]);
999   sum_m[4] = aom_sdotq_s16(sum_m[4], src[0], dgds[8]);
1000   sum_m[4] = aom_sdotq_s16(sum_m[4], src[1], dgds[9]);
1001   sum_m[5] = aom_sdotq_s16(sum_m[5], src[0], dgds[10]);
1002   sum_m[5] = aom_sdotq_s16(sum_m[5], src[1], dgds[11]);
1003   sum_m[6] = aom_sdotq_s16(sum_m[6], src[0], dgds[12]);
1004   sum_m[6] = aom_sdotq_s16(sum_m[6], src[1], dgds[13]);
1005 
1006   sum_h[0] = aom_sdotq_s16(sum_h[0], dgd[0], dgds[0]);
1007   sum_h[0] = aom_sdotq_s16(sum_h[0], dgd[1], dgds[1]);
1008   sum_h[1] = aom_sdotq_s16(sum_h[1], dgd[0], dgds[2]);
1009   sum_h[1] = aom_sdotq_s16(sum_h[1], dgd[1], dgds[3]);
1010   sum_h[2] = aom_sdotq_s16(sum_h[2], dgd[0], dgds[4]);
1011   sum_h[2] = aom_sdotq_s16(sum_h[2], dgd[1], dgds[5]);
1012   sum_h[3] = aom_sdotq_s16(sum_h[3], dgd[0], dgds[6]);
1013   sum_h[3] = aom_sdotq_s16(sum_h[3], dgd[1], dgds[7]);
1014   sum_h[4] = aom_sdotq_s16(sum_h[4], dgd[0], dgds[8]);
1015   sum_h[4] = aom_sdotq_s16(sum_h[4], dgd[1], dgds[9]);
1016   sum_h[5] = aom_sdotq_s16(sum_h[5], dgd[0], dgds[10]);
1017   sum_h[5] = aom_sdotq_s16(sum_h[5], dgd[1], dgds[11]);
1018   sum_h[6] = aom_sdotq_s16(sum_h[6], dgd[0], dgds[12]);
1019   sum_h[6] = aom_sdotq_s16(sum_h[6], dgd[1], dgds[13]);
1020 }
1021 
stats_left_win7_sve(const int16x8_t src[2],const int16_t * d,const int32_t d_stride,int64x2_t * sum)1022 static inline void stats_left_win7_sve(const int16x8_t src[2], const int16_t *d,
1023                                        const int32_t d_stride, int64x2_t *sum) {
1024   int16x8_t dgds[WIN_7];
1025 
1026   load_s16_8x6(d + d_stride + 0, d_stride, &dgds[0], &dgds[2], &dgds[4],
1027                &dgds[6], &dgds[8], &dgds[10]);
1028   load_s16_8x6(d + d_stride + 8, d_stride, &dgds[1], &dgds[3], &dgds[5],
1029                &dgds[7], &dgds[9], &dgds[11]);
1030 
1031   sum[0] = aom_sdotq_s16(sum[0], src[0], dgds[0]);
1032   sum[0] = aom_sdotq_s16(sum[0], src[1], dgds[1]);
1033   sum[1] = aom_sdotq_s16(sum[1], src[0], dgds[2]);
1034   sum[1] = aom_sdotq_s16(sum[1], src[1], dgds[3]);
1035   sum[2] = aom_sdotq_s16(sum[2], src[0], dgds[4]);
1036   sum[2] = aom_sdotq_s16(sum[2], src[1], dgds[5]);
1037   sum[3] = aom_sdotq_s16(sum[3], src[0], dgds[6]);
1038   sum[3] = aom_sdotq_s16(sum[3], src[1], dgds[7]);
1039   sum[4] = aom_sdotq_s16(sum[4], src[0], dgds[8]);
1040   sum[4] = aom_sdotq_s16(sum[4], src[1], dgds[9]);
1041   sum[5] = aom_sdotq_s16(sum[5], src[0], dgds[10]);
1042   sum[5] = aom_sdotq_s16(sum[5], src[1], dgds[11]);
1043 }
1044 
load_square_win7_sve(const int16_t * const di,const int16_t * const dj,const int32_t d_stride,const int32_t height,int16x8_t * d_is,int16x8_t * d_ie,int16x8_t * d_js,int16x8_t * d_je,svbool_t p0,svbool_t p1)1045 static inline void load_square_win7_sve(
1046     const int16_t *const di, const int16_t *const dj, const int32_t d_stride,
1047     const int32_t height, int16x8_t *d_is, int16x8_t *d_ie, int16x8_t *d_js,
1048     int16x8_t *d_je, svbool_t p0, svbool_t p1) {
1049   d_is[0] = svget_neonq_s16(svld1_s16(p0, di + 0 * d_stride + 0));
1050   d_is[1] = svget_neonq_s16(svld1_s16(p1, di + 0 * d_stride + 8));
1051   d_is[2] = svget_neonq_s16(svld1_s16(p0, di + 1 * d_stride + 0));
1052   d_is[3] = svget_neonq_s16(svld1_s16(p1, di + 1 * d_stride + 8));
1053   d_is[4] = svget_neonq_s16(svld1_s16(p0, di + 2 * d_stride + 0));
1054   d_is[5] = svget_neonq_s16(svld1_s16(p1, di + 2 * d_stride + 8));
1055   d_is[6] = svget_neonq_s16(svld1_s16(p0, di + 3 * d_stride + 0));
1056   d_is[7] = svget_neonq_s16(svld1_s16(p1, di + 3 * d_stride + 8));
1057   d_is[8] = svget_neonq_s16(svld1_s16(p0, di + 4 * d_stride + 0));
1058   d_is[9] = svget_neonq_s16(svld1_s16(p1, di + 4 * d_stride + 8));
1059   d_is[10] = svget_neonq_s16(svld1_s16(p0, di + 5 * d_stride + 0));
1060   d_is[11] = svget_neonq_s16(svld1_s16(p1, di + 5 * d_stride + 8));
1061 
1062   d_ie[0] = svget_neonq_s16(svld1_s16(p0, di + (height + 0) * d_stride + 0));
1063   d_ie[1] = svget_neonq_s16(svld1_s16(p1, di + (height + 0) * d_stride + 8));
1064   d_ie[2] = svget_neonq_s16(svld1_s16(p0, di + (height + 1) * d_stride + 0));
1065   d_ie[3] = svget_neonq_s16(svld1_s16(p1, di + (height + 1) * d_stride + 8));
1066   d_ie[4] = svget_neonq_s16(svld1_s16(p0, di + (height + 2) * d_stride + 0));
1067   d_ie[5] = svget_neonq_s16(svld1_s16(p1, di + (height + 2) * d_stride + 8));
1068   d_ie[6] = svget_neonq_s16(svld1_s16(p0, di + (height + 3) * d_stride + 0));
1069   d_ie[7] = svget_neonq_s16(svld1_s16(p1, di + (height + 3) * d_stride + 8));
1070   d_ie[8] = svget_neonq_s16(svld1_s16(p0, di + (height + 4) * d_stride + 0));
1071   d_ie[9] = svget_neonq_s16(svld1_s16(p1, di + (height + 4) * d_stride + 8));
1072   d_ie[10] = svget_neonq_s16(svld1_s16(p0, di + (height + 5) * d_stride + 0));
1073   d_ie[11] = svget_neonq_s16(svld1_s16(p1, di + (height + 5) * d_stride + 8));
1074 
1075   load_s16_8x6(dj + 0, d_stride, &d_js[0], &d_js[2], &d_js[4], &d_js[6],
1076                &d_js[8], &d_js[10]);
1077   load_s16_8x6(dj + 8, d_stride, &d_js[1], &d_js[3], &d_js[5], &d_js[7],
1078                &d_js[9], &d_js[11]);
1079   load_s16_8x6(dj + height * d_stride + 0, d_stride, &d_je[0], &d_je[2],
1080                &d_je[4], &d_je[6], &d_je[8], &d_je[10]);
1081   load_s16_8x6(dj + height * d_stride + 8, d_stride, &d_je[1], &d_je[3],
1082                &d_je[5], &d_je[7], &d_je[9], &d_je[11]);
1083 }
1084 
derive_square_win7_sve(int16x8_t * d_is,const int16x8_t * d_ie,const int16x8_t * d_js,const int16x8_t * d_je,int64x2_t deltas[][WIN_7])1085 static inline void derive_square_win7_sve(int16x8_t *d_is,
1086                                           const int16x8_t *d_ie,
1087                                           const int16x8_t *d_js,
1088                                           const int16x8_t *d_je,
1089                                           int64x2_t deltas[][WIN_7]) {
1090   d_is[0] = vnegq_s16(d_is[0]);
1091   d_is[1] = vnegq_s16(d_is[1]);
1092   d_is[2] = vnegq_s16(d_is[2]);
1093   d_is[3] = vnegq_s16(d_is[3]);
1094   d_is[4] = vnegq_s16(d_is[4]);
1095   d_is[5] = vnegq_s16(d_is[5]);
1096   d_is[6] = vnegq_s16(d_is[6]);
1097   d_is[7] = vnegq_s16(d_is[7]);
1098   d_is[8] = vnegq_s16(d_is[8]);
1099   d_is[9] = vnegq_s16(d_is[9]);
1100   d_is[10] = vnegq_s16(d_is[10]);
1101   d_is[11] = vnegq_s16(d_is[11]);
1102 
1103   deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_is[0], d_js[0]);
1104   deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_is[1], d_js[1]);
1105   deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_is[0], d_js[2]);
1106   deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_is[1], d_js[3]);
1107   deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_is[0], d_js[4]);
1108   deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_is[1], d_js[5]);
1109   deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_is[0], d_js[6]);
1110   deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_is[1], d_js[7]);
1111   deltas[0][4] = aom_sdotq_s16(deltas[0][4], d_is[0], d_js[8]);
1112   deltas[0][4] = aom_sdotq_s16(deltas[0][4], d_is[1], d_js[9]);
1113   deltas[0][5] = aom_sdotq_s16(deltas[0][5], d_is[0], d_js[10]);
1114   deltas[0][5] = aom_sdotq_s16(deltas[0][5], d_is[1], d_js[11]);
1115 
1116   deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_is[2], d_js[0]);
1117   deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_is[3], d_js[1]);
1118   deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_is[2], d_js[2]);
1119   deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_is[3], d_js[3]);
1120   deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_is[2], d_js[4]);
1121   deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_is[3], d_js[5]);
1122   deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_is[2], d_js[6]);
1123   deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_is[3], d_js[7]);
1124   deltas[1][4] = aom_sdotq_s16(deltas[1][4], d_is[2], d_js[8]);
1125   deltas[1][4] = aom_sdotq_s16(deltas[1][4], d_is[3], d_js[9]);
1126   deltas[1][5] = aom_sdotq_s16(deltas[1][5], d_is[2], d_js[10]);
1127   deltas[1][5] = aom_sdotq_s16(deltas[1][5], d_is[3], d_js[11]);
1128 
1129   deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_is[4], d_js[0]);
1130   deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_is[5], d_js[1]);
1131   deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_is[4], d_js[2]);
1132   deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_is[5], d_js[3]);
1133   deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_is[4], d_js[4]);
1134   deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_is[5], d_js[5]);
1135   deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_is[4], d_js[6]);
1136   deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_is[5], d_js[7]);
1137   deltas[2][4] = aom_sdotq_s16(deltas[2][4], d_is[4], d_js[8]);
1138   deltas[2][4] = aom_sdotq_s16(deltas[2][4], d_is[5], d_js[9]);
1139   deltas[2][5] = aom_sdotq_s16(deltas[2][5], d_is[4], d_js[10]);
1140   deltas[2][5] = aom_sdotq_s16(deltas[2][5], d_is[5], d_js[11]);
1141 
1142   deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_is[6], d_js[0]);
1143   deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_is[7], d_js[1]);
1144   deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_is[6], d_js[2]);
1145   deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_is[7], d_js[3]);
1146   deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_is[6], d_js[4]);
1147   deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_is[7], d_js[5]);
1148   deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_is[6], d_js[6]);
1149   deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_is[7], d_js[7]);
1150   deltas[3][4] = aom_sdotq_s16(deltas[3][4], d_is[6], d_js[8]);
1151   deltas[3][4] = aom_sdotq_s16(deltas[3][4], d_is[7], d_js[9]);
1152   deltas[3][5] = aom_sdotq_s16(deltas[3][5], d_is[6], d_js[10]);
1153   deltas[3][5] = aom_sdotq_s16(deltas[3][5], d_is[7], d_js[11]);
1154 
1155   deltas[4][0] = aom_sdotq_s16(deltas[4][0], d_is[8], d_js[0]);
1156   deltas[4][0] = aom_sdotq_s16(deltas[4][0], d_is[9], d_js[1]);
1157   deltas[4][1] = aom_sdotq_s16(deltas[4][1], d_is[8], d_js[2]);
1158   deltas[4][1] = aom_sdotq_s16(deltas[4][1], d_is[9], d_js[3]);
1159   deltas[4][2] = aom_sdotq_s16(deltas[4][2], d_is[8], d_js[4]);
1160   deltas[4][2] = aom_sdotq_s16(deltas[4][2], d_is[9], d_js[5]);
1161   deltas[4][3] = aom_sdotq_s16(deltas[4][3], d_is[8], d_js[6]);
1162   deltas[4][3] = aom_sdotq_s16(deltas[4][3], d_is[9], d_js[7]);
1163   deltas[4][4] = aom_sdotq_s16(deltas[4][4], d_is[8], d_js[8]);
1164   deltas[4][4] = aom_sdotq_s16(deltas[4][4], d_is[9], d_js[9]);
1165   deltas[4][5] = aom_sdotq_s16(deltas[4][5], d_is[8], d_js[10]);
1166   deltas[4][5] = aom_sdotq_s16(deltas[4][5], d_is[9], d_js[11]);
1167 
1168   deltas[5][0] = aom_sdotq_s16(deltas[5][0], d_is[10], d_js[0]);
1169   deltas[5][0] = aom_sdotq_s16(deltas[5][0], d_is[11], d_js[1]);
1170   deltas[5][1] = aom_sdotq_s16(deltas[5][1], d_is[10], d_js[2]);
1171   deltas[5][1] = aom_sdotq_s16(deltas[5][1], d_is[11], d_js[3]);
1172   deltas[5][2] = aom_sdotq_s16(deltas[5][2], d_is[10], d_js[4]);
1173   deltas[5][2] = aom_sdotq_s16(deltas[5][2], d_is[11], d_js[5]);
1174   deltas[5][3] = aom_sdotq_s16(deltas[5][3], d_is[10], d_js[6]);
1175   deltas[5][3] = aom_sdotq_s16(deltas[5][3], d_is[11], d_js[7]);
1176   deltas[5][4] = aom_sdotq_s16(deltas[5][4], d_is[10], d_js[8]);
1177   deltas[5][4] = aom_sdotq_s16(deltas[5][4], d_is[11], d_js[9]);
1178   deltas[5][5] = aom_sdotq_s16(deltas[5][5], d_is[10], d_js[10]);
1179   deltas[5][5] = aom_sdotq_s16(deltas[5][5], d_is[11], d_js[11]);
1180 
1181   deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_ie[0], d_je[0]);
1182   deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_ie[1], d_je[1]);
1183   deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_ie[0], d_je[2]);
1184   deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_ie[1], d_je[3]);
1185   deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_ie[0], d_je[4]);
1186   deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_ie[1], d_je[5]);
1187   deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_ie[0], d_je[6]);
1188   deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_ie[1], d_je[7]);
1189   deltas[0][4] = aom_sdotq_s16(deltas[0][4], d_ie[0], d_je[8]);
1190   deltas[0][4] = aom_sdotq_s16(deltas[0][4], d_ie[1], d_je[9]);
1191   deltas[0][5] = aom_sdotq_s16(deltas[0][5], d_ie[0], d_je[10]);
1192   deltas[0][5] = aom_sdotq_s16(deltas[0][5], d_ie[1], d_je[11]);
1193 
1194   deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_ie[2], d_je[0]);
1195   deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_ie[3], d_je[1]);
1196   deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_ie[2], d_je[2]);
1197   deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_ie[3], d_je[3]);
1198   deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_ie[2], d_je[4]);
1199   deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_ie[3], d_je[5]);
1200   deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_ie[2], d_je[6]);
1201   deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_ie[3], d_je[7]);
1202   deltas[1][4] = aom_sdotq_s16(deltas[1][4], d_ie[2], d_je[8]);
1203   deltas[1][4] = aom_sdotq_s16(deltas[1][4], d_ie[3], d_je[9]);
1204   deltas[1][5] = aom_sdotq_s16(deltas[1][5], d_ie[2], d_je[10]);
1205   deltas[1][5] = aom_sdotq_s16(deltas[1][5], d_ie[3], d_je[11]);
1206 
1207   deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_ie[4], d_je[0]);
1208   deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_ie[5], d_je[1]);
1209   deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_ie[4], d_je[2]);
1210   deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_ie[5], d_je[3]);
1211   deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_ie[4], d_je[4]);
1212   deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_ie[5], d_je[5]);
1213   deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_ie[4], d_je[6]);
1214   deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_ie[5], d_je[7]);
1215   deltas[2][4] = aom_sdotq_s16(deltas[2][4], d_ie[4], d_je[8]);
1216   deltas[2][4] = aom_sdotq_s16(deltas[2][4], d_ie[5], d_je[9]);
1217   deltas[2][5] = aom_sdotq_s16(deltas[2][5], d_ie[4], d_je[10]);
1218   deltas[2][5] = aom_sdotq_s16(deltas[2][5], d_ie[5], d_je[11]);
1219 
1220   deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_ie[6], d_je[0]);
1221   deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_ie[7], d_je[1]);
1222   deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_ie[6], d_je[2]);
1223   deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_ie[7], d_je[3]);
1224   deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_ie[6], d_je[4]);
1225   deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_ie[7], d_je[5]);
1226   deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_ie[6], d_je[6]);
1227   deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_ie[7], d_je[7]);
1228   deltas[3][4] = aom_sdotq_s16(deltas[3][4], d_ie[6], d_je[8]);
1229   deltas[3][4] = aom_sdotq_s16(deltas[3][4], d_ie[7], d_je[9]);
1230   deltas[3][5] = aom_sdotq_s16(deltas[3][5], d_ie[6], d_je[10]);
1231   deltas[3][5] = aom_sdotq_s16(deltas[3][5], d_ie[7], d_je[11]);
1232 
1233   deltas[4][0] = aom_sdotq_s16(deltas[4][0], d_ie[8], d_je[0]);
1234   deltas[4][0] = aom_sdotq_s16(deltas[4][0], d_ie[9], d_je[1]);
1235   deltas[4][1] = aom_sdotq_s16(deltas[4][1], d_ie[8], d_je[2]);
1236   deltas[4][1] = aom_sdotq_s16(deltas[4][1], d_ie[9], d_je[3]);
1237   deltas[4][2] = aom_sdotq_s16(deltas[4][2], d_ie[8], d_je[4]);
1238   deltas[4][2] = aom_sdotq_s16(deltas[4][2], d_ie[9], d_je[5]);
1239   deltas[4][3] = aom_sdotq_s16(deltas[4][3], d_ie[8], d_je[6]);
1240   deltas[4][3] = aom_sdotq_s16(deltas[4][3], d_ie[9], d_je[7]);
1241   deltas[4][4] = aom_sdotq_s16(deltas[4][4], d_ie[8], d_je[8]);
1242   deltas[4][4] = aom_sdotq_s16(deltas[4][4], d_ie[9], d_je[9]);
1243   deltas[4][5] = aom_sdotq_s16(deltas[4][5], d_ie[8], d_je[10]);
1244   deltas[4][5] = aom_sdotq_s16(deltas[4][5], d_ie[9], d_je[11]);
1245 
1246   deltas[5][0] = aom_sdotq_s16(deltas[5][0], d_ie[10], d_je[0]);
1247   deltas[5][0] = aom_sdotq_s16(deltas[5][0], d_ie[11], d_je[1]);
1248   deltas[5][1] = aom_sdotq_s16(deltas[5][1], d_ie[10], d_je[2]);
1249   deltas[5][1] = aom_sdotq_s16(deltas[5][1], d_ie[11], d_je[3]);
1250   deltas[5][2] = aom_sdotq_s16(deltas[5][2], d_ie[10], d_je[4]);
1251   deltas[5][2] = aom_sdotq_s16(deltas[5][2], d_ie[11], d_je[5]);
1252   deltas[5][3] = aom_sdotq_s16(deltas[5][3], d_ie[10], d_je[6]);
1253   deltas[5][3] = aom_sdotq_s16(deltas[5][3], d_ie[11], d_je[7]);
1254   deltas[5][4] = aom_sdotq_s16(deltas[5][4], d_ie[10], d_je[8]);
1255   deltas[5][4] = aom_sdotq_s16(deltas[5][4], d_ie[11], d_je[9]);
1256   deltas[5][5] = aom_sdotq_s16(deltas[5][5], d_ie[10], d_je[10]);
1257   deltas[5][5] = aom_sdotq_s16(deltas[5][5], d_ie[11], d_je[11]);
1258 }
1259 
hadd_update_6_stats_sve(const int64_t * const src,const int64x2_t * deltas,int64_t * const dst)1260 static inline void hadd_update_6_stats_sve(const int64_t *const src,
1261                                            const int64x2_t *deltas,
1262                                            int64_t *const dst) {
1263   int64x2_t src0 = vld1q_s64(src + 0);
1264   int64x2_t src1 = vld1q_s64(src + 2);
1265   int64x2_t src2 = vld1q_s64(src + 4);
1266 
1267   int64x2_t deltas01 = vpaddq_s64(deltas[0], deltas[1]);
1268   int64x2_t deltas23 = vpaddq_s64(deltas[2], deltas[3]);
1269   int64x2_t deltas45 = vpaddq_s64(deltas[4], deltas[5]);
1270 
1271   vst1q_s64(dst + 0, vaddq_s64(src0, deltas01));
1272   vst1q_s64(dst + 2, vaddq_s64(src1, deltas23));
1273   vst1q_s64(dst + 4, vaddq_s64(src2, deltas45));
1274 }
1275 
load_triangle_win7_sve(const int16_t * const di,const int32_t d_stride,const int32_t height,int16x8_t * d_is,int16x8_t * d_ie,svbool_t p0,svbool_t p1)1276 static inline void load_triangle_win7_sve(const int16_t *const di,
1277                                           const int32_t d_stride,
1278                                           const int32_t height, int16x8_t *d_is,
1279                                           int16x8_t *d_ie, svbool_t p0,
1280                                           svbool_t p1) {
1281   d_is[0] = svget_neonq_s16(svld1_s16(p0, di + 0 * d_stride + 0));
1282   d_is[1] = svget_neonq_s16(svld1_s16(p1, di + 0 * d_stride + 8));
1283   d_is[2] = svget_neonq_s16(svld1_s16(p0, di + 1 * d_stride + 0));
1284   d_is[3] = svget_neonq_s16(svld1_s16(p1, di + 1 * d_stride + 8));
1285   d_is[4] = svget_neonq_s16(svld1_s16(p0, di + 2 * d_stride + 0));
1286   d_is[5] = svget_neonq_s16(svld1_s16(p1, di + 2 * d_stride + 8));
1287   d_is[6] = svget_neonq_s16(svld1_s16(p0, di + 3 * d_stride + 0));
1288   d_is[7] = svget_neonq_s16(svld1_s16(p1, di + 3 * d_stride + 8));
1289   d_is[8] = svget_neonq_s16(svld1_s16(p0, di + 4 * d_stride + 0));
1290   d_is[9] = svget_neonq_s16(svld1_s16(p1, di + 4 * d_stride + 8));
1291   d_is[10] = svget_neonq_s16(svld1_s16(p0, di + 5 * d_stride + 0));
1292   d_is[11] = svget_neonq_s16(svld1_s16(p1, di + 5 * d_stride + 8));
1293 
1294   d_ie[0] = svget_neonq_s16(svld1_s16(p0, di + (height + 0) * d_stride + 0));
1295   d_ie[1] = svget_neonq_s16(svld1_s16(p1, di + (height + 0) * d_stride + 8));
1296   d_ie[2] = svget_neonq_s16(svld1_s16(p0, di + (height + 1) * d_stride + 0));
1297   d_ie[3] = svget_neonq_s16(svld1_s16(p1, di + (height + 1) * d_stride + 8));
1298   d_ie[4] = svget_neonq_s16(svld1_s16(p0, di + (height + 2) * d_stride + 0));
1299   d_ie[5] = svget_neonq_s16(svld1_s16(p1, di + (height + 2) * d_stride + 8));
1300   d_ie[6] = svget_neonq_s16(svld1_s16(p0, di + (height + 3) * d_stride + 0));
1301   d_ie[7] = svget_neonq_s16(svld1_s16(p1, di + (height + 3) * d_stride + 8));
1302   d_ie[8] = svget_neonq_s16(svld1_s16(p0, di + (height + 4) * d_stride + 0));
1303   d_ie[9] = svget_neonq_s16(svld1_s16(p1, di + (height + 4) * d_stride + 8));
1304   d_ie[10] = svget_neonq_s16(svld1_s16(p0, di + (height + 5) * d_stride + 0));
1305   d_ie[11] = svget_neonq_s16(svld1_s16(p1, di + (height + 5) * d_stride + 8));
1306 }
1307 
derive_triangle_win7_sve(const int16x8_t * d_is,const int16x8_t * d_ie,int64x2_t * deltas)1308 static inline void derive_triangle_win7_sve(const int16x8_t *d_is,
1309                                             const int16x8_t *d_ie,
1310                                             int64x2_t *deltas) {
1311   deltas[0] = aom_sdotq_s16(deltas[0], vnegq_s16(d_is[0]), d_is[0]);
1312   deltas[0] = aom_sdotq_s16(deltas[0], vnegq_s16(d_is[1]), d_is[1]);
1313   deltas[1] = aom_sdotq_s16(deltas[1], vnegq_s16(d_is[0]), d_is[2]);
1314   deltas[1] = aom_sdotq_s16(deltas[1], vnegq_s16(d_is[1]), d_is[3]);
1315   deltas[2] = aom_sdotq_s16(deltas[2], vnegq_s16(d_is[0]), d_is[4]);
1316   deltas[2] = aom_sdotq_s16(deltas[2], vnegq_s16(d_is[1]), d_is[5]);
1317   deltas[3] = aom_sdotq_s16(deltas[3], vnegq_s16(d_is[0]), d_is[6]);
1318   deltas[3] = aom_sdotq_s16(deltas[3], vnegq_s16(d_is[1]), d_is[7]);
1319   deltas[4] = aom_sdotq_s16(deltas[4], vnegq_s16(d_is[0]), d_is[8]);
1320   deltas[4] = aom_sdotq_s16(deltas[4], vnegq_s16(d_is[1]), d_is[9]);
1321   deltas[5] = aom_sdotq_s16(deltas[5], vnegq_s16(d_is[0]), d_is[10]);
1322   deltas[5] = aom_sdotq_s16(deltas[5], vnegq_s16(d_is[1]), d_is[11]);
1323 
1324   deltas[6] = aom_sdotq_s16(deltas[6], vnegq_s16(d_is[2]), d_is[2]);
1325   deltas[6] = aom_sdotq_s16(deltas[6], vnegq_s16(d_is[3]), d_is[3]);
1326   deltas[7] = aom_sdotq_s16(deltas[7], vnegq_s16(d_is[2]), d_is[4]);
1327   deltas[7] = aom_sdotq_s16(deltas[7], vnegq_s16(d_is[3]), d_is[5]);
1328   deltas[8] = aom_sdotq_s16(deltas[8], vnegq_s16(d_is[2]), d_is[6]);
1329   deltas[8] = aom_sdotq_s16(deltas[8], vnegq_s16(d_is[3]), d_is[7]);
1330   deltas[9] = aom_sdotq_s16(deltas[9], vnegq_s16(d_is[2]), d_is[8]);
1331   deltas[9] = aom_sdotq_s16(deltas[9], vnegq_s16(d_is[3]), d_is[9]);
1332   deltas[10] = aom_sdotq_s16(deltas[10], vnegq_s16(d_is[2]), d_is[10]);
1333   deltas[10] = aom_sdotq_s16(deltas[10], vnegq_s16(d_is[3]), d_is[11]);
1334 
1335   deltas[11] = aom_sdotq_s16(deltas[11], vnegq_s16(d_is[4]), d_is[4]);
1336   deltas[11] = aom_sdotq_s16(deltas[11], vnegq_s16(d_is[5]), d_is[5]);
1337   deltas[12] = aom_sdotq_s16(deltas[12], vnegq_s16(d_is[4]), d_is[6]);
1338   deltas[12] = aom_sdotq_s16(deltas[12], vnegq_s16(d_is[5]), d_is[7]);
1339   deltas[13] = aom_sdotq_s16(deltas[13], vnegq_s16(d_is[4]), d_is[8]);
1340   deltas[13] = aom_sdotq_s16(deltas[13], vnegq_s16(d_is[5]), d_is[9]);
1341   deltas[14] = aom_sdotq_s16(deltas[14], vnegq_s16(d_is[4]), d_is[10]);
1342   deltas[14] = aom_sdotq_s16(deltas[14], vnegq_s16(d_is[5]), d_is[11]);
1343 
1344   deltas[15] = aom_sdotq_s16(deltas[15], vnegq_s16(d_is[6]), d_is[6]);
1345   deltas[15] = aom_sdotq_s16(deltas[15], vnegq_s16(d_is[7]), d_is[7]);
1346   deltas[16] = aom_sdotq_s16(deltas[16], vnegq_s16(d_is[6]), d_is[8]);
1347   deltas[16] = aom_sdotq_s16(deltas[16], vnegq_s16(d_is[7]), d_is[9]);
1348   deltas[17] = aom_sdotq_s16(deltas[17], vnegq_s16(d_is[6]), d_is[10]);
1349   deltas[17] = aom_sdotq_s16(deltas[17], vnegq_s16(d_is[7]), d_is[11]);
1350 
1351   deltas[18] = aom_sdotq_s16(deltas[18], vnegq_s16(d_is[8]), d_is[8]);
1352   deltas[18] = aom_sdotq_s16(deltas[18], vnegq_s16(d_is[9]), d_is[9]);
1353   deltas[19] = aom_sdotq_s16(deltas[19], vnegq_s16(d_is[8]), d_is[10]);
1354   deltas[19] = aom_sdotq_s16(deltas[19], vnegq_s16(d_is[9]), d_is[11]);
1355 
1356   deltas[20] = aom_sdotq_s16(deltas[20], vnegq_s16(d_is[10]), d_is[10]);
1357   deltas[20] = aom_sdotq_s16(deltas[20], vnegq_s16(d_is[11]), d_is[11]);
1358 
1359   deltas[0] = aom_sdotq_s16(deltas[0], d_ie[0], d_ie[0]);
1360   deltas[0] = aom_sdotq_s16(deltas[0], d_ie[1], d_ie[1]);
1361   deltas[1] = aom_sdotq_s16(deltas[1], d_ie[0], d_ie[2]);
1362   deltas[1] = aom_sdotq_s16(deltas[1], d_ie[1], d_ie[3]);
1363   deltas[2] = aom_sdotq_s16(deltas[2], d_ie[0], d_ie[4]);
1364   deltas[2] = aom_sdotq_s16(deltas[2], d_ie[1], d_ie[5]);
1365   deltas[3] = aom_sdotq_s16(deltas[3], d_ie[0], d_ie[6]);
1366   deltas[3] = aom_sdotq_s16(deltas[3], d_ie[1], d_ie[7]);
1367   deltas[4] = aom_sdotq_s16(deltas[4], d_ie[0], d_ie[8]);
1368   deltas[4] = aom_sdotq_s16(deltas[4], d_ie[1], d_ie[9]);
1369   deltas[5] = aom_sdotq_s16(deltas[5], d_ie[0], d_ie[10]);
1370   deltas[5] = aom_sdotq_s16(deltas[5], d_ie[1], d_ie[11]);
1371 
1372   deltas[6] = aom_sdotq_s16(deltas[6], d_ie[2], d_ie[2]);
1373   deltas[6] = aom_sdotq_s16(deltas[6], d_ie[3], d_ie[3]);
1374   deltas[7] = aom_sdotq_s16(deltas[7], d_ie[2], d_ie[4]);
1375   deltas[7] = aom_sdotq_s16(deltas[7], d_ie[3], d_ie[5]);
1376   deltas[8] = aom_sdotq_s16(deltas[8], d_ie[2], d_ie[6]);
1377   deltas[8] = aom_sdotq_s16(deltas[8], d_ie[3], d_ie[7]);
1378   deltas[9] = aom_sdotq_s16(deltas[9], d_ie[2], d_ie[8]);
1379   deltas[9] = aom_sdotq_s16(deltas[9], d_ie[3], d_ie[9]);
1380   deltas[10] = aom_sdotq_s16(deltas[10], d_ie[2], d_ie[10]);
1381   deltas[10] = aom_sdotq_s16(deltas[10], d_ie[3], d_ie[11]);
1382 
1383   deltas[11] = aom_sdotq_s16(deltas[11], d_ie[4], d_ie[4]);
1384   deltas[11] = aom_sdotq_s16(deltas[11], d_ie[5], d_ie[5]);
1385   deltas[12] = aom_sdotq_s16(deltas[12], d_ie[4], d_ie[6]);
1386   deltas[12] = aom_sdotq_s16(deltas[12], d_ie[5], d_ie[7]);
1387   deltas[13] = aom_sdotq_s16(deltas[13], d_ie[4], d_ie[8]);
1388   deltas[13] = aom_sdotq_s16(deltas[13], d_ie[5], d_ie[9]);
1389   deltas[14] = aom_sdotq_s16(deltas[14], d_ie[4], d_ie[10]);
1390   deltas[14] = aom_sdotq_s16(deltas[14], d_ie[5], d_ie[11]);
1391 
1392   deltas[15] = aom_sdotq_s16(deltas[15], d_ie[6], d_ie[6]);
1393   deltas[15] = aom_sdotq_s16(deltas[15], d_ie[7], d_ie[7]);
1394   deltas[16] = aom_sdotq_s16(deltas[16], d_ie[6], d_ie[8]);
1395   deltas[16] = aom_sdotq_s16(deltas[16], d_ie[7], d_ie[9]);
1396   deltas[17] = aom_sdotq_s16(deltas[17], d_ie[6], d_ie[10]);
1397   deltas[17] = aom_sdotq_s16(deltas[17], d_ie[7], d_ie[11]);
1398 
1399   deltas[18] = aom_sdotq_s16(deltas[18], d_ie[8], d_ie[8]);
1400   deltas[18] = aom_sdotq_s16(deltas[18], d_ie[9], d_ie[9]);
1401   deltas[19] = aom_sdotq_s16(deltas[19], d_ie[8], d_ie[10]);
1402   deltas[19] = aom_sdotq_s16(deltas[19], d_ie[9], d_ie[11]);
1403 
1404   deltas[20] = aom_sdotq_s16(deltas[20], d_ie[10], d_ie[10]);
1405   deltas[20] = aom_sdotq_s16(deltas[20], d_ie[11], d_ie[11]);
1406 }
1407 
compute_stats_win7_sve(const int16_t * const d,const int32_t d_stride,const int16_t * const s,const int32_t s_stride,const int32_t width,const int32_t height,int64_t * const M,int64_t * const H)1408 static inline void compute_stats_win7_sve(
1409     const int16_t *const d, const int32_t d_stride, const int16_t *const s,
1410     const int32_t s_stride, const int32_t width, const int32_t height,
1411     int64_t *const M, int64_t *const H) {
1412   const int32_t wiener_win = WIENER_WIN;
1413   const int32_t wiener_win2 = wiener_win * wiener_win;
1414   const int32_t h8 = height & ~7;
1415   int32_t i, j, x, y;
1416 
1417   // Use a predicate to compute the last columns.
1418   svbool_t p0 = svwhilelt_b16_u32(0, width % 16 == 0 ? 16 : width % 16);
1419   svbool_t p1 = svwhilelt_b16_u32(8, width % 16 == 0 ? 16 : width % 16);
1420 
1421   // Step 1: Calculate the top edge of the whole matrix, i.e., the top
1422   // edge of each triangle and square on the top row.
1423   j = 0;
1424   do {
1425     const int16_t *s_t = s;
1426     const int16_t *d_t = d;
1427     int64x2_t sum_m[WIENER_WIN] = { vdupq_n_s64(0) };
1428     int64x2_t sum_h[WIENER_WIN] = { vdupq_n_s64(0) };
1429     int16x8_t src[2], dgd[2];
1430 
1431     y = height;
1432     do {
1433       x = 0;
1434       while (x < width - 16) {
1435         src[0] = vld1q_s16(s_t + x + 0);
1436         src[1] = vld1q_s16(s_t + x + 8);
1437         dgd[0] = vld1q_s16(d_t + x + 0);
1438         dgd[1] = vld1q_s16(d_t + x + 8);
1439         stats_top_win7_sve(src, dgd, d_t + j + x, d_stride, sum_m, sum_h);
1440         x += 16;
1441       }
1442 
1443       src[0] = svget_neonq_s16(svld1_s16(p0, s_t + x + 0));
1444       src[1] = svget_neonq_s16(svld1_s16(p1, s_t + x + 8));
1445       dgd[0] = svget_neonq_s16(svld1_s16(p0, d_t + x + 0));
1446       dgd[1] = svget_neonq_s16(svld1_s16(p1, d_t + x + 8));
1447       stats_top_win7_sve(src, dgd, d_t + j + x, d_stride, sum_m, sum_h);
1448 
1449       s_t += s_stride;
1450       d_t += d_stride;
1451     } while (--y);
1452 
1453     vst1q_s64(M + wiener_win * j + 0, vpaddq_s64(sum_m[0], sum_m[1]));
1454     vst1q_s64(M + wiener_win * j + 2, vpaddq_s64(sum_m[2], sum_m[3]));
1455     vst1q_s64(M + wiener_win * j + 4, vpaddq_s64(sum_m[4], sum_m[5]));
1456     M[wiener_win * j + 6] = vaddvq_s64(sum_m[6]);
1457 
1458     vst1q_s64(H + wiener_win * j + 0, vpaddq_s64(sum_h[0], sum_h[1]));
1459     vst1q_s64(H + wiener_win * j + 2, vpaddq_s64(sum_h[2], sum_h[3]));
1460     vst1q_s64(H + wiener_win * j + 4, vpaddq_s64(sum_h[4], sum_h[5]));
1461     H[wiener_win * j + 6] = vaddvq_s64(sum_h[6]);
1462   } while (++j < wiener_win);
1463 
1464   // Step 2: Calculate the left edge of each square on the top row.
1465   j = 1;
1466   do {
1467     const int16_t *d_t = d;
1468     int64x2_t sum_h[WIENER_WIN - 1] = { vdupq_n_s64(0) };
1469     int16x8_t dgd[2];
1470 
1471     y = height;
1472     do {
1473       x = 0;
1474       while (x < width - 16) {
1475         dgd[0] = vld1q_s16(d_t + j + x + 0);
1476         dgd[1] = vld1q_s16(d_t + j + x + 8);
1477         stats_left_win7_sve(dgd, d_t + x, d_stride, sum_h);
1478         x += 16;
1479       }
1480 
1481       dgd[0] = svget_neonq_s16(svld1_s16(p0, d_t + j + x + 0));
1482       dgd[1] = svget_neonq_s16(svld1_s16(p1, d_t + j + x + 8));
1483       stats_left_win7_sve(dgd, d_t + x, d_stride, sum_h);
1484 
1485       d_t += d_stride;
1486     } while (--y);
1487 
1488     int64x2_t sum_h01 = vpaddq_s64(sum_h[0], sum_h[1]);
1489     int64x2_t sum_h23 = vpaddq_s64(sum_h[2], sum_h[3]);
1490     int64x2_t sum_h45 = vpaddq_s64(sum_h[4], sum_h[5]);
1491     vst1_s64(&H[1 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h01));
1492     vst1_s64(&H[2 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h01));
1493     vst1_s64(&H[3 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h23));
1494     vst1_s64(&H[4 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h23));
1495     vst1_s64(&H[5 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h45));
1496     vst1_s64(&H[6 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h45));
1497   } while (++j < wiener_win);
1498 
1499   // Step 3: Derive the top edge of each triangle along the diagonal. No
1500   // triangle in top row.
1501   {
1502     const int16_t *d_t = d;
1503     // Pad to call transpose function.
1504     int32x4_t deltas[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
1505     int32x4_t deltas_tr[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
1506     int16x8_t ds[WIENER_WIN * 2];
1507 
1508     load_s16_8x6(d_t, d_stride, &ds[0], &ds[2], &ds[4], &ds[6], &ds[8],
1509                  &ds[10]);
1510     load_s16_8x6(d_t + width, d_stride, &ds[1], &ds[3], &ds[5], &ds[7], &ds[9],
1511                  &ds[11]);
1512 
1513     d_t += 6 * d_stride;
1514 
1515     step3_win7_neon(d_t, d_stride, width, height, ds, deltas);
1516     transpose_arrays_s32_8x8(deltas, deltas_tr);
1517 
1518     update_8_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
1519                         deltas_tr[0], deltas_tr[4],
1520                         H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
1521     update_8_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
1522                         deltas_tr[1], deltas_tr[5],
1523                         H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
1524     update_8_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
1525                         deltas_tr[2], deltas_tr[6],
1526                         H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
1527     update_8_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
1528                         deltas_tr[3], deltas_tr[7],
1529                         H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
1530     update_8_stats_neon(H + 4 * wiener_win * wiener_win2 + 4 * wiener_win,
1531                         deltas_tr[8], deltas_tr[12],
1532                         H + 5 * wiener_win * wiener_win2 + 5 * wiener_win);
1533     update_8_stats_neon(H + 5 * wiener_win * wiener_win2 + 5 * wiener_win,
1534                         deltas_tr[9], deltas_tr[13],
1535                         H + 6 * wiener_win * wiener_win2 + 6 * wiener_win);
1536   }
1537 
1538   // Step 4: Derive the top and left edge of each square. No square in top and
1539   // bottom row.
1540 
1541   i = 1;
1542   do {
1543     j = i + 1;
1544     do {
1545       const int16_t *di = d + i - 1;
1546       const int16_t *dj = d + j - 1;
1547       int64x2_t deltas[(2 * WIENER_WIN - 1) * 2] = { vdupq_n_s64(0) };
1548       int16x8_t dd[WIENER_WIN * 2], ds[WIENER_WIN * 2];
1549 
1550       dd[5] = vdupq_n_s16(0);  // Initialize to avoid warning.
1551       const int16_t dd0_values[] = { di[0 * d_stride],
1552                                      di[1 * d_stride],
1553                                      di[2 * d_stride],
1554                                      di[3 * d_stride],
1555                                      di[4 * d_stride],
1556                                      di[5 * d_stride],
1557                                      0,
1558                                      0 };
1559       dd[0] = vld1q_s16(dd0_values);
1560       const int16_t dd1_values[] = { di[0 * d_stride + width],
1561                                      di[1 * d_stride + width],
1562                                      di[2 * d_stride + width],
1563                                      di[3 * d_stride + width],
1564                                      di[4 * d_stride + width],
1565                                      di[5 * d_stride + width],
1566                                      0,
1567                                      0 };
1568       dd[1] = vld1q_s16(dd1_values);
1569       const int16_t ds0_values[] = { dj[0 * d_stride],
1570                                      dj[1 * d_stride],
1571                                      dj[2 * d_stride],
1572                                      dj[3 * d_stride],
1573                                      dj[4 * d_stride],
1574                                      dj[5 * d_stride],
1575                                      0,
1576                                      0 };
1577       ds[0] = vld1q_s16(ds0_values);
1578       int16_t ds1_values[] = { dj[0 * d_stride + width],
1579                                dj[1 * d_stride + width],
1580                                dj[2 * d_stride + width],
1581                                dj[3 * d_stride + width],
1582                                dj[4 * d_stride + width],
1583                                dj[5 * d_stride + width],
1584                                0,
1585                                0 };
1586       ds[1] = vld1q_s16(ds1_values);
1587 
1588       y = 0;
1589       while (y < h8) {
1590         // 00s 10s 20s 30s 40s 50s 60s 70s  00e 10e 20e 30e 40e 50e 60e 70e
1591         dd[0] = vsetq_lane_s16(di[6 * d_stride], dd[0], 6);
1592         dd[0] = vsetq_lane_s16(di[7 * d_stride], dd[0], 7);
1593         dd[1] = vsetq_lane_s16(di[6 * d_stride + width], dd[1], 6);
1594         dd[1] = vsetq_lane_s16(di[7 * d_stride + width], dd[1], 7);
1595 
1596         // 00s 10s 20s 30s 40s 50s 60s 70s  00e 10e 20e 30e 40e 50e 60e 70e
1597         // 01s 11s 21s 31s 41s 51s 61s 71s  01e 11e 21e 31e 41e 51e 61e 71e
1598         ds[0] = vsetq_lane_s16(dj[6 * d_stride], ds[0], 6);
1599         ds[0] = vsetq_lane_s16(dj[7 * d_stride], ds[0], 7);
1600         ds[1] = vsetq_lane_s16(dj[6 * d_stride + width], ds[1], 6);
1601         ds[1] = vsetq_lane_s16(dj[7 * d_stride + width], ds[1], 7);
1602 
1603         load_more_16_neon(di + 8 * d_stride, width, &dd[0], &dd[2]);
1604         load_more_16_neon(dj + 8 * d_stride, width, &ds[0], &ds[2]);
1605         load_more_16_neon(di + 9 * d_stride, width, &dd[2], &dd[4]);
1606         load_more_16_neon(dj + 9 * d_stride, width, &ds[2], &ds[4]);
1607         load_more_16_neon(di + 10 * d_stride, width, &dd[4], &dd[6]);
1608         load_more_16_neon(dj + 10 * d_stride, width, &ds[4], &ds[6]);
1609         load_more_16_neon(di + 11 * d_stride, width, &dd[6], &dd[8]);
1610         load_more_16_neon(dj + 11 * d_stride, width, &ds[6], &ds[8]);
1611         load_more_16_neon(di + 12 * d_stride, width, &dd[8], &dd[10]);
1612         load_more_16_neon(dj + 12 * d_stride, width, &ds[8], &ds[10]);
1613         load_more_16_neon(di + 13 * d_stride, width, &dd[10], &dd[12]);
1614         load_more_16_neon(dj + 13 * d_stride, width, &ds[10], &ds[12]);
1615 
1616         deltas[0] = aom_sdotq_s16(deltas[0], dd[0], ds[0]);
1617         deltas[1] = aom_sdotq_s16(deltas[1], dd[1], ds[1]);
1618         deltas[2] = aom_sdotq_s16(deltas[2], dd[0], ds[2]);
1619         deltas[3] = aom_sdotq_s16(deltas[3], dd[1], ds[3]);
1620         deltas[4] = aom_sdotq_s16(deltas[4], dd[0], ds[4]);
1621         deltas[5] = aom_sdotq_s16(deltas[5], dd[1], ds[5]);
1622         deltas[6] = aom_sdotq_s16(deltas[6], dd[0], ds[6]);
1623         deltas[7] = aom_sdotq_s16(deltas[7], dd[1], ds[7]);
1624         deltas[8] = aom_sdotq_s16(deltas[8], dd[0], ds[8]);
1625         deltas[9] = aom_sdotq_s16(deltas[9], dd[1], ds[9]);
1626         deltas[10] = aom_sdotq_s16(deltas[10], dd[0], ds[10]);
1627         deltas[11] = aom_sdotq_s16(deltas[11], dd[1], ds[11]);
1628         deltas[12] = aom_sdotq_s16(deltas[12], dd[0], ds[12]);
1629         deltas[13] = aom_sdotq_s16(deltas[13], dd[1], ds[13]);
1630         deltas[14] = aom_sdotq_s16(deltas[14], dd[2], ds[0]);
1631         deltas[15] = aom_sdotq_s16(deltas[15], dd[3], ds[1]);
1632         deltas[16] = aom_sdotq_s16(deltas[16], dd[4], ds[0]);
1633         deltas[17] = aom_sdotq_s16(deltas[17], dd[5], ds[1]);
1634         deltas[18] = aom_sdotq_s16(deltas[18], dd[6], ds[0]);
1635         deltas[19] = aom_sdotq_s16(deltas[19], dd[7], ds[1]);
1636         deltas[20] = aom_sdotq_s16(deltas[20], dd[8], ds[0]);
1637         deltas[21] = aom_sdotq_s16(deltas[21], dd[9], ds[1]);
1638         deltas[22] = aom_sdotq_s16(deltas[22], dd[10], ds[0]);
1639         deltas[23] = aom_sdotq_s16(deltas[23], dd[11], ds[1]);
1640         deltas[24] = aom_sdotq_s16(deltas[24], dd[12], ds[0]);
1641         deltas[25] = aom_sdotq_s16(deltas[25], dd[13], ds[1]);
1642 
1643         dd[0] = vextq_s16(dd[12], vdupq_n_s16(0), 2);
1644         dd[1] = vextq_s16(dd[13], vdupq_n_s16(0), 2);
1645         ds[0] = vextq_s16(ds[12], vdupq_n_s16(0), 2);
1646         ds[1] = vextq_s16(ds[13], vdupq_n_s16(0), 2);
1647 
1648         di += 8 * d_stride;
1649         dj += 8 * d_stride;
1650         y += 8;
1651       }
1652 
1653       int64x2_t deltas02 = vpaddq_s64(deltas[0], deltas[2]);
1654       int64x2_t deltas13 = vpaddq_s64(deltas[1], deltas[3]);
1655       int64x2_t deltas46 = vpaddq_s64(deltas[4], deltas[6]);
1656       int64x2_t deltas57 = vpaddq_s64(deltas[5], deltas[7]);
1657       int64x2_t deltas810 = vpaddq_s64(deltas[8], deltas[10]);
1658       int64x2_t deltas911 = vpaddq_s64(deltas[9], deltas[11]);
1659       int64x2_t deltas1212 = vpaddq_s64(deltas[12], deltas[12]);
1660       int64x2_t deltas1313 = vpaddq_s64(deltas[13], deltas[13]);
1661       int64x2_t deltas1416 = vpaddq_s64(deltas[14], deltas[16]);
1662       int64x2_t deltas1820 = vpaddq_s64(deltas[18], deltas[20]);
1663       int64x2_t deltas1517 = vpaddq_s64(deltas[15], deltas[17]);
1664       int64x2_t deltas1921 = vpaddq_s64(deltas[19], deltas[21]);
1665       int64x2_t deltas2224 = vpaddq_s64(deltas[22], deltas[24]);
1666       int64x2_t deltas2325 = vpaddq_s64(deltas[23], deltas[25]);
1667       deltas02 = vsubq_s64(deltas13, deltas02);
1668       deltas46 = vsubq_s64(deltas57, deltas46);
1669       deltas810 = vsubq_s64(deltas911, deltas810);
1670       deltas1212 = vsubq_s64(deltas1313, deltas1212);
1671       deltas1416 = vsubq_s64(deltas1517, deltas1416);
1672       deltas1820 = vsubq_s64(deltas1921, deltas1820);
1673       deltas2224 = vsubq_s64(deltas2325, deltas2224);
1674 
1675       if (h8 != height) {
1676         const int16_t ds0_vals[] = {
1677           dj[0 * d_stride], dj[0 * d_stride + width],
1678           dj[1 * d_stride], dj[1 * d_stride + width],
1679           dj[2 * d_stride], dj[2 * d_stride + width],
1680           dj[3 * d_stride], dj[3 * d_stride + width]
1681         };
1682         ds[0] = vld1q_s16(ds0_vals);
1683 
1684         ds[1] = vsetq_lane_s16(dj[4 * d_stride], ds[1], 0);
1685         ds[1] = vsetq_lane_s16(dj[4 * d_stride + width], ds[1], 1);
1686         ds[1] = vsetq_lane_s16(dj[5 * d_stride], ds[1], 2);
1687         ds[1] = vsetq_lane_s16(dj[5 * d_stride + width], ds[1], 3);
1688         const int16_t dd4_vals[] = {
1689           -di[1 * d_stride], di[1 * d_stride + width],
1690           -di[2 * d_stride], di[2 * d_stride + width],
1691           -di[3 * d_stride], di[3 * d_stride + width],
1692           -di[4 * d_stride], di[4 * d_stride + width]
1693         };
1694         dd[4] = vld1q_s16(dd4_vals);
1695 
1696         dd[5] = vsetq_lane_s16(-di[5 * d_stride], dd[5], 0);
1697         dd[5] = vsetq_lane_s16(di[5 * d_stride + width], dd[5], 1);
1698         do {
1699           dd[0] = vdupq_n_s16(-di[0 * d_stride]);
1700           dd[2] = dd[3] = vdupq_n_s16(di[0 * d_stride + width]);
1701           dd[0] = dd[1] = vzip1q_s16(dd[0], dd[2]);
1702 
1703           ds[4] = vdupq_n_s16(dj[0 * d_stride]);
1704           ds[6] = ds[7] = vdupq_n_s16(dj[0 * d_stride + width]);
1705           ds[4] = ds[5] = vzip1q_s16(ds[4], ds[6]);
1706 
1707           dd[5] = vsetq_lane_s16(-di[6 * d_stride], dd[5], 2);
1708           dd[5] = vsetq_lane_s16(di[6 * d_stride + width], dd[5], 3);
1709           ds[1] = vsetq_lane_s16(dj[6 * d_stride], ds[1], 4);
1710           ds[1] = vsetq_lane_s16(dj[6 * d_stride + width], ds[1], 5);
1711 
1712           const int32x4_t res0 =
1713               vpaddq_s32(vmull_s16(vget_low_s16(dd[0]), vget_low_s16(ds[0])),
1714                          vmull_s16(vget_high_s16(dd[0]), vget_high_s16(ds[0])));
1715           deltas02 = vaddw_s32(deltas02, vget_low_s32(res0));
1716           deltas46 = vaddw_s32(deltas46, vget_high_s32(res0));
1717           const int32x4_t res1 =
1718               vpaddq_s32(vmull_s16(vget_low_s16(dd[1]), vget_low_s16(ds[1])),
1719                          vmull_s16(vget_high_s16(dd[1]), vget_high_s16(ds[1])));
1720           deltas810 = vaddw_s32(deltas810, vget_low_s32(res1));
1721           deltas1212 = vaddw_s32(deltas1212, vget_high_s32(res1));
1722           const int32x4_t res2 =
1723               vpaddq_s32(vmull_s16(vget_low_s16(dd[4]), vget_low_s16(ds[4])),
1724                          vmull_s16(vget_high_s16(dd[4]), vget_high_s16(ds[4])));
1725           deltas1416 = vaddw_s32(deltas1416, vget_low_s32(res2));
1726           deltas1820 = vaddw_s32(deltas1820, vget_high_s32(res2));
1727           const int32x4_t res3 =
1728               vpaddq_s32(vmull_s16(vget_low_s16(dd[5]), vget_low_s16(ds[5])),
1729                          vmull_s16(vget_high_s16(dd[5]), vget_high_s16(ds[5])));
1730           deltas2224 = vaddw_s32(deltas2224, vget_low_s32(res3));
1731 
1732           int32_t tmp0 = vgetq_lane_s32(vreinterpretq_s32_s16(ds[0]), 0);
1733           ds[0] = vextq_s16(ds[0], ds[1], 2);
1734           ds[1] = vextq_s16(ds[1], ds[0], 2);
1735           ds[1] = vreinterpretq_s16_s32(
1736               vsetq_lane_s32(tmp0, vreinterpretq_s32_s16(ds[1]), 3));
1737           int32_t tmp1 = vgetq_lane_s32(vreinterpretq_s32_s16(dd[4]), 0);
1738           dd[4] = vextq_s16(dd[4], dd[5], 2);
1739           dd[5] = vextq_s16(dd[5], dd[4], 2);
1740           dd[5] = vreinterpretq_s16_s32(
1741               vsetq_lane_s32(tmp1, vreinterpretq_s32_s16(dd[5]), 3));
1742           di += d_stride;
1743           dj += d_stride;
1744         } while (++y < height);
1745       }
1746 
1747       // Writing one more element on the top edge of a square falls to
1748       // the next square in the same row or the first element in the next
1749       // row, which will just be overwritten later.
1750       int64x2_t s0 = vld1q_s64(H + (i - 1) * wiener_win * wiener_win2 +
1751                                (j - 1) * wiener_win + 0);
1752       int64x2_t s1 = vld1q_s64(H + (i - 1) * wiener_win * wiener_win2 +
1753                                (j - 1) * wiener_win + 2);
1754       int64x2_t s2 = vld1q_s64(H + (i - 1) * wiener_win * wiener_win2 +
1755                                (j - 1) * wiener_win + 4);
1756       int64x2_t s3 = vld1q_s64(H + (i - 1) * wiener_win * wiener_win2 +
1757                                (j - 1) * wiener_win + 6);
1758 
1759       vst1q_s64(H + i * wiener_win * wiener_win2 + j * wiener_win + 0,
1760                 vaddq_s64(s0, deltas02));
1761       vst1q_s64(H + i * wiener_win * wiener_win2 + j * wiener_win + 2,
1762                 vaddq_s64(s1, deltas46));
1763       vst1q_s64(H + i * wiener_win * wiener_win2 + j * wiener_win + 4,
1764                 vaddq_s64(s2, deltas810));
1765       vst1q_s64(H + i * wiener_win * wiener_win2 + j * wiener_win + 6,
1766                 vaddq_s64(s3, deltas1212));
1767 
1768       H[(i * wiener_win + 1) * wiener_win2 + j * wiener_win] =
1769           H[((i - 1) * wiener_win + 1) * wiener_win2 + (j - 1) * wiener_win] +
1770           vgetq_lane_s64(deltas1416, 0);
1771       H[(i * wiener_win + 2) * wiener_win2 + j * wiener_win] =
1772           H[((i - 1) * wiener_win + 2) * wiener_win2 + (j - 1) * wiener_win] +
1773           vgetq_lane_s64(deltas1416, 1);
1774       H[(i * wiener_win + 3) * wiener_win2 + j * wiener_win] =
1775           H[((i - 1) * wiener_win + 3) * wiener_win2 + (j - 1) * wiener_win] +
1776           vgetq_lane_s64(deltas1820, 0);
1777       H[(i * wiener_win + 4) * wiener_win2 + j * wiener_win] =
1778           H[((i - 1) * wiener_win + 4) * wiener_win2 + (j - 1) * wiener_win] +
1779           vgetq_lane_s64(deltas1820, 1);
1780       H[(i * wiener_win + 5) * wiener_win2 + j * wiener_win] =
1781           H[((i - 1) * wiener_win + 5) * wiener_win2 + (j - 1) * wiener_win] +
1782           vgetq_lane_s64(deltas2224, 0);
1783       H[(i * wiener_win + 6) * wiener_win2 + j * wiener_win] =
1784           H[((i - 1) * wiener_win + 6) * wiener_win2 + (j - 1) * wiener_win] +
1785           vgetq_lane_s64(deltas2224, 1);
1786     } while (++j < wiener_win);
1787   } while (++i < wiener_win - 1);
1788 
1789   // Step 5: Derive other points of each square. No square in bottom row.
1790   i = 0;
1791   do {
1792     const int16_t *const di = d + i;
1793 
1794     j = i + 1;
1795     do {
1796       const int16_t *const dj = d + j;
1797       int64x2_t deltas[WIENER_WIN - 1][WIN_7] = { { vdupq_n_s64(0) },
1798                                                   { vdupq_n_s64(0) } };
1799       int16x8_t d_is[WIN_7];
1800       int16x8_t d_ie[WIN_7];
1801       int16x8_t d_js[WIN_7];
1802       int16x8_t d_je[WIN_7];
1803 
1804       x = 0;
1805       while (x < width - 16) {
1806         load_square_win7_neon(di + x, dj + x, d_stride, height, d_is, d_ie,
1807                               d_js, d_je);
1808         derive_square_win7_sve(d_is, d_ie, d_js, d_je, deltas);
1809         x += 16;
1810       }
1811 
1812       load_square_win7_sve(di + x, dj + x, d_stride, height, d_is, d_ie, d_js,
1813                            d_je, p0, p1);
1814       derive_square_win7_sve(d_is, d_ie, d_js, d_je, deltas);
1815 
1816       hadd_update_6_stats_sve(
1817           H + (i * wiener_win + 0) * wiener_win2 + j * wiener_win, deltas[0],
1818           H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win + 1);
1819       hadd_update_6_stats_sve(
1820           H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win, deltas[1],
1821           H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win + 1);
1822       hadd_update_6_stats_sve(
1823           H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win, deltas[2],
1824           H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win + 1);
1825       hadd_update_6_stats_sve(
1826           H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win, deltas[3],
1827           H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win + 1);
1828       hadd_update_6_stats_sve(
1829           H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win, deltas[4],
1830           H + (i * wiener_win + 5) * wiener_win2 + j * wiener_win + 1);
1831       hadd_update_6_stats_sve(
1832           H + (i * wiener_win + 5) * wiener_win2 + j * wiener_win, deltas[5],
1833           H + (i * wiener_win + 6) * wiener_win2 + j * wiener_win + 1);
1834     } while (++j < wiener_win);
1835   } while (++i < wiener_win - 1);
1836 
1837   // Step 6: Derive other points of each upper triangle along the diagonal.
1838   i = 0;
1839   do {
1840     const int16_t *const di = d + i;
1841     int64x2_t deltas[3 * WIENER_WIN] = { vdupq_n_s64(0) };
1842     int16x8_t d_is[WIN_7], d_ie[WIN_7];
1843 
1844     x = 0;
1845     while (x < width - 16) {
1846       load_triangle_win7_neon(di + x, d_stride, height, d_is, d_ie);
1847       derive_triangle_win7_sve(d_is, d_ie, deltas);
1848       x += 16;
1849     }
1850 
1851     load_triangle_win7_sve(di + x, d_stride, height, d_is, d_ie, p0, p1);
1852     derive_triangle_win7_sve(d_is, d_ie, deltas);
1853 
1854     // Row 1: 6 points
1855     hadd_update_6_stats_sve(
1856         H + (i * wiener_win + 0) * wiener_win2 + i * wiener_win, deltas,
1857         H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1);
1858 
1859     int64x2_t deltas1017 = vpaddq_s64(deltas[10], deltas[17]);
1860 
1861     // Row 2: 5 points
1862     hadd_update_4_stats_sve(
1863         H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1, deltas + 6,
1864         H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2);
1865     H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 6] =
1866         H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 5] +
1867         vgetq_lane_s64(deltas1017, 0);
1868 
1869     // Row 3: 4 points
1870     hadd_update_4_stats_sve(
1871         H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2,
1872         deltas + 11,
1873         H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3);
1874 
1875     // Row 4: 3 points
1876     int64x2_t h0 =
1877         vld1q_s64(H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3);
1878     vst1q_s64(H + (i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4,
1879               vaddq_s64(h0, vpaddq_s64(deltas[15], deltas[16])));
1880     H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 6] =
1881         H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 5] +
1882         vgetq_lane_s64(deltas1017, 1);
1883 
1884     // Row 5: 2 points
1885     int64x2_t h1 =
1886         vld1q_s64(H + (i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4);
1887     vst1q_s64(H + (i * wiener_win + 5) * wiener_win2 + i * wiener_win + 5,
1888               vaddq_s64(h1, vpaddq_s64(deltas[18], deltas[19])));
1889 
1890     // Row 6: 1 points
1891     H[(i * wiener_win + 6) * wiener_win2 + i * wiener_win + 6] =
1892         H[(i * wiener_win + 5) * wiener_win2 + i * wiener_win + 5] +
1893         vaddvq_s64(deltas[20]);
1894   } while (++i < wiener_win);
1895 }
1896 
1897 #endif  // AOM_AV1_ENCODER_ARM_PICKRST_SVE_H_
1898