1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker *
4*77c1e3ccSAndroid Build Coastguard Worker * This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker * was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker * Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker */
11*77c1e3ccSAndroid Build Coastguard Worker
12*77c1e3ccSAndroid Build Coastguard Worker #ifndef AOM_AV1_ENCODER_ARM_PICKRST_SVE_H_
13*77c1e3ccSAndroid Build Coastguard Worker #define AOM_AV1_ENCODER_ARM_PICKRST_SVE_H_
14*77c1e3ccSAndroid Build Coastguard Worker
15*77c1e3ccSAndroid Build Coastguard Worker #include <arm_neon.h>
16*77c1e3ccSAndroid Build Coastguard Worker #include <arm_sve.h>
17*77c1e3ccSAndroid Build Coastguard Worker
18*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/arm/aom_neon_sve_bridge.h"
19*77c1e3ccSAndroid Build Coastguard Worker #include "av1/encoder/arm/pickrst_neon.h"
20*77c1e3ccSAndroid Build Coastguard Worker
21*77c1e3ccSAndroid Build Coastguard Worker // Swap each half of the dgd vectors so that we can accumulate the result of
22*77c1e3ccSAndroid Build Coastguard Worker // the dot-products directly in the destination matrix.
transpose_dgd(int16x8_t dgd0,int16x8_t dgd1)23*77c1e3ccSAndroid Build Coastguard Worker static inline int16x8x2_t transpose_dgd(int16x8_t dgd0, int16x8_t dgd1) {
24*77c1e3ccSAndroid Build Coastguard Worker int16x8_t dgd_trn0 = vreinterpretq_s16_s64(
25*77c1e3ccSAndroid Build Coastguard Worker vzip1q_s64(vreinterpretq_s64_s16(dgd0), vreinterpretq_s64_s16(dgd1)));
26*77c1e3ccSAndroid Build Coastguard Worker int16x8_t dgd_trn1 = vreinterpretq_s16_s64(
27*77c1e3ccSAndroid Build Coastguard Worker vzip2q_s64(vreinterpretq_s64_s16(dgd0), vreinterpretq_s64_s16(dgd1)));
28*77c1e3ccSAndroid Build Coastguard Worker
29*77c1e3ccSAndroid Build Coastguard Worker return (struct int16x8x2_t){ dgd_trn0, dgd_trn1 };
30*77c1e3ccSAndroid Build Coastguard Worker }
31*77c1e3ccSAndroid Build Coastguard Worker
compute_M_one_row_win5(int16x8_t src,int16x8_t dgd[5],int64_t * M,int row)32*77c1e3ccSAndroid Build Coastguard Worker static inline void compute_M_one_row_win5(int16x8_t src, int16x8_t dgd[5],
33*77c1e3ccSAndroid Build Coastguard Worker int64_t *M, int row) {
34*77c1e3ccSAndroid Build Coastguard Worker const int wiener_win = 5;
35*77c1e3ccSAndroid Build Coastguard Worker
36*77c1e3ccSAndroid Build Coastguard Worker int64x2_t m01 = vld1q_s64(M + row * wiener_win + 0);
37*77c1e3ccSAndroid Build Coastguard Worker int16x8x2_t dgd01 = transpose_dgd(dgd[0], dgd[1]);
38*77c1e3ccSAndroid Build Coastguard Worker
39*77c1e3ccSAndroid Build Coastguard Worker int64x2_t cross_corr01 = aom_svdot_lane_s16(m01, dgd01.val[0], src, 0);
40*77c1e3ccSAndroid Build Coastguard Worker cross_corr01 = aom_svdot_lane_s16(cross_corr01, dgd01.val[1], src, 1);
41*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(M + row * wiener_win + 0, cross_corr01);
42*77c1e3ccSAndroid Build Coastguard Worker
43*77c1e3ccSAndroid Build Coastguard Worker int64x2_t m23 = vld1q_s64(M + row * wiener_win + 2);
44*77c1e3ccSAndroid Build Coastguard Worker int16x8x2_t dgd23 = transpose_dgd(dgd[2], dgd[3]);
45*77c1e3ccSAndroid Build Coastguard Worker
46*77c1e3ccSAndroid Build Coastguard Worker int64x2_t cross_corr23 = aom_svdot_lane_s16(m23, dgd23.val[0], src, 0);
47*77c1e3ccSAndroid Build Coastguard Worker cross_corr23 = aom_svdot_lane_s16(cross_corr23, dgd23.val[1], src, 1);
48*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(M + row * wiener_win + 2, cross_corr23);
49*77c1e3ccSAndroid Build Coastguard Worker
50*77c1e3ccSAndroid Build Coastguard Worker int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), src, dgd[4]);
51*77c1e3ccSAndroid Build Coastguard Worker M[row * wiener_win + 4] += vaddvq_s64(m4);
52*77c1e3ccSAndroid Build Coastguard Worker }
53*77c1e3ccSAndroid Build Coastguard Worker
compute_M_one_row_win7(int16x8_t src,int16x8_t dgd[7],int64_t * M,int row)54*77c1e3ccSAndroid Build Coastguard Worker static inline void compute_M_one_row_win7(int16x8_t src, int16x8_t dgd[7],
55*77c1e3ccSAndroid Build Coastguard Worker int64_t *M, int row) {
56*77c1e3ccSAndroid Build Coastguard Worker const int wiener_win = 7;
57*77c1e3ccSAndroid Build Coastguard Worker
58*77c1e3ccSAndroid Build Coastguard Worker int64x2_t m01 = vld1q_s64(M + row * wiener_win + 0);
59*77c1e3ccSAndroid Build Coastguard Worker int16x8x2_t dgd01 = transpose_dgd(dgd[0], dgd[1]);
60*77c1e3ccSAndroid Build Coastguard Worker
61*77c1e3ccSAndroid Build Coastguard Worker int64x2_t cross_corr01 = aom_svdot_lane_s16(m01, dgd01.val[0], src, 0);
62*77c1e3ccSAndroid Build Coastguard Worker cross_corr01 = aom_svdot_lane_s16(cross_corr01, dgd01.val[1], src, 1);
63*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(M + row * wiener_win + 0, cross_corr01);
64*77c1e3ccSAndroid Build Coastguard Worker
65*77c1e3ccSAndroid Build Coastguard Worker int64x2_t m23 = vld1q_s64(M + row * wiener_win + 2);
66*77c1e3ccSAndroid Build Coastguard Worker int16x8x2_t dgd23 = transpose_dgd(dgd[2], dgd[3]);
67*77c1e3ccSAndroid Build Coastguard Worker
68*77c1e3ccSAndroid Build Coastguard Worker int64x2_t cross_corr23 = aom_svdot_lane_s16(m23, dgd23.val[0], src, 0);
69*77c1e3ccSAndroid Build Coastguard Worker cross_corr23 = aom_svdot_lane_s16(cross_corr23, dgd23.val[1], src, 1);
70*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(M + row * wiener_win + 2, cross_corr23);
71*77c1e3ccSAndroid Build Coastguard Worker
72*77c1e3ccSAndroid Build Coastguard Worker int64x2_t m45 = vld1q_s64(M + row * wiener_win + 4);
73*77c1e3ccSAndroid Build Coastguard Worker int16x8x2_t dgd45 = transpose_dgd(dgd[4], dgd[5]);
74*77c1e3ccSAndroid Build Coastguard Worker
75*77c1e3ccSAndroid Build Coastguard Worker int64x2_t cross_corr45 = aom_svdot_lane_s16(m45, dgd45.val[0], src, 0);
76*77c1e3ccSAndroid Build Coastguard Worker cross_corr45 = aom_svdot_lane_s16(cross_corr45, dgd45.val[1], src, 1);
77*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(M + row * wiener_win + 4, cross_corr45);
78*77c1e3ccSAndroid Build Coastguard Worker
79*77c1e3ccSAndroid Build Coastguard Worker int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), src, dgd[6]);
80*77c1e3ccSAndroid Build Coastguard Worker M[row * wiener_win + 6] += vaddvq_s64(m6);
81*77c1e3ccSAndroid Build Coastguard Worker }
82*77c1e3ccSAndroid Build Coastguard Worker
compute_H_one_col(int16x8_t * dgd,int col,int64_t * H,const int wiener_win,const int wiener_win2)83*77c1e3ccSAndroid Build Coastguard Worker static inline void compute_H_one_col(int16x8_t *dgd, int col, int64_t *H,
84*77c1e3ccSAndroid Build Coastguard Worker const int wiener_win,
85*77c1e3ccSAndroid Build Coastguard Worker const int wiener_win2) {
86*77c1e3ccSAndroid Build Coastguard Worker for (int row0 = 0; row0 < wiener_win; row0++) {
87*77c1e3ccSAndroid Build Coastguard Worker for (int row1 = row0; row1 < wiener_win; row1++) {
88*77c1e3ccSAndroid Build Coastguard Worker int auto_cov_idx =
89*77c1e3ccSAndroid Build Coastguard Worker (col * wiener_win + row0) * wiener_win2 + (col * wiener_win) + row1;
90*77c1e3ccSAndroid Build Coastguard Worker
91*77c1e3ccSAndroid Build Coastguard Worker int64x2_t auto_cov = aom_sdotq_s16(vdupq_n_s64(0), dgd[row0], dgd[row1]);
92*77c1e3ccSAndroid Build Coastguard Worker H[auto_cov_idx] += vaddvq_s64(auto_cov);
93*77c1e3ccSAndroid Build Coastguard Worker }
94*77c1e3ccSAndroid Build Coastguard Worker }
95*77c1e3ccSAndroid Build Coastguard Worker }
96*77c1e3ccSAndroid Build Coastguard Worker
compute_H_two_rows_win5(int16x8_t * dgd0,int16x8_t * dgd1,int row0,int row1,int64_t * H)97*77c1e3ccSAndroid Build Coastguard Worker static inline void compute_H_two_rows_win5(int16x8_t *dgd0, int16x8_t *dgd1,
98*77c1e3ccSAndroid Build Coastguard Worker int row0, int row1, int64_t *H) {
99*77c1e3ccSAndroid Build Coastguard Worker for (int col0 = 0; col0 < 5; col0++) {
100*77c1e3ccSAndroid Build Coastguard Worker int auto_cov_idx = (row0 * 5 + col0) * 25 + (row1 * 5);
101*77c1e3ccSAndroid Build Coastguard Worker
102*77c1e3ccSAndroid Build Coastguard Worker int64x2_t h01 = vld1q_s64(H + auto_cov_idx);
103*77c1e3ccSAndroid Build Coastguard Worker int16x8x2_t dgd01 = transpose_dgd(dgd1[0], dgd1[1]);
104*77c1e3ccSAndroid Build Coastguard Worker
105*77c1e3ccSAndroid Build Coastguard Worker int64x2_t auto_cov01 = aom_svdot_lane_s16(h01, dgd01.val[0], dgd0[col0], 0);
106*77c1e3ccSAndroid Build Coastguard Worker auto_cov01 = aom_svdot_lane_s16(auto_cov01, dgd01.val[1], dgd0[col0], 1);
107*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(H + auto_cov_idx, auto_cov01);
108*77c1e3ccSAndroid Build Coastguard Worker
109*77c1e3ccSAndroid Build Coastguard Worker int64x2_t h23 = vld1q_s64(H + auto_cov_idx + 2);
110*77c1e3ccSAndroid Build Coastguard Worker int16x8x2_t dgd23 = transpose_dgd(dgd1[2], dgd1[3]);
111*77c1e3ccSAndroid Build Coastguard Worker
112*77c1e3ccSAndroid Build Coastguard Worker int64x2_t auto_cov23 = aom_svdot_lane_s16(h23, dgd23.val[0], dgd0[col0], 0);
113*77c1e3ccSAndroid Build Coastguard Worker auto_cov23 = aom_svdot_lane_s16(auto_cov23, dgd23.val[1], dgd0[col0], 1);
114*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(H + auto_cov_idx + 2, auto_cov23);
115*77c1e3ccSAndroid Build Coastguard Worker
116*77c1e3ccSAndroid Build Coastguard Worker int64x2_t auto_cov4 = aom_sdotq_s16(vdupq_n_s64(0), dgd0[col0], dgd1[4]);
117*77c1e3ccSAndroid Build Coastguard Worker H[auto_cov_idx + 4] += vaddvq_s64(auto_cov4);
118*77c1e3ccSAndroid Build Coastguard Worker }
119*77c1e3ccSAndroid Build Coastguard Worker }
120*77c1e3ccSAndroid Build Coastguard Worker
compute_H_two_rows_win7(int16x8_t * dgd0,int16x8_t * dgd1,int row0,int row1,int64_t * H)121*77c1e3ccSAndroid Build Coastguard Worker static inline void compute_H_two_rows_win7(int16x8_t *dgd0, int16x8_t *dgd1,
122*77c1e3ccSAndroid Build Coastguard Worker int row0, int row1, int64_t *H) {
123*77c1e3ccSAndroid Build Coastguard Worker for (int col0 = 0; col0 < 7; col0++) {
124*77c1e3ccSAndroid Build Coastguard Worker int auto_cov_idx = (row0 * 7 + col0) * 49 + (row1 * 7);
125*77c1e3ccSAndroid Build Coastguard Worker
126*77c1e3ccSAndroid Build Coastguard Worker int64x2_t h01 = vld1q_s64(H + auto_cov_idx);
127*77c1e3ccSAndroid Build Coastguard Worker int16x8x2_t dgd01 = transpose_dgd(dgd1[0], dgd1[1]);
128*77c1e3ccSAndroid Build Coastguard Worker
129*77c1e3ccSAndroid Build Coastguard Worker int64x2_t auto_cov01 = aom_svdot_lane_s16(h01, dgd01.val[0], dgd0[col0], 0);
130*77c1e3ccSAndroid Build Coastguard Worker auto_cov01 = aom_svdot_lane_s16(auto_cov01, dgd01.val[1], dgd0[col0], 1);
131*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(H + auto_cov_idx, auto_cov01);
132*77c1e3ccSAndroid Build Coastguard Worker
133*77c1e3ccSAndroid Build Coastguard Worker int64x2_t h23 = vld1q_s64(H + auto_cov_idx + 2);
134*77c1e3ccSAndroid Build Coastguard Worker int16x8x2_t dgd23 = transpose_dgd(dgd1[2], dgd1[3]);
135*77c1e3ccSAndroid Build Coastguard Worker
136*77c1e3ccSAndroid Build Coastguard Worker int64x2_t auto_cov23 = aom_svdot_lane_s16(h23, dgd23.val[0], dgd0[col0], 0);
137*77c1e3ccSAndroid Build Coastguard Worker auto_cov23 = aom_svdot_lane_s16(auto_cov23, dgd23.val[1], dgd0[col0], 1);
138*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(H + auto_cov_idx + 2, auto_cov23);
139*77c1e3ccSAndroid Build Coastguard Worker
140*77c1e3ccSAndroid Build Coastguard Worker int64x2_t h45 = vld1q_s64(H + auto_cov_idx + 4);
141*77c1e3ccSAndroid Build Coastguard Worker int16x8x2_t dgd45 = transpose_dgd(dgd1[4], dgd1[5]);
142*77c1e3ccSAndroid Build Coastguard Worker
143*77c1e3ccSAndroid Build Coastguard Worker int64x2_t auto_cov45 = aom_svdot_lane_s16(h45, dgd45.val[0], dgd0[col0], 0);
144*77c1e3ccSAndroid Build Coastguard Worker auto_cov45 = aom_svdot_lane_s16(auto_cov45, dgd45.val[1], dgd0[col0], 1);
145*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(H + auto_cov_idx + 4, auto_cov45);
146*77c1e3ccSAndroid Build Coastguard Worker
147*77c1e3ccSAndroid Build Coastguard Worker int64x2_t auto_cov6 = aom_sdotq_s16(vdupq_n_s64(0), dgd0[col0], dgd1[6]);
148*77c1e3ccSAndroid Build Coastguard Worker H[auto_cov_idx + 6] += vaddvq_s64(auto_cov6);
149*77c1e3ccSAndroid Build Coastguard Worker }
150*77c1e3ccSAndroid Build Coastguard Worker }
151*77c1e3ccSAndroid Build Coastguard Worker
stats_top_win5_sve(const int16x8_t src[2],const int16x8_t dgd[2],const int16_t * const d,const int32_t d_stride,int64x2_t * sum_m,int64x2_t * sum_h)152*77c1e3ccSAndroid Build Coastguard Worker static inline void stats_top_win5_sve(const int16x8_t src[2],
153*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t dgd[2],
154*77c1e3ccSAndroid Build Coastguard Worker const int16_t *const d,
155*77c1e3ccSAndroid Build Coastguard Worker const int32_t d_stride, int64x2_t *sum_m,
156*77c1e3ccSAndroid Build Coastguard Worker int64x2_t *sum_h) {
157*77c1e3ccSAndroid Build Coastguard Worker int16x8_t dgds[WIENER_WIN_CHROMA * 2];
158*77c1e3ccSAndroid Build Coastguard Worker
159*77c1e3ccSAndroid Build Coastguard Worker load_s16_8x5(d + 0, d_stride, &dgds[0], &dgds[2], &dgds[4], &dgds[6],
160*77c1e3ccSAndroid Build Coastguard Worker &dgds[8]);
161*77c1e3ccSAndroid Build Coastguard Worker load_s16_8x5(d + 8, d_stride, &dgds[1], &dgds[3], &dgds[5], &dgds[7],
162*77c1e3ccSAndroid Build Coastguard Worker &dgds[9]);
163*77c1e3ccSAndroid Build Coastguard Worker
164*77c1e3ccSAndroid Build Coastguard Worker sum_m[0] = aom_sdotq_s16(sum_m[0], src[0], dgds[0]);
165*77c1e3ccSAndroid Build Coastguard Worker sum_m[0] = aom_sdotq_s16(sum_m[0], src[1], dgds[1]);
166*77c1e3ccSAndroid Build Coastguard Worker sum_m[1] = aom_sdotq_s16(sum_m[1], src[0], dgds[2]);
167*77c1e3ccSAndroid Build Coastguard Worker sum_m[1] = aom_sdotq_s16(sum_m[1], src[1], dgds[3]);
168*77c1e3ccSAndroid Build Coastguard Worker sum_m[2] = aom_sdotq_s16(sum_m[2], src[0], dgds[4]);
169*77c1e3ccSAndroid Build Coastguard Worker sum_m[2] = aom_sdotq_s16(sum_m[2], src[1], dgds[5]);
170*77c1e3ccSAndroid Build Coastguard Worker sum_m[3] = aom_sdotq_s16(sum_m[3], src[0], dgds[6]);
171*77c1e3ccSAndroid Build Coastguard Worker sum_m[3] = aom_sdotq_s16(sum_m[3], src[1], dgds[7]);
172*77c1e3ccSAndroid Build Coastguard Worker sum_m[4] = aom_sdotq_s16(sum_m[4], src[0], dgds[8]);
173*77c1e3ccSAndroid Build Coastguard Worker sum_m[4] = aom_sdotq_s16(sum_m[4], src[1], dgds[9]);
174*77c1e3ccSAndroid Build Coastguard Worker
175*77c1e3ccSAndroid Build Coastguard Worker sum_h[0] = aom_sdotq_s16(sum_h[0], dgd[0], dgds[0]);
176*77c1e3ccSAndroid Build Coastguard Worker sum_h[0] = aom_sdotq_s16(sum_h[0], dgd[1], dgds[1]);
177*77c1e3ccSAndroid Build Coastguard Worker sum_h[1] = aom_sdotq_s16(sum_h[1], dgd[0], dgds[2]);
178*77c1e3ccSAndroid Build Coastguard Worker sum_h[1] = aom_sdotq_s16(sum_h[1], dgd[1], dgds[3]);
179*77c1e3ccSAndroid Build Coastguard Worker sum_h[2] = aom_sdotq_s16(sum_h[2], dgd[0], dgds[4]);
180*77c1e3ccSAndroid Build Coastguard Worker sum_h[2] = aom_sdotq_s16(sum_h[2], dgd[1], dgds[5]);
181*77c1e3ccSAndroid Build Coastguard Worker sum_h[3] = aom_sdotq_s16(sum_h[3], dgd[0], dgds[6]);
182*77c1e3ccSAndroid Build Coastguard Worker sum_h[3] = aom_sdotq_s16(sum_h[3], dgd[1], dgds[7]);
183*77c1e3ccSAndroid Build Coastguard Worker sum_h[4] = aom_sdotq_s16(sum_h[4], dgd[0], dgds[8]);
184*77c1e3ccSAndroid Build Coastguard Worker sum_h[4] = aom_sdotq_s16(sum_h[4], dgd[1], dgds[9]);
185*77c1e3ccSAndroid Build Coastguard Worker }
186*77c1e3ccSAndroid Build Coastguard Worker
stats_left_win5_sve(const int16x8_t src[2],const int16_t * d,const int32_t d_stride,int64x2_t * sum)187*77c1e3ccSAndroid Build Coastguard Worker static inline void stats_left_win5_sve(const int16x8_t src[2], const int16_t *d,
188*77c1e3ccSAndroid Build Coastguard Worker const int32_t d_stride, int64x2_t *sum) {
189*77c1e3ccSAndroid Build Coastguard Worker int16x8_t dgds[WIN_CHROMA];
190*77c1e3ccSAndroid Build Coastguard Worker
191*77c1e3ccSAndroid Build Coastguard Worker load_s16_8x4(d + d_stride + 0, d_stride, &dgds[0], &dgds[2], &dgds[4],
192*77c1e3ccSAndroid Build Coastguard Worker &dgds[6]);
193*77c1e3ccSAndroid Build Coastguard Worker load_s16_8x4(d + d_stride + 8, d_stride, &dgds[1], &dgds[3], &dgds[5],
194*77c1e3ccSAndroid Build Coastguard Worker &dgds[7]);
195*77c1e3ccSAndroid Build Coastguard Worker
196*77c1e3ccSAndroid Build Coastguard Worker sum[0] = aom_sdotq_s16(sum[0], src[0], dgds[0]);
197*77c1e3ccSAndroid Build Coastguard Worker sum[0] = aom_sdotq_s16(sum[0], src[1], dgds[1]);
198*77c1e3ccSAndroid Build Coastguard Worker sum[1] = aom_sdotq_s16(sum[1], src[0], dgds[2]);
199*77c1e3ccSAndroid Build Coastguard Worker sum[1] = aom_sdotq_s16(sum[1], src[1], dgds[3]);
200*77c1e3ccSAndroid Build Coastguard Worker sum[2] = aom_sdotq_s16(sum[2], src[0], dgds[4]);
201*77c1e3ccSAndroid Build Coastguard Worker sum[2] = aom_sdotq_s16(sum[2], src[1], dgds[5]);
202*77c1e3ccSAndroid Build Coastguard Worker sum[3] = aom_sdotq_s16(sum[3], src[0], dgds[6]);
203*77c1e3ccSAndroid Build Coastguard Worker sum[3] = aom_sdotq_s16(sum[3], src[1], dgds[7]);
204*77c1e3ccSAndroid Build Coastguard Worker }
205*77c1e3ccSAndroid Build Coastguard Worker
sub_deltas_step4_sve(int16x8_t * A,int16x8_t * B,int64x2_t * deltas)206*77c1e3ccSAndroid Build Coastguard Worker static inline void sub_deltas_step4_sve(int16x8_t *A, int16x8_t *B,
207*77c1e3ccSAndroid Build Coastguard Worker int64x2_t *deltas) {
208*77c1e3ccSAndroid Build Coastguard Worker deltas[0] = aom_sdotq_s16(deltas[0], vnegq_s16(A[0]), B[0]);
209*77c1e3ccSAndroid Build Coastguard Worker deltas[1] = aom_sdotq_s16(deltas[1], vnegq_s16(A[0]), B[1]);
210*77c1e3ccSAndroid Build Coastguard Worker deltas[2] = aom_sdotq_s16(deltas[2], vnegq_s16(A[0]), B[2]);
211*77c1e3ccSAndroid Build Coastguard Worker deltas[3] = aom_sdotq_s16(deltas[3], vnegq_s16(A[0]), B[3]);
212*77c1e3ccSAndroid Build Coastguard Worker deltas[4] = aom_sdotq_s16(deltas[4], vnegq_s16(A[0]), B[4]);
213*77c1e3ccSAndroid Build Coastguard Worker deltas[5] = aom_sdotq_s16(deltas[5], vnegq_s16(A[1]), B[0]);
214*77c1e3ccSAndroid Build Coastguard Worker deltas[6] = aom_sdotq_s16(deltas[6], vnegq_s16(A[2]), B[0]);
215*77c1e3ccSAndroid Build Coastguard Worker deltas[7] = aom_sdotq_s16(deltas[7], vnegq_s16(A[3]), B[0]);
216*77c1e3ccSAndroid Build Coastguard Worker deltas[8] = aom_sdotq_s16(deltas[8], vnegq_s16(A[4]), B[0]);
217*77c1e3ccSAndroid Build Coastguard Worker }
218*77c1e3ccSAndroid Build Coastguard Worker
add_deltas_step4_sve(int16x8_t * A,int16x8_t * B,int64x2_t * deltas)219*77c1e3ccSAndroid Build Coastguard Worker static inline void add_deltas_step4_sve(int16x8_t *A, int16x8_t *B,
220*77c1e3ccSAndroid Build Coastguard Worker int64x2_t *deltas) {
221*77c1e3ccSAndroid Build Coastguard Worker deltas[0] = aom_sdotq_s16(deltas[0], A[0], B[0]);
222*77c1e3ccSAndroid Build Coastguard Worker deltas[1] = aom_sdotq_s16(deltas[1], A[0], B[1]);
223*77c1e3ccSAndroid Build Coastguard Worker deltas[2] = aom_sdotq_s16(deltas[2], A[0], B[2]);
224*77c1e3ccSAndroid Build Coastguard Worker deltas[3] = aom_sdotq_s16(deltas[3], A[0], B[3]);
225*77c1e3ccSAndroid Build Coastguard Worker deltas[4] = aom_sdotq_s16(deltas[4], A[0], B[4]);
226*77c1e3ccSAndroid Build Coastguard Worker deltas[5] = aom_sdotq_s16(deltas[5], A[1], B[0]);
227*77c1e3ccSAndroid Build Coastguard Worker deltas[6] = aom_sdotq_s16(deltas[6], A[2], B[0]);
228*77c1e3ccSAndroid Build Coastguard Worker deltas[7] = aom_sdotq_s16(deltas[7], A[3], B[0]);
229*77c1e3ccSAndroid Build Coastguard Worker deltas[8] = aom_sdotq_s16(deltas[8], A[4], B[0]);
230*77c1e3ccSAndroid Build Coastguard Worker }
231*77c1e3ccSAndroid Build Coastguard Worker
load_square_win5_sve(const int16_t * const di,const int16_t * const dj,const int32_t d_stride,const int32_t height,int16x8_t * d_is,int16x8_t * d_ie,int16x8_t * d_js,int16x8_t * d_je,svbool_t p0,svbool_t p1)232*77c1e3ccSAndroid Build Coastguard Worker static inline void load_square_win5_sve(
233*77c1e3ccSAndroid Build Coastguard Worker const int16_t *const di, const int16_t *const dj, const int32_t d_stride,
234*77c1e3ccSAndroid Build Coastguard Worker const int32_t height, int16x8_t *d_is, int16x8_t *d_ie, int16x8_t *d_js,
235*77c1e3ccSAndroid Build Coastguard Worker int16x8_t *d_je, svbool_t p0, svbool_t p1) {
236*77c1e3ccSAndroid Build Coastguard Worker d_is[0] = svget_neonq_s16(svld1_s16(p0, di + 0 * d_stride + 0));
237*77c1e3ccSAndroid Build Coastguard Worker d_is[1] = svget_neonq_s16(svld1_s16(p1, di + 0 * d_stride + 8));
238*77c1e3ccSAndroid Build Coastguard Worker d_is[2] = svget_neonq_s16(svld1_s16(p0, di + 1 * d_stride + 0));
239*77c1e3ccSAndroid Build Coastguard Worker d_is[3] = svget_neonq_s16(svld1_s16(p1, di + 1 * d_stride + 8));
240*77c1e3ccSAndroid Build Coastguard Worker d_is[4] = svget_neonq_s16(svld1_s16(p0, di + 2 * d_stride + 0));
241*77c1e3ccSAndroid Build Coastguard Worker d_is[5] = svget_neonq_s16(svld1_s16(p1, di + 2 * d_stride + 8));
242*77c1e3ccSAndroid Build Coastguard Worker d_is[6] = svget_neonq_s16(svld1_s16(p0, di + 3 * d_stride + 0));
243*77c1e3ccSAndroid Build Coastguard Worker d_is[7] = svget_neonq_s16(svld1_s16(p1, di + 3 * d_stride + 8));
244*77c1e3ccSAndroid Build Coastguard Worker
245*77c1e3ccSAndroid Build Coastguard Worker d_ie[0] = svget_neonq_s16(svld1_s16(p0, di + (height + 0) * d_stride + 0));
246*77c1e3ccSAndroid Build Coastguard Worker d_ie[1] = svget_neonq_s16(svld1_s16(p1, di + (height + 0) * d_stride + 8));
247*77c1e3ccSAndroid Build Coastguard Worker d_ie[2] = svget_neonq_s16(svld1_s16(p0, di + (height + 1) * d_stride + 0));
248*77c1e3ccSAndroid Build Coastguard Worker d_ie[3] = svget_neonq_s16(svld1_s16(p1, di + (height + 1) * d_stride + 8));
249*77c1e3ccSAndroid Build Coastguard Worker d_ie[4] = svget_neonq_s16(svld1_s16(p0, di + (height + 2) * d_stride + 0));
250*77c1e3ccSAndroid Build Coastguard Worker d_ie[5] = svget_neonq_s16(svld1_s16(p1, di + (height + 2) * d_stride + 8));
251*77c1e3ccSAndroid Build Coastguard Worker d_ie[6] = svget_neonq_s16(svld1_s16(p0, di + (height + 3) * d_stride + 0));
252*77c1e3ccSAndroid Build Coastguard Worker d_ie[7] = svget_neonq_s16(svld1_s16(p1, di + (height + 3) * d_stride + 8));
253*77c1e3ccSAndroid Build Coastguard Worker
254*77c1e3ccSAndroid Build Coastguard Worker load_s16_8x4(dj + 0, d_stride, &d_js[0], &d_js[2], &d_js[4], &d_js[6]);
255*77c1e3ccSAndroid Build Coastguard Worker load_s16_8x4(dj + 8, d_stride, &d_js[1], &d_js[3], &d_js[5], &d_js[7]);
256*77c1e3ccSAndroid Build Coastguard Worker load_s16_8x4(dj + height * d_stride + 0, d_stride, &d_je[0], &d_je[2],
257*77c1e3ccSAndroid Build Coastguard Worker &d_je[4], &d_je[6]);
258*77c1e3ccSAndroid Build Coastguard Worker load_s16_8x4(dj + height * d_stride + 8, d_stride, &d_je[1], &d_je[3],
259*77c1e3ccSAndroid Build Coastguard Worker &d_je[5], &d_je[7]);
260*77c1e3ccSAndroid Build Coastguard Worker }
261*77c1e3ccSAndroid Build Coastguard Worker
update_4_stats_sve(const int64_t * const src,const int64x2_t * delta,int64_t * const dst)262*77c1e3ccSAndroid Build Coastguard Worker static inline void update_4_stats_sve(const int64_t *const src,
263*77c1e3ccSAndroid Build Coastguard Worker const int64x2_t *delta,
264*77c1e3ccSAndroid Build Coastguard Worker int64_t *const dst) {
265*77c1e3ccSAndroid Build Coastguard Worker const int64x2_t s1 = vld1q_s64(src);
266*77c1e3ccSAndroid Build Coastguard Worker const int64x2_t s2 = vld1q_s64(src + 2);
267*77c1e3ccSAndroid Build Coastguard Worker
268*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(dst + 0, vaddq_s64(s1, delta[0]));
269*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(dst + 2, vaddq_s64(s2, delta[1]));
270*77c1e3ccSAndroid Build Coastguard Worker }
271*77c1e3ccSAndroid Build Coastguard Worker
derive_square_win5_sve(int16x8_t * d_is,const int16x8_t * d_ie,const int16x8_t * d_js,const int16x8_t * d_je,int64x2_t deltas[WIENER_WIN_CHROMA-1][WIENER_WIN_CHROMA-1])272*77c1e3ccSAndroid Build Coastguard Worker static inline void derive_square_win5_sve(
273*77c1e3ccSAndroid Build Coastguard Worker int16x8_t *d_is, const int16x8_t *d_ie, const int16x8_t *d_js,
274*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t *d_je,
275*77c1e3ccSAndroid Build Coastguard Worker int64x2_t deltas[WIENER_WIN_CHROMA - 1][WIENER_WIN_CHROMA - 1]) {
276*77c1e3ccSAndroid Build Coastguard Worker d_is[0] = vnegq_s16(d_is[0]);
277*77c1e3ccSAndroid Build Coastguard Worker d_is[1] = vnegq_s16(d_is[1]);
278*77c1e3ccSAndroid Build Coastguard Worker d_is[2] = vnegq_s16(d_is[2]);
279*77c1e3ccSAndroid Build Coastguard Worker d_is[3] = vnegq_s16(d_is[3]);
280*77c1e3ccSAndroid Build Coastguard Worker d_is[4] = vnegq_s16(d_is[4]);
281*77c1e3ccSAndroid Build Coastguard Worker d_is[5] = vnegq_s16(d_is[5]);
282*77c1e3ccSAndroid Build Coastguard Worker d_is[6] = vnegq_s16(d_is[6]);
283*77c1e3ccSAndroid Build Coastguard Worker d_is[7] = vnegq_s16(d_is[7]);
284*77c1e3ccSAndroid Build Coastguard Worker
285*77c1e3ccSAndroid Build Coastguard Worker deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_is[0], d_js[0]);
286*77c1e3ccSAndroid Build Coastguard Worker deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_is[1], d_js[1]);
287*77c1e3ccSAndroid Build Coastguard Worker deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_is[0], d_js[2]);
288*77c1e3ccSAndroid Build Coastguard Worker deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_is[1], d_js[3]);
289*77c1e3ccSAndroid Build Coastguard Worker deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_is[0], d_js[4]);
290*77c1e3ccSAndroid Build Coastguard Worker deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_is[1], d_js[5]);
291*77c1e3ccSAndroid Build Coastguard Worker deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_is[0], d_js[6]);
292*77c1e3ccSAndroid Build Coastguard Worker deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_is[1], d_js[7]);
293*77c1e3ccSAndroid Build Coastguard Worker
294*77c1e3ccSAndroid Build Coastguard Worker deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_is[2], d_js[0]);
295*77c1e3ccSAndroid Build Coastguard Worker deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_is[3], d_js[1]);
296*77c1e3ccSAndroid Build Coastguard Worker deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_is[2], d_js[2]);
297*77c1e3ccSAndroid Build Coastguard Worker deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_is[3], d_js[3]);
298*77c1e3ccSAndroid Build Coastguard Worker deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_is[2], d_js[4]);
299*77c1e3ccSAndroid Build Coastguard Worker deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_is[3], d_js[5]);
300*77c1e3ccSAndroid Build Coastguard Worker deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_is[2], d_js[6]);
301*77c1e3ccSAndroid Build Coastguard Worker deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_is[3], d_js[7]);
302*77c1e3ccSAndroid Build Coastguard Worker
303*77c1e3ccSAndroid Build Coastguard Worker deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_is[4], d_js[0]);
304*77c1e3ccSAndroid Build Coastguard Worker deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_is[5], d_js[1]);
305*77c1e3ccSAndroid Build Coastguard Worker deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_is[4], d_js[2]);
306*77c1e3ccSAndroid Build Coastguard Worker deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_is[5], d_js[3]);
307*77c1e3ccSAndroid Build Coastguard Worker deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_is[4], d_js[4]);
308*77c1e3ccSAndroid Build Coastguard Worker deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_is[5], d_js[5]);
309*77c1e3ccSAndroid Build Coastguard Worker deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_is[4], d_js[6]);
310*77c1e3ccSAndroid Build Coastguard Worker deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_is[5], d_js[7]);
311*77c1e3ccSAndroid Build Coastguard Worker
312*77c1e3ccSAndroid Build Coastguard Worker deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_is[6], d_js[0]);
313*77c1e3ccSAndroid Build Coastguard Worker deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_is[7], d_js[1]);
314*77c1e3ccSAndroid Build Coastguard Worker deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_is[6], d_js[2]);
315*77c1e3ccSAndroid Build Coastguard Worker deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_is[7], d_js[3]);
316*77c1e3ccSAndroid Build Coastguard Worker deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_is[6], d_js[4]);
317*77c1e3ccSAndroid Build Coastguard Worker deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_is[7], d_js[5]);
318*77c1e3ccSAndroid Build Coastguard Worker deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_is[6], d_js[6]);
319*77c1e3ccSAndroid Build Coastguard Worker deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_is[7], d_js[7]);
320*77c1e3ccSAndroid Build Coastguard Worker
321*77c1e3ccSAndroid Build Coastguard Worker deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_ie[0], d_je[0]);
322*77c1e3ccSAndroid Build Coastguard Worker deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_ie[1], d_je[1]);
323*77c1e3ccSAndroid Build Coastguard Worker deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_ie[0], d_je[2]);
324*77c1e3ccSAndroid Build Coastguard Worker deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_ie[1], d_je[3]);
325*77c1e3ccSAndroid Build Coastguard Worker deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_ie[0], d_je[4]);
326*77c1e3ccSAndroid Build Coastguard Worker deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_ie[1], d_je[5]);
327*77c1e3ccSAndroid Build Coastguard Worker deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_ie[0], d_je[6]);
328*77c1e3ccSAndroid Build Coastguard Worker deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_ie[1], d_je[7]);
329*77c1e3ccSAndroid Build Coastguard Worker
330*77c1e3ccSAndroid Build Coastguard Worker deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_ie[2], d_je[0]);
331*77c1e3ccSAndroid Build Coastguard Worker deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_ie[3], d_je[1]);
332*77c1e3ccSAndroid Build Coastguard Worker deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_ie[2], d_je[2]);
333*77c1e3ccSAndroid Build Coastguard Worker deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_ie[3], d_je[3]);
334*77c1e3ccSAndroid Build Coastguard Worker deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_ie[2], d_je[4]);
335*77c1e3ccSAndroid Build Coastguard Worker deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_ie[3], d_je[5]);
336*77c1e3ccSAndroid Build Coastguard Worker deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_ie[2], d_je[6]);
337*77c1e3ccSAndroid Build Coastguard Worker deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_ie[3], d_je[7]);
338*77c1e3ccSAndroid Build Coastguard Worker
339*77c1e3ccSAndroid Build Coastguard Worker deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_ie[4], d_je[0]);
340*77c1e3ccSAndroid Build Coastguard Worker deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_ie[5], d_je[1]);
341*77c1e3ccSAndroid Build Coastguard Worker deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_ie[4], d_je[2]);
342*77c1e3ccSAndroid Build Coastguard Worker deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_ie[5], d_je[3]);
343*77c1e3ccSAndroid Build Coastguard Worker deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_ie[4], d_je[4]);
344*77c1e3ccSAndroid Build Coastguard Worker deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_ie[5], d_je[5]);
345*77c1e3ccSAndroid Build Coastguard Worker deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_ie[4], d_je[6]);
346*77c1e3ccSAndroid Build Coastguard Worker deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_ie[5], d_je[7]);
347*77c1e3ccSAndroid Build Coastguard Worker
348*77c1e3ccSAndroid Build Coastguard Worker deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_ie[6], d_je[0]);
349*77c1e3ccSAndroid Build Coastguard Worker deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_ie[7], d_je[1]);
350*77c1e3ccSAndroid Build Coastguard Worker deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_ie[6], d_je[2]);
351*77c1e3ccSAndroid Build Coastguard Worker deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_ie[7], d_je[3]);
352*77c1e3ccSAndroid Build Coastguard Worker deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_ie[6], d_je[4]);
353*77c1e3ccSAndroid Build Coastguard Worker deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_ie[7], d_je[5]);
354*77c1e3ccSAndroid Build Coastguard Worker deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_ie[6], d_je[6]);
355*77c1e3ccSAndroid Build Coastguard Worker deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_ie[7], d_je[7]);
356*77c1e3ccSAndroid Build Coastguard Worker }
357*77c1e3ccSAndroid Build Coastguard Worker
hadd_update_4_stats_sve(const int64_t * const src,const int64x2_t * deltas,int64_t * const dst)358*77c1e3ccSAndroid Build Coastguard Worker static inline void hadd_update_4_stats_sve(const int64_t *const src,
359*77c1e3ccSAndroid Build Coastguard Worker const int64x2_t *deltas,
360*77c1e3ccSAndroid Build Coastguard Worker int64_t *const dst) {
361*77c1e3ccSAndroid Build Coastguard Worker int64x2_t src0 = vld1q_s64(src);
362*77c1e3ccSAndroid Build Coastguard Worker int64x2_t src1 = vld1q_s64(src + 2);
363*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(dst + 0, vaddq_s64(src0, vpaddq_s64(deltas[0], deltas[1])));
364*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(dst + 2, vaddq_s64(src1, vpaddq_s64(deltas[2], deltas[3])));
365*77c1e3ccSAndroid Build Coastguard Worker }
366*77c1e3ccSAndroid Build Coastguard Worker
load_triangle_win5_sve(const int16_t * const di,const int32_t d_stride,const int32_t height,int16x8_t * d_is,int16x8_t * d_ie,svbool_t p0,svbool_t p1)367*77c1e3ccSAndroid Build Coastguard Worker static inline void load_triangle_win5_sve(const int16_t *const di,
368*77c1e3ccSAndroid Build Coastguard Worker const int32_t d_stride,
369*77c1e3ccSAndroid Build Coastguard Worker const int32_t height, int16x8_t *d_is,
370*77c1e3ccSAndroid Build Coastguard Worker int16x8_t *d_ie, svbool_t p0,
371*77c1e3ccSAndroid Build Coastguard Worker svbool_t p1) {
372*77c1e3ccSAndroid Build Coastguard Worker d_is[0] = svget_neonq_s16(svld1_s16(p0, di + 0 * d_stride + 0));
373*77c1e3ccSAndroid Build Coastguard Worker d_is[1] = svget_neonq_s16(svld1_s16(p1, di + 0 * d_stride + 8));
374*77c1e3ccSAndroid Build Coastguard Worker d_is[2] = svget_neonq_s16(svld1_s16(p0, di + 1 * d_stride + 0));
375*77c1e3ccSAndroid Build Coastguard Worker d_is[3] = svget_neonq_s16(svld1_s16(p1, di + 1 * d_stride + 8));
376*77c1e3ccSAndroid Build Coastguard Worker d_is[4] = svget_neonq_s16(svld1_s16(p0, di + 2 * d_stride + 0));
377*77c1e3ccSAndroid Build Coastguard Worker d_is[5] = svget_neonq_s16(svld1_s16(p1, di + 2 * d_stride + 8));
378*77c1e3ccSAndroid Build Coastguard Worker d_is[6] = svget_neonq_s16(svld1_s16(p0, di + 3 * d_stride + 0));
379*77c1e3ccSAndroid Build Coastguard Worker d_is[7] = svget_neonq_s16(svld1_s16(p1, di + 3 * d_stride + 8));
380*77c1e3ccSAndroid Build Coastguard Worker d_ie[0] = svget_neonq_s16(svld1_s16(p0, di + (height + 0) * d_stride + 0));
381*77c1e3ccSAndroid Build Coastguard Worker d_ie[1] = svget_neonq_s16(svld1_s16(p1, di + (height + 0) * d_stride + 8));
382*77c1e3ccSAndroid Build Coastguard Worker d_ie[2] = svget_neonq_s16(svld1_s16(p0, di + (height + 1) * d_stride + 0));
383*77c1e3ccSAndroid Build Coastguard Worker d_ie[3] = svget_neonq_s16(svld1_s16(p1, di + (height + 1) * d_stride + 8));
384*77c1e3ccSAndroid Build Coastguard Worker d_ie[4] = svget_neonq_s16(svld1_s16(p0, di + (height + 2) * d_stride + 0));
385*77c1e3ccSAndroid Build Coastguard Worker d_ie[5] = svget_neonq_s16(svld1_s16(p1, di + (height + 2) * d_stride + 8));
386*77c1e3ccSAndroid Build Coastguard Worker d_ie[6] = svget_neonq_s16(svld1_s16(p0, di + (height + 3) * d_stride + 0));
387*77c1e3ccSAndroid Build Coastguard Worker d_ie[7] = svget_neonq_s16(svld1_s16(p1, di + (height + 3) * d_stride + 8));
388*77c1e3ccSAndroid Build Coastguard Worker }
389*77c1e3ccSAndroid Build Coastguard Worker
derive_triangle_win5_sve(const int16x8_t * d_is,const int16x8_t * d_ie,int64x2_t * deltas)390*77c1e3ccSAndroid Build Coastguard Worker static inline void derive_triangle_win5_sve(const int16x8_t *d_is,
391*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t *d_ie,
392*77c1e3ccSAndroid Build Coastguard Worker int64x2_t *deltas) {
393*77c1e3ccSAndroid Build Coastguard Worker deltas[0] = aom_sdotq_s16(deltas[0], vnegq_s16(d_is[0]), d_is[0]);
394*77c1e3ccSAndroid Build Coastguard Worker deltas[0] = aom_sdotq_s16(deltas[0], vnegq_s16(d_is[1]), d_is[1]);
395*77c1e3ccSAndroid Build Coastguard Worker deltas[1] = aom_sdotq_s16(deltas[1], vnegq_s16(d_is[0]), d_is[2]);
396*77c1e3ccSAndroid Build Coastguard Worker deltas[1] = aom_sdotq_s16(deltas[1], vnegq_s16(d_is[1]), d_is[3]);
397*77c1e3ccSAndroid Build Coastguard Worker deltas[2] = aom_sdotq_s16(deltas[2], vnegq_s16(d_is[0]), d_is[4]);
398*77c1e3ccSAndroid Build Coastguard Worker deltas[2] = aom_sdotq_s16(deltas[2], vnegq_s16(d_is[1]), d_is[5]);
399*77c1e3ccSAndroid Build Coastguard Worker deltas[3] = aom_sdotq_s16(deltas[3], vnegq_s16(d_is[0]), d_is[6]);
400*77c1e3ccSAndroid Build Coastguard Worker deltas[3] = aom_sdotq_s16(deltas[3], vnegq_s16(d_is[1]), d_is[7]);
401*77c1e3ccSAndroid Build Coastguard Worker deltas[4] = aom_sdotq_s16(deltas[4], vnegq_s16(d_is[2]), d_is[2]);
402*77c1e3ccSAndroid Build Coastguard Worker deltas[4] = aom_sdotq_s16(deltas[4], vnegq_s16(d_is[3]), d_is[3]);
403*77c1e3ccSAndroid Build Coastguard Worker deltas[5] = aom_sdotq_s16(deltas[5], vnegq_s16(d_is[2]), d_is[4]);
404*77c1e3ccSAndroid Build Coastguard Worker deltas[5] = aom_sdotq_s16(deltas[5], vnegq_s16(d_is[3]), d_is[5]);
405*77c1e3ccSAndroid Build Coastguard Worker deltas[6] = aom_sdotq_s16(deltas[6], vnegq_s16(d_is[2]), d_is[6]);
406*77c1e3ccSAndroid Build Coastguard Worker deltas[6] = aom_sdotq_s16(deltas[6], vnegq_s16(d_is[3]), d_is[7]);
407*77c1e3ccSAndroid Build Coastguard Worker deltas[7] = aom_sdotq_s16(deltas[7], vnegq_s16(d_is[4]), d_is[4]);
408*77c1e3ccSAndroid Build Coastguard Worker deltas[7] = aom_sdotq_s16(deltas[7], vnegq_s16(d_is[5]), d_is[5]);
409*77c1e3ccSAndroid Build Coastguard Worker deltas[8] = aom_sdotq_s16(deltas[8], vnegq_s16(d_is[4]), d_is[6]);
410*77c1e3ccSAndroid Build Coastguard Worker deltas[8] = aom_sdotq_s16(deltas[8], vnegq_s16(d_is[5]), d_is[7]);
411*77c1e3ccSAndroid Build Coastguard Worker deltas[9] = aom_sdotq_s16(deltas[9], vnegq_s16(d_is[6]), d_is[6]);
412*77c1e3ccSAndroid Build Coastguard Worker deltas[9] = aom_sdotq_s16(deltas[9], vnegq_s16(d_is[7]), d_is[7]);
413*77c1e3ccSAndroid Build Coastguard Worker
414*77c1e3ccSAndroid Build Coastguard Worker deltas[0] = aom_sdotq_s16(deltas[0], d_ie[0], d_ie[0]);
415*77c1e3ccSAndroid Build Coastguard Worker deltas[0] = aom_sdotq_s16(deltas[0], d_ie[1], d_ie[1]);
416*77c1e3ccSAndroid Build Coastguard Worker deltas[1] = aom_sdotq_s16(deltas[1], d_ie[0], d_ie[2]);
417*77c1e3ccSAndroid Build Coastguard Worker deltas[1] = aom_sdotq_s16(deltas[1], d_ie[1], d_ie[3]);
418*77c1e3ccSAndroid Build Coastguard Worker deltas[2] = aom_sdotq_s16(deltas[2], d_ie[0], d_ie[4]);
419*77c1e3ccSAndroid Build Coastguard Worker deltas[2] = aom_sdotq_s16(deltas[2], d_ie[1], d_ie[5]);
420*77c1e3ccSAndroid Build Coastguard Worker deltas[3] = aom_sdotq_s16(deltas[3], d_ie[0], d_ie[6]);
421*77c1e3ccSAndroid Build Coastguard Worker deltas[3] = aom_sdotq_s16(deltas[3], d_ie[1], d_ie[7]);
422*77c1e3ccSAndroid Build Coastguard Worker deltas[4] = aom_sdotq_s16(deltas[4], d_ie[2], d_ie[2]);
423*77c1e3ccSAndroid Build Coastguard Worker deltas[4] = aom_sdotq_s16(deltas[4], d_ie[3], d_ie[3]);
424*77c1e3ccSAndroid Build Coastguard Worker deltas[5] = aom_sdotq_s16(deltas[5], d_ie[2], d_ie[4]);
425*77c1e3ccSAndroid Build Coastguard Worker deltas[5] = aom_sdotq_s16(deltas[5], d_ie[3], d_ie[5]);
426*77c1e3ccSAndroid Build Coastguard Worker deltas[6] = aom_sdotq_s16(deltas[6], d_ie[2], d_ie[6]);
427*77c1e3ccSAndroid Build Coastguard Worker deltas[6] = aom_sdotq_s16(deltas[6], d_ie[3], d_ie[7]);
428*77c1e3ccSAndroid Build Coastguard Worker deltas[7] = aom_sdotq_s16(deltas[7], d_ie[4], d_ie[4]);
429*77c1e3ccSAndroid Build Coastguard Worker deltas[7] = aom_sdotq_s16(deltas[7], d_ie[5], d_ie[5]);
430*77c1e3ccSAndroid Build Coastguard Worker deltas[8] = aom_sdotq_s16(deltas[8], d_ie[4], d_ie[6]);
431*77c1e3ccSAndroid Build Coastguard Worker deltas[8] = aom_sdotq_s16(deltas[8], d_ie[5], d_ie[7]);
432*77c1e3ccSAndroid Build Coastguard Worker deltas[9] = aom_sdotq_s16(deltas[9], d_ie[6], d_ie[6]);
433*77c1e3ccSAndroid Build Coastguard Worker deltas[9] = aom_sdotq_s16(deltas[9], d_ie[7], d_ie[7]);
434*77c1e3ccSAndroid Build Coastguard Worker }
435*77c1e3ccSAndroid Build Coastguard Worker
compute_stats_win5_sve(const int16_t * const d,const int32_t d_stride,const int16_t * const s,const int32_t s_stride,const int32_t width,const int32_t height,int64_t * const M,int64_t * const H)436*77c1e3ccSAndroid Build Coastguard Worker static inline void compute_stats_win5_sve(
437*77c1e3ccSAndroid Build Coastguard Worker const int16_t *const d, const int32_t d_stride, const int16_t *const s,
438*77c1e3ccSAndroid Build Coastguard Worker const int32_t s_stride, const int32_t width, const int32_t height,
439*77c1e3ccSAndroid Build Coastguard Worker int64_t *const M, int64_t *const H) {
440*77c1e3ccSAndroid Build Coastguard Worker const int32_t wiener_win = WIENER_WIN_CHROMA;
441*77c1e3ccSAndroid Build Coastguard Worker const int32_t wiener_win2 = wiener_win * wiener_win;
442*77c1e3ccSAndroid Build Coastguard Worker const int32_t h8 = height & ~7;
443*77c1e3ccSAndroid Build Coastguard Worker int32_t i, j, x, y;
444*77c1e3ccSAndroid Build Coastguard Worker
445*77c1e3ccSAndroid Build Coastguard Worker // Use a predicate to compute the last columns.
446*77c1e3ccSAndroid Build Coastguard Worker svbool_t p0 = svwhilelt_b16_u32(0, width % 16 == 0 ? 16 : width % 16);
447*77c1e3ccSAndroid Build Coastguard Worker svbool_t p1 = svwhilelt_b16_u32(8, width % 16 == 0 ? 16 : width % 16);
448*77c1e3ccSAndroid Build Coastguard Worker
449*77c1e3ccSAndroid Build Coastguard Worker // Step 1: Calculate the top edge of the whole matrix, i.e., the top
450*77c1e3ccSAndroid Build Coastguard Worker // edge of each triangle and square on the top row.
451*77c1e3ccSAndroid Build Coastguard Worker j = 0;
452*77c1e3ccSAndroid Build Coastguard Worker do {
453*77c1e3ccSAndroid Build Coastguard Worker const int16_t *s_t = s;
454*77c1e3ccSAndroid Build Coastguard Worker const int16_t *d_t = d;
455*77c1e3ccSAndroid Build Coastguard Worker int64x2_t sum_m[WIENER_WIN_CHROMA] = { vdupq_n_s64(0) };
456*77c1e3ccSAndroid Build Coastguard Worker int64x2_t sum_h[WIENER_WIN_CHROMA] = { vdupq_n_s64(0) };
457*77c1e3ccSAndroid Build Coastguard Worker int16x8_t src[2], dgd[2];
458*77c1e3ccSAndroid Build Coastguard Worker
459*77c1e3ccSAndroid Build Coastguard Worker y = height;
460*77c1e3ccSAndroid Build Coastguard Worker do {
461*77c1e3ccSAndroid Build Coastguard Worker x = 0;
462*77c1e3ccSAndroid Build Coastguard Worker while (x < width - 16) {
463*77c1e3ccSAndroid Build Coastguard Worker src[0] = vld1q_s16(s_t + x + 0);
464*77c1e3ccSAndroid Build Coastguard Worker src[1] = vld1q_s16(s_t + x + 8);
465*77c1e3ccSAndroid Build Coastguard Worker dgd[0] = vld1q_s16(d_t + x + 0);
466*77c1e3ccSAndroid Build Coastguard Worker dgd[1] = vld1q_s16(d_t + x + 8);
467*77c1e3ccSAndroid Build Coastguard Worker stats_top_win5_sve(src, dgd, d_t + j + x, d_stride, sum_m, sum_h);
468*77c1e3ccSAndroid Build Coastguard Worker x += 16;
469*77c1e3ccSAndroid Build Coastguard Worker }
470*77c1e3ccSAndroid Build Coastguard Worker
471*77c1e3ccSAndroid Build Coastguard Worker src[0] = svget_neonq_s16(svld1_s16(p0, s_t + x + 0));
472*77c1e3ccSAndroid Build Coastguard Worker src[1] = svget_neonq_s16(svld1_s16(p1, s_t + x + 8));
473*77c1e3ccSAndroid Build Coastguard Worker dgd[0] = svget_neonq_s16(svld1_s16(p0, d_t + x + 0));
474*77c1e3ccSAndroid Build Coastguard Worker dgd[1] = svget_neonq_s16(svld1_s16(p1, d_t + x + 8));
475*77c1e3ccSAndroid Build Coastguard Worker
476*77c1e3ccSAndroid Build Coastguard Worker stats_top_win5_sve(src, dgd, d_t + j + x, d_stride, sum_m, sum_h);
477*77c1e3ccSAndroid Build Coastguard Worker
478*77c1e3ccSAndroid Build Coastguard Worker s_t += s_stride;
479*77c1e3ccSAndroid Build Coastguard Worker d_t += d_stride;
480*77c1e3ccSAndroid Build Coastguard Worker } while (--y);
481*77c1e3ccSAndroid Build Coastguard Worker
482*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(&M[wiener_win * j + 0], vpaddq_s64(sum_m[0], sum_m[1]));
483*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(&M[wiener_win * j + 2], vpaddq_s64(sum_m[2], sum_m[3]));
484*77c1e3ccSAndroid Build Coastguard Worker M[wiener_win * j + 4] = vaddvq_s64(sum_m[4]);
485*77c1e3ccSAndroid Build Coastguard Worker
486*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(&H[wiener_win * j + 0], vpaddq_s64(sum_h[0], sum_h[1]));
487*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(&H[wiener_win * j + 2], vpaddq_s64(sum_h[2], sum_h[3]));
488*77c1e3ccSAndroid Build Coastguard Worker H[wiener_win * j + 4] = vaddvq_s64(sum_h[4]);
489*77c1e3ccSAndroid Build Coastguard Worker } while (++j < wiener_win);
490*77c1e3ccSAndroid Build Coastguard Worker
491*77c1e3ccSAndroid Build Coastguard Worker // Step 2: Calculate the left edge of each square on the top row.
492*77c1e3ccSAndroid Build Coastguard Worker j = 1;
493*77c1e3ccSAndroid Build Coastguard Worker do {
494*77c1e3ccSAndroid Build Coastguard Worker const int16_t *d_t = d;
495*77c1e3ccSAndroid Build Coastguard Worker int64x2_t sum_h[WIENER_WIN_CHROMA - 1] = { vdupq_n_s64(0) };
496*77c1e3ccSAndroid Build Coastguard Worker int16x8_t dgd[2];
497*77c1e3ccSAndroid Build Coastguard Worker
498*77c1e3ccSAndroid Build Coastguard Worker y = height;
499*77c1e3ccSAndroid Build Coastguard Worker do {
500*77c1e3ccSAndroid Build Coastguard Worker x = 0;
501*77c1e3ccSAndroid Build Coastguard Worker while (x < width - 16) {
502*77c1e3ccSAndroid Build Coastguard Worker dgd[0] = vld1q_s16(d_t + j + x + 0);
503*77c1e3ccSAndroid Build Coastguard Worker dgd[1] = vld1q_s16(d_t + j + x + 8);
504*77c1e3ccSAndroid Build Coastguard Worker stats_left_win5_sve(dgd, d_t + x, d_stride, sum_h);
505*77c1e3ccSAndroid Build Coastguard Worker x += 16;
506*77c1e3ccSAndroid Build Coastguard Worker }
507*77c1e3ccSAndroid Build Coastguard Worker
508*77c1e3ccSAndroid Build Coastguard Worker dgd[0] = svget_neonq_s16(svld1_s16(p0, d_t + j + x + 0));
509*77c1e3ccSAndroid Build Coastguard Worker dgd[1] = svget_neonq_s16(svld1_s16(p1, d_t + j + x + 8));
510*77c1e3ccSAndroid Build Coastguard Worker
511*77c1e3ccSAndroid Build Coastguard Worker stats_left_win5_sve(dgd, d_t + x, d_stride, sum_h);
512*77c1e3ccSAndroid Build Coastguard Worker
513*77c1e3ccSAndroid Build Coastguard Worker d_t += d_stride;
514*77c1e3ccSAndroid Build Coastguard Worker } while (--y);
515*77c1e3ccSAndroid Build Coastguard Worker
516*77c1e3ccSAndroid Build Coastguard Worker int64x2_t sum_h01 = vpaddq_s64(sum_h[0], sum_h[1]);
517*77c1e3ccSAndroid Build Coastguard Worker int64x2_t sum_h23 = vpaddq_s64(sum_h[2], sum_h[3]);
518*77c1e3ccSAndroid Build Coastguard Worker vst1_s64(&H[1 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h01));
519*77c1e3ccSAndroid Build Coastguard Worker vst1_s64(&H[2 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h01));
520*77c1e3ccSAndroid Build Coastguard Worker vst1_s64(&H[3 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h23));
521*77c1e3ccSAndroid Build Coastguard Worker vst1_s64(&H[4 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h23));
522*77c1e3ccSAndroid Build Coastguard Worker
523*77c1e3ccSAndroid Build Coastguard Worker } while (++j < wiener_win);
524*77c1e3ccSAndroid Build Coastguard Worker
525*77c1e3ccSAndroid Build Coastguard Worker // Step 3: Derive the top edge of each triangle along the diagonal. No
526*77c1e3ccSAndroid Build Coastguard Worker // triangle in top row.
527*77c1e3ccSAndroid Build Coastguard Worker {
528*77c1e3ccSAndroid Build Coastguard Worker const int16_t *d_t = d;
529*77c1e3ccSAndroid Build Coastguard Worker
530*77c1e3ccSAndroid Build Coastguard Worker if (height % 2) {
531*77c1e3ccSAndroid Build Coastguard Worker int32x4_t deltas[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
532*77c1e3ccSAndroid Build Coastguard Worker int32x4_t deltas_tr[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
533*77c1e3ccSAndroid Build Coastguard Worker int16x8_t ds[WIENER_WIN * 2];
534*77c1e3ccSAndroid Build Coastguard Worker
535*77c1e3ccSAndroid Build Coastguard Worker load_s16_8x4(d_t, d_stride, &ds[0], &ds[2], &ds[4], &ds[6]);
536*77c1e3ccSAndroid Build Coastguard Worker load_s16_8x4(d_t + width, d_stride, &ds[1], &ds[3], &ds[5], &ds[7]);
537*77c1e3ccSAndroid Build Coastguard Worker d_t += 4 * d_stride;
538*77c1e3ccSAndroid Build Coastguard Worker
539*77c1e3ccSAndroid Build Coastguard Worker step3_win5_oneline_neon(&d_t, d_stride, width, height, ds, deltas);
540*77c1e3ccSAndroid Build Coastguard Worker transpose_arrays_s32_8x8(deltas, deltas_tr);
541*77c1e3ccSAndroid Build Coastguard Worker
542*77c1e3ccSAndroid Build Coastguard Worker update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
543*77c1e3ccSAndroid Build Coastguard Worker deltas_tr[0], vgetq_lane_s32(deltas_tr[4], 0),
544*77c1e3ccSAndroid Build Coastguard Worker H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
545*77c1e3ccSAndroid Build Coastguard Worker
546*77c1e3ccSAndroid Build Coastguard Worker update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
547*77c1e3ccSAndroid Build Coastguard Worker deltas_tr[1], vgetq_lane_s32(deltas_tr[5], 0),
548*77c1e3ccSAndroid Build Coastguard Worker H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
549*77c1e3ccSAndroid Build Coastguard Worker
550*77c1e3ccSAndroid Build Coastguard Worker update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
551*77c1e3ccSAndroid Build Coastguard Worker deltas_tr[2], vgetq_lane_s32(deltas_tr[6], 0),
552*77c1e3ccSAndroid Build Coastguard Worker H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
553*77c1e3ccSAndroid Build Coastguard Worker
554*77c1e3ccSAndroid Build Coastguard Worker update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
555*77c1e3ccSAndroid Build Coastguard Worker deltas_tr[3], vgetq_lane_s32(deltas_tr[7], 0),
556*77c1e3ccSAndroid Build Coastguard Worker H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
557*77c1e3ccSAndroid Build Coastguard Worker
558*77c1e3ccSAndroid Build Coastguard Worker } else {
559*77c1e3ccSAndroid Build Coastguard Worker int32x4_t deltas[WIENER_WIN_CHROMA * 2] = { vdupq_n_s32(0) };
560*77c1e3ccSAndroid Build Coastguard Worker int16x8_t ds[WIENER_WIN_CHROMA * 2];
561*77c1e3ccSAndroid Build Coastguard Worker
562*77c1e3ccSAndroid Build Coastguard Worker ds[0] = load_unaligned_s16_4x2(d_t + 0 * d_stride, width);
563*77c1e3ccSAndroid Build Coastguard Worker ds[1] = load_unaligned_s16_4x2(d_t + 1 * d_stride, width);
564*77c1e3ccSAndroid Build Coastguard Worker ds[2] = load_unaligned_s16_4x2(d_t + 2 * d_stride, width);
565*77c1e3ccSAndroid Build Coastguard Worker ds[3] = load_unaligned_s16_4x2(d_t + 3 * d_stride, width);
566*77c1e3ccSAndroid Build Coastguard Worker
567*77c1e3ccSAndroid Build Coastguard Worker step3_win5_neon(d_t + 4 * d_stride, d_stride, width, height, ds, deltas);
568*77c1e3ccSAndroid Build Coastguard Worker
569*77c1e3ccSAndroid Build Coastguard Worker transpose_elems_inplace_s32_4x4(&deltas[0], &deltas[1], &deltas[2],
570*77c1e3ccSAndroid Build Coastguard Worker &deltas[3]);
571*77c1e3ccSAndroid Build Coastguard Worker
572*77c1e3ccSAndroid Build Coastguard Worker update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
573*77c1e3ccSAndroid Build Coastguard Worker deltas[0], vgetq_lane_s32(deltas[4], 0),
574*77c1e3ccSAndroid Build Coastguard Worker H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
575*77c1e3ccSAndroid Build Coastguard Worker
576*77c1e3ccSAndroid Build Coastguard Worker update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
577*77c1e3ccSAndroid Build Coastguard Worker deltas[1], vgetq_lane_s32(deltas[4], 1),
578*77c1e3ccSAndroid Build Coastguard Worker H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
579*77c1e3ccSAndroid Build Coastguard Worker
580*77c1e3ccSAndroid Build Coastguard Worker update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
581*77c1e3ccSAndroid Build Coastguard Worker deltas[2], vgetq_lane_s32(deltas[4], 2),
582*77c1e3ccSAndroid Build Coastguard Worker H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
583*77c1e3ccSAndroid Build Coastguard Worker
584*77c1e3ccSAndroid Build Coastguard Worker update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
585*77c1e3ccSAndroid Build Coastguard Worker deltas[3], vgetq_lane_s32(deltas[4], 3),
586*77c1e3ccSAndroid Build Coastguard Worker H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
587*77c1e3ccSAndroid Build Coastguard Worker }
588*77c1e3ccSAndroid Build Coastguard Worker }
589*77c1e3ccSAndroid Build Coastguard Worker
590*77c1e3ccSAndroid Build Coastguard Worker // Step 4: Derive the top and left edge of each square. No square in top and
591*77c1e3ccSAndroid Build Coastguard Worker // bottom row.
592*77c1e3ccSAndroid Build Coastguard Worker {
593*77c1e3ccSAndroid Build Coastguard Worker y = h8;
594*77c1e3ccSAndroid Build Coastguard Worker
595*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d_s[12];
596*77c1e3ccSAndroid Build Coastguard Worker int16x4_t d_e[12];
597*77c1e3ccSAndroid Build Coastguard Worker const int16_t *d_t = d;
598*77c1e3ccSAndroid Build Coastguard Worker int16x4_t zeros = vdup_n_s16(0);
599*77c1e3ccSAndroid Build Coastguard Worker load_s16_4x4(d_t, d_stride, &d_s[0], &d_s[1], &d_s[2], &d_s[3]);
600*77c1e3ccSAndroid Build Coastguard Worker load_s16_4x4(d_t + width, d_stride, &d_e[0], &d_e[1], &d_e[2], &d_e[3]);
601*77c1e3ccSAndroid Build Coastguard Worker int64x2_t deltas[6][18] = { { vdupq_n_s64(0) }, { vdupq_n_s64(0) } };
602*77c1e3ccSAndroid Build Coastguard Worker
603*77c1e3ccSAndroid Build Coastguard Worker while (y >= 8) {
604*77c1e3ccSAndroid Build Coastguard Worker load_s16_4x8(d_t + 4 * d_stride, d_stride, &d_s[4], &d_s[5], &d_s[6],
605*77c1e3ccSAndroid Build Coastguard Worker &d_s[7], &d_s[8], &d_s[9], &d_s[10], &d_s[11]);
606*77c1e3ccSAndroid Build Coastguard Worker load_s16_4x8(d_t + width + 4 * d_stride, d_stride, &d_e[4], &d_e[5],
607*77c1e3ccSAndroid Build Coastguard Worker &d_e[6], &d_e[7], &d_e[8], &d_e[9], &d_e[10], &d_e[11]);
608*77c1e3ccSAndroid Build Coastguard Worker
609*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s_tr[8], e_tr[8];
610*77c1e3ccSAndroid Build Coastguard Worker transpose_elems_s16_4x8(d_s[0], d_s[1], d_s[2], d_s[3], d_s[4], d_s[5],
611*77c1e3ccSAndroid Build Coastguard Worker d_s[6], d_s[7], &s_tr[0], &s_tr[1], &s_tr[2],
612*77c1e3ccSAndroid Build Coastguard Worker &s_tr[3]);
613*77c1e3ccSAndroid Build Coastguard Worker transpose_elems_s16_4x8(d_s[8], d_s[9], d_s[10], d_s[11], zeros, zeros,
614*77c1e3ccSAndroid Build Coastguard Worker zeros, zeros, &s_tr[4], &s_tr[5], &s_tr[6],
615*77c1e3ccSAndroid Build Coastguard Worker &s_tr[7]);
616*77c1e3ccSAndroid Build Coastguard Worker
617*77c1e3ccSAndroid Build Coastguard Worker transpose_elems_s16_4x8(d_e[0], d_e[1], d_e[2], d_e[3], d_e[4], d_e[5],
618*77c1e3ccSAndroid Build Coastguard Worker d_e[6], d_e[7], &e_tr[0], &e_tr[1], &e_tr[2],
619*77c1e3ccSAndroid Build Coastguard Worker &e_tr[3]);
620*77c1e3ccSAndroid Build Coastguard Worker transpose_elems_s16_4x8(d_e[8], d_e[9], d_e[10], d_e[11], zeros, zeros,
621*77c1e3ccSAndroid Build Coastguard Worker zeros, zeros, &e_tr[4], &e_tr[5], &e_tr[6],
622*77c1e3ccSAndroid Build Coastguard Worker &e_tr[7]);
623*77c1e3ccSAndroid Build Coastguard Worker
624*77c1e3ccSAndroid Build Coastguard Worker int16x8_t start_col0[5], start_col1[5], start_col2[5], start_col3[5];
625*77c1e3ccSAndroid Build Coastguard Worker start_col0[0] = s_tr[0];
626*77c1e3ccSAndroid Build Coastguard Worker start_col0[1] = vextq_s16(s_tr[0], s_tr[4], 1);
627*77c1e3ccSAndroid Build Coastguard Worker start_col0[2] = vextq_s16(s_tr[0], s_tr[4], 2);
628*77c1e3ccSAndroid Build Coastguard Worker start_col0[3] = vextq_s16(s_tr[0], s_tr[4], 3);
629*77c1e3ccSAndroid Build Coastguard Worker start_col0[4] = vextq_s16(s_tr[0], s_tr[4], 4);
630*77c1e3ccSAndroid Build Coastguard Worker
631*77c1e3ccSAndroid Build Coastguard Worker start_col1[0] = s_tr[1];
632*77c1e3ccSAndroid Build Coastguard Worker start_col1[1] = vextq_s16(s_tr[1], s_tr[5], 1);
633*77c1e3ccSAndroid Build Coastguard Worker start_col1[2] = vextq_s16(s_tr[1], s_tr[5], 2);
634*77c1e3ccSAndroid Build Coastguard Worker start_col1[3] = vextq_s16(s_tr[1], s_tr[5], 3);
635*77c1e3ccSAndroid Build Coastguard Worker start_col1[4] = vextq_s16(s_tr[1], s_tr[5], 4);
636*77c1e3ccSAndroid Build Coastguard Worker
637*77c1e3ccSAndroid Build Coastguard Worker start_col2[0] = s_tr[2];
638*77c1e3ccSAndroid Build Coastguard Worker start_col2[1] = vextq_s16(s_tr[2], s_tr[6], 1);
639*77c1e3ccSAndroid Build Coastguard Worker start_col2[2] = vextq_s16(s_tr[2], s_tr[6], 2);
640*77c1e3ccSAndroid Build Coastguard Worker start_col2[3] = vextq_s16(s_tr[2], s_tr[6], 3);
641*77c1e3ccSAndroid Build Coastguard Worker start_col2[4] = vextq_s16(s_tr[2], s_tr[6], 4);
642*77c1e3ccSAndroid Build Coastguard Worker
643*77c1e3ccSAndroid Build Coastguard Worker start_col3[0] = s_tr[3];
644*77c1e3ccSAndroid Build Coastguard Worker start_col3[1] = vextq_s16(s_tr[3], s_tr[7], 1);
645*77c1e3ccSAndroid Build Coastguard Worker start_col3[2] = vextq_s16(s_tr[3], s_tr[7], 2);
646*77c1e3ccSAndroid Build Coastguard Worker start_col3[3] = vextq_s16(s_tr[3], s_tr[7], 3);
647*77c1e3ccSAndroid Build Coastguard Worker start_col3[4] = vextq_s16(s_tr[3], s_tr[7], 4);
648*77c1e3ccSAndroid Build Coastguard Worker
649*77c1e3ccSAndroid Build Coastguard Worker // i = 1, j = 2;
650*77c1e3ccSAndroid Build Coastguard Worker sub_deltas_step4_sve(start_col0, start_col1, deltas[0]);
651*77c1e3ccSAndroid Build Coastguard Worker
652*77c1e3ccSAndroid Build Coastguard Worker // i = 1, j = 3;
653*77c1e3ccSAndroid Build Coastguard Worker sub_deltas_step4_sve(start_col0, start_col2, deltas[1]);
654*77c1e3ccSAndroid Build Coastguard Worker
655*77c1e3ccSAndroid Build Coastguard Worker // i = 1, j = 4
656*77c1e3ccSAndroid Build Coastguard Worker sub_deltas_step4_sve(start_col0, start_col3, deltas[2]);
657*77c1e3ccSAndroid Build Coastguard Worker
658*77c1e3ccSAndroid Build Coastguard Worker // i = 2, j =3
659*77c1e3ccSAndroid Build Coastguard Worker sub_deltas_step4_sve(start_col1, start_col2, deltas[3]);
660*77c1e3ccSAndroid Build Coastguard Worker
661*77c1e3ccSAndroid Build Coastguard Worker // i = 2, j = 4
662*77c1e3ccSAndroid Build Coastguard Worker sub_deltas_step4_sve(start_col1, start_col3, deltas[4]);
663*77c1e3ccSAndroid Build Coastguard Worker
664*77c1e3ccSAndroid Build Coastguard Worker // i = 3, j = 4
665*77c1e3ccSAndroid Build Coastguard Worker sub_deltas_step4_sve(start_col2, start_col3, deltas[5]);
666*77c1e3ccSAndroid Build Coastguard Worker
667*77c1e3ccSAndroid Build Coastguard Worker int16x8_t end_col0[5], end_col1[5], end_col2[5], end_col3[5];
668*77c1e3ccSAndroid Build Coastguard Worker end_col0[0] = e_tr[0];
669*77c1e3ccSAndroid Build Coastguard Worker end_col0[1] = vextq_s16(e_tr[0], e_tr[4], 1);
670*77c1e3ccSAndroid Build Coastguard Worker end_col0[2] = vextq_s16(e_tr[0], e_tr[4], 2);
671*77c1e3ccSAndroid Build Coastguard Worker end_col0[3] = vextq_s16(e_tr[0], e_tr[4], 3);
672*77c1e3ccSAndroid Build Coastguard Worker end_col0[4] = vextq_s16(e_tr[0], e_tr[4], 4);
673*77c1e3ccSAndroid Build Coastguard Worker
674*77c1e3ccSAndroid Build Coastguard Worker end_col1[0] = e_tr[1];
675*77c1e3ccSAndroid Build Coastguard Worker end_col1[1] = vextq_s16(e_tr[1], e_tr[5], 1);
676*77c1e3ccSAndroid Build Coastguard Worker end_col1[2] = vextq_s16(e_tr[1], e_tr[5], 2);
677*77c1e3ccSAndroid Build Coastguard Worker end_col1[3] = vextq_s16(e_tr[1], e_tr[5], 3);
678*77c1e3ccSAndroid Build Coastguard Worker end_col1[4] = vextq_s16(e_tr[1], e_tr[5], 4);
679*77c1e3ccSAndroid Build Coastguard Worker
680*77c1e3ccSAndroid Build Coastguard Worker end_col2[0] = e_tr[2];
681*77c1e3ccSAndroid Build Coastguard Worker end_col2[1] = vextq_s16(e_tr[2], e_tr[6], 1);
682*77c1e3ccSAndroid Build Coastguard Worker end_col2[2] = vextq_s16(e_tr[2], e_tr[6], 2);
683*77c1e3ccSAndroid Build Coastguard Worker end_col2[3] = vextq_s16(e_tr[2], e_tr[6], 3);
684*77c1e3ccSAndroid Build Coastguard Worker end_col2[4] = vextq_s16(e_tr[2], e_tr[6], 4);
685*77c1e3ccSAndroid Build Coastguard Worker
686*77c1e3ccSAndroid Build Coastguard Worker end_col3[0] = e_tr[3];
687*77c1e3ccSAndroid Build Coastguard Worker end_col3[1] = vextq_s16(e_tr[3], e_tr[7], 1);
688*77c1e3ccSAndroid Build Coastguard Worker end_col3[2] = vextq_s16(e_tr[3], e_tr[7], 2);
689*77c1e3ccSAndroid Build Coastguard Worker end_col3[3] = vextq_s16(e_tr[3], e_tr[7], 3);
690*77c1e3ccSAndroid Build Coastguard Worker end_col3[4] = vextq_s16(e_tr[3], e_tr[7], 4);
691*77c1e3ccSAndroid Build Coastguard Worker
692*77c1e3ccSAndroid Build Coastguard Worker // i = 1, j = 2;
693*77c1e3ccSAndroid Build Coastguard Worker add_deltas_step4_sve(end_col0, end_col1, deltas[0]);
694*77c1e3ccSAndroid Build Coastguard Worker
695*77c1e3ccSAndroid Build Coastguard Worker // i = 1, j = 3;
696*77c1e3ccSAndroid Build Coastguard Worker add_deltas_step4_sve(end_col0, end_col2, deltas[1]);
697*77c1e3ccSAndroid Build Coastguard Worker
698*77c1e3ccSAndroid Build Coastguard Worker // i = 1, j = 4
699*77c1e3ccSAndroid Build Coastguard Worker add_deltas_step4_sve(end_col0, end_col3, deltas[2]);
700*77c1e3ccSAndroid Build Coastguard Worker
701*77c1e3ccSAndroid Build Coastguard Worker // i = 2, j =3
702*77c1e3ccSAndroid Build Coastguard Worker add_deltas_step4_sve(end_col1, end_col2, deltas[3]);
703*77c1e3ccSAndroid Build Coastguard Worker
704*77c1e3ccSAndroid Build Coastguard Worker // i = 2, j = 4
705*77c1e3ccSAndroid Build Coastguard Worker add_deltas_step4_sve(end_col1, end_col3, deltas[4]);
706*77c1e3ccSAndroid Build Coastguard Worker
707*77c1e3ccSAndroid Build Coastguard Worker // i = 3, j = 4
708*77c1e3ccSAndroid Build Coastguard Worker add_deltas_step4_sve(end_col2, end_col3, deltas[5]);
709*77c1e3ccSAndroid Build Coastguard Worker
710*77c1e3ccSAndroid Build Coastguard Worker d_s[0] = d_s[8];
711*77c1e3ccSAndroid Build Coastguard Worker d_s[1] = d_s[9];
712*77c1e3ccSAndroid Build Coastguard Worker d_s[2] = d_s[10];
713*77c1e3ccSAndroid Build Coastguard Worker d_s[3] = d_s[11];
714*77c1e3ccSAndroid Build Coastguard Worker d_e[0] = d_e[8];
715*77c1e3ccSAndroid Build Coastguard Worker d_e[1] = d_e[9];
716*77c1e3ccSAndroid Build Coastguard Worker d_e[2] = d_e[10];
717*77c1e3ccSAndroid Build Coastguard Worker d_e[3] = d_e[11];
718*77c1e3ccSAndroid Build Coastguard Worker
719*77c1e3ccSAndroid Build Coastguard Worker d_t += 8 * d_stride;
720*77c1e3ccSAndroid Build Coastguard Worker y -= 8;
721*77c1e3ccSAndroid Build Coastguard Worker }
722*77c1e3ccSAndroid Build Coastguard Worker
723*77c1e3ccSAndroid Build Coastguard Worker if (h8 != height) {
724*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t mask_h = vld1q_s16(&mask_16bit[16] - (height % 8));
725*77c1e3ccSAndroid Build Coastguard Worker
726*77c1e3ccSAndroid Build Coastguard Worker load_s16_4x8(d_t + 4 * d_stride, d_stride, &d_s[4], &d_s[5], &d_s[6],
727*77c1e3ccSAndroid Build Coastguard Worker &d_s[7], &d_s[8], &d_s[9], &d_s[10], &d_s[11]);
728*77c1e3ccSAndroid Build Coastguard Worker load_s16_4x8(d_t + width + 4 * d_stride, d_stride, &d_e[4], &d_e[5],
729*77c1e3ccSAndroid Build Coastguard Worker &d_e[6], &d_e[7], &d_e[8], &d_e[9], &d_e[10], &d_e[11]);
730*77c1e3ccSAndroid Build Coastguard Worker int16x8_t s_tr[8], e_tr[8];
731*77c1e3ccSAndroid Build Coastguard Worker transpose_elems_s16_4x8(d_s[0], d_s[1], d_s[2], d_s[3], d_s[4], d_s[5],
732*77c1e3ccSAndroid Build Coastguard Worker d_s[6], d_s[7], &s_tr[0], &s_tr[1], &s_tr[2],
733*77c1e3ccSAndroid Build Coastguard Worker &s_tr[3]);
734*77c1e3ccSAndroid Build Coastguard Worker transpose_elems_s16_4x8(d_s[8], d_s[9], d_s[10], d_s[11], zeros, zeros,
735*77c1e3ccSAndroid Build Coastguard Worker zeros, zeros, &s_tr[4], &s_tr[5], &s_tr[6],
736*77c1e3ccSAndroid Build Coastguard Worker &s_tr[7]);
737*77c1e3ccSAndroid Build Coastguard Worker transpose_elems_s16_4x8(d_e[0], d_e[1], d_e[2], d_e[3], d_e[4], d_e[5],
738*77c1e3ccSAndroid Build Coastguard Worker d_e[6], d_e[7], &e_tr[0], &e_tr[1], &e_tr[2],
739*77c1e3ccSAndroid Build Coastguard Worker &e_tr[3]);
740*77c1e3ccSAndroid Build Coastguard Worker transpose_elems_s16_4x8(d_e[8], d_e[9], d_e[10], d_e[11], zeros, zeros,
741*77c1e3ccSAndroid Build Coastguard Worker zeros, zeros, &e_tr[4], &e_tr[5], &e_tr[6],
742*77c1e3ccSAndroid Build Coastguard Worker &e_tr[7]);
743*77c1e3ccSAndroid Build Coastguard Worker
744*77c1e3ccSAndroid Build Coastguard Worker int16x8_t start_col0[5], start_col1[5], start_col2[5], start_col3[5];
745*77c1e3ccSAndroid Build Coastguard Worker start_col0[0] = vandq_s16(s_tr[0], mask_h);
746*77c1e3ccSAndroid Build Coastguard Worker start_col0[1] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 1), mask_h);
747*77c1e3ccSAndroid Build Coastguard Worker start_col0[2] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 2), mask_h);
748*77c1e3ccSAndroid Build Coastguard Worker start_col0[3] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 3), mask_h);
749*77c1e3ccSAndroid Build Coastguard Worker start_col0[4] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 4), mask_h);
750*77c1e3ccSAndroid Build Coastguard Worker
751*77c1e3ccSAndroid Build Coastguard Worker start_col1[0] = vandq_s16(s_tr[1], mask_h);
752*77c1e3ccSAndroid Build Coastguard Worker start_col1[1] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 1), mask_h);
753*77c1e3ccSAndroid Build Coastguard Worker start_col1[2] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 2), mask_h);
754*77c1e3ccSAndroid Build Coastguard Worker start_col1[3] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 3), mask_h);
755*77c1e3ccSAndroid Build Coastguard Worker start_col1[4] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 4), mask_h);
756*77c1e3ccSAndroid Build Coastguard Worker
757*77c1e3ccSAndroid Build Coastguard Worker start_col2[0] = vandq_s16(s_tr[2], mask_h);
758*77c1e3ccSAndroid Build Coastguard Worker start_col2[1] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 1), mask_h);
759*77c1e3ccSAndroid Build Coastguard Worker start_col2[2] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 2), mask_h);
760*77c1e3ccSAndroid Build Coastguard Worker start_col2[3] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 3), mask_h);
761*77c1e3ccSAndroid Build Coastguard Worker start_col2[4] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 4), mask_h);
762*77c1e3ccSAndroid Build Coastguard Worker
763*77c1e3ccSAndroid Build Coastguard Worker start_col3[0] = vandq_s16(s_tr[3], mask_h);
764*77c1e3ccSAndroid Build Coastguard Worker start_col3[1] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 1), mask_h);
765*77c1e3ccSAndroid Build Coastguard Worker start_col3[2] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 2), mask_h);
766*77c1e3ccSAndroid Build Coastguard Worker start_col3[3] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 3), mask_h);
767*77c1e3ccSAndroid Build Coastguard Worker start_col3[4] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 4), mask_h);
768*77c1e3ccSAndroid Build Coastguard Worker
769*77c1e3ccSAndroid Build Coastguard Worker // i = 1, j = 2;
770*77c1e3ccSAndroid Build Coastguard Worker sub_deltas_step4_sve(start_col0, start_col1, deltas[0]);
771*77c1e3ccSAndroid Build Coastguard Worker
772*77c1e3ccSAndroid Build Coastguard Worker // i = 1, j = 3;
773*77c1e3ccSAndroid Build Coastguard Worker sub_deltas_step4_sve(start_col0, start_col2, deltas[1]);
774*77c1e3ccSAndroid Build Coastguard Worker
775*77c1e3ccSAndroid Build Coastguard Worker // i = 1, j = 4
776*77c1e3ccSAndroid Build Coastguard Worker sub_deltas_step4_sve(start_col0, start_col3, deltas[2]);
777*77c1e3ccSAndroid Build Coastguard Worker
778*77c1e3ccSAndroid Build Coastguard Worker // i = 2, j = 3
779*77c1e3ccSAndroid Build Coastguard Worker sub_deltas_step4_sve(start_col1, start_col2, deltas[3]);
780*77c1e3ccSAndroid Build Coastguard Worker
781*77c1e3ccSAndroid Build Coastguard Worker // i = 2, j = 4
782*77c1e3ccSAndroid Build Coastguard Worker sub_deltas_step4_sve(start_col1, start_col3, deltas[4]);
783*77c1e3ccSAndroid Build Coastguard Worker
784*77c1e3ccSAndroid Build Coastguard Worker // i = 3, j = 4
785*77c1e3ccSAndroid Build Coastguard Worker sub_deltas_step4_sve(start_col2, start_col3, deltas[5]);
786*77c1e3ccSAndroid Build Coastguard Worker
787*77c1e3ccSAndroid Build Coastguard Worker int16x8_t end_col0[5], end_col1[5], end_col2[5], end_col3[5];
788*77c1e3ccSAndroid Build Coastguard Worker end_col0[0] = vandq_s16(e_tr[0], mask_h);
789*77c1e3ccSAndroid Build Coastguard Worker end_col0[1] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 1), mask_h);
790*77c1e3ccSAndroid Build Coastguard Worker end_col0[2] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 2), mask_h);
791*77c1e3ccSAndroid Build Coastguard Worker end_col0[3] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 3), mask_h);
792*77c1e3ccSAndroid Build Coastguard Worker end_col0[4] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 4), mask_h);
793*77c1e3ccSAndroid Build Coastguard Worker
794*77c1e3ccSAndroid Build Coastguard Worker end_col1[0] = vandq_s16(e_tr[1], mask_h);
795*77c1e3ccSAndroid Build Coastguard Worker end_col1[1] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 1), mask_h);
796*77c1e3ccSAndroid Build Coastguard Worker end_col1[2] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 2), mask_h);
797*77c1e3ccSAndroid Build Coastguard Worker end_col1[3] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 3), mask_h);
798*77c1e3ccSAndroid Build Coastguard Worker end_col1[4] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 4), mask_h);
799*77c1e3ccSAndroid Build Coastguard Worker
800*77c1e3ccSAndroid Build Coastguard Worker end_col2[0] = vandq_s16(e_tr[2], mask_h);
801*77c1e3ccSAndroid Build Coastguard Worker end_col2[1] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 1), mask_h);
802*77c1e3ccSAndroid Build Coastguard Worker end_col2[2] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 2), mask_h);
803*77c1e3ccSAndroid Build Coastguard Worker end_col2[3] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 3), mask_h);
804*77c1e3ccSAndroid Build Coastguard Worker end_col2[4] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 4), mask_h);
805*77c1e3ccSAndroid Build Coastguard Worker
806*77c1e3ccSAndroid Build Coastguard Worker end_col3[0] = vandq_s16(e_tr[3], mask_h);
807*77c1e3ccSAndroid Build Coastguard Worker end_col3[1] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 1), mask_h);
808*77c1e3ccSAndroid Build Coastguard Worker end_col3[2] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 2), mask_h);
809*77c1e3ccSAndroid Build Coastguard Worker end_col3[3] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 3), mask_h);
810*77c1e3ccSAndroid Build Coastguard Worker end_col3[4] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 4), mask_h);
811*77c1e3ccSAndroid Build Coastguard Worker
812*77c1e3ccSAndroid Build Coastguard Worker // i = 1, j = 2;
813*77c1e3ccSAndroid Build Coastguard Worker add_deltas_step4_sve(end_col0, end_col1, deltas[0]);
814*77c1e3ccSAndroid Build Coastguard Worker
815*77c1e3ccSAndroid Build Coastguard Worker // i = 1, j = 3;
816*77c1e3ccSAndroid Build Coastguard Worker add_deltas_step4_sve(end_col0, end_col2, deltas[1]);
817*77c1e3ccSAndroid Build Coastguard Worker
818*77c1e3ccSAndroid Build Coastguard Worker // i = 1, j = 4
819*77c1e3ccSAndroid Build Coastguard Worker add_deltas_step4_sve(end_col0, end_col3, deltas[2]);
820*77c1e3ccSAndroid Build Coastguard Worker
821*77c1e3ccSAndroid Build Coastguard Worker // i = 2, j =3
822*77c1e3ccSAndroid Build Coastguard Worker add_deltas_step4_sve(end_col1, end_col2, deltas[3]);
823*77c1e3ccSAndroid Build Coastguard Worker
824*77c1e3ccSAndroid Build Coastguard Worker // i = 2, j = 4
825*77c1e3ccSAndroid Build Coastguard Worker add_deltas_step4_sve(end_col1, end_col3, deltas[4]);
826*77c1e3ccSAndroid Build Coastguard Worker
827*77c1e3ccSAndroid Build Coastguard Worker // i = 3, j = 4
828*77c1e3ccSAndroid Build Coastguard Worker add_deltas_step4_sve(end_col2, end_col3, deltas[5]);
829*77c1e3ccSAndroid Build Coastguard Worker }
830*77c1e3ccSAndroid Build Coastguard Worker
831*77c1e3ccSAndroid Build Coastguard Worker int64_t single_delta[6];
832*77c1e3ccSAndroid Build Coastguard Worker
833*77c1e3ccSAndroid Build Coastguard Worker deltas[0][0] = vpaddq_s64(deltas[0][0], deltas[0][1]);
834*77c1e3ccSAndroid Build Coastguard Worker deltas[0][1] = vpaddq_s64(deltas[0][2], deltas[0][3]);
835*77c1e3ccSAndroid Build Coastguard Worker deltas[1][0] = vpaddq_s64(deltas[1][0], deltas[1][1]);
836*77c1e3ccSAndroid Build Coastguard Worker deltas[1][1] = vpaddq_s64(deltas[1][2], deltas[1][3]);
837*77c1e3ccSAndroid Build Coastguard Worker deltas[2][0] = vpaddq_s64(deltas[2][0], deltas[2][1]);
838*77c1e3ccSAndroid Build Coastguard Worker deltas[2][1] = vpaddq_s64(deltas[2][2], deltas[2][3]);
839*77c1e3ccSAndroid Build Coastguard Worker deltas[3][0] = vpaddq_s64(deltas[3][0], deltas[3][1]);
840*77c1e3ccSAndroid Build Coastguard Worker deltas[3][1] = vpaddq_s64(deltas[3][2], deltas[3][3]);
841*77c1e3ccSAndroid Build Coastguard Worker deltas[4][0] = vpaddq_s64(deltas[4][0], deltas[4][1]);
842*77c1e3ccSAndroid Build Coastguard Worker deltas[4][1] = vpaddq_s64(deltas[4][2], deltas[4][3]);
843*77c1e3ccSAndroid Build Coastguard Worker deltas[5][0] = vpaddq_s64(deltas[5][0], deltas[5][1]);
844*77c1e3ccSAndroid Build Coastguard Worker deltas[5][1] = vpaddq_s64(deltas[5][2], deltas[5][3]);
845*77c1e3ccSAndroid Build Coastguard Worker
846*77c1e3ccSAndroid Build Coastguard Worker deltas[0][5] = vpaddq_s64(deltas[0][5], deltas[0][6]);
847*77c1e3ccSAndroid Build Coastguard Worker deltas[0][7] = vpaddq_s64(deltas[0][7], deltas[0][8]);
848*77c1e3ccSAndroid Build Coastguard Worker deltas[1][5] = vpaddq_s64(deltas[1][5], deltas[1][6]);
849*77c1e3ccSAndroid Build Coastguard Worker deltas[1][7] = vpaddq_s64(deltas[1][7], deltas[1][8]);
850*77c1e3ccSAndroid Build Coastguard Worker deltas[2][5] = vpaddq_s64(deltas[2][5], deltas[2][6]);
851*77c1e3ccSAndroid Build Coastguard Worker deltas[2][7] = vpaddq_s64(deltas[2][7], deltas[2][8]);
852*77c1e3ccSAndroid Build Coastguard Worker deltas[3][5] = vpaddq_s64(deltas[3][5], deltas[3][6]);
853*77c1e3ccSAndroid Build Coastguard Worker deltas[3][7] = vpaddq_s64(deltas[3][7], deltas[3][8]);
854*77c1e3ccSAndroid Build Coastguard Worker deltas[4][5] = vpaddq_s64(deltas[4][5], deltas[4][6]);
855*77c1e3ccSAndroid Build Coastguard Worker deltas[4][7] = vpaddq_s64(deltas[4][7], deltas[4][8]);
856*77c1e3ccSAndroid Build Coastguard Worker deltas[5][5] = vpaddq_s64(deltas[5][5], deltas[5][6]);
857*77c1e3ccSAndroid Build Coastguard Worker deltas[5][7] = vpaddq_s64(deltas[5][7], deltas[5][8]);
858*77c1e3ccSAndroid Build Coastguard Worker
859*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(single_delta + 0, vpaddq_s64(deltas[0][4], deltas[1][4]));
860*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(single_delta + 2, vpaddq_s64(deltas[2][4], deltas[3][4]));
861*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(single_delta + 4, vpaddq_s64(deltas[4][4], deltas[5][4]));
862*77c1e3ccSAndroid Build Coastguard Worker
863*77c1e3ccSAndroid Build Coastguard Worker int idx = 0;
864*77c1e3ccSAndroid Build Coastguard Worker for (i = 1; i < wiener_win - 1; i++) {
865*77c1e3ccSAndroid Build Coastguard Worker for (j = i + 1; j < wiener_win; j++) {
866*77c1e3ccSAndroid Build Coastguard Worker update_4_stats_sve(
867*77c1e3ccSAndroid Build Coastguard Worker H + (i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win,
868*77c1e3ccSAndroid Build Coastguard Worker deltas[idx], H + i * wiener_win * wiener_win2 + j * wiener_win);
869*77c1e3ccSAndroid Build Coastguard Worker H[i * wiener_win * wiener_win2 + j * wiener_win + 4] =
870*77c1e3ccSAndroid Build Coastguard Worker H[(i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win + 4] +
871*77c1e3ccSAndroid Build Coastguard Worker single_delta[idx];
872*77c1e3ccSAndroid Build Coastguard Worker
873*77c1e3ccSAndroid Build Coastguard Worker H[(i * wiener_win + 1) * wiener_win2 + j * wiener_win] =
874*77c1e3ccSAndroid Build Coastguard Worker H[((i - 1) * wiener_win + 1) * wiener_win2 + (j - 1) * wiener_win] +
875*77c1e3ccSAndroid Build Coastguard Worker vgetq_lane_s64(deltas[idx][5], 0);
876*77c1e3ccSAndroid Build Coastguard Worker H[(i * wiener_win + 2) * wiener_win2 + j * wiener_win] =
877*77c1e3ccSAndroid Build Coastguard Worker H[((i - 1) * wiener_win + 2) * wiener_win2 + (j - 1) * wiener_win] +
878*77c1e3ccSAndroid Build Coastguard Worker vgetq_lane_s64(deltas[idx][5], 1);
879*77c1e3ccSAndroid Build Coastguard Worker H[(i * wiener_win + 3) * wiener_win2 + j * wiener_win] =
880*77c1e3ccSAndroid Build Coastguard Worker H[((i - 1) * wiener_win + 3) * wiener_win2 + (j - 1) * wiener_win] +
881*77c1e3ccSAndroid Build Coastguard Worker vgetq_lane_s64(deltas[idx][7], 0);
882*77c1e3ccSAndroid Build Coastguard Worker H[(i * wiener_win + 4) * wiener_win2 + j * wiener_win] =
883*77c1e3ccSAndroid Build Coastguard Worker H[((i - 1) * wiener_win + 4) * wiener_win2 + (j - 1) * wiener_win] +
884*77c1e3ccSAndroid Build Coastguard Worker vgetq_lane_s64(deltas[idx][7], 1);
885*77c1e3ccSAndroid Build Coastguard Worker
886*77c1e3ccSAndroid Build Coastguard Worker idx++;
887*77c1e3ccSAndroid Build Coastguard Worker }
888*77c1e3ccSAndroid Build Coastguard Worker }
889*77c1e3ccSAndroid Build Coastguard Worker }
890*77c1e3ccSAndroid Build Coastguard Worker
891*77c1e3ccSAndroid Build Coastguard Worker // Step 5: Derive other points of each square. No square in bottom row.
892*77c1e3ccSAndroid Build Coastguard Worker i = 0;
893*77c1e3ccSAndroid Build Coastguard Worker do {
894*77c1e3ccSAndroid Build Coastguard Worker const int16_t *const di = d + i;
895*77c1e3ccSAndroid Build Coastguard Worker
896*77c1e3ccSAndroid Build Coastguard Worker j = i + 1;
897*77c1e3ccSAndroid Build Coastguard Worker do {
898*77c1e3ccSAndroid Build Coastguard Worker const int16_t *const dj = d + j;
899*77c1e3ccSAndroid Build Coastguard Worker int64x2_t deltas[WIENER_WIN_CHROMA - 1][WIENER_WIN_CHROMA - 1] = {
900*77c1e3ccSAndroid Build Coastguard Worker { vdupq_n_s64(0) }, { vdupq_n_s64(0) }
901*77c1e3ccSAndroid Build Coastguard Worker };
902*77c1e3ccSAndroid Build Coastguard Worker int16x8_t d_is[WIN_CHROMA], d_ie[WIN_CHROMA];
903*77c1e3ccSAndroid Build Coastguard Worker int16x8_t d_js[WIN_CHROMA], d_je[WIN_CHROMA];
904*77c1e3ccSAndroid Build Coastguard Worker
905*77c1e3ccSAndroid Build Coastguard Worker x = 0;
906*77c1e3ccSAndroid Build Coastguard Worker while (x < width - 16) {
907*77c1e3ccSAndroid Build Coastguard Worker load_square_win5_neon(di + x, dj + x, d_stride, height, d_is, d_ie,
908*77c1e3ccSAndroid Build Coastguard Worker d_js, d_je);
909*77c1e3ccSAndroid Build Coastguard Worker derive_square_win5_sve(d_is, d_ie, d_js, d_je, deltas);
910*77c1e3ccSAndroid Build Coastguard Worker x += 16;
911*77c1e3ccSAndroid Build Coastguard Worker }
912*77c1e3ccSAndroid Build Coastguard Worker
913*77c1e3ccSAndroid Build Coastguard Worker load_square_win5_sve(di + x, dj + x, d_stride, height, d_is, d_ie, d_js,
914*77c1e3ccSAndroid Build Coastguard Worker d_je, p0, p1);
915*77c1e3ccSAndroid Build Coastguard Worker derive_square_win5_sve(d_is, d_ie, d_js, d_je, deltas);
916*77c1e3ccSAndroid Build Coastguard Worker
917*77c1e3ccSAndroid Build Coastguard Worker hadd_update_4_stats_sve(
918*77c1e3ccSAndroid Build Coastguard Worker H + (i * wiener_win + 0) * wiener_win2 + j * wiener_win, deltas[0],
919*77c1e3ccSAndroid Build Coastguard Worker H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win + 1);
920*77c1e3ccSAndroid Build Coastguard Worker hadd_update_4_stats_sve(
921*77c1e3ccSAndroid Build Coastguard Worker H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win, deltas[1],
922*77c1e3ccSAndroid Build Coastguard Worker H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win + 1);
923*77c1e3ccSAndroid Build Coastguard Worker hadd_update_4_stats_sve(
924*77c1e3ccSAndroid Build Coastguard Worker H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win, deltas[2],
925*77c1e3ccSAndroid Build Coastguard Worker H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win + 1);
926*77c1e3ccSAndroid Build Coastguard Worker hadd_update_4_stats_sve(
927*77c1e3ccSAndroid Build Coastguard Worker H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win, deltas[3],
928*77c1e3ccSAndroid Build Coastguard Worker H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win + 1);
929*77c1e3ccSAndroid Build Coastguard Worker } while (++j < wiener_win);
930*77c1e3ccSAndroid Build Coastguard Worker } while (++i < wiener_win - 1);
931*77c1e3ccSAndroid Build Coastguard Worker
932*77c1e3ccSAndroid Build Coastguard Worker // Step 6: Derive other points of each upper triangle along the diagonal.
933*77c1e3ccSAndroid Build Coastguard Worker i = 0;
934*77c1e3ccSAndroid Build Coastguard Worker do {
935*77c1e3ccSAndroid Build Coastguard Worker const int16_t *const di = d + i;
936*77c1e3ccSAndroid Build Coastguard Worker int64x2_t deltas[WIENER_WIN_CHROMA * 2 + 1] = { vdupq_n_s64(0) };
937*77c1e3ccSAndroid Build Coastguard Worker int16x8_t d_is[WIN_CHROMA], d_ie[WIN_CHROMA];
938*77c1e3ccSAndroid Build Coastguard Worker
939*77c1e3ccSAndroid Build Coastguard Worker x = 0;
940*77c1e3ccSAndroid Build Coastguard Worker while (x < width - 16) {
941*77c1e3ccSAndroid Build Coastguard Worker load_triangle_win5_neon(di + x, d_stride, height, d_is, d_ie);
942*77c1e3ccSAndroid Build Coastguard Worker derive_triangle_win5_sve(d_is, d_ie, deltas);
943*77c1e3ccSAndroid Build Coastguard Worker x += 16;
944*77c1e3ccSAndroid Build Coastguard Worker }
945*77c1e3ccSAndroid Build Coastguard Worker
946*77c1e3ccSAndroid Build Coastguard Worker load_triangle_win5_sve(di + x, d_stride, height, d_is, d_ie, p0, p1);
947*77c1e3ccSAndroid Build Coastguard Worker derive_triangle_win5_sve(d_is, d_ie, deltas);
948*77c1e3ccSAndroid Build Coastguard Worker
949*77c1e3ccSAndroid Build Coastguard Worker // Row 1: 4 points
950*77c1e3ccSAndroid Build Coastguard Worker hadd_update_4_stats_sve(
951*77c1e3ccSAndroid Build Coastguard Worker H + (i * wiener_win + 0) * wiener_win2 + i * wiener_win, deltas,
952*77c1e3ccSAndroid Build Coastguard Worker H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1);
953*77c1e3ccSAndroid Build Coastguard Worker
954*77c1e3ccSAndroid Build Coastguard Worker // Row 2: 3 points
955*77c1e3ccSAndroid Build Coastguard Worker int64x2_t src0 =
956*77c1e3ccSAndroid Build Coastguard Worker vld1q_s64(H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1);
957*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2,
958*77c1e3ccSAndroid Build Coastguard Worker vaddq_s64(src0, vpaddq_s64(deltas[4], deltas[5])));
959*77c1e3ccSAndroid Build Coastguard Worker
960*77c1e3ccSAndroid Build Coastguard Worker int64x2_t deltas69 = vpaddq_s64(deltas[6], deltas[9]);
961*77c1e3ccSAndroid Build Coastguard Worker
962*77c1e3ccSAndroid Build Coastguard Worker H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 4] =
963*77c1e3ccSAndroid Build Coastguard Worker H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 3] +
964*77c1e3ccSAndroid Build Coastguard Worker vgetq_lane_s64(deltas69, 0);
965*77c1e3ccSAndroid Build Coastguard Worker
966*77c1e3ccSAndroid Build Coastguard Worker // Row 3: 2 points
967*77c1e3ccSAndroid Build Coastguard Worker int64x2_t src1 =
968*77c1e3ccSAndroid Build Coastguard Worker vld1q_s64(H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2);
969*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3,
970*77c1e3ccSAndroid Build Coastguard Worker vaddq_s64(src1, vpaddq_s64(deltas[7], deltas[8])));
971*77c1e3ccSAndroid Build Coastguard Worker
972*77c1e3ccSAndroid Build Coastguard Worker // Row 4: 1 point
973*77c1e3ccSAndroid Build Coastguard Worker H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4] =
974*77c1e3ccSAndroid Build Coastguard Worker H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3] +
975*77c1e3ccSAndroid Build Coastguard Worker vgetq_lane_s64(deltas69, 1);
976*77c1e3ccSAndroid Build Coastguard Worker } while (++i < wiener_win);
977*77c1e3ccSAndroid Build Coastguard Worker }
978*77c1e3ccSAndroid Build Coastguard Worker
stats_top_win7_sve(const int16x8_t src[2],const int16x8_t dgd[2],const int16_t * const d,const int32_t d_stride,int64x2_t * sum_m,int64x2_t * sum_h)979*77c1e3ccSAndroid Build Coastguard Worker static inline void stats_top_win7_sve(const int16x8_t src[2],
980*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t dgd[2],
981*77c1e3ccSAndroid Build Coastguard Worker const int16_t *const d,
982*77c1e3ccSAndroid Build Coastguard Worker const int32_t d_stride, int64x2_t *sum_m,
983*77c1e3ccSAndroid Build Coastguard Worker int64x2_t *sum_h) {
984*77c1e3ccSAndroid Build Coastguard Worker int16x8_t dgds[WIENER_WIN * 2];
985*77c1e3ccSAndroid Build Coastguard Worker
986*77c1e3ccSAndroid Build Coastguard Worker load_s16_8x7(d + 0, d_stride, &dgds[0], &dgds[2], &dgds[4], &dgds[6],
987*77c1e3ccSAndroid Build Coastguard Worker &dgds[8], &dgds[10], &dgds[12]);
988*77c1e3ccSAndroid Build Coastguard Worker load_s16_8x7(d + 8, d_stride, &dgds[1], &dgds[3], &dgds[5], &dgds[7],
989*77c1e3ccSAndroid Build Coastguard Worker &dgds[9], &dgds[11], &dgds[13]);
990*77c1e3ccSAndroid Build Coastguard Worker
991*77c1e3ccSAndroid Build Coastguard Worker sum_m[0] = aom_sdotq_s16(sum_m[0], src[0], dgds[0]);
992*77c1e3ccSAndroid Build Coastguard Worker sum_m[0] = aom_sdotq_s16(sum_m[0], src[1], dgds[1]);
993*77c1e3ccSAndroid Build Coastguard Worker sum_m[1] = aom_sdotq_s16(sum_m[1], src[0], dgds[2]);
994*77c1e3ccSAndroid Build Coastguard Worker sum_m[1] = aom_sdotq_s16(sum_m[1], src[1], dgds[3]);
995*77c1e3ccSAndroid Build Coastguard Worker sum_m[2] = aom_sdotq_s16(sum_m[2], src[0], dgds[4]);
996*77c1e3ccSAndroid Build Coastguard Worker sum_m[2] = aom_sdotq_s16(sum_m[2], src[1], dgds[5]);
997*77c1e3ccSAndroid Build Coastguard Worker sum_m[3] = aom_sdotq_s16(sum_m[3], src[0], dgds[6]);
998*77c1e3ccSAndroid Build Coastguard Worker sum_m[3] = aom_sdotq_s16(sum_m[3], src[1], dgds[7]);
999*77c1e3ccSAndroid Build Coastguard Worker sum_m[4] = aom_sdotq_s16(sum_m[4], src[0], dgds[8]);
1000*77c1e3ccSAndroid Build Coastguard Worker sum_m[4] = aom_sdotq_s16(sum_m[4], src[1], dgds[9]);
1001*77c1e3ccSAndroid Build Coastguard Worker sum_m[5] = aom_sdotq_s16(sum_m[5], src[0], dgds[10]);
1002*77c1e3ccSAndroid Build Coastguard Worker sum_m[5] = aom_sdotq_s16(sum_m[5], src[1], dgds[11]);
1003*77c1e3ccSAndroid Build Coastguard Worker sum_m[6] = aom_sdotq_s16(sum_m[6], src[0], dgds[12]);
1004*77c1e3ccSAndroid Build Coastguard Worker sum_m[6] = aom_sdotq_s16(sum_m[6], src[1], dgds[13]);
1005*77c1e3ccSAndroid Build Coastguard Worker
1006*77c1e3ccSAndroid Build Coastguard Worker sum_h[0] = aom_sdotq_s16(sum_h[0], dgd[0], dgds[0]);
1007*77c1e3ccSAndroid Build Coastguard Worker sum_h[0] = aom_sdotq_s16(sum_h[0], dgd[1], dgds[1]);
1008*77c1e3ccSAndroid Build Coastguard Worker sum_h[1] = aom_sdotq_s16(sum_h[1], dgd[0], dgds[2]);
1009*77c1e3ccSAndroid Build Coastguard Worker sum_h[1] = aom_sdotq_s16(sum_h[1], dgd[1], dgds[3]);
1010*77c1e3ccSAndroid Build Coastguard Worker sum_h[2] = aom_sdotq_s16(sum_h[2], dgd[0], dgds[4]);
1011*77c1e3ccSAndroid Build Coastguard Worker sum_h[2] = aom_sdotq_s16(sum_h[2], dgd[1], dgds[5]);
1012*77c1e3ccSAndroid Build Coastguard Worker sum_h[3] = aom_sdotq_s16(sum_h[3], dgd[0], dgds[6]);
1013*77c1e3ccSAndroid Build Coastguard Worker sum_h[3] = aom_sdotq_s16(sum_h[3], dgd[1], dgds[7]);
1014*77c1e3ccSAndroid Build Coastguard Worker sum_h[4] = aom_sdotq_s16(sum_h[4], dgd[0], dgds[8]);
1015*77c1e3ccSAndroid Build Coastguard Worker sum_h[4] = aom_sdotq_s16(sum_h[4], dgd[1], dgds[9]);
1016*77c1e3ccSAndroid Build Coastguard Worker sum_h[5] = aom_sdotq_s16(sum_h[5], dgd[0], dgds[10]);
1017*77c1e3ccSAndroid Build Coastguard Worker sum_h[5] = aom_sdotq_s16(sum_h[5], dgd[1], dgds[11]);
1018*77c1e3ccSAndroid Build Coastguard Worker sum_h[6] = aom_sdotq_s16(sum_h[6], dgd[0], dgds[12]);
1019*77c1e3ccSAndroid Build Coastguard Worker sum_h[6] = aom_sdotq_s16(sum_h[6], dgd[1], dgds[13]);
1020*77c1e3ccSAndroid Build Coastguard Worker }
1021*77c1e3ccSAndroid Build Coastguard Worker
stats_left_win7_sve(const int16x8_t src[2],const int16_t * d,const int32_t d_stride,int64x2_t * sum)1022*77c1e3ccSAndroid Build Coastguard Worker static inline void stats_left_win7_sve(const int16x8_t src[2], const int16_t *d,
1023*77c1e3ccSAndroid Build Coastguard Worker const int32_t d_stride, int64x2_t *sum) {
1024*77c1e3ccSAndroid Build Coastguard Worker int16x8_t dgds[WIN_7];
1025*77c1e3ccSAndroid Build Coastguard Worker
1026*77c1e3ccSAndroid Build Coastguard Worker load_s16_8x6(d + d_stride + 0, d_stride, &dgds[0], &dgds[2], &dgds[4],
1027*77c1e3ccSAndroid Build Coastguard Worker &dgds[6], &dgds[8], &dgds[10]);
1028*77c1e3ccSAndroid Build Coastguard Worker load_s16_8x6(d + d_stride + 8, d_stride, &dgds[1], &dgds[3], &dgds[5],
1029*77c1e3ccSAndroid Build Coastguard Worker &dgds[7], &dgds[9], &dgds[11]);
1030*77c1e3ccSAndroid Build Coastguard Worker
1031*77c1e3ccSAndroid Build Coastguard Worker sum[0] = aom_sdotq_s16(sum[0], src[0], dgds[0]);
1032*77c1e3ccSAndroid Build Coastguard Worker sum[0] = aom_sdotq_s16(sum[0], src[1], dgds[1]);
1033*77c1e3ccSAndroid Build Coastguard Worker sum[1] = aom_sdotq_s16(sum[1], src[0], dgds[2]);
1034*77c1e3ccSAndroid Build Coastguard Worker sum[1] = aom_sdotq_s16(sum[1], src[1], dgds[3]);
1035*77c1e3ccSAndroid Build Coastguard Worker sum[2] = aom_sdotq_s16(sum[2], src[0], dgds[4]);
1036*77c1e3ccSAndroid Build Coastguard Worker sum[2] = aom_sdotq_s16(sum[2], src[1], dgds[5]);
1037*77c1e3ccSAndroid Build Coastguard Worker sum[3] = aom_sdotq_s16(sum[3], src[0], dgds[6]);
1038*77c1e3ccSAndroid Build Coastguard Worker sum[3] = aom_sdotq_s16(sum[3], src[1], dgds[7]);
1039*77c1e3ccSAndroid Build Coastguard Worker sum[4] = aom_sdotq_s16(sum[4], src[0], dgds[8]);
1040*77c1e3ccSAndroid Build Coastguard Worker sum[4] = aom_sdotq_s16(sum[4], src[1], dgds[9]);
1041*77c1e3ccSAndroid Build Coastguard Worker sum[5] = aom_sdotq_s16(sum[5], src[0], dgds[10]);
1042*77c1e3ccSAndroid Build Coastguard Worker sum[5] = aom_sdotq_s16(sum[5], src[1], dgds[11]);
1043*77c1e3ccSAndroid Build Coastguard Worker }
1044*77c1e3ccSAndroid Build Coastguard Worker
load_square_win7_sve(const int16_t * const di,const int16_t * const dj,const int32_t d_stride,const int32_t height,int16x8_t * d_is,int16x8_t * d_ie,int16x8_t * d_js,int16x8_t * d_je,svbool_t p0,svbool_t p1)1045*77c1e3ccSAndroid Build Coastguard Worker static inline void load_square_win7_sve(
1046*77c1e3ccSAndroid Build Coastguard Worker const int16_t *const di, const int16_t *const dj, const int32_t d_stride,
1047*77c1e3ccSAndroid Build Coastguard Worker const int32_t height, int16x8_t *d_is, int16x8_t *d_ie, int16x8_t *d_js,
1048*77c1e3ccSAndroid Build Coastguard Worker int16x8_t *d_je, svbool_t p0, svbool_t p1) {
1049*77c1e3ccSAndroid Build Coastguard Worker d_is[0] = svget_neonq_s16(svld1_s16(p0, di + 0 * d_stride + 0));
1050*77c1e3ccSAndroid Build Coastguard Worker d_is[1] = svget_neonq_s16(svld1_s16(p1, di + 0 * d_stride + 8));
1051*77c1e3ccSAndroid Build Coastguard Worker d_is[2] = svget_neonq_s16(svld1_s16(p0, di + 1 * d_stride + 0));
1052*77c1e3ccSAndroid Build Coastguard Worker d_is[3] = svget_neonq_s16(svld1_s16(p1, di + 1 * d_stride + 8));
1053*77c1e3ccSAndroid Build Coastguard Worker d_is[4] = svget_neonq_s16(svld1_s16(p0, di + 2 * d_stride + 0));
1054*77c1e3ccSAndroid Build Coastguard Worker d_is[5] = svget_neonq_s16(svld1_s16(p1, di + 2 * d_stride + 8));
1055*77c1e3ccSAndroid Build Coastguard Worker d_is[6] = svget_neonq_s16(svld1_s16(p0, di + 3 * d_stride + 0));
1056*77c1e3ccSAndroid Build Coastguard Worker d_is[7] = svget_neonq_s16(svld1_s16(p1, di + 3 * d_stride + 8));
1057*77c1e3ccSAndroid Build Coastguard Worker d_is[8] = svget_neonq_s16(svld1_s16(p0, di + 4 * d_stride + 0));
1058*77c1e3ccSAndroid Build Coastguard Worker d_is[9] = svget_neonq_s16(svld1_s16(p1, di + 4 * d_stride + 8));
1059*77c1e3ccSAndroid Build Coastguard Worker d_is[10] = svget_neonq_s16(svld1_s16(p0, di + 5 * d_stride + 0));
1060*77c1e3ccSAndroid Build Coastguard Worker d_is[11] = svget_neonq_s16(svld1_s16(p1, di + 5 * d_stride + 8));
1061*77c1e3ccSAndroid Build Coastguard Worker
1062*77c1e3ccSAndroid Build Coastguard Worker d_ie[0] = svget_neonq_s16(svld1_s16(p0, di + (height + 0) * d_stride + 0));
1063*77c1e3ccSAndroid Build Coastguard Worker d_ie[1] = svget_neonq_s16(svld1_s16(p1, di + (height + 0) * d_stride + 8));
1064*77c1e3ccSAndroid Build Coastguard Worker d_ie[2] = svget_neonq_s16(svld1_s16(p0, di + (height + 1) * d_stride + 0));
1065*77c1e3ccSAndroid Build Coastguard Worker d_ie[3] = svget_neonq_s16(svld1_s16(p1, di + (height + 1) * d_stride + 8));
1066*77c1e3ccSAndroid Build Coastguard Worker d_ie[4] = svget_neonq_s16(svld1_s16(p0, di + (height + 2) * d_stride + 0));
1067*77c1e3ccSAndroid Build Coastguard Worker d_ie[5] = svget_neonq_s16(svld1_s16(p1, di + (height + 2) * d_stride + 8));
1068*77c1e3ccSAndroid Build Coastguard Worker d_ie[6] = svget_neonq_s16(svld1_s16(p0, di + (height + 3) * d_stride + 0));
1069*77c1e3ccSAndroid Build Coastguard Worker d_ie[7] = svget_neonq_s16(svld1_s16(p1, di + (height + 3) * d_stride + 8));
1070*77c1e3ccSAndroid Build Coastguard Worker d_ie[8] = svget_neonq_s16(svld1_s16(p0, di + (height + 4) * d_stride + 0));
1071*77c1e3ccSAndroid Build Coastguard Worker d_ie[9] = svget_neonq_s16(svld1_s16(p1, di + (height + 4) * d_stride + 8));
1072*77c1e3ccSAndroid Build Coastguard Worker d_ie[10] = svget_neonq_s16(svld1_s16(p0, di + (height + 5) * d_stride + 0));
1073*77c1e3ccSAndroid Build Coastguard Worker d_ie[11] = svget_neonq_s16(svld1_s16(p1, di + (height + 5) * d_stride + 8));
1074*77c1e3ccSAndroid Build Coastguard Worker
1075*77c1e3ccSAndroid Build Coastguard Worker load_s16_8x6(dj + 0, d_stride, &d_js[0], &d_js[2], &d_js[4], &d_js[6],
1076*77c1e3ccSAndroid Build Coastguard Worker &d_js[8], &d_js[10]);
1077*77c1e3ccSAndroid Build Coastguard Worker load_s16_8x6(dj + 8, d_stride, &d_js[1], &d_js[3], &d_js[5], &d_js[7],
1078*77c1e3ccSAndroid Build Coastguard Worker &d_js[9], &d_js[11]);
1079*77c1e3ccSAndroid Build Coastguard Worker load_s16_8x6(dj + height * d_stride + 0, d_stride, &d_je[0], &d_je[2],
1080*77c1e3ccSAndroid Build Coastguard Worker &d_je[4], &d_je[6], &d_je[8], &d_je[10]);
1081*77c1e3ccSAndroid Build Coastguard Worker load_s16_8x6(dj + height * d_stride + 8, d_stride, &d_je[1], &d_je[3],
1082*77c1e3ccSAndroid Build Coastguard Worker &d_je[5], &d_je[7], &d_je[9], &d_je[11]);
1083*77c1e3ccSAndroid Build Coastguard Worker }
1084*77c1e3ccSAndroid Build Coastguard Worker
derive_square_win7_sve(int16x8_t * d_is,const int16x8_t * d_ie,const int16x8_t * d_js,const int16x8_t * d_je,int64x2_t deltas[][WIN_7])1085*77c1e3ccSAndroid Build Coastguard Worker static inline void derive_square_win7_sve(int16x8_t *d_is,
1086*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t *d_ie,
1087*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t *d_js,
1088*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t *d_je,
1089*77c1e3ccSAndroid Build Coastguard Worker int64x2_t deltas[][WIN_7]) {
1090*77c1e3ccSAndroid Build Coastguard Worker d_is[0] = vnegq_s16(d_is[0]);
1091*77c1e3ccSAndroid Build Coastguard Worker d_is[1] = vnegq_s16(d_is[1]);
1092*77c1e3ccSAndroid Build Coastguard Worker d_is[2] = vnegq_s16(d_is[2]);
1093*77c1e3ccSAndroid Build Coastguard Worker d_is[3] = vnegq_s16(d_is[3]);
1094*77c1e3ccSAndroid Build Coastguard Worker d_is[4] = vnegq_s16(d_is[4]);
1095*77c1e3ccSAndroid Build Coastguard Worker d_is[5] = vnegq_s16(d_is[5]);
1096*77c1e3ccSAndroid Build Coastguard Worker d_is[6] = vnegq_s16(d_is[6]);
1097*77c1e3ccSAndroid Build Coastguard Worker d_is[7] = vnegq_s16(d_is[7]);
1098*77c1e3ccSAndroid Build Coastguard Worker d_is[8] = vnegq_s16(d_is[8]);
1099*77c1e3ccSAndroid Build Coastguard Worker d_is[9] = vnegq_s16(d_is[9]);
1100*77c1e3ccSAndroid Build Coastguard Worker d_is[10] = vnegq_s16(d_is[10]);
1101*77c1e3ccSAndroid Build Coastguard Worker d_is[11] = vnegq_s16(d_is[11]);
1102*77c1e3ccSAndroid Build Coastguard Worker
1103*77c1e3ccSAndroid Build Coastguard Worker deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_is[0], d_js[0]);
1104*77c1e3ccSAndroid Build Coastguard Worker deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_is[1], d_js[1]);
1105*77c1e3ccSAndroid Build Coastguard Worker deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_is[0], d_js[2]);
1106*77c1e3ccSAndroid Build Coastguard Worker deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_is[1], d_js[3]);
1107*77c1e3ccSAndroid Build Coastguard Worker deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_is[0], d_js[4]);
1108*77c1e3ccSAndroid Build Coastguard Worker deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_is[1], d_js[5]);
1109*77c1e3ccSAndroid Build Coastguard Worker deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_is[0], d_js[6]);
1110*77c1e3ccSAndroid Build Coastguard Worker deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_is[1], d_js[7]);
1111*77c1e3ccSAndroid Build Coastguard Worker deltas[0][4] = aom_sdotq_s16(deltas[0][4], d_is[0], d_js[8]);
1112*77c1e3ccSAndroid Build Coastguard Worker deltas[0][4] = aom_sdotq_s16(deltas[0][4], d_is[1], d_js[9]);
1113*77c1e3ccSAndroid Build Coastguard Worker deltas[0][5] = aom_sdotq_s16(deltas[0][5], d_is[0], d_js[10]);
1114*77c1e3ccSAndroid Build Coastguard Worker deltas[0][5] = aom_sdotq_s16(deltas[0][5], d_is[1], d_js[11]);
1115*77c1e3ccSAndroid Build Coastguard Worker
1116*77c1e3ccSAndroid Build Coastguard Worker deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_is[2], d_js[0]);
1117*77c1e3ccSAndroid Build Coastguard Worker deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_is[3], d_js[1]);
1118*77c1e3ccSAndroid Build Coastguard Worker deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_is[2], d_js[2]);
1119*77c1e3ccSAndroid Build Coastguard Worker deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_is[3], d_js[3]);
1120*77c1e3ccSAndroid Build Coastguard Worker deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_is[2], d_js[4]);
1121*77c1e3ccSAndroid Build Coastguard Worker deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_is[3], d_js[5]);
1122*77c1e3ccSAndroid Build Coastguard Worker deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_is[2], d_js[6]);
1123*77c1e3ccSAndroid Build Coastguard Worker deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_is[3], d_js[7]);
1124*77c1e3ccSAndroid Build Coastguard Worker deltas[1][4] = aom_sdotq_s16(deltas[1][4], d_is[2], d_js[8]);
1125*77c1e3ccSAndroid Build Coastguard Worker deltas[1][4] = aom_sdotq_s16(deltas[1][4], d_is[3], d_js[9]);
1126*77c1e3ccSAndroid Build Coastguard Worker deltas[1][5] = aom_sdotq_s16(deltas[1][5], d_is[2], d_js[10]);
1127*77c1e3ccSAndroid Build Coastguard Worker deltas[1][5] = aom_sdotq_s16(deltas[1][5], d_is[3], d_js[11]);
1128*77c1e3ccSAndroid Build Coastguard Worker
1129*77c1e3ccSAndroid Build Coastguard Worker deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_is[4], d_js[0]);
1130*77c1e3ccSAndroid Build Coastguard Worker deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_is[5], d_js[1]);
1131*77c1e3ccSAndroid Build Coastguard Worker deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_is[4], d_js[2]);
1132*77c1e3ccSAndroid Build Coastguard Worker deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_is[5], d_js[3]);
1133*77c1e3ccSAndroid Build Coastguard Worker deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_is[4], d_js[4]);
1134*77c1e3ccSAndroid Build Coastguard Worker deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_is[5], d_js[5]);
1135*77c1e3ccSAndroid Build Coastguard Worker deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_is[4], d_js[6]);
1136*77c1e3ccSAndroid Build Coastguard Worker deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_is[5], d_js[7]);
1137*77c1e3ccSAndroid Build Coastguard Worker deltas[2][4] = aom_sdotq_s16(deltas[2][4], d_is[4], d_js[8]);
1138*77c1e3ccSAndroid Build Coastguard Worker deltas[2][4] = aom_sdotq_s16(deltas[2][4], d_is[5], d_js[9]);
1139*77c1e3ccSAndroid Build Coastguard Worker deltas[2][5] = aom_sdotq_s16(deltas[2][5], d_is[4], d_js[10]);
1140*77c1e3ccSAndroid Build Coastguard Worker deltas[2][5] = aom_sdotq_s16(deltas[2][5], d_is[5], d_js[11]);
1141*77c1e3ccSAndroid Build Coastguard Worker
1142*77c1e3ccSAndroid Build Coastguard Worker deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_is[6], d_js[0]);
1143*77c1e3ccSAndroid Build Coastguard Worker deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_is[7], d_js[1]);
1144*77c1e3ccSAndroid Build Coastguard Worker deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_is[6], d_js[2]);
1145*77c1e3ccSAndroid Build Coastguard Worker deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_is[7], d_js[3]);
1146*77c1e3ccSAndroid Build Coastguard Worker deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_is[6], d_js[4]);
1147*77c1e3ccSAndroid Build Coastguard Worker deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_is[7], d_js[5]);
1148*77c1e3ccSAndroid Build Coastguard Worker deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_is[6], d_js[6]);
1149*77c1e3ccSAndroid Build Coastguard Worker deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_is[7], d_js[7]);
1150*77c1e3ccSAndroid Build Coastguard Worker deltas[3][4] = aom_sdotq_s16(deltas[3][4], d_is[6], d_js[8]);
1151*77c1e3ccSAndroid Build Coastguard Worker deltas[3][4] = aom_sdotq_s16(deltas[3][4], d_is[7], d_js[9]);
1152*77c1e3ccSAndroid Build Coastguard Worker deltas[3][5] = aom_sdotq_s16(deltas[3][5], d_is[6], d_js[10]);
1153*77c1e3ccSAndroid Build Coastguard Worker deltas[3][5] = aom_sdotq_s16(deltas[3][5], d_is[7], d_js[11]);
1154*77c1e3ccSAndroid Build Coastguard Worker
1155*77c1e3ccSAndroid Build Coastguard Worker deltas[4][0] = aom_sdotq_s16(deltas[4][0], d_is[8], d_js[0]);
1156*77c1e3ccSAndroid Build Coastguard Worker deltas[4][0] = aom_sdotq_s16(deltas[4][0], d_is[9], d_js[1]);
1157*77c1e3ccSAndroid Build Coastguard Worker deltas[4][1] = aom_sdotq_s16(deltas[4][1], d_is[8], d_js[2]);
1158*77c1e3ccSAndroid Build Coastguard Worker deltas[4][1] = aom_sdotq_s16(deltas[4][1], d_is[9], d_js[3]);
1159*77c1e3ccSAndroid Build Coastguard Worker deltas[4][2] = aom_sdotq_s16(deltas[4][2], d_is[8], d_js[4]);
1160*77c1e3ccSAndroid Build Coastguard Worker deltas[4][2] = aom_sdotq_s16(deltas[4][2], d_is[9], d_js[5]);
1161*77c1e3ccSAndroid Build Coastguard Worker deltas[4][3] = aom_sdotq_s16(deltas[4][3], d_is[8], d_js[6]);
1162*77c1e3ccSAndroid Build Coastguard Worker deltas[4][3] = aom_sdotq_s16(deltas[4][3], d_is[9], d_js[7]);
1163*77c1e3ccSAndroid Build Coastguard Worker deltas[4][4] = aom_sdotq_s16(deltas[4][4], d_is[8], d_js[8]);
1164*77c1e3ccSAndroid Build Coastguard Worker deltas[4][4] = aom_sdotq_s16(deltas[4][4], d_is[9], d_js[9]);
1165*77c1e3ccSAndroid Build Coastguard Worker deltas[4][5] = aom_sdotq_s16(deltas[4][5], d_is[8], d_js[10]);
1166*77c1e3ccSAndroid Build Coastguard Worker deltas[4][5] = aom_sdotq_s16(deltas[4][5], d_is[9], d_js[11]);
1167*77c1e3ccSAndroid Build Coastguard Worker
1168*77c1e3ccSAndroid Build Coastguard Worker deltas[5][0] = aom_sdotq_s16(deltas[5][0], d_is[10], d_js[0]);
1169*77c1e3ccSAndroid Build Coastguard Worker deltas[5][0] = aom_sdotq_s16(deltas[5][0], d_is[11], d_js[1]);
1170*77c1e3ccSAndroid Build Coastguard Worker deltas[5][1] = aom_sdotq_s16(deltas[5][1], d_is[10], d_js[2]);
1171*77c1e3ccSAndroid Build Coastguard Worker deltas[5][1] = aom_sdotq_s16(deltas[5][1], d_is[11], d_js[3]);
1172*77c1e3ccSAndroid Build Coastguard Worker deltas[5][2] = aom_sdotq_s16(deltas[5][2], d_is[10], d_js[4]);
1173*77c1e3ccSAndroid Build Coastguard Worker deltas[5][2] = aom_sdotq_s16(deltas[5][2], d_is[11], d_js[5]);
1174*77c1e3ccSAndroid Build Coastguard Worker deltas[5][3] = aom_sdotq_s16(deltas[5][3], d_is[10], d_js[6]);
1175*77c1e3ccSAndroid Build Coastguard Worker deltas[5][3] = aom_sdotq_s16(deltas[5][3], d_is[11], d_js[7]);
1176*77c1e3ccSAndroid Build Coastguard Worker deltas[5][4] = aom_sdotq_s16(deltas[5][4], d_is[10], d_js[8]);
1177*77c1e3ccSAndroid Build Coastguard Worker deltas[5][4] = aom_sdotq_s16(deltas[5][4], d_is[11], d_js[9]);
1178*77c1e3ccSAndroid Build Coastguard Worker deltas[5][5] = aom_sdotq_s16(deltas[5][5], d_is[10], d_js[10]);
1179*77c1e3ccSAndroid Build Coastguard Worker deltas[5][5] = aom_sdotq_s16(deltas[5][5], d_is[11], d_js[11]);
1180*77c1e3ccSAndroid Build Coastguard Worker
1181*77c1e3ccSAndroid Build Coastguard Worker deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_ie[0], d_je[0]);
1182*77c1e3ccSAndroid Build Coastguard Worker deltas[0][0] = aom_sdotq_s16(deltas[0][0], d_ie[1], d_je[1]);
1183*77c1e3ccSAndroid Build Coastguard Worker deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_ie[0], d_je[2]);
1184*77c1e3ccSAndroid Build Coastguard Worker deltas[0][1] = aom_sdotq_s16(deltas[0][1], d_ie[1], d_je[3]);
1185*77c1e3ccSAndroid Build Coastguard Worker deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_ie[0], d_je[4]);
1186*77c1e3ccSAndroid Build Coastguard Worker deltas[0][2] = aom_sdotq_s16(deltas[0][2], d_ie[1], d_je[5]);
1187*77c1e3ccSAndroid Build Coastguard Worker deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_ie[0], d_je[6]);
1188*77c1e3ccSAndroid Build Coastguard Worker deltas[0][3] = aom_sdotq_s16(deltas[0][3], d_ie[1], d_je[7]);
1189*77c1e3ccSAndroid Build Coastguard Worker deltas[0][4] = aom_sdotq_s16(deltas[0][4], d_ie[0], d_je[8]);
1190*77c1e3ccSAndroid Build Coastguard Worker deltas[0][4] = aom_sdotq_s16(deltas[0][4], d_ie[1], d_je[9]);
1191*77c1e3ccSAndroid Build Coastguard Worker deltas[0][5] = aom_sdotq_s16(deltas[0][5], d_ie[0], d_je[10]);
1192*77c1e3ccSAndroid Build Coastguard Worker deltas[0][5] = aom_sdotq_s16(deltas[0][5], d_ie[1], d_je[11]);
1193*77c1e3ccSAndroid Build Coastguard Worker
1194*77c1e3ccSAndroid Build Coastguard Worker deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_ie[2], d_je[0]);
1195*77c1e3ccSAndroid Build Coastguard Worker deltas[1][0] = aom_sdotq_s16(deltas[1][0], d_ie[3], d_je[1]);
1196*77c1e3ccSAndroid Build Coastguard Worker deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_ie[2], d_je[2]);
1197*77c1e3ccSAndroid Build Coastguard Worker deltas[1][1] = aom_sdotq_s16(deltas[1][1], d_ie[3], d_je[3]);
1198*77c1e3ccSAndroid Build Coastguard Worker deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_ie[2], d_je[4]);
1199*77c1e3ccSAndroid Build Coastguard Worker deltas[1][2] = aom_sdotq_s16(deltas[1][2], d_ie[3], d_je[5]);
1200*77c1e3ccSAndroid Build Coastguard Worker deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_ie[2], d_je[6]);
1201*77c1e3ccSAndroid Build Coastguard Worker deltas[1][3] = aom_sdotq_s16(deltas[1][3], d_ie[3], d_je[7]);
1202*77c1e3ccSAndroid Build Coastguard Worker deltas[1][4] = aom_sdotq_s16(deltas[1][4], d_ie[2], d_je[8]);
1203*77c1e3ccSAndroid Build Coastguard Worker deltas[1][4] = aom_sdotq_s16(deltas[1][4], d_ie[3], d_je[9]);
1204*77c1e3ccSAndroid Build Coastguard Worker deltas[1][5] = aom_sdotq_s16(deltas[1][5], d_ie[2], d_je[10]);
1205*77c1e3ccSAndroid Build Coastguard Worker deltas[1][5] = aom_sdotq_s16(deltas[1][5], d_ie[3], d_je[11]);
1206*77c1e3ccSAndroid Build Coastguard Worker
1207*77c1e3ccSAndroid Build Coastguard Worker deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_ie[4], d_je[0]);
1208*77c1e3ccSAndroid Build Coastguard Worker deltas[2][0] = aom_sdotq_s16(deltas[2][0], d_ie[5], d_je[1]);
1209*77c1e3ccSAndroid Build Coastguard Worker deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_ie[4], d_je[2]);
1210*77c1e3ccSAndroid Build Coastguard Worker deltas[2][1] = aom_sdotq_s16(deltas[2][1], d_ie[5], d_je[3]);
1211*77c1e3ccSAndroid Build Coastguard Worker deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_ie[4], d_je[4]);
1212*77c1e3ccSAndroid Build Coastguard Worker deltas[2][2] = aom_sdotq_s16(deltas[2][2], d_ie[5], d_je[5]);
1213*77c1e3ccSAndroid Build Coastguard Worker deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_ie[4], d_je[6]);
1214*77c1e3ccSAndroid Build Coastguard Worker deltas[2][3] = aom_sdotq_s16(deltas[2][3], d_ie[5], d_je[7]);
1215*77c1e3ccSAndroid Build Coastguard Worker deltas[2][4] = aom_sdotq_s16(deltas[2][4], d_ie[4], d_je[8]);
1216*77c1e3ccSAndroid Build Coastguard Worker deltas[2][4] = aom_sdotq_s16(deltas[2][4], d_ie[5], d_je[9]);
1217*77c1e3ccSAndroid Build Coastguard Worker deltas[2][5] = aom_sdotq_s16(deltas[2][5], d_ie[4], d_je[10]);
1218*77c1e3ccSAndroid Build Coastguard Worker deltas[2][5] = aom_sdotq_s16(deltas[2][5], d_ie[5], d_je[11]);
1219*77c1e3ccSAndroid Build Coastguard Worker
1220*77c1e3ccSAndroid Build Coastguard Worker deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_ie[6], d_je[0]);
1221*77c1e3ccSAndroid Build Coastguard Worker deltas[3][0] = aom_sdotq_s16(deltas[3][0], d_ie[7], d_je[1]);
1222*77c1e3ccSAndroid Build Coastguard Worker deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_ie[6], d_je[2]);
1223*77c1e3ccSAndroid Build Coastguard Worker deltas[3][1] = aom_sdotq_s16(deltas[3][1], d_ie[7], d_je[3]);
1224*77c1e3ccSAndroid Build Coastguard Worker deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_ie[6], d_je[4]);
1225*77c1e3ccSAndroid Build Coastguard Worker deltas[3][2] = aom_sdotq_s16(deltas[3][2], d_ie[7], d_je[5]);
1226*77c1e3ccSAndroid Build Coastguard Worker deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_ie[6], d_je[6]);
1227*77c1e3ccSAndroid Build Coastguard Worker deltas[3][3] = aom_sdotq_s16(deltas[3][3], d_ie[7], d_je[7]);
1228*77c1e3ccSAndroid Build Coastguard Worker deltas[3][4] = aom_sdotq_s16(deltas[3][4], d_ie[6], d_je[8]);
1229*77c1e3ccSAndroid Build Coastguard Worker deltas[3][4] = aom_sdotq_s16(deltas[3][4], d_ie[7], d_je[9]);
1230*77c1e3ccSAndroid Build Coastguard Worker deltas[3][5] = aom_sdotq_s16(deltas[3][5], d_ie[6], d_je[10]);
1231*77c1e3ccSAndroid Build Coastguard Worker deltas[3][5] = aom_sdotq_s16(deltas[3][5], d_ie[7], d_je[11]);
1232*77c1e3ccSAndroid Build Coastguard Worker
1233*77c1e3ccSAndroid Build Coastguard Worker deltas[4][0] = aom_sdotq_s16(deltas[4][0], d_ie[8], d_je[0]);
1234*77c1e3ccSAndroid Build Coastguard Worker deltas[4][0] = aom_sdotq_s16(deltas[4][0], d_ie[9], d_je[1]);
1235*77c1e3ccSAndroid Build Coastguard Worker deltas[4][1] = aom_sdotq_s16(deltas[4][1], d_ie[8], d_je[2]);
1236*77c1e3ccSAndroid Build Coastguard Worker deltas[4][1] = aom_sdotq_s16(deltas[4][1], d_ie[9], d_je[3]);
1237*77c1e3ccSAndroid Build Coastguard Worker deltas[4][2] = aom_sdotq_s16(deltas[4][2], d_ie[8], d_je[4]);
1238*77c1e3ccSAndroid Build Coastguard Worker deltas[4][2] = aom_sdotq_s16(deltas[4][2], d_ie[9], d_je[5]);
1239*77c1e3ccSAndroid Build Coastguard Worker deltas[4][3] = aom_sdotq_s16(deltas[4][3], d_ie[8], d_je[6]);
1240*77c1e3ccSAndroid Build Coastguard Worker deltas[4][3] = aom_sdotq_s16(deltas[4][3], d_ie[9], d_je[7]);
1241*77c1e3ccSAndroid Build Coastguard Worker deltas[4][4] = aom_sdotq_s16(deltas[4][4], d_ie[8], d_je[8]);
1242*77c1e3ccSAndroid Build Coastguard Worker deltas[4][4] = aom_sdotq_s16(deltas[4][4], d_ie[9], d_je[9]);
1243*77c1e3ccSAndroid Build Coastguard Worker deltas[4][5] = aom_sdotq_s16(deltas[4][5], d_ie[8], d_je[10]);
1244*77c1e3ccSAndroid Build Coastguard Worker deltas[4][5] = aom_sdotq_s16(deltas[4][5], d_ie[9], d_je[11]);
1245*77c1e3ccSAndroid Build Coastguard Worker
1246*77c1e3ccSAndroid Build Coastguard Worker deltas[5][0] = aom_sdotq_s16(deltas[5][0], d_ie[10], d_je[0]);
1247*77c1e3ccSAndroid Build Coastguard Worker deltas[5][0] = aom_sdotq_s16(deltas[5][0], d_ie[11], d_je[1]);
1248*77c1e3ccSAndroid Build Coastguard Worker deltas[5][1] = aom_sdotq_s16(deltas[5][1], d_ie[10], d_je[2]);
1249*77c1e3ccSAndroid Build Coastguard Worker deltas[5][1] = aom_sdotq_s16(deltas[5][1], d_ie[11], d_je[3]);
1250*77c1e3ccSAndroid Build Coastguard Worker deltas[5][2] = aom_sdotq_s16(deltas[5][2], d_ie[10], d_je[4]);
1251*77c1e3ccSAndroid Build Coastguard Worker deltas[5][2] = aom_sdotq_s16(deltas[5][2], d_ie[11], d_je[5]);
1252*77c1e3ccSAndroid Build Coastguard Worker deltas[5][3] = aom_sdotq_s16(deltas[5][3], d_ie[10], d_je[6]);
1253*77c1e3ccSAndroid Build Coastguard Worker deltas[5][3] = aom_sdotq_s16(deltas[5][3], d_ie[11], d_je[7]);
1254*77c1e3ccSAndroid Build Coastguard Worker deltas[5][4] = aom_sdotq_s16(deltas[5][4], d_ie[10], d_je[8]);
1255*77c1e3ccSAndroid Build Coastguard Worker deltas[5][4] = aom_sdotq_s16(deltas[5][4], d_ie[11], d_je[9]);
1256*77c1e3ccSAndroid Build Coastguard Worker deltas[5][5] = aom_sdotq_s16(deltas[5][5], d_ie[10], d_je[10]);
1257*77c1e3ccSAndroid Build Coastguard Worker deltas[5][5] = aom_sdotq_s16(deltas[5][5], d_ie[11], d_je[11]);
1258*77c1e3ccSAndroid Build Coastguard Worker }
1259*77c1e3ccSAndroid Build Coastguard Worker
hadd_update_6_stats_sve(const int64_t * const src,const int64x2_t * deltas,int64_t * const dst)1260*77c1e3ccSAndroid Build Coastguard Worker static inline void hadd_update_6_stats_sve(const int64_t *const src,
1261*77c1e3ccSAndroid Build Coastguard Worker const int64x2_t *deltas,
1262*77c1e3ccSAndroid Build Coastguard Worker int64_t *const dst) {
1263*77c1e3ccSAndroid Build Coastguard Worker int64x2_t src0 = vld1q_s64(src + 0);
1264*77c1e3ccSAndroid Build Coastguard Worker int64x2_t src1 = vld1q_s64(src + 2);
1265*77c1e3ccSAndroid Build Coastguard Worker int64x2_t src2 = vld1q_s64(src + 4);
1266*77c1e3ccSAndroid Build Coastguard Worker
1267*77c1e3ccSAndroid Build Coastguard Worker int64x2_t deltas01 = vpaddq_s64(deltas[0], deltas[1]);
1268*77c1e3ccSAndroid Build Coastguard Worker int64x2_t deltas23 = vpaddq_s64(deltas[2], deltas[3]);
1269*77c1e3ccSAndroid Build Coastguard Worker int64x2_t deltas45 = vpaddq_s64(deltas[4], deltas[5]);
1270*77c1e3ccSAndroid Build Coastguard Worker
1271*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(dst + 0, vaddq_s64(src0, deltas01));
1272*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(dst + 2, vaddq_s64(src1, deltas23));
1273*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(dst + 4, vaddq_s64(src2, deltas45));
1274*77c1e3ccSAndroid Build Coastguard Worker }
1275*77c1e3ccSAndroid Build Coastguard Worker
load_triangle_win7_sve(const int16_t * const di,const int32_t d_stride,const int32_t height,int16x8_t * d_is,int16x8_t * d_ie,svbool_t p0,svbool_t p1)1276*77c1e3ccSAndroid Build Coastguard Worker static inline void load_triangle_win7_sve(const int16_t *const di,
1277*77c1e3ccSAndroid Build Coastguard Worker const int32_t d_stride,
1278*77c1e3ccSAndroid Build Coastguard Worker const int32_t height, int16x8_t *d_is,
1279*77c1e3ccSAndroid Build Coastguard Worker int16x8_t *d_ie, svbool_t p0,
1280*77c1e3ccSAndroid Build Coastguard Worker svbool_t p1) {
1281*77c1e3ccSAndroid Build Coastguard Worker d_is[0] = svget_neonq_s16(svld1_s16(p0, di + 0 * d_stride + 0));
1282*77c1e3ccSAndroid Build Coastguard Worker d_is[1] = svget_neonq_s16(svld1_s16(p1, di + 0 * d_stride + 8));
1283*77c1e3ccSAndroid Build Coastguard Worker d_is[2] = svget_neonq_s16(svld1_s16(p0, di + 1 * d_stride + 0));
1284*77c1e3ccSAndroid Build Coastguard Worker d_is[3] = svget_neonq_s16(svld1_s16(p1, di + 1 * d_stride + 8));
1285*77c1e3ccSAndroid Build Coastguard Worker d_is[4] = svget_neonq_s16(svld1_s16(p0, di + 2 * d_stride + 0));
1286*77c1e3ccSAndroid Build Coastguard Worker d_is[5] = svget_neonq_s16(svld1_s16(p1, di + 2 * d_stride + 8));
1287*77c1e3ccSAndroid Build Coastguard Worker d_is[6] = svget_neonq_s16(svld1_s16(p0, di + 3 * d_stride + 0));
1288*77c1e3ccSAndroid Build Coastguard Worker d_is[7] = svget_neonq_s16(svld1_s16(p1, di + 3 * d_stride + 8));
1289*77c1e3ccSAndroid Build Coastguard Worker d_is[8] = svget_neonq_s16(svld1_s16(p0, di + 4 * d_stride + 0));
1290*77c1e3ccSAndroid Build Coastguard Worker d_is[9] = svget_neonq_s16(svld1_s16(p1, di + 4 * d_stride + 8));
1291*77c1e3ccSAndroid Build Coastguard Worker d_is[10] = svget_neonq_s16(svld1_s16(p0, di + 5 * d_stride + 0));
1292*77c1e3ccSAndroid Build Coastguard Worker d_is[11] = svget_neonq_s16(svld1_s16(p1, di + 5 * d_stride + 8));
1293*77c1e3ccSAndroid Build Coastguard Worker
1294*77c1e3ccSAndroid Build Coastguard Worker d_ie[0] = svget_neonq_s16(svld1_s16(p0, di + (height + 0) * d_stride + 0));
1295*77c1e3ccSAndroid Build Coastguard Worker d_ie[1] = svget_neonq_s16(svld1_s16(p1, di + (height + 0) * d_stride + 8));
1296*77c1e3ccSAndroid Build Coastguard Worker d_ie[2] = svget_neonq_s16(svld1_s16(p0, di + (height + 1) * d_stride + 0));
1297*77c1e3ccSAndroid Build Coastguard Worker d_ie[3] = svget_neonq_s16(svld1_s16(p1, di + (height + 1) * d_stride + 8));
1298*77c1e3ccSAndroid Build Coastguard Worker d_ie[4] = svget_neonq_s16(svld1_s16(p0, di + (height + 2) * d_stride + 0));
1299*77c1e3ccSAndroid Build Coastguard Worker d_ie[5] = svget_neonq_s16(svld1_s16(p1, di + (height + 2) * d_stride + 8));
1300*77c1e3ccSAndroid Build Coastguard Worker d_ie[6] = svget_neonq_s16(svld1_s16(p0, di + (height + 3) * d_stride + 0));
1301*77c1e3ccSAndroid Build Coastguard Worker d_ie[7] = svget_neonq_s16(svld1_s16(p1, di + (height + 3) * d_stride + 8));
1302*77c1e3ccSAndroid Build Coastguard Worker d_ie[8] = svget_neonq_s16(svld1_s16(p0, di + (height + 4) * d_stride + 0));
1303*77c1e3ccSAndroid Build Coastguard Worker d_ie[9] = svget_neonq_s16(svld1_s16(p1, di + (height + 4) * d_stride + 8));
1304*77c1e3ccSAndroid Build Coastguard Worker d_ie[10] = svget_neonq_s16(svld1_s16(p0, di + (height + 5) * d_stride + 0));
1305*77c1e3ccSAndroid Build Coastguard Worker d_ie[11] = svget_neonq_s16(svld1_s16(p1, di + (height + 5) * d_stride + 8));
1306*77c1e3ccSAndroid Build Coastguard Worker }
1307*77c1e3ccSAndroid Build Coastguard Worker
derive_triangle_win7_sve(const int16x8_t * d_is,const int16x8_t * d_ie,int64x2_t * deltas)1308*77c1e3ccSAndroid Build Coastguard Worker static inline void derive_triangle_win7_sve(const int16x8_t *d_is,
1309*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t *d_ie,
1310*77c1e3ccSAndroid Build Coastguard Worker int64x2_t *deltas) {
1311*77c1e3ccSAndroid Build Coastguard Worker deltas[0] = aom_sdotq_s16(deltas[0], vnegq_s16(d_is[0]), d_is[0]);
1312*77c1e3ccSAndroid Build Coastguard Worker deltas[0] = aom_sdotq_s16(deltas[0], vnegq_s16(d_is[1]), d_is[1]);
1313*77c1e3ccSAndroid Build Coastguard Worker deltas[1] = aom_sdotq_s16(deltas[1], vnegq_s16(d_is[0]), d_is[2]);
1314*77c1e3ccSAndroid Build Coastguard Worker deltas[1] = aom_sdotq_s16(deltas[1], vnegq_s16(d_is[1]), d_is[3]);
1315*77c1e3ccSAndroid Build Coastguard Worker deltas[2] = aom_sdotq_s16(deltas[2], vnegq_s16(d_is[0]), d_is[4]);
1316*77c1e3ccSAndroid Build Coastguard Worker deltas[2] = aom_sdotq_s16(deltas[2], vnegq_s16(d_is[1]), d_is[5]);
1317*77c1e3ccSAndroid Build Coastguard Worker deltas[3] = aom_sdotq_s16(deltas[3], vnegq_s16(d_is[0]), d_is[6]);
1318*77c1e3ccSAndroid Build Coastguard Worker deltas[3] = aom_sdotq_s16(deltas[3], vnegq_s16(d_is[1]), d_is[7]);
1319*77c1e3ccSAndroid Build Coastguard Worker deltas[4] = aom_sdotq_s16(deltas[4], vnegq_s16(d_is[0]), d_is[8]);
1320*77c1e3ccSAndroid Build Coastguard Worker deltas[4] = aom_sdotq_s16(deltas[4], vnegq_s16(d_is[1]), d_is[9]);
1321*77c1e3ccSAndroid Build Coastguard Worker deltas[5] = aom_sdotq_s16(deltas[5], vnegq_s16(d_is[0]), d_is[10]);
1322*77c1e3ccSAndroid Build Coastguard Worker deltas[5] = aom_sdotq_s16(deltas[5], vnegq_s16(d_is[1]), d_is[11]);
1323*77c1e3ccSAndroid Build Coastguard Worker
1324*77c1e3ccSAndroid Build Coastguard Worker deltas[6] = aom_sdotq_s16(deltas[6], vnegq_s16(d_is[2]), d_is[2]);
1325*77c1e3ccSAndroid Build Coastguard Worker deltas[6] = aom_sdotq_s16(deltas[6], vnegq_s16(d_is[3]), d_is[3]);
1326*77c1e3ccSAndroid Build Coastguard Worker deltas[7] = aom_sdotq_s16(deltas[7], vnegq_s16(d_is[2]), d_is[4]);
1327*77c1e3ccSAndroid Build Coastguard Worker deltas[7] = aom_sdotq_s16(deltas[7], vnegq_s16(d_is[3]), d_is[5]);
1328*77c1e3ccSAndroid Build Coastguard Worker deltas[8] = aom_sdotq_s16(deltas[8], vnegq_s16(d_is[2]), d_is[6]);
1329*77c1e3ccSAndroid Build Coastguard Worker deltas[8] = aom_sdotq_s16(deltas[8], vnegq_s16(d_is[3]), d_is[7]);
1330*77c1e3ccSAndroid Build Coastguard Worker deltas[9] = aom_sdotq_s16(deltas[9], vnegq_s16(d_is[2]), d_is[8]);
1331*77c1e3ccSAndroid Build Coastguard Worker deltas[9] = aom_sdotq_s16(deltas[9], vnegq_s16(d_is[3]), d_is[9]);
1332*77c1e3ccSAndroid Build Coastguard Worker deltas[10] = aom_sdotq_s16(deltas[10], vnegq_s16(d_is[2]), d_is[10]);
1333*77c1e3ccSAndroid Build Coastguard Worker deltas[10] = aom_sdotq_s16(deltas[10], vnegq_s16(d_is[3]), d_is[11]);
1334*77c1e3ccSAndroid Build Coastguard Worker
1335*77c1e3ccSAndroid Build Coastguard Worker deltas[11] = aom_sdotq_s16(deltas[11], vnegq_s16(d_is[4]), d_is[4]);
1336*77c1e3ccSAndroid Build Coastguard Worker deltas[11] = aom_sdotq_s16(deltas[11], vnegq_s16(d_is[5]), d_is[5]);
1337*77c1e3ccSAndroid Build Coastguard Worker deltas[12] = aom_sdotq_s16(deltas[12], vnegq_s16(d_is[4]), d_is[6]);
1338*77c1e3ccSAndroid Build Coastguard Worker deltas[12] = aom_sdotq_s16(deltas[12], vnegq_s16(d_is[5]), d_is[7]);
1339*77c1e3ccSAndroid Build Coastguard Worker deltas[13] = aom_sdotq_s16(deltas[13], vnegq_s16(d_is[4]), d_is[8]);
1340*77c1e3ccSAndroid Build Coastguard Worker deltas[13] = aom_sdotq_s16(deltas[13], vnegq_s16(d_is[5]), d_is[9]);
1341*77c1e3ccSAndroid Build Coastguard Worker deltas[14] = aom_sdotq_s16(deltas[14], vnegq_s16(d_is[4]), d_is[10]);
1342*77c1e3ccSAndroid Build Coastguard Worker deltas[14] = aom_sdotq_s16(deltas[14], vnegq_s16(d_is[5]), d_is[11]);
1343*77c1e3ccSAndroid Build Coastguard Worker
1344*77c1e3ccSAndroid Build Coastguard Worker deltas[15] = aom_sdotq_s16(deltas[15], vnegq_s16(d_is[6]), d_is[6]);
1345*77c1e3ccSAndroid Build Coastguard Worker deltas[15] = aom_sdotq_s16(deltas[15], vnegq_s16(d_is[7]), d_is[7]);
1346*77c1e3ccSAndroid Build Coastguard Worker deltas[16] = aom_sdotq_s16(deltas[16], vnegq_s16(d_is[6]), d_is[8]);
1347*77c1e3ccSAndroid Build Coastguard Worker deltas[16] = aom_sdotq_s16(deltas[16], vnegq_s16(d_is[7]), d_is[9]);
1348*77c1e3ccSAndroid Build Coastguard Worker deltas[17] = aom_sdotq_s16(deltas[17], vnegq_s16(d_is[6]), d_is[10]);
1349*77c1e3ccSAndroid Build Coastguard Worker deltas[17] = aom_sdotq_s16(deltas[17], vnegq_s16(d_is[7]), d_is[11]);
1350*77c1e3ccSAndroid Build Coastguard Worker
1351*77c1e3ccSAndroid Build Coastguard Worker deltas[18] = aom_sdotq_s16(deltas[18], vnegq_s16(d_is[8]), d_is[8]);
1352*77c1e3ccSAndroid Build Coastguard Worker deltas[18] = aom_sdotq_s16(deltas[18], vnegq_s16(d_is[9]), d_is[9]);
1353*77c1e3ccSAndroid Build Coastguard Worker deltas[19] = aom_sdotq_s16(deltas[19], vnegq_s16(d_is[8]), d_is[10]);
1354*77c1e3ccSAndroid Build Coastguard Worker deltas[19] = aom_sdotq_s16(deltas[19], vnegq_s16(d_is[9]), d_is[11]);
1355*77c1e3ccSAndroid Build Coastguard Worker
1356*77c1e3ccSAndroid Build Coastguard Worker deltas[20] = aom_sdotq_s16(deltas[20], vnegq_s16(d_is[10]), d_is[10]);
1357*77c1e3ccSAndroid Build Coastguard Worker deltas[20] = aom_sdotq_s16(deltas[20], vnegq_s16(d_is[11]), d_is[11]);
1358*77c1e3ccSAndroid Build Coastguard Worker
1359*77c1e3ccSAndroid Build Coastguard Worker deltas[0] = aom_sdotq_s16(deltas[0], d_ie[0], d_ie[0]);
1360*77c1e3ccSAndroid Build Coastguard Worker deltas[0] = aom_sdotq_s16(deltas[0], d_ie[1], d_ie[1]);
1361*77c1e3ccSAndroid Build Coastguard Worker deltas[1] = aom_sdotq_s16(deltas[1], d_ie[0], d_ie[2]);
1362*77c1e3ccSAndroid Build Coastguard Worker deltas[1] = aom_sdotq_s16(deltas[1], d_ie[1], d_ie[3]);
1363*77c1e3ccSAndroid Build Coastguard Worker deltas[2] = aom_sdotq_s16(deltas[2], d_ie[0], d_ie[4]);
1364*77c1e3ccSAndroid Build Coastguard Worker deltas[2] = aom_sdotq_s16(deltas[2], d_ie[1], d_ie[5]);
1365*77c1e3ccSAndroid Build Coastguard Worker deltas[3] = aom_sdotq_s16(deltas[3], d_ie[0], d_ie[6]);
1366*77c1e3ccSAndroid Build Coastguard Worker deltas[3] = aom_sdotq_s16(deltas[3], d_ie[1], d_ie[7]);
1367*77c1e3ccSAndroid Build Coastguard Worker deltas[4] = aom_sdotq_s16(deltas[4], d_ie[0], d_ie[8]);
1368*77c1e3ccSAndroid Build Coastguard Worker deltas[4] = aom_sdotq_s16(deltas[4], d_ie[1], d_ie[9]);
1369*77c1e3ccSAndroid Build Coastguard Worker deltas[5] = aom_sdotq_s16(deltas[5], d_ie[0], d_ie[10]);
1370*77c1e3ccSAndroid Build Coastguard Worker deltas[5] = aom_sdotq_s16(deltas[5], d_ie[1], d_ie[11]);
1371*77c1e3ccSAndroid Build Coastguard Worker
1372*77c1e3ccSAndroid Build Coastguard Worker deltas[6] = aom_sdotq_s16(deltas[6], d_ie[2], d_ie[2]);
1373*77c1e3ccSAndroid Build Coastguard Worker deltas[6] = aom_sdotq_s16(deltas[6], d_ie[3], d_ie[3]);
1374*77c1e3ccSAndroid Build Coastguard Worker deltas[7] = aom_sdotq_s16(deltas[7], d_ie[2], d_ie[4]);
1375*77c1e3ccSAndroid Build Coastguard Worker deltas[7] = aom_sdotq_s16(deltas[7], d_ie[3], d_ie[5]);
1376*77c1e3ccSAndroid Build Coastguard Worker deltas[8] = aom_sdotq_s16(deltas[8], d_ie[2], d_ie[6]);
1377*77c1e3ccSAndroid Build Coastguard Worker deltas[8] = aom_sdotq_s16(deltas[8], d_ie[3], d_ie[7]);
1378*77c1e3ccSAndroid Build Coastguard Worker deltas[9] = aom_sdotq_s16(deltas[9], d_ie[2], d_ie[8]);
1379*77c1e3ccSAndroid Build Coastguard Worker deltas[9] = aom_sdotq_s16(deltas[9], d_ie[3], d_ie[9]);
1380*77c1e3ccSAndroid Build Coastguard Worker deltas[10] = aom_sdotq_s16(deltas[10], d_ie[2], d_ie[10]);
1381*77c1e3ccSAndroid Build Coastguard Worker deltas[10] = aom_sdotq_s16(deltas[10], d_ie[3], d_ie[11]);
1382*77c1e3ccSAndroid Build Coastguard Worker
1383*77c1e3ccSAndroid Build Coastguard Worker deltas[11] = aom_sdotq_s16(deltas[11], d_ie[4], d_ie[4]);
1384*77c1e3ccSAndroid Build Coastguard Worker deltas[11] = aom_sdotq_s16(deltas[11], d_ie[5], d_ie[5]);
1385*77c1e3ccSAndroid Build Coastguard Worker deltas[12] = aom_sdotq_s16(deltas[12], d_ie[4], d_ie[6]);
1386*77c1e3ccSAndroid Build Coastguard Worker deltas[12] = aom_sdotq_s16(deltas[12], d_ie[5], d_ie[7]);
1387*77c1e3ccSAndroid Build Coastguard Worker deltas[13] = aom_sdotq_s16(deltas[13], d_ie[4], d_ie[8]);
1388*77c1e3ccSAndroid Build Coastguard Worker deltas[13] = aom_sdotq_s16(deltas[13], d_ie[5], d_ie[9]);
1389*77c1e3ccSAndroid Build Coastguard Worker deltas[14] = aom_sdotq_s16(deltas[14], d_ie[4], d_ie[10]);
1390*77c1e3ccSAndroid Build Coastguard Worker deltas[14] = aom_sdotq_s16(deltas[14], d_ie[5], d_ie[11]);
1391*77c1e3ccSAndroid Build Coastguard Worker
1392*77c1e3ccSAndroid Build Coastguard Worker deltas[15] = aom_sdotq_s16(deltas[15], d_ie[6], d_ie[6]);
1393*77c1e3ccSAndroid Build Coastguard Worker deltas[15] = aom_sdotq_s16(deltas[15], d_ie[7], d_ie[7]);
1394*77c1e3ccSAndroid Build Coastguard Worker deltas[16] = aom_sdotq_s16(deltas[16], d_ie[6], d_ie[8]);
1395*77c1e3ccSAndroid Build Coastguard Worker deltas[16] = aom_sdotq_s16(deltas[16], d_ie[7], d_ie[9]);
1396*77c1e3ccSAndroid Build Coastguard Worker deltas[17] = aom_sdotq_s16(deltas[17], d_ie[6], d_ie[10]);
1397*77c1e3ccSAndroid Build Coastguard Worker deltas[17] = aom_sdotq_s16(deltas[17], d_ie[7], d_ie[11]);
1398*77c1e3ccSAndroid Build Coastguard Worker
1399*77c1e3ccSAndroid Build Coastguard Worker deltas[18] = aom_sdotq_s16(deltas[18], d_ie[8], d_ie[8]);
1400*77c1e3ccSAndroid Build Coastguard Worker deltas[18] = aom_sdotq_s16(deltas[18], d_ie[9], d_ie[9]);
1401*77c1e3ccSAndroid Build Coastguard Worker deltas[19] = aom_sdotq_s16(deltas[19], d_ie[8], d_ie[10]);
1402*77c1e3ccSAndroid Build Coastguard Worker deltas[19] = aom_sdotq_s16(deltas[19], d_ie[9], d_ie[11]);
1403*77c1e3ccSAndroid Build Coastguard Worker
1404*77c1e3ccSAndroid Build Coastguard Worker deltas[20] = aom_sdotq_s16(deltas[20], d_ie[10], d_ie[10]);
1405*77c1e3ccSAndroid Build Coastguard Worker deltas[20] = aom_sdotq_s16(deltas[20], d_ie[11], d_ie[11]);
1406*77c1e3ccSAndroid Build Coastguard Worker }
1407*77c1e3ccSAndroid Build Coastguard Worker
compute_stats_win7_sve(const int16_t * const d,const int32_t d_stride,const int16_t * const s,const int32_t s_stride,const int32_t width,const int32_t height,int64_t * const M,int64_t * const H)1408*77c1e3ccSAndroid Build Coastguard Worker static inline void compute_stats_win7_sve(
1409*77c1e3ccSAndroid Build Coastguard Worker const int16_t *const d, const int32_t d_stride, const int16_t *const s,
1410*77c1e3ccSAndroid Build Coastguard Worker const int32_t s_stride, const int32_t width, const int32_t height,
1411*77c1e3ccSAndroid Build Coastguard Worker int64_t *const M, int64_t *const H) {
1412*77c1e3ccSAndroid Build Coastguard Worker const int32_t wiener_win = WIENER_WIN;
1413*77c1e3ccSAndroid Build Coastguard Worker const int32_t wiener_win2 = wiener_win * wiener_win;
1414*77c1e3ccSAndroid Build Coastguard Worker const int32_t h8 = height & ~7;
1415*77c1e3ccSAndroid Build Coastguard Worker int32_t i, j, x, y;
1416*77c1e3ccSAndroid Build Coastguard Worker
1417*77c1e3ccSAndroid Build Coastguard Worker // Use a predicate to compute the last columns.
1418*77c1e3ccSAndroid Build Coastguard Worker svbool_t p0 = svwhilelt_b16_u32(0, width % 16 == 0 ? 16 : width % 16);
1419*77c1e3ccSAndroid Build Coastguard Worker svbool_t p1 = svwhilelt_b16_u32(8, width % 16 == 0 ? 16 : width % 16);
1420*77c1e3ccSAndroid Build Coastguard Worker
1421*77c1e3ccSAndroid Build Coastguard Worker // Step 1: Calculate the top edge of the whole matrix, i.e., the top
1422*77c1e3ccSAndroid Build Coastguard Worker // edge of each triangle and square on the top row.
1423*77c1e3ccSAndroid Build Coastguard Worker j = 0;
1424*77c1e3ccSAndroid Build Coastguard Worker do {
1425*77c1e3ccSAndroid Build Coastguard Worker const int16_t *s_t = s;
1426*77c1e3ccSAndroid Build Coastguard Worker const int16_t *d_t = d;
1427*77c1e3ccSAndroid Build Coastguard Worker int64x2_t sum_m[WIENER_WIN] = { vdupq_n_s64(0) };
1428*77c1e3ccSAndroid Build Coastguard Worker int64x2_t sum_h[WIENER_WIN] = { vdupq_n_s64(0) };
1429*77c1e3ccSAndroid Build Coastguard Worker int16x8_t src[2], dgd[2];
1430*77c1e3ccSAndroid Build Coastguard Worker
1431*77c1e3ccSAndroid Build Coastguard Worker y = height;
1432*77c1e3ccSAndroid Build Coastguard Worker do {
1433*77c1e3ccSAndroid Build Coastguard Worker x = 0;
1434*77c1e3ccSAndroid Build Coastguard Worker while (x < width - 16) {
1435*77c1e3ccSAndroid Build Coastguard Worker src[0] = vld1q_s16(s_t + x + 0);
1436*77c1e3ccSAndroid Build Coastguard Worker src[1] = vld1q_s16(s_t + x + 8);
1437*77c1e3ccSAndroid Build Coastguard Worker dgd[0] = vld1q_s16(d_t + x + 0);
1438*77c1e3ccSAndroid Build Coastguard Worker dgd[1] = vld1q_s16(d_t + x + 8);
1439*77c1e3ccSAndroid Build Coastguard Worker stats_top_win7_sve(src, dgd, d_t + j + x, d_stride, sum_m, sum_h);
1440*77c1e3ccSAndroid Build Coastguard Worker x += 16;
1441*77c1e3ccSAndroid Build Coastguard Worker }
1442*77c1e3ccSAndroid Build Coastguard Worker
1443*77c1e3ccSAndroid Build Coastguard Worker src[0] = svget_neonq_s16(svld1_s16(p0, s_t + x + 0));
1444*77c1e3ccSAndroid Build Coastguard Worker src[1] = svget_neonq_s16(svld1_s16(p1, s_t + x + 8));
1445*77c1e3ccSAndroid Build Coastguard Worker dgd[0] = svget_neonq_s16(svld1_s16(p0, d_t + x + 0));
1446*77c1e3ccSAndroid Build Coastguard Worker dgd[1] = svget_neonq_s16(svld1_s16(p1, d_t + x + 8));
1447*77c1e3ccSAndroid Build Coastguard Worker stats_top_win7_sve(src, dgd, d_t + j + x, d_stride, sum_m, sum_h);
1448*77c1e3ccSAndroid Build Coastguard Worker
1449*77c1e3ccSAndroid Build Coastguard Worker s_t += s_stride;
1450*77c1e3ccSAndroid Build Coastguard Worker d_t += d_stride;
1451*77c1e3ccSAndroid Build Coastguard Worker } while (--y);
1452*77c1e3ccSAndroid Build Coastguard Worker
1453*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(M + wiener_win * j + 0, vpaddq_s64(sum_m[0], sum_m[1]));
1454*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(M + wiener_win * j + 2, vpaddq_s64(sum_m[2], sum_m[3]));
1455*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(M + wiener_win * j + 4, vpaddq_s64(sum_m[4], sum_m[5]));
1456*77c1e3ccSAndroid Build Coastguard Worker M[wiener_win * j + 6] = vaddvq_s64(sum_m[6]);
1457*77c1e3ccSAndroid Build Coastguard Worker
1458*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(H + wiener_win * j + 0, vpaddq_s64(sum_h[0], sum_h[1]));
1459*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(H + wiener_win * j + 2, vpaddq_s64(sum_h[2], sum_h[3]));
1460*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(H + wiener_win * j + 4, vpaddq_s64(sum_h[4], sum_h[5]));
1461*77c1e3ccSAndroid Build Coastguard Worker H[wiener_win * j + 6] = vaddvq_s64(sum_h[6]);
1462*77c1e3ccSAndroid Build Coastguard Worker } while (++j < wiener_win);
1463*77c1e3ccSAndroid Build Coastguard Worker
1464*77c1e3ccSAndroid Build Coastguard Worker // Step 2: Calculate the left edge of each square on the top row.
1465*77c1e3ccSAndroid Build Coastguard Worker j = 1;
1466*77c1e3ccSAndroid Build Coastguard Worker do {
1467*77c1e3ccSAndroid Build Coastguard Worker const int16_t *d_t = d;
1468*77c1e3ccSAndroid Build Coastguard Worker int64x2_t sum_h[WIENER_WIN - 1] = { vdupq_n_s64(0) };
1469*77c1e3ccSAndroid Build Coastguard Worker int16x8_t dgd[2];
1470*77c1e3ccSAndroid Build Coastguard Worker
1471*77c1e3ccSAndroid Build Coastguard Worker y = height;
1472*77c1e3ccSAndroid Build Coastguard Worker do {
1473*77c1e3ccSAndroid Build Coastguard Worker x = 0;
1474*77c1e3ccSAndroid Build Coastguard Worker while (x < width - 16) {
1475*77c1e3ccSAndroid Build Coastguard Worker dgd[0] = vld1q_s16(d_t + j + x + 0);
1476*77c1e3ccSAndroid Build Coastguard Worker dgd[1] = vld1q_s16(d_t + j + x + 8);
1477*77c1e3ccSAndroid Build Coastguard Worker stats_left_win7_sve(dgd, d_t + x, d_stride, sum_h);
1478*77c1e3ccSAndroid Build Coastguard Worker x += 16;
1479*77c1e3ccSAndroid Build Coastguard Worker }
1480*77c1e3ccSAndroid Build Coastguard Worker
1481*77c1e3ccSAndroid Build Coastguard Worker dgd[0] = svget_neonq_s16(svld1_s16(p0, d_t + j + x + 0));
1482*77c1e3ccSAndroid Build Coastguard Worker dgd[1] = svget_neonq_s16(svld1_s16(p1, d_t + j + x + 8));
1483*77c1e3ccSAndroid Build Coastguard Worker stats_left_win7_sve(dgd, d_t + x, d_stride, sum_h);
1484*77c1e3ccSAndroid Build Coastguard Worker
1485*77c1e3ccSAndroid Build Coastguard Worker d_t += d_stride;
1486*77c1e3ccSAndroid Build Coastguard Worker } while (--y);
1487*77c1e3ccSAndroid Build Coastguard Worker
1488*77c1e3ccSAndroid Build Coastguard Worker int64x2_t sum_h01 = vpaddq_s64(sum_h[0], sum_h[1]);
1489*77c1e3ccSAndroid Build Coastguard Worker int64x2_t sum_h23 = vpaddq_s64(sum_h[2], sum_h[3]);
1490*77c1e3ccSAndroid Build Coastguard Worker int64x2_t sum_h45 = vpaddq_s64(sum_h[4], sum_h[5]);
1491*77c1e3ccSAndroid Build Coastguard Worker vst1_s64(&H[1 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h01));
1492*77c1e3ccSAndroid Build Coastguard Worker vst1_s64(&H[2 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h01));
1493*77c1e3ccSAndroid Build Coastguard Worker vst1_s64(&H[3 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h23));
1494*77c1e3ccSAndroid Build Coastguard Worker vst1_s64(&H[4 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h23));
1495*77c1e3ccSAndroid Build Coastguard Worker vst1_s64(&H[5 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h45));
1496*77c1e3ccSAndroid Build Coastguard Worker vst1_s64(&H[6 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h45));
1497*77c1e3ccSAndroid Build Coastguard Worker } while (++j < wiener_win);
1498*77c1e3ccSAndroid Build Coastguard Worker
1499*77c1e3ccSAndroid Build Coastguard Worker // Step 3: Derive the top edge of each triangle along the diagonal. No
1500*77c1e3ccSAndroid Build Coastguard Worker // triangle in top row.
1501*77c1e3ccSAndroid Build Coastguard Worker {
1502*77c1e3ccSAndroid Build Coastguard Worker const int16_t *d_t = d;
1503*77c1e3ccSAndroid Build Coastguard Worker // Pad to call transpose function.
1504*77c1e3ccSAndroid Build Coastguard Worker int32x4_t deltas[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
1505*77c1e3ccSAndroid Build Coastguard Worker int32x4_t deltas_tr[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
1506*77c1e3ccSAndroid Build Coastguard Worker int16x8_t ds[WIENER_WIN * 2];
1507*77c1e3ccSAndroid Build Coastguard Worker
1508*77c1e3ccSAndroid Build Coastguard Worker load_s16_8x6(d_t, d_stride, &ds[0], &ds[2], &ds[4], &ds[6], &ds[8],
1509*77c1e3ccSAndroid Build Coastguard Worker &ds[10]);
1510*77c1e3ccSAndroid Build Coastguard Worker load_s16_8x6(d_t + width, d_stride, &ds[1], &ds[3], &ds[5], &ds[7], &ds[9],
1511*77c1e3ccSAndroid Build Coastguard Worker &ds[11]);
1512*77c1e3ccSAndroid Build Coastguard Worker
1513*77c1e3ccSAndroid Build Coastguard Worker d_t += 6 * d_stride;
1514*77c1e3ccSAndroid Build Coastguard Worker
1515*77c1e3ccSAndroid Build Coastguard Worker step3_win7_neon(d_t, d_stride, width, height, ds, deltas);
1516*77c1e3ccSAndroid Build Coastguard Worker transpose_arrays_s32_8x8(deltas, deltas_tr);
1517*77c1e3ccSAndroid Build Coastguard Worker
1518*77c1e3ccSAndroid Build Coastguard Worker update_8_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
1519*77c1e3ccSAndroid Build Coastguard Worker deltas_tr[0], deltas_tr[4],
1520*77c1e3ccSAndroid Build Coastguard Worker H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
1521*77c1e3ccSAndroid Build Coastguard Worker update_8_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
1522*77c1e3ccSAndroid Build Coastguard Worker deltas_tr[1], deltas_tr[5],
1523*77c1e3ccSAndroid Build Coastguard Worker H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
1524*77c1e3ccSAndroid Build Coastguard Worker update_8_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
1525*77c1e3ccSAndroid Build Coastguard Worker deltas_tr[2], deltas_tr[6],
1526*77c1e3ccSAndroid Build Coastguard Worker H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
1527*77c1e3ccSAndroid Build Coastguard Worker update_8_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
1528*77c1e3ccSAndroid Build Coastguard Worker deltas_tr[3], deltas_tr[7],
1529*77c1e3ccSAndroid Build Coastguard Worker H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
1530*77c1e3ccSAndroid Build Coastguard Worker update_8_stats_neon(H + 4 * wiener_win * wiener_win2 + 4 * wiener_win,
1531*77c1e3ccSAndroid Build Coastguard Worker deltas_tr[8], deltas_tr[12],
1532*77c1e3ccSAndroid Build Coastguard Worker H + 5 * wiener_win * wiener_win2 + 5 * wiener_win);
1533*77c1e3ccSAndroid Build Coastguard Worker update_8_stats_neon(H + 5 * wiener_win * wiener_win2 + 5 * wiener_win,
1534*77c1e3ccSAndroid Build Coastguard Worker deltas_tr[9], deltas_tr[13],
1535*77c1e3ccSAndroid Build Coastguard Worker H + 6 * wiener_win * wiener_win2 + 6 * wiener_win);
1536*77c1e3ccSAndroid Build Coastguard Worker }
1537*77c1e3ccSAndroid Build Coastguard Worker
1538*77c1e3ccSAndroid Build Coastguard Worker // Step 4: Derive the top and left edge of each square. No square in top and
1539*77c1e3ccSAndroid Build Coastguard Worker // bottom row.
1540*77c1e3ccSAndroid Build Coastguard Worker
1541*77c1e3ccSAndroid Build Coastguard Worker i = 1;
1542*77c1e3ccSAndroid Build Coastguard Worker do {
1543*77c1e3ccSAndroid Build Coastguard Worker j = i + 1;
1544*77c1e3ccSAndroid Build Coastguard Worker do {
1545*77c1e3ccSAndroid Build Coastguard Worker const int16_t *di = d + i - 1;
1546*77c1e3ccSAndroid Build Coastguard Worker const int16_t *dj = d + j - 1;
1547*77c1e3ccSAndroid Build Coastguard Worker int64x2_t deltas[(2 * WIENER_WIN - 1) * 2] = { vdupq_n_s64(0) };
1548*77c1e3ccSAndroid Build Coastguard Worker int16x8_t dd[WIENER_WIN * 2], ds[WIENER_WIN * 2];
1549*77c1e3ccSAndroid Build Coastguard Worker
1550*77c1e3ccSAndroid Build Coastguard Worker dd[5] = vdupq_n_s16(0); // Initialize to avoid warning.
1551*77c1e3ccSAndroid Build Coastguard Worker const int16_t dd0_values[] = { di[0 * d_stride],
1552*77c1e3ccSAndroid Build Coastguard Worker di[1 * d_stride],
1553*77c1e3ccSAndroid Build Coastguard Worker di[2 * d_stride],
1554*77c1e3ccSAndroid Build Coastguard Worker di[3 * d_stride],
1555*77c1e3ccSAndroid Build Coastguard Worker di[4 * d_stride],
1556*77c1e3ccSAndroid Build Coastguard Worker di[5 * d_stride],
1557*77c1e3ccSAndroid Build Coastguard Worker 0,
1558*77c1e3ccSAndroid Build Coastguard Worker 0 };
1559*77c1e3ccSAndroid Build Coastguard Worker dd[0] = vld1q_s16(dd0_values);
1560*77c1e3ccSAndroid Build Coastguard Worker const int16_t dd1_values[] = { di[0 * d_stride + width],
1561*77c1e3ccSAndroid Build Coastguard Worker di[1 * d_stride + width],
1562*77c1e3ccSAndroid Build Coastguard Worker di[2 * d_stride + width],
1563*77c1e3ccSAndroid Build Coastguard Worker di[3 * d_stride + width],
1564*77c1e3ccSAndroid Build Coastguard Worker di[4 * d_stride + width],
1565*77c1e3ccSAndroid Build Coastguard Worker di[5 * d_stride + width],
1566*77c1e3ccSAndroid Build Coastguard Worker 0,
1567*77c1e3ccSAndroid Build Coastguard Worker 0 };
1568*77c1e3ccSAndroid Build Coastguard Worker dd[1] = vld1q_s16(dd1_values);
1569*77c1e3ccSAndroid Build Coastguard Worker const int16_t ds0_values[] = { dj[0 * d_stride],
1570*77c1e3ccSAndroid Build Coastguard Worker dj[1 * d_stride],
1571*77c1e3ccSAndroid Build Coastguard Worker dj[2 * d_stride],
1572*77c1e3ccSAndroid Build Coastguard Worker dj[3 * d_stride],
1573*77c1e3ccSAndroid Build Coastguard Worker dj[4 * d_stride],
1574*77c1e3ccSAndroid Build Coastguard Worker dj[5 * d_stride],
1575*77c1e3ccSAndroid Build Coastguard Worker 0,
1576*77c1e3ccSAndroid Build Coastguard Worker 0 };
1577*77c1e3ccSAndroid Build Coastguard Worker ds[0] = vld1q_s16(ds0_values);
1578*77c1e3ccSAndroid Build Coastguard Worker int16_t ds1_values[] = { dj[0 * d_stride + width],
1579*77c1e3ccSAndroid Build Coastguard Worker dj[1 * d_stride + width],
1580*77c1e3ccSAndroid Build Coastguard Worker dj[2 * d_stride + width],
1581*77c1e3ccSAndroid Build Coastguard Worker dj[3 * d_stride + width],
1582*77c1e3ccSAndroid Build Coastguard Worker dj[4 * d_stride + width],
1583*77c1e3ccSAndroid Build Coastguard Worker dj[5 * d_stride + width],
1584*77c1e3ccSAndroid Build Coastguard Worker 0,
1585*77c1e3ccSAndroid Build Coastguard Worker 0 };
1586*77c1e3ccSAndroid Build Coastguard Worker ds[1] = vld1q_s16(ds1_values);
1587*77c1e3ccSAndroid Build Coastguard Worker
1588*77c1e3ccSAndroid Build Coastguard Worker y = 0;
1589*77c1e3ccSAndroid Build Coastguard Worker while (y < h8) {
1590*77c1e3ccSAndroid Build Coastguard Worker // 00s 10s 20s 30s 40s 50s 60s 70s 00e 10e 20e 30e 40e 50e 60e 70e
1591*77c1e3ccSAndroid Build Coastguard Worker dd[0] = vsetq_lane_s16(di[6 * d_stride], dd[0], 6);
1592*77c1e3ccSAndroid Build Coastguard Worker dd[0] = vsetq_lane_s16(di[7 * d_stride], dd[0], 7);
1593*77c1e3ccSAndroid Build Coastguard Worker dd[1] = vsetq_lane_s16(di[6 * d_stride + width], dd[1], 6);
1594*77c1e3ccSAndroid Build Coastguard Worker dd[1] = vsetq_lane_s16(di[7 * d_stride + width], dd[1], 7);
1595*77c1e3ccSAndroid Build Coastguard Worker
1596*77c1e3ccSAndroid Build Coastguard Worker // 00s 10s 20s 30s 40s 50s 60s 70s 00e 10e 20e 30e 40e 50e 60e 70e
1597*77c1e3ccSAndroid Build Coastguard Worker // 01s 11s 21s 31s 41s 51s 61s 71s 01e 11e 21e 31e 41e 51e 61e 71e
1598*77c1e3ccSAndroid Build Coastguard Worker ds[0] = vsetq_lane_s16(dj[6 * d_stride], ds[0], 6);
1599*77c1e3ccSAndroid Build Coastguard Worker ds[0] = vsetq_lane_s16(dj[7 * d_stride], ds[0], 7);
1600*77c1e3ccSAndroid Build Coastguard Worker ds[1] = vsetq_lane_s16(dj[6 * d_stride + width], ds[1], 6);
1601*77c1e3ccSAndroid Build Coastguard Worker ds[1] = vsetq_lane_s16(dj[7 * d_stride + width], ds[1], 7);
1602*77c1e3ccSAndroid Build Coastguard Worker
1603*77c1e3ccSAndroid Build Coastguard Worker load_more_16_neon(di + 8 * d_stride, width, &dd[0], &dd[2]);
1604*77c1e3ccSAndroid Build Coastguard Worker load_more_16_neon(dj + 8 * d_stride, width, &ds[0], &ds[2]);
1605*77c1e3ccSAndroid Build Coastguard Worker load_more_16_neon(di + 9 * d_stride, width, &dd[2], &dd[4]);
1606*77c1e3ccSAndroid Build Coastguard Worker load_more_16_neon(dj + 9 * d_stride, width, &ds[2], &ds[4]);
1607*77c1e3ccSAndroid Build Coastguard Worker load_more_16_neon(di + 10 * d_stride, width, &dd[4], &dd[6]);
1608*77c1e3ccSAndroid Build Coastguard Worker load_more_16_neon(dj + 10 * d_stride, width, &ds[4], &ds[6]);
1609*77c1e3ccSAndroid Build Coastguard Worker load_more_16_neon(di + 11 * d_stride, width, &dd[6], &dd[8]);
1610*77c1e3ccSAndroid Build Coastguard Worker load_more_16_neon(dj + 11 * d_stride, width, &ds[6], &ds[8]);
1611*77c1e3ccSAndroid Build Coastguard Worker load_more_16_neon(di + 12 * d_stride, width, &dd[8], &dd[10]);
1612*77c1e3ccSAndroid Build Coastguard Worker load_more_16_neon(dj + 12 * d_stride, width, &ds[8], &ds[10]);
1613*77c1e3ccSAndroid Build Coastguard Worker load_more_16_neon(di + 13 * d_stride, width, &dd[10], &dd[12]);
1614*77c1e3ccSAndroid Build Coastguard Worker load_more_16_neon(dj + 13 * d_stride, width, &ds[10], &ds[12]);
1615*77c1e3ccSAndroid Build Coastguard Worker
1616*77c1e3ccSAndroid Build Coastguard Worker deltas[0] = aom_sdotq_s16(deltas[0], dd[0], ds[0]);
1617*77c1e3ccSAndroid Build Coastguard Worker deltas[1] = aom_sdotq_s16(deltas[1], dd[1], ds[1]);
1618*77c1e3ccSAndroid Build Coastguard Worker deltas[2] = aom_sdotq_s16(deltas[2], dd[0], ds[2]);
1619*77c1e3ccSAndroid Build Coastguard Worker deltas[3] = aom_sdotq_s16(deltas[3], dd[1], ds[3]);
1620*77c1e3ccSAndroid Build Coastguard Worker deltas[4] = aom_sdotq_s16(deltas[4], dd[0], ds[4]);
1621*77c1e3ccSAndroid Build Coastguard Worker deltas[5] = aom_sdotq_s16(deltas[5], dd[1], ds[5]);
1622*77c1e3ccSAndroid Build Coastguard Worker deltas[6] = aom_sdotq_s16(deltas[6], dd[0], ds[6]);
1623*77c1e3ccSAndroid Build Coastguard Worker deltas[7] = aom_sdotq_s16(deltas[7], dd[1], ds[7]);
1624*77c1e3ccSAndroid Build Coastguard Worker deltas[8] = aom_sdotq_s16(deltas[8], dd[0], ds[8]);
1625*77c1e3ccSAndroid Build Coastguard Worker deltas[9] = aom_sdotq_s16(deltas[9], dd[1], ds[9]);
1626*77c1e3ccSAndroid Build Coastguard Worker deltas[10] = aom_sdotq_s16(deltas[10], dd[0], ds[10]);
1627*77c1e3ccSAndroid Build Coastguard Worker deltas[11] = aom_sdotq_s16(deltas[11], dd[1], ds[11]);
1628*77c1e3ccSAndroid Build Coastguard Worker deltas[12] = aom_sdotq_s16(deltas[12], dd[0], ds[12]);
1629*77c1e3ccSAndroid Build Coastguard Worker deltas[13] = aom_sdotq_s16(deltas[13], dd[1], ds[13]);
1630*77c1e3ccSAndroid Build Coastguard Worker deltas[14] = aom_sdotq_s16(deltas[14], dd[2], ds[0]);
1631*77c1e3ccSAndroid Build Coastguard Worker deltas[15] = aom_sdotq_s16(deltas[15], dd[3], ds[1]);
1632*77c1e3ccSAndroid Build Coastguard Worker deltas[16] = aom_sdotq_s16(deltas[16], dd[4], ds[0]);
1633*77c1e3ccSAndroid Build Coastguard Worker deltas[17] = aom_sdotq_s16(deltas[17], dd[5], ds[1]);
1634*77c1e3ccSAndroid Build Coastguard Worker deltas[18] = aom_sdotq_s16(deltas[18], dd[6], ds[0]);
1635*77c1e3ccSAndroid Build Coastguard Worker deltas[19] = aom_sdotq_s16(deltas[19], dd[7], ds[1]);
1636*77c1e3ccSAndroid Build Coastguard Worker deltas[20] = aom_sdotq_s16(deltas[20], dd[8], ds[0]);
1637*77c1e3ccSAndroid Build Coastguard Worker deltas[21] = aom_sdotq_s16(deltas[21], dd[9], ds[1]);
1638*77c1e3ccSAndroid Build Coastguard Worker deltas[22] = aom_sdotq_s16(deltas[22], dd[10], ds[0]);
1639*77c1e3ccSAndroid Build Coastguard Worker deltas[23] = aom_sdotq_s16(deltas[23], dd[11], ds[1]);
1640*77c1e3ccSAndroid Build Coastguard Worker deltas[24] = aom_sdotq_s16(deltas[24], dd[12], ds[0]);
1641*77c1e3ccSAndroid Build Coastguard Worker deltas[25] = aom_sdotq_s16(deltas[25], dd[13], ds[1]);
1642*77c1e3ccSAndroid Build Coastguard Worker
1643*77c1e3ccSAndroid Build Coastguard Worker dd[0] = vextq_s16(dd[12], vdupq_n_s16(0), 2);
1644*77c1e3ccSAndroid Build Coastguard Worker dd[1] = vextq_s16(dd[13], vdupq_n_s16(0), 2);
1645*77c1e3ccSAndroid Build Coastguard Worker ds[0] = vextq_s16(ds[12], vdupq_n_s16(0), 2);
1646*77c1e3ccSAndroid Build Coastguard Worker ds[1] = vextq_s16(ds[13], vdupq_n_s16(0), 2);
1647*77c1e3ccSAndroid Build Coastguard Worker
1648*77c1e3ccSAndroid Build Coastguard Worker di += 8 * d_stride;
1649*77c1e3ccSAndroid Build Coastguard Worker dj += 8 * d_stride;
1650*77c1e3ccSAndroid Build Coastguard Worker y += 8;
1651*77c1e3ccSAndroid Build Coastguard Worker }
1652*77c1e3ccSAndroid Build Coastguard Worker
1653*77c1e3ccSAndroid Build Coastguard Worker int64x2_t deltas02 = vpaddq_s64(deltas[0], deltas[2]);
1654*77c1e3ccSAndroid Build Coastguard Worker int64x2_t deltas13 = vpaddq_s64(deltas[1], deltas[3]);
1655*77c1e3ccSAndroid Build Coastguard Worker int64x2_t deltas46 = vpaddq_s64(deltas[4], deltas[6]);
1656*77c1e3ccSAndroid Build Coastguard Worker int64x2_t deltas57 = vpaddq_s64(deltas[5], deltas[7]);
1657*77c1e3ccSAndroid Build Coastguard Worker int64x2_t deltas810 = vpaddq_s64(deltas[8], deltas[10]);
1658*77c1e3ccSAndroid Build Coastguard Worker int64x2_t deltas911 = vpaddq_s64(deltas[9], deltas[11]);
1659*77c1e3ccSAndroid Build Coastguard Worker int64x2_t deltas1212 = vpaddq_s64(deltas[12], deltas[12]);
1660*77c1e3ccSAndroid Build Coastguard Worker int64x2_t deltas1313 = vpaddq_s64(deltas[13], deltas[13]);
1661*77c1e3ccSAndroid Build Coastguard Worker int64x2_t deltas1416 = vpaddq_s64(deltas[14], deltas[16]);
1662*77c1e3ccSAndroid Build Coastguard Worker int64x2_t deltas1820 = vpaddq_s64(deltas[18], deltas[20]);
1663*77c1e3ccSAndroid Build Coastguard Worker int64x2_t deltas1517 = vpaddq_s64(deltas[15], deltas[17]);
1664*77c1e3ccSAndroid Build Coastguard Worker int64x2_t deltas1921 = vpaddq_s64(deltas[19], deltas[21]);
1665*77c1e3ccSAndroid Build Coastguard Worker int64x2_t deltas2224 = vpaddq_s64(deltas[22], deltas[24]);
1666*77c1e3ccSAndroid Build Coastguard Worker int64x2_t deltas2325 = vpaddq_s64(deltas[23], deltas[25]);
1667*77c1e3ccSAndroid Build Coastguard Worker deltas02 = vsubq_s64(deltas13, deltas02);
1668*77c1e3ccSAndroid Build Coastguard Worker deltas46 = vsubq_s64(deltas57, deltas46);
1669*77c1e3ccSAndroid Build Coastguard Worker deltas810 = vsubq_s64(deltas911, deltas810);
1670*77c1e3ccSAndroid Build Coastguard Worker deltas1212 = vsubq_s64(deltas1313, deltas1212);
1671*77c1e3ccSAndroid Build Coastguard Worker deltas1416 = vsubq_s64(deltas1517, deltas1416);
1672*77c1e3ccSAndroid Build Coastguard Worker deltas1820 = vsubq_s64(deltas1921, deltas1820);
1673*77c1e3ccSAndroid Build Coastguard Worker deltas2224 = vsubq_s64(deltas2325, deltas2224);
1674*77c1e3ccSAndroid Build Coastguard Worker
1675*77c1e3ccSAndroid Build Coastguard Worker if (h8 != height) {
1676*77c1e3ccSAndroid Build Coastguard Worker const int16_t ds0_vals[] = {
1677*77c1e3ccSAndroid Build Coastguard Worker dj[0 * d_stride], dj[0 * d_stride + width],
1678*77c1e3ccSAndroid Build Coastguard Worker dj[1 * d_stride], dj[1 * d_stride + width],
1679*77c1e3ccSAndroid Build Coastguard Worker dj[2 * d_stride], dj[2 * d_stride + width],
1680*77c1e3ccSAndroid Build Coastguard Worker dj[3 * d_stride], dj[3 * d_stride + width]
1681*77c1e3ccSAndroid Build Coastguard Worker };
1682*77c1e3ccSAndroid Build Coastguard Worker ds[0] = vld1q_s16(ds0_vals);
1683*77c1e3ccSAndroid Build Coastguard Worker
1684*77c1e3ccSAndroid Build Coastguard Worker ds[1] = vsetq_lane_s16(dj[4 * d_stride], ds[1], 0);
1685*77c1e3ccSAndroid Build Coastguard Worker ds[1] = vsetq_lane_s16(dj[4 * d_stride + width], ds[1], 1);
1686*77c1e3ccSAndroid Build Coastguard Worker ds[1] = vsetq_lane_s16(dj[5 * d_stride], ds[1], 2);
1687*77c1e3ccSAndroid Build Coastguard Worker ds[1] = vsetq_lane_s16(dj[5 * d_stride + width], ds[1], 3);
1688*77c1e3ccSAndroid Build Coastguard Worker const int16_t dd4_vals[] = {
1689*77c1e3ccSAndroid Build Coastguard Worker -di[1 * d_stride], di[1 * d_stride + width],
1690*77c1e3ccSAndroid Build Coastguard Worker -di[2 * d_stride], di[2 * d_stride + width],
1691*77c1e3ccSAndroid Build Coastguard Worker -di[3 * d_stride], di[3 * d_stride + width],
1692*77c1e3ccSAndroid Build Coastguard Worker -di[4 * d_stride], di[4 * d_stride + width]
1693*77c1e3ccSAndroid Build Coastguard Worker };
1694*77c1e3ccSAndroid Build Coastguard Worker dd[4] = vld1q_s16(dd4_vals);
1695*77c1e3ccSAndroid Build Coastguard Worker
1696*77c1e3ccSAndroid Build Coastguard Worker dd[5] = vsetq_lane_s16(-di[5 * d_stride], dd[5], 0);
1697*77c1e3ccSAndroid Build Coastguard Worker dd[5] = vsetq_lane_s16(di[5 * d_stride + width], dd[5], 1);
1698*77c1e3ccSAndroid Build Coastguard Worker do {
1699*77c1e3ccSAndroid Build Coastguard Worker dd[0] = vdupq_n_s16(-di[0 * d_stride]);
1700*77c1e3ccSAndroid Build Coastguard Worker dd[2] = dd[3] = vdupq_n_s16(di[0 * d_stride + width]);
1701*77c1e3ccSAndroid Build Coastguard Worker dd[0] = dd[1] = vzip1q_s16(dd[0], dd[2]);
1702*77c1e3ccSAndroid Build Coastguard Worker
1703*77c1e3ccSAndroid Build Coastguard Worker ds[4] = vdupq_n_s16(dj[0 * d_stride]);
1704*77c1e3ccSAndroid Build Coastguard Worker ds[6] = ds[7] = vdupq_n_s16(dj[0 * d_stride + width]);
1705*77c1e3ccSAndroid Build Coastguard Worker ds[4] = ds[5] = vzip1q_s16(ds[4], ds[6]);
1706*77c1e3ccSAndroid Build Coastguard Worker
1707*77c1e3ccSAndroid Build Coastguard Worker dd[5] = vsetq_lane_s16(-di[6 * d_stride], dd[5], 2);
1708*77c1e3ccSAndroid Build Coastguard Worker dd[5] = vsetq_lane_s16(di[6 * d_stride + width], dd[5], 3);
1709*77c1e3ccSAndroid Build Coastguard Worker ds[1] = vsetq_lane_s16(dj[6 * d_stride], ds[1], 4);
1710*77c1e3ccSAndroid Build Coastguard Worker ds[1] = vsetq_lane_s16(dj[6 * d_stride + width], ds[1], 5);
1711*77c1e3ccSAndroid Build Coastguard Worker
1712*77c1e3ccSAndroid Build Coastguard Worker const int32x4_t res0 =
1713*77c1e3ccSAndroid Build Coastguard Worker vpaddq_s32(vmull_s16(vget_low_s16(dd[0]), vget_low_s16(ds[0])),
1714*77c1e3ccSAndroid Build Coastguard Worker vmull_s16(vget_high_s16(dd[0]), vget_high_s16(ds[0])));
1715*77c1e3ccSAndroid Build Coastguard Worker deltas02 = vaddw_s32(deltas02, vget_low_s32(res0));
1716*77c1e3ccSAndroid Build Coastguard Worker deltas46 = vaddw_s32(deltas46, vget_high_s32(res0));
1717*77c1e3ccSAndroid Build Coastguard Worker const int32x4_t res1 =
1718*77c1e3ccSAndroid Build Coastguard Worker vpaddq_s32(vmull_s16(vget_low_s16(dd[1]), vget_low_s16(ds[1])),
1719*77c1e3ccSAndroid Build Coastguard Worker vmull_s16(vget_high_s16(dd[1]), vget_high_s16(ds[1])));
1720*77c1e3ccSAndroid Build Coastguard Worker deltas810 = vaddw_s32(deltas810, vget_low_s32(res1));
1721*77c1e3ccSAndroid Build Coastguard Worker deltas1212 = vaddw_s32(deltas1212, vget_high_s32(res1));
1722*77c1e3ccSAndroid Build Coastguard Worker const int32x4_t res2 =
1723*77c1e3ccSAndroid Build Coastguard Worker vpaddq_s32(vmull_s16(vget_low_s16(dd[4]), vget_low_s16(ds[4])),
1724*77c1e3ccSAndroid Build Coastguard Worker vmull_s16(vget_high_s16(dd[4]), vget_high_s16(ds[4])));
1725*77c1e3ccSAndroid Build Coastguard Worker deltas1416 = vaddw_s32(deltas1416, vget_low_s32(res2));
1726*77c1e3ccSAndroid Build Coastguard Worker deltas1820 = vaddw_s32(deltas1820, vget_high_s32(res2));
1727*77c1e3ccSAndroid Build Coastguard Worker const int32x4_t res3 =
1728*77c1e3ccSAndroid Build Coastguard Worker vpaddq_s32(vmull_s16(vget_low_s16(dd[5]), vget_low_s16(ds[5])),
1729*77c1e3ccSAndroid Build Coastguard Worker vmull_s16(vget_high_s16(dd[5]), vget_high_s16(ds[5])));
1730*77c1e3ccSAndroid Build Coastguard Worker deltas2224 = vaddw_s32(deltas2224, vget_low_s32(res3));
1731*77c1e3ccSAndroid Build Coastguard Worker
1732*77c1e3ccSAndroid Build Coastguard Worker int32_t tmp0 = vgetq_lane_s32(vreinterpretq_s32_s16(ds[0]), 0);
1733*77c1e3ccSAndroid Build Coastguard Worker ds[0] = vextq_s16(ds[0], ds[1], 2);
1734*77c1e3ccSAndroid Build Coastguard Worker ds[1] = vextq_s16(ds[1], ds[0], 2);
1735*77c1e3ccSAndroid Build Coastguard Worker ds[1] = vreinterpretq_s16_s32(
1736*77c1e3ccSAndroid Build Coastguard Worker vsetq_lane_s32(tmp0, vreinterpretq_s32_s16(ds[1]), 3));
1737*77c1e3ccSAndroid Build Coastguard Worker int32_t tmp1 = vgetq_lane_s32(vreinterpretq_s32_s16(dd[4]), 0);
1738*77c1e3ccSAndroid Build Coastguard Worker dd[4] = vextq_s16(dd[4], dd[5], 2);
1739*77c1e3ccSAndroid Build Coastguard Worker dd[5] = vextq_s16(dd[5], dd[4], 2);
1740*77c1e3ccSAndroid Build Coastguard Worker dd[5] = vreinterpretq_s16_s32(
1741*77c1e3ccSAndroid Build Coastguard Worker vsetq_lane_s32(tmp1, vreinterpretq_s32_s16(dd[5]), 3));
1742*77c1e3ccSAndroid Build Coastguard Worker di += d_stride;
1743*77c1e3ccSAndroid Build Coastguard Worker dj += d_stride;
1744*77c1e3ccSAndroid Build Coastguard Worker } while (++y < height);
1745*77c1e3ccSAndroid Build Coastguard Worker }
1746*77c1e3ccSAndroid Build Coastguard Worker
1747*77c1e3ccSAndroid Build Coastguard Worker // Writing one more element on the top edge of a square falls to
1748*77c1e3ccSAndroid Build Coastguard Worker // the next square in the same row or the first element in the next
1749*77c1e3ccSAndroid Build Coastguard Worker // row, which will just be overwritten later.
1750*77c1e3ccSAndroid Build Coastguard Worker int64x2_t s0 = vld1q_s64(H + (i - 1) * wiener_win * wiener_win2 +
1751*77c1e3ccSAndroid Build Coastguard Worker (j - 1) * wiener_win + 0);
1752*77c1e3ccSAndroid Build Coastguard Worker int64x2_t s1 = vld1q_s64(H + (i - 1) * wiener_win * wiener_win2 +
1753*77c1e3ccSAndroid Build Coastguard Worker (j - 1) * wiener_win + 2);
1754*77c1e3ccSAndroid Build Coastguard Worker int64x2_t s2 = vld1q_s64(H + (i - 1) * wiener_win * wiener_win2 +
1755*77c1e3ccSAndroid Build Coastguard Worker (j - 1) * wiener_win + 4);
1756*77c1e3ccSAndroid Build Coastguard Worker int64x2_t s3 = vld1q_s64(H + (i - 1) * wiener_win * wiener_win2 +
1757*77c1e3ccSAndroid Build Coastguard Worker (j - 1) * wiener_win + 6);
1758*77c1e3ccSAndroid Build Coastguard Worker
1759*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(H + i * wiener_win * wiener_win2 + j * wiener_win + 0,
1760*77c1e3ccSAndroid Build Coastguard Worker vaddq_s64(s0, deltas02));
1761*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(H + i * wiener_win * wiener_win2 + j * wiener_win + 2,
1762*77c1e3ccSAndroid Build Coastguard Worker vaddq_s64(s1, deltas46));
1763*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(H + i * wiener_win * wiener_win2 + j * wiener_win + 4,
1764*77c1e3ccSAndroid Build Coastguard Worker vaddq_s64(s2, deltas810));
1765*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(H + i * wiener_win * wiener_win2 + j * wiener_win + 6,
1766*77c1e3ccSAndroid Build Coastguard Worker vaddq_s64(s3, deltas1212));
1767*77c1e3ccSAndroid Build Coastguard Worker
1768*77c1e3ccSAndroid Build Coastguard Worker H[(i * wiener_win + 1) * wiener_win2 + j * wiener_win] =
1769*77c1e3ccSAndroid Build Coastguard Worker H[((i - 1) * wiener_win + 1) * wiener_win2 + (j - 1) * wiener_win] +
1770*77c1e3ccSAndroid Build Coastguard Worker vgetq_lane_s64(deltas1416, 0);
1771*77c1e3ccSAndroid Build Coastguard Worker H[(i * wiener_win + 2) * wiener_win2 + j * wiener_win] =
1772*77c1e3ccSAndroid Build Coastguard Worker H[((i - 1) * wiener_win + 2) * wiener_win2 + (j - 1) * wiener_win] +
1773*77c1e3ccSAndroid Build Coastguard Worker vgetq_lane_s64(deltas1416, 1);
1774*77c1e3ccSAndroid Build Coastguard Worker H[(i * wiener_win + 3) * wiener_win2 + j * wiener_win] =
1775*77c1e3ccSAndroid Build Coastguard Worker H[((i - 1) * wiener_win + 3) * wiener_win2 + (j - 1) * wiener_win] +
1776*77c1e3ccSAndroid Build Coastguard Worker vgetq_lane_s64(deltas1820, 0);
1777*77c1e3ccSAndroid Build Coastguard Worker H[(i * wiener_win + 4) * wiener_win2 + j * wiener_win] =
1778*77c1e3ccSAndroid Build Coastguard Worker H[((i - 1) * wiener_win + 4) * wiener_win2 + (j - 1) * wiener_win] +
1779*77c1e3ccSAndroid Build Coastguard Worker vgetq_lane_s64(deltas1820, 1);
1780*77c1e3ccSAndroid Build Coastguard Worker H[(i * wiener_win + 5) * wiener_win2 + j * wiener_win] =
1781*77c1e3ccSAndroid Build Coastguard Worker H[((i - 1) * wiener_win + 5) * wiener_win2 + (j - 1) * wiener_win] +
1782*77c1e3ccSAndroid Build Coastguard Worker vgetq_lane_s64(deltas2224, 0);
1783*77c1e3ccSAndroid Build Coastguard Worker H[(i * wiener_win + 6) * wiener_win2 + j * wiener_win] =
1784*77c1e3ccSAndroid Build Coastguard Worker H[((i - 1) * wiener_win + 6) * wiener_win2 + (j - 1) * wiener_win] +
1785*77c1e3ccSAndroid Build Coastguard Worker vgetq_lane_s64(deltas2224, 1);
1786*77c1e3ccSAndroid Build Coastguard Worker } while (++j < wiener_win);
1787*77c1e3ccSAndroid Build Coastguard Worker } while (++i < wiener_win - 1);
1788*77c1e3ccSAndroid Build Coastguard Worker
1789*77c1e3ccSAndroid Build Coastguard Worker // Step 5: Derive other points of each square. No square in bottom row.
1790*77c1e3ccSAndroid Build Coastguard Worker i = 0;
1791*77c1e3ccSAndroid Build Coastguard Worker do {
1792*77c1e3ccSAndroid Build Coastguard Worker const int16_t *const di = d + i;
1793*77c1e3ccSAndroid Build Coastguard Worker
1794*77c1e3ccSAndroid Build Coastguard Worker j = i + 1;
1795*77c1e3ccSAndroid Build Coastguard Worker do {
1796*77c1e3ccSAndroid Build Coastguard Worker const int16_t *const dj = d + j;
1797*77c1e3ccSAndroid Build Coastguard Worker int64x2_t deltas[WIENER_WIN - 1][WIN_7] = { { vdupq_n_s64(0) },
1798*77c1e3ccSAndroid Build Coastguard Worker { vdupq_n_s64(0) } };
1799*77c1e3ccSAndroid Build Coastguard Worker int16x8_t d_is[WIN_7];
1800*77c1e3ccSAndroid Build Coastguard Worker int16x8_t d_ie[WIN_7];
1801*77c1e3ccSAndroid Build Coastguard Worker int16x8_t d_js[WIN_7];
1802*77c1e3ccSAndroid Build Coastguard Worker int16x8_t d_je[WIN_7];
1803*77c1e3ccSAndroid Build Coastguard Worker
1804*77c1e3ccSAndroid Build Coastguard Worker x = 0;
1805*77c1e3ccSAndroid Build Coastguard Worker while (x < width - 16) {
1806*77c1e3ccSAndroid Build Coastguard Worker load_square_win7_neon(di + x, dj + x, d_stride, height, d_is, d_ie,
1807*77c1e3ccSAndroid Build Coastguard Worker d_js, d_je);
1808*77c1e3ccSAndroid Build Coastguard Worker derive_square_win7_sve(d_is, d_ie, d_js, d_je, deltas);
1809*77c1e3ccSAndroid Build Coastguard Worker x += 16;
1810*77c1e3ccSAndroid Build Coastguard Worker }
1811*77c1e3ccSAndroid Build Coastguard Worker
1812*77c1e3ccSAndroid Build Coastguard Worker load_square_win7_sve(di + x, dj + x, d_stride, height, d_is, d_ie, d_js,
1813*77c1e3ccSAndroid Build Coastguard Worker d_je, p0, p1);
1814*77c1e3ccSAndroid Build Coastguard Worker derive_square_win7_sve(d_is, d_ie, d_js, d_je, deltas);
1815*77c1e3ccSAndroid Build Coastguard Worker
1816*77c1e3ccSAndroid Build Coastguard Worker hadd_update_6_stats_sve(
1817*77c1e3ccSAndroid Build Coastguard Worker H + (i * wiener_win + 0) * wiener_win2 + j * wiener_win, deltas[0],
1818*77c1e3ccSAndroid Build Coastguard Worker H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win + 1);
1819*77c1e3ccSAndroid Build Coastguard Worker hadd_update_6_stats_sve(
1820*77c1e3ccSAndroid Build Coastguard Worker H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win, deltas[1],
1821*77c1e3ccSAndroid Build Coastguard Worker H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win + 1);
1822*77c1e3ccSAndroid Build Coastguard Worker hadd_update_6_stats_sve(
1823*77c1e3ccSAndroid Build Coastguard Worker H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win, deltas[2],
1824*77c1e3ccSAndroid Build Coastguard Worker H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win + 1);
1825*77c1e3ccSAndroid Build Coastguard Worker hadd_update_6_stats_sve(
1826*77c1e3ccSAndroid Build Coastguard Worker H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win, deltas[3],
1827*77c1e3ccSAndroid Build Coastguard Worker H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win + 1);
1828*77c1e3ccSAndroid Build Coastguard Worker hadd_update_6_stats_sve(
1829*77c1e3ccSAndroid Build Coastguard Worker H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win, deltas[4],
1830*77c1e3ccSAndroid Build Coastguard Worker H + (i * wiener_win + 5) * wiener_win2 + j * wiener_win + 1);
1831*77c1e3ccSAndroid Build Coastguard Worker hadd_update_6_stats_sve(
1832*77c1e3ccSAndroid Build Coastguard Worker H + (i * wiener_win + 5) * wiener_win2 + j * wiener_win, deltas[5],
1833*77c1e3ccSAndroid Build Coastguard Worker H + (i * wiener_win + 6) * wiener_win2 + j * wiener_win + 1);
1834*77c1e3ccSAndroid Build Coastguard Worker } while (++j < wiener_win);
1835*77c1e3ccSAndroid Build Coastguard Worker } while (++i < wiener_win - 1);
1836*77c1e3ccSAndroid Build Coastguard Worker
1837*77c1e3ccSAndroid Build Coastguard Worker // Step 6: Derive other points of each upper triangle along the diagonal.
1838*77c1e3ccSAndroid Build Coastguard Worker i = 0;
1839*77c1e3ccSAndroid Build Coastguard Worker do {
1840*77c1e3ccSAndroid Build Coastguard Worker const int16_t *const di = d + i;
1841*77c1e3ccSAndroid Build Coastguard Worker int64x2_t deltas[3 * WIENER_WIN] = { vdupq_n_s64(0) };
1842*77c1e3ccSAndroid Build Coastguard Worker int16x8_t d_is[WIN_7], d_ie[WIN_7];
1843*77c1e3ccSAndroid Build Coastguard Worker
1844*77c1e3ccSAndroid Build Coastguard Worker x = 0;
1845*77c1e3ccSAndroid Build Coastguard Worker while (x < width - 16) {
1846*77c1e3ccSAndroid Build Coastguard Worker load_triangle_win7_neon(di + x, d_stride, height, d_is, d_ie);
1847*77c1e3ccSAndroid Build Coastguard Worker derive_triangle_win7_sve(d_is, d_ie, deltas);
1848*77c1e3ccSAndroid Build Coastguard Worker x += 16;
1849*77c1e3ccSAndroid Build Coastguard Worker }
1850*77c1e3ccSAndroid Build Coastguard Worker
1851*77c1e3ccSAndroid Build Coastguard Worker load_triangle_win7_sve(di + x, d_stride, height, d_is, d_ie, p0, p1);
1852*77c1e3ccSAndroid Build Coastguard Worker derive_triangle_win7_sve(d_is, d_ie, deltas);
1853*77c1e3ccSAndroid Build Coastguard Worker
1854*77c1e3ccSAndroid Build Coastguard Worker // Row 1: 6 points
1855*77c1e3ccSAndroid Build Coastguard Worker hadd_update_6_stats_sve(
1856*77c1e3ccSAndroid Build Coastguard Worker H + (i * wiener_win + 0) * wiener_win2 + i * wiener_win, deltas,
1857*77c1e3ccSAndroid Build Coastguard Worker H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1);
1858*77c1e3ccSAndroid Build Coastguard Worker
1859*77c1e3ccSAndroid Build Coastguard Worker int64x2_t deltas1017 = vpaddq_s64(deltas[10], deltas[17]);
1860*77c1e3ccSAndroid Build Coastguard Worker
1861*77c1e3ccSAndroid Build Coastguard Worker // Row 2: 5 points
1862*77c1e3ccSAndroid Build Coastguard Worker hadd_update_4_stats_sve(
1863*77c1e3ccSAndroid Build Coastguard Worker H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1, deltas + 6,
1864*77c1e3ccSAndroid Build Coastguard Worker H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2);
1865*77c1e3ccSAndroid Build Coastguard Worker H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 6] =
1866*77c1e3ccSAndroid Build Coastguard Worker H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 5] +
1867*77c1e3ccSAndroid Build Coastguard Worker vgetq_lane_s64(deltas1017, 0);
1868*77c1e3ccSAndroid Build Coastguard Worker
1869*77c1e3ccSAndroid Build Coastguard Worker // Row 3: 4 points
1870*77c1e3ccSAndroid Build Coastguard Worker hadd_update_4_stats_sve(
1871*77c1e3ccSAndroid Build Coastguard Worker H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2,
1872*77c1e3ccSAndroid Build Coastguard Worker deltas + 11,
1873*77c1e3ccSAndroid Build Coastguard Worker H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3);
1874*77c1e3ccSAndroid Build Coastguard Worker
1875*77c1e3ccSAndroid Build Coastguard Worker // Row 4: 3 points
1876*77c1e3ccSAndroid Build Coastguard Worker int64x2_t h0 =
1877*77c1e3ccSAndroid Build Coastguard Worker vld1q_s64(H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3);
1878*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(H + (i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4,
1879*77c1e3ccSAndroid Build Coastguard Worker vaddq_s64(h0, vpaddq_s64(deltas[15], deltas[16])));
1880*77c1e3ccSAndroid Build Coastguard Worker H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 6] =
1881*77c1e3ccSAndroid Build Coastguard Worker H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 5] +
1882*77c1e3ccSAndroid Build Coastguard Worker vgetq_lane_s64(deltas1017, 1);
1883*77c1e3ccSAndroid Build Coastguard Worker
1884*77c1e3ccSAndroid Build Coastguard Worker // Row 5: 2 points
1885*77c1e3ccSAndroid Build Coastguard Worker int64x2_t h1 =
1886*77c1e3ccSAndroid Build Coastguard Worker vld1q_s64(H + (i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4);
1887*77c1e3ccSAndroid Build Coastguard Worker vst1q_s64(H + (i * wiener_win + 5) * wiener_win2 + i * wiener_win + 5,
1888*77c1e3ccSAndroid Build Coastguard Worker vaddq_s64(h1, vpaddq_s64(deltas[18], deltas[19])));
1889*77c1e3ccSAndroid Build Coastguard Worker
1890*77c1e3ccSAndroid Build Coastguard Worker // Row 6: 1 points
1891*77c1e3ccSAndroid Build Coastguard Worker H[(i * wiener_win + 6) * wiener_win2 + i * wiener_win + 6] =
1892*77c1e3ccSAndroid Build Coastguard Worker H[(i * wiener_win + 5) * wiener_win2 + i * wiener_win + 5] +
1893*77c1e3ccSAndroid Build Coastguard Worker vaddvq_s64(deltas[20]);
1894*77c1e3ccSAndroid Build Coastguard Worker } while (++i < wiener_win);
1895*77c1e3ccSAndroid Build Coastguard Worker }
1896*77c1e3ccSAndroid Build Coastguard Worker
1897*77c1e3ccSAndroid Build Coastguard Worker #endif // AOM_AV1_ENCODER_ARM_PICKRST_SVE_H_
1898