xref: /aosp_15_r20/external/libaom/av1/encoder/arm/pickrst_neon.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker  * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker  *
4*77c1e3ccSAndroid Build Coastguard Worker  * This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker  * was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker  * Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker  */
11*77c1e3ccSAndroid Build Coastguard Worker 
12*77c1e3ccSAndroid Build Coastguard Worker #include <arm_neon.h>
13*77c1e3ccSAndroid Build Coastguard Worker 
14*77c1e3ccSAndroid Build Coastguard Worker #include "config/aom_config.h"
15*77c1e3ccSAndroid Build Coastguard Worker #include "config/av1_rtcd.h"
16*77c1e3ccSAndroid Build Coastguard Worker 
17*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/arm/mem_neon.h"
18*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/arm/sum_neon.h"
19*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/arm/transpose_neon.h"
20*77c1e3ccSAndroid Build Coastguard Worker #include "av1/common/restoration.h"
21*77c1e3ccSAndroid Build Coastguard Worker #include "av1/encoder/arm/pickrst_neon.h"
22*77c1e3ccSAndroid Build Coastguard Worker #include "av1/encoder/pickrst.h"
23*77c1e3ccSAndroid Build Coastguard Worker 
av1_lowbd_pixel_proj_error_neon(const uint8_t * src,int width,int height,int src_stride,const uint8_t * dat,int dat_stride,int32_t * flt0,int flt0_stride,int32_t * flt1,int flt1_stride,int xq[2],const sgr_params_type * params)24*77c1e3ccSAndroid Build Coastguard Worker int64_t av1_lowbd_pixel_proj_error_neon(
25*77c1e3ccSAndroid Build Coastguard Worker     const uint8_t *src, int width, int height, int src_stride,
26*77c1e3ccSAndroid Build Coastguard Worker     const uint8_t *dat, int dat_stride, int32_t *flt0, int flt0_stride,
27*77c1e3ccSAndroid Build Coastguard Worker     int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
28*77c1e3ccSAndroid Build Coastguard Worker   int64_t sse = 0;
29*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t sse_s64 = vdupq_n_s64(0);
30*77c1e3ccSAndroid Build Coastguard Worker 
31*77c1e3ccSAndroid Build Coastguard Worker   if (params->r[0] > 0 && params->r[1] > 0) {
32*77c1e3ccSAndroid Build Coastguard Worker     int32x2_t xq_v = vld1_s32(xq);
33*77c1e3ccSAndroid Build Coastguard Worker     int32x2_t xq_sum_v = vshl_n_s32(vpadd_s32(xq_v, xq_v), SGRPROJ_RST_BITS);
34*77c1e3ccSAndroid Build Coastguard Worker 
35*77c1e3ccSAndroid Build Coastguard Worker     do {
36*77c1e3ccSAndroid Build Coastguard Worker       int j = 0;
37*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t sse_s32 = vdupq_n_s32(0);
38*77c1e3ccSAndroid Build Coastguard Worker 
39*77c1e3ccSAndroid Build Coastguard Worker       do {
40*77c1e3ccSAndroid Build Coastguard Worker         const uint8x8_t d = vld1_u8(&dat[j]);
41*77c1e3ccSAndroid Build Coastguard Worker         const uint8x8_t s = vld1_u8(&src[j]);
42*77c1e3ccSAndroid Build Coastguard Worker         int32x4_t flt0_0 = vld1q_s32(&flt0[j]);
43*77c1e3ccSAndroid Build Coastguard Worker         int32x4_t flt0_1 = vld1q_s32(&flt0[j + 4]);
44*77c1e3ccSAndroid Build Coastguard Worker         int32x4_t flt1_0 = vld1q_s32(&flt1[j]);
45*77c1e3ccSAndroid Build Coastguard Worker         int32x4_t flt1_1 = vld1q_s32(&flt1[j + 4]);
46*77c1e3ccSAndroid Build Coastguard Worker 
47*77c1e3ccSAndroid Build Coastguard Worker         int32x4_t offset =
48*77c1e3ccSAndroid Build Coastguard Worker             vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1));
49*77c1e3ccSAndroid Build Coastguard Worker         int32x4_t v0 = vmlaq_lane_s32(offset, flt0_0, xq_v, 0);
50*77c1e3ccSAndroid Build Coastguard Worker         int32x4_t v1 = vmlaq_lane_s32(offset, flt0_1, xq_v, 0);
51*77c1e3ccSAndroid Build Coastguard Worker 
52*77c1e3ccSAndroid Build Coastguard Worker         v0 = vmlaq_lane_s32(v0, flt1_0, xq_v, 1);
53*77c1e3ccSAndroid Build Coastguard Worker         v1 = vmlaq_lane_s32(v1, flt1_1, xq_v, 1);
54*77c1e3ccSAndroid Build Coastguard Worker 
55*77c1e3ccSAndroid Build Coastguard Worker         int16x8_t d_s16 = vreinterpretq_s16_u16(vmovl_u8(d));
56*77c1e3ccSAndroid Build Coastguard Worker         v0 = vmlsl_lane_s16(v0, vget_low_s16(d_s16),
57*77c1e3ccSAndroid Build Coastguard Worker                             vreinterpret_s16_s32(xq_sum_v), 0);
58*77c1e3ccSAndroid Build Coastguard Worker         v1 = vmlsl_lane_s16(v1, vget_high_s16(d_s16),
59*77c1e3ccSAndroid Build Coastguard Worker                             vreinterpret_s16_s32(xq_sum_v), 0);
60*77c1e3ccSAndroid Build Coastguard Worker 
61*77c1e3ccSAndroid Build Coastguard Worker         int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
62*77c1e3ccSAndroid Build Coastguard Worker         int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
63*77c1e3ccSAndroid Build Coastguard Worker 
64*77c1e3ccSAndroid Build Coastguard Worker         int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(d, s));
65*77c1e3ccSAndroid Build Coastguard Worker         int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1), diff);
66*77c1e3ccSAndroid Build Coastguard Worker         int16x4_t e_lo = vget_low_s16(e);
67*77c1e3ccSAndroid Build Coastguard Worker         int16x4_t e_hi = vget_high_s16(e);
68*77c1e3ccSAndroid Build Coastguard Worker 
69*77c1e3ccSAndroid Build Coastguard Worker         sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo);
70*77c1e3ccSAndroid Build Coastguard Worker         sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi);
71*77c1e3ccSAndroid Build Coastguard Worker 
72*77c1e3ccSAndroid Build Coastguard Worker         j += 8;
73*77c1e3ccSAndroid Build Coastguard Worker       } while (j <= width - 8);
74*77c1e3ccSAndroid Build Coastguard Worker 
75*77c1e3ccSAndroid Build Coastguard Worker       for (int k = j; k < width; ++k) {
76*77c1e3ccSAndroid Build Coastguard Worker         int32_t u = (dat[k] << SGRPROJ_RST_BITS);
77*77c1e3ccSAndroid Build Coastguard Worker         int32_t v = (1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)) +
78*77c1e3ccSAndroid Build Coastguard Worker                     xq[0] * flt0[k] + xq[1] * flt1[k] - u * (xq[0] + xq[1]);
79*77c1e3ccSAndroid Build Coastguard Worker         int32_t e =
80*77c1e3ccSAndroid Build Coastguard Worker             (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + dat[k] - src[k];
81*77c1e3ccSAndroid Build Coastguard Worker         sse += e * e;
82*77c1e3ccSAndroid Build Coastguard Worker       }
83*77c1e3ccSAndroid Build Coastguard Worker 
84*77c1e3ccSAndroid Build Coastguard Worker       sse_s64 = vpadalq_s32(sse_s64, sse_s32);
85*77c1e3ccSAndroid Build Coastguard Worker 
86*77c1e3ccSAndroid Build Coastguard Worker       dat += dat_stride;
87*77c1e3ccSAndroid Build Coastguard Worker       src += src_stride;
88*77c1e3ccSAndroid Build Coastguard Worker       flt0 += flt0_stride;
89*77c1e3ccSAndroid Build Coastguard Worker       flt1 += flt1_stride;
90*77c1e3ccSAndroid Build Coastguard Worker     } while (--height != 0);
91*77c1e3ccSAndroid Build Coastguard Worker   } else if (params->r[0] > 0 || params->r[1] > 0) {
92*77c1e3ccSAndroid Build Coastguard Worker     int xq_active = (params->r[0] > 0) ? xq[0] : xq[1];
93*77c1e3ccSAndroid Build Coastguard Worker     int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
94*77c1e3ccSAndroid Build Coastguard Worker     int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
95*77c1e3ccSAndroid Build Coastguard Worker     int32x2_t xq_v = vdup_n_s32(xq_active);
96*77c1e3ccSAndroid Build Coastguard Worker 
97*77c1e3ccSAndroid Build Coastguard Worker     do {
98*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t sse_s32 = vdupq_n_s32(0);
99*77c1e3ccSAndroid Build Coastguard Worker       int j = 0;
100*77c1e3ccSAndroid Build Coastguard Worker 
101*77c1e3ccSAndroid Build Coastguard Worker       do {
102*77c1e3ccSAndroid Build Coastguard Worker         const uint8x8_t d = vld1_u8(&dat[j]);
103*77c1e3ccSAndroid Build Coastguard Worker         const uint8x8_t s = vld1_u8(&src[j]);
104*77c1e3ccSAndroid Build Coastguard Worker         int32x4_t flt_0 = vld1q_s32(&flt[j]);
105*77c1e3ccSAndroid Build Coastguard Worker         int32x4_t flt_1 = vld1q_s32(&flt[j + 4]);
106*77c1e3ccSAndroid Build Coastguard Worker         int16x8_t d_s16 =
107*77c1e3ccSAndroid Build Coastguard Worker             vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS));
108*77c1e3ccSAndroid Build Coastguard Worker 
109*77c1e3ccSAndroid Build Coastguard Worker         int32x4_t sub_0 = vsubw_s16(flt_0, vget_low_s16(d_s16));
110*77c1e3ccSAndroid Build Coastguard Worker         int32x4_t sub_1 = vsubw_s16(flt_1, vget_high_s16(d_s16));
111*77c1e3ccSAndroid Build Coastguard Worker 
112*77c1e3ccSAndroid Build Coastguard Worker         int32x4_t offset =
113*77c1e3ccSAndroid Build Coastguard Worker             vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1));
114*77c1e3ccSAndroid Build Coastguard Worker         int32x4_t v0 = vmlaq_lane_s32(offset, sub_0, xq_v, 0);
115*77c1e3ccSAndroid Build Coastguard Worker         int32x4_t v1 = vmlaq_lane_s32(offset, sub_1, xq_v, 0);
116*77c1e3ccSAndroid Build Coastguard Worker 
117*77c1e3ccSAndroid Build Coastguard Worker         int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
118*77c1e3ccSAndroid Build Coastguard Worker         int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS);
119*77c1e3ccSAndroid Build Coastguard Worker 
120*77c1e3ccSAndroid Build Coastguard Worker         int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(d, s));
121*77c1e3ccSAndroid Build Coastguard Worker         int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1), diff);
122*77c1e3ccSAndroid Build Coastguard Worker         int16x4_t e_lo = vget_low_s16(e);
123*77c1e3ccSAndroid Build Coastguard Worker         int16x4_t e_hi = vget_high_s16(e);
124*77c1e3ccSAndroid Build Coastguard Worker 
125*77c1e3ccSAndroid Build Coastguard Worker         sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo);
126*77c1e3ccSAndroid Build Coastguard Worker         sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi);
127*77c1e3ccSAndroid Build Coastguard Worker 
128*77c1e3ccSAndroid Build Coastguard Worker         j += 8;
129*77c1e3ccSAndroid Build Coastguard Worker       } while (j <= width - 8);
130*77c1e3ccSAndroid Build Coastguard Worker 
131*77c1e3ccSAndroid Build Coastguard Worker       for (int k = j; k < width; ++k) {
132*77c1e3ccSAndroid Build Coastguard Worker         int32_t u = dat[k] << SGRPROJ_RST_BITS;
133*77c1e3ccSAndroid Build Coastguard Worker         int32_t v = xq_active * (flt[k] - u);
134*77c1e3ccSAndroid Build Coastguard Worker         int32_t e = ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) +
135*77c1e3ccSAndroid Build Coastguard Worker                     dat[k] - src[k];
136*77c1e3ccSAndroid Build Coastguard Worker         sse += e * e;
137*77c1e3ccSAndroid Build Coastguard Worker       }
138*77c1e3ccSAndroid Build Coastguard Worker 
139*77c1e3ccSAndroid Build Coastguard Worker       sse_s64 = vpadalq_s32(sse_s64, sse_s32);
140*77c1e3ccSAndroid Build Coastguard Worker 
141*77c1e3ccSAndroid Build Coastguard Worker       dat += dat_stride;
142*77c1e3ccSAndroid Build Coastguard Worker       src += src_stride;
143*77c1e3ccSAndroid Build Coastguard Worker       flt += flt_stride;
144*77c1e3ccSAndroid Build Coastguard Worker     } while (--height != 0);
145*77c1e3ccSAndroid Build Coastguard Worker   } else {
146*77c1e3ccSAndroid Build Coastguard Worker     uint32x4_t sse_s32 = vdupq_n_u32(0);
147*77c1e3ccSAndroid Build Coastguard Worker 
148*77c1e3ccSAndroid Build Coastguard Worker     do {
149*77c1e3ccSAndroid Build Coastguard Worker       int j = 0;
150*77c1e3ccSAndroid Build Coastguard Worker 
151*77c1e3ccSAndroid Build Coastguard Worker       do {
152*77c1e3ccSAndroid Build Coastguard Worker         const uint8x16_t d = vld1q_u8(&dat[j]);
153*77c1e3ccSAndroid Build Coastguard Worker         const uint8x16_t s = vld1q_u8(&src[j]);
154*77c1e3ccSAndroid Build Coastguard Worker 
155*77c1e3ccSAndroid Build Coastguard Worker         uint8x16_t diff = vabdq_u8(d, s);
156*77c1e3ccSAndroid Build Coastguard Worker         uint8x8_t diff_lo = vget_low_u8(diff);
157*77c1e3ccSAndroid Build Coastguard Worker         uint8x8_t diff_hi = vget_high_u8(diff);
158*77c1e3ccSAndroid Build Coastguard Worker 
159*77c1e3ccSAndroid Build Coastguard Worker         sse_s32 = vpadalq_u16(sse_s32, vmull_u8(diff_lo, diff_lo));
160*77c1e3ccSAndroid Build Coastguard Worker         sse_s32 = vpadalq_u16(sse_s32, vmull_u8(diff_hi, diff_hi));
161*77c1e3ccSAndroid Build Coastguard Worker 
162*77c1e3ccSAndroid Build Coastguard Worker         j += 16;
163*77c1e3ccSAndroid Build Coastguard Worker       } while (j <= width - 16);
164*77c1e3ccSAndroid Build Coastguard Worker 
165*77c1e3ccSAndroid Build Coastguard Worker       for (int k = j; k < width; ++k) {
166*77c1e3ccSAndroid Build Coastguard Worker         int32_t e = dat[k] - src[k];
167*77c1e3ccSAndroid Build Coastguard Worker         sse += e * e;
168*77c1e3ccSAndroid Build Coastguard Worker       }
169*77c1e3ccSAndroid Build Coastguard Worker 
170*77c1e3ccSAndroid Build Coastguard Worker       dat += dat_stride;
171*77c1e3ccSAndroid Build Coastguard Worker       src += src_stride;
172*77c1e3ccSAndroid Build Coastguard Worker     } while (--height != 0);
173*77c1e3ccSAndroid Build Coastguard Worker 
174*77c1e3ccSAndroid Build Coastguard Worker     sse_s64 = vreinterpretq_s64_u64(vpaddlq_u32(sse_s32));
175*77c1e3ccSAndroid Build Coastguard Worker   }
176*77c1e3ccSAndroid Build Coastguard Worker 
177*77c1e3ccSAndroid Build Coastguard Worker   sse += horizontal_add_s64x2(sse_s64);
178*77c1e3ccSAndroid Build Coastguard Worker   return sse;
179*77c1e3ccSAndroid Build Coastguard Worker }
180*77c1e3ccSAndroid Build Coastguard Worker 
181*77c1e3ccSAndroid Build Coastguard Worker // We can accumulate up to 32768 8-bit multiplication results in a signed
182*77c1e3ccSAndroid Build Coastguard Worker // 32-bit integer. We are processing 2 pixels at a time, so the accumulator max
183*77c1e3ccSAndroid Build Coastguard Worker // can be as high as 16384 for the compute stats.
184*77c1e3ccSAndroid Build Coastguard Worker #define STAT_ACCUMULATOR_MAX 16384
185*77c1e3ccSAndroid Build Coastguard Worker 
tbl2(uint8x16_t a,uint8x16_t b,uint8x8_t idx)186*77c1e3ccSAndroid Build Coastguard Worker static inline uint8x8_t tbl2(uint8x16_t a, uint8x16_t b, uint8x8_t idx) {
187*77c1e3ccSAndroid Build Coastguard Worker #if AOM_ARCH_AARCH64
188*77c1e3ccSAndroid Build Coastguard Worker   uint8x16x2_t table = { { a, b } };
189*77c1e3ccSAndroid Build Coastguard Worker   return vqtbl2_u8(table, idx);
190*77c1e3ccSAndroid Build Coastguard Worker #else
191*77c1e3ccSAndroid Build Coastguard Worker   uint8x8x4_t table = { { vget_low_u8(a), vget_high_u8(a), vget_low_u8(b),
192*77c1e3ccSAndroid Build Coastguard Worker                           vget_high_u8(b) } };
193*77c1e3ccSAndroid Build Coastguard Worker   return vtbl4_u8(table, idx);
194*77c1e3ccSAndroid Build Coastguard Worker #endif
195*77c1e3ccSAndroid Build Coastguard Worker }
196*77c1e3ccSAndroid Build Coastguard Worker 
tbl2q(uint8x16_t a,uint8x16_t b,uint8x16_t idx)197*77c1e3ccSAndroid Build Coastguard Worker static inline uint8x16_t tbl2q(uint8x16_t a, uint8x16_t b, uint8x16_t idx) {
198*77c1e3ccSAndroid Build Coastguard Worker #if AOM_ARCH_AARCH64
199*77c1e3ccSAndroid Build Coastguard Worker   uint8x16x2_t table = { { a, b } };
200*77c1e3ccSAndroid Build Coastguard Worker   return vqtbl2q_u8(table, idx);
201*77c1e3ccSAndroid Build Coastguard Worker #else
202*77c1e3ccSAndroid Build Coastguard Worker   uint8x8x4_t table = { { vget_low_u8(a), vget_high_u8(a), vget_low_u8(b),
203*77c1e3ccSAndroid Build Coastguard Worker                           vget_high_u8(b) } };
204*77c1e3ccSAndroid Build Coastguard Worker   return vcombine_u8(vtbl4_u8(table, vget_low_u8(idx)),
205*77c1e3ccSAndroid Build Coastguard Worker                      vtbl4_u8(table, vget_high_u8(idx)));
206*77c1e3ccSAndroid Build Coastguard Worker #endif
207*77c1e3ccSAndroid Build Coastguard Worker }
208*77c1e3ccSAndroid Build Coastguard Worker 
209*77c1e3ccSAndroid Build Coastguard Worker // The M matrix is accumulated in STAT_ACCUMULATOR_MAX steps to speed-up the
210*77c1e3ccSAndroid Build Coastguard Worker // computation. This function computes the final M from the accumulated
211*77c1e3ccSAndroid Build Coastguard Worker // (src_s64) and the residual parts (src_s32). It also transposes the result as
212*77c1e3ccSAndroid Build Coastguard Worker // the output needs to be column-major.
acc_transpose_M(int64_t * dst,const int64_t * src_s64,const int32_t * src_s32,const int wiener_win,int scale)213*77c1e3ccSAndroid Build Coastguard Worker static inline void acc_transpose_M(int64_t *dst, const int64_t *src_s64,
214*77c1e3ccSAndroid Build Coastguard Worker                                    const int32_t *src_s32, const int wiener_win,
215*77c1e3ccSAndroid Build Coastguard Worker                                    int scale) {
216*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < wiener_win; ++i) {
217*77c1e3ccSAndroid Build Coastguard Worker     for (int j = 0; j < wiener_win; ++j) {
218*77c1e3ccSAndroid Build Coastguard Worker       int tr_idx = j * wiener_win + i;
219*77c1e3ccSAndroid Build Coastguard Worker       *dst++ += (int64_t)(src_s64[tr_idx] + src_s32[tr_idx]) * scale;
220*77c1e3ccSAndroid Build Coastguard Worker     }
221*77c1e3ccSAndroid Build Coastguard Worker   }
222*77c1e3ccSAndroid Build Coastguard Worker }
223*77c1e3ccSAndroid Build Coastguard Worker 
224*77c1e3ccSAndroid Build Coastguard Worker // The resulting H is a column-major matrix accumulated from the transposed
225*77c1e3ccSAndroid Build Coastguard Worker // (column-major) samples of the filter kernel (5x5 or 7x7) viewed as a single
226*77c1e3ccSAndroid Build Coastguard Worker // vector. For the 7x7 filter case: H(49x49) = [49 x 1] x [1 x 49]. This
227*77c1e3ccSAndroid Build Coastguard Worker // function transforms back to the originally expected format (double
228*77c1e3ccSAndroid Build Coastguard Worker // transpose). The H matrix is accumulated in STAT_ACCUMULATOR_MAX steps to
229*77c1e3ccSAndroid Build Coastguard Worker // speed-up the computation. This function computes the final H from the
230*77c1e3ccSAndroid Build Coastguard Worker // accumulated (src_s64) and the residual parts (src_s32). The computed H is
231*77c1e3ccSAndroid Build Coastguard Worker // only an upper triangle matrix, this function also fills the lower triangle of
232*77c1e3ccSAndroid Build Coastguard Worker // the resulting matrix.
update_H(int64_t * dst,const int64_t * src_s64,const int32_t * src_s32,const int wiener_win,int stride,int scale)233*77c1e3ccSAndroid Build Coastguard Worker static void update_H(int64_t *dst, const int64_t *src_s64,
234*77c1e3ccSAndroid Build Coastguard Worker                      const int32_t *src_s32, const int wiener_win, int stride,
235*77c1e3ccSAndroid Build Coastguard Worker                      int scale) {
236*77c1e3ccSAndroid Build Coastguard Worker   // For a simplified theoretical 3x3 case where `wiener_win` is 3 and
237*77c1e3ccSAndroid Build Coastguard Worker   // `wiener_win2` is 9, the M matrix is 3x3:
238*77c1e3ccSAndroid Build Coastguard Worker   // 0, 3, 6
239*77c1e3ccSAndroid Build Coastguard Worker   // 1, 4, 7
240*77c1e3ccSAndroid Build Coastguard Worker   // 2, 5, 8
241*77c1e3ccSAndroid Build Coastguard Worker   //
242*77c1e3ccSAndroid Build Coastguard Worker   // This is viewed as a vector to compute H (9x9) by vector outer product:
243*77c1e3ccSAndroid Build Coastguard Worker   // 0, 3, 6, 1, 4, 7, 2, 5, 8
244*77c1e3ccSAndroid Build Coastguard Worker   //
245*77c1e3ccSAndroid Build Coastguard Worker   // Double transpose and upper triangle remapping for 3x3 -> 9x9 case:
246*77c1e3ccSAndroid Build Coastguard Worker   // 0,    3,    6,    1,    4,    7,    2,    5,    8,
247*77c1e3ccSAndroid Build Coastguard Worker   // 3,   30,   33,   12,   31,   34,   21,   32,   35,
248*77c1e3ccSAndroid Build Coastguard Worker   // 6,   33,   60,   15,   42,   61,   24,   51,   62,
249*77c1e3ccSAndroid Build Coastguard Worker   // 1,   12,   15,   10,   13,   16,   11,   14,   17,
250*77c1e3ccSAndroid Build Coastguard Worker   // 4,   31,   42,   13,   40,   43,   22,   41,   44,
251*77c1e3ccSAndroid Build Coastguard Worker   // 7,   34,   61,   16,   43,   70,   25,   52,   71,
252*77c1e3ccSAndroid Build Coastguard Worker   // 2,   21,   24,   11,   22,   25,   20,   23,   26,
253*77c1e3ccSAndroid Build Coastguard Worker   // 5,   32,   51,   14,   41,   52,   23,   50,   53,
254*77c1e3ccSAndroid Build Coastguard Worker   // 8,   35,   62,   17,   44,   71,   26,   53,   80,
255*77c1e3ccSAndroid Build Coastguard Worker   const int wiener_win2 = wiener_win * wiener_win;
256*77c1e3ccSAndroid Build Coastguard Worker 
257*77c1e3ccSAndroid Build Coastguard Worker   // Loop through the indices according to the remapping above, along the
258*77c1e3ccSAndroid Build Coastguard Worker   // columns:
259*77c1e3ccSAndroid Build Coastguard Worker   // 0, wiener_win, 2 * wiener_win, ..., 1, 1 + 2 * wiener_win, ...,
260*77c1e3ccSAndroid Build Coastguard Worker   // wiener_win - 1, wiener_win - 1 + wiener_win, ...
261*77c1e3ccSAndroid Build Coastguard Worker   // For the 3x3 case `j` will be: 0, 3, 6, 1, 4, 7, 2, 5, 8.
262*77c1e3ccSAndroid Build Coastguard Worker   for (int i = 0; i < wiener_win; ++i) {
263*77c1e3ccSAndroid Build Coastguard Worker     for (int j = i; j < wiener_win2; j += wiener_win) {
264*77c1e3ccSAndroid Build Coastguard Worker       // These two inner loops are the same as the two outer loops, but running
265*77c1e3ccSAndroid Build Coastguard Worker       // along rows instead of columns. For the 3x3 case `l` will be:
266*77c1e3ccSAndroid Build Coastguard Worker       // 0, 3, 6, 1, 4, 7, 2, 5, 8.
267*77c1e3ccSAndroid Build Coastguard Worker       for (int k = 0; k < wiener_win; ++k) {
268*77c1e3ccSAndroid Build Coastguard Worker         for (int l = k; l < wiener_win2; l += wiener_win) {
269*77c1e3ccSAndroid Build Coastguard Worker           // The nominal double transpose indexing would be:
270*77c1e3ccSAndroid Build Coastguard Worker           // int idx = stride * j + l;
271*77c1e3ccSAndroid Build Coastguard Worker           // However we need the upper-triangle indices, it is easy with some
272*77c1e3ccSAndroid Build Coastguard Worker           // min/max operations.
273*77c1e3ccSAndroid Build Coastguard Worker           int tr_idx = stride * AOMMIN(j, l) + AOMMAX(j, l);
274*77c1e3ccSAndroid Build Coastguard Worker 
275*77c1e3ccSAndroid Build Coastguard Worker           // Resulting matrix is filled by combining the 64-bit and the residual
276*77c1e3ccSAndroid Build Coastguard Worker           // 32-bit matrices together with scaling.
277*77c1e3ccSAndroid Build Coastguard Worker           *dst++ += (int64_t)(src_s64[tr_idx] + src_s32[tr_idx]) * scale;
278*77c1e3ccSAndroid Build Coastguard Worker         }
279*77c1e3ccSAndroid Build Coastguard Worker       }
280*77c1e3ccSAndroid Build Coastguard Worker     }
281*77c1e3ccSAndroid Build Coastguard Worker   }
282*77c1e3ccSAndroid Build Coastguard Worker }
283*77c1e3ccSAndroid Build Coastguard Worker 
284*77c1e3ccSAndroid Build Coastguard Worker // Load 7x7 matrix into 3 and a half 128-bit vectors from consecutive rows, the
285*77c1e3ccSAndroid Build Coastguard Worker // last load address is offset to prevent out-of-bounds access.
load_and_pack_u8_8x7(uint8x16_t dst[4],const uint8_t * src,ptrdiff_t stride)286*77c1e3ccSAndroid Build Coastguard Worker static inline void load_and_pack_u8_8x7(uint8x16_t dst[4], const uint8_t *src,
287*77c1e3ccSAndroid Build Coastguard Worker                                         ptrdiff_t stride) {
288*77c1e3ccSAndroid Build Coastguard Worker   dst[0] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride));
289*77c1e3ccSAndroid Build Coastguard Worker   src += 2 * stride;
290*77c1e3ccSAndroid Build Coastguard Worker   dst[1] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride));
291*77c1e3ccSAndroid Build Coastguard Worker   src += 2 * stride;
292*77c1e3ccSAndroid Build Coastguard Worker   dst[2] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride));
293*77c1e3ccSAndroid Build Coastguard Worker   src += 2 * stride;
294*77c1e3ccSAndroid Build Coastguard Worker   dst[3] = vcombine_u8(vld1_u8(src - 1), vdup_n_u8(0));
295*77c1e3ccSAndroid Build Coastguard Worker }
296*77c1e3ccSAndroid Build Coastguard Worker 
compute_stats_win7_downsampled_neon(const uint8_t * dgd,const uint8_t * src,int width,int height,int dgd_stride,int src_stride,int avg,int64_t * M,int64_t * H,int downsample_factor)297*77c1e3ccSAndroid Build Coastguard Worker static inline void compute_stats_win7_downsampled_neon(
298*77c1e3ccSAndroid Build Coastguard Worker     const uint8_t *dgd, const uint8_t *src, int width, int height,
299*77c1e3ccSAndroid Build Coastguard Worker     int dgd_stride, int src_stride, int avg, int64_t *M, int64_t *H,
300*77c1e3ccSAndroid Build Coastguard Worker     int downsample_factor) {
301*77c1e3ccSAndroid Build Coastguard Worker   // Matrix names are capitalized to help readability.
302*77c1e3ccSAndroid Build Coastguard Worker   DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_ALIGN3]);
303*77c1e3ccSAndroid Build Coastguard Worker   DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_ALIGN3]);
304*77c1e3ccSAndroid Build Coastguard Worker   DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_ALIGN3]);
305*77c1e3ccSAndroid Build Coastguard Worker   DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_ALIGN3]);
306*77c1e3ccSAndroid Build Coastguard Worker   DECLARE_ALIGNED(64, int32_t, H_s32[WIENER_WIN2 * WIENER_WIN2_ALIGN2]);
307*77c1e3ccSAndroid Build Coastguard Worker   DECLARE_ALIGNED(64, int64_t, H_s64[WIENER_WIN2 * WIENER_WIN2_ALIGN2]);
308*77c1e3ccSAndroid Build Coastguard Worker 
309*77c1e3ccSAndroid Build Coastguard Worker   memset(M_s32, 0, sizeof(M_s32));
310*77c1e3ccSAndroid Build Coastguard Worker   memset(M_s64, 0, sizeof(M_s64));
311*77c1e3ccSAndroid Build Coastguard Worker   memset(H_s32, 0, sizeof(H_s32));
312*77c1e3ccSAndroid Build Coastguard Worker   memset(H_s64, 0, sizeof(H_s64));
313*77c1e3ccSAndroid Build Coastguard Worker 
314*77c1e3ccSAndroid Build Coastguard Worker   // Look-up tables to create 8x6 matrix with consecutive elements from two 7x7
315*77c1e3ccSAndroid Build Coastguard Worker   // matrices.
316*77c1e3ccSAndroid Build Coastguard Worker   // clang-format off
317*77c1e3ccSAndroid Build Coastguard Worker   DECLARE_ALIGNED(16, static const uint8_t, shuffle_stats7[96]) = {
318*77c1e3ccSAndroid Build Coastguard Worker     0,  1,  2,  3,  4,  5,  6,  8,  9, 10, 11, 12, 13, 14, 16, 17,
319*77c1e3ccSAndroid Build Coastguard Worker     2,  3,  4,  5,  6,  8,  9, 10, 11, 12, 13, 14, 16, 17, 18, 19,
320*77c1e3ccSAndroid Build Coastguard Worker     4,  5,  6,  8,  9, 10, 11, 12, 13, 14, 17, 18, 19, 20, 21, 22,
321*77c1e3ccSAndroid Build Coastguard Worker     1,  2,  3,  4,  5,  6,  7,  9, 10, 11, 12, 13, 14, 15, 17, 18,
322*77c1e3ccSAndroid Build Coastguard Worker     3,  4,  5,  6,  7,  9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20,
323*77c1e3ccSAndroid Build Coastguard Worker     5,  6,  7,  9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23,
324*77c1e3ccSAndroid Build Coastguard Worker   };
325*77c1e3ccSAndroid Build Coastguard Worker   // clang-format on
326*77c1e3ccSAndroid Build Coastguard Worker 
327*77c1e3ccSAndroid Build Coastguard Worker   const uint8x16_t lut0 = vld1q_u8(shuffle_stats7 + 0);
328*77c1e3ccSAndroid Build Coastguard Worker   const uint8x16_t lut1 = vld1q_u8(shuffle_stats7 + 16);
329*77c1e3ccSAndroid Build Coastguard Worker   const uint8x16_t lut2 = vld1q_u8(shuffle_stats7 + 32);
330*77c1e3ccSAndroid Build Coastguard Worker   const uint8x16_t lut3 = vld1q_u8(shuffle_stats7 + 48);
331*77c1e3ccSAndroid Build Coastguard Worker   const uint8x16_t lut4 = vld1q_u8(shuffle_stats7 + 64);
332*77c1e3ccSAndroid Build Coastguard Worker   const uint8x16_t lut5 = vld1q_u8(shuffle_stats7 + 80);
333*77c1e3ccSAndroid Build Coastguard Worker 
334*77c1e3ccSAndroid Build Coastguard Worker   int acc_cnt = STAT_ACCUMULATOR_MAX;
335*77c1e3ccSAndroid Build Coastguard Worker   const int src_next = downsample_factor * src_stride - width;
336*77c1e3ccSAndroid Build Coastguard Worker   const int dgd_next = downsample_factor * dgd_stride - width;
337*77c1e3ccSAndroid Build Coastguard Worker   const uint8x8_t avg_u8 = vdup_n_u8(avg);
338*77c1e3ccSAndroid Build Coastguard Worker 
339*77c1e3ccSAndroid Build Coastguard Worker   do {
340*77c1e3ccSAndroid Build Coastguard Worker     int j = width;
341*77c1e3ccSAndroid Build Coastguard Worker     while (j >= 2) {
342*77c1e3ccSAndroid Build Coastguard Worker       // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the
343*77c1e3ccSAndroid Build Coastguard Worker       // middle 6x7 elements being shared.
344*77c1e3ccSAndroid Build Coastguard Worker       uint8x16_t dgd_rows[4];
345*77c1e3ccSAndroid Build Coastguard Worker       load_and_pack_u8_8x7(dgd_rows, dgd, dgd_stride);
346*77c1e3ccSAndroid Build Coastguard Worker 
347*77c1e3ccSAndroid Build Coastguard Worker       const uint8_t *dgd_ptr = dgd + dgd_stride * 6;
348*77c1e3ccSAndroid Build Coastguard Worker       dgd += 2;
349*77c1e3ccSAndroid Build Coastguard Worker 
350*77c1e3ccSAndroid Build Coastguard Worker       // Re-arrange (and widen) the combined 8x7 matrix to have the 2 whole 7x7
351*77c1e3ccSAndroid Build Coastguard Worker       // matrices (1 for each of the 2 pixels) separated into distinct
352*77c1e3ccSAndroid Build Coastguard Worker       // int16x8_t[6] arrays. These arrays contain 48 elements of the 49 (7x7).
353*77c1e3ccSAndroid Build Coastguard Worker       // Compute `dgd - avg` for both buffers. Each DGD_AVG buffer contains 49
354*77c1e3ccSAndroid Build Coastguard Worker       // consecutive elements.
355*77c1e3ccSAndroid Build Coastguard Worker       int16x8_t dgd_avg0[6];
356*77c1e3ccSAndroid Build Coastguard Worker       int16x8_t dgd_avg1[6];
357*77c1e3ccSAndroid Build Coastguard Worker       uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0);
358*77c1e3ccSAndroid Build Coastguard Worker       uint8x16_t dgd_shuf3 = tbl2q(dgd_rows[0], dgd_rows[1], lut3);
359*77c1e3ccSAndroid Build Coastguard Worker 
360*77c1e3ccSAndroid Build Coastguard Worker       dgd_avg0[0] =
361*77c1e3ccSAndroid Build Coastguard Worker           vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8));
362*77c1e3ccSAndroid Build Coastguard Worker       dgd_avg0[1] =
363*77c1e3ccSAndroid Build Coastguard Worker           vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8));
364*77c1e3ccSAndroid Build Coastguard Worker       dgd_avg1[0] =
365*77c1e3ccSAndroid Build Coastguard Worker           vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf3), avg_u8));
366*77c1e3ccSAndroid Build Coastguard Worker       dgd_avg1[1] =
367*77c1e3ccSAndroid Build Coastguard Worker           vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf3), avg_u8));
368*77c1e3ccSAndroid Build Coastguard Worker 
369*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s16(DGD_AVG0, dgd_avg0[0]);
370*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]);
371*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s16(DGD_AVG1, dgd_avg1[0]);
372*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s16(DGD_AVG1 + 8, dgd_avg1[1]);
373*77c1e3ccSAndroid Build Coastguard Worker 
374*77c1e3ccSAndroid Build Coastguard Worker       uint8x16_t dgd_shuf1 = tbl2q(dgd_rows[1], dgd_rows[2], lut1);
375*77c1e3ccSAndroid Build Coastguard Worker       uint8x16_t dgd_shuf4 = tbl2q(dgd_rows[1], dgd_rows[2], lut4);
376*77c1e3ccSAndroid Build Coastguard Worker 
377*77c1e3ccSAndroid Build Coastguard Worker       dgd_avg0[2] =
378*77c1e3ccSAndroid Build Coastguard Worker           vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf1), avg_u8));
379*77c1e3ccSAndroid Build Coastguard Worker       dgd_avg0[3] =
380*77c1e3ccSAndroid Build Coastguard Worker           vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf1), avg_u8));
381*77c1e3ccSAndroid Build Coastguard Worker       dgd_avg1[2] =
382*77c1e3ccSAndroid Build Coastguard Worker           vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf4), avg_u8));
383*77c1e3ccSAndroid Build Coastguard Worker       dgd_avg1[3] =
384*77c1e3ccSAndroid Build Coastguard Worker           vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf4), avg_u8));
385*77c1e3ccSAndroid Build Coastguard Worker 
386*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]);
387*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s16(DGD_AVG0 + 24, dgd_avg0[3]);
388*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s16(DGD_AVG1 + 16, dgd_avg1[2]);
389*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s16(DGD_AVG1 + 24, dgd_avg1[3]);
390*77c1e3ccSAndroid Build Coastguard Worker 
391*77c1e3ccSAndroid Build Coastguard Worker       uint8x16_t dgd_shuf2 = tbl2q(dgd_rows[2], dgd_rows[3], lut2);
392*77c1e3ccSAndroid Build Coastguard Worker       uint8x16_t dgd_shuf5 = tbl2q(dgd_rows[2], dgd_rows[3], lut5);
393*77c1e3ccSAndroid Build Coastguard Worker 
394*77c1e3ccSAndroid Build Coastguard Worker       dgd_avg0[4] =
395*77c1e3ccSAndroid Build Coastguard Worker           vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf2), avg_u8));
396*77c1e3ccSAndroid Build Coastguard Worker       dgd_avg0[5] =
397*77c1e3ccSAndroid Build Coastguard Worker           vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf2), avg_u8));
398*77c1e3ccSAndroid Build Coastguard Worker       dgd_avg1[4] =
399*77c1e3ccSAndroid Build Coastguard Worker           vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf5), avg_u8));
400*77c1e3ccSAndroid Build Coastguard Worker       dgd_avg1[5] =
401*77c1e3ccSAndroid Build Coastguard Worker           vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf5), avg_u8));
402*77c1e3ccSAndroid Build Coastguard Worker 
403*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s16(DGD_AVG0 + 32, dgd_avg0[4]);
404*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s16(DGD_AVG0 + 40, dgd_avg0[5]);
405*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s16(DGD_AVG1 + 32, dgd_avg1[4]);
406*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s16(DGD_AVG1 + 40, dgd_avg1[5]);
407*77c1e3ccSAndroid Build Coastguard Worker 
408*77c1e3ccSAndroid Build Coastguard Worker       // The remaining last (49th) elements of `dgd - avg`.
409*77c1e3ccSAndroid Build Coastguard Worker       DGD_AVG0[48] = dgd_ptr[6] - avg;
410*77c1e3ccSAndroid Build Coastguard Worker       DGD_AVG1[48] = dgd_ptr[7] - avg;
411*77c1e3ccSAndroid Build Coastguard Worker 
412*77c1e3ccSAndroid Build Coastguard Worker       // Accumulate into row-major variant of matrix M (cross-correlation) for 2
413*77c1e3ccSAndroid Build Coastguard Worker       // output pixels at a time. M is of size 7 * 7. It needs to be filled such
414*77c1e3ccSAndroid Build Coastguard Worker       // that multiplying one element from src with each element of a row of the
415*77c1e3ccSAndroid Build Coastguard Worker       // wiener window will fill one column of M. However this is not very
416*77c1e3ccSAndroid Build Coastguard Worker       // convenient in terms of memory access, as it means we do contiguous
417*77c1e3ccSAndroid Build Coastguard Worker       // loads of dgd but strided stores to M. As a result, we use an
418*77c1e3ccSAndroid Build Coastguard Worker       // intermediate matrix M_s32 which is instead filled such that one row of
419*77c1e3ccSAndroid Build Coastguard Worker       // the wiener window gives one row of M_s32. Once fully computed, M_s32 is
420*77c1e3ccSAndroid Build Coastguard Worker       // then transposed to return M.
421*77c1e3ccSAndroid Build Coastguard Worker       int src_avg0 = *src++ - avg;
422*77c1e3ccSAndroid Build Coastguard Worker       int src_avg1 = *src++ - avg;
423*77c1e3ccSAndroid Build Coastguard Worker       int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
424*77c1e3ccSAndroid Build Coastguard Worker       int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1);
425*77c1e3ccSAndroid Build Coastguard Worker       update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0],
426*77c1e3ccSAndroid Build Coastguard Worker                        dgd_avg1[0]);
427*77c1e3ccSAndroid Build Coastguard Worker       update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1],
428*77c1e3ccSAndroid Build Coastguard Worker                        dgd_avg1[1]);
429*77c1e3ccSAndroid Build Coastguard Worker       update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2],
430*77c1e3ccSAndroid Build Coastguard Worker                        dgd_avg1[2]);
431*77c1e3ccSAndroid Build Coastguard Worker       update_M_2pixels(M_s32 + 24, src_avg0_s16, src_avg1_s16, dgd_avg0[3],
432*77c1e3ccSAndroid Build Coastguard Worker                        dgd_avg1[3]);
433*77c1e3ccSAndroid Build Coastguard Worker       update_M_2pixels(M_s32 + 32, src_avg0_s16, src_avg1_s16, dgd_avg0[4],
434*77c1e3ccSAndroid Build Coastguard Worker                        dgd_avg1[4]);
435*77c1e3ccSAndroid Build Coastguard Worker       update_M_2pixels(M_s32 + 40, src_avg0_s16, src_avg1_s16, dgd_avg0[5],
436*77c1e3ccSAndroid Build Coastguard Worker                        dgd_avg1[5]);
437*77c1e3ccSAndroid Build Coastguard Worker 
438*77c1e3ccSAndroid Build Coastguard Worker       // Last (49th) element of M_s32 can be computed as scalar more efficiently
439*77c1e3ccSAndroid Build Coastguard Worker       // for 2 output pixels.
440*77c1e3ccSAndroid Build Coastguard Worker       M_s32[48] += DGD_AVG0[48] * src_avg0 + DGD_AVG1[48] * src_avg1;
441*77c1e3ccSAndroid Build Coastguard Worker 
442*77c1e3ccSAndroid Build Coastguard Worker       // Start accumulating into row-major version of matrix H
443*77c1e3ccSAndroid Build Coastguard Worker       // (auto-covariance), it expects the DGD_AVG[01] matrices to also be
444*77c1e3ccSAndroid Build Coastguard Worker       // row-major. H is of size 49 * 49. It is filled by multiplying every pair
445*77c1e3ccSAndroid Build Coastguard Worker       // of elements of the wiener window together (vector outer product). Since
446*77c1e3ccSAndroid Build Coastguard Worker       // it is a symmetric matrix, we only compute the upper-right triangle, and
447*77c1e3ccSAndroid Build Coastguard Worker       // then copy it down to the lower-left later. The upper triangle is
448*77c1e3ccSAndroid Build Coastguard Worker       // covered by 4x4 tiles. The original algorithm assumes the M matrix is
449*77c1e3ccSAndroid Build Coastguard Worker       // column-major and the resulting H matrix is also expected to be
450*77c1e3ccSAndroid Build Coastguard Worker       // column-major. It is not efficient to work with column-major matrices,
451*77c1e3ccSAndroid Build Coastguard Worker       // so we accumulate into a row-major matrix H_s32. At the end of the
452*77c1e3ccSAndroid Build Coastguard Worker       // algorithm a double transpose transformation will convert H_s32 back to
453*77c1e3ccSAndroid Build Coastguard Worker       // the expected output layout.
454*77c1e3ccSAndroid Build Coastguard Worker       update_H_7x7_2pixels(H_s32, DGD_AVG0, DGD_AVG1);
455*77c1e3ccSAndroid Build Coastguard Worker 
456*77c1e3ccSAndroid Build Coastguard Worker       // The last element of the triangle of H_s32 matrix can be computed as a
457*77c1e3ccSAndroid Build Coastguard Worker       // scalar more efficiently.
458*77c1e3ccSAndroid Build Coastguard Worker       H_s32[48 * WIENER_WIN2_ALIGN2 + 48] +=
459*77c1e3ccSAndroid Build Coastguard Worker           DGD_AVG0[48] * DGD_AVG0[48] + DGD_AVG1[48] * DGD_AVG1[48];
460*77c1e3ccSAndroid Build Coastguard Worker 
461*77c1e3ccSAndroid Build Coastguard Worker       // Accumulate into 64-bit after STAT_ACCUMULATOR_MAX iterations to prevent
462*77c1e3ccSAndroid Build Coastguard Worker       // overflow.
463*77c1e3ccSAndroid Build Coastguard Worker       if (--acc_cnt == 0) {
464*77c1e3ccSAndroid Build Coastguard Worker         acc_cnt = STAT_ACCUMULATOR_MAX;
465*77c1e3ccSAndroid Build Coastguard Worker 
466*77c1e3ccSAndroid Build Coastguard Worker         accumulate_and_clear(M_s64, M_s32, WIENER_WIN2_ALIGN2);
467*77c1e3ccSAndroid Build Coastguard Worker 
468*77c1e3ccSAndroid Build Coastguard Worker         // The widening accumulation is only needed for the upper triangle part
469*77c1e3ccSAndroid Build Coastguard Worker         // of the matrix.
470*77c1e3ccSAndroid Build Coastguard Worker         int64_t *lh = H_s64;
471*77c1e3ccSAndroid Build Coastguard Worker         int32_t *lh32 = H_s32;
472*77c1e3ccSAndroid Build Coastguard Worker         for (int k = 0; k < WIENER_WIN2; ++k) {
473*77c1e3ccSAndroid Build Coastguard Worker           // The widening accumulation is only run for the relevant parts
474*77c1e3ccSAndroid Build Coastguard Worker           // (upper-right triangle) in a row 4-element aligned.
475*77c1e3ccSAndroid Build Coastguard Worker           int k4 = k / 4 * 4;
476*77c1e3ccSAndroid Build Coastguard Worker           accumulate_and_clear(lh + k4, lh32 + k4, 48 - k4);
477*77c1e3ccSAndroid Build Coastguard Worker 
478*77c1e3ccSAndroid Build Coastguard Worker           // Last element of the row is computed separately.
479*77c1e3ccSAndroid Build Coastguard Worker           lh[48] += lh32[48];
480*77c1e3ccSAndroid Build Coastguard Worker           lh32[48] = 0;
481*77c1e3ccSAndroid Build Coastguard Worker 
482*77c1e3ccSAndroid Build Coastguard Worker           lh += WIENER_WIN2_ALIGN2;
483*77c1e3ccSAndroid Build Coastguard Worker           lh32 += WIENER_WIN2_ALIGN2;
484*77c1e3ccSAndroid Build Coastguard Worker         }
485*77c1e3ccSAndroid Build Coastguard Worker       }
486*77c1e3ccSAndroid Build Coastguard Worker 
487*77c1e3ccSAndroid Build Coastguard Worker       j -= 2;
488*77c1e3ccSAndroid Build Coastguard Worker     }
489*77c1e3ccSAndroid Build Coastguard Worker 
490*77c1e3ccSAndroid Build Coastguard Worker     // Computations for odd pixel in the row.
491*77c1e3ccSAndroid Build Coastguard Worker     if (width & 1) {
492*77c1e3ccSAndroid Build Coastguard Worker       // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the
493*77c1e3ccSAndroid Build Coastguard Worker       // middle 6x7 elements being shared.
494*77c1e3ccSAndroid Build Coastguard Worker       uint8x16_t dgd_rows[4];
495*77c1e3ccSAndroid Build Coastguard Worker       load_and_pack_u8_8x7(dgd_rows, dgd, dgd_stride);
496*77c1e3ccSAndroid Build Coastguard Worker 
497*77c1e3ccSAndroid Build Coastguard Worker       const uint8_t *dgd_ptr = dgd + dgd_stride * 6;
498*77c1e3ccSAndroid Build Coastguard Worker       ++dgd;
499*77c1e3ccSAndroid Build Coastguard Worker 
500*77c1e3ccSAndroid Build Coastguard Worker       // Re-arrange (and widen) the combined 8x7 matrix to have a whole 7x7
501*77c1e3ccSAndroid Build Coastguard Worker       // matrix tightly packed into a int16x8_t[6] array. This array contains
502*77c1e3ccSAndroid Build Coastguard Worker       // 48 elements of the 49 (7x7). Compute `dgd - avg` for the whole buffer.
503*77c1e3ccSAndroid Build Coastguard Worker       // The DGD_AVG buffer contains 49 consecutive elements.
504*77c1e3ccSAndroid Build Coastguard Worker       int16x8_t dgd_avg0[6];
505*77c1e3ccSAndroid Build Coastguard Worker       uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0);
506*77c1e3ccSAndroid Build Coastguard Worker       dgd_avg0[0] =
507*77c1e3ccSAndroid Build Coastguard Worker           vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8));
508*77c1e3ccSAndroid Build Coastguard Worker       dgd_avg0[1] =
509*77c1e3ccSAndroid Build Coastguard Worker           vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8));
510*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s16(DGD_AVG0, dgd_avg0[0]);
511*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]);
512*77c1e3ccSAndroid Build Coastguard Worker 
513*77c1e3ccSAndroid Build Coastguard Worker       uint8x16_t dgd_shuf1 = tbl2q(dgd_rows[1], dgd_rows[2], lut1);
514*77c1e3ccSAndroid Build Coastguard Worker       dgd_avg0[2] =
515*77c1e3ccSAndroid Build Coastguard Worker           vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf1), avg_u8));
516*77c1e3ccSAndroid Build Coastguard Worker       dgd_avg0[3] =
517*77c1e3ccSAndroid Build Coastguard Worker           vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf1), avg_u8));
518*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]);
519*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s16(DGD_AVG0 + 24, dgd_avg0[3]);
520*77c1e3ccSAndroid Build Coastguard Worker 
521*77c1e3ccSAndroid Build Coastguard Worker       uint8x16_t dgd_shuf2 = tbl2q(dgd_rows[2], dgd_rows[3], lut2);
522*77c1e3ccSAndroid Build Coastguard Worker       dgd_avg0[4] =
523*77c1e3ccSAndroid Build Coastguard Worker           vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf2), avg_u8));
524*77c1e3ccSAndroid Build Coastguard Worker       dgd_avg0[5] =
525*77c1e3ccSAndroid Build Coastguard Worker           vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf2), avg_u8));
526*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s16(DGD_AVG0 + 32, dgd_avg0[4]);
527*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s16(DGD_AVG0 + 40, dgd_avg0[5]);
528*77c1e3ccSAndroid Build Coastguard Worker 
529*77c1e3ccSAndroid Build Coastguard Worker       // The remaining last (49th) element of `dgd - avg`.
530*77c1e3ccSAndroid Build Coastguard Worker       DGD_AVG0[48] = dgd_ptr[6] - avg;
531*77c1e3ccSAndroid Build Coastguard Worker 
532*77c1e3ccSAndroid Build Coastguard Worker       // Accumulate into row-major order variant of matrix M (cross-correlation)
533*77c1e3ccSAndroid Build Coastguard Worker       // for 1 output pixel at a time. M is of size 7 * 7. It needs to be filled
534*77c1e3ccSAndroid Build Coastguard Worker       // such that multiplying one element from src with each element of a row
535*77c1e3ccSAndroid Build Coastguard Worker       // of the wiener window will fill one column of M. However this is not
536*77c1e3ccSAndroid Build Coastguard Worker       // very convenient in terms of memory access, as it means we do
537*77c1e3ccSAndroid Build Coastguard Worker       // contiguous loads of dgd but strided stores to M. As a result, we use an
538*77c1e3ccSAndroid Build Coastguard Worker       // intermediate matrix M_s32 which is instead filled such that one row of
539*77c1e3ccSAndroid Build Coastguard Worker       // the wiener window gives one row of M_s32. Once fully computed, M_s32 is
540*77c1e3ccSAndroid Build Coastguard Worker       // then transposed to return M.
541*77c1e3ccSAndroid Build Coastguard Worker       int src_avg0 = *src++ - avg;
542*77c1e3ccSAndroid Build Coastguard Worker       int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
543*77c1e3ccSAndroid Build Coastguard Worker       update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]);
544*77c1e3ccSAndroid Build Coastguard Worker       update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]);
545*77c1e3ccSAndroid Build Coastguard Worker       update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]);
546*77c1e3ccSAndroid Build Coastguard Worker       update_M_1pixel(M_s32 + 24, src_avg0_s16, dgd_avg0[3]);
547*77c1e3ccSAndroid Build Coastguard Worker       update_M_1pixel(M_s32 + 32, src_avg0_s16, dgd_avg0[4]);
548*77c1e3ccSAndroid Build Coastguard Worker       update_M_1pixel(M_s32 + 40, src_avg0_s16, dgd_avg0[5]);
549*77c1e3ccSAndroid Build Coastguard Worker 
550*77c1e3ccSAndroid Build Coastguard Worker       // Last (49th) element of M_s32 can be computed as scalar more efficiently
551*77c1e3ccSAndroid Build Coastguard Worker       // for 1 output pixel.
552*77c1e3ccSAndroid Build Coastguard Worker       M_s32[48] += DGD_AVG0[48] * src_avg0;
553*77c1e3ccSAndroid Build Coastguard Worker 
554*77c1e3ccSAndroid Build Coastguard Worker       // Start accumulating into row-major order version of matrix H
555*77c1e3ccSAndroid Build Coastguard Worker       // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major.
556*77c1e3ccSAndroid Build Coastguard Worker       // H is of size 49 * 49. It is filled by multiplying every pair of
557*77c1e3ccSAndroid Build Coastguard Worker       // elements of the wiener window together (vector outer product). Since it
558*77c1e3ccSAndroid Build Coastguard Worker       // is a symmetric matrix, we only compute the upper-right triangle, and
559*77c1e3ccSAndroid Build Coastguard Worker       // then copy it down to the lower-left later. The upper triangle is
560*77c1e3ccSAndroid Build Coastguard Worker       // covered by 4x4 tiles. The original algorithm assumes the M matrix is
561*77c1e3ccSAndroid Build Coastguard Worker       // column-major and the resulting H matrix is also expected to be
562*77c1e3ccSAndroid Build Coastguard Worker       // column-major. It is not efficient to work column-major matrices, so we
563*77c1e3ccSAndroid Build Coastguard Worker       // accumulate into a row-major matrix H_s32. At the end of the algorithm a
564*77c1e3ccSAndroid Build Coastguard Worker       // double transpose transformation will convert H_s32 back to the expected
565*77c1e3ccSAndroid Build Coastguard Worker       // output layout.
566*77c1e3ccSAndroid Build Coastguard Worker       update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_ALIGN2, 48);
567*77c1e3ccSAndroid Build Coastguard Worker 
568*77c1e3ccSAndroid Build Coastguard Worker       // The last element of the triangle of H_s32 matrix can be computed as
569*77c1e3ccSAndroid Build Coastguard Worker       // scalar more efficiently.
570*77c1e3ccSAndroid Build Coastguard Worker       H_s32[48 * WIENER_WIN2_ALIGN2 + 48] += DGD_AVG0[48] * DGD_AVG0[48];
571*77c1e3ccSAndroid Build Coastguard Worker     }
572*77c1e3ccSAndroid Build Coastguard Worker 
573*77c1e3ccSAndroid Build Coastguard Worker     src += src_next;
574*77c1e3ccSAndroid Build Coastguard Worker     dgd += dgd_next;
575*77c1e3ccSAndroid Build Coastguard Worker   } while (--height != 0);
576*77c1e3ccSAndroid Build Coastguard Worker 
577*77c1e3ccSAndroid Build Coastguard Worker   acc_transpose_M(M, M_s64, M_s32, WIENER_WIN, downsample_factor);
578*77c1e3ccSAndroid Build Coastguard Worker 
579*77c1e3ccSAndroid Build Coastguard Worker   update_H(H, H_s64, H_s32, WIENER_WIN, WIENER_WIN2_ALIGN2, downsample_factor);
580*77c1e3ccSAndroid Build Coastguard Worker }
581*77c1e3ccSAndroid Build Coastguard Worker 
582*77c1e3ccSAndroid Build Coastguard Worker // Load 5x5 matrix into 2 and a half 128-bit vectors from consecutive rows, the
583*77c1e3ccSAndroid Build Coastguard Worker // last load address is offset to prevent out-of-bounds access.
load_and_pack_u8_6x5(uint8x16_t dst[3],const uint8_t * src,ptrdiff_t stride)584*77c1e3ccSAndroid Build Coastguard Worker static inline void load_and_pack_u8_6x5(uint8x16_t dst[3], const uint8_t *src,
585*77c1e3ccSAndroid Build Coastguard Worker                                         ptrdiff_t stride) {
586*77c1e3ccSAndroid Build Coastguard Worker   dst[0] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride));
587*77c1e3ccSAndroid Build Coastguard Worker   src += 2 * stride;
588*77c1e3ccSAndroid Build Coastguard Worker   dst[1] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride));
589*77c1e3ccSAndroid Build Coastguard Worker   src += 2 * stride;
590*77c1e3ccSAndroid Build Coastguard Worker   dst[2] = vcombine_u8(vld1_u8(src - 3), vdup_n_u8(0));
591*77c1e3ccSAndroid Build Coastguard Worker }
592*77c1e3ccSAndroid Build Coastguard Worker 
compute_stats_win5_downsampled_neon(const uint8_t * dgd,const uint8_t * src,int width,int height,int dgd_stride,int src_stride,int avg,int64_t * M,int64_t * H,int downsample_factor)593*77c1e3ccSAndroid Build Coastguard Worker static inline void compute_stats_win5_downsampled_neon(
594*77c1e3ccSAndroid Build Coastguard Worker     const uint8_t *dgd, const uint8_t *src, int width, int height,
595*77c1e3ccSAndroid Build Coastguard Worker     int dgd_stride, int src_stride, int avg, int64_t *M, int64_t *H,
596*77c1e3ccSAndroid Build Coastguard Worker     int downsample_factor) {
597*77c1e3ccSAndroid Build Coastguard Worker   // Matrix names are capitalized to help readability.
598*77c1e3ccSAndroid Build Coastguard Worker   DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_REDUCED_ALIGN3]);
599*77c1e3ccSAndroid Build Coastguard Worker   DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_REDUCED_ALIGN3]);
600*77c1e3ccSAndroid Build Coastguard Worker   DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_REDUCED_ALIGN3]);
601*77c1e3ccSAndroid Build Coastguard Worker   DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_REDUCED_ALIGN3]);
602*77c1e3ccSAndroid Build Coastguard Worker   DECLARE_ALIGNED(64, int32_t,
603*77c1e3ccSAndroid Build Coastguard Worker                   H_s32[WIENER_WIN2_REDUCED * WIENER_WIN2_REDUCED_ALIGN2]);
604*77c1e3ccSAndroid Build Coastguard Worker   DECLARE_ALIGNED(64, int64_t,
605*77c1e3ccSAndroid Build Coastguard Worker                   H_s64[WIENER_WIN2_REDUCED * WIENER_WIN2_REDUCED_ALIGN2]);
606*77c1e3ccSAndroid Build Coastguard Worker 
607*77c1e3ccSAndroid Build Coastguard Worker   memset(M_s32, 0, sizeof(M_s32));
608*77c1e3ccSAndroid Build Coastguard Worker   memset(M_s64, 0, sizeof(M_s64));
609*77c1e3ccSAndroid Build Coastguard Worker   memset(H_s32, 0, sizeof(H_s32));
610*77c1e3ccSAndroid Build Coastguard Worker   memset(H_s64, 0, sizeof(H_s64));
611*77c1e3ccSAndroid Build Coastguard Worker 
612*77c1e3ccSAndroid Build Coastguard Worker   // Look-up tables to create 8x3 matrix with consecutive elements from two 5x5
613*77c1e3ccSAndroid Build Coastguard Worker   // matrices.
614*77c1e3ccSAndroid Build Coastguard Worker   // clang-format off
615*77c1e3ccSAndroid Build Coastguard Worker   DECLARE_ALIGNED(16, static const uint8_t, shuffle_stats5[48]) = {
616*77c1e3ccSAndroid Build Coastguard Worker     0,  1,  2,  3,  4,  8,  9, 10, 11, 12, 16, 17, 18, 19, 20, 24,
617*77c1e3ccSAndroid Build Coastguard Worker     1,  2,  3,  4,  5,  9, 10, 11, 12, 13, 17, 18, 19, 20, 21, 25,
618*77c1e3ccSAndroid Build Coastguard Worker     9, 10, 11, 12, 19, 20, 21, 22, 10, 11, 12, 13, 20, 21, 22, 23,
619*77c1e3ccSAndroid Build Coastguard Worker   };
620*77c1e3ccSAndroid Build Coastguard Worker   // clang-format on
621*77c1e3ccSAndroid Build Coastguard Worker 
622*77c1e3ccSAndroid Build Coastguard Worker   const uint8x16_t lut0 = vld1q_u8(shuffle_stats5 + 0);
623*77c1e3ccSAndroid Build Coastguard Worker   const uint8x16_t lut1 = vld1q_u8(shuffle_stats5 + 16);
624*77c1e3ccSAndroid Build Coastguard Worker   const uint8x16_t lut2 = vld1q_u8(shuffle_stats5 + 32);
625*77c1e3ccSAndroid Build Coastguard Worker 
626*77c1e3ccSAndroid Build Coastguard Worker   int acc_cnt = STAT_ACCUMULATOR_MAX;
627*77c1e3ccSAndroid Build Coastguard Worker   const int src_next = downsample_factor * src_stride - width;
628*77c1e3ccSAndroid Build Coastguard Worker   const int dgd_next = downsample_factor * dgd_stride - width;
629*77c1e3ccSAndroid Build Coastguard Worker   const uint8x8_t avg_u8 = vdup_n_u8(avg);
630*77c1e3ccSAndroid Build Coastguard Worker 
631*77c1e3ccSAndroid Build Coastguard Worker   do {
632*77c1e3ccSAndroid Build Coastguard Worker     int j = width;
633*77c1e3ccSAndroid Build Coastguard Worker     while (j >= 2) {
634*77c1e3ccSAndroid Build Coastguard Worker       // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the
635*77c1e3ccSAndroid Build Coastguard Worker       // middle 4x5 elements being shared.
636*77c1e3ccSAndroid Build Coastguard Worker       uint8x16_t dgd_rows[3];
637*77c1e3ccSAndroid Build Coastguard Worker       load_and_pack_u8_6x5(dgd_rows, dgd, dgd_stride);
638*77c1e3ccSAndroid Build Coastguard Worker 
639*77c1e3ccSAndroid Build Coastguard Worker       const uint8_t *dgd_ptr = dgd + dgd_stride * 4;
640*77c1e3ccSAndroid Build Coastguard Worker       dgd += 2;
641*77c1e3ccSAndroid Build Coastguard Worker 
642*77c1e3ccSAndroid Build Coastguard Worker       // Re-arrange (and widen) the combined 6x5 matrix to have the 2 whole 5x5
643*77c1e3ccSAndroid Build Coastguard Worker       // matrices (1 for each of the 2 pixels) separated into distinct
644*77c1e3ccSAndroid Build Coastguard Worker       // int16x8_t[3] arrays. These arrays contain 24 elements of the 25 (5x5).
645*77c1e3ccSAndroid Build Coastguard Worker       // Compute `dgd - avg` for both buffers. Each DGD_AVG buffer contains 25
646*77c1e3ccSAndroid Build Coastguard Worker       // consecutive elements.
647*77c1e3ccSAndroid Build Coastguard Worker       int16x8_t dgd_avg0[3];
648*77c1e3ccSAndroid Build Coastguard Worker       int16x8_t dgd_avg1[3];
649*77c1e3ccSAndroid Build Coastguard Worker       uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0);
650*77c1e3ccSAndroid Build Coastguard Worker       uint8x16_t dgd_shuf1 = tbl2q(dgd_rows[0], dgd_rows[1], lut1);
651*77c1e3ccSAndroid Build Coastguard Worker       uint8x16_t dgd_shuf2 = tbl2q(dgd_rows[1], dgd_rows[2], lut2);
652*77c1e3ccSAndroid Build Coastguard Worker 
653*77c1e3ccSAndroid Build Coastguard Worker       dgd_avg0[0] =
654*77c1e3ccSAndroid Build Coastguard Worker           vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8));
655*77c1e3ccSAndroid Build Coastguard Worker       dgd_avg0[1] =
656*77c1e3ccSAndroid Build Coastguard Worker           vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8));
657*77c1e3ccSAndroid Build Coastguard Worker       dgd_avg0[2] =
658*77c1e3ccSAndroid Build Coastguard Worker           vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf2), avg_u8));
659*77c1e3ccSAndroid Build Coastguard Worker       dgd_avg1[0] =
660*77c1e3ccSAndroid Build Coastguard Worker           vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf1), avg_u8));
661*77c1e3ccSAndroid Build Coastguard Worker       dgd_avg1[1] =
662*77c1e3ccSAndroid Build Coastguard Worker           vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf1), avg_u8));
663*77c1e3ccSAndroid Build Coastguard Worker       dgd_avg1[2] =
664*77c1e3ccSAndroid Build Coastguard Worker           vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf2), avg_u8));
665*77c1e3ccSAndroid Build Coastguard Worker 
666*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s16(DGD_AVG0 + 0, dgd_avg0[0]);
667*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]);
668*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]);
669*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s16(DGD_AVG1 + 0, dgd_avg1[0]);
670*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s16(DGD_AVG1 + 8, dgd_avg1[1]);
671*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s16(DGD_AVG1 + 16, dgd_avg1[2]);
672*77c1e3ccSAndroid Build Coastguard Worker 
673*77c1e3ccSAndroid Build Coastguard Worker       // The remaining last (25th) elements of `dgd - avg`.
674*77c1e3ccSAndroid Build Coastguard Worker       DGD_AVG0[24] = dgd_ptr[4] - avg;
675*77c1e3ccSAndroid Build Coastguard Worker       DGD_AVG1[24] = dgd_ptr[5] - avg;
676*77c1e3ccSAndroid Build Coastguard Worker 
677*77c1e3ccSAndroid Build Coastguard Worker       // Accumulate into row-major variant of matrix M (cross-correlation) for 2
678*77c1e3ccSAndroid Build Coastguard Worker       // output pixels at a time. M is of size 5 * 5. It needs to be filled such
679*77c1e3ccSAndroid Build Coastguard Worker       // that multiplying one element from src with each element of a row of the
680*77c1e3ccSAndroid Build Coastguard Worker       // wiener window will fill one column of M. However this is not very
681*77c1e3ccSAndroid Build Coastguard Worker       // convenient in terms of memory access, as it means we do contiguous
682*77c1e3ccSAndroid Build Coastguard Worker       // loads of dgd but strided stores to M. As a result, we use an
683*77c1e3ccSAndroid Build Coastguard Worker       // intermediate matrix M_s32 which is instead filled such that one row of
684*77c1e3ccSAndroid Build Coastguard Worker       // the wiener window gives one row of M_s32. Once fully computed, M_s32 is
685*77c1e3ccSAndroid Build Coastguard Worker       // then transposed to return M.
686*77c1e3ccSAndroid Build Coastguard Worker       int src_avg0 = *src++ - avg;
687*77c1e3ccSAndroid Build Coastguard Worker       int src_avg1 = *src++ - avg;
688*77c1e3ccSAndroid Build Coastguard Worker       int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
689*77c1e3ccSAndroid Build Coastguard Worker       int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1);
690*77c1e3ccSAndroid Build Coastguard Worker       update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0],
691*77c1e3ccSAndroid Build Coastguard Worker                        dgd_avg1[0]);
692*77c1e3ccSAndroid Build Coastguard Worker       update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1],
693*77c1e3ccSAndroid Build Coastguard Worker                        dgd_avg1[1]);
694*77c1e3ccSAndroid Build Coastguard Worker       update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2],
695*77c1e3ccSAndroid Build Coastguard Worker                        dgd_avg1[2]);
696*77c1e3ccSAndroid Build Coastguard Worker 
697*77c1e3ccSAndroid Build Coastguard Worker       // Last (25th) element of M_s32 can be computed as scalar more efficiently
698*77c1e3ccSAndroid Build Coastguard Worker       // for 2 output pixels.
699*77c1e3ccSAndroid Build Coastguard Worker       M_s32[24] += DGD_AVG0[24] * src_avg0 + DGD_AVG1[24] * src_avg1;
700*77c1e3ccSAndroid Build Coastguard Worker 
701*77c1e3ccSAndroid Build Coastguard Worker       // Start accumulating into row-major version of matrix H
702*77c1e3ccSAndroid Build Coastguard Worker       // (auto-covariance), it expects the DGD_AVG[01] matrices to also be
703*77c1e3ccSAndroid Build Coastguard Worker       // row-major. H is of size 25 * 25. It is filled by multiplying every pair
704*77c1e3ccSAndroid Build Coastguard Worker       // of elements of the wiener window together (vector outer product). Since
705*77c1e3ccSAndroid Build Coastguard Worker       // it is a symmetric matrix, we only compute the upper-right triangle, and
706*77c1e3ccSAndroid Build Coastguard Worker       // then copy it down to the lower-left later. The upper triangle is
707*77c1e3ccSAndroid Build Coastguard Worker       // covered by 4x4 tiles. The original algorithm assumes the M matrix is
708*77c1e3ccSAndroid Build Coastguard Worker       // column-major and the resulting H matrix is also expected to be
709*77c1e3ccSAndroid Build Coastguard Worker       // column-major. It is not efficient to work with column-major matrices,
710*77c1e3ccSAndroid Build Coastguard Worker       // so we accumulate into a row-major matrix H_s32. At the end of the
711*77c1e3ccSAndroid Build Coastguard Worker       // algorithm a double transpose transformation will convert H_s32 back to
712*77c1e3ccSAndroid Build Coastguard Worker       // the expected output layout.
713*77c1e3ccSAndroid Build Coastguard Worker       update_H_5x5_2pixels(H_s32, DGD_AVG0, DGD_AVG1);
714*77c1e3ccSAndroid Build Coastguard Worker 
715*77c1e3ccSAndroid Build Coastguard Worker       // The last element of the triangle of H_s32 matrix can be computed as a
716*77c1e3ccSAndroid Build Coastguard Worker       // scalar more efficiently.
717*77c1e3ccSAndroid Build Coastguard Worker       H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] +=
718*77c1e3ccSAndroid Build Coastguard Worker           DGD_AVG0[24] * DGD_AVG0[24] + DGD_AVG1[24] * DGD_AVG1[24];
719*77c1e3ccSAndroid Build Coastguard Worker 
720*77c1e3ccSAndroid Build Coastguard Worker       // Accumulate into 64-bit after STAT_ACCUMULATOR_MAX iterations to prevent
721*77c1e3ccSAndroid Build Coastguard Worker       // overflow.
722*77c1e3ccSAndroid Build Coastguard Worker       if (--acc_cnt == 0) {
723*77c1e3ccSAndroid Build Coastguard Worker         acc_cnt = STAT_ACCUMULATOR_MAX;
724*77c1e3ccSAndroid Build Coastguard Worker 
725*77c1e3ccSAndroid Build Coastguard Worker         accumulate_and_clear(M_s64, M_s32, WIENER_WIN2_REDUCED_ALIGN2);
726*77c1e3ccSAndroid Build Coastguard Worker 
727*77c1e3ccSAndroid Build Coastguard Worker         // The widening accumulation is only needed for the upper triangle part
728*77c1e3ccSAndroid Build Coastguard Worker         // of the matrix.
729*77c1e3ccSAndroid Build Coastguard Worker         int64_t *lh = H_s64;
730*77c1e3ccSAndroid Build Coastguard Worker         int32_t *lh32 = H_s32;
731*77c1e3ccSAndroid Build Coastguard Worker         for (int k = 0; k < WIENER_WIN2_REDUCED; ++k) {
732*77c1e3ccSAndroid Build Coastguard Worker           // The widening accumulation is only run for the relevant parts
733*77c1e3ccSAndroid Build Coastguard Worker           // (upper-right triangle) in a row 4-element aligned.
734*77c1e3ccSAndroid Build Coastguard Worker           int k4 = k / 4 * 4;
735*77c1e3ccSAndroid Build Coastguard Worker           accumulate_and_clear(lh + k4, lh32 + k4, 24 - k4);
736*77c1e3ccSAndroid Build Coastguard Worker 
737*77c1e3ccSAndroid Build Coastguard Worker           // Last element of the row is computed separately.
738*77c1e3ccSAndroid Build Coastguard Worker           lh[24] += lh32[24];
739*77c1e3ccSAndroid Build Coastguard Worker           lh32[24] = 0;
740*77c1e3ccSAndroid Build Coastguard Worker 
741*77c1e3ccSAndroid Build Coastguard Worker           lh += WIENER_WIN2_REDUCED_ALIGN2;
742*77c1e3ccSAndroid Build Coastguard Worker           lh32 += WIENER_WIN2_REDUCED_ALIGN2;
743*77c1e3ccSAndroid Build Coastguard Worker         }
744*77c1e3ccSAndroid Build Coastguard Worker       }
745*77c1e3ccSAndroid Build Coastguard Worker 
746*77c1e3ccSAndroid Build Coastguard Worker       j -= 2;
747*77c1e3ccSAndroid Build Coastguard Worker     }
748*77c1e3ccSAndroid Build Coastguard Worker 
749*77c1e3ccSAndroid Build Coastguard Worker     // Computations for odd pixel in the row.
750*77c1e3ccSAndroid Build Coastguard Worker     if (width & 1) {
751*77c1e3ccSAndroid Build Coastguard Worker       // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the
752*77c1e3ccSAndroid Build Coastguard Worker       // middle 4x5 elements being shared.
753*77c1e3ccSAndroid Build Coastguard Worker       uint8x16_t dgd_rows[3];
754*77c1e3ccSAndroid Build Coastguard Worker       load_and_pack_u8_6x5(dgd_rows, dgd, dgd_stride);
755*77c1e3ccSAndroid Build Coastguard Worker 
756*77c1e3ccSAndroid Build Coastguard Worker       const uint8_t *dgd_ptr = dgd + dgd_stride * 4;
757*77c1e3ccSAndroid Build Coastguard Worker       ++dgd;
758*77c1e3ccSAndroid Build Coastguard Worker 
759*77c1e3ccSAndroid Build Coastguard Worker       // Re-arrange (and widen) the combined 6x5 matrix to have a whole 5x5
760*77c1e3ccSAndroid Build Coastguard Worker       // matrix tightly packed into a int16x8_t[3] array. This array contains
761*77c1e3ccSAndroid Build Coastguard Worker       // 24 elements of the 25 (5x5). Compute `dgd - avg` for the whole buffer.
762*77c1e3ccSAndroid Build Coastguard Worker       // The DGD_AVG buffer contains 25 consecutive elements.
763*77c1e3ccSAndroid Build Coastguard Worker       int16x8_t dgd_avg0[3];
764*77c1e3ccSAndroid Build Coastguard Worker       uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0);
765*77c1e3ccSAndroid Build Coastguard Worker       uint8x8_t dgd_shuf1 = tbl2(dgd_rows[1], dgd_rows[2], vget_low_u8(lut2));
766*77c1e3ccSAndroid Build Coastguard Worker 
767*77c1e3ccSAndroid Build Coastguard Worker       dgd_avg0[0] =
768*77c1e3ccSAndroid Build Coastguard Worker           vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8));
769*77c1e3ccSAndroid Build Coastguard Worker       dgd_avg0[1] =
770*77c1e3ccSAndroid Build Coastguard Worker           vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8));
771*77c1e3ccSAndroid Build Coastguard Worker       dgd_avg0[2] = vreinterpretq_s16_u16(vsubl_u8(dgd_shuf1, avg_u8));
772*77c1e3ccSAndroid Build Coastguard Worker 
773*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s16(DGD_AVG0 + 0, dgd_avg0[0]);
774*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]);
775*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]);
776*77c1e3ccSAndroid Build Coastguard Worker 
777*77c1e3ccSAndroid Build Coastguard Worker       // The remaining last (25th) element of `dgd - avg`.
778*77c1e3ccSAndroid Build Coastguard Worker       DGD_AVG0[24] = dgd_ptr[4] - avg;
779*77c1e3ccSAndroid Build Coastguard Worker 
780*77c1e3ccSAndroid Build Coastguard Worker       // Accumulate into row-major order variant of matrix M (cross-correlation)
781*77c1e3ccSAndroid Build Coastguard Worker       // for 1 output pixel at a time. M is of size 5 * 5. It needs to be filled
782*77c1e3ccSAndroid Build Coastguard Worker       // such that multiplying one element from src with each element of a row
783*77c1e3ccSAndroid Build Coastguard Worker       // of the wiener window will fill one column of M. However this is not
784*77c1e3ccSAndroid Build Coastguard Worker       // very convenient in terms of memory access, as it means we do
785*77c1e3ccSAndroid Build Coastguard Worker       // contiguous loads of dgd but strided stores to M. As a result, we use an
786*77c1e3ccSAndroid Build Coastguard Worker       // intermediate matrix M_s32 which is instead filled such that one row of
787*77c1e3ccSAndroid Build Coastguard Worker       // the wiener window gives one row of M_s32. Once fully computed, M_s32 is
788*77c1e3ccSAndroid Build Coastguard Worker       // then transposed to return M.
789*77c1e3ccSAndroid Build Coastguard Worker       int src_avg0 = *src++ - avg;
790*77c1e3ccSAndroid Build Coastguard Worker       int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0);
791*77c1e3ccSAndroid Build Coastguard Worker       update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]);
792*77c1e3ccSAndroid Build Coastguard Worker       update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]);
793*77c1e3ccSAndroid Build Coastguard Worker       update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]);
794*77c1e3ccSAndroid Build Coastguard Worker 
795*77c1e3ccSAndroid Build Coastguard Worker       // Last (25th) element of M_s32 can be computed as scalar more efficiently
796*77c1e3ccSAndroid Build Coastguard Worker       // for 1 output pixel.
797*77c1e3ccSAndroid Build Coastguard Worker       M_s32[24] += DGD_AVG0[24] * src_avg0;
798*77c1e3ccSAndroid Build Coastguard Worker 
799*77c1e3ccSAndroid Build Coastguard Worker       // Start accumulating into row-major order version of matrix H
800*77c1e3ccSAndroid Build Coastguard Worker       // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major.
801*77c1e3ccSAndroid Build Coastguard Worker       // H is of size 25 * 25. It is filled by multiplying every pair of
802*77c1e3ccSAndroid Build Coastguard Worker       // elements of the wiener window together (vector outer product). Since it
803*77c1e3ccSAndroid Build Coastguard Worker       // is a symmetric matrix, we only compute the upper-right triangle, and
804*77c1e3ccSAndroid Build Coastguard Worker       // then copy it down to the lower-left later. The upper triangle is
805*77c1e3ccSAndroid Build Coastguard Worker       // covered by 4x4 tiles. The original algorithm assumes the M matrix is
806*77c1e3ccSAndroid Build Coastguard Worker       // column-major and the resulting H matrix is also expected to be
807*77c1e3ccSAndroid Build Coastguard Worker       // column-major. It is not efficient to work column-major matrices, so we
808*77c1e3ccSAndroid Build Coastguard Worker       // accumulate into a row-major matrix H_s32. At the end of the algorithm a
809*77c1e3ccSAndroid Build Coastguard Worker       // double transpose transformation will convert H_s32 back to the expected
810*77c1e3ccSAndroid Build Coastguard Worker       // output layout.
811*77c1e3ccSAndroid Build Coastguard Worker       update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_REDUCED_ALIGN2, 24);
812*77c1e3ccSAndroid Build Coastguard Worker 
813*77c1e3ccSAndroid Build Coastguard Worker       // The last element of the triangle of H_s32 matrix can be computed as a
814*77c1e3ccSAndroid Build Coastguard Worker       // scalar more efficiently.
815*77c1e3ccSAndroid Build Coastguard Worker       H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] +=
816*77c1e3ccSAndroid Build Coastguard Worker           DGD_AVG0[24] * DGD_AVG0[24];
817*77c1e3ccSAndroid Build Coastguard Worker     }
818*77c1e3ccSAndroid Build Coastguard Worker 
819*77c1e3ccSAndroid Build Coastguard Worker     src += src_next;
820*77c1e3ccSAndroid Build Coastguard Worker     dgd += dgd_next;
821*77c1e3ccSAndroid Build Coastguard Worker   } while (--height != 0);
822*77c1e3ccSAndroid Build Coastguard Worker 
823*77c1e3ccSAndroid Build Coastguard Worker   acc_transpose_M(M, M_s64, M_s32, WIENER_WIN_REDUCED, downsample_factor);
824*77c1e3ccSAndroid Build Coastguard Worker 
825*77c1e3ccSAndroid Build Coastguard Worker   update_H(H, H_s64, H_s32, WIENER_WIN_REDUCED, WIENER_WIN2_REDUCED_ALIGN2,
826*77c1e3ccSAndroid Build Coastguard Worker            downsample_factor);
827*77c1e3ccSAndroid Build Coastguard Worker }
828*77c1e3ccSAndroid Build Coastguard Worker 
hadd_update_6_stats_neon(const int64_t * const src,const int32x4_t * deltas,int64_t * const dst)829*77c1e3ccSAndroid Build Coastguard Worker static inline void hadd_update_6_stats_neon(const int64_t *const src,
830*77c1e3ccSAndroid Build Coastguard Worker                                             const int32x4_t *deltas,
831*77c1e3ccSAndroid Build Coastguard Worker                                             int64_t *const dst) {
832*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t delta01 = horizontal_add_2d_s32(deltas[0], deltas[1]);
833*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t delta23 = horizontal_add_2d_s32(deltas[2], deltas[3]);
834*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t delta45 = horizontal_add_2d_s32(deltas[4], deltas[5]);
835*77c1e3ccSAndroid Build Coastguard Worker 
836*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t delta01_s64 = vpaddlq_s32(delta01);
837*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t delta23_s64 = vpaddlq_s32(delta23);
838*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t delta45_s64 = vpaddlq_s32(delta45);
839*77c1e3ccSAndroid Build Coastguard Worker 
840*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t src0 = vld1q_s64(src);
841*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t src1 = vld1q_s64(src + 2);
842*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t src2 = vld1q_s64(src + 4);
843*77c1e3ccSAndroid Build Coastguard Worker 
844*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s64(dst, vaddq_s64(src0, delta01_s64));
845*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s64(dst + 2, vaddq_s64(src1, delta23_s64));
846*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s64(dst + 4, vaddq_s64(src2, delta45_s64));
847*77c1e3ccSAndroid Build Coastguard Worker }
848*77c1e3ccSAndroid Build Coastguard Worker 
hadd_update_4_stats_neon(const int64_t * const src,const int32x4_t * deltas,int64_t * const dst)849*77c1e3ccSAndroid Build Coastguard Worker static inline void hadd_update_4_stats_neon(const int64_t *const src,
850*77c1e3ccSAndroid Build Coastguard Worker                                             const int32x4_t *deltas,
851*77c1e3ccSAndroid Build Coastguard Worker                                             int64_t *const dst) {
852*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t delta01 = horizontal_add_2d_s32(deltas[0], deltas[1]);
853*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t delta23 = horizontal_add_2d_s32(deltas[2], deltas[3]);
854*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t delta01_s64 = vpaddlq_s32(delta01);
855*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t delta23_s64 = vpaddlq_s32(delta23);
856*77c1e3ccSAndroid Build Coastguard Worker 
857*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t src0 = vld1q_s64(src);
858*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t src1 = vld1q_s64(src + 2);
859*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s64(dst, vaddq_s64(src0, delta01_s64));
860*77c1e3ccSAndroid Build Coastguard Worker   vst1q_s64(dst + 2, vaddq_s64(src1, delta23_s64));
861*77c1e3ccSAndroid Build Coastguard Worker }
862*77c1e3ccSAndroid Build Coastguard Worker 
compute_stats_win5_neon(const int16_t * const d,const int32_t d_stride,const int16_t * const s,const int32_t s_stride,const int32_t width,const int32_t height,int64_t * const M,int64_t * const H)863*77c1e3ccSAndroid Build Coastguard Worker static inline void compute_stats_win5_neon(
864*77c1e3ccSAndroid Build Coastguard Worker     const int16_t *const d, const int32_t d_stride, const int16_t *const s,
865*77c1e3ccSAndroid Build Coastguard Worker     const int32_t s_stride, const int32_t width, const int32_t height,
866*77c1e3ccSAndroid Build Coastguard Worker     int64_t *const M, int64_t *const H) {
867*77c1e3ccSAndroid Build Coastguard Worker   const int32_t wiener_win = WIENER_WIN_CHROMA;
868*77c1e3ccSAndroid Build Coastguard Worker   const int32_t wiener_win2 = wiener_win * wiener_win;
869*77c1e3ccSAndroid Build Coastguard Worker   const int32_t w16 = width & ~15;
870*77c1e3ccSAndroid Build Coastguard Worker   const int32_t h8 = height & ~7;
871*77c1e3ccSAndroid Build Coastguard Worker   int16x8_t mask[2];
872*77c1e3ccSAndroid Build Coastguard Worker   mask[0] = vld1q_s16(&(mask_16bit[16]) - width % 16);
873*77c1e3ccSAndroid Build Coastguard Worker   mask[1] = vld1q_s16(&(mask_16bit[16]) - width % 16 + 8);
874*77c1e3ccSAndroid Build Coastguard Worker   const int bit_depth = 8;
875*77c1e3ccSAndroid Build Coastguard Worker   int32_t i, j, x, y;
876*77c1e3ccSAndroid Build Coastguard Worker 
877*77c1e3ccSAndroid Build Coastguard Worker   const int32_t num_bit_left =
878*77c1e3ccSAndroid Build Coastguard Worker       32 - 1 /* sign */ - 2 * bit_depth /* energy */ + 2 /* SIMD */;
879*77c1e3ccSAndroid Build Coastguard Worker   const int32_t h_allowed =
880*77c1e3ccSAndroid Build Coastguard Worker       (1 << num_bit_left) / (w16 + ((w16 != width) ? 16 : 0));
881*77c1e3ccSAndroid Build Coastguard Worker 
882*77c1e3ccSAndroid Build Coastguard Worker   // Step 1: Calculate the top edge of the whole matrix, i.e., the top
883*77c1e3ccSAndroid Build Coastguard Worker   // edge of each triangle and square on the top row.
884*77c1e3ccSAndroid Build Coastguard Worker   j = 0;
885*77c1e3ccSAndroid Build Coastguard Worker   do {
886*77c1e3ccSAndroid Build Coastguard Worker     const int16_t *s_t = s;
887*77c1e3ccSAndroid Build Coastguard Worker     const int16_t *d_t = d;
888*77c1e3ccSAndroid Build Coastguard Worker     int32_t height_t = 0;
889*77c1e3ccSAndroid Build Coastguard Worker     int64x2_t sum_m[WIENER_WIN_CHROMA] = { vdupq_n_s64(0) };
890*77c1e3ccSAndroid Build Coastguard Worker     int64x2_t sum_h[WIENER_WIN_CHROMA] = { vdupq_n_s64(0) };
891*77c1e3ccSAndroid Build Coastguard Worker     int16x8_t src[2], dgd[2];
892*77c1e3ccSAndroid Build Coastguard Worker 
893*77c1e3ccSAndroid Build Coastguard Worker     do {
894*77c1e3ccSAndroid Build Coastguard Worker       const int32_t h_t =
895*77c1e3ccSAndroid Build Coastguard Worker           ((height - height_t) < h_allowed) ? (height - height_t) : h_allowed;
896*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t row_m[WIENER_WIN_CHROMA] = { vdupq_n_s32(0) };
897*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t row_h[WIENER_WIN_CHROMA] = { vdupq_n_s32(0) };
898*77c1e3ccSAndroid Build Coastguard Worker 
899*77c1e3ccSAndroid Build Coastguard Worker       y = h_t;
900*77c1e3ccSAndroid Build Coastguard Worker       do {
901*77c1e3ccSAndroid Build Coastguard Worker         x = 0;
902*77c1e3ccSAndroid Build Coastguard Worker         while (x < w16) {
903*77c1e3ccSAndroid Build Coastguard Worker           src[0] = vld1q_s16(s_t + x + 0);
904*77c1e3ccSAndroid Build Coastguard Worker           src[1] = vld1q_s16(s_t + x + 8);
905*77c1e3ccSAndroid Build Coastguard Worker           dgd[0] = vld1q_s16(d_t + x + 0);
906*77c1e3ccSAndroid Build Coastguard Worker           dgd[1] = vld1q_s16(d_t + x + 8);
907*77c1e3ccSAndroid Build Coastguard Worker           stats_top_win5_neon(src, dgd, d_t + j + x, d_stride, row_m, row_h);
908*77c1e3ccSAndroid Build Coastguard Worker           x += 16;
909*77c1e3ccSAndroid Build Coastguard Worker         }
910*77c1e3ccSAndroid Build Coastguard Worker 
911*77c1e3ccSAndroid Build Coastguard Worker         if (w16 != width) {
912*77c1e3ccSAndroid Build Coastguard Worker           src[0] = vld1q_s16(s_t + w16 + 0);
913*77c1e3ccSAndroid Build Coastguard Worker           src[1] = vld1q_s16(s_t + w16 + 8);
914*77c1e3ccSAndroid Build Coastguard Worker           dgd[0] = vld1q_s16(d_t + w16 + 0);
915*77c1e3ccSAndroid Build Coastguard Worker           dgd[1] = vld1q_s16(d_t + w16 + 8);
916*77c1e3ccSAndroid Build Coastguard Worker           src[0] = vandq_s16(src[0], mask[0]);
917*77c1e3ccSAndroid Build Coastguard Worker           src[1] = vandq_s16(src[1], mask[1]);
918*77c1e3ccSAndroid Build Coastguard Worker           dgd[0] = vandq_s16(dgd[0], mask[0]);
919*77c1e3ccSAndroid Build Coastguard Worker           dgd[1] = vandq_s16(dgd[1], mask[1]);
920*77c1e3ccSAndroid Build Coastguard Worker           stats_top_win5_neon(src, dgd, d_t + j + w16, d_stride, row_m, row_h);
921*77c1e3ccSAndroid Build Coastguard Worker         }
922*77c1e3ccSAndroid Build Coastguard Worker 
923*77c1e3ccSAndroid Build Coastguard Worker         s_t += s_stride;
924*77c1e3ccSAndroid Build Coastguard Worker         d_t += d_stride;
925*77c1e3ccSAndroid Build Coastguard Worker       } while (--y);
926*77c1e3ccSAndroid Build Coastguard Worker 
927*77c1e3ccSAndroid Build Coastguard Worker       sum_m[0] = vpadalq_s32(sum_m[0], row_m[0]);
928*77c1e3ccSAndroid Build Coastguard Worker       sum_m[1] = vpadalq_s32(sum_m[1], row_m[1]);
929*77c1e3ccSAndroid Build Coastguard Worker       sum_m[2] = vpadalq_s32(sum_m[2], row_m[2]);
930*77c1e3ccSAndroid Build Coastguard Worker       sum_m[3] = vpadalq_s32(sum_m[3], row_m[3]);
931*77c1e3ccSAndroid Build Coastguard Worker       sum_m[4] = vpadalq_s32(sum_m[4], row_m[4]);
932*77c1e3ccSAndroid Build Coastguard Worker       sum_h[0] = vpadalq_s32(sum_h[0], row_h[0]);
933*77c1e3ccSAndroid Build Coastguard Worker       sum_h[1] = vpadalq_s32(sum_h[1], row_h[1]);
934*77c1e3ccSAndroid Build Coastguard Worker       sum_h[2] = vpadalq_s32(sum_h[2], row_h[2]);
935*77c1e3ccSAndroid Build Coastguard Worker       sum_h[3] = vpadalq_s32(sum_h[3], row_h[3]);
936*77c1e3ccSAndroid Build Coastguard Worker       sum_h[4] = vpadalq_s32(sum_h[4], row_h[4]);
937*77c1e3ccSAndroid Build Coastguard Worker 
938*77c1e3ccSAndroid Build Coastguard Worker       height_t += h_t;
939*77c1e3ccSAndroid Build Coastguard Worker     } while (height_t < height);
940*77c1e3ccSAndroid Build Coastguard Worker 
941*77c1e3ccSAndroid Build Coastguard Worker #if AOM_ARCH_AARCH64
942*77c1e3ccSAndroid Build Coastguard Worker     int64x2_t sum_m0 = vpaddq_s64(sum_m[0], sum_m[1]);
943*77c1e3ccSAndroid Build Coastguard Worker     int64x2_t sum_m2 = vpaddq_s64(sum_m[2], sum_m[3]);
944*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s64(&M[wiener_win * j + 0], sum_m0);
945*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s64(&M[wiener_win * j + 2], sum_m2);
946*77c1e3ccSAndroid Build Coastguard Worker     M[wiener_win * j + 4] = vaddvq_s64(sum_m[4]);
947*77c1e3ccSAndroid Build Coastguard Worker 
948*77c1e3ccSAndroid Build Coastguard Worker     int64x2_t sum_h0 = vpaddq_s64(sum_h[0], sum_h[1]);
949*77c1e3ccSAndroid Build Coastguard Worker     int64x2_t sum_h2 = vpaddq_s64(sum_h[2], sum_h[3]);
950*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s64(&H[wiener_win * j + 0], sum_h0);
951*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s64(&H[wiener_win * j + 2], sum_h2);
952*77c1e3ccSAndroid Build Coastguard Worker     H[wiener_win * j + 4] = vaddvq_s64(sum_h[4]);
953*77c1e3ccSAndroid Build Coastguard Worker #else
954*77c1e3ccSAndroid Build Coastguard Worker     M[wiener_win * j + 0] = horizontal_add_s64x2(sum_m[0]);
955*77c1e3ccSAndroid Build Coastguard Worker     M[wiener_win * j + 1] = horizontal_add_s64x2(sum_m[1]);
956*77c1e3ccSAndroid Build Coastguard Worker     M[wiener_win * j + 2] = horizontal_add_s64x2(sum_m[2]);
957*77c1e3ccSAndroid Build Coastguard Worker     M[wiener_win * j + 3] = horizontal_add_s64x2(sum_m[3]);
958*77c1e3ccSAndroid Build Coastguard Worker     M[wiener_win * j + 4] = horizontal_add_s64x2(sum_m[4]);
959*77c1e3ccSAndroid Build Coastguard Worker 
960*77c1e3ccSAndroid Build Coastguard Worker     H[wiener_win * j + 0] = horizontal_add_s64x2(sum_h[0]);
961*77c1e3ccSAndroid Build Coastguard Worker     H[wiener_win * j + 1] = horizontal_add_s64x2(sum_h[1]);
962*77c1e3ccSAndroid Build Coastguard Worker     H[wiener_win * j + 2] = horizontal_add_s64x2(sum_h[2]);
963*77c1e3ccSAndroid Build Coastguard Worker     H[wiener_win * j + 3] = horizontal_add_s64x2(sum_h[3]);
964*77c1e3ccSAndroid Build Coastguard Worker     H[wiener_win * j + 4] = horizontal_add_s64x2(sum_h[4]);
965*77c1e3ccSAndroid Build Coastguard Worker #endif  // AOM_ARCH_AARCH64
966*77c1e3ccSAndroid Build Coastguard Worker   } while (++j < wiener_win);
967*77c1e3ccSAndroid Build Coastguard Worker 
968*77c1e3ccSAndroid Build Coastguard Worker   // Step 2: Calculate the left edge of each square on the top row.
969*77c1e3ccSAndroid Build Coastguard Worker   j = 1;
970*77c1e3ccSAndroid Build Coastguard Worker   do {
971*77c1e3ccSAndroid Build Coastguard Worker     const int16_t *d_t = d;
972*77c1e3ccSAndroid Build Coastguard Worker     int32_t height_t = 0;
973*77c1e3ccSAndroid Build Coastguard Worker     int64x2_t sum_h[WIENER_WIN_CHROMA - 1] = { vdupq_n_s64(0) };
974*77c1e3ccSAndroid Build Coastguard Worker     int16x8_t dgd[2];
975*77c1e3ccSAndroid Build Coastguard Worker 
976*77c1e3ccSAndroid Build Coastguard Worker     do {
977*77c1e3ccSAndroid Build Coastguard Worker       const int32_t h_t =
978*77c1e3ccSAndroid Build Coastguard Worker           ((height - height_t) < h_allowed) ? (height - height_t) : h_allowed;
979*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t row_h[WIENER_WIN_CHROMA - 1] = { vdupq_n_s32(0) };
980*77c1e3ccSAndroid Build Coastguard Worker 
981*77c1e3ccSAndroid Build Coastguard Worker       y = h_t;
982*77c1e3ccSAndroid Build Coastguard Worker       do {
983*77c1e3ccSAndroid Build Coastguard Worker         x = 0;
984*77c1e3ccSAndroid Build Coastguard Worker         while (x < w16) {
985*77c1e3ccSAndroid Build Coastguard Worker           dgd[0] = vld1q_s16(d_t + j + x + 0);
986*77c1e3ccSAndroid Build Coastguard Worker           dgd[1] = vld1q_s16(d_t + j + x + 8);
987*77c1e3ccSAndroid Build Coastguard Worker           stats_left_win5_neon(dgd, d_t + x, d_stride, row_h);
988*77c1e3ccSAndroid Build Coastguard Worker           x += 16;
989*77c1e3ccSAndroid Build Coastguard Worker         }
990*77c1e3ccSAndroid Build Coastguard Worker 
991*77c1e3ccSAndroid Build Coastguard Worker         if (w16 != width) {
992*77c1e3ccSAndroid Build Coastguard Worker           dgd[0] = vld1q_s16(d_t + j + x + 0);
993*77c1e3ccSAndroid Build Coastguard Worker           dgd[1] = vld1q_s16(d_t + j + x + 8);
994*77c1e3ccSAndroid Build Coastguard Worker           dgd[0] = vandq_s16(dgd[0], mask[0]);
995*77c1e3ccSAndroid Build Coastguard Worker           dgd[1] = vandq_s16(dgd[1], mask[1]);
996*77c1e3ccSAndroid Build Coastguard Worker           stats_left_win5_neon(dgd, d_t + x, d_stride, row_h);
997*77c1e3ccSAndroid Build Coastguard Worker         }
998*77c1e3ccSAndroid Build Coastguard Worker 
999*77c1e3ccSAndroid Build Coastguard Worker         d_t += d_stride;
1000*77c1e3ccSAndroid Build Coastguard Worker       } while (--y);
1001*77c1e3ccSAndroid Build Coastguard Worker 
1002*77c1e3ccSAndroid Build Coastguard Worker       sum_h[0] = vpadalq_s32(sum_h[0], row_h[0]);
1003*77c1e3ccSAndroid Build Coastguard Worker       sum_h[1] = vpadalq_s32(sum_h[1], row_h[1]);
1004*77c1e3ccSAndroid Build Coastguard Worker       sum_h[2] = vpadalq_s32(sum_h[2], row_h[2]);
1005*77c1e3ccSAndroid Build Coastguard Worker       sum_h[3] = vpadalq_s32(sum_h[3], row_h[3]);
1006*77c1e3ccSAndroid Build Coastguard Worker 
1007*77c1e3ccSAndroid Build Coastguard Worker       height_t += h_t;
1008*77c1e3ccSAndroid Build Coastguard Worker     } while (height_t < height);
1009*77c1e3ccSAndroid Build Coastguard Worker 
1010*77c1e3ccSAndroid Build Coastguard Worker #if AOM_ARCH_AARCH64
1011*77c1e3ccSAndroid Build Coastguard Worker     int64x2_t sum_h0 = vpaddq_s64(sum_h[0], sum_h[1]);
1012*77c1e3ccSAndroid Build Coastguard Worker     int64x2_t sum_h1 = vpaddq_s64(sum_h[2], sum_h[3]);
1013*77c1e3ccSAndroid Build Coastguard Worker     vst1_s64(&H[1 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h0));
1014*77c1e3ccSAndroid Build Coastguard Worker     vst1_s64(&H[2 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h0));
1015*77c1e3ccSAndroid Build Coastguard Worker     vst1_s64(&H[3 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h1));
1016*77c1e3ccSAndroid Build Coastguard Worker     vst1_s64(&H[4 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h1));
1017*77c1e3ccSAndroid Build Coastguard Worker #else
1018*77c1e3ccSAndroid Build Coastguard Worker     H[1 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[0]);
1019*77c1e3ccSAndroid Build Coastguard Worker     H[2 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[1]);
1020*77c1e3ccSAndroid Build Coastguard Worker     H[3 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[2]);
1021*77c1e3ccSAndroid Build Coastguard Worker     H[4 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[3]);
1022*77c1e3ccSAndroid Build Coastguard Worker #endif  // AOM_ARCH_AARCH64
1023*77c1e3ccSAndroid Build Coastguard Worker   } while (++j < wiener_win);
1024*77c1e3ccSAndroid Build Coastguard Worker 
1025*77c1e3ccSAndroid Build Coastguard Worker   // Step 3: Derive the top edge of each triangle along the diagonal. No
1026*77c1e3ccSAndroid Build Coastguard Worker   // triangle in top row.
1027*77c1e3ccSAndroid Build Coastguard Worker   {
1028*77c1e3ccSAndroid Build Coastguard Worker     const int16_t *d_t = d;
1029*77c1e3ccSAndroid Build Coastguard Worker 
1030*77c1e3ccSAndroid Build Coastguard Worker     if (height % 2) {
1031*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t deltas[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
1032*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t deltas_tr[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
1033*77c1e3ccSAndroid Build Coastguard Worker       int16x8_t ds[WIENER_WIN * 2];
1034*77c1e3ccSAndroid Build Coastguard Worker 
1035*77c1e3ccSAndroid Build Coastguard Worker       load_s16_8x4(d_t, d_stride, &ds[0], &ds[2], &ds[4], &ds[6]);
1036*77c1e3ccSAndroid Build Coastguard Worker       load_s16_8x4(d_t + width, d_stride, &ds[1], &ds[3], &ds[5], &ds[7]);
1037*77c1e3ccSAndroid Build Coastguard Worker       d_t += 4 * d_stride;
1038*77c1e3ccSAndroid Build Coastguard Worker 
1039*77c1e3ccSAndroid Build Coastguard Worker       step3_win5_oneline_neon(&d_t, d_stride, width, height, ds, deltas);
1040*77c1e3ccSAndroid Build Coastguard Worker       transpose_arrays_s32_8x8(deltas, deltas_tr);
1041*77c1e3ccSAndroid Build Coastguard Worker 
1042*77c1e3ccSAndroid Build Coastguard Worker       update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
1043*77c1e3ccSAndroid Build Coastguard Worker                           deltas_tr[0], vgetq_lane_s32(deltas_tr[4], 0),
1044*77c1e3ccSAndroid Build Coastguard Worker                           H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
1045*77c1e3ccSAndroid Build Coastguard Worker 
1046*77c1e3ccSAndroid Build Coastguard Worker       update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
1047*77c1e3ccSAndroid Build Coastguard Worker                           deltas_tr[1], vgetq_lane_s32(deltas_tr[5], 0),
1048*77c1e3ccSAndroid Build Coastguard Worker                           H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
1049*77c1e3ccSAndroid Build Coastguard Worker 
1050*77c1e3ccSAndroid Build Coastguard Worker       update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
1051*77c1e3ccSAndroid Build Coastguard Worker                           deltas_tr[2], vgetq_lane_s32(deltas_tr[6], 0),
1052*77c1e3ccSAndroid Build Coastguard Worker                           H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
1053*77c1e3ccSAndroid Build Coastguard Worker 
1054*77c1e3ccSAndroid Build Coastguard Worker       update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
1055*77c1e3ccSAndroid Build Coastguard Worker                           deltas_tr[3], vgetq_lane_s32(deltas_tr[7], 0),
1056*77c1e3ccSAndroid Build Coastguard Worker                           H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
1057*77c1e3ccSAndroid Build Coastguard Worker 
1058*77c1e3ccSAndroid Build Coastguard Worker     } else {
1059*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t deltas[WIENER_WIN_CHROMA * 2] = { vdupq_n_s32(0) };
1060*77c1e3ccSAndroid Build Coastguard Worker       int16x8_t ds[WIENER_WIN_CHROMA * 2];
1061*77c1e3ccSAndroid Build Coastguard Worker 
1062*77c1e3ccSAndroid Build Coastguard Worker       ds[0] = load_unaligned_s16_4x2(d_t + 0 * d_stride, width);
1063*77c1e3ccSAndroid Build Coastguard Worker       ds[1] = load_unaligned_s16_4x2(d_t + 1 * d_stride, width);
1064*77c1e3ccSAndroid Build Coastguard Worker       ds[2] = load_unaligned_s16_4x2(d_t + 2 * d_stride, width);
1065*77c1e3ccSAndroid Build Coastguard Worker       ds[3] = load_unaligned_s16_4x2(d_t + 3 * d_stride, width);
1066*77c1e3ccSAndroid Build Coastguard Worker 
1067*77c1e3ccSAndroid Build Coastguard Worker       step3_win5_neon(d_t + 4 * d_stride, d_stride, width, height, ds, deltas);
1068*77c1e3ccSAndroid Build Coastguard Worker 
1069*77c1e3ccSAndroid Build Coastguard Worker       transpose_elems_inplace_s32_4x4(&deltas[0], &deltas[1], &deltas[2],
1070*77c1e3ccSAndroid Build Coastguard Worker                                       &deltas[3]);
1071*77c1e3ccSAndroid Build Coastguard Worker 
1072*77c1e3ccSAndroid Build Coastguard Worker       update_5_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
1073*77c1e3ccSAndroid Build Coastguard Worker                           deltas[0], vgetq_lane_s32(deltas[4], 0),
1074*77c1e3ccSAndroid Build Coastguard Worker                           H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
1075*77c1e3ccSAndroid Build Coastguard Worker 
1076*77c1e3ccSAndroid Build Coastguard Worker       update_5_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
1077*77c1e3ccSAndroid Build Coastguard Worker                           deltas[1], vgetq_lane_s32(deltas[4], 1),
1078*77c1e3ccSAndroid Build Coastguard Worker                           H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
1079*77c1e3ccSAndroid Build Coastguard Worker 
1080*77c1e3ccSAndroid Build Coastguard Worker       update_5_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
1081*77c1e3ccSAndroid Build Coastguard Worker                           deltas[2], vgetq_lane_s32(deltas[4], 2),
1082*77c1e3ccSAndroid Build Coastguard Worker                           H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
1083*77c1e3ccSAndroid Build Coastguard Worker 
1084*77c1e3ccSAndroid Build Coastguard Worker       update_5_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
1085*77c1e3ccSAndroid Build Coastguard Worker                           deltas[3], vgetq_lane_s32(deltas[4], 3),
1086*77c1e3ccSAndroid Build Coastguard Worker                           H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
1087*77c1e3ccSAndroid Build Coastguard Worker     }
1088*77c1e3ccSAndroid Build Coastguard Worker   }
1089*77c1e3ccSAndroid Build Coastguard Worker 
1090*77c1e3ccSAndroid Build Coastguard Worker   // Step 4: Derive the top and left edge of each square. No square in top and
1091*77c1e3ccSAndroid Build Coastguard Worker   // bottom row.
1092*77c1e3ccSAndroid Build Coastguard Worker 
1093*77c1e3ccSAndroid Build Coastguard Worker   {
1094*77c1e3ccSAndroid Build Coastguard Worker     y = h8;
1095*77c1e3ccSAndroid Build Coastguard Worker 
1096*77c1e3ccSAndroid Build Coastguard Worker     int16x4_t d_s[12];
1097*77c1e3ccSAndroid Build Coastguard Worker     int16x4_t d_e[12];
1098*77c1e3ccSAndroid Build Coastguard Worker     const int16_t *d_t = d;
1099*77c1e3ccSAndroid Build Coastguard Worker     int16x4_t zeros = vdup_n_s16(0);
1100*77c1e3ccSAndroid Build Coastguard Worker     load_s16_4x4(d_t, d_stride, &d_s[0], &d_s[1], &d_s[2], &d_s[3]);
1101*77c1e3ccSAndroid Build Coastguard Worker     load_s16_4x4(d_t + width, d_stride, &d_e[0], &d_e[1], &d_e[2], &d_e[3]);
1102*77c1e3ccSAndroid Build Coastguard Worker     int32x4_t deltas[6][18] = { { vdupq_n_s32(0) }, { vdupq_n_s32(0) } };
1103*77c1e3ccSAndroid Build Coastguard Worker 
1104*77c1e3ccSAndroid Build Coastguard Worker     while (y >= 8) {
1105*77c1e3ccSAndroid Build Coastguard Worker       load_s16_4x8(d_t + 4 * d_stride, d_stride, &d_s[4], &d_s[5], &d_s[6],
1106*77c1e3ccSAndroid Build Coastguard Worker                    &d_s[7], &d_s[8], &d_s[9], &d_s[10], &d_s[11]);
1107*77c1e3ccSAndroid Build Coastguard Worker       load_s16_4x8(d_t + width + 4 * d_stride, d_stride, &d_e[4], &d_e[5],
1108*77c1e3ccSAndroid Build Coastguard Worker                    &d_e[6], &d_e[7], &d_e[8], &d_e[9], &d_e[10], &d_e[11]);
1109*77c1e3ccSAndroid Build Coastguard Worker 
1110*77c1e3ccSAndroid Build Coastguard Worker       int16x8_t s_tr[8], e_tr[8];
1111*77c1e3ccSAndroid Build Coastguard Worker       transpose_elems_s16_4x8(d_s[0], d_s[1], d_s[2], d_s[3], d_s[4], d_s[5],
1112*77c1e3ccSAndroid Build Coastguard Worker                               d_s[6], d_s[7], &s_tr[0], &s_tr[1], &s_tr[2],
1113*77c1e3ccSAndroid Build Coastguard Worker                               &s_tr[3]);
1114*77c1e3ccSAndroid Build Coastguard Worker       transpose_elems_s16_4x8(d_s[8], d_s[9], d_s[10], d_s[11], zeros, zeros,
1115*77c1e3ccSAndroid Build Coastguard Worker                               zeros, zeros, &s_tr[4], &s_tr[5], &s_tr[6],
1116*77c1e3ccSAndroid Build Coastguard Worker                               &s_tr[7]);
1117*77c1e3ccSAndroid Build Coastguard Worker 
1118*77c1e3ccSAndroid Build Coastguard Worker       transpose_elems_s16_4x8(d_e[0], d_e[1], d_e[2], d_e[3], d_e[4], d_e[5],
1119*77c1e3ccSAndroid Build Coastguard Worker                               d_e[6], d_e[7], &e_tr[0], &e_tr[1], &e_tr[2],
1120*77c1e3ccSAndroid Build Coastguard Worker                               &e_tr[3]);
1121*77c1e3ccSAndroid Build Coastguard Worker       transpose_elems_s16_4x8(d_e[8], d_e[9], d_e[10], d_e[11], zeros, zeros,
1122*77c1e3ccSAndroid Build Coastguard Worker                               zeros, zeros, &e_tr[4], &e_tr[5], &e_tr[6],
1123*77c1e3ccSAndroid Build Coastguard Worker                               &e_tr[7]);
1124*77c1e3ccSAndroid Build Coastguard Worker 
1125*77c1e3ccSAndroid Build Coastguard Worker       int16x8_t start_col0[5], start_col1[5], start_col2[5], start_col3[5];
1126*77c1e3ccSAndroid Build Coastguard Worker       start_col0[0] = s_tr[0];
1127*77c1e3ccSAndroid Build Coastguard Worker       start_col0[1] = vextq_s16(s_tr[0], s_tr[4], 1);
1128*77c1e3ccSAndroid Build Coastguard Worker       start_col0[2] = vextq_s16(s_tr[0], s_tr[4], 2);
1129*77c1e3ccSAndroid Build Coastguard Worker       start_col0[3] = vextq_s16(s_tr[0], s_tr[4], 3);
1130*77c1e3ccSAndroid Build Coastguard Worker       start_col0[4] = vextq_s16(s_tr[0], s_tr[4], 4);
1131*77c1e3ccSAndroid Build Coastguard Worker 
1132*77c1e3ccSAndroid Build Coastguard Worker       start_col1[0] = s_tr[1];
1133*77c1e3ccSAndroid Build Coastguard Worker       start_col1[1] = vextq_s16(s_tr[1], s_tr[5], 1);
1134*77c1e3ccSAndroid Build Coastguard Worker       start_col1[2] = vextq_s16(s_tr[1], s_tr[5], 2);
1135*77c1e3ccSAndroid Build Coastguard Worker       start_col1[3] = vextq_s16(s_tr[1], s_tr[5], 3);
1136*77c1e3ccSAndroid Build Coastguard Worker       start_col1[4] = vextq_s16(s_tr[1], s_tr[5], 4);
1137*77c1e3ccSAndroid Build Coastguard Worker 
1138*77c1e3ccSAndroid Build Coastguard Worker       start_col2[0] = s_tr[2];
1139*77c1e3ccSAndroid Build Coastguard Worker       start_col2[1] = vextq_s16(s_tr[2], s_tr[6], 1);
1140*77c1e3ccSAndroid Build Coastguard Worker       start_col2[2] = vextq_s16(s_tr[2], s_tr[6], 2);
1141*77c1e3ccSAndroid Build Coastguard Worker       start_col2[3] = vextq_s16(s_tr[2], s_tr[6], 3);
1142*77c1e3ccSAndroid Build Coastguard Worker       start_col2[4] = vextq_s16(s_tr[2], s_tr[6], 4);
1143*77c1e3ccSAndroid Build Coastguard Worker 
1144*77c1e3ccSAndroid Build Coastguard Worker       start_col3[0] = s_tr[3];
1145*77c1e3ccSAndroid Build Coastguard Worker       start_col3[1] = vextq_s16(s_tr[3], s_tr[7], 1);
1146*77c1e3ccSAndroid Build Coastguard Worker       start_col3[2] = vextq_s16(s_tr[3], s_tr[7], 2);
1147*77c1e3ccSAndroid Build Coastguard Worker       start_col3[3] = vextq_s16(s_tr[3], s_tr[7], 3);
1148*77c1e3ccSAndroid Build Coastguard Worker       start_col3[4] = vextq_s16(s_tr[3], s_tr[7], 4);
1149*77c1e3ccSAndroid Build Coastguard Worker 
1150*77c1e3ccSAndroid Build Coastguard Worker       // i = 1, j = 2;
1151*77c1e3ccSAndroid Build Coastguard Worker       sub_deltas_step4(start_col0, start_col1, deltas[0]);
1152*77c1e3ccSAndroid Build Coastguard Worker 
1153*77c1e3ccSAndroid Build Coastguard Worker       // i = 1, j = 3;
1154*77c1e3ccSAndroid Build Coastguard Worker       sub_deltas_step4(start_col0, start_col2, deltas[1]);
1155*77c1e3ccSAndroid Build Coastguard Worker 
1156*77c1e3ccSAndroid Build Coastguard Worker       // i = 1, j = 4
1157*77c1e3ccSAndroid Build Coastguard Worker       sub_deltas_step4(start_col0, start_col3, deltas[2]);
1158*77c1e3ccSAndroid Build Coastguard Worker 
1159*77c1e3ccSAndroid Build Coastguard Worker       // i = 2, j =3
1160*77c1e3ccSAndroid Build Coastguard Worker       sub_deltas_step4(start_col1, start_col2, deltas[3]);
1161*77c1e3ccSAndroid Build Coastguard Worker 
1162*77c1e3ccSAndroid Build Coastguard Worker       // i = 2, j = 4
1163*77c1e3ccSAndroid Build Coastguard Worker       sub_deltas_step4(start_col1, start_col3, deltas[4]);
1164*77c1e3ccSAndroid Build Coastguard Worker 
1165*77c1e3ccSAndroid Build Coastguard Worker       // i = 3, j = 4
1166*77c1e3ccSAndroid Build Coastguard Worker       sub_deltas_step4(start_col2, start_col3, deltas[5]);
1167*77c1e3ccSAndroid Build Coastguard Worker 
1168*77c1e3ccSAndroid Build Coastguard Worker       int16x8_t end_col0[5], end_col1[5], end_col2[5], end_col3[5];
1169*77c1e3ccSAndroid Build Coastguard Worker       end_col0[0] = e_tr[0];
1170*77c1e3ccSAndroid Build Coastguard Worker       end_col0[1] = vextq_s16(e_tr[0], e_tr[4], 1);
1171*77c1e3ccSAndroid Build Coastguard Worker       end_col0[2] = vextq_s16(e_tr[0], e_tr[4], 2);
1172*77c1e3ccSAndroid Build Coastguard Worker       end_col0[3] = vextq_s16(e_tr[0], e_tr[4], 3);
1173*77c1e3ccSAndroid Build Coastguard Worker       end_col0[4] = vextq_s16(e_tr[0], e_tr[4], 4);
1174*77c1e3ccSAndroid Build Coastguard Worker 
1175*77c1e3ccSAndroid Build Coastguard Worker       end_col1[0] = e_tr[1];
1176*77c1e3ccSAndroid Build Coastguard Worker       end_col1[1] = vextq_s16(e_tr[1], e_tr[5], 1);
1177*77c1e3ccSAndroid Build Coastguard Worker       end_col1[2] = vextq_s16(e_tr[1], e_tr[5], 2);
1178*77c1e3ccSAndroid Build Coastguard Worker       end_col1[3] = vextq_s16(e_tr[1], e_tr[5], 3);
1179*77c1e3ccSAndroid Build Coastguard Worker       end_col1[4] = vextq_s16(e_tr[1], e_tr[5], 4);
1180*77c1e3ccSAndroid Build Coastguard Worker 
1181*77c1e3ccSAndroid Build Coastguard Worker       end_col2[0] = e_tr[2];
1182*77c1e3ccSAndroid Build Coastguard Worker       end_col2[1] = vextq_s16(e_tr[2], e_tr[6], 1);
1183*77c1e3ccSAndroid Build Coastguard Worker       end_col2[2] = vextq_s16(e_tr[2], e_tr[6], 2);
1184*77c1e3ccSAndroid Build Coastguard Worker       end_col2[3] = vextq_s16(e_tr[2], e_tr[6], 3);
1185*77c1e3ccSAndroid Build Coastguard Worker       end_col2[4] = vextq_s16(e_tr[2], e_tr[6], 4);
1186*77c1e3ccSAndroid Build Coastguard Worker 
1187*77c1e3ccSAndroid Build Coastguard Worker       end_col3[0] = e_tr[3];
1188*77c1e3ccSAndroid Build Coastguard Worker       end_col3[1] = vextq_s16(e_tr[3], e_tr[7], 1);
1189*77c1e3ccSAndroid Build Coastguard Worker       end_col3[2] = vextq_s16(e_tr[3], e_tr[7], 2);
1190*77c1e3ccSAndroid Build Coastguard Worker       end_col3[3] = vextq_s16(e_tr[3], e_tr[7], 3);
1191*77c1e3ccSAndroid Build Coastguard Worker       end_col3[4] = vextq_s16(e_tr[3], e_tr[7], 4);
1192*77c1e3ccSAndroid Build Coastguard Worker 
1193*77c1e3ccSAndroid Build Coastguard Worker       // i = 1, j = 2;
1194*77c1e3ccSAndroid Build Coastguard Worker       add_deltas_step4(end_col0, end_col1, deltas[0]);
1195*77c1e3ccSAndroid Build Coastguard Worker 
1196*77c1e3ccSAndroid Build Coastguard Worker       // i = 1, j = 3;
1197*77c1e3ccSAndroid Build Coastguard Worker       add_deltas_step4(end_col0, end_col2, deltas[1]);
1198*77c1e3ccSAndroid Build Coastguard Worker 
1199*77c1e3ccSAndroid Build Coastguard Worker       // i = 1, j = 4
1200*77c1e3ccSAndroid Build Coastguard Worker       add_deltas_step4(end_col0, end_col3, deltas[2]);
1201*77c1e3ccSAndroid Build Coastguard Worker 
1202*77c1e3ccSAndroid Build Coastguard Worker       // i = 2, j =3
1203*77c1e3ccSAndroid Build Coastguard Worker       add_deltas_step4(end_col1, end_col2, deltas[3]);
1204*77c1e3ccSAndroid Build Coastguard Worker 
1205*77c1e3ccSAndroid Build Coastguard Worker       // i = 2, j = 4
1206*77c1e3ccSAndroid Build Coastguard Worker       add_deltas_step4(end_col1, end_col3, deltas[4]);
1207*77c1e3ccSAndroid Build Coastguard Worker 
1208*77c1e3ccSAndroid Build Coastguard Worker       // i = 3, j = 4
1209*77c1e3ccSAndroid Build Coastguard Worker       add_deltas_step4(end_col2, end_col3, deltas[5]);
1210*77c1e3ccSAndroid Build Coastguard Worker 
1211*77c1e3ccSAndroid Build Coastguard Worker       d_s[0] = d_s[8];
1212*77c1e3ccSAndroid Build Coastguard Worker       d_s[1] = d_s[9];
1213*77c1e3ccSAndroid Build Coastguard Worker       d_s[2] = d_s[10];
1214*77c1e3ccSAndroid Build Coastguard Worker       d_s[3] = d_s[11];
1215*77c1e3ccSAndroid Build Coastguard Worker       d_e[0] = d_e[8];
1216*77c1e3ccSAndroid Build Coastguard Worker       d_e[1] = d_e[9];
1217*77c1e3ccSAndroid Build Coastguard Worker       d_e[2] = d_e[10];
1218*77c1e3ccSAndroid Build Coastguard Worker       d_e[3] = d_e[11];
1219*77c1e3ccSAndroid Build Coastguard Worker 
1220*77c1e3ccSAndroid Build Coastguard Worker       d_t += 8 * d_stride;
1221*77c1e3ccSAndroid Build Coastguard Worker       y -= 8;
1222*77c1e3ccSAndroid Build Coastguard Worker     }
1223*77c1e3ccSAndroid Build Coastguard Worker 
1224*77c1e3ccSAndroid Build Coastguard Worker     if (h8 != height) {
1225*77c1e3ccSAndroid Build Coastguard Worker       const int16x8_t mask_h = vld1q_s16(&mask_16bit[16] - (height % 8));
1226*77c1e3ccSAndroid Build Coastguard Worker 
1227*77c1e3ccSAndroid Build Coastguard Worker       load_s16_4x8(d_t + 4 * d_stride, d_stride, &d_s[4], &d_s[5], &d_s[6],
1228*77c1e3ccSAndroid Build Coastguard Worker                    &d_s[7], &d_s[8], &d_s[9], &d_s[10], &d_s[11]);
1229*77c1e3ccSAndroid Build Coastguard Worker       load_s16_4x8(d_t + width + 4 * d_stride, d_stride, &d_e[4], &d_e[5],
1230*77c1e3ccSAndroid Build Coastguard Worker                    &d_e[6], &d_e[7], &d_e[8], &d_e[9], &d_e[10], &d_e[11]);
1231*77c1e3ccSAndroid Build Coastguard Worker       int16x8_t s_tr[8], e_tr[8];
1232*77c1e3ccSAndroid Build Coastguard Worker       transpose_elems_s16_4x8(d_s[0], d_s[1], d_s[2], d_s[3], d_s[4], d_s[5],
1233*77c1e3ccSAndroid Build Coastguard Worker                               d_s[6], d_s[7], &s_tr[0], &s_tr[1], &s_tr[2],
1234*77c1e3ccSAndroid Build Coastguard Worker                               &s_tr[3]);
1235*77c1e3ccSAndroid Build Coastguard Worker       transpose_elems_s16_4x8(d_s[8], d_s[9], d_s[10], d_s[11], zeros, zeros,
1236*77c1e3ccSAndroid Build Coastguard Worker                               zeros, zeros, &s_tr[4], &s_tr[5], &s_tr[6],
1237*77c1e3ccSAndroid Build Coastguard Worker                               &s_tr[7]);
1238*77c1e3ccSAndroid Build Coastguard Worker       transpose_elems_s16_4x8(d_e[0], d_e[1], d_e[2], d_e[3], d_e[4], d_e[5],
1239*77c1e3ccSAndroid Build Coastguard Worker                               d_e[6], d_e[7], &e_tr[0], &e_tr[1], &e_tr[2],
1240*77c1e3ccSAndroid Build Coastguard Worker                               &e_tr[3]);
1241*77c1e3ccSAndroid Build Coastguard Worker       transpose_elems_s16_4x8(d_e[8], d_e[9], d_e[10], d_e[11], zeros, zeros,
1242*77c1e3ccSAndroid Build Coastguard Worker                               zeros, zeros, &e_tr[4], &e_tr[5], &e_tr[6],
1243*77c1e3ccSAndroid Build Coastguard Worker                               &e_tr[7]);
1244*77c1e3ccSAndroid Build Coastguard Worker 
1245*77c1e3ccSAndroid Build Coastguard Worker       int16x8_t start_col0[5], start_col1[5], start_col2[5], start_col3[5];
1246*77c1e3ccSAndroid Build Coastguard Worker       start_col0[0] = vandq_s16(s_tr[0], mask_h);
1247*77c1e3ccSAndroid Build Coastguard Worker       start_col0[1] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 1), mask_h);
1248*77c1e3ccSAndroid Build Coastguard Worker       start_col0[2] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 2), mask_h);
1249*77c1e3ccSAndroid Build Coastguard Worker       start_col0[3] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 3), mask_h);
1250*77c1e3ccSAndroid Build Coastguard Worker       start_col0[4] = vandq_s16(vextq_s16(s_tr[0], s_tr[4], 4), mask_h);
1251*77c1e3ccSAndroid Build Coastguard Worker 
1252*77c1e3ccSAndroid Build Coastguard Worker       start_col1[0] = vandq_s16(s_tr[1], mask_h);
1253*77c1e3ccSAndroid Build Coastguard Worker       start_col1[1] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 1), mask_h);
1254*77c1e3ccSAndroid Build Coastguard Worker       start_col1[2] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 2), mask_h);
1255*77c1e3ccSAndroid Build Coastguard Worker       start_col1[3] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 3), mask_h);
1256*77c1e3ccSAndroid Build Coastguard Worker       start_col1[4] = vandq_s16(vextq_s16(s_tr[1], s_tr[5], 4), mask_h);
1257*77c1e3ccSAndroid Build Coastguard Worker 
1258*77c1e3ccSAndroid Build Coastguard Worker       start_col2[0] = vandq_s16(s_tr[2], mask_h);
1259*77c1e3ccSAndroid Build Coastguard Worker       start_col2[1] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 1), mask_h);
1260*77c1e3ccSAndroid Build Coastguard Worker       start_col2[2] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 2), mask_h);
1261*77c1e3ccSAndroid Build Coastguard Worker       start_col2[3] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 3), mask_h);
1262*77c1e3ccSAndroid Build Coastguard Worker       start_col2[4] = vandq_s16(vextq_s16(s_tr[2], s_tr[6], 4), mask_h);
1263*77c1e3ccSAndroid Build Coastguard Worker 
1264*77c1e3ccSAndroid Build Coastguard Worker       start_col3[0] = vandq_s16(s_tr[3], mask_h);
1265*77c1e3ccSAndroid Build Coastguard Worker       start_col3[1] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 1), mask_h);
1266*77c1e3ccSAndroid Build Coastguard Worker       start_col3[2] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 2), mask_h);
1267*77c1e3ccSAndroid Build Coastguard Worker       start_col3[3] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 3), mask_h);
1268*77c1e3ccSAndroid Build Coastguard Worker       start_col3[4] = vandq_s16(vextq_s16(s_tr[3], s_tr[7], 4), mask_h);
1269*77c1e3ccSAndroid Build Coastguard Worker 
1270*77c1e3ccSAndroid Build Coastguard Worker       // i = 1, j = 2;
1271*77c1e3ccSAndroid Build Coastguard Worker       sub_deltas_step4(start_col0, start_col1, deltas[0]);
1272*77c1e3ccSAndroid Build Coastguard Worker 
1273*77c1e3ccSAndroid Build Coastguard Worker       // i = 1, j = 3;
1274*77c1e3ccSAndroid Build Coastguard Worker       sub_deltas_step4(start_col0, start_col2, deltas[1]);
1275*77c1e3ccSAndroid Build Coastguard Worker 
1276*77c1e3ccSAndroid Build Coastguard Worker       // i = 1, j = 4
1277*77c1e3ccSAndroid Build Coastguard Worker       sub_deltas_step4(start_col0, start_col3, deltas[2]);
1278*77c1e3ccSAndroid Build Coastguard Worker 
1279*77c1e3ccSAndroid Build Coastguard Worker       // i = 2, j = 3
1280*77c1e3ccSAndroid Build Coastguard Worker       sub_deltas_step4(start_col1, start_col2, deltas[3]);
1281*77c1e3ccSAndroid Build Coastguard Worker 
1282*77c1e3ccSAndroid Build Coastguard Worker       // i = 2, j = 4
1283*77c1e3ccSAndroid Build Coastguard Worker       sub_deltas_step4(start_col1, start_col3, deltas[4]);
1284*77c1e3ccSAndroid Build Coastguard Worker 
1285*77c1e3ccSAndroid Build Coastguard Worker       // i = 3, j = 4
1286*77c1e3ccSAndroid Build Coastguard Worker       sub_deltas_step4(start_col2, start_col3, deltas[5]);
1287*77c1e3ccSAndroid Build Coastguard Worker 
1288*77c1e3ccSAndroid Build Coastguard Worker       int16x8_t end_col0[5], end_col1[5], end_col2[5], end_col3[5];
1289*77c1e3ccSAndroid Build Coastguard Worker       end_col0[0] = vandq_s16(e_tr[0], mask_h);
1290*77c1e3ccSAndroid Build Coastguard Worker       end_col0[1] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 1), mask_h);
1291*77c1e3ccSAndroid Build Coastguard Worker       end_col0[2] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 2), mask_h);
1292*77c1e3ccSAndroid Build Coastguard Worker       end_col0[3] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 3), mask_h);
1293*77c1e3ccSAndroid Build Coastguard Worker       end_col0[4] = vandq_s16(vextq_s16(e_tr[0], e_tr[4], 4), mask_h);
1294*77c1e3ccSAndroid Build Coastguard Worker 
1295*77c1e3ccSAndroid Build Coastguard Worker       end_col1[0] = vandq_s16(e_tr[1], mask_h);
1296*77c1e3ccSAndroid Build Coastguard Worker       end_col1[1] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 1), mask_h);
1297*77c1e3ccSAndroid Build Coastguard Worker       end_col1[2] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 2), mask_h);
1298*77c1e3ccSAndroid Build Coastguard Worker       end_col1[3] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 3), mask_h);
1299*77c1e3ccSAndroid Build Coastguard Worker       end_col1[4] = vandq_s16(vextq_s16(e_tr[1], e_tr[5], 4), mask_h);
1300*77c1e3ccSAndroid Build Coastguard Worker 
1301*77c1e3ccSAndroid Build Coastguard Worker       end_col2[0] = vandq_s16(e_tr[2], mask_h);
1302*77c1e3ccSAndroid Build Coastguard Worker       end_col2[1] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 1), mask_h);
1303*77c1e3ccSAndroid Build Coastguard Worker       end_col2[2] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 2), mask_h);
1304*77c1e3ccSAndroid Build Coastguard Worker       end_col2[3] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 3), mask_h);
1305*77c1e3ccSAndroid Build Coastguard Worker       end_col2[4] = vandq_s16(vextq_s16(e_tr[2], e_tr[6], 4), mask_h);
1306*77c1e3ccSAndroid Build Coastguard Worker 
1307*77c1e3ccSAndroid Build Coastguard Worker       end_col3[0] = vandq_s16(e_tr[3], mask_h);
1308*77c1e3ccSAndroid Build Coastguard Worker       end_col3[1] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 1), mask_h);
1309*77c1e3ccSAndroid Build Coastguard Worker       end_col3[2] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 2), mask_h);
1310*77c1e3ccSAndroid Build Coastguard Worker       end_col3[3] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 3), mask_h);
1311*77c1e3ccSAndroid Build Coastguard Worker       end_col3[4] = vandq_s16(vextq_s16(e_tr[3], e_tr[7], 4), mask_h);
1312*77c1e3ccSAndroid Build Coastguard Worker 
1313*77c1e3ccSAndroid Build Coastguard Worker       // i = 1, j = 2;
1314*77c1e3ccSAndroid Build Coastguard Worker       add_deltas_step4(end_col0, end_col1, deltas[0]);
1315*77c1e3ccSAndroid Build Coastguard Worker 
1316*77c1e3ccSAndroid Build Coastguard Worker       // i = 1, j = 3;
1317*77c1e3ccSAndroid Build Coastguard Worker       add_deltas_step4(end_col0, end_col2, deltas[1]);
1318*77c1e3ccSAndroid Build Coastguard Worker 
1319*77c1e3ccSAndroid Build Coastguard Worker       // i = 1, j = 4
1320*77c1e3ccSAndroid Build Coastguard Worker       add_deltas_step4(end_col0, end_col3, deltas[2]);
1321*77c1e3ccSAndroid Build Coastguard Worker 
1322*77c1e3ccSAndroid Build Coastguard Worker       // i = 2, j =3
1323*77c1e3ccSAndroid Build Coastguard Worker       add_deltas_step4(end_col1, end_col2, deltas[3]);
1324*77c1e3ccSAndroid Build Coastguard Worker 
1325*77c1e3ccSAndroid Build Coastguard Worker       // i = 2, j = 4
1326*77c1e3ccSAndroid Build Coastguard Worker       add_deltas_step4(end_col1, end_col3, deltas[4]);
1327*77c1e3ccSAndroid Build Coastguard Worker 
1328*77c1e3ccSAndroid Build Coastguard Worker       // i = 3, j = 4
1329*77c1e3ccSAndroid Build Coastguard Worker       add_deltas_step4(end_col2, end_col3, deltas[5]);
1330*77c1e3ccSAndroid Build Coastguard Worker     }
1331*77c1e3ccSAndroid Build Coastguard Worker 
1332*77c1e3ccSAndroid Build Coastguard Worker     int32x4_t delta[6][2];
1333*77c1e3ccSAndroid Build Coastguard Worker     int32_t single_delta[6];
1334*77c1e3ccSAndroid Build Coastguard Worker 
1335*77c1e3ccSAndroid Build Coastguard Worker     delta[0][0] = horizontal_add_4d_s32x4(&deltas[0][0]);
1336*77c1e3ccSAndroid Build Coastguard Worker     delta[1][0] = horizontal_add_4d_s32x4(&deltas[1][0]);
1337*77c1e3ccSAndroid Build Coastguard Worker     delta[2][0] = horizontal_add_4d_s32x4(&deltas[2][0]);
1338*77c1e3ccSAndroid Build Coastguard Worker     delta[3][0] = horizontal_add_4d_s32x4(&deltas[3][0]);
1339*77c1e3ccSAndroid Build Coastguard Worker     delta[4][0] = horizontal_add_4d_s32x4(&deltas[4][0]);
1340*77c1e3ccSAndroid Build Coastguard Worker     delta[5][0] = horizontal_add_4d_s32x4(&deltas[5][0]);
1341*77c1e3ccSAndroid Build Coastguard Worker 
1342*77c1e3ccSAndroid Build Coastguard Worker     delta[0][1] = horizontal_add_4d_s32x4(&deltas[0][5]);
1343*77c1e3ccSAndroid Build Coastguard Worker     delta[1][1] = horizontal_add_4d_s32x4(&deltas[1][5]);
1344*77c1e3ccSAndroid Build Coastguard Worker     delta[2][1] = horizontal_add_4d_s32x4(&deltas[2][5]);
1345*77c1e3ccSAndroid Build Coastguard Worker     delta[3][1] = horizontal_add_4d_s32x4(&deltas[3][5]);
1346*77c1e3ccSAndroid Build Coastguard Worker     delta[4][1] = horizontal_add_4d_s32x4(&deltas[4][5]);
1347*77c1e3ccSAndroid Build Coastguard Worker     delta[5][1] = horizontal_add_4d_s32x4(&deltas[5][5]);
1348*77c1e3ccSAndroid Build Coastguard Worker 
1349*77c1e3ccSAndroid Build Coastguard Worker     single_delta[0] = horizontal_add_s32x4(deltas[0][4]);
1350*77c1e3ccSAndroid Build Coastguard Worker     single_delta[1] = horizontal_add_s32x4(deltas[1][4]);
1351*77c1e3ccSAndroid Build Coastguard Worker     single_delta[2] = horizontal_add_s32x4(deltas[2][4]);
1352*77c1e3ccSAndroid Build Coastguard Worker     single_delta[3] = horizontal_add_s32x4(deltas[3][4]);
1353*77c1e3ccSAndroid Build Coastguard Worker     single_delta[4] = horizontal_add_s32x4(deltas[4][4]);
1354*77c1e3ccSAndroid Build Coastguard Worker     single_delta[5] = horizontal_add_s32x4(deltas[5][4]);
1355*77c1e3ccSAndroid Build Coastguard Worker 
1356*77c1e3ccSAndroid Build Coastguard Worker     int idx = 0;
1357*77c1e3ccSAndroid Build Coastguard Worker     for (i = 1; i < wiener_win - 1; i++) {
1358*77c1e3ccSAndroid Build Coastguard Worker       for (j = i + 1; j < wiener_win; j++) {
1359*77c1e3ccSAndroid Build Coastguard Worker         update_4_stats_neon(
1360*77c1e3ccSAndroid Build Coastguard Worker             H + (i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win,
1361*77c1e3ccSAndroid Build Coastguard Worker             delta[idx][0], H + i * wiener_win * wiener_win2 + j * wiener_win);
1362*77c1e3ccSAndroid Build Coastguard Worker         H[i * wiener_win * wiener_win2 + j * wiener_win + 4] =
1363*77c1e3ccSAndroid Build Coastguard Worker             H[(i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win + 4] +
1364*77c1e3ccSAndroid Build Coastguard Worker             single_delta[idx];
1365*77c1e3ccSAndroid Build Coastguard Worker 
1366*77c1e3ccSAndroid Build Coastguard Worker         H[(i * wiener_win + 1) * wiener_win2 + j * wiener_win] =
1367*77c1e3ccSAndroid Build Coastguard Worker             H[((i - 1) * wiener_win + 1) * wiener_win2 + (j - 1) * wiener_win] +
1368*77c1e3ccSAndroid Build Coastguard Worker             vgetq_lane_s32(delta[idx][1], 0);
1369*77c1e3ccSAndroid Build Coastguard Worker         H[(i * wiener_win + 2) * wiener_win2 + j * wiener_win] =
1370*77c1e3ccSAndroid Build Coastguard Worker             H[((i - 1) * wiener_win + 2) * wiener_win2 + (j - 1) * wiener_win] +
1371*77c1e3ccSAndroid Build Coastguard Worker             vgetq_lane_s32(delta[idx][1], 1);
1372*77c1e3ccSAndroid Build Coastguard Worker         H[(i * wiener_win + 3) * wiener_win2 + j * wiener_win] =
1373*77c1e3ccSAndroid Build Coastguard Worker             H[((i - 1) * wiener_win + 3) * wiener_win2 + (j - 1) * wiener_win] +
1374*77c1e3ccSAndroid Build Coastguard Worker             vgetq_lane_s32(delta[idx][1], 2);
1375*77c1e3ccSAndroid Build Coastguard Worker         H[(i * wiener_win + 4) * wiener_win2 + j * wiener_win] =
1376*77c1e3ccSAndroid Build Coastguard Worker             H[((i - 1) * wiener_win + 4) * wiener_win2 + (j - 1) * wiener_win] +
1377*77c1e3ccSAndroid Build Coastguard Worker             vgetq_lane_s32(delta[idx][1], 3);
1378*77c1e3ccSAndroid Build Coastguard Worker 
1379*77c1e3ccSAndroid Build Coastguard Worker         idx++;
1380*77c1e3ccSAndroid Build Coastguard Worker       }
1381*77c1e3ccSAndroid Build Coastguard Worker     }
1382*77c1e3ccSAndroid Build Coastguard Worker   }
1383*77c1e3ccSAndroid Build Coastguard Worker 
1384*77c1e3ccSAndroid Build Coastguard Worker   // Step 5: Derive other points of each square. No square in bottom row.
1385*77c1e3ccSAndroid Build Coastguard Worker   i = 0;
1386*77c1e3ccSAndroid Build Coastguard Worker   do {
1387*77c1e3ccSAndroid Build Coastguard Worker     const int16_t *const di = d + i;
1388*77c1e3ccSAndroid Build Coastguard Worker 
1389*77c1e3ccSAndroid Build Coastguard Worker     j = i + 1;
1390*77c1e3ccSAndroid Build Coastguard Worker     do {
1391*77c1e3ccSAndroid Build Coastguard Worker       const int16_t *const dj = d + j;
1392*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t deltas[WIENER_WIN_CHROMA - 1][WIENER_WIN_CHROMA - 1] = {
1393*77c1e3ccSAndroid Build Coastguard Worker         { vdupq_n_s32(0) }, { vdupq_n_s32(0) }
1394*77c1e3ccSAndroid Build Coastguard Worker       };
1395*77c1e3ccSAndroid Build Coastguard Worker       int16x8_t d_is[WIN_CHROMA], d_ie[WIN_CHROMA];
1396*77c1e3ccSAndroid Build Coastguard Worker       int16x8_t d_js[WIN_CHROMA], d_je[WIN_CHROMA];
1397*77c1e3ccSAndroid Build Coastguard Worker 
1398*77c1e3ccSAndroid Build Coastguard Worker       x = 0;
1399*77c1e3ccSAndroid Build Coastguard Worker       while (x < w16) {
1400*77c1e3ccSAndroid Build Coastguard Worker         load_square_win5_neon(di + x, dj + x, d_stride, height, d_is, d_ie,
1401*77c1e3ccSAndroid Build Coastguard Worker                               d_js, d_je);
1402*77c1e3ccSAndroid Build Coastguard Worker         derive_square_win5_neon(d_is, d_ie, d_js, d_je, deltas);
1403*77c1e3ccSAndroid Build Coastguard Worker         x += 16;
1404*77c1e3ccSAndroid Build Coastguard Worker       }
1405*77c1e3ccSAndroid Build Coastguard Worker 
1406*77c1e3ccSAndroid Build Coastguard Worker       if (w16 != width) {
1407*77c1e3ccSAndroid Build Coastguard Worker         load_square_win5_neon(di + x, dj + x, d_stride, height, d_is, d_ie,
1408*77c1e3ccSAndroid Build Coastguard Worker                               d_js, d_je);
1409*77c1e3ccSAndroid Build Coastguard Worker         d_is[0] = vandq_s16(d_is[0], mask[0]);
1410*77c1e3ccSAndroid Build Coastguard Worker         d_is[1] = vandq_s16(d_is[1], mask[1]);
1411*77c1e3ccSAndroid Build Coastguard Worker         d_is[2] = vandq_s16(d_is[2], mask[0]);
1412*77c1e3ccSAndroid Build Coastguard Worker         d_is[3] = vandq_s16(d_is[3], mask[1]);
1413*77c1e3ccSAndroid Build Coastguard Worker         d_is[4] = vandq_s16(d_is[4], mask[0]);
1414*77c1e3ccSAndroid Build Coastguard Worker         d_is[5] = vandq_s16(d_is[5], mask[1]);
1415*77c1e3ccSAndroid Build Coastguard Worker         d_is[6] = vandq_s16(d_is[6], mask[0]);
1416*77c1e3ccSAndroid Build Coastguard Worker         d_is[7] = vandq_s16(d_is[7], mask[1]);
1417*77c1e3ccSAndroid Build Coastguard Worker         d_ie[0] = vandq_s16(d_ie[0], mask[0]);
1418*77c1e3ccSAndroid Build Coastguard Worker         d_ie[1] = vandq_s16(d_ie[1], mask[1]);
1419*77c1e3ccSAndroid Build Coastguard Worker         d_ie[2] = vandq_s16(d_ie[2], mask[0]);
1420*77c1e3ccSAndroid Build Coastguard Worker         d_ie[3] = vandq_s16(d_ie[3], mask[1]);
1421*77c1e3ccSAndroid Build Coastguard Worker         d_ie[4] = vandq_s16(d_ie[4], mask[0]);
1422*77c1e3ccSAndroid Build Coastguard Worker         d_ie[5] = vandq_s16(d_ie[5], mask[1]);
1423*77c1e3ccSAndroid Build Coastguard Worker         d_ie[6] = vandq_s16(d_ie[6], mask[0]);
1424*77c1e3ccSAndroid Build Coastguard Worker         d_ie[7] = vandq_s16(d_ie[7], mask[1]);
1425*77c1e3ccSAndroid Build Coastguard Worker         derive_square_win5_neon(d_is, d_ie, d_js, d_je, deltas);
1426*77c1e3ccSAndroid Build Coastguard Worker       }
1427*77c1e3ccSAndroid Build Coastguard Worker 
1428*77c1e3ccSAndroid Build Coastguard Worker       hadd_update_4_stats_neon(
1429*77c1e3ccSAndroid Build Coastguard Worker           H + (i * wiener_win + 0) * wiener_win2 + j * wiener_win, deltas[0],
1430*77c1e3ccSAndroid Build Coastguard Worker           H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win + 1);
1431*77c1e3ccSAndroid Build Coastguard Worker       hadd_update_4_stats_neon(
1432*77c1e3ccSAndroid Build Coastguard Worker           H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win, deltas[1],
1433*77c1e3ccSAndroid Build Coastguard Worker           H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win + 1);
1434*77c1e3ccSAndroid Build Coastguard Worker       hadd_update_4_stats_neon(
1435*77c1e3ccSAndroid Build Coastguard Worker           H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win, deltas[2],
1436*77c1e3ccSAndroid Build Coastguard Worker           H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win + 1);
1437*77c1e3ccSAndroid Build Coastguard Worker       hadd_update_4_stats_neon(
1438*77c1e3ccSAndroid Build Coastguard Worker           H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win, deltas[3],
1439*77c1e3ccSAndroid Build Coastguard Worker           H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win + 1);
1440*77c1e3ccSAndroid Build Coastguard Worker     } while (++j < wiener_win);
1441*77c1e3ccSAndroid Build Coastguard Worker   } while (++i < wiener_win - 1);
1442*77c1e3ccSAndroid Build Coastguard Worker 
1443*77c1e3ccSAndroid Build Coastguard Worker   // Step 6: Derive other points of each upper triangle along the diagonal.
1444*77c1e3ccSAndroid Build Coastguard Worker   i = 0;
1445*77c1e3ccSAndroid Build Coastguard Worker   do {
1446*77c1e3ccSAndroid Build Coastguard Worker     const int16_t *const di = d + i;
1447*77c1e3ccSAndroid Build Coastguard Worker     int32x4_t deltas[WIENER_WIN_CHROMA * 2 + 1] = { vdupq_n_s32(0) };
1448*77c1e3ccSAndroid Build Coastguard Worker     int16x8_t d_is[WIN_CHROMA], d_ie[WIN_CHROMA];
1449*77c1e3ccSAndroid Build Coastguard Worker 
1450*77c1e3ccSAndroid Build Coastguard Worker     x = 0;
1451*77c1e3ccSAndroid Build Coastguard Worker     while (x < w16) {
1452*77c1e3ccSAndroid Build Coastguard Worker       load_triangle_win5_neon(di + x, d_stride, height, d_is, d_ie);
1453*77c1e3ccSAndroid Build Coastguard Worker       derive_triangle_win5_neon(d_is, d_ie, deltas);
1454*77c1e3ccSAndroid Build Coastguard Worker       x += 16;
1455*77c1e3ccSAndroid Build Coastguard Worker     }
1456*77c1e3ccSAndroid Build Coastguard Worker 
1457*77c1e3ccSAndroid Build Coastguard Worker     if (w16 != width) {
1458*77c1e3ccSAndroid Build Coastguard Worker       load_triangle_win5_neon(di + x, d_stride, height, d_is, d_ie);
1459*77c1e3ccSAndroid Build Coastguard Worker       d_is[0] = vandq_s16(d_is[0], mask[0]);
1460*77c1e3ccSAndroid Build Coastguard Worker       d_is[1] = vandq_s16(d_is[1], mask[1]);
1461*77c1e3ccSAndroid Build Coastguard Worker       d_is[2] = vandq_s16(d_is[2], mask[0]);
1462*77c1e3ccSAndroid Build Coastguard Worker       d_is[3] = vandq_s16(d_is[3], mask[1]);
1463*77c1e3ccSAndroid Build Coastguard Worker       d_is[4] = vandq_s16(d_is[4], mask[0]);
1464*77c1e3ccSAndroid Build Coastguard Worker       d_is[5] = vandq_s16(d_is[5], mask[1]);
1465*77c1e3ccSAndroid Build Coastguard Worker       d_is[6] = vandq_s16(d_is[6], mask[0]);
1466*77c1e3ccSAndroid Build Coastguard Worker       d_is[7] = vandq_s16(d_is[7], mask[1]);
1467*77c1e3ccSAndroid Build Coastguard Worker       d_ie[0] = vandq_s16(d_ie[0], mask[0]);
1468*77c1e3ccSAndroid Build Coastguard Worker       d_ie[1] = vandq_s16(d_ie[1], mask[1]);
1469*77c1e3ccSAndroid Build Coastguard Worker       d_ie[2] = vandq_s16(d_ie[2], mask[0]);
1470*77c1e3ccSAndroid Build Coastguard Worker       d_ie[3] = vandq_s16(d_ie[3], mask[1]);
1471*77c1e3ccSAndroid Build Coastguard Worker       d_ie[4] = vandq_s16(d_ie[4], mask[0]);
1472*77c1e3ccSAndroid Build Coastguard Worker       d_ie[5] = vandq_s16(d_ie[5], mask[1]);
1473*77c1e3ccSAndroid Build Coastguard Worker       d_ie[6] = vandq_s16(d_ie[6], mask[0]);
1474*77c1e3ccSAndroid Build Coastguard Worker       d_ie[7] = vandq_s16(d_ie[7], mask[1]);
1475*77c1e3ccSAndroid Build Coastguard Worker       derive_triangle_win5_neon(d_is, d_ie, deltas);
1476*77c1e3ccSAndroid Build Coastguard Worker     }
1477*77c1e3ccSAndroid Build Coastguard Worker 
1478*77c1e3ccSAndroid Build Coastguard Worker     // Row 1: 4 points
1479*77c1e3ccSAndroid Build Coastguard Worker     hadd_update_4_stats_neon(
1480*77c1e3ccSAndroid Build Coastguard Worker         H + (i * wiener_win + 0) * wiener_win2 + i * wiener_win, deltas,
1481*77c1e3ccSAndroid Build Coastguard Worker         H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1);
1482*77c1e3ccSAndroid Build Coastguard Worker 
1483*77c1e3ccSAndroid Build Coastguard Worker     // Row 2: 3 points
1484*77c1e3ccSAndroid Build Coastguard Worker     int32x4_t deltas45 = horizontal_add_2d_s32(deltas[4], deltas[5]);
1485*77c1e3ccSAndroid Build Coastguard Worker     int32x4_t deltas78 = horizontal_add_2d_s32(deltas[7], deltas[8]);
1486*77c1e3ccSAndroid Build Coastguard Worker 
1487*77c1e3ccSAndroid Build Coastguard Worker     int64x2_t deltas45_s64 = vpaddlq_s32(deltas45);
1488*77c1e3ccSAndroid Build Coastguard Worker     int64x2_t deltas78_s64 = vpaddlq_s32(deltas78);
1489*77c1e3ccSAndroid Build Coastguard Worker 
1490*77c1e3ccSAndroid Build Coastguard Worker     int64x2_t src =
1491*77c1e3ccSAndroid Build Coastguard Worker         vld1q_s64(H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1);
1492*77c1e3ccSAndroid Build Coastguard Worker     int64x2_t dst = vaddq_s64(src, deltas45_s64);
1493*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s64(H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2, dst);
1494*77c1e3ccSAndroid Build Coastguard Worker 
1495*77c1e3ccSAndroid Build Coastguard Worker     int32x4_t delta69 = horizontal_add_2d_s32(deltas[6], deltas[9]);
1496*77c1e3ccSAndroid Build Coastguard Worker     int64x2_t delta69_s64 = vpaddlq_s32(delta69);
1497*77c1e3ccSAndroid Build Coastguard Worker     H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 4] =
1498*77c1e3ccSAndroid Build Coastguard Worker         H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 3] +
1499*77c1e3ccSAndroid Build Coastguard Worker         vgetq_lane_s64(delta69_s64, 0);
1500*77c1e3ccSAndroid Build Coastguard Worker 
1501*77c1e3ccSAndroid Build Coastguard Worker     // Row 3: 2 points
1502*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s64(H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3,
1503*77c1e3ccSAndroid Build Coastguard Worker               vaddq_s64(dst, deltas78_s64));
1504*77c1e3ccSAndroid Build Coastguard Worker 
1505*77c1e3ccSAndroid Build Coastguard Worker     // Row 4: 1 point
1506*77c1e3ccSAndroid Build Coastguard Worker     H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4] =
1507*77c1e3ccSAndroid Build Coastguard Worker         H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3] +
1508*77c1e3ccSAndroid Build Coastguard Worker         vgetq_lane_s64(delta69_s64, 1);
1509*77c1e3ccSAndroid Build Coastguard Worker   } while (++i < wiener_win);
1510*77c1e3ccSAndroid Build Coastguard Worker }
1511*77c1e3ccSAndroid Build Coastguard Worker 
compute_stats_win7_neon(const int16_t * const d,const int32_t d_stride,const int16_t * const s,const int32_t s_stride,const int32_t width,const int32_t height,int64_t * const M,int64_t * const H)1512*77c1e3ccSAndroid Build Coastguard Worker static inline void compute_stats_win7_neon(
1513*77c1e3ccSAndroid Build Coastguard Worker     const int16_t *const d, const int32_t d_stride, const int16_t *const s,
1514*77c1e3ccSAndroid Build Coastguard Worker     const int32_t s_stride, const int32_t width, const int32_t height,
1515*77c1e3ccSAndroid Build Coastguard Worker     int64_t *const M, int64_t *const H) {
1516*77c1e3ccSAndroid Build Coastguard Worker   const int32_t wiener_win = WIENER_WIN;
1517*77c1e3ccSAndroid Build Coastguard Worker   const int32_t wiener_win2 = wiener_win * wiener_win;
1518*77c1e3ccSAndroid Build Coastguard Worker   const int32_t w16 = width & ~15;
1519*77c1e3ccSAndroid Build Coastguard Worker   const int32_t h8 = height & ~7;
1520*77c1e3ccSAndroid Build Coastguard Worker   int16x8_t mask[2];
1521*77c1e3ccSAndroid Build Coastguard Worker   mask[0] = vld1q_s16(&(mask_16bit[16]) - width % 16);
1522*77c1e3ccSAndroid Build Coastguard Worker   mask[1] = vld1q_s16(&(mask_16bit[16]) - width % 16 + 8);
1523*77c1e3ccSAndroid Build Coastguard Worker   const int bit_depth = 8;
1524*77c1e3ccSAndroid Build Coastguard Worker   int32_t i, j, x, y;
1525*77c1e3ccSAndroid Build Coastguard Worker 
1526*77c1e3ccSAndroid Build Coastguard Worker   const int32_t num_bit_left =
1527*77c1e3ccSAndroid Build Coastguard Worker       32 - 1 /* sign */ - 2 * bit_depth /* energy */ + 2 /* SIMD */;
1528*77c1e3ccSAndroid Build Coastguard Worker   const int32_t h_allowed =
1529*77c1e3ccSAndroid Build Coastguard Worker       (1 << num_bit_left) / (w16 + ((w16 != width) ? 16 : 0));
1530*77c1e3ccSAndroid Build Coastguard Worker 
1531*77c1e3ccSAndroid Build Coastguard Worker   // Step 1: Calculate the top edge of the whole matrix, i.e., the top
1532*77c1e3ccSAndroid Build Coastguard Worker   // edge of each triangle and square on the top row.
1533*77c1e3ccSAndroid Build Coastguard Worker   j = 0;
1534*77c1e3ccSAndroid Build Coastguard Worker   do {
1535*77c1e3ccSAndroid Build Coastguard Worker     const int16_t *s_t = s;
1536*77c1e3ccSAndroid Build Coastguard Worker     const int16_t *d_t = d;
1537*77c1e3ccSAndroid Build Coastguard Worker     int32_t height_t = 0;
1538*77c1e3ccSAndroid Build Coastguard Worker     int64x2_t sum_m[WIENER_WIN] = { vdupq_n_s64(0) };
1539*77c1e3ccSAndroid Build Coastguard Worker     int64x2_t sum_h[WIENER_WIN] = { vdupq_n_s64(0) };
1540*77c1e3ccSAndroid Build Coastguard Worker     int16x8_t src[2], dgd[2];
1541*77c1e3ccSAndroid Build Coastguard Worker 
1542*77c1e3ccSAndroid Build Coastguard Worker     do {
1543*77c1e3ccSAndroid Build Coastguard Worker       const int32_t h_t =
1544*77c1e3ccSAndroid Build Coastguard Worker           ((height - height_t) < h_allowed) ? (height - height_t) : h_allowed;
1545*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t row_m[WIENER_WIN * 2] = { vdupq_n_s32(0) };
1546*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t row_h[WIENER_WIN * 2] = { vdupq_n_s32(0) };
1547*77c1e3ccSAndroid Build Coastguard Worker 
1548*77c1e3ccSAndroid Build Coastguard Worker       y = h_t;
1549*77c1e3ccSAndroid Build Coastguard Worker       do {
1550*77c1e3ccSAndroid Build Coastguard Worker         x = 0;
1551*77c1e3ccSAndroid Build Coastguard Worker         while (x < w16) {
1552*77c1e3ccSAndroid Build Coastguard Worker           src[0] = vld1q_s16(s_t + x);
1553*77c1e3ccSAndroid Build Coastguard Worker           src[1] = vld1q_s16(s_t + x + 8);
1554*77c1e3ccSAndroid Build Coastguard Worker           dgd[0] = vld1q_s16(d_t + x);
1555*77c1e3ccSAndroid Build Coastguard Worker           dgd[1] = vld1q_s16(d_t + x + 8);
1556*77c1e3ccSAndroid Build Coastguard Worker           stats_top_win7_neon(src, dgd, d_t + j + x, d_stride, row_m, row_h);
1557*77c1e3ccSAndroid Build Coastguard Worker           x += 16;
1558*77c1e3ccSAndroid Build Coastguard Worker         }
1559*77c1e3ccSAndroid Build Coastguard Worker 
1560*77c1e3ccSAndroid Build Coastguard Worker         if (w16 != width) {
1561*77c1e3ccSAndroid Build Coastguard Worker           src[0] = vld1q_s16(s_t + w16);
1562*77c1e3ccSAndroid Build Coastguard Worker           src[1] = vld1q_s16(s_t + w16 + 8);
1563*77c1e3ccSAndroid Build Coastguard Worker           dgd[0] = vld1q_s16(d_t + w16);
1564*77c1e3ccSAndroid Build Coastguard Worker           dgd[1] = vld1q_s16(d_t + w16 + 8);
1565*77c1e3ccSAndroid Build Coastguard Worker           src[0] = vandq_s16(src[0], mask[0]);
1566*77c1e3ccSAndroid Build Coastguard Worker           src[1] = vandq_s16(src[1], mask[1]);
1567*77c1e3ccSAndroid Build Coastguard Worker           dgd[0] = vandq_s16(dgd[0], mask[0]);
1568*77c1e3ccSAndroid Build Coastguard Worker           dgd[1] = vandq_s16(dgd[1], mask[1]);
1569*77c1e3ccSAndroid Build Coastguard Worker           stats_top_win7_neon(src, dgd, d_t + j + w16, d_stride, row_m, row_h);
1570*77c1e3ccSAndroid Build Coastguard Worker         }
1571*77c1e3ccSAndroid Build Coastguard Worker 
1572*77c1e3ccSAndroid Build Coastguard Worker         s_t += s_stride;
1573*77c1e3ccSAndroid Build Coastguard Worker         d_t += d_stride;
1574*77c1e3ccSAndroid Build Coastguard Worker       } while (--y);
1575*77c1e3ccSAndroid Build Coastguard Worker 
1576*77c1e3ccSAndroid Build Coastguard Worker       sum_m[0] = vpadalq_s32(sum_m[0], row_m[0]);
1577*77c1e3ccSAndroid Build Coastguard Worker       sum_m[1] = vpadalq_s32(sum_m[1], row_m[1]);
1578*77c1e3ccSAndroid Build Coastguard Worker       sum_m[2] = vpadalq_s32(sum_m[2], row_m[2]);
1579*77c1e3ccSAndroid Build Coastguard Worker       sum_m[3] = vpadalq_s32(sum_m[3], row_m[3]);
1580*77c1e3ccSAndroid Build Coastguard Worker       sum_m[4] = vpadalq_s32(sum_m[4], row_m[4]);
1581*77c1e3ccSAndroid Build Coastguard Worker       sum_m[5] = vpadalq_s32(sum_m[5], row_m[5]);
1582*77c1e3ccSAndroid Build Coastguard Worker       sum_m[6] = vpadalq_s32(sum_m[6], row_m[6]);
1583*77c1e3ccSAndroid Build Coastguard Worker 
1584*77c1e3ccSAndroid Build Coastguard Worker       sum_h[0] = vpadalq_s32(sum_h[0], row_h[0]);
1585*77c1e3ccSAndroid Build Coastguard Worker       sum_h[1] = vpadalq_s32(sum_h[1], row_h[1]);
1586*77c1e3ccSAndroid Build Coastguard Worker       sum_h[2] = vpadalq_s32(sum_h[2], row_h[2]);
1587*77c1e3ccSAndroid Build Coastguard Worker       sum_h[3] = vpadalq_s32(sum_h[3], row_h[3]);
1588*77c1e3ccSAndroid Build Coastguard Worker       sum_h[4] = vpadalq_s32(sum_h[4], row_h[4]);
1589*77c1e3ccSAndroid Build Coastguard Worker       sum_h[5] = vpadalq_s32(sum_h[5], row_h[5]);
1590*77c1e3ccSAndroid Build Coastguard Worker       sum_h[6] = vpadalq_s32(sum_h[6], row_h[6]);
1591*77c1e3ccSAndroid Build Coastguard Worker 
1592*77c1e3ccSAndroid Build Coastguard Worker       height_t += h_t;
1593*77c1e3ccSAndroid Build Coastguard Worker     } while (height_t < height);
1594*77c1e3ccSAndroid Build Coastguard Worker 
1595*77c1e3ccSAndroid Build Coastguard Worker #if AOM_ARCH_AARCH64
1596*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s64(M + wiener_win * j + 0, vpaddq_s64(sum_m[0], sum_m[1]));
1597*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s64(M + wiener_win * j + 2, vpaddq_s64(sum_m[2], sum_m[3]));
1598*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s64(M + wiener_win * j + 4, vpaddq_s64(sum_m[4], sum_m[5]));
1599*77c1e3ccSAndroid Build Coastguard Worker     M[wiener_win * j + 6] = vaddvq_s64(sum_m[6]);
1600*77c1e3ccSAndroid Build Coastguard Worker 
1601*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s64(H + wiener_win * j + 0, vpaddq_s64(sum_h[0], sum_h[1]));
1602*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s64(H + wiener_win * j + 2, vpaddq_s64(sum_h[2], sum_h[3]));
1603*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s64(H + wiener_win * j + 4, vpaddq_s64(sum_h[4], sum_h[5]));
1604*77c1e3ccSAndroid Build Coastguard Worker     H[wiener_win * j + 6] = vaddvq_s64(sum_h[6]);
1605*77c1e3ccSAndroid Build Coastguard Worker #else
1606*77c1e3ccSAndroid Build Coastguard Worker     M[wiener_win * j + 0] = horizontal_add_s64x2(sum_m[0]);
1607*77c1e3ccSAndroid Build Coastguard Worker     M[wiener_win * j + 1] = horizontal_add_s64x2(sum_m[1]);
1608*77c1e3ccSAndroid Build Coastguard Worker     M[wiener_win * j + 2] = horizontal_add_s64x2(sum_m[2]);
1609*77c1e3ccSAndroid Build Coastguard Worker     M[wiener_win * j + 3] = horizontal_add_s64x2(sum_m[3]);
1610*77c1e3ccSAndroid Build Coastguard Worker     M[wiener_win * j + 4] = horizontal_add_s64x2(sum_m[4]);
1611*77c1e3ccSAndroid Build Coastguard Worker     M[wiener_win * j + 5] = horizontal_add_s64x2(sum_m[5]);
1612*77c1e3ccSAndroid Build Coastguard Worker     M[wiener_win * j + 6] = horizontal_add_s64x2(sum_m[6]);
1613*77c1e3ccSAndroid Build Coastguard Worker 
1614*77c1e3ccSAndroid Build Coastguard Worker     H[wiener_win * j + 0] = horizontal_add_s64x2(sum_h[0]);
1615*77c1e3ccSAndroid Build Coastguard Worker     H[wiener_win * j + 1] = horizontal_add_s64x2(sum_h[1]);
1616*77c1e3ccSAndroid Build Coastguard Worker     H[wiener_win * j + 2] = horizontal_add_s64x2(sum_h[2]);
1617*77c1e3ccSAndroid Build Coastguard Worker     H[wiener_win * j + 3] = horizontal_add_s64x2(sum_h[3]);
1618*77c1e3ccSAndroid Build Coastguard Worker     H[wiener_win * j + 4] = horizontal_add_s64x2(sum_h[4]);
1619*77c1e3ccSAndroid Build Coastguard Worker     H[wiener_win * j + 5] = horizontal_add_s64x2(sum_h[5]);
1620*77c1e3ccSAndroid Build Coastguard Worker     H[wiener_win * j + 6] = horizontal_add_s64x2(sum_h[6]);
1621*77c1e3ccSAndroid Build Coastguard Worker #endif  // AOM_ARCH_AARCH64
1622*77c1e3ccSAndroid Build Coastguard Worker   } while (++j < wiener_win);
1623*77c1e3ccSAndroid Build Coastguard Worker 
1624*77c1e3ccSAndroid Build Coastguard Worker   // Step 2: Calculate the left edge of each square on the top row.
1625*77c1e3ccSAndroid Build Coastguard Worker   j = 1;
1626*77c1e3ccSAndroid Build Coastguard Worker   do {
1627*77c1e3ccSAndroid Build Coastguard Worker     const int16_t *d_t = d;
1628*77c1e3ccSAndroid Build Coastguard Worker     int32_t height_t = 0;
1629*77c1e3ccSAndroid Build Coastguard Worker     int64x2_t sum_h[WIENER_WIN - 1] = { vdupq_n_s64(0) };
1630*77c1e3ccSAndroid Build Coastguard Worker     int16x8_t dgd[2];
1631*77c1e3ccSAndroid Build Coastguard Worker 
1632*77c1e3ccSAndroid Build Coastguard Worker     do {
1633*77c1e3ccSAndroid Build Coastguard Worker       const int32_t h_t =
1634*77c1e3ccSAndroid Build Coastguard Worker           ((height - height_t) < h_allowed) ? (height - height_t) : h_allowed;
1635*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t row_h[WIENER_WIN - 1] = { vdupq_n_s32(0) };
1636*77c1e3ccSAndroid Build Coastguard Worker 
1637*77c1e3ccSAndroid Build Coastguard Worker       y = h_t;
1638*77c1e3ccSAndroid Build Coastguard Worker       do {
1639*77c1e3ccSAndroid Build Coastguard Worker         x = 0;
1640*77c1e3ccSAndroid Build Coastguard Worker         while (x < w16) {
1641*77c1e3ccSAndroid Build Coastguard Worker           dgd[0] = vld1q_s16(d_t + j + x + 0);
1642*77c1e3ccSAndroid Build Coastguard Worker           dgd[1] = vld1q_s16(d_t + j + x + 8);
1643*77c1e3ccSAndroid Build Coastguard Worker           stats_left_win7_neon(dgd, d_t + x, d_stride, row_h);
1644*77c1e3ccSAndroid Build Coastguard Worker           x += 16;
1645*77c1e3ccSAndroid Build Coastguard Worker         }
1646*77c1e3ccSAndroid Build Coastguard Worker 
1647*77c1e3ccSAndroid Build Coastguard Worker         if (w16 != width) {
1648*77c1e3ccSAndroid Build Coastguard Worker           dgd[0] = vld1q_s16(d_t + j + x + 0);
1649*77c1e3ccSAndroid Build Coastguard Worker           dgd[1] = vld1q_s16(d_t + j + x + 8);
1650*77c1e3ccSAndroid Build Coastguard Worker           dgd[0] = vandq_s16(dgd[0], mask[0]);
1651*77c1e3ccSAndroid Build Coastguard Worker           dgd[1] = vandq_s16(dgd[1], mask[1]);
1652*77c1e3ccSAndroid Build Coastguard Worker           stats_left_win7_neon(dgd, d_t + x, d_stride, row_h);
1653*77c1e3ccSAndroid Build Coastguard Worker         }
1654*77c1e3ccSAndroid Build Coastguard Worker 
1655*77c1e3ccSAndroid Build Coastguard Worker         d_t += d_stride;
1656*77c1e3ccSAndroid Build Coastguard Worker       } while (--y);
1657*77c1e3ccSAndroid Build Coastguard Worker 
1658*77c1e3ccSAndroid Build Coastguard Worker       sum_h[0] = vpadalq_s32(sum_h[0], row_h[0]);
1659*77c1e3ccSAndroid Build Coastguard Worker       sum_h[1] = vpadalq_s32(sum_h[1], row_h[1]);
1660*77c1e3ccSAndroid Build Coastguard Worker       sum_h[2] = vpadalq_s32(sum_h[2], row_h[2]);
1661*77c1e3ccSAndroid Build Coastguard Worker       sum_h[3] = vpadalq_s32(sum_h[3], row_h[3]);
1662*77c1e3ccSAndroid Build Coastguard Worker       sum_h[4] = vpadalq_s32(sum_h[4], row_h[4]);
1663*77c1e3ccSAndroid Build Coastguard Worker       sum_h[5] = vpadalq_s32(sum_h[5], row_h[5]);
1664*77c1e3ccSAndroid Build Coastguard Worker 
1665*77c1e3ccSAndroid Build Coastguard Worker       height_t += h_t;
1666*77c1e3ccSAndroid Build Coastguard Worker     } while (height_t < height);
1667*77c1e3ccSAndroid Build Coastguard Worker 
1668*77c1e3ccSAndroid Build Coastguard Worker #if AOM_ARCH_AARCH64
1669*77c1e3ccSAndroid Build Coastguard Worker     int64x2_t sum_h0 = vpaddq_s64(sum_h[0], sum_h[1]);
1670*77c1e3ccSAndroid Build Coastguard Worker     int64x2_t sum_h2 = vpaddq_s64(sum_h[2], sum_h[3]);
1671*77c1e3ccSAndroid Build Coastguard Worker     int64x2_t sum_h4 = vpaddq_s64(sum_h[4], sum_h[5]);
1672*77c1e3ccSAndroid Build Coastguard Worker     vst1_s64(&H[1 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h0));
1673*77c1e3ccSAndroid Build Coastguard Worker     vst1_s64(&H[2 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h0));
1674*77c1e3ccSAndroid Build Coastguard Worker     vst1_s64(&H[3 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h2));
1675*77c1e3ccSAndroid Build Coastguard Worker     vst1_s64(&H[4 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h2));
1676*77c1e3ccSAndroid Build Coastguard Worker     vst1_s64(&H[5 * wiener_win2 + j * wiener_win], vget_low_s64(sum_h4));
1677*77c1e3ccSAndroid Build Coastguard Worker     vst1_s64(&H[6 * wiener_win2 + j * wiener_win], vget_high_s64(sum_h4));
1678*77c1e3ccSAndroid Build Coastguard Worker #else
1679*77c1e3ccSAndroid Build Coastguard Worker     H[1 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[0]);
1680*77c1e3ccSAndroid Build Coastguard Worker     H[2 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[1]);
1681*77c1e3ccSAndroid Build Coastguard Worker     H[3 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[2]);
1682*77c1e3ccSAndroid Build Coastguard Worker     H[4 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[3]);
1683*77c1e3ccSAndroid Build Coastguard Worker     H[5 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[4]);
1684*77c1e3ccSAndroid Build Coastguard Worker     H[6 * wiener_win2 + j * wiener_win] = horizontal_add_s64x2(sum_h[5]);
1685*77c1e3ccSAndroid Build Coastguard Worker #endif  // AOM_ARCH_AARCH64
1686*77c1e3ccSAndroid Build Coastguard Worker   } while (++j < wiener_win);
1687*77c1e3ccSAndroid Build Coastguard Worker 
1688*77c1e3ccSAndroid Build Coastguard Worker   // Step 3: Derive the top edge of each triangle along the diagonal. No
1689*77c1e3ccSAndroid Build Coastguard Worker   // triangle in top row.
1690*77c1e3ccSAndroid Build Coastguard Worker   {
1691*77c1e3ccSAndroid Build Coastguard Worker     const int16_t *d_t = d;
1692*77c1e3ccSAndroid Build Coastguard Worker     // Pad to call transpose function.
1693*77c1e3ccSAndroid Build Coastguard Worker     int32x4_t deltas[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
1694*77c1e3ccSAndroid Build Coastguard Worker     int32x4_t deltas_tr[(WIENER_WIN + 1) * 2] = { vdupq_n_s32(0) };
1695*77c1e3ccSAndroid Build Coastguard Worker     int16x8_t ds[WIENER_WIN * 2];
1696*77c1e3ccSAndroid Build Coastguard Worker 
1697*77c1e3ccSAndroid Build Coastguard Worker     load_s16_8x6(d_t, d_stride, &ds[0], &ds[2], &ds[4], &ds[6], &ds[8],
1698*77c1e3ccSAndroid Build Coastguard Worker                  &ds[10]);
1699*77c1e3ccSAndroid Build Coastguard Worker     load_s16_8x6(d_t + width, d_stride, &ds[1], &ds[3], &ds[5], &ds[7], &ds[9],
1700*77c1e3ccSAndroid Build Coastguard Worker                  &ds[11]);
1701*77c1e3ccSAndroid Build Coastguard Worker 
1702*77c1e3ccSAndroid Build Coastguard Worker     d_t += 6 * d_stride;
1703*77c1e3ccSAndroid Build Coastguard Worker 
1704*77c1e3ccSAndroid Build Coastguard Worker     step3_win7_neon(d_t, d_stride, width, height, ds, deltas);
1705*77c1e3ccSAndroid Build Coastguard Worker     transpose_arrays_s32_8x8(deltas, deltas_tr);
1706*77c1e3ccSAndroid Build Coastguard Worker 
1707*77c1e3ccSAndroid Build Coastguard Worker     update_8_stats_neon(H + 0 * wiener_win * wiener_win2 + 0 * wiener_win,
1708*77c1e3ccSAndroid Build Coastguard Worker                         deltas_tr[0], deltas_tr[4],
1709*77c1e3ccSAndroid Build Coastguard Worker                         H + 1 * wiener_win * wiener_win2 + 1 * wiener_win);
1710*77c1e3ccSAndroid Build Coastguard Worker     update_8_stats_neon(H + 1 * wiener_win * wiener_win2 + 1 * wiener_win,
1711*77c1e3ccSAndroid Build Coastguard Worker                         deltas_tr[1], deltas_tr[5],
1712*77c1e3ccSAndroid Build Coastguard Worker                         H + 2 * wiener_win * wiener_win2 + 2 * wiener_win);
1713*77c1e3ccSAndroid Build Coastguard Worker     update_8_stats_neon(H + 2 * wiener_win * wiener_win2 + 2 * wiener_win,
1714*77c1e3ccSAndroid Build Coastguard Worker                         deltas_tr[2], deltas_tr[6],
1715*77c1e3ccSAndroid Build Coastguard Worker                         H + 3 * wiener_win * wiener_win2 + 3 * wiener_win);
1716*77c1e3ccSAndroid Build Coastguard Worker     update_8_stats_neon(H + 3 * wiener_win * wiener_win2 + 3 * wiener_win,
1717*77c1e3ccSAndroid Build Coastguard Worker                         deltas_tr[3], deltas_tr[7],
1718*77c1e3ccSAndroid Build Coastguard Worker                         H + 4 * wiener_win * wiener_win2 + 4 * wiener_win);
1719*77c1e3ccSAndroid Build Coastguard Worker     update_8_stats_neon(H + 4 * wiener_win * wiener_win2 + 4 * wiener_win,
1720*77c1e3ccSAndroid Build Coastguard Worker                         deltas_tr[8], deltas_tr[12],
1721*77c1e3ccSAndroid Build Coastguard Worker                         H + 5 * wiener_win * wiener_win2 + 5 * wiener_win);
1722*77c1e3ccSAndroid Build Coastguard Worker     update_8_stats_neon(H + 5 * wiener_win * wiener_win2 + 5 * wiener_win,
1723*77c1e3ccSAndroid Build Coastguard Worker                         deltas_tr[9], deltas_tr[13],
1724*77c1e3ccSAndroid Build Coastguard Worker                         H + 6 * wiener_win * wiener_win2 + 6 * wiener_win);
1725*77c1e3ccSAndroid Build Coastguard Worker   }
1726*77c1e3ccSAndroid Build Coastguard Worker 
1727*77c1e3ccSAndroid Build Coastguard Worker   // Step 4: Derive the top and left edge of each square. No square in top and
1728*77c1e3ccSAndroid Build Coastguard Worker   // bottom row.
1729*77c1e3ccSAndroid Build Coastguard Worker 
1730*77c1e3ccSAndroid Build Coastguard Worker   i = 1;
1731*77c1e3ccSAndroid Build Coastguard Worker   do {
1732*77c1e3ccSAndroid Build Coastguard Worker     j = i + 1;
1733*77c1e3ccSAndroid Build Coastguard Worker     do {
1734*77c1e3ccSAndroid Build Coastguard Worker       const int16_t *di = d + i - 1;
1735*77c1e3ccSAndroid Build Coastguard Worker       const int16_t *dj = d + j - 1;
1736*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t deltas[(2 * WIENER_WIN - 1) * 2] = { vdupq_n_s32(0) };
1737*77c1e3ccSAndroid Build Coastguard Worker       int16x8_t dd[WIENER_WIN * 2], ds[WIENER_WIN * 2];
1738*77c1e3ccSAndroid Build Coastguard Worker 
1739*77c1e3ccSAndroid Build Coastguard Worker       dd[5] = vdupq_n_s16(0);  // Initialize to avoid warning.
1740*77c1e3ccSAndroid Build Coastguard Worker       const int16_t dd0_values[] = { di[0 * d_stride],
1741*77c1e3ccSAndroid Build Coastguard Worker                                      di[1 * d_stride],
1742*77c1e3ccSAndroid Build Coastguard Worker                                      di[2 * d_stride],
1743*77c1e3ccSAndroid Build Coastguard Worker                                      di[3 * d_stride],
1744*77c1e3ccSAndroid Build Coastguard Worker                                      di[4 * d_stride],
1745*77c1e3ccSAndroid Build Coastguard Worker                                      di[5 * d_stride],
1746*77c1e3ccSAndroid Build Coastguard Worker                                      0,
1747*77c1e3ccSAndroid Build Coastguard Worker                                      0 };
1748*77c1e3ccSAndroid Build Coastguard Worker       dd[0] = vld1q_s16(dd0_values);
1749*77c1e3ccSAndroid Build Coastguard Worker       const int16_t dd1_values[] = { di[0 * d_stride + width],
1750*77c1e3ccSAndroid Build Coastguard Worker                                      di[1 * d_stride + width],
1751*77c1e3ccSAndroid Build Coastguard Worker                                      di[2 * d_stride + width],
1752*77c1e3ccSAndroid Build Coastguard Worker                                      di[3 * d_stride + width],
1753*77c1e3ccSAndroid Build Coastguard Worker                                      di[4 * d_stride + width],
1754*77c1e3ccSAndroid Build Coastguard Worker                                      di[5 * d_stride + width],
1755*77c1e3ccSAndroid Build Coastguard Worker                                      0,
1756*77c1e3ccSAndroid Build Coastguard Worker                                      0 };
1757*77c1e3ccSAndroid Build Coastguard Worker       dd[1] = vld1q_s16(dd1_values);
1758*77c1e3ccSAndroid Build Coastguard Worker       const int16_t ds0_values[] = { dj[0 * d_stride],
1759*77c1e3ccSAndroid Build Coastguard Worker                                      dj[1 * d_stride],
1760*77c1e3ccSAndroid Build Coastguard Worker                                      dj[2 * d_stride],
1761*77c1e3ccSAndroid Build Coastguard Worker                                      dj[3 * d_stride],
1762*77c1e3ccSAndroid Build Coastguard Worker                                      dj[4 * d_stride],
1763*77c1e3ccSAndroid Build Coastguard Worker                                      dj[5 * d_stride],
1764*77c1e3ccSAndroid Build Coastguard Worker                                      0,
1765*77c1e3ccSAndroid Build Coastguard Worker                                      0 };
1766*77c1e3ccSAndroid Build Coastguard Worker       ds[0] = vld1q_s16(ds0_values);
1767*77c1e3ccSAndroid Build Coastguard Worker       int16_t ds1_values[] = { dj[0 * d_stride + width],
1768*77c1e3ccSAndroid Build Coastguard Worker                                dj[1 * d_stride + width],
1769*77c1e3ccSAndroid Build Coastguard Worker                                dj[2 * d_stride + width],
1770*77c1e3ccSAndroid Build Coastguard Worker                                dj[3 * d_stride + width],
1771*77c1e3ccSAndroid Build Coastguard Worker                                dj[4 * d_stride + width],
1772*77c1e3ccSAndroid Build Coastguard Worker                                dj[5 * d_stride + width],
1773*77c1e3ccSAndroid Build Coastguard Worker                                0,
1774*77c1e3ccSAndroid Build Coastguard Worker                                0 };
1775*77c1e3ccSAndroid Build Coastguard Worker       ds[1] = vld1q_s16(ds1_values);
1776*77c1e3ccSAndroid Build Coastguard Worker 
1777*77c1e3ccSAndroid Build Coastguard Worker       y = 0;
1778*77c1e3ccSAndroid Build Coastguard Worker       while (y < h8) {
1779*77c1e3ccSAndroid Build Coastguard Worker         // 00s 10s 20s 30s 40s 50s 60s 70s  00e 10e 20e 30e 40e 50e 60e 70e
1780*77c1e3ccSAndroid Build Coastguard Worker         dd[0] = vsetq_lane_s16(di[6 * d_stride], dd[0], 6);
1781*77c1e3ccSAndroid Build Coastguard Worker         dd[0] = vsetq_lane_s16(di[7 * d_stride], dd[0], 7);
1782*77c1e3ccSAndroid Build Coastguard Worker         dd[1] = vsetq_lane_s16(di[6 * d_stride + width], dd[1], 6);
1783*77c1e3ccSAndroid Build Coastguard Worker         dd[1] = vsetq_lane_s16(di[7 * d_stride + width], dd[1], 7);
1784*77c1e3ccSAndroid Build Coastguard Worker 
1785*77c1e3ccSAndroid Build Coastguard Worker         // 00s 10s 20s 30s 40s 50s 60s 70s  00e 10e 20e 30e 40e 50e 60e 70e
1786*77c1e3ccSAndroid Build Coastguard Worker         // 01s 11s 21s 31s 41s 51s 61s 71s  01e 11e 21e 31e 41e 51e 61e 71e
1787*77c1e3ccSAndroid Build Coastguard Worker         ds[0] = vsetq_lane_s16(dj[6 * d_stride], ds[0], 6);
1788*77c1e3ccSAndroid Build Coastguard Worker         ds[0] = vsetq_lane_s16(dj[7 * d_stride], ds[0], 7);
1789*77c1e3ccSAndroid Build Coastguard Worker         ds[1] = vsetq_lane_s16(dj[6 * d_stride + width], ds[1], 6);
1790*77c1e3ccSAndroid Build Coastguard Worker         ds[1] = vsetq_lane_s16(dj[7 * d_stride + width], ds[1], 7);
1791*77c1e3ccSAndroid Build Coastguard Worker 
1792*77c1e3ccSAndroid Build Coastguard Worker         load_more_16_neon(di + 8 * d_stride, width, &dd[0], &dd[2]);
1793*77c1e3ccSAndroid Build Coastguard Worker         load_more_16_neon(dj + 8 * d_stride, width, &ds[0], &ds[2]);
1794*77c1e3ccSAndroid Build Coastguard Worker         load_more_16_neon(di + 9 * d_stride, width, &dd[2], &dd[4]);
1795*77c1e3ccSAndroid Build Coastguard Worker         load_more_16_neon(dj + 9 * d_stride, width, &ds[2], &ds[4]);
1796*77c1e3ccSAndroid Build Coastguard Worker         load_more_16_neon(di + 10 * d_stride, width, &dd[4], &dd[6]);
1797*77c1e3ccSAndroid Build Coastguard Worker         load_more_16_neon(dj + 10 * d_stride, width, &ds[4], &ds[6]);
1798*77c1e3ccSAndroid Build Coastguard Worker         load_more_16_neon(di + 11 * d_stride, width, &dd[6], &dd[8]);
1799*77c1e3ccSAndroid Build Coastguard Worker         load_more_16_neon(dj + 11 * d_stride, width, &ds[6], &ds[8]);
1800*77c1e3ccSAndroid Build Coastguard Worker         load_more_16_neon(di + 12 * d_stride, width, &dd[8], &dd[10]);
1801*77c1e3ccSAndroid Build Coastguard Worker         load_more_16_neon(dj + 12 * d_stride, width, &ds[8], &ds[10]);
1802*77c1e3ccSAndroid Build Coastguard Worker         load_more_16_neon(di + 13 * d_stride, width, &dd[10], &dd[12]);
1803*77c1e3ccSAndroid Build Coastguard Worker         load_more_16_neon(dj + 13 * d_stride, width, &ds[10], &ds[12]);
1804*77c1e3ccSAndroid Build Coastguard Worker 
1805*77c1e3ccSAndroid Build Coastguard Worker         madd_neon(&deltas[0], dd[0], ds[0]);
1806*77c1e3ccSAndroid Build Coastguard Worker         madd_neon(&deltas[1], dd[1], ds[1]);
1807*77c1e3ccSAndroid Build Coastguard Worker         madd_neon(&deltas[2], dd[0], ds[2]);
1808*77c1e3ccSAndroid Build Coastguard Worker         madd_neon(&deltas[3], dd[1], ds[3]);
1809*77c1e3ccSAndroid Build Coastguard Worker         madd_neon(&deltas[4], dd[0], ds[4]);
1810*77c1e3ccSAndroid Build Coastguard Worker         madd_neon(&deltas[5], dd[1], ds[5]);
1811*77c1e3ccSAndroid Build Coastguard Worker         madd_neon(&deltas[6], dd[0], ds[6]);
1812*77c1e3ccSAndroid Build Coastguard Worker         madd_neon(&deltas[7], dd[1], ds[7]);
1813*77c1e3ccSAndroid Build Coastguard Worker         madd_neon(&deltas[8], dd[0], ds[8]);
1814*77c1e3ccSAndroid Build Coastguard Worker         madd_neon(&deltas[9], dd[1], ds[9]);
1815*77c1e3ccSAndroid Build Coastguard Worker         madd_neon(&deltas[10], dd[0], ds[10]);
1816*77c1e3ccSAndroid Build Coastguard Worker         madd_neon(&deltas[11], dd[1], ds[11]);
1817*77c1e3ccSAndroid Build Coastguard Worker         madd_neon(&deltas[12], dd[0], ds[12]);
1818*77c1e3ccSAndroid Build Coastguard Worker         madd_neon(&deltas[13], dd[1], ds[13]);
1819*77c1e3ccSAndroid Build Coastguard Worker         madd_neon(&deltas[14], dd[2], ds[0]);
1820*77c1e3ccSAndroid Build Coastguard Worker         madd_neon(&deltas[15], dd[3], ds[1]);
1821*77c1e3ccSAndroid Build Coastguard Worker         madd_neon(&deltas[16], dd[4], ds[0]);
1822*77c1e3ccSAndroid Build Coastguard Worker         madd_neon(&deltas[17], dd[5], ds[1]);
1823*77c1e3ccSAndroid Build Coastguard Worker         madd_neon(&deltas[18], dd[6], ds[0]);
1824*77c1e3ccSAndroid Build Coastguard Worker         madd_neon(&deltas[19], dd[7], ds[1]);
1825*77c1e3ccSAndroid Build Coastguard Worker         madd_neon(&deltas[20], dd[8], ds[0]);
1826*77c1e3ccSAndroid Build Coastguard Worker         madd_neon(&deltas[21], dd[9], ds[1]);
1827*77c1e3ccSAndroid Build Coastguard Worker         madd_neon(&deltas[22], dd[10], ds[0]);
1828*77c1e3ccSAndroid Build Coastguard Worker         madd_neon(&deltas[23], dd[11], ds[1]);
1829*77c1e3ccSAndroid Build Coastguard Worker         madd_neon(&deltas[24], dd[12], ds[0]);
1830*77c1e3ccSAndroid Build Coastguard Worker         madd_neon(&deltas[25], dd[13], ds[1]);
1831*77c1e3ccSAndroid Build Coastguard Worker 
1832*77c1e3ccSAndroid Build Coastguard Worker         dd[0] = vextq_s16(dd[12], vdupq_n_s16(0), 2);
1833*77c1e3ccSAndroid Build Coastguard Worker         dd[1] = vextq_s16(dd[13], vdupq_n_s16(0), 2);
1834*77c1e3ccSAndroid Build Coastguard Worker         ds[0] = vextq_s16(ds[12], vdupq_n_s16(0), 2);
1835*77c1e3ccSAndroid Build Coastguard Worker         ds[1] = vextq_s16(ds[13], vdupq_n_s16(0), 2);
1836*77c1e3ccSAndroid Build Coastguard Worker 
1837*77c1e3ccSAndroid Build Coastguard Worker         di += 8 * d_stride;
1838*77c1e3ccSAndroid Build Coastguard Worker         dj += 8 * d_stride;
1839*77c1e3ccSAndroid Build Coastguard Worker         y += 8;
1840*77c1e3ccSAndroid Build Coastguard Worker       }
1841*77c1e3ccSAndroid Build Coastguard Worker 
1842*77c1e3ccSAndroid Build Coastguard Worker       deltas[0] = hadd_four_32_neon(deltas[0], deltas[2], deltas[4], deltas[6]);
1843*77c1e3ccSAndroid Build Coastguard Worker       deltas[1] = hadd_four_32_neon(deltas[1], deltas[3], deltas[5], deltas[7]);
1844*77c1e3ccSAndroid Build Coastguard Worker       deltas[2] =
1845*77c1e3ccSAndroid Build Coastguard Worker           hadd_four_32_neon(deltas[8], deltas[10], deltas[12], deltas[12]);
1846*77c1e3ccSAndroid Build Coastguard Worker       deltas[3] =
1847*77c1e3ccSAndroid Build Coastguard Worker           hadd_four_32_neon(deltas[9], deltas[11], deltas[13], deltas[13]);
1848*77c1e3ccSAndroid Build Coastguard Worker       deltas[4] =
1849*77c1e3ccSAndroid Build Coastguard Worker           hadd_four_32_neon(deltas[14], deltas[16], deltas[18], deltas[20]);
1850*77c1e3ccSAndroid Build Coastguard Worker       deltas[5] =
1851*77c1e3ccSAndroid Build Coastguard Worker           hadd_four_32_neon(deltas[15], deltas[17], deltas[19], deltas[21]);
1852*77c1e3ccSAndroid Build Coastguard Worker       deltas[6] =
1853*77c1e3ccSAndroid Build Coastguard Worker           hadd_four_32_neon(deltas[22], deltas[24], deltas[22], deltas[24]);
1854*77c1e3ccSAndroid Build Coastguard Worker       deltas[7] =
1855*77c1e3ccSAndroid Build Coastguard Worker           hadd_four_32_neon(deltas[23], deltas[25], deltas[23], deltas[25]);
1856*77c1e3ccSAndroid Build Coastguard Worker       deltas[0] = vsubq_s32(deltas[1], deltas[0]);
1857*77c1e3ccSAndroid Build Coastguard Worker       deltas[1] = vsubq_s32(deltas[3], deltas[2]);
1858*77c1e3ccSAndroid Build Coastguard Worker       deltas[2] = vsubq_s32(deltas[5], deltas[4]);
1859*77c1e3ccSAndroid Build Coastguard Worker       deltas[3] = vsubq_s32(deltas[7], deltas[6]);
1860*77c1e3ccSAndroid Build Coastguard Worker 
1861*77c1e3ccSAndroid Build Coastguard Worker       if (h8 != height) {
1862*77c1e3ccSAndroid Build Coastguard Worker         const int16_t ds0_vals[] = {
1863*77c1e3ccSAndroid Build Coastguard Worker           dj[0 * d_stride], dj[0 * d_stride + width],
1864*77c1e3ccSAndroid Build Coastguard Worker           dj[1 * d_stride], dj[1 * d_stride + width],
1865*77c1e3ccSAndroid Build Coastguard Worker           dj[2 * d_stride], dj[2 * d_stride + width],
1866*77c1e3ccSAndroid Build Coastguard Worker           dj[3 * d_stride], dj[3 * d_stride + width]
1867*77c1e3ccSAndroid Build Coastguard Worker         };
1868*77c1e3ccSAndroid Build Coastguard Worker         ds[0] = vld1q_s16(ds0_vals);
1869*77c1e3ccSAndroid Build Coastguard Worker 
1870*77c1e3ccSAndroid Build Coastguard Worker         ds[1] = vsetq_lane_s16(dj[4 * d_stride], ds[1], 0);
1871*77c1e3ccSAndroid Build Coastguard Worker         ds[1] = vsetq_lane_s16(dj[4 * d_stride + width], ds[1], 1);
1872*77c1e3ccSAndroid Build Coastguard Worker         ds[1] = vsetq_lane_s16(dj[5 * d_stride], ds[1], 2);
1873*77c1e3ccSAndroid Build Coastguard Worker         ds[1] = vsetq_lane_s16(dj[5 * d_stride + width], ds[1], 3);
1874*77c1e3ccSAndroid Build Coastguard Worker         const int16_t dd4_vals[] = {
1875*77c1e3ccSAndroid Build Coastguard Worker           -di[1 * d_stride], di[1 * d_stride + width],
1876*77c1e3ccSAndroid Build Coastguard Worker           -di[2 * d_stride], di[2 * d_stride + width],
1877*77c1e3ccSAndroid Build Coastguard Worker           -di[3 * d_stride], di[3 * d_stride + width],
1878*77c1e3ccSAndroid Build Coastguard Worker           -di[4 * d_stride], di[4 * d_stride + width]
1879*77c1e3ccSAndroid Build Coastguard Worker         };
1880*77c1e3ccSAndroid Build Coastguard Worker         dd[4] = vld1q_s16(dd4_vals);
1881*77c1e3ccSAndroid Build Coastguard Worker 
1882*77c1e3ccSAndroid Build Coastguard Worker         dd[5] = vsetq_lane_s16(-di[5 * d_stride], dd[5], 0);
1883*77c1e3ccSAndroid Build Coastguard Worker         dd[5] = vsetq_lane_s16(di[5 * d_stride + width], dd[5], 1);
1884*77c1e3ccSAndroid Build Coastguard Worker         do {
1885*77c1e3ccSAndroid Build Coastguard Worker           dd[0] = vdupq_n_s16(-di[0 * d_stride]);
1886*77c1e3ccSAndroid Build Coastguard Worker           dd[2] = dd[3] = vdupq_n_s16(di[0 * d_stride + width]);
1887*77c1e3ccSAndroid Build Coastguard Worker           dd[0] = dd[1] = vzipq_s16(dd[0], dd[2]).val[0];
1888*77c1e3ccSAndroid Build Coastguard Worker 
1889*77c1e3ccSAndroid Build Coastguard Worker           ds[4] = vdupq_n_s16(dj[0 * d_stride]);
1890*77c1e3ccSAndroid Build Coastguard Worker           ds[6] = ds[7] = vdupq_n_s16(dj[0 * d_stride + width]);
1891*77c1e3ccSAndroid Build Coastguard Worker           ds[4] = ds[5] = vzipq_s16(ds[4], ds[6]).val[0];
1892*77c1e3ccSAndroid Build Coastguard Worker 
1893*77c1e3ccSAndroid Build Coastguard Worker           dd[5] = vsetq_lane_s16(-di[6 * d_stride], dd[5], 2);
1894*77c1e3ccSAndroid Build Coastguard Worker           dd[5] = vsetq_lane_s16(di[6 * d_stride + width], dd[5], 3);
1895*77c1e3ccSAndroid Build Coastguard Worker           ds[1] = vsetq_lane_s16(dj[6 * d_stride], ds[1], 4);
1896*77c1e3ccSAndroid Build Coastguard Worker           ds[1] = vsetq_lane_s16(dj[6 * d_stride + width], ds[1], 5);
1897*77c1e3ccSAndroid Build Coastguard Worker 
1898*77c1e3ccSAndroid Build Coastguard Worker           madd_neon_pairwise(&deltas[0], dd[0], ds[0]);
1899*77c1e3ccSAndroid Build Coastguard Worker           madd_neon_pairwise(&deltas[1], dd[1], ds[1]);
1900*77c1e3ccSAndroid Build Coastguard Worker           madd_neon_pairwise(&deltas[2], dd[4], ds[4]);
1901*77c1e3ccSAndroid Build Coastguard Worker           madd_neon_pairwise(&deltas[3], dd[5], ds[5]);
1902*77c1e3ccSAndroid Build Coastguard Worker 
1903*77c1e3ccSAndroid Build Coastguard Worker           int32_t tmp0 = vgetq_lane_s32(vreinterpretq_s32_s16(ds[0]), 0);
1904*77c1e3ccSAndroid Build Coastguard Worker           ds[0] = vextq_s16(ds[0], ds[1], 2);
1905*77c1e3ccSAndroid Build Coastguard Worker           ds[1] = vextq_s16(ds[1], ds[0], 2);
1906*77c1e3ccSAndroid Build Coastguard Worker           ds[1] = vreinterpretq_s16_s32(
1907*77c1e3ccSAndroid Build Coastguard Worker               vsetq_lane_s32(tmp0, vreinterpretq_s32_s16(ds[1]), 3));
1908*77c1e3ccSAndroid Build Coastguard Worker           int32_t tmp1 = vgetq_lane_s32(vreinterpretq_s32_s16(dd[4]), 0);
1909*77c1e3ccSAndroid Build Coastguard Worker           dd[4] = vextq_s16(dd[4], dd[5], 2);
1910*77c1e3ccSAndroid Build Coastguard Worker           dd[5] = vextq_s16(dd[5], dd[4], 2);
1911*77c1e3ccSAndroid Build Coastguard Worker           dd[5] = vreinterpretq_s16_s32(
1912*77c1e3ccSAndroid Build Coastguard Worker               vsetq_lane_s32(tmp1, vreinterpretq_s32_s16(dd[5]), 3));
1913*77c1e3ccSAndroid Build Coastguard Worker           di += d_stride;
1914*77c1e3ccSAndroid Build Coastguard Worker           dj += d_stride;
1915*77c1e3ccSAndroid Build Coastguard Worker         } while (++y < height);
1916*77c1e3ccSAndroid Build Coastguard Worker       }
1917*77c1e3ccSAndroid Build Coastguard Worker 
1918*77c1e3ccSAndroid Build Coastguard Worker       // Writing one more element on the top edge of a square falls to
1919*77c1e3ccSAndroid Build Coastguard Worker       // the next square in the same row or the first element in the next
1920*77c1e3ccSAndroid Build Coastguard Worker       // row, which will just be overwritten later.
1921*77c1e3ccSAndroid Build Coastguard Worker       update_8_stats_neon(
1922*77c1e3ccSAndroid Build Coastguard Worker           H + (i - 1) * wiener_win * wiener_win2 + (j - 1) * wiener_win,
1923*77c1e3ccSAndroid Build Coastguard Worker           deltas[0], deltas[1],
1924*77c1e3ccSAndroid Build Coastguard Worker           H + i * wiener_win * wiener_win2 + j * wiener_win);
1925*77c1e3ccSAndroid Build Coastguard Worker 
1926*77c1e3ccSAndroid Build Coastguard Worker       H[(i * wiener_win + 1) * wiener_win2 + j * wiener_win] =
1927*77c1e3ccSAndroid Build Coastguard Worker           H[((i - 1) * wiener_win + 1) * wiener_win2 + (j - 1) * wiener_win] +
1928*77c1e3ccSAndroid Build Coastguard Worker           vgetq_lane_s32(deltas[2], 0);
1929*77c1e3ccSAndroid Build Coastguard Worker       H[(i * wiener_win + 2) * wiener_win2 + j * wiener_win] =
1930*77c1e3ccSAndroid Build Coastguard Worker           H[((i - 1) * wiener_win + 2) * wiener_win2 + (j - 1) * wiener_win] +
1931*77c1e3ccSAndroid Build Coastguard Worker           vgetq_lane_s32(deltas[2], 1);
1932*77c1e3ccSAndroid Build Coastguard Worker       H[(i * wiener_win + 3) * wiener_win2 + j * wiener_win] =
1933*77c1e3ccSAndroid Build Coastguard Worker           H[((i - 1) * wiener_win + 3) * wiener_win2 + (j - 1) * wiener_win] +
1934*77c1e3ccSAndroid Build Coastguard Worker           vgetq_lane_s32(deltas[2], 2);
1935*77c1e3ccSAndroid Build Coastguard Worker       H[(i * wiener_win + 4) * wiener_win2 + j * wiener_win] =
1936*77c1e3ccSAndroid Build Coastguard Worker           H[((i - 1) * wiener_win + 4) * wiener_win2 + (j - 1) * wiener_win] +
1937*77c1e3ccSAndroid Build Coastguard Worker           vgetq_lane_s32(deltas[2], 3);
1938*77c1e3ccSAndroid Build Coastguard Worker       H[(i * wiener_win + 5) * wiener_win2 + j * wiener_win] =
1939*77c1e3ccSAndroid Build Coastguard Worker           H[((i - 1) * wiener_win + 5) * wiener_win2 + (j - 1) * wiener_win] +
1940*77c1e3ccSAndroid Build Coastguard Worker           vgetq_lane_s32(deltas[3], 0);
1941*77c1e3ccSAndroid Build Coastguard Worker       H[(i * wiener_win + 6) * wiener_win2 + j * wiener_win] =
1942*77c1e3ccSAndroid Build Coastguard Worker           H[((i - 1) * wiener_win + 6) * wiener_win2 + (j - 1) * wiener_win] +
1943*77c1e3ccSAndroid Build Coastguard Worker           vgetq_lane_s32(deltas[3], 1);
1944*77c1e3ccSAndroid Build Coastguard Worker     } while (++j < wiener_win);
1945*77c1e3ccSAndroid Build Coastguard Worker   } while (++i < wiener_win - 1);
1946*77c1e3ccSAndroid Build Coastguard Worker 
1947*77c1e3ccSAndroid Build Coastguard Worker   // Step 5: Derive other points of each square. No square in bottom row.
1948*77c1e3ccSAndroid Build Coastguard Worker   i = 0;
1949*77c1e3ccSAndroid Build Coastguard Worker   do {
1950*77c1e3ccSAndroid Build Coastguard Worker     const int16_t *const di = d + i;
1951*77c1e3ccSAndroid Build Coastguard Worker 
1952*77c1e3ccSAndroid Build Coastguard Worker     j = i + 1;
1953*77c1e3ccSAndroid Build Coastguard Worker     do {
1954*77c1e3ccSAndroid Build Coastguard Worker       const int16_t *const dj = d + j;
1955*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t deltas[WIENER_WIN - 1][WIN_7] = { { vdupq_n_s32(0) },
1956*77c1e3ccSAndroid Build Coastguard Worker                                                   { vdupq_n_s32(0) } };
1957*77c1e3ccSAndroid Build Coastguard Worker       int16x8_t d_is[WIN_7];
1958*77c1e3ccSAndroid Build Coastguard Worker       int16x8_t d_ie[WIN_7];
1959*77c1e3ccSAndroid Build Coastguard Worker       int16x8_t d_js[WIN_7];
1960*77c1e3ccSAndroid Build Coastguard Worker       int16x8_t d_je[WIN_7];
1961*77c1e3ccSAndroid Build Coastguard Worker 
1962*77c1e3ccSAndroid Build Coastguard Worker       x = 0;
1963*77c1e3ccSAndroid Build Coastguard Worker       while (x < w16) {
1964*77c1e3ccSAndroid Build Coastguard Worker         load_square_win7_neon(di + x, dj + x, d_stride, height, d_is, d_ie,
1965*77c1e3ccSAndroid Build Coastguard Worker                               d_js, d_je);
1966*77c1e3ccSAndroid Build Coastguard Worker         derive_square_win7_neon(d_is, d_ie, d_js, d_je, deltas);
1967*77c1e3ccSAndroid Build Coastguard Worker         x += 16;
1968*77c1e3ccSAndroid Build Coastguard Worker       }
1969*77c1e3ccSAndroid Build Coastguard Worker 
1970*77c1e3ccSAndroid Build Coastguard Worker       if (w16 != width) {
1971*77c1e3ccSAndroid Build Coastguard Worker         load_square_win7_neon(di + x, dj + x, d_stride, height, d_is, d_ie,
1972*77c1e3ccSAndroid Build Coastguard Worker                               d_js, d_je);
1973*77c1e3ccSAndroid Build Coastguard Worker         d_is[0] = vandq_s16(d_is[0], mask[0]);
1974*77c1e3ccSAndroid Build Coastguard Worker         d_is[1] = vandq_s16(d_is[1], mask[1]);
1975*77c1e3ccSAndroid Build Coastguard Worker         d_is[2] = vandq_s16(d_is[2], mask[0]);
1976*77c1e3ccSAndroid Build Coastguard Worker         d_is[3] = vandq_s16(d_is[3], mask[1]);
1977*77c1e3ccSAndroid Build Coastguard Worker         d_is[4] = vandq_s16(d_is[4], mask[0]);
1978*77c1e3ccSAndroid Build Coastguard Worker         d_is[5] = vandq_s16(d_is[5], mask[1]);
1979*77c1e3ccSAndroid Build Coastguard Worker         d_is[6] = vandq_s16(d_is[6], mask[0]);
1980*77c1e3ccSAndroid Build Coastguard Worker         d_is[7] = vandq_s16(d_is[7], mask[1]);
1981*77c1e3ccSAndroid Build Coastguard Worker         d_is[8] = vandq_s16(d_is[8], mask[0]);
1982*77c1e3ccSAndroid Build Coastguard Worker         d_is[9] = vandq_s16(d_is[9], mask[1]);
1983*77c1e3ccSAndroid Build Coastguard Worker         d_is[10] = vandq_s16(d_is[10], mask[0]);
1984*77c1e3ccSAndroid Build Coastguard Worker         d_is[11] = vandq_s16(d_is[11], mask[1]);
1985*77c1e3ccSAndroid Build Coastguard Worker         d_ie[0] = vandq_s16(d_ie[0], mask[0]);
1986*77c1e3ccSAndroid Build Coastguard Worker         d_ie[1] = vandq_s16(d_ie[1], mask[1]);
1987*77c1e3ccSAndroid Build Coastguard Worker         d_ie[2] = vandq_s16(d_ie[2], mask[0]);
1988*77c1e3ccSAndroid Build Coastguard Worker         d_ie[3] = vandq_s16(d_ie[3], mask[1]);
1989*77c1e3ccSAndroid Build Coastguard Worker         d_ie[4] = vandq_s16(d_ie[4], mask[0]);
1990*77c1e3ccSAndroid Build Coastguard Worker         d_ie[5] = vandq_s16(d_ie[5], mask[1]);
1991*77c1e3ccSAndroid Build Coastguard Worker         d_ie[6] = vandq_s16(d_ie[6], mask[0]);
1992*77c1e3ccSAndroid Build Coastguard Worker         d_ie[7] = vandq_s16(d_ie[7], mask[1]);
1993*77c1e3ccSAndroid Build Coastguard Worker         d_ie[8] = vandq_s16(d_ie[8], mask[0]);
1994*77c1e3ccSAndroid Build Coastguard Worker         d_ie[9] = vandq_s16(d_ie[9], mask[1]);
1995*77c1e3ccSAndroid Build Coastguard Worker         d_ie[10] = vandq_s16(d_ie[10], mask[0]);
1996*77c1e3ccSAndroid Build Coastguard Worker         d_ie[11] = vandq_s16(d_ie[11], mask[1]);
1997*77c1e3ccSAndroid Build Coastguard Worker         derive_square_win7_neon(d_is, d_ie, d_js, d_je, deltas);
1998*77c1e3ccSAndroid Build Coastguard Worker       }
1999*77c1e3ccSAndroid Build Coastguard Worker 
2000*77c1e3ccSAndroid Build Coastguard Worker       hadd_update_6_stats_neon(
2001*77c1e3ccSAndroid Build Coastguard Worker           H + (i * wiener_win + 0) * wiener_win2 + j * wiener_win, deltas[0],
2002*77c1e3ccSAndroid Build Coastguard Worker           H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win + 1);
2003*77c1e3ccSAndroid Build Coastguard Worker       hadd_update_6_stats_neon(
2004*77c1e3ccSAndroid Build Coastguard Worker           H + (i * wiener_win + 1) * wiener_win2 + j * wiener_win, deltas[1],
2005*77c1e3ccSAndroid Build Coastguard Worker           H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win + 1);
2006*77c1e3ccSAndroid Build Coastguard Worker       hadd_update_6_stats_neon(
2007*77c1e3ccSAndroid Build Coastguard Worker           H + (i * wiener_win + 2) * wiener_win2 + j * wiener_win, deltas[2],
2008*77c1e3ccSAndroid Build Coastguard Worker           H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win + 1);
2009*77c1e3ccSAndroid Build Coastguard Worker       hadd_update_6_stats_neon(
2010*77c1e3ccSAndroid Build Coastguard Worker           H + (i * wiener_win + 3) * wiener_win2 + j * wiener_win, deltas[3],
2011*77c1e3ccSAndroid Build Coastguard Worker           H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win + 1);
2012*77c1e3ccSAndroid Build Coastguard Worker       hadd_update_6_stats_neon(
2013*77c1e3ccSAndroid Build Coastguard Worker           H + (i * wiener_win + 4) * wiener_win2 + j * wiener_win, deltas[4],
2014*77c1e3ccSAndroid Build Coastguard Worker           H + (i * wiener_win + 5) * wiener_win2 + j * wiener_win + 1);
2015*77c1e3ccSAndroid Build Coastguard Worker       hadd_update_6_stats_neon(
2016*77c1e3ccSAndroid Build Coastguard Worker           H + (i * wiener_win + 5) * wiener_win2 + j * wiener_win, deltas[5],
2017*77c1e3ccSAndroid Build Coastguard Worker           H + (i * wiener_win + 6) * wiener_win2 + j * wiener_win + 1);
2018*77c1e3ccSAndroid Build Coastguard Worker     } while (++j < wiener_win);
2019*77c1e3ccSAndroid Build Coastguard Worker   } while (++i < wiener_win - 1);
2020*77c1e3ccSAndroid Build Coastguard Worker 
2021*77c1e3ccSAndroid Build Coastguard Worker   // Step 6: Derive other points of each upper triangle along the diagonal.
2022*77c1e3ccSAndroid Build Coastguard Worker   i = 0;
2023*77c1e3ccSAndroid Build Coastguard Worker   do {
2024*77c1e3ccSAndroid Build Coastguard Worker     const int16_t *const di = d + i;
2025*77c1e3ccSAndroid Build Coastguard Worker     int32x4_t deltas[WIENER_WIN * (WIENER_WIN - 1)] = { vdupq_n_s32(0) };
2026*77c1e3ccSAndroid Build Coastguard Worker     int16x8_t d_is[WIN_7], d_ie[WIN_7];
2027*77c1e3ccSAndroid Build Coastguard Worker 
2028*77c1e3ccSAndroid Build Coastguard Worker     x = 0;
2029*77c1e3ccSAndroid Build Coastguard Worker     while (x < w16) {
2030*77c1e3ccSAndroid Build Coastguard Worker       load_triangle_win7_neon(di + x, d_stride, height, d_is, d_ie);
2031*77c1e3ccSAndroid Build Coastguard Worker       derive_triangle_win7_neon(d_is, d_ie, deltas);
2032*77c1e3ccSAndroid Build Coastguard Worker       x += 16;
2033*77c1e3ccSAndroid Build Coastguard Worker     }
2034*77c1e3ccSAndroid Build Coastguard Worker 
2035*77c1e3ccSAndroid Build Coastguard Worker     if (w16 != width) {
2036*77c1e3ccSAndroid Build Coastguard Worker       load_triangle_win7_neon(di + x, d_stride, height, d_is, d_ie);
2037*77c1e3ccSAndroid Build Coastguard Worker       d_is[0] = vandq_s16(d_is[0], mask[0]);
2038*77c1e3ccSAndroid Build Coastguard Worker       d_is[1] = vandq_s16(d_is[1], mask[1]);
2039*77c1e3ccSAndroid Build Coastguard Worker       d_is[2] = vandq_s16(d_is[2], mask[0]);
2040*77c1e3ccSAndroid Build Coastguard Worker       d_is[3] = vandq_s16(d_is[3], mask[1]);
2041*77c1e3ccSAndroid Build Coastguard Worker       d_is[4] = vandq_s16(d_is[4], mask[0]);
2042*77c1e3ccSAndroid Build Coastguard Worker       d_is[5] = vandq_s16(d_is[5], mask[1]);
2043*77c1e3ccSAndroid Build Coastguard Worker       d_is[6] = vandq_s16(d_is[6], mask[0]);
2044*77c1e3ccSAndroid Build Coastguard Worker       d_is[7] = vandq_s16(d_is[7], mask[1]);
2045*77c1e3ccSAndroid Build Coastguard Worker       d_is[8] = vandq_s16(d_is[8], mask[0]);
2046*77c1e3ccSAndroid Build Coastguard Worker       d_is[9] = vandq_s16(d_is[9], mask[1]);
2047*77c1e3ccSAndroid Build Coastguard Worker       d_is[10] = vandq_s16(d_is[10], mask[0]);
2048*77c1e3ccSAndroid Build Coastguard Worker       d_is[11] = vandq_s16(d_is[11], mask[1]);
2049*77c1e3ccSAndroid Build Coastguard Worker       d_ie[0] = vandq_s16(d_ie[0], mask[0]);
2050*77c1e3ccSAndroid Build Coastguard Worker       d_ie[1] = vandq_s16(d_ie[1], mask[1]);
2051*77c1e3ccSAndroid Build Coastguard Worker       d_ie[2] = vandq_s16(d_ie[2], mask[0]);
2052*77c1e3ccSAndroid Build Coastguard Worker       d_ie[3] = vandq_s16(d_ie[3], mask[1]);
2053*77c1e3ccSAndroid Build Coastguard Worker       d_ie[4] = vandq_s16(d_ie[4], mask[0]);
2054*77c1e3ccSAndroid Build Coastguard Worker       d_ie[5] = vandq_s16(d_ie[5], mask[1]);
2055*77c1e3ccSAndroid Build Coastguard Worker       d_ie[6] = vandq_s16(d_ie[6], mask[0]);
2056*77c1e3ccSAndroid Build Coastguard Worker       d_ie[7] = vandq_s16(d_ie[7], mask[1]);
2057*77c1e3ccSAndroid Build Coastguard Worker       d_ie[8] = vandq_s16(d_ie[8], mask[0]);
2058*77c1e3ccSAndroid Build Coastguard Worker       d_ie[9] = vandq_s16(d_ie[9], mask[1]);
2059*77c1e3ccSAndroid Build Coastguard Worker       d_ie[10] = vandq_s16(d_ie[10], mask[0]);
2060*77c1e3ccSAndroid Build Coastguard Worker       d_ie[11] = vandq_s16(d_ie[11], mask[1]);
2061*77c1e3ccSAndroid Build Coastguard Worker       derive_triangle_win7_neon(d_is, d_ie, deltas);
2062*77c1e3ccSAndroid Build Coastguard Worker     }
2063*77c1e3ccSAndroid Build Coastguard Worker 
2064*77c1e3ccSAndroid Build Coastguard Worker     // Row 1: 6 points
2065*77c1e3ccSAndroid Build Coastguard Worker     hadd_update_6_stats_neon(
2066*77c1e3ccSAndroid Build Coastguard Worker         H + (i * wiener_win + 0) * wiener_win2 + i * wiener_win, deltas,
2067*77c1e3ccSAndroid Build Coastguard Worker         H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1);
2068*77c1e3ccSAndroid Build Coastguard Worker 
2069*77c1e3ccSAndroid Build Coastguard Worker     int32x4_t delta1710 = horizontal_add_2d_s32(deltas[17], deltas[10]);
2070*77c1e3ccSAndroid Build Coastguard Worker     int32x4_t delta1516 = horizontal_add_2d_s32(deltas[15], deltas[16]);
2071*77c1e3ccSAndroid Build Coastguard Worker 
2072*77c1e3ccSAndroid Build Coastguard Worker     int64x2_t delta1710_s64 = vpaddlq_s32(delta1710);
2073*77c1e3ccSAndroid Build Coastguard Worker     int64x2_t delta1516_s64 = vpaddlq_s32(delta1516);
2074*77c1e3ccSAndroid Build Coastguard Worker 
2075*77c1e3ccSAndroid Build Coastguard Worker     // Row 2: 5 points
2076*77c1e3ccSAndroid Build Coastguard Worker     hadd_update_4_stats_neon(
2077*77c1e3ccSAndroid Build Coastguard Worker         H + (i * wiener_win + 1) * wiener_win2 + i * wiener_win + 1, deltas + 6,
2078*77c1e3ccSAndroid Build Coastguard Worker         H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2);
2079*77c1e3ccSAndroid Build Coastguard Worker     H[(i * wiener_win + 2) * wiener_win2 + i * wiener_win + 6] =
2080*77c1e3ccSAndroid Build Coastguard Worker         H[(i * wiener_win + 1) * wiener_win2 + i * wiener_win + 5] +
2081*77c1e3ccSAndroid Build Coastguard Worker         vgetq_lane_s64(delta1710_s64, 1);
2082*77c1e3ccSAndroid Build Coastguard Worker 
2083*77c1e3ccSAndroid Build Coastguard Worker     // Row 3: 4 points
2084*77c1e3ccSAndroid Build Coastguard Worker     hadd_update_4_stats_neon(
2085*77c1e3ccSAndroid Build Coastguard Worker         H + (i * wiener_win + 2) * wiener_win2 + i * wiener_win + 2,
2086*77c1e3ccSAndroid Build Coastguard Worker         deltas + 11,
2087*77c1e3ccSAndroid Build Coastguard Worker         H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3);
2088*77c1e3ccSAndroid Build Coastguard Worker 
2089*77c1e3ccSAndroid Build Coastguard Worker     // Row 4: 3 points
2090*77c1e3ccSAndroid Build Coastguard Worker     int64x2_t h0 =
2091*77c1e3ccSAndroid Build Coastguard Worker         vld1q_s64(H + (i * wiener_win + 3) * wiener_win2 + i * wiener_win + 3);
2092*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s64(H + (i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4,
2093*77c1e3ccSAndroid Build Coastguard Worker               vaddq_s64(h0, delta1516_s64));
2094*77c1e3ccSAndroid Build Coastguard Worker     H[(i * wiener_win + 4) * wiener_win2 + i * wiener_win + 6] =
2095*77c1e3ccSAndroid Build Coastguard Worker         H[(i * wiener_win + 3) * wiener_win2 + i * wiener_win + 5] +
2096*77c1e3ccSAndroid Build Coastguard Worker         vgetq_lane_s64(delta1710_s64, 0);
2097*77c1e3ccSAndroid Build Coastguard Worker 
2098*77c1e3ccSAndroid Build Coastguard Worker     int32x4_t delta1819 = horizontal_add_2d_s32(deltas[18], deltas[19]);
2099*77c1e3ccSAndroid Build Coastguard Worker     int64x2_t delta1819_s64 = vpaddlq_s32(delta1819);
2100*77c1e3ccSAndroid Build Coastguard Worker 
2101*77c1e3ccSAndroid Build Coastguard Worker     // Row 5: 2 points
2102*77c1e3ccSAndroid Build Coastguard Worker     int64x2_t h1 =
2103*77c1e3ccSAndroid Build Coastguard Worker         vld1q_s64(H + (i * wiener_win + 4) * wiener_win2 + i * wiener_win + 4);
2104*77c1e3ccSAndroid Build Coastguard Worker     vst1q_s64(H + (i * wiener_win + 5) * wiener_win2 + i * wiener_win + 5,
2105*77c1e3ccSAndroid Build Coastguard Worker               vaddq_s64(h1, delta1819_s64));
2106*77c1e3ccSAndroid Build Coastguard Worker 
2107*77c1e3ccSAndroid Build Coastguard Worker     // Row 6: 1 points
2108*77c1e3ccSAndroid Build Coastguard Worker     H[(i * wiener_win + 6) * wiener_win2 + i * wiener_win + 6] =
2109*77c1e3ccSAndroid Build Coastguard Worker         H[(i * wiener_win + 5) * wiener_win2 + i * wiener_win + 5] +
2110*77c1e3ccSAndroid Build Coastguard Worker         horizontal_long_add_s32x4(deltas[20]);
2111*77c1e3ccSAndroid Build Coastguard Worker   } while (++i < wiener_win);
2112*77c1e3ccSAndroid Build Coastguard Worker }
2113*77c1e3ccSAndroid Build Coastguard Worker 
find_average_neon(const uint8_t * src,int src_stride,int width,int height)2114*77c1e3ccSAndroid Build Coastguard Worker static inline uint8_t find_average_neon(const uint8_t *src, int src_stride,
2115*77c1e3ccSAndroid Build Coastguard Worker                                         int width, int height) {
2116*77c1e3ccSAndroid Build Coastguard Worker   uint64_t sum = 0;
2117*77c1e3ccSAndroid Build Coastguard Worker 
2118*77c1e3ccSAndroid Build Coastguard Worker   if (width >= 16) {
2119*77c1e3ccSAndroid Build Coastguard Worker     int h = 0;
2120*77c1e3ccSAndroid Build Coastguard Worker     // We can accumulate up to 257 8-bit values in a 16-bit value, given
2121*77c1e3ccSAndroid Build Coastguard Worker     // that each 16-bit vector has 8 elements, that means we can process up to
2122*77c1e3ccSAndroid Build Coastguard Worker     // int(257*8/width) rows before we need to widen to 32-bit vector
2123*77c1e3ccSAndroid Build Coastguard Worker     // elements.
2124*77c1e3ccSAndroid Build Coastguard Worker     int h_overflow = 257 * 8 / width;
2125*77c1e3ccSAndroid Build Coastguard Worker     int h_limit = height > h_overflow ? h_overflow : height;
2126*77c1e3ccSAndroid Build Coastguard Worker     uint32x4_t avg_u32 = vdupq_n_u32(0);
2127*77c1e3ccSAndroid Build Coastguard Worker     do {
2128*77c1e3ccSAndroid Build Coastguard Worker       uint16x8_t avg_u16 = vdupq_n_u16(0);
2129*77c1e3ccSAndroid Build Coastguard Worker       do {
2130*77c1e3ccSAndroid Build Coastguard Worker         int j = width;
2131*77c1e3ccSAndroid Build Coastguard Worker         const uint8_t *src_ptr = src;
2132*77c1e3ccSAndroid Build Coastguard Worker         do {
2133*77c1e3ccSAndroid Build Coastguard Worker           uint8x16_t s = vld1q_u8(src_ptr);
2134*77c1e3ccSAndroid Build Coastguard Worker           avg_u16 = vpadalq_u8(avg_u16, s);
2135*77c1e3ccSAndroid Build Coastguard Worker           j -= 16;
2136*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += 16;
2137*77c1e3ccSAndroid Build Coastguard Worker         } while (j >= 16);
2138*77c1e3ccSAndroid Build Coastguard Worker         if (j >= 8) {
2139*77c1e3ccSAndroid Build Coastguard Worker           uint8x8_t s = vld1_u8(src_ptr);
2140*77c1e3ccSAndroid Build Coastguard Worker           avg_u16 = vaddw_u8(avg_u16, s);
2141*77c1e3ccSAndroid Build Coastguard Worker           j -= 8;
2142*77c1e3ccSAndroid Build Coastguard Worker           src_ptr += 8;
2143*77c1e3ccSAndroid Build Coastguard Worker         }
2144*77c1e3ccSAndroid Build Coastguard Worker         // Scalar tail case.
2145*77c1e3ccSAndroid Build Coastguard Worker         while (j > 0) {
2146*77c1e3ccSAndroid Build Coastguard Worker           sum += src[width - j];
2147*77c1e3ccSAndroid Build Coastguard Worker           j--;
2148*77c1e3ccSAndroid Build Coastguard Worker         }
2149*77c1e3ccSAndroid Build Coastguard Worker         src += src_stride;
2150*77c1e3ccSAndroid Build Coastguard Worker       } while (++h < h_limit);
2151*77c1e3ccSAndroid Build Coastguard Worker       avg_u32 = vpadalq_u16(avg_u32, avg_u16);
2152*77c1e3ccSAndroid Build Coastguard Worker 
2153*77c1e3ccSAndroid Build Coastguard Worker       h_limit += h_overflow;
2154*77c1e3ccSAndroid Build Coastguard Worker       h_limit = height > h_overflow ? h_overflow : height;
2155*77c1e3ccSAndroid Build Coastguard Worker     } while (h < height);
2156*77c1e3ccSAndroid Build Coastguard Worker     return (uint8_t)((horizontal_long_add_u32x4(avg_u32) + sum) /
2157*77c1e3ccSAndroid Build Coastguard Worker                      (width * height));
2158*77c1e3ccSAndroid Build Coastguard Worker   }
2159*77c1e3ccSAndroid Build Coastguard Worker   if (width >= 8) {
2160*77c1e3ccSAndroid Build Coastguard Worker     int h = 0;
2161*77c1e3ccSAndroid Build Coastguard Worker     // We can accumulate up to 257 8-bit values in a 16-bit value, given
2162*77c1e3ccSAndroid Build Coastguard Worker     // that each 16-bit vector has 4 elements, that means we can process up to
2163*77c1e3ccSAndroid Build Coastguard Worker     // int(257*4/width) rows before we need to widen to 32-bit vector
2164*77c1e3ccSAndroid Build Coastguard Worker     // elements.
2165*77c1e3ccSAndroid Build Coastguard Worker     int h_overflow = 257 * 4 / width;
2166*77c1e3ccSAndroid Build Coastguard Worker     int h_limit = height > h_overflow ? h_overflow : height;
2167*77c1e3ccSAndroid Build Coastguard Worker     uint32x2_t avg_u32 = vdup_n_u32(0);
2168*77c1e3ccSAndroid Build Coastguard Worker     do {
2169*77c1e3ccSAndroid Build Coastguard Worker       uint16x4_t avg_u16 = vdup_n_u16(0);
2170*77c1e3ccSAndroid Build Coastguard Worker       do {
2171*77c1e3ccSAndroid Build Coastguard Worker         int j = width;
2172*77c1e3ccSAndroid Build Coastguard Worker         const uint8_t *src_ptr = src;
2173*77c1e3ccSAndroid Build Coastguard Worker         uint8x8_t s = vld1_u8(src_ptr);
2174*77c1e3ccSAndroid Build Coastguard Worker         avg_u16 = vpadal_u8(avg_u16, s);
2175*77c1e3ccSAndroid Build Coastguard Worker         j -= 8;
2176*77c1e3ccSAndroid Build Coastguard Worker         src_ptr += 8;
2177*77c1e3ccSAndroid Build Coastguard Worker         // Scalar tail case.
2178*77c1e3ccSAndroid Build Coastguard Worker         while (j > 0) {
2179*77c1e3ccSAndroid Build Coastguard Worker           sum += src[width - j];
2180*77c1e3ccSAndroid Build Coastguard Worker           j--;
2181*77c1e3ccSAndroid Build Coastguard Worker         }
2182*77c1e3ccSAndroid Build Coastguard Worker         src += src_stride;
2183*77c1e3ccSAndroid Build Coastguard Worker       } while (++h < h_limit);
2184*77c1e3ccSAndroid Build Coastguard Worker       avg_u32 = vpadal_u16(avg_u32, avg_u16);
2185*77c1e3ccSAndroid Build Coastguard Worker 
2186*77c1e3ccSAndroid Build Coastguard Worker       h_limit += h_overflow;
2187*77c1e3ccSAndroid Build Coastguard Worker       h_limit = height > h_overflow ? h_overflow : height;
2188*77c1e3ccSAndroid Build Coastguard Worker     } while (h < height);
2189*77c1e3ccSAndroid Build Coastguard Worker     return (uint8_t)((horizontal_long_add_u32x2(avg_u32) + sum) /
2190*77c1e3ccSAndroid Build Coastguard Worker                      (width * height));
2191*77c1e3ccSAndroid Build Coastguard Worker   }
2192*77c1e3ccSAndroid Build Coastguard Worker   int i = height;
2193*77c1e3ccSAndroid Build Coastguard Worker   do {
2194*77c1e3ccSAndroid Build Coastguard Worker     int j = 0;
2195*77c1e3ccSAndroid Build Coastguard Worker     do {
2196*77c1e3ccSAndroid Build Coastguard Worker       sum += src[j];
2197*77c1e3ccSAndroid Build Coastguard Worker     } while (++j < width);
2198*77c1e3ccSAndroid Build Coastguard Worker     src += src_stride;
2199*77c1e3ccSAndroid Build Coastguard Worker   } while (--i != 0);
2200*77c1e3ccSAndroid Build Coastguard Worker   return (uint8_t)(sum / (width * height));
2201*77c1e3ccSAndroid Build Coastguard Worker }
2202*77c1e3ccSAndroid Build Coastguard Worker 
compute_sub_avg(const uint8_t * buf,int buf_stride,int avg,int16_t * buf_avg,int buf_avg_stride,int width,int height,int downsample_factor)2203*77c1e3ccSAndroid Build Coastguard Worker static inline void compute_sub_avg(const uint8_t *buf, int buf_stride, int avg,
2204*77c1e3ccSAndroid Build Coastguard Worker                                    int16_t *buf_avg, int buf_avg_stride,
2205*77c1e3ccSAndroid Build Coastguard Worker                                    int width, int height,
2206*77c1e3ccSAndroid Build Coastguard Worker                                    int downsample_factor) {
2207*77c1e3ccSAndroid Build Coastguard Worker   uint8x8_t avg_u8 = vdup_n_u8(avg);
2208*77c1e3ccSAndroid Build Coastguard Worker 
2209*77c1e3ccSAndroid Build Coastguard Worker   if (width > 8) {
2210*77c1e3ccSAndroid Build Coastguard Worker     int i = 0;
2211*77c1e3ccSAndroid Build Coastguard Worker     do {
2212*77c1e3ccSAndroid Build Coastguard Worker       int j = width;
2213*77c1e3ccSAndroid Build Coastguard Worker       const uint8_t *buf_ptr = buf;
2214*77c1e3ccSAndroid Build Coastguard Worker       int16_t *buf_avg_ptr = buf_avg;
2215*77c1e3ccSAndroid Build Coastguard Worker       do {
2216*77c1e3ccSAndroid Build Coastguard Worker         uint8x8_t d = vld1_u8(buf_ptr);
2217*77c1e3ccSAndroid Build Coastguard Worker         vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubl_u8(d, avg_u8)));
2218*77c1e3ccSAndroid Build Coastguard Worker 
2219*77c1e3ccSAndroid Build Coastguard Worker         j -= 8;
2220*77c1e3ccSAndroid Build Coastguard Worker         buf_ptr += 8;
2221*77c1e3ccSAndroid Build Coastguard Worker         buf_avg_ptr += 8;
2222*77c1e3ccSAndroid Build Coastguard Worker       } while (j >= 8);
2223*77c1e3ccSAndroid Build Coastguard Worker       while (j > 0) {
2224*77c1e3ccSAndroid Build Coastguard Worker         *buf_avg_ptr = (int16_t)buf[width - j] - (int16_t)avg;
2225*77c1e3ccSAndroid Build Coastguard Worker         buf_avg_ptr++;
2226*77c1e3ccSAndroid Build Coastguard Worker         j--;
2227*77c1e3ccSAndroid Build Coastguard Worker       }
2228*77c1e3ccSAndroid Build Coastguard Worker       buf += buf_stride;
2229*77c1e3ccSAndroid Build Coastguard Worker       buf_avg += buf_avg_stride;
2230*77c1e3ccSAndroid Build Coastguard Worker       i += downsample_factor;
2231*77c1e3ccSAndroid Build Coastguard Worker     } while (i < height);
2232*77c1e3ccSAndroid Build Coastguard Worker   } else {
2233*77c1e3ccSAndroid Build Coastguard Worker     // For width < 8, don't use Neon.
2234*77c1e3ccSAndroid Build Coastguard Worker     for (int i = 0; i < height; i = i + downsample_factor) {
2235*77c1e3ccSAndroid Build Coastguard Worker       for (int j = 0; j < width; j++) {
2236*77c1e3ccSAndroid Build Coastguard Worker         buf_avg[j] = (int16_t)buf[j] - (int16_t)avg;
2237*77c1e3ccSAndroid Build Coastguard Worker       }
2238*77c1e3ccSAndroid Build Coastguard Worker       buf += buf_stride;
2239*77c1e3ccSAndroid Build Coastguard Worker       buf_avg += buf_avg_stride;
2240*77c1e3ccSAndroid Build Coastguard Worker     }
2241*77c1e3ccSAndroid Build Coastguard Worker   }
2242*77c1e3ccSAndroid Build Coastguard Worker }
2243*77c1e3ccSAndroid Build Coastguard Worker 
av1_compute_stats_downsampled_neon(int wiener_win,const uint8_t * dgd,const uint8_t * src,int16_t * dgd_avg,int16_t * src_avg,int h_start,int h_end,int v_start,int v_end,int dgd_stride,int src_stride,int64_t * M,int64_t * H,int use_downsampled_wiener_stats)2244*77c1e3ccSAndroid Build Coastguard Worker static inline void av1_compute_stats_downsampled_neon(
2245*77c1e3ccSAndroid Build Coastguard Worker     int wiener_win, const uint8_t *dgd, const uint8_t *src, int16_t *dgd_avg,
2246*77c1e3ccSAndroid Build Coastguard Worker     int16_t *src_avg, int h_start, int h_end, int v_start, int v_end,
2247*77c1e3ccSAndroid Build Coastguard Worker     int dgd_stride, int src_stride, int64_t *M, int64_t *H,
2248*77c1e3ccSAndroid Build Coastguard Worker     int use_downsampled_wiener_stats) {
2249*77c1e3ccSAndroid Build Coastguard Worker   assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA);
2250*77c1e3ccSAndroid Build Coastguard Worker   assert(WIENER_STATS_DOWNSAMPLE_FACTOR == 4);
2251*77c1e3ccSAndroid Build Coastguard Worker   (void)dgd_avg;
2252*77c1e3ccSAndroid Build Coastguard Worker   (void)src_avg;
2253*77c1e3ccSAndroid Build Coastguard Worker 
2254*77c1e3ccSAndroid Build Coastguard Worker   const int wiener_win2 = wiener_win * wiener_win;
2255*77c1e3ccSAndroid Build Coastguard Worker   const int wiener_halfwin = wiener_win >> 1;
2256*77c1e3ccSAndroid Build Coastguard Worker   const int width = h_end - h_start;
2257*77c1e3ccSAndroid Build Coastguard Worker   const int height = v_end - v_start;
2258*77c1e3ccSAndroid Build Coastguard Worker 
2259*77c1e3ccSAndroid Build Coastguard Worker   const uint8_t *dgd_start = dgd + h_start + v_start * dgd_stride;
2260*77c1e3ccSAndroid Build Coastguard Worker   const uint8_t *src_start = src + h_start + v_start * src_stride;
2261*77c1e3ccSAndroid Build Coastguard Worker 
2262*77c1e3ccSAndroid Build Coastguard Worker   // The wiener window will slide along the dgd frame, centered on each pixel.
2263*77c1e3ccSAndroid Build Coastguard Worker   // For the top left pixel and all the pixels on the side of the frame this
2264*77c1e3ccSAndroid Build Coastguard Worker   // means half of the window will be outside of the frame. As such the actual
2265*77c1e3ccSAndroid Build Coastguard Worker   // buffer that we need to subtract the avg from will be 2 * wiener_halfwin
2266*77c1e3ccSAndroid Build Coastguard Worker   // wider and 2 * wiener_halfwin higher than the original dgd buffer.
2267*77c1e3ccSAndroid Build Coastguard Worker   const int vert_offset = v_start - wiener_halfwin;
2268*77c1e3ccSAndroid Build Coastguard Worker   const int horiz_offset = h_start - wiener_halfwin;
2269*77c1e3ccSAndroid Build Coastguard Worker   const uint8_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride;
2270*77c1e3ccSAndroid Build Coastguard Worker 
2271*77c1e3ccSAndroid Build Coastguard Worker   uint8_t avg = find_average_neon(dgd_start, dgd_stride, width, height);
2272*77c1e3ccSAndroid Build Coastguard Worker 
2273*77c1e3ccSAndroid Build Coastguard Worker   // Since the height is not necessarily a multiple of the downsample factor,
2274*77c1e3ccSAndroid Build Coastguard Worker   // the last line of src will be scaled according to how many rows remain.
2275*77c1e3ccSAndroid Build Coastguard Worker   int downsample_factor =
2276*77c1e3ccSAndroid Build Coastguard Worker       use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
2277*77c1e3ccSAndroid Build Coastguard Worker 
2278*77c1e3ccSAndroid Build Coastguard Worker   int downsampled_height = height / downsample_factor;
2279*77c1e3ccSAndroid Build Coastguard Worker   int downsample_remainder = height % downsample_factor;
2280*77c1e3ccSAndroid Build Coastguard Worker 
2281*77c1e3ccSAndroid Build Coastguard Worker   memset(M, 0, wiener_win2 * sizeof(*M));
2282*77c1e3ccSAndroid Build Coastguard Worker   memset(H, 0, wiener_win2 * wiener_win2 * sizeof(*H));
2283*77c1e3ccSAndroid Build Coastguard Worker 
2284*77c1e3ccSAndroid Build Coastguard Worker   // Calculate the M and H matrices for the normal and downsampled cases.
2285*77c1e3ccSAndroid Build Coastguard Worker   if (downsampled_height > 0) {
2286*77c1e3ccSAndroid Build Coastguard Worker     if (wiener_win == WIENER_WIN) {
2287*77c1e3ccSAndroid Build Coastguard Worker       compute_stats_win7_downsampled_neon(
2288*77c1e3ccSAndroid Build Coastguard Worker           dgd_win, src_start, width, downsampled_height, dgd_stride, src_stride,
2289*77c1e3ccSAndroid Build Coastguard Worker           avg, M, H, downsample_factor);
2290*77c1e3ccSAndroid Build Coastguard Worker     } else {
2291*77c1e3ccSAndroid Build Coastguard Worker       compute_stats_win5_downsampled_neon(
2292*77c1e3ccSAndroid Build Coastguard Worker           dgd_win, src_start, width, downsampled_height, dgd_stride, src_stride,
2293*77c1e3ccSAndroid Build Coastguard Worker           avg, M, H, downsample_factor);
2294*77c1e3ccSAndroid Build Coastguard Worker     }
2295*77c1e3ccSAndroid Build Coastguard Worker   }
2296*77c1e3ccSAndroid Build Coastguard Worker 
2297*77c1e3ccSAndroid Build Coastguard Worker   // Accumulate the remaining last rows in the downsampled case.
2298*77c1e3ccSAndroid Build Coastguard Worker   if (downsample_remainder > 0) {
2299*77c1e3ccSAndroid Build Coastguard Worker     int remainder_offset = height - downsample_remainder;
2300*77c1e3ccSAndroid Build Coastguard Worker     if (wiener_win == WIENER_WIN) {
2301*77c1e3ccSAndroid Build Coastguard Worker       compute_stats_win7_downsampled_neon(
2302*77c1e3ccSAndroid Build Coastguard Worker           dgd_win + remainder_offset * dgd_stride,
2303*77c1e3ccSAndroid Build Coastguard Worker           src_start + remainder_offset * src_stride, width, 1, dgd_stride,
2304*77c1e3ccSAndroid Build Coastguard Worker           src_stride, avg, M, H, downsample_remainder);
2305*77c1e3ccSAndroid Build Coastguard Worker     } else {
2306*77c1e3ccSAndroid Build Coastguard Worker       compute_stats_win5_downsampled_neon(
2307*77c1e3ccSAndroid Build Coastguard Worker           dgd_win + remainder_offset * dgd_stride,
2308*77c1e3ccSAndroid Build Coastguard Worker           src_start + remainder_offset * src_stride, width, 1, dgd_stride,
2309*77c1e3ccSAndroid Build Coastguard Worker           src_stride, avg, M, H, downsample_remainder);
2310*77c1e3ccSAndroid Build Coastguard Worker     }
2311*77c1e3ccSAndroid Build Coastguard Worker   }
2312*77c1e3ccSAndroid Build Coastguard Worker }
2313*77c1e3ccSAndroid Build Coastguard Worker 
av1_compute_stats_neon(int32_t wiener_win,const uint8_t * dgd,const uint8_t * src,int16_t * dgd_avg,int16_t * src_avg,int32_t h_start,int32_t h_end,int32_t v_start,int32_t v_end,int32_t dgd_stride,int32_t src_stride,int64_t * M,int64_t * H,int use_downsampled_wiener_stats)2314*77c1e3ccSAndroid Build Coastguard Worker void av1_compute_stats_neon(int32_t wiener_win, const uint8_t *dgd,
2315*77c1e3ccSAndroid Build Coastguard Worker                             const uint8_t *src, int16_t *dgd_avg,
2316*77c1e3ccSAndroid Build Coastguard Worker                             int16_t *src_avg, int32_t h_start, int32_t h_end,
2317*77c1e3ccSAndroid Build Coastguard Worker                             int32_t v_start, int32_t v_end, int32_t dgd_stride,
2318*77c1e3ccSAndroid Build Coastguard Worker                             int32_t src_stride, int64_t *M, int64_t *H,
2319*77c1e3ccSAndroid Build Coastguard Worker                             int use_downsampled_wiener_stats) {
2320*77c1e3ccSAndroid Build Coastguard Worker   assert(WIENER_STATS_DOWNSAMPLE_FACTOR == 4);
2321*77c1e3ccSAndroid Build Coastguard Worker   if (use_downsampled_wiener_stats) {
2322*77c1e3ccSAndroid Build Coastguard Worker     av1_compute_stats_downsampled_neon(
2323*77c1e3ccSAndroid Build Coastguard Worker         wiener_win, dgd, src, dgd_avg, src_avg, h_start, h_end, v_start, v_end,
2324*77c1e3ccSAndroid Build Coastguard Worker         dgd_stride, src_stride, M, H, use_downsampled_wiener_stats);
2325*77c1e3ccSAndroid Build Coastguard Worker     return;
2326*77c1e3ccSAndroid Build Coastguard Worker   }
2327*77c1e3ccSAndroid Build Coastguard Worker 
2328*77c1e3ccSAndroid Build Coastguard Worker   const int32_t wiener_win2 = wiener_win * wiener_win;
2329*77c1e3ccSAndroid Build Coastguard Worker   const int32_t wiener_halfwin = (wiener_win >> 1);
2330*77c1e3ccSAndroid Build Coastguard Worker   const int32_t width = h_end - h_start;
2331*77c1e3ccSAndroid Build Coastguard Worker   const int32_t height = v_end - v_start;
2332*77c1e3ccSAndroid Build Coastguard Worker   const uint8_t *dgd_start = dgd + h_start + v_start * dgd_stride;
2333*77c1e3ccSAndroid Build Coastguard Worker   const uint8_t avg = find_average_neon(dgd_start, dgd_stride, width, height);
2334*77c1e3ccSAndroid Build Coastguard Worker   const int32_t d_stride = (width + 2 * wiener_halfwin + 15) & ~15;
2335*77c1e3ccSAndroid Build Coastguard Worker   const int32_t s_stride = (width + 15) & ~15;
2336*77c1e3ccSAndroid Build Coastguard Worker 
2337*77c1e3ccSAndroid Build Coastguard Worker   compute_sub_avg(src + v_start * src_stride + h_start, src_stride, avg,
2338*77c1e3ccSAndroid Build Coastguard Worker                   src_avg, s_stride, width, height, 1);
2339*77c1e3ccSAndroid Build Coastguard Worker   compute_sub_avg(
2340*77c1e3ccSAndroid Build Coastguard Worker       dgd + (v_start - wiener_halfwin) * dgd_stride + h_start - wiener_halfwin,
2341*77c1e3ccSAndroid Build Coastguard Worker       dgd_stride, avg, dgd_avg, d_stride, width + 2 * wiener_halfwin,
2342*77c1e3ccSAndroid Build Coastguard Worker       height + 2 * wiener_halfwin, 1);
2343*77c1e3ccSAndroid Build Coastguard Worker 
2344*77c1e3ccSAndroid Build Coastguard Worker   if (wiener_win == WIENER_WIN) {
2345*77c1e3ccSAndroid Build Coastguard Worker     compute_stats_win7_neon(dgd_avg, d_stride, src_avg, s_stride, width, height,
2346*77c1e3ccSAndroid Build Coastguard Worker                             M, H);
2347*77c1e3ccSAndroid Build Coastguard Worker   } else if (wiener_win == WIENER_WIN_CHROMA) {
2348*77c1e3ccSAndroid Build Coastguard Worker     compute_stats_win5_neon(dgd_avg, d_stride, src_avg, s_stride, width, height,
2349*77c1e3ccSAndroid Build Coastguard Worker                             M, H);
2350*77c1e3ccSAndroid Build Coastguard Worker   }
2351*77c1e3ccSAndroid Build Coastguard Worker 
2352*77c1e3ccSAndroid Build Coastguard Worker   // H is a symmetric matrix, so we only need to fill out the upper triangle.
2353*77c1e3ccSAndroid Build Coastguard Worker   // We can copy it down to the lower triangle outside the (i, j) loops.
2354*77c1e3ccSAndroid Build Coastguard Worker   diagonal_copy_stats_neon(wiener_win2, H);
2355*77c1e3ccSAndroid Build Coastguard Worker }
2356*77c1e3ccSAndroid Build Coastguard Worker 
calc_proj_params_r0_r1_neon(const uint8_t * src8,int width,int height,int src_stride,const uint8_t * dat8,int dat_stride,int32_t * flt0,int flt0_stride,int32_t * flt1,int flt1_stride,int64_t H[2][2],int64_t C[2])2357*77c1e3ccSAndroid Build Coastguard Worker static inline void calc_proj_params_r0_r1_neon(
2358*77c1e3ccSAndroid Build Coastguard Worker     const uint8_t *src8, int width, int height, int src_stride,
2359*77c1e3ccSAndroid Build Coastguard Worker     const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
2360*77c1e3ccSAndroid Build Coastguard Worker     int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
2361*77c1e3ccSAndroid Build Coastguard Worker   assert(width % 8 == 0);
2362*77c1e3ccSAndroid Build Coastguard Worker   const int size = width * height;
2363*77c1e3ccSAndroid Build Coastguard Worker 
2364*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t h00_lo = vdupq_n_s64(0);
2365*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t h00_hi = vdupq_n_s64(0);
2366*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t h11_lo = vdupq_n_s64(0);
2367*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t h11_hi = vdupq_n_s64(0);
2368*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t h01_lo = vdupq_n_s64(0);
2369*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t h01_hi = vdupq_n_s64(0);
2370*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t c0_lo = vdupq_n_s64(0);
2371*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t c0_hi = vdupq_n_s64(0);
2372*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t c1_lo = vdupq_n_s64(0);
2373*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t c1_hi = vdupq_n_s64(0);
2374*77c1e3ccSAndroid Build Coastguard Worker 
2375*77c1e3ccSAndroid Build Coastguard Worker   do {
2376*77c1e3ccSAndroid Build Coastguard Worker     const uint8_t *src_ptr = src8;
2377*77c1e3ccSAndroid Build Coastguard Worker     const uint8_t *dat_ptr = dat8;
2378*77c1e3ccSAndroid Build Coastguard Worker     int32_t *flt0_ptr = flt0;
2379*77c1e3ccSAndroid Build Coastguard Worker     int32_t *flt1_ptr = flt1;
2380*77c1e3ccSAndroid Build Coastguard Worker     int w = width;
2381*77c1e3ccSAndroid Build Coastguard Worker 
2382*77c1e3ccSAndroid Build Coastguard Worker     do {
2383*77c1e3ccSAndroid Build Coastguard Worker       uint8x8_t s = vld1_u8(src_ptr);
2384*77c1e3ccSAndroid Build Coastguard Worker       uint8x8_t d = vld1_u8(dat_ptr);
2385*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t f0_lo = vld1q_s32(flt0_ptr);
2386*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4);
2387*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t f1_lo = vld1q_s32(flt1_ptr);
2388*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4);
2389*77c1e3ccSAndroid Build Coastguard Worker 
2390*77c1e3ccSAndroid Build Coastguard Worker       int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS));
2391*77c1e3ccSAndroid Build Coastguard Worker       int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS));
2392*77c1e3ccSAndroid Build Coastguard Worker 
2393*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u));
2394*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u));
2395*77c1e3ccSAndroid Build Coastguard Worker       f0_lo = vsubw_s16(f0_lo, vget_low_s16(u));
2396*77c1e3ccSAndroid Build Coastguard Worker       f0_hi = vsubw_s16(f0_hi, vget_high_s16(u));
2397*77c1e3ccSAndroid Build Coastguard Worker       f1_lo = vsubw_s16(f1_lo, vget_low_s16(u));
2398*77c1e3ccSAndroid Build Coastguard Worker       f1_hi = vsubw_s16(f1_hi, vget_high_s16(u));
2399*77c1e3ccSAndroid Build Coastguard Worker 
2400*77c1e3ccSAndroid Build Coastguard Worker       h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo));
2401*77c1e3ccSAndroid Build Coastguard Worker       h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo));
2402*77c1e3ccSAndroid Build Coastguard Worker       h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi));
2403*77c1e3ccSAndroid Build Coastguard Worker       h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi));
2404*77c1e3ccSAndroid Build Coastguard Worker 
2405*77c1e3ccSAndroid Build Coastguard Worker       h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo));
2406*77c1e3ccSAndroid Build Coastguard Worker       h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo));
2407*77c1e3ccSAndroid Build Coastguard Worker       h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi));
2408*77c1e3ccSAndroid Build Coastguard Worker       h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi));
2409*77c1e3ccSAndroid Build Coastguard Worker 
2410*77c1e3ccSAndroid Build Coastguard Worker       h01_lo = vmlal_s32(h01_lo, vget_low_s32(f0_lo), vget_low_s32(f1_lo));
2411*77c1e3ccSAndroid Build Coastguard Worker       h01_lo = vmlal_s32(h01_lo, vget_high_s32(f0_lo), vget_high_s32(f1_lo));
2412*77c1e3ccSAndroid Build Coastguard Worker       h01_hi = vmlal_s32(h01_hi, vget_low_s32(f0_hi), vget_low_s32(f1_hi));
2413*77c1e3ccSAndroid Build Coastguard Worker       h01_hi = vmlal_s32(h01_hi, vget_high_s32(f0_hi), vget_high_s32(f1_hi));
2414*77c1e3ccSAndroid Build Coastguard Worker 
2415*77c1e3ccSAndroid Build Coastguard Worker       c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo));
2416*77c1e3ccSAndroid Build Coastguard Worker       c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo));
2417*77c1e3ccSAndroid Build Coastguard Worker       c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi));
2418*77c1e3ccSAndroid Build Coastguard Worker       c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi));
2419*77c1e3ccSAndroid Build Coastguard Worker 
2420*77c1e3ccSAndroid Build Coastguard Worker       c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo));
2421*77c1e3ccSAndroid Build Coastguard Worker       c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo));
2422*77c1e3ccSAndroid Build Coastguard Worker       c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi));
2423*77c1e3ccSAndroid Build Coastguard Worker       c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi));
2424*77c1e3ccSAndroid Build Coastguard Worker 
2425*77c1e3ccSAndroid Build Coastguard Worker       src_ptr += 8;
2426*77c1e3ccSAndroid Build Coastguard Worker       dat_ptr += 8;
2427*77c1e3ccSAndroid Build Coastguard Worker       flt0_ptr += 8;
2428*77c1e3ccSAndroid Build Coastguard Worker       flt1_ptr += 8;
2429*77c1e3ccSAndroid Build Coastguard Worker       w -= 8;
2430*77c1e3ccSAndroid Build Coastguard Worker     } while (w != 0);
2431*77c1e3ccSAndroid Build Coastguard Worker 
2432*77c1e3ccSAndroid Build Coastguard Worker     src8 += src_stride;
2433*77c1e3ccSAndroid Build Coastguard Worker     dat8 += dat_stride;
2434*77c1e3ccSAndroid Build Coastguard Worker     flt0 += flt0_stride;
2435*77c1e3ccSAndroid Build Coastguard Worker     flt1 += flt1_stride;
2436*77c1e3ccSAndroid Build Coastguard Worker   } while (--height != 0);
2437*77c1e3ccSAndroid Build Coastguard Worker 
2438*77c1e3ccSAndroid Build Coastguard Worker   H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size;
2439*77c1e3ccSAndroid Build Coastguard Worker   H[0][1] = horizontal_add_s64x2(vaddq_s64(h01_lo, h01_hi)) / size;
2440*77c1e3ccSAndroid Build Coastguard Worker   H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size;
2441*77c1e3ccSAndroid Build Coastguard Worker   H[1][0] = H[0][1];
2442*77c1e3ccSAndroid Build Coastguard Worker   C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size;
2443*77c1e3ccSAndroid Build Coastguard Worker   C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size;
2444*77c1e3ccSAndroid Build Coastguard Worker }
2445*77c1e3ccSAndroid Build Coastguard Worker 
calc_proj_params_r0_neon(const uint8_t * src8,int width,int height,int src_stride,const uint8_t * dat8,int dat_stride,int32_t * flt0,int flt0_stride,int64_t H[2][2],int64_t C[2])2446*77c1e3ccSAndroid Build Coastguard Worker static inline void calc_proj_params_r0_neon(const uint8_t *src8, int width,
2447*77c1e3ccSAndroid Build Coastguard Worker                                             int height, int src_stride,
2448*77c1e3ccSAndroid Build Coastguard Worker                                             const uint8_t *dat8, int dat_stride,
2449*77c1e3ccSAndroid Build Coastguard Worker                                             int32_t *flt0, int flt0_stride,
2450*77c1e3ccSAndroid Build Coastguard Worker                                             int64_t H[2][2], int64_t C[2]) {
2451*77c1e3ccSAndroid Build Coastguard Worker   assert(width % 8 == 0);
2452*77c1e3ccSAndroid Build Coastguard Worker   const int size = width * height;
2453*77c1e3ccSAndroid Build Coastguard Worker 
2454*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t h00_lo = vdupq_n_s64(0);
2455*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t h00_hi = vdupq_n_s64(0);
2456*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t c0_lo = vdupq_n_s64(0);
2457*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t c0_hi = vdupq_n_s64(0);
2458*77c1e3ccSAndroid Build Coastguard Worker 
2459*77c1e3ccSAndroid Build Coastguard Worker   do {
2460*77c1e3ccSAndroid Build Coastguard Worker     const uint8_t *src_ptr = src8;
2461*77c1e3ccSAndroid Build Coastguard Worker     const uint8_t *dat_ptr = dat8;
2462*77c1e3ccSAndroid Build Coastguard Worker     int32_t *flt0_ptr = flt0;
2463*77c1e3ccSAndroid Build Coastguard Worker     int w = width;
2464*77c1e3ccSAndroid Build Coastguard Worker 
2465*77c1e3ccSAndroid Build Coastguard Worker     do {
2466*77c1e3ccSAndroid Build Coastguard Worker       uint8x8_t s = vld1_u8(src_ptr);
2467*77c1e3ccSAndroid Build Coastguard Worker       uint8x8_t d = vld1_u8(dat_ptr);
2468*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t f0_lo = vld1q_s32(flt0_ptr);
2469*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t f0_hi = vld1q_s32(flt0_ptr + 4);
2470*77c1e3ccSAndroid Build Coastguard Worker 
2471*77c1e3ccSAndroid Build Coastguard Worker       int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS));
2472*77c1e3ccSAndroid Build Coastguard Worker       int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS));
2473*77c1e3ccSAndroid Build Coastguard Worker 
2474*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u));
2475*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u));
2476*77c1e3ccSAndroid Build Coastguard Worker       f0_lo = vsubw_s16(f0_lo, vget_low_s16(u));
2477*77c1e3ccSAndroid Build Coastguard Worker       f0_hi = vsubw_s16(f0_hi, vget_high_s16(u));
2478*77c1e3ccSAndroid Build Coastguard Worker 
2479*77c1e3ccSAndroid Build Coastguard Worker       h00_lo = vmlal_s32(h00_lo, vget_low_s32(f0_lo), vget_low_s32(f0_lo));
2480*77c1e3ccSAndroid Build Coastguard Worker       h00_lo = vmlal_s32(h00_lo, vget_high_s32(f0_lo), vget_high_s32(f0_lo));
2481*77c1e3ccSAndroid Build Coastguard Worker       h00_hi = vmlal_s32(h00_hi, vget_low_s32(f0_hi), vget_low_s32(f0_hi));
2482*77c1e3ccSAndroid Build Coastguard Worker       h00_hi = vmlal_s32(h00_hi, vget_high_s32(f0_hi), vget_high_s32(f0_hi));
2483*77c1e3ccSAndroid Build Coastguard Worker 
2484*77c1e3ccSAndroid Build Coastguard Worker       c0_lo = vmlal_s32(c0_lo, vget_low_s32(f0_lo), vget_low_s32(s_lo));
2485*77c1e3ccSAndroid Build Coastguard Worker       c0_lo = vmlal_s32(c0_lo, vget_high_s32(f0_lo), vget_high_s32(s_lo));
2486*77c1e3ccSAndroid Build Coastguard Worker       c0_hi = vmlal_s32(c0_hi, vget_low_s32(f0_hi), vget_low_s32(s_hi));
2487*77c1e3ccSAndroid Build Coastguard Worker       c0_hi = vmlal_s32(c0_hi, vget_high_s32(f0_hi), vget_high_s32(s_hi));
2488*77c1e3ccSAndroid Build Coastguard Worker 
2489*77c1e3ccSAndroid Build Coastguard Worker       src_ptr += 8;
2490*77c1e3ccSAndroid Build Coastguard Worker       dat_ptr += 8;
2491*77c1e3ccSAndroid Build Coastguard Worker       flt0_ptr += 8;
2492*77c1e3ccSAndroid Build Coastguard Worker       w -= 8;
2493*77c1e3ccSAndroid Build Coastguard Worker     } while (w != 0);
2494*77c1e3ccSAndroid Build Coastguard Worker 
2495*77c1e3ccSAndroid Build Coastguard Worker     src8 += src_stride;
2496*77c1e3ccSAndroid Build Coastguard Worker     dat8 += dat_stride;
2497*77c1e3ccSAndroid Build Coastguard Worker     flt0 += flt0_stride;
2498*77c1e3ccSAndroid Build Coastguard Worker   } while (--height != 0);
2499*77c1e3ccSAndroid Build Coastguard Worker 
2500*77c1e3ccSAndroid Build Coastguard Worker   H[0][0] = horizontal_add_s64x2(vaddq_s64(h00_lo, h00_hi)) / size;
2501*77c1e3ccSAndroid Build Coastguard Worker   C[0] = horizontal_add_s64x2(vaddq_s64(c0_lo, c0_hi)) / size;
2502*77c1e3ccSAndroid Build Coastguard Worker }
2503*77c1e3ccSAndroid Build Coastguard Worker 
calc_proj_params_r1_neon(const uint8_t * src8,int width,int height,int src_stride,const uint8_t * dat8,int dat_stride,int32_t * flt1,int flt1_stride,int64_t H[2][2],int64_t C[2])2504*77c1e3ccSAndroid Build Coastguard Worker static inline void calc_proj_params_r1_neon(const uint8_t *src8, int width,
2505*77c1e3ccSAndroid Build Coastguard Worker                                             int height, int src_stride,
2506*77c1e3ccSAndroid Build Coastguard Worker                                             const uint8_t *dat8, int dat_stride,
2507*77c1e3ccSAndroid Build Coastguard Worker                                             int32_t *flt1, int flt1_stride,
2508*77c1e3ccSAndroid Build Coastguard Worker                                             int64_t H[2][2], int64_t C[2]) {
2509*77c1e3ccSAndroid Build Coastguard Worker   assert(width % 8 == 0);
2510*77c1e3ccSAndroid Build Coastguard Worker   const int size = width * height;
2511*77c1e3ccSAndroid Build Coastguard Worker 
2512*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t h11_lo = vdupq_n_s64(0);
2513*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t h11_hi = vdupq_n_s64(0);
2514*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t c1_lo = vdupq_n_s64(0);
2515*77c1e3ccSAndroid Build Coastguard Worker   int64x2_t c1_hi = vdupq_n_s64(0);
2516*77c1e3ccSAndroid Build Coastguard Worker 
2517*77c1e3ccSAndroid Build Coastguard Worker   do {
2518*77c1e3ccSAndroid Build Coastguard Worker     const uint8_t *src_ptr = src8;
2519*77c1e3ccSAndroid Build Coastguard Worker     const uint8_t *dat_ptr = dat8;
2520*77c1e3ccSAndroid Build Coastguard Worker     int32_t *flt1_ptr = flt1;
2521*77c1e3ccSAndroid Build Coastguard Worker     int w = width;
2522*77c1e3ccSAndroid Build Coastguard Worker 
2523*77c1e3ccSAndroid Build Coastguard Worker     do {
2524*77c1e3ccSAndroid Build Coastguard Worker       uint8x8_t s = vld1_u8(src_ptr);
2525*77c1e3ccSAndroid Build Coastguard Worker       uint8x8_t d = vld1_u8(dat_ptr);
2526*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t f1_lo = vld1q_s32(flt1_ptr);
2527*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t f1_hi = vld1q_s32(flt1_ptr + 4);
2528*77c1e3ccSAndroid Build Coastguard Worker 
2529*77c1e3ccSAndroid Build Coastguard Worker       int16x8_t u = vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS));
2530*77c1e3ccSAndroid Build Coastguard Worker       int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, SGRPROJ_RST_BITS));
2531*77c1e3ccSAndroid Build Coastguard Worker 
2532*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t s_lo = vsubl_s16(vget_low_s16(s_s16), vget_low_s16(u));
2533*77c1e3ccSAndroid Build Coastguard Worker       int32x4_t s_hi = vsubl_s16(vget_high_s16(s_s16), vget_high_s16(u));
2534*77c1e3ccSAndroid Build Coastguard Worker       f1_lo = vsubw_s16(f1_lo, vget_low_s16(u));
2535*77c1e3ccSAndroid Build Coastguard Worker       f1_hi = vsubw_s16(f1_hi, vget_high_s16(u));
2536*77c1e3ccSAndroid Build Coastguard Worker 
2537*77c1e3ccSAndroid Build Coastguard Worker       h11_lo = vmlal_s32(h11_lo, vget_low_s32(f1_lo), vget_low_s32(f1_lo));
2538*77c1e3ccSAndroid Build Coastguard Worker       h11_lo = vmlal_s32(h11_lo, vget_high_s32(f1_lo), vget_high_s32(f1_lo));
2539*77c1e3ccSAndroid Build Coastguard Worker       h11_hi = vmlal_s32(h11_hi, vget_low_s32(f1_hi), vget_low_s32(f1_hi));
2540*77c1e3ccSAndroid Build Coastguard Worker       h11_hi = vmlal_s32(h11_hi, vget_high_s32(f1_hi), vget_high_s32(f1_hi));
2541*77c1e3ccSAndroid Build Coastguard Worker 
2542*77c1e3ccSAndroid Build Coastguard Worker       c1_lo = vmlal_s32(c1_lo, vget_low_s32(f1_lo), vget_low_s32(s_lo));
2543*77c1e3ccSAndroid Build Coastguard Worker       c1_lo = vmlal_s32(c1_lo, vget_high_s32(f1_lo), vget_high_s32(s_lo));
2544*77c1e3ccSAndroid Build Coastguard Worker       c1_hi = vmlal_s32(c1_hi, vget_low_s32(f1_hi), vget_low_s32(s_hi));
2545*77c1e3ccSAndroid Build Coastguard Worker       c1_hi = vmlal_s32(c1_hi, vget_high_s32(f1_hi), vget_high_s32(s_hi));
2546*77c1e3ccSAndroid Build Coastguard Worker 
2547*77c1e3ccSAndroid Build Coastguard Worker       src_ptr += 8;
2548*77c1e3ccSAndroid Build Coastguard Worker       dat_ptr += 8;
2549*77c1e3ccSAndroid Build Coastguard Worker       flt1_ptr += 8;
2550*77c1e3ccSAndroid Build Coastguard Worker       w -= 8;
2551*77c1e3ccSAndroid Build Coastguard Worker     } while (w != 0);
2552*77c1e3ccSAndroid Build Coastguard Worker 
2553*77c1e3ccSAndroid Build Coastguard Worker     src8 += src_stride;
2554*77c1e3ccSAndroid Build Coastguard Worker     dat8 += dat_stride;
2555*77c1e3ccSAndroid Build Coastguard Worker     flt1 += flt1_stride;
2556*77c1e3ccSAndroid Build Coastguard Worker   } while (--height != 0);
2557*77c1e3ccSAndroid Build Coastguard Worker 
2558*77c1e3ccSAndroid Build Coastguard Worker   H[1][1] = horizontal_add_s64x2(vaddq_s64(h11_lo, h11_hi)) / size;
2559*77c1e3ccSAndroid Build Coastguard Worker   C[1] = horizontal_add_s64x2(vaddq_s64(c1_lo, c1_hi)) / size;
2560*77c1e3ccSAndroid Build Coastguard Worker }
2561*77c1e3ccSAndroid Build Coastguard Worker 
2562*77c1e3ccSAndroid Build Coastguard Worker // The function calls 3 subfunctions for the following cases :
2563*77c1e3ccSAndroid Build Coastguard Worker // 1) When params->r[0] > 0 and params->r[1] > 0. In this case all elements
2564*77c1e3ccSAndroid Build Coastguard Worker //    of C and H need to be computed.
2565*77c1e3ccSAndroid Build Coastguard Worker // 2) When only params->r[0] > 0. In this case only H[0][0] and C[0] are
2566*77c1e3ccSAndroid Build Coastguard Worker //    non-zero and need to be computed.
2567*77c1e3ccSAndroid Build Coastguard Worker // 3) When only params->r[1] > 0. In this case only H[1][1] and C[1] are
2568*77c1e3ccSAndroid Build Coastguard Worker //    non-zero and need to be computed.
av1_calc_proj_params_neon(const uint8_t * src8,int width,int height,int src_stride,const uint8_t * dat8,int dat_stride,int32_t * flt0,int flt0_stride,int32_t * flt1,int flt1_stride,int64_t H[2][2],int64_t C[2],const sgr_params_type * params)2569*77c1e3ccSAndroid Build Coastguard Worker void av1_calc_proj_params_neon(const uint8_t *src8, int width, int height,
2570*77c1e3ccSAndroid Build Coastguard Worker                                int src_stride, const uint8_t *dat8,
2571*77c1e3ccSAndroid Build Coastguard Worker                                int dat_stride, int32_t *flt0, int flt0_stride,
2572*77c1e3ccSAndroid Build Coastguard Worker                                int32_t *flt1, int flt1_stride, int64_t H[2][2],
2573*77c1e3ccSAndroid Build Coastguard Worker                                int64_t C[2], const sgr_params_type *params) {
2574*77c1e3ccSAndroid Build Coastguard Worker   if ((params->r[0] > 0) && (params->r[1] > 0)) {
2575*77c1e3ccSAndroid Build Coastguard Worker     calc_proj_params_r0_r1_neon(src8, width, height, src_stride, dat8,
2576*77c1e3ccSAndroid Build Coastguard Worker                                 dat_stride, flt0, flt0_stride, flt1,
2577*77c1e3ccSAndroid Build Coastguard Worker                                 flt1_stride, H, C);
2578*77c1e3ccSAndroid Build Coastguard Worker   } else if (params->r[0] > 0) {
2579*77c1e3ccSAndroid Build Coastguard Worker     calc_proj_params_r0_neon(src8, width, height, src_stride, dat8, dat_stride,
2580*77c1e3ccSAndroid Build Coastguard Worker                              flt0, flt0_stride, H, C);
2581*77c1e3ccSAndroid Build Coastguard Worker   } else if (params->r[1] > 0) {
2582*77c1e3ccSAndroid Build Coastguard Worker     calc_proj_params_r1_neon(src8, width, height, src_stride, dat8, dat_stride,
2583*77c1e3ccSAndroid Build Coastguard Worker                              flt1, flt1_stride, H, C);
2584*77c1e3ccSAndroid Build Coastguard Worker   }
2585*77c1e3ccSAndroid Build Coastguard Worker }
2586