xref: /aosp_15_r20/external/libaom/av1/common/arm/selfguided_neon.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker  * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker  *
4*77c1e3ccSAndroid Build Coastguard Worker  * This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker  * was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker  * Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker  */
11*77c1e3ccSAndroid Build Coastguard Worker 
12*77c1e3ccSAndroid Build Coastguard Worker #include <arm_neon.h>
13*77c1e3ccSAndroid Build Coastguard Worker #include <assert.h>
14*77c1e3ccSAndroid Build Coastguard Worker 
15*77c1e3ccSAndroid Build Coastguard Worker #include "config/aom_config.h"
16*77c1e3ccSAndroid Build Coastguard Worker #include "config/av1_rtcd.h"
17*77c1e3ccSAndroid Build Coastguard Worker 
18*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/aom_dsp_common.h"
19*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/txfm_common.h"
20*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/arm/mem_neon.h"
21*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/arm/transpose_neon.h"
22*77c1e3ccSAndroid Build Coastguard Worker #include "aom_mem/aom_mem.h"
23*77c1e3ccSAndroid Build Coastguard Worker #include "aom_ports/mem.h"
24*77c1e3ccSAndroid Build Coastguard Worker #include "av1/common/av1_common_int.h"
25*77c1e3ccSAndroid Build Coastguard Worker #include "av1/common/common.h"
26*77c1e3ccSAndroid Build Coastguard Worker #include "av1/common/resize.h"
27*77c1e3ccSAndroid Build Coastguard Worker #include "av1/common/restoration.h"
28*77c1e3ccSAndroid Build Coastguard Worker 
29*77c1e3ccSAndroid Build Coastguard Worker // Constants used for right shift in final_filter calculation.
30*77c1e3ccSAndroid Build Coastguard Worker #define NB_EVEN 5
31*77c1e3ccSAndroid Build Coastguard Worker #define NB_ODD 4
32*77c1e3ccSAndroid Build Coastguard Worker 
calc_ab_fast_internal_common(uint32x4_t s0,uint32x4_t s1,uint32x4_t s2,uint32x4_t s3,uint32x4_t s4,uint32x4_t s5,uint32x4_t s6,uint32x4_t s7,int32x4_t sr4,int32x4_t sr5,int32x4_t sr6,int32x4_t sr7,uint32x4_t const_n_val,uint32x4_t s_vec,uint32x4_t const_val,uint32x4_t one_by_n_minus_1_vec,uint16x4_t sgrproj_sgr,int32_t * src1,uint16_t * dst_A16,int32_t * src2,const int buf_stride)33*77c1e3ccSAndroid Build Coastguard Worker static inline void calc_ab_fast_internal_common(
34*77c1e3ccSAndroid Build Coastguard Worker     uint32x4_t s0, uint32x4_t s1, uint32x4_t s2, uint32x4_t s3, uint32x4_t s4,
35*77c1e3ccSAndroid Build Coastguard Worker     uint32x4_t s5, uint32x4_t s6, uint32x4_t s7, int32x4_t sr4, int32x4_t sr5,
36*77c1e3ccSAndroid Build Coastguard Worker     int32x4_t sr6, int32x4_t sr7, uint32x4_t const_n_val, uint32x4_t s_vec,
37*77c1e3ccSAndroid Build Coastguard Worker     uint32x4_t const_val, uint32x4_t one_by_n_minus_1_vec,
38*77c1e3ccSAndroid Build Coastguard Worker     uint16x4_t sgrproj_sgr, int32_t *src1, uint16_t *dst_A16, int32_t *src2,
39*77c1e3ccSAndroid Build Coastguard Worker     const int buf_stride) {
40*77c1e3ccSAndroid Build Coastguard Worker   uint32x4_t q0, q1, q2, q3;
41*77c1e3ccSAndroid Build Coastguard Worker   uint32x4_t p0, p1, p2, p3;
42*77c1e3ccSAndroid Build Coastguard Worker   uint16x4_t d0, d1, d2, d3;
43*77c1e3ccSAndroid Build Coastguard Worker 
44*77c1e3ccSAndroid Build Coastguard Worker   s0 = vmulq_u32(s0, const_n_val);
45*77c1e3ccSAndroid Build Coastguard Worker   s1 = vmulq_u32(s1, const_n_val);
46*77c1e3ccSAndroid Build Coastguard Worker   s2 = vmulq_u32(s2, const_n_val);
47*77c1e3ccSAndroid Build Coastguard Worker   s3 = vmulq_u32(s3, const_n_val);
48*77c1e3ccSAndroid Build Coastguard Worker 
49*77c1e3ccSAndroid Build Coastguard Worker   q0 = vmulq_u32(s4, s4);
50*77c1e3ccSAndroid Build Coastguard Worker   q1 = vmulq_u32(s5, s5);
51*77c1e3ccSAndroid Build Coastguard Worker   q2 = vmulq_u32(s6, s6);
52*77c1e3ccSAndroid Build Coastguard Worker   q3 = vmulq_u32(s7, s7);
53*77c1e3ccSAndroid Build Coastguard Worker 
54*77c1e3ccSAndroid Build Coastguard Worker   p0 = vcleq_u32(q0, s0);
55*77c1e3ccSAndroid Build Coastguard Worker   p1 = vcleq_u32(q1, s1);
56*77c1e3ccSAndroid Build Coastguard Worker   p2 = vcleq_u32(q2, s2);
57*77c1e3ccSAndroid Build Coastguard Worker   p3 = vcleq_u32(q3, s3);
58*77c1e3ccSAndroid Build Coastguard Worker 
59*77c1e3ccSAndroid Build Coastguard Worker   q0 = vsubq_u32(s0, q0);
60*77c1e3ccSAndroid Build Coastguard Worker   q1 = vsubq_u32(s1, q1);
61*77c1e3ccSAndroid Build Coastguard Worker   q2 = vsubq_u32(s2, q2);
62*77c1e3ccSAndroid Build Coastguard Worker   q3 = vsubq_u32(s3, q3);
63*77c1e3ccSAndroid Build Coastguard Worker 
64*77c1e3ccSAndroid Build Coastguard Worker   p0 = vandq_u32(p0, q0);
65*77c1e3ccSAndroid Build Coastguard Worker   p1 = vandq_u32(p1, q1);
66*77c1e3ccSAndroid Build Coastguard Worker   p2 = vandq_u32(p2, q2);
67*77c1e3ccSAndroid Build Coastguard Worker   p3 = vandq_u32(p3, q3);
68*77c1e3ccSAndroid Build Coastguard Worker 
69*77c1e3ccSAndroid Build Coastguard Worker   p0 = vmulq_u32(p0, s_vec);
70*77c1e3ccSAndroid Build Coastguard Worker   p1 = vmulq_u32(p1, s_vec);
71*77c1e3ccSAndroid Build Coastguard Worker   p2 = vmulq_u32(p2, s_vec);
72*77c1e3ccSAndroid Build Coastguard Worker   p3 = vmulq_u32(p3, s_vec);
73*77c1e3ccSAndroid Build Coastguard Worker 
74*77c1e3ccSAndroid Build Coastguard Worker   p0 = vrshrq_n_u32(p0, SGRPROJ_MTABLE_BITS);
75*77c1e3ccSAndroid Build Coastguard Worker   p1 = vrshrq_n_u32(p1, SGRPROJ_MTABLE_BITS);
76*77c1e3ccSAndroid Build Coastguard Worker   p2 = vrshrq_n_u32(p2, SGRPROJ_MTABLE_BITS);
77*77c1e3ccSAndroid Build Coastguard Worker   p3 = vrshrq_n_u32(p3, SGRPROJ_MTABLE_BITS);
78*77c1e3ccSAndroid Build Coastguard Worker 
79*77c1e3ccSAndroid Build Coastguard Worker   p0 = vminq_u32(p0, const_val);
80*77c1e3ccSAndroid Build Coastguard Worker   p1 = vminq_u32(p1, const_val);
81*77c1e3ccSAndroid Build Coastguard Worker   p2 = vminq_u32(p2, const_val);
82*77c1e3ccSAndroid Build Coastguard Worker   p3 = vminq_u32(p3, const_val);
83*77c1e3ccSAndroid Build Coastguard Worker 
84*77c1e3ccSAndroid Build Coastguard Worker   {
85*77c1e3ccSAndroid Build Coastguard Worker     store_u32_4x4((uint32_t *)src1, buf_stride, p0, p1, p2, p3);
86*77c1e3ccSAndroid Build Coastguard Worker 
87*77c1e3ccSAndroid Build Coastguard Worker     for (int x = 0; x < 4; x++) {
88*77c1e3ccSAndroid Build Coastguard Worker       for (int y = 0; y < 4; y++) {
89*77c1e3ccSAndroid Build Coastguard Worker         dst_A16[x * buf_stride + y] = av1_x_by_xplus1[src1[x * buf_stride + y]];
90*77c1e3ccSAndroid Build Coastguard Worker       }
91*77c1e3ccSAndroid Build Coastguard Worker     }
92*77c1e3ccSAndroid Build Coastguard Worker     load_u16_4x4(dst_A16, buf_stride, &d0, &d1, &d2, &d3);
93*77c1e3ccSAndroid Build Coastguard Worker   }
94*77c1e3ccSAndroid Build Coastguard Worker   p0 = vsubl_u16(sgrproj_sgr, d0);
95*77c1e3ccSAndroid Build Coastguard Worker   p1 = vsubl_u16(sgrproj_sgr, d1);
96*77c1e3ccSAndroid Build Coastguard Worker   p2 = vsubl_u16(sgrproj_sgr, d2);
97*77c1e3ccSAndroid Build Coastguard Worker   p3 = vsubl_u16(sgrproj_sgr, d3);
98*77c1e3ccSAndroid Build Coastguard Worker 
99*77c1e3ccSAndroid Build Coastguard Worker   s4 = vmulq_u32(vreinterpretq_u32_s32(sr4), one_by_n_minus_1_vec);
100*77c1e3ccSAndroid Build Coastguard Worker   s5 = vmulq_u32(vreinterpretq_u32_s32(sr5), one_by_n_minus_1_vec);
101*77c1e3ccSAndroid Build Coastguard Worker   s6 = vmulq_u32(vreinterpretq_u32_s32(sr6), one_by_n_minus_1_vec);
102*77c1e3ccSAndroid Build Coastguard Worker   s7 = vmulq_u32(vreinterpretq_u32_s32(sr7), one_by_n_minus_1_vec);
103*77c1e3ccSAndroid Build Coastguard Worker 
104*77c1e3ccSAndroid Build Coastguard Worker   s4 = vmulq_u32(s4, p0);
105*77c1e3ccSAndroid Build Coastguard Worker   s5 = vmulq_u32(s5, p1);
106*77c1e3ccSAndroid Build Coastguard Worker   s6 = vmulq_u32(s6, p2);
107*77c1e3ccSAndroid Build Coastguard Worker   s7 = vmulq_u32(s7, p3);
108*77c1e3ccSAndroid Build Coastguard Worker 
109*77c1e3ccSAndroid Build Coastguard Worker   p0 = vrshrq_n_u32(s4, SGRPROJ_RECIP_BITS);
110*77c1e3ccSAndroid Build Coastguard Worker   p1 = vrshrq_n_u32(s5, SGRPROJ_RECIP_BITS);
111*77c1e3ccSAndroid Build Coastguard Worker   p2 = vrshrq_n_u32(s6, SGRPROJ_RECIP_BITS);
112*77c1e3ccSAndroid Build Coastguard Worker   p3 = vrshrq_n_u32(s7, SGRPROJ_RECIP_BITS);
113*77c1e3ccSAndroid Build Coastguard Worker 
114*77c1e3ccSAndroid Build Coastguard Worker   store_s32_4x4(src2, buf_stride, vreinterpretq_s32_u32(p0),
115*77c1e3ccSAndroid Build Coastguard Worker                 vreinterpretq_s32_u32(p1), vreinterpretq_s32_u32(p2),
116*77c1e3ccSAndroid Build Coastguard Worker                 vreinterpretq_s32_u32(p3));
117*77c1e3ccSAndroid Build Coastguard Worker }
calc_ab_internal_common(uint32x4_t s0,uint32x4_t s1,uint32x4_t s2,uint32x4_t s3,uint32x4_t s4,uint32x4_t s5,uint32x4_t s6,uint32x4_t s7,uint16x8_t s16_0,uint16x8_t s16_1,uint16x8_t s16_2,uint16x8_t s16_3,uint16x8_t s16_4,uint16x8_t s16_5,uint16x8_t s16_6,uint16x8_t s16_7,uint32x4_t const_n_val,uint32x4_t s_vec,uint32x4_t const_val,uint16x4_t one_by_n_minus_1_vec,uint16x8_t sgrproj_sgr,int32_t * src1,uint16_t * dst_A16,int32_t * dst2,const int buf_stride)118*77c1e3ccSAndroid Build Coastguard Worker static inline void calc_ab_internal_common(
119*77c1e3ccSAndroid Build Coastguard Worker     uint32x4_t s0, uint32x4_t s1, uint32x4_t s2, uint32x4_t s3, uint32x4_t s4,
120*77c1e3ccSAndroid Build Coastguard Worker     uint32x4_t s5, uint32x4_t s6, uint32x4_t s7, uint16x8_t s16_0,
121*77c1e3ccSAndroid Build Coastguard Worker     uint16x8_t s16_1, uint16x8_t s16_2, uint16x8_t s16_3, uint16x8_t s16_4,
122*77c1e3ccSAndroid Build Coastguard Worker     uint16x8_t s16_5, uint16x8_t s16_6, uint16x8_t s16_7,
123*77c1e3ccSAndroid Build Coastguard Worker     uint32x4_t const_n_val, uint32x4_t s_vec, uint32x4_t const_val,
124*77c1e3ccSAndroid Build Coastguard Worker     uint16x4_t one_by_n_minus_1_vec, uint16x8_t sgrproj_sgr, int32_t *src1,
125*77c1e3ccSAndroid Build Coastguard Worker     uint16_t *dst_A16, int32_t *dst2, const int buf_stride) {
126*77c1e3ccSAndroid Build Coastguard Worker   uint16x4_t d0, d1, d2, d3, d4, d5, d6, d7;
127*77c1e3ccSAndroid Build Coastguard Worker   uint32x4_t q0, q1, q2, q3, q4, q5, q6, q7;
128*77c1e3ccSAndroid Build Coastguard Worker   uint32x4_t p0, p1, p2, p3, p4, p5, p6, p7;
129*77c1e3ccSAndroid Build Coastguard Worker 
130*77c1e3ccSAndroid Build Coastguard Worker   s0 = vmulq_u32(s0, const_n_val);
131*77c1e3ccSAndroid Build Coastguard Worker   s1 = vmulq_u32(s1, const_n_val);
132*77c1e3ccSAndroid Build Coastguard Worker   s2 = vmulq_u32(s2, const_n_val);
133*77c1e3ccSAndroid Build Coastguard Worker   s3 = vmulq_u32(s3, const_n_val);
134*77c1e3ccSAndroid Build Coastguard Worker   s4 = vmulq_u32(s4, const_n_val);
135*77c1e3ccSAndroid Build Coastguard Worker   s5 = vmulq_u32(s5, const_n_val);
136*77c1e3ccSAndroid Build Coastguard Worker   s6 = vmulq_u32(s6, const_n_val);
137*77c1e3ccSAndroid Build Coastguard Worker   s7 = vmulq_u32(s7, const_n_val);
138*77c1e3ccSAndroid Build Coastguard Worker 
139*77c1e3ccSAndroid Build Coastguard Worker   d0 = vget_low_u16(s16_4);
140*77c1e3ccSAndroid Build Coastguard Worker   d1 = vget_low_u16(s16_5);
141*77c1e3ccSAndroid Build Coastguard Worker   d2 = vget_low_u16(s16_6);
142*77c1e3ccSAndroid Build Coastguard Worker   d3 = vget_low_u16(s16_7);
143*77c1e3ccSAndroid Build Coastguard Worker   d4 = vget_high_u16(s16_4);
144*77c1e3ccSAndroid Build Coastguard Worker   d5 = vget_high_u16(s16_5);
145*77c1e3ccSAndroid Build Coastguard Worker   d6 = vget_high_u16(s16_6);
146*77c1e3ccSAndroid Build Coastguard Worker   d7 = vget_high_u16(s16_7);
147*77c1e3ccSAndroid Build Coastguard Worker 
148*77c1e3ccSAndroid Build Coastguard Worker   q0 = vmull_u16(d0, d0);
149*77c1e3ccSAndroid Build Coastguard Worker   q1 = vmull_u16(d1, d1);
150*77c1e3ccSAndroid Build Coastguard Worker   q2 = vmull_u16(d2, d2);
151*77c1e3ccSAndroid Build Coastguard Worker   q3 = vmull_u16(d3, d3);
152*77c1e3ccSAndroid Build Coastguard Worker   q4 = vmull_u16(d4, d4);
153*77c1e3ccSAndroid Build Coastguard Worker   q5 = vmull_u16(d5, d5);
154*77c1e3ccSAndroid Build Coastguard Worker   q6 = vmull_u16(d6, d6);
155*77c1e3ccSAndroid Build Coastguard Worker   q7 = vmull_u16(d7, d7);
156*77c1e3ccSAndroid Build Coastguard Worker 
157*77c1e3ccSAndroid Build Coastguard Worker   p0 = vcleq_u32(q0, s0);
158*77c1e3ccSAndroid Build Coastguard Worker   p1 = vcleq_u32(q1, s1);
159*77c1e3ccSAndroid Build Coastguard Worker   p2 = vcleq_u32(q2, s2);
160*77c1e3ccSAndroid Build Coastguard Worker   p3 = vcleq_u32(q3, s3);
161*77c1e3ccSAndroid Build Coastguard Worker   p4 = vcleq_u32(q4, s4);
162*77c1e3ccSAndroid Build Coastguard Worker   p5 = vcleq_u32(q5, s5);
163*77c1e3ccSAndroid Build Coastguard Worker   p6 = vcleq_u32(q6, s6);
164*77c1e3ccSAndroid Build Coastguard Worker   p7 = vcleq_u32(q7, s7);
165*77c1e3ccSAndroid Build Coastguard Worker 
166*77c1e3ccSAndroid Build Coastguard Worker   q0 = vsubq_u32(s0, q0);
167*77c1e3ccSAndroid Build Coastguard Worker   q1 = vsubq_u32(s1, q1);
168*77c1e3ccSAndroid Build Coastguard Worker   q2 = vsubq_u32(s2, q2);
169*77c1e3ccSAndroid Build Coastguard Worker   q3 = vsubq_u32(s3, q3);
170*77c1e3ccSAndroid Build Coastguard Worker   q4 = vsubq_u32(s4, q4);
171*77c1e3ccSAndroid Build Coastguard Worker   q5 = vsubq_u32(s5, q5);
172*77c1e3ccSAndroid Build Coastguard Worker   q6 = vsubq_u32(s6, q6);
173*77c1e3ccSAndroid Build Coastguard Worker   q7 = vsubq_u32(s7, q7);
174*77c1e3ccSAndroid Build Coastguard Worker 
175*77c1e3ccSAndroid Build Coastguard Worker   p0 = vandq_u32(p0, q0);
176*77c1e3ccSAndroid Build Coastguard Worker   p1 = vandq_u32(p1, q1);
177*77c1e3ccSAndroid Build Coastguard Worker   p2 = vandq_u32(p2, q2);
178*77c1e3ccSAndroid Build Coastguard Worker   p3 = vandq_u32(p3, q3);
179*77c1e3ccSAndroid Build Coastguard Worker   p4 = vandq_u32(p4, q4);
180*77c1e3ccSAndroid Build Coastguard Worker   p5 = vandq_u32(p5, q5);
181*77c1e3ccSAndroid Build Coastguard Worker   p6 = vandq_u32(p6, q6);
182*77c1e3ccSAndroid Build Coastguard Worker   p7 = vandq_u32(p7, q7);
183*77c1e3ccSAndroid Build Coastguard Worker 
184*77c1e3ccSAndroid Build Coastguard Worker   p0 = vmulq_u32(p0, s_vec);
185*77c1e3ccSAndroid Build Coastguard Worker   p1 = vmulq_u32(p1, s_vec);
186*77c1e3ccSAndroid Build Coastguard Worker   p2 = vmulq_u32(p2, s_vec);
187*77c1e3ccSAndroid Build Coastguard Worker   p3 = vmulq_u32(p3, s_vec);
188*77c1e3ccSAndroid Build Coastguard Worker   p4 = vmulq_u32(p4, s_vec);
189*77c1e3ccSAndroid Build Coastguard Worker   p5 = vmulq_u32(p5, s_vec);
190*77c1e3ccSAndroid Build Coastguard Worker   p6 = vmulq_u32(p6, s_vec);
191*77c1e3ccSAndroid Build Coastguard Worker   p7 = vmulq_u32(p7, s_vec);
192*77c1e3ccSAndroid Build Coastguard Worker 
193*77c1e3ccSAndroid Build Coastguard Worker   p0 = vrshrq_n_u32(p0, SGRPROJ_MTABLE_BITS);
194*77c1e3ccSAndroid Build Coastguard Worker   p1 = vrshrq_n_u32(p1, SGRPROJ_MTABLE_BITS);
195*77c1e3ccSAndroid Build Coastguard Worker   p2 = vrshrq_n_u32(p2, SGRPROJ_MTABLE_BITS);
196*77c1e3ccSAndroid Build Coastguard Worker   p3 = vrshrq_n_u32(p3, SGRPROJ_MTABLE_BITS);
197*77c1e3ccSAndroid Build Coastguard Worker   p4 = vrshrq_n_u32(p4, SGRPROJ_MTABLE_BITS);
198*77c1e3ccSAndroid Build Coastguard Worker   p5 = vrshrq_n_u32(p5, SGRPROJ_MTABLE_BITS);
199*77c1e3ccSAndroid Build Coastguard Worker   p6 = vrshrq_n_u32(p6, SGRPROJ_MTABLE_BITS);
200*77c1e3ccSAndroid Build Coastguard Worker   p7 = vrshrq_n_u32(p7, SGRPROJ_MTABLE_BITS);
201*77c1e3ccSAndroid Build Coastguard Worker 
202*77c1e3ccSAndroid Build Coastguard Worker   p0 = vminq_u32(p0, const_val);
203*77c1e3ccSAndroid Build Coastguard Worker   p1 = vminq_u32(p1, const_val);
204*77c1e3ccSAndroid Build Coastguard Worker   p2 = vminq_u32(p2, const_val);
205*77c1e3ccSAndroid Build Coastguard Worker   p3 = vminq_u32(p3, const_val);
206*77c1e3ccSAndroid Build Coastguard Worker   p4 = vminq_u32(p4, const_val);
207*77c1e3ccSAndroid Build Coastguard Worker   p5 = vminq_u32(p5, const_val);
208*77c1e3ccSAndroid Build Coastguard Worker   p6 = vminq_u32(p6, const_val);
209*77c1e3ccSAndroid Build Coastguard Worker   p7 = vminq_u32(p7, const_val);
210*77c1e3ccSAndroid Build Coastguard Worker 
211*77c1e3ccSAndroid Build Coastguard Worker   {
212*77c1e3ccSAndroid Build Coastguard Worker     store_u32_4x4((uint32_t *)src1, buf_stride, p0, p1, p2, p3);
213*77c1e3ccSAndroid Build Coastguard Worker     store_u32_4x4((uint32_t *)src1 + 4, buf_stride, p4, p5, p6, p7);
214*77c1e3ccSAndroid Build Coastguard Worker 
215*77c1e3ccSAndroid Build Coastguard Worker     for (int x = 0; x < 4; x++) {
216*77c1e3ccSAndroid Build Coastguard Worker       for (int y = 0; y < 8; y++) {
217*77c1e3ccSAndroid Build Coastguard Worker         dst_A16[x * buf_stride + y] = av1_x_by_xplus1[src1[x * buf_stride + y]];
218*77c1e3ccSAndroid Build Coastguard Worker       }
219*77c1e3ccSAndroid Build Coastguard Worker     }
220*77c1e3ccSAndroid Build Coastguard Worker     load_u16_8x4(dst_A16, buf_stride, &s16_4, &s16_5, &s16_6, &s16_7);
221*77c1e3ccSAndroid Build Coastguard Worker   }
222*77c1e3ccSAndroid Build Coastguard Worker 
223*77c1e3ccSAndroid Build Coastguard Worker   s16_4 = vsubq_u16(sgrproj_sgr, s16_4);
224*77c1e3ccSAndroid Build Coastguard Worker   s16_5 = vsubq_u16(sgrproj_sgr, s16_5);
225*77c1e3ccSAndroid Build Coastguard Worker   s16_6 = vsubq_u16(sgrproj_sgr, s16_6);
226*77c1e3ccSAndroid Build Coastguard Worker   s16_7 = vsubq_u16(sgrproj_sgr, s16_7);
227*77c1e3ccSAndroid Build Coastguard Worker 
228*77c1e3ccSAndroid Build Coastguard Worker   s0 = vmull_u16(vget_low_u16(s16_0), one_by_n_minus_1_vec);
229*77c1e3ccSAndroid Build Coastguard Worker   s1 = vmull_u16(vget_low_u16(s16_1), one_by_n_minus_1_vec);
230*77c1e3ccSAndroid Build Coastguard Worker   s2 = vmull_u16(vget_low_u16(s16_2), one_by_n_minus_1_vec);
231*77c1e3ccSAndroid Build Coastguard Worker   s3 = vmull_u16(vget_low_u16(s16_3), one_by_n_minus_1_vec);
232*77c1e3ccSAndroid Build Coastguard Worker   s4 = vmull_u16(vget_high_u16(s16_0), one_by_n_minus_1_vec);
233*77c1e3ccSAndroid Build Coastguard Worker   s5 = vmull_u16(vget_high_u16(s16_1), one_by_n_minus_1_vec);
234*77c1e3ccSAndroid Build Coastguard Worker   s6 = vmull_u16(vget_high_u16(s16_2), one_by_n_minus_1_vec);
235*77c1e3ccSAndroid Build Coastguard Worker   s7 = vmull_u16(vget_high_u16(s16_3), one_by_n_minus_1_vec);
236*77c1e3ccSAndroid Build Coastguard Worker 
237*77c1e3ccSAndroid Build Coastguard Worker   s0 = vmulq_u32(s0, vmovl_u16(vget_low_u16(s16_4)));
238*77c1e3ccSAndroid Build Coastguard Worker   s1 = vmulq_u32(s1, vmovl_u16(vget_low_u16(s16_5)));
239*77c1e3ccSAndroid Build Coastguard Worker   s2 = vmulq_u32(s2, vmovl_u16(vget_low_u16(s16_6)));
240*77c1e3ccSAndroid Build Coastguard Worker   s3 = vmulq_u32(s3, vmovl_u16(vget_low_u16(s16_7)));
241*77c1e3ccSAndroid Build Coastguard Worker   s4 = vmulq_u32(s4, vmovl_u16(vget_high_u16(s16_4)));
242*77c1e3ccSAndroid Build Coastguard Worker   s5 = vmulq_u32(s5, vmovl_u16(vget_high_u16(s16_5)));
243*77c1e3ccSAndroid Build Coastguard Worker   s6 = vmulq_u32(s6, vmovl_u16(vget_high_u16(s16_6)));
244*77c1e3ccSAndroid Build Coastguard Worker   s7 = vmulq_u32(s7, vmovl_u16(vget_high_u16(s16_7)));
245*77c1e3ccSAndroid Build Coastguard Worker 
246*77c1e3ccSAndroid Build Coastguard Worker   p0 = vrshrq_n_u32(s0, SGRPROJ_RECIP_BITS);
247*77c1e3ccSAndroid Build Coastguard Worker   p1 = vrshrq_n_u32(s1, SGRPROJ_RECIP_BITS);
248*77c1e3ccSAndroid Build Coastguard Worker   p2 = vrshrq_n_u32(s2, SGRPROJ_RECIP_BITS);
249*77c1e3ccSAndroid Build Coastguard Worker   p3 = vrshrq_n_u32(s3, SGRPROJ_RECIP_BITS);
250*77c1e3ccSAndroid Build Coastguard Worker   p4 = vrshrq_n_u32(s4, SGRPROJ_RECIP_BITS);
251*77c1e3ccSAndroid Build Coastguard Worker   p5 = vrshrq_n_u32(s5, SGRPROJ_RECIP_BITS);
252*77c1e3ccSAndroid Build Coastguard Worker   p6 = vrshrq_n_u32(s6, SGRPROJ_RECIP_BITS);
253*77c1e3ccSAndroid Build Coastguard Worker   p7 = vrshrq_n_u32(s7, SGRPROJ_RECIP_BITS);
254*77c1e3ccSAndroid Build Coastguard Worker 
255*77c1e3ccSAndroid Build Coastguard Worker   store_s32_4x4(dst2, buf_stride, vreinterpretq_s32_u32(p0),
256*77c1e3ccSAndroid Build Coastguard Worker                 vreinterpretq_s32_u32(p1), vreinterpretq_s32_u32(p2),
257*77c1e3ccSAndroid Build Coastguard Worker                 vreinterpretq_s32_u32(p3));
258*77c1e3ccSAndroid Build Coastguard Worker   store_s32_4x4(dst2 + 4, buf_stride, vreinterpretq_s32_u32(p4),
259*77c1e3ccSAndroid Build Coastguard Worker                 vreinterpretq_s32_u32(p5), vreinterpretq_s32_u32(p6),
260*77c1e3ccSAndroid Build Coastguard Worker                 vreinterpretq_s32_u32(p7));
261*77c1e3ccSAndroid Build Coastguard Worker }
262*77c1e3ccSAndroid Build Coastguard Worker 
boxsum2_square_sum_calc(int16x4_t t1,int16x4_t t2,int16x4_t t3,int16x4_t t4,int16x4_t t5,int16x4_t t6,int16x4_t t7,int16x4_t t8,int16x4_t t9,int16x4_t t10,int16x4_t t11,int32x4_t * r0,int32x4_t * r1,int32x4_t * r2,int32x4_t * r3)263*77c1e3ccSAndroid Build Coastguard Worker static inline void boxsum2_square_sum_calc(
264*77c1e3ccSAndroid Build Coastguard Worker     int16x4_t t1, int16x4_t t2, int16x4_t t3, int16x4_t t4, int16x4_t t5,
265*77c1e3ccSAndroid Build Coastguard Worker     int16x4_t t6, int16x4_t t7, int16x4_t t8, int16x4_t t9, int16x4_t t10,
266*77c1e3ccSAndroid Build Coastguard Worker     int16x4_t t11, int32x4_t *r0, int32x4_t *r1, int32x4_t *r2, int32x4_t *r3) {
267*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11;
268*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t r12, r34, r67, r89, r1011;
269*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t r345, r6789, r789;
270*77c1e3ccSAndroid Build Coastguard Worker 
271*77c1e3ccSAndroid Build Coastguard Worker   d1 = vmull_s16(t1, t1);
272*77c1e3ccSAndroid Build Coastguard Worker   d2 = vmull_s16(t2, t2);
273*77c1e3ccSAndroid Build Coastguard Worker   d3 = vmull_s16(t3, t3);
274*77c1e3ccSAndroid Build Coastguard Worker   d4 = vmull_s16(t4, t4);
275*77c1e3ccSAndroid Build Coastguard Worker   d5 = vmull_s16(t5, t5);
276*77c1e3ccSAndroid Build Coastguard Worker   d6 = vmull_s16(t6, t6);
277*77c1e3ccSAndroid Build Coastguard Worker   d7 = vmull_s16(t7, t7);
278*77c1e3ccSAndroid Build Coastguard Worker   d8 = vmull_s16(t8, t8);
279*77c1e3ccSAndroid Build Coastguard Worker   d9 = vmull_s16(t9, t9);
280*77c1e3ccSAndroid Build Coastguard Worker   d10 = vmull_s16(t10, t10);
281*77c1e3ccSAndroid Build Coastguard Worker   d11 = vmull_s16(t11, t11);
282*77c1e3ccSAndroid Build Coastguard Worker 
283*77c1e3ccSAndroid Build Coastguard Worker   r12 = vaddq_s32(d1, d2);
284*77c1e3ccSAndroid Build Coastguard Worker   r34 = vaddq_s32(d3, d4);
285*77c1e3ccSAndroid Build Coastguard Worker   r67 = vaddq_s32(d6, d7);
286*77c1e3ccSAndroid Build Coastguard Worker   r89 = vaddq_s32(d8, d9);
287*77c1e3ccSAndroid Build Coastguard Worker   r1011 = vaddq_s32(d10, d11);
288*77c1e3ccSAndroid Build Coastguard Worker   r345 = vaddq_s32(r34, d5);
289*77c1e3ccSAndroid Build Coastguard Worker   r6789 = vaddq_s32(r67, r89);
290*77c1e3ccSAndroid Build Coastguard Worker   r789 = vsubq_s32(r6789, d6);
291*77c1e3ccSAndroid Build Coastguard Worker   *r0 = vaddq_s32(r12, r345);
292*77c1e3ccSAndroid Build Coastguard Worker   *r1 = vaddq_s32(r67, r345);
293*77c1e3ccSAndroid Build Coastguard Worker   *r2 = vaddq_s32(d5, r6789);
294*77c1e3ccSAndroid Build Coastguard Worker   *r3 = vaddq_s32(r789, r1011);
295*77c1e3ccSAndroid Build Coastguard Worker }
296*77c1e3ccSAndroid Build Coastguard Worker 
boxsum2(int16_t * src,const int src_stride,int16_t * dst16,int32_t * dst32,int32_t * dst2,const int dst_stride,const int width,const int height)297*77c1e3ccSAndroid Build Coastguard Worker static inline void boxsum2(int16_t *src, const int src_stride, int16_t *dst16,
298*77c1e3ccSAndroid Build Coastguard Worker                            int32_t *dst32, int32_t *dst2, const int dst_stride,
299*77c1e3ccSAndroid Build Coastguard Worker                            const int width, const int height) {
300*77c1e3ccSAndroid Build Coastguard Worker   assert(width > 2 * SGRPROJ_BORDER_HORZ);
301*77c1e3ccSAndroid Build Coastguard Worker   assert(height > 2 * SGRPROJ_BORDER_VERT);
302*77c1e3ccSAndroid Build Coastguard Worker 
303*77c1e3ccSAndroid Build Coastguard Worker   int16_t *dst1_16_ptr, *src_ptr;
304*77c1e3ccSAndroid Build Coastguard Worker   int32_t *dst2_ptr;
305*77c1e3ccSAndroid Build Coastguard Worker   int h, w, count = 0;
306*77c1e3ccSAndroid Build Coastguard Worker   const int dst_stride_2 = (dst_stride << 1);
307*77c1e3ccSAndroid Build Coastguard Worker   const int dst_stride_8 = (dst_stride << 3);
308*77c1e3ccSAndroid Build Coastguard Worker 
309*77c1e3ccSAndroid Build Coastguard Worker   dst1_16_ptr = dst16;
310*77c1e3ccSAndroid Build Coastguard Worker   dst2_ptr = dst2;
311*77c1e3ccSAndroid Build Coastguard Worker   src_ptr = src;
312*77c1e3ccSAndroid Build Coastguard Worker   w = width;
313*77c1e3ccSAndroid Build Coastguard Worker   {
314*77c1e3ccSAndroid Build Coastguard Worker     int16x8_t t1, t2, t3, t4, t5, t6, t7;
315*77c1e3ccSAndroid Build Coastguard Worker     int16x8_t t8, t9, t10, t11, t12;
316*77c1e3ccSAndroid Build Coastguard Worker 
317*77c1e3ccSAndroid Build Coastguard Worker     int16x8_t q12345, q56789, q34567, q7891011;
318*77c1e3ccSAndroid Build Coastguard Worker     int16x8_t q12, q34, q67, q89, q1011;
319*77c1e3ccSAndroid Build Coastguard Worker     int16x8_t q345, q6789, q789;
320*77c1e3ccSAndroid Build Coastguard Worker 
321*77c1e3ccSAndroid Build Coastguard Worker     int32x4_t r12345, r56789, r34567, r7891011;
322*77c1e3ccSAndroid Build Coastguard Worker 
323*77c1e3ccSAndroid Build Coastguard Worker     do {
324*77c1e3ccSAndroid Build Coastguard Worker       h = height;
325*77c1e3ccSAndroid Build Coastguard Worker       dst1_16_ptr = dst16 + (count << 3);
326*77c1e3ccSAndroid Build Coastguard Worker       dst2_ptr = dst2 + (count << 3);
327*77c1e3ccSAndroid Build Coastguard Worker       src_ptr = src + (count << 3);
328*77c1e3ccSAndroid Build Coastguard Worker 
329*77c1e3ccSAndroid Build Coastguard Worker       dst1_16_ptr += dst_stride_2;
330*77c1e3ccSAndroid Build Coastguard Worker       dst2_ptr += dst_stride_2;
331*77c1e3ccSAndroid Build Coastguard Worker       do {
332*77c1e3ccSAndroid Build Coastguard Worker         load_s16_8x4(src_ptr, src_stride, &t1, &t2, &t3, &t4);
333*77c1e3ccSAndroid Build Coastguard Worker         src_ptr += 4 * src_stride;
334*77c1e3ccSAndroid Build Coastguard Worker         load_s16_8x4(src_ptr, src_stride, &t5, &t6, &t7, &t8);
335*77c1e3ccSAndroid Build Coastguard Worker         src_ptr += 4 * src_stride;
336*77c1e3ccSAndroid Build Coastguard Worker         load_s16_8x4(src_ptr, src_stride, &t9, &t10, &t11, &t12);
337*77c1e3ccSAndroid Build Coastguard Worker 
338*77c1e3ccSAndroid Build Coastguard Worker         q12 = vaddq_s16(t1, t2);
339*77c1e3ccSAndroid Build Coastguard Worker         q34 = vaddq_s16(t3, t4);
340*77c1e3ccSAndroid Build Coastguard Worker         q67 = vaddq_s16(t6, t7);
341*77c1e3ccSAndroid Build Coastguard Worker         q89 = vaddq_s16(t8, t9);
342*77c1e3ccSAndroid Build Coastguard Worker         q1011 = vaddq_s16(t10, t11);
343*77c1e3ccSAndroid Build Coastguard Worker         q345 = vaddq_s16(q34, t5);
344*77c1e3ccSAndroid Build Coastguard Worker         q6789 = vaddq_s16(q67, q89);
345*77c1e3ccSAndroid Build Coastguard Worker         q789 = vaddq_s16(q89, t7);
346*77c1e3ccSAndroid Build Coastguard Worker         q12345 = vaddq_s16(q12, q345);
347*77c1e3ccSAndroid Build Coastguard Worker         q34567 = vaddq_s16(q67, q345);
348*77c1e3ccSAndroid Build Coastguard Worker         q56789 = vaddq_s16(t5, q6789);
349*77c1e3ccSAndroid Build Coastguard Worker         q7891011 = vaddq_s16(q789, q1011);
350*77c1e3ccSAndroid Build Coastguard Worker 
351*77c1e3ccSAndroid Build Coastguard Worker         store_s16_8x4(dst1_16_ptr, dst_stride_2, q12345, q34567, q56789,
352*77c1e3ccSAndroid Build Coastguard Worker                       q7891011);
353*77c1e3ccSAndroid Build Coastguard Worker         dst1_16_ptr += dst_stride_8;
354*77c1e3ccSAndroid Build Coastguard Worker 
355*77c1e3ccSAndroid Build Coastguard Worker         boxsum2_square_sum_calc(
356*77c1e3ccSAndroid Build Coastguard Worker             vget_low_s16(t1), vget_low_s16(t2), vget_low_s16(t3),
357*77c1e3ccSAndroid Build Coastguard Worker             vget_low_s16(t4), vget_low_s16(t5), vget_low_s16(t6),
358*77c1e3ccSAndroid Build Coastguard Worker             vget_low_s16(t7), vget_low_s16(t8), vget_low_s16(t9),
359*77c1e3ccSAndroid Build Coastguard Worker             vget_low_s16(t10), vget_low_s16(t11), &r12345, &r34567, &r56789,
360*77c1e3ccSAndroid Build Coastguard Worker             &r7891011);
361*77c1e3ccSAndroid Build Coastguard Worker 
362*77c1e3ccSAndroid Build Coastguard Worker         store_s32_4x4(dst2_ptr, dst_stride_2, r12345, r34567, r56789, r7891011);
363*77c1e3ccSAndroid Build Coastguard Worker 
364*77c1e3ccSAndroid Build Coastguard Worker         boxsum2_square_sum_calc(
365*77c1e3ccSAndroid Build Coastguard Worker             vget_high_s16(t1), vget_high_s16(t2), vget_high_s16(t3),
366*77c1e3ccSAndroid Build Coastguard Worker             vget_high_s16(t4), vget_high_s16(t5), vget_high_s16(t6),
367*77c1e3ccSAndroid Build Coastguard Worker             vget_high_s16(t7), vget_high_s16(t8), vget_high_s16(t9),
368*77c1e3ccSAndroid Build Coastguard Worker             vget_high_s16(t10), vget_high_s16(t11), &r12345, &r34567, &r56789,
369*77c1e3ccSAndroid Build Coastguard Worker             &r7891011);
370*77c1e3ccSAndroid Build Coastguard Worker 
371*77c1e3ccSAndroid Build Coastguard Worker         store_s32_4x4(dst2_ptr + 4, dst_stride_2, r12345, r34567, r56789,
372*77c1e3ccSAndroid Build Coastguard Worker                       r7891011);
373*77c1e3ccSAndroid Build Coastguard Worker         dst2_ptr += (dst_stride_8);
374*77c1e3ccSAndroid Build Coastguard Worker         h -= 8;
375*77c1e3ccSAndroid Build Coastguard Worker       } while (h > 0);
376*77c1e3ccSAndroid Build Coastguard Worker       w -= 8;
377*77c1e3ccSAndroid Build Coastguard Worker       count++;
378*77c1e3ccSAndroid Build Coastguard Worker     } while (w > 0);
379*77c1e3ccSAndroid Build Coastguard Worker 
380*77c1e3ccSAndroid Build Coastguard Worker     // memset needed for row pixels as 2nd stage of boxsum filter uses
381*77c1e3ccSAndroid Build Coastguard Worker     // first 2 rows of dst16, dst2 buffer which is not filled in first stage.
382*77c1e3ccSAndroid Build Coastguard Worker     for (int x = 0; x < 2; x++) {
383*77c1e3ccSAndroid Build Coastguard Worker       memset(dst16 + x * dst_stride, 0, (width + 4) * sizeof(*dst16));
384*77c1e3ccSAndroid Build Coastguard Worker       memset(dst2 + x * dst_stride, 0, (width + 4) * sizeof(*dst2));
385*77c1e3ccSAndroid Build Coastguard Worker     }
386*77c1e3ccSAndroid Build Coastguard Worker 
387*77c1e3ccSAndroid Build Coastguard Worker     // memset needed for extra columns as 2nd stage of boxsum filter uses
388*77c1e3ccSAndroid Build Coastguard Worker     // last 2 columns of dst16, dst2 buffer which is not filled in first stage.
389*77c1e3ccSAndroid Build Coastguard Worker     for (int x = 2; x < height + 2; x++) {
390*77c1e3ccSAndroid Build Coastguard Worker       int dst_offset = x * dst_stride + width + 2;
391*77c1e3ccSAndroid Build Coastguard Worker       memset(dst16 + dst_offset, 0, 3 * sizeof(*dst16));
392*77c1e3ccSAndroid Build Coastguard Worker       memset(dst2 + dst_offset, 0, 3 * sizeof(*dst2));
393*77c1e3ccSAndroid Build Coastguard Worker     }
394*77c1e3ccSAndroid Build Coastguard Worker   }
395*77c1e3ccSAndroid Build Coastguard Worker 
396*77c1e3ccSAndroid Build Coastguard Worker   {
397*77c1e3ccSAndroid Build Coastguard Worker     int16x4_t s1, s2, s3, s4, s5, s6, s7, s8;
398*77c1e3ccSAndroid Build Coastguard Worker     int32x4_t d1, d2, d3, d4, d5, d6, d7, d8;
399*77c1e3ccSAndroid Build Coastguard Worker     int32x4_t q12345, q34567, q23456, q45678;
400*77c1e3ccSAndroid Build Coastguard Worker     int32x4_t q23, q45, q67;
401*77c1e3ccSAndroid Build Coastguard Worker     int32x4_t q2345, q4567;
402*77c1e3ccSAndroid Build Coastguard Worker 
403*77c1e3ccSAndroid Build Coastguard Worker     int32x4_t r12345, r34567, r23456, r45678;
404*77c1e3ccSAndroid Build Coastguard Worker     int32x4_t r23, r45, r67;
405*77c1e3ccSAndroid Build Coastguard Worker     int32x4_t r2345, r4567;
406*77c1e3ccSAndroid Build Coastguard Worker 
407*77c1e3ccSAndroid Build Coastguard Worker     int32_t *src2_ptr, *dst1_32_ptr;
408*77c1e3ccSAndroid Build Coastguard Worker     int16_t *src1_ptr;
409*77c1e3ccSAndroid Build Coastguard Worker     count = 0;
410*77c1e3ccSAndroid Build Coastguard Worker     h = height;
411*77c1e3ccSAndroid Build Coastguard Worker     do {
412*77c1e3ccSAndroid Build Coastguard Worker       dst1_32_ptr = dst32 + count * dst_stride_8 + (dst_stride_2);
413*77c1e3ccSAndroid Build Coastguard Worker       dst2_ptr = dst2 + count * dst_stride_8 + (dst_stride_2);
414*77c1e3ccSAndroid Build Coastguard Worker       src1_ptr = dst16 + count * dst_stride_8 + (dst_stride_2);
415*77c1e3ccSAndroid Build Coastguard Worker       src2_ptr = dst2 + count * dst_stride_8 + (dst_stride_2);
416*77c1e3ccSAndroid Build Coastguard Worker       w = width;
417*77c1e3ccSAndroid Build Coastguard Worker 
418*77c1e3ccSAndroid Build Coastguard Worker       dst1_32_ptr += 2;
419*77c1e3ccSAndroid Build Coastguard Worker       dst2_ptr += 2;
420*77c1e3ccSAndroid Build Coastguard Worker       load_s16_4x4(src1_ptr, dst_stride_2, &s1, &s2, &s3, &s4);
421*77c1e3ccSAndroid Build Coastguard Worker       transpose_elems_inplace_s16_4x4(&s1, &s2, &s3, &s4);
422*77c1e3ccSAndroid Build Coastguard Worker       load_s32_4x4(src2_ptr, dst_stride_2, &d1, &d2, &d3, &d4);
423*77c1e3ccSAndroid Build Coastguard Worker       transpose_elems_inplace_s32_4x4(&d1, &d2, &d3, &d4);
424*77c1e3ccSAndroid Build Coastguard Worker       do {
425*77c1e3ccSAndroid Build Coastguard Worker         src1_ptr += 4;
426*77c1e3ccSAndroid Build Coastguard Worker         src2_ptr += 4;
427*77c1e3ccSAndroid Build Coastguard Worker         load_s16_4x4(src1_ptr, dst_stride_2, &s5, &s6, &s7, &s8);
428*77c1e3ccSAndroid Build Coastguard Worker         transpose_elems_inplace_s16_4x4(&s5, &s6, &s7, &s8);
429*77c1e3ccSAndroid Build Coastguard Worker         load_s32_4x4(src2_ptr, dst_stride_2, &d5, &d6, &d7, &d8);
430*77c1e3ccSAndroid Build Coastguard Worker         transpose_elems_inplace_s32_4x4(&d5, &d6, &d7, &d8);
431*77c1e3ccSAndroid Build Coastguard Worker         q23 = vaddl_s16(s2, s3);
432*77c1e3ccSAndroid Build Coastguard Worker         q45 = vaddl_s16(s4, s5);
433*77c1e3ccSAndroid Build Coastguard Worker         q67 = vaddl_s16(s6, s7);
434*77c1e3ccSAndroid Build Coastguard Worker         q2345 = vaddq_s32(q23, q45);
435*77c1e3ccSAndroid Build Coastguard Worker         q4567 = vaddq_s32(q45, q67);
436*77c1e3ccSAndroid Build Coastguard Worker         q12345 = vaddq_s32(vmovl_s16(s1), q2345);
437*77c1e3ccSAndroid Build Coastguard Worker         q23456 = vaddq_s32(q2345, vmovl_s16(s6));
438*77c1e3ccSAndroid Build Coastguard Worker         q34567 = vaddq_s32(q4567, vmovl_s16(s3));
439*77c1e3ccSAndroid Build Coastguard Worker         q45678 = vaddq_s32(q4567, vmovl_s16(s8));
440*77c1e3ccSAndroid Build Coastguard Worker 
441*77c1e3ccSAndroid Build Coastguard Worker         transpose_elems_inplace_s32_4x4(&q12345, &q23456, &q34567, &q45678);
442*77c1e3ccSAndroid Build Coastguard Worker         store_s32_4x4(dst1_32_ptr, dst_stride_2, q12345, q23456, q34567,
443*77c1e3ccSAndroid Build Coastguard Worker                       q45678);
444*77c1e3ccSAndroid Build Coastguard Worker         dst1_32_ptr += 4;
445*77c1e3ccSAndroid Build Coastguard Worker         s1 = s5;
446*77c1e3ccSAndroid Build Coastguard Worker         s2 = s6;
447*77c1e3ccSAndroid Build Coastguard Worker         s3 = s7;
448*77c1e3ccSAndroid Build Coastguard Worker         s4 = s8;
449*77c1e3ccSAndroid Build Coastguard Worker 
450*77c1e3ccSAndroid Build Coastguard Worker         r23 = vaddq_s32(d2, d3);
451*77c1e3ccSAndroid Build Coastguard Worker         r45 = vaddq_s32(d4, d5);
452*77c1e3ccSAndroid Build Coastguard Worker         r67 = vaddq_s32(d6, d7);
453*77c1e3ccSAndroid Build Coastguard Worker         r2345 = vaddq_s32(r23, r45);
454*77c1e3ccSAndroid Build Coastguard Worker         r4567 = vaddq_s32(r45, r67);
455*77c1e3ccSAndroid Build Coastguard Worker         r12345 = vaddq_s32(d1, r2345);
456*77c1e3ccSAndroid Build Coastguard Worker         r23456 = vaddq_s32(r2345, d6);
457*77c1e3ccSAndroid Build Coastguard Worker         r34567 = vaddq_s32(r4567, d3);
458*77c1e3ccSAndroid Build Coastguard Worker         r45678 = vaddq_s32(r4567, d8);
459*77c1e3ccSAndroid Build Coastguard Worker 
460*77c1e3ccSAndroid Build Coastguard Worker         transpose_elems_inplace_s32_4x4(&r12345, &r23456, &r34567, &r45678);
461*77c1e3ccSAndroid Build Coastguard Worker         store_s32_4x4(dst2_ptr, dst_stride_2, r12345, r23456, r34567, r45678);
462*77c1e3ccSAndroid Build Coastguard Worker         dst2_ptr += 4;
463*77c1e3ccSAndroid Build Coastguard Worker         d1 = d5;
464*77c1e3ccSAndroid Build Coastguard Worker         d2 = d6;
465*77c1e3ccSAndroid Build Coastguard Worker         d3 = d7;
466*77c1e3ccSAndroid Build Coastguard Worker         d4 = d8;
467*77c1e3ccSAndroid Build Coastguard Worker         w -= 4;
468*77c1e3ccSAndroid Build Coastguard Worker       } while (w > 0);
469*77c1e3ccSAndroid Build Coastguard Worker       h -= 8;
470*77c1e3ccSAndroid Build Coastguard Worker       count++;
471*77c1e3ccSAndroid Build Coastguard Worker     } while (h > 0);
472*77c1e3ccSAndroid Build Coastguard Worker   }
473*77c1e3ccSAndroid Build Coastguard Worker }
474*77c1e3ccSAndroid Build Coastguard Worker 
calc_ab_internal_lbd(int32_t * A,uint16_t * A16,uint16_t * B16,int32_t * B,const int buf_stride,const int width,const int height,const int r,const int s,const int ht_inc)475*77c1e3ccSAndroid Build Coastguard Worker static inline void calc_ab_internal_lbd(int32_t *A, uint16_t *A16,
476*77c1e3ccSAndroid Build Coastguard Worker                                         uint16_t *B16, int32_t *B,
477*77c1e3ccSAndroid Build Coastguard Worker                                         const int buf_stride, const int width,
478*77c1e3ccSAndroid Build Coastguard Worker                                         const int height, const int r,
479*77c1e3ccSAndroid Build Coastguard Worker                                         const int s, const int ht_inc) {
480*77c1e3ccSAndroid Build Coastguard Worker   int32_t *src1, *dst2, count = 0;
481*77c1e3ccSAndroid Build Coastguard Worker   uint16_t *dst_A16, *src2;
482*77c1e3ccSAndroid Build Coastguard Worker   const uint32_t n = (2 * r + 1) * (2 * r + 1);
483*77c1e3ccSAndroid Build Coastguard Worker   const uint32x4_t const_n_val = vdupq_n_u32(n);
484*77c1e3ccSAndroid Build Coastguard Worker   const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR);
485*77c1e3ccSAndroid Build Coastguard Worker   const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(av1_one_by_x[n - 1]);
486*77c1e3ccSAndroid Build Coastguard Worker   const uint32x4_t const_val = vdupq_n_u32(255);
487*77c1e3ccSAndroid Build Coastguard Worker 
488*77c1e3ccSAndroid Build Coastguard Worker   uint16x8_t s16_0, s16_1, s16_2, s16_3, s16_4, s16_5, s16_6, s16_7;
489*77c1e3ccSAndroid Build Coastguard Worker 
490*77c1e3ccSAndroid Build Coastguard Worker   uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
491*77c1e3ccSAndroid Build Coastguard Worker 
492*77c1e3ccSAndroid Build Coastguard Worker   const uint32x4_t s_vec = vdupq_n_u32(s);
493*77c1e3ccSAndroid Build Coastguard Worker   int w, h = height;
494*77c1e3ccSAndroid Build Coastguard Worker 
495*77c1e3ccSAndroid Build Coastguard Worker   do {
496*77c1e3ccSAndroid Build Coastguard Worker     dst_A16 = A16 + (count << 2) * buf_stride;
497*77c1e3ccSAndroid Build Coastguard Worker     src1 = A + (count << 2) * buf_stride;
498*77c1e3ccSAndroid Build Coastguard Worker     src2 = B16 + (count << 2) * buf_stride;
499*77c1e3ccSAndroid Build Coastguard Worker     dst2 = B + (count << 2) * buf_stride;
500*77c1e3ccSAndroid Build Coastguard Worker     w = width;
501*77c1e3ccSAndroid Build Coastguard Worker     do {
502*77c1e3ccSAndroid Build Coastguard Worker       load_u32_4x4((uint32_t *)src1, buf_stride, &s0, &s1, &s2, &s3);
503*77c1e3ccSAndroid Build Coastguard Worker       load_u32_4x4((uint32_t *)src1 + 4, buf_stride, &s4, &s5, &s6, &s7);
504*77c1e3ccSAndroid Build Coastguard Worker       load_u16_8x4(src2, buf_stride, &s16_0, &s16_1, &s16_2, &s16_3);
505*77c1e3ccSAndroid Build Coastguard Worker 
506*77c1e3ccSAndroid Build Coastguard Worker       s16_4 = s16_0;
507*77c1e3ccSAndroid Build Coastguard Worker       s16_5 = s16_1;
508*77c1e3ccSAndroid Build Coastguard Worker       s16_6 = s16_2;
509*77c1e3ccSAndroid Build Coastguard Worker       s16_7 = s16_3;
510*77c1e3ccSAndroid Build Coastguard Worker 
511*77c1e3ccSAndroid Build Coastguard Worker       calc_ab_internal_common(
512*77c1e3ccSAndroid Build Coastguard Worker           s0, s1, s2, s3, s4, s5, s6, s7, s16_0, s16_1, s16_2, s16_3, s16_4,
513*77c1e3ccSAndroid Build Coastguard Worker           s16_5, s16_6, s16_7, const_n_val, s_vec, const_val,
514*77c1e3ccSAndroid Build Coastguard Worker           one_by_n_minus_1_vec, sgrproj_sgr, src1, dst_A16, dst2, buf_stride);
515*77c1e3ccSAndroid Build Coastguard Worker 
516*77c1e3ccSAndroid Build Coastguard Worker       w -= 8;
517*77c1e3ccSAndroid Build Coastguard Worker       dst2 += 8;
518*77c1e3ccSAndroid Build Coastguard Worker       src1 += 8;
519*77c1e3ccSAndroid Build Coastguard Worker       src2 += 8;
520*77c1e3ccSAndroid Build Coastguard Worker       dst_A16 += 8;
521*77c1e3ccSAndroid Build Coastguard Worker     } while (w > 0);
522*77c1e3ccSAndroid Build Coastguard Worker     count++;
523*77c1e3ccSAndroid Build Coastguard Worker     h -= (ht_inc * 4);
524*77c1e3ccSAndroid Build Coastguard Worker   } while (h > 0);
525*77c1e3ccSAndroid Build Coastguard Worker }
526*77c1e3ccSAndroid Build Coastguard Worker 
527*77c1e3ccSAndroid Build Coastguard Worker #if CONFIG_AV1_HIGHBITDEPTH
calc_ab_internal_hbd(int32_t * A,uint16_t * A16,uint16_t * B16,int32_t * B,const int buf_stride,const int width,const int height,const int bit_depth,const int r,const int s,const int ht_inc)528*77c1e3ccSAndroid Build Coastguard Worker static inline void calc_ab_internal_hbd(int32_t *A, uint16_t *A16,
529*77c1e3ccSAndroid Build Coastguard Worker                                         uint16_t *B16, int32_t *B,
530*77c1e3ccSAndroid Build Coastguard Worker                                         const int buf_stride, const int width,
531*77c1e3ccSAndroid Build Coastguard Worker                                         const int height, const int bit_depth,
532*77c1e3ccSAndroid Build Coastguard Worker                                         const int r, const int s,
533*77c1e3ccSAndroid Build Coastguard Worker                                         const int ht_inc) {
534*77c1e3ccSAndroid Build Coastguard Worker   int32_t *src1, *dst2, count = 0;
535*77c1e3ccSAndroid Build Coastguard Worker   uint16_t *dst_A16, *src2;
536*77c1e3ccSAndroid Build Coastguard Worker   const uint32_t n = (2 * r + 1) * (2 * r + 1);
537*77c1e3ccSAndroid Build Coastguard Worker   const int16x8_t bd_min_2_vec = vdupq_n_s16(-(bit_depth - 8));
538*77c1e3ccSAndroid Build Coastguard Worker   const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1));
539*77c1e3ccSAndroid Build Coastguard Worker   const uint32x4_t const_n_val = vdupq_n_u32(n);
540*77c1e3ccSAndroid Build Coastguard Worker   const uint16x8_t sgrproj_sgr = vdupq_n_u16(SGRPROJ_SGR);
541*77c1e3ccSAndroid Build Coastguard Worker   const uint16x4_t one_by_n_minus_1_vec = vdup_n_u16(av1_one_by_x[n - 1]);
542*77c1e3ccSAndroid Build Coastguard Worker   const uint32x4_t const_val = vdupq_n_u32(255);
543*77c1e3ccSAndroid Build Coastguard Worker 
544*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
545*77c1e3ccSAndroid Build Coastguard Worker   uint16x8_t s16_0, s16_1, s16_2, s16_3;
546*77c1e3ccSAndroid Build Coastguard Worker   uint16x8_t s16_4, s16_5, s16_6, s16_7;
547*77c1e3ccSAndroid Build Coastguard Worker   uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
548*77c1e3ccSAndroid Build Coastguard Worker 
549*77c1e3ccSAndroid Build Coastguard Worker   const uint32x4_t s_vec = vdupq_n_u32(s);
550*77c1e3ccSAndroid Build Coastguard Worker   int w, h = height;
551*77c1e3ccSAndroid Build Coastguard Worker 
552*77c1e3ccSAndroid Build Coastguard Worker   do {
553*77c1e3ccSAndroid Build Coastguard Worker     src1 = A + (count << 2) * buf_stride;
554*77c1e3ccSAndroid Build Coastguard Worker     src2 = B16 + (count << 2) * buf_stride;
555*77c1e3ccSAndroid Build Coastguard Worker     dst2 = B + (count << 2) * buf_stride;
556*77c1e3ccSAndroid Build Coastguard Worker     dst_A16 = A16 + (count << 2) * buf_stride;
557*77c1e3ccSAndroid Build Coastguard Worker     w = width;
558*77c1e3ccSAndroid Build Coastguard Worker     do {
559*77c1e3ccSAndroid Build Coastguard Worker       load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3);
560*77c1e3ccSAndroid Build Coastguard Worker       load_s32_4x4(src1 + 4, buf_stride, &sr4, &sr5, &sr6, &sr7);
561*77c1e3ccSAndroid Build Coastguard Worker       load_u16_8x4(src2, buf_stride, &s16_0, &s16_1, &s16_2, &s16_3);
562*77c1e3ccSAndroid Build Coastguard Worker 
563*77c1e3ccSAndroid Build Coastguard Worker       s0 = vrshlq_u32(vreinterpretq_u32_s32(sr0), bd_min_1_vec);
564*77c1e3ccSAndroid Build Coastguard Worker       s1 = vrshlq_u32(vreinterpretq_u32_s32(sr1), bd_min_1_vec);
565*77c1e3ccSAndroid Build Coastguard Worker       s2 = vrshlq_u32(vreinterpretq_u32_s32(sr2), bd_min_1_vec);
566*77c1e3ccSAndroid Build Coastguard Worker       s3 = vrshlq_u32(vreinterpretq_u32_s32(sr3), bd_min_1_vec);
567*77c1e3ccSAndroid Build Coastguard Worker       s4 = vrshlq_u32(vreinterpretq_u32_s32(sr4), bd_min_1_vec);
568*77c1e3ccSAndroid Build Coastguard Worker       s5 = vrshlq_u32(vreinterpretq_u32_s32(sr5), bd_min_1_vec);
569*77c1e3ccSAndroid Build Coastguard Worker       s6 = vrshlq_u32(vreinterpretq_u32_s32(sr6), bd_min_1_vec);
570*77c1e3ccSAndroid Build Coastguard Worker       s7 = vrshlq_u32(vreinterpretq_u32_s32(sr7), bd_min_1_vec);
571*77c1e3ccSAndroid Build Coastguard Worker 
572*77c1e3ccSAndroid Build Coastguard Worker       s16_4 = vrshlq_u16(s16_0, bd_min_2_vec);
573*77c1e3ccSAndroid Build Coastguard Worker       s16_5 = vrshlq_u16(s16_1, bd_min_2_vec);
574*77c1e3ccSAndroid Build Coastguard Worker       s16_6 = vrshlq_u16(s16_2, bd_min_2_vec);
575*77c1e3ccSAndroid Build Coastguard Worker       s16_7 = vrshlq_u16(s16_3, bd_min_2_vec);
576*77c1e3ccSAndroid Build Coastguard Worker 
577*77c1e3ccSAndroid Build Coastguard Worker       calc_ab_internal_common(
578*77c1e3ccSAndroid Build Coastguard Worker           s0, s1, s2, s3, s4, s5, s6, s7, s16_0, s16_1, s16_2, s16_3, s16_4,
579*77c1e3ccSAndroid Build Coastguard Worker           s16_5, s16_6, s16_7, const_n_val, s_vec, const_val,
580*77c1e3ccSAndroid Build Coastguard Worker           one_by_n_minus_1_vec, sgrproj_sgr, src1, dst_A16, dst2, buf_stride);
581*77c1e3ccSAndroid Build Coastguard Worker 
582*77c1e3ccSAndroid Build Coastguard Worker       w -= 8;
583*77c1e3ccSAndroid Build Coastguard Worker       dst2 += 8;
584*77c1e3ccSAndroid Build Coastguard Worker       src1 += 8;
585*77c1e3ccSAndroid Build Coastguard Worker       src2 += 8;
586*77c1e3ccSAndroid Build Coastguard Worker       dst_A16 += 8;
587*77c1e3ccSAndroid Build Coastguard Worker     } while (w > 0);
588*77c1e3ccSAndroid Build Coastguard Worker     count++;
589*77c1e3ccSAndroid Build Coastguard Worker     h -= (ht_inc * 4);
590*77c1e3ccSAndroid Build Coastguard Worker   } while (h > 0);
591*77c1e3ccSAndroid Build Coastguard Worker }
592*77c1e3ccSAndroid Build Coastguard Worker #endif  // CONFIG_AV1_HIGHBITDEPTH
593*77c1e3ccSAndroid Build Coastguard Worker 
calc_ab_fast_internal_lbd(int32_t * A,uint16_t * A16,int32_t * B,const int buf_stride,const int width,const int height,const int r,const int s,const int ht_inc)594*77c1e3ccSAndroid Build Coastguard Worker static inline void calc_ab_fast_internal_lbd(int32_t *A, uint16_t *A16,
595*77c1e3ccSAndroid Build Coastguard Worker                                              int32_t *B, const int buf_stride,
596*77c1e3ccSAndroid Build Coastguard Worker                                              const int width, const int height,
597*77c1e3ccSAndroid Build Coastguard Worker                                              const int r, const int s,
598*77c1e3ccSAndroid Build Coastguard Worker                                              const int ht_inc) {
599*77c1e3ccSAndroid Build Coastguard Worker   int32_t *src1, *src2, count = 0;
600*77c1e3ccSAndroid Build Coastguard Worker   uint16_t *dst_A16;
601*77c1e3ccSAndroid Build Coastguard Worker   const uint32_t n = (2 * r + 1) * (2 * r + 1);
602*77c1e3ccSAndroid Build Coastguard Worker   const uint32x4_t const_n_val = vdupq_n_u32(n);
603*77c1e3ccSAndroid Build Coastguard Worker   const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR);
604*77c1e3ccSAndroid Build Coastguard Worker   const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(av1_one_by_x[n - 1]);
605*77c1e3ccSAndroid Build Coastguard Worker   const uint32x4_t const_val = vdupq_n_u32(255);
606*77c1e3ccSAndroid Build Coastguard Worker 
607*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
608*77c1e3ccSAndroid Build Coastguard Worker   uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
609*77c1e3ccSAndroid Build Coastguard Worker 
610*77c1e3ccSAndroid Build Coastguard Worker   const uint32x4_t s_vec = vdupq_n_u32(s);
611*77c1e3ccSAndroid Build Coastguard Worker   int w, h = height;
612*77c1e3ccSAndroid Build Coastguard Worker 
613*77c1e3ccSAndroid Build Coastguard Worker   do {
614*77c1e3ccSAndroid Build Coastguard Worker     src1 = A + (count << 2) * buf_stride;
615*77c1e3ccSAndroid Build Coastguard Worker     src2 = B + (count << 2) * buf_stride;
616*77c1e3ccSAndroid Build Coastguard Worker     dst_A16 = A16 + (count << 2) * buf_stride;
617*77c1e3ccSAndroid Build Coastguard Worker     w = width;
618*77c1e3ccSAndroid Build Coastguard Worker     do {
619*77c1e3ccSAndroid Build Coastguard Worker       load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3);
620*77c1e3ccSAndroid Build Coastguard Worker       load_s32_4x4(src2, buf_stride, &sr4, &sr5, &sr6, &sr7);
621*77c1e3ccSAndroid Build Coastguard Worker 
622*77c1e3ccSAndroid Build Coastguard Worker       s0 = vreinterpretq_u32_s32(sr0);
623*77c1e3ccSAndroid Build Coastguard Worker       s1 = vreinterpretq_u32_s32(sr1);
624*77c1e3ccSAndroid Build Coastguard Worker       s2 = vreinterpretq_u32_s32(sr2);
625*77c1e3ccSAndroid Build Coastguard Worker       s3 = vreinterpretq_u32_s32(sr3);
626*77c1e3ccSAndroid Build Coastguard Worker       s4 = vreinterpretq_u32_s32(sr4);
627*77c1e3ccSAndroid Build Coastguard Worker       s5 = vreinterpretq_u32_s32(sr5);
628*77c1e3ccSAndroid Build Coastguard Worker       s6 = vreinterpretq_u32_s32(sr6);
629*77c1e3ccSAndroid Build Coastguard Worker       s7 = vreinterpretq_u32_s32(sr7);
630*77c1e3ccSAndroid Build Coastguard Worker 
631*77c1e3ccSAndroid Build Coastguard Worker       calc_ab_fast_internal_common(s0, s1, s2, s3, s4, s5, s6, s7, sr4, sr5,
632*77c1e3ccSAndroid Build Coastguard Worker                                    sr6, sr7, const_n_val, s_vec, const_val,
633*77c1e3ccSAndroid Build Coastguard Worker                                    one_by_n_minus_1_vec, sgrproj_sgr, src1,
634*77c1e3ccSAndroid Build Coastguard Worker                                    dst_A16, src2, buf_stride);
635*77c1e3ccSAndroid Build Coastguard Worker 
636*77c1e3ccSAndroid Build Coastguard Worker       w -= 4;
637*77c1e3ccSAndroid Build Coastguard Worker       src1 += 4;
638*77c1e3ccSAndroid Build Coastguard Worker       src2 += 4;
639*77c1e3ccSAndroid Build Coastguard Worker       dst_A16 += 4;
640*77c1e3ccSAndroid Build Coastguard Worker     } while (w > 0);
641*77c1e3ccSAndroid Build Coastguard Worker     count++;
642*77c1e3ccSAndroid Build Coastguard Worker     h -= (ht_inc * 4);
643*77c1e3ccSAndroid Build Coastguard Worker   } while (h > 0);
644*77c1e3ccSAndroid Build Coastguard Worker }
645*77c1e3ccSAndroid Build Coastguard Worker 
646*77c1e3ccSAndroid Build Coastguard Worker #if CONFIG_AV1_HIGHBITDEPTH
calc_ab_fast_internal_hbd(int32_t * A,uint16_t * A16,int32_t * B,const int buf_stride,const int width,const int height,const int bit_depth,const int r,const int s,const int ht_inc)647*77c1e3ccSAndroid Build Coastguard Worker static inline void calc_ab_fast_internal_hbd(int32_t *A, uint16_t *A16,
648*77c1e3ccSAndroid Build Coastguard Worker                                              int32_t *B, const int buf_stride,
649*77c1e3ccSAndroid Build Coastguard Worker                                              const int width, const int height,
650*77c1e3ccSAndroid Build Coastguard Worker                                              const int bit_depth, const int r,
651*77c1e3ccSAndroid Build Coastguard Worker                                              const int s, const int ht_inc) {
652*77c1e3ccSAndroid Build Coastguard Worker   int32_t *src1, *src2, count = 0;
653*77c1e3ccSAndroid Build Coastguard Worker   uint16_t *dst_A16;
654*77c1e3ccSAndroid Build Coastguard Worker   const uint32_t n = (2 * r + 1) * (2 * r + 1);
655*77c1e3ccSAndroid Build Coastguard Worker   const int32x4_t bd_min_2_vec = vdupq_n_s32(-(bit_depth - 8));
656*77c1e3ccSAndroid Build Coastguard Worker   const int32x4_t bd_min_1_vec = vdupq_n_s32(-((bit_depth - 8) << 1));
657*77c1e3ccSAndroid Build Coastguard Worker   const uint32x4_t const_n_val = vdupq_n_u32(n);
658*77c1e3ccSAndroid Build Coastguard Worker   const uint16x4_t sgrproj_sgr = vdup_n_u16(SGRPROJ_SGR);
659*77c1e3ccSAndroid Build Coastguard Worker   const uint32x4_t one_by_n_minus_1_vec = vdupq_n_u32(av1_one_by_x[n - 1]);
660*77c1e3ccSAndroid Build Coastguard Worker   const uint32x4_t const_val = vdupq_n_u32(255);
661*77c1e3ccSAndroid Build Coastguard Worker 
662*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t sr0, sr1, sr2, sr3, sr4, sr5, sr6, sr7;
663*77c1e3ccSAndroid Build Coastguard Worker   uint32x4_t s0, s1, s2, s3, s4, s5, s6, s7;
664*77c1e3ccSAndroid Build Coastguard Worker 
665*77c1e3ccSAndroid Build Coastguard Worker   const uint32x4_t s_vec = vdupq_n_u32(s);
666*77c1e3ccSAndroid Build Coastguard Worker   int w, h = height;
667*77c1e3ccSAndroid Build Coastguard Worker 
668*77c1e3ccSAndroid Build Coastguard Worker   do {
669*77c1e3ccSAndroid Build Coastguard Worker     src1 = A + (count << 2) * buf_stride;
670*77c1e3ccSAndroid Build Coastguard Worker     src2 = B + (count << 2) * buf_stride;
671*77c1e3ccSAndroid Build Coastguard Worker     dst_A16 = A16 + (count << 2) * buf_stride;
672*77c1e3ccSAndroid Build Coastguard Worker     w = width;
673*77c1e3ccSAndroid Build Coastguard Worker     do {
674*77c1e3ccSAndroid Build Coastguard Worker       load_s32_4x4(src1, buf_stride, &sr0, &sr1, &sr2, &sr3);
675*77c1e3ccSAndroid Build Coastguard Worker       load_s32_4x4(src2, buf_stride, &sr4, &sr5, &sr6, &sr7);
676*77c1e3ccSAndroid Build Coastguard Worker 
677*77c1e3ccSAndroid Build Coastguard Worker       s0 = vrshlq_u32(vreinterpretq_u32_s32(sr0), bd_min_1_vec);
678*77c1e3ccSAndroid Build Coastguard Worker       s1 = vrshlq_u32(vreinterpretq_u32_s32(sr1), bd_min_1_vec);
679*77c1e3ccSAndroid Build Coastguard Worker       s2 = vrshlq_u32(vreinterpretq_u32_s32(sr2), bd_min_1_vec);
680*77c1e3ccSAndroid Build Coastguard Worker       s3 = vrshlq_u32(vreinterpretq_u32_s32(sr3), bd_min_1_vec);
681*77c1e3ccSAndroid Build Coastguard Worker       s4 = vrshlq_u32(vreinterpretq_u32_s32(sr4), bd_min_2_vec);
682*77c1e3ccSAndroid Build Coastguard Worker       s5 = vrshlq_u32(vreinterpretq_u32_s32(sr5), bd_min_2_vec);
683*77c1e3ccSAndroid Build Coastguard Worker       s6 = vrshlq_u32(vreinterpretq_u32_s32(sr6), bd_min_2_vec);
684*77c1e3ccSAndroid Build Coastguard Worker       s7 = vrshlq_u32(vreinterpretq_u32_s32(sr7), bd_min_2_vec);
685*77c1e3ccSAndroid Build Coastguard Worker 
686*77c1e3ccSAndroid Build Coastguard Worker       calc_ab_fast_internal_common(s0, s1, s2, s3, s4, s5, s6, s7, sr4, sr5,
687*77c1e3ccSAndroid Build Coastguard Worker                                    sr6, sr7, const_n_val, s_vec, const_val,
688*77c1e3ccSAndroid Build Coastguard Worker                                    one_by_n_minus_1_vec, sgrproj_sgr, src1,
689*77c1e3ccSAndroid Build Coastguard Worker                                    dst_A16, src2, buf_stride);
690*77c1e3ccSAndroid Build Coastguard Worker 
691*77c1e3ccSAndroid Build Coastguard Worker       w -= 4;
692*77c1e3ccSAndroid Build Coastguard Worker       src1 += 4;
693*77c1e3ccSAndroid Build Coastguard Worker       src2 += 4;
694*77c1e3ccSAndroid Build Coastguard Worker       dst_A16 += 4;
695*77c1e3ccSAndroid Build Coastguard Worker     } while (w > 0);
696*77c1e3ccSAndroid Build Coastguard Worker     count++;
697*77c1e3ccSAndroid Build Coastguard Worker     h -= (ht_inc * 4);
698*77c1e3ccSAndroid Build Coastguard Worker   } while (h > 0);
699*77c1e3ccSAndroid Build Coastguard Worker }
700*77c1e3ccSAndroid Build Coastguard Worker #endif  // CONFIG_AV1_HIGHBITDEPTH
701*77c1e3ccSAndroid Build Coastguard Worker 
boxsum1(int16_t * src,const int src_stride,uint16_t * dst1,int32_t * dst2,const int dst_stride,const int width,const int height)702*77c1e3ccSAndroid Build Coastguard Worker static inline void boxsum1(int16_t *src, const int src_stride, uint16_t *dst1,
703*77c1e3ccSAndroid Build Coastguard Worker                            int32_t *dst2, const int dst_stride, const int width,
704*77c1e3ccSAndroid Build Coastguard Worker                            const int height) {
705*77c1e3ccSAndroid Build Coastguard Worker   assert(width > 2 * SGRPROJ_BORDER_HORZ);
706*77c1e3ccSAndroid Build Coastguard Worker   assert(height > 2 * SGRPROJ_BORDER_VERT);
707*77c1e3ccSAndroid Build Coastguard Worker 
708*77c1e3ccSAndroid Build Coastguard Worker   int16_t *src_ptr;
709*77c1e3ccSAndroid Build Coastguard Worker   int32_t *dst2_ptr;
710*77c1e3ccSAndroid Build Coastguard Worker   uint16_t *dst1_ptr;
711*77c1e3ccSAndroid Build Coastguard Worker   int h, w, count = 0;
712*77c1e3ccSAndroid Build Coastguard Worker 
713*77c1e3ccSAndroid Build Coastguard Worker   w = width;
714*77c1e3ccSAndroid Build Coastguard Worker   {
715*77c1e3ccSAndroid Build Coastguard Worker     int16x8_t s1, s2, s3, s4, s5, s6, s7, s8;
716*77c1e3ccSAndroid Build Coastguard Worker     int16x8_t q23, q34, q56, q234, q345, q456, q567;
717*77c1e3ccSAndroid Build Coastguard Worker     int32x4_t r23, r56, r345, r456, r567, r78, r678;
718*77c1e3ccSAndroid Build Coastguard Worker     int32x4_t r4_low, r4_high, r34_low, r34_high, r234_low, r234_high;
719*77c1e3ccSAndroid Build Coastguard Worker     int32x4_t r2, r3, r5, r6, r7, r8;
720*77c1e3ccSAndroid Build Coastguard Worker     int16x8_t q678, q78;
721*77c1e3ccSAndroid Build Coastguard Worker 
722*77c1e3ccSAndroid Build Coastguard Worker     do {
723*77c1e3ccSAndroid Build Coastguard Worker       dst1_ptr = dst1 + (count << 3);
724*77c1e3ccSAndroid Build Coastguard Worker       dst2_ptr = dst2 + (count << 3);
725*77c1e3ccSAndroid Build Coastguard Worker       src_ptr = src + (count << 3);
726*77c1e3ccSAndroid Build Coastguard Worker       h = height;
727*77c1e3ccSAndroid Build Coastguard Worker 
728*77c1e3ccSAndroid Build Coastguard Worker       load_s16_8x4(src_ptr, src_stride, &s1, &s2, &s3, &s4);
729*77c1e3ccSAndroid Build Coastguard Worker       src_ptr += 4 * src_stride;
730*77c1e3ccSAndroid Build Coastguard Worker 
731*77c1e3ccSAndroid Build Coastguard Worker       q23 = vaddq_s16(s2, s3);
732*77c1e3ccSAndroid Build Coastguard Worker       q234 = vaddq_s16(q23, s4);
733*77c1e3ccSAndroid Build Coastguard Worker       q34 = vaddq_s16(s3, s4);
734*77c1e3ccSAndroid Build Coastguard Worker       dst1_ptr += (dst_stride << 1);
735*77c1e3ccSAndroid Build Coastguard Worker 
736*77c1e3ccSAndroid Build Coastguard Worker       r2 = vmull_s16(vget_low_s16(s2), vget_low_s16(s2));
737*77c1e3ccSAndroid Build Coastguard Worker       r3 = vmull_s16(vget_low_s16(s3), vget_low_s16(s3));
738*77c1e3ccSAndroid Build Coastguard Worker       r4_low = vmull_s16(vget_low_s16(s4), vget_low_s16(s4));
739*77c1e3ccSAndroid Build Coastguard Worker       r23 = vaddq_s32(r2, r3);
740*77c1e3ccSAndroid Build Coastguard Worker       r234_low = vaddq_s32(r23, r4_low);
741*77c1e3ccSAndroid Build Coastguard Worker       r34_low = vaddq_s32(r3, r4_low);
742*77c1e3ccSAndroid Build Coastguard Worker 
743*77c1e3ccSAndroid Build Coastguard Worker       r2 = vmull_s16(vget_high_s16(s2), vget_high_s16(s2));
744*77c1e3ccSAndroid Build Coastguard Worker       r3 = vmull_s16(vget_high_s16(s3), vget_high_s16(s3));
745*77c1e3ccSAndroid Build Coastguard Worker       r4_high = vmull_s16(vget_high_s16(s4), vget_high_s16(s4));
746*77c1e3ccSAndroid Build Coastguard Worker       r23 = vaddq_s32(r2, r3);
747*77c1e3ccSAndroid Build Coastguard Worker       r234_high = vaddq_s32(r23, r4_high);
748*77c1e3ccSAndroid Build Coastguard Worker       r34_high = vaddq_s32(r3, r4_high);
749*77c1e3ccSAndroid Build Coastguard Worker 
750*77c1e3ccSAndroid Build Coastguard Worker       dst2_ptr += (dst_stride << 1);
751*77c1e3ccSAndroid Build Coastguard Worker 
752*77c1e3ccSAndroid Build Coastguard Worker       do {
753*77c1e3ccSAndroid Build Coastguard Worker         load_s16_8x4(src_ptr, src_stride, &s5, &s6, &s7, &s8);
754*77c1e3ccSAndroid Build Coastguard Worker         src_ptr += 4 * src_stride;
755*77c1e3ccSAndroid Build Coastguard Worker 
756*77c1e3ccSAndroid Build Coastguard Worker         q345 = vaddq_s16(s5, q34);
757*77c1e3ccSAndroid Build Coastguard Worker         q56 = vaddq_s16(s5, s6);
758*77c1e3ccSAndroid Build Coastguard Worker         q456 = vaddq_s16(s4, q56);
759*77c1e3ccSAndroid Build Coastguard Worker         q567 = vaddq_s16(s7, q56);
760*77c1e3ccSAndroid Build Coastguard Worker         q78 = vaddq_s16(s7, s8);
761*77c1e3ccSAndroid Build Coastguard Worker         q678 = vaddq_s16(s6, q78);
762*77c1e3ccSAndroid Build Coastguard Worker 
763*77c1e3ccSAndroid Build Coastguard Worker         store_s16_8x4((int16_t *)dst1_ptr, dst_stride, q234, q345, q456, q567);
764*77c1e3ccSAndroid Build Coastguard Worker         dst1_ptr += (dst_stride << 2);
765*77c1e3ccSAndroid Build Coastguard Worker 
766*77c1e3ccSAndroid Build Coastguard Worker         s4 = s8;
767*77c1e3ccSAndroid Build Coastguard Worker         q34 = q78;
768*77c1e3ccSAndroid Build Coastguard Worker         q234 = q678;
769*77c1e3ccSAndroid Build Coastguard Worker 
770*77c1e3ccSAndroid Build Coastguard Worker         r5 = vmull_s16(vget_low_s16(s5), vget_low_s16(s5));
771*77c1e3ccSAndroid Build Coastguard Worker         r6 = vmull_s16(vget_low_s16(s6), vget_low_s16(s6));
772*77c1e3ccSAndroid Build Coastguard Worker         r7 = vmull_s16(vget_low_s16(s7), vget_low_s16(s7));
773*77c1e3ccSAndroid Build Coastguard Worker         r8 = vmull_s16(vget_low_s16(s8), vget_low_s16(s8));
774*77c1e3ccSAndroid Build Coastguard Worker 
775*77c1e3ccSAndroid Build Coastguard Worker         r345 = vaddq_s32(r5, r34_low);
776*77c1e3ccSAndroid Build Coastguard Worker         r56 = vaddq_s32(r5, r6);
777*77c1e3ccSAndroid Build Coastguard Worker         r456 = vaddq_s32(r4_low, r56);
778*77c1e3ccSAndroid Build Coastguard Worker         r567 = vaddq_s32(r7, r56);
779*77c1e3ccSAndroid Build Coastguard Worker         r78 = vaddq_s32(r7, r8);
780*77c1e3ccSAndroid Build Coastguard Worker         r678 = vaddq_s32(r6, r78);
781*77c1e3ccSAndroid Build Coastguard Worker         store_s32_4x4(dst2_ptr, dst_stride, r234_low, r345, r456, r567);
782*77c1e3ccSAndroid Build Coastguard Worker 
783*77c1e3ccSAndroid Build Coastguard Worker         r4_low = r8;
784*77c1e3ccSAndroid Build Coastguard Worker         r34_low = r78;
785*77c1e3ccSAndroid Build Coastguard Worker         r234_low = r678;
786*77c1e3ccSAndroid Build Coastguard Worker 
787*77c1e3ccSAndroid Build Coastguard Worker         r5 = vmull_s16(vget_high_s16(s5), vget_high_s16(s5));
788*77c1e3ccSAndroid Build Coastguard Worker         r6 = vmull_s16(vget_high_s16(s6), vget_high_s16(s6));
789*77c1e3ccSAndroid Build Coastguard Worker         r7 = vmull_s16(vget_high_s16(s7), vget_high_s16(s7));
790*77c1e3ccSAndroid Build Coastguard Worker         r8 = vmull_s16(vget_high_s16(s8), vget_high_s16(s8));
791*77c1e3ccSAndroid Build Coastguard Worker 
792*77c1e3ccSAndroid Build Coastguard Worker         r345 = vaddq_s32(r5, r34_high);
793*77c1e3ccSAndroid Build Coastguard Worker         r56 = vaddq_s32(r5, r6);
794*77c1e3ccSAndroid Build Coastguard Worker         r456 = vaddq_s32(r4_high, r56);
795*77c1e3ccSAndroid Build Coastguard Worker         r567 = vaddq_s32(r7, r56);
796*77c1e3ccSAndroid Build Coastguard Worker         r78 = vaddq_s32(r7, r8);
797*77c1e3ccSAndroid Build Coastguard Worker         r678 = vaddq_s32(r6, r78);
798*77c1e3ccSAndroid Build Coastguard Worker         store_s32_4x4((dst2_ptr + 4), dst_stride, r234_high, r345, r456, r567);
799*77c1e3ccSAndroid Build Coastguard Worker         dst2_ptr += (dst_stride << 2);
800*77c1e3ccSAndroid Build Coastguard Worker 
801*77c1e3ccSAndroid Build Coastguard Worker         r4_high = r8;
802*77c1e3ccSAndroid Build Coastguard Worker         r34_high = r78;
803*77c1e3ccSAndroid Build Coastguard Worker         r234_high = r678;
804*77c1e3ccSAndroid Build Coastguard Worker 
805*77c1e3ccSAndroid Build Coastguard Worker         h -= 4;
806*77c1e3ccSAndroid Build Coastguard Worker       } while (h > 0);
807*77c1e3ccSAndroid Build Coastguard Worker       w -= 8;
808*77c1e3ccSAndroid Build Coastguard Worker       count++;
809*77c1e3ccSAndroid Build Coastguard Worker     } while (w > 0);
810*77c1e3ccSAndroid Build Coastguard Worker 
811*77c1e3ccSAndroid Build Coastguard Worker     // memset needed for row pixels as 2nd stage of boxsum filter uses
812*77c1e3ccSAndroid Build Coastguard Worker     // first 2 rows of dst1, dst2 buffer which is not filled in first stage.
813*77c1e3ccSAndroid Build Coastguard Worker     for (int x = 0; x < 2; x++) {
814*77c1e3ccSAndroid Build Coastguard Worker       memset(dst1 + x * dst_stride, 0, (width + 4) * sizeof(*dst1));
815*77c1e3ccSAndroid Build Coastguard Worker       memset(dst2 + x * dst_stride, 0, (width + 4) * sizeof(*dst2));
816*77c1e3ccSAndroid Build Coastguard Worker     }
817*77c1e3ccSAndroid Build Coastguard Worker 
818*77c1e3ccSAndroid Build Coastguard Worker     // memset needed for extra columns as 2nd stage of boxsum filter uses
819*77c1e3ccSAndroid Build Coastguard Worker     // last 2 columns of dst1, dst2 buffer which is not filled in first stage.
820*77c1e3ccSAndroid Build Coastguard Worker     for (int x = 2; x < height + 2; x++) {
821*77c1e3ccSAndroid Build Coastguard Worker       int dst_offset = x * dst_stride + width + 2;
822*77c1e3ccSAndroid Build Coastguard Worker       memset(dst1 + dst_offset, 0, 3 * sizeof(*dst1));
823*77c1e3ccSAndroid Build Coastguard Worker       memset(dst2 + dst_offset, 0, 3 * sizeof(*dst2));
824*77c1e3ccSAndroid Build Coastguard Worker     }
825*77c1e3ccSAndroid Build Coastguard Worker   }
826*77c1e3ccSAndroid Build Coastguard Worker 
827*77c1e3ccSAndroid Build Coastguard Worker   {
828*77c1e3ccSAndroid Build Coastguard Worker     int16x4_t d1, d2, d3, d4, d5, d6, d7, d8;
829*77c1e3ccSAndroid Build Coastguard Worker     int16x4_t q23, q34, q56, q234, q345, q456, q567;
830*77c1e3ccSAndroid Build Coastguard Worker     int32x4_t r23, r56, r234, r345, r456, r567, r34, r78, r678;
831*77c1e3ccSAndroid Build Coastguard Worker     int32x4_t r1, r2, r3, r4, r5, r6, r7, r8;
832*77c1e3ccSAndroid Build Coastguard Worker     int16x4_t q678, q78;
833*77c1e3ccSAndroid Build Coastguard Worker 
834*77c1e3ccSAndroid Build Coastguard Worker     int32_t *src2_ptr;
835*77c1e3ccSAndroid Build Coastguard Worker     uint16_t *src1_ptr;
836*77c1e3ccSAndroid Build Coastguard Worker     count = 0;
837*77c1e3ccSAndroid Build Coastguard Worker     h = height;
838*77c1e3ccSAndroid Build Coastguard Worker     w = width;
839*77c1e3ccSAndroid Build Coastguard Worker     do {
840*77c1e3ccSAndroid Build Coastguard Worker       dst1_ptr = dst1 + (count << 2) * dst_stride;
841*77c1e3ccSAndroid Build Coastguard Worker       dst2_ptr = dst2 + (count << 2) * dst_stride;
842*77c1e3ccSAndroid Build Coastguard Worker       src1_ptr = dst1 + (count << 2) * dst_stride;
843*77c1e3ccSAndroid Build Coastguard Worker       src2_ptr = dst2 + (count << 2) * dst_stride;
844*77c1e3ccSAndroid Build Coastguard Worker       w = width;
845*77c1e3ccSAndroid Build Coastguard Worker 
846*77c1e3ccSAndroid Build Coastguard Worker       load_s16_4x4((int16_t *)src1_ptr, dst_stride, &d1, &d2, &d3, &d4);
847*77c1e3ccSAndroid Build Coastguard Worker       transpose_elems_inplace_s16_4x4(&d1, &d2, &d3, &d4);
848*77c1e3ccSAndroid Build Coastguard Worker       load_s32_4x4(src2_ptr, dst_stride, &r1, &r2, &r3, &r4);
849*77c1e3ccSAndroid Build Coastguard Worker       transpose_elems_inplace_s32_4x4(&r1, &r2, &r3, &r4);
850*77c1e3ccSAndroid Build Coastguard Worker       src1_ptr += 4;
851*77c1e3ccSAndroid Build Coastguard Worker       src2_ptr += 4;
852*77c1e3ccSAndroid Build Coastguard Worker 
853*77c1e3ccSAndroid Build Coastguard Worker       q23 = vadd_s16(d2, d3);
854*77c1e3ccSAndroid Build Coastguard Worker       q234 = vadd_s16(q23, d4);
855*77c1e3ccSAndroid Build Coastguard Worker       q34 = vadd_s16(d3, d4);
856*77c1e3ccSAndroid Build Coastguard Worker       dst1_ptr += 2;
857*77c1e3ccSAndroid Build Coastguard Worker       r23 = vaddq_s32(r2, r3);
858*77c1e3ccSAndroid Build Coastguard Worker       r234 = vaddq_s32(r23, r4);
859*77c1e3ccSAndroid Build Coastguard Worker       r34 = vaddq_s32(r3, r4);
860*77c1e3ccSAndroid Build Coastguard Worker       dst2_ptr += 2;
861*77c1e3ccSAndroid Build Coastguard Worker 
862*77c1e3ccSAndroid Build Coastguard Worker       do {
863*77c1e3ccSAndroid Build Coastguard Worker         load_s16_4x4((int16_t *)src1_ptr, dst_stride, &d5, &d6, &d7, &d8);
864*77c1e3ccSAndroid Build Coastguard Worker         transpose_elems_inplace_s16_4x4(&d5, &d6, &d7, &d8);
865*77c1e3ccSAndroid Build Coastguard Worker         load_s32_4x4(src2_ptr, dst_stride, &r5, &r6, &r7, &r8);
866*77c1e3ccSAndroid Build Coastguard Worker         transpose_elems_inplace_s32_4x4(&r5, &r6, &r7, &r8);
867*77c1e3ccSAndroid Build Coastguard Worker         src1_ptr += 4;
868*77c1e3ccSAndroid Build Coastguard Worker         src2_ptr += 4;
869*77c1e3ccSAndroid Build Coastguard Worker 
870*77c1e3ccSAndroid Build Coastguard Worker         q345 = vadd_s16(d5, q34);
871*77c1e3ccSAndroid Build Coastguard Worker         q56 = vadd_s16(d5, d6);
872*77c1e3ccSAndroid Build Coastguard Worker         q456 = vadd_s16(d4, q56);
873*77c1e3ccSAndroid Build Coastguard Worker         q567 = vadd_s16(d7, q56);
874*77c1e3ccSAndroid Build Coastguard Worker         q78 = vadd_s16(d7, d8);
875*77c1e3ccSAndroid Build Coastguard Worker         q678 = vadd_s16(d6, q78);
876*77c1e3ccSAndroid Build Coastguard Worker         transpose_elems_inplace_s16_4x4(&q234, &q345, &q456, &q567);
877*77c1e3ccSAndroid Build Coastguard Worker         store_s16_4x4((int16_t *)dst1_ptr, dst_stride, q234, q345, q456, q567);
878*77c1e3ccSAndroid Build Coastguard Worker         dst1_ptr += 4;
879*77c1e3ccSAndroid Build Coastguard Worker 
880*77c1e3ccSAndroid Build Coastguard Worker         d4 = d8;
881*77c1e3ccSAndroid Build Coastguard Worker         q34 = q78;
882*77c1e3ccSAndroid Build Coastguard Worker         q234 = q678;
883*77c1e3ccSAndroid Build Coastguard Worker 
884*77c1e3ccSAndroid Build Coastguard Worker         r345 = vaddq_s32(r5, r34);
885*77c1e3ccSAndroid Build Coastguard Worker         r56 = vaddq_s32(r5, r6);
886*77c1e3ccSAndroid Build Coastguard Worker         r456 = vaddq_s32(r4, r56);
887*77c1e3ccSAndroid Build Coastguard Worker         r567 = vaddq_s32(r7, r56);
888*77c1e3ccSAndroid Build Coastguard Worker         r78 = vaddq_s32(r7, r8);
889*77c1e3ccSAndroid Build Coastguard Worker         r678 = vaddq_s32(r6, r78);
890*77c1e3ccSAndroid Build Coastguard Worker         transpose_elems_inplace_s32_4x4(&r234, &r345, &r456, &r567);
891*77c1e3ccSAndroid Build Coastguard Worker         store_s32_4x4(dst2_ptr, dst_stride, r234, r345, r456, r567);
892*77c1e3ccSAndroid Build Coastguard Worker         dst2_ptr += 4;
893*77c1e3ccSAndroid Build Coastguard Worker 
894*77c1e3ccSAndroid Build Coastguard Worker         r4 = r8;
895*77c1e3ccSAndroid Build Coastguard Worker         r34 = r78;
896*77c1e3ccSAndroid Build Coastguard Worker         r234 = r678;
897*77c1e3ccSAndroid Build Coastguard Worker         w -= 4;
898*77c1e3ccSAndroid Build Coastguard Worker       } while (w > 0);
899*77c1e3ccSAndroid Build Coastguard Worker       h -= 4;
900*77c1e3ccSAndroid Build Coastguard Worker       count++;
901*77c1e3ccSAndroid Build Coastguard Worker     } while (h > 0);
902*77c1e3ccSAndroid Build Coastguard Worker   }
903*77c1e3ccSAndroid Build Coastguard Worker }
904*77c1e3ccSAndroid Build Coastguard Worker 
cross_sum_inp_s32(int32_t * buf,int buf_stride)905*77c1e3ccSAndroid Build Coastguard Worker static inline int32x4_t cross_sum_inp_s32(int32_t *buf, int buf_stride) {
906*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t xtr, xt, xtl, xl, x, xr, xbr, xb, xbl;
907*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t fours, threes, res;
908*77c1e3ccSAndroid Build Coastguard Worker 
909*77c1e3ccSAndroid Build Coastguard Worker   xtl = vld1q_s32(buf - buf_stride - 1);
910*77c1e3ccSAndroid Build Coastguard Worker   xt = vld1q_s32(buf - buf_stride);
911*77c1e3ccSAndroid Build Coastguard Worker   xtr = vld1q_s32(buf - buf_stride + 1);
912*77c1e3ccSAndroid Build Coastguard Worker   xl = vld1q_s32(buf - 1);
913*77c1e3ccSAndroid Build Coastguard Worker   x = vld1q_s32(buf);
914*77c1e3ccSAndroid Build Coastguard Worker   xr = vld1q_s32(buf + 1);
915*77c1e3ccSAndroid Build Coastguard Worker   xbl = vld1q_s32(buf + buf_stride - 1);
916*77c1e3ccSAndroid Build Coastguard Worker   xb = vld1q_s32(buf + buf_stride);
917*77c1e3ccSAndroid Build Coastguard Worker   xbr = vld1q_s32(buf + buf_stride + 1);
918*77c1e3ccSAndroid Build Coastguard Worker 
919*77c1e3ccSAndroid Build Coastguard Worker   fours = vaddq_s32(xl, vaddq_s32(xt, vaddq_s32(xr, vaddq_s32(xb, x))));
920*77c1e3ccSAndroid Build Coastguard Worker   threes = vaddq_s32(xtl, vaddq_s32(xtr, vaddq_s32(xbr, xbl)));
921*77c1e3ccSAndroid Build Coastguard Worker   res = vsubq_s32(vshlq_n_s32(vaddq_s32(fours, threes), 2), threes);
922*77c1e3ccSAndroid Build Coastguard Worker   return res;
923*77c1e3ccSAndroid Build Coastguard Worker }
924*77c1e3ccSAndroid Build Coastguard Worker 
cross_sum_inp_u16(uint16_t * buf,int buf_stride,int32x4_t * a0,int32x4_t * a1)925*77c1e3ccSAndroid Build Coastguard Worker static inline void cross_sum_inp_u16(uint16_t *buf, int buf_stride,
926*77c1e3ccSAndroid Build Coastguard Worker                                      int32x4_t *a0, int32x4_t *a1) {
927*77c1e3ccSAndroid Build Coastguard Worker   uint16x8_t xtr, xt, xtl, xl, x, xr, xbr, xb, xbl;
928*77c1e3ccSAndroid Build Coastguard Worker   uint16x8_t r0, r1;
929*77c1e3ccSAndroid Build Coastguard Worker 
930*77c1e3ccSAndroid Build Coastguard Worker   xtl = vld1q_u16(buf - buf_stride - 1);
931*77c1e3ccSAndroid Build Coastguard Worker   xt = vld1q_u16(buf - buf_stride);
932*77c1e3ccSAndroid Build Coastguard Worker   xtr = vld1q_u16(buf - buf_stride + 1);
933*77c1e3ccSAndroid Build Coastguard Worker   xl = vld1q_u16(buf - 1);
934*77c1e3ccSAndroid Build Coastguard Worker   x = vld1q_u16(buf);
935*77c1e3ccSAndroid Build Coastguard Worker   xr = vld1q_u16(buf + 1);
936*77c1e3ccSAndroid Build Coastguard Worker   xbl = vld1q_u16(buf + buf_stride - 1);
937*77c1e3ccSAndroid Build Coastguard Worker   xb = vld1q_u16(buf + buf_stride);
938*77c1e3ccSAndroid Build Coastguard Worker   xbr = vld1q_u16(buf + buf_stride + 1);
939*77c1e3ccSAndroid Build Coastguard Worker 
940*77c1e3ccSAndroid Build Coastguard Worker   xb = vaddq_u16(xb, x);
941*77c1e3ccSAndroid Build Coastguard Worker   xt = vaddq_u16(xt, xr);
942*77c1e3ccSAndroid Build Coastguard Worker   xl = vaddq_u16(xl, xb);
943*77c1e3ccSAndroid Build Coastguard Worker   xl = vaddq_u16(xl, xt);
944*77c1e3ccSAndroid Build Coastguard Worker 
945*77c1e3ccSAndroid Build Coastguard Worker   r0 = vshlq_n_u16(xl, 2);
946*77c1e3ccSAndroid Build Coastguard Worker 
947*77c1e3ccSAndroid Build Coastguard Worker   xbl = vaddq_u16(xbl, xbr);
948*77c1e3ccSAndroid Build Coastguard Worker   xtl = vaddq_u16(xtl, xtr);
949*77c1e3ccSAndroid Build Coastguard Worker   xtl = vaddq_u16(xtl, xbl);
950*77c1e3ccSAndroid Build Coastguard Worker 
951*77c1e3ccSAndroid Build Coastguard Worker   r1 = vshlq_n_u16(xtl, 2);
952*77c1e3ccSAndroid Build Coastguard Worker   r1 = vsubq_u16(r1, xtl);
953*77c1e3ccSAndroid Build Coastguard Worker 
954*77c1e3ccSAndroid Build Coastguard Worker   *a0 = vreinterpretq_s32_u32(
955*77c1e3ccSAndroid Build Coastguard Worker       vaddq_u32(vmovl_u16(vget_low_u16(r0)), vmovl_u16(vget_low_u16(r1))));
956*77c1e3ccSAndroid Build Coastguard Worker   *a1 = vreinterpretq_s32_u32(
957*77c1e3ccSAndroid Build Coastguard Worker       vaddq_u32(vmovl_u16(vget_high_u16(r0)), vmovl_u16(vget_high_u16(r1))));
958*77c1e3ccSAndroid Build Coastguard Worker }
959*77c1e3ccSAndroid Build Coastguard Worker 
cross_sum_fast_even_row(int32_t * buf,int buf_stride)960*77c1e3ccSAndroid Build Coastguard Worker static inline int32x4_t cross_sum_fast_even_row(int32_t *buf, int buf_stride) {
961*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t xtr, xt, xtl, xbr, xb, xbl;
962*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t fives, sixes, fives_plus_sixes;
963*77c1e3ccSAndroid Build Coastguard Worker 
964*77c1e3ccSAndroid Build Coastguard Worker   xtl = vld1q_s32(buf - buf_stride - 1);
965*77c1e3ccSAndroid Build Coastguard Worker   xt = vld1q_s32(buf - buf_stride);
966*77c1e3ccSAndroid Build Coastguard Worker   xtr = vld1q_s32(buf - buf_stride + 1);
967*77c1e3ccSAndroid Build Coastguard Worker   xbl = vld1q_s32(buf + buf_stride - 1);
968*77c1e3ccSAndroid Build Coastguard Worker   xb = vld1q_s32(buf + buf_stride);
969*77c1e3ccSAndroid Build Coastguard Worker   xbr = vld1q_s32(buf + buf_stride + 1);
970*77c1e3ccSAndroid Build Coastguard Worker 
971*77c1e3ccSAndroid Build Coastguard Worker   fives = vaddq_s32(xtl, vaddq_s32(xtr, vaddq_s32(xbr, xbl)));
972*77c1e3ccSAndroid Build Coastguard Worker   sixes = vaddq_s32(xt, xb);
973*77c1e3ccSAndroid Build Coastguard Worker   fives_plus_sixes = vaddq_s32(fives, sixes);
974*77c1e3ccSAndroid Build Coastguard Worker 
975*77c1e3ccSAndroid Build Coastguard Worker   return vaddq_s32(
976*77c1e3ccSAndroid Build Coastguard Worker       vaddq_s32(vshlq_n_s32(fives_plus_sixes, 2), fives_plus_sixes), sixes);
977*77c1e3ccSAndroid Build Coastguard Worker }
978*77c1e3ccSAndroid Build Coastguard Worker 
cross_sum_fast_even_row_inp16(uint16_t * buf,int buf_stride,int32x4_t * a0,int32x4_t * a1)979*77c1e3ccSAndroid Build Coastguard Worker static inline void cross_sum_fast_even_row_inp16(uint16_t *buf, int buf_stride,
980*77c1e3ccSAndroid Build Coastguard Worker                                                  int32x4_t *a0, int32x4_t *a1) {
981*77c1e3ccSAndroid Build Coastguard Worker   uint16x8_t xtr, xt, xtl, xbr, xb, xbl, xb0;
982*77c1e3ccSAndroid Build Coastguard Worker 
983*77c1e3ccSAndroid Build Coastguard Worker   xtl = vld1q_u16(buf - buf_stride - 1);
984*77c1e3ccSAndroid Build Coastguard Worker   xt = vld1q_u16(buf - buf_stride);
985*77c1e3ccSAndroid Build Coastguard Worker   xtr = vld1q_u16(buf - buf_stride + 1);
986*77c1e3ccSAndroid Build Coastguard Worker   xbl = vld1q_u16(buf + buf_stride - 1);
987*77c1e3ccSAndroid Build Coastguard Worker   xb = vld1q_u16(buf + buf_stride);
988*77c1e3ccSAndroid Build Coastguard Worker   xbr = vld1q_u16(buf + buf_stride + 1);
989*77c1e3ccSAndroid Build Coastguard Worker 
990*77c1e3ccSAndroid Build Coastguard Worker   xbr = vaddq_u16(xbr, xbl);
991*77c1e3ccSAndroid Build Coastguard Worker   xtr = vaddq_u16(xtr, xtl);
992*77c1e3ccSAndroid Build Coastguard Worker   xbr = vaddq_u16(xbr, xtr);
993*77c1e3ccSAndroid Build Coastguard Worker   xtl = vshlq_n_u16(xbr, 2);
994*77c1e3ccSAndroid Build Coastguard Worker   xbr = vaddq_u16(xtl, xbr);
995*77c1e3ccSAndroid Build Coastguard Worker 
996*77c1e3ccSAndroid Build Coastguard Worker   xb = vaddq_u16(xb, xt);
997*77c1e3ccSAndroid Build Coastguard Worker   xb0 = vshlq_n_u16(xb, 1);
998*77c1e3ccSAndroid Build Coastguard Worker   xb = vshlq_n_u16(xb, 2);
999*77c1e3ccSAndroid Build Coastguard Worker   xb = vaddq_u16(xb, xb0);
1000*77c1e3ccSAndroid Build Coastguard Worker 
1001*77c1e3ccSAndroid Build Coastguard Worker   *a0 = vreinterpretq_s32_u32(
1002*77c1e3ccSAndroid Build Coastguard Worker       vaddq_u32(vmovl_u16(vget_low_u16(xbr)), vmovl_u16(vget_low_u16(xb))));
1003*77c1e3ccSAndroid Build Coastguard Worker   *a1 = vreinterpretq_s32_u32(
1004*77c1e3ccSAndroid Build Coastguard Worker       vaddq_u32(vmovl_u16(vget_high_u16(xbr)), vmovl_u16(vget_high_u16(xb))));
1005*77c1e3ccSAndroid Build Coastguard Worker }
1006*77c1e3ccSAndroid Build Coastguard Worker 
cross_sum_fast_odd_row(int32_t * buf)1007*77c1e3ccSAndroid Build Coastguard Worker static inline int32x4_t cross_sum_fast_odd_row(int32_t *buf) {
1008*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t xl, x, xr;
1009*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t fives, sixes, fives_plus_sixes;
1010*77c1e3ccSAndroid Build Coastguard Worker 
1011*77c1e3ccSAndroid Build Coastguard Worker   xl = vld1q_s32(buf - 1);
1012*77c1e3ccSAndroid Build Coastguard Worker   x = vld1q_s32(buf);
1013*77c1e3ccSAndroid Build Coastguard Worker   xr = vld1q_s32(buf + 1);
1014*77c1e3ccSAndroid Build Coastguard Worker   fives = vaddq_s32(xl, xr);
1015*77c1e3ccSAndroid Build Coastguard Worker   sixes = x;
1016*77c1e3ccSAndroid Build Coastguard Worker   fives_plus_sixes = vaddq_s32(fives, sixes);
1017*77c1e3ccSAndroid Build Coastguard Worker 
1018*77c1e3ccSAndroid Build Coastguard Worker   return vaddq_s32(
1019*77c1e3ccSAndroid Build Coastguard Worker       vaddq_s32(vshlq_n_s32(fives_plus_sixes, 2), fives_plus_sixes), sixes);
1020*77c1e3ccSAndroid Build Coastguard Worker }
1021*77c1e3ccSAndroid Build Coastguard Worker 
cross_sum_fast_odd_row_inp16(uint16_t * buf,int32x4_t * a0,int32x4_t * a1)1022*77c1e3ccSAndroid Build Coastguard Worker static inline void cross_sum_fast_odd_row_inp16(uint16_t *buf, int32x4_t *a0,
1023*77c1e3ccSAndroid Build Coastguard Worker                                                 int32x4_t *a1) {
1024*77c1e3ccSAndroid Build Coastguard Worker   uint16x8_t xl, x, xr;
1025*77c1e3ccSAndroid Build Coastguard Worker   uint16x8_t x0;
1026*77c1e3ccSAndroid Build Coastguard Worker 
1027*77c1e3ccSAndroid Build Coastguard Worker   xl = vld1q_u16(buf - 1);
1028*77c1e3ccSAndroid Build Coastguard Worker   x = vld1q_u16(buf);
1029*77c1e3ccSAndroid Build Coastguard Worker   xr = vld1q_u16(buf + 1);
1030*77c1e3ccSAndroid Build Coastguard Worker   xl = vaddq_u16(xl, xr);
1031*77c1e3ccSAndroid Build Coastguard Worker   x0 = vshlq_n_u16(xl, 2);
1032*77c1e3ccSAndroid Build Coastguard Worker   xl = vaddq_u16(xl, x0);
1033*77c1e3ccSAndroid Build Coastguard Worker 
1034*77c1e3ccSAndroid Build Coastguard Worker   x0 = vshlq_n_u16(x, 1);
1035*77c1e3ccSAndroid Build Coastguard Worker   x = vshlq_n_u16(x, 2);
1036*77c1e3ccSAndroid Build Coastguard Worker   x = vaddq_u16(x, x0);
1037*77c1e3ccSAndroid Build Coastguard Worker 
1038*77c1e3ccSAndroid Build Coastguard Worker   *a0 = vreinterpretq_s32_u32(
1039*77c1e3ccSAndroid Build Coastguard Worker       vaddq_u32(vmovl_u16(vget_low_u16(xl)), vmovl_u16(vget_low_u16(x))));
1040*77c1e3ccSAndroid Build Coastguard Worker   *a1 = vreinterpretq_s32_u32(
1041*77c1e3ccSAndroid Build Coastguard Worker       vaddq_u32(vmovl_u16(vget_high_u16(xl)), vmovl_u16(vget_high_u16(x))));
1042*77c1e3ccSAndroid Build Coastguard Worker }
1043*77c1e3ccSAndroid Build Coastguard Worker 
final_filter_fast_internal(uint16_t * A,int32_t * B,const int buf_stride,int16_t * src,const int src_stride,int32_t * dst,const int dst_stride,const int width,const int height)1044*77c1e3ccSAndroid Build Coastguard Worker static void final_filter_fast_internal(uint16_t *A, int32_t *B,
1045*77c1e3ccSAndroid Build Coastguard Worker                                        const int buf_stride, int16_t *src,
1046*77c1e3ccSAndroid Build Coastguard Worker                                        const int src_stride, int32_t *dst,
1047*77c1e3ccSAndroid Build Coastguard Worker                                        const int dst_stride, const int width,
1048*77c1e3ccSAndroid Build Coastguard Worker                                        const int height) {
1049*77c1e3ccSAndroid Build Coastguard Worker   int16x8_t s0;
1050*77c1e3ccSAndroid Build Coastguard Worker   int32_t *B_tmp, *dst_ptr;
1051*77c1e3ccSAndroid Build Coastguard Worker   uint16_t *A_tmp;
1052*77c1e3ccSAndroid Build Coastguard Worker   int16_t *src_ptr;
1053*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t a_res0, a_res1, b_res0, b_res1;
1054*77c1e3ccSAndroid Build Coastguard Worker   int w, h, count = 0;
1055*77c1e3ccSAndroid Build Coastguard Worker   assert(SGRPROJ_SGR_BITS == 8);
1056*77c1e3ccSAndroid Build Coastguard Worker   assert(SGRPROJ_RST_BITS == 4);
1057*77c1e3ccSAndroid Build Coastguard Worker 
1058*77c1e3ccSAndroid Build Coastguard Worker   A_tmp = A;
1059*77c1e3ccSAndroid Build Coastguard Worker   B_tmp = B;
1060*77c1e3ccSAndroid Build Coastguard Worker   src_ptr = src;
1061*77c1e3ccSAndroid Build Coastguard Worker   dst_ptr = dst;
1062*77c1e3ccSAndroid Build Coastguard Worker   h = height;
1063*77c1e3ccSAndroid Build Coastguard Worker   do {
1064*77c1e3ccSAndroid Build Coastguard Worker     A_tmp = (A + count * buf_stride);
1065*77c1e3ccSAndroid Build Coastguard Worker     B_tmp = (B + count * buf_stride);
1066*77c1e3ccSAndroid Build Coastguard Worker     src_ptr = (src + count * src_stride);
1067*77c1e3ccSAndroid Build Coastguard Worker     dst_ptr = (dst + count * dst_stride);
1068*77c1e3ccSAndroid Build Coastguard Worker     w = width;
1069*77c1e3ccSAndroid Build Coastguard Worker     if (!(count & 1)) {
1070*77c1e3ccSAndroid Build Coastguard Worker       do {
1071*77c1e3ccSAndroid Build Coastguard Worker         s0 = vld1q_s16(src_ptr);
1072*77c1e3ccSAndroid Build Coastguard Worker         cross_sum_fast_even_row_inp16(A_tmp, buf_stride, &a_res0, &a_res1);
1073*77c1e3ccSAndroid Build Coastguard Worker         a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0);
1074*77c1e3ccSAndroid Build Coastguard Worker         a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1);
1075*77c1e3ccSAndroid Build Coastguard Worker 
1076*77c1e3ccSAndroid Build Coastguard Worker         b_res0 = cross_sum_fast_even_row(B_tmp, buf_stride);
1077*77c1e3ccSAndroid Build Coastguard Worker         b_res1 = cross_sum_fast_even_row(B_tmp + 4, buf_stride);
1078*77c1e3ccSAndroid Build Coastguard Worker         a_res0 = vaddq_s32(a_res0, b_res0);
1079*77c1e3ccSAndroid Build Coastguard Worker         a_res1 = vaddq_s32(a_res1, b_res1);
1080*77c1e3ccSAndroid Build Coastguard Worker 
1081*77c1e3ccSAndroid Build Coastguard Worker         a_res0 =
1082*77c1e3ccSAndroid Build Coastguard Worker             vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
1083*77c1e3ccSAndroid Build Coastguard Worker         a_res1 =
1084*77c1e3ccSAndroid Build Coastguard Worker             vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
1085*77c1e3ccSAndroid Build Coastguard Worker 
1086*77c1e3ccSAndroid Build Coastguard Worker         vst1q_s32(dst_ptr, a_res0);
1087*77c1e3ccSAndroid Build Coastguard Worker         vst1q_s32(dst_ptr + 4, a_res1);
1088*77c1e3ccSAndroid Build Coastguard Worker 
1089*77c1e3ccSAndroid Build Coastguard Worker         A_tmp += 8;
1090*77c1e3ccSAndroid Build Coastguard Worker         B_tmp += 8;
1091*77c1e3ccSAndroid Build Coastguard Worker         src_ptr += 8;
1092*77c1e3ccSAndroid Build Coastguard Worker         dst_ptr += 8;
1093*77c1e3ccSAndroid Build Coastguard Worker         w -= 8;
1094*77c1e3ccSAndroid Build Coastguard Worker       } while (w > 0);
1095*77c1e3ccSAndroid Build Coastguard Worker     } else {
1096*77c1e3ccSAndroid Build Coastguard Worker       do {
1097*77c1e3ccSAndroid Build Coastguard Worker         s0 = vld1q_s16(src_ptr);
1098*77c1e3ccSAndroid Build Coastguard Worker         cross_sum_fast_odd_row_inp16(A_tmp, &a_res0, &a_res1);
1099*77c1e3ccSAndroid Build Coastguard Worker         a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0);
1100*77c1e3ccSAndroid Build Coastguard Worker         a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1);
1101*77c1e3ccSAndroid Build Coastguard Worker 
1102*77c1e3ccSAndroid Build Coastguard Worker         b_res0 = cross_sum_fast_odd_row(B_tmp);
1103*77c1e3ccSAndroid Build Coastguard Worker         b_res1 = cross_sum_fast_odd_row(B_tmp + 4);
1104*77c1e3ccSAndroid Build Coastguard Worker         a_res0 = vaddq_s32(a_res0, b_res0);
1105*77c1e3ccSAndroid Build Coastguard Worker         a_res1 = vaddq_s32(a_res1, b_res1);
1106*77c1e3ccSAndroid Build Coastguard Worker 
1107*77c1e3ccSAndroid Build Coastguard Worker         a_res0 =
1108*77c1e3ccSAndroid Build Coastguard Worker             vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_ODD - SGRPROJ_RST_BITS);
1109*77c1e3ccSAndroid Build Coastguard Worker         a_res1 =
1110*77c1e3ccSAndroid Build Coastguard Worker             vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_ODD - SGRPROJ_RST_BITS);
1111*77c1e3ccSAndroid Build Coastguard Worker 
1112*77c1e3ccSAndroid Build Coastguard Worker         vst1q_s32(dst_ptr, a_res0);
1113*77c1e3ccSAndroid Build Coastguard Worker         vst1q_s32(dst_ptr + 4, a_res1);
1114*77c1e3ccSAndroid Build Coastguard Worker 
1115*77c1e3ccSAndroid Build Coastguard Worker         A_tmp += 8;
1116*77c1e3ccSAndroid Build Coastguard Worker         B_tmp += 8;
1117*77c1e3ccSAndroid Build Coastguard Worker         src_ptr += 8;
1118*77c1e3ccSAndroid Build Coastguard Worker         dst_ptr += 8;
1119*77c1e3ccSAndroid Build Coastguard Worker         w -= 8;
1120*77c1e3ccSAndroid Build Coastguard Worker       } while (w > 0);
1121*77c1e3ccSAndroid Build Coastguard Worker     }
1122*77c1e3ccSAndroid Build Coastguard Worker     count++;
1123*77c1e3ccSAndroid Build Coastguard Worker     h -= 1;
1124*77c1e3ccSAndroid Build Coastguard Worker   } while (h > 0);
1125*77c1e3ccSAndroid Build Coastguard Worker }
1126*77c1e3ccSAndroid Build Coastguard Worker 
final_filter_internal(uint16_t * A,int32_t * B,const int buf_stride,int16_t * src,const int src_stride,int32_t * dst,const int dst_stride,const int width,const int height)1127*77c1e3ccSAndroid Build Coastguard Worker static void final_filter_internal(uint16_t *A, int32_t *B, const int buf_stride,
1128*77c1e3ccSAndroid Build Coastguard Worker                                   int16_t *src, const int src_stride,
1129*77c1e3ccSAndroid Build Coastguard Worker                                   int32_t *dst, const int dst_stride,
1130*77c1e3ccSAndroid Build Coastguard Worker                                   const int width, const int height) {
1131*77c1e3ccSAndroid Build Coastguard Worker   int16x8_t s0;
1132*77c1e3ccSAndroid Build Coastguard Worker   int32_t *B_tmp, *dst_ptr;
1133*77c1e3ccSAndroid Build Coastguard Worker   uint16_t *A_tmp;
1134*77c1e3ccSAndroid Build Coastguard Worker   int16_t *src_ptr;
1135*77c1e3ccSAndroid Build Coastguard Worker   int32x4_t a_res0, a_res1, b_res0, b_res1;
1136*77c1e3ccSAndroid Build Coastguard Worker   int w, h, count = 0;
1137*77c1e3ccSAndroid Build Coastguard Worker 
1138*77c1e3ccSAndroid Build Coastguard Worker   assert(SGRPROJ_SGR_BITS == 8);
1139*77c1e3ccSAndroid Build Coastguard Worker   assert(SGRPROJ_RST_BITS == 4);
1140*77c1e3ccSAndroid Build Coastguard Worker   h = height;
1141*77c1e3ccSAndroid Build Coastguard Worker 
1142*77c1e3ccSAndroid Build Coastguard Worker   do {
1143*77c1e3ccSAndroid Build Coastguard Worker     A_tmp = (A + count * buf_stride);
1144*77c1e3ccSAndroid Build Coastguard Worker     B_tmp = (B + count * buf_stride);
1145*77c1e3ccSAndroid Build Coastguard Worker     src_ptr = (src + count * src_stride);
1146*77c1e3ccSAndroid Build Coastguard Worker     dst_ptr = (dst + count * dst_stride);
1147*77c1e3ccSAndroid Build Coastguard Worker     w = width;
1148*77c1e3ccSAndroid Build Coastguard Worker     do {
1149*77c1e3ccSAndroid Build Coastguard Worker       s0 = vld1q_s16(src_ptr);
1150*77c1e3ccSAndroid Build Coastguard Worker       cross_sum_inp_u16(A_tmp, buf_stride, &a_res0, &a_res1);
1151*77c1e3ccSAndroid Build Coastguard Worker       a_res0 = vmulq_s32(vmovl_s16(vget_low_s16(s0)), a_res0);
1152*77c1e3ccSAndroid Build Coastguard Worker       a_res1 = vmulq_s32(vmovl_s16(vget_high_s16(s0)), a_res1);
1153*77c1e3ccSAndroid Build Coastguard Worker 
1154*77c1e3ccSAndroid Build Coastguard Worker       b_res0 = cross_sum_inp_s32(B_tmp, buf_stride);
1155*77c1e3ccSAndroid Build Coastguard Worker       b_res1 = cross_sum_inp_s32(B_tmp + 4, buf_stride);
1156*77c1e3ccSAndroid Build Coastguard Worker       a_res0 = vaddq_s32(a_res0, b_res0);
1157*77c1e3ccSAndroid Build Coastguard Worker       a_res1 = vaddq_s32(a_res1, b_res1);
1158*77c1e3ccSAndroid Build Coastguard Worker 
1159*77c1e3ccSAndroid Build Coastguard Worker       a_res0 =
1160*77c1e3ccSAndroid Build Coastguard Worker           vrshrq_n_s32(a_res0, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
1161*77c1e3ccSAndroid Build Coastguard Worker       a_res1 =
1162*77c1e3ccSAndroid Build Coastguard Worker           vrshrq_n_s32(a_res1, SGRPROJ_SGR_BITS + NB_EVEN - SGRPROJ_RST_BITS);
1163*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s32(dst_ptr, a_res0);
1164*77c1e3ccSAndroid Build Coastguard Worker       vst1q_s32(dst_ptr + 4, a_res1);
1165*77c1e3ccSAndroid Build Coastguard Worker 
1166*77c1e3ccSAndroid Build Coastguard Worker       A_tmp += 8;
1167*77c1e3ccSAndroid Build Coastguard Worker       B_tmp += 8;
1168*77c1e3ccSAndroid Build Coastguard Worker       src_ptr += 8;
1169*77c1e3ccSAndroid Build Coastguard Worker       dst_ptr += 8;
1170*77c1e3ccSAndroid Build Coastguard Worker       w -= 8;
1171*77c1e3ccSAndroid Build Coastguard Worker     } while (w > 0);
1172*77c1e3ccSAndroid Build Coastguard Worker     count++;
1173*77c1e3ccSAndroid Build Coastguard Worker     h -= 1;
1174*77c1e3ccSAndroid Build Coastguard Worker   } while (h > 0);
1175*77c1e3ccSAndroid Build Coastguard Worker }
1176*77c1e3ccSAndroid Build Coastguard Worker 
restoration_fast_internal(uint16_t * dgd16,int width,int height,int dgd_stride,int32_t * dst,int dst_stride,int bit_depth,int sgr_params_idx,int radius_idx)1177*77c1e3ccSAndroid Build Coastguard Worker static inline int restoration_fast_internal(uint16_t *dgd16, int width,
1178*77c1e3ccSAndroid Build Coastguard Worker                                             int height, int dgd_stride,
1179*77c1e3ccSAndroid Build Coastguard Worker                                             int32_t *dst, int dst_stride,
1180*77c1e3ccSAndroid Build Coastguard Worker                                             int bit_depth, int sgr_params_idx,
1181*77c1e3ccSAndroid Build Coastguard Worker                                             int radius_idx) {
1182*77c1e3ccSAndroid Build Coastguard Worker   const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
1183*77c1e3ccSAndroid Build Coastguard Worker   const int r = params->r[radius_idx];
1184*77c1e3ccSAndroid Build Coastguard Worker   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
1185*77c1e3ccSAndroid Build Coastguard Worker   const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
1186*77c1e3ccSAndroid Build Coastguard Worker   const int buf_stride = ((width_ext + 3) & ~3) + 16;
1187*77c1e3ccSAndroid Build Coastguard Worker 
1188*77c1e3ccSAndroid Build Coastguard Worker   const size_t buf_size = 3 * sizeof(int32_t) * RESTORATION_PROC_UNIT_PELS;
1189*77c1e3ccSAndroid Build Coastguard Worker   int32_t *buf = aom_memalign(8, buf_size);
1190*77c1e3ccSAndroid Build Coastguard Worker   if (!buf) return -1;
1191*77c1e3ccSAndroid Build Coastguard Worker 
1192*77c1e3ccSAndroid Build Coastguard Worker   int32_t *square_sum_buf = buf;
1193*77c1e3ccSAndroid Build Coastguard Worker   int32_t *sum_buf = square_sum_buf + RESTORATION_PROC_UNIT_PELS;
1194*77c1e3ccSAndroid Build Coastguard Worker   uint16_t *tmp16_buf = (uint16_t *)(sum_buf + RESTORATION_PROC_UNIT_PELS);
1195*77c1e3ccSAndroid Build Coastguard Worker   assert((char *)(sum_buf + RESTORATION_PROC_UNIT_PELS) <=
1196*77c1e3ccSAndroid Build Coastguard Worker              (char *)buf + buf_size &&
1197*77c1e3ccSAndroid Build Coastguard Worker          "Allocated buffer is too small. Resize the buffer.");
1198*77c1e3ccSAndroid Build Coastguard Worker 
1199*77c1e3ccSAndroid Build Coastguard Worker   assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
1200*77c1e3ccSAndroid Build Coastguard Worker   assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
1201*77c1e3ccSAndroid Build Coastguard Worker          "Need SGRPROJ_BORDER_* >= r+1");
1202*77c1e3ccSAndroid Build Coastguard Worker 
1203*77c1e3ccSAndroid Build Coastguard Worker   assert(radius_idx == 0);
1204*77c1e3ccSAndroid Build Coastguard Worker   assert(r == 2);
1205*77c1e3ccSAndroid Build Coastguard Worker 
1206*77c1e3ccSAndroid Build Coastguard Worker   // input(dgd16) is 16bit.
1207*77c1e3ccSAndroid Build Coastguard Worker   // sum of pixels 1st stage output will be in 16bit(tmp16_buf). End output is
1208*77c1e3ccSAndroid Build Coastguard Worker   // kept in 32bit [sum_buf]. sum of squares output is kept in 32bit
1209*77c1e3ccSAndroid Build Coastguard Worker   // buffer(square_sum_buf).
1210*77c1e3ccSAndroid Build Coastguard Worker   boxsum2((int16_t *)(dgd16 - dgd_stride * SGRPROJ_BORDER_VERT -
1211*77c1e3ccSAndroid Build Coastguard Worker                       SGRPROJ_BORDER_HORZ),
1212*77c1e3ccSAndroid Build Coastguard Worker           dgd_stride, (int16_t *)tmp16_buf, sum_buf, square_sum_buf, buf_stride,
1213*77c1e3ccSAndroid Build Coastguard Worker           width_ext, height_ext);
1214*77c1e3ccSAndroid Build Coastguard Worker 
1215*77c1e3ccSAndroid Build Coastguard Worker   square_sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
1216*77c1e3ccSAndroid Build Coastguard Worker   sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
1217*77c1e3ccSAndroid Build Coastguard Worker   tmp16_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
1218*77c1e3ccSAndroid Build Coastguard Worker 
1219*77c1e3ccSAndroid Build Coastguard Worker   // Calculation of a, b. a output is in 16bit tmp_buf which is in range of
1220*77c1e3ccSAndroid Build Coastguard Worker   // [1, 256] for all bit depths. b output is kept in 32bit buffer.
1221*77c1e3ccSAndroid Build Coastguard Worker 
1222*77c1e3ccSAndroid Build Coastguard Worker #if CONFIG_AV1_HIGHBITDEPTH
1223*77c1e3ccSAndroid Build Coastguard Worker   if (bit_depth > 8) {
1224*77c1e3ccSAndroid Build Coastguard Worker     calc_ab_fast_internal_hbd(
1225*77c1e3ccSAndroid Build Coastguard Worker         (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1),
1226*77c1e3ccSAndroid Build Coastguard Worker         (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2,
1227*77c1e3ccSAndroid Build Coastguard Worker         bit_depth, r, params->s[radius_idx], 2);
1228*77c1e3ccSAndroid Build Coastguard Worker   } else {
1229*77c1e3ccSAndroid Build Coastguard Worker     calc_ab_fast_internal_lbd(
1230*77c1e3ccSAndroid Build Coastguard Worker         (square_sum_buf - buf_stride - 1), (tmp16_buf - buf_stride - 1),
1231*77c1e3ccSAndroid Build Coastguard Worker         (sum_buf - buf_stride - 1), buf_stride * 2, width + 2, height + 2, r,
1232*77c1e3ccSAndroid Build Coastguard Worker         params->s[radius_idx], 2);
1233*77c1e3ccSAndroid Build Coastguard Worker   }
1234*77c1e3ccSAndroid Build Coastguard Worker #else
1235*77c1e3ccSAndroid Build Coastguard Worker   (void)bit_depth;
1236*77c1e3ccSAndroid Build Coastguard Worker   calc_ab_fast_internal_lbd((square_sum_buf - buf_stride - 1),
1237*77c1e3ccSAndroid Build Coastguard Worker                             (tmp16_buf - buf_stride - 1),
1238*77c1e3ccSAndroid Build Coastguard Worker                             (sum_buf - buf_stride - 1), buf_stride * 2,
1239*77c1e3ccSAndroid Build Coastguard Worker                             width + 2, height + 2, r, params->s[radius_idx], 2);
1240*77c1e3ccSAndroid Build Coastguard Worker #endif
1241*77c1e3ccSAndroid Build Coastguard Worker   final_filter_fast_internal(tmp16_buf, sum_buf, buf_stride, (int16_t *)dgd16,
1242*77c1e3ccSAndroid Build Coastguard Worker                              dgd_stride, dst, dst_stride, width, height);
1243*77c1e3ccSAndroid Build Coastguard Worker   aom_free(buf);
1244*77c1e3ccSAndroid Build Coastguard Worker   return 0;
1245*77c1e3ccSAndroid Build Coastguard Worker }
1246*77c1e3ccSAndroid Build Coastguard Worker 
restoration_internal(uint16_t * dgd16,int width,int height,int dgd_stride,int32_t * dst,int dst_stride,int bit_depth,int sgr_params_idx,int radius_idx)1247*77c1e3ccSAndroid Build Coastguard Worker static inline int restoration_internal(uint16_t *dgd16, int width, int height,
1248*77c1e3ccSAndroid Build Coastguard Worker                                        int dgd_stride, int32_t *dst,
1249*77c1e3ccSAndroid Build Coastguard Worker                                        int dst_stride, int bit_depth,
1250*77c1e3ccSAndroid Build Coastguard Worker                                        int sgr_params_idx, int radius_idx) {
1251*77c1e3ccSAndroid Build Coastguard Worker   const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
1252*77c1e3ccSAndroid Build Coastguard Worker   const int r = params->r[radius_idx];
1253*77c1e3ccSAndroid Build Coastguard Worker   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
1254*77c1e3ccSAndroid Build Coastguard Worker   const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
1255*77c1e3ccSAndroid Build Coastguard Worker   const int buf_stride = ((width_ext + 3) & ~3) + 16;
1256*77c1e3ccSAndroid Build Coastguard Worker 
1257*77c1e3ccSAndroid Build Coastguard Worker   const size_t buf_size = 3 * sizeof(int32_t) * RESTORATION_PROC_UNIT_PELS;
1258*77c1e3ccSAndroid Build Coastguard Worker   int32_t *buf = aom_memalign(8, buf_size);
1259*77c1e3ccSAndroid Build Coastguard Worker   if (!buf) return -1;
1260*77c1e3ccSAndroid Build Coastguard Worker 
1261*77c1e3ccSAndroid Build Coastguard Worker   int32_t *square_sum_buf = buf;
1262*77c1e3ccSAndroid Build Coastguard Worker   int32_t *B = square_sum_buf + RESTORATION_PROC_UNIT_PELS;
1263*77c1e3ccSAndroid Build Coastguard Worker   uint16_t *A16 = (uint16_t *)(B + RESTORATION_PROC_UNIT_PELS);
1264*77c1e3ccSAndroid Build Coastguard Worker   uint16_t *sum_buf = A16 + RESTORATION_PROC_UNIT_PELS;
1265*77c1e3ccSAndroid Build Coastguard Worker 
1266*77c1e3ccSAndroid Build Coastguard Worker   assert((char *)(sum_buf + RESTORATION_PROC_UNIT_PELS) <=
1267*77c1e3ccSAndroid Build Coastguard Worker              (char *)buf + buf_size &&
1268*77c1e3ccSAndroid Build Coastguard Worker          "Allocated buffer is too small. Resize the buffer.");
1269*77c1e3ccSAndroid Build Coastguard Worker 
1270*77c1e3ccSAndroid Build Coastguard Worker   assert(r <= MAX_RADIUS && "Need MAX_RADIUS >= r");
1271*77c1e3ccSAndroid Build Coastguard Worker   assert(r <= SGRPROJ_BORDER_VERT - 1 && r <= SGRPROJ_BORDER_HORZ - 1 &&
1272*77c1e3ccSAndroid Build Coastguard Worker          "Need SGRPROJ_BORDER_* >= r+1");
1273*77c1e3ccSAndroid Build Coastguard Worker 
1274*77c1e3ccSAndroid Build Coastguard Worker   assert(radius_idx == 1);
1275*77c1e3ccSAndroid Build Coastguard Worker   assert(r == 1);
1276*77c1e3ccSAndroid Build Coastguard Worker 
1277*77c1e3ccSAndroid Build Coastguard Worker   // input(dgd16) is 16bit.
1278*77c1e3ccSAndroid Build Coastguard Worker   // sum of pixels output will be in 16bit(sum_buf).
1279*77c1e3ccSAndroid Build Coastguard Worker   // sum of squares output is kept in 32bit buffer(square_sum_buf).
1280*77c1e3ccSAndroid Build Coastguard Worker   boxsum1((int16_t *)(dgd16 - dgd_stride * SGRPROJ_BORDER_VERT -
1281*77c1e3ccSAndroid Build Coastguard Worker                       SGRPROJ_BORDER_HORZ),
1282*77c1e3ccSAndroid Build Coastguard Worker           dgd_stride, sum_buf, square_sum_buf, buf_stride, width_ext,
1283*77c1e3ccSAndroid Build Coastguard Worker           height_ext);
1284*77c1e3ccSAndroid Build Coastguard Worker 
1285*77c1e3ccSAndroid Build Coastguard Worker   square_sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
1286*77c1e3ccSAndroid Build Coastguard Worker   B += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
1287*77c1e3ccSAndroid Build Coastguard Worker   A16 += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
1288*77c1e3ccSAndroid Build Coastguard Worker   sum_buf += SGRPROJ_BORDER_VERT * buf_stride + SGRPROJ_BORDER_HORZ;
1289*77c1e3ccSAndroid Build Coastguard Worker 
1290*77c1e3ccSAndroid Build Coastguard Worker #if CONFIG_AV1_HIGHBITDEPTH
1291*77c1e3ccSAndroid Build Coastguard Worker   // Calculation of a, b. a output is in 16bit tmp_buf which is in range of
1292*77c1e3ccSAndroid Build Coastguard Worker   // [1, 256] for all bit depths. b output is kept in 32bit buffer.
1293*77c1e3ccSAndroid Build Coastguard Worker   if (bit_depth > 8) {
1294*77c1e3ccSAndroid Build Coastguard Worker     calc_ab_internal_hbd((square_sum_buf - buf_stride - 1),
1295*77c1e3ccSAndroid Build Coastguard Worker                          (A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
1296*77c1e3ccSAndroid Build Coastguard Worker                          (B - buf_stride - 1), buf_stride, width + 2,
1297*77c1e3ccSAndroid Build Coastguard Worker                          height + 2, bit_depth, r, params->s[radius_idx], 1);
1298*77c1e3ccSAndroid Build Coastguard Worker   } else {
1299*77c1e3ccSAndroid Build Coastguard Worker     calc_ab_internal_lbd((square_sum_buf - buf_stride - 1),
1300*77c1e3ccSAndroid Build Coastguard Worker                          (A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
1301*77c1e3ccSAndroid Build Coastguard Worker                          (B - buf_stride - 1), buf_stride, width + 2,
1302*77c1e3ccSAndroid Build Coastguard Worker                          height + 2, r, params->s[radius_idx], 1);
1303*77c1e3ccSAndroid Build Coastguard Worker   }
1304*77c1e3ccSAndroid Build Coastguard Worker #else
1305*77c1e3ccSAndroid Build Coastguard Worker   (void)bit_depth;
1306*77c1e3ccSAndroid Build Coastguard Worker   calc_ab_internal_lbd((square_sum_buf - buf_stride - 1),
1307*77c1e3ccSAndroid Build Coastguard Worker                        (A16 - buf_stride - 1), (sum_buf - buf_stride - 1),
1308*77c1e3ccSAndroid Build Coastguard Worker                        (B - buf_stride - 1), buf_stride, width + 2, height + 2,
1309*77c1e3ccSAndroid Build Coastguard Worker                        r, params->s[radius_idx], 1);
1310*77c1e3ccSAndroid Build Coastguard Worker #endif
1311*77c1e3ccSAndroid Build Coastguard Worker   final_filter_internal(A16, B, buf_stride, (int16_t *)dgd16, dgd_stride, dst,
1312*77c1e3ccSAndroid Build Coastguard Worker                         dst_stride, width, height);
1313*77c1e3ccSAndroid Build Coastguard Worker   aom_free(buf);
1314*77c1e3ccSAndroid Build Coastguard Worker   return 0;
1315*77c1e3ccSAndroid Build Coastguard Worker }
1316*77c1e3ccSAndroid Build Coastguard Worker 
src_convert_u8_to_u16(const uint8_t * src,const int src_stride,uint16_t * dst,const int dst_stride,const int width,const int height)1317*77c1e3ccSAndroid Build Coastguard Worker static inline void src_convert_u8_to_u16(const uint8_t *src,
1318*77c1e3ccSAndroid Build Coastguard Worker                                          const int src_stride, uint16_t *dst,
1319*77c1e3ccSAndroid Build Coastguard Worker                                          const int dst_stride, const int width,
1320*77c1e3ccSAndroid Build Coastguard Worker                                          const int height) {
1321*77c1e3ccSAndroid Build Coastguard Worker   const uint8_t *src_ptr;
1322*77c1e3ccSAndroid Build Coastguard Worker   uint16_t *dst_ptr;
1323*77c1e3ccSAndroid Build Coastguard Worker   int h, w, count = 0;
1324*77c1e3ccSAndroid Build Coastguard Worker 
1325*77c1e3ccSAndroid Build Coastguard Worker   uint8x8_t t1, t2, t3, t4;
1326*77c1e3ccSAndroid Build Coastguard Worker   uint16x8_t s1, s2, s3, s4;
1327*77c1e3ccSAndroid Build Coastguard Worker   h = height;
1328*77c1e3ccSAndroid Build Coastguard Worker   do {
1329*77c1e3ccSAndroid Build Coastguard Worker     src_ptr = src + (count << 2) * src_stride;
1330*77c1e3ccSAndroid Build Coastguard Worker     dst_ptr = dst + (count << 2) * dst_stride;
1331*77c1e3ccSAndroid Build Coastguard Worker     w = width;
1332*77c1e3ccSAndroid Build Coastguard Worker     if (w >= 7) {
1333*77c1e3ccSAndroid Build Coastguard Worker       do {
1334*77c1e3ccSAndroid Build Coastguard Worker         load_u8_8x4(src_ptr, src_stride, &t1, &t2, &t3, &t4);
1335*77c1e3ccSAndroid Build Coastguard Worker         s1 = vmovl_u8(t1);
1336*77c1e3ccSAndroid Build Coastguard Worker         s2 = vmovl_u8(t2);
1337*77c1e3ccSAndroid Build Coastguard Worker         s3 = vmovl_u8(t3);
1338*77c1e3ccSAndroid Build Coastguard Worker         s4 = vmovl_u8(t4);
1339*77c1e3ccSAndroid Build Coastguard Worker         store_u16_8x4(dst_ptr, dst_stride, s1, s2, s3, s4);
1340*77c1e3ccSAndroid Build Coastguard Worker 
1341*77c1e3ccSAndroid Build Coastguard Worker         src_ptr += 8;
1342*77c1e3ccSAndroid Build Coastguard Worker         dst_ptr += 8;
1343*77c1e3ccSAndroid Build Coastguard Worker         w -= 8;
1344*77c1e3ccSAndroid Build Coastguard Worker       } while (w > 7);
1345*77c1e3ccSAndroid Build Coastguard Worker     }
1346*77c1e3ccSAndroid Build Coastguard Worker 
1347*77c1e3ccSAndroid Build Coastguard Worker     for (int y = 0; y < w; y++) {
1348*77c1e3ccSAndroid Build Coastguard Worker       dst_ptr[y] = src_ptr[y];
1349*77c1e3ccSAndroid Build Coastguard Worker       dst_ptr[y + 1 * dst_stride] = src_ptr[y + 1 * src_stride];
1350*77c1e3ccSAndroid Build Coastguard Worker       dst_ptr[y + 2 * dst_stride] = src_ptr[y + 2 * src_stride];
1351*77c1e3ccSAndroid Build Coastguard Worker       dst_ptr[y + 3 * dst_stride] = src_ptr[y + 3 * src_stride];
1352*77c1e3ccSAndroid Build Coastguard Worker     }
1353*77c1e3ccSAndroid Build Coastguard Worker     count++;
1354*77c1e3ccSAndroid Build Coastguard Worker     h -= 4;
1355*77c1e3ccSAndroid Build Coastguard Worker   } while (h > 3);
1356*77c1e3ccSAndroid Build Coastguard Worker 
1357*77c1e3ccSAndroid Build Coastguard Worker   src_ptr = src + (count << 2) * src_stride;
1358*77c1e3ccSAndroid Build Coastguard Worker   dst_ptr = dst + (count << 2) * dst_stride;
1359*77c1e3ccSAndroid Build Coastguard Worker   for (int x = 0; x < h; x++) {
1360*77c1e3ccSAndroid Build Coastguard Worker     for (int y = 0; y < width; y++) {
1361*77c1e3ccSAndroid Build Coastguard Worker       dst_ptr[y + x * dst_stride] = src_ptr[y + x * src_stride];
1362*77c1e3ccSAndroid Build Coastguard Worker     }
1363*77c1e3ccSAndroid Build Coastguard Worker   }
1364*77c1e3ccSAndroid Build Coastguard Worker 
1365*77c1e3ccSAndroid Build Coastguard Worker   // memset uninitialized rows of src buffer as they are needed for the
1366*77c1e3ccSAndroid Build Coastguard Worker   // boxsum filter calculation.
1367*77c1e3ccSAndroid Build Coastguard Worker   for (int x = height; x < height + 5; x++)
1368*77c1e3ccSAndroid Build Coastguard Worker     memset(dst + x * dst_stride, 0, (width + 2) * sizeof(*dst));
1369*77c1e3ccSAndroid Build Coastguard Worker }
1370*77c1e3ccSAndroid Build Coastguard Worker 
1371*77c1e3ccSAndroid Build Coastguard Worker #if CONFIG_AV1_HIGHBITDEPTH
src_convert_hbd_copy(const uint16_t * src,int src_stride,uint16_t * dst,const int dst_stride,int width,int height)1372*77c1e3ccSAndroid Build Coastguard Worker static inline void src_convert_hbd_copy(const uint16_t *src, int src_stride,
1373*77c1e3ccSAndroid Build Coastguard Worker                                         uint16_t *dst, const int dst_stride,
1374*77c1e3ccSAndroid Build Coastguard Worker                                         int width, int height) {
1375*77c1e3ccSAndroid Build Coastguard Worker   const uint16_t *src_ptr;
1376*77c1e3ccSAndroid Build Coastguard Worker   uint16_t *dst_ptr;
1377*77c1e3ccSAndroid Build Coastguard Worker   int h, w, count = 0;
1378*77c1e3ccSAndroid Build Coastguard Worker   uint16x8_t s1, s2, s3, s4;
1379*77c1e3ccSAndroid Build Coastguard Worker 
1380*77c1e3ccSAndroid Build Coastguard Worker   h = height;
1381*77c1e3ccSAndroid Build Coastguard Worker   do {
1382*77c1e3ccSAndroid Build Coastguard Worker     src_ptr = src + (count << 2) * src_stride;
1383*77c1e3ccSAndroid Build Coastguard Worker     dst_ptr = dst + (count << 2) * dst_stride;
1384*77c1e3ccSAndroid Build Coastguard Worker     w = width;
1385*77c1e3ccSAndroid Build Coastguard Worker     do {
1386*77c1e3ccSAndroid Build Coastguard Worker       load_u16_8x4(src_ptr, src_stride, &s1, &s2, &s3, &s4);
1387*77c1e3ccSAndroid Build Coastguard Worker       store_u16_8x4(dst_ptr, dst_stride, s1, s2, s3, s4);
1388*77c1e3ccSAndroid Build Coastguard Worker       src_ptr += 8;
1389*77c1e3ccSAndroid Build Coastguard Worker       dst_ptr += 8;
1390*77c1e3ccSAndroid Build Coastguard Worker       w -= 8;
1391*77c1e3ccSAndroid Build Coastguard Worker     } while (w > 7);
1392*77c1e3ccSAndroid Build Coastguard Worker 
1393*77c1e3ccSAndroid Build Coastguard Worker     for (int y = 0; y < w; y++) {
1394*77c1e3ccSAndroid Build Coastguard Worker       dst_ptr[y] = src_ptr[y];
1395*77c1e3ccSAndroid Build Coastguard Worker       dst_ptr[y + 1 * dst_stride] = src_ptr[y + 1 * src_stride];
1396*77c1e3ccSAndroid Build Coastguard Worker       dst_ptr[y + 2 * dst_stride] = src_ptr[y + 2 * src_stride];
1397*77c1e3ccSAndroid Build Coastguard Worker       dst_ptr[y + 3 * dst_stride] = src_ptr[y + 3 * src_stride];
1398*77c1e3ccSAndroid Build Coastguard Worker     }
1399*77c1e3ccSAndroid Build Coastguard Worker     count++;
1400*77c1e3ccSAndroid Build Coastguard Worker     h -= 4;
1401*77c1e3ccSAndroid Build Coastguard Worker   } while (h > 3);
1402*77c1e3ccSAndroid Build Coastguard Worker 
1403*77c1e3ccSAndroid Build Coastguard Worker   src_ptr = src + (count << 2) * src_stride;
1404*77c1e3ccSAndroid Build Coastguard Worker   dst_ptr = dst + (count << 2) * dst_stride;
1405*77c1e3ccSAndroid Build Coastguard Worker 
1406*77c1e3ccSAndroid Build Coastguard Worker   for (int x = 0; x < h; x++) {
1407*77c1e3ccSAndroid Build Coastguard Worker     memcpy((dst_ptr + x * dst_stride), (src_ptr + x * src_stride),
1408*77c1e3ccSAndroid Build Coastguard Worker            sizeof(uint16_t) * width);
1409*77c1e3ccSAndroid Build Coastguard Worker   }
1410*77c1e3ccSAndroid Build Coastguard Worker   // memset uninitialized rows of src buffer as they are needed for the
1411*77c1e3ccSAndroid Build Coastguard Worker   // boxsum filter calculation.
1412*77c1e3ccSAndroid Build Coastguard Worker   for (int x = height; x < height + 5; x++)
1413*77c1e3ccSAndroid Build Coastguard Worker     memset(dst + x * dst_stride, 0, (width + 2) * sizeof(*dst));
1414*77c1e3ccSAndroid Build Coastguard Worker }
1415*77c1e3ccSAndroid Build Coastguard Worker #endif  // CONFIG_AV1_HIGHBITDEPTH
1416*77c1e3ccSAndroid Build Coastguard Worker 
av1_selfguided_restoration_neon(const uint8_t * dat8,int width,int height,int stride,int32_t * flt0,int32_t * flt1,int flt_stride,int sgr_params_idx,int bit_depth,int highbd)1417*77c1e3ccSAndroid Build Coastguard Worker int av1_selfguided_restoration_neon(const uint8_t *dat8, int width, int height,
1418*77c1e3ccSAndroid Build Coastguard Worker                                     int stride, int32_t *flt0, int32_t *flt1,
1419*77c1e3ccSAndroid Build Coastguard Worker                                     int flt_stride, int sgr_params_idx,
1420*77c1e3ccSAndroid Build Coastguard Worker                                     int bit_depth, int highbd) {
1421*77c1e3ccSAndroid Build Coastguard Worker   const sgr_params_type *const params = &av1_sgr_params[sgr_params_idx];
1422*77c1e3ccSAndroid Build Coastguard Worker   assert(!(params->r[0] == 0 && params->r[1] == 0));
1423*77c1e3ccSAndroid Build Coastguard Worker 
1424*77c1e3ccSAndroid Build Coastguard Worker   uint16_t dgd16_[RESTORATION_PROC_UNIT_PELS];
1425*77c1e3ccSAndroid Build Coastguard Worker   const int dgd16_stride = width + 2 * SGRPROJ_BORDER_HORZ;
1426*77c1e3ccSAndroid Build Coastguard Worker   uint16_t *dgd16 =
1427*77c1e3ccSAndroid Build Coastguard Worker       dgd16_ + dgd16_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
1428*77c1e3ccSAndroid Build Coastguard Worker   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
1429*77c1e3ccSAndroid Build Coastguard Worker   const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
1430*77c1e3ccSAndroid Build Coastguard Worker   const int dgd_stride = stride;
1431*77c1e3ccSAndroid Build Coastguard Worker 
1432*77c1e3ccSAndroid Build Coastguard Worker #if CONFIG_AV1_HIGHBITDEPTH
1433*77c1e3ccSAndroid Build Coastguard Worker   if (highbd) {
1434*77c1e3ccSAndroid Build Coastguard Worker     const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8);
1435*77c1e3ccSAndroid Build Coastguard Worker     src_convert_hbd_copy(
1436*77c1e3ccSAndroid Build Coastguard Worker         dgd16_tmp - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
1437*77c1e3ccSAndroid Build Coastguard Worker         dgd_stride,
1438*77c1e3ccSAndroid Build Coastguard Worker         dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
1439*77c1e3ccSAndroid Build Coastguard Worker         dgd16_stride, width_ext, height_ext);
1440*77c1e3ccSAndroid Build Coastguard Worker   } else {
1441*77c1e3ccSAndroid Build Coastguard Worker     src_convert_u8_to_u16(
1442*77c1e3ccSAndroid Build Coastguard Worker         dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
1443*77c1e3ccSAndroid Build Coastguard Worker         dgd_stride,
1444*77c1e3ccSAndroid Build Coastguard Worker         dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
1445*77c1e3ccSAndroid Build Coastguard Worker         dgd16_stride, width_ext, height_ext);
1446*77c1e3ccSAndroid Build Coastguard Worker   }
1447*77c1e3ccSAndroid Build Coastguard Worker #else
1448*77c1e3ccSAndroid Build Coastguard Worker   (void)highbd;
1449*77c1e3ccSAndroid Build Coastguard Worker   src_convert_u8_to_u16(
1450*77c1e3ccSAndroid Build Coastguard Worker       dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, dgd_stride,
1451*77c1e3ccSAndroid Build Coastguard Worker       dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
1452*77c1e3ccSAndroid Build Coastguard Worker       dgd16_stride, width_ext, height_ext);
1453*77c1e3ccSAndroid Build Coastguard Worker #endif
1454*77c1e3ccSAndroid Build Coastguard Worker 
1455*77c1e3ccSAndroid Build Coastguard Worker   if (params->r[0] > 0) {
1456*77c1e3ccSAndroid Build Coastguard Worker     int ret =
1457*77c1e3ccSAndroid Build Coastguard Worker         restoration_fast_internal(dgd16, width, height, dgd16_stride, flt0,
1458*77c1e3ccSAndroid Build Coastguard Worker                                   flt_stride, bit_depth, sgr_params_idx, 0);
1459*77c1e3ccSAndroid Build Coastguard Worker     if (ret != 0) return ret;
1460*77c1e3ccSAndroid Build Coastguard Worker   }
1461*77c1e3ccSAndroid Build Coastguard Worker   if (params->r[1] > 0) {
1462*77c1e3ccSAndroid Build Coastguard Worker     int ret = restoration_internal(dgd16, width, height, dgd16_stride, flt1,
1463*77c1e3ccSAndroid Build Coastguard Worker                                    flt_stride, bit_depth, sgr_params_idx, 1);
1464*77c1e3ccSAndroid Build Coastguard Worker     if (ret != 0) return ret;
1465*77c1e3ccSAndroid Build Coastguard Worker   }
1466*77c1e3ccSAndroid Build Coastguard Worker   return 0;
1467*77c1e3ccSAndroid Build Coastguard Worker }
1468*77c1e3ccSAndroid Build Coastguard Worker 
av1_apply_selfguided_restoration_neon(const uint8_t * dat8,int width,int height,int stride,int eps,const int * xqd,uint8_t * dst8,int dst_stride,int32_t * tmpbuf,int bit_depth,int highbd)1469*77c1e3ccSAndroid Build Coastguard Worker int av1_apply_selfguided_restoration_neon(const uint8_t *dat8, int width,
1470*77c1e3ccSAndroid Build Coastguard Worker                                           int height, int stride, int eps,
1471*77c1e3ccSAndroid Build Coastguard Worker                                           const int *xqd, uint8_t *dst8,
1472*77c1e3ccSAndroid Build Coastguard Worker                                           int dst_stride, int32_t *tmpbuf,
1473*77c1e3ccSAndroid Build Coastguard Worker                                           int bit_depth, int highbd) {
1474*77c1e3ccSAndroid Build Coastguard Worker   int32_t *flt0 = tmpbuf;
1475*77c1e3ccSAndroid Build Coastguard Worker   int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
1476*77c1e3ccSAndroid Build Coastguard Worker   assert(width * height <= RESTORATION_UNITPELS_MAX);
1477*77c1e3ccSAndroid Build Coastguard Worker   uint16_t dgd16_[RESTORATION_PROC_UNIT_PELS];
1478*77c1e3ccSAndroid Build Coastguard Worker   const int dgd16_stride = width + 2 * SGRPROJ_BORDER_HORZ;
1479*77c1e3ccSAndroid Build Coastguard Worker   uint16_t *dgd16 =
1480*77c1e3ccSAndroid Build Coastguard Worker       dgd16_ + dgd16_stride * SGRPROJ_BORDER_VERT + SGRPROJ_BORDER_HORZ;
1481*77c1e3ccSAndroid Build Coastguard Worker   const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
1482*77c1e3ccSAndroid Build Coastguard Worker   const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
1483*77c1e3ccSAndroid Build Coastguard Worker   const int dgd_stride = stride;
1484*77c1e3ccSAndroid Build Coastguard Worker   const sgr_params_type *const params = &av1_sgr_params[eps];
1485*77c1e3ccSAndroid Build Coastguard Worker   int xq[2];
1486*77c1e3ccSAndroid Build Coastguard Worker 
1487*77c1e3ccSAndroid Build Coastguard Worker   assert(!(params->r[0] == 0 && params->r[1] == 0));
1488*77c1e3ccSAndroid Build Coastguard Worker 
1489*77c1e3ccSAndroid Build Coastguard Worker #if CONFIG_AV1_HIGHBITDEPTH
1490*77c1e3ccSAndroid Build Coastguard Worker   if (highbd) {
1491*77c1e3ccSAndroid Build Coastguard Worker     const uint16_t *dgd16_tmp = CONVERT_TO_SHORTPTR(dat8);
1492*77c1e3ccSAndroid Build Coastguard Worker     src_convert_hbd_copy(
1493*77c1e3ccSAndroid Build Coastguard Worker         dgd16_tmp - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
1494*77c1e3ccSAndroid Build Coastguard Worker         dgd_stride,
1495*77c1e3ccSAndroid Build Coastguard Worker         dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
1496*77c1e3ccSAndroid Build Coastguard Worker         dgd16_stride, width_ext, height_ext);
1497*77c1e3ccSAndroid Build Coastguard Worker   } else {
1498*77c1e3ccSAndroid Build Coastguard Worker     src_convert_u8_to_u16(
1499*77c1e3ccSAndroid Build Coastguard Worker         dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ,
1500*77c1e3ccSAndroid Build Coastguard Worker         dgd_stride,
1501*77c1e3ccSAndroid Build Coastguard Worker         dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
1502*77c1e3ccSAndroid Build Coastguard Worker         dgd16_stride, width_ext, height_ext);
1503*77c1e3ccSAndroid Build Coastguard Worker   }
1504*77c1e3ccSAndroid Build Coastguard Worker #else
1505*77c1e3ccSAndroid Build Coastguard Worker   (void)highbd;
1506*77c1e3ccSAndroid Build Coastguard Worker   src_convert_u8_to_u16(
1507*77c1e3ccSAndroid Build Coastguard Worker       dat8 - SGRPROJ_BORDER_VERT * dgd_stride - SGRPROJ_BORDER_HORZ, dgd_stride,
1508*77c1e3ccSAndroid Build Coastguard Worker       dgd16 - SGRPROJ_BORDER_VERT * dgd16_stride - SGRPROJ_BORDER_HORZ,
1509*77c1e3ccSAndroid Build Coastguard Worker       dgd16_stride, width_ext, height_ext);
1510*77c1e3ccSAndroid Build Coastguard Worker #endif
1511*77c1e3ccSAndroid Build Coastguard Worker   if (params->r[0] > 0) {
1512*77c1e3ccSAndroid Build Coastguard Worker     int ret = restoration_fast_internal(dgd16, width, height, dgd16_stride,
1513*77c1e3ccSAndroid Build Coastguard Worker                                         flt0, width, bit_depth, eps, 0);
1514*77c1e3ccSAndroid Build Coastguard Worker     if (ret != 0) return ret;
1515*77c1e3ccSAndroid Build Coastguard Worker   }
1516*77c1e3ccSAndroid Build Coastguard Worker   if (params->r[1] > 0) {
1517*77c1e3ccSAndroid Build Coastguard Worker     int ret = restoration_internal(dgd16, width, height, dgd16_stride, flt1,
1518*77c1e3ccSAndroid Build Coastguard Worker                                    width, bit_depth, eps, 1);
1519*77c1e3ccSAndroid Build Coastguard Worker     if (ret != 0) return ret;
1520*77c1e3ccSAndroid Build Coastguard Worker   }
1521*77c1e3ccSAndroid Build Coastguard Worker 
1522*77c1e3ccSAndroid Build Coastguard Worker   av1_decode_xq(xqd, xq, params);
1523*77c1e3ccSAndroid Build Coastguard Worker 
1524*77c1e3ccSAndroid Build Coastguard Worker   {
1525*77c1e3ccSAndroid Build Coastguard Worker     int16_t *src_ptr;
1526*77c1e3ccSAndroid Build Coastguard Worker     uint8_t *dst_ptr;
1527*77c1e3ccSAndroid Build Coastguard Worker #if CONFIG_AV1_HIGHBITDEPTH
1528*77c1e3ccSAndroid Build Coastguard Worker     uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst8);
1529*77c1e3ccSAndroid Build Coastguard Worker     uint16_t *dst16_ptr;
1530*77c1e3ccSAndroid Build Coastguard Worker #endif
1531*77c1e3ccSAndroid Build Coastguard Worker     int16x4_t d0, d4;
1532*77c1e3ccSAndroid Build Coastguard Worker     int16x8_t r0, s0;
1533*77c1e3ccSAndroid Build Coastguard Worker     uint16x8_t r4;
1534*77c1e3ccSAndroid Build Coastguard Worker     int32x4_t u0, u4, v0, v4, f00, f10;
1535*77c1e3ccSAndroid Build Coastguard Worker     uint8x8_t t0;
1536*77c1e3ccSAndroid Build Coastguard Worker     int count = 0, w = width, h = height, rc = 0;
1537*77c1e3ccSAndroid Build Coastguard Worker 
1538*77c1e3ccSAndroid Build Coastguard Worker     const int32x4_t xq0_vec = vdupq_n_s32(xq[0]);
1539*77c1e3ccSAndroid Build Coastguard Worker     const int32x4_t xq1_vec = vdupq_n_s32(xq[1]);
1540*77c1e3ccSAndroid Build Coastguard Worker     const int16x8_t zero = vdupq_n_s16(0);
1541*77c1e3ccSAndroid Build Coastguard Worker     const uint16x8_t max = vdupq_n_u16((1 << bit_depth) - 1);
1542*77c1e3ccSAndroid Build Coastguard Worker     src_ptr = (int16_t *)dgd16;
1543*77c1e3ccSAndroid Build Coastguard Worker     do {
1544*77c1e3ccSAndroid Build Coastguard Worker       w = width;
1545*77c1e3ccSAndroid Build Coastguard Worker       count = 0;
1546*77c1e3ccSAndroid Build Coastguard Worker       dst_ptr = dst8 + rc * dst_stride;
1547*77c1e3ccSAndroid Build Coastguard Worker #if CONFIG_AV1_HIGHBITDEPTH
1548*77c1e3ccSAndroid Build Coastguard Worker       dst16_ptr = dst16 + rc * dst_stride;
1549*77c1e3ccSAndroid Build Coastguard Worker #endif
1550*77c1e3ccSAndroid Build Coastguard Worker       do {
1551*77c1e3ccSAndroid Build Coastguard Worker         s0 = vld1q_s16(src_ptr + count);
1552*77c1e3ccSAndroid Build Coastguard Worker 
1553*77c1e3ccSAndroid Build Coastguard Worker         u0 = vshll_n_s16(vget_low_s16(s0), SGRPROJ_RST_BITS);
1554*77c1e3ccSAndroid Build Coastguard Worker         u4 = vshll_n_s16(vget_high_s16(s0), SGRPROJ_RST_BITS);
1555*77c1e3ccSAndroid Build Coastguard Worker 
1556*77c1e3ccSAndroid Build Coastguard Worker         v0 = vshlq_n_s32(u0, SGRPROJ_PRJ_BITS);
1557*77c1e3ccSAndroid Build Coastguard Worker         v4 = vshlq_n_s32(u4, SGRPROJ_PRJ_BITS);
1558*77c1e3ccSAndroid Build Coastguard Worker 
1559*77c1e3ccSAndroid Build Coastguard Worker         if (params->r[0] > 0) {
1560*77c1e3ccSAndroid Build Coastguard Worker           f00 = vld1q_s32(flt0 + count);
1561*77c1e3ccSAndroid Build Coastguard Worker           f10 = vld1q_s32(flt0 + count + 4);
1562*77c1e3ccSAndroid Build Coastguard Worker 
1563*77c1e3ccSAndroid Build Coastguard Worker           f00 = vsubq_s32(f00, u0);
1564*77c1e3ccSAndroid Build Coastguard Worker           f10 = vsubq_s32(f10, u4);
1565*77c1e3ccSAndroid Build Coastguard Worker 
1566*77c1e3ccSAndroid Build Coastguard Worker           v0 = vmlaq_s32(v0, xq0_vec, f00);
1567*77c1e3ccSAndroid Build Coastguard Worker           v4 = vmlaq_s32(v4, xq0_vec, f10);
1568*77c1e3ccSAndroid Build Coastguard Worker         }
1569*77c1e3ccSAndroid Build Coastguard Worker 
1570*77c1e3ccSAndroid Build Coastguard Worker         if (params->r[1] > 0) {
1571*77c1e3ccSAndroid Build Coastguard Worker           f00 = vld1q_s32(flt1 + count);
1572*77c1e3ccSAndroid Build Coastguard Worker           f10 = vld1q_s32(flt1 + count + 4);
1573*77c1e3ccSAndroid Build Coastguard Worker 
1574*77c1e3ccSAndroid Build Coastguard Worker           f00 = vsubq_s32(f00, u0);
1575*77c1e3ccSAndroid Build Coastguard Worker           f10 = vsubq_s32(f10, u4);
1576*77c1e3ccSAndroid Build Coastguard Worker 
1577*77c1e3ccSAndroid Build Coastguard Worker           v0 = vmlaq_s32(v0, xq1_vec, f00);
1578*77c1e3ccSAndroid Build Coastguard Worker           v4 = vmlaq_s32(v4, xq1_vec, f10);
1579*77c1e3ccSAndroid Build Coastguard Worker         }
1580*77c1e3ccSAndroid Build Coastguard Worker 
1581*77c1e3ccSAndroid Build Coastguard Worker         d0 = vqrshrn_n_s32(v0, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
1582*77c1e3ccSAndroid Build Coastguard Worker         d4 = vqrshrn_n_s32(v4, SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
1583*77c1e3ccSAndroid Build Coastguard Worker 
1584*77c1e3ccSAndroid Build Coastguard Worker         r0 = vcombine_s16(d0, d4);
1585*77c1e3ccSAndroid Build Coastguard Worker 
1586*77c1e3ccSAndroid Build Coastguard Worker         r4 = vreinterpretq_u16_s16(vmaxq_s16(r0, zero));
1587*77c1e3ccSAndroid Build Coastguard Worker 
1588*77c1e3ccSAndroid Build Coastguard Worker #if CONFIG_AV1_HIGHBITDEPTH
1589*77c1e3ccSAndroid Build Coastguard Worker         if (highbd) {
1590*77c1e3ccSAndroid Build Coastguard Worker           r4 = vminq_u16(r4, max);
1591*77c1e3ccSAndroid Build Coastguard Worker           vst1q_u16(dst16_ptr, r4);
1592*77c1e3ccSAndroid Build Coastguard Worker           dst16_ptr += 8;
1593*77c1e3ccSAndroid Build Coastguard Worker         } else {
1594*77c1e3ccSAndroid Build Coastguard Worker           t0 = vqmovn_u16(r4);
1595*77c1e3ccSAndroid Build Coastguard Worker           vst1_u8(dst_ptr, t0);
1596*77c1e3ccSAndroid Build Coastguard Worker           dst_ptr += 8;
1597*77c1e3ccSAndroid Build Coastguard Worker         }
1598*77c1e3ccSAndroid Build Coastguard Worker #else
1599*77c1e3ccSAndroid Build Coastguard Worker         (void)max;
1600*77c1e3ccSAndroid Build Coastguard Worker         t0 = vqmovn_u16(r4);
1601*77c1e3ccSAndroid Build Coastguard Worker         vst1_u8(dst_ptr, t0);
1602*77c1e3ccSAndroid Build Coastguard Worker         dst_ptr += 8;
1603*77c1e3ccSAndroid Build Coastguard Worker #endif
1604*77c1e3ccSAndroid Build Coastguard Worker         w -= 8;
1605*77c1e3ccSAndroid Build Coastguard Worker         count += 8;
1606*77c1e3ccSAndroid Build Coastguard Worker       } while (w > 0);
1607*77c1e3ccSAndroid Build Coastguard Worker 
1608*77c1e3ccSAndroid Build Coastguard Worker       src_ptr += dgd16_stride;
1609*77c1e3ccSAndroid Build Coastguard Worker       flt1 += width;
1610*77c1e3ccSAndroid Build Coastguard Worker       flt0 += width;
1611*77c1e3ccSAndroid Build Coastguard Worker       rc++;
1612*77c1e3ccSAndroid Build Coastguard Worker       h--;
1613*77c1e3ccSAndroid Build Coastguard Worker     } while (h > 0);
1614*77c1e3ccSAndroid Build Coastguard Worker   }
1615*77c1e3ccSAndroid Build Coastguard Worker   return 0;
1616*77c1e3ccSAndroid Build Coastguard Worker }
1617