xref: /aosp_15_r20/external/libvpx/vpx_dsp/arm/deblock_neon.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1*fb1b10abSAndroid Build Coastguard Worker /*
2*fb1b10abSAndroid Build Coastguard Worker  *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
3*fb1b10abSAndroid Build Coastguard Worker  *
4*fb1b10abSAndroid Build Coastguard Worker  *  Use of this source code is governed by a BSD-style license
5*fb1b10abSAndroid Build Coastguard Worker  *  that can be found in the LICENSE file in the root of the source
6*fb1b10abSAndroid Build Coastguard Worker  *  tree. An additional intellectual property rights grant can be found
7*fb1b10abSAndroid Build Coastguard Worker  *  in the file PATENTS.  All contributing project authors may
8*fb1b10abSAndroid Build Coastguard Worker  *  be found in the AUTHORS file in the root of the source tree.
9*fb1b10abSAndroid Build Coastguard Worker  */
10*fb1b10abSAndroid Build Coastguard Worker 
11*fb1b10abSAndroid Build Coastguard Worker #include <arm_neon.h>
12*fb1b10abSAndroid Build Coastguard Worker #include <assert.h>
13*fb1b10abSAndroid Build Coastguard Worker 
14*fb1b10abSAndroid Build Coastguard Worker #include "./vpx_dsp_rtcd.h"
15*fb1b10abSAndroid Build Coastguard Worker #include "vpx/vpx_integer.h"
16*fb1b10abSAndroid Build Coastguard Worker #include "vpx_dsp/arm/transpose_neon.h"
17*fb1b10abSAndroid Build Coastguard Worker 
18*fb1b10abSAndroid Build Coastguard Worker extern const int16_t vpx_rv[];
19*fb1b10abSAndroid Build Coastguard Worker 
average_k_out(const uint8x8_t a2,const uint8x8_t a1,const uint8x8_t v0,const uint8x8_t b1,const uint8x8_t b2)20*fb1b10abSAndroid Build Coastguard Worker static uint8x8_t average_k_out(const uint8x8_t a2, const uint8x8_t a1,
21*fb1b10abSAndroid Build Coastguard Worker                                const uint8x8_t v0, const uint8x8_t b1,
22*fb1b10abSAndroid Build Coastguard Worker                                const uint8x8_t b2) {
23*fb1b10abSAndroid Build Coastguard Worker   const uint8x8_t k1 = vrhadd_u8(a2, a1);
24*fb1b10abSAndroid Build Coastguard Worker   const uint8x8_t k2 = vrhadd_u8(b2, b1);
25*fb1b10abSAndroid Build Coastguard Worker   const uint8x8_t k3 = vrhadd_u8(k1, k2);
26*fb1b10abSAndroid Build Coastguard Worker   return vrhadd_u8(k3, v0);
27*fb1b10abSAndroid Build Coastguard Worker }
28*fb1b10abSAndroid Build Coastguard Worker 
generate_mask(const uint8x8_t a2,const uint8x8_t a1,const uint8x8_t v0,const uint8x8_t b1,const uint8x8_t b2,const uint8x8_t filter)29*fb1b10abSAndroid Build Coastguard Worker static uint8x8_t generate_mask(const uint8x8_t a2, const uint8x8_t a1,
30*fb1b10abSAndroid Build Coastguard Worker                                const uint8x8_t v0, const uint8x8_t b1,
31*fb1b10abSAndroid Build Coastguard Worker                                const uint8x8_t b2, const uint8x8_t filter) {
32*fb1b10abSAndroid Build Coastguard Worker   const uint8x8_t a2_v0 = vabd_u8(a2, v0);
33*fb1b10abSAndroid Build Coastguard Worker   const uint8x8_t a1_v0 = vabd_u8(a1, v0);
34*fb1b10abSAndroid Build Coastguard Worker   const uint8x8_t b1_v0 = vabd_u8(b1, v0);
35*fb1b10abSAndroid Build Coastguard Worker   const uint8x8_t b2_v0 = vabd_u8(b2, v0);
36*fb1b10abSAndroid Build Coastguard Worker 
37*fb1b10abSAndroid Build Coastguard Worker   uint8x8_t max = vmax_u8(a2_v0, a1_v0);
38*fb1b10abSAndroid Build Coastguard Worker   max = vmax_u8(b1_v0, max);
39*fb1b10abSAndroid Build Coastguard Worker   max = vmax_u8(b2_v0, max);
40*fb1b10abSAndroid Build Coastguard Worker   return vclt_u8(max, filter);
41*fb1b10abSAndroid Build Coastguard Worker }
42*fb1b10abSAndroid Build Coastguard Worker 
generate_output(const uint8x8_t a2,const uint8x8_t a1,const uint8x8_t v0,const uint8x8_t b1,const uint8x8_t b2,const uint8x8_t filter)43*fb1b10abSAndroid Build Coastguard Worker static uint8x8_t generate_output(const uint8x8_t a2, const uint8x8_t a1,
44*fb1b10abSAndroid Build Coastguard Worker                                  const uint8x8_t v0, const uint8x8_t b1,
45*fb1b10abSAndroid Build Coastguard Worker                                  const uint8x8_t b2, const uint8x8_t filter) {
46*fb1b10abSAndroid Build Coastguard Worker   const uint8x8_t k_out = average_k_out(a2, a1, v0, b1, b2);
47*fb1b10abSAndroid Build Coastguard Worker   const uint8x8_t mask = generate_mask(a2, a1, v0, b1, b2, filter);
48*fb1b10abSAndroid Build Coastguard Worker 
49*fb1b10abSAndroid Build Coastguard Worker   return vbsl_u8(mask, k_out, v0);
50*fb1b10abSAndroid Build Coastguard Worker }
51*fb1b10abSAndroid Build Coastguard Worker 
52*fb1b10abSAndroid Build Coastguard Worker // Same functions but for uint8x16_t.
average_k_outq(const uint8x16_t a2,const uint8x16_t a1,const uint8x16_t v0,const uint8x16_t b1,const uint8x16_t b2)53*fb1b10abSAndroid Build Coastguard Worker static uint8x16_t average_k_outq(const uint8x16_t a2, const uint8x16_t a1,
54*fb1b10abSAndroid Build Coastguard Worker                                  const uint8x16_t v0, const uint8x16_t b1,
55*fb1b10abSAndroid Build Coastguard Worker                                  const uint8x16_t b2) {
56*fb1b10abSAndroid Build Coastguard Worker   const uint8x16_t k1 = vrhaddq_u8(a2, a1);
57*fb1b10abSAndroid Build Coastguard Worker   const uint8x16_t k2 = vrhaddq_u8(b2, b1);
58*fb1b10abSAndroid Build Coastguard Worker   const uint8x16_t k3 = vrhaddq_u8(k1, k2);
59*fb1b10abSAndroid Build Coastguard Worker   return vrhaddq_u8(k3, v0);
60*fb1b10abSAndroid Build Coastguard Worker }
61*fb1b10abSAndroid Build Coastguard Worker 
generate_maskq(const uint8x16_t a2,const uint8x16_t a1,const uint8x16_t v0,const uint8x16_t b1,const uint8x16_t b2,const uint8x16_t filter)62*fb1b10abSAndroid Build Coastguard Worker static uint8x16_t generate_maskq(const uint8x16_t a2, const uint8x16_t a1,
63*fb1b10abSAndroid Build Coastguard Worker                                  const uint8x16_t v0, const uint8x16_t b1,
64*fb1b10abSAndroid Build Coastguard Worker                                  const uint8x16_t b2, const uint8x16_t filter) {
65*fb1b10abSAndroid Build Coastguard Worker   const uint8x16_t a2_v0 = vabdq_u8(a2, v0);
66*fb1b10abSAndroid Build Coastguard Worker   const uint8x16_t a1_v0 = vabdq_u8(a1, v0);
67*fb1b10abSAndroid Build Coastguard Worker   const uint8x16_t b1_v0 = vabdq_u8(b1, v0);
68*fb1b10abSAndroid Build Coastguard Worker   const uint8x16_t b2_v0 = vabdq_u8(b2, v0);
69*fb1b10abSAndroid Build Coastguard Worker 
70*fb1b10abSAndroid Build Coastguard Worker   uint8x16_t max = vmaxq_u8(a2_v0, a1_v0);
71*fb1b10abSAndroid Build Coastguard Worker   max = vmaxq_u8(b1_v0, max);
72*fb1b10abSAndroid Build Coastguard Worker   max = vmaxq_u8(b2_v0, max);
73*fb1b10abSAndroid Build Coastguard Worker   return vcltq_u8(max, filter);
74*fb1b10abSAndroid Build Coastguard Worker }
75*fb1b10abSAndroid Build Coastguard Worker 
generate_outputq(const uint8x16_t a2,const uint8x16_t a1,const uint8x16_t v0,const uint8x16_t b1,const uint8x16_t b2,const uint8x16_t filter)76*fb1b10abSAndroid Build Coastguard Worker static uint8x16_t generate_outputq(const uint8x16_t a2, const uint8x16_t a1,
77*fb1b10abSAndroid Build Coastguard Worker                                    const uint8x16_t v0, const uint8x16_t b1,
78*fb1b10abSAndroid Build Coastguard Worker                                    const uint8x16_t b2,
79*fb1b10abSAndroid Build Coastguard Worker                                    const uint8x16_t filter) {
80*fb1b10abSAndroid Build Coastguard Worker   const uint8x16_t k_out = average_k_outq(a2, a1, v0, b1, b2);
81*fb1b10abSAndroid Build Coastguard Worker   const uint8x16_t mask = generate_maskq(a2, a1, v0, b1, b2, filter);
82*fb1b10abSAndroid Build Coastguard Worker 
83*fb1b10abSAndroid Build Coastguard Worker   return vbslq_u8(mask, k_out, v0);
84*fb1b10abSAndroid Build Coastguard Worker }
85*fb1b10abSAndroid Build Coastguard Worker 
vpx_post_proc_down_and_across_mb_row_neon(uint8_t * src_ptr,uint8_t * dst_ptr,int src_stride,int dst_stride,int cols,uint8_t * f,int size)86*fb1b10abSAndroid Build Coastguard Worker void vpx_post_proc_down_and_across_mb_row_neon(uint8_t *src_ptr,
87*fb1b10abSAndroid Build Coastguard Worker                                                uint8_t *dst_ptr, int src_stride,
88*fb1b10abSAndroid Build Coastguard Worker                                                int dst_stride, int cols,
89*fb1b10abSAndroid Build Coastguard Worker                                                uint8_t *f, int size) {
90*fb1b10abSAndroid Build Coastguard Worker   uint8_t *src, *dst;
91*fb1b10abSAndroid Build Coastguard Worker   int row;
92*fb1b10abSAndroid Build Coastguard Worker   int col;
93*fb1b10abSAndroid Build Coastguard Worker 
94*fb1b10abSAndroid Build Coastguard Worker   // While columns of length 16 can be processed, load them.
95*fb1b10abSAndroid Build Coastguard Worker   for (col = 0; col < cols - 8; col += 16) {
96*fb1b10abSAndroid Build Coastguard Worker     uint8x16_t a0, a1, a2, a3, a4, a5, a6, a7;
97*fb1b10abSAndroid Build Coastguard Worker     src = src_ptr - 2 * src_stride;
98*fb1b10abSAndroid Build Coastguard Worker     dst = dst_ptr;
99*fb1b10abSAndroid Build Coastguard Worker 
100*fb1b10abSAndroid Build Coastguard Worker     a0 = vld1q_u8(src);
101*fb1b10abSAndroid Build Coastguard Worker     src += src_stride;
102*fb1b10abSAndroid Build Coastguard Worker     a1 = vld1q_u8(src);
103*fb1b10abSAndroid Build Coastguard Worker     src += src_stride;
104*fb1b10abSAndroid Build Coastguard Worker     a2 = vld1q_u8(src);
105*fb1b10abSAndroid Build Coastguard Worker     src += src_stride;
106*fb1b10abSAndroid Build Coastguard Worker     a3 = vld1q_u8(src);
107*fb1b10abSAndroid Build Coastguard Worker     src += src_stride;
108*fb1b10abSAndroid Build Coastguard Worker 
109*fb1b10abSAndroid Build Coastguard Worker     for (row = 0; row < size; row += 4) {
110*fb1b10abSAndroid Build Coastguard Worker       uint8x16_t v_out_0, v_out_1, v_out_2, v_out_3;
111*fb1b10abSAndroid Build Coastguard Worker       const uint8x16_t filterq = vld1q_u8(f + col);
112*fb1b10abSAndroid Build Coastguard Worker 
113*fb1b10abSAndroid Build Coastguard Worker       a4 = vld1q_u8(src);
114*fb1b10abSAndroid Build Coastguard Worker       src += src_stride;
115*fb1b10abSAndroid Build Coastguard Worker       a5 = vld1q_u8(src);
116*fb1b10abSAndroid Build Coastguard Worker       src += src_stride;
117*fb1b10abSAndroid Build Coastguard Worker       a6 = vld1q_u8(src);
118*fb1b10abSAndroid Build Coastguard Worker       src += src_stride;
119*fb1b10abSAndroid Build Coastguard Worker       a7 = vld1q_u8(src);
120*fb1b10abSAndroid Build Coastguard Worker       src += src_stride;
121*fb1b10abSAndroid Build Coastguard Worker 
122*fb1b10abSAndroid Build Coastguard Worker       v_out_0 = generate_outputq(a0, a1, a2, a3, a4, filterq);
123*fb1b10abSAndroid Build Coastguard Worker       v_out_1 = generate_outputq(a1, a2, a3, a4, a5, filterq);
124*fb1b10abSAndroid Build Coastguard Worker       v_out_2 = generate_outputq(a2, a3, a4, a5, a6, filterq);
125*fb1b10abSAndroid Build Coastguard Worker       v_out_3 = generate_outputq(a3, a4, a5, a6, a7, filterq);
126*fb1b10abSAndroid Build Coastguard Worker 
127*fb1b10abSAndroid Build Coastguard Worker       vst1q_u8(dst, v_out_0);
128*fb1b10abSAndroid Build Coastguard Worker       dst += dst_stride;
129*fb1b10abSAndroid Build Coastguard Worker       vst1q_u8(dst, v_out_1);
130*fb1b10abSAndroid Build Coastguard Worker       dst += dst_stride;
131*fb1b10abSAndroid Build Coastguard Worker       vst1q_u8(dst, v_out_2);
132*fb1b10abSAndroid Build Coastguard Worker       dst += dst_stride;
133*fb1b10abSAndroid Build Coastguard Worker       vst1q_u8(dst, v_out_3);
134*fb1b10abSAndroid Build Coastguard Worker       dst += dst_stride;
135*fb1b10abSAndroid Build Coastguard Worker 
136*fb1b10abSAndroid Build Coastguard Worker       // Rotate over to the next slot.
137*fb1b10abSAndroid Build Coastguard Worker       a0 = a4;
138*fb1b10abSAndroid Build Coastguard Worker       a1 = a5;
139*fb1b10abSAndroid Build Coastguard Worker       a2 = a6;
140*fb1b10abSAndroid Build Coastguard Worker       a3 = a7;
141*fb1b10abSAndroid Build Coastguard Worker     }
142*fb1b10abSAndroid Build Coastguard Worker 
143*fb1b10abSAndroid Build Coastguard Worker     src_ptr += 16;
144*fb1b10abSAndroid Build Coastguard Worker     dst_ptr += 16;
145*fb1b10abSAndroid Build Coastguard Worker   }
146*fb1b10abSAndroid Build Coastguard Worker 
147*fb1b10abSAndroid Build Coastguard Worker   // Clean up any left over column of length 8.
148*fb1b10abSAndroid Build Coastguard Worker   if (col != cols) {
149*fb1b10abSAndroid Build Coastguard Worker     uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7;
150*fb1b10abSAndroid Build Coastguard Worker     src = src_ptr - 2 * src_stride;
151*fb1b10abSAndroid Build Coastguard Worker     dst = dst_ptr;
152*fb1b10abSAndroid Build Coastguard Worker 
153*fb1b10abSAndroid Build Coastguard Worker     a0 = vld1_u8(src);
154*fb1b10abSAndroid Build Coastguard Worker     src += src_stride;
155*fb1b10abSAndroid Build Coastguard Worker     a1 = vld1_u8(src);
156*fb1b10abSAndroid Build Coastguard Worker     src += src_stride;
157*fb1b10abSAndroid Build Coastguard Worker     a2 = vld1_u8(src);
158*fb1b10abSAndroid Build Coastguard Worker     src += src_stride;
159*fb1b10abSAndroid Build Coastguard Worker     a3 = vld1_u8(src);
160*fb1b10abSAndroid Build Coastguard Worker     src += src_stride;
161*fb1b10abSAndroid Build Coastguard Worker 
162*fb1b10abSAndroid Build Coastguard Worker     for (row = 0; row < size; row += 4) {
163*fb1b10abSAndroid Build Coastguard Worker       uint8x8_t v_out_0, v_out_1, v_out_2, v_out_3;
164*fb1b10abSAndroid Build Coastguard Worker       const uint8x8_t filter = vld1_u8(f + col);
165*fb1b10abSAndroid Build Coastguard Worker 
166*fb1b10abSAndroid Build Coastguard Worker       a4 = vld1_u8(src);
167*fb1b10abSAndroid Build Coastguard Worker       src += src_stride;
168*fb1b10abSAndroid Build Coastguard Worker       a5 = vld1_u8(src);
169*fb1b10abSAndroid Build Coastguard Worker       src += src_stride;
170*fb1b10abSAndroid Build Coastguard Worker       a6 = vld1_u8(src);
171*fb1b10abSAndroid Build Coastguard Worker       src += src_stride;
172*fb1b10abSAndroid Build Coastguard Worker       a7 = vld1_u8(src);
173*fb1b10abSAndroid Build Coastguard Worker       src += src_stride;
174*fb1b10abSAndroid Build Coastguard Worker 
175*fb1b10abSAndroid Build Coastguard Worker       v_out_0 = generate_output(a0, a1, a2, a3, a4, filter);
176*fb1b10abSAndroid Build Coastguard Worker       v_out_1 = generate_output(a1, a2, a3, a4, a5, filter);
177*fb1b10abSAndroid Build Coastguard Worker       v_out_2 = generate_output(a2, a3, a4, a5, a6, filter);
178*fb1b10abSAndroid Build Coastguard Worker       v_out_3 = generate_output(a3, a4, a5, a6, a7, filter);
179*fb1b10abSAndroid Build Coastguard Worker 
180*fb1b10abSAndroid Build Coastguard Worker       vst1_u8(dst, v_out_0);
181*fb1b10abSAndroid Build Coastguard Worker       dst += dst_stride;
182*fb1b10abSAndroid Build Coastguard Worker       vst1_u8(dst, v_out_1);
183*fb1b10abSAndroid Build Coastguard Worker       dst += dst_stride;
184*fb1b10abSAndroid Build Coastguard Worker       vst1_u8(dst, v_out_2);
185*fb1b10abSAndroid Build Coastguard Worker       dst += dst_stride;
186*fb1b10abSAndroid Build Coastguard Worker       vst1_u8(dst, v_out_3);
187*fb1b10abSAndroid Build Coastguard Worker       dst += dst_stride;
188*fb1b10abSAndroid Build Coastguard Worker 
189*fb1b10abSAndroid Build Coastguard Worker       // Rotate over to the next slot.
190*fb1b10abSAndroid Build Coastguard Worker       a0 = a4;
191*fb1b10abSAndroid Build Coastguard Worker       a1 = a5;
192*fb1b10abSAndroid Build Coastguard Worker       a2 = a6;
193*fb1b10abSAndroid Build Coastguard Worker       a3 = a7;
194*fb1b10abSAndroid Build Coastguard Worker     }
195*fb1b10abSAndroid Build Coastguard Worker 
196*fb1b10abSAndroid Build Coastguard Worker     // Not strictly necessary but makes resetting dst_ptr easier.
197*fb1b10abSAndroid Build Coastguard Worker     dst_ptr += 8;
198*fb1b10abSAndroid Build Coastguard Worker   }
199*fb1b10abSAndroid Build Coastguard Worker 
200*fb1b10abSAndroid Build Coastguard Worker   dst_ptr -= cols;
201*fb1b10abSAndroid Build Coastguard Worker 
202*fb1b10abSAndroid Build Coastguard Worker   for (row = 0; row < size; row += 8) {
203*fb1b10abSAndroid Build Coastguard Worker     uint8x8_t a0, a1, a2, a3;
204*fb1b10abSAndroid Build Coastguard Worker     uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7;
205*fb1b10abSAndroid Build Coastguard Worker 
206*fb1b10abSAndroid Build Coastguard Worker     src = dst_ptr;
207*fb1b10abSAndroid Build Coastguard Worker     dst = dst_ptr;
208*fb1b10abSAndroid Build Coastguard Worker 
209*fb1b10abSAndroid Build Coastguard Worker     // Load 8 values, transpose 4 of them, and discard 2 because they will be
210*fb1b10abSAndroid Build Coastguard Worker     // reloaded later.
211*fb1b10abSAndroid Build Coastguard Worker     load_and_transpose_u8_4x8(src, dst_stride, &a0, &a1, &a2, &a3);
212*fb1b10abSAndroid Build Coastguard Worker     a3 = a1;
213*fb1b10abSAndroid Build Coastguard Worker     a2 = a1 = a0;  // Extend left border.
214*fb1b10abSAndroid Build Coastguard Worker 
215*fb1b10abSAndroid Build Coastguard Worker     src += 2;
216*fb1b10abSAndroid Build Coastguard Worker 
217*fb1b10abSAndroid Build Coastguard Worker     for (col = 0; col < cols; col += 8) {
218*fb1b10abSAndroid Build Coastguard Worker       uint8x8_t v_out_0, v_out_1, v_out_2, v_out_3, v_out_4, v_out_5, v_out_6,
219*fb1b10abSAndroid Build Coastguard Worker           v_out_7;
220*fb1b10abSAndroid Build Coastguard Worker       // Although the filter is meant to be applied vertically and is instead
221*fb1b10abSAndroid Build Coastguard Worker       // being applied horizontally here it's OK because it's set in blocks of 8
222*fb1b10abSAndroid Build Coastguard Worker       // (or 16).
223*fb1b10abSAndroid Build Coastguard Worker       const uint8x8_t filter = vld1_u8(f + col);
224*fb1b10abSAndroid Build Coastguard Worker 
225*fb1b10abSAndroid Build Coastguard Worker       load_and_transpose_u8_8x8(src, dst_stride, &b0, &b1, &b2, &b3, &b4, &b5,
226*fb1b10abSAndroid Build Coastguard Worker                                 &b6, &b7);
227*fb1b10abSAndroid Build Coastguard Worker 
228*fb1b10abSAndroid Build Coastguard Worker       if (col + 8 == cols) {
229*fb1b10abSAndroid Build Coastguard Worker         // Last row. Extend border (b5).
230*fb1b10abSAndroid Build Coastguard Worker         b6 = b7 = b5;
231*fb1b10abSAndroid Build Coastguard Worker       }
232*fb1b10abSAndroid Build Coastguard Worker 
233*fb1b10abSAndroid Build Coastguard Worker       v_out_0 = generate_output(a0, a1, a2, a3, b0, filter);
234*fb1b10abSAndroid Build Coastguard Worker       v_out_1 = generate_output(a1, a2, a3, b0, b1, filter);
235*fb1b10abSAndroid Build Coastguard Worker       v_out_2 = generate_output(a2, a3, b0, b1, b2, filter);
236*fb1b10abSAndroid Build Coastguard Worker       v_out_3 = generate_output(a3, b0, b1, b2, b3, filter);
237*fb1b10abSAndroid Build Coastguard Worker       v_out_4 = generate_output(b0, b1, b2, b3, b4, filter);
238*fb1b10abSAndroid Build Coastguard Worker       v_out_5 = generate_output(b1, b2, b3, b4, b5, filter);
239*fb1b10abSAndroid Build Coastguard Worker       v_out_6 = generate_output(b2, b3, b4, b5, b6, filter);
240*fb1b10abSAndroid Build Coastguard Worker       v_out_7 = generate_output(b3, b4, b5, b6, b7, filter);
241*fb1b10abSAndroid Build Coastguard Worker 
242*fb1b10abSAndroid Build Coastguard Worker       transpose_and_store_u8_8x8(dst, dst_stride, v_out_0, v_out_1, v_out_2,
243*fb1b10abSAndroid Build Coastguard Worker                                  v_out_3, v_out_4, v_out_5, v_out_6, v_out_7);
244*fb1b10abSAndroid Build Coastguard Worker 
245*fb1b10abSAndroid Build Coastguard Worker       a0 = b4;
246*fb1b10abSAndroid Build Coastguard Worker       a1 = b5;
247*fb1b10abSAndroid Build Coastguard Worker       a2 = b6;
248*fb1b10abSAndroid Build Coastguard Worker       a3 = b7;
249*fb1b10abSAndroid Build Coastguard Worker 
250*fb1b10abSAndroid Build Coastguard Worker       src += 8;
251*fb1b10abSAndroid Build Coastguard Worker       dst += 8;
252*fb1b10abSAndroid Build Coastguard Worker     }
253*fb1b10abSAndroid Build Coastguard Worker 
254*fb1b10abSAndroid Build Coastguard Worker     dst_ptr += 8 * dst_stride;
255*fb1b10abSAndroid Build Coastguard Worker   }
256*fb1b10abSAndroid Build Coastguard Worker }
257*fb1b10abSAndroid Build Coastguard Worker 
258*fb1b10abSAndroid Build Coastguard Worker // sum += x;
259*fb1b10abSAndroid Build Coastguard Worker // sumsq += x * y;
accumulate_sum_sumsq(const int16x4_t x,const int32x4_t xy,int16x4_t * const sum,int32x4_t * const sumsq)260*fb1b10abSAndroid Build Coastguard Worker static void accumulate_sum_sumsq(const int16x4_t x, const int32x4_t xy,
261*fb1b10abSAndroid Build Coastguard Worker                                  int16x4_t *const sum, int32x4_t *const sumsq) {
262*fb1b10abSAndroid Build Coastguard Worker   const int16x4_t zero = vdup_n_s16(0);
263*fb1b10abSAndroid Build Coastguard Worker   const int32x4_t zeroq = vdupq_n_s32(0);
264*fb1b10abSAndroid Build Coastguard Worker 
265*fb1b10abSAndroid Build Coastguard Worker   // Add in the first set because vext doesn't work with '0'.
266*fb1b10abSAndroid Build Coastguard Worker   *sum = vadd_s16(*sum, x);
267*fb1b10abSAndroid Build Coastguard Worker   *sumsq = vaddq_s32(*sumsq, xy);
268*fb1b10abSAndroid Build Coastguard Worker 
269*fb1b10abSAndroid Build Coastguard Worker   // Shift x and xy to the right and sum. vext requires an immediate.
270*fb1b10abSAndroid Build Coastguard Worker   *sum = vadd_s16(*sum, vext_s16(zero, x, 1));
271*fb1b10abSAndroid Build Coastguard Worker   *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 1));
272*fb1b10abSAndroid Build Coastguard Worker 
273*fb1b10abSAndroid Build Coastguard Worker   *sum = vadd_s16(*sum, vext_s16(zero, x, 2));
274*fb1b10abSAndroid Build Coastguard Worker   *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 2));
275*fb1b10abSAndroid Build Coastguard Worker 
276*fb1b10abSAndroid Build Coastguard Worker   *sum = vadd_s16(*sum, vext_s16(zero, x, 3));
277*fb1b10abSAndroid Build Coastguard Worker   *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 3));
278*fb1b10abSAndroid Build Coastguard Worker }
279*fb1b10abSAndroid Build Coastguard Worker 
280*fb1b10abSAndroid Build Coastguard Worker // Generate mask based on (sumsq * 15 - sum * sum < flimit)
calculate_mask(const int16x4_t sum,const int32x4_t sumsq,const int32x4_t f,const int32x4_t fifteen)281*fb1b10abSAndroid Build Coastguard Worker static uint16x4_t calculate_mask(const int16x4_t sum, const int32x4_t sumsq,
282*fb1b10abSAndroid Build Coastguard Worker                                  const int32x4_t f, const int32x4_t fifteen) {
283*fb1b10abSAndroid Build Coastguard Worker   const int32x4_t a = vmulq_s32(sumsq, fifteen);
284*fb1b10abSAndroid Build Coastguard Worker   const int32x4_t b = vmlsl_s16(a, sum, sum);
285*fb1b10abSAndroid Build Coastguard Worker   const uint32x4_t mask32 = vcltq_s32(b, f);
286*fb1b10abSAndroid Build Coastguard Worker   return vmovn_u32(mask32);
287*fb1b10abSAndroid Build Coastguard Worker }
288*fb1b10abSAndroid Build Coastguard Worker 
combine_mask(const int16x4_t sum_low,const int16x4_t sum_high,const int32x4_t sumsq_low,const int32x4_t sumsq_high,const int32x4_t f)289*fb1b10abSAndroid Build Coastguard Worker static uint8x8_t combine_mask(const int16x4_t sum_low, const int16x4_t sum_high,
290*fb1b10abSAndroid Build Coastguard Worker                               const int32x4_t sumsq_low,
291*fb1b10abSAndroid Build Coastguard Worker                               const int32x4_t sumsq_high, const int32x4_t f) {
292*fb1b10abSAndroid Build Coastguard Worker   const int32x4_t fifteen = vdupq_n_s32(15);
293*fb1b10abSAndroid Build Coastguard Worker   const uint16x4_t mask16_low = calculate_mask(sum_low, sumsq_low, f, fifteen);
294*fb1b10abSAndroid Build Coastguard Worker   const uint16x4_t mask16_high =
295*fb1b10abSAndroid Build Coastguard Worker       calculate_mask(sum_high, sumsq_high, f, fifteen);
296*fb1b10abSAndroid Build Coastguard Worker   return vmovn_u16(vcombine_u16(mask16_low, mask16_high));
297*fb1b10abSAndroid Build Coastguard Worker }
298*fb1b10abSAndroid Build Coastguard Worker 
299*fb1b10abSAndroid Build Coastguard Worker // Apply filter of (8 + sum + s[c]) >> 4.
filter_pixels(const int16x8_t sum,const uint8x8_t s)300*fb1b10abSAndroid Build Coastguard Worker static uint8x8_t filter_pixels(const int16x8_t sum, const uint8x8_t s) {
301*fb1b10abSAndroid Build Coastguard Worker   const int16x8_t s16 = vreinterpretq_s16_u16(vmovl_u8(s));
302*fb1b10abSAndroid Build Coastguard Worker   const int16x8_t sum_s = vaddq_s16(sum, s16);
303*fb1b10abSAndroid Build Coastguard Worker 
304*fb1b10abSAndroid Build Coastguard Worker   return vqrshrun_n_s16(sum_s, 4);
305*fb1b10abSAndroid Build Coastguard Worker }
306*fb1b10abSAndroid Build Coastguard Worker 
vpx_mbpost_proc_across_ip_neon(uint8_t * src,int pitch,int rows,int cols,int flimit)307*fb1b10abSAndroid Build Coastguard Worker void vpx_mbpost_proc_across_ip_neon(uint8_t *src, int pitch, int rows, int cols,
308*fb1b10abSAndroid Build Coastguard Worker                                     int flimit) {
309*fb1b10abSAndroid Build Coastguard Worker   int row, col;
310*fb1b10abSAndroid Build Coastguard Worker   const int32x4_t f = vdupq_n_s32(flimit);
311*fb1b10abSAndroid Build Coastguard Worker 
312*fb1b10abSAndroid Build Coastguard Worker   assert(cols % 8 == 0);
313*fb1b10abSAndroid Build Coastguard Worker 
314*fb1b10abSAndroid Build Coastguard Worker   for (row = 0; row < rows; ++row) {
315*fb1b10abSAndroid Build Coastguard Worker     // Sum the first 8 elements, which are extended from s[0].
316*fb1b10abSAndroid Build Coastguard Worker     // sumsq gets primed with +16.
317*fb1b10abSAndroid Build Coastguard Worker     int sumsq = src[0] * src[0] * 9 + 16;
318*fb1b10abSAndroid Build Coastguard Worker     int sum = src[0] * 9;
319*fb1b10abSAndroid Build Coastguard Worker 
320*fb1b10abSAndroid Build Coastguard Worker     uint8x8_t left_context, s, right_context;
321*fb1b10abSAndroid Build Coastguard Worker     int16x4_t sum_low, sum_high;
322*fb1b10abSAndroid Build Coastguard Worker     int32x4_t sumsq_low, sumsq_high;
323*fb1b10abSAndroid Build Coastguard Worker 
324*fb1b10abSAndroid Build Coastguard Worker     // Sum (+square) the next 6 elements.
325*fb1b10abSAndroid Build Coastguard Worker     // Skip [0] because it's included above.
326*fb1b10abSAndroid Build Coastguard Worker     for (col = 1; col <= 6; ++col) {
327*fb1b10abSAndroid Build Coastguard Worker       sumsq += src[col] * src[col];
328*fb1b10abSAndroid Build Coastguard Worker       sum += src[col];
329*fb1b10abSAndroid Build Coastguard Worker     }
330*fb1b10abSAndroid Build Coastguard Worker 
331*fb1b10abSAndroid Build Coastguard Worker     // Prime the sums. Later the loop uses the _high values to prime the new
332*fb1b10abSAndroid Build Coastguard Worker     // vectors.
333*fb1b10abSAndroid Build Coastguard Worker     sumsq_high = vdupq_n_s32(sumsq);
334*fb1b10abSAndroid Build Coastguard Worker     sum_high = vdup_n_s16(sum);
335*fb1b10abSAndroid Build Coastguard Worker 
336*fb1b10abSAndroid Build Coastguard Worker     // Manually extend the left border.
337*fb1b10abSAndroid Build Coastguard Worker     left_context = vdup_n_u8(src[0]);
338*fb1b10abSAndroid Build Coastguard Worker 
339*fb1b10abSAndroid Build Coastguard Worker     for (col = 0; col < cols; col += 8) {
340*fb1b10abSAndroid Build Coastguard Worker       uint8x8_t mask, output;
341*fb1b10abSAndroid Build Coastguard Worker       int16x8_t x, y;
342*fb1b10abSAndroid Build Coastguard Worker       int32x4_t xy_low, xy_high;
343*fb1b10abSAndroid Build Coastguard Worker 
344*fb1b10abSAndroid Build Coastguard Worker       s = vld1_u8(src + col);
345*fb1b10abSAndroid Build Coastguard Worker 
346*fb1b10abSAndroid Build Coastguard Worker       if (col + 8 == cols) {
347*fb1b10abSAndroid Build Coastguard Worker         // Last row. Extend border.
348*fb1b10abSAndroid Build Coastguard Worker         right_context = vdup_n_u8(src[col + 7]);
349*fb1b10abSAndroid Build Coastguard Worker       } else {
350*fb1b10abSAndroid Build Coastguard Worker         right_context = vld1_u8(src + col + 7);
351*fb1b10abSAndroid Build Coastguard Worker       }
352*fb1b10abSAndroid Build Coastguard Worker 
353*fb1b10abSAndroid Build Coastguard Worker       x = vreinterpretq_s16_u16(vsubl_u8(right_context, left_context));
354*fb1b10abSAndroid Build Coastguard Worker       y = vreinterpretq_s16_u16(vaddl_u8(right_context, left_context));
355*fb1b10abSAndroid Build Coastguard Worker       xy_low = vmull_s16(vget_low_s16(x), vget_low_s16(y));
356*fb1b10abSAndroid Build Coastguard Worker       xy_high = vmull_s16(vget_high_s16(x), vget_high_s16(y));
357*fb1b10abSAndroid Build Coastguard Worker 
358*fb1b10abSAndroid Build Coastguard Worker       // Catch up to the last sum'd value.
359*fb1b10abSAndroid Build Coastguard Worker       sum_low = vdup_lane_s16(sum_high, 3);
360*fb1b10abSAndroid Build Coastguard Worker       sumsq_low = vdupq_lane_s32(vget_high_s32(sumsq_high), 1);
361*fb1b10abSAndroid Build Coastguard Worker 
362*fb1b10abSAndroid Build Coastguard Worker       accumulate_sum_sumsq(vget_low_s16(x), xy_low, &sum_low, &sumsq_low);
363*fb1b10abSAndroid Build Coastguard Worker 
364*fb1b10abSAndroid Build Coastguard Worker       // Need to do this sequentially because we need the max value from
365*fb1b10abSAndroid Build Coastguard Worker       // sum_low.
366*fb1b10abSAndroid Build Coastguard Worker       sum_high = vdup_lane_s16(sum_low, 3);
367*fb1b10abSAndroid Build Coastguard Worker       sumsq_high = vdupq_lane_s32(vget_high_s32(sumsq_low), 1);
368*fb1b10abSAndroid Build Coastguard Worker 
369*fb1b10abSAndroid Build Coastguard Worker       accumulate_sum_sumsq(vget_high_s16(x), xy_high, &sum_high, &sumsq_high);
370*fb1b10abSAndroid Build Coastguard Worker 
371*fb1b10abSAndroid Build Coastguard Worker       mask = combine_mask(sum_low, sum_high, sumsq_low, sumsq_high, f);
372*fb1b10abSAndroid Build Coastguard Worker 
373*fb1b10abSAndroid Build Coastguard Worker       output = filter_pixels(vcombine_s16(sum_low, sum_high), s);
374*fb1b10abSAndroid Build Coastguard Worker       output = vbsl_u8(mask, output, s);
375*fb1b10abSAndroid Build Coastguard Worker 
376*fb1b10abSAndroid Build Coastguard Worker       vst1_u8(src + col, output);
377*fb1b10abSAndroid Build Coastguard Worker 
378*fb1b10abSAndroid Build Coastguard Worker       left_context = s;
379*fb1b10abSAndroid Build Coastguard Worker     }
380*fb1b10abSAndroid Build Coastguard Worker 
381*fb1b10abSAndroid Build Coastguard Worker     src += pitch;
382*fb1b10abSAndroid Build Coastguard Worker   }
383*fb1b10abSAndroid Build Coastguard Worker }
384*fb1b10abSAndroid Build Coastguard Worker 
385*fb1b10abSAndroid Build Coastguard Worker // Apply filter of (vpx_rv + sum + s[c]) >> 4.
filter_pixels_rv(const int16x8_t sum,const uint8x8_t s,const int16x8_t rv)386*fb1b10abSAndroid Build Coastguard Worker static uint8x8_t filter_pixels_rv(const int16x8_t sum, const uint8x8_t s,
387*fb1b10abSAndroid Build Coastguard Worker                                   const int16x8_t rv) {
388*fb1b10abSAndroid Build Coastguard Worker   const int16x8_t s16 = vreinterpretq_s16_u16(vmovl_u8(s));
389*fb1b10abSAndroid Build Coastguard Worker   const int16x8_t sum_s = vaddq_s16(sum, s16);
390*fb1b10abSAndroid Build Coastguard Worker   const int16x8_t rounded = vaddq_s16(sum_s, rv);
391*fb1b10abSAndroid Build Coastguard Worker 
392*fb1b10abSAndroid Build Coastguard Worker   return vqshrun_n_s16(rounded, 4);
393*fb1b10abSAndroid Build Coastguard Worker }
394*fb1b10abSAndroid Build Coastguard Worker 
vpx_mbpost_proc_down_neon(uint8_t * dst,int pitch,int rows,int cols,int flimit)395*fb1b10abSAndroid Build Coastguard Worker void vpx_mbpost_proc_down_neon(uint8_t *dst, int pitch, int rows, int cols,
396*fb1b10abSAndroid Build Coastguard Worker                                int flimit) {
397*fb1b10abSAndroid Build Coastguard Worker   int row, col, i;
398*fb1b10abSAndroid Build Coastguard Worker   const int32x4_t f = vdupq_n_s32(flimit);
399*fb1b10abSAndroid Build Coastguard Worker   uint8x8_t below_context = vdup_n_u8(0);
400*fb1b10abSAndroid Build Coastguard Worker 
401*fb1b10abSAndroid Build Coastguard Worker   // 8 columns are processed at a time.
402*fb1b10abSAndroid Build Coastguard Worker   // If rows is less than 8 the bottom border extension fails.
403*fb1b10abSAndroid Build Coastguard Worker   assert(cols % 8 == 0);
404*fb1b10abSAndroid Build Coastguard Worker   assert(rows >= 8);
405*fb1b10abSAndroid Build Coastguard Worker 
406*fb1b10abSAndroid Build Coastguard Worker   // Load and keep the first 8 values in memory. Process a vertical stripe that
407*fb1b10abSAndroid Build Coastguard Worker   // is 8 wide.
408*fb1b10abSAndroid Build Coastguard Worker   for (col = 0; col < cols; col += 8) {
409*fb1b10abSAndroid Build Coastguard Worker     uint8x8_t s, above_context[8];
410*fb1b10abSAndroid Build Coastguard Worker     int16x8_t sum, sum_tmp;
411*fb1b10abSAndroid Build Coastguard Worker     int32x4_t sumsq_low, sumsq_high;
412*fb1b10abSAndroid Build Coastguard Worker 
413*fb1b10abSAndroid Build Coastguard Worker     // Load and extend the top border.
414*fb1b10abSAndroid Build Coastguard Worker     s = vld1_u8(dst);
415*fb1b10abSAndroid Build Coastguard Worker     for (i = 0; i < 8; i++) {
416*fb1b10abSAndroid Build Coastguard Worker       above_context[i] = s;
417*fb1b10abSAndroid Build Coastguard Worker     }
418*fb1b10abSAndroid Build Coastguard Worker 
419*fb1b10abSAndroid Build Coastguard Worker     sum_tmp = vreinterpretq_s16_u16(vmovl_u8(s));
420*fb1b10abSAndroid Build Coastguard Worker 
421*fb1b10abSAndroid Build Coastguard Worker     // sum * 9
422*fb1b10abSAndroid Build Coastguard Worker     sum = vmulq_n_s16(sum_tmp, 9);
423*fb1b10abSAndroid Build Coastguard Worker 
424*fb1b10abSAndroid Build Coastguard Worker     // (sum * 9) * sum == sum * sum * 9
425*fb1b10abSAndroid Build Coastguard Worker     sumsq_low = vmull_s16(vget_low_s16(sum), vget_low_s16(sum_tmp));
426*fb1b10abSAndroid Build Coastguard Worker     sumsq_high = vmull_s16(vget_high_s16(sum), vget_high_s16(sum_tmp));
427*fb1b10abSAndroid Build Coastguard Worker 
428*fb1b10abSAndroid Build Coastguard Worker     // Load and discard the next 6 values to prime sum and sumsq.
429*fb1b10abSAndroid Build Coastguard Worker     for (i = 1; i <= 6; ++i) {
430*fb1b10abSAndroid Build Coastguard Worker       const uint8x8_t a = vld1_u8(dst + i * pitch);
431*fb1b10abSAndroid Build Coastguard Worker       const int16x8_t b = vreinterpretq_s16_u16(vmovl_u8(a));
432*fb1b10abSAndroid Build Coastguard Worker       sum = vaddq_s16(sum, b);
433*fb1b10abSAndroid Build Coastguard Worker 
434*fb1b10abSAndroid Build Coastguard Worker       sumsq_low = vmlal_s16(sumsq_low, vget_low_s16(b), vget_low_s16(b));
435*fb1b10abSAndroid Build Coastguard Worker       sumsq_high = vmlal_s16(sumsq_high, vget_high_s16(b), vget_high_s16(b));
436*fb1b10abSAndroid Build Coastguard Worker     }
437*fb1b10abSAndroid Build Coastguard Worker 
438*fb1b10abSAndroid Build Coastguard Worker     for (row = 0; row < rows; ++row) {
439*fb1b10abSAndroid Build Coastguard Worker       uint8x8_t mask, output;
440*fb1b10abSAndroid Build Coastguard Worker       int16x8_t x, y;
441*fb1b10abSAndroid Build Coastguard Worker       int32x4_t xy_low, xy_high;
442*fb1b10abSAndroid Build Coastguard Worker 
443*fb1b10abSAndroid Build Coastguard Worker       s = vld1_u8(dst + row * pitch);
444*fb1b10abSAndroid Build Coastguard Worker 
445*fb1b10abSAndroid Build Coastguard Worker       // Extend the bottom border.
446*fb1b10abSAndroid Build Coastguard Worker       if (row + 7 < rows) {
447*fb1b10abSAndroid Build Coastguard Worker         below_context = vld1_u8(dst + (row + 7) * pitch);
448*fb1b10abSAndroid Build Coastguard Worker       }
449*fb1b10abSAndroid Build Coastguard Worker 
450*fb1b10abSAndroid Build Coastguard Worker       x = vreinterpretq_s16_u16(vsubl_u8(below_context, above_context[0]));
451*fb1b10abSAndroid Build Coastguard Worker       y = vreinterpretq_s16_u16(vaddl_u8(below_context, above_context[0]));
452*fb1b10abSAndroid Build Coastguard Worker       xy_low = vmull_s16(vget_low_s16(x), vget_low_s16(y));
453*fb1b10abSAndroid Build Coastguard Worker       xy_high = vmull_s16(vget_high_s16(x), vget_high_s16(y));
454*fb1b10abSAndroid Build Coastguard Worker 
455*fb1b10abSAndroid Build Coastguard Worker       sum = vaddq_s16(sum, x);
456*fb1b10abSAndroid Build Coastguard Worker 
457*fb1b10abSAndroid Build Coastguard Worker       sumsq_low = vaddq_s32(sumsq_low, xy_low);
458*fb1b10abSAndroid Build Coastguard Worker       sumsq_high = vaddq_s32(sumsq_high, xy_high);
459*fb1b10abSAndroid Build Coastguard Worker 
460*fb1b10abSAndroid Build Coastguard Worker       mask = combine_mask(vget_low_s16(sum), vget_high_s16(sum), sumsq_low,
461*fb1b10abSAndroid Build Coastguard Worker                           sumsq_high, f);
462*fb1b10abSAndroid Build Coastguard Worker 
463*fb1b10abSAndroid Build Coastguard Worker       output = filter_pixels_rv(sum, s, vld1q_s16(vpx_rv + (row & 127)));
464*fb1b10abSAndroid Build Coastguard Worker       output = vbsl_u8(mask, output, s);
465*fb1b10abSAndroid Build Coastguard Worker 
466*fb1b10abSAndroid Build Coastguard Worker       vst1_u8(dst + row * pitch, output);
467*fb1b10abSAndroid Build Coastguard Worker 
468*fb1b10abSAndroid Build Coastguard Worker       above_context[0] = above_context[1];
469*fb1b10abSAndroid Build Coastguard Worker       above_context[1] = above_context[2];
470*fb1b10abSAndroid Build Coastguard Worker       above_context[2] = above_context[3];
471*fb1b10abSAndroid Build Coastguard Worker       above_context[3] = above_context[4];
472*fb1b10abSAndroid Build Coastguard Worker       above_context[4] = above_context[5];
473*fb1b10abSAndroid Build Coastguard Worker       above_context[5] = above_context[6];
474*fb1b10abSAndroid Build Coastguard Worker       above_context[6] = above_context[7];
475*fb1b10abSAndroid Build Coastguard Worker       above_context[7] = s;
476*fb1b10abSAndroid Build Coastguard Worker     }
477*fb1b10abSAndroid Build Coastguard Worker 
478*fb1b10abSAndroid Build Coastguard Worker     dst += 8;
479*fb1b10abSAndroid Build Coastguard Worker   }
480*fb1b10abSAndroid Build Coastguard Worker }
481