xref: /aosp_15_r20/external/libopenapv/src/neon/oapv_sad_neon.c (revision abb65b4b03b69e1d508d4d9a44dcf199df16e7c3)
1 /*
2  * Copyright (c) 2022 Samsung Electronics Co., Ltd.
3  * All Rights Reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * - Redistributions of source code must retain the above copyright notice,
9  *   this list of conditions and the following disclaimer.
10  *
11  * - Redistributions in binary form must reproduce the above copyright notice,
12  *   this list of conditions and the following disclaimer in the documentation
13  *   and/or other materials provided with the distribution.
14  *
15  * - Neither the name of the copyright owner, nor the names of its contributors
16  *   may be used to endorse or promote products derived from this software
17  *   without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include "oapv_def.h"
33 #include <math.h>
34 
35 #if ARM_NEON
36 
37 /* SSD ***********************************************************************/
ssd_16b_neon_8x8(int w,int h,void * src1,void * src2,int s_src1,int s_src2,int bit_depth)38 static s64 ssd_16b_neon_8x8(int w, int h, void *src1, void *src2, int s_src1, int s_src2, int bit_depth)
39 {
40     s64 ssd = 0;
41     s16* s1 = (s16*) src1;
42     s16* s2 = (s16*) src2;
43     s16 i;
44     int16x8_t s1_vector, s2_vector;
45     int32x4_t diff1, diff2;
46     int32x2_t diff1_low, diff2_low;
47     int64x2_t sq_diff1_low, sq_diff1_high, sq_diff2_low, sq_diff2_high, sq_diff;
48 
49     {
50         s1_vector = vld1q_s16(s1);
51         s1 += s_src1;
52         s2_vector = vld1q_s16(s2);
53         s2 += s_src2;
54 
55         diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
56         diff2 = vsubl_high_s16(s1_vector, s2_vector);
57         diff1_low = vget_low_s32(diff1);
58         diff2_low = vget_low_s32(diff2);
59 
60         sq_diff1_low = vmull_s32(diff1_low, diff1_low);
61         sq_diff1_high = vmull_high_s32(diff1, diff1);
62         sq_diff2_low = vmull_s32(diff2_low, diff2_low);
63         sq_diff2_high = vmull_high_s32(diff2, diff2);
64 
65         sq_diff = vaddq_s64(sq_diff1_low, sq_diff1_high);
66         sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
67         sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
68     }
69     {
70         s1_vector = vld1q_s16(s1);
71         s1 += s_src1;
72         s2_vector = vld1q_s16(s2);
73         s2 += s_src2;
74 
75         diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
76         diff2 = vsubl_high_s16(s1_vector, s2_vector);
77         diff1_low = vget_low_s32(diff1);
78         diff2_low = vget_low_s32(diff2);
79 
80         sq_diff1_low = vmull_s32(diff1_low, diff1_low);
81         sq_diff1_high = vmull_high_s32(diff1, diff1);
82         sq_diff2_low = vmull_s32(diff2_low, diff2_low);
83         sq_diff2_high = vmull_high_s32(diff2, diff2);
84 
85         sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
86         sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
87         sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
88         sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
89     }
90     {
91         s1_vector = vld1q_s16(s1);
92         s1 += s_src1;
93         s2_vector = vld1q_s16(s2);
94         s2 += s_src2;
95 
96         diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
97         diff2 = vsubl_high_s16(s1_vector, s2_vector);
98         diff1_low = vget_low_s32(diff1);
99         diff2_low = vget_low_s32(diff2);
100 
101         sq_diff1_low = vmull_s32(diff1_low, diff1_low);
102         sq_diff1_high = vmull_high_s32(diff1, diff1);
103         sq_diff2_low = vmull_s32(diff2_low, diff2_low);
104         sq_diff2_high = vmull_high_s32(diff2, diff2);
105 
106         sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
107         sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
108         sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
109         sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
110     }
111     {
112         s1_vector = vld1q_s16(s1);
113         s1 += s_src1;
114         s2_vector = vld1q_s16(s2);
115         s2 += s_src2;
116 
117         diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
118         diff2 = vsubl_high_s16(s1_vector, s2_vector);
119         diff1_low = vget_low_s32(diff1);
120         diff2_low = vget_low_s32(diff2);
121 
122         sq_diff1_low = vmull_s32(diff1_low, diff1_low);
123         sq_diff1_high = vmull_high_s32(diff1, diff1);
124         sq_diff2_low = vmull_s32(diff2_low, diff2_low);
125         sq_diff2_high = vmull_high_s32(diff2, diff2);
126 
127         sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
128         sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
129         sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
130         sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
131     }
132     {
133         s1_vector = vld1q_s16(s1);
134         s1 += s_src1;
135         s2_vector = vld1q_s16(s2);
136         s2 += s_src2;
137 
138         diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
139         diff2 = vsubl_high_s16(s1_vector, s2_vector);
140         diff1_low = vget_low_s32(diff1);
141         diff2_low = vget_low_s32(diff2);
142 
143         sq_diff1_low = vmull_s32(diff1_low, diff1_low);
144         sq_diff1_high = vmull_high_s32(diff1, diff1);
145         sq_diff2_low = vmull_s32(diff2_low, diff2_low);
146         sq_diff2_high = vmull_high_s32(diff2, diff2);
147 
148         sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
149         sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
150         sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
151         sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
152     }
153     {
154         s1_vector = vld1q_s16(s1);
155         s1 += s_src1;
156         s2_vector = vld1q_s16(s2);
157         s2 += s_src2;
158 
159         diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
160         diff2 = vsubl_high_s16(s1_vector, s2_vector);
161         diff1_low = vget_low_s32(diff1);
162         diff2_low = vget_low_s32(diff2);
163 
164         sq_diff1_low = vmull_s32(diff1_low, diff1_low);
165         sq_diff1_high = vmull_high_s32(diff1, diff1);
166         sq_diff2_low = vmull_s32(diff2_low, diff2_low);
167         sq_diff2_high = vmull_high_s32(diff2, diff2);
168 
169         sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
170         sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
171         sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
172         sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
173     }
174     {
175         s1_vector = vld1q_s16(s1);
176         s1 += s_src1;
177         s2_vector = vld1q_s16(s2);
178         s2 += s_src2;
179 
180         diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
181         diff2 = vsubl_high_s16(s1_vector, s2_vector);
182         diff1_low = vget_low_s32(diff1);
183         diff2_low = vget_low_s32(diff2);
184 
185         sq_diff1_low = vmull_s32(diff1_low, diff1_low);
186         sq_diff1_high = vmull_high_s32(diff1, diff1);
187         sq_diff2_low = vmull_s32(diff2_low, diff2_low);
188         sq_diff2_high = vmull_high_s32(diff2, diff2);
189 
190         sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
191         sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
192         sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
193         sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
194     }
195     {
196         s1_vector = vld1q_s16(s1);
197         s1 += s_src1;
198         s2_vector = vld1q_s16(s2);
199         s2 += s_src2;
200 
201         diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
202         diff2 = vsubl_high_s16(s1_vector, s2_vector);
203         diff1_low = vget_low_s32(diff1);
204         diff2_low = vget_low_s32(diff2);
205 
206         sq_diff1_low = vmull_s32(diff1_low, diff1_low);
207         sq_diff1_high = vmull_high_s32(diff1, diff1);
208         sq_diff2_low = vmull_s32(diff2_low, diff2_low);
209         sq_diff2_high = vmull_high_s32(diff2, diff2);
210 
211         sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
212         sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
213         sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
214         sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
215     }
216     ssd += vaddvq_s64(sq_diff);
217     return ssd;
218 }
219 
220 const oapv_fn_ssd_t oapv_tbl_fn_ssd_16b_neon[2] =
221     {
222         ssd_16b_neon_8x8,
223             NULL};
224 
225 
oapv_dc_removed_had8x8_neon(pel * org,int s_org)226 int oapv_dc_removed_had8x8_neon(pel* org, int s_org)
227 {
228     int satd = 0;
229     /* all 128 bit registers are named with a suffix mxnb, where m is the */
230     /* number of n bits packed in the register                            */
231 
232     int16x8_t src0_8x16b, src1_8x16b, src2_8x16b, src3_8x16b;
233     int16x8_t src4_8x16b, src5_8x16b, src6_8x16b, src7_8x16b;
234     int16x8_t pred0_8x16b, pred1_8x16b, pred2_8x16b, pred3_8x16b;
235     int16x8_t pred4_8x16b, pred5_8x16b, pred6_8x16b, pred7_8x16b;
236     int16x8_t out0_8x16b, out1_8x16b, out2_8x16b, out3_8x16b;
237     int16x8_t out4_8x16b, out5_8x16b, out6_8x16b, out7_8x16b;
238     int16x8x2_t out0_8x16bx2, out1_8x16bx2, out2_8x16bx2, out3_8x16bx2;
239 
240     src0_8x16b = (vld1q_s16(&org[0]));
241     org = org + s_org;
242     src1_8x16b = (vld1q_s16(&org[0]));
243     org = org + s_org;
244     src2_8x16b = (vld1q_s16(&org[0]));
245     org = org + s_org;
246     src3_8x16b = (vld1q_s16(&org[0]));
247     org = org + s_org;
248     src4_8x16b = (vld1q_s16(&org[0]));
249     org = org + s_org;
250     src5_8x16b = (vld1q_s16(&org[0]));
251     org = org + s_org;
252     src6_8x16b = (vld1q_s16(&org[0]));
253     org = org + s_org;
254     src7_8x16b = (vld1q_s16(&org[0]));
255     org = org + s_org;
256 
257     /**************** 8x8 horizontal transform *******************************/
258     /***********************    8x8 16 bit Transpose  ************************/
259 
260     out3_8x16b = vcombine_s16(vget_low_s16(src0_8x16b), vget_low_s16(src1_8x16b));
261     out7_8x16b = vcombine_s16(vget_high_s16(src0_8x16b), vget_high_s16(src1_8x16b));
262 
263     pred0_8x16b = vcombine_s16(vget_low_s16(src2_8x16b), vget_low_s16(src3_8x16b));
264     src2_8x16b = vcombine_s16(vget_high_s16(src2_8x16b), vget_high_s16(src3_8x16b));
265 
266     out2_8x16b = vcombine_s16(vget_low_s16(src4_8x16b), vget_low_s16(src5_8x16b));
267     pred7_8x16b = vcombine_s16(vget_high_s16(src4_8x16b), vget_high_s16(src5_8x16b));
268 
269     pred3_8x16b = vcombine_s16(vget_low_s16(src6_8x16b), vget_low_s16(src7_8x16b));
270     src6_8x16b = vcombine_s16(vget_high_s16(src6_8x16b), vget_high_s16(src7_8x16b));
271 
272 
273     out1_8x16b = vzip1q_s32(out3_8x16b, pred0_8x16b);
274     out3_8x16b = vzip2q_s32(out3_8x16b, pred0_8x16b);
275 
276     pred1_8x16b = vzip1q_s32(out2_8x16b, pred3_8x16b);
277     pred3_8x16b = vzip2q_s32(out2_8x16b, pred3_8x16b);
278 
279     out5_8x16b = vzip1q_s32(out7_8x16b, src2_8x16b);
280     out7_8x16b = vzip2q_s32(out7_8x16b, src2_8x16b);
281 
282     pred5_8x16b = vzip1q_s32(pred7_8x16b, src6_8x16b);
283     pred7_8x16b = vzip2q_s32(pred7_8x16b, src6_8x16b);
284 
285     out0_8x16b = vzip1q_s64(out1_8x16b,pred1_8x16b);
286     out1_8x16b = vzip2q_s64(out1_8x16b,pred1_8x16b);
287     out2_8x16b = vzip1q_s64(out3_8x16b,pred3_8x16b);
288     out3_8x16b = vzip2q_s64(out3_8x16b,pred3_8x16b);
289     out4_8x16b = vzip1q_s64(out5_8x16b,pred5_8x16b);
290     out5_8x16b = vzip2q_s64(out5_8x16b,pred5_8x16b);
291     out6_8x16b = vzip1q_s64(out7_8x16b,pred7_8x16b);
292     out7_8x16b = vzip2q_s64(out7_8x16b,pred7_8x16b);
293 
294     /**********************   8x8 16 bit Transpose End   *********************/
295 
296     /* r0 + r1 */
297     pred0_8x16b = vaddq_s16(out0_8x16b, out1_8x16b);
298     /* r2 + r3 */
299     pred2_8x16b = vaddq_s16(out2_8x16b, out3_8x16b);
300     /* r4 + r5 */
301     pred4_8x16b = vaddq_s16(out4_8x16b, out5_8x16b);
302     /* r6 + r7 */
303     pred6_8x16b = vaddq_s16(out6_8x16b, out7_8x16b);
304 
305 
306     /* r0 + r1 + r2 + r3 */
307     pred1_8x16b = vaddq_s16(pred0_8x16b, pred2_8x16b);
308     /* r4 + r5 + r6 + r7 */
309     pred5_8x16b = vaddq_s16(pred4_8x16b, pred6_8x16b);
310     /* r0 + r1 + r2 + r3 + r4 + r5 + r6 + r7 */
311     src0_8x16b = vaddq_s16(pred1_8x16b, pred5_8x16b);
312     /* r0 + r1 + r2 + r3 - r4 - r5 - r6 - r7 */
313     src4_8x16b = vsubq_s16(pred1_8x16b, pred5_8x16b);
314 
315     /* r0 + r1 - r2 - r3 */
316     pred1_8x16b = vsubq_s16(pred0_8x16b, pred2_8x16b);
317     /* r4 + r5 - r6 - r7 */
318     pred5_8x16b = vsubq_s16(pred4_8x16b, pred6_8x16b);
319     /* r0 + r1 - r2 - r3 + r4 + r5 - r6 - r7 */
320     src2_8x16b = vaddq_s16(pred1_8x16b, pred5_8x16b);
321     /* r0 + r1 - r2 - r3 - r4 - r5 + r6 + r7 */
322     src6_8x16b = vsubq_s16(pred1_8x16b, pred5_8x16b);
323 
324     /* r0 - r1 */
325     pred0_8x16b = vsubq_s16(out0_8x16b, out1_8x16b);
326     /* r2 - r3 */
327     pred2_8x16b = vsubq_s16(out2_8x16b, out3_8x16b);
328     /* r4 - r5 */
329     pred4_8x16b = vsubq_s16(out4_8x16b, out5_8x16b);
330     /* r6 - r7 */
331     pred6_8x16b = vsubq_s16(out6_8x16b, out7_8x16b);
332 
333     /* r0 - r1 + r2 - r3 */
334     pred1_8x16b = vaddq_s16(pred0_8x16b, pred2_8x16b);
335     /* r4 - r5 + r6 - r7 */
336     pred5_8x16b = vaddq_s16(pred4_8x16b, pred6_8x16b);
337     /* r0 - r1 + r2 - r3 + r4 - r5 + r6 - r7 */
338     src1_8x16b = vaddq_s16(pred1_8x16b, pred5_8x16b);
339     /* r0 - r1 + r2 - r3 - r4 + r5 - r6 + r7 */
340     src5_8x16b = vsubq_s16(pred1_8x16b, pred5_8x16b);
341 
342     /* r0 - r1 - r2 + r3 */
343     pred1_8x16b = vsubq_s16(pred0_8x16b, pred2_8x16b);
344     /* r4 - r5 - r6 + r7 */
345     pred5_8x16b = vsubq_s16(pred4_8x16b, pred6_8x16b);
346     /* r0 - r1 - r2 + r3 + r4 - r5 - r6 + r7 */
347     src3_8x16b = vaddq_s16(pred1_8x16b, pred5_8x16b);
348     /* r0 - r1 - r2 + r3 - r4 + r5 + r6 - r7 */
349     src7_8x16b = vsubq_s16(pred1_8x16b, pred5_8x16b);
350 
351 
352     /***********************    8x8 16 bit Transpose  ************************/
353     out3_8x16b = vzip1q_s16(src0_8x16b, src1_8x16b);
354     pred0_8x16b = vzip1q_s16(src2_8x16b, src3_8x16b);
355     out2_8x16b = vzip1q_s16(src4_8x16b, src5_8x16b);
356     pred3_8x16b = vzip1q_s16(src6_8x16b, src7_8x16b);
357     out7_8x16b = vzip2q_s16(src0_8x16b, src1_8x16b);
358     src2_8x16b = vzip2q_s16(src2_8x16b, src3_8x16b);
359     pred7_8x16b = vzip2q_s16(src4_8x16b, src5_8x16b);
360     src6_8x16b = vzip2q_s16(src6_8x16b, src7_8x16b);
361 
362     out1_8x16b = vzip1q_s32(out3_8x16b, pred0_8x16b);
363     out3_8x16b = vzip2q_s32(out3_8x16b, pred0_8x16b);
364 
365     pred1_8x16b = vzip1q_s32(out2_8x16b, pred3_8x16b);
366     pred3_8x16b = vzip2q_s32(out2_8x16b, pred3_8x16b);
367 
368     out5_8x16b = vzip1q_s32(out7_8x16b, src2_8x16b);
369     out7_8x16b = vzip2q_s32(out7_8x16b, src2_8x16b);
370 
371     pred5_8x16b = vzip1q_s32(pred7_8x16b, src6_8x16b);
372     pred7_8x16b = vzip2q_s32(pred7_8x16b, src6_8x16b);
373 
374     src0_8x16b = vzip1q_s64(out1_8x16b,pred1_8x16b);
375     src1_8x16b = vzip2q_s64(out1_8x16b,pred1_8x16b);
376     src2_8x16b = vzip1q_s64(out3_8x16b,pred3_8x16b);
377     src3_8x16b = vzip2q_s64(out3_8x16b,pred3_8x16b);
378     src4_8x16b = vzip1q_s64(out5_8x16b,pred5_8x16b);
379     src5_8x16b = vzip2q_s64(out5_8x16b,pred5_8x16b);
380     src6_8x16b = vzip1q_s64(out7_8x16b,pred7_8x16b);
381     src7_8x16b = vzip2q_s64(out7_8x16b,pred7_8x16b);
382 
383     /**********************   8x8 16 bit Transpose End   *********************/
384     /**************** 8x8 horizontal transform *******************************/
385     {
386         int16x8_t out0a_8x16b, out1a_8x16b, out2a_8x16b, out3a_8x16b;
387         int16x8_t out4a_8x16b, out5a_8x16b, out6a_8x16b, out7a_8x16b;
388         int16x8_t tmp0_8x16b, tmp1_8x16b, tmp2_8x16b, tmp3_8x16b;
389         int16x8_t tmp4_8x16b, tmp5_8x16b, tmp6_8x16b, tmp7_8x16b;
390 
391         /************************* 8x8 Vertical Transform*************************/
392         tmp0_8x16b = vcombine_s16(vget_high_s16(src0_8x16b), vcreate_s32(0));
393         tmp1_8x16b = vcombine_s16(vget_high_s16(src1_8x16b), vcreate_s32(0));
394         tmp2_8x16b = vcombine_s16(vget_high_s16(src2_8x16b), vcreate_s32(0));
395         tmp3_8x16b = vcombine_s16(vget_high_s16(src3_8x16b), vcreate_s32(0));
396         tmp4_8x16b = vcombine_s16(vget_high_s16(src4_8x16b), vcreate_s32(0));
397         tmp5_8x16b = vcombine_s16(vget_high_s16(src5_8x16b), vcreate_s32(0));
398         tmp6_8x16b = vcombine_s16(vget_high_s16(src6_8x16b), vcreate_s32(0));
399         tmp7_8x16b = vcombine_s16(vget_high_s16(src7_8x16b), vcreate_s32(0));
400 
401         /*************************First 4 pixels ********************************/
402 
403         src0_8x16b = vmovl_s16(vget_low_s16(src0_8x16b));
404         src1_8x16b = vmovl_s16(vget_low_s16(src1_8x16b));
405         src2_8x16b = vmovl_s16(vget_low_s16(src2_8x16b));
406         src3_8x16b = vmovl_s16(vget_low_s16(src3_8x16b));
407         src4_8x16b = vmovl_s16(vget_low_s16(src4_8x16b));
408         src5_8x16b = vmovl_s16(vget_low_s16(src5_8x16b));
409         src6_8x16b = vmovl_s16(vget_low_s16(src6_8x16b));
410         src7_8x16b = vmovl_s16(vget_low_s16(src7_8x16b));
411 
412         /* r0 + r1 */
413         pred0_8x16b = vaddq_s32(src0_8x16b, src1_8x16b);
414         /* r2 + r3 */
415         pred2_8x16b = vaddq_s32(src2_8x16b, src3_8x16b);
416         /* r4 + r5 */
417         pred4_8x16b = vaddq_s32(src4_8x16b, src5_8x16b);
418         /* r6 + r7 */
419         pred6_8x16b = vaddq_s32(src6_8x16b, src7_8x16b);
420 
421         /* r0 + r1 + r2 + r3 */
422         pred1_8x16b = vaddq_s32(pred0_8x16b, pred2_8x16b);
423         /* r4 + r5 + r6 + r7 */
424         pred5_8x16b = vaddq_s32(pred4_8x16b, pred6_8x16b);
425         /* r0 + r1 + r2 + r3 + r4 + r5 + r6 + r7 */
426         out0_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
427         /* r0 + r1 + r2 + r3 - r4 - r5 - r6 - r7 */
428         out4_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
429 
430         /* r0 + r1 - r2 - r3 */
431         pred1_8x16b = vsubq_s32(pred0_8x16b, pred2_8x16b);
432         /* r4 + r5 - r6 - r7 */
433         pred5_8x16b = vsubq_s32(pred4_8x16b, pred6_8x16b);
434         /* r0 + r1 - r2 - r3 + r4 + r5 - r6 - r7 */
435         out2_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
436         /* r0 + r1 - r2 - r3 - r4 - r5 + r6 + r7 */
437         out6_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
438 
439         /* r0 - r1 */
440         pred0_8x16b = vsubq_s32(src0_8x16b, src1_8x16b);
441         /* r2 - r3 */
442         pred2_8x16b = vsubq_s32(src2_8x16b, src3_8x16b);
443         /* r4 - r5 */
444         pred4_8x16b = vsubq_s32(src4_8x16b, src5_8x16b);
445         /* r6 - r7 */
446         pred6_8x16b = vsubq_s32(src6_8x16b, src7_8x16b);
447 
448         /* r0 - r1 + r2 - r3 */
449         pred1_8x16b = vaddq_s32(pred0_8x16b, pred2_8x16b);
450         /* r4 - r5 + r6 - r7 */
451         pred5_8x16b = vaddq_s32(pred4_8x16b, pred6_8x16b);
452         /* r0 - r1 + r2 - r3 + r4 - r5 + r6 - r7 */
453         out1_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
454         /* r0 - r1 + r2 - r3 - r4 + r5 - r6 + r7 */
455         out5_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
456 
457         /* r0 - r1 - r2 + r3 */
458         pred1_8x16b = vsubq_s32(pred0_8x16b, pred2_8x16b);
459         /* r4 - r5 - r6 + r7 */
460         pred5_8x16b = vsubq_s32(pred4_8x16b, pred6_8x16b);
461         /* r0 - r1 - r2 + r3 + r4 - r5 - r6 + r7 */
462         out3_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
463         /* r0 - r1 - r2 + r3 - r4 + r5 + r6 - r7 */
464         out7_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
465 
466         /*************************First 4 pixels ********************************/
467 
468         /**************************Next 4 pixels *******************************/
469         src0_8x16b = vmovl_s16(vget_low_s16(tmp0_8x16b));
470         src1_8x16b = vmovl_s16(vget_low_s16(tmp1_8x16b));
471         src2_8x16b = vmovl_s16(vget_low_s16(tmp2_8x16b));
472         src3_8x16b = vmovl_s16(vget_low_s16(tmp3_8x16b));
473         src4_8x16b = vmovl_s16(vget_low_s16(tmp4_8x16b));
474         src5_8x16b = vmovl_s16(vget_low_s16(tmp5_8x16b));
475         src6_8x16b = vmovl_s16(vget_low_s16(tmp6_8x16b));
476         src7_8x16b = vmovl_s16(vget_low_s16(tmp7_8x16b));
477 
478         /* r0 + r1 */
479         pred0_8x16b = vaddq_s32(src0_8x16b, src1_8x16b);
480         /* r2 + r3 */
481         pred2_8x16b = vaddq_s32(src2_8x16b, src3_8x16b);
482         /* r4 + r5 */
483         pred4_8x16b = vaddq_s32(src4_8x16b, src5_8x16b);
484         /* r6 + r7 */
485         pred6_8x16b = vaddq_s32(src6_8x16b, src7_8x16b);
486 
487         /* r0 + r1 + r2 + r3 */
488         pred1_8x16b = vaddq_s32(pred0_8x16b, pred2_8x16b);
489         /* r4 + r5 + r6 + r7 */
490         pred5_8x16b = vaddq_s32(pred4_8x16b, pred6_8x16b);
491         /* r0 + r1 + r2 + r3 + r4 + r5 + r6 + r7 */
492         out0a_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
493         /* r0 + r1 + r2 + r3 - r4 - r5 - r6 - r7 */
494         out4a_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
495 
496         /* r0 + r1 - r2 - r3 */
497         pred1_8x16b = vsubq_s32(pred0_8x16b, pred2_8x16b);
498         /* r4 + r5 - r6 - r7 */
499         pred5_8x16b = vsubq_s32(pred4_8x16b, pred6_8x16b);
500         /* r0 + r1 - r2 - r3 + r4 + r5 - r6 - r7 */
501         out2a_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
502         /* r0 + r1 - r2 - r3 - r4 - r5 + r6 + r7 */
503         out6a_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
504 
505         /* r0 - r1 */
506         pred0_8x16b = vsubq_s32(src0_8x16b, src1_8x16b);
507         /* r2 - r3 */
508         pred2_8x16b = vsubq_s32(src2_8x16b, src3_8x16b);
509         /* r4 - r5 */
510         pred4_8x16b = vsubq_s32(src4_8x16b, src5_8x16b);
511         /* r6 - r7 */
512         pred6_8x16b = vsubq_s32(src6_8x16b, src7_8x16b);
513 
514         /* r0 - r1 + r2 - r3 */
515         pred1_8x16b = vaddq_s32(pred0_8x16b, pred2_8x16b);
516         /* r4 - r5 + r6 - r7 */
517         pred5_8x16b = vaddq_s32(pred4_8x16b, pred6_8x16b);
518         /* r0 - r1 + r2 - r3 + r4 - r5 + r6 - r7 */
519         out1a_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
520         /* r0 - r1 + r2 - r3 - r4 + r5 - r6 + r7 */
521         out5a_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
522 
523         /* r0 - r1 - r2 + r3 */
524         pred1_8x16b = vsubq_s32(pred0_8x16b, pred2_8x16b);
525         /* r4 - r5 - r6 + r7 */
526         pred5_8x16b = vsubq_s32(pred4_8x16b, pred6_8x16b);
527         /* r0 - r1 - r2 + r3 + r4 - r5 - r6 + r7 */
528         out3a_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
529         /* r0 - r1 - r2 + r3 - r4 + r5 + r6 - r7 */
530         out7a_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
531 
532         /**************************Next 4 pixels *******************************/
533         /************************* 8x8 Vertical Transform*************************/
534 
535         /****************************SATD calculation ****************************/
536         src0_8x16b = vabsq_s32(out0_8x16b);
537         src1_8x16b = vabsq_s32(out1_8x16b);
538         src2_8x16b = vabsq_s32(out2_8x16b);
539         src3_8x16b = vabsq_s32(out3_8x16b);
540         src4_8x16b = vabsq_s32(out4_8x16b);
541         src5_8x16b = vabsq_s32(out5_8x16b);
542         src6_8x16b = vabsq_s32(out6_8x16b);
543         src7_8x16b = vabsq_s32(out7_8x16b);
544         s32* p = (s32*)&src0_8x16b;
545         p[0] = 0;
546 
547         satd = vaddvq_s32(src0_8x16b);
548         satd += vaddvq_s32(src1_8x16b);
549         satd += vaddvq_s32(src2_8x16b);
550         satd += vaddvq_s32(src3_8x16b);
551         satd += vaddvq_s32(src4_8x16b);
552         satd += vaddvq_s32(src5_8x16b);
553         satd += vaddvq_s32(src6_8x16b);
554         satd += vaddvq_s32(src7_8x16b);
555 
556         src0_8x16b = vabsq_s32(out0a_8x16b);
557         src1_8x16b = vabsq_s32(out1a_8x16b);
558         src2_8x16b = vabsq_s32(out2a_8x16b);
559         src3_8x16b = vabsq_s32(out3a_8x16b);
560         src4_8x16b = vabsq_s32(out4a_8x16b);
561         src5_8x16b = vabsq_s32(out5a_8x16b);
562         src6_8x16b = vabsq_s32(out6a_8x16b);
563         src7_8x16b = vabsq_s32(out7a_8x16b);
564 
565         satd += vaddvq_s32(src0_8x16b);
566         satd += vaddvq_s32(src1_8x16b);
567         satd += vaddvq_s32(src2_8x16b);
568         satd += vaddvq_s32(src3_8x16b);
569         satd += vaddvq_s32(src4_8x16b);
570         satd += vaddvq_s32(src5_8x16b);
571         satd += vaddvq_s32(src6_8x16b);
572         satd += vaddvq_s32(src7_8x16b);
573 
574         satd = (satd + 2) >> 2;
575         return satd;
576     }
577 }
578 #endif /* ARM_NEON */
579