1 /*
2 * Copyright (c) 2022 Samsung Electronics Co., Ltd.
3 * All Rights Reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 *
8 * - Redistributions of source code must retain the above copyright notice,
9 * this list of conditions and the following disclaimer.
10 *
11 * - Redistributions in binary form must reproduce the above copyright notice,
12 * this list of conditions and the following disclaimer in the documentation
13 * and/or other materials provided with the distribution.
14 *
15 * - Neither the name of the copyright owner, nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 * ARE DISCLAIMED.IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include "oapv_def.h"
33 #include <math.h>
34
35 #if ARM_NEON
36
37 /* SSD ***********************************************************************/
ssd_16b_neon_8x8(int w,int h,void * src1,void * src2,int s_src1,int s_src2,int bit_depth)38 static s64 ssd_16b_neon_8x8(int w, int h, void *src1, void *src2, int s_src1, int s_src2, int bit_depth)
39 {
40 s64 ssd = 0;
41 s16* s1 = (s16*) src1;
42 s16* s2 = (s16*) src2;
43 s16 i;
44 int16x8_t s1_vector, s2_vector;
45 int32x4_t diff1, diff2;
46 int32x2_t diff1_low, diff2_low;
47 int64x2_t sq_diff1_low, sq_diff1_high, sq_diff2_low, sq_diff2_high, sq_diff;
48
49 {
50 s1_vector = vld1q_s16(s1);
51 s1 += s_src1;
52 s2_vector = vld1q_s16(s2);
53 s2 += s_src2;
54
55 diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
56 diff2 = vsubl_high_s16(s1_vector, s2_vector);
57 diff1_low = vget_low_s32(diff1);
58 diff2_low = vget_low_s32(diff2);
59
60 sq_diff1_low = vmull_s32(diff1_low, diff1_low);
61 sq_diff1_high = vmull_high_s32(diff1, diff1);
62 sq_diff2_low = vmull_s32(diff2_low, diff2_low);
63 sq_diff2_high = vmull_high_s32(diff2, diff2);
64
65 sq_diff = vaddq_s64(sq_diff1_low, sq_diff1_high);
66 sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
67 sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
68 }
69 {
70 s1_vector = vld1q_s16(s1);
71 s1 += s_src1;
72 s2_vector = vld1q_s16(s2);
73 s2 += s_src2;
74
75 diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
76 diff2 = vsubl_high_s16(s1_vector, s2_vector);
77 diff1_low = vget_low_s32(diff1);
78 diff2_low = vget_low_s32(diff2);
79
80 sq_diff1_low = vmull_s32(diff1_low, diff1_low);
81 sq_diff1_high = vmull_high_s32(diff1, diff1);
82 sq_diff2_low = vmull_s32(diff2_low, diff2_low);
83 sq_diff2_high = vmull_high_s32(diff2, diff2);
84
85 sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
86 sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
87 sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
88 sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
89 }
90 {
91 s1_vector = vld1q_s16(s1);
92 s1 += s_src1;
93 s2_vector = vld1q_s16(s2);
94 s2 += s_src2;
95
96 diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
97 diff2 = vsubl_high_s16(s1_vector, s2_vector);
98 diff1_low = vget_low_s32(diff1);
99 diff2_low = vget_low_s32(diff2);
100
101 sq_diff1_low = vmull_s32(diff1_low, diff1_low);
102 sq_diff1_high = vmull_high_s32(diff1, diff1);
103 sq_diff2_low = vmull_s32(diff2_low, diff2_low);
104 sq_diff2_high = vmull_high_s32(diff2, diff2);
105
106 sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
107 sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
108 sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
109 sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
110 }
111 {
112 s1_vector = vld1q_s16(s1);
113 s1 += s_src1;
114 s2_vector = vld1q_s16(s2);
115 s2 += s_src2;
116
117 diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
118 diff2 = vsubl_high_s16(s1_vector, s2_vector);
119 diff1_low = vget_low_s32(diff1);
120 diff2_low = vget_low_s32(diff2);
121
122 sq_diff1_low = vmull_s32(diff1_low, diff1_low);
123 sq_diff1_high = vmull_high_s32(diff1, diff1);
124 sq_diff2_low = vmull_s32(diff2_low, diff2_low);
125 sq_diff2_high = vmull_high_s32(diff2, diff2);
126
127 sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
128 sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
129 sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
130 sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
131 }
132 {
133 s1_vector = vld1q_s16(s1);
134 s1 += s_src1;
135 s2_vector = vld1q_s16(s2);
136 s2 += s_src2;
137
138 diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
139 diff2 = vsubl_high_s16(s1_vector, s2_vector);
140 diff1_low = vget_low_s32(diff1);
141 diff2_low = vget_low_s32(diff2);
142
143 sq_diff1_low = vmull_s32(diff1_low, diff1_low);
144 sq_diff1_high = vmull_high_s32(diff1, diff1);
145 sq_diff2_low = vmull_s32(diff2_low, diff2_low);
146 sq_diff2_high = vmull_high_s32(diff2, diff2);
147
148 sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
149 sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
150 sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
151 sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
152 }
153 {
154 s1_vector = vld1q_s16(s1);
155 s1 += s_src1;
156 s2_vector = vld1q_s16(s2);
157 s2 += s_src2;
158
159 diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
160 diff2 = vsubl_high_s16(s1_vector, s2_vector);
161 diff1_low = vget_low_s32(diff1);
162 diff2_low = vget_low_s32(diff2);
163
164 sq_diff1_low = vmull_s32(diff1_low, diff1_low);
165 sq_diff1_high = vmull_high_s32(diff1, diff1);
166 sq_diff2_low = vmull_s32(diff2_low, diff2_low);
167 sq_diff2_high = vmull_high_s32(diff2, diff2);
168
169 sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
170 sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
171 sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
172 sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
173 }
174 {
175 s1_vector = vld1q_s16(s1);
176 s1 += s_src1;
177 s2_vector = vld1q_s16(s2);
178 s2 += s_src2;
179
180 diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
181 diff2 = vsubl_high_s16(s1_vector, s2_vector);
182 diff1_low = vget_low_s32(diff1);
183 diff2_low = vget_low_s32(diff2);
184
185 sq_diff1_low = vmull_s32(diff1_low, diff1_low);
186 sq_diff1_high = vmull_high_s32(diff1, diff1);
187 sq_diff2_low = vmull_s32(diff2_low, diff2_low);
188 sq_diff2_high = vmull_high_s32(diff2, diff2);
189
190 sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
191 sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
192 sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
193 sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
194 }
195 {
196 s1_vector = vld1q_s16(s1);
197 s1 += s_src1;
198 s2_vector = vld1q_s16(s2);
199 s2 += s_src2;
200
201 diff1 = vsubl_s16(vget_low_s16(s1_vector), vget_low_s16(s2_vector));
202 diff2 = vsubl_high_s16(s1_vector, s2_vector);
203 diff1_low = vget_low_s32(diff1);
204 diff2_low = vget_low_s32(diff2);
205
206 sq_diff1_low = vmull_s32(diff1_low, diff1_low);
207 sq_diff1_high = vmull_high_s32(diff1, diff1);
208 sq_diff2_low = vmull_s32(diff2_low, diff2_low);
209 sq_diff2_high = vmull_high_s32(diff2, diff2);
210
211 sq_diff = vaddq_s64(sq_diff, sq_diff1_low);
212 sq_diff = vaddq_s64(sq_diff, sq_diff1_high);
213 sq_diff = vaddq_s64(sq_diff, sq_diff2_low);
214 sq_diff = vaddq_s64(sq_diff, sq_diff2_high);
215 }
216 ssd += vaddvq_s64(sq_diff);
217 return ssd;
218 }
219
220 const oapv_fn_ssd_t oapv_tbl_fn_ssd_16b_neon[2] =
221 {
222 ssd_16b_neon_8x8,
223 NULL};
224
225
oapv_dc_removed_had8x8_neon(pel * org,int s_org)226 int oapv_dc_removed_had8x8_neon(pel* org, int s_org)
227 {
228 int satd = 0;
229 /* all 128 bit registers are named with a suffix mxnb, where m is the */
230 /* number of n bits packed in the register */
231
232 int16x8_t src0_8x16b, src1_8x16b, src2_8x16b, src3_8x16b;
233 int16x8_t src4_8x16b, src5_8x16b, src6_8x16b, src7_8x16b;
234 int16x8_t pred0_8x16b, pred1_8x16b, pred2_8x16b, pred3_8x16b;
235 int16x8_t pred4_8x16b, pred5_8x16b, pred6_8x16b, pred7_8x16b;
236 int16x8_t out0_8x16b, out1_8x16b, out2_8x16b, out3_8x16b;
237 int16x8_t out4_8x16b, out5_8x16b, out6_8x16b, out7_8x16b;
238 int16x8x2_t out0_8x16bx2, out1_8x16bx2, out2_8x16bx2, out3_8x16bx2;
239
240 src0_8x16b = (vld1q_s16(&org[0]));
241 org = org + s_org;
242 src1_8x16b = (vld1q_s16(&org[0]));
243 org = org + s_org;
244 src2_8x16b = (vld1q_s16(&org[0]));
245 org = org + s_org;
246 src3_8x16b = (vld1q_s16(&org[0]));
247 org = org + s_org;
248 src4_8x16b = (vld1q_s16(&org[0]));
249 org = org + s_org;
250 src5_8x16b = (vld1q_s16(&org[0]));
251 org = org + s_org;
252 src6_8x16b = (vld1q_s16(&org[0]));
253 org = org + s_org;
254 src7_8x16b = (vld1q_s16(&org[0]));
255 org = org + s_org;
256
257 /**************** 8x8 horizontal transform *******************************/
258 /*********************** 8x8 16 bit Transpose ************************/
259
260 out3_8x16b = vcombine_s16(vget_low_s16(src0_8x16b), vget_low_s16(src1_8x16b));
261 out7_8x16b = vcombine_s16(vget_high_s16(src0_8x16b), vget_high_s16(src1_8x16b));
262
263 pred0_8x16b = vcombine_s16(vget_low_s16(src2_8x16b), vget_low_s16(src3_8x16b));
264 src2_8x16b = vcombine_s16(vget_high_s16(src2_8x16b), vget_high_s16(src3_8x16b));
265
266 out2_8x16b = vcombine_s16(vget_low_s16(src4_8x16b), vget_low_s16(src5_8x16b));
267 pred7_8x16b = vcombine_s16(vget_high_s16(src4_8x16b), vget_high_s16(src5_8x16b));
268
269 pred3_8x16b = vcombine_s16(vget_low_s16(src6_8x16b), vget_low_s16(src7_8x16b));
270 src6_8x16b = vcombine_s16(vget_high_s16(src6_8x16b), vget_high_s16(src7_8x16b));
271
272
273 out1_8x16b = vzip1q_s32(out3_8x16b, pred0_8x16b);
274 out3_8x16b = vzip2q_s32(out3_8x16b, pred0_8x16b);
275
276 pred1_8x16b = vzip1q_s32(out2_8x16b, pred3_8x16b);
277 pred3_8x16b = vzip2q_s32(out2_8x16b, pred3_8x16b);
278
279 out5_8x16b = vzip1q_s32(out7_8x16b, src2_8x16b);
280 out7_8x16b = vzip2q_s32(out7_8x16b, src2_8x16b);
281
282 pred5_8x16b = vzip1q_s32(pred7_8x16b, src6_8x16b);
283 pred7_8x16b = vzip2q_s32(pred7_8x16b, src6_8x16b);
284
285 out0_8x16b = vzip1q_s64(out1_8x16b,pred1_8x16b);
286 out1_8x16b = vzip2q_s64(out1_8x16b,pred1_8x16b);
287 out2_8x16b = vzip1q_s64(out3_8x16b,pred3_8x16b);
288 out3_8x16b = vzip2q_s64(out3_8x16b,pred3_8x16b);
289 out4_8x16b = vzip1q_s64(out5_8x16b,pred5_8x16b);
290 out5_8x16b = vzip2q_s64(out5_8x16b,pred5_8x16b);
291 out6_8x16b = vzip1q_s64(out7_8x16b,pred7_8x16b);
292 out7_8x16b = vzip2q_s64(out7_8x16b,pred7_8x16b);
293
294 /********************** 8x8 16 bit Transpose End *********************/
295
296 /* r0 + r1 */
297 pred0_8x16b = vaddq_s16(out0_8x16b, out1_8x16b);
298 /* r2 + r3 */
299 pred2_8x16b = vaddq_s16(out2_8x16b, out3_8x16b);
300 /* r4 + r5 */
301 pred4_8x16b = vaddq_s16(out4_8x16b, out5_8x16b);
302 /* r6 + r7 */
303 pred6_8x16b = vaddq_s16(out6_8x16b, out7_8x16b);
304
305
306 /* r0 + r1 + r2 + r3 */
307 pred1_8x16b = vaddq_s16(pred0_8x16b, pred2_8x16b);
308 /* r4 + r5 + r6 + r7 */
309 pred5_8x16b = vaddq_s16(pred4_8x16b, pred6_8x16b);
310 /* r0 + r1 + r2 + r3 + r4 + r5 + r6 + r7 */
311 src0_8x16b = vaddq_s16(pred1_8x16b, pred5_8x16b);
312 /* r0 + r1 + r2 + r3 - r4 - r5 - r6 - r7 */
313 src4_8x16b = vsubq_s16(pred1_8x16b, pred5_8x16b);
314
315 /* r0 + r1 - r2 - r3 */
316 pred1_8x16b = vsubq_s16(pred0_8x16b, pred2_8x16b);
317 /* r4 + r5 - r6 - r7 */
318 pred5_8x16b = vsubq_s16(pred4_8x16b, pred6_8x16b);
319 /* r0 + r1 - r2 - r3 + r4 + r5 - r6 - r7 */
320 src2_8x16b = vaddq_s16(pred1_8x16b, pred5_8x16b);
321 /* r0 + r1 - r2 - r3 - r4 - r5 + r6 + r7 */
322 src6_8x16b = vsubq_s16(pred1_8x16b, pred5_8x16b);
323
324 /* r0 - r1 */
325 pred0_8x16b = vsubq_s16(out0_8x16b, out1_8x16b);
326 /* r2 - r3 */
327 pred2_8x16b = vsubq_s16(out2_8x16b, out3_8x16b);
328 /* r4 - r5 */
329 pred4_8x16b = vsubq_s16(out4_8x16b, out5_8x16b);
330 /* r6 - r7 */
331 pred6_8x16b = vsubq_s16(out6_8x16b, out7_8x16b);
332
333 /* r0 - r1 + r2 - r3 */
334 pred1_8x16b = vaddq_s16(pred0_8x16b, pred2_8x16b);
335 /* r4 - r5 + r6 - r7 */
336 pred5_8x16b = vaddq_s16(pred4_8x16b, pred6_8x16b);
337 /* r0 - r1 + r2 - r3 + r4 - r5 + r6 - r7 */
338 src1_8x16b = vaddq_s16(pred1_8x16b, pred5_8x16b);
339 /* r0 - r1 + r2 - r3 - r4 + r5 - r6 + r7 */
340 src5_8x16b = vsubq_s16(pred1_8x16b, pred5_8x16b);
341
342 /* r0 - r1 - r2 + r3 */
343 pred1_8x16b = vsubq_s16(pred0_8x16b, pred2_8x16b);
344 /* r4 - r5 - r6 + r7 */
345 pred5_8x16b = vsubq_s16(pred4_8x16b, pred6_8x16b);
346 /* r0 - r1 - r2 + r3 + r4 - r5 - r6 + r7 */
347 src3_8x16b = vaddq_s16(pred1_8x16b, pred5_8x16b);
348 /* r0 - r1 - r2 + r3 - r4 + r5 + r6 - r7 */
349 src7_8x16b = vsubq_s16(pred1_8x16b, pred5_8x16b);
350
351
352 /*********************** 8x8 16 bit Transpose ************************/
353 out3_8x16b = vzip1q_s16(src0_8x16b, src1_8x16b);
354 pred0_8x16b = vzip1q_s16(src2_8x16b, src3_8x16b);
355 out2_8x16b = vzip1q_s16(src4_8x16b, src5_8x16b);
356 pred3_8x16b = vzip1q_s16(src6_8x16b, src7_8x16b);
357 out7_8x16b = vzip2q_s16(src0_8x16b, src1_8x16b);
358 src2_8x16b = vzip2q_s16(src2_8x16b, src3_8x16b);
359 pred7_8x16b = vzip2q_s16(src4_8x16b, src5_8x16b);
360 src6_8x16b = vzip2q_s16(src6_8x16b, src7_8x16b);
361
362 out1_8x16b = vzip1q_s32(out3_8x16b, pred0_8x16b);
363 out3_8x16b = vzip2q_s32(out3_8x16b, pred0_8x16b);
364
365 pred1_8x16b = vzip1q_s32(out2_8x16b, pred3_8x16b);
366 pred3_8x16b = vzip2q_s32(out2_8x16b, pred3_8x16b);
367
368 out5_8x16b = vzip1q_s32(out7_8x16b, src2_8x16b);
369 out7_8x16b = vzip2q_s32(out7_8x16b, src2_8x16b);
370
371 pred5_8x16b = vzip1q_s32(pred7_8x16b, src6_8x16b);
372 pred7_8x16b = vzip2q_s32(pred7_8x16b, src6_8x16b);
373
374 src0_8x16b = vzip1q_s64(out1_8x16b,pred1_8x16b);
375 src1_8x16b = vzip2q_s64(out1_8x16b,pred1_8x16b);
376 src2_8x16b = vzip1q_s64(out3_8x16b,pred3_8x16b);
377 src3_8x16b = vzip2q_s64(out3_8x16b,pred3_8x16b);
378 src4_8x16b = vzip1q_s64(out5_8x16b,pred5_8x16b);
379 src5_8x16b = vzip2q_s64(out5_8x16b,pred5_8x16b);
380 src6_8x16b = vzip1q_s64(out7_8x16b,pred7_8x16b);
381 src7_8x16b = vzip2q_s64(out7_8x16b,pred7_8x16b);
382
383 /********************** 8x8 16 bit Transpose End *********************/
384 /**************** 8x8 horizontal transform *******************************/
385 {
386 int16x8_t out0a_8x16b, out1a_8x16b, out2a_8x16b, out3a_8x16b;
387 int16x8_t out4a_8x16b, out5a_8x16b, out6a_8x16b, out7a_8x16b;
388 int16x8_t tmp0_8x16b, tmp1_8x16b, tmp2_8x16b, tmp3_8x16b;
389 int16x8_t tmp4_8x16b, tmp5_8x16b, tmp6_8x16b, tmp7_8x16b;
390
391 /************************* 8x8 Vertical Transform*************************/
392 tmp0_8x16b = vcombine_s16(vget_high_s16(src0_8x16b), vcreate_s32(0));
393 tmp1_8x16b = vcombine_s16(vget_high_s16(src1_8x16b), vcreate_s32(0));
394 tmp2_8x16b = vcombine_s16(vget_high_s16(src2_8x16b), vcreate_s32(0));
395 tmp3_8x16b = vcombine_s16(vget_high_s16(src3_8x16b), vcreate_s32(0));
396 tmp4_8x16b = vcombine_s16(vget_high_s16(src4_8x16b), vcreate_s32(0));
397 tmp5_8x16b = vcombine_s16(vget_high_s16(src5_8x16b), vcreate_s32(0));
398 tmp6_8x16b = vcombine_s16(vget_high_s16(src6_8x16b), vcreate_s32(0));
399 tmp7_8x16b = vcombine_s16(vget_high_s16(src7_8x16b), vcreate_s32(0));
400
401 /*************************First 4 pixels ********************************/
402
403 src0_8x16b = vmovl_s16(vget_low_s16(src0_8x16b));
404 src1_8x16b = vmovl_s16(vget_low_s16(src1_8x16b));
405 src2_8x16b = vmovl_s16(vget_low_s16(src2_8x16b));
406 src3_8x16b = vmovl_s16(vget_low_s16(src3_8x16b));
407 src4_8x16b = vmovl_s16(vget_low_s16(src4_8x16b));
408 src5_8x16b = vmovl_s16(vget_low_s16(src5_8x16b));
409 src6_8x16b = vmovl_s16(vget_low_s16(src6_8x16b));
410 src7_8x16b = vmovl_s16(vget_low_s16(src7_8x16b));
411
412 /* r0 + r1 */
413 pred0_8x16b = vaddq_s32(src0_8x16b, src1_8x16b);
414 /* r2 + r3 */
415 pred2_8x16b = vaddq_s32(src2_8x16b, src3_8x16b);
416 /* r4 + r5 */
417 pred4_8x16b = vaddq_s32(src4_8x16b, src5_8x16b);
418 /* r6 + r7 */
419 pred6_8x16b = vaddq_s32(src6_8x16b, src7_8x16b);
420
421 /* r0 + r1 + r2 + r3 */
422 pred1_8x16b = vaddq_s32(pred0_8x16b, pred2_8x16b);
423 /* r4 + r5 + r6 + r7 */
424 pred5_8x16b = vaddq_s32(pred4_8x16b, pred6_8x16b);
425 /* r0 + r1 + r2 + r3 + r4 + r5 + r6 + r7 */
426 out0_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
427 /* r0 + r1 + r2 + r3 - r4 - r5 - r6 - r7 */
428 out4_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
429
430 /* r0 + r1 - r2 - r3 */
431 pred1_8x16b = vsubq_s32(pred0_8x16b, pred2_8x16b);
432 /* r4 + r5 - r6 - r7 */
433 pred5_8x16b = vsubq_s32(pred4_8x16b, pred6_8x16b);
434 /* r0 + r1 - r2 - r3 + r4 + r5 - r6 - r7 */
435 out2_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
436 /* r0 + r1 - r2 - r3 - r4 - r5 + r6 + r7 */
437 out6_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
438
439 /* r0 - r1 */
440 pred0_8x16b = vsubq_s32(src0_8x16b, src1_8x16b);
441 /* r2 - r3 */
442 pred2_8x16b = vsubq_s32(src2_8x16b, src3_8x16b);
443 /* r4 - r5 */
444 pred4_8x16b = vsubq_s32(src4_8x16b, src5_8x16b);
445 /* r6 - r7 */
446 pred6_8x16b = vsubq_s32(src6_8x16b, src7_8x16b);
447
448 /* r0 - r1 + r2 - r3 */
449 pred1_8x16b = vaddq_s32(pred0_8x16b, pred2_8x16b);
450 /* r4 - r5 + r6 - r7 */
451 pred5_8x16b = vaddq_s32(pred4_8x16b, pred6_8x16b);
452 /* r0 - r1 + r2 - r3 + r4 - r5 + r6 - r7 */
453 out1_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
454 /* r0 - r1 + r2 - r3 - r4 + r5 - r6 + r7 */
455 out5_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
456
457 /* r0 - r1 - r2 + r3 */
458 pred1_8x16b = vsubq_s32(pred0_8x16b, pred2_8x16b);
459 /* r4 - r5 - r6 + r7 */
460 pred5_8x16b = vsubq_s32(pred4_8x16b, pred6_8x16b);
461 /* r0 - r1 - r2 + r3 + r4 - r5 - r6 + r7 */
462 out3_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
463 /* r0 - r1 - r2 + r3 - r4 + r5 + r6 - r7 */
464 out7_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
465
466 /*************************First 4 pixels ********************************/
467
468 /**************************Next 4 pixels *******************************/
469 src0_8x16b = vmovl_s16(vget_low_s16(tmp0_8x16b));
470 src1_8x16b = vmovl_s16(vget_low_s16(tmp1_8x16b));
471 src2_8x16b = vmovl_s16(vget_low_s16(tmp2_8x16b));
472 src3_8x16b = vmovl_s16(vget_low_s16(tmp3_8x16b));
473 src4_8x16b = vmovl_s16(vget_low_s16(tmp4_8x16b));
474 src5_8x16b = vmovl_s16(vget_low_s16(tmp5_8x16b));
475 src6_8x16b = vmovl_s16(vget_low_s16(tmp6_8x16b));
476 src7_8x16b = vmovl_s16(vget_low_s16(tmp7_8x16b));
477
478 /* r0 + r1 */
479 pred0_8x16b = vaddq_s32(src0_8x16b, src1_8x16b);
480 /* r2 + r3 */
481 pred2_8x16b = vaddq_s32(src2_8x16b, src3_8x16b);
482 /* r4 + r5 */
483 pred4_8x16b = vaddq_s32(src4_8x16b, src5_8x16b);
484 /* r6 + r7 */
485 pred6_8x16b = vaddq_s32(src6_8x16b, src7_8x16b);
486
487 /* r0 + r1 + r2 + r3 */
488 pred1_8x16b = vaddq_s32(pred0_8x16b, pred2_8x16b);
489 /* r4 + r5 + r6 + r7 */
490 pred5_8x16b = vaddq_s32(pred4_8x16b, pred6_8x16b);
491 /* r0 + r1 + r2 + r3 + r4 + r5 + r6 + r7 */
492 out0a_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
493 /* r0 + r1 + r2 + r3 - r4 - r5 - r6 - r7 */
494 out4a_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
495
496 /* r0 + r1 - r2 - r3 */
497 pred1_8x16b = vsubq_s32(pred0_8x16b, pred2_8x16b);
498 /* r4 + r5 - r6 - r7 */
499 pred5_8x16b = vsubq_s32(pred4_8x16b, pred6_8x16b);
500 /* r0 + r1 - r2 - r3 + r4 + r5 - r6 - r7 */
501 out2a_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
502 /* r0 + r1 - r2 - r3 - r4 - r5 + r6 + r7 */
503 out6a_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
504
505 /* r0 - r1 */
506 pred0_8x16b = vsubq_s32(src0_8x16b, src1_8x16b);
507 /* r2 - r3 */
508 pred2_8x16b = vsubq_s32(src2_8x16b, src3_8x16b);
509 /* r4 - r5 */
510 pred4_8x16b = vsubq_s32(src4_8x16b, src5_8x16b);
511 /* r6 - r7 */
512 pred6_8x16b = vsubq_s32(src6_8x16b, src7_8x16b);
513
514 /* r0 - r1 + r2 - r3 */
515 pred1_8x16b = vaddq_s32(pred0_8x16b, pred2_8x16b);
516 /* r4 - r5 + r6 - r7 */
517 pred5_8x16b = vaddq_s32(pred4_8x16b, pred6_8x16b);
518 /* r0 - r1 + r2 - r3 + r4 - r5 + r6 - r7 */
519 out1a_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
520 /* r0 - r1 + r2 - r3 - r4 + r5 - r6 + r7 */
521 out5a_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
522
523 /* r0 - r1 - r2 + r3 */
524 pred1_8x16b = vsubq_s32(pred0_8x16b, pred2_8x16b);
525 /* r4 - r5 - r6 + r7 */
526 pred5_8x16b = vsubq_s32(pred4_8x16b, pred6_8x16b);
527 /* r0 - r1 - r2 + r3 + r4 - r5 - r6 + r7 */
528 out3a_8x16b = vaddq_s32(pred1_8x16b, pred5_8x16b);
529 /* r0 - r1 - r2 + r3 - r4 + r5 + r6 - r7 */
530 out7a_8x16b = vsubq_s32(pred1_8x16b, pred5_8x16b);
531
532 /**************************Next 4 pixels *******************************/
533 /************************* 8x8 Vertical Transform*************************/
534
535 /****************************SATD calculation ****************************/
536 src0_8x16b = vabsq_s32(out0_8x16b);
537 src1_8x16b = vabsq_s32(out1_8x16b);
538 src2_8x16b = vabsq_s32(out2_8x16b);
539 src3_8x16b = vabsq_s32(out3_8x16b);
540 src4_8x16b = vabsq_s32(out4_8x16b);
541 src5_8x16b = vabsq_s32(out5_8x16b);
542 src6_8x16b = vabsq_s32(out6_8x16b);
543 src7_8x16b = vabsq_s32(out7_8x16b);
544 s32* p = (s32*)&src0_8x16b;
545 p[0] = 0;
546
547 satd = vaddvq_s32(src0_8x16b);
548 satd += vaddvq_s32(src1_8x16b);
549 satd += vaddvq_s32(src2_8x16b);
550 satd += vaddvq_s32(src3_8x16b);
551 satd += vaddvq_s32(src4_8x16b);
552 satd += vaddvq_s32(src5_8x16b);
553 satd += vaddvq_s32(src6_8x16b);
554 satd += vaddvq_s32(src7_8x16b);
555
556 src0_8x16b = vabsq_s32(out0a_8x16b);
557 src1_8x16b = vabsq_s32(out1a_8x16b);
558 src2_8x16b = vabsq_s32(out2a_8x16b);
559 src3_8x16b = vabsq_s32(out3a_8x16b);
560 src4_8x16b = vabsq_s32(out4a_8x16b);
561 src5_8x16b = vabsq_s32(out5a_8x16b);
562 src6_8x16b = vabsq_s32(out6a_8x16b);
563 src7_8x16b = vabsq_s32(out7a_8x16b);
564
565 satd += vaddvq_s32(src0_8x16b);
566 satd += vaddvq_s32(src1_8x16b);
567 satd += vaddvq_s32(src2_8x16b);
568 satd += vaddvq_s32(src3_8x16b);
569 satd += vaddvq_s32(src4_8x16b);
570 satd += vaddvq_s32(src5_8x16b);
571 satd += vaddvq_s32(src6_8x16b);
572 satd += vaddvq_s32(src7_8x16b);
573
574 satd = (satd + 2) >> 2;
575 return satd;
576 }
577 }
578 #endif /* ARM_NEON */
579