1 /******************************************************************************
2 *
3 * Copyright (C) 2018 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 * ihevce_common_utils_neon.c
24 *
25 * @brief
26 * Contains intrinsic definitions of functions for sao param
27 *
28 * @author
29 * ittiam
30 *
31 * @par List of Functions:
32 * - ihevce_get_luma_eo_sao_params_neon()
33 * - ihevce_get_chroma_eo_sao_params_neon()
34 *
35 * @remarks
36 * None
37 *
38 *******************************************************************************
39 */
40
41 /*****************************************************************************/
42 /* File Includes */
43 /*****************************************************************************/
44 /* System include files */
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <assert.h>
48 #include <string.h>
49 #include <arm_neon.h>
50
51 /* User include files */
52 #include "ihevc_typedefs.h"
53 #include "itt_video_api.h"
54 #include "ihevce_api.h"
55
56 #include "rc_cntrl_param.h"
57 #include "rc_frame_info_collector.h"
58 #include "rc_look_ahead_params.h"
59
60 #include "ihevc_defs.h"
61 #include "ihevc_debug.h"
62 #include "ihevc_structs.h"
63 #include "ihevc_platform_macros.h"
64 #include "ihevc_deblk.h"
65 #include "ihevc_itrans_recon.h"
66 #include "ihevc_chroma_itrans_recon.h"
67 #include "ihevc_chroma_intra_pred.h"
68 #include "ihevc_intra_pred.h"
69 #include "ihevc_inter_pred.h"
70 #include "ihevc_mem_fns.h"
71 #include "ihevc_padding.h"
72 #include "ihevc_weighted_pred.h"
73 #include "ihevc_sao.h"
74 #include "ihevc_resi_trans.h"
75 #include "ihevc_quant_iquant_ssd.h"
76 #include "ihevc_cabac_tables.h"
77 #include "ihevc_cmn_utils_neon.h"
78
79 #include "ihevce_defs.h"
80 #include "ihevce_hle_interface.h"
81 #include "ihevce_lap_enc_structs.h"
82 #include "ihevce_multi_thrd_structs.h"
83 #include "ihevce_me_common_defs.h"
84 #include "ihevce_had_satd.h"
85 #include "ihevce_error_codes.h"
86 #include "ihevce_bitstream.h"
87 #include "ihevce_cabac.h"
88 #include "ihevce_rdoq_macros.h"
89 #include "ihevce_function_selector.h"
90 #include "ihevce_enc_structs.h"
91 #include "ihevce_entropy_structs.h"
92 #include "ihevce_cmn_utils_instr_set_router.h"
93 #include "ihevce_enc_loop_structs.h"
94 #include "ihevce_common_utils.h"
95 #include "ihevce_global_tables.h"
96
97 /*****************************************************************************/
98 /* Function Definitions */
99 /*****************************************************************************/
100
ihevce_wt_avg_2d_16x1_neon(UWORD8 * pu1_pred0,UWORD8 * pu1_pred1,UWORD8 * pu1_dst,WORD32 w0,WORD32 w1,WORD32 rnd,WORD32 shift)101 static void ihevce_wt_avg_2d_16x1_neon(
102 UWORD8 *pu1_pred0,
103 UWORD8 *pu1_pred1,
104 UWORD8 *pu1_dst,
105 WORD32 w0,
106 WORD32 w1,
107 WORD32 rnd,
108 WORD32 shift)
109 {
110 uint8x16_t a0, a1;
111 int32x4_t a6, a7, a9;
112 int32x4_t reg0[4], reg1[4];
113 int16x8_t a2, a3, a4, a5, a8;
114
115 a8 = vdupq_n_s16((WORD16)rnd);
116
117 a6 = vdupq_n_s32(w0);
118 a7 = vdupq_n_s32(w1);
119 a9 = vdupq_n_s32(-shift);
120
121 a0 = vld1q_u8(pu1_pred0);
122 a1 = vld1q_u8(pu1_pred1);
123
124 a2 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a0)));
125 a3 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a0)));
126 a4 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a1)));
127 a5 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a1)));
128
129 reg0[0] = vmovl_s16(vget_low_s16(a2));
130 reg0[1] = vmovl_s16(vget_high_s16(a2));
131 reg0[2] = vmovl_s16(vget_low_s16(a3));
132 reg0[3] = vmovl_s16(vget_high_s16(a3));
133
134 reg1[0] = vmovl_s16(vget_low_s16(a4));
135 reg1[1] = vmovl_s16(vget_high_s16(a4));
136 reg1[2] = vmovl_s16(vget_low_s16(a5));
137 reg1[3] = vmovl_s16(vget_high_s16(a5));
138
139 reg0[0] = vmulq_s32(reg0[0], a6);
140 reg0[1] = vmulq_s32(reg0[1], a6);
141 reg0[2] = vmulq_s32(reg0[2], a6);
142 reg0[3] = vmulq_s32(reg0[3], a6);
143
144 reg1[0] = vmulq_s32(reg1[0], a7);
145 reg1[1] = vmulq_s32(reg1[1], a7);
146 reg1[2] = vmulq_s32(reg1[2], a7);
147 reg1[3] = vmulq_s32(reg1[3], a7);
148
149 reg0[0] = vaddq_s32(reg0[0], reg1[0]);
150 reg0[1] = vaddq_s32(reg0[1], reg1[1]);
151 reg0[2] = vaddq_s32(reg0[2], reg1[2]);
152 reg0[3] = vaddq_s32(reg0[3], reg1[3]);
153
154 reg0[0] = vshlq_s32(reg0[0], a9);
155 reg0[1] = vshlq_s32(reg0[1], a9);
156 reg0[2] = vshlq_s32(reg0[2], a9);
157 reg0[3] = vshlq_s32(reg0[3], a9); // (p0*w0 + p1*w1) >> shift
158
159 a2 = vcombine_s16(vmovn_s32(reg0[0]), vmovn_s32(reg0[1]));
160 a3 = vcombine_s16(vmovn_s32(reg0[2]), vmovn_s32(reg0[3]));
161
162 a2 = vaddq_s16(a2, a8);
163 a3 = vaddq_s16(a3, a8); // ((p0*w0 + p1*w1) >> shift) + rnd
164 a0 = vcombine_u8(vqmovun_s16(a2), vqmovun_s16(a3));
165
166 vst1q_u8(pu1_dst, a0);
167 }
168
ihevce_wt_avg_2d_8x1_neon(UWORD8 * pu1_pred0,UWORD8 * pu1_pred1,UWORD8 * pu1_dst,WORD32 w0,WORD32 w1,WORD32 rnd,WORD32 shift)169 static void ihevce_wt_avg_2d_8x1_neon(
170 UWORD8 *pu1_pred0,
171 UWORD8 *pu1_pred1,
172 UWORD8 *pu1_dst,
173 WORD32 w0,
174 WORD32 w1,
175 WORD32 rnd,
176 WORD32 shift)
177 {
178 uint8x8_t a2, a3;
179 int16x8_t a0, a1, a6;
180 int32x4_t a4, a5, a7, a8, a9, a10, a11;
181
182 a6 = vdupq_n_s16((WORD16)rnd);
183
184 a4 = vdupq_n_s32(w0);
185 a5 = vdupq_n_s32(w1);
186 a7 = vdupq_n_s32((-shift));
187
188 a2 = vld1_u8(pu1_pred0);
189 a3 = vld1_u8(pu1_pred1);
190 a0 = vreinterpretq_s16_u16(vmovl_u8(a2));
191 a1 = vreinterpretq_s16_u16(vmovl_u8(a3));
192
193 a8 = vmovl_s16(vget_low_s16(a0));
194 a9 = vmovl_s16(vget_high_s16(a0));
195 a10 = vmovl_s16(vget_low_s16(a1));
196 a11 = vmovl_s16(vget_high_s16(a1));
197
198 a8 = vmulq_s32(a8, a4);
199 a9 = vmulq_s32(a9, a4);
200 a10 = vmulq_s32(a10, a5);
201 a11 = vmulq_s32(a11, a5);
202
203 a8 = vaddq_s32(a8, a10);
204 a10 = vaddq_s32(a9, a11);
205
206 a8 = vshlq_s32(a8, a7);
207 a10 = vshlq_s32(a10, a7);
208
209 a0 = vcombine_s16(vmovn_s32(a8), vmovn_s32(a10));
210 a0 = vaddq_s16(a0, a6);
211 a2 = vqmovun_s16(a0);
212 vst1_u8(pu1_dst, a2);
213 }
214
ihevce_wt_avg_2d_4xn_neon(UWORD8 * pu1_pred0,UWORD8 * pu1_pred1,WORD32 pred0_strd,WORD32 pred1_strd,WORD32 wd,WORD32 ht,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 w0,WORD32 w1,WORD32 rnd,WORD32 shift)215 static void ihevce_wt_avg_2d_4xn_neon(
216 UWORD8 *pu1_pred0,
217 UWORD8 *pu1_pred1,
218 WORD32 pred0_strd,
219 WORD32 pred1_strd,
220 WORD32 wd,
221 WORD32 ht,
222 UWORD8 *pu1_dst,
223 WORD32 dst_strd,
224 WORD32 w0,
225 WORD32 w1,
226 WORD32 rnd,
227 WORD32 shift)
228 {
229 WORD32 i, j;
230 uint8x16_t src0_u8, src1_u8;
231 uint16x8_t a0, a1, a2, a3;
232 int32x4_t reg0[4], reg1[4];
233 int32x4_t a4, a5, a7;
234 int16x8_t a8, a9, a6;
235 uint32x2_t p0, p1;
236
237 a6 = vdupq_n_s16((WORD16)rnd);
238
239 a4 = vdupq_n_s32(w0);
240 a5 = vdupq_n_s32(w1);
241 a7 = vdupq_n_s32((-shift));
242
243 for(i = 0; i < ht; i = i + 4)
244 {
245 for(j = 0; j < wd; j = j + 4)
246 {
247 src0_u8 = load_unaligned_u8q(pu1_pred0 + ((i * pred0_strd) + j), pred0_strd);
248 src1_u8 = load_unaligned_u8q(pu1_pred1 + ((i * pred1_strd) + j), pred1_strd);
249
250 a0 = vmovl_u8(vget_low_u8(src0_u8));
251 a1 = vmovl_u8(vget_high_u8(src0_u8));
252 a2 = vmovl_u8(vget_low_u8(src1_u8));
253 a3 = vmovl_u8(vget_high_u8(src1_u8));
254
255 reg0[0] = vmovl_s16(vreinterpret_s16_u16(vget_low_u16(a0)));
256 reg0[1] = vmovl_s16(vreinterpret_s16_u16(vget_high_u16(a0)));
257 reg0[2] = vmovl_s16(vreinterpret_s16_u16(vget_low_u16(a1)));
258 reg0[3] = vmovl_s16(vreinterpret_s16_u16(vget_high_u16(a1)));
259
260 reg1[0] = vmovl_s16(vreinterpret_s16_u16(vget_low_u16(a2)));
261 reg1[1] = vmovl_s16(vreinterpret_s16_u16(vget_high_u16(a2)));
262 reg1[2] = vmovl_s16(vreinterpret_s16_u16(vget_low_u16(a3)));
263 reg1[3] = vmovl_s16(vreinterpret_s16_u16(vget_high_u16(a3)));
264
265 reg0[0] = vmulq_s32(reg0[0], a4);
266 reg0[1] = vmulq_s32(reg0[1], a4);
267 reg0[2] = vmulq_s32(reg0[2], a4);
268 reg0[3] = vmulq_s32(reg0[3], a4);
269
270 reg1[0] = vmulq_s32(reg1[0], a5);
271 reg1[1] = vmulq_s32(reg1[1], a5);
272 reg1[2] = vmulq_s32(reg1[2], a5);
273 reg1[3] = vmulq_s32(reg1[3], a5);
274
275 reg0[0] = vaddq_s32(reg0[0], reg1[0]);
276 reg0[1] = vaddq_s32(reg0[1], reg1[1]);
277 reg0[2] = vaddq_s32(reg0[2], reg1[2]);
278 reg0[3] = vaddq_s32(reg0[3], reg1[3]);
279
280 reg0[0] = vshlq_s32(reg0[0], a7);
281 reg0[1] = vshlq_s32(reg0[1], a7);
282 reg0[2] = vshlq_s32(reg0[2], a7);
283 reg0[3] = vshlq_s32(reg0[3], a7);
284
285 a8 = vcombine_s16(vmovn_s32(reg0[0]), vmovn_s32(reg0[1]));
286 a9 = vcombine_s16(vmovn_s32(reg0[2]), vmovn_s32(reg0[3]));
287
288 a8 = vaddq_s16(a8, a6);
289 a9 = vaddq_s16(a9, a6);
290
291 p0 = vreinterpret_u32_u8(vqmovun_s16(a8));
292 p1 = vreinterpret_u32_u8(vqmovun_s16(a9));
293
294 *(UWORD32 *)pu1_dst = vget_lane_u32(p0, 0);
295 *(UWORD32 *)(pu1_dst + dst_strd) = vget_lane_u32(p0, 1);
296 *(UWORD32 *)(pu1_dst + 2 * dst_strd) = vget_lane_u32(p1, 0);
297 *(UWORD32 *)(pu1_dst + 3 * dst_strd) = vget_lane_u32(p1, 1);
298
299 pu1_dst += 4;
300 }
301 pu1_dst = pu1_dst - wd + 4 * dst_strd;
302 }
303 }
304
305 /**
306 ********************************************************************************
307 *
308 * @brief Weighted pred of 2 predictor buffers as per spec
309 *
310 * @param[in] pu1_pred0 : Pred0 buffer
311 *
312 * @param[in] pu1_pred1 : Pred1 buffer
313 *
314 * @param[in] pred0_strd : Stride of pred0 buffer
315 *
316 * @param[in] pred1_strd : Stride of pred1 buffer
317 *
318 * @param[in] wd : Width of pred block
319 *
320 * @param[in] ht : Height of pred block
321 *
322 * @param[out] pu1_dst : Destination buffer that will hold result
323 *
324 * @param[in] dst_strd : Stride of dest buffer
325 *
326 * @param[in] w0 : Weighting factor of Pred0
327 *
328 * @param[in] w1 : weighting factor of pred1
329 *
330 * @param[in] o0 : offset for pred0
331 *
332 * @param[in] o1 : offset for pred1
333 *
334 * @param[in] log_wdc : shift factor as per spec
335 *
336 * @return none
337 *
338 ********************************************************************************
339 */
ihevce_wt_avg_2d_neon(UWORD8 * pu1_pred0,UWORD8 * pu1_pred1,WORD32 pred0_strd,WORD32 pred1_strd,WORD32 wd,WORD32 ht,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 w0,WORD32 w1,WORD32 o0,WORD32 o1,WORD32 log_wdc)340 void ihevce_wt_avg_2d_neon(
341 UWORD8 *pu1_pred0,
342 UWORD8 *pu1_pred1,
343 WORD32 pred0_strd,
344 WORD32 pred1_strd,
345 WORD32 wd,
346 WORD32 ht,
347 UWORD8 *pu1_dst,
348 WORD32 dst_strd,
349 WORD32 w0,
350 WORD32 w1,
351 WORD32 o0,
352 WORD32 o1,
353 WORD32 log_wdc)
354 {
355 /* Total Rounding term to be added, including offset */
356 WORD32 rnd = (o0 + o1 + 1) >> 1; // << log_wdc;
357 /* Downshift */
358 WORD32 shift = log_wdc + 1;
359 /* loop counters */
360 WORD32 i, j;
361
362 switch(wd)
363 {
364 case 4:
365 case 12:
366 ihevce_wt_avg_2d_4xn_neon(
367 pu1_pred0,
368 pu1_pred1,
369 pred0_strd,
370 pred1_strd,
371 wd,
372 ht,
373 pu1_dst,
374 dst_strd,
375 w0,
376 w1,
377 rnd,
378 shift);
379 break;
380 case 8:
381 case 24:
382 for(i = 0; i < ht; i++)
383 {
384 for(j = 0; j < wd; j = j + 8)
385 {
386 ihevce_wt_avg_2d_8x1_neon(
387 pu1_pred0 + ((i * pred0_strd) + j),
388 pu1_pred1 + ((i * pred1_strd) + j),
389 pu1_dst + ((i * dst_strd) + j),
390 w0,
391 w1,
392 rnd,
393 shift);
394 }
395 }
396 break;
397 case 16:
398 for(i = 0; i < ht; i++)
399 ihevce_wt_avg_2d_16x1_neon(
400 pu1_pred0 + (i * pred0_strd),
401 pu1_pred1 + (i * pred1_strd),
402 pu1_dst + (i * dst_strd),
403 w0,
404 w1,
405 rnd,
406 shift);
407 break;
408 case 32:
409 case 64:
410 for(i = 0; i < ht; i++)
411 {
412 for(j = 0; j < wd; j = j + 16)
413 {
414 ihevce_wt_avg_2d_16x1_neon(
415 pu1_pred0 + ((i * pred0_strd) + j),
416 pu1_pred1 + ((i * pred1_strd) + j),
417 pu1_dst + ((i * dst_strd) + j),
418 w0,
419 w1,
420 rnd,
421 shift);
422 }
423 }
424 break;
425 case 48:
426 for(i = 0; i < ht; i++)
427 {
428 for(j = 0; j < wd; j = j + 16)
429 {
430 ihevce_wt_avg_2d_16x1_neon(
431 pu1_pred0 + ((i * pred0_strd) + j),
432 pu1_pred1 + ((i * pred1_strd) + j),
433 pu1_dst + ((i * dst_strd) + j),
434 w0,
435 w1,
436 rnd,
437 shift);
438 }
439 }
440 break;
441 default:
442 assert(0);
443 break;
444 }
445 return;
446 }
447
sad_cal(int16x8_t temp_reg)448 static INLINE WORD32 sad_cal(int16x8_t temp_reg)
449 {
450 int64x2_t sad_reg = vpaddlq_s32(vpaddlq_s16(temp_reg));
451
452 return (vget_lane_s32(
453 vadd_s32(
454 vreinterpret_s32_s64(vget_low_s64(sad_reg)),
455 vreinterpret_s32_s64(vget_high_s64(sad_reg))),
456 0));
457 }
458
ihevce_get_luma_eo_sao_params_neon(void * pv_sao_ctxt,WORD32 eo_sao_class,WORD32 * pi4_acc_error_category,WORD32 * pi4_category_count)459 void ihevce_get_luma_eo_sao_params_neon(
460 void *pv_sao_ctxt,
461 WORD32 eo_sao_class,
462 WORD32 *pi4_acc_error_category,
463 WORD32 *pi4_category_count)
464 {
465 /*temp var*/
466 UWORD8 *pu1_luma_recon_buf, *pu1_luma_src_buf;
467 UWORD8 *pu1_luma_src_buf_copy, *pu1_luma_recon_buf_copy;
468 WORD32 row_end, col_end, row, col;
469 WORD32 row_start = 0, col_start = 0;
470 WORD32 wd, rem_wd;
471 WORD32 a, b, c, edge_idx, pel_err;
472
473 int16x8_t temp_reg0, temp_reg1, temp_reg2, temp_reg3, temp_reg4;
474 int16x8_t edgeidx_reg0, edgeidx_reg1, edgeidx_reg2, edgeidx_reg3, edgeidx_reg4;
475 int16x8_t edgeidx_reg5, edgeidx_reg6, edgeidx_reg7;
476 int16x8_t pel_error, pel_error1;
477 int16x8_t sign_reg0, sign_reg1, sign_reg, sign_reg2, sign_reg3;
478 int16x8_t edgeidx, edgeidx1;
479 int16x8_t temp_reg5, temp_reg6, temp_reg7;
480 uint8x16_t src_buf_8x16, recon_buf_8x16, recon_buf0_8x16, recon_buf1_8x16;
481 uint8x8_t src_buf, recon_buf, recon_buf0, recon_buf1;
482
483 sao_ctxt_t *ps_sao_ctxt = (sao_ctxt_t *)pv_sao_ctxt;
484 const WORD32 i4_luma_recon_strd = ps_sao_ctxt->i4_cur_luma_recon_stride;
485 const WORD32 i4_luma_src_strd = ps_sao_ctxt->i4_cur_luma_src_stride;
486
487 const int16x8_t const_2 = vdupq_n_s16(2);
488 const int16x8_t const_0 = vdupq_n_s16(0);
489 const int16x8_t const_1 = vdupq_n_s16(1);
490 const int16x8_t const_3 = vdupq_n_s16(3);
491 const int16x8_t const_4 = vdupq_n_s16(4);
492
493 row_end = ps_sao_ctxt->i4_sao_blk_ht;
494 col_end = ps_sao_ctxt->i4_sao_blk_wd;
495
496 if((ps_sao_ctxt->i4_ctb_x == 0) && (eo_sao_class != SAO_EDGE_90_DEG))
497 {
498 col_start = 1;
499 }
500
501 if(((ps_sao_ctxt->i4_ctb_x + 1) == ps_sao_ctxt->ps_sps->i2_pic_wd_in_ctb) &&
502 (eo_sao_class != SAO_EDGE_90_DEG))
503 {
504 col_end = col_end - 1;
505 }
506
507 if((ps_sao_ctxt->i4_ctb_y == 0) && (eo_sao_class != SAO_EDGE_0_DEG))
508 {
509 row_start = 1;
510 }
511
512 if(((ps_sao_ctxt->i4_ctb_y + 1) == ps_sao_ctxt->ps_sps->i2_pic_ht_in_ctb) &&
513 (eo_sao_class != SAO_EDGE_0_DEG))
514 {
515 row_end = row_end - 1;
516 }
517 wd = col_end - col_start;
518 rem_wd = wd;
519 pu1_luma_recon_buf =
520 ps_sao_ctxt->pu1_cur_luma_recon_buf + col_start + (row_start * i4_luma_recon_strd);
521 pu1_luma_src_buf =
522 ps_sao_ctxt->pu1_cur_luma_src_buf + col_start + (row_start * i4_luma_src_strd);
523
524 switch(eo_sao_class)
525 {
526 case SAO_EDGE_0_DEG:
527 for(row = row_start; row < row_end; row++)
528 {
529 pu1_luma_src_buf_copy = pu1_luma_src_buf;
530 pu1_luma_recon_buf_copy = pu1_luma_recon_buf;
531 for(col = wd; col > 15; col -= 16)
532 {
533 /*load src and recon data*/
534 src_buf_8x16 = vld1q_u8(pu1_luma_src_buf);
535 recon_buf_8x16 = vld1q_u8(pu1_luma_recon_buf);
536 recon_buf0_8x16 = vld1q_u8(pu1_luma_recon_buf - 1);
537 recon_buf1_8x16 = vld1q_u8(pu1_luma_recon_buf + 1);
538
539 /*pel_error*/
540 pel_error = vreinterpretq_s16_u16(
541 vsubl_u8(vget_low_u8(src_buf_8x16), vget_low_u8(recon_buf_8x16)));
542 pel_error1 = vreinterpretq_s16_u16(
543 vsubl_u8(vget_high_u8(src_buf_8x16), vget_high_u8(recon_buf_8x16)));
544
545 /*sign*/
546 sign_reg0 = vreinterpretq_s16_u16(
547 vsubl_u8(vget_low_u8(recon_buf_8x16), vget_low_u8(recon_buf0_8x16)));
548 sign_reg = (int16x8_t)vcgtq_s16(sign_reg0, const_0);
549 sign_reg0 = (int16x8_t)vcltq_s16(sign_reg0, const_0);
550 sign_reg0 = vsubq_s16(sign_reg0, sign_reg);
551
552 sign_reg1 = vreinterpretq_s16_u16(
553 vsubl_u8(vget_low_u8(recon_buf_8x16), vget_low_u8(recon_buf1_8x16)));
554 sign_reg = (int16x8_t)vcgtq_s16(sign_reg1, const_0);
555 sign_reg1 = (int16x8_t)vcltq_s16(sign_reg1, const_0);
556 sign_reg1 = vsubq_s16(sign_reg1, sign_reg);
557
558 sign_reg2 = vreinterpretq_s16_u16(
559 vsubl_u8(vget_high_u8(recon_buf_8x16), vget_high_u8(recon_buf0_8x16)));
560 sign_reg = (int16x8_t)vcgtq_s16(sign_reg2, const_0);
561 sign_reg2 = (int16x8_t)vcltq_s16(sign_reg2, const_0);
562 sign_reg2 = vsubq_s16(sign_reg2, sign_reg);
563
564 sign_reg3 = vreinterpretq_s16_u16(
565 vsubl_u8(vget_high_u8(recon_buf_8x16), vget_high_u8(recon_buf1_8x16)));
566 sign_reg = (int16x8_t)vcgtq_s16(sign_reg3, const_0);
567 sign_reg3 = (int16x8_t)vcltq_s16(sign_reg3, const_0);
568 sign_reg3 = vsubq_s16(sign_reg3, sign_reg);
569 /*edgidx*/
570 edgeidx = vaddq_s16(vaddq_s16(sign_reg0, const_2), sign_reg1);
571 edgeidx1 = vaddq_s16(vaddq_s16(sign_reg2, const_2), sign_reg3);
572
573 edgeidx_reg0 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error));
574 edgeidx = vandq_s16(edgeidx_reg0, edgeidx);
575
576 edgeidx_reg5 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error1));
577 edgeidx1 = vandq_s16(edgeidx_reg5, edgeidx1);
578
579 temp_reg0 = (int16x8_t)vceqq_s16(const_0, edgeidx);
580 temp_reg4 = (int16x8_t)vceqq_s16(const_0, edgeidx1);
581 temp_reg1 = (int16x8_t)vceqq_s16(const_1, edgeidx);
582 temp_reg5 = (int16x8_t)vceqq_s16(const_1, edgeidx1);
583
584 temp_reg2 = (int16x8_t)vceqq_s16(const_3, edgeidx);
585 temp_reg6 = (int16x8_t)vceqq_s16(const_3, edgeidx1);
586 temp_reg3 = (int16x8_t)vceqq_s16(const_4, edgeidx);
587 temp_reg7 = (int16x8_t)vceqq_s16(const_4, edgeidx1);
588
589 edgeidx_reg1 = vabsq_s16(temp_reg1);
590 edgeidx_reg5 = vabsq_s16(temp_reg5);
591
592 edgeidx_reg2 = vabsq_s16(temp_reg2);
593 edgeidx_reg6 = vabsq_s16(temp_reg6);
594 edgeidx_reg3 = vabsq_s16(temp_reg3);
595 edgeidx_reg7 = vabsq_s16(temp_reg7);
596
597 temp_reg0 = vandq_s16(temp_reg0, pel_error);
598 temp_reg4 = vandq_s16(temp_reg4, pel_error1);
599 temp_reg1 = vandq_s16(temp_reg1, pel_error);
600 temp_reg5 = vandq_s16(temp_reg5, pel_error1);
601
602 temp_reg2 = vandq_s16(temp_reg2, pel_error);
603 temp_reg6 = vandq_s16(temp_reg6, pel_error1);
604 temp_reg3 = vandq_s16(temp_reg3, pel_error);
605 temp_reg7 = vandq_s16(temp_reg7, pel_error1);
606
607 edgeidx_reg0 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg0));
608 edgeidx_reg4 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg4));
609
610 temp_reg0 = vaddq_s16(temp_reg0, temp_reg4);
611 temp_reg1 = vaddq_s16(temp_reg1, temp_reg5);
612 temp_reg2 = vaddq_s16(temp_reg2, temp_reg6);
613 temp_reg3 = vaddq_s16(temp_reg3, temp_reg7);
614
615 edgeidx_reg0 = vaddq_s16(edgeidx_reg0, edgeidx_reg4);
616 edgeidx_reg1 = vaddq_s16(edgeidx_reg1, edgeidx_reg5);
617 edgeidx_reg2 = vaddq_s16(edgeidx_reg2, edgeidx_reg6);
618 edgeidx_reg3 = vaddq_s16(edgeidx_reg3, edgeidx_reg7);
619
620 /*store peel error*/
621 pi4_acc_error_category[0] += sad_cal(temp_reg0);
622 pi4_acc_error_category[1] += sad_cal(temp_reg1);
623 pi4_acc_error_category[3] += sad_cal(temp_reg2);
624 pi4_acc_error_category[4] += sad_cal(temp_reg3);
625
626 /*store edgeidx account*/
627 pi4_category_count[0] += sad_cal(edgeidx_reg0);
628 pi4_category_count[1] += sad_cal(edgeidx_reg1);
629 pi4_category_count[3] += sad_cal(edgeidx_reg2);
630 pi4_category_count[4] += sad_cal(edgeidx_reg3);
631 pu1_luma_recon_buf += 16;
632 pu1_luma_src_buf += 16;
633 }
634 rem_wd &= 0x0F;
635
636 if(rem_wd > 7)
637 {
638 /*load data*/
639 src_buf = vld1_u8(pu1_luma_src_buf);
640 recon_buf = vld1_u8(pu1_luma_recon_buf);
641 recon_buf0 = vld1_u8(pu1_luma_recon_buf - 1);
642 recon_buf1 = vld1_u8(pu1_luma_recon_buf + 1);
643 /*pel_error*/
644 pel_error = vreinterpretq_s16_u16(vsubl_u8(src_buf, recon_buf));
645 /*sign*/
646 sign_reg0 = vreinterpretq_s16_u16(vsubl_u8(recon_buf, recon_buf0));
647 sign_reg = (int16x8_t)vcgtq_s16(sign_reg0, const_0);
648 sign_reg0 = (int16x8_t)vcltq_s16(sign_reg0, const_0);
649 sign_reg0 = vsubq_s16(sign_reg0, sign_reg);
650
651 sign_reg1 = vreinterpretq_s16_u16(vsubl_u8(recon_buf, recon_buf1));
652 sign_reg = (int16x8_t)vcgtq_s16(sign_reg1, const_0);
653 sign_reg1 = (int16x8_t)vcltq_s16(sign_reg1, const_0);
654 sign_reg1 = vsubq_s16(sign_reg1, sign_reg);
655
656 edgeidx = vaddq_s16(vaddq_s16(sign_reg0, const_2), sign_reg1);
657
658 edgeidx_reg0 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error));
659 edgeidx = vandq_s16(edgeidx_reg0, edgeidx);
660
661 temp_reg0 = (int16x8_t)vceqq_s16(const_0, edgeidx);
662 temp_reg1 = (int16x8_t)vceqq_s16(const_1, edgeidx);
663 temp_reg2 = (int16x8_t)vceqq_s16(const_3, edgeidx);
664 temp_reg3 = (int16x8_t)vceqq_s16(const_4, edgeidx);
665
666 edgeidx_reg1 = vabsq_s16(temp_reg1);
667 edgeidx_reg2 = vabsq_s16(temp_reg2);
668 edgeidx_reg3 = vabsq_s16(temp_reg3);
669
670 temp_reg0 = vandq_s16(temp_reg0, pel_error);
671 temp_reg1 = vandq_s16(temp_reg1, pel_error);
672 temp_reg2 = vandq_s16(temp_reg2, pel_error);
673 temp_reg3 = vandq_s16(temp_reg3, pel_error);
674
675 edgeidx_reg0 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg0));
676 /*store */
677 pi4_acc_error_category[0] += sad_cal(temp_reg0);
678 pi4_acc_error_category[1] += sad_cal(temp_reg1);
679 pi4_acc_error_category[3] += sad_cal(temp_reg2);
680 pi4_acc_error_category[4] += sad_cal(temp_reg3);
681
682 pi4_category_count[0] += sad_cal(edgeidx_reg0);
683 pi4_category_count[1] += sad_cal(edgeidx_reg1);
684 pi4_category_count[3] += sad_cal(edgeidx_reg2);
685 pi4_category_count[4] += sad_cal(edgeidx_reg3);
686 pu1_luma_recon_buf += 8;
687 pu1_luma_src_buf += 8;
688 }
689 rem_wd &= 0x7;
690 if(rem_wd)
691 {
692 for(col = 0; col < rem_wd; col++)
693 {
694 c = pu1_luma_recon_buf[col];
695 a = pu1_luma_recon_buf[col - 1];
696 b = pu1_luma_recon_buf[col + 1];
697 pel_err = pu1_luma_src_buf[col] - pu1_luma_recon_buf[col];
698 edge_idx = 2 + SIGN(c - a) + SIGN(c - b);
699
700 if(pel_err != 0)
701 {
702 pi4_acc_error_category[edge_idx] += pel_err;
703 pi4_category_count[edge_idx]++;
704 }
705 }
706 }
707 pu1_luma_recon_buf = pu1_luma_recon_buf_copy + i4_luma_recon_strd;
708 pu1_luma_src_buf = pu1_luma_src_buf_copy + i4_luma_src_strd;
709 rem_wd = wd;
710 }
711 break;
712 case SAO_EDGE_90_DEG:
713 for(row = row_start; row < row_end; row++)
714 {
715 pu1_luma_src_buf_copy = pu1_luma_src_buf;
716 pu1_luma_recon_buf_copy = pu1_luma_recon_buf;
717 for(col = wd; col > 15; col -= 16)
718 {
719 /*load src and recon data*/
720 src_buf_8x16 = vld1q_u8(pu1_luma_src_buf);
721 recon_buf_8x16 = vld1q_u8(pu1_luma_recon_buf);
722 recon_buf0_8x16 = vld1q_u8(pu1_luma_recon_buf - i4_luma_recon_strd);
723 recon_buf1_8x16 = vld1q_u8(pu1_luma_recon_buf + i4_luma_recon_strd);
724 /*pel_error*/
725 pel_error = vreinterpretq_s16_u16(
726 vsubl_u8(vget_low_u8(src_buf_8x16), vget_low_u8(recon_buf_8x16)));
727 pel_error1 = vreinterpretq_s16_u16(
728 vsubl_u8(vget_high_u8(src_buf_8x16), vget_high_u8(recon_buf_8x16)));
729 /*sign*/
730 sign_reg0 = vreinterpretq_s16_u16(
731 vsubl_u8(vget_low_u8(recon_buf_8x16), vget_low_u8(recon_buf0_8x16)));
732 sign_reg = (int16x8_t)vcgtq_s16(sign_reg0, const_0);
733 sign_reg0 = (int16x8_t)vcltq_s16(sign_reg0, const_0);
734 sign_reg0 = vsubq_s16(sign_reg0, sign_reg);
735
736 sign_reg1 = vreinterpretq_s16_u16(
737 vsubl_u8(vget_low_u8(recon_buf_8x16), vget_low_u8(recon_buf1_8x16)));
738 sign_reg = (int16x8_t)vcgtq_s16(sign_reg1, const_0);
739 sign_reg1 = (int16x8_t)vcltq_s16(sign_reg1, const_0);
740 sign_reg1 = vsubq_s16(sign_reg1, sign_reg);
741
742 sign_reg2 = vreinterpretq_s16_u16(
743 vsubl_u8(vget_high_u8(recon_buf_8x16), vget_high_u8(recon_buf0_8x16)));
744 sign_reg = (int16x8_t)vcgtq_s16(sign_reg2, const_0);
745 sign_reg2 = (int16x8_t)vcltq_s16(sign_reg2, const_0);
746 sign_reg2 = vsubq_s16(sign_reg2, sign_reg);
747
748 sign_reg3 = vreinterpretq_s16_u16(
749 vsubl_u8(vget_high_u8(recon_buf_8x16), vget_high_u8(recon_buf1_8x16)));
750 sign_reg = (int16x8_t)vcgtq_s16(sign_reg3, const_0);
751 sign_reg3 = (int16x8_t)vcltq_s16(sign_reg3, const_0);
752 sign_reg3 = vsubq_s16(sign_reg3, sign_reg);
753 /*edgeidx*/
754 edgeidx = vaddq_s16(vaddq_s16(sign_reg0, const_2), sign_reg1);
755 edgeidx1 = vaddq_s16(vaddq_s16(sign_reg2, const_2), sign_reg3);
756
757 edgeidx_reg0 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error));
758 edgeidx = vandq_s16(edgeidx_reg0, edgeidx);
759
760 edgeidx_reg5 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error1));
761 edgeidx1 = vandq_s16(edgeidx_reg5, edgeidx1);
762
763 temp_reg0 = (int16x8_t)vceqq_s16(const_0, edgeidx);
764 temp_reg4 = (int16x8_t)vceqq_s16(const_0, edgeidx1);
765 temp_reg1 = (int16x8_t)vceqq_s16(const_1, edgeidx);
766 temp_reg5 = (int16x8_t)vceqq_s16(const_1, edgeidx1);
767
768 temp_reg2 = (int16x8_t)vceqq_s16(const_3, edgeidx);
769 temp_reg6 = (int16x8_t)vceqq_s16(const_3, edgeidx1);
770 temp_reg3 = (int16x8_t)vceqq_s16(const_4, edgeidx);
771 temp_reg7 = (int16x8_t)vceqq_s16(const_4, edgeidx1);
772
773 edgeidx_reg1 = vabsq_s16(temp_reg1);
774 edgeidx_reg5 = vabsq_s16(temp_reg5);
775
776 edgeidx_reg2 = vabsq_s16(temp_reg2);
777 edgeidx_reg6 = vabsq_s16(temp_reg6);
778 edgeidx_reg3 = vabsq_s16(temp_reg3);
779 edgeidx_reg7 = vabsq_s16(temp_reg7);
780
781 temp_reg0 = vandq_s16(temp_reg0, pel_error);
782 temp_reg4 = vandq_s16(temp_reg4, pel_error1);
783 temp_reg1 = vandq_s16(temp_reg1, pel_error);
784 temp_reg5 = vandq_s16(temp_reg5, pel_error1);
785
786 temp_reg2 = vandq_s16(temp_reg2, pel_error);
787 temp_reg6 = vandq_s16(temp_reg6, pel_error1);
788 temp_reg3 = vandq_s16(temp_reg3, pel_error);
789 temp_reg7 = vandq_s16(temp_reg7, pel_error1);
790
791 edgeidx_reg0 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg0));
792 edgeidx_reg4 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg4));
793
794 temp_reg0 = vaddq_s16(temp_reg0, temp_reg4);
795 temp_reg1 = vaddq_s16(temp_reg1, temp_reg5);
796 temp_reg2 = vaddq_s16(temp_reg2, temp_reg6);
797 temp_reg3 = vaddq_s16(temp_reg3, temp_reg7);
798
799 edgeidx_reg0 = vaddq_s16(edgeidx_reg0, edgeidx_reg4);
800 edgeidx_reg1 = vaddq_s16(edgeidx_reg1, edgeidx_reg5);
801 edgeidx_reg2 = vaddq_s16(edgeidx_reg2, edgeidx_reg6);
802 edgeidx_reg3 = vaddq_s16(edgeidx_reg3, edgeidx_reg7);
803 /* store */
804 pi4_acc_error_category[0] += sad_cal(temp_reg0);
805 pi4_acc_error_category[1] += sad_cal(temp_reg1);
806 pi4_acc_error_category[3] += sad_cal(temp_reg2);
807 pi4_acc_error_category[4] += sad_cal(temp_reg3);
808 /*store account*/
809 pi4_category_count[0] += sad_cal(edgeidx_reg0);
810 pi4_category_count[1] += sad_cal(edgeidx_reg1);
811 pi4_category_count[3] += sad_cal(edgeidx_reg2);
812 pi4_category_count[4] += sad_cal(edgeidx_reg3);
813 pu1_luma_recon_buf += 16;
814 pu1_luma_src_buf += 16;
815 }
816 rem_wd &= 0x0F;
817
818 if(rem_wd > 7)
819 {
820 /*load*/
821 src_buf = vld1_u8(pu1_luma_src_buf);
822 recon_buf = vld1_u8(pu1_luma_recon_buf);
823 recon_buf0 = vld1_u8(pu1_luma_recon_buf - i4_luma_recon_strd);
824 recon_buf1 = vld1_u8(pu1_luma_recon_buf + i4_luma_recon_strd);
825 /*pel_error*/
826 pel_error = vreinterpretq_s16_u16(vsubl_u8(src_buf, recon_buf));
827 /*sign*/
828 sign_reg0 = vreinterpretq_s16_u16(vsubl_u8(recon_buf, recon_buf0));
829 sign_reg = (int16x8_t)vcgtq_s16(sign_reg0, const_0);
830 sign_reg0 = (int16x8_t)vcltq_s16(sign_reg0, const_0);
831 sign_reg0 = vsubq_s16(sign_reg0, sign_reg);
832
833 sign_reg1 = vreinterpretq_s16_u16(vsubl_u8(recon_buf, recon_buf1));
834 sign_reg = (int16x8_t)vcgtq_s16(sign_reg1, const_0);
835 sign_reg1 = (int16x8_t)vcltq_s16(sign_reg1, const_0);
836 sign_reg1 = vsubq_s16(sign_reg1, sign_reg);
837
838 edgeidx = vaddq_s16(vaddq_s16(sign_reg0, const_2), sign_reg1);
839 edgeidx_reg0 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error));
840 edgeidx = vandq_s16(edgeidx_reg0, edgeidx);
841
842 temp_reg0 = (int16x8_t)vceqq_s16(const_0, edgeidx);
843 temp_reg1 = (int16x8_t)vceqq_s16(const_1, edgeidx);
844 temp_reg2 = (int16x8_t)vceqq_s16(const_3, edgeidx);
845 temp_reg3 = (int16x8_t)vceqq_s16(const_4, edgeidx);
846
847 edgeidx_reg1 = vabsq_s16(temp_reg1);
848 edgeidx_reg2 = vabsq_s16(temp_reg2);
849 edgeidx_reg3 = vabsq_s16(temp_reg3);
850
851 temp_reg0 = vandq_s16(temp_reg0, pel_error);
852 temp_reg1 = vandq_s16(temp_reg1, pel_error);
853 temp_reg2 = vandq_s16(temp_reg2, pel_error);
854 temp_reg3 = vandq_s16(temp_reg3, pel_error);
855
856 edgeidx_reg0 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg0));
857 /*store*/
858 pi4_acc_error_category[0] += sad_cal(temp_reg0);
859 pi4_acc_error_category[1] += sad_cal(temp_reg1);
860 pi4_acc_error_category[3] += sad_cal(temp_reg2);
861 pi4_acc_error_category[4] += sad_cal(temp_reg3);
862
863 pi4_category_count[0] += sad_cal(edgeidx_reg0);
864 pi4_category_count[1] += sad_cal(edgeidx_reg1);
865 pi4_category_count[3] += sad_cal(edgeidx_reg2);
866 pi4_category_count[4] += sad_cal(edgeidx_reg3);
867 pu1_luma_recon_buf += 8;
868 pu1_luma_src_buf += 8;
869 }
870 rem_wd &= 0x7;
871 if(rem_wd)
872 {
873 for(col = 0; col < rem_wd; col++)
874 {
875 c = pu1_luma_recon_buf[col];
876 a = pu1_luma_recon_buf[col - i4_luma_recon_strd];
877 b = pu1_luma_recon_buf[col + i4_luma_recon_strd];
878 pel_err = pu1_luma_src_buf[col] - pu1_luma_recon_buf[col];
879 edge_idx = 2 + SIGN(c - a) + SIGN(c - b);
880
881 if(pel_err != 0)
882 {
883 pi4_acc_error_category[edge_idx] += pel_err;
884 pi4_category_count[edge_idx]++;
885 }
886 }
887 }
888 pu1_luma_recon_buf = pu1_luma_recon_buf_copy + i4_luma_recon_strd;
889 pu1_luma_src_buf = pu1_luma_src_buf_copy + i4_luma_src_strd;
890 rem_wd = wd;
891 }
892 break;
893 case SAO_EDGE_135_DEG:
894 for(row = row_start; row < row_end; row++)
895 {
896 pu1_luma_src_buf_copy = pu1_luma_src_buf;
897 pu1_luma_recon_buf_copy = pu1_luma_recon_buf;
898 for(col = wd; col > 15; col -= 16)
899 {
900 /*load src and recon data*/
901 src_buf_8x16 = vld1q_u8(pu1_luma_src_buf);
902 recon_buf_8x16 = vld1q_u8(pu1_luma_recon_buf);
903 recon_buf0_8x16 = vld1q_u8(pu1_luma_recon_buf - 1 - i4_luma_recon_strd);
904 recon_buf1_8x16 = vld1q_u8(pu1_luma_recon_buf + 1 + i4_luma_recon_strd);
905 /*pel_error*/
906 pel_error = vreinterpretq_s16_u16(
907 vsubl_u8(vget_low_u8(src_buf_8x16), vget_low_u8(recon_buf_8x16)));
908 pel_error1 = vreinterpretq_s16_u16(
909 vsubl_u8(vget_high_u8(src_buf_8x16), vget_high_u8(recon_buf_8x16)));
910 /*sign*/
911 sign_reg0 = vreinterpretq_s16_u16(
912 vsubl_u8(vget_low_u8(recon_buf_8x16), vget_low_u8(recon_buf0_8x16)));
913 sign_reg = (int16x8_t)vcgtq_s16(sign_reg0, const_0);
914 sign_reg0 = (int16x8_t)vcltq_s16(sign_reg0, const_0);
915 sign_reg0 = vsubq_s16(sign_reg0, sign_reg);
916
917 sign_reg1 = vreinterpretq_s16_u16(
918 vsubl_u8(vget_low_u8(recon_buf_8x16), vget_low_u8(recon_buf1_8x16)));
919 sign_reg = (int16x8_t)vcgtq_s16(sign_reg1, const_0);
920 sign_reg1 = (int16x8_t)vcltq_s16(sign_reg1, const_0);
921 sign_reg1 = vsubq_s16(sign_reg1, sign_reg);
922
923 sign_reg2 = vreinterpretq_s16_u16(
924 vsubl_u8(vget_high_u8(recon_buf_8x16), vget_high_u8(recon_buf0_8x16)));
925 sign_reg = (int16x8_t)vcgtq_s16(sign_reg2, const_0);
926 sign_reg2 = (int16x8_t)vcltq_s16(sign_reg2, const_0);
927 sign_reg2 = vsubq_s16(sign_reg2, sign_reg);
928
929 sign_reg3 = vreinterpretq_s16_u16(
930 vsubl_u8(vget_high_u8(recon_buf_8x16), vget_high_u8(recon_buf1_8x16)));
931 sign_reg = (int16x8_t)vcgtq_s16(sign_reg3, const_0);
932 sign_reg3 = (int16x8_t)vcltq_s16(sign_reg3, const_0);
933 sign_reg3 = vsubq_s16(sign_reg3, sign_reg);
934
935 edgeidx = vaddq_s16(vaddq_s16(sign_reg0, const_2), sign_reg1);
936 edgeidx1 = vaddq_s16(vaddq_s16(sign_reg2, const_2), sign_reg3);
937
938 edgeidx_reg0 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error));
939 edgeidx = vandq_s16(edgeidx_reg0, edgeidx);
940
941 edgeidx_reg5 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error1));
942 edgeidx1 = vandq_s16(edgeidx_reg5, edgeidx1);
943
944 temp_reg0 = (int16x8_t)vceqq_s16(const_0, edgeidx);
945 temp_reg4 = (int16x8_t)vceqq_s16(const_0, edgeidx1);
946 temp_reg1 = (int16x8_t)vceqq_s16(const_1, edgeidx);
947 temp_reg5 = (int16x8_t)vceqq_s16(const_1, edgeidx1);
948
949 temp_reg2 = (int16x8_t)vceqq_s16(const_3, edgeidx);
950 temp_reg6 = (int16x8_t)vceqq_s16(const_3, edgeidx1);
951 temp_reg3 = (int16x8_t)vceqq_s16(const_4, edgeidx);
952 temp_reg7 = (int16x8_t)vceqq_s16(const_4, edgeidx1);
953
954 edgeidx_reg1 = vabsq_s16(temp_reg1);
955 edgeidx_reg5 = vabsq_s16(temp_reg5);
956
957 edgeidx_reg2 = vabsq_s16(temp_reg2);
958 edgeidx_reg6 = vabsq_s16(temp_reg6);
959 edgeidx_reg3 = vabsq_s16(temp_reg3);
960 edgeidx_reg7 = vabsq_s16(temp_reg7);
961
962 temp_reg0 = vandq_s16(temp_reg0, pel_error);
963 temp_reg4 = vandq_s16(temp_reg4, pel_error1);
964 temp_reg1 = vandq_s16(temp_reg1, pel_error);
965 temp_reg5 = vandq_s16(temp_reg5, pel_error1);
966
967 temp_reg2 = vandq_s16(temp_reg2, pel_error);
968 temp_reg6 = vandq_s16(temp_reg6, pel_error1);
969 temp_reg3 = vandq_s16(temp_reg3, pel_error);
970 temp_reg7 = vandq_s16(temp_reg7, pel_error1);
971
972 edgeidx_reg0 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg0));
973 edgeidx_reg4 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg4));
974
975 temp_reg0 = vaddq_s16(temp_reg0, temp_reg4);
976 temp_reg1 = vaddq_s16(temp_reg1, temp_reg5);
977 temp_reg2 = vaddq_s16(temp_reg2, temp_reg6);
978 temp_reg3 = vaddq_s16(temp_reg3, temp_reg7);
979
980 edgeidx_reg0 = vaddq_s16(edgeidx_reg0, edgeidx_reg4);
981 edgeidx_reg1 = vaddq_s16(edgeidx_reg1, edgeidx_reg5);
982 edgeidx_reg2 = vaddq_s16(edgeidx_reg2, edgeidx_reg6);
983 edgeidx_reg3 = vaddq_s16(edgeidx_reg3, edgeidx_reg7);
984 /*store*/
985 pi4_acc_error_category[0] += sad_cal(temp_reg0);
986 pi4_acc_error_category[1] += sad_cal(temp_reg1);
987 pi4_acc_error_category[3] += sad_cal(temp_reg2);
988 pi4_acc_error_category[4] += sad_cal(temp_reg3);
989
990 pi4_category_count[0] += sad_cal(edgeidx_reg0);
991 pi4_category_count[1] += sad_cal(edgeidx_reg1);
992 pi4_category_count[3] += sad_cal(edgeidx_reg2);
993 pi4_category_count[4] += sad_cal(edgeidx_reg3);
994 pu1_luma_recon_buf += 16;
995 pu1_luma_src_buf += 16;
996 }
997 rem_wd &= 0x0F;
998
999 if(rem_wd > 7)
1000 {
1001 /*load data*/
1002 src_buf = vld1_u8(pu1_luma_src_buf);
1003 recon_buf = vld1_u8(pu1_luma_recon_buf);
1004 recon_buf0 = vld1_u8(pu1_luma_recon_buf - 1 - i4_luma_recon_strd);
1005 recon_buf1 = vld1_u8(pu1_luma_recon_buf + 1 + i4_luma_recon_strd);
1006 /*pel_error*/
1007 pel_error = vreinterpretq_s16_u16(vsubl_u8(src_buf, recon_buf));
1008 /*sign*/
1009 sign_reg0 = vreinterpretq_s16_u16(vsubl_u8(recon_buf, recon_buf0));
1010 sign_reg = (int16x8_t)vcgtq_s16(sign_reg0, const_0);
1011 sign_reg0 = (int16x8_t)vcltq_s16(sign_reg0, const_0);
1012 sign_reg0 = vsubq_s16(sign_reg0, sign_reg);
1013
1014 sign_reg1 = vreinterpretq_s16_u16(vsubl_u8(recon_buf, recon_buf1));
1015 sign_reg = (int16x8_t)vcgtq_s16(sign_reg1, const_0);
1016 sign_reg1 = (int16x8_t)vcltq_s16(sign_reg1, const_0);
1017 sign_reg1 = vsubq_s16(sign_reg1, sign_reg);
1018
1019 edgeidx = vaddq_s16(vaddq_s16(sign_reg0, const_2), sign_reg1);
1020 edgeidx_reg0 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error));
1021 edgeidx = vandq_s16(edgeidx_reg0, edgeidx);
1022
1023 temp_reg0 = (int16x8_t)vceqq_s16(const_0, edgeidx);
1024 temp_reg1 = (int16x8_t)vceqq_s16(const_1, edgeidx);
1025 temp_reg3 = (int16x8_t)vceqq_s16(const_3, edgeidx);
1026 temp_reg4 = (int16x8_t)vceqq_s16(const_4, edgeidx);
1027
1028 edgeidx_reg1 = vabsq_s16(temp_reg1);
1029 edgeidx_reg3 = vabsq_s16(temp_reg3);
1030 edgeidx_reg4 = vabsq_s16(temp_reg4);
1031
1032 temp_reg0 = vandq_s16(temp_reg0, pel_error);
1033 temp_reg1 = vandq_s16(temp_reg1, pel_error);
1034 temp_reg3 = vandq_s16(temp_reg3, pel_error);
1035 temp_reg4 = vandq_s16(temp_reg4, pel_error);
1036
1037 edgeidx_reg0 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg0));
1038 /*store*/
1039 pi4_acc_error_category[0] += sad_cal(temp_reg0);
1040 pi4_acc_error_category[1] += sad_cal(temp_reg1);
1041 pi4_acc_error_category[3] += sad_cal(temp_reg3);
1042 pi4_acc_error_category[4] += sad_cal(temp_reg4);
1043
1044 pi4_category_count[0] += sad_cal(edgeidx_reg0);
1045 pi4_category_count[1] += sad_cal(edgeidx_reg1);
1046 pi4_category_count[3] += sad_cal(edgeidx_reg3);
1047 pi4_category_count[4] += sad_cal(edgeidx_reg4);
1048 pu1_luma_recon_buf += 8;
1049 pu1_luma_src_buf += 8;
1050 }
1051 rem_wd &= 0x7;
1052 if(rem_wd)
1053 {
1054 for(col = 0; col < rem_wd; col++)
1055 {
1056 c = pu1_luma_recon_buf[col];
1057 a = pu1_luma_recon_buf[col - 1 - i4_luma_recon_strd];
1058 b = pu1_luma_recon_buf[col + 1 + i4_luma_recon_strd];
1059 pel_err = pu1_luma_src_buf[col] - pu1_luma_recon_buf[col];
1060 edge_idx = 2 + SIGN(c - a) + SIGN(c - b);
1061
1062 if(pel_err != 0)
1063 {
1064 pi4_acc_error_category[edge_idx] += pel_err;
1065 pi4_category_count[edge_idx]++;
1066 }
1067 }
1068 }
1069 pu1_luma_recon_buf = pu1_luma_recon_buf_copy + i4_luma_recon_strd;
1070 pu1_luma_src_buf = pu1_luma_src_buf_copy + i4_luma_src_strd;
1071 rem_wd = wd;
1072 }
1073 break;
1074 case SAO_EDGE_45_DEG:
1075 for(row = row_start; row < row_end; row++)
1076 {
1077 pu1_luma_src_buf_copy = pu1_luma_src_buf;
1078 pu1_luma_recon_buf_copy = pu1_luma_recon_buf;
1079 for(col = wd; col > 15; col -= 16)
1080 {
1081 /*load data*/
1082 src_buf_8x16 = vld1q_u8(pu1_luma_src_buf);
1083 recon_buf_8x16 = vld1q_u8(pu1_luma_recon_buf);
1084 recon_buf0_8x16 = vld1q_u8(pu1_luma_recon_buf + 1 - i4_luma_recon_strd);
1085 recon_buf1_8x16 = vld1q_u8(pu1_luma_recon_buf - 1 + i4_luma_recon_strd);
1086 /*pel_error*/
1087 pel_error = vreinterpretq_s16_u16(
1088 vsubl_u8(vget_low_u8(src_buf_8x16), vget_low_u8(recon_buf_8x16)));
1089 pel_error1 = vreinterpretq_s16_u16(
1090 vsubl_u8(vget_high_u8(src_buf_8x16), vget_high_u8(recon_buf_8x16)));
1091 /*sign*/
1092 sign_reg0 = vreinterpretq_s16_u16(
1093 vsubl_u8(vget_low_u8(recon_buf_8x16), vget_low_u8(recon_buf0_8x16)));
1094 sign_reg = (int16x8_t)vcgtq_s16(sign_reg0, const_0);
1095 sign_reg0 = (int16x8_t)vcltq_s16(sign_reg0, const_0);
1096 sign_reg0 = vsubq_s16(sign_reg0, sign_reg);
1097
1098 sign_reg1 = vreinterpretq_s16_u16(
1099 vsubl_u8(vget_low_u8(recon_buf_8x16), vget_low_u8(recon_buf1_8x16)));
1100 sign_reg = (int16x8_t)vcgtq_s16(sign_reg1, const_0);
1101 sign_reg1 = (int16x8_t)vcltq_s16(sign_reg1, const_0);
1102 sign_reg1 = vsubq_s16(sign_reg1, sign_reg);
1103
1104 sign_reg2 = vreinterpretq_s16_u16(
1105 vsubl_u8(vget_high_u8(recon_buf_8x16), vget_high_u8(recon_buf0_8x16)));
1106 sign_reg = (int16x8_t)vcgtq_s16(sign_reg2, const_0);
1107 sign_reg2 = (int16x8_t)vcltq_s16(sign_reg2, const_0);
1108 sign_reg2 = vsubq_s16(sign_reg2, sign_reg);
1109
1110 sign_reg3 = vreinterpretq_s16_u16(
1111 vsubl_u8(vget_high_u8(recon_buf_8x16), vget_high_u8(recon_buf1_8x16)));
1112 sign_reg = (int16x8_t)vcgtq_s16(sign_reg3, const_0);
1113 sign_reg3 = (int16x8_t)vcltq_s16(sign_reg3, const_0);
1114 sign_reg3 = vsubq_s16(sign_reg3, sign_reg);
1115
1116 edgeidx = vaddq_s16(vaddq_s16(sign_reg0, const_2), sign_reg1);
1117 edgeidx1 = vaddq_s16(vaddq_s16(sign_reg2, const_2), sign_reg3);
1118
1119 edgeidx_reg0 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error));
1120 edgeidx = vandq_s16(edgeidx_reg0, edgeidx);
1121
1122 edgeidx_reg5 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error1));
1123 edgeidx1 = vandq_s16(edgeidx_reg5, edgeidx1);
1124
1125 temp_reg0 = (int16x8_t)vceqq_s16(const_0, edgeidx);
1126 temp_reg4 = (int16x8_t)vceqq_s16(const_0, edgeidx1);
1127 temp_reg1 = (int16x8_t)vceqq_s16(const_1, edgeidx);
1128 temp_reg5 = (int16x8_t)vceqq_s16(const_1, edgeidx1);
1129
1130 temp_reg2 = (int16x8_t)vceqq_s16(const_3, edgeidx);
1131 temp_reg6 = (int16x8_t)vceqq_s16(const_3, edgeidx1);
1132 temp_reg3 = (int16x8_t)vceqq_s16(const_4, edgeidx);
1133 temp_reg7 = (int16x8_t)vceqq_s16(const_4, edgeidx1);
1134
1135 edgeidx_reg1 = vabsq_s16(temp_reg1);
1136 edgeidx_reg5 = vabsq_s16(temp_reg5);
1137
1138 edgeidx_reg2 = vabsq_s16(temp_reg2);
1139 edgeidx_reg6 = vabsq_s16(temp_reg6);
1140 edgeidx_reg3 = vabsq_s16(temp_reg3);
1141 edgeidx_reg7 = vabsq_s16(temp_reg7);
1142
1143 temp_reg0 = vandq_s16(temp_reg0, pel_error);
1144 temp_reg4 = vandq_s16(temp_reg4, pel_error1);
1145 temp_reg1 = vandq_s16(temp_reg1, pel_error);
1146 temp_reg5 = vandq_s16(temp_reg5, pel_error1);
1147
1148 temp_reg2 = vandq_s16(temp_reg2, pel_error);
1149 temp_reg6 = vandq_s16(temp_reg6, pel_error1);
1150 temp_reg3 = vandq_s16(temp_reg3, pel_error);
1151 temp_reg7 = vandq_s16(temp_reg7, pel_error1);
1152
1153 edgeidx_reg0 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg0));
1154 edgeidx_reg4 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg4));
1155
1156 temp_reg0 = vaddq_s16(temp_reg0, temp_reg4);
1157 temp_reg1 = vaddq_s16(temp_reg1, temp_reg5);
1158 temp_reg2 = vaddq_s16(temp_reg2, temp_reg6);
1159 temp_reg3 = vaddq_s16(temp_reg3, temp_reg7);
1160
1161 edgeidx_reg0 = vaddq_s16(edgeidx_reg0, edgeidx_reg4);
1162 edgeidx_reg1 = vaddq_s16(edgeidx_reg1, edgeidx_reg5);
1163 edgeidx_reg2 = vaddq_s16(edgeidx_reg2, edgeidx_reg6);
1164 edgeidx_reg3 = vaddq_s16(edgeidx_reg3, edgeidx_reg7);
1165 /*store*/
1166 pi4_acc_error_category[0] += sad_cal(temp_reg0);
1167 pi4_acc_error_category[1] += sad_cal(temp_reg1);
1168 pi4_acc_error_category[3] += sad_cal(temp_reg2);
1169 pi4_acc_error_category[4] += sad_cal(temp_reg3);
1170
1171 pi4_category_count[0] += sad_cal(edgeidx_reg0);
1172 pi4_category_count[1] += sad_cal(edgeidx_reg1);
1173 pi4_category_count[3] += sad_cal(edgeidx_reg2);
1174 pi4_category_count[4] += sad_cal(edgeidx_reg3);
1175 pu1_luma_recon_buf += 16;
1176 pu1_luma_src_buf += 16;
1177 }
1178 rem_wd &= 0x0F;
1179
1180 if(rem_wd > 7)
1181 {
1182 /*load*/
1183 src_buf = vld1_u8(pu1_luma_src_buf);
1184 recon_buf = vld1_u8(pu1_luma_recon_buf);
1185 recon_buf0 = vld1_u8(pu1_luma_recon_buf + 1 - i4_luma_recon_strd);
1186 recon_buf1 = vld1_u8(pu1_luma_recon_buf - 1 + i4_luma_recon_strd);
1187
1188 pel_error = vreinterpretq_s16_u16(vsubl_u8(src_buf, recon_buf));
1189
1190 sign_reg0 = vreinterpretq_s16_u16(vsubl_u8(recon_buf, recon_buf0));
1191 sign_reg = (int16x8_t)vcgtq_s16(sign_reg0, const_0);
1192 sign_reg0 = (int16x8_t)vcltq_s16(sign_reg0, const_0);
1193 sign_reg0 = vsubq_s16(sign_reg0, sign_reg);
1194
1195 sign_reg1 = vreinterpretq_s16_u16(vsubl_u8(recon_buf, recon_buf1));
1196 sign_reg = (int16x8_t)vcgtq_s16(sign_reg1, const_0);
1197 sign_reg1 = (int16x8_t)vcltq_s16(sign_reg1, const_0);
1198 sign_reg1 = vsubq_s16(sign_reg1, sign_reg);
1199
1200 edgeidx = vaddq_s16(vaddq_s16(sign_reg0, const_2), sign_reg1);
1201
1202 edgeidx_reg0 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error));
1203 edgeidx = vandq_s16(edgeidx_reg0, edgeidx);
1204
1205 temp_reg0 = (int16x8_t)vceqq_s16(const_0, edgeidx);
1206 temp_reg1 = (int16x8_t)vceqq_s16(const_1, edgeidx);
1207 temp_reg3 = (int16x8_t)vceqq_s16(const_3, edgeidx);
1208 temp_reg4 = (int16x8_t)vceqq_s16(const_4, edgeidx);
1209
1210 edgeidx_reg1 = vabsq_s16(temp_reg1);
1211 edgeidx_reg3 = vabsq_s16(temp_reg3);
1212 edgeidx_reg4 = vabsq_s16(temp_reg4);
1213
1214 temp_reg0 = vandq_s16(temp_reg0, pel_error);
1215 temp_reg1 = vandq_s16(temp_reg1, pel_error);
1216 temp_reg3 = vandq_s16(temp_reg3, pel_error);
1217 temp_reg4 = vandq_s16(temp_reg4, pel_error);
1218
1219 edgeidx_reg0 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg0));
1220 /*store*/
1221 pi4_acc_error_category[0] += sad_cal(temp_reg0);
1222 pi4_acc_error_category[1] += sad_cal(temp_reg1);
1223 pi4_acc_error_category[3] += sad_cal(temp_reg3);
1224 pi4_acc_error_category[4] += sad_cal(temp_reg4);
1225
1226 pi4_category_count[0] += sad_cal(edgeidx_reg0);
1227 pi4_category_count[1] += sad_cal(edgeidx_reg1);
1228 pi4_category_count[3] += sad_cal(edgeidx_reg3);
1229 pi4_category_count[4] += sad_cal(edgeidx_reg4);
1230 pu1_luma_recon_buf += 8;
1231 pu1_luma_src_buf += 8;
1232 }
1233 rem_wd &= 0x7;
1234 if(rem_wd)
1235 {
1236 for(col = 0; col < rem_wd; col++)
1237 {
1238 c = pu1_luma_recon_buf[col];
1239 a = pu1_luma_recon_buf[col + 1 - i4_luma_recon_strd];
1240 b = pu1_luma_recon_buf[col - 1 + i4_luma_recon_strd];
1241 pel_err = pu1_luma_src_buf[col] - pu1_luma_recon_buf[col];
1242 edge_idx = 2 + SIGN(c - a) + SIGN(c - b);
1243 if(pel_err != 0)
1244 {
1245 pi4_acc_error_category[edge_idx] += pel_err;
1246 pi4_category_count[edge_idx]++;
1247 }
1248 }
1249 }
1250 pu1_luma_recon_buf = pu1_luma_recon_buf_copy + i4_luma_recon_strd;
1251 pu1_luma_src_buf = pu1_luma_src_buf_copy + i4_luma_src_strd;
1252 rem_wd = wd;
1253 }
1254 break;
1255 default:
1256 break;
1257 }
1258 }
1259
ihevce_get_chroma_eo_sao_params_neon(void * pv_sao_ctxt,WORD32 eo_sao_class,WORD32 * pi4_acc_error_category,WORD32 * pi4_category_count)1260 void ihevce_get_chroma_eo_sao_params_neon(
1261 void *pv_sao_ctxt,
1262 WORD32 eo_sao_class,
1263 WORD32 *pi4_acc_error_category,
1264 WORD32 *pi4_category_count)
1265 {
1266 /*temp var*/
1267 UWORD8 *pu1_chroma_recon_buf, *pu1_chroma_src_buf;
1268 UWORD8 *pu1_chroma_src_buf_copy, *pu1_chroma_recon_buf_copy;
1269 WORD32 row_end, col_end, row, col;
1270 WORD32 row_start = 0, col_start = 0;
1271 WORD32 wd, rem_wd;
1272 WORD32 a, b, c, edge_idx, pel_err;
1273
1274 int16x8_t temp_reg0, temp_reg1, temp_reg2, temp_reg3, temp_reg4;
1275 int16x8_t edgeidx_reg0, edgeidx_reg1, edgeidx_reg2, edgeidx_reg3, edgeidx_reg4;
1276 int16x8_t edgeidx_reg5, edgeidx_reg6, edgeidx_reg7;
1277 int16x8_t pel_error, pel_error1;
1278 int16x8_t sign_reg0, sign_reg1, sign_reg, sign_reg2, sign_reg3;
1279 int16x8_t edgeidx, edgeidx1;
1280 int16x8_t temp_reg5, temp_reg6, temp_reg7;
1281 uint8x16_t src_buf_8x16, recon_buf_8x16, recon_buf0_8x16, recon_buf1_8x16;
1282 uint8x8_t src_buf, recon_buf, recon_buf0, recon_buf1;
1283
1284 sao_ctxt_t *ps_sao_ctxt = (sao_ctxt_t *)pv_sao_ctxt;
1285 const WORD32 i4_chroma_recon_strd = ps_sao_ctxt->i4_cur_chroma_recon_stride;
1286 const WORD32 i4_chroma_src_strd = ps_sao_ctxt->i4_cur_chroma_src_stride;
1287
1288 const int16x8_t const_2 = vdupq_n_s16(2);
1289 const int16x8_t const_0 = vdupq_n_s16(0);
1290 const int16x8_t const_1 = vdupq_n_s16(1);
1291 const int16x8_t const_3 = vdupq_n_s16(3);
1292 const int16x8_t const_4 = vdupq_n_s16(4);
1293
1294 row_end = ps_sao_ctxt->i4_sao_blk_ht >> 1;
1295 col_end = ps_sao_ctxt->i4_sao_blk_wd;
1296
1297 if((ps_sao_ctxt->i4_ctb_x == 0) && (eo_sao_class != SAO_EDGE_90_DEG))
1298 {
1299 col_start = 2;
1300 }
1301
1302 if(((ps_sao_ctxt->i4_ctb_x + 1) == ps_sao_ctxt->ps_sps->i2_pic_wd_in_ctb) &&
1303 (eo_sao_class != SAO_EDGE_90_DEG))
1304 {
1305 col_end = col_end - 2;
1306 }
1307
1308 if((ps_sao_ctxt->i4_ctb_y == 0) && (eo_sao_class != SAO_EDGE_0_DEG))
1309 {
1310 row_start = 1;
1311 }
1312
1313 if(((ps_sao_ctxt->i4_ctb_y + 1) == ps_sao_ctxt->ps_sps->i2_pic_ht_in_ctb) &&
1314 (eo_sao_class != SAO_EDGE_0_DEG))
1315 {
1316 row_end = row_end - 1;
1317 }
1318 wd = col_end - col_start;
1319 rem_wd = wd;
1320 pu1_chroma_recon_buf =
1321 ps_sao_ctxt->pu1_cur_chroma_recon_buf + col_start + (row_start * i4_chroma_recon_strd);
1322 pu1_chroma_src_buf =
1323 ps_sao_ctxt->pu1_cur_chroma_src_buf + col_start + (row_start * i4_chroma_src_strd);
1324
1325 switch(eo_sao_class)
1326 {
1327 case SAO_EDGE_0_DEG:
1328 for(row = row_start; row < row_end; row++)
1329 {
1330 pu1_chroma_src_buf_copy = pu1_chroma_src_buf;
1331 pu1_chroma_recon_buf_copy = pu1_chroma_recon_buf;
1332 for(col = wd; col > 15; col -= 16)
1333 {
1334 /*load src and recon data*/
1335 src_buf_8x16 = vld1q_u8(pu1_chroma_src_buf);
1336 recon_buf_8x16 = vld1q_u8(pu1_chroma_recon_buf);
1337 recon_buf0_8x16 = vld1q_u8(pu1_chroma_recon_buf - 2);
1338 recon_buf1_8x16 = vld1q_u8(pu1_chroma_recon_buf + 2);
1339
1340 /*pel_error*/
1341 pel_error = vreinterpretq_s16_u16(
1342 vsubl_u8(vget_low_u8(src_buf_8x16), vget_low_u8(recon_buf_8x16)));
1343 pel_error1 = vreinterpretq_s16_u16(
1344 vsubl_u8(vget_high_u8(src_buf_8x16), vget_high_u8(recon_buf_8x16)));
1345
1346 /*sign*/
1347 sign_reg0 = vreinterpretq_s16_u16(
1348 vsubl_u8(vget_low_u8(recon_buf_8x16), vget_low_u8(recon_buf0_8x16)));
1349 sign_reg = (int16x8_t)vcgtq_s16(sign_reg0, const_0);
1350 sign_reg0 = (int16x8_t)vcltq_s16(sign_reg0, const_0);
1351 sign_reg0 = vsubq_s16(sign_reg0, sign_reg);
1352
1353 sign_reg1 = vreinterpretq_s16_u16(
1354 vsubl_u8(vget_low_u8(recon_buf_8x16), vget_low_u8(recon_buf1_8x16)));
1355 sign_reg = (int16x8_t)vcgtq_s16(sign_reg1, const_0);
1356 sign_reg1 = (int16x8_t)vcltq_s16(sign_reg1, const_0);
1357 sign_reg1 = vsubq_s16(sign_reg1, sign_reg);
1358
1359 sign_reg2 = vreinterpretq_s16_u16(
1360 vsubl_u8(vget_high_u8(recon_buf_8x16), vget_high_u8(recon_buf0_8x16)));
1361 sign_reg = (int16x8_t)vcgtq_s16(sign_reg2, const_0);
1362 sign_reg2 = (int16x8_t)vcltq_s16(sign_reg2, const_0);
1363 sign_reg2 = vsubq_s16(sign_reg2, sign_reg);
1364
1365 sign_reg3 = vreinterpretq_s16_u16(
1366 vsubl_u8(vget_high_u8(recon_buf_8x16), vget_high_u8(recon_buf1_8x16)));
1367 sign_reg = (int16x8_t)vcgtq_s16(sign_reg3, const_0);
1368 sign_reg3 = (int16x8_t)vcltq_s16(sign_reg3, const_0);
1369 sign_reg3 = vsubq_s16(sign_reg3, sign_reg);
1370 /*edgidx*/
1371 edgeidx = vaddq_s16(vaddq_s16(sign_reg0, const_2), sign_reg1);
1372 edgeidx1 = vaddq_s16(vaddq_s16(sign_reg2, const_2), sign_reg3);
1373
1374 edgeidx_reg0 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error));
1375 edgeidx = vandq_s16(edgeidx_reg0, edgeidx);
1376
1377 edgeidx_reg5 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error1));
1378 edgeidx1 = vandq_s16(edgeidx_reg5, edgeidx1);
1379
1380 temp_reg0 = (int16x8_t)vceqq_s16(const_0, edgeidx);
1381 temp_reg4 = (int16x8_t)vceqq_s16(const_0, edgeidx1);
1382 temp_reg1 = (int16x8_t)vceqq_s16(const_1, edgeidx);
1383 temp_reg5 = (int16x8_t)vceqq_s16(const_1, edgeidx1);
1384
1385 temp_reg2 = (int16x8_t)vceqq_s16(const_3, edgeidx);
1386 temp_reg6 = (int16x8_t)vceqq_s16(const_3, edgeidx1);
1387 temp_reg3 = (int16x8_t)vceqq_s16(const_4, edgeidx);
1388 temp_reg7 = (int16x8_t)vceqq_s16(const_4, edgeidx1);
1389
1390 edgeidx_reg1 = vabsq_s16(temp_reg1);
1391 edgeidx_reg5 = vabsq_s16(temp_reg5);
1392
1393 edgeidx_reg2 = vabsq_s16(temp_reg2);
1394 edgeidx_reg6 = vabsq_s16(temp_reg6);
1395 edgeidx_reg3 = vabsq_s16(temp_reg3);
1396 edgeidx_reg7 = vabsq_s16(temp_reg7);
1397
1398 temp_reg0 = vandq_s16(temp_reg0, pel_error);
1399 temp_reg4 = vandq_s16(temp_reg4, pel_error1);
1400 temp_reg1 = vandq_s16(temp_reg1, pel_error);
1401 temp_reg5 = vandq_s16(temp_reg5, pel_error1);
1402
1403 temp_reg2 = vandq_s16(temp_reg2, pel_error);
1404 temp_reg6 = vandq_s16(temp_reg6, pel_error1);
1405 temp_reg3 = vandq_s16(temp_reg3, pel_error);
1406 temp_reg7 = vandq_s16(temp_reg7, pel_error1);
1407
1408 edgeidx_reg0 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg0));
1409 edgeidx_reg4 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg4));
1410
1411 temp_reg0 = vaddq_s16(temp_reg0, temp_reg4);
1412 temp_reg1 = vaddq_s16(temp_reg1, temp_reg5);
1413 temp_reg2 = vaddq_s16(temp_reg2, temp_reg6);
1414 temp_reg3 = vaddq_s16(temp_reg3, temp_reg7);
1415
1416 edgeidx_reg0 = vaddq_s16(edgeidx_reg0, edgeidx_reg4);
1417 edgeidx_reg1 = vaddq_s16(edgeidx_reg1, edgeidx_reg5);
1418 edgeidx_reg2 = vaddq_s16(edgeidx_reg2, edgeidx_reg6);
1419 edgeidx_reg3 = vaddq_s16(edgeidx_reg3, edgeidx_reg7);
1420
1421 /*store peel error*/
1422 pi4_acc_error_category[0] += sad_cal(temp_reg0);
1423 pi4_acc_error_category[1] += sad_cal(temp_reg1);
1424 pi4_acc_error_category[3] += sad_cal(temp_reg2);
1425 pi4_acc_error_category[4] += sad_cal(temp_reg3);
1426
1427 /*store edgeidx account*/
1428 pi4_category_count[0] += sad_cal(edgeidx_reg0);
1429 pi4_category_count[1] += sad_cal(edgeidx_reg1);
1430 pi4_category_count[3] += sad_cal(edgeidx_reg2);
1431 pi4_category_count[4] += sad_cal(edgeidx_reg3);
1432 pu1_chroma_recon_buf += 16;
1433 pu1_chroma_src_buf += 16;
1434 }
1435 rem_wd &= 0x0F;
1436
1437 if(rem_wd > 7)
1438 {
1439 /*load data*/
1440 src_buf = vld1_u8(pu1_chroma_src_buf);
1441 recon_buf = vld1_u8(pu1_chroma_recon_buf);
1442 recon_buf0 = vld1_u8(pu1_chroma_recon_buf - 2);
1443 recon_buf1 = vld1_u8(pu1_chroma_recon_buf + 2);
1444 /*pel_error*/
1445 pel_error = vreinterpretq_s16_u16(vsubl_u8(src_buf, recon_buf));
1446 /*sign*/
1447 sign_reg0 = vreinterpretq_s16_u16(vsubl_u8(recon_buf, recon_buf0));
1448 sign_reg = (int16x8_t)vcgtq_s16(sign_reg0, const_0);
1449 sign_reg0 = (int16x8_t)vcltq_s16(sign_reg0, const_0);
1450 sign_reg0 = vsubq_s16(sign_reg0, sign_reg);
1451
1452 sign_reg1 = vreinterpretq_s16_u16(vsubl_u8(recon_buf, recon_buf1));
1453 sign_reg = (int16x8_t)vcgtq_s16(sign_reg1, const_0);
1454 sign_reg1 = (int16x8_t)vcltq_s16(sign_reg1, const_0);
1455 sign_reg1 = vsubq_s16(sign_reg1, sign_reg);
1456
1457 edgeidx = vaddq_s16(vaddq_s16(sign_reg0, const_2), sign_reg1);
1458
1459 edgeidx_reg0 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error));
1460 edgeidx = vandq_s16(edgeidx_reg0, edgeidx);
1461
1462 temp_reg0 = (int16x8_t)vceqq_s16(const_0, edgeidx);
1463 temp_reg1 = (int16x8_t)vceqq_s16(const_1, edgeidx);
1464 temp_reg2 = (int16x8_t)vceqq_s16(const_3, edgeidx);
1465 temp_reg3 = (int16x8_t)vceqq_s16(const_4, edgeidx);
1466
1467 edgeidx_reg1 = vabsq_s16(temp_reg1);
1468 edgeidx_reg2 = vabsq_s16(temp_reg2);
1469 edgeidx_reg3 = vabsq_s16(temp_reg3);
1470
1471 temp_reg0 = vandq_s16(temp_reg0, pel_error);
1472 temp_reg1 = vandq_s16(temp_reg1, pel_error);
1473 temp_reg2 = vandq_s16(temp_reg2, pel_error);
1474 temp_reg3 = vandq_s16(temp_reg3, pel_error);
1475
1476 edgeidx_reg0 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg0));
1477 /*store */
1478 pi4_acc_error_category[0] += sad_cal(temp_reg0);
1479 pi4_acc_error_category[1] += sad_cal(temp_reg1);
1480 pi4_acc_error_category[3] += sad_cal(temp_reg2);
1481 pi4_acc_error_category[4] += sad_cal(temp_reg3);
1482
1483 pi4_category_count[0] += sad_cal(edgeidx_reg0);
1484 pi4_category_count[1] += sad_cal(edgeidx_reg1);
1485 pi4_category_count[3] += sad_cal(edgeidx_reg2);
1486 pi4_category_count[4] += sad_cal(edgeidx_reg3);
1487 pu1_chroma_recon_buf += 8;
1488 pu1_chroma_src_buf += 8;
1489 }
1490 rem_wd &= 0x7;
1491 if(rem_wd)
1492 {
1493 for(col = 0; col < rem_wd; col++)
1494 {
1495 c = pu1_chroma_recon_buf[col];
1496 a = pu1_chroma_recon_buf[col - 2];
1497 b = pu1_chroma_recon_buf[col + 2];
1498 pel_err = pu1_chroma_src_buf[col] - pu1_chroma_recon_buf[col];
1499 edge_idx = 2 + SIGN(c - a) + SIGN(c - b);
1500
1501 if(pel_err != 0)
1502 {
1503 pi4_acc_error_category[edge_idx] += pel_err;
1504 pi4_category_count[edge_idx]++;
1505 }
1506 }
1507 }
1508 pu1_chroma_recon_buf = pu1_chroma_recon_buf_copy + i4_chroma_recon_strd;
1509 pu1_chroma_src_buf = pu1_chroma_src_buf_copy + i4_chroma_src_strd;
1510 rem_wd = wd;
1511 }
1512 break;
1513 case SAO_EDGE_90_DEG:
1514 for(row = row_start; row < row_end; row++)
1515 {
1516 pu1_chroma_src_buf_copy = pu1_chroma_src_buf;
1517 pu1_chroma_recon_buf_copy = pu1_chroma_recon_buf;
1518 for(col = wd; col > 15; col -= 16)
1519 {
1520 /*load src and recon data*/
1521 src_buf_8x16 = vld1q_u8(pu1_chroma_src_buf);
1522 recon_buf_8x16 = vld1q_u8(pu1_chroma_recon_buf);
1523 recon_buf0_8x16 = vld1q_u8(pu1_chroma_recon_buf - i4_chroma_recon_strd);
1524 recon_buf1_8x16 = vld1q_u8(pu1_chroma_recon_buf + i4_chroma_recon_strd);
1525 /*pel_error*/
1526 pel_error = vreinterpretq_s16_u16(
1527 vsubl_u8(vget_low_u8(src_buf_8x16), vget_low_u8(recon_buf_8x16)));
1528 pel_error1 = vreinterpretq_s16_u16(
1529 vsubl_u8(vget_high_u8(src_buf_8x16), vget_high_u8(recon_buf_8x16)));
1530 /*sign*/
1531 sign_reg0 = vreinterpretq_s16_u16(
1532 vsubl_u8(vget_low_u8(recon_buf_8x16), vget_low_u8(recon_buf0_8x16)));
1533 sign_reg = (int16x8_t)vcgtq_s16(sign_reg0, const_0);
1534 sign_reg0 = (int16x8_t)vcltq_s16(sign_reg0, const_0);
1535 sign_reg0 = vsubq_s16(sign_reg0, sign_reg);
1536
1537 sign_reg1 = vreinterpretq_s16_u16(
1538 vsubl_u8(vget_low_u8(recon_buf_8x16), vget_low_u8(recon_buf1_8x16)));
1539 sign_reg = (int16x8_t)vcgtq_s16(sign_reg1, const_0);
1540 sign_reg1 = (int16x8_t)vcltq_s16(sign_reg1, const_0);
1541 sign_reg1 = vsubq_s16(sign_reg1, sign_reg);
1542
1543 sign_reg2 = vreinterpretq_s16_u16(
1544 vsubl_u8(vget_high_u8(recon_buf_8x16), vget_high_u8(recon_buf0_8x16)));
1545 sign_reg = (int16x8_t)vcgtq_s16(sign_reg2, const_0);
1546 sign_reg2 = (int16x8_t)vcltq_s16(sign_reg2, const_0);
1547 sign_reg2 = vsubq_s16(sign_reg2, sign_reg);
1548
1549 sign_reg3 = vreinterpretq_s16_u16(
1550 vsubl_u8(vget_high_u8(recon_buf_8x16), vget_high_u8(recon_buf1_8x16)));
1551 sign_reg = (int16x8_t)vcgtq_s16(sign_reg3, const_0);
1552 sign_reg3 = (int16x8_t)vcltq_s16(sign_reg3, const_0);
1553 sign_reg3 = vsubq_s16(sign_reg3, sign_reg);
1554 /*edgeidx*/
1555 edgeidx = vaddq_s16(vaddq_s16(sign_reg0, const_2), sign_reg1);
1556 edgeidx1 = vaddq_s16(vaddq_s16(sign_reg2, const_2), sign_reg3);
1557
1558 edgeidx_reg0 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error));
1559 edgeidx = vandq_s16(edgeidx_reg0, edgeidx);
1560
1561 edgeidx_reg5 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error1));
1562 edgeidx1 = vandq_s16(edgeidx_reg5, edgeidx1);
1563
1564 temp_reg0 = (int16x8_t)vceqq_s16(const_0, edgeidx);
1565 temp_reg4 = (int16x8_t)vceqq_s16(const_0, edgeidx1);
1566 temp_reg1 = (int16x8_t)vceqq_s16(const_1, edgeidx);
1567 temp_reg5 = (int16x8_t)vceqq_s16(const_1, edgeidx1);
1568
1569 temp_reg2 = (int16x8_t)vceqq_s16(const_3, edgeidx);
1570 temp_reg6 = (int16x8_t)vceqq_s16(const_3, edgeidx1);
1571 temp_reg3 = (int16x8_t)vceqq_s16(const_4, edgeidx);
1572 temp_reg7 = (int16x8_t)vceqq_s16(const_4, edgeidx1);
1573
1574 edgeidx_reg1 = vabsq_s16(temp_reg1);
1575 edgeidx_reg5 = vabsq_s16(temp_reg5);
1576
1577 edgeidx_reg2 = vabsq_s16(temp_reg2);
1578 edgeidx_reg6 = vabsq_s16(temp_reg6);
1579 edgeidx_reg3 = vabsq_s16(temp_reg3);
1580 edgeidx_reg7 = vabsq_s16(temp_reg7);
1581
1582 temp_reg0 = vandq_s16(temp_reg0, pel_error);
1583 temp_reg4 = vandq_s16(temp_reg4, pel_error1);
1584 temp_reg1 = vandq_s16(temp_reg1, pel_error);
1585 temp_reg5 = vandq_s16(temp_reg5, pel_error1);
1586
1587 temp_reg2 = vandq_s16(temp_reg2, pel_error);
1588 temp_reg6 = vandq_s16(temp_reg6, pel_error1);
1589 temp_reg3 = vandq_s16(temp_reg3, pel_error);
1590 temp_reg7 = vandq_s16(temp_reg7, pel_error1);
1591
1592 edgeidx_reg0 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg0));
1593 edgeidx_reg4 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg4));
1594
1595 temp_reg0 = vaddq_s16(temp_reg0, temp_reg4);
1596 temp_reg1 = vaddq_s16(temp_reg1, temp_reg5);
1597 temp_reg2 = vaddq_s16(temp_reg2, temp_reg6);
1598 temp_reg3 = vaddq_s16(temp_reg3, temp_reg7);
1599
1600 edgeidx_reg0 = vaddq_s16(edgeidx_reg0, edgeidx_reg4);
1601 edgeidx_reg1 = vaddq_s16(edgeidx_reg1, edgeidx_reg5);
1602 edgeidx_reg2 = vaddq_s16(edgeidx_reg2, edgeidx_reg6);
1603 edgeidx_reg3 = vaddq_s16(edgeidx_reg3, edgeidx_reg7);
1604 /* store */
1605 pi4_acc_error_category[0] += sad_cal(temp_reg0);
1606 pi4_acc_error_category[1] += sad_cal(temp_reg1);
1607 pi4_acc_error_category[3] += sad_cal(temp_reg2);
1608 pi4_acc_error_category[4] += sad_cal(temp_reg3);
1609 /*store account*/
1610 pi4_category_count[0] += sad_cal(edgeidx_reg0);
1611 pi4_category_count[1] += sad_cal(edgeidx_reg1);
1612 pi4_category_count[3] += sad_cal(edgeidx_reg2);
1613 pi4_category_count[4] += sad_cal(edgeidx_reg3);
1614 pu1_chroma_recon_buf += 16;
1615 pu1_chroma_src_buf += 16;
1616 }
1617 rem_wd &= 0x0F;
1618
1619 if(rem_wd > 7)
1620 {
1621 /*load*/
1622 src_buf = vld1_u8(pu1_chroma_src_buf);
1623 recon_buf = vld1_u8(pu1_chroma_recon_buf);
1624 recon_buf0 = vld1_u8(pu1_chroma_recon_buf - i4_chroma_recon_strd);
1625 recon_buf1 = vld1_u8(pu1_chroma_recon_buf + i4_chroma_recon_strd);
1626 /*pel_error*/
1627 pel_error = vreinterpretq_s16_u16(vsubl_u8(src_buf, recon_buf));
1628 /*sign*/
1629 sign_reg0 = vreinterpretq_s16_u16(vsubl_u8(recon_buf, recon_buf0));
1630 sign_reg = (int16x8_t)vcgtq_s16(sign_reg0, const_0);
1631 sign_reg0 = (int16x8_t)vcltq_s16(sign_reg0, const_0);
1632 sign_reg0 = vsubq_s16(sign_reg0, sign_reg);
1633
1634 sign_reg1 = vreinterpretq_s16_u16(vsubl_u8(recon_buf, recon_buf1));
1635 sign_reg = (int16x8_t)vcgtq_s16(sign_reg1, const_0);
1636 sign_reg1 = (int16x8_t)vcltq_s16(sign_reg1, const_0);
1637 sign_reg1 = vsubq_s16(sign_reg1, sign_reg);
1638
1639 edgeidx = vaddq_s16(vaddq_s16(sign_reg0, const_2), sign_reg1);
1640 edgeidx_reg0 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error));
1641 edgeidx = vandq_s16(edgeidx_reg0, edgeidx);
1642
1643 temp_reg0 = (int16x8_t)vceqq_s16(const_0, edgeidx);
1644 temp_reg1 = (int16x8_t)vceqq_s16(const_1, edgeidx);
1645 temp_reg2 = (int16x8_t)vceqq_s16(const_3, edgeidx);
1646 temp_reg3 = (int16x8_t)vceqq_s16(const_4, edgeidx);
1647
1648 edgeidx_reg1 = vabsq_s16(temp_reg1);
1649 edgeidx_reg2 = vabsq_s16(temp_reg2);
1650 edgeidx_reg3 = vabsq_s16(temp_reg3);
1651
1652 temp_reg0 = vandq_s16(temp_reg0, pel_error);
1653 temp_reg1 = vandq_s16(temp_reg1, pel_error);
1654 temp_reg2 = vandq_s16(temp_reg2, pel_error);
1655 temp_reg3 = vandq_s16(temp_reg3, pel_error);
1656
1657 edgeidx_reg0 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg0));
1658 /*store*/
1659 pi4_acc_error_category[0] += sad_cal(temp_reg0);
1660 pi4_acc_error_category[1] += sad_cal(temp_reg1);
1661 pi4_acc_error_category[3] += sad_cal(temp_reg2);
1662 pi4_acc_error_category[4] += sad_cal(temp_reg3);
1663
1664 pi4_category_count[0] += sad_cal(edgeidx_reg0);
1665 pi4_category_count[1] += sad_cal(edgeidx_reg1);
1666 pi4_category_count[3] += sad_cal(edgeidx_reg2);
1667 pi4_category_count[4] += sad_cal(edgeidx_reg3);
1668 pu1_chroma_recon_buf += 8;
1669 pu1_chroma_src_buf += 8;
1670 }
1671 rem_wd &= 0x7;
1672 if(rem_wd)
1673 {
1674 for(col = 0; col < rem_wd; col++)
1675 {
1676 c = pu1_chroma_recon_buf[col];
1677 a = pu1_chroma_recon_buf[col - i4_chroma_recon_strd];
1678 b = pu1_chroma_recon_buf[col + i4_chroma_recon_strd];
1679 pel_err = pu1_chroma_src_buf[col] - pu1_chroma_recon_buf[col];
1680 edge_idx = 2 + SIGN(c - a) + SIGN(c - b);
1681
1682 if(pel_err != 0)
1683 {
1684 pi4_acc_error_category[edge_idx] += pel_err;
1685 pi4_category_count[edge_idx]++;
1686 }
1687 }
1688 }
1689 pu1_chroma_recon_buf = pu1_chroma_recon_buf_copy + i4_chroma_recon_strd;
1690 pu1_chroma_src_buf = pu1_chroma_src_buf_copy + i4_chroma_src_strd;
1691 rem_wd = wd;
1692 }
1693 break;
1694 case SAO_EDGE_135_DEG:
1695 for(row = row_start; row < row_end; row++)
1696 {
1697 pu1_chroma_src_buf_copy = pu1_chroma_src_buf;
1698 pu1_chroma_recon_buf_copy = pu1_chroma_recon_buf;
1699 for(col = wd; col > 15; col -= 16)
1700 {
1701 /*load src and recon data*/
1702 src_buf_8x16 = vld1q_u8(pu1_chroma_src_buf);
1703 recon_buf_8x16 = vld1q_u8(pu1_chroma_recon_buf);
1704 recon_buf0_8x16 = vld1q_u8(pu1_chroma_recon_buf - 2 - i4_chroma_recon_strd);
1705 recon_buf1_8x16 = vld1q_u8(pu1_chroma_recon_buf + 2 + i4_chroma_recon_strd);
1706 /*pel_error*/
1707 pel_error = vreinterpretq_s16_u16(
1708 vsubl_u8(vget_low_u8(src_buf_8x16), vget_low_u8(recon_buf_8x16)));
1709 pel_error1 = vreinterpretq_s16_u16(
1710 vsubl_u8(vget_high_u8(src_buf_8x16), vget_high_u8(recon_buf_8x16)));
1711 /*sign*/
1712 sign_reg0 = vreinterpretq_s16_u16(
1713 vsubl_u8(vget_low_u8(recon_buf_8x16), vget_low_u8(recon_buf0_8x16)));
1714 sign_reg = (int16x8_t)vcgtq_s16(sign_reg0, const_0);
1715 sign_reg0 = (int16x8_t)vcltq_s16(sign_reg0, const_0);
1716 sign_reg0 = vsubq_s16(sign_reg0, sign_reg);
1717
1718 sign_reg1 = vreinterpretq_s16_u16(
1719 vsubl_u8(vget_low_u8(recon_buf_8x16), vget_low_u8(recon_buf1_8x16)));
1720 sign_reg = (int16x8_t)vcgtq_s16(sign_reg1, const_0);
1721 sign_reg1 = (int16x8_t)vcltq_s16(sign_reg1, const_0);
1722 sign_reg1 = vsubq_s16(sign_reg1, sign_reg);
1723
1724 sign_reg2 = vreinterpretq_s16_u16(
1725 vsubl_u8(vget_high_u8(recon_buf_8x16), vget_high_u8(recon_buf0_8x16)));
1726 sign_reg = (int16x8_t)vcgtq_s16(sign_reg2, const_0);
1727 sign_reg2 = (int16x8_t)vcltq_s16(sign_reg2, const_0);
1728 sign_reg2 = vsubq_s16(sign_reg2, sign_reg);
1729
1730 sign_reg3 = vreinterpretq_s16_u16(
1731 vsubl_u8(vget_high_u8(recon_buf_8x16), vget_high_u8(recon_buf1_8x16)));
1732 sign_reg = (int16x8_t)vcgtq_s16(sign_reg3, const_0);
1733 sign_reg3 = (int16x8_t)vcltq_s16(sign_reg3, const_0);
1734 sign_reg3 = vsubq_s16(sign_reg3, sign_reg);
1735
1736 edgeidx = vaddq_s16(vaddq_s16(sign_reg0, const_2), sign_reg1);
1737 edgeidx1 = vaddq_s16(vaddq_s16(sign_reg2, const_2), sign_reg3);
1738
1739 edgeidx_reg0 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error));
1740 edgeidx = vandq_s16(edgeidx_reg0, edgeidx);
1741
1742 edgeidx_reg5 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error1));
1743 edgeidx1 = vandq_s16(edgeidx_reg5, edgeidx1);
1744
1745 temp_reg0 = (int16x8_t)vceqq_s16(const_0, edgeidx);
1746 temp_reg4 = (int16x8_t)vceqq_s16(const_0, edgeidx1);
1747 temp_reg1 = (int16x8_t)vceqq_s16(const_1, edgeidx);
1748 temp_reg5 = (int16x8_t)vceqq_s16(const_1, edgeidx1);
1749
1750 temp_reg2 = (int16x8_t)vceqq_s16(const_3, edgeidx);
1751 temp_reg6 = (int16x8_t)vceqq_s16(const_3, edgeidx1);
1752 temp_reg3 = (int16x8_t)vceqq_s16(const_4, edgeidx);
1753 temp_reg7 = (int16x8_t)vceqq_s16(const_4, edgeidx1);
1754
1755 edgeidx_reg1 = vabsq_s16(temp_reg1);
1756 edgeidx_reg5 = vabsq_s16(temp_reg5);
1757
1758 edgeidx_reg2 = vabsq_s16(temp_reg2);
1759 edgeidx_reg6 = vabsq_s16(temp_reg6);
1760 edgeidx_reg3 = vabsq_s16(temp_reg3);
1761 edgeidx_reg7 = vabsq_s16(temp_reg7);
1762
1763 temp_reg0 = vandq_s16(temp_reg0, pel_error);
1764 temp_reg4 = vandq_s16(temp_reg4, pel_error1);
1765 temp_reg1 = vandq_s16(temp_reg1, pel_error);
1766 temp_reg5 = vandq_s16(temp_reg5, pel_error1);
1767
1768 temp_reg2 = vandq_s16(temp_reg2, pel_error);
1769 temp_reg6 = vandq_s16(temp_reg6, pel_error1);
1770 temp_reg3 = vandq_s16(temp_reg3, pel_error);
1771 temp_reg7 = vandq_s16(temp_reg7, pel_error1);
1772
1773 edgeidx_reg0 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg0));
1774 edgeidx_reg4 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg4));
1775
1776 temp_reg0 = vaddq_s16(temp_reg0, temp_reg4);
1777 temp_reg1 = vaddq_s16(temp_reg1, temp_reg5);
1778 temp_reg2 = vaddq_s16(temp_reg2, temp_reg6);
1779 temp_reg3 = vaddq_s16(temp_reg3, temp_reg7);
1780
1781 edgeidx_reg0 = vaddq_s16(edgeidx_reg0, edgeidx_reg4);
1782 edgeidx_reg1 = vaddq_s16(edgeidx_reg1, edgeidx_reg5);
1783 edgeidx_reg2 = vaddq_s16(edgeidx_reg2, edgeidx_reg6);
1784 edgeidx_reg3 = vaddq_s16(edgeidx_reg3, edgeidx_reg7);
1785 /*store*/
1786 pi4_acc_error_category[0] += sad_cal(temp_reg0);
1787 pi4_acc_error_category[1] += sad_cal(temp_reg1);
1788 pi4_acc_error_category[3] += sad_cal(temp_reg2);
1789 pi4_acc_error_category[4] += sad_cal(temp_reg3);
1790
1791 pi4_category_count[0] += sad_cal(edgeidx_reg0);
1792 pi4_category_count[1] += sad_cal(edgeidx_reg1);
1793 pi4_category_count[3] += sad_cal(edgeidx_reg2);
1794 pi4_category_count[4] += sad_cal(edgeidx_reg3);
1795 pu1_chroma_recon_buf += 16;
1796 pu1_chroma_src_buf += 16;
1797 }
1798 rem_wd &= 0x0F;
1799
1800 if(rem_wd > 7)
1801 {
1802 /*load data*/
1803 src_buf = vld1_u8(pu1_chroma_src_buf);
1804 recon_buf = vld1_u8(pu1_chroma_recon_buf);
1805 recon_buf0 = vld1_u8(pu1_chroma_recon_buf - 2 - i4_chroma_recon_strd);
1806 recon_buf1 = vld1_u8(pu1_chroma_recon_buf + 2 + i4_chroma_recon_strd);
1807 /*pel_error*/
1808 pel_error = vreinterpretq_s16_u16(vsubl_u8(src_buf, recon_buf));
1809 /*sign*/
1810 sign_reg0 = vreinterpretq_s16_u16(vsubl_u8(recon_buf, recon_buf0));
1811 sign_reg = (int16x8_t)vcgtq_s16(sign_reg0, const_0);
1812 sign_reg0 = (int16x8_t)vcltq_s16(sign_reg0, const_0);
1813 sign_reg0 = vsubq_s16(sign_reg0, sign_reg);
1814
1815 sign_reg1 = vreinterpretq_s16_u16(vsubl_u8(recon_buf, recon_buf1));
1816 sign_reg = (int16x8_t)vcgtq_s16(sign_reg1, const_0);
1817 sign_reg1 = (int16x8_t)vcltq_s16(sign_reg1, const_0);
1818 sign_reg1 = vsubq_s16(sign_reg1, sign_reg);
1819
1820 edgeidx = vaddq_s16(vaddq_s16(sign_reg0, const_2), sign_reg1);
1821 edgeidx_reg0 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error));
1822 edgeidx = vandq_s16(edgeidx_reg0, edgeidx);
1823
1824 temp_reg0 = (int16x8_t)vceqq_s16(const_0, edgeidx);
1825 temp_reg1 = (int16x8_t)vceqq_s16(const_1, edgeidx);
1826 temp_reg3 = (int16x8_t)vceqq_s16(const_3, edgeidx);
1827 temp_reg4 = (int16x8_t)vceqq_s16(const_4, edgeidx);
1828
1829 edgeidx_reg1 = vabsq_s16(temp_reg1);
1830 edgeidx_reg3 = vabsq_s16(temp_reg3);
1831 edgeidx_reg4 = vabsq_s16(temp_reg4);
1832
1833 temp_reg0 = vandq_s16(temp_reg0, pel_error);
1834 temp_reg1 = vandq_s16(temp_reg1, pel_error);
1835 temp_reg3 = vandq_s16(temp_reg3, pel_error);
1836 temp_reg4 = vandq_s16(temp_reg4, pel_error);
1837
1838 edgeidx_reg0 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg0));
1839 /*store*/
1840 pi4_acc_error_category[0] += sad_cal(temp_reg0);
1841 pi4_acc_error_category[1] += sad_cal(temp_reg1);
1842 pi4_acc_error_category[3] += sad_cal(temp_reg3);
1843 pi4_acc_error_category[4] += sad_cal(temp_reg4);
1844
1845 pi4_category_count[0] += sad_cal(edgeidx_reg0);
1846 pi4_category_count[1] += sad_cal(edgeidx_reg1);
1847 pi4_category_count[3] += sad_cal(edgeidx_reg3);
1848 pi4_category_count[4] += sad_cal(edgeidx_reg4);
1849 pu1_chroma_recon_buf += 8;
1850 pu1_chroma_src_buf += 8;
1851 }
1852 rem_wd &= 0x7;
1853 if(rem_wd)
1854 {
1855 for(col = 0; col < rem_wd; col++)
1856 {
1857 c = pu1_chroma_recon_buf[col];
1858 a = pu1_chroma_recon_buf[col - 2 - i4_chroma_recon_strd];
1859 b = pu1_chroma_recon_buf[col + 2 + i4_chroma_recon_strd];
1860 pel_err = pu1_chroma_src_buf[col] - pu1_chroma_recon_buf[col];
1861 edge_idx = 2 + SIGN(c - a) + SIGN(c - b);
1862
1863 if(pel_err != 0)
1864 {
1865 pi4_acc_error_category[edge_idx] += pel_err;
1866 pi4_category_count[edge_idx]++;
1867 }
1868 }
1869 }
1870 pu1_chroma_recon_buf = pu1_chroma_recon_buf_copy + i4_chroma_recon_strd;
1871 pu1_chroma_src_buf = pu1_chroma_src_buf_copy + i4_chroma_src_strd;
1872 rem_wd = wd;
1873 }
1874 break;
1875 case SAO_EDGE_45_DEG:
1876 for(row = row_start; row < row_end; row++)
1877 {
1878 pu1_chroma_src_buf_copy = pu1_chroma_src_buf;
1879 pu1_chroma_recon_buf_copy = pu1_chroma_recon_buf;
1880 for(col = wd; col > 15; col -= 16)
1881 {
1882 /*load data*/
1883 src_buf_8x16 = vld1q_u8(pu1_chroma_src_buf);
1884 recon_buf_8x16 = vld1q_u8(pu1_chroma_recon_buf);
1885 recon_buf0_8x16 = vld1q_u8(pu1_chroma_recon_buf + 2 - i4_chroma_recon_strd);
1886 recon_buf1_8x16 = vld1q_u8(pu1_chroma_recon_buf - 2 + i4_chroma_recon_strd);
1887 /*pel_error*/
1888 pel_error = vreinterpretq_s16_u16(
1889 vsubl_u8(vget_low_u8(src_buf_8x16), vget_low_u8(recon_buf_8x16)));
1890 pel_error1 = vreinterpretq_s16_u16(
1891 vsubl_u8(vget_high_u8(src_buf_8x16), vget_high_u8(recon_buf_8x16)));
1892 /*sign*/
1893 sign_reg0 = vreinterpretq_s16_u16(
1894 vsubl_u8(vget_low_u8(recon_buf_8x16), vget_low_u8(recon_buf0_8x16)));
1895 sign_reg = (int16x8_t)vcgtq_s16(sign_reg0, const_0);
1896 sign_reg0 = (int16x8_t)vcltq_s16(sign_reg0, const_0);
1897 sign_reg0 = vsubq_s16(sign_reg0, sign_reg);
1898
1899 sign_reg1 = vreinterpretq_s16_u16(
1900 vsubl_u8(vget_low_u8(recon_buf_8x16), vget_low_u8(recon_buf1_8x16)));
1901 sign_reg = (int16x8_t)vcgtq_s16(sign_reg1, const_0);
1902 sign_reg1 = (int16x8_t)vcltq_s16(sign_reg1, const_0);
1903 sign_reg1 = vsubq_s16(sign_reg1, sign_reg);
1904
1905 sign_reg2 = vreinterpretq_s16_u16(
1906 vsubl_u8(vget_high_u8(recon_buf_8x16), vget_high_u8(recon_buf0_8x16)));
1907 sign_reg = (int16x8_t)vcgtq_s16(sign_reg2, const_0);
1908 sign_reg2 = (int16x8_t)vcltq_s16(sign_reg2, const_0);
1909 sign_reg2 = vsubq_s16(sign_reg2, sign_reg);
1910
1911 sign_reg3 = vreinterpretq_s16_u16(
1912 vsubl_u8(vget_high_u8(recon_buf_8x16), vget_high_u8(recon_buf1_8x16)));
1913 sign_reg = (int16x8_t)vcgtq_s16(sign_reg3, const_0);
1914 sign_reg3 = (int16x8_t)vcltq_s16(sign_reg3, const_0);
1915 sign_reg3 = vsubq_s16(sign_reg3, sign_reg);
1916
1917 edgeidx = vaddq_s16(vaddq_s16(sign_reg0, const_2), sign_reg1);
1918 edgeidx1 = vaddq_s16(vaddq_s16(sign_reg2, const_2), sign_reg3);
1919
1920 edgeidx_reg0 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error));
1921 edgeidx = vandq_s16(edgeidx_reg0, edgeidx);
1922
1923 edgeidx_reg5 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error1));
1924 edgeidx1 = vandq_s16(edgeidx_reg5, edgeidx1);
1925
1926 temp_reg0 = (int16x8_t)vceqq_s16(const_0, edgeidx);
1927 temp_reg4 = (int16x8_t)vceqq_s16(const_0, edgeidx1);
1928 temp_reg1 = (int16x8_t)vceqq_s16(const_1, edgeidx);
1929 temp_reg5 = (int16x8_t)vceqq_s16(const_1, edgeidx1);
1930
1931 temp_reg2 = (int16x8_t)vceqq_s16(const_3, edgeidx);
1932 temp_reg6 = (int16x8_t)vceqq_s16(const_3, edgeidx1);
1933 temp_reg3 = (int16x8_t)vceqq_s16(const_4, edgeidx);
1934 temp_reg7 = (int16x8_t)vceqq_s16(const_4, edgeidx1);
1935
1936 edgeidx_reg1 = vabsq_s16(temp_reg1);
1937 edgeidx_reg5 = vabsq_s16(temp_reg5);
1938
1939 edgeidx_reg2 = vabsq_s16(temp_reg2);
1940 edgeidx_reg6 = vabsq_s16(temp_reg6);
1941 edgeidx_reg3 = vabsq_s16(temp_reg3);
1942 edgeidx_reg7 = vabsq_s16(temp_reg7);
1943
1944 temp_reg0 = vandq_s16(temp_reg0, pel_error);
1945 temp_reg4 = vandq_s16(temp_reg4, pel_error1);
1946 temp_reg1 = vandq_s16(temp_reg1, pel_error);
1947 temp_reg5 = vandq_s16(temp_reg5, pel_error1);
1948
1949 temp_reg2 = vandq_s16(temp_reg2, pel_error);
1950 temp_reg6 = vandq_s16(temp_reg6, pel_error1);
1951 temp_reg3 = vandq_s16(temp_reg3, pel_error);
1952 temp_reg7 = vandq_s16(temp_reg7, pel_error1);
1953
1954 edgeidx_reg0 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg0));
1955 edgeidx_reg4 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg4));
1956
1957 temp_reg0 = vaddq_s16(temp_reg0, temp_reg4);
1958 temp_reg1 = vaddq_s16(temp_reg1, temp_reg5);
1959 temp_reg2 = vaddq_s16(temp_reg2, temp_reg6);
1960 temp_reg3 = vaddq_s16(temp_reg3, temp_reg7);
1961
1962 edgeidx_reg0 = vaddq_s16(edgeidx_reg0, edgeidx_reg4);
1963 edgeidx_reg1 = vaddq_s16(edgeidx_reg1, edgeidx_reg5);
1964 edgeidx_reg2 = vaddq_s16(edgeidx_reg2, edgeidx_reg6);
1965 edgeidx_reg3 = vaddq_s16(edgeidx_reg3, edgeidx_reg7);
1966 /*store*/
1967 pi4_acc_error_category[0] += sad_cal(temp_reg0);
1968 pi4_acc_error_category[1] += sad_cal(temp_reg1);
1969 pi4_acc_error_category[3] += sad_cal(temp_reg2);
1970 pi4_acc_error_category[4] += sad_cal(temp_reg3);
1971
1972 pi4_category_count[0] += sad_cal(edgeidx_reg0);
1973 pi4_category_count[1] += sad_cal(edgeidx_reg1);
1974 pi4_category_count[3] += sad_cal(edgeidx_reg2);
1975 pi4_category_count[4] += sad_cal(edgeidx_reg3);
1976 pu1_chroma_recon_buf += 16;
1977 pu1_chroma_src_buf += 16;
1978 }
1979 rem_wd &= 0x0F;
1980
1981 if(rem_wd > 7)
1982 {
1983 /*load*/
1984 src_buf = vld1_u8(pu1_chroma_src_buf);
1985 recon_buf = vld1_u8(pu1_chroma_recon_buf);
1986 recon_buf0 = vld1_u8(pu1_chroma_recon_buf + 2 - i4_chroma_recon_strd);
1987 recon_buf1 = vld1_u8(pu1_chroma_recon_buf - 2 + i4_chroma_recon_strd);
1988
1989 pel_error = vreinterpretq_s16_u16(vsubl_u8(src_buf, recon_buf));
1990
1991 sign_reg0 = vreinterpretq_s16_u16(vsubl_u8(recon_buf, recon_buf0));
1992 sign_reg = (int16x8_t)vcgtq_s16(sign_reg0, const_0);
1993 sign_reg0 = (int16x8_t)vcltq_s16(sign_reg0, const_0);
1994 sign_reg0 = vsubq_s16(sign_reg0, sign_reg);
1995
1996 sign_reg1 = vreinterpretq_s16_u16(vsubl_u8(recon_buf, recon_buf1));
1997 sign_reg = (int16x8_t)vcgtq_s16(sign_reg1, const_0);
1998 sign_reg1 = (int16x8_t)vcltq_s16(sign_reg1, const_0);
1999 sign_reg1 = vsubq_s16(sign_reg1, sign_reg);
2000
2001 edgeidx = vaddq_s16(vaddq_s16(sign_reg0, const_2), sign_reg1);
2002
2003 edgeidx_reg0 = vmvnq_s16((int16x8_t)vceqq_s16(const_0, pel_error));
2004 edgeidx = vandq_s16(edgeidx_reg0, edgeidx);
2005
2006 temp_reg0 = (int16x8_t)vceqq_s16(const_0, edgeidx);
2007 temp_reg1 = (int16x8_t)vceqq_s16(const_1, edgeidx);
2008 temp_reg3 = (int16x8_t)vceqq_s16(const_3, edgeidx);
2009 temp_reg4 = (int16x8_t)vceqq_s16(const_4, edgeidx);
2010
2011 edgeidx_reg1 = vabsq_s16(temp_reg1);
2012 edgeidx_reg3 = vabsq_s16(temp_reg3);
2013 edgeidx_reg4 = vabsq_s16(temp_reg4);
2014
2015 temp_reg0 = vandq_s16(temp_reg0, pel_error);
2016 temp_reg1 = vandq_s16(temp_reg1, pel_error);
2017 temp_reg3 = vandq_s16(temp_reg3, pel_error);
2018 temp_reg4 = vandq_s16(temp_reg4, pel_error);
2019
2020 edgeidx_reg0 = vaddq_s16(const_1, (int16x8_t)vceqq_s16(const_0, temp_reg0));
2021 /*store*/
2022 pi4_acc_error_category[0] += sad_cal(temp_reg0);
2023 pi4_acc_error_category[1] += sad_cal(temp_reg1);
2024 pi4_acc_error_category[3] += sad_cal(temp_reg3);
2025 pi4_acc_error_category[4] += sad_cal(temp_reg4);
2026
2027 pi4_category_count[0] += sad_cal(edgeidx_reg0);
2028 pi4_category_count[1] += sad_cal(edgeidx_reg1);
2029 pi4_category_count[3] += sad_cal(edgeidx_reg3);
2030 pi4_category_count[4] += sad_cal(edgeidx_reg4);
2031 pu1_chroma_recon_buf += 8;
2032 pu1_chroma_src_buf += 8;
2033 }
2034 rem_wd &= 0x7;
2035 if(rem_wd)
2036 {
2037 for(col = 0; col < rem_wd; col++)
2038 {
2039 c = pu1_chroma_recon_buf[col];
2040 a = pu1_chroma_recon_buf[col + 2 - i4_chroma_recon_strd];
2041 b = pu1_chroma_recon_buf[col - 2 + i4_chroma_recon_strd];
2042 pel_err = pu1_chroma_src_buf[col] - pu1_chroma_recon_buf[col];
2043 edge_idx = 2 + SIGN(c - a) + SIGN(c - b);
2044 if(pel_err != 0)
2045 {
2046 pi4_acc_error_category[edge_idx] += pel_err;
2047 pi4_category_count[edge_idx]++;
2048 }
2049 }
2050 }
2051 pu1_chroma_recon_buf = pu1_chroma_recon_buf_copy + i4_chroma_recon_strd;
2052 pu1_chroma_src_buf = pu1_chroma_src_buf_copy + i4_chroma_src_strd;
2053 rem_wd = wd;
2054 }
2055 break;
2056 default:
2057 break;
2058 }
2059 }
2060