xref: /aosp_15_r20/external/libhevc/encoder/arm/ihevce_coarse_layer_sad_neon.c (revision c83a76b084498d55f252f48b2e3786804cdf24b7)
1 /******************************************************************************
2  *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 *  ihevce_coarse_layer_sad_neon.c
24 *
25 * @brief
26 *  Contains intrinsic definitions of functions for computing sad
27 *
28 * @author
29 *  Ittiam
30 *
31 * @par List of Functions:
32 *
33 * @remarks
34 *  None
35 *
36 ********************************************************************************
37 */
38 
39 /*****************************************************************************/
40 /* File Includes                                                             */
41 /*****************************************************************************/
42 /* System include files */
43 #include <stdio.h>
44 #include <string.h>
45 #include <assert.h>
46 #include <arm_neon.h>
47 
48 /* User include files */
49 #include "ihevc_typedefs.h"
50 #include "itt_video_api.h"
51 #include "ihevc_cmn_utils_neon.h"
52 #include "ihevc_chroma_itrans_recon.h"
53 #include "ihevc_chroma_intra_pred.h"
54 #include "ihevc_debug.h"
55 #include "ihevc_deblk.h"
56 #include "ihevc_defs.h"
57 #include "ihevc_itrans_recon.h"
58 #include "ihevc_intra_pred.h"
59 #include "ihevc_inter_pred.h"
60 #include "ihevc_macros.h"
61 #include "ihevc_mem_fns.h"
62 #include "ihevc_padding.h"
63 #include "ihevc_quant_iquant_ssd.h"
64 #include "ihevc_resi_trans.h"
65 #include "ihevc_sao.h"
66 #include "ihevc_structs.h"
67 #include "ihevc_weighted_pred.h"
68 
69 #include "rc_cntrl_param.h"
70 #include "rc_frame_info_collector.h"
71 #include "rc_look_ahead_params.h"
72 
73 #include "ihevce_api.h"
74 #include "ihevce_defs.h"
75 #include "ihevce_lap_enc_structs.h"
76 #include "ihevce_multi_thrd_structs.h"
77 #include "ihevce_function_selector.h"
78 #include "ihevce_me_common_defs.h"
79 #include "ihevce_enc_structs.h"
80 #include "ihevce_had_satd.h"
81 #include "ihevce_ipe_instr_set_router.h"
82 #include "ihevce_global_tables.h"
83 
84 #include "hme_datatype.h"
85 #include "hme_common_defs.h"
86 #include "hme_interface.h"
87 #include "hme_defs.h"
88 #include "hme_globals.h"
89 
90 #include "ihevce_me_instr_set_router.h"
91 
92 /*****************************************************************************/
93 /* Function Definitions                                                      */
94 /*****************************************************************************/
95 
hme_store_4x4_sads_high_speed_neon(hme_search_prms_t * ps_search_prms,layer_ctxt_t * ps_layer_ctxt,range_prms_t * ps_mv_limit,wgt_pred_ctxt_t * ps_wt_inp_prms,S16 * pi2_sads_4x4)96 void hme_store_4x4_sads_high_speed_neon(
97     hme_search_prms_t *ps_search_prms,
98     layer_ctxt_t *ps_layer_ctxt,
99     range_prms_t *ps_mv_limit,
100     wgt_pred_ctxt_t *ps_wt_inp_prms,
101     S16 *pi2_sads_4x4)
102 {
103     uint8x8_t src2[4];
104     uint8x16_t src;
105 
106     S32 i, j;
107 
108     /* Input and reference attributes */
109     U08 *pu1_inp, *pu1_ref;
110     S32 i4_inp_stride, i4_ref_stride, i4_ref_offset;
111 
112     /* The reference is actually an array of ptrs since there are several    */
113     /* reference id. So an array gets passed form calling function           */
114     U08 **ppu1_ref, *pu1_ref_coloc;
115 
116     S32 stepy, stepx, step_shift_x, step_shift_y;
117     S32 mvx, mvy, mv_x_offset, mv_y_offset, mv_x_range, mv_y_range;
118 
119     /* Points to the range limits for mv */
120     range_prms_t *ps_range_prms = ps_search_prms->aps_mv_range[0];
121 
122     /* Reference index to be searched */
123     S32 i4_search_idx = ps_search_prms->i1_ref_idx;
124 
125     pu1_inp = ps_wt_inp_prms->apu1_wt_inp[i4_search_idx];
126     i4_inp_stride = ps_search_prms->i4_inp_stride;
127 
128     /* Move to the location of the search blk in inp buffer */
129     pu1_inp += ps_search_prms->i4_cu_x_off;
130     pu1_inp += ps_search_prms->i4_cu_y_off * i4_inp_stride;
131 
132     /*************************************************************************/
133     /* we use either input of previously encoded pictures as reference       */
134     /* in coarse layer                                                       */
135     /*************************************************************************/
136     i4_ref_stride = ps_layer_ctxt->i4_inp_stride;
137     ppu1_ref = ps_layer_ctxt->ppu1_list_inp;
138 
139     /* colocated position in reference picture */
140     i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
141     pu1_ref_coloc = ppu1_ref[i4_search_idx] + i4_ref_offset;
142 
143     stepx = stepy = HME_COARSE_STEP_SIZE_HIGH_SPEED;
144     /*TODO: Calculate Step shift from the #define HME_COARSE_STEP_SIZE_HIGH_SPEED */
145     step_shift_x = step_shift_y = 2;
146 
147     mv_x_offset = -(ps_mv_limit->i2_min_x >> step_shift_x);
148     mv_y_offset = -(ps_mv_limit->i2_min_y >> step_shift_y);
149     mv_x_range = (-ps_mv_limit->i2_min_x + ps_mv_limit->i2_max_x) >> step_shift_x;
150     mv_y_range = (-ps_mv_limit->i2_min_y + ps_mv_limit->i2_max_y) >> step_shift_y;
151 
152     ASSERT(4 == stepx);
153 
154     /* load input */
155     {
156         S32 mv_x_sweep = ps_range_prms->i2_max_x - ps_range_prms->i2_min_x;
157         uint32x2_t a[4];
158 
159         for(i = 0; i < 4; i++)
160         {
161             a[i] = vld1_dup_u32((uint32_t *)pu1_inp);
162             pu1_inp += i4_inp_stride;
163         }
164         src2[0] = vreinterpret_u8_u32(a[0]);
165         src2[1] = vreinterpret_u8_u32(a[1]);
166         src2[2] = vreinterpret_u8_u32(a[2]);
167         src2[3] = vreinterpret_u8_u32(a[3]);
168 
169         if((mv_x_sweep >> step_shift_x) & 1)
170         {
171             uint32x2x2_t l = vtrn_u32(a[0], a[1]);
172             uint32x2x2_t m = vtrn_u32(a[2], a[3]);
173 
174             src = vcombine_u8(vreinterpret_u8_u32(l.val[0]), vreinterpret_u8_u32(m.val[0]));
175         }
176     }
177 
178     /* Run 2loops to sweep over the reference area */
179     for(mvy = ps_range_prms->i2_min_y; mvy < ps_range_prms->i2_max_y; mvy += stepy)
180     {
181         for(mvx = ps_range_prms->i2_min_x; mvx < ps_range_prms->i2_max_x;)
182         {
183             U16 *pu2_sad = (U16 *)&pi2_sads_4x4
184                 [((mvx >> step_shift_x) + mv_x_offset) +
185                  ((mvy >> step_shift_y) + mv_y_offset) * mv_x_range];
186 
187             pu1_ref = pu1_ref_coloc + mvx + (mvy * i4_ref_stride);
188             if((mvx + (stepx * 4)) <= ps_range_prms->i2_max_x)  // 16x4
189             {
190                 uint16x8_t abs_01 = vdupq_n_u16(0);
191                 uint16x8_t abs_23 = vdupq_n_u16(0);
192                 uint16x4_t tmp_a0, tmp_a1;
193 
194                 for(j = 0; j < 4; j++)
195                 {
196                     uint8x16_t ref = vld1q_u8(pu1_ref);
197 
198                     abs_01 = vabal_u8(abs_01, src2[j], vget_low_u8(ref));
199                     abs_23 = vabal_u8(abs_23, src2[j], vget_high_u8(ref));
200                     pu1_ref += i4_ref_stride;
201                 }
202                 tmp_a0 = vpadd_u16(vget_low_u16(abs_01), vget_high_u16(abs_01));
203                 tmp_a1 = vpadd_u16(vget_low_u16(abs_23), vget_high_u16(abs_23));
204                 abs_01 = vcombine_u16(tmp_a0, tmp_a1);
205                 tmp_a0 = vpadd_u16(vget_low_u16(abs_01), vget_high_u16(abs_01));
206                 vst1_u16(pu2_sad, tmp_a0);
207                 mvx += stepx * 4;
208             }
209             else if((mvx + (stepx * 2)) <= ps_range_prms->i2_max_x)  // 8x4
210             {
211                 uint16x8_t abs_01 = vdupq_n_u16(0);
212                 uint16x4_t tmp_a;
213                 uint32x2_t tmp_b;
214 
215                 for(j = 0; j < 4; j++)
216                 {
217                     uint8x8_t ref = vld1_u8(pu1_ref);
218 
219                     abs_01 = vabal_u8(abs_01, src2[j], ref);
220                     pu1_ref += i4_ref_stride;
221                 }
222                 tmp_a = vpadd_u16(vget_low_u16(abs_01), vget_high_u16(abs_01));
223                 tmp_b = vpaddl_u16(tmp_a);
224                 pu2_sad[0] = vget_lane_u32(tmp_b, 0);
225                 pu2_sad[1] = vget_lane_u32(tmp_b, 1);
226                 mvx += stepx * 2;
227             }
228             else if((mvx + stepx) <= ps_range_prms->i2_max_x)  // 4x4
229             {
230                 const uint8x16_t ref = load_unaligned_u8q(pu1_ref, i4_ref_stride);
231                 uint16x8_t abs = vabdl_u8(vget_low_u8(src), vget_low_u8(ref));
232                 uint32x4_t b;
233                 uint64x2_t c;
234 
235                 abs = vabal_u8(abs, vget_high_u8(src), vget_high_u8(ref));
236                 b = vpaddlq_u16(abs);
237                 c = vpaddlq_u32(b);
238                 *pu2_sad = vget_lane_u32(
239                     vadd_u32(
240                         vreinterpret_u32_u64(vget_low_u64(c)),
241                         vreinterpret_u32_u64(vget_high_u64(c))),
242                     0);
243                 mvx += stepx;
244             }
245         }
246     }
247 }
248 
hme_store_4x4_sads_high_quality_neon(hme_search_prms_t * ps_search_prms,layer_ctxt_t * ps_layer_ctxt,range_prms_t * ps_mv_limit,wgt_pred_ctxt_t * ps_wt_inp_prms,S16 * pi2_sads_4x4)249 void hme_store_4x4_sads_high_quality_neon(
250     hme_search_prms_t *ps_search_prms,
251     layer_ctxt_t *ps_layer_ctxt,
252     range_prms_t *ps_mv_limit,
253     wgt_pred_ctxt_t *ps_wt_inp_prms,
254     S16 *pi2_sads_4x4)
255 {
256     uint8x8_t src2[4];
257     uint8x16_t src;
258 
259     S32 i, j;
260 
261     /* Input and reference attributes */
262     U08 *pu1_inp, *pu1_ref;
263     S32 i4_inp_stride, i4_ref_stride, i4_ref_offset;
264 
265     /* The reference is actually an array of ptrs since there are several    */
266     /* reference id. So an array gets passed form calling function           */
267     U08 **ppu1_ref, *pu1_ref_coloc;
268 
269     S32 stepy, stepx, step_shift_x, step_shift_y;
270     S32 mvx, mvy, mv_x_offset, mv_y_offset, mv_x_range, mv_y_range;
271 
272     /* Points to the range limits for mv */
273     range_prms_t *ps_range_prms = ps_search_prms->aps_mv_range[0];
274 
275     /* Reference index to be searched */
276     S32 i4_search_idx = ps_search_prms->i1_ref_idx;
277 
278     pu1_inp = ps_wt_inp_prms->apu1_wt_inp[i4_search_idx];
279     i4_inp_stride = ps_search_prms->i4_inp_stride;
280 
281     /* Move to the location of the search blk in inp buffer */
282     pu1_inp += ps_search_prms->i4_cu_x_off;
283     pu1_inp += ps_search_prms->i4_cu_y_off * i4_inp_stride;
284 
285     /*************************************************************************/
286     /* we use either input of previously encoded pictures as reference       */
287     /* in coarse layer                                                       */
288     /*************************************************************************/
289     i4_ref_stride = ps_layer_ctxt->i4_inp_stride;
290     ppu1_ref = ps_layer_ctxt->ppu1_list_inp;
291 
292     /* colocated position in reference picture */
293     i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
294     pu1_ref_coloc = ppu1_ref[i4_search_idx] + i4_ref_offset;
295 
296     stepx = stepy = HME_COARSE_STEP_SIZE_HIGH_QUALITY;
297     /*TODO: Calculate Step shift from the #define HME_COARSE_STEP_SIZE_HIGH_QUALITY */
298     step_shift_x = step_shift_y = 1;
299 
300     mv_x_offset = -(ps_mv_limit->i2_min_x >> step_shift_x);
301     mv_y_offset = -(ps_mv_limit->i2_min_y >> step_shift_y);
302     mv_x_range = (-ps_mv_limit->i2_min_x + ps_mv_limit->i2_max_x) >> step_shift_x;
303     mv_y_range = (-ps_mv_limit->i2_min_y + ps_mv_limit->i2_max_y) >> step_shift_y;
304 
305     /* load input */
306     {
307         S32 mv_x_sweep = ps_range_prms->i2_max_x - ps_range_prms->i2_min_x;
308         uint32x2_t a[4];
309 
310         for(i = 0; i < 4; i++)
311         {
312             a[i] = vld1_dup_u32((uint32_t *)pu1_inp);
313             pu1_inp += i4_inp_stride;
314         }
315         src2[0] = vreinterpret_u8_u32(a[0]);
316         src2[1] = vreinterpret_u8_u32(a[1]);
317         src2[2] = vreinterpret_u8_u32(a[2]);
318         src2[3] = vreinterpret_u8_u32(a[3]);
319 
320         if((mv_x_sweep >> 2) & 1)
321         {
322             uint32x2x2_t l = vtrn_u32(a[0], a[1]);
323             uint32x2x2_t m = vtrn_u32(a[2], a[3]);
324 
325             src = vcombine_u8(vreinterpret_u8_u32(l.val[0]), vreinterpret_u8_u32(m.val[0]));
326         }
327     }
328 
329     /* Run 2loops to sweep over the reference area */
330     for(mvy = ps_range_prms->i2_min_y; mvy < ps_range_prms->i2_max_y; mvy += stepy)
331     {
332         for(mvx = ps_range_prms->i2_min_x; mvx < ps_range_prms->i2_max_x;)
333         {
334             U16 *pu2_sad = (U16 *)&pi2_sads_4x4
335                 [((mvx >> step_shift_x) + mv_x_offset) +
336                  ((mvy >> step_shift_y) + mv_y_offset) * mv_x_range];
337 
338             pu1_ref = pu1_ref_coloc + mvx + (mvy * i4_ref_stride);
339             if((mvx + (stepx * 8)) <= ps_range_prms->i2_max_x)  // 16x4
340             {
341                 uint16x8_t abs_a_01 = vdupq_n_u16(0);
342                 uint16x8_t abs_a_23 = vdupq_n_u16(0);
343                 uint16x8_t abs_b_01 = vdupq_n_u16(0);
344                 uint16x8_t abs_b_23 = vdupq_n_u16(0);
345                 uint16x4_t tmp_b0, tmp_b1;
346                 uint16x4x2_t tmp_a;
347 
348                 for(j = 0; j < 4; j++)
349                 {
350                     uint8x16_t ref_a = vld1q_u8(pu1_ref);
351                     uint8x16_t ref_b = vld1q_u8(pu1_ref + 2);
352 
353                     abs_a_01 = vabal_u8(abs_a_01, src2[j], vget_low_u8(ref_a));
354                     abs_a_23 = vabal_u8(abs_a_23, src2[j], vget_high_u8(ref_a));
355                     abs_b_01 = vabal_u8(abs_b_01, src2[j], vget_low_u8(ref_b));
356                     abs_b_23 = vabal_u8(abs_b_23, src2[j], vget_high_u8(ref_b));
357                     pu1_ref += i4_ref_stride;
358                 }
359                 tmp_a.val[0] = vpadd_u16(vget_low_u16(abs_a_01), vget_high_u16(abs_a_01));
360                 tmp_a.val[1] = vpadd_u16(vget_low_u16(abs_a_23), vget_high_u16(abs_a_23));
361                 abs_a_01 = vcombine_u16(tmp_a.val[0], tmp_a.val[1]);
362                 tmp_a.val[0] = vpadd_u16(vget_low_u16(abs_a_01), vget_high_u16(abs_a_01));
363                 tmp_b0 = vpadd_u16(vget_low_u16(abs_b_01), vget_high_u16(abs_b_01));
364                 tmp_b1 = vpadd_u16(vget_low_u16(abs_b_23), vget_high_u16(abs_b_23));
365                 abs_b_01 = vcombine_u16(tmp_b0, tmp_b1);
366                 tmp_a.val[1] = vpadd_u16(vget_low_u16(abs_b_01), vget_high_u16(abs_b_01));
367                 vst2_u16(pu2_sad, tmp_a);
368                 mvx += stepx * 8;
369             }
370             else if((mvx + (stepx * 4)) <= ps_range_prms->i2_max_x)  // 8x4
371             {
372                 uint16x8_t abs_a_01 = vdupq_n_u16(0);
373                 uint16x8_t abs_b_01 = vdupq_n_u16(0);
374                 uint16x4_t tmp_a, tmp_b;
375 
376                 for(j = 0; j < 4; j++)
377                 {
378                     uint8x8_t ref_a = vld1_u8(pu1_ref);
379                     uint8x8_t ref_b = vld1_u8(pu1_ref + 2);
380 
381                     abs_a_01 = vabal_u8(abs_a_01, src2[j], ref_a);
382                     abs_b_01 = vabal_u8(abs_b_01, src2[j], ref_b);
383                     pu1_ref += i4_ref_stride;
384                 }
385                 tmp_a = vpadd_u16(vget_low_u16(abs_a_01), vget_high_u16(abs_a_01));
386                 tmp_b = vpadd_u16(vget_low_u16(abs_b_01), vget_high_u16(abs_b_01));
387                 tmp_a = vpadd_u16(tmp_a, tmp_b);
388 
389                 pu2_sad[0] = vget_lane_u16(tmp_a, 0);
390                 pu2_sad[1] = vget_lane_u16(tmp_a, 2);
391                 pu2_sad[2] = vget_lane_u16(tmp_a, 1);
392                 pu2_sad[3] = vget_lane_u16(tmp_a, 3);
393                 mvx += stepx * 4;
394             }
395             else if((mvx + stepx * 2) <= ps_range_prms->i2_max_x)  // 4x4
396             {
397                 uint8x16_t ref = load_unaligned_u8q(pu1_ref, i4_ref_stride);
398                 uint16x8_t abs = vabdl_u8(vget_low_u8(src), vget_low_u8(ref));
399                 uint32x4_t b;
400                 uint64x2_t c;
401 
402                 abs = vabal_u8(abs, vget_high_u8(src), vget_high_u8(ref));
403                 b = vpaddlq_u16(abs);
404                 c = vpaddlq_u32(b);
405                 *pu2_sad = vget_lane_u32(
406                     vadd_u32(
407                         vreinterpret_u32_u64(vget_low_u64(c)),
408                         vreinterpret_u32_u64(vget_high_u64(c))),
409                     0);
410 
411                 ref = load_unaligned_u8q(pu1_ref + 2, i4_ref_stride);
412                 abs = vabdl_u8(vget_low_u8(src), vget_low_u8(ref));
413                 abs = vabal_u8(abs, vget_high_u8(src), vget_high_u8(ref));
414                 b = vpaddlq_u16(abs);
415                 c = vpaddlq_u32(b);
416                 pu2_sad[1] = vget_lane_u32(
417                     vadd_u32(
418                         vreinterpret_u32_u64(vget_low_u64(c)),
419                         vreinterpret_u32_u64(vget_high_u64(c))),
420                     0);
421                 mvx += stepx * 2;
422             }
423             else
424             {
425                 assert(0);
426             }
427         }
428     }
429 }
430 
431 #define BEST_COST(i)                                                                               \
432     if(sad_array[0][i] < min_cost_4x8)                                                             \
433     {                                                                                              \
434         best_mv_x_4x8 = mvx + i * stepx;                                                           \
435         best_mv_y_4x8 = mvy;                                                                       \
436         min_cost_4x8 = sad_array[0][i];                                                            \
437     }                                                                                              \
438     if(sad_array[1][i] < min_cost_8x4)                                                             \
439     {                                                                                              \
440         best_mv_x_8x4 = mvx + i * stepx;                                                           \
441         best_mv_y_8x4 = mvy;                                                                       \
442         min_cost_8x4 = sad_array[1][i];                                                            \
443     }
444 
hme_combine_4x4_sads_and_compute_cost_high_speed_neon(S08 i1_ref_idx,range_prms_t * ps_mv_range,range_prms_t * ps_mv_limit,hme_mv_t * ps_best_mv_4x8,hme_mv_t * ps_best_mv_8x4,pred_ctxt_t * ps_pred_ctxt,PF_MV_COST_FXN pf_mv_cost_compute,S16 * pi2_sads_4x4_current,S16 * pi2_sads_4x4_east,S16 * pi2_sads_4x4_south)445 void hme_combine_4x4_sads_and_compute_cost_high_speed_neon(
446     S08 i1_ref_idx,
447     range_prms_t *ps_mv_range,
448     range_prms_t *ps_mv_limit,
449     hme_mv_t *ps_best_mv_4x8,
450     hme_mv_t *ps_best_mv_8x4,
451     pred_ctxt_t *ps_pred_ctxt,
452     PF_MV_COST_FXN pf_mv_cost_compute,
453     S16 *pi2_sads_4x4_current,
454     S16 *pi2_sads_4x4_east,
455     S16 *pi2_sads_4x4_south)
456 {
457     S32 best_mv_y_4x8, best_mv_x_4x8, best_mv_y_8x4, best_mv_x_8x4;
458 
459     S32 stepy = HME_COARSE_STEP_SIZE_HIGH_SPEED;
460     S32 stepx = HME_COARSE_STEP_SIZE_HIGH_SPEED;
461     /*TODO: Calculate Step shift from the #define HME_COARSE_STEP_SIZE_HIGH_SPEED */
462     S32 step_shift_x = 2;
463     S32 step_shift_y = 2;
464 
465     S32 mvx, mvy, mv_x_offset, mv_y_offset, mv_x_range, mv_y_range;
466 
467     S32 lambda = ps_pred_ctxt->lambda;
468     S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
469     S32 rnd = 1 << (lambda_q_shift - 1);
470 
471     S32 min_cost_4x8 = MAX_32BIT_VAL;
472     S32 min_cost_8x4 = MAX_32BIT_VAL;
473 
474     S32 i;
475 
476     const uint16x8_t v_ref_idx = vdupq_n_u16(i1_ref_idx);
477     const uint32x4_t v_lambda = vdupq_n_u32(lambda);
478     const uint32x4_t v_rnd_factor = vdupq_n_u32(rnd);
479     const int32x4_t v_lambda_q_shift = vdupq_n_s32(-lambda_q_shift);
480 
481     mv_x_offset = (-ps_mv_limit->i2_min_x >> step_shift_x);
482     mv_y_offset = (-ps_mv_limit->i2_min_y >> step_shift_y);
483     mv_x_range = (-ps_mv_limit->i2_min_x + ps_mv_limit->i2_max_x) >> step_shift_x;
484     mv_y_range = (-ps_mv_limit->i2_min_y + ps_mv_limit->i2_max_y) >> step_shift_y;
485 
486     ASSERT(MAX_MVX_SUPPORTED_IN_COARSE_LAYER >= ABS(ps_mv_range->i2_max_x));
487     ASSERT(MAX_MVY_SUPPORTED_IN_COARSE_LAYER >= ABS(ps_mv_range->i2_max_y));
488 
489     /* Run 2loops to sweep over the reference area */
490     for(mvy = ps_mv_range->i2_min_y; mvy < ps_mv_range->i2_max_y; mvy += stepy)
491     {
492         /* LUT: (2 * hme_get_range(mv_y) - 1) + ((!mv_y) ? 0 : 1) */
493         uint16x8_t mvy_wt = vld1q_u16((U16 *)&gi2_mvy_range[ABS(mvy)][0]);
494 
495         /* mvy wt + ref_idx */
496         mvy_wt = vaddq_u16(mvy_wt, v_ref_idx);
497 
498         for(mvx = ps_mv_range->i2_min_x; mvx < ps_mv_range->i2_max_x;)
499         {
500             S32 sad_pos = ((mvx >> step_shift_x) + mv_x_offset) +
501                           ((mvy >> step_shift_y) + mv_y_offset) * mv_x_range;
502 
503             if(mvx + (8 * stepx) <= ps_mv_range->i2_max_x)  // 8x4
504             {
505                 uint16x8_t curr = vld1q_u16((U16 *)pi2_sads_4x4_current + sad_pos);
506                 uint16x8_t south = vld1q_u16((U16 *)pi2_sads_4x4_south + sad_pos);
507                 uint16x8_t east = vld1q_u16((U16 *)pi2_sads_4x4_east + sad_pos);
508                 uint16x8_t sad_4x8 = vaddq_u16(curr, south);
509                 uint16x8_t sad_8x4 = vaddq_u16(curr, east);
510                 /* LUT: (2 * hme_get_range(mv_x) - 1) + ((!mv_x) ? 0 : 1) */
511                 uint16x8_t mv_wt =
512                     vld1q_u16((U16 *)&gi2_mvx_range[mvx + MAX_MVX_SUPPORTED_IN_COARSE_LAYER][0]);
513                 uint32x4_t total_cost_0, total_cost_1;
514                 uint16x8_t total_cost;
515                 U16 sad_array[2][8];
516 
517                 /* mv weight + ref_idx */
518                 mv_wt = vaddq_u16(mv_wt, mvy_wt);
519 
520                 total_cost_0 = vmulq_u32(v_lambda, vmovl_u16(vget_low_u16(mv_wt)));
521                 total_cost_1 = vmulq_u32(v_lambda, vmovl_u16(vget_high_u16(mv_wt)));
522 
523                 total_cost_0 = vaddq_u32(total_cost_0, v_rnd_factor);
524                 total_cost_1 = vaddq_u32(total_cost_1, v_rnd_factor);
525 
526                 total_cost_0 = vshlq_u32(total_cost_0, v_lambda_q_shift);
527                 total_cost_1 = vshlq_u32(total_cost_1, v_lambda_q_shift);
528 
529                 total_cost = vcombine_u16(vmovn_u32(total_cost_0), vmovn_u32(total_cost_1));
530 
531                 sad_4x8 = vaddq_u16(total_cost, sad_4x8);
532                 sad_8x4 = vaddq_u16(total_cost, sad_8x4);
533 
534                 vst1q_u16(sad_array[0], sad_4x8);
535                 vst1q_u16(sad_array[1], sad_8x4);
536 
537                 for(i = 0; i < 8; i++)
538                 {
539                     BEST_COST(i);
540                 }
541                 mvx += stepx * 8;
542             }
543             else if(mvx + (4 * stepx) <= ps_mv_range->i2_max_x)  // 4x4
544             {
545                 uint16x4_t curr = vld1_u16((U16 *)pi2_sads_4x4_current + sad_pos);
546                 uint16x4_t south = vld1_u16((U16 *)pi2_sads_4x4_south + sad_pos);
547                 uint16x4_t east = vld1_u16((U16 *)pi2_sads_4x4_east + sad_pos);
548                 uint16x4_t sad_4x8 = vadd_u16(curr, south);
549                 uint16x4_t sad_8x4 = vadd_u16(curr, east);
550                 /* LUT: (2 * hme_get_range(mv_x) - 1) + ((!mv_x) ? 0 : 1) */
551                 uint16x4_t mv_wt =
552                     vld1_u16((U16 *)&gi2_mvx_range[mvx + MAX_MVX_SUPPORTED_IN_COARSE_LAYER][0]);
553                 uint32x4_t total_cost;
554                 U16 sad_array[2][4];
555 
556                 /* mv weight + ref_idx */
557                 mv_wt = vadd_u16(mv_wt, vget_low_u16(mvy_wt));
558 
559                 total_cost = vmulq_u32(v_lambda, vmovl_u16(mv_wt));
560                 total_cost = vaddq_u32(total_cost, v_rnd_factor);
561                 total_cost = vshlq_u32(total_cost, v_lambda_q_shift);
562 
563                 sad_4x8 = vadd_u16(vmovn_u32(total_cost), sad_4x8);
564                 sad_8x4 = vadd_u16(vmovn_u32(total_cost), sad_8x4);
565 
566                 vst1_u16(sad_array[0], sad_4x8);
567                 vst1_u16(sad_array[1], sad_8x4);
568 
569                 for(i = 0; i < 4; i++)
570                 {
571                     BEST_COST(i);
572                 }
573 
574                 mvx += stepx * 4;
575             }
576             else
577             {
578                 S16 sad_array[2][1];
579                 S32 mv_cost;
580 
581                 /* Get SAD by adding SAD for current and neighbour S  */
582                 sad_array[0][0] = pi2_sads_4x4_current[sad_pos] + pi2_sads_4x4_south[sad_pos];
583                 sad_array[1][0] = pi2_sads_4x4_current[sad_pos] + pi2_sads_4x4_east[sad_pos];
584 
585                 mv_cost = gi2_mvy_range[ABS(mvy)][0] +
586                           gi2_mvx_range[mvx + MAX_MVX_SUPPORTED_IN_COARSE_LAYER][0] + i1_ref_idx;
587 
588                 mv_cost = (mv_cost * lambda + rnd) >> lambda_q_shift;
589 
590                 sad_array[0][0] += mv_cost;
591                 sad_array[1][0] += mv_cost;
592 
593                 BEST_COST(0);
594                 mvx += stepx;
595             }
596         }
597     }
598 
599     ps_best_mv_4x8->i2_mv_x = best_mv_x_4x8;
600     ps_best_mv_4x8->i2_mv_y = best_mv_y_4x8;
601 
602     ps_best_mv_8x4->i2_mv_x = best_mv_x_8x4;
603     ps_best_mv_8x4->i2_mv_y = best_mv_y_8x4;
604 }
605 
hme_combine_4x4_sads_and_compute_cost_high_quality_neon(S08 i1_ref_idx,range_prms_t * ps_mv_range,range_prms_t * ps_mv_limit,hme_mv_t * ps_best_mv_4x8,hme_mv_t * ps_best_mv_8x4,pred_ctxt_t * ps_pred_ctxt,PF_MV_COST_FXN pf_mv_cost_compute,S16 * pi2_sads_4x4_current,S16 * pi2_sads_4x4_east,S16 * pi2_sads_4x4_south)606 void hme_combine_4x4_sads_and_compute_cost_high_quality_neon(
607     S08 i1_ref_idx,
608     range_prms_t *ps_mv_range,
609     range_prms_t *ps_mv_limit,
610     hme_mv_t *ps_best_mv_4x8,
611     hme_mv_t *ps_best_mv_8x4,
612     pred_ctxt_t *ps_pred_ctxt,
613     PF_MV_COST_FXN pf_mv_cost_compute,
614     S16 *pi2_sads_4x4_current,
615     S16 *pi2_sads_4x4_east,
616     S16 *pi2_sads_4x4_south)
617 {
618     S32 best_mv_y_4x8, best_mv_x_4x8, best_mv_y_8x4, best_mv_x_8x4;
619 
620     S32 stepy = HME_COARSE_STEP_SIZE_HIGH_QUALITY;
621     S32 stepx = HME_COARSE_STEP_SIZE_HIGH_QUALITY;
622     /*TODO: Calculate Step shift from the #define HME_COARSE_STEP_SIZE_HIGH_SPEED */
623     S32 step_shift_x = 1;
624     S32 step_shift_y = 1;
625 
626     S32 mvx, mvy, mv_x_offset, mv_y_offset, mv_x_range, mv_y_range;
627 
628     S32 lambda = ps_pred_ctxt->lambda;
629     S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
630     S32 rnd = 1 << (lambda_q_shift - 1);
631 
632     S32 min_cost_4x8 = MAX_32BIT_VAL;
633     S32 min_cost_8x4 = MAX_32BIT_VAL;
634 
635     S32 i;
636 
637     const uint16x8_t v_ref_idx = vdupq_n_u16(i1_ref_idx);
638     const uint32x4_t v_lambda = vdupq_n_u32(lambda);
639     const uint32x4_t v_rnd_factor = vdupq_n_u32(rnd);
640     const int32x4_t v_lambda_q_shift = vdupq_n_s32(-lambda_q_shift);
641 
642     mv_x_offset = (-ps_mv_limit->i2_min_x >> step_shift_x);
643     mv_y_offset = (-ps_mv_limit->i2_min_y >> step_shift_y);
644     mv_x_range = (-ps_mv_limit->i2_min_x + ps_mv_limit->i2_max_x) >> step_shift_x;
645     mv_y_range = (-ps_mv_limit->i2_min_y + ps_mv_limit->i2_max_y) >> step_shift_y;
646 
647     ASSERT(MAX_MVX_SUPPORTED_IN_COARSE_LAYER >= ABS(ps_mv_range->i2_max_x));
648     ASSERT(MAX_MVY_SUPPORTED_IN_COARSE_LAYER >= ABS(ps_mv_range->i2_max_y));
649 
650     /* Run 2loops to sweep over the reference area */
651     for(mvy = ps_mv_range->i2_min_y; mvy < ps_mv_range->i2_max_y; mvy += stepy)
652     {
653         /* LUT: (2 * hme_get_range(mv_y) - 1) + ((!mv_y) ? 0 : 1) */
654         uint16x8_t mvy_wt = vld1q_u16((U16 *)&gi2_mvy_range[ABS(mvy)][0]);
655 
656         /* mvy wt + ref_idx */
657         mvy_wt = vaddq_u16(mvy_wt, v_ref_idx);
658 
659         for(mvx = ps_mv_range->i2_min_x; mvx < ps_mv_range->i2_max_x;)
660         {
661             S32 sad_pos = ((mvx >> step_shift_x) + mv_x_offset) +
662                           ((mvy >> step_shift_y) + mv_y_offset) * mv_x_range;
663 
664             if(mvx + (8 * stepx) <= ps_mv_range->i2_max_x)  // 8x4
665             {
666                 uint16x8_t curr = vld1q_u16((U16 *)pi2_sads_4x4_current + sad_pos);
667                 uint16x8_t south = vld1q_u16((U16 *)pi2_sads_4x4_south + sad_pos);
668                 uint16x8_t east = vld1q_u16((U16 *)pi2_sads_4x4_east + sad_pos);
669                 uint16x8_t sad_4x8 = vaddq_u16(curr, south);
670                 uint16x8_t sad_8x4 = vaddq_u16(curr, east);
671                 /* LUT: (2 * hme_get_range(mv_x) - 1) + ((!mv_x) ? 0 : 1) */
672                 uint16x8_t mv_wt = vld1q_u16(
673                     (U16 *)&gi2_mvx_range_high_quality[mvx + MAX_MVX_SUPPORTED_IN_COARSE_LAYER][0]);
674                 uint32x4_t total_cost_0, total_cost_1;
675                 uint16x8_t total_cost;
676                 U16 sad_array[2][8];
677 
678                 /* mv weight + ref_idx */
679                 mv_wt = vaddq_u16(mv_wt, mvy_wt);
680 
681                 total_cost_0 = vmulq_u32(v_lambda, vmovl_u16(vget_low_u16(mv_wt)));
682                 total_cost_1 = vmulq_u32(v_lambda, vmovl_u16(vget_high_u16(mv_wt)));
683 
684                 total_cost_0 = vaddq_u32(total_cost_0, v_rnd_factor);
685                 total_cost_1 = vaddq_u32(total_cost_1, v_rnd_factor);
686 
687                 total_cost_0 = vshlq_u32(total_cost_0, v_lambda_q_shift);
688                 total_cost_1 = vshlq_u32(total_cost_1, v_lambda_q_shift);
689 
690                 total_cost = vcombine_u16(vmovn_u32(total_cost_0), vmovn_u32(total_cost_1));
691 
692                 sad_4x8 = vaddq_u16(total_cost, sad_4x8);
693                 sad_8x4 = vaddq_u16(total_cost, sad_8x4);
694 
695                 vst1q_u16(sad_array[0], sad_4x8);
696                 vst1q_u16(sad_array[1], sad_8x4);
697 
698                 for(i = 0; i < 8; i++)
699                 {
700                     BEST_COST(i);
701                 }
702                 mvx += stepx * 8;
703             }
704             else if(mvx + (4 * stepx) <= ps_mv_range->i2_max_x)  // 4x4
705             {
706                 uint16x4_t curr = vld1_u16((U16 *)pi2_sads_4x4_current + sad_pos);
707                 uint16x4_t south = vld1_u16((U16 *)pi2_sads_4x4_south + sad_pos);
708                 uint16x4_t east = vld1_u16((U16 *)pi2_sads_4x4_east + sad_pos);
709                 uint16x4_t sad_4x8 = vadd_u16(curr, south);
710                 uint16x4_t sad_8x4 = vadd_u16(curr, east);
711                 /* LUT: (2 * hme_get_range(mv_x) - 1) + ((!mv_x) ? 0 : 1) */
712                 uint16x4_t mv_wt = vld1_u16(
713                     (U16 *)&gi2_mvx_range_high_quality[mvx + MAX_MVX_SUPPORTED_IN_COARSE_LAYER][0]);
714                 uint32x4_t total_cost;
715                 U16 sad_array[2][4];
716 
717                 /* mv weight + ref_idx */
718                 mv_wt = vadd_u16(mv_wt, vget_low_u16(mvy_wt));
719 
720                 total_cost = vmulq_u32(v_lambda, vmovl_u16(mv_wt));
721                 total_cost = vaddq_u32(total_cost, v_rnd_factor);
722                 total_cost = vshlq_u32(total_cost, v_lambda_q_shift);
723 
724                 sad_4x8 = vadd_u16(vmovn_u32(total_cost), sad_4x8);
725                 sad_8x4 = vadd_u16(vmovn_u32(total_cost), sad_8x4);
726 
727                 vst1_u16(sad_array[0], sad_4x8);
728                 vst1_u16(sad_array[1], sad_8x4);
729 
730                 for(i = 0; i < 4; i++)
731                 {
732                     BEST_COST(i);
733                 }
734 
735                 mvx += stepx * 4;
736             }
737             else
738             {
739                 S16 sad_array[2][1];
740                 S32 mv_cost;
741 
742                 /* Get SAD by adding SAD for current and neighbour S  */
743                 sad_array[0][0] = pi2_sads_4x4_current[sad_pos] + pi2_sads_4x4_south[sad_pos];
744                 sad_array[1][0] = pi2_sads_4x4_current[sad_pos] + pi2_sads_4x4_east[sad_pos];
745 
746                 mv_cost = gi2_mvy_range[ABS(mvy)][0] +
747                           gi2_mvx_range_high_quality[mvx + MAX_MVX_SUPPORTED_IN_COARSE_LAYER][0] +
748                           i1_ref_idx;
749 
750                 mv_cost = (mv_cost * lambda + rnd) >> lambda_q_shift;
751 
752                 sad_array[0][0] += mv_cost;
753                 sad_array[1][0] += mv_cost;
754 
755                 BEST_COST(0);
756                 mvx += stepx;
757             }
758         }
759     }
760 
761     ps_best_mv_4x8->i2_mv_x = best_mv_x_4x8;
762     ps_best_mv_4x8->i2_mv_y = best_mv_y_4x8;
763 
764     ps_best_mv_8x4->i2_mv_x = best_mv_x_8x4;
765     ps_best_mv_8x4->i2_mv_y = best_mv_y_8x4;
766 }
767