xref: /aosp_15_r20/external/libhevc/encoder/hme_subpel.c (revision c83a76b084498d55f252f48b2e3786804cdf24b7)
1*c83a76b0SSuyog Pawar /******************************************************************************
2*c83a76b0SSuyog Pawar  *
3*c83a76b0SSuyog Pawar  * Copyright (C) 2018 The Android Open Source Project
4*c83a76b0SSuyog Pawar  *
5*c83a76b0SSuyog Pawar  * Licensed under the Apache License, Version 2.0 (the "License");
6*c83a76b0SSuyog Pawar  * you may not use this file except in compliance with the License.
7*c83a76b0SSuyog Pawar  * You may obtain a copy of the License at:
8*c83a76b0SSuyog Pawar  *
9*c83a76b0SSuyog Pawar  * http://www.apache.org/licenses/LICENSE-2.0
10*c83a76b0SSuyog Pawar  *
11*c83a76b0SSuyog Pawar  * Unless required by applicable law or agreed to in writing, software
12*c83a76b0SSuyog Pawar  * distributed under the License is distributed on an "AS IS" BASIS,
13*c83a76b0SSuyog Pawar  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14*c83a76b0SSuyog Pawar  * See the License for the specific language governing permissions and
15*c83a76b0SSuyog Pawar  * limitations under the License.
16*c83a76b0SSuyog Pawar  *
17*c83a76b0SSuyog Pawar  *****************************************************************************
18*c83a76b0SSuyog Pawar  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19*c83a76b0SSuyog Pawar */
20*c83a76b0SSuyog Pawar 
21*c83a76b0SSuyog Pawar /**
22*c83a76b0SSuyog Pawar ******************************************************************************
23*c83a76b0SSuyog Pawar * @file hme_subpel.c
24*c83a76b0SSuyog Pawar *
25*c83a76b0SSuyog Pawar * @brief
26*c83a76b0SSuyog Pawar *    Subpel refinement modules for ME algo
27*c83a76b0SSuyog Pawar *
28*c83a76b0SSuyog Pawar * @author
29*c83a76b0SSuyog Pawar *    Ittiam
30*c83a76b0SSuyog Pawar *
31*c83a76b0SSuyog Pawar *
32*c83a76b0SSuyog Pawar * List of Functions
33*c83a76b0SSuyog Pawar * hme_qpel_interp_avg()
34*c83a76b0SSuyog Pawar * hme_subpel_refine_ctblist_bck()
35*c83a76b0SSuyog Pawar * hme_subpel_refine_ctblist_fwd()
36*c83a76b0SSuyog Pawar * hme_refine_bidirect()
37*c83a76b0SSuyog Pawar * hme_subpel_refinement()
38*c83a76b0SSuyog Pawar * hme_subpel_refine_ctb_fwd()
39*c83a76b0SSuyog Pawar * hme_subpel_refine_ctb_bck()
40*c83a76b0SSuyog Pawar * hme_create_bck_inp()
41*c83a76b0SSuyog Pawar * hme_subpel_refine_search_node()
42*c83a76b0SSuyog Pawar ******************************************************************************
43*c83a76b0SSuyog Pawar */
44*c83a76b0SSuyog Pawar 
45*c83a76b0SSuyog Pawar /*****************************************************************************/
46*c83a76b0SSuyog Pawar /* File Includes                                                             */
47*c83a76b0SSuyog Pawar /*****************************************************************************/
48*c83a76b0SSuyog Pawar /* System include files */
49*c83a76b0SSuyog Pawar #include <stdio.h>
50*c83a76b0SSuyog Pawar #include <string.h>
51*c83a76b0SSuyog Pawar #include <stdlib.h>
52*c83a76b0SSuyog Pawar #include <assert.h>
53*c83a76b0SSuyog Pawar #include <stdarg.h>
54*c83a76b0SSuyog Pawar #include <math.h>
55*c83a76b0SSuyog Pawar #include <limits.h>
56*c83a76b0SSuyog Pawar 
57*c83a76b0SSuyog Pawar /* User include files */
58*c83a76b0SSuyog Pawar #include "ihevc_typedefs.h"
59*c83a76b0SSuyog Pawar #include "itt_video_api.h"
60*c83a76b0SSuyog Pawar #include "ihevce_api.h"
61*c83a76b0SSuyog Pawar 
62*c83a76b0SSuyog Pawar #include "rc_cntrl_param.h"
63*c83a76b0SSuyog Pawar #include "rc_frame_info_collector.h"
64*c83a76b0SSuyog Pawar #include "rc_look_ahead_params.h"
65*c83a76b0SSuyog Pawar 
66*c83a76b0SSuyog Pawar #include "ihevc_defs.h"
67*c83a76b0SSuyog Pawar #include "ihevc_structs.h"
68*c83a76b0SSuyog Pawar #include "ihevc_platform_macros.h"
69*c83a76b0SSuyog Pawar #include "ihevc_deblk.h"
70*c83a76b0SSuyog Pawar #include "ihevc_itrans_recon.h"
71*c83a76b0SSuyog Pawar #include "ihevc_chroma_itrans_recon.h"
72*c83a76b0SSuyog Pawar #include "ihevc_chroma_intra_pred.h"
73*c83a76b0SSuyog Pawar #include "ihevc_intra_pred.h"
74*c83a76b0SSuyog Pawar #include "ihevc_inter_pred.h"
75*c83a76b0SSuyog Pawar #include "ihevc_mem_fns.h"
76*c83a76b0SSuyog Pawar #include "ihevc_padding.h"
77*c83a76b0SSuyog Pawar #include "ihevc_weighted_pred.h"
78*c83a76b0SSuyog Pawar #include "ihevc_sao.h"
79*c83a76b0SSuyog Pawar #include "ihevc_resi_trans.h"
80*c83a76b0SSuyog Pawar #include "ihevc_quant_iquant_ssd.h"
81*c83a76b0SSuyog Pawar #include "ihevc_cabac_tables.h"
82*c83a76b0SSuyog Pawar 
83*c83a76b0SSuyog Pawar #include "ihevce_defs.h"
84*c83a76b0SSuyog Pawar #include "ihevce_lap_enc_structs.h"
85*c83a76b0SSuyog Pawar #include "ihevce_multi_thrd_structs.h"
86*c83a76b0SSuyog Pawar #include "ihevce_multi_thrd_funcs.h"
87*c83a76b0SSuyog Pawar #include "ihevce_me_common_defs.h"
88*c83a76b0SSuyog Pawar #include "ihevce_had_satd.h"
89*c83a76b0SSuyog Pawar #include "ihevce_error_codes.h"
90*c83a76b0SSuyog Pawar #include "ihevce_bitstream.h"
91*c83a76b0SSuyog Pawar #include "ihevce_cabac.h"
92*c83a76b0SSuyog Pawar #include "ihevce_rdoq_macros.h"
93*c83a76b0SSuyog Pawar #include "ihevce_function_selector.h"
94*c83a76b0SSuyog Pawar #include "ihevce_enc_structs.h"
95*c83a76b0SSuyog Pawar #include "ihevce_entropy_structs.h"
96*c83a76b0SSuyog Pawar #include "ihevce_cmn_utils_instr_set_router.h"
97*c83a76b0SSuyog Pawar #include "ihevce_enc_loop_structs.h"
98*c83a76b0SSuyog Pawar #include "ihevce_bs_compute_ctb.h"
99*c83a76b0SSuyog Pawar #include "ihevce_global_tables.h"
100*c83a76b0SSuyog Pawar #include "ihevce_dep_mngr_interface.h"
101*c83a76b0SSuyog Pawar #include "hme_datatype.h"
102*c83a76b0SSuyog Pawar #include "hme_interface.h"
103*c83a76b0SSuyog Pawar #include "hme_common_defs.h"
104*c83a76b0SSuyog Pawar #include "hme_defs.h"
105*c83a76b0SSuyog Pawar #include "ihevce_me_instr_set_router.h"
106*c83a76b0SSuyog Pawar #include "hme_globals.h"
107*c83a76b0SSuyog Pawar #include "hme_utils.h"
108*c83a76b0SSuyog Pawar #include "hme_coarse.h"
109*c83a76b0SSuyog Pawar #include "hme_fullpel.h"
110*c83a76b0SSuyog Pawar #include "hme_subpel.h"
111*c83a76b0SSuyog Pawar #include "hme_refine.h"
112*c83a76b0SSuyog Pawar #include "hme_err_compute.h"
113*c83a76b0SSuyog Pawar #include "hme_common_utils.h"
114*c83a76b0SSuyog Pawar #include "hme_search_algo.h"
115*c83a76b0SSuyog Pawar #include "ihevce_stasino_helpers.h"
116*c83a76b0SSuyog Pawar #include "ihevce_common_utils.h"
117*c83a76b0SSuyog Pawar 
118*c83a76b0SSuyog Pawar /*****************************************************************************/
119*c83a76b0SSuyog Pawar /* Function Definitions                                                      */
120*c83a76b0SSuyog Pawar /*****************************************************************************/
hme_qpel_interp_avg(interp_prms_t * ps_prms,S32 i4_mv_x,S32 i4_mv_y,S32 i4_buf_id)121*c83a76b0SSuyog Pawar void hme_qpel_interp_avg(interp_prms_t *ps_prms, S32 i4_mv_x, S32 i4_mv_y, S32 i4_buf_id)
122*c83a76b0SSuyog Pawar {
123*c83a76b0SSuyog Pawar     U08 *pu1_src1, *pu1_src2, *pu1_dst;
124*c83a76b0SSuyog Pawar     qpel_input_buf_cfg_t *ps_inp_cfg;
125*c83a76b0SSuyog Pawar     S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
126*c83a76b0SSuyog Pawar 
127*c83a76b0SSuyog Pawar     /*************************************************************************/
128*c83a76b0SSuyog Pawar     /* For a given QPEL pt, we need to determine the 2 source pts that are   */
129*c83a76b0SSuyog Pawar     /* needed to do the QPEL averaging. The logic to do this is as follows   */
130*c83a76b0SSuyog Pawar     /* i4_mv_x and i4_mv_y are the motion vectors in QPEL units that are     */
131*c83a76b0SSuyog Pawar     /* pointing to the pt of interest. Obviously, they are w.r.t. the 0,0    */
132*c83a76b0SSuyog Pawar     /* pt of th reference blk that is colocated to the inp blk.              */
133*c83a76b0SSuyog Pawar     /*    A j E k B                                                          */
134*c83a76b0SSuyog Pawar     /*    l m n o p                                                          */
135*c83a76b0SSuyog Pawar     /*    F q G r H                                                          */
136*c83a76b0SSuyog Pawar     /*    s t u v w                                                          */
137*c83a76b0SSuyog Pawar     /*    C x I y D                                                          */
138*c83a76b0SSuyog Pawar     /* In above diagram, A. B, C, D are full pts at offsets (0,0),(1,0),(0,1)*/
139*c83a76b0SSuyog Pawar     /* and (1,1) respectively in the fpel buffer (id = 0)                    */
140*c83a76b0SSuyog Pawar     /* E and I are hxfy pts in offsets (0,0),(0,1) respectively in hxfy buf  */
141*c83a76b0SSuyog Pawar     /* F and H are fxhy pts in offsets (0,0),(1,0) respectively in fxhy buf  */
142*c83a76b0SSuyog Pawar     /* G is hxhy pt in offset 0,0 in hxhy buf                                */
143*c83a76b0SSuyog Pawar     /* All above offsets are computed w.r.t. motion displaced pt in          */
144*c83a76b0SSuyog Pawar     /* respective bufs. This means that A corresponds to (i4_mv_x >> 2) and  */
145*c83a76b0SSuyog Pawar     /* (i4_mv_y >> 2) in fxfy buf. Ditto with E, F and G                     */
146*c83a76b0SSuyog Pawar     /* fxfy buf is buf id 0, hxfy is buf id 1, fxhy is buf id 2, hxhy is 3   */
147*c83a76b0SSuyog Pawar     /* If we consider pt v to be derived. v has a fractional comp of 3, 3    */
148*c83a76b0SSuyog Pawar     /* v is avg of H and I. So the table look up of v should give following  */
149*c83a76b0SSuyog Pawar     /* buf 1 (H) : offset = (1, 0) buf id = 2.                               */
150*c83a76b0SSuyog Pawar     /* buf 2 (I) : offset = 0 , 1) buf id = 1.                               */
151*c83a76b0SSuyog Pawar     /* NOTE: For pts that are fxfy/hxfy/fxhy/hxhy, bufid 1 will be -1.       */
152*c83a76b0SSuyog Pawar     /*************************************************************************/
153*c83a76b0SSuyog Pawar     i4_mv_x_frac = i4_mv_x & 3;
154*c83a76b0SSuyog Pawar     i4_mv_y_frac = i4_mv_y & 3;
155*c83a76b0SSuyog Pawar 
156*c83a76b0SSuyog Pawar     i4_offset = (i4_mv_x >> 2) + (i4_mv_y >> 2) * ps_prms->i4_ref_stride;
157*c83a76b0SSuyog Pawar 
158*c83a76b0SSuyog Pawar     /* Derive the descriptor that has all offset and size info */
159*c83a76b0SSuyog Pawar     ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
160*c83a76b0SSuyog Pawar 
161*c83a76b0SSuyog Pawar     if(ps_inp_cfg->i1_buf_id1 == ps_inp_cfg->i1_buf_id2)
162*c83a76b0SSuyog Pawar     {
163*c83a76b0SSuyog Pawar         /* This is case for fxfy/hxfy/fxhy/hxhy */
164*c83a76b0SSuyog Pawar         ps_prms->pu1_final_out = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
165*c83a76b0SSuyog Pawar         ps_prms->pu1_final_out += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
166*c83a76b0SSuyog Pawar         ps_prms->pu1_final_out += (ps_inp_cfg->i1_buf_yoff1 * ps_prms->i4_ref_stride);
167*c83a76b0SSuyog Pawar         ps_prms->i4_final_out_stride = ps_prms->i4_ref_stride;
168*c83a76b0SSuyog Pawar 
169*c83a76b0SSuyog Pawar         return;
170*c83a76b0SSuyog Pawar     }
171*c83a76b0SSuyog Pawar 
172*c83a76b0SSuyog Pawar     pu1_src1 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
173*c83a76b0SSuyog Pawar     pu1_src1 += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
174*c83a76b0SSuyog Pawar     pu1_src1 += (ps_inp_cfg->i1_buf_yoff1 * ps_prms->i4_ref_stride);
175*c83a76b0SSuyog Pawar 
176*c83a76b0SSuyog Pawar     pu1_src2 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id2];
177*c83a76b0SSuyog Pawar     pu1_src2 += ps_inp_cfg->i1_buf_xoff2 + i4_offset;
178*c83a76b0SSuyog Pawar     pu1_src2 += (ps_inp_cfg->i1_buf_yoff2 * ps_prms->i4_ref_stride);
179*c83a76b0SSuyog Pawar 
180*c83a76b0SSuyog Pawar     pu1_dst = ps_prms->apu1_interp_out[i4_buf_id];
181*c83a76b0SSuyog Pawar     hevc_avg_2d(
182*c83a76b0SSuyog Pawar         pu1_src1,
183*c83a76b0SSuyog Pawar         pu1_src2,
184*c83a76b0SSuyog Pawar         ps_prms->i4_ref_stride,
185*c83a76b0SSuyog Pawar         ps_prms->i4_ref_stride,
186*c83a76b0SSuyog Pawar         ps_prms->i4_blk_wd,
187*c83a76b0SSuyog Pawar         ps_prms->i4_blk_ht,
188*c83a76b0SSuyog Pawar         pu1_dst,
189*c83a76b0SSuyog Pawar         ps_prms->i4_out_stride);
190*c83a76b0SSuyog Pawar     ps_prms->pu1_final_out = pu1_dst;
191*c83a76b0SSuyog Pawar     ps_prms->i4_final_out_stride = ps_prms->i4_out_stride;
192*c83a76b0SSuyog Pawar }
193*c83a76b0SSuyog Pawar 
hme_qpel_interp_avg_2pt_vert_no_reuse(interp_prms_t * ps_prms,S32 i4_mv_x,S32 i4_mv_y,U08 ** ppu1_final,S32 * pi4_final_stride,FT_QPEL_INTERP_AVG_1PT * pf_qpel_interp_avg_1pt)194*c83a76b0SSuyog Pawar static __inline void hme_qpel_interp_avg_2pt_vert_no_reuse(
195*c83a76b0SSuyog Pawar     interp_prms_t *ps_prms,
196*c83a76b0SSuyog Pawar     S32 i4_mv_x,
197*c83a76b0SSuyog Pawar     S32 i4_mv_y,
198*c83a76b0SSuyog Pawar     U08 **ppu1_final,
199*c83a76b0SSuyog Pawar     S32 *pi4_final_stride,
200*c83a76b0SSuyog Pawar     FT_QPEL_INTERP_AVG_1PT *pf_qpel_interp_avg_1pt)
201*c83a76b0SSuyog Pawar {
202*c83a76b0SSuyog Pawar     pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x, i4_mv_y + 1, 3, ppu1_final, pi4_final_stride);
203*c83a76b0SSuyog Pawar 
204*c83a76b0SSuyog Pawar     pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x, i4_mv_y - 1, 1, ppu1_final, pi4_final_stride);
205*c83a76b0SSuyog Pawar }
206*c83a76b0SSuyog Pawar 
hme_qpel_interp_avg_2pt_horz_no_reuse(interp_prms_t * ps_prms,S32 i4_mv_x,S32 i4_mv_y,U08 ** ppu1_final,S32 * pi4_final_stride,FT_QPEL_INTERP_AVG_1PT * pf_qpel_interp_avg_1pt)207*c83a76b0SSuyog Pawar static __inline void hme_qpel_interp_avg_2pt_horz_no_reuse(
208*c83a76b0SSuyog Pawar     interp_prms_t *ps_prms,
209*c83a76b0SSuyog Pawar     S32 i4_mv_x,
210*c83a76b0SSuyog Pawar     S32 i4_mv_y,
211*c83a76b0SSuyog Pawar     U08 **ppu1_final,
212*c83a76b0SSuyog Pawar     S32 *pi4_final_stride,
213*c83a76b0SSuyog Pawar     FT_QPEL_INTERP_AVG_1PT *pf_qpel_interp_avg_1pt)
214*c83a76b0SSuyog Pawar {
215*c83a76b0SSuyog Pawar     pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x + 1, i4_mv_y, 2, ppu1_final, pi4_final_stride);
216*c83a76b0SSuyog Pawar 
217*c83a76b0SSuyog Pawar     pf_qpel_interp_avg_1pt(ps_prms, i4_mv_x - 1, i4_mv_y, 0, ppu1_final, pi4_final_stride);
218*c83a76b0SSuyog Pawar }
219*c83a76b0SSuyog Pawar 
220*c83a76b0SSuyog Pawar /********************************************************************************
221*c83a76b0SSuyog Pawar *  @fn     hme_qpel_interp_comprehensive
222*c83a76b0SSuyog Pawar *
223*c83a76b0SSuyog Pawar *  @brief  Interpolates 2 qpel points by hpel averaging
224*c83a76b0SSuyog Pawar *
225*c83a76b0SSuyog Pawar *  @param[in,out]  ps_prms: Both input buffer ptrs and location of output
226*c83a76b0SSuyog Pawar *
227*c83a76b0SSuyog Pawar *  @param[in]  i4_mv_x : x component of motion vector in QPEL units
228*c83a76b0SSuyog Pawar *
229*c83a76b0SSuyog Pawar *  @param[in]  i4_mv_y : y component of motion vector in QPEL units
230*c83a76b0SSuyog Pawar *
231*c83a76b0SSuyog Pawar *  @param[in]  i4_grid_mask : mask which determines qpels to be computed
232*c83a76b0SSuyog Pawar *
233*c83a76b0SSuyog Pawar *  @param[out]  ppu1_final : storage for final buffer pointers
234*c83a76b0SSuyog Pawar *
235*c83a76b0SSuyog Pawar *  @param[out]  pi4_final_stride : storage for final buffer strides
236*c83a76b0SSuyog Pawar *
237*c83a76b0SSuyog Pawar *  @return None
238*c83a76b0SSuyog Pawar ********************************************************************************
239*c83a76b0SSuyog Pawar */
hme_qpel_interp_comprehensive(interp_prms_t * ps_prms,U08 ** ppu1_final,S32 * pi4_final_stride,S32 i4_mv_x,S32 i4_mv_y,S32 i4_grid_mask,ihevce_me_optimised_function_list_t * ps_me_optimised_function_list)240*c83a76b0SSuyog Pawar static __inline void hme_qpel_interp_comprehensive(
241*c83a76b0SSuyog Pawar     interp_prms_t *ps_prms,
242*c83a76b0SSuyog Pawar     U08 **ppu1_final,
243*c83a76b0SSuyog Pawar     S32 *pi4_final_stride,
244*c83a76b0SSuyog Pawar     S32 i4_mv_x,
245*c83a76b0SSuyog Pawar     S32 i4_mv_y,
246*c83a76b0SSuyog Pawar     S32 i4_grid_mask,
247*c83a76b0SSuyog Pawar     ihevce_me_optimised_function_list_t *ps_me_optimised_function_list)
248*c83a76b0SSuyog Pawar {
249*c83a76b0SSuyog Pawar     S32 pt_select_for_TB, pt_select_for_LR;
250*c83a76b0SSuyog Pawar     S32 dx, dy, dydx;
251*c83a76b0SSuyog Pawar     S32 vert_func_selector, horz_func_selector;
252*c83a76b0SSuyog Pawar 
253*c83a76b0SSuyog Pawar     S32 i4_ref_stride = ps_prms->i4_ref_stride;
254*c83a76b0SSuyog Pawar 
255*c83a76b0SSuyog Pawar     pt_select_for_TB =
256*c83a76b0SSuyog Pawar         ((i4_grid_mask & (1 << PT_B)) >> PT_B) + ((i4_grid_mask & (1 << PT_T)) >> (PT_T - 1));
257*c83a76b0SSuyog Pawar 
258*c83a76b0SSuyog Pawar     pt_select_for_LR =
259*c83a76b0SSuyog Pawar         ((i4_grid_mask & (1 << PT_R)) >> PT_R) + ((i4_grid_mask & (1 << PT_L)) >> (PT_L - 1));
260*c83a76b0SSuyog Pawar 
261*c83a76b0SSuyog Pawar     dx = (i4_mv_x & 3);
262*c83a76b0SSuyog Pawar     dy = (i4_mv_y & 3);
263*c83a76b0SSuyog Pawar     dydx = (dx + (dy << 2));
264*c83a76b0SSuyog Pawar 
265*c83a76b0SSuyog Pawar     vert_func_selector = gai4_select_qpel_function_vert[pt_select_for_TB][dydx];
266*c83a76b0SSuyog Pawar     horz_func_selector = gai4_select_qpel_function_horz[pt_select_for_LR][dydx];
267*c83a76b0SSuyog Pawar 
268*c83a76b0SSuyog Pawar     /* case descriptions */
269*c83a76b0SSuyog Pawar     /* Let T = (gridmask & T) & B = (gridmask & B) */
270*c83a76b0SSuyog Pawar     /* & hp = pt is an hpel or an fpel */
271*c83a76b0SSuyog Pawar     /* & r = reuse possible */
272*c83a76b0SSuyog Pawar     /* 0 => T || B = 0 */
273*c83a76b0SSuyog Pawar     /* 1 => (!T) && (B) && hp */
274*c83a76b0SSuyog Pawar     /* 2 => (T) && (!B) && hp */
275*c83a76b0SSuyog Pawar     /* 3 => (!T) && (B) && !hp */
276*c83a76b0SSuyog Pawar     /* 4 => (T) && (!B) && !hp */
277*c83a76b0SSuyog Pawar     /* 5 => (T) && (B) && !hp && r */
278*c83a76b0SSuyog Pawar     /* 6 => (T) && (B) && !hp && !r */
279*c83a76b0SSuyog Pawar     /* 7 => (T) && (B) && hp */
280*c83a76b0SSuyog Pawar 
281*c83a76b0SSuyog Pawar     switch(vert_func_selector)
282*c83a76b0SSuyog Pawar     {
283*c83a76b0SSuyog Pawar     case 0:
284*c83a76b0SSuyog Pawar     {
285*c83a76b0SSuyog Pawar         break;
286*c83a76b0SSuyog Pawar     }
287*c83a76b0SSuyog Pawar     case 1:
288*c83a76b0SSuyog Pawar     {
289*c83a76b0SSuyog Pawar         S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
290*c83a76b0SSuyog Pawar         qpel_input_buf_cfg_t *ps_inp_cfg;
291*c83a76b0SSuyog Pawar         S32 i4_mvyp1 = (i4_mv_y + 1);
292*c83a76b0SSuyog Pawar 
293*c83a76b0SSuyog Pawar         i4_mv_x_frac = dx;
294*c83a76b0SSuyog Pawar         i4_mv_y_frac = i4_mvyp1 & 3;
295*c83a76b0SSuyog Pawar 
296*c83a76b0SSuyog Pawar         i4_offset = (i4_mv_x >> 2) + (i4_mvyp1 >> 2) * i4_ref_stride;
297*c83a76b0SSuyog Pawar 
298*c83a76b0SSuyog Pawar         /* Derive the descriptor that has all offset and size info */
299*c83a76b0SSuyog Pawar         ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
300*c83a76b0SSuyog Pawar 
301*c83a76b0SSuyog Pawar         ppu1_final[3] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
302*c83a76b0SSuyog Pawar         ppu1_final[3] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
303*c83a76b0SSuyog Pawar         ppu1_final[3] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
304*c83a76b0SSuyog Pawar         pi4_final_stride[3] = i4_ref_stride;
305*c83a76b0SSuyog Pawar 
306*c83a76b0SSuyog Pawar         break;
307*c83a76b0SSuyog Pawar     }
308*c83a76b0SSuyog Pawar     case 2:
309*c83a76b0SSuyog Pawar     {
310*c83a76b0SSuyog Pawar         S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
311*c83a76b0SSuyog Pawar         qpel_input_buf_cfg_t *ps_inp_cfg;
312*c83a76b0SSuyog Pawar         S32 i4_mvym1 = (i4_mv_y - 1);
313*c83a76b0SSuyog Pawar 
314*c83a76b0SSuyog Pawar         i4_mv_x_frac = dx;
315*c83a76b0SSuyog Pawar         i4_mv_y_frac = i4_mvym1 & 3;
316*c83a76b0SSuyog Pawar 
317*c83a76b0SSuyog Pawar         i4_offset = (i4_mv_x >> 2) + (i4_mvym1 >> 2) * i4_ref_stride;
318*c83a76b0SSuyog Pawar 
319*c83a76b0SSuyog Pawar         /* Derive the descriptor that has all offset and size info */
320*c83a76b0SSuyog Pawar         ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
321*c83a76b0SSuyog Pawar 
322*c83a76b0SSuyog Pawar         ppu1_final[1] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
323*c83a76b0SSuyog Pawar         ppu1_final[1] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
324*c83a76b0SSuyog Pawar         ppu1_final[1] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
325*c83a76b0SSuyog Pawar         pi4_final_stride[1] = i4_ref_stride;
326*c83a76b0SSuyog Pawar 
327*c83a76b0SSuyog Pawar         break;
328*c83a76b0SSuyog Pawar     }
329*c83a76b0SSuyog Pawar     case 3:
330*c83a76b0SSuyog Pawar     {
331*c83a76b0SSuyog Pawar         ps_me_optimised_function_list->pf_qpel_interp_avg_1pt(
332*c83a76b0SSuyog Pawar             ps_prms, i4_mv_x, i4_mv_y + 1, 3, ppu1_final, pi4_final_stride);
333*c83a76b0SSuyog Pawar 
334*c83a76b0SSuyog Pawar         break;
335*c83a76b0SSuyog Pawar     }
336*c83a76b0SSuyog Pawar     case 4:
337*c83a76b0SSuyog Pawar     {
338*c83a76b0SSuyog Pawar         ps_me_optimised_function_list->pf_qpel_interp_avg_1pt(
339*c83a76b0SSuyog Pawar             ps_prms, i4_mv_x, i4_mv_y - 1, 1, ppu1_final, pi4_final_stride);
340*c83a76b0SSuyog Pawar 
341*c83a76b0SSuyog Pawar         break;
342*c83a76b0SSuyog Pawar     }
343*c83a76b0SSuyog Pawar     case 5:
344*c83a76b0SSuyog Pawar     {
345*c83a76b0SSuyog Pawar         ps_me_optimised_function_list->pf_qpel_interp_avg_2pt_vert_with_reuse(
346*c83a76b0SSuyog Pawar             ps_prms, i4_mv_x, i4_mv_y, ppu1_final, pi4_final_stride);
347*c83a76b0SSuyog Pawar         break;
348*c83a76b0SSuyog Pawar     }
349*c83a76b0SSuyog Pawar     case 6:
350*c83a76b0SSuyog Pawar     {
351*c83a76b0SSuyog Pawar         hme_qpel_interp_avg_2pt_vert_no_reuse(
352*c83a76b0SSuyog Pawar             ps_prms,
353*c83a76b0SSuyog Pawar             i4_mv_x,
354*c83a76b0SSuyog Pawar             i4_mv_y,
355*c83a76b0SSuyog Pawar             ppu1_final,
356*c83a76b0SSuyog Pawar             pi4_final_stride,
357*c83a76b0SSuyog Pawar             ps_me_optimised_function_list->pf_qpel_interp_avg_1pt);
358*c83a76b0SSuyog Pawar         break;
359*c83a76b0SSuyog Pawar     }
360*c83a76b0SSuyog Pawar     case 7:
361*c83a76b0SSuyog Pawar     {
362*c83a76b0SSuyog Pawar         S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
363*c83a76b0SSuyog Pawar         qpel_input_buf_cfg_t *ps_inp_cfg;
364*c83a76b0SSuyog Pawar 
365*c83a76b0SSuyog Pawar         S32 i4_mvyp1 = (i4_mv_y + 1);
366*c83a76b0SSuyog Pawar         S32 i4_mvym1 = (i4_mv_y - 1);
367*c83a76b0SSuyog Pawar 
368*c83a76b0SSuyog Pawar         i4_mv_x_frac = dx;
369*c83a76b0SSuyog Pawar         i4_mv_y_frac = i4_mvyp1 & 3;
370*c83a76b0SSuyog Pawar 
371*c83a76b0SSuyog Pawar         i4_offset = (i4_mv_x >> 2) + (i4_mvyp1 >> 2) * i4_ref_stride;
372*c83a76b0SSuyog Pawar 
373*c83a76b0SSuyog Pawar         /* Derive the descriptor that has all offset and size info */
374*c83a76b0SSuyog Pawar         ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
375*c83a76b0SSuyog Pawar 
376*c83a76b0SSuyog Pawar         ppu1_final[3] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
377*c83a76b0SSuyog Pawar         ppu1_final[3] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
378*c83a76b0SSuyog Pawar         ppu1_final[3] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
379*c83a76b0SSuyog Pawar         pi4_final_stride[3] = i4_ref_stride;
380*c83a76b0SSuyog Pawar 
381*c83a76b0SSuyog Pawar         i4_mv_y_frac = i4_mvym1 & 3;
382*c83a76b0SSuyog Pawar 
383*c83a76b0SSuyog Pawar         i4_offset = (i4_mv_x >> 2) + (i4_mvym1 >> 2) * i4_ref_stride;
384*c83a76b0SSuyog Pawar 
385*c83a76b0SSuyog Pawar         /* Derive the descriptor that has all offset and size info */
386*c83a76b0SSuyog Pawar         ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
387*c83a76b0SSuyog Pawar 
388*c83a76b0SSuyog Pawar         ppu1_final[1] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
389*c83a76b0SSuyog Pawar         ppu1_final[1] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
390*c83a76b0SSuyog Pawar         ppu1_final[1] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
391*c83a76b0SSuyog Pawar         pi4_final_stride[1] = i4_ref_stride;
392*c83a76b0SSuyog Pawar 
393*c83a76b0SSuyog Pawar         break;
394*c83a76b0SSuyog Pawar     }
395*c83a76b0SSuyog Pawar     }
396*c83a76b0SSuyog Pawar 
397*c83a76b0SSuyog Pawar     /* case descriptions */
398*c83a76b0SSuyog Pawar     /* Let L = (gridmask & L) & R = (gridmask & R) */
399*c83a76b0SSuyog Pawar     /* & hp = pt is an hpel or an fpel */
400*c83a76b0SSuyog Pawar     /* & r = reuse possible */
401*c83a76b0SSuyog Pawar     /* 0 => L || R = 0 */
402*c83a76b0SSuyog Pawar     /* 1 => (!L) && (R) && hp */
403*c83a76b0SSuyog Pawar     /* 2 => (L) && (!R) && hp */
404*c83a76b0SSuyog Pawar     /* 3 => (!L) && (R) && !hp */
405*c83a76b0SSuyog Pawar     /* 4 => (L) && (!R) && !hp */
406*c83a76b0SSuyog Pawar     /* 5 => (L) && (R) && !hp && r */
407*c83a76b0SSuyog Pawar     /* 6 => (L) && (R) && !hp && !r */
408*c83a76b0SSuyog Pawar     /* 7 => (L) && (R) && hp */
409*c83a76b0SSuyog Pawar 
410*c83a76b0SSuyog Pawar     switch(horz_func_selector)
411*c83a76b0SSuyog Pawar     {
412*c83a76b0SSuyog Pawar     case 0:
413*c83a76b0SSuyog Pawar     {
414*c83a76b0SSuyog Pawar         break;
415*c83a76b0SSuyog Pawar     }
416*c83a76b0SSuyog Pawar     case 1:
417*c83a76b0SSuyog Pawar     {
418*c83a76b0SSuyog Pawar         S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
419*c83a76b0SSuyog Pawar         qpel_input_buf_cfg_t *ps_inp_cfg;
420*c83a76b0SSuyog Pawar         S32 i4_mvxp1 = (i4_mv_x + 1);
421*c83a76b0SSuyog Pawar 
422*c83a76b0SSuyog Pawar         i4_mv_x_frac = i4_mvxp1 & 3;
423*c83a76b0SSuyog Pawar         i4_mv_y_frac = dy;
424*c83a76b0SSuyog Pawar 
425*c83a76b0SSuyog Pawar         i4_offset = (i4_mvxp1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride;
426*c83a76b0SSuyog Pawar 
427*c83a76b0SSuyog Pawar         /* Derive the descriptor that has all offset and size info */
428*c83a76b0SSuyog Pawar         ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
429*c83a76b0SSuyog Pawar 
430*c83a76b0SSuyog Pawar         ppu1_final[2] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
431*c83a76b0SSuyog Pawar         ppu1_final[2] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
432*c83a76b0SSuyog Pawar         ppu1_final[2] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
433*c83a76b0SSuyog Pawar         pi4_final_stride[2] = i4_ref_stride;
434*c83a76b0SSuyog Pawar 
435*c83a76b0SSuyog Pawar         break;
436*c83a76b0SSuyog Pawar     }
437*c83a76b0SSuyog Pawar     case 2:
438*c83a76b0SSuyog Pawar     {
439*c83a76b0SSuyog Pawar         S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
440*c83a76b0SSuyog Pawar         qpel_input_buf_cfg_t *ps_inp_cfg;
441*c83a76b0SSuyog Pawar         S32 i4_mvxm1 = (i4_mv_x - 1);
442*c83a76b0SSuyog Pawar 
443*c83a76b0SSuyog Pawar         i4_mv_x_frac = i4_mvxm1 & 3;
444*c83a76b0SSuyog Pawar         i4_mv_y_frac = dy;
445*c83a76b0SSuyog Pawar 
446*c83a76b0SSuyog Pawar         i4_offset = (i4_mvxm1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride;
447*c83a76b0SSuyog Pawar 
448*c83a76b0SSuyog Pawar         /* Derive the descriptor that has all offset and size info */
449*c83a76b0SSuyog Pawar         ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
450*c83a76b0SSuyog Pawar 
451*c83a76b0SSuyog Pawar         ppu1_final[0] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
452*c83a76b0SSuyog Pawar         ppu1_final[0] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
453*c83a76b0SSuyog Pawar         ppu1_final[0] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
454*c83a76b0SSuyog Pawar         pi4_final_stride[0] = i4_ref_stride;
455*c83a76b0SSuyog Pawar 
456*c83a76b0SSuyog Pawar         break;
457*c83a76b0SSuyog Pawar     }
458*c83a76b0SSuyog Pawar     case 3:
459*c83a76b0SSuyog Pawar     {
460*c83a76b0SSuyog Pawar         ps_me_optimised_function_list->pf_qpel_interp_avg_1pt(
461*c83a76b0SSuyog Pawar             ps_prms, i4_mv_x + 1, i4_mv_y, 2, ppu1_final, pi4_final_stride);
462*c83a76b0SSuyog Pawar 
463*c83a76b0SSuyog Pawar         break;
464*c83a76b0SSuyog Pawar     }
465*c83a76b0SSuyog Pawar     case 4:
466*c83a76b0SSuyog Pawar     {
467*c83a76b0SSuyog Pawar         ps_me_optimised_function_list->pf_qpel_interp_avg_1pt(
468*c83a76b0SSuyog Pawar             ps_prms, i4_mv_x - 1, i4_mv_y, 0, ppu1_final, pi4_final_stride);
469*c83a76b0SSuyog Pawar 
470*c83a76b0SSuyog Pawar         break;
471*c83a76b0SSuyog Pawar     }
472*c83a76b0SSuyog Pawar     case 5:
473*c83a76b0SSuyog Pawar     {
474*c83a76b0SSuyog Pawar         ps_me_optimised_function_list->pf_qpel_interp_avg_2pt_horz_with_reuse(
475*c83a76b0SSuyog Pawar             ps_prms, i4_mv_x, i4_mv_y, ppu1_final, pi4_final_stride);
476*c83a76b0SSuyog Pawar         break;
477*c83a76b0SSuyog Pawar     }
478*c83a76b0SSuyog Pawar     case 6:
479*c83a76b0SSuyog Pawar     {
480*c83a76b0SSuyog Pawar         hme_qpel_interp_avg_2pt_horz_no_reuse(
481*c83a76b0SSuyog Pawar             ps_prms,
482*c83a76b0SSuyog Pawar             i4_mv_x,
483*c83a76b0SSuyog Pawar             i4_mv_y,
484*c83a76b0SSuyog Pawar             ppu1_final,
485*c83a76b0SSuyog Pawar             pi4_final_stride,
486*c83a76b0SSuyog Pawar             ps_me_optimised_function_list->pf_qpel_interp_avg_1pt);
487*c83a76b0SSuyog Pawar         break;
488*c83a76b0SSuyog Pawar     }
489*c83a76b0SSuyog Pawar     case 7:
490*c83a76b0SSuyog Pawar     {
491*c83a76b0SSuyog Pawar         S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
492*c83a76b0SSuyog Pawar         qpel_input_buf_cfg_t *ps_inp_cfg;
493*c83a76b0SSuyog Pawar 
494*c83a76b0SSuyog Pawar         S32 i4_mvxp1 = (i4_mv_x + 1);
495*c83a76b0SSuyog Pawar         S32 i4_mvxm1 = (i4_mv_x - 1);
496*c83a76b0SSuyog Pawar 
497*c83a76b0SSuyog Pawar         i4_mv_x_frac = i4_mvxp1 & 3;
498*c83a76b0SSuyog Pawar         i4_mv_y_frac = dy;
499*c83a76b0SSuyog Pawar 
500*c83a76b0SSuyog Pawar         i4_offset = (i4_mvxp1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride;
501*c83a76b0SSuyog Pawar 
502*c83a76b0SSuyog Pawar         /* Derive the descriptor that has all offset and size info */
503*c83a76b0SSuyog Pawar         ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
504*c83a76b0SSuyog Pawar 
505*c83a76b0SSuyog Pawar         ppu1_final[2] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
506*c83a76b0SSuyog Pawar         ppu1_final[2] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
507*c83a76b0SSuyog Pawar         ppu1_final[2] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
508*c83a76b0SSuyog Pawar         pi4_final_stride[2] = i4_ref_stride;
509*c83a76b0SSuyog Pawar 
510*c83a76b0SSuyog Pawar         i4_mv_x_frac = i4_mvxm1 & 3;
511*c83a76b0SSuyog Pawar 
512*c83a76b0SSuyog Pawar         i4_offset = (i4_mvxm1 >> 2) + (i4_mv_y >> 2) * i4_ref_stride;
513*c83a76b0SSuyog Pawar 
514*c83a76b0SSuyog Pawar         /* Derive the descriptor that has all offset and size info */
515*c83a76b0SSuyog Pawar         ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
516*c83a76b0SSuyog Pawar 
517*c83a76b0SSuyog Pawar         ppu1_final[0] = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
518*c83a76b0SSuyog Pawar         ppu1_final[0] += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
519*c83a76b0SSuyog Pawar         ppu1_final[0] += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
520*c83a76b0SSuyog Pawar         pi4_final_stride[0] = i4_ref_stride;
521*c83a76b0SSuyog Pawar 
522*c83a76b0SSuyog Pawar         break;
523*c83a76b0SSuyog Pawar     }
524*c83a76b0SSuyog Pawar     }
525*c83a76b0SSuyog Pawar }
526*c83a76b0SSuyog Pawar 
527*c83a76b0SSuyog Pawar /**
528*c83a76b0SSuyog Pawar ********************************************************************************
529*c83a76b0SSuyog Pawar *  @fn     S32 hme_compute_pred_and_evaluate_bi(hme_subpel_prms_t *ps_prms,
530*c83a76b0SSuyog Pawar *                                   search_results_t *ps_search_results,
531*c83a76b0SSuyog Pawar *                                   layer_ctxt_t *ps_curr_layer,
532*c83a76b0SSuyog Pawar *                                   U08 **ppu1_pred)
533*c83a76b0SSuyog Pawar *
534*c83a76b0SSuyog Pawar *
535*c83a76b0SSuyog Pawar *  @brief  Evaluates the best bipred cost as avg(P0, P1) where P0 and P1 are
536*c83a76b0SSuyog Pawar *          best L0 and L1 bufs respectively for the entire CU
537*c83a76b0SSuyog Pawar *
538*c83a76b0SSuyog Pawar *  @param[in]  ps_prms: subpel prms input to this function
539*c83a76b0SSuyog Pawar *
540*c83a76b0SSuyog Pawar *  @param[in] ps_curr_layer: points to the current layer ctxt
541*c83a76b0SSuyog Pawar *
542*c83a76b0SSuyog Pawar *  @return The best BI cost of best uni cost, whichever better
543*c83a76b0SSuyog Pawar ********************************************************************************
544*c83a76b0SSuyog Pawar */
hme_compute_pred_and_evaluate_bi(inter_cu_results_t * ps_cu_results,inter_pu_results_t * ps_pu_results,inter_ctb_prms_t * ps_inter_ctb_prms,part_type_results_t * ps_part_type_result,ULWORD64 * pu8_winning_pred_sigmaXSquare,ULWORD64 * pu8_winning_pred_sigmaX,ihevce_cmn_opt_func_t * ps_cmn_utils_optimised_function_list,ihevce_me_optimised_function_list_t * ps_me_optimised_function_list)545*c83a76b0SSuyog Pawar void hme_compute_pred_and_evaluate_bi(
546*c83a76b0SSuyog Pawar     inter_cu_results_t *ps_cu_results,
547*c83a76b0SSuyog Pawar     inter_pu_results_t *ps_pu_results,
548*c83a76b0SSuyog Pawar     inter_ctb_prms_t *ps_inter_ctb_prms,
549*c83a76b0SSuyog Pawar     part_type_results_t *ps_part_type_result,
550*c83a76b0SSuyog Pawar     ULWORD64 *pu8_winning_pred_sigmaXSquare,
551*c83a76b0SSuyog Pawar     ULWORD64 *pu8_winning_pred_sigmaX,
552*c83a76b0SSuyog Pawar     ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list,
553*c83a76b0SSuyog Pawar     ihevce_me_optimised_function_list_t *ps_me_optimised_function_list)
554*c83a76b0SSuyog Pawar {
555*c83a76b0SSuyog Pawar     /* Idx0 - Uni winner */
556*c83a76b0SSuyog Pawar     /* Idx1 - Uni runner-up */
557*c83a76b0SSuyog Pawar     /* Idx2 - Bi winner */
558*c83a76b0SSuyog Pawar     hme_pred_buf_info_t as_pred_buf_data[3][NUM_INTER_PU_PARTS];
559*c83a76b0SSuyog Pawar     err_prms_t s_err_prms;
560*c83a76b0SSuyog Pawar     interp_prms_t s_interp_prms;
561*c83a76b0SSuyog Pawar 
562*c83a76b0SSuyog Pawar     PF_SAD_FXN_T pf_err_compute;
563*c83a76b0SSuyog Pawar 
564*c83a76b0SSuyog Pawar     S32 i, j;
565*c83a76b0SSuyog Pawar     S32 x_off, y_off, x_pic, y_pic;
566*c83a76b0SSuyog Pawar     S32 i4_sad_grid;
567*c83a76b0SSuyog Pawar     U08 e_cu_size;
568*c83a76b0SSuyog Pawar     S32 i4_part_type;
569*c83a76b0SSuyog Pawar     U08 u1_cu_size;
570*c83a76b0SSuyog Pawar     S32 shift;
571*c83a76b0SSuyog Pawar     S32 x_part, y_part, num_parts;
572*c83a76b0SSuyog Pawar     S32 inp_stride, ref_stride;
573*c83a76b0SSuyog Pawar     U08 au1_pred_buf_array_indixes[3];
574*c83a76b0SSuyog Pawar     S32 cur_iter_best_cost;
575*c83a76b0SSuyog Pawar     S32 uni_cost, bi_cost, best_cost, tot_cost;
576*c83a76b0SSuyog Pawar     /* Idx0 - Uni winner */
577*c83a76b0SSuyog Pawar     /* Idx1 - Bi winner */
578*c83a76b0SSuyog Pawar     ULWORD64 au8_sigmaX[2][NUM_INTER_PU_PARTS];
579*c83a76b0SSuyog Pawar     ULWORD64 au8_sigmaXSquared[2][NUM_INTER_PU_PARTS];
580*c83a76b0SSuyog Pawar #if USE_NOISE_TERM_DURING_BICAND_SEARCH
581*c83a76b0SSuyog Pawar     S32 i4_noise_term;
582*c83a76b0SSuyog Pawar #endif
583*c83a76b0SSuyog Pawar 
584*c83a76b0SSuyog Pawar     interp_prms_t *ps_interp_prms = &s_interp_prms;
585*c83a76b0SSuyog Pawar 
586*c83a76b0SSuyog Pawar     S32 best_cand_in_opp_dir_idx = 0;
587*c83a76b0SSuyog Pawar     S32 is_best_cand_an_intra = 0;
588*c83a76b0SSuyog Pawar     U08 u1_is_cu_noisy = ps_inter_ctb_prms->u1_is_cu_noisy;
589*c83a76b0SSuyog Pawar #if USE_NOISE_TERM_DURING_BICAND_SEARCH
590*c83a76b0SSuyog Pawar     const S32 i4_default_src_wt = ((1 << 15) + (WGHT_DEFAULT >> 1)) / WGHT_DEFAULT;
591*c83a76b0SSuyog Pawar #endif
592*c83a76b0SSuyog Pawar     tot_cost = 0;
593*c83a76b0SSuyog Pawar 
594*c83a76b0SSuyog Pawar     /* Start of the CU w.r.t. CTB */
595*c83a76b0SSuyog Pawar     x_off = ps_cu_results->u1_x_off;
596*c83a76b0SSuyog Pawar     y_off = ps_cu_results->u1_y_off;
597*c83a76b0SSuyog Pawar 
598*c83a76b0SSuyog Pawar     inp_stride = ps_inter_ctb_prms->i4_inp_stride;
599*c83a76b0SSuyog Pawar     ref_stride = ps_inter_ctb_prms->i4_rec_stride;
600*c83a76b0SSuyog Pawar 
601*c83a76b0SSuyog Pawar     ps_interp_prms->i4_ref_stride = ref_stride;
602*c83a76b0SSuyog Pawar 
603*c83a76b0SSuyog Pawar     /* Start of the CU w.r.t. Pic 0,0 */
604*c83a76b0SSuyog Pawar     x_pic = x_off + ps_inter_ctb_prms->i4_ctb_x_off;
605*c83a76b0SSuyog Pawar     y_pic = y_off + ps_inter_ctb_prms->i4_ctb_y_off;
606*c83a76b0SSuyog Pawar 
607*c83a76b0SSuyog Pawar     u1_cu_size = ps_cu_results->u1_cu_size;
608*c83a76b0SSuyog Pawar     e_cu_size = u1_cu_size;
609*c83a76b0SSuyog Pawar     shift = (S32)e_cu_size;
610*c83a76b0SSuyog Pawar     i4_part_type = ps_part_type_result->u1_part_type;
611*c83a76b0SSuyog Pawar     num_parts = gau1_num_parts_in_part_type[i4_part_type];
612*c83a76b0SSuyog Pawar 
613*c83a76b0SSuyog Pawar     for(i = 0; i < 3; i++)
614*c83a76b0SSuyog Pawar     {
615*c83a76b0SSuyog Pawar         hme_init_pred_buf_info(
616*c83a76b0SSuyog Pawar             &as_pred_buf_data[i],
617*c83a76b0SSuyog Pawar             &ps_inter_ctb_prms->s_pred_buf_mngr,
618*c83a76b0SSuyog Pawar             (ps_part_type_result->as_pu_results->pu.b4_wd + 1) << 2,
619*c83a76b0SSuyog Pawar             (ps_part_type_result->as_pu_results->pu.b4_ht + 1) << 2,
620*c83a76b0SSuyog Pawar             (PART_TYPE_T)i4_part_type);
621*c83a76b0SSuyog Pawar 
622*c83a76b0SSuyog Pawar         au1_pred_buf_array_indixes[i] = as_pred_buf_data[i][0].u1_pred_buf_array_id;
623*c83a76b0SSuyog Pawar     }
624*c83a76b0SSuyog Pawar 
625*c83a76b0SSuyog Pawar     for(j = 0; j < num_parts; j++)
626*c83a76b0SSuyog Pawar     {
627*c83a76b0SSuyog Pawar         UWORD8 *apu1_hpel_ref[2][4];
628*c83a76b0SSuyog Pawar         PART_ID_T e_part_id;
629*c83a76b0SSuyog Pawar         BLK_SIZE_T e_blk_size;
630*c83a76b0SSuyog Pawar         WORD8 i1_ref_idx;
631*c83a76b0SSuyog Pawar         UWORD8 pred_dir;
632*c83a76b0SSuyog Pawar         WORD32 ref_offset, inp_offset, wd, ht;
633*c83a76b0SSuyog Pawar         pu_result_t *ps_pu_node1, *ps_pu_node2, *ps_pu_result;
634*c83a76b0SSuyog Pawar         mv_t *aps_mv[2];
635*c83a76b0SSuyog Pawar         UWORD8 num_active_ref_opp;
636*c83a76b0SSuyog Pawar         UWORD8 num_results_per_part;
637*c83a76b0SSuyog Pawar         WORD32 luma_weight_ref1, luma_offset_ref1;
638*c83a76b0SSuyog Pawar         WORD32 luma_weight_ref2, luma_offset_ref2;
639*c83a76b0SSuyog Pawar         WORD32 pu_node2_found = 0;
640*c83a76b0SSuyog Pawar 
641*c83a76b0SSuyog Pawar         e_part_id = ge_part_type_to_part_id[i4_part_type][j];
642*c83a76b0SSuyog Pawar         e_blk_size = ge_part_id_to_blk_size[e_cu_size][e_part_id];
643*c83a76b0SSuyog Pawar 
644*c83a76b0SSuyog Pawar         x_part = gas_part_attr_in_cu[e_part_id].u1_x_start << shift;
645*c83a76b0SSuyog Pawar         y_part = gas_part_attr_in_cu[e_part_id].u1_y_start << shift;
646*c83a76b0SSuyog Pawar 
647*c83a76b0SSuyog Pawar         ref_offset = (x_part + x_pic) + (y_pic + y_part) * ref_stride;
648*c83a76b0SSuyog Pawar         inp_offset = (x_part + y_part * inp_stride) + ps_cu_results->i4_inp_offset;
649*c83a76b0SSuyog Pawar 
650*c83a76b0SSuyog Pawar         pred_dir = ps_part_type_result->as_pu_results[j].pu.b2_pred_mode;
651*c83a76b0SSuyog Pawar 
652*c83a76b0SSuyog Pawar         ps_pu_node1 = &(ps_part_type_result->as_pu_results[j]);
653*c83a76b0SSuyog Pawar 
654*c83a76b0SSuyog Pawar         if(PRED_L0 == pred_dir)
655*c83a76b0SSuyog Pawar         {
656*c83a76b0SSuyog Pawar             i1_ref_idx = ps_pu_node1->pu.mv.i1_l0_ref_idx;
657*c83a76b0SSuyog Pawar             aps_mv[0] = &(ps_pu_node1->pu.mv.s_l0_mv);
658*c83a76b0SSuyog Pawar 
659*c83a76b0SSuyog Pawar             num_active_ref_opp =
660*c83a76b0SSuyog Pawar                 ps_inter_ctb_prms->u1_num_active_ref_l1 * (ps_inter_ctb_prms->i4_bidir_enabled);
661*c83a76b0SSuyog Pawar             num_results_per_part = ps_pu_results->u1_num_results_per_part_l0[e_part_id];
662*c83a76b0SSuyog Pawar 
663*c83a76b0SSuyog Pawar             ps_pu_result = ps_pu_results->aps_pu_results[PRED_L0][e_part_id];
664*c83a76b0SSuyog Pawar 
665*c83a76b0SSuyog Pawar             ASSERT(i1_ref_idx >= 0);
666*c83a76b0SSuyog Pawar 
667*c83a76b0SSuyog Pawar             apu1_hpel_ref[0][0] =
668*c83a76b0SSuyog Pawar                 (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->s_yuv_buf_desc.pv_y_buf) +
669*c83a76b0SSuyog Pawar                 ref_offset;
670*c83a76b0SSuyog Pawar             apu1_hpel_ref[0][1] =
671*c83a76b0SSuyog Pawar                 ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[0] +
672*c83a76b0SSuyog Pawar                 ref_offset;
673*c83a76b0SSuyog Pawar             apu1_hpel_ref[0][2] =
674*c83a76b0SSuyog Pawar                 ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[1] +
675*c83a76b0SSuyog Pawar                 ref_offset;
676*c83a76b0SSuyog Pawar             apu1_hpel_ref[0][3] =
677*c83a76b0SSuyog Pawar                 ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[2] +
678*c83a76b0SSuyog Pawar                 ref_offset;
679*c83a76b0SSuyog Pawar 
680*c83a76b0SSuyog Pawar             luma_weight_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
681*c83a76b0SSuyog Pawar                                    ->s_weight_offset.i2_luma_weight;
682*c83a76b0SSuyog Pawar             luma_offset_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
683*c83a76b0SSuyog Pawar                                    ->s_weight_offset.i2_luma_offset;
684*c83a76b0SSuyog Pawar         }
685*c83a76b0SSuyog Pawar         else
686*c83a76b0SSuyog Pawar         {
687*c83a76b0SSuyog Pawar             i1_ref_idx = ps_pu_node1->pu.mv.i1_l1_ref_idx;
688*c83a76b0SSuyog Pawar             aps_mv[0] = &(ps_pu_node1->pu.mv.s_l1_mv);
689*c83a76b0SSuyog Pawar 
690*c83a76b0SSuyog Pawar             ASSERT(i1_ref_idx >= 0);
691*c83a76b0SSuyog Pawar 
692*c83a76b0SSuyog Pawar             num_active_ref_opp =
693*c83a76b0SSuyog Pawar                 ps_inter_ctb_prms->u1_num_active_ref_l0 * (ps_inter_ctb_prms->i4_bidir_enabled);
694*c83a76b0SSuyog Pawar             num_results_per_part = ps_pu_results->u1_num_results_per_part_l1[e_part_id];
695*c83a76b0SSuyog Pawar 
696*c83a76b0SSuyog Pawar             ps_pu_result = ps_pu_results->aps_pu_results[PRED_L1][e_part_id];
697*c83a76b0SSuyog Pawar 
698*c83a76b0SSuyog Pawar             apu1_hpel_ref[0][0] =
699*c83a76b0SSuyog Pawar                 (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->s_yuv_buf_desc.pv_y_buf) +
700*c83a76b0SSuyog Pawar                 ref_offset;
701*c83a76b0SSuyog Pawar             apu1_hpel_ref[0][1] =
702*c83a76b0SSuyog Pawar                 ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[0] +
703*c83a76b0SSuyog Pawar                 ref_offset;
704*c83a76b0SSuyog Pawar             apu1_hpel_ref[0][2] =
705*c83a76b0SSuyog Pawar                 ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[1] +
706*c83a76b0SSuyog Pawar                 ref_offset;
707*c83a76b0SSuyog Pawar             apu1_hpel_ref[0][3] =
708*c83a76b0SSuyog Pawar                 ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[2] +
709*c83a76b0SSuyog Pawar                 ref_offset;
710*c83a76b0SSuyog Pawar 
711*c83a76b0SSuyog Pawar             luma_weight_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
712*c83a76b0SSuyog Pawar                                    ->s_weight_offset.i2_luma_weight;
713*c83a76b0SSuyog Pawar             luma_offset_ref1 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
714*c83a76b0SSuyog Pawar                                    ->s_weight_offset.i2_luma_offset;
715*c83a76b0SSuyog Pawar         }
716*c83a76b0SSuyog Pawar 
717*c83a76b0SSuyog Pawar         if(aps_mv[0]->i2_mvx == INTRA_MV)
718*c83a76b0SSuyog Pawar         {
719*c83a76b0SSuyog Pawar             uni_cost = ps_pu_node1->i4_tot_cost;
720*c83a76b0SSuyog Pawar             cur_iter_best_cost = ps_pu_node1->i4_tot_cost;
721*c83a76b0SSuyog Pawar             best_cost = MIN(uni_cost, cur_iter_best_cost);
722*c83a76b0SSuyog Pawar             tot_cost += best_cost;
723*c83a76b0SSuyog Pawar             continue;
724*c83a76b0SSuyog Pawar         }
725*c83a76b0SSuyog Pawar 
726*c83a76b0SSuyog Pawar         ps_interp_prms->i4_blk_wd = wd = gau1_blk_size_to_wd[e_blk_size];
727*c83a76b0SSuyog Pawar         ps_interp_prms->i4_blk_ht = ht = gau1_blk_size_to_ht[e_blk_size];
728*c83a76b0SSuyog Pawar         ps_interp_prms->i4_out_stride = MAX_CU_SIZE;
729*c83a76b0SSuyog Pawar 
730*c83a76b0SSuyog Pawar         if(num_active_ref_opp)
731*c83a76b0SSuyog Pawar         {
732*c83a76b0SSuyog Pawar             if(PRED_L0 == pred_dir)
733*c83a76b0SSuyog Pawar             {
734*c83a76b0SSuyog Pawar                 if(ps_pu_results->u1_num_results_per_part_l1[e_part_id])
735*c83a76b0SSuyog Pawar                 {
736*c83a76b0SSuyog Pawar                     ps_pu_node2 = ps_pu_results->aps_pu_results[1][e_part_id];
737*c83a76b0SSuyog Pawar                     pu_node2_found = 1;
738*c83a76b0SSuyog Pawar                 }
739*c83a76b0SSuyog Pawar             }
740*c83a76b0SSuyog Pawar             else
741*c83a76b0SSuyog Pawar             {
742*c83a76b0SSuyog Pawar                 if(ps_pu_results->u1_num_results_per_part_l0[e_part_id])
743*c83a76b0SSuyog Pawar                 {
744*c83a76b0SSuyog Pawar                     ps_pu_node2 = ps_pu_results->aps_pu_results[0][e_part_id];
745*c83a76b0SSuyog Pawar                     pu_node2_found = 1;
746*c83a76b0SSuyog Pawar                 }
747*c83a76b0SSuyog Pawar             }
748*c83a76b0SSuyog Pawar         }
749*c83a76b0SSuyog Pawar 
750*c83a76b0SSuyog Pawar         if(!pu_node2_found)
751*c83a76b0SSuyog Pawar         {
752*c83a76b0SSuyog Pawar             bi_cost = INT_MAX >> 1;
753*c83a76b0SSuyog Pawar 
754*c83a76b0SSuyog Pawar             s_interp_prms.apu1_interp_out[0] = as_pred_buf_data[0][j].pu1_pred;
755*c83a76b0SSuyog Pawar             ps_interp_prms->ppu1_ref = &apu1_hpel_ref[0][0];
756*c83a76b0SSuyog Pawar 
757*c83a76b0SSuyog Pawar             ps_me_optimised_function_list->pf_qpel_interp_avg_generic(
758*c83a76b0SSuyog Pawar                 ps_interp_prms, aps_mv[0]->i2_mvx, aps_mv[0]->i2_mvy, 0);
759*c83a76b0SSuyog Pawar 
760*c83a76b0SSuyog Pawar             if(ps_interp_prms->pu1_final_out != s_interp_prms.apu1_interp_out[0])
761*c83a76b0SSuyog Pawar             {
762*c83a76b0SSuyog Pawar                 as_pred_buf_data[0][j].u1_pred_buf_array_id = UCHAR_MAX;
763*c83a76b0SSuyog Pawar                 as_pred_buf_data[0][j].pu1_pred = ps_interp_prms->pu1_final_out;
764*c83a76b0SSuyog Pawar                 as_pred_buf_data[0][j].i4_pred_stride = ps_interp_prms->i4_final_out_stride;
765*c83a76b0SSuyog Pawar             }
766*c83a76b0SSuyog Pawar 
767*c83a76b0SSuyog Pawar             if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
768*c83a76b0SSuyog Pawar             {
769*c83a76b0SSuyog Pawar                 hme_compute_sigmaX_and_sigmaXSquared(
770*c83a76b0SSuyog Pawar                     as_pred_buf_data[0][j].pu1_pred,
771*c83a76b0SSuyog Pawar                     as_pred_buf_data[0][j].i4_pred_stride,
772*c83a76b0SSuyog Pawar                     &au8_sigmaX[0][j],
773*c83a76b0SSuyog Pawar                     &au8_sigmaXSquared[0][j],
774*c83a76b0SSuyog Pawar                     ps_interp_prms->i4_blk_wd,
775*c83a76b0SSuyog Pawar                     ps_interp_prms->i4_blk_ht,
776*c83a76b0SSuyog Pawar                     ps_interp_prms->i4_blk_wd,
777*c83a76b0SSuyog Pawar                     ps_interp_prms->i4_blk_ht,
778*c83a76b0SSuyog Pawar                     0,
779*c83a76b0SSuyog Pawar                     1);
780*c83a76b0SSuyog Pawar             }
781*c83a76b0SSuyog Pawar         }
782*c83a76b0SSuyog Pawar         else
783*c83a76b0SSuyog Pawar         {
784*c83a76b0SSuyog Pawar             i = 0;
785*c83a76b0SSuyog Pawar             bi_cost = MAX_32BIT_VAL;
786*c83a76b0SSuyog Pawar             is_best_cand_an_intra = 0;
787*c83a76b0SSuyog Pawar             best_cand_in_opp_dir_idx = 0;
788*c83a76b0SSuyog Pawar 
789*c83a76b0SSuyog Pawar             pred_dir = ps_pu_node2[i].pu.b2_pred_mode;
790*c83a76b0SSuyog Pawar 
791*c83a76b0SSuyog Pawar             if(PRED_L0 == pred_dir)
792*c83a76b0SSuyog Pawar             {
793*c83a76b0SSuyog Pawar                 i1_ref_idx = ps_pu_node2[i].pu.mv.i1_l0_ref_idx;
794*c83a76b0SSuyog Pawar                 aps_mv[1] = &(ps_pu_node2[i].pu.mv.s_l0_mv);
795*c83a76b0SSuyog Pawar 
796*c83a76b0SSuyog Pawar                 ASSERT(i1_ref_idx >= 0);
797*c83a76b0SSuyog Pawar 
798*c83a76b0SSuyog Pawar                 apu1_hpel_ref[1][0] =
799*c83a76b0SSuyog Pawar                     (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
800*c83a76b0SSuyog Pawar                                    ->s_yuv_buf_desc.pv_y_buf) +
801*c83a76b0SSuyog Pawar                     ref_offset;  //>ppu1_list_rec_fxfy[0][i1_ref_idx] + ref_offset;
802*c83a76b0SSuyog Pawar                 apu1_hpel_ref[1][1] =
803*c83a76b0SSuyog Pawar                     ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[0] +
804*c83a76b0SSuyog Pawar                     ref_offset;
805*c83a76b0SSuyog Pawar                 apu1_hpel_ref[1][2] =
806*c83a76b0SSuyog Pawar                     ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[1] +
807*c83a76b0SSuyog Pawar                     ref_offset;
808*c83a76b0SSuyog Pawar                 apu1_hpel_ref[1][3] =
809*c83a76b0SSuyog Pawar                     ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]->apu1_y_sub_pel_planes[2] +
810*c83a76b0SSuyog Pawar                     ref_offset;
811*c83a76b0SSuyog Pawar 
812*c83a76b0SSuyog Pawar                 luma_weight_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
813*c83a76b0SSuyog Pawar                                        ->s_weight_offset.i2_luma_weight;
814*c83a76b0SSuyog Pawar                 luma_offset_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l0[i1_ref_idx]
815*c83a76b0SSuyog Pawar                                        ->s_weight_offset.i2_luma_offset;
816*c83a76b0SSuyog Pawar             }
817*c83a76b0SSuyog Pawar             else
818*c83a76b0SSuyog Pawar             {
819*c83a76b0SSuyog Pawar                 i1_ref_idx = ps_pu_node2[i].pu.mv.i1_l1_ref_idx;
820*c83a76b0SSuyog Pawar                 aps_mv[1] = &(ps_pu_node2[i].pu.mv.s_l1_mv);
821*c83a76b0SSuyog Pawar 
822*c83a76b0SSuyog Pawar                 ASSERT(i1_ref_idx >= 0);
823*c83a76b0SSuyog Pawar 
824*c83a76b0SSuyog Pawar                 apu1_hpel_ref[1][0] =
825*c83a76b0SSuyog Pawar                     (UWORD8 *)(ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
826*c83a76b0SSuyog Pawar                                    ->s_yuv_buf_desc.pv_y_buf) +
827*c83a76b0SSuyog Pawar                     ref_offset;  //>ppu1_list_rec_fxfy[0][i1_ref_idx] + ref_offset;
828*c83a76b0SSuyog Pawar                 apu1_hpel_ref[1][1] =
829*c83a76b0SSuyog Pawar                     ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[0] +
830*c83a76b0SSuyog Pawar                     ref_offset;
831*c83a76b0SSuyog Pawar                 apu1_hpel_ref[1][2] =
832*c83a76b0SSuyog Pawar                     ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[1] +
833*c83a76b0SSuyog Pawar                     ref_offset;
834*c83a76b0SSuyog Pawar                 apu1_hpel_ref[1][3] =
835*c83a76b0SSuyog Pawar                     ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]->apu1_y_sub_pel_planes[2] +
836*c83a76b0SSuyog Pawar                     ref_offset;
837*c83a76b0SSuyog Pawar 
838*c83a76b0SSuyog Pawar                 luma_weight_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
839*c83a76b0SSuyog Pawar                                        ->s_weight_offset.i2_luma_weight;
840*c83a76b0SSuyog Pawar                 luma_offset_ref2 = (WORD32)ps_inter_ctb_prms->pps_rec_list_l1[i1_ref_idx]
841*c83a76b0SSuyog Pawar                                        ->s_weight_offset.i2_luma_offset;
842*c83a76b0SSuyog Pawar             }
843*c83a76b0SSuyog Pawar 
844*c83a76b0SSuyog Pawar             if(aps_mv[1]->i2_mvx == INTRA_MV)
845*c83a76b0SSuyog Pawar             {
846*c83a76b0SSuyog Pawar                 uni_cost = ps_pu_node1->i4_tot_cost;
847*c83a76b0SSuyog Pawar                 cur_iter_best_cost = ps_pu_node2[i].i4_tot_cost;
848*c83a76b0SSuyog Pawar 
849*c83a76b0SSuyog Pawar                 if(cur_iter_best_cost < bi_cost)
850*c83a76b0SSuyog Pawar                 {
851*c83a76b0SSuyog Pawar                     bi_cost = cur_iter_best_cost;
852*c83a76b0SSuyog Pawar                     best_cand_in_opp_dir_idx = i;
853*c83a76b0SSuyog Pawar                     is_best_cand_an_intra = 1;
854*c83a76b0SSuyog Pawar                 }
855*c83a76b0SSuyog Pawar 
856*c83a76b0SSuyog Pawar                 best_cost = MIN(uni_cost, bi_cost);
857*c83a76b0SSuyog Pawar                 tot_cost += best_cost;
858*c83a76b0SSuyog Pawar                 continue;
859*c83a76b0SSuyog Pawar             }
860*c83a76b0SSuyog Pawar 
861*c83a76b0SSuyog Pawar             s_interp_prms.apu1_interp_out[0] = as_pred_buf_data[0][j].pu1_pred;
862*c83a76b0SSuyog Pawar             ps_interp_prms->ppu1_ref = &apu1_hpel_ref[0][0];
863*c83a76b0SSuyog Pawar 
864*c83a76b0SSuyog Pawar             ps_me_optimised_function_list->pf_qpel_interp_avg_generic(
865*c83a76b0SSuyog Pawar                 ps_interp_prms, aps_mv[0]->i2_mvx, aps_mv[0]->i2_mvy, 0);
866*c83a76b0SSuyog Pawar 
867*c83a76b0SSuyog Pawar             if(ps_interp_prms->pu1_final_out != s_interp_prms.apu1_interp_out[0])
868*c83a76b0SSuyog Pawar             {
869*c83a76b0SSuyog Pawar                 as_pred_buf_data[0][j].u1_pred_buf_array_id = UCHAR_MAX;
870*c83a76b0SSuyog Pawar                 as_pred_buf_data[0][j].pu1_pred = ps_interp_prms->pu1_final_out;
871*c83a76b0SSuyog Pawar                 as_pred_buf_data[0][j].i4_pred_stride = ps_interp_prms->i4_final_out_stride;
872*c83a76b0SSuyog Pawar             }
873*c83a76b0SSuyog Pawar 
874*c83a76b0SSuyog Pawar             if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
875*c83a76b0SSuyog Pawar             {
876*c83a76b0SSuyog Pawar                 hme_compute_sigmaX_and_sigmaXSquared(
877*c83a76b0SSuyog Pawar                     as_pred_buf_data[0][j].pu1_pred,
878*c83a76b0SSuyog Pawar                     as_pred_buf_data[0][j].i4_pred_stride,
879*c83a76b0SSuyog Pawar                     &au8_sigmaX[0][j],
880*c83a76b0SSuyog Pawar                     &au8_sigmaXSquared[0][j],
881*c83a76b0SSuyog Pawar                     ps_interp_prms->i4_blk_wd,
882*c83a76b0SSuyog Pawar                     ps_interp_prms->i4_blk_ht,
883*c83a76b0SSuyog Pawar                     ps_interp_prms->i4_blk_wd,
884*c83a76b0SSuyog Pawar                     ps_interp_prms->i4_blk_ht,
885*c83a76b0SSuyog Pawar                     0,
886*c83a76b0SSuyog Pawar                     1);
887*c83a76b0SSuyog Pawar             }
888*c83a76b0SSuyog Pawar 
889*c83a76b0SSuyog Pawar             s_interp_prms.apu1_interp_out[0] = as_pred_buf_data[1][j].pu1_pred;
890*c83a76b0SSuyog Pawar             ps_interp_prms->ppu1_ref = &apu1_hpel_ref[1][0];
891*c83a76b0SSuyog Pawar 
892*c83a76b0SSuyog Pawar             ps_me_optimised_function_list->pf_qpel_interp_avg_generic(
893*c83a76b0SSuyog Pawar                 ps_interp_prms, aps_mv[1]->i2_mvx, aps_mv[1]->i2_mvy, 0);
894*c83a76b0SSuyog Pawar 
895*c83a76b0SSuyog Pawar             if(ps_interp_prms->pu1_final_out != s_interp_prms.apu1_interp_out[0])
896*c83a76b0SSuyog Pawar             {
897*c83a76b0SSuyog Pawar                 as_pred_buf_data[1][j].u1_pred_buf_array_id = UCHAR_MAX;
898*c83a76b0SSuyog Pawar                 as_pred_buf_data[1][j].pu1_pred = ps_interp_prms->pu1_final_out;
899*c83a76b0SSuyog Pawar                 as_pred_buf_data[1][j].i4_pred_stride = ps_interp_prms->i4_final_out_stride;
900*c83a76b0SSuyog Pawar             }
901*c83a76b0SSuyog Pawar 
902*c83a76b0SSuyog Pawar             ps_cmn_utils_optimised_function_list->pf_wt_avg_2d(
903*c83a76b0SSuyog Pawar                 as_pred_buf_data[0][j].pu1_pred,
904*c83a76b0SSuyog Pawar                 as_pred_buf_data[1][j].pu1_pred,
905*c83a76b0SSuyog Pawar                 as_pred_buf_data[0][j].i4_pred_stride,
906*c83a76b0SSuyog Pawar                 as_pred_buf_data[1][j].i4_pred_stride,
907*c83a76b0SSuyog Pawar                 wd,
908*c83a76b0SSuyog Pawar                 ht,
909*c83a76b0SSuyog Pawar                 as_pred_buf_data[2][j].pu1_pred,
910*c83a76b0SSuyog Pawar                 as_pred_buf_data[2][j].i4_pred_stride,
911*c83a76b0SSuyog Pawar                 luma_weight_ref1,
912*c83a76b0SSuyog Pawar                 luma_weight_ref2,
913*c83a76b0SSuyog Pawar                 luma_offset_ref1,
914*c83a76b0SSuyog Pawar                 luma_offset_ref2,
915*c83a76b0SSuyog Pawar                 ps_inter_ctb_prms->wpred_log_wdc);
916*c83a76b0SSuyog Pawar 
917*c83a76b0SSuyog Pawar             if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
918*c83a76b0SSuyog Pawar             {
919*c83a76b0SSuyog Pawar                 hme_compute_sigmaX_and_sigmaXSquared(
920*c83a76b0SSuyog Pawar                     as_pred_buf_data[2][j].pu1_pred,
921*c83a76b0SSuyog Pawar                     as_pred_buf_data[2][j].i4_pred_stride,
922*c83a76b0SSuyog Pawar                     &au8_sigmaX[1][j],
923*c83a76b0SSuyog Pawar                     &au8_sigmaXSquared[1][j],
924*c83a76b0SSuyog Pawar                     ps_interp_prms->i4_blk_wd,
925*c83a76b0SSuyog Pawar                     ps_interp_prms->i4_blk_ht,
926*c83a76b0SSuyog Pawar                     ps_interp_prms->i4_blk_wd,
927*c83a76b0SSuyog Pawar                     ps_interp_prms->i4_blk_ht,
928*c83a76b0SSuyog Pawar                     0,
929*c83a76b0SSuyog Pawar                     1);
930*c83a76b0SSuyog Pawar             }
931*c83a76b0SSuyog Pawar 
932*c83a76b0SSuyog Pawar             s_err_prms.pu1_inp = (U08 *)ps_inter_ctb_prms->pu1_non_wt_inp + inp_offset;
933*c83a76b0SSuyog Pawar             s_err_prms.i4_inp_stride = inp_stride;
934*c83a76b0SSuyog Pawar             s_err_prms.i4_ref_stride = as_pred_buf_data[2][j].i4_pred_stride;
935*c83a76b0SSuyog Pawar             s_err_prms.i4_part_mask = (ENABLE_2Nx2N);
936*c83a76b0SSuyog Pawar             s_err_prms.i4_grid_mask = 1;
937*c83a76b0SSuyog Pawar             s_err_prms.pi4_sad_grid = &i4_sad_grid;
938*c83a76b0SSuyog Pawar             s_err_prms.i4_blk_wd = wd;
939*c83a76b0SSuyog Pawar             s_err_prms.i4_blk_ht = ht;
940*c83a76b0SSuyog Pawar             s_err_prms.pu1_ref = as_pred_buf_data[2][j].pu1_pred;
941*c83a76b0SSuyog Pawar             s_err_prms.ps_cmn_utils_optimised_function_list = ps_cmn_utils_optimised_function_list;
942*c83a76b0SSuyog Pawar 
943*c83a76b0SSuyog Pawar             if(ps_inter_ctb_prms->u1_use_satd)
944*c83a76b0SSuyog Pawar             {
945*c83a76b0SSuyog Pawar                 pf_err_compute = compute_satd_8bit;
946*c83a76b0SSuyog Pawar             }
947*c83a76b0SSuyog Pawar             else
948*c83a76b0SSuyog Pawar             {
949*c83a76b0SSuyog Pawar                 pf_err_compute = ps_me_optimised_function_list->pf_evalsad_pt_npu_mxn_8bit;
950*c83a76b0SSuyog Pawar             }
951*c83a76b0SSuyog Pawar 
952*c83a76b0SSuyog Pawar             pf_err_compute(&s_err_prms);
953*c83a76b0SSuyog Pawar 
954*c83a76b0SSuyog Pawar #if USE_NOISE_TERM_DURING_BICAND_SEARCH
955*c83a76b0SSuyog Pawar             if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
956*c83a76b0SSuyog Pawar             {
957*c83a76b0SSuyog Pawar                 unsigned long u4_shift_val;
958*c83a76b0SSuyog Pawar                 ULWORD64 u8_src_variance, u8_pred_variance, u8_pred_sigmaSquareX;
959*c83a76b0SSuyog Pawar                 ULWORD64 u8_temp_var, u8_temp_var1;
960*c83a76b0SSuyog Pawar                 S32 i4_bits_req;
961*c83a76b0SSuyog Pawar 
962*c83a76b0SSuyog Pawar                 S32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT;
963*c83a76b0SSuyog Pawar 
964*c83a76b0SSuyog Pawar                 u8_pred_sigmaSquareX = (au8_sigmaX[1][j] * au8_sigmaX[1][j]);
965*c83a76b0SSuyog Pawar                 u8_pred_variance = au8_sigmaXSquared[1][j] - u8_pred_sigmaSquareX;
966*c83a76b0SSuyog Pawar 
967*c83a76b0SSuyog Pawar                 if(e_cu_size == CU_8x8)
968*c83a76b0SSuyog Pawar                 {
969*c83a76b0SSuyog Pawar                     PART_ID_T e_part_id =
970*c83a76b0SSuyog Pawar                         (PART_ID_T)((PART_ID_NxN_TL) + (x_off & 1) + ((y_off & 1) << 1));
971*c83a76b0SSuyog Pawar 
972*c83a76b0SSuyog Pawar                     u4_shift_val = ihevce_calc_stim_injected_variance(
973*c83a76b0SSuyog Pawar                         ps_inter_ctb_prms->pu8_part_src_sigmaX,
974*c83a76b0SSuyog Pawar                         ps_inter_ctb_prms->pu8_part_src_sigmaXSquared,
975*c83a76b0SSuyog Pawar                         &u8_src_variance,
976*c83a76b0SSuyog Pawar                         i4_default_src_wt,
977*c83a76b0SSuyog Pawar                         0,
978*c83a76b0SSuyog Pawar                         ps_inter_ctb_prms->wpred_log_wdc,
979*c83a76b0SSuyog Pawar                         e_part_id);
980*c83a76b0SSuyog Pawar                 }
981*c83a76b0SSuyog Pawar                 else
982*c83a76b0SSuyog Pawar                 {
983*c83a76b0SSuyog Pawar                     u4_shift_val = ihevce_calc_stim_injected_variance(
984*c83a76b0SSuyog Pawar                         ps_inter_ctb_prms->pu8_part_src_sigmaX,
985*c83a76b0SSuyog Pawar                         ps_inter_ctb_prms->pu8_part_src_sigmaXSquared,
986*c83a76b0SSuyog Pawar                         &u8_src_variance,
987*c83a76b0SSuyog Pawar                         i4_default_src_wt,
988*c83a76b0SSuyog Pawar                         0,
989*c83a76b0SSuyog Pawar                         ps_inter_ctb_prms->wpred_log_wdc,
990*c83a76b0SSuyog Pawar                         e_part_id);
991*c83a76b0SSuyog Pawar                 }
992*c83a76b0SSuyog Pawar 
993*c83a76b0SSuyog Pawar                 u8_pred_variance = u8_pred_variance >> u4_shift_val;
994*c83a76b0SSuyog Pawar 
995*c83a76b0SSuyog Pawar                 GETRANGE64(i4_bits_req, u8_pred_variance);
996*c83a76b0SSuyog Pawar 
997*c83a76b0SSuyog Pawar                 if(i4_bits_req > 27)
998*c83a76b0SSuyog Pawar                 {
999*c83a76b0SSuyog Pawar                     u8_pred_variance = u8_pred_variance >> (i4_bits_req - 27);
1000*c83a76b0SSuyog Pawar                     u8_src_variance = u8_src_variance >> (i4_bits_req - 27);
1001*c83a76b0SSuyog Pawar                 }
1002*c83a76b0SSuyog Pawar 
1003*c83a76b0SSuyog Pawar                 if(u8_src_variance == u8_pred_variance)
1004*c83a76b0SSuyog Pawar                 {
1005*c83a76b0SSuyog Pawar                     u8_temp_var = (1 << STIM_Q_FORMAT);
1006*c83a76b0SSuyog Pawar                 }
1007*c83a76b0SSuyog Pawar                 else
1008*c83a76b0SSuyog Pawar                 {
1009*c83a76b0SSuyog Pawar                     u8_temp_var = (2 * u8_src_variance * u8_pred_variance);
1010*c83a76b0SSuyog Pawar                     u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT));
1011*c83a76b0SSuyog Pawar                     u8_temp_var1 =
1012*c83a76b0SSuyog Pawar                         (u8_src_variance * u8_src_variance) + (u8_pred_variance * u8_pred_variance);
1013*c83a76b0SSuyog Pawar                     u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2));
1014*c83a76b0SSuyog Pawar                     u8_temp_var = (u8_temp_var / u8_temp_var1);
1015*c83a76b0SSuyog Pawar                 }
1016*c83a76b0SSuyog Pawar 
1017*c83a76b0SSuyog Pawar                 i4_noise_term = (UWORD32)u8_temp_var;
1018*c83a76b0SSuyog Pawar 
1019*c83a76b0SSuyog Pawar                 i4_noise_term *= ps_inter_ctb_prms->i4_alpha_stim_multiplier;
1020*c83a76b0SSuyog Pawar 
1021*c83a76b0SSuyog Pawar                 ASSERT(i4_noise_term >= 0);
1022*c83a76b0SSuyog Pawar 
1023*c83a76b0SSuyog Pawar                 u8_temp_var = i4_sad_grid;
1024*c83a76b0SSuyog Pawar                 u8_temp_var *= ((1 << (i4_q_level)) - (i4_noise_term));
1025*c83a76b0SSuyog Pawar                 u8_temp_var += (1 << ((i4_q_level)-1));
1026*c83a76b0SSuyog Pawar                 i4_sad_grid = (UWORD32)(u8_temp_var >> (i4_q_level));
1027*c83a76b0SSuyog Pawar             }
1028*c83a76b0SSuyog Pawar #endif
1029*c83a76b0SSuyog Pawar 
1030*c83a76b0SSuyog Pawar             cur_iter_best_cost = i4_sad_grid;
1031*c83a76b0SSuyog Pawar             cur_iter_best_cost += ps_pu_node1->i4_mv_cost;
1032*c83a76b0SSuyog Pawar             cur_iter_best_cost += ps_pu_node2[i].i4_mv_cost;
1033*c83a76b0SSuyog Pawar 
1034*c83a76b0SSuyog Pawar             if(cur_iter_best_cost < bi_cost)
1035*c83a76b0SSuyog Pawar             {
1036*c83a76b0SSuyog Pawar                 bi_cost = cur_iter_best_cost;
1037*c83a76b0SSuyog Pawar                 best_cand_in_opp_dir_idx = i;
1038*c83a76b0SSuyog Pawar                 is_best_cand_an_intra = 0;
1039*c83a76b0SSuyog Pawar             }
1040*c83a76b0SSuyog Pawar         }
1041*c83a76b0SSuyog Pawar 
1042*c83a76b0SSuyog Pawar         uni_cost = ps_pu_node1->i4_tot_cost;
1043*c83a76b0SSuyog Pawar 
1044*c83a76b0SSuyog Pawar #if USE_NOISE_TERM_DURING_BICAND_SEARCH
1045*c83a76b0SSuyog Pawar         if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
1046*c83a76b0SSuyog Pawar         {
1047*c83a76b0SSuyog Pawar             unsigned long u4_shift_val;
1048*c83a76b0SSuyog Pawar             ULWORD64 u8_src_variance, u8_pred_variance, u8_pred_sigmaSquareX;
1049*c83a76b0SSuyog Pawar             ULWORD64 u8_temp_var, u8_temp_var1;
1050*c83a76b0SSuyog Pawar             S32 i4_bits_req;
1051*c83a76b0SSuyog Pawar 
1052*c83a76b0SSuyog Pawar             S32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT;
1053*c83a76b0SSuyog Pawar 
1054*c83a76b0SSuyog Pawar             S08 i1_ref_idx =
1055*c83a76b0SSuyog Pawar                 (PRED_L0 == ps_pu_node1->pu.b2_pred_mode)
1056*c83a76b0SSuyog Pawar                     ? ps_inter_ctb_prms->pi1_past_list[ps_pu_node1->pu.mv.i1_l0_ref_idx]
1057*c83a76b0SSuyog Pawar                     : ps_inter_ctb_prms->pi1_future_list[ps_pu_node1->pu.mv.i1_l1_ref_idx];
1058*c83a76b0SSuyog Pawar             S32 i4_sad = ps_pu_node1->i4_tot_cost - ps_pu_node1->i4_mv_cost;
1059*c83a76b0SSuyog Pawar 
1060*c83a76b0SSuyog Pawar             u8_pred_sigmaSquareX = (au8_sigmaX[0][j] * au8_sigmaX[0][j]);
1061*c83a76b0SSuyog Pawar             u8_pred_variance = au8_sigmaXSquared[0][j] - u8_pred_sigmaSquareX;
1062*c83a76b0SSuyog Pawar 
1063*c83a76b0SSuyog Pawar             if(e_cu_size == CU_8x8)
1064*c83a76b0SSuyog Pawar             {
1065*c83a76b0SSuyog Pawar                 PART_ID_T e_part_id =
1066*c83a76b0SSuyog Pawar                     (PART_ID_T)((PART_ID_NxN_TL) + (x_off & 1) + ((y_off & 1) << 1));
1067*c83a76b0SSuyog Pawar 
1068*c83a76b0SSuyog Pawar                 u4_shift_val = ihevce_calc_stim_injected_variance(
1069*c83a76b0SSuyog Pawar                     ps_inter_ctb_prms->pu8_part_src_sigmaX,
1070*c83a76b0SSuyog Pawar                     ps_inter_ctb_prms->pu8_part_src_sigmaXSquared,
1071*c83a76b0SSuyog Pawar                     &u8_src_variance,
1072*c83a76b0SSuyog Pawar                     ps_inter_ctb_prms->pi4_inv_wt[i1_ref_idx],
1073*c83a76b0SSuyog Pawar                     ps_inter_ctb_prms->pi4_inv_wt_shift_val[i1_ref_idx],
1074*c83a76b0SSuyog Pawar                     ps_inter_ctb_prms->wpred_log_wdc,
1075*c83a76b0SSuyog Pawar                     e_part_id);
1076*c83a76b0SSuyog Pawar             }
1077*c83a76b0SSuyog Pawar             else
1078*c83a76b0SSuyog Pawar             {
1079*c83a76b0SSuyog Pawar                 u4_shift_val = ihevce_calc_stim_injected_variance(
1080*c83a76b0SSuyog Pawar                     ps_inter_ctb_prms->pu8_part_src_sigmaX,
1081*c83a76b0SSuyog Pawar                     ps_inter_ctb_prms->pu8_part_src_sigmaXSquared,
1082*c83a76b0SSuyog Pawar                     &u8_src_variance,
1083*c83a76b0SSuyog Pawar                     ps_inter_ctb_prms->pi4_inv_wt[i1_ref_idx],
1084*c83a76b0SSuyog Pawar                     ps_inter_ctb_prms->pi4_inv_wt_shift_val[i1_ref_idx],
1085*c83a76b0SSuyog Pawar                     ps_inter_ctb_prms->wpred_log_wdc,
1086*c83a76b0SSuyog Pawar                     e_part_id);
1087*c83a76b0SSuyog Pawar             }
1088*c83a76b0SSuyog Pawar 
1089*c83a76b0SSuyog Pawar             u8_pred_variance = u8_pred_variance >> (u4_shift_val);
1090*c83a76b0SSuyog Pawar 
1091*c83a76b0SSuyog Pawar             GETRANGE64(i4_bits_req, u8_pred_variance);
1092*c83a76b0SSuyog Pawar 
1093*c83a76b0SSuyog Pawar             if(i4_bits_req > 27)
1094*c83a76b0SSuyog Pawar             {
1095*c83a76b0SSuyog Pawar                 u8_pred_variance = u8_pred_variance >> (i4_bits_req - 27);
1096*c83a76b0SSuyog Pawar                 u8_src_variance = u8_src_variance >> (i4_bits_req - 27);
1097*c83a76b0SSuyog Pawar             }
1098*c83a76b0SSuyog Pawar 
1099*c83a76b0SSuyog Pawar             if(u8_src_variance == u8_pred_variance)
1100*c83a76b0SSuyog Pawar             {
1101*c83a76b0SSuyog Pawar                 u8_temp_var = (1 << STIM_Q_FORMAT);
1102*c83a76b0SSuyog Pawar             }
1103*c83a76b0SSuyog Pawar             else
1104*c83a76b0SSuyog Pawar             {
1105*c83a76b0SSuyog Pawar                 u8_temp_var = (2 * u8_src_variance * u8_pred_variance);
1106*c83a76b0SSuyog Pawar                 u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT));
1107*c83a76b0SSuyog Pawar                 u8_temp_var1 =
1108*c83a76b0SSuyog Pawar                     (u8_src_variance * u8_src_variance) + (u8_pred_variance * u8_pred_variance);
1109*c83a76b0SSuyog Pawar                 u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2));
1110*c83a76b0SSuyog Pawar                 u8_temp_var = (u8_temp_var / u8_temp_var1);
1111*c83a76b0SSuyog Pawar             }
1112*c83a76b0SSuyog Pawar 
1113*c83a76b0SSuyog Pawar             i4_noise_term = (UWORD32)u8_temp_var;
1114*c83a76b0SSuyog Pawar 
1115*c83a76b0SSuyog Pawar             i4_noise_term *= ps_inter_ctb_prms->i4_alpha_stim_multiplier;
1116*c83a76b0SSuyog Pawar 
1117*c83a76b0SSuyog Pawar             ASSERT(i4_noise_term >= 0);
1118*c83a76b0SSuyog Pawar 
1119*c83a76b0SSuyog Pawar             u8_temp_var = i4_sad;
1120*c83a76b0SSuyog Pawar             u8_temp_var *= ((1 << (i4_q_level)) - (i4_noise_term));
1121*c83a76b0SSuyog Pawar             u8_temp_var += (1 << ((i4_q_level)-1));
1122*c83a76b0SSuyog Pawar             i4_sad = (UWORD32)(u8_temp_var >> (i4_q_level));
1123*c83a76b0SSuyog Pawar 
1124*c83a76b0SSuyog Pawar             uni_cost = i4_sad + ps_pu_node1->i4_mv_cost;
1125*c83a76b0SSuyog Pawar 
1126*c83a76b0SSuyog Pawar             pu8_winning_pred_sigmaX[j] = au8_sigmaX[0][j];
1127*c83a76b0SSuyog Pawar             pu8_winning_pred_sigmaXSquare[j] = au8_sigmaXSquared[0][j];
1128*c83a76b0SSuyog Pawar         }
1129*c83a76b0SSuyog Pawar #endif
1130*c83a76b0SSuyog Pawar 
1131*c83a76b0SSuyog Pawar         if((bi_cost < uni_cost) && (!is_best_cand_an_intra))
1132*c83a76b0SSuyog Pawar         {
1133*c83a76b0SSuyog Pawar             if(u1_is_cu_noisy && ps_inter_ctb_prms->i4_alpha_stim_multiplier)
1134*c83a76b0SSuyog Pawar             {
1135*c83a76b0SSuyog Pawar                 pu8_winning_pred_sigmaX[j] = au8_sigmaX[1][j];
1136*c83a76b0SSuyog Pawar                 pu8_winning_pred_sigmaXSquare[j] = au8_sigmaXSquared[1][j];
1137*c83a76b0SSuyog Pawar             }
1138*c83a76b0SSuyog Pawar 
1139*c83a76b0SSuyog Pawar             if(PRED_L0 == ps_pu_node1->pu.b2_pred_mode)
1140*c83a76b0SSuyog Pawar             {
1141*c83a76b0SSuyog Pawar                 ps_pu_node1->pu.b2_pred_mode = PRED_BI;
1142*c83a76b0SSuyog Pawar 
1143*c83a76b0SSuyog Pawar                 if(PRED_L0 == ps_pu_node2[best_cand_in_opp_dir_idx].pu.b2_pred_mode)
1144*c83a76b0SSuyog Pawar                 {
1145*c83a76b0SSuyog Pawar                     ps_pu_node1->pu.mv.i1_l1_ref_idx =
1146*c83a76b0SSuyog Pawar                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l0_ref_idx;
1147*c83a76b0SSuyog Pawar                     ps_pu_node1->pu.mv.s_l1_mv.i2_mvx =
1148*c83a76b0SSuyog Pawar                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvx;
1149*c83a76b0SSuyog Pawar                     ps_pu_node1->pu.mv.s_l1_mv.i2_mvy =
1150*c83a76b0SSuyog Pawar                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvy;
1151*c83a76b0SSuyog Pawar                 }
1152*c83a76b0SSuyog Pawar                 else
1153*c83a76b0SSuyog Pawar                 {
1154*c83a76b0SSuyog Pawar                     ps_pu_node1->pu.mv.i1_l1_ref_idx =
1155*c83a76b0SSuyog Pawar                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l1_ref_idx;
1156*c83a76b0SSuyog Pawar                     ps_pu_node1->pu.mv.s_l1_mv.i2_mvx =
1157*c83a76b0SSuyog Pawar                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvx;
1158*c83a76b0SSuyog Pawar                     ps_pu_node1->pu.mv.s_l1_mv.i2_mvy =
1159*c83a76b0SSuyog Pawar                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvy;
1160*c83a76b0SSuyog Pawar                 }
1161*c83a76b0SSuyog Pawar             }
1162*c83a76b0SSuyog Pawar             else
1163*c83a76b0SSuyog Pawar             {
1164*c83a76b0SSuyog Pawar                 ps_pu_node1->pu.b2_pred_mode = PRED_BI;
1165*c83a76b0SSuyog Pawar 
1166*c83a76b0SSuyog Pawar                 if(PRED_L0 == ps_pu_node2[best_cand_in_opp_dir_idx].pu.b2_pred_mode)
1167*c83a76b0SSuyog Pawar                 {
1168*c83a76b0SSuyog Pawar                     ps_pu_node1->pu.mv.i1_l0_ref_idx =
1169*c83a76b0SSuyog Pawar                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l0_ref_idx;
1170*c83a76b0SSuyog Pawar                     ps_pu_node1->pu.mv.s_l0_mv.i2_mvx =
1171*c83a76b0SSuyog Pawar                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvx;
1172*c83a76b0SSuyog Pawar                     ps_pu_node1->pu.mv.s_l0_mv.i2_mvy =
1173*c83a76b0SSuyog Pawar                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l0_mv.i2_mvy;
1174*c83a76b0SSuyog Pawar                 }
1175*c83a76b0SSuyog Pawar                 else
1176*c83a76b0SSuyog Pawar                 {
1177*c83a76b0SSuyog Pawar                     ps_pu_node1->pu.mv.i1_l0_ref_idx =
1178*c83a76b0SSuyog Pawar                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.i1_l1_ref_idx;
1179*c83a76b0SSuyog Pawar                     ps_pu_node1->pu.mv.s_l0_mv.i2_mvx =
1180*c83a76b0SSuyog Pawar                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvx;
1181*c83a76b0SSuyog Pawar                     ps_pu_node1->pu.mv.s_l0_mv.i2_mvy =
1182*c83a76b0SSuyog Pawar                         ps_pu_node2[best_cand_in_opp_dir_idx].pu.mv.s_l1_mv.i2_mvy;
1183*c83a76b0SSuyog Pawar                 }
1184*c83a76b0SSuyog Pawar             }
1185*c83a76b0SSuyog Pawar 
1186*c83a76b0SSuyog Pawar             ps_part_type_result->as_pu_results[j].i4_tot_cost = bi_cost;
1187*c83a76b0SSuyog Pawar         }
1188*c83a76b0SSuyog Pawar 
1189*c83a76b0SSuyog Pawar         best_cost = MIN(uni_cost, bi_cost);
1190*c83a76b0SSuyog Pawar         tot_cost += best_cost;
1191*c83a76b0SSuyog Pawar     }
1192*c83a76b0SSuyog Pawar 
1193*c83a76b0SSuyog Pawar     hme_debrief_bipred_eval(
1194*c83a76b0SSuyog Pawar         ps_part_type_result,
1195*c83a76b0SSuyog Pawar         as_pred_buf_data,
1196*c83a76b0SSuyog Pawar         &ps_inter_ctb_prms->s_pred_buf_mngr,
1197*c83a76b0SSuyog Pawar         au1_pred_buf_array_indixes,
1198*c83a76b0SSuyog Pawar         ps_cmn_utils_optimised_function_list);
1199*c83a76b0SSuyog Pawar 
1200*c83a76b0SSuyog Pawar     ps_part_type_result->i4_tot_cost = tot_cost;
1201*c83a76b0SSuyog Pawar }
1202*c83a76b0SSuyog Pawar 
hme_evalsatd_pt_pu_8x8_tu_rec(err_prms_t * ps_prms,WORD32 lambda,WORD32 lambda_q_shift,WORD32 i4_frm_qstep,me_func_selector_t * ps_func_selector)1203*c83a76b0SSuyog Pawar WORD32 hme_evalsatd_pt_pu_8x8_tu_rec(
1204*c83a76b0SSuyog Pawar     err_prms_t *ps_prms,
1205*c83a76b0SSuyog Pawar     WORD32 lambda,
1206*c83a76b0SSuyog Pawar     WORD32 lambda_q_shift,
1207*c83a76b0SSuyog Pawar     WORD32 i4_frm_qstep,
1208*c83a76b0SSuyog Pawar     me_func_selector_t *ps_func_selector)
1209*c83a76b0SSuyog Pawar {
1210*c83a76b0SSuyog Pawar     S32 ai4_satd_4x4[4]; /* num 4x4s in a 8x8 */
1211*c83a76b0SSuyog Pawar     S32 i4_satd_8x8;
1212*c83a76b0SSuyog Pawar     S16 *pi2_had_out;
1213*c83a76b0SSuyog Pawar     S32 i4_tu_split_flag = 0;
1214*c83a76b0SSuyog Pawar     S32 i4_tu_early_cbf = 0;
1215*c83a76b0SSuyog Pawar 
1216*c83a76b0SSuyog Pawar     S32 i4_early_cbf = 1;
1217*c83a76b0SSuyog Pawar     //  S32 i4_i, i4_k;
1218*c83a76b0SSuyog Pawar     S32 i4_total_satd_cost = 0;
1219*c83a76b0SSuyog Pawar     S32 best_cost_tu_split;
1220*c83a76b0SSuyog Pawar 
1221*c83a76b0SSuyog Pawar     /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1222*c83a76b0SSuyog Pawar     S32 *api4_satd_pu[HAD_32x32 + 1];
1223*c83a76b0SSuyog Pawar     S32 *api4_tu_split[HAD_32x32 + 1];
1224*c83a76b0SSuyog Pawar     S32 *api4_tu_early_cbf[HAD_32x32 + 1];
1225*c83a76b0SSuyog Pawar 
1226*c83a76b0SSuyog Pawar     S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1227*c83a76b0SSuyog Pawar     S32 *pi4_tu_split = ps_prms->pi4_tu_split_flags;
1228*c83a76b0SSuyog Pawar     S32 *pi4_early_cbf = ps_prms->pi4_tu_early_cbf;
1229*c83a76b0SSuyog Pawar 
1230*c83a76b0SSuyog Pawar     U08 *pu1_inp = ps_prms->pu1_inp;
1231*c83a76b0SSuyog Pawar     U08 *pu1_ref = ps_prms->pu1_ref;
1232*c83a76b0SSuyog Pawar 
1233*c83a76b0SSuyog Pawar     S32 inp_stride = ps_prms->i4_inp_stride;
1234*c83a76b0SSuyog Pawar     S32 ref_stride = ps_prms->i4_ref_stride;
1235*c83a76b0SSuyog Pawar 
1236*c83a76b0SSuyog Pawar     /* Initialize tu_split_cost to "0" */
1237*c83a76b0SSuyog Pawar     ps_prms->i4_tu_split_cost = 0;
1238*c83a76b0SSuyog Pawar     pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem;
1239*c83a76b0SSuyog Pawar 
1240*c83a76b0SSuyog Pawar     api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
1241*c83a76b0SSuyog Pawar     api4_satd_pu[HAD_8x8] = &i4_satd_8x8;
1242*c83a76b0SSuyog Pawar     api4_satd_pu[HAD_16x16] = NULL;
1243*c83a76b0SSuyog Pawar     api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1244*c83a76b0SSuyog Pawar 
1245*c83a76b0SSuyog Pawar     api4_tu_split[HAD_4x4] = NULL;
1246*c83a76b0SSuyog Pawar     api4_tu_split[HAD_8x8] = &i4_tu_split_flag;
1247*c83a76b0SSuyog Pawar     api4_tu_split[HAD_16x16] = NULL;
1248*c83a76b0SSuyog Pawar     api4_tu_split[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1249*c83a76b0SSuyog Pawar 
1250*c83a76b0SSuyog Pawar     api4_tu_early_cbf[HAD_4x4] = NULL;
1251*c83a76b0SSuyog Pawar     api4_tu_early_cbf[HAD_8x8] = &i4_tu_early_cbf;
1252*c83a76b0SSuyog Pawar     api4_tu_early_cbf[HAD_16x16] = NULL;
1253*c83a76b0SSuyog Pawar     api4_tu_early_cbf[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1254*c83a76b0SSuyog Pawar 
1255*c83a76b0SSuyog Pawar     /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */
1256*c83a76b0SSuyog Pawar 
1257*c83a76b0SSuyog Pawar     /* Return value is merge of both best_stad_cost and tu_split_flags */
1258*c83a76b0SSuyog Pawar     best_cost_tu_split = ps_func_selector->pf_had_8x8_using_4_4x4_r(
1259*c83a76b0SSuyog Pawar         pu1_inp,
1260*c83a76b0SSuyog Pawar         inp_stride,
1261*c83a76b0SSuyog Pawar         pu1_ref,
1262*c83a76b0SSuyog Pawar         ref_stride,
1263*c83a76b0SSuyog Pawar         pi2_had_out,
1264*c83a76b0SSuyog Pawar         8,
1265*c83a76b0SSuyog Pawar         api4_satd_pu,
1266*c83a76b0SSuyog Pawar         api4_tu_split,
1267*c83a76b0SSuyog Pawar         api4_tu_early_cbf,
1268*c83a76b0SSuyog Pawar         0,
1269*c83a76b0SSuyog Pawar         2,
1270*c83a76b0SSuyog Pawar         0,
1271*c83a76b0SSuyog Pawar         0,
1272*c83a76b0SSuyog Pawar         i4_frm_qstep,
1273*c83a76b0SSuyog Pawar         0,
1274*c83a76b0SSuyog Pawar         ps_prms->u1_max_tr_depth,
1275*c83a76b0SSuyog Pawar         ps_prms->u1_max_tr_size,
1276*c83a76b0SSuyog Pawar         &(ps_prms->i4_tu_split_cost),
1277*c83a76b0SSuyog Pawar         NULL);
1278*c83a76b0SSuyog Pawar 
1279*c83a76b0SSuyog Pawar     /* For SATD computation following TU size are assumed for a 8x8 CU */
1280*c83a76b0SSuyog Pawar     /* 8 for 2Nx2N, 4 for Nx2N,2NxN                                    */
1281*c83a76b0SSuyog Pawar 
1282*c83a76b0SSuyog Pawar     i4_total_satd_cost = best_cost_tu_split >> 2;
1283*c83a76b0SSuyog Pawar 
1284*c83a76b0SSuyog Pawar     /* Second last bit has the tu pslit flag */
1285*c83a76b0SSuyog Pawar     i4_tu_split_flag = (best_cost_tu_split & 0x3) >> 1;
1286*c83a76b0SSuyog Pawar 
1287*c83a76b0SSuyog Pawar     /* Last bit corrsponds to the Early CBF flag */
1288*c83a76b0SSuyog Pawar     i4_early_cbf = (best_cost_tu_split & 0x1);
1289*c83a76b0SSuyog Pawar 
1290*c83a76b0SSuyog Pawar     /* Update 8x8 SATDs */
1291*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_2Nx2N] = i4_satd_8x8;
1292*c83a76b0SSuyog Pawar     pi4_tu_split[PART_ID_2Nx2N] = i4_tu_split_flag;
1293*c83a76b0SSuyog Pawar     pi4_early_cbf[PART_ID_2Nx2N] = i4_early_cbf;
1294*c83a76b0SSuyog Pawar 
1295*c83a76b0SSuyog Pawar     return i4_total_satd_cost;
1296*c83a76b0SSuyog Pawar }
1297*c83a76b0SSuyog Pawar //#endif
1298*c83a76b0SSuyog Pawar /**
1299*c83a76b0SSuyog Pawar ********************************************************************************
1300*c83a76b0SSuyog Pawar *  @fn     S32 hme_evalsatd_update_1_best_result_pt_pu_16x16
1301*c83a76b0SSuyog Pawar *
1302*c83a76b0SSuyog Pawar *  @brief  Evaluates the SATD with partial updates for all the best partitions
1303*c83a76b0SSuyog Pawar *          of a 16x16 CU based on recursive Hadamard 16x16, 8x8 and 4x4 satds
1304*c83a76b0SSuyog Pawar *
1305*c83a76b0SSuyog Pawar *  @param[inout]  ps_prms: error prms containg current and ref ptr, strides,
1306*c83a76b0SSuyog Pawar *                 pointer to sad grid of each partitions
1307*c83a76b0SSuyog Pawar *
1308*c83a76b0SSuyog Pawar *  @return     None
1309*c83a76b0SSuyog Pawar ********************************************************************************
1310*c83a76b0SSuyog Pawar */
1311*c83a76b0SSuyog Pawar 
hme_evalsatd_update_2_best_results_pt_pu_16x16(err_prms_t * ps_prms,result_upd_prms_t * ps_result_prms)1312*c83a76b0SSuyog Pawar void hme_evalsatd_update_2_best_results_pt_pu_16x16(
1313*c83a76b0SSuyog Pawar     err_prms_t *ps_prms, result_upd_prms_t *ps_result_prms)
1314*c83a76b0SSuyog Pawar {
1315*c83a76b0SSuyog Pawar     S32 ai4_satd_4x4[16]; /* num 4x4s in a 16x16 */
1316*c83a76b0SSuyog Pawar     S32 ai4_satd_8x8[4]; /* num 8x8s in a 16x16 */
1317*c83a76b0SSuyog Pawar     S32 i4_satd_16x16; /* 16x16 satd cost     */
1318*c83a76b0SSuyog Pawar     S32 i;
1319*c83a76b0SSuyog Pawar     S16 ai2_8x8_had[256];
1320*c83a76b0SSuyog Pawar     S16 *pi2_y0;
1321*c83a76b0SSuyog Pawar     U08 *pu1_src, *pu1_pred;
1322*c83a76b0SSuyog Pawar     S32 pos_x_y_4x4_0, pos_x_y_4x4 = 0;
1323*c83a76b0SSuyog Pawar     S32 *ppi4_hsad;
1324*c83a76b0SSuyog Pawar 
1325*c83a76b0SSuyog Pawar     /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1326*c83a76b0SSuyog Pawar     S32 *api4_satd_pu[HAD_32x32 + 1];
1327*c83a76b0SSuyog Pawar     S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1328*c83a76b0SSuyog Pawar 
1329*c83a76b0SSuyog Pawar     U08 *pu1_inp = ps_prms->pu1_inp;
1330*c83a76b0SSuyog Pawar     U08 *pu1_ref = ps_prms->pu1_ref;
1331*c83a76b0SSuyog Pawar 
1332*c83a76b0SSuyog Pawar     S32 inp_stride = ps_prms->i4_inp_stride;
1333*c83a76b0SSuyog Pawar     S32 ref_stride = ps_prms->i4_ref_stride;
1334*c83a76b0SSuyog Pawar 
1335*c83a76b0SSuyog Pawar     api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
1336*c83a76b0SSuyog Pawar     api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
1337*c83a76b0SSuyog Pawar     api4_satd_pu[HAD_16x16] = &i4_satd_16x16;
1338*c83a76b0SSuyog Pawar     api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1339*c83a76b0SSuyog Pawar 
1340*c83a76b0SSuyog Pawar     ppi4_hsad = api4_satd_pu[HAD_16x16];
1341*c83a76b0SSuyog Pawar 
1342*c83a76b0SSuyog Pawar     /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */
1343*c83a76b0SSuyog Pawar     for(i = 0; i < 4; i++)
1344*c83a76b0SSuyog Pawar     {
1345*c83a76b0SSuyog Pawar         pu1_src = pu1_inp + (i & 0x01) * 8 + (i >> 1) * inp_stride * 8;
1346*c83a76b0SSuyog Pawar         pu1_pred = pu1_ref + (i & 0x01) * 8 + (i >> 1) * ref_stride * 8;
1347*c83a76b0SSuyog Pawar         pi2_y0 = ai2_8x8_had + (i & 0x01) * 8 + (i >> 1) * 16 * 8;
1348*c83a76b0SSuyog Pawar         pos_x_y_4x4_0 = pos_x_y_4x4 + (i & 0x01) * 2 + (i >> 1) * (2 << 16);
1349*c83a76b0SSuyog Pawar 
1350*c83a76b0SSuyog Pawar         ihevce_had_8x8_using_4_4x4(
1351*c83a76b0SSuyog Pawar             pu1_src, inp_stride, pu1_pred, ref_stride, pi2_y0, 16, api4_satd_pu, pos_x_y_4x4_0, 4);
1352*c83a76b0SSuyog Pawar     }
1353*c83a76b0SSuyog Pawar 
1354*c83a76b0SSuyog Pawar     /* For SATD computation following TU size are assumed for a 16x16 CU */
1355*c83a76b0SSuyog Pawar     /* 16 for 2Nx2N, 8 for NxN/Nx2N,2NxN and mix of 4 and 8 for AMPs     */
1356*c83a76b0SSuyog Pawar 
1357*c83a76b0SSuyog Pawar     /* Update 8x8 SATDs */
1358*c83a76b0SSuyog Pawar     /* Modified to cost calculation using only 4x4 SATD */
1359*c83a76b0SSuyog Pawar 
1360*c83a76b0SSuyog Pawar     //  ai4_satd_8x8[0] = ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[4] + ai4_satd_4x4[5];
1361*c83a76b0SSuyog Pawar     //  ai4_satd_8x8[1] = ai4_satd_4x4[2] + ai4_satd_4x4[3] + ai4_satd_4x4[6] + ai4_satd_4x4[7];
1362*c83a76b0SSuyog Pawar     //  ai4_satd_8x8[2] = ai4_satd_4x4[8] + ai4_satd_4x4[9] + ai4_satd_4x4[12] + ai4_satd_4x4[13];
1363*c83a76b0SSuyog Pawar     //  ai4_satd_8x8[3] = ai4_satd_4x4[10] + ai4_satd_4x4[11] + ai4_satd_4x4[14] + ai4_satd_4x4[15];
1364*c83a76b0SSuyog Pawar 
1365*c83a76b0SSuyog Pawar     /* Update 16x16 SATDs */
1366*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_2Nx2N] =
1367*c83a76b0SSuyog Pawar         ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[2] + ai4_satd_8x8[3];
1368*c83a76b0SSuyog Pawar 
1369*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_8x8[0];
1370*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_8x8[1];
1371*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_8x8[2];
1372*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_8x8[3];
1373*c83a76b0SSuyog Pawar 
1374*c83a76b0SSuyog Pawar     /* Update 8x16 / 16x8 SATDs */
1375*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_8x8[0] + ai4_satd_8x8[2];
1376*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_8x8[1] + ai4_satd_8x8[3];
1377*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_8x8[0] + ai4_satd_8x8[1];
1378*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_8x8[2] + ai4_satd_8x8[3];
1379*c83a76b0SSuyog Pawar 
1380*c83a76b0SSuyog Pawar     /* Update AMP SATDs 16x12,16x4, 12x16,4x16  */
1381*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_nLx2N_L] =
1382*c83a76b0SSuyog Pawar         ai4_satd_4x4[0] + ai4_satd_4x4[4] + ai4_satd_4x4[8] + ai4_satd_4x4[12];
1383*c83a76b0SSuyog Pawar 
1384*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_nLx2N_R] = ai4_satd_4x4[1] + ai4_satd_4x4[5] + ai4_satd_4x4[9] +
1385*c83a76b0SSuyog Pawar                                     ai4_satd_4x4[13] + pi4_sad_grid[PART_ID_Nx2N_R];
1386*c83a76b0SSuyog Pawar 
1387*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_nRx2N_L] = ai4_satd_4x4[2] + ai4_satd_4x4[6] + ai4_satd_4x4[10] +
1388*c83a76b0SSuyog Pawar                                     ai4_satd_4x4[14] + pi4_sad_grid[PART_ID_Nx2N_L];
1389*c83a76b0SSuyog Pawar 
1390*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_nRx2N_R] =
1391*c83a76b0SSuyog Pawar         ai4_satd_4x4[3] + ai4_satd_4x4[7] + ai4_satd_4x4[11] + ai4_satd_4x4[15];
1392*c83a76b0SSuyog Pawar 
1393*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_2NxnU_T] =
1394*c83a76b0SSuyog Pawar         ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[2] + ai4_satd_4x4[3];
1395*c83a76b0SSuyog Pawar 
1396*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_2NxnU_B] = ai4_satd_4x4[4] + ai4_satd_4x4[5] + ai4_satd_4x4[6] +
1397*c83a76b0SSuyog Pawar                                     ai4_satd_4x4[7] + pi4_sad_grid[PART_ID_2NxN_B];
1398*c83a76b0SSuyog Pawar 
1399*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_2NxnD_T] = ai4_satd_4x4[8] + ai4_satd_4x4[9] + ai4_satd_4x4[10] +
1400*c83a76b0SSuyog Pawar                                     ai4_satd_4x4[11] + pi4_sad_grid[PART_ID_2NxN_T];
1401*c83a76b0SSuyog Pawar 
1402*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_2NxnD_B] =
1403*c83a76b0SSuyog Pawar         ai4_satd_4x4[12] + ai4_satd_4x4[13] + ai4_satd_4x4[14] + ai4_satd_4x4[15];
1404*c83a76b0SSuyog Pawar 
1405*c83a76b0SSuyog Pawar     /* Call the update results function */
1406*c83a76b0SSuyog Pawar     {
1407*c83a76b0SSuyog Pawar         S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
1408*c83a76b0SSuyog Pawar         mv_refine_ctxt_t *ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
1409*c83a76b0SSuyog Pawar         S32 *pi4_valid_part_ids = &ps_subpel_refine_ctxt->ai4_part_id[0];
1410*c83a76b0SSuyog Pawar         S32 best_node_cost;
1411*c83a76b0SSuyog Pawar         S32 second_best_node_cost;
1412*c83a76b0SSuyog Pawar 
1413*c83a76b0SSuyog Pawar         /*For each valid partition, update the refine_prm structure to reflect the best and second
1414*c83a76b0SSuyog Pawar         best candidates for that partition*/
1415*c83a76b0SSuyog Pawar 
1416*c83a76b0SSuyog Pawar         for(i4_count = 0; i4_count < ps_subpel_refine_ctxt->i4_num_valid_parts; i4_count++)
1417*c83a76b0SSuyog Pawar         {
1418*c83a76b0SSuyog Pawar             S32 update_required = 0;
1419*c83a76b0SSuyog Pawar             S32 part_id = pi4_valid_part_ids[i4_count];
1420*c83a76b0SSuyog Pawar             S32 index = (ps_subpel_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
1421*c83a76b0SSuyog Pawar 
1422*c83a76b0SSuyog Pawar             /* Use a pre-computed cost instead of freshly evaluating subpel cost */
1423*c83a76b0SSuyog Pawar             i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
1424*c83a76b0SSuyog Pawar 
1425*c83a76b0SSuyog Pawar             /*Calculate total cost*/
1426*c83a76b0SSuyog Pawar             i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
1427*c83a76b0SSuyog Pawar             i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
1428*c83a76b0SSuyog Pawar 
1429*c83a76b0SSuyog Pawar             /*****************************************************************/
1430*c83a76b0SSuyog Pawar             /* We do not labor through the results if the total cost worse   */
1431*c83a76b0SSuyog Pawar             /* than the last of the results.                                 */
1432*c83a76b0SSuyog Pawar             /*****************************************************************/
1433*c83a76b0SSuyog Pawar             best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[0][index]);
1434*c83a76b0SSuyog Pawar             second_best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[1][index]);
1435*c83a76b0SSuyog Pawar 
1436*c83a76b0SSuyog Pawar             if(i4_tot_cost < second_best_node_cost)
1437*c83a76b0SSuyog Pawar             {
1438*c83a76b0SSuyog Pawar                 update_required = 2;
1439*c83a76b0SSuyog Pawar 
1440*c83a76b0SSuyog Pawar                 /*************************************************************/
1441*c83a76b0SSuyog Pawar                 /* Identify where the current result isto be placed.Basically*/
1442*c83a76b0SSuyog Pawar                 /* find the node which has cost just higher thannodeundertest*/
1443*c83a76b0SSuyog Pawar                 /*************************************************************/
1444*c83a76b0SSuyog Pawar                 if(i4_tot_cost < best_node_cost)
1445*c83a76b0SSuyog Pawar                 {
1446*c83a76b0SSuyog Pawar                     update_required = 1;
1447*c83a76b0SSuyog Pawar                 }
1448*c83a76b0SSuyog Pawar                 else if(i4_tot_cost == ps_subpel_refine_ctxt->i2_tot_cost[0][index])
1449*c83a76b0SSuyog Pawar                 {
1450*c83a76b0SSuyog Pawar                     update_required = 0;
1451*c83a76b0SSuyog Pawar                 }
1452*c83a76b0SSuyog Pawar                 if(update_required == 2)
1453*c83a76b0SSuyog Pawar                 {
1454*c83a76b0SSuyog Pawar                     ps_subpel_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
1455*c83a76b0SSuyog Pawar                     ps_subpel_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
1456*c83a76b0SSuyog Pawar                     ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_result_prms->i2_mv_x;
1457*c83a76b0SSuyog Pawar                     ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_result_prms->i2_mv_y;
1458*c83a76b0SSuyog Pawar                     ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_result_prms->i1_ref_idx;
1459*c83a76b0SSuyog Pawar                 }
1460*c83a76b0SSuyog Pawar                 else if(update_required == 1)
1461*c83a76b0SSuyog Pawar                 {
1462*c83a76b0SSuyog Pawar                     ps_subpel_refine_ctxt->i2_tot_cost[1][index] =
1463*c83a76b0SSuyog Pawar                         ps_subpel_refine_ctxt->i2_tot_cost[0][index];
1464*c83a76b0SSuyog Pawar                     ps_subpel_refine_ctxt->i2_mv_cost[1][index] =
1465*c83a76b0SSuyog Pawar                         ps_subpel_refine_ctxt->i2_mv_cost[0][index];
1466*c83a76b0SSuyog Pawar                     ps_subpel_refine_ctxt->i2_mv_x[1][index] =
1467*c83a76b0SSuyog Pawar                         ps_subpel_refine_ctxt->i2_mv_x[0][index];
1468*c83a76b0SSuyog Pawar                     ps_subpel_refine_ctxt->i2_mv_y[1][index] =
1469*c83a76b0SSuyog Pawar                         ps_subpel_refine_ctxt->i2_mv_y[0][index];
1470*c83a76b0SSuyog Pawar                     ps_subpel_refine_ctxt->i2_ref_idx[1][index] =
1471*c83a76b0SSuyog Pawar                         ps_subpel_refine_ctxt->i2_ref_idx[0][index];
1472*c83a76b0SSuyog Pawar 
1473*c83a76b0SSuyog Pawar                     ps_subpel_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
1474*c83a76b0SSuyog Pawar                     ps_subpel_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
1475*c83a76b0SSuyog Pawar                     ps_subpel_refine_ctxt->i2_mv_x[0][index] = ps_result_prms->i2_mv_x;
1476*c83a76b0SSuyog Pawar                     ps_subpel_refine_ctxt->i2_mv_y[0][index] = ps_result_prms->i2_mv_y;
1477*c83a76b0SSuyog Pawar                     ps_subpel_refine_ctxt->i2_ref_idx[0][index] = ps_result_prms->i1_ref_idx;
1478*c83a76b0SSuyog Pawar                 }
1479*c83a76b0SSuyog Pawar             }
1480*c83a76b0SSuyog Pawar         }
1481*c83a76b0SSuyog Pawar     }
1482*c83a76b0SSuyog Pawar }
1483*c83a76b0SSuyog Pawar 
1484*c83a76b0SSuyog Pawar //#if COMPUTE_16x16_R == C
hme_evalsatd_update_1_best_result_pt_pu_16x16(err_prms_t * ps_prms,result_upd_prms_t * ps_result_prms)1485*c83a76b0SSuyog Pawar void hme_evalsatd_update_1_best_result_pt_pu_16x16(
1486*c83a76b0SSuyog Pawar     err_prms_t *ps_prms, result_upd_prms_t *ps_result_prms)
1487*c83a76b0SSuyog Pawar {
1488*c83a76b0SSuyog Pawar     S32 ai4_satd_4x4[16]; /* num 4x4s in a 16x16 */
1489*c83a76b0SSuyog Pawar     S32 ai4_satd_8x8[4]; /* num 8x8s in a 16x16 */
1490*c83a76b0SSuyog Pawar     S32 i4_satd_16x16; /* 16x16 satd cost     */
1491*c83a76b0SSuyog Pawar     S32 i;
1492*c83a76b0SSuyog Pawar     S16 ai2_8x8_had[256];
1493*c83a76b0SSuyog Pawar     S16 *pi2_y0;
1494*c83a76b0SSuyog Pawar     U08 *pu1_src, *pu1_pred;
1495*c83a76b0SSuyog Pawar     S32 pos_x_y_4x4_0, pos_x_y_4x4 = 0;
1496*c83a76b0SSuyog Pawar     S32 *ppi4_hsad;
1497*c83a76b0SSuyog Pawar 
1498*c83a76b0SSuyog Pawar     /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1499*c83a76b0SSuyog Pawar     S32 *api4_satd_pu[HAD_32x32 + 1];
1500*c83a76b0SSuyog Pawar     S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1501*c83a76b0SSuyog Pawar 
1502*c83a76b0SSuyog Pawar     U08 *pu1_inp = ps_prms->pu1_inp;
1503*c83a76b0SSuyog Pawar     U08 *pu1_ref = ps_prms->pu1_ref;
1504*c83a76b0SSuyog Pawar 
1505*c83a76b0SSuyog Pawar     S32 inp_stride = ps_prms->i4_inp_stride;
1506*c83a76b0SSuyog Pawar     S32 ref_stride = ps_prms->i4_ref_stride;
1507*c83a76b0SSuyog Pawar 
1508*c83a76b0SSuyog Pawar     api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
1509*c83a76b0SSuyog Pawar     api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
1510*c83a76b0SSuyog Pawar     api4_satd_pu[HAD_16x16] = &i4_satd_16x16;
1511*c83a76b0SSuyog Pawar     api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1512*c83a76b0SSuyog Pawar 
1513*c83a76b0SSuyog Pawar     ppi4_hsad = api4_satd_pu[HAD_16x16];
1514*c83a76b0SSuyog Pawar 
1515*c83a76b0SSuyog Pawar     /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */
1516*c83a76b0SSuyog Pawar     for(i = 0; i < 4; i++)
1517*c83a76b0SSuyog Pawar     {
1518*c83a76b0SSuyog Pawar         pu1_src = pu1_inp + (i & 0x01) * 8 + (i >> 1) * inp_stride * 8;
1519*c83a76b0SSuyog Pawar         pu1_pred = pu1_ref + (i & 0x01) * 8 + (i >> 1) * ref_stride * 8;
1520*c83a76b0SSuyog Pawar         pi2_y0 = ai2_8x8_had + (i & 0x01) * 8 + (i >> 1) * 16 * 8;
1521*c83a76b0SSuyog Pawar         pos_x_y_4x4_0 = pos_x_y_4x4 + (i & 0x01) * 2 + (i >> 1) * (2 << 16);
1522*c83a76b0SSuyog Pawar 
1523*c83a76b0SSuyog Pawar         ihevce_had_8x8_using_4_4x4(
1524*c83a76b0SSuyog Pawar             pu1_src, inp_stride, pu1_pred, ref_stride, pi2_y0, 16, api4_satd_pu, pos_x_y_4x4_0, 4);
1525*c83a76b0SSuyog Pawar     }
1526*c83a76b0SSuyog Pawar 
1527*c83a76b0SSuyog Pawar     /* For SATD computation following TU size are assumed for a 16x16 CU */
1528*c83a76b0SSuyog Pawar     /* 16 for 2Nx2N, 8 for NxN/Nx2N,2NxN and mix of 4 and 8 for AMPs     */
1529*c83a76b0SSuyog Pawar 
1530*c83a76b0SSuyog Pawar     /* Update 8x8 SATDs */
1531*c83a76b0SSuyog Pawar     /* Modified to cost calculation using only 4x4 SATD */
1532*c83a76b0SSuyog Pawar 
1533*c83a76b0SSuyog Pawar     //  ai4_satd_8x8[0] = ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[4] + ai4_satd_4x4[5];
1534*c83a76b0SSuyog Pawar     //  ai4_satd_8x8[1] = ai4_satd_4x4[2] + ai4_satd_4x4[3] + ai4_satd_4x4[6] + ai4_satd_4x4[7];
1535*c83a76b0SSuyog Pawar     //  ai4_satd_8x8[2] = ai4_satd_4x4[8] + ai4_satd_4x4[9] + ai4_satd_4x4[12] + ai4_satd_4x4[13];
1536*c83a76b0SSuyog Pawar     //  ai4_satd_8x8[3] = ai4_satd_4x4[10] + ai4_satd_4x4[11] + ai4_satd_4x4[14] + ai4_satd_4x4[15];
1537*c83a76b0SSuyog Pawar 
1538*c83a76b0SSuyog Pawar     /* Update 16x16 SATDs */
1539*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_2Nx2N] =
1540*c83a76b0SSuyog Pawar         ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[2] + ai4_satd_8x8[3];
1541*c83a76b0SSuyog Pawar 
1542*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_8x8[0];
1543*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_8x8[1];
1544*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_8x8[2];
1545*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_8x8[3];
1546*c83a76b0SSuyog Pawar 
1547*c83a76b0SSuyog Pawar     /* Update 8x16 / 16x8 SATDs */
1548*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_8x8[0] + ai4_satd_8x8[2];
1549*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_8x8[1] + ai4_satd_8x8[3];
1550*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_8x8[0] + ai4_satd_8x8[1];
1551*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_8x8[2] + ai4_satd_8x8[3];
1552*c83a76b0SSuyog Pawar 
1553*c83a76b0SSuyog Pawar     /* Update AMP SATDs 16x12,16x4, 12x16,4x16  */
1554*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_nLx2N_L] =
1555*c83a76b0SSuyog Pawar         ai4_satd_4x4[0] + ai4_satd_4x4[2] + ai4_satd_4x4[8] + ai4_satd_4x4[10];
1556*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_nRx2N_R] =
1557*c83a76b0SSuyog Pawar         ai4_satd_4x4[5] + ai4_satd_4x4[7] + ai4_satd_4x4[13] + ai4_satd_4x4[15];
1558*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_2NxnU_T] =
1559*c83a76b0SSuyog Pawar         ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[4] + ai4_satd_4x4[5];
1560*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_2NxnD_B] =
1561*c83a76b0SSuyog Pawar         ai4_satd_4x4[10] + ai4_satd_4x4[11] + ai4_satd_4x4[14] + ai4_satd_4x4[15];
1562*c83a76b0SSuyog Pawar 
1563*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_nLx2N_R] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
1564*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_nRx2N_L] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
1565*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_2NxnU_B] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
1566*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_2NxnD_T] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
1567*c83a76b0SSuyog Pawar 
1568*c83a76b0SSuyog Pawar     /* Call the update results function */
1569*c83a76b0SSuyog Pawar     {
1570*c83a76b0SSuyog Pawar         S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
1571*c83a76b0SSuyog Pawar         mv_refine_ctxt_t *ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
1572*c83a76b0SSuyog Pawar         S32 *pi4_valid_part_ids = &ps_subpel_refine_ctxt->ai4_part_id[0];
1573*c83a76b0SSuyog Pawar         S32 best_node_cost;
1574*c83a76b0SSuyog Pawar         S32 second_best_node_cost;
1575*c83a76b0SSuyog Pawar 
1576*c83a76b0SSuyog Pawar         /*For each valid partition, update the refine_prm structure to reflect the best and second
1577*c83a76b0SSuyog Pawar         best candidates for that partition*/
1578*c83a76b0SSuyog Pawar 
1579*c83a76b0SSuyog Pawar         for(i4_count = 0; i4_count < ps_subpel_refine_ctxt->i4_num_valid_parts; i4_count++)
1580*c83a76b0SSuyog Pawar         {
1581*c83a76b0SSuyog Pawar             S32 update_required = 0;
1582*c83a76b0SSuyog Pawar             S32 part_id = pi4_valid_part_ids[i4_count];
1583*c83a76b0SSuyog Pawar             S32 index = (ps_subpel_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
1584*c83a76b0SSuyog Pawar 
1585*c83a76b0SSuyog Pawar             /* Use a pre-computed cost instead of freshly evaluating subpel cost */
1586*c83a76b0SSuyog Pawar             i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
1587*c83a76b0SSuyog Pawar 
1588*c83a76b0SSuyog Pawar             /*Calculate total cost*/
1589*c83a76b0SSuyog Pawar             i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
1590*c83a76b0SSuyog Pawar             i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
1591*c83a76b0SSuyog Pawar 
1592*c83a76b0SSuyog Pawar             /*****************************************************************/
1593*c83a76b0SSuyog Pawar             /* We do not labor through the results if the total cost worse   */
1594*c83a76b0SSuyog Pawar             /* than the last of the results.                                 */
1595*c83a76b0SSuyog Pawar             /*****************************************************************/
1596*c83a76b0SSuyog Pawar             best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[0][index]);
1597*c83a76b0SSuyog Pawar             second_best_node_cost = SHRT_MAX;
1598*c83a76b0SSuyog Pawar 
1599*c83a76b0SSuyog Pawar             if(i4_tot_cost < second_best_node_cost)
1600*c83a76b0SSuyog Pawar             {
1601*c83a76b0SSuyog Pawar                 update_required = 0;
1602*c83a76b0SSuyog Pawar 
1603*c83a76b0SSuyog Pawar                 /*************************************************************/
1604*c83a76b0SSuyog Pawar                 /* Identify where the current result isto be placed.Basically*/
1605*c83a76b0SSuyog Pawar                 /* find the node which has cost just higher thannodeundertest*/
1606*c83a76b0SSuyog Pawar                 /*************************************************************/
1607*c83a76b0SSuyog Pawar                 if(i4_tot_cost < best_node_cost)
1608*c83a76b0SSuyog Pawar                 {
1609*c83a76b0SSuyog Pawar                     update_required = 1;
1610*c83a76b0SSuyog Pawar                 }
1611*c83a76b0SSuyog Pawar                 else if(i4_tot_cost == ps_subpel_refine_ctxt->i2_tot_cost[0][index])
1612*c83a76b0SSuyog Pawar                 {
1613*c83a76b0SSuyog Pawar                     update_required = 0;
1614*c83a76b0SSuyog Pawar                 }
1615*c83a76b0SSuyog Pawar                 if(update_required == 2)
1616*c83a76b0SSuyog Pawar                 {
1617*c83a76b0SSuyog Pawar                     ps_subpel_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
1618*c83a76b0SSuyog Pawar                     ps_subpel_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
1619*c83a76b0SSuyog Pawar                     ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_result_prms->i2_mv_x;
1620*c83a76b0SSuyog Pawar                     ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_result_prms->i2_mv_y;
1621*c83a76b0SSuyog Pawar                     ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_result_prms->i1_ref_idx;
1622*c83a76b0SSuyog Pawar                 }
1623*c83a76b0SSuyog Pawar                 else if(update_required == 1)
1624*c83a76b0SSuyog Pawar                 {
1625*c83a76b0SSuyog Pawar                     ps_subpel_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
1626*c83a76b0SSuyog Pawar                     ps_subpel_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
1627*c83a76b0SSuyog Pawar                     ps_subpel_refine_ctxt->i2_mv_x[0][index] = ps_result_prms->i2_mv_x;
1628*c83a76b0SSuyog Pawar                     ps_subpel_refine_ctxt->i2_mv_y[0][index] = ps_result_prms->i2_mv_y;
1629*c83a76b0SSuyog Pawar                     ps_subpel_refine_ctxt->i2_ref_idx[0][index] = ps_result_prms->i1_ref_idx;
1630*c83a76b0SSuyog Pawar                 }
1631*c83a76b0SSuyog Pawar             }
1632*c83a76b0SSuyog Pawar         }
1633*c83a76b0SSuyog Pawar     }
1634*c83a76b0SSuyog Pawar }
1635*c83a76b0SSuyog Pawar 
hme_evalsatd_pt_pu_16x16_tu_rec(err_prms_t * ps_prms,WORD32 lambda,WORD32 lambda_q_shift,WORD32 i4_frm_qstep,me_func_selector_t * ps_func_selector)1636*c83a76b0SSuyog Pawar WORD32 hme_evalsatd_pt_pu_16x16_tu_rec(
1637*c83a76b0SSuyog Pawar     err_prms_t *ps_prms,
1638*c83a76b0SSuyog Pawar     WORD32 lambda,
1639*c83a76b0SSuyog Pawar     WORD32 lambda_q_shift,
1640*c83a76b0SSuyog Pawar     WORD32 i4_frm_qstep,
1641*c83a76b0SSuyog Pawar     me_func_selector_t *ps_func_selector)
1642*c83a76b0SSuyog Pawar {
1643*c83a76b0SSuyog Pawar     S32 ai4_satd_4x4[16]; /* num 4x4s in a 16x16 */
1644*c83a76b0SSuyog Pawar     S32 ai4_satd_8x8[4]; /* num 8x8s in a 16x16 */
1645*c83a76b0SSuyog Pawar     S32 ai4_tu_split_8x8[16];
1646*c83a76b0SSuyog Pawar     S32 i4_satd_16x16; /* 16x16 satd cost     */
1647*c83a76b0SSuyog Pawar 
1648*c83a76b0SSuyog Pawar     S32 ai4_tu_early_cbf_8x8[16];
1649*c83a76b0SSuyog Pawar 
1650*c83a76b0SSuyog Pawar     //S16 ai2_had_out[256];
1651*c83a76b0SSuyog Pawar     S16 *pi2_had_out;
1652*c83a76b0SSuyog Pawar     S32 tu_split_flag = 0;
1653*c83a76b0SSuyog Pawar     S32 early_cbf_flag = 0;
1654*c83a76b0SSuyog Pawar     S32 total_satd_cost = 0;
1655*c83a76b0SSuyog Pawar 
1656*c83a76b0SSuyog Pawar     /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1657*c83a76b0SSuyog Pawar     S32 *api4_satd_pu[HAD_32x32 + 1];
1658*c83a76b0SSuyog Pawar     S32 *api4_tu_split[HAD_32x32 + 1];
1659*c83a76b0SSuyog Pawar     S32 *api4_tu_early_cbf[HAD_32x32 + 1];
1660*c83a76b0SSuyog Pawar 
1661*c83a76b0SSuyog Pawar     U08 *pu1_inp = ps_prms->pu1_inp;
1662*c83a76b0SSuyog Pawar     U08 *pu1_ref = ps_prms->pu1_ref;
1663*c83a76b0SSuyog Pawar 
1664*c83a76b0SSuyog Pawar     S32 inp_stride = ps_prms->i4_inp_stride;
1665*c83a76b0SSuyog Pawar     S32 ref_stride = ps_prms->i4_ref_stride;
1666*c83a76b0SSuyog Pawar 
1667*c83a76b0SSuyog Pawar     /* Initialize tu_split_cost to "0" */
1668*c83a76b0SSuyog Pawar     ps_prms->i4_tu_split_cost = 0;
1669*c83a76b0SSuyog Pawar 
1670*c83a76b0SSuyog Pawar     pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem;
1671*c83a76b0SSuyog Pawar 
1672*c83a76b0SSuyog Pawar     api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
1673*c83a76b0SSuyog Pawar     api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
1674*c83a76b0SSuyog Pawar     api4_satd_pu[HAD_16x16] = &i4_satd_16x16;
1675*c83a76b0SSuyog Pawar     api4_satd_pu[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1676*c83a76b0SSuyog Pawar 
1677*c83a76b0SSuyog Pawar     api4_tu_split[HAD_4x4] = NULL;
1678*c83a76b0SSuyog Pawar     api4_tu_split[HAD_8x8] = &ai4_tu_split_8x8[0];
1679*c83a76b0SSuyog Pawar     api4_tu_split[HAD_16x16] = &tu_split_flag;
1680*c83a76b0SSuyog Pawar     api4_tu_split[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1681*c83a76b0SSuyog Pawar 
1682*c83a76b0SSuyog Pawar     api4_tu_early_cbf[HAD_4x4] = NULL;
1683*c83a76b0SSuyog Pawar     api4_tu_early_cbf[HAD_8x8] = &ai4_tu_early_cbf_8x8[0];
1684*c83a76b0SSuyog Pawar     api4_tu_early_cbf[HAD_16x16] = &early_cbf_flag;
1685*c83a76b0SSuyog Pawar     api4_tu_early_cbf[HAD_32x32] = NULL; /* 32x32 not used for 16x16 subpel refine */
1686*c83a76b0SSuyog Pawar 
1687*c83a76b0SSuyog Pawar     /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */
1688*c83a76b0SSuyog Pawar     ps_func_selector->pf_had_16x16_r(
1689*c83a76b0SSuyog Pawar         pu1_inp,
1690*c83a76b0SSuyog Pawar         inp_stride,
1691*c83a76b0SSuyog Pawar         pu1_ref,
1692*c83a76b0SSuyog Pawar         ref_stride,
1693*c83a76b0SSuyog Pawar         pi2_had_out,
1694*c83a76b0SSuyog Pawar         16,
1695*c83a76b0SSuyog Pawar         api4_satd_pu,
1696*c83a76b0SSuyog Pawar         api4_tu_split,
1697*c83a76b0SSuyog Pawar         api4_tu_early_cbf,
1698*c83a76b0SSuyog Pawar         0,
1699*c83a76b0SSuyog Pawar         4,
1700*c83a76b0SSuyog Pawar         lambda,
1701*c83a76b0SSuyog Pawar         lambda_q_shift,
1702*c83a76b0SSuyog Pawar         i4_frm_qstep,
1703*c83a76b0SSuyog Pawar         0,
1704*c83a76b0SSuyog Pawar         ps_prms->u1_max_tr_depth,
1705*c83a76b0SSuyog Pawar         ps_prms->u1_max_tr_size,
1706*c83a76b0SSuyog Pawar         &(ps_prms->i4_tu_split_cost),
1707*c83a76b0SSuyog Pawar         NULL);
1708*c83a76b0SSuyog Pawar 
1709*c83a76b0SSuyog Pawar     total_satd_cost = i4_satd_16x16;
1710*c83a76b0SSuyog Pawar 
1711*c83a76b0SSuyog Pawar     ps_prms->pi4_tu_split_flags[0] = tu_split_flag;
1712*c83a76b0SSuyog Pawar 
1713*c83a76b0SSuyog Pawar     ps_prms->pi4_tu_early_cbf[0] = early_cbf_flag;
1714*c83a76b0SSuyog Pawar 
1715*c83a76b0SSuyog Pawar     return total_satd_cost;
1716*c83a76b0SSuyog Pawar }
1717*c83a76b0SSuyog Pawar 
1718*c83a76b0SSuyog Pawar /**
1719*c83a76b0SSuyog Pawar ********************************************************************************
1720*c83a76b0SSuyog Pawar *  @fn     S32 hme_evalsatd_pt_pu_32x32
1721*c83a76b0SSuyog Pawar *
1722*c83a76b0SSuyog Pawar *  @brief  Evaluates the SATD with partial updates for all the best partitions
1723*c83a76b0SSuyog Pawar *          of a 32x32 CU based on recursive Hadamard 16x16, 8x8 and 4x4 satds
1724*c83a76b0SSuyog Pawar *
1725*c83a76b0SSuyog Pawar *  @param[inout]  ps_prms: error prms containg current and ref ptr, strides,
1726*c83a76b0SSuyog Pawar *                 pointer to sad grid of each partitions
1727*c83a76b0SSuyog Pawar *
1728*c83a76b0SSuyog Pawar *  @return     None
1729*c83a76b0SSuyog Pawar ********************************************************************************
1730*c83a76b0SSuyog Pawar */
hme_evalsatd_pt_pu_32x32(err_prms_t * ps_prms)1731*c83a76b0SSuyog Pawar void hme_evalsatd_pt_pu_32x32(err_prms_t *ps_prms)
1732*c83a76b0SSuyog Pawar {
1733*c83a76b0SSuyog Pawar     //S32 ai4_satd_4x4[64];   /* num 4x4s in a 32x32 */
1734*c83a76b0SSuyog Pawar     S32 ai4_satd_8x8[16]; /* num 8x8s in a 32x32 */
1735*c83a76b0SSuyog Pawar     S32 ai4_satd_16x16[4]; /* num 16x16 in a 32x32 */
1736*c83a76b0SSuyog Pawar     S32 i4_satd_32x32;
1737*c83a76b0SSuyog Pawar     //    S16 ai2_had_out[32*32];
1738*c83a76b0SSuyog Pawar     U08 *pu1_src;
1739*c83a76b0SSuyog Pawar     U08 *pu1_pred;
1740*c83a76b0SSuyog Pawar     S32 i;
1741*c83a76b0SSuyog Pawar 
1742*c83a76b0SSuyog Pawar     /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1743*c83a76b0SSuyog Pawar     S32 *api4_satd_pu[HAD_32x32 + 1];
1744*c83a76b0SSuyog Pawar     S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1745*c83a76b0SSuyog Pawar 
1746*c83a76b0SSuyog Pawar     U08 *pu1_inp = ps_prms->pu1_inp;
1747*c83a76b0SSuyog Pawar     U08 *pu1_ref = ps_prms->pu1_ref;
1748*c83a76b0SSuyog Pawar 
1749*c83a76b0SSuyog Pawar     S32 inp_stride = ps_prms->i4_inp_stride;
1750*c83a76b0SSuyog Pawar     S32 ref_stride = ps_prms->i4_ref_stride;
1751*c83a76b0SSuyog Pawar 
1752*c83a76b0SSuyog Pawar     //api4_satd_pu[HAD_4x4]   = &ai4_satd_4x4[0];
1753*c83a76b0SSuyog Pawar     api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
1754*c83a76b0SSuyog Pawar     api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[0];
1755*c83a76b0SSuyog Pawar     api4_satd_pu[HAD_32x32] = &i4_satd_32x32;
1756*c83a76b0SSuyog Pawar 
1757*c83a76b0SSuyog Pawar     /* 32x32 SATD is calculates as the sum of the 4 8x8's in the block */
1758*c83a76b0SSuyog Pawar     for(i = 0; i < 16; i++)
1759*c83a76b0SSuyog Pawar     {
1760*c83a76b0SSuyog Pawar         pu1_src = pu1_inp + ((i & 0x3) << 3) + ((i >> 2) * inp_stride * 8);
1761*c83a76b0SSuyog Pawar 
1762*c83a76b0SSuyog Pawar         pu1_pred = pu1_ref + ((i & 0x3) << 3) + ((i >> 2) * ref_stride * 8);
1763*c83a76b0SSuyog Pawar 
1764*c83a76b0SSuyog Pawar         ai4_satd_8x8[i] = ps_prms->ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit(
1765*c83a76b0SSuyog Pawar             pu1_src, inp_stride, pu1_pred, ref_stride, NULL, 1);
1766*c83a76b0SSuyog Pawar     }
1767*c83a76b0SSuyog Pawar 
1768*c83a76b0SSuyog Pawar     /* Modified to cost calculation using only 8x8 SATD for 32x32*/
1769*c83a76b0SSuyog Pawar     ai4_satd_16x16[0] = ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[4] + ai4_satd_8x8[5];
1770*c83a76b0SSuyog Pawar     ai4_satd_16x16[1] = ai4_satd_8x8[2] + ai4_satd_8x8[3] + ai4_satd_8x8[6] + ai4_satd_8x8[7];
1771*c83a76b0SSuyog Pawar     ai4_satd_16x16[2] = ai4_satd_8x8[8] + ai4_satd_8x8[9] + ai4_satd_8x8[12] + ai4_satd_8x8[13];
1772*c83a76b0SSuyog Pawar     ai4_satd_16x16[3] = ai4_satd_8x8[10] + ai4_satd_8x8[11] + ai4_satd_8x8[14] + ai4_satd_8x8[15];
1773*c83a76b0SSuyog Pawar 
1774*c83a76b0SSuyog Pawar     /* Update 32x32 SATD */
1775*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_2Nx2N] =
1776*c83a76b0SSuyog Pawar         ai4_satd_16x16[0] + ai4_satd_16x16[1] + ai4_satd_16x16[2] + ai4_satd_16x16[3];
1777*c83a76b0SSuyog Pawar 
1778*c83a76b0SSuyog Pawar     /* Update 16x16 SATDs */
1779*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_16x16[0];
1780*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_16x16[1];
1781*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_16x16[2];
1782*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_16x16[3];
1783*c83a76b0SSuyog Pawar 
1784*c83a76b0SSuyog Pawar     /* Update 16x32 / 32x16 SATDs */
1785*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_16x16[0] + ai4_satd_16x16[2];
1786*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_16x16[1] + ai4_satd_16x16[3];
1787*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_16x16[0] + ai4_satd_16x16[1];
1788*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_16x16[2] + ai4_satd_16x16[3];
1789*c83a76b0SSuyog Pawar 
1790*c83a76b0SSuyog Pawar     /* Update AMP SATDs 32x24,32x8, 24x32,8x32  */
1791*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_nLx2N_L] =
1792*c83a76b0SSuyog Pawar         ai4_satd_8x8[0] + ai4_satd_8x8[4] + ai4_satd_8x8[8] + ai4_satd_8x8[12];
1793*c83a76b0SSuyog Pawar 
1794*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_nLx2N_R] = ai4_satd_8x8[1] + ai4_satd_8x8[5] + ai4_satd_8x8[9] +
1795*c83a76b0SSuyog Pawar                                     ai4_satd_8x8[13] + pi4_sad_grid[PART_ID_Nx2N_R];
1796*c83a76b0SSuyog Pawar 
1797*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_nRx2N_L] = ai4_satd_8x8[2] + ai4_satd_8x8[6] + ai4_satd_8x8[10] +
1798*c83a76b0SSuyog Pawar                                     ai4_satd_8x8[14] + pi4_sad_grid[PART_ID_Nx2N_L];
1799*c83a76b0SSuyog Pawar 
1800*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_nRx2N_R] =
1801*c83a76b0SSuyog Pawar         ai4_satd_8x8[3] + ai4_satd_8x8[7] + ai4_satd_8x8[11] + ai4_satd_8x8[15];
1802*c83a76b0SSuyog Pawar 
1803*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_2NxnU_T] =
1804*c83a76b0SSuyog Pawar         ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[2] + ai4_satd_8x8[3];
1805*c83a76b0SSuyog Pawar 
1806*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_2NxnU_B] = ai4_satd_8x8[4] + ai4_satd_8x8[5] + ai4_satd_8x8[6] +
1807*c83a76b0SSuyog Pawar                                     ai4_satd_8x8[7] + pi4_sad_grid[PART_ID_2NxN_B];
1808*c83a76b0SSuyog Pawar 
1809*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_2NxnD_T] = ai4_satd_8x8[8] + ai4_satd_8x8[9] + ai4_satd_8x8[10] +
1810*c83a76b0SSuyog Pawar                                     ai4_satd_8x8[11] + pi4_sad_grid[PART_ID_2NxN_T];
1811*c83a76b0SSuyog Pawar 
1812*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_2NxnD_B] =
1813*c83a76b0SSuyog Pawar         ai4_satd_8x8[12] + ai4_satd_8x8[13] + ai4_satd_8x8[14] + ai4_satd_8x8[15];
1814*c83a76b0SSuyog Pawar }
1815*c83a76b0SSuyog Pawar 
hme_evalsatd_pt_pu_32x32_tu_rec(err_prms_t * ps_prms,WORD32 lambda,WORD32 lambda_q_shift,WORD32 i4_frm_qstep,me_func_selector_t * ps_func_selector)1816*c83a76b0SSuyog Pawar WORD32 hme_evalsatd_pt_pu_32x32_tu_rec(
1817*c83a76b0SSuyog Pawar     err_prms_t *ps_prms,
1818*c83a76b0SSuyog Pawar     WORD32 lambda,
1819*c83a76b0SSuyog Pawar     WORD32 lambda_q_shift,
1820*c83a76b0SSuyog Pawar     WORD32 i4_frm_qstep,
1821*c83a76b0SSuyog Pawar     me_func_selector_t *ps_func_selector)
1822*c83a76b0SSuyog Pawar {
1823*c83a76b0SSuyog Pawar     S32 ai4_satd_4x4[64]; /* num 4x4s in a 32x32 */
1824*c83a76b0SSuyog Pawar     S32 ai4_satd_8x8[16]; /* num 8x8s in a 32x32 */
1825*c83a76b0SSuyog Pawar     S32 ai4_tu_split_8x8[16];
1826*c83a76b0SSuyog Pawar     S32 ai4_satd_16x16[4]; /* num 16x16 in a 32x32 */
1827*c83a76b0SSuyog Pawar     S32 ai4_tu_split_16x16[4];
1828*c83a76b0SSuyog Pawar     S32 i4_satd_32x32;
1829*c83a76b0SSuyog Pawar 
1830*c83a76b0SSuyog Pawar     S32 ai4_tu_early_cbf_8x8[16];
1831*c83a76b0SSuyog Pawar     S32 ai4_tu_early_cbf_16x16[4];
1832*c83a76b0SSuyog Pawar     S32 early_cbf_flag;
1833*c83a76b0SSuyog Pawar 
1834*c83a76b0SSuyog Pawar     S16 *pi2_had_out;
1835*c83a76b0SSuyog Pawar 
1836*c83a76b0SSuyog Pawar     /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1837*c83a76b0SSuyog Pawar     S32 *api4_satd_pu[HAD_32x32 + 1];
1838*c83a76b0SSuyog Pawar     S32 *api4_tu_split[HAD_32x32 + 1];
1839*c83a76b0SSuyog Pawar     S32 *api4_tu_early_cbf[HAD_32x32 + 1];
1840*c83a76b0SSuyog Pawar 
1841*c83a76b0SSuyog Pawar     S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1842*c83a76b0SSuyog Pawar     S32 *pi4_tu_split_flag = ps_prms->pi4_tu_split_flags;
1843*c83a76b0SSuyog Pawar     S32 *pi4_tu_early_cbf = ps_prms->pi4_tu_early_cbf;
1844*c83a76b0SSuyog Pawar 
1845*c83a76b0SSuyog Pawar     S32 tu_split_flag = 0;
1846*c83a76b0SSuyog Pawar     S32 total_satd_cost = 0;
1847*c83a76b0SSuyog Pawar 
1848*c83a76b0SSuyog Pawar     U08 *pu1_inp = ps_prms->pu1_inp;
1849*c83a76b0SSuyog Pawar     U08 *pu1_ref = ps_prms->pu1_ref;
1850*c83a76b0SSuyog Pawar 
1851*c83a76b0SSuyog Pawar     S32 inp_stride = ps_prms->i4_inp_stride;
1852*c83a76b0SSuyog Pawar     S32 ref_stride = ps_prms->i4_ref_stride;
1853*c83a76b0SSuyog Pawar 
1854*c83a76b0SSuyog Pawar     /* Initialize tu_split_cost to "0" */
1855*c83a76b0SSuyog Pawar     ps_prms->i4_tu_split_cost = 0;
1856*c83a76b0SSuyog Pawar 
1857*c83a76b0SSuyog Pawar     pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem;
1858*c83a76b0SSuyog Pawar 
1859*c83a76b0SSuyog Pawar     api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
1860*c83a76b0SSuyog Pawar     api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
1861*c83a76b0SSuyog Pawar     api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[0];
1862*c83a76b0SSuyog Pawar     api4_satd_pu[HAD_32x32] = &i4_satd_32x32;
1863*c83a76b0SSuyog Pawar 
1864*c83a76b0SSuyog Pawar     api4_tu_split[HAD_4x4] = NULL;
1865*c83a76b0SSuyog Pawar     api4_tu_split[HAD_8x8] = &ai4_tu_split_8x8[0];
1866*c83a76b0SSuyog Pawar     api4_tu_split[HAD_16x16] = &ai4_tu_split_16x16[0];
1867*c83a76b0SSuyog Pawar     api4_tu_split[HAD_32x32] = &tu_split_flag;
1868*c83a76b0SSuyog Pawar 
1869*c83a76b0SSuyog Pawar     api4_tu_early_cbf[HAD_4x4] = NULL;
1870*c83a76b0SSuyog Pawar     api4_tu_early_cbf[HAD_8x8] = &ai4_tu_early_cbf_8x8[0];
1871*c83a76b0SSuyog Pawar     api4_tu_early_cbf[HAD_16x16] = &ai4_tu_early_cbf_16x16[0];
1872*c83a76b0SSuyog Pawar     api4_tu_early_cbf[HAD_32x32] = &early_cbf_flag;
1873*c83a76b0SSuyog Pawar 
1874*c83a76b0SSuyog Pawar     /* Call recursive 32x32 HAD module; updates satds for 4x4, 8x8, 16x16 and 32x32 */
1875*c83a76b0SSuyog Pawar     ihevce_had_32x32_r(
1876*c83a76b0SSuyog Pawar         pu1_inp,
1877*c83a76b0SSuyog Pawar         inp_stride,
1878*c83a76b0SSuyog Pawar         pu1_ref,
1879*c83a76b0SSuyog Pawar         ref_stride,
1880*c83a76b0SSuyog Pawar         pi2_had_out,
1881*c83a76b0SSuyog Pawar         32,
1882*c83a76b0SSuyog Pawar         api4_satd_pu,
1883*c83a76b0SSuyog Pawar         api4_tu_split,
1884*c83a76b0SSuyog Pawar         api4_tu_early_cbf,
1885*c83a76b0SSuyog Pawar         0,
1886*c83a76b0SSuyog Pawar         8,
1887*c83a76b0SSuyog Pawar         lambda,
1888*c83a76b0SSuyog Pawar         lambda_q_shift,
1889*c83a76b0SSuyog Pawar         i4_frm_qstep,
1890*c83a76b0SSuyog Pawar         0,
1891*c83a76b0SSuyog Pawar         ps_prms->u1_max_tr_depth,
1892*c83a76b0SSuyog Pawar         ps_prms->u1_max_tr_size,
1893*c83a76b0SSuyog Pawar         &(ps_prms->i4_tu_split_cost),
1894*c83a76b0SSuyog Pawar         ps_func_selector);
1895*c83a76b0SSuyog Pawar 
1896*c83a76b0SSuyog Pawar     total_satd_cost = i4_satd_32x32;
1897*c83a76b0SSuyog Pawar 
1898*c83a76b0SSuyog Pawar     /*The structure of the TU_SPLIT flag for the current 32x32 is as follows
1899*c83a76b0SSuyog Pawar     TL_16x16 - 5bits (4 for child and LSBit for 16x16 split)
1900*c83a76b0SSuyog Pawar     TR_16x16 - 5bits (4 for child and LSBit for 16x16 split)
1901*c83a76b0SSuyog Pawar     BL_16x16 - 5bits (4 for child and LSBit for 16x16 split)
1902*c83a76b0SSuyog Pawar     BR_16x16 - 5bits (4 for child and LSBit for 16x16 split)
1903*c83a76b0SSuyog Pawar     32x32_split - 1bit (LSBit)
1904*c83a76b0SSuyog Pawar 
1905*c83a76b0SSuyog Pawar     TU_SPLIT : (TL_16x16)_(TR_16x16)_(BL_16x16)_(BR_16x16)_32x32_split (21bits)*/
1906*c83a76b0SSuyog Pawar 
1907*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_2Nx2N] = total_satd_cost;
1908*c83a76b0SSuyog Pawar     pi4_tu_split_flag[PART_ID_2Nx2N] = tu_split_flag;
1909*c83a76b0SSuyog Pawar     pi4_tu_early_cbf[PART_ID_2Nx2N] = early_cbf_flag;
1910*c83a76b0SSuyog Pawar 
1911*c83a76b0SSuyog Pawar     return total_satd_cost;
1912*c83a76b0SSuyog Pawar }
1913*c83a76b0SSuyog Pawar 
1914*c83a76b0SSuyog Pawar /**
1915*c83a76b0SSuyog Pawar ********************************************************************************
1916*c83a76b0SSuyog Pawar *  @fn     S32 hme_evalsatd_pt_pu_64x64
1917*c83a76b0SSuyog Pawar *
1918*c83a76b0SSuyog Pawar *  @brief  Evaluates the SATD with partial updates for all the best partitions
1919*c83a76b0SSuyog Pawar *          of a 64x64 CU based on accumulated Hadamard 32x32 and 16x16 satds
1920*c83a76b0SSuyog Pawar *
1921*c83a76b0SSuyog Pawar *           Note : 64x64 SATD does not do hadamard Transform using 32x32 hadamard
1922*c83a76b0SSuyog Pawar *                  outputs but directly uses four 32x32 SATD and 16 16x16 SATDS as
1923*c83a76b0SSuyog Pawar *                  TU size of 64 is not supported in HEVC
1924*c83a76b0SSuyog Pawar *
1925*c83a76b0SSuyog Pawar *  @param[inout]  ps_prms: error prms containg current and ref ptr, strides,
1926*c83a76b0SSuyog Pawar *                 pointer to sad grid of each partitions
1927*c83a76b0SSuyog Pawar *
1928*c83a76b0SSuyog Pawar *  @return     None
1929*c83a76b0SSuyog Pawar ********************************************************************************
1930*c83a76b0SSuyog Pawar */
1931*c83a76b0SSuyog Pawar 
hme_evalsatd_pt_pu_64x64(err_prms_t * ps_prms)1932*c83a76b0SSuyog Pawar void hme_evalsatd_pt_pu_64x64(err_prms_t *ps_prms)
1933*c83a76b0SSuyog Pawar {
1934*c83a76b0SSuyog Pawar     //S32 ai4_satd_4x4[4][64];   /* num 4x4s in a 32x32 * num 32x32 in 64x64 */
1935*c83a76b0SSuyog Pawar     S32 ai4_satd_8x8[4][16]; /* num 8x8s in a 32x32 * num 32x32 in 64x64 */
1936*c83a76b0SSuyog Pawar     S32 ai4_satd_16x16[4][4]; /* num 16x16 in a 32x32* num 32x32 in 64x64 */
1937*c83a76b0SSuyog Pawar     S32 ai4_satd_32x32[4]; /* num 32x32 in 64x64 */
1938*c83a76b0SSuyog Pawar     //    S16 ai2_had_out[32*32];
1939*c83a76b0SSuyog Pawar     S32 i, j;
1940*c83a76b0SSuyog Pawar 
1941*c83a76b0SSuyog Pawar     //  S32 ai4_tu_split_8x8[4][16];
1942*c83a76b0SSuyog Pawar     //  S32 ai4_tu_split_16x16[4][4];
1943*c83a76b0SSuyog Pawar     //  S32 ai4_tu_split_32x32[4];
1944*c83a76b0SSuyog Pawar 
1945*c83a76b0SSuyog Pawar     /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
1946*c83a76b0SSuyog Pawar     S32 *api4_satd_pu[HAD_32x32 + 1];
1947*c83a76b0SSuyog Pawar     //  S32 *api4_tu_split[HAD_32x32 + 1];
1948*c83a76b0SSuyog Pawar 
1949*c83a76b0SSuyog Pawar     S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
1950*c83a76b0SSuyog Pawar 
1951*c83a76b0SSuyog Pawar     U08 *pu1_inp = ps_prms->pu1_inp;
1952*c83a76b0SSuyog Pawar     U08 *pu1_ref = ps_prms->pu1_ref;
1953*c83a76b0SSuyog Pawar     U08 *pu1_src;
1954*c83a76b0SSuyog Pawar     U08 *pu1_pred;
1955*c83a76b0SSuyog Pawar 
1956*c83a76b0SSuyog Pawar     S32 inp_stride = ps_prms->i4_inp_stride;
1957*c83a76b0SSuyog Pawar     S32 ref_stride = ps_prms->i4_ref_stride;
1958*c83a76b0SSuyog Pawar 
1959*c83a76b0SSuyog Pawar     for(i = 0; i < 4; i++)
1960*c83a76b0SSuyog Pawar     {
1961*c83a76b0SSuyog Pawar         S32 blkx = (i & 0x1);
1962*c83a76b0SSuyog Pawar         S32 blky = (i >> 1);
1963*c83a76b0SSuyog Pawar         U08 *pu1_pi0, *pu1_pi1;
1964*c83a76b0SSuyog Pawar 
1965*c83a76b0SSuyog Pawar         //api4_satd_pu[HAD_4x4]   = &ai4_satd_4x4[i][0];
1966*c83a76b0SSuyog Pawar         api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[i][0];
1967*c83a76b0SSuyog Pawar         api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[i][0];
1968*c83a76b0SSuyog Pawar         api4_satd_pu[HAD_32x32] = &ai4_satd_32x32[i];
1969*c83a76b0SSuyog Pawar 
1970*c83a76b0SSuyog Pawar         pu1_pi0 = pu1_inp + (blkx * 32) + (blky * 32 * inp_stride);
1971*c83a76b0SSuyog Pawar         pu1_pi1 = pu1_ref + (blkx * 32) + (blky * 32 * ref_stride);
1972*c83a76b0SSuyog Pawar 
1973*c83a76b0SSuyog Pawar         /* 64x64 SATD is calculates as the sum of the 4 16x16's in the block */
1974*c83a76b0SSuyog Pawar         for(j = 0; j < 16; j++)
1975*c83a76b0SSuyog Pawar         {
1976*c83a76b0SSuyog Pawar             pu1_src = pu1_pi0 + ((j & 0x3) << 3) + ((j >> 2) * inp_stride * 8);
1977*c83a76b0SSuyog Pawar 
1978*c83a76b0SSuyog Pawar             pu1_pred = pu1_pi1 + ((j & 0x3) << 3) + ((j >> 2) * ref_stride * 8);
1979*c83a76b0SSuyog Pawar 
1980*c83a76b0SSuyog Pawar             ai4_satd_8x8[i][j] = ps_prms->ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit(
1981*c83a76b0SSuyog Pawar                 pu1_src, inp_stride, pu1_pred, ref_stride, NULL, 1);
1982*c83a76b0SSuyog Pawar         }
1983*c83a76b0SSuyog Pawar 
1984*c83a76b0SSuyog Pawar         /* Modified to cost calculation using only 8x8 SATD for 32x32*/
1985*c83a76b0SSuyog Pawar         ai4_satd_16x16[i][0] =
1986*c83a76b0SSuyog Pawar             ai4_satd_8x8[i][0] + ai4_satd_8x8[i][1] + ai4_satd_8x8[i][4] + ai4_satd_8x8[i][5];
1987*c83a76b0SSuyog Pawar         ai4_satd_16x16[i][1] =
1988*c83a76b0SSuyog Pawar             ai4_satd_8x8[i][2] + ai4_satd_8x8[i][3] + ai4_satd_8x8[i][6] + ai4_satd_8x8[i][7];
1989*c83a76b0SSuyog Pawar         ai4_satd_16x16[i][2] =
1990*c83a76b0SSuyog Pawar             ai4_satd_8x8[i][8] + ai4_satd_8x8[i][9] + ai4_satd_8x8[i][12] + ai4_satd_8x8[i][13];
1991*c83a76b0SSuyog Pawar         ai4_satd_16x16[i][3] =
1992*c83a76b0SSuyog Pawar             ai4_satd_8x8[i][10] + ai4_satd_8x8[i][11] + ai4_satd_8x8[i][14] + ai4_satd_8x8[i][15];
1993*c83a76b0SSuyog Pawar     }
1994*c83a76b0SSuyog Pawar 
1995*c83a76b0SSuyog Pawar     /* Modified to cost calculation using only 8x8 SATD for 32x32*/
1996*c83a76b0SSuyog Pawar 
1997*c83a76b0SSuyog Pawar     ai4_satd_32x32[0] =
1998*c83a76b0SSuyog Pawar         ai4_satd_16x16[0][0] + ai4_satd_16x16[0][1] + ai4_satd_16x16[0][2] + ai4_satd_16x16[0][3];
1999*c83a76b0SSuyog Pawar     ai4_satd_32x32[1] =
2000*c83a76b0SSuyog Pawar         ai4_satd_16x16[1][0] + ai4_satd_16x16[1][1] + ai4_satd_16x16[1][2] + ai4_satd_16x16[1][3];
2001*c83a76b0SSuyog Pawar     ai4_satd_32x32[2] =
2002*c83a76b0SSuyog Pawar         ai4_satd_16x16[2][0] + ai4_satd_16x16[2][1] + ai4_satd_16x16[2][2] + ai4_satd_16x16[2][3];
2003*c83a76b0SSuyog Pawar     ai4_satd_32x32[3] =
2004*c83a76b0SSuyog Pawar         ai4_satd_16x16[3][0] + ai4_satd_16x16[3][1] + ai4_satd_16x16[3][2] + ai4_satd_16x16[3][3];
2005*c83a76b0SSuyog Pawar 
2006*c83a76b0SSuyog Pawar     /* Update 64x64 SATDs */
2007*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_2Nx2N] =
2008*c83a76b0SSuyog Pawar         ai4_satd_32x32[0] + ai4_satd_32x32[1] + ai4_satd_32x32[2] + ai4_satd_32x32[3];
2009*c83a76b0SSuyog Pawar 
2010*c83a76b0SSuyog Pawar     /* Update 32x32 SATDs */
2011*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_32x32[0];
2012*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_32x32[1];
2013*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_32x32[2];
2014*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_32x32[3];
2015*c83a76b0SSuyog Pawar 
2016*c83a76b0SSuyog Pawar     /* Update 32x64 / 64x32 SATDs */
2017*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_32x32[0] + ai4_satd_32x32[2];
2018*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_32x32[1] + ai4_satd_32x32[3];
2019*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_32x32[0] + ai4_satd_32x32[1];
2020*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_32x32[2] + ai4_satd_32x32[3];
2021*c83a76b0SSuyog Pawar 
2022*c83a76b0SSuyog Pawar     /* Update AMP SATDs 64x48,64x16, 48x64,16x64  */
2023*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_nLx2N_L] =
2024*c83a76b0SSuyog Pawar         ai4_satd_16x16[0][0] + ai4_satd_16x16[0][2] + ai4_satd_16x16[2][0] + ai4_satd_16x16[2][2];
2025*c83a76b0SSuyog Pawar 
2026*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_nLx2N_R] = ai4_satd_16x16[0][1] + ai4_satd_16x16[0][3] +
2027*c83a76b0SSuyog Pawar                                     ai4_satd_16x16[2][1] + ai4_satd_16x16[2][3] +
2028*c83a76b0SSuyog Pawar                                     pi4_sad_grid[PART_ID_Nx2N_R];
2029*c83a76b0SSuyog Pawar 
2030*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_nRx2N_L] = ai4_satd_16x16[1][0] + ai4_satd_16x16[1][2] +
2031*c83a76b0SSuyog Pawar                                     ai4_satd_16x16[3][0] + ai4_satd_16x16[3][2] +
2032*c83a76b0SSuyog Pawar                                     pi4_sad_grid[PART_ID_Nx2N_L];
2033*c83a76b0SSuyog Pawar 
2034*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_nRx2N_R] =
2035*c83a76b0SSuyog Pawar         ai4_satd_16x16[1][1] + ai4_satd_16x16[1][3] + ai4_satd_16x16[3][1] + ai4_satd_16x16[3][3];
2036*c83a76b0SSuyog Pawar 
2037*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_2NxnU_T] =
2038*c83a76b0SSuyog Pawar         ai4_satd_16x16[0][0] + ai4_satd_16x16[0][1] + ai4_satd_16x16[1][0] + ai4_satd_16x16[1][1];
2039*c83a76b0SSuyog Pawar 
2040*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_2NxnU_B] = ai4_satd_16x16[0][2] + ai4_satd_16x16[0][3] +
2041*c83a76b0SSuyog Pawar                                     ai4_satd_16x16[1][2] + ai4_satd_16x16[1][3] +
2042*c83a76b0SSuyog Pawar                                     pi4_sad_grid[PART_ID_2NxN_B];
2043*c83a76b0SSuyog Pawar 
2044*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_2NxnD_T] = ai4_satd_16x16[2][0] + ai4_satd_16x16[2][1] +
2045*c83a76b0SSuyog Pawar                                     ai4_satd_16x16[3][0] + ai4_satd_16x16[3][1] +
2046*c83a76b0SSuyog Pawar                                     pi4_sad_grid[PART_ID_2NxN_T];
2047*c83a76b0SSuyog Pawar 
2048*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_2NxnD_B] =
2049*c83a76b0SSuyog Pawar         ai4_satd_16x16[2][2] + ai4_satd_16x16[2][3] + ai4_satd_16x16[3][2] + ai4_satd_16x16[3][3];
2050*c83a76b0SSuyog Pawar }
2051*c83a76b0SSuyog Pawar 
hme_evalsatd_pt_pu_64x64_tu_rec(err_prms_t * ps_prms,WORD32 lambda,WORD32 lambda_q_shift,WORD32 i4_frm_qstep,me_func_selector_t * ps_func_selector)2052*c83a76b0SSuyog Pawar WORD32 hme_evalsatd_pt_pu_64x64_tu_rec(
2053*c83a76b0SSuyog Pawar     err_prms_t *ps_prms,
2054*c83a76b0SSuyog Pawar     WORD32 lambda,
2055*c83a76b0SSuyog Pawar     WORD32 lambda_q_shift,
2056*c83a76b0SSuyog Pawar     WORD32 i4_frm_qstep,
2057*c83a76b0SSuyog Pawar     me_func_selector_t *ps_func_selector)
2058*c83a76b0SSuyog Pawar {
2059*c83a76b0SSuyog Pawar     S32 ai4_satd_4x4[64]; /* num 4x4s in a 32x32 * num 32x32 in 64x64 */
2060*c83a76b0SSuyog Pawar     S32 ai4_satd_8x8[16]; /* num 8x8s in a 32x32 * num 32x32 in 64x64 */
2061*c83a76b0SSuyog Pawar     S32 ai4_satd_16x16[4]; /* num 16x16 in a 32x32* num 32x32 in 64x64 */
2062*c83a76b0SSuyog Pawar     S32 ai4_satd_32x32[4]; /* num 32x32 in 64x64 */
2063*c83a76b0SSuyog Pawar 
2064*c83a76b0SSuyog Pawar     S32 ai4_tu_split_8x8[16];
2065*c83a76b0SSuyog Pawar     S32 ai4_tu_split_16x16[4];
2066*c83a76b0SSuyog Pawar 
2067*c83a76b0SSuyog Pawar     S32 ai4_tu_early_cbf_8x8[16];
2068*c83a76b0SSuyog Pawar     S32 ai4_tu_early_cbf_16x16[4];
2069*c83a76b0SSuyog Pawar 
2070*c83a76b0SSuyog Pawar     S16 *pi2_had_out;
2071*c83a76b0SSuyog Pawar     S32 i;
2072*c83a76b0SSuyog Pawar 
2073*c83a76b0SSuyog Pawar     /* Initialize array of ptrs to hold partial SATDs at all levels of 16x16 */
2074*c83a76b0SSuyog Pawar     S32 *api4_satd_pu[HAD_32x32 + 1];
2075*c83a76b0SSuyog Pawar     S32 *api4_tu_split[HAD_32x32 + 1];
2076*c83a76b0SSuyog Pawar     S32 *api4_tu_early_cbf[HAD_32x32 + 1];
2077*c83a76b0SSuyog Pawar 
2078*c83a76b0SSuyog Pawar     S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
2079*c83a76b0SSuyog Pawar 
2080*c83a76b0SSuyog Pawar     S32 tu_split_flag = 0;
2081*c83a76b0SSuyog Pawar     S32 total_satd_cost = 0;
2082*c83a76b0SSuyog Pawar 
2083*c83a76b0SSuyog Pawar     U08 *pu1_inp = ps_prms->pu1_inp;
2084*c83a76b0SSuyog Pawar     U08 *pu1_ref = ps_prms->pu1_ref;
2085*c83a76b0SSuyog Pawar 
2086*c83a76b0SSuyog Pawar     S32 inp_stride = ps_prms->i4_inp_stride;
2087*c83a76b0SSuyog Pawar     S32 ref_stride = ps_prms->i4_ref_stride;
2088*c83a76b0SSuyog Pawar 
2089*c83a76b0SSuyog Pawar     /* Initialize tu_split_cost to "0" */
2090*c83a76b0SSuyog Pawar     ps_prms->i4_tu_split_cost = 0;
2091*c83a76b0SSuyog Pawar 
2092*c83a76b0SSuyog Pawar     pi2_had_out = (S16 *)ps_prms->pu1_wkg_mem;
2093*c83a76b0SSuyog Pawar 
2094*c83a76b0SSuyog Pawar     for(i = 0; i < 4; i++)
2095*c83a76b0SSuyog Pawar     {
2096*c83a76b0SSuyog Pawar         S32 blkx = (i & 0x1);
2097*c83a76b0SSuyog Pawar         S32 blky = (i >> 1);
2098*c83a76b0SSuyog Pawar         U08 *pu1_pi0, *pu1_pi1;
2099*c83a76b0SSuyog Pawar         tu_split_flag = 0;
2100*c83a76b0SSuyog Pawar 
2101*c83a76b0SSuyog Pawar         api4_satd_pu[HAD_4x4] = &ai4_satd_4x4[0];
2102*c83a76b0SSuyog Pawar         api4_satd_pu[HAD_8x8] = &ai4_satd_8x8[0];
2103*c83a76b0SSuyog Pawar         api4_satd_pu[HAD_16x16] = &ai4_satd_16x16[0];
2104*c83a76b0SSuyog Pawar         api4_satd_pu[HAD_32x32] = &ai4_satd_32x32[i];
2105*c83a76b0SSuyog Pawar 
2106*c83a76b0SSuyog Pawar         api4_tu_split[HAD_4x4] = NULL;
2107*c83a76b0SSuyog Pawar         api4_tu_split[HAD_8x8] = &ai4_tu_split_8x8[0];
2108*c83a76b0SSuyog Pawar         api4_tu_split[HAD_16x16] = &ai4_tu_split_16x16[0];
2109*c83a76b0SSuyog Pawar         api4_tu_split[HAD_32x32] = &ps_prms->pi4_tu_split_flags[i];
2110*c83a76b0SSuyog Pawar 
2111*c83a76b0SSuyog Pawar         api4_tu_early_cbf[HAD_4x4] = NULL;
2112*c83a76b0SSuyog Pawar         api4_tu_early_cbf[HAD_8x8] = &ai4_tu_early_cbf_8x8[0];
2113*c83a76b0SSuyog Pawar         api4_tu_early_cbf[HAD_16x16] = &ai4_tu_early_cbf_16x16[0];
2114*c83a76b0SSuyog Pawar         api4_tu_early_cbf[HAD_32x32] = &ps_prms->pi4_tu_early_cbf[i];
2115*c83a76b0SSuyog Pawar 
2116*c83a76b0SSuyog Pawar         pu1_pi0 = pu1_inp + (blkx * 32) + (blky * 32 * inp_stride);
2117*c83a76b0SSuyog Pawar         pu1_pi1 = pu1_ref + (blkx * 32) + (blky * 32 * ref_stride);
2118*c83a76b0SSuyog Pawar 
2119*c83a76b0SSuyog Pawar         /* Call recursive 32x32 HAD module; updates satds for 4x4, 8x8, 16x16 and 32x32 */
2120*c83a76b0SSuyog Pawar         ihevce_had_32x32_r(
2121*c83a76b0SSuyog Pawar             pu1_pi0,
2122*c83a76b0SSuyog Pawar             inp_stride,
2123*c83a76b0SSuyog Pawar             pu1_pi1,
2124*c83a76b0SSuyog Pawar             ref_stride,
2125*c83a76b0SSuyog Pawar             pi2_had_out,
2126*c83a76b0SSuyog Pawar             32,
2127*c83a76b0SSuyog Pawar             api4_satd_pu,
2128*c83a76b0SSuyog Pawar             api4_tu_split,
2129*c83a76b0SSuyog Pawar             api4_tu_early_cbf,
2130*c83a76b0SSuyog Pawar             0,
2131*c83a76b0SSuyog Pawar             8,
2132*c83a76b0SSuyog Pawar             lambda,
2133*c83a76b0SSuyog Pawar             lambda_q_shift,
2134*c83a76b0SSuyog Pawar             i4_frm_qstep,
2135*c83a76b0SSuyog Pawar             1,
2136*c83a76b0SSuyog Pawar             ps_prms->u1_max_tr_depth,
2137*c83a76b0SSuyog Pawar             ps_prms->u1_max_tr_size,
2138*c83a76b0SSuyog Pawar             &(ps_prms->i4_tu_split_cost),
2139*c83a76b0SSuyog Pawar             ps_func_selector);
2140*c83a76b0SSuyog Pawar     }
2141*c83a76b0SSuyog Pawar 
2142*c83a76b0SSuyog Pawar     total_satd_cost = ai4_satd_32x32[0] + ai4_satd_32x32[1] + ai4_satd_32x32[2] + ai4_satd_32x32[3];
2143*c83a76b0SSuyog Pawar 
2144*c83a76b0SSuyog Pawar     /* Update 64x64 SATDs */
2145*c83a76b0SSuyog Pawar     pi4_sad_grid[PART_ID_2Nx2N] =
2146*c83a76b0SSuyog Pawar         ai4_satd_32x32[0] + ai4_satd_32x32[1] + ai4_satd_32x32[2] + ai4_satd_32x32[3];
2147*c83a76b0SSuyog Pawar 
2148*c83a76b0SSuyog Pawar     return total_satd_cost;
2149*c83a76b0SSuyog Pawar }
2150*c83a76b0SSuyog Pawar 
2151*c83a76b0SSuyog Pawar /**
2152*c83a76b0SSuyog Pawar ********************************************************************************
2153*c83a76b0SSuyog Pawar *  @fn     void hme_subpel_refine_search_node(search_node_t *ps_search_node,
2154*c83a76b0SSuyog Pawar *                                   hme_subpel_prms_t *ps_prms,
2155*c83a76b0SSuyog Pawar *                                   layer_ctxt_t *ps_curr_layer,
2156*c83a76b0SSuyog Pawar *                                   BLK_SIZE_T e_blk_size,
2157*c83a76b0SSuyog Pawar *                                   S32 x_off,
2158*c83a76b0SSuyog Pawar *                                   S32 y_off)
2159*c83a76b0SSuyog Pawar *
2160*c83a76b0SSuyog Pawar *  @brief  Refines a given partition within a CU
2161*c83a76b0SSuyog Pawar *
2162*c83a76b0SSuyog Pawar *  @param[in,out]  ps_search_node: supplies starting mv and also ref id.
2163*c83a76b0SSuyog Pawar *                   updated with the accurate subpel mv
2164*c83a76b0SSuyog Pawar *
2165*c83a76b0SSuyog Pawar *  @param[in]  ps_prms: subpel prms input to this function
2166*c83a76b0SSuyog Pawar *
2167*c83a76b0SSuyog Pawar *  @param[in]  ps_curr_layer : layer context
2168*c83a76b0SSuyog Pawar *
2169*c83a76b0SSuyog Pawar *  @param[in]  e_blk_size : Block size enumeration
2170*c83a76b0SSuyog Pawar *
2171*c83a76b0SSuyog Pawar *  @param[in]  x_off : x offset of the partition w.r.t. pic start
2172*c83a76b0SSuyog Pawar *
2173*c83a76b0SSuyog Pawar *  @param[in]  y_off : y offset of the partition w.r.t. pic start
2174*c83a76b0SSuyog Pawar *
2175*c83a76b0SSuyog Pawar *  @return None
2176*c83a76b0SSuyog Pawar ********************************************************************************
2177*c83a76b0SSuyog Pawar */
2178*c83a76b0SSuyog Pawar 
hme_get_calc_sad_and_result_subpel_fxn(me_func_selector_t * ps_func_selector,ihevce_me_optimised_function_list_t * ps_me_optimised_function_list,S32 i4_part_mask,U08 u1_use_satd,U08 u1_num_parts,U08 u1_num_results)2179*c83a76b0SSuyog Pawar static __inline PF_SAD_RESULT_FXN_T hme_get_calc_sad_and_result_subpel_fxn(
2180*c83a76b0SSuyog Pawar     me_func_selector_t *ps_func_selector,
2181*c83a76b0SSuyog Pawar     ihevce_me_optimised_function_list_t *ps_me_optimised_function_list,
2182*c83a76b0SSuyog Pawar     S32 i4_part_mask,
2183*c83a76b0SSuyog Pawar     U08 u1_use_satd,
2184*c83a76b0SSuyog Pawar     U08 u1_num_parts,
2185*c83a76b0SSuyog Pawar     U08 u1_num_results)
2186*c83a76b0SSuyog Pawar {
2187*c83a76b0SSuyog Pawar     PF_SAD_RESULT_FXN_T pf_err_compute;
2188*c83a76b0SSuyog Pawar 
2189*c83a76b0SSuyog Pawar     ASSERT((1 == u1_num_results) || (2 == u1_num_results));
2190*c83a76b0SSuyog Pawar 
2191*c83a76b0SSuyog Pawar     if(1 == u1_num_results)
2192*c83a76b0SSuyog Pawar     {
2193*c83a76b0SSuyog Pawar         if(u1_use_satd)
2194*c83a76b0SSuyog Pawar         {
2195*c83a76b0SSuyog Pawar             if(u1_num_parts == 1)
2196*c83a76b0SSuyog Pawar             {
2197*c83a76b0SSuyog Pawar                 pf_err_compute =
2198*c83a76b0SSuyog Pawar                     ps_func_selector->pf_evalsatd_update_1_best_result_pt_pu_16x16_num_part_eq_1;
2199*c83a76b0SSuyog Pawar             }
2200*c83a76b0SSuyog Pawar             else if((u1_num_parts > 1) && (u1_num_parts <= 8))
2201*c83a76b0SSuyog Pawar             {
2202*c83a76b0SSuyog Pawar                 pf_err_compute =
2203*c83a76b0SSuyog Pawar                     ps_func_selector->pf_evalsatd_update_1_best_result_pt_pu_16x16_num_part_lt_9;
2204*c83a76b0SSuyog Pawar             }
2205*c83a76b0SSuyog Pawar             else
2206*c83a76b0SSuyog Pawar             {
2207*c83a76b0SSuyog Pawar                 pf_err_compute =
2208*c83a76b0SSuyog Pawar                     ps_func_selector->pf_evalsatd_update_1_best_result_pt_pu_16x16_num_part_lt_17;
2209*c83a76b0SSuyog Pawar             }
2210*c83a76b0SSuyog Pawar         }
2211*c83a76b0SSuyog Pawar         else
2212*c83a76b0SSuyog Pawar         {
2213*c83a76b0SSuyog Pawar             if(u1_num_parts == 1)
2214*c83a76b0SSuyog Pawar             {
2215*c83a76b0SSuyog Pawar                 pf_err_compute = ps_me_optimised_function_list
2216*c83a76b0SSuyog Pawar                                      ->pf_calc_sad_and_1_best_result_subpel_num_part_eq_1;
2217*c83a76b0SSuyog Pawar             }
2218*c83a76b0SSuyog Pawar             else if(((i4_part_mask & ENABLE_SQUARE_PARTS) != 0) && (u1_num_parts == 5))
2219*c83a76b0SSuyog Pawar             {
2220*c83a76b0SSuyog Pawar                 pf_err_compute =
2221*c83a76b0SSuyog Pawar                     ps_me_optimised_function_list->pf_calc_sad_and_1_best_result_subpel_square_parts;
2222*c83a76b0SSuyog Pawar             }
2223*c83a76b0SSuyog Pawar             else if((u1_num_parts > 1) && (u1_num_parts <= 8))
2224*c83a76b0SSuyog Pawar             {
2225*c83a76b0SSuyog Pawar                 pf_err_compute = ps_me_optimised_function_list
2226*c83a76b0SSuyog Pawar                                      ->pf_calc_sad_and_1_best_result_subpel_num_part_lt_9;
2227*c83a76b0SSuyog Pawar             }
2228*c83a76b0SSuyog Pawar             else
2229*c83a76b0SSuyog Pawar             {
2230*c83a76b0SSuyog Pawar                 pf_err_compute = ps_me_optimised_function_list
2231*c83a76b0SSuyog Pawar                                      ->pf_calc_sad_and_1_best_result_subpel_num_part_lt_17;
2232*c83a76b0SSuyog Pawar             }
2233*c83a76b0SSuyog Pawar         }
2234*c83a76b0SSuyog Pawar     }
2235*c83a76b0SSuyog Pawar     else
2236*c83a76b0SSuyog Pawar     {
2237*c83a76b0SSuyog Pawar         if(u1_use_satd)
2238*c83a76b0SSuyog Pawar         {
2239*c83a76b0SSuyog Pawar             if(u1_num_parts == 1)
2240*c83a76b0SSuyog Pawar             {
2241*c83a76b0SSuyog Pawar                 pf_err_compute =
2242*c83a76b0SSuyog Pawar                     ps_func_selector->pf_evalsatd_update_2_best_results_pt_pu_16x16_num_part_eq_1;
2243*c83a76b0SSuyog Pawar             }
2244*c83a76b0SSuyog Pawar             else if((u1_num_parts > 1) && (u1_num_parts <= 8))
2245*c83a76b0SSuyog Pawar             {
2246*c83a76b0SSuyog Pawar                 pf_err_compute =
2247*c83a76b0SSuyog Pawar                     ps_func_selector->pf_evalsatd_update_2_best_results_pt_pu_16x16_num_part_lt_9;
2248*c83a76b0SSuyog Pawar             }
2249*c83a76b0SSuyog Pawar             else
2250*c83a76b0SSuyog Pawar             {
2251*c83a76b0SSuyog Pawar                 pf_err_compute =
2252*c83a76b0SSuyog Pawar                     ps_func_selector->pf_evalsatd_update_2_best_results_pt_pu_16x16_num_part_lt_17;
2253*c83a76b0SSuyog Pawar             }
2254*c83a76b0SSuyog Pawar         }
2255*c83a76b0SSuyog Pawar         else
2256*c83a76b0SSuyog Pawar         {
2257*c83a76b0SSuyog Pawar             if(u1_num_parts == 1)
2258*c83a76b0SSuyog Pawar             {
2259*c83a76b0SSuyog Pawar                 pf_err_compute = ps_me_optimised_function_list
2260*c83a76b0SSuyog Pawar                                      ->pf_calc_sad_and_2_best_results_subpel_num_part_eq_1;
2261*c83a76b0SSuyog Pawar             }
2262*c83a76b0SSuyog Pawar             else if(((i4_part_mask & ENABLE_SQUARE_PARTS) != 0) && (u1_num_parts == 5))
2263*c83a76b0SSuyog Pawar             {
2264*c83a76b0SSuyog Pawar                 pf_err_compute = ps_me_optimised_function_list
2265*c83a76b0SSuyog Pawar                                      ->pf_calc_sad_and_2_best_results_subpel_square_parts;
2266*c83a76b0SSuyog Pawar             }
2267*c83a76b0SSuyog Pawar             else if((u1_num_parts > 1) && (u1_num_parts <= 8))
2268*c83a76b0SSuyog Pawar             {
2269*c83a76b0SSuyog Pawar                 pf_err_compute = ps_me_optimised_function_list
2270*c83a76b0SSuyog Pawar                                      ->pf_calc_sad_and_2_best_results_subpel_num_part_lt_9;
2271*c83a76b0SSuyog Pawar             }
2272*c83a76b0SSuyog Pawar             else
2273*c83a76b0SSuyog Pawar             {
2274*c83a76b0SSuyog Pawar                 pf_err_compute = ps_me_optimised_function_list
2275*c83a76b0SSuyog Pawar                                      ->pf_calc_sad_and_2_best_results_subpel_num_part_lt_17;
2276*c83a76b0SSuyog Pawar             }
2277*c83a76b0SSuyog Pawar         }
2278*c83a76b0SSuyog Pawar     }
2279*c83a76b0SSuyog Pawar 
2280*c83a76b0SSuyog Pawar     return pf_err_compute;
2281*c83a76b0SSuyog Pawar }
2282*c83a76b0SSuyog Pawar 
2283*c83a76b0SSuyog Pawar #if DIAMOND_GRID == 1
hme_subpel_refine_search_node_high_speed(search_node_t * ps_search_node,hme_subpel_prms_t * ps_prms,layer_ctxt_t * ps_curr_layer,BLK_SIZE_T e_blk_size,S32 x_off,S32 y_off,search_results_t * ps_search_results,S32 pred_lx,S32 i4_part_mask,S32 * pi4_valid_part_ids,S32 search_idx,subpel_dedup_enabler_t * ps_dedup_enabler,me_func_selector_t * ps_func_selector,ihevce_me_optimised_function_list_t * ps_me_optimised_function_list)2284*c83a76b0SSuyog Pawar S32 hme_subpel_refine_search_node_high_speed(
2285*c83a76b0SSuyog Pawar     search_node_t *ps_search_node,
2286*c83a76b0SSuyog Pawar     hme_subpel_prms_t *ps_prms,
2287*c83a76b0SSuyog Pawar     layer_ctxt_t *ps_curr_layer,
2288*c83a76b0SSuyog Pawar     BLK_SIZE_T e_blk_size,
2289*c83a76b0SSuyog Pawar     S32 x_off,
2290*c83a76b0SSuyog Pawar     S32 y_off,
2291*c83a76b0SSuyog Pawar     search_results_t *ps_search_results,
2292*c83a76b0SSuyog Pawar     S32 pred_lx,
2293*c83a76b0SSuyog Pawar     S32 i4_part_mask,
2294*c83a76b0SSuyog Pawar     S32 *pi4_valid_part_ids,
2295*c83a76b0SSuyog Pawar     S32 search_idx,
2296*c83a76b0SSuyog Pawar     subpel_dedup_enabler_t *ps_dedup_enabler,
2297*c83a76b0SSuyog Pawar     me_func_selector_t *ps_func_selector,
2298*c83a76b0SSuyog Pawar     ihevce_me_optimised_function_list_t *ps_me_optimised_function_list)
2299*c83a76b0SSuyog Pawar {
2300*c83a76b0SSuyog Pawar     S32 i4_num_hpel_refine, i4_num_qpel_refine;
2301*c83a76b0SSuyog Pawar     S32 i4_offset, i4_grid_mask;
2302*c83a76b0SSuyog Pawar     S08 i1_ref_idx;
2303*c83a76b0SSuyog Pawar     S32 i4_blk_wd, i4_blk_ht;
2304*c83a76b0SSuyog Pawar     S32 i4_ref_stride, i4_i;
2305*c83a76b0SSuyog Pawar     pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
2306*c83a76b0SSuyog Pawar     result_upd_prms_t s_result_prms;
2307*c83a76b0SSuyog Pawar     search_node_t s_temp_search_node;
2308*c83a76b0SSuyog Pawar 
2309*c83a76b0SSuyog Pawar     /*************************************************************************/
2310*c83a76b0SSuyog Pawar     /* Tracks current MV with the fractional component.                      */
2311*c83a76b0SSuyog Pawar     /*************************************************************************/
2312*c83a76b0SSuyog Pawar     S32 i4_mv_x, i4_mv_y;
2313*c83a76b0SSuyog Pawar     S32 i4_frac_x, i4_frac_y;
2314*c83a76b0SSuyog Pawar 
2315*c83a76b0SSuyog Pawar     /*************************************************************************/
2316*c83a76b0SSuyog Pawar     /* Function pointer for SAD/SATD, array and prms structure to pass to    */
2317*c83a76b0SSuyog Pawar     /* This function                                                         */
2318*c83a76b0SSuyog Pawar     /*************************************************************************/
2319*c83a76b0SSuyog Pawar     PF_SAD_RESULT_FXN_T pf_err_compute;
2320*c83a76b0SSuyog Pawar 
2321*c83a76b0SSuyog Pawar     S32 ai4_sad_grid[17], i4_tot_cost;
2322*c83a76b0SSuyog Pawar     err_prms_t s_err_prms;
2323*c83a76b0SSuyog Pawar 
2324*c83a76b0SSuyog Pawar     /*************************************************************************/
2325*c83a76b0SSuyog Pawar     /* Allowed MV RANGE                                                      */
2326*c83a76b0SSuyog Pawar     /*************************************************************************/
2327*c83a76b0SSuyog Pawar     range_prms_t *ps_range_prms;
2328*c83a76b0SSuyog Pawar 
2329*c83a76b0SSuyog Pawar     /*************************************************************************/
2330*c83a76b0SSuyog Pawar     /* stores min id in grid with associated min cost.                       */
2331*c83a76b0SSuyog Pawar     /*************************************************************************/
2332*c83a76b0SSuyog Pawar     S32 i4_min_cost, i4_min_sad;
2333*c83a76b0SSuyog Pawar     GRID_PT_T e_min_id;
2334*c83a76b0SSuyog Pawar 
2335*c83a76b0SSuyog Pawar     PF_INTERP_FXN_T pf_qpel_interp;
2336*c83a76b0SSuyog Pawar     /*************************************************************************/
2337*c83a76b0SSuyog Pawar     /* For hpel and qpel we move in diamonds and hence each point in the     */
2338*c83a76b0SSuyog Pawar     /* diamond will belong to a completely different plane. To simplify the  */
2339*c83a76b0SSuyog Pawar     /* look up of the ref ptr, we declare a 2x2 array of ref ptrs for the    */
2340*c83a76b0SSuyog Pawar     /* hpel planes which are interpolated during recon.                      */
2341*c83a76b0SSuyog Pawar     /*************************************************************************/
2342*c83a76b0SSuyog Pawar     U08 *apu1_hpel_ref[4], *pu1_ref;
2343*c83a76b0SSuyog Pawar 
2344*c83a76b0SSuyog Pawar     interp_prms_t s_interp_prms;
2345*c83a76b0SSuyog Pawar 
2346*c83a76b0SSuyog Pawar     /*************************************************************************/
2347*c83a76b0SSuyog Pawar     /* Maintains the minimum id of interpolated buffers, and the pointer that*/
2348*c83a76b0SSuyog Pawar     /* points to the corresponding predicted buf with its stride.            */
2349*c83a76b0SSuyog Pawar     /* Note that the pointer cannot be derived just from the id, since the   */
2350*c83a76b0SSuyog Pawar     /* pointer may also point to the hpel buffer (in case we request interp  */
2351*c83a76b0SSuyog Pawar     /* of a hpel pt, which already exists in the recon hpel planes)          */
2352*c83a76b0SSuyog Pawar     /*************************************************************************/
2353*c83a76b0SSuyog Pawar     U08 *pu1_final_out;
2354*c83a76b0SSuyog Pawar     S32 i4_final_out_stride;
2355*c83a76b0SSuyog Pawar     S32 part_id;
2356*c83a76b0SSuyog Pawar     S32 check_for_duplicate = 0;
2357*c83a76b0SSuyog Pawar 
2358*c83a76b0SSuyog Pawar     subpel_refine_ctxt_t *ps_subpel_refine_ctxt = ps_prms->ps_subpel_refine_ctxt;
2359*c83a76b0SSuyog Pawar 
2360*c83a76b0SSuyog Pawar     S32 mvx_qpel;
2361*c83a76b0SSuyog Pawar     S32 mvy_qpel;
2362*c83a76b0SSuyog Pawar 
2363*c83a76b0SSuyog Pawar     pf_err_compute = hme_get_calc_sad_and_result_subpel_fxn(
2364*c83a76b0SSuyog Pawar         ps_func_selector,
2365*c83a76b0SSuyog Pawar         ps_me_optimised_function_list,
2366*c83a76b0SSuyog Pawar         i4_part_mask,
2367*c83a76b0SSuyog Pawar         ps_prms->i4_use_satd,
2368*c83a76b0SSuyog Pawar         ps_subpel_refine_ctxt->i4_num_valid_parts,
2369*c83a76b0SSuyog Pawar         ps_search_results->u1_num_results_per_part);
2370*c83a76b0SSuyog Pawar 
2371*c83a76b0SSuyog Pawar     i4_num_hpel_refine = ps_prms->i4_num_steps_hpel_refine;
2372*c83a76b0SSuyog Pawar     i4_num_qpel_refine = ps_prms->i4_num_steps_qpel_refine;
2373*c83a76b0SSuyog Pawar 
2374*c83a76b0SSuyog Pawar     /* Prediction contet should now deal with qpel units */
2375*c83a76b0SSuyog Pawar     HME_SET_MVPRED_RES(ps_pred_ctxt, MV_RES_QPEL);
2376*c83a76b0SSuyog Pawar 
2377*c83a76b0SSuyog Pawar     /* Buffer allocation for subpel */
2378*c83a76b0SSuyog Pawar     /* Current design is that there may be many partitions and different mvs */
2379*c83a76b0SSuyog Pawar     /* that attempt subpel refinemnt. While there is possibility of overlap, the */
2380*c83a76b0SSuyog Pawar     /* hashing to detect and avoid overlap may be very complex. So, currently,   */
2381*c83a76b0SSuyog Pawar     /* the only thing done is to store the eventual predicted buffer with every  */
2382*c83a76b0SSuyog Pawar     /* ctb node that holds the result of hte best subpel search */
2383*c83a76b0SSuyog Pawar 
2384*c83a76b0SSuyog Pawar     /* Compute the base pointer for input, interpolated buffers */
2385*c83a76b0SSuyog Pawar     /* The base pointers point as follows: */
2386*c83a76b0SSuyog Pawar     /* fx fy : 0, 0 :: fx, hy : 0, 0.5, hx, fy: 0.5, 0, hx, fy: 0.5, 0.5 */
2387*c83a76b0SSuyog Pawar     /* To these, we need to add the offset of the current node */
2388*c83a76b0SSuyog Pawar     i4_ref_stride = ps_curr_layer->i4_rec_stride;
2389*c83a76b0SSuyog Pawar     i4_offset = x_off + (y_off * i4_ref_stride);
2390*c83a76b0SSuyog Pawar     i1_ref_idx = ps_search_node->i1_ref_idx;
2391*c83a76b0SSuyog Pawar 
2392*c83a76b0SSuyog Pawar     apu1_hpel_ref[0] = ps_curr_layer->ppu1_list_rec_fxfy[i1_ref_idx] + i4_offset;
2393*c83a76b0SSuyog Pawar     apu1_hpel_ref[1] = ps_curr_layer->ppu1_list_rec_hxfy[i1_ref_idx] + i4_offset;
2394*c83a76b0SSuyog Pawar     apu1_hpel_ref[2] = ps_curr_layer->ppu1_list_rec_fxhy[i1_ref_idx] + i4_offset;
2395*c83a76b0SSuyog Pawar     apu1_hpel_ref[3] = ps_curr_layer->ppu1_list_rec_hxhy[i1_ref_idx] + i4_offset;
2396*c83a76b0SSuyog Pawar 
2397*c83a76b0SSuyog Pawar     /* Initialize result params used for partition update */
2398*c83a76b0SSuyog Pawar     s_result_prms.pf_mv_cost_compute = NULL;
2399*c83a76b0SSuyog Pawar     s_result_prms.ps_search_results = ps_search_results;
2400*c83a76b0SSuyog Pawar     s_result_prms.pi4_valid_part_ids = pi4_valid_part_ids;
2401*c83a76b0SSuyog Pawar     s_result_prms.i1_ref_idx = ps_search_node->i1_ref_idx;
2402*c83a76b0SSuyog Pawar     s_result_prms.u1_pred_lx = search_idx;
2403*c83a76b0SSuyog Pawar     s_result_prms.i4_part_mask = i4_part_mask;
2404*c83a76b0SSuyog Pawar     s_result_prms.ps_search_node_base = ps_search_node;
2405*c83a76b0SSuyog Pawar     s_result_prms.pi4_sad_grid = &ai4_sad_grid[0];
2406*c83a76b0SSuyog Pawar     s_result_prms.i4_grid_mask = 1;
2407*c83a76b0SSuyog Pawar     s_result_prms.ps_search_node = &s_temp_search_node;
2408*c83a76b0SSuyog Pawar     s_temp_search_node.i1_ref_idx = ps_search_node->i1_ref_idx;
2409*c83a76b0SSuyog Pawar 
2410*c83a76b0SSuyog Pawar     /* convert to hpel units */
2411*c83a76b0SSuyog Pawar     i4_mv_x = ps_search_node->s_mv.i2_mvx >> 1;
2412*c83a76b0SSuyog Pawar     i4_mv_y = ps_search_node->s_mv.i2_mvy >> 1;
2413*c83a76b0SSuyog Pawar 
2414*c83a76b0SSuyog Pawar     /* for first pt, we compute at all locations in the grid, 4 + 1 centre */
2415*c83a76b0SSuyog Pawar     ps_range_prms = ps_prms->aps_mv_range_qpel[i1_ref_idx];
2416*c83a76b0SSuyog Pawar     i4_grid_mask = (GRID_DIAMOND_ENABLE_ALL);
2417*c83a76b0SSuyog Pawar     i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms);
2418*c83a76b0SSuyog Pawar 
2419*c83a76b0SSuyog Pawar     i4_min_cost = MAX_32BIT_VAL;
2420*c83a76b0SSuyog Pawar     i4_min_sad = MAX_32BIT_VAL;
2421*c83a76b0SSuyog Pawar 
2422*c83a76b0SSuyog Pawar     /*************************************************************************/
2423*c83a76b0SSuyog Pawar     /* Prepare the input params to SAD/SATD function. Note that input is     */
2424*c83a76b0SSuyog Pawar     /* passed from the calling funcion since it may be I (normal subpel      */
2425*c83a76b0SSuyog Pawar     /* refinement) or 2I - P0 in case of bidirect subpel refinement.         */
2426*c83a76b0SSuyog Pawar     /* Both cases are handled here.                                          */
2427*c83a76b0SSuyog Pawar     /*************************************************************************/
2428*c83a76b0SSuyog Pawar     s_err_prms.pu1_inp = (U08 *)ps_prms->pv_inp;
2429*c83a76b0SSuyog Pawar     s_err_prms.i4_inp_stride = ps_prms->i4_inp_stride;
2430*c83a76b0SSuyog Pawar     s_err_prms.i4_ref_stride = i4_ref_stride;
2431*c83a76b0SSuyog Pawar     s_err_prms.i4_part_mask = (ENABLE_2Nx2N);
2432*c83a76b0SSuyog Pawar     s_err_prms.i4_grid_mask = 1;
2433*c83a76b0SSuyog Pawar     s_err_prms.pi4_sad_grid = &ai4_sad_grid[0];
2434*c83a76b0SSuyog Pawar     s_err_prms.i4_blk_wd = i4_blk_wd = gau1_blk_size_to_wd[e_blk_size];
2435*c83a76b0SSuyog Pawar     s_err_prms.i4_blk_ht = i4_blk_ht = gau1_blk_size_to_ht[e_blk_size];
2436*c83a76b0SSuyog Pawar 
2437*c83a76b0SSuyog Pawar     s_result_prms.ps_subpel_refine_ctxt = ps_subpel_refine_ctxt;
2438*c83a76b0SSuyog Pawar 
2439*c83a76b0SSuyog Pawar     part_id = ps_search_node->u1_part_id;
2440*c83a76b0SSuyog Pawar     for(i4_i = 0; i4_i < i4_num_hpel_refine; i4_i++)
2441*c83a76b0SSuyog Pawar     {
2442*c83a76b0SSuyog Pawar         e_min_id = PT_C;
2443*c83a76b0SSuyog Pawar 
2444*c83a76b0SSuyog Pawar         mvx_qpel = i4_mv_x << 1;
2445*c83a76b0SSuyog Pawar         mvy_qpel = i4_mv_y << 1;
2446*c83a76b0SSuyog Pawar 
2447*c83a76b0SSuyog Pawar         /* Central pt */
2448*c83a76b0SSuyog Pawar         if(i4_grid_mask & BIT_EN(PT_C))
2449*c83a76b0SSuyog Pawar         {
2450*c83a76b0SSuyog Pawar             //ps_search_node->i2_mv_x = (S16)i4_mv_x;
2451*c83a76b0SSuyog Pawar             //ps_search_node->i2_mv_x = (S16)i4_mv_y;
2452*c83a76b0SSuyog Pawar             /* central pt is i4_mv_x, i4_mv_y */
2453*c83a76b0SSuyog Pawar             CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2454*c83a76b0SSuyog Pawar                 ps_dedup_enabler, 1, mvx_qpel, mvy_qpel, check_for_duplicate);
2455*c83a76b0SSuyog Pawar 
2456*c83a76b0SSuyog Pawar             i4_frac_x = i4_mv_x & 1;
2457*c83a76b0SSuyog Pawar             i4_frac_y = i4_mv_y & 1;
2458*c83a76b0SSuyog Pawar             pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
2459*c83a76b0SSuyog Pawar             s_err_prms.pu1_ref = pu1_ref + (i4_mv_x >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
2460*c83a76b0SSuyog Pawar 
2461*c83a76b0SSuyog Pawar             /* Update the mv's with the current candt motion vectors */
2462*c83a76b0SSuyog Pawar             s_result_prms.i2_mv_x = mvx_qpel;
2463*c83a76b0SSuyog Pawar             s_result_prms.i2_mv_y = mvy_qpel;
2464*c83a76b0SSuyog Pawar             s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
2465*c83a76b0SSuyog Pawar             s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
2466*c83a76b0SSuyog Pawar 
2467*c83a76b0SSuyog Pawar             pf_err_compute(&s_err_prms, &s_result_prms);
2468*c83a76b0SSuyog Pawar 
2469*c83a76b0SSuyog Pawar             i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2470*c83a76b0SSuyog Pawar             if(i4_tot_cost < i4_min_cost)
2471*c83a76b0SSuyog Pawar             {
2472*c83a76b0SSuyog Pawar                 i4_min_cost = i4_tot_cost;
2473*c83a76b0SSuyog Pawar                 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2474*c83a76b0SSuyog Pawar                 e_min_id = PT_C;
2475*c83a76b0SSuyog Pawar                 pu1_final_out = s_err_prms.pu1_ref;
2476*c83a76b0SSuyog Pawar             }
2477*c83a76b0SSuyog Pawar         }
2478*c83a76b0SSuyog Pawar 
2479*c83a76b0SSuyog Pawar         /* left pt */
2480*c83a76b0SSuyog Pawar         if(i4_grid_mask & BIT_EN(PT_L))
2481*c83a76b0SSuyog Pawar         {
2482*c83a76b0SSuyog Pawar             CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2483*c83a76b0SSuyog Pawar                 ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel, check_for_duplicate);
2484*c83a76b0SSuyog Pawar 
2485*c83a76b0SSuyog Pawar             if(!check_for_duplicate)
2486*c83a76b0SSuyog Pawar             {
2487*c83a76b0SSuyog Pawar                 /* search node mv is stored in qpel units */
2488*c83a76b0SSuyog Pawar                 ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x - 1) << 1);
2489*c83a76b0SSuyog Pawar                 ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
2490*c83a76b0SSuyog Pawar                 /* central pt is i4_mv_x - 1, i4_mv_y */
2491*c83a76b0SSuyog Pawar                 i4_frac_x = (i4_mv_x - 1) & 1;  // same as (x-1)&1
2492*c83a76b0SSuyog Pawar                 i4_frac_y = i4_mv_y & 1;
2493*c83a76b0SSuyog Pawar                 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
2494*c83a76b0SSuyog Pawar                 s_err_prms.pu1_ref =
2495*c83a76b0SSuyog Pawar                     pu1_ref + ((i4_mv_x - 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
2496*c83a76b0SSuyog Pawar 
2497*c83a76b0SSuyog Pawar                 /* Update the mv's with the current candt motion vectors */
2498*c83a76b0SSuyog Pawar                 s_result_prms.i2_mv_x = mvx_qpel - 2;
2499*c83a76b0SSuyog Pawar                 s_result_prms.i2_mv_y = mvy_qpel;
2500*c83a76b0SSuyog Pawar                 s_temp_search_node.s_mv.i2_mvx = mvx_qpel - 2;
2501*c83a76b0SSuyog Pawar                 s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
2502*c83a76b0SSuyog Pawar 
2503*c83a76b0SSuyog Pawar                 pf_err_compute(&s_err_prms, &s_result_prms);
2504*c83a76b0SSuyog Pawar                 //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2505*c83a76b0SSuyog Pawar                 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2506*c83a76b0SSuyog Pawar                 if(i4_tot_cost < i4_min_cost)
2507*c83a76b0SSuyog Pawar                 {
2508*c83a76b0SSuyog Pawar                     i4_min_cost = i4_tot_cost;
2509*c83a76b0SSuyog Pawar                     i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2510*c83a76b0SSuyog Pawar                     e_min_id = PT_L;
2511*c83a76b0SSuyog Pawar                     pu1_final_out = s_err_prms.pu1_ref;
2512*c83a76b0SSuyog Pawar                 }
2513*c83a76b0SSuyog Pawar             }
2514*c83a76b0SSuyog Pawar         }
2515*c83a76b0SSuyog Pawar         /* top pt */
2516*c83a76b0SSuyog Pawar         if(i4_grid_mask & BIT_EN(PT_T))
2517*c83a76b0SSuyog Pawar         {
2518*c83a76b0SSuyog Pawar             CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2519*c83a76b0SSuyog Pawar                 ps_dedup_enabler, 1, mvx_qpel, mvy_qpel - 2, check_for_duplicate);
2520*c83a76b0SSuyog Pawar 
2521*c83a76b0SSuyog Pawar             if(!check_for_duplicate)
2522*c83a76b0SSuyog Pawar             {
2523*c83a76b0SSuyog Pawar                 /* search node mv is stored in qpel units */
2524*c83a76b0SSuyog Pawar                 ps_search_node->s_mv.i2_mvx = (S16)(i4_mv_x << 1);
2525*c83a76b0SSuyog Pawar                 ps_search_node->s_mv.i2_mvy = (S16)((i4_mv_y - 1) << 1);
2526*c83a76b0SSuyog Pawar                 /* top pt is i4_mv_x, i4_mv_y - 1 */
2527*c83a76b0SSuyog Pawar                 i4_frac_x = i4_mv_x & 1;
2528*c83a76b0SSuyog Pawar                 i4_frac_y = (i4_mv_y - 1) & 1;
2529*c83a76b0SSuyog Pawar                 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
2530*c83a76b0SSuyog Pawar                 s_err_prms.pu1_ref =
2531*c83a76b0SSuyog Pawar                     pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y - 1) >> 1) * i4_ref_stride);
2532*c83a76b0SSuyog Pawar 
2533*c83a76b0SSuyog Pawar                 /* Update the mv's with the current candt motion vectors */
2534*c83a76b0SSuyog Pawar                 s_result_prms.i2_mv_x = mvx_qpel;
2535*c83a76b0SSuyog Pawar                 s_result_prms.i2_mv_y = mvy_qpel - 2;
2536*c83a76b0SSuyog Pawar                 s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
2537*c83a76b0SSuyog Pawar                 s_temp_search_node.s_mv.i2_mvy = mvy_qpel - 2;
2538*c83a76b0SSuyog Pawar 
2539*c83a76b0SSuyog Pawar                 pf_err_compute(&s_err_prms, &s_result_prms);
2540*c83a76b0SSuyog Pawar                 //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2541*c83a76b0SSuyog Pawar                 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2542*c83a76b0SSuyog Pawar                 if(i4_tot_cost < i4_min_cost)
2543*c83a76b0SSuyog Pawar                 {
2544*c83a76b0SSuyog Pawar                     i4_min_cost = i4_tot_cost;
2545*c83a76b0SSuyog Pawar                     i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2546*c83a76b0SSuyog Pawar                     e_min_id = PT_T;
2547*c83a76b0SSuyog Pawar                     pu1_final_out = s_err_prms.pu1_ref;
2548*c83a76b0SSuyog Pawar                 }
2549*c83a76b0SSuyog Pawar             }
2550*c83a76b0SSuyog Pawar         }
2551*c83a76b0SSuyog Pawar         /* right pt */
2552*c83a76b0SSuyog Pawar         if(i4_grid_mask & BIT_EN(PT_R))
2553*c83a76b0SSuyog Pawar         {
2554*c83a76b0SSuyog Pawar             CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2555*c83a76b0SSuyog Pawar                 ps_dedup_enabler, num_unique_nodes, mvx_qpel + 2, mvy_qpel, check_for_duplicate);
2556*c83a76b0SSuyog Pawar             if(!check_for_duplicate)
2557*c83a76b0SSuyog Pawar             {
2558*c83a76b0SSuyog Pawar                 /* search node mv is stored in qpel units */
2559*c83a76b0SSuyog Pawar                 ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x + 1) << 1);
2560*c83a76b0SSuyog Pawar                 ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
2561*c83a76b0SSuyog Pawar                 /* right pt is i4_mv_x + 1, i4_mv_y */
2562*c83a76b0SSuyog Pawar                 i4_frac_x = (i4_mv_x + 1) & 1;
2563*c83a76b0SSuyog Pawar                 i4_frac_y = i4_mv_y & 1;
2564*c83a76b0SSuyog Pawar 
2565*c83a76b0SSuyog Pawar                 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
2566*c83a76b0SSuyog Pawar                 s_err_prms.pu1_ref =
2567*c83a76b0SSuyog Pawar                     pu1_ref + ((i4_mv_x + 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
2568*c83a76b0SSuyog Pawar 
2569*c83a76b0SSuyog Pawar                 /* Update the mv's with the current candt motion vectors */
2570*c83a76b0SSuyog Pawar                 s_result_prms.i2_mv_x = mvx_qpel + 2;
2571*c83a76b0SSuyog Pawar                 s_result_prms.i2_mv_y = mvy_qpel;
2572*c83a76b0SSuyog Pawar                 s_temp_search_node.s_mv.i2_mvx = mvx_qpel + 2;
2573*c83a76b0SSuyog Pawar                 s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
2574*c83a76b0SSuyog Pawar 
2575*c83a76b0SSuyog Pawar                 pf_err_compute(&s_err_prms, &s_result_prms);
2576*c83a76b0SSuyog Pawar                 //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2577*c83a76b0SSuyog Pawar                 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2578*c83a76b0SSuyog Pawar                 if(i4_tot_cost < i4_min_cost)
2579*c83a76b0SSuyog Pawar                 {
2580*c83a76b0SSuyog Pawar                     i4_min_cost = i4_tot_cost;
2581*c83a76b0SSuyog Pawar                     i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2582*c83a76b0SSuyog Pawar                     e_min_id = PT_R;
2583*c83a76b0SSuyog Pawar                     pu1_final_out = s_err_prms.pu1_ref;
2584*c83a76b0SSuyog Pawar                 }
2585*c83a76b0SSuyog Pawar             }
2586*c83a76b0SSuyog Pawar         }
2587*c83a76b0SSuyog Pawar         /* bottom pt */
2588*c83a76b0SSuyog Pawar         if(i4_grid_mask & BIT_EN(PT_B))
2589*c83a76b0SSuyog Pawar         {
2590*c83a76b0SSuyog Pawar             CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2591*c83a76b0SSuyog Pawar                 ps_dedup_enabler, num_unique_nodes, mvx_qpel, mvy_qpel + 2, check_for_duplicate);
2592*c83a76b0SSuyog Pawar             if(!check_for_duplicate)
2593*c83a76b0SSuyog Pawar             {
2594*c83a76b0SSuyog Pawar                 /* search node mv is stored in qpel units */
2595*c83a76b0SSuyog Pawar                 ps_search_node->s_mv.i2_mvx = ((S16)i4_mv_x << 1);
2596*c83a76b0SSuyog Pawar                 ps_search_node->s_mv.i2_mvy = ((S16)(i4_mv_y + 1) << 1);
2597*c83a76b0SSuyog Pawar                 i4_frac_x = i4_mv_x & 1;
2598*c83a76b0SSuyog Pawar                 i4_frac_y = (i4_mv_y + 1) & 1;
2599*c83a76b0SSuyog Pawar                 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
2600*c83a76b0SSuyog Pawar                 s_err_prms.pu1_ref =
2601*c83a76b0SSuyog Pawar                     pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y + 1) >> 1) * i4_ref_stride);
2602*c83a76b0SSuyog Pawar 
2603*c83a76b0SSuyog Pawar                 /* Update the mv's with the current candt motion vectors */
2604*c83a76b0SSuyog Pawar                 s_result_prms.i2_mv_x = mvx_qpel;
2605*c83a76b0SSuyog Pawar                 s_result_prms.i2_mv_y = mvy_qpel + 2;
2606*c83a76b0SSuyog Pawar                 s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
2607*c83a76b0SSuyog Pawar                 s_temp_search_node.s_mv.i2_mvy = mvy_qpel + 2;
2608*c83a76b0SSuyog Pawar 
2609*c83a76b0SSuyog Pawar                 pf_err_compute(&s_err_prms, &s_result_prms);
2610*c83a76b0SSuyog Pawar                 //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2611*c83a76b0SSuyog Pawar                 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2612*c83a76b0SSuyog Pawar                 if(i4_tot_cost < i4_min_cost)
2613*c83a76b0SSuyog Pawar                 {
2614*c83a76b0SSuyog Pawar                     i4_min_cost = i4_tot_cost;
2615*c83a76b0SSuyog Pawar                     i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2616*c83a76b0SSuyog Pawar                     e_min_id = PT_B;
2617*c83a76b0SSuyog Pawar                     pu1_final_out = s_err_prms.pu1_ref;
2618*c83a76b0SSuyog Pawar                 }
2619*c83a76b0SSuyog Pawar             }
2620*c83a76b0SSuyog Pawar         }
2621*c83a76b0SSuyog Pawar         /* Early exit in case of central point */
2622*c83a76b0SSuyog Pawar         if(e_min_id == PT_C)
2623*c83a76b0SSuyog Pawar             break;
2624*c83a76b0SSuyog Pawar 
2625*c83a76b0SSuyog Pawar         /*********************************************************************/
2626*c83a76b0SSuyog Pawar         /* Depending on the best result location, we may be able to skip     */
2627*c83a76b0SSuyog Pawar         /* atleast two pts, centre pt and one more pt. E.g. if right pt is   */
2628*c83a76b0SSuyog Pawar         /* the best result, the next iteration need not do centre, left pts  */
2629*c83a76b0SSuyog Pawar         /*********************************************************************/
2630*c83a76b0SSuyog Pawar         i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id];
2631*c83a76b0SSuyog Pawar         i4_mv_x += gai1_grid_id_to_x[e_min_id];
2632*c83a76b0SSuyog Pawar         i4_mv_y += gai1_grid_id_to_y[e_min_id];
2633*c83a76b0SSuyog Pawar         ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2634*c83a76b0SSuyog Pawar         ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2635*c83a76b0SSuyog Pawar         i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms);
2636*c83a76b0SSuyog Pawar     }
2637*c83a76b0SSuyog Pawar 
2638*c83a76b0SSuyog Pawar     /* Convert to QPEL units */
2639*c83a76b0SSuyog Pawar     i4_mv_x <<= 1;
2640*c83a76b0SSuyog Pawar     i4_mv_y <<= 1;
2641*c83a76b0SSuyog Pawar 
2642*c83a76b0SSuyog Pawar     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2643*c83a76b0SSuyog Pawar     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2644*c83a76b0SSuyog Pawar 
2645*c83a76b0SSuyog Pawar     /* Exact interpolation or averaging chosen here */
2646*c83a76b0SSuyog Pawar     pf_qpel_interp = ps_prms->pf_qpel_interp;
2647*c83a76b0SSuyog Pawar 
2648*c83a76b0SSuyog Pawar     /* Next QPEL ME */
2649*c83a76b0SSuyog Pawar     /* In this case, we have option of doing exact QPEL interpolation or avg */
2650*c83a76b0SSuyog Pawar     /*************************************************************************/
2651*c83a76b0SSuyog Pawar     /*        x                                                              */
2652*c83a76b0SSuyog Pawar     /*    A b C d                                                            */
2653*c83a76b0SSuyog Pawar     /*    e f g h                                                            */
2654*c83a76b0SSuyog Pawar     /*    I j K l                                                            */
2655*c83a76b0SSuyog Pawar     /*    m n o p                                                            */
2656*c83a76b0SSuyog Pawar     /*    Q r S t                                                            */
2657*c83a76b0SSuyog Pawar     /*                                                                       */
2658*c83a76b0SSuyog Pawar     /*    Approximate QPEL logic                                             */
2659*c83a76b0SSuyog Pawar     /*    b = avg(A,C) f = avg(I,C), g= avg(C,K) j=avg(I,K)                  */
2660*c83a76b0SSuyog Pawar     /*    for any given pt, we can get all the information required about    */
2661*c83a76b0SSuyog Pawar     /*    the surrounding 4 pts. For example, given point C (0.5, 0)         */
2662*c83a76b0SSuyog Pawar     /*     surrounding pts info:                                             */
2663*c83a76b0SSuyog Pawar     /*     b : qpel offset: 1, 0, generated by averaging. buffer1: fpel buf  */
2664*c83a76b0SSuyog Pawar     /*           buffer 2: hxfy, offsets for both are 0, 0                   */
2665*c83a76b0SSuyog Pawar     /*    similarly for other pts the info can be gotten                     */
2666*c83a76b0SSuyog Pawar     /*************************************************************************/
2667*c83a76b0SSuyog Pawar     i4_grid_mask = GRID_DIAMOND_ENABLE_ALL ^ (BIT_EN(PT_C));
2668*c83a76b0SSuyog Pawar     i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms);
2669*c83a76b0SSuyog Pawar 
2670*c83a76b0SSuyog Pawar     /*************************************************************************/
2671*c83a76b0SSuyog Pawar     /* One time preparation of non changing interpolation params. These      */
2672*c83a76b0SSuyog Pawar     /* include a set of ping pong result buf ptrs, input buf ptrs and some   */
2673*c83a76b0SSuyog Pawar     /* working memory (not used though in case of averaging).                */
2674*c83a76b0SSuyog Pawar     /*************************************************************************/
2675*c83a76b0SSuyog Pawar     s_interp_prms.ppu1_ref = &apu1_hpel_ref[0];
2676*c83a76b0SSuyog Pawar     s_interp_prms.i4_ref_stride = i4_ref_stride;
2677*c83a76b0SSuyog Pawar     s_interp_prms.i4_blk_wd = i4_blk_wd;
2678*c83a76b0SSuyog Pawar     s_interp_prms.i4_blk_ht = i4_blk_ht;
2679*c83a76b0SSuyog Pawar 
2680*c83a76b0SSuyog Pawar     i4_final_out_stride = i4_ref_stride;
2681*c83a76b0SSuyog Pawar 
2682*c83a76b0SSuyog Pawar     {
2683*c83a76b0SSuyog Pawar         U08 *pu1_mem;
2684*c83a76b0SSuyog Pawar         /*********************************************************************/
2685*c83a76b0SSuyog Pawar         /* Allocation of working memory for interpolated buffers. We maintain*/
2686*c83a76b0SSuyog Pawar         /* an intermediate working buffer, and 2 ping pong interpolated out  */
2687*c83a76b0SSuyog Pawar         /* buffers, purpose of ping pong explained later below               */
2688*c83a76b0SSuyog Pawar         /*********************************************************************/
2689*c83a76b0SSuyog Pawar         pu1_mem = ps_prms->pu1_wkg_mem;
2690*c83a76b0SSuyog Pawar         s_interp_prms.pu1_wkg_mem = pu1_mem;
2691*c83a76b0SSuyog Pawar 
2692*c83a76b0SSuyog Pawar         //pu1_mem += (INTERP_INTERMED_BUF_SIZE);
2693*c83a76b0SSuyog Pawar         s_interp_prms.apu1_interp_out[0] = pu1_mem;
2694*c83a76b0SSuyog Pawar 
2695*c83a76b0SSuyog Pawar         pu1_mem += (INTERP_OUT_BUF_SIZE);
2696*c83a76b0SSuyog Pawar         s_interp_prms.apu1_interp_out[1] = pu1_mem;
2697*c83a76b0SSuyog Pawar 
2698*c83a76b0SSuyog Pawar         pu1_mem += (INTERP_OUT_BUF_SIZE);
2699*c83a76b0SSuyog Pawar         s_interp_prms.apu1_interp_out[2] = pu1_mem;
2700*c83a76b0SSuyog Pawar 
2701*c83a76b0SSuyog Pawar         pu1_mem += (INTERP_OUT_BUF_SIZE);
2702*c83a76b0SSuyog Pawar         s_interp_prms.apu1_interp_out[3] = pu1_mem;
2703*c83a76b0SSuyog Pawar 
2704*c83a76b0SSuyog Pawar         pu1_mem += (INTERP_OUT_BUF_SIZE);
2705*c83a76b0SSuyog Pawar         s_interp_prms.apu1_interp_out[4] = pu1_mem;
2706*c83a76b0SSuyog Pawar 
2707*c83a76b0SSuyog Pawar         /*********************************************************************/
2708*c83a76b0SSuyog Pawar         /* Stride of interpolated output is just a function of blk width of  */
2709*c83a76b0SSuyog Pawar         /* this partition and hence remains constant for this partition      */
2710*c83a76b0SSuyog Pawar         /*********************************************************************/
2711*c83a76b0SSuyog Pawar         s_interp_prms.i4_out_stride = (i4_blk_wd);
2712*c83a76b0SSuyog Pawar     }
2713*c83a76b0SSuyog Pawar 
2714*c83a76b0SSuyog Pawar     {
2715*c83a76b0SSuyog Pawar         UWORD8 *apu1_final[4];
2716*c83a76b0SSuyog Pawar         WORD32 ai4_ref_stride[4];
2717*c83a76b0SSuyog Pawar         /*************************************************************************/
2718*c83a76b0SSuyog Pawar         /* Ping pong design for interpolated buffers. We use a min id, which     */
2719*c83a76b0SSuyog Pawar         /* tracks the id of the ppu1_interp_out that stores the best result.     */
2720*c83a76b0SSuyog Pawar         /* When new interp to be done, it uses 1 - bes result id to do the interp*/
2721*c83a76b0SSuyog Pawar         /* min id is toggled when any new result becomes the best result.        */
2722*c83a76b0SSuyog Pawar         /*************************************************************************/
2723*c83a76b0SSuyog Pawar 
2724*c83a76b0SSuyog Pawar         for(i4_i = 0; i4_i < i4_num_qpel_refine; i4_i++)
2725*c83a76b0SSuyog Pawar         {
2726*c83a76b0SSuyog Pawar             e_min_id = PT_C;
2727*c83a76b0SSuyog Pawar 
2728*c83a76b0SSuyog Pawar             mvx_qpel = i4_mv_x;
2729*c83a76b0SSuyog Pawar             mvy_qpel = i4_mv_y;
2730*c83a76b0SSuyog Pawar             hme_qpel_interp_comprehensive(
2731*c83a76b0SSuyog Pawar                 &s_interp_prms,
2732*c83a76b0SSuyog Pawar                 apu1_final,
2733*c83a76b0SSuyog Pawar                 ai4_ref_stride,
2734*c83a76b0SSuyog Pawar                 i4_mv_x,
2735*c83a76b0SSuyog Pawar                 i4_mv_y,
2736*c83a76b0SSuyog Pawar                 i4_grid_mask,
2737*c83a76b0SSuyog Pawar                 ps_me_optimised_function_list);
2738*c83a76b0SSuyog Pawar             if(i4_grid_mask & BIT_EN(PT_L))
2739*c83a76b0SSuyog Pawar             {
2740*c83a76b0SSuyog Pawar                 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2741*c83a76b0SSuyog Pawar                     ps_dedup_enabler,
2742*c83a76b0SSuyog Pawar                     num_unique_nodes,
2743*c83a76b0SSuyog Pawar                     mvx_qpel - 1,
2744*c83a76b0SSuyog Pawar                     mvy_qpel - 0,
2745*c83a76b0SSuyog Pawar                     check_for_duplicate);
2746*c83a76b0SSuyog Pawar 
2747*c83a76b0SSuyog Pawar                 if(!check_for_duplicate)
2748*c83a76b0SSuyog Pawar                 {
2749*c83a76b0SSuyog Pawar                     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1;
2750*c83a76b0SSuyog Pawar                     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2751*c83a76b0SSuyog Pawar 
2752*c83a76b0SSuyog Pawar                     s_err_prms.pu1_ref = apu1_final[0];
2753*c83a76b0SSuyog Pawar                     s_err_prms.i4_ref_stride = ai4_ref_stride[0];
2754*c83a76b0SSuyog Pawar 
2755*c83a76b0SSuyog Pawar                     /* Update the mv's with the current candt motion vectors */
2756*c83a76b0SSuyog Pawar                     s_result_prms.i2_mv_x = mvx_qpel - 1;
2757*c83a76b0SSuyog Pawar                     s_result_prms.i2_mv_y = mvy_qpel;
2758*c83a76b0SSuyog Pawar                     s_temp_search_node.s_mv.i2_mvx = mvx_qpel - 1;
2759*c83a76b0SSuyog Pawar                     s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
2760*c83a76b0SSuyog Pawar 
2761*c83a76b0SSuyog Pawar                     pf_err_compute(&s_err_prms, &s_result_prms);
2762*c83a76b0SSuyog Pawar                     //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2763*c83a76b0SSuyog Pawar 
2764*c83a76b0SSuyog Pawar                     i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2765*c83a76b0SSuyog Pawar                     if(i4_tot_cost < i4_min_cost)
2766*c83a76b0SSuyog Pawar                     {
2767*c83a76b0SSuyog Pawar                         e_min_id = PT_L;
2768*c83a76b0SSuyog Pawar                         i4_min_cost = i4_tot_cost;
2769*c83a76b0SSuyog Pawar                         i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2770*c83a76b0SSuyog Pawar                     }
2771*c83a76b0SSuyog Pawar                 }
2772*c83a76b0SSuyog Pawar             }
2773*c83a76b0SSuyog Pawar             if(i4_grid_mask & BIT_EN(PT_T))
2774*c83a76b0SSuyog Pawar             {
2775*c83a76b0SSuyog Pawar                 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2776*c83a76b0SSuyog Pawar                     ps_dedup_enabler,
2777*c83a76b0SSuyog Pawar                     num_unique_nodes,
2778*c83a76b0SSuyog Pawar                     mvx_qpel - 0,
2779*c83a76b0SSuyog Pawar                     mvy_qpel - 1,
2780*c83a76b0SSuyog Pawar                     check_for_duplicate);
2781*c83a76b0SSuyog Pawar 
2782*c83a76b0SSuyog Pawar                 if(!check_for_duplicate)
2783*c83a76b0SSuyog Pawar                 {
2784*c83a76b0SSuyog Pawar                     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2785*c83a76b0SSuyog Pawar                     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1;
2786*c83a76b0SSuyog Pawar 
2787*c83a76b0SSuyog Pawar                     s_err_prms.pu1_ref = apu1_final[1];
2788*c83a76b0SSuyog Pawar                     s_err_prms.i4_ref_stride = ai4_ref_stride[1];
2789*c83a76b0SSuyog Pawar 
2790*c83a76b0SSuyog Pawar                     /* Update the mv's with the current candt motion vectors */
2791*c83a76b0SSuyog Pawar                     s_result_prms.i2_mv_x = mvx_qpel;
2792*c83a76b0SSuyog Pawar                     s_result_prms.i2_mv_y = mvy_qpel - 1;
2793*c83a76b0SSuyog Pawar 
2794*c83a76b0SSuyog Pawar                     s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
2795*c83a76b0SSuyog Pawar                     s_temp_search_node.s_mv.i2_mvy = mvy_qpel - 1;
2796*c83a76b0SSuyog Pawar 
2797*c83a76b0SSuyog Pawar                     pf_err_compute(&s_err_prms, &s_result_prms);
2798*c83a76b0SSuyog Pawar 
2799*c83a76b0SSuyog Pawar                     //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2800*c83a76b0SSuyog Pawar                     i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2801*c83a76b0SSuyog Pawar                     if(i4_tot_cost < i4_min_cost)
2802*c83a76b0SSuyog Pawar                     {
2803*c83a76b0SSuyog Pawar                         e_min_id = PT_T;
2804*c83a76b0SSuyog Pawar                         i4_min_cost = i4_tot_cost;
2805*c83a76b0SSuyog Pawar                         i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2806*c83a76b0SSuyog Pawar                     }
2807*c83a76b0SSuyog Pawar                 }
2808*c83a76b0SSuyog Pawar             }
2809*c83a76b0SSuyog Pawar             if(i4_grid_mask & BIT_EN(PT_R))
2810*c83a76b0SSuyog Pawar             {
2811*c83a76b0SSuyog Pawar                 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2812*c83a76b0SSuyog Pawar                     ps_dedup_enabler, num_unique_nodes, mvx_qpel + 1, mvy_qpel, check_for_duplicate);
2813*c83a76b0SSuyog Pawar 
2814*c83a76b0SSuyog Pawar                 if(!check_for_duplicate)
2815*c83a76b0SSuyog Pawar                 {
2816*c83a76b0SSuyog Pawar                     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1;
2817*c83a76b0SSuyog Pawar                     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2818*c83a76b0SSuyog Pawar 
2819*c83a76b0SSuyog Pawar                     s_err_prms.pu1_ref = apu1_final[2];
2820*c83a76b0SSuyog Pawar                     s_err_prms.i4_ref_stride = ai4_ref_stride[2];
2821*c83a76b0SSuyog Pawar 
2822*c83a76b0SSuyog Pawar                     /* Update the mv's with the current candt motion vectors */
2823*c83a76b0SSuyog Pawar                     s_result_prms.i2_mv_x = mvx_qpel + 1;
2824*c83a76b0SSuyog Pawar                     s_result_prms.i2_mv_y = mvy_qpel;
2825*c83a76b0SSuyog Pawar 
2826*c83a76b0SSuyog Pawar                     s_temp_search_node.s_mv.i2_mvx = mvx_qpel + 1;
2827*c83a76b0SSuyog Pawar                     s_temp_search_node.s_mv.i2_mvy = mvy_qpel;
2828*c83a76b0SSuyog Pawar 
2829*c83a76b0SSuyog Pawar                     pf_err_compute(&s_err_prms, &s_result_prms);
2830*c83a76b0SSuyog Pawar 
2831*c83a76b0SSuyog Pawar                     //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2832*c83a76b0SSuyog Pawar 
2833*c83a76b0SSuyog Pawar                     i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2834*c83a76b0SSuyog Pawar                     if(i4_tot_cost < i4_min_cost)
2835*c83a76b0SSuyog Pawar                     {
2836*c83a76b0SSuyog Pawar                         e_min_id = PT_R;
2837*c83a76b0SSuyog Pawar                         i4_min_cost = i4_tot_cost;
2838*c83a76b0SSuyog Pawar                         i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2839*c83a76b0SSuyog Pawar                     }
2840*c83a76b0SSuyog Pawar                 }
2841*c83a76b0SSuyog Pawar             }
2842*c83a76b0SSuyog Pawar             /* i4_mv_x and i4_mv_y will always be the centre pt */
2843*c83a76b0SSuyog Pawar             /* for qpel we  start with least hpel, and hence compute of center pt never reqd */
2844*c83a76b0SSuyog Pawar             if(i4_grid_mask & BIT_EN(PT_B))
2845*c83a76b0SSuyog Pawar             {
2846*c83a76b0SSuyog Pawar                 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
2847*c83a76b0SSuyog Pawar                     ps_dedup_enabler, num_unique_nodes, mvx_qpel, mvy_qpel + 1, check_for_duplicate);
2848*c83a76b0SSuyog Pawar 
2849*c83a76b0SSuyog Pawar                 if(!check_for_duplicate)
2850*c83a76b0SSuyog Pawar                 {
2851*c83a76b0SSuyog Pawar                     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2852*c83a76b0SSuyog Pawar                     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1;
2853*c83a76b0SSuyog Pawar 
2854*c83a76b0SSuyog Pawar                     s_err_prms.pu1_ref = apu1_final[3];
2855*c83a76b0SSuyog Pawar                     s_err_prms.i4_ref_stride = ai4_ref_stride[3];
2856*c83a76b0SSuyog Pawar 
2857*c83a76b0SSuyog Pawar                     /* Update the mv's with the current candt motion vectors */
2858*c83a76b0SSuyog Pawar                     s_result_prms.i2_mv_x = mvx_qpel;
2859*c83a76b0SSuyog Pawar                     s_result_prms.i2_mv_y = mvy_qpel + 1;
2860*c83a76b0SSuyog Pawar 
2861*c83a76b0SSuyog Pawar                     s_temp_search_node.s_mv.i2_mvx = mvx_qpel;
2862*c83a76b0SSuyog Pawar                     s_temp_search_node.s_mv.i2_mvy = mvy_qpel + 1;
2863*c83a76b0SSuyog Pawar 
2864*c83a76b0SSuyog Pawar                     pf_err_compute(&s_err_prms, &s_result_prms);
2865*c83a76b0SSuyog Pawar 
2866*c83a76b0SSuyog Pawar                     //hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
2867*c83a76b0SSuyog Pawar                     i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
2868*c83a76b0SSuyog Pawar                     if(i4_tot_cost < i4_min_cost)
2869*c83a76b0SSuyog Pawar                     {
2870*c83a76b0SSuyog Pawar                         e_min_id = PT_B;
2871*c83a76b0SSuyog Pawar                         i4_min_cost = i4_tot_cost;
2872*c83a76b0SSuyog Pawar                         i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
2873*c83a76b0SSuyog Pawar                     }
2874*c83a76b0SSuyog Pawar                 }
2875*c83a76b0SSuyog Pawar             }
2876*c83a76b0SSuyog Pawar 
2877*c83a76b0SSuyog Pawar             /* New QPEL mv x and y */
2878*c83a76b0SSuyog Pawar             if(e_min_id == PT_C)
2879*c83a76b0SSuyog Pawar                 break;
2880*c83a76b0SSuyog Pawar             i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id];
2881*c83a76b0SSuyog Pawar             i4_mv_x += gai1_grid_id_to_x[e_min_id];
2882*c83a76b0SSuyog Pawar             i4_mv_y += gai1_grid_id_to_y[e_min_id];
2883*c83a76b0SSuyog Pawar             ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2884*c83a76b0SSuyog Pawar             ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2885*c83a76b0SSuyog Pawar             i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms);
2886*c83a76b0SSuyog Pawar         }
2887*c83a76b0SSuyog Pawar     }
2888*c83a76b0SSuyog Pawar 
2889*c83a76b0SSuyog Pawar     /* update modified motion vectors and cost at end of subpel */
2890*c83a76b0SSuyog Pawar     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
2891*c83a76b0SSuyog Pawar     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
2892*c83a76b0SSuyog Pawar     ps_search_node->i4_tot_cost = i4_min_cost;
2893*c83a76b0SSuyog Pawar     ps_search_node->i4_sad = i4_min_sad;
2894*c83a76b0SSuyog Pawar 
2895*c83a76b0SSuyog Pawar     /********************************************************************************/
2896*c83a76b0SSuyog Pawar     /* TODO: Restoring back Sad lambda from Hadamard lambda                         */
2897*c83a76b0SSuyog Pawar     /* Need to pass the had/satd lambda in more cleaner way for subpel cost compute */
2898*c83a76b0SSuyog Pawar     /********************************************************************************/
2899*c83a76b0SSuyog Pawar     //ps_pred_ctxt->lambda >>= 1;
2900*c83a76b0SSuyog Pawar 
2901*c83a76b0SSuyog Pawar     return (i4_min_cost);
2902*c83a76b0SSuyog Pawar }
2903*c83a76b0SSuyog Pawar #elif DIAMOND_GRID == 0
hme_subpel_refine_search_node_high_speed(search_node_t * ps_search_node,hme_subpel_prms_t * ps_prms,layer_ctxt_t * ps_curr_layer,BLK_SIZE_T e_blk_size,S32 x_off,S32 y_off,search_results_t * ps_search_results,S32 pred_lx,S32 i4_part_mask,S32 * pi4_valid_part_ids,S32 search_idx,subpel_dedup_enabler_t * ps_dedup_enabler,me_func_selector_t * ps_func_selector)2904*c83a76b0SSuyog Pawar S32 hme_subpel_refine_search_node_high_speed(
2905*c83a76b0SSuyog Pawar     search_node_t *ps_search_node,
2906*c83a76b0SSuyog Pawar     hme_subpel_prms_t *ps_prms,
2907*c83a76b0SSuyog Pawar     layer_ctxt_t *ps_curr_layer,
2908*c83a76b0SSuyog Pawar     BLK_SIZE_T e_blk_size,
2909*c83a76b0SSuyog Pawar     S32 x_off,
2910*c83a76b0SSuyog Pawar     S32 y_off,
2911*c83a76b0SSuyog Pawar     search_results_t *ps_search_results,
2912*c83a76b0SSuyog Pawar     S32 pred_lx,
2913*c83a76b0SSuyog Pawar     S32 i4_part_mask,
2914*c83a76b0SSuyog Pawar     S32 *pi4_valid_part_ids,
2915*c83a76b0SSuyog Pawar     S32 search_idx,
2916*c83a76b0SSuyog Pawar     subpel_dedup_enabler_t *ps_dedup_enabler,
2917*c83a76b0SSuyog Pawar     me_func_selector_t *ps_func_selector)
2918*c83a76b0SSuyog Pawar {
2919*c83a76b0SSuyog Pawar     S32 i4_num_hpel_refine, i4_num_qpel_refine;
2920*c83a76b0SSuyog Pawar     S32 i4_offset, i4_grid_mask;
2921*c83a76b0SSuyog Pawar     S08 i1_ref_idx;
2922*c83a76b0SSuyog Pawar     S32 i4_blk_wd, i4_blk_ht;
2923*c83a76b0SSuyog Pawar     S32 i4_ref_stride, i4_i;
2924*c83a76b0SSuyog Pawar     pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
2925*c83a76b0SSuyog Pawar     result_upd_prms_t s_result_prms;
2926*c83a76b0SSuyog Pawar 
2927*c83a76b0SSuyog Pawar     /*************************************************************************/
2928*c83a76b0SSuyog Pawar     /* Tracks current MV with the fractional component.                      */
2929*c83a76b0SSuyog Pawar     /*************************************************************************/
2930*c83a76b0SSuyog Pawar     S32 i4_mv_x, i4_mv_y;
2931*c83a76b0SSuyog Pawar     S32 i4_frac_x, i4_frac_y;
2932*c83a76b0SSuyog Pawar 
2933*c83a76b0SSuyog Pawar     /*************************************************************************/
2934*c83a76b0SSuyog Pawar     /* Function pointer for SAD/SATD, array and prms structure to pass to    */
2935*c83a76b0SSuyog Pawar     /* This function                                                         */
2936*c83a76b0SSuyog Pawar     /*************************************************************************/
2937*c83a76b0SSuyog Pawar     PF_SAD_FXN_T pf_err_compute;
2938*c83a76b0SSuyog Pawar     S32 ai4_sad_grid[9][17], i4_tot_cost;
2939*c83a76b0SSuyog Pawar     err_prms_t s_err_prms;
2940*c83a76b0SSuyog Pawar 
2941*c83a76b0SSuyog Pawar     /*************************************************************************/
2942*c83a76b0SSuyog Pawar     /* Allowed MV RANGE                                                      */
2943*c83a76b0SSuyog Pawar     /*************************************************************************/
2944*c83a76b0SSuyog Pawar     range_prms_t *ps_range_prms;
2945*c83a76b0SSuyog Pawar 
2946*c83a76b0SSuyog Pawar     /*************************************************************************/
2947*c83a76b0SSuyog Pawar     /* stores min id in grid with associated min cost.                       */
2948*c83a76b0SSuyog Pawar     /*************************************************************************/
2949*c83a76b0SSuyog Pawar     S32 i4_min_cost, i4_min_sad;
2950*c83a76b0SSuyog Pawar     GRID_PT_T e_min_id;
2951*c83a76b0SSuyog Pawar 
2952*c83a76b0SSuyog Pawar     PF_INTERP_FXN_T pf_qpel_interp;
2953*c83a76b0SSuyog Pawar     /*************************************************************************/
2954*c83a76b0SSuyog Pawar     /* For hpel and qpel we move in diamonds and hence each point in the     */
2955*c83a76b0SSuyog Pawar     /* diamond will belong to a completely different plane. To simplify the  */
2956*c83a76b0SSuyog Pawar     /* look up of the ref ptr, we declare a 2x2 array of ref ptrs for the    */
2957*c83a76b0SSuyog Pawar     /* hpel planes which are interpolated during recon.                      */
2958*c83a76b0SSuyog Pawar     /*************************************************************************/
2959*c83a76b0SSuyog Pawar     U08 *apu1_hpel_ref[4], *pu1_ref;
2960*c83a76b0SSuyog Pawar 
2961*c83a76b0SSuyog Pawar     interp_prms_t s_interp_prms;
2962*c83a76b0SSuyog Pawar 
2963*c83a76b0SSuyog Pawar     /*************************************************************************/
2964*c83a76b0SSuyog Pawar     /* Maintains the minimum id of interpolated buffers, and the pointer that*/
2965*c83a76b0SSuyog Pawar     /* points to the corresponding predicted buf with its stride.            */
2966*c83a76b0SSuyog Pawar     /* Note that the pointer cannot be derived just from the id, since the   */
2967*c83a76b0SSuyog Pawar     /* pointer may also point to the hpel buffer (in case we request interp  */
2968*c83a76b0SSuyog Pawar     /* of a hpel pt, which already exists in the recon hpel planes)          */
2969*c83a76b0SSuyog Pawar     /*************************************************************************/
2970*c83a76b0SSuyog Pawar     U08 *pu1_final_out;
2971*c83a76b0SSuyog Pawar     S32 i4_final_out_stride;
2972*c83a76b0SSuyog Pawar     S32 part_id;
2973*c83a76b0SSuyog Pawar     S32 check_for_duplicate = 0;
2974*c83a76b0SSuyog Pawar 
2975*c83a76b0SSuyog Pawar     S32 mvx_qpel;
2976*c83a76b0SSuyog Pawar     S32 mvy_qpel;
2977*c83a76b0SSuyog Pawar 
2978*c83a76b0SSuyog Pawar     /*************************************************************************/
2979*c83a76b0SSuyog Pawar     /* Appropriate Err compute fxn, depends on SAD/SATD, blk size and remains*/
2980*c83a76b0SSuyog Pawar     /* fixed through this subpel refinement for this partition.              */
2981*c83a76b0SSuyog Pawar     /* Note, we do not enable grid sads since each pt is different buffers.  */
2982*c83a76b0SSuyog Pawar     /* Hence, part mask is also nearly dont care and we use 2Nx2N enabled.   */
2983*c83a76b0SSuyog Pawar     /*************************************************************************/
2984*c83a76b0SSuyog Pawar     if(ps_prms->i4_use_satd)
2985*c83a76b0SSuyog Pawar     {
2986*c83a76b0SSuyog Pawar         pf_err_compute = hme_evalsatd_update_1_best_result_pt_pu_16x16;
2987*c83a76b0SSuyog Pawar     }
2988*c83a76b0SSuyog Pawar     else
2989*c83a76b0SSuyog Pawar     {
2990*c83a76b0SSuyog Pawar         pf_err_compute = hme_evalsad_grid_pu_16x16; /* hme_evalsad_pt_pu_16x16; */
2991*c83a76b0SSuyog Pawar     }
2992*c83a76b0SSuyog Pawar 
2993*c83a76b0SSuyog Pawar     i4_num_hpel_refine = ps_prms->i4_num_steps_hpel_refine;
2994*c83a76b0SSuyog Pawar     i4_num_qpel_refine = ps_prms->i4_num_steps_qpel_refine;
2995*c83a76b0SSuyog Pawar 
2996*c83a76b0SSuyog Pawar     /* Prediction contet should now deal with qpel units */
2997*c83a76b0SSuyog Pawar     HME_SET_MVPRED_RES(ps_pred_ctxt, MV_RES_QPEL);
2998*c83a76b0SSuyog Pawar 
2999*c83a76b0SSuyog Pawar     /* Buffer allocation for subpel */
3000*c83a76b0SSuyog Pawar     /* Current design is that there may be many partitions and different mvs */
3001*c83a76b0SSuyog Pawar     /* that attempt subpel refinemnt. While there is possibility of overlap, the */
3002*c83a76b0SSuyog Pawar     /* hashing to detect and avoid overlap may be very complex. So, currently,   */
3003*c83a76b0SSuyog Pawar     /* the only thing done is to store the eventual predicted buffer with every  */
3004*c83a76b0SSuyog Pawar     /* ctb node that holds the result of hte best subpel search */
3005*c83a76b0SSuyog Pawar 
3006*c83a76b0SSuyog Pawar     /* Compute the base pointer for input, interpolated buffers */
3007*c83a76b0SSuyog Pawar     /* The base pointers point as follows:
3008*c83a76b0SSuyog Pawar     /* fx fy : 0, 0 :: fx, hy : 0, 0.5, hx, fy: 0.5, 0, hx, fy: 0.5, 0.5 */
3009*c83a76b0SSuyog Pawar     /* To these, we need to add the offset of the current node */
3010*c83a76b0SSuyog Pawar     i4_ref_stride = ps_curr_layer->i4_rec_stride;
3011*c83a76b0SSuyog Pawar     i4_offset = x_off + (y_off * i4_ref_stride);
3012*c83a76b0SSuyog Pawar     i1_ref_idx = ps_search_node->i1_ref_idx;
3013*c83a76b0SSuyog Pawar 
3014*c83a76b0SSuyog Pawar     apu1_hpel_ref[0] = ps_curr_layer->ppu1_list_rec_fxfy[i1_ref_idx] + i4_offset;
3015*c83a76b0SSuyog Pawar     apu1_hpel_ref[1] = ps_curr_layer->ppu1_list_rec_hxfy[i1_ref_idx] + i4_offset;
3016*c83a76b0SSuyog Pawar     apu1_hpel_ref[2] = ps_curr_layer->ppu1_list_rec_fxhy[i1_ref_idx] + i4_offset;
3017*c83a76b0SSuyog Pawar     apu1_hpel_ref[3] = ps_curr_layer->ppu1_list_rec_hxhy[i1_ref_idx] + i4_offset;
3018*c83a76b0SSuyog Pawar 
3019*c83a76b0SSuyog Pawar     /* Initialize result params used for partition update */
3020*c83a76b0SSuyog Pawar     s_result_prms.pf_mv_cost_compute = NULL;
3021*c83a76b0SSuyog Pawar     s_result_prms.ps_search_results = ps_search_results;
3022*c83a76b0SSuyog Pawar     s_result_prms.pi4_valid_part_ids = pi4_valid_part_ids;
3023*c83a76b0SSuyog Pawar     s_result_prms.i1_ref_idx = search_idx;
3024*c83a76b0SSuyog Pawar     s_result_prms.i4_part_mask = i4_part_mask;
3025*c83a76b0SSuyog Pawar     s_result_prms.ps_search_node_base = ps_search_node;
3026*c83a76b0SSuyog Pawar     s_result_prms.pi4_sad_grid = &ai4_sad_grid[0][0];
3027*c83a76b0SSuyog Pawar     s_result_prms.i4_grid_mask = 1;
3028*c83a76b0SSuyog Pawar 
3029*c83a76b0SSuyog Pawar     /* convert to hpel units */
3030*c83a76b0SSuyog Pawar     i4_mv_x = ps_search_node->s_mv.i2_mvx >> 1;
3031*c83a76b0SSuyog Pawar     i4_mv_y = ps_search_node->s_mv.i2_mvy >> 1;
3032*c83a76b0SSuyog Pawar 
3033*c83a76b0SSuyog Pawar     /* for first pt, we compute at all locations in the grid, 4 + 1 centre */
3034*c83a76b0SSuyog Pawar     ps_range_prms = ps_prms->ps_mv_range_qpel;
3035*c83a76b0SSuyog Pawar     i4_grid_mask = (GRID_ALL_PTS_VALID);
3036*c83a76b0SSuyog Pawar     i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms);
3037*c83a76b0SSuyog Pawar 
3038*c83a76b0SSuyog Pawar     i4_min_cost = MAX_32BIT_VAL;
3039*c83a76b0SSuyog Pawar     i4_min_sad = MAX_32BIT_VAL;
3040*c83a76b0SSuyog Pawar 
3041*c83a76b0SSuyog Pawar     /*************************************************************************/
3042*c83a76b0SSuyog Pawar     /* Prepare the input params to SAD/SATD function. Note that input is     */
3043*c83a76b0SSuyog Pawar     /* passed from the calling funcion since it may be I (normal subpel      */
3044*c83a76b0SSuyog Pawar     /* refinement) or 2I - P0 in case of bidirect subpel refinement.         */
3045*c83a76b0SSuyog Pawar     /* Both cases are handled here.                                          */
3046*c83a76b0SSuyog Pawar     /*************************************************************************/
3047*c83a76b0SSuyog Pawar     s_err_prms.pu1_inp = (U08 *)ps_prms->pv_inp;
3048*c83a76b0SSuyog Pawar     s_err_prms.i4_inp_stride = ps_prms->i4_inp_stride;
3049*c83a76b0SSuyog Pawar     s_err_prms.i4_ref_stride = i4_ref_stride;
3050*c83a76b0SSuyog Pawar     s_err_prms.i4_part_mask = (ENABLE_2Nx2N);
3051*c83a76b0SSuyog Pawar     s_err_prms.i4_grid_mask = 1;
3052*c83a76b0SSuyog Pawar     s_err_prms.pi4_sad_grid = &ai4_sad_grid[0][0];
3053*c83a76b0SSuyog Pawar     s_err_prms.i4_blk_wd = i4_blk_wd = gau1_blk_size_to_wd[e_blk_size];
3054*c83a76b0SSuyog Pawar     s_err_prms.i4_blk_ht = i4_blk_ht = gau1_blk_size_to_ht[e_blk_size];
3055*c83a76b0SSuyog Pawar 
3056*c83a76b0SSuyog Pawar     /* TODO: Currently doubling lambda for Hadamard Sad instead of 1.9*sadlambda */
3057*c83a76b0SSuyog Pawar     //ps_pred_ctxt->lambda <<= 1;
3058*c83a76b0SSuyog Pawar     part_id = ps_search_node->u1_part_id;
3059*c83a76b0SSuyog Pawar     for(i4_i = 0; i4_i < i4_num_hpel_refine; i4_i++)
3060*c83a76b0SSuyog Pawar     {
3061*c83a76b0SSuyog Pawar         e_min_id = PT_C;
3062*c83a76b0SSuyog Pawar 
3063*c83a76b0SSuyog Pawar         mvx_qpel = i4_mv_x << 1;
3064*c83a76b0SSuyog Pawar         mvy_qpel = i4_mv_y << 1;
3065*c83a76b0SSuyog Pawar 
3066*c83a76b0SSuyog Pawar         /* Central pt */
3067*c83a76b0SSuyog Pawar         if(i4_grid_mask & BIT_EN(PT_C))
3068*c83a76b0SSuyog Pawar         {
3069*c83a76b0SSuyog Pawar             //ps_search_node->i2_mv_x = (S16)i4_mv_x;
3070*c83a76b0SSuyog Pawar             //ps_search_node->i2_mv_x = (S16)i4_mv_y;
3071*c83a76b0SSuyog Pawar             /* central pt is i4_mv_x, i4_mv_y */
3072*c83a76b0SSuyog Pawar             CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3073*c83a76b0SSuyog Pawar                 ps_dedup_enabler, 1, mvx_qpel, mvy_qpel, check_for_duplicate);
3074*c83a76b0SSuyog Pawar 
3075*c83a76b0SSuyog Pawar             i4_frac_x = i4_mv_x & 1;
3076*c83a76b0SSuyog Pawar             i4_frac_y = i4_mv_y & 1;
3077*c83a76b0SSuyog Pawar             pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3078*c83a76b0SSuyog Pawar             s_err_prms.pu1_ref = pu1_ref + (i4_mv_x >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
3079*c83a76b0SSuyog Pawar             pf_err_compute(&s_err_prms);
3080*c83a76b0SSuyog Pawar             /* Update the mv's with the current candt motion vectors */
3081*c83a76b0SSuyog Pawar             s_result_prms.i2_mv_x = mvx_qpel;
3082*c83a76b0SSuyog Pawar             s_result_prms.i2_mv_y = mvy_qpel;
3083*c83a76b0SSuyog Pawar             hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3084*c83a76b0SSuyog Pawar             i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3085*c83a76b0SSuyog Pawar             if(i4_tot_cost < i4_min_cost)
3086*c83a76b0SSuyog Pawar             {
3087*c83a76b0SSuyog Pawar                 i4_min_cost = i4_tot_cost;
3088*c83a76b0SSuyog Pawar                 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3089*c83a76b0SSuyog Pawar                 e_min_id = PT_C;
3090*c83a76b0SSuyog Pawar                 pu1_final_out = s_err_prms.pu1_ref;
3091*c83a76b0SSuyog Pawar             }
3092*c83a76b0SSuyog Pawar         }
3093*c83a76b0SSuyog Pawar 
3094*c83a76b0SSuyog Pawar         /* left pt */
3095*c83a76b0SSuyog Pawar         if(i4_grid_mask & BIT_EN(PT_L))
3096*c83a76b0SSuyog Pawar         {
3097*c83a76b0SSuyog Pawar             CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3098*c83a76b0SSuyog Pawar                 ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel, check_for_duplicate);
3099*c83a76b0SSuyog Pawar 
3100*c83a76b0SSuyog Pawar             if(!check_for_duplicate)
3101*c83a76b0SSuyog Pawar             {
3102*c83a76b0SSuyog Pawar                 /* search node mv is stored in qpel units */
3103*c83a76b0SSuyog Pawar                 ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x - 1) << 1);
3104*c83a76b0SSuyog Pawar                 ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
3105*c83a76b0SSuyog Pawar                 /* central pt is i4_mv_x - 1, i4_mv_y */
3106*c83a76b0SSuyog Pawar                 i4_frac_x = (i4_mv_x - 1) & 1;  // same as (x-1)&1
3107*c83a76b0SSuyog Pawar                 i4_frac_y = i4_mv_y & 1;
3108*c83a76b0SSuyog Pawar                 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3109*c83a76b0SSuyog Pawar                 s_err_prms.pu1_ref =
3110*c83a76b0SSuyog Pawar                     pu1_ref + ((i4_mv_x - 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
3111*c83a76b0SSuyog Pawar 
3112*c83a76b0SSuyog Pawar                 pf_err_compute(&s_err_prms);
3113*c83a76b0SSuyog Pawar                 /* Update the mv's with the current candt motion vectors */
3114*c83a76b0SSuyog Pawar                 s_result_prms.i2_mv_x = mvx_qpel;
3115*c83a76b0SSuyog Pawar                 s_result_prms.i2_mv_y = mvy_qpel;
3116*c83a76b0SSuyog Pawar                 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3117*c83a76b0SSuyog Pawar 
3118*c83a76b0SSuyog Pawar                 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3119*c83a76b0SSuyog Pawar 
3120*c83a76b0SSuyog Pawar                 if(i4_tot_cost < i4_min_cost)
3121*c83a76b0SSuyog Pawar                 {
3122*c83a76b0SSuyog Pawar                     i4_min_cost = i4_tot_cost;
3123*c83a76b0SSuyog Pawar                     i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3124*c83a76b0SSuyog Pawar                     e_min_id = PT_L;
3125*c83a76b0SSuyog Pawar                     pu1_final_out = s_err_prms.pu1_ref;
3126*c83a76b0SSuyog Pawar                 }
3127*c83a76b0SSuyog Pawar             }
3128*c83a76b0SSuyog Pawar         }
3129*c83a76b0SSuyog Pawar         /* top pt */
3130*c83a76b0SSuyog Pawar         if(i4_grid_mask & BIT_EN(PT_T))
3131*c83a76b0SSuyog Pawar         {
3132*c83a76b0SSuyog Pawar             CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3133*c83a76b0SSuyog Pawar                 ps_dedup_enabler, 1, mvx_qpel, mvy_qpel - 2, check_for_duplicate);
3134*c83a76b0SSuyog Pawar 
3135*c83a76b0SSuyog Pawar             if(!check_for_duplicate)
3136*c83a76b0SSuyog Pawar             {
3137*c83a76b0SSuyog Pawar                 /* search node mv is stored in qpel units */
3138*c83a76b0SSuyog Pawar                 ps_search_node->s_mv.i2_mvx = (S16)(i4_mv_x << 1);
3139*c83a76b0SSuyog Pawar                 ps_search_node->s_mv.i2_mvy = (S16)((i4_mv_y - 1) << 1);
3140*c83a76b0SSuyog Pawar                 /* top pt is i4_mv_x, i4_mv_y - 1 */
3141*c83a76b0SSuyog Pawar                 i4_frac_x = i4_mv_x & 1;
3142*c83a76b0SSuyog Pawar                 i4_frac_y = (i4_mv_y - 1) & 1;
3143*c83a76b0SSuyog Pawar                 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3144*c83a76b0SSuyog Pawar                 s_err_prms.pu1_ref =
3145*c83a76b0SSuyog Pawar                     pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y - 1) >> 1) * i4_ref_stride);
3146*c83a76b0SSuyog Pawar                 pf_err_compute(&s_err_prms);
3147*c83a76b0SSuyog Pawar                 /* Update the mv's with the current candt motion vectors */
3148*c83a76b0SSuyog Pawar                 s_result_prms.i2_mv_x = mvx_qpel;
3149*c83a76b0SSuyog Pawar                 s_result_prms.i2_mv_y = mvy_qpel - 2;
3150*c83a76b0SSuyog Pawar                 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3151*c83a76b0SSuyog Pawar 
3152*c83a76b0SSuyog Pawar                 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3153*c83a76b0SSuyog Pawar 
3154*c83a76b0SSuyog Pawar                 if(i4_tot_cost < i4_min_cost)
3155*c83a76b0SSuyog Pawar                 {
3156*c83a76b0SSuyog Pawar                     i4_min_cost = i4_tot_cost;
3157*c83a76b0SSuyog Pawar                     i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3158*c83a76b0SSuyog Pawar                     e_min_id = PT_T;
3159*c83a76b0SSuyog Pawar                     pu1_final_out = s_err_prms.pu1_ref;
3160*c83a76b0SSuyog Pawar                 }
3161*c83a76b0SSuyog Pawar             }
3162*c83a76b0SSuyog Pawar         }
3163*c83a76b0SSuyog Pawar         /* right pt */
3164*c83a76b0SSuyog Pawar         if(i4_grid_mask & BIT_EN(PT_R))
3165*c83a76b0SSuyog Pawar         {
3166*c83a76b0SSuyog Pawar             CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3167*c83a76b0SSuyog Pawar                 ps_dedup_enabler, 1, mvx_qpel + 2, mvy_qpel, check_for_duplicate);
3168*c83a76b0SSuyog Pawar 
3169*c83a76b0SSuyog Pawar             if(!check_for_duplicate)
3170*c83a76b0SSuyog Pawar             {
3171*c83a76b0SSuyog Pawar                 /* search node mv is stored in qpel units */
3172*c83a76b0SSuyog Pawar                 ps_search_node->s_mv.i2_mvx = (S16)((i4_mv_x + 1) << 1);
3173*c83a76b0SSuyog Pawar                 ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
3174*c83a76b0SSuyog Pawar                 /* right pt is i4_mv_x + 1, i4_mv_y */
3175*c83a76b0SSuyog Pawar                 i4_frac_x = (i4_mv_x + 1) & 1;
3176*c83a76b0SSuyog Pawar                 i4_frac_y = i4_mv_y & 1;
3177*c83a76b0SSuyog Pawar 
3178*c83a76b0SSuyog Pawar                 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3179*c83a76b0SSuyog Pawar                 s_err_prms.pu1_ref =
3180*c83a76b0SSuyog Pawar                     pu1_ref + ((i4_mv_x + 1) >> 1) + ((i4_mv_y >> 1) * i4_ref_stride);
3181*c83a76b0SSuyog Pawar                 pf_err_compute(&s_err_prms);
3182*c83a76b0SSuyog Pawar                 /* Update the mv's with the current candt motion vectors */
3183*c83a76b0SSuyog Pawar                 s_result_prms.i2_mv_x = mvx_qpel + 2;
3184*c83a76b0SSuyog Pawar                 s_result_prms.i2_mv_y = mvy_qpel;
3185*c83a76b0SSuyog Pawar                 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3186*c83a76b0SSuyog Pawar 
3187*c83a76b0SSuyog Pawar                 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3188*c83a76b0SSuyog Pawar 
3189*c83a76b0SSuyog Pawar                 if(i4_tot_cost < i4_min_cost)
3190*c83a76b0SSuyog Pawar                 {
3191*c83a76b0SSuyog Pawar                     i4_min_cost = i4_tot_cost;
3192*c83a76b0SSuyog Pawar                     i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3193*c83a76b0SSuyog Pawar                     e_min_id = PT_R;
3194*c83a76b0SSuyog Pawar                     pu1_final_out = s_err_prms.pu1_ref;
3195*c83a76b0SSuyog Pawar                 }
3196*c83a76b0SSuyog Pawar             }
3197*c83a76b0SSuyog Pawar         }
3198*c83a76b0SSuyog Pawar         /* bottom pt */
3199*c83a76b0SSuyog Pawar         if(i4_grid_mask & BIT_EN(PT_B))
3200*c83a76b0SSuyog Pawar         {
3201*c83a76b0SSuyog Pawar             CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3202*c83a76b0SSuyog Pawar                 ps_dedup_enabler, 1, mvx_qpel, mvy_qpel + 2, check_for_duplicate);
3203*c83a76b0SSuyog Pawar 
3204*c83a76b0SSuyog Pawar             if(!check_for_duplicate)
3205*c83a76b0SSuyog Pawar             {
3206*c83a76b0SSuyog Pawar                 /* search node mv is stored in qpel units */
3207*c83a76b0SSuyog Pawar                 ps_search_node->s_mv.i2_mvx = ((S16)i4_mv_x << 1);
3208*c83a76b0SSuyog Pawar                 ps_search_node->s_mv.i2_mvy = ((S16)(i4_mv_y + 1) << 1);
3209*c83a76b0SSuyog Pawar                 i4_frac_x = i4_mv_x & 1;
3210*c83a76b0SSuyog Pawar                 i4_frac_y = (i4_mv_y + 1) & 1;
3211*c83a76b0SSuyog Pawar                 pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3212*c83a76b0SSuyog Pawar                 s_err_prms.pu1_ref =
3213*c83a76b0SSuyog Pawar                     pu1_ref + (i4_mv_x >> 1) + (((i4_mv_y + 1) >> 1) * i4_ref_stride);
3214*c83a76b0SSuyog Pawar 
3215*c83a76b0SSuyog Pawar                 pf_err_compute(&s_err_prms);
3216*c83a76b0SSuyog Pawar                 /* Update the mv's with the current candt motion vectors */
3217*c83a76b0SSuyog Pawar                 s_result_prms.i2_mv_x = mvx_qpel;
3218*c83a76b0SSuyog Pawar                 s_result_prms.i2_mv_y = mvy_qpel + 2;
3219*c83a76b0SSuyog Pawar                 hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3220*c83a76b0SSuyog Pawar 
3221*c83a76b0SSuyog Pawar                 i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3222*c83a76b0SSuyog Pawar 
3223*c83a76b0SSuyog Pawar                 if(i4_tot_cost < i4_min_cost)
3224*c83a76b0SSuyog Pawar                 {
3225*c83a76b0SSuyog Pawar                     i4_min_cost = i4_tot_cost;
3226*c83a76b0SSuyog Pawar                     i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3227*c83a76b0SSuyog Pawar                     e_min_id = PT_B;
3228*c83a76b0SSuyog Pawar                     pu1_final_out = s_err_prms.pu1_ref;
3229*c83a76b0SSuyog Pawar                 }
3230*c83a76b0SSuyog Pawar             }
3231*c83a76b0SSuyog Pawar         }
3232*c83a76b0SSuyog Pawar         if(e_min_id == PT_C)
3233*c83a76b0SSuyog Pawar         {
3234*c83a76b0SSuyog Pawar             if(!i4_i)
3235*c83a76b0SSuyog Pawar             {
3236*c83a76b0SSuyog Pawar                 /* TL pt */
3237*c83a76b0SSuyog Pawar                 if(i4_grid_mask & BIT_EN(PT_TL))
3238*c83a76b0SSuyog Pawar                 {
3239*c83a76b0SSuyog Pawar                     S32 mvx_minus_1 = (i4_mv_x - 1);
3240*c83a76b0SSuyog Pawar                     S32 mvy_minus_1 = (i4_mv_y - 1);
3241*c83a76b0SSuyog Pawar 
3242*c83a76b0SSuyog Pawar                     CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3243*c83a76b0SSuyog Pawar                         ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel - 2, check_for_duplicate);
3244*c83a76b0SSuyog Pawar 
3245*c83a76b0SSuyog Pawar                     if(!check_for_duplicate)
3246*c83a76b0SSuyog Pawar                     {
3247*c83a76b0SSuyog Pawar                         /* search node mv is stored in qpel units */
3248*c83a76b0SSuyog Pawar                         ps_search_node->s_mv.i2_mvx = ((S16)mvx_minus_1 << 1);
3249*c83a76b0SSuyog Pawar                         ps_search_node->s_mv.i2_mvy = ((S16)mvy_minus_1 << 1);
3250*c83a76b0SSuyog Pawar                         i4_frac_x = mvx_minus_1 & 1;
3251*c83a76b0SSuyog Pawar                         i4_frac_y = mvy_minus_1 & 1;
3252*c83a76b0SSuyog Pawar                         pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3253*c83a76b0SSuyog Pawar                         s_err_prms.pu1_ref =
3254*c83a76b0SSuyog Pawar                             pu1_ref + (mvx_minus_1 >> 1) + ((mvy_minus_1 >> 1) * i4_ref_stride);
3255*c83a76b0SSuyog Pawar 
3256*c83a76b0SSuyog Pawar                         pf_err_compute(&s_err_prms);
3257*c83a76b0SSuyog Pawar                         /* Update the mv's with the current candt motion vectors */
3258*c83a76b0SSuyog Pawar                         s_result_prms.i2_mv_x = mvx_qpel - 2;
3259*c83a76b0SSuyog Pawar                         s_result_prms.i2_mv_y = mvy_qpel - 2;
3260*c83a76b0SSuyog Pawar                         hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3261*c83a76b0SSuyog Pawar 
3262*c83a76b0SSuyog Pawar                         i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3263*c83a76b0SSuyog Pawar 
3264*c83a76b0SSuyog Pawar                         if(i4_tot_cost < i4_min_cost)
3265*c83a76b0SSuyog Pawar                         {
3266*c83a76b0SSuyog Pawar                             i4_min_cost = i4_tot_cost;
3267*c83a76b0SSuyog Pawar                             i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3268*c83a76b0SSuyog Pawar                             e_min_id = PT_TL;
3269*c83a76b0SSuyog Pawar                             pu1_final_out = s_err_prms.pu1_ref;
3270*c83a76b0SSuyog Pawar                         }
3271*c83a76b0SSuyog Pawar                     }
3272*c83a76b0SSuyog Pawar                 }
3273*c83a76b0SSuyog Pawar                 /* TR pt */
3274*c83a76b0SSuyog Pawar                 if(i4_grid_mask & BIT_EN(PT_TR))
3275*c83a76b0SSuyog Pawar                 {
3276*c83a76b0SSuyog Pawar                     S32 mvx_plus_1 = (i4_mv_x + 1);
3277*c83a76b0SSuyog Pawar                     S32 mvy_minus_1 = (i4_mv_y - 1);
3278*c83a76b0SSuyog Pawar 
3279*c83a76b0SSuyog Pawar                     CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3280*c83a76b0SSuyog Pawar                         ps_dedup_enabler, 1, mvx_qpel + 2, mvy_qpel - 2, check_for_duplicate);
3281*c83a76b0SSuyog Pawar 
3282*c83a76b0SSuyog Pawar                     if(!check_for_duplicate)
3283*c83a76b0SSuyog Pawar                     {
3284*c83a76b0SSuyog Pawar                         /* search node mv is stored in qpel units */
3285*c83a76b0SSuyog Pawar                         ps_search_node->s_mv.i2_mvx = ((S16)mvx_plus_1 << 1);
3286*c83a76b0SSuyog Pawar                         ps_search_node->s_mv.i2_mvy = ((S16)mvy_minus_1 << 1);
3287*c83a76b0SSuyog Pawar                         i4_frac_x = mvx_plus_1 & 1;
3288*c83a76b0SSuyog Pawar                         i4_frac_y = mvy_minus_1 & 1;
3289*c83a76b0SSuyog Pawar                         pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3290*c83a76b0SSuyog Pawar                         s_err_prms.pu1_ref =
3291*c83a76b0SSuyog Pawar                             pu1_ref + (mvx_plus_1 >> 1) + ((mvy_minus_1 >> 1) * i4_ref_stride);
3292*c83a76b0SSuyog Pawar 
3293*c83a76b0SSuyog Pawar                         pf_err_compute(&s_err_prms);
3294*c83a76b0SSuyog Pawar                         /* Update the mv's with the current candt motion vectors */
3295*c83a76b0SSuyog Pawar                         s_result_prms.i2_mv_x = mvx_qpel + 2;
3296*c83a76b0SSuyog Pawar                         s_result_prms.i2_mv_y = mvy_qpel - 2;
3297*c83a76b0SSuyog Pawar                         hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3298*c83a76b0SSuyog Pawar 
3299*c83a76b0SSuyog Pawar                         i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3300*c83a76b0SSuyog Pawar 
3301*c83a76b0SSuyog Pawar                         if(i4_tot_cost < i4_min_cost)
3302*c83a76b0SSuyog Pawar                         {
3303*c83a76b0SSuyog Pawar                             i4_min_cost = i4_tot_cost;
3304*c83a76b0SSuyog Pawar                             i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3305*c83a76b0SSuyog Pawar                             e_min_id = PT_TR;
3306*c83a76b0SSuyog Pawar                             pu1_final_out = s_err_prms.pu1_ref;
3307*c83a76b0SSuyog Pawar                         }
3308*c83a76b0SSuyog Pawar                     }
3309*c83a76b0SSuyog Pawar                 }
3310*c83a76b0SSuyog Pawar                 /* BL pt */
3311*c83a76b0SSuyog Pawar                 if(i4_grid_mask & BIT_EN(PT_BL))
3312*c83a76b0SSuyog Pawar                 {
3313*c83a76b0SSuyog Pawar                     S32 mvx_minus_1 = (i4_mv_x - 1);
3314*c83a76b0SSuyog Pawar                     S32 mvy_plus_1 = (i4_mv_y + 1);
3315*c83a76b0SSuyog Pawar 
3316*c83a76b0SSuyog Pawar                     CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3317*c83a76b0SSuyog Pawar                         ps_dedup_enabler, 1, mvx_qpel - 2, mvy_qpel + 2, check_for_duplicate);
3318*c83a76b0SSuyog Pawar 
3319*c83a76b0SSuyog Pawar                     if(!check_for_duplicate)
3320*c83a76b0SSuyog Pawar                     {
3321*c83a76b0SSuyog Pawar                         /* search node mv is stored in qpel units */
3322*c83a76b0SSuyog Pawar                         ps_search_node->s_mv.i2_mvx = ((S16)mvx_minus_1 << 1);
3323*c83a76b0SSuyog Pawar                         ps_search_node->s_mv.i2_mvy = ((S16)mvy_plus_1 << 1);
3324*c83a76b0SSuyog Pawar                         i4_frac_x = mvx_minus_1 & 1;
3325*c83a76b0SSuyog Pawar                         i4_frac_y = mvy_plus_1 & 1;
3326*c83a76b0SSuyog Pawar                         pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3327*c83a76b0SSuyog Pawar                         s_err_prms.pu1_ref =
3328*c83a76b0SSuyog Pawar                             pu1_ref + (mvx_minus_1 >> 1) + ((mvy_plus_1 >> 1) * i4_ref_stride);
3329*c83a76b0SSuyog Pawar 
3330*c83a76b0SSuyog Pawar                         pf_err_compute(&s_err_prms);
3331*c83a76b0SSuyog Pawar                         /* Update the mv's with the current candt motion vectors */
3332*c83a76b0SSuyog Pawar                         s_result_prms.i2_mv_x = mvx_qpel - 2;
3333*c83a76b0SSuyog Pawar                         s_result_prms.i2_mv_y = mvy_qpel + 2;
3334*c83a76b0SSuyog Pawar                         hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3335*c83a76b0SSuyog Pawar 
3336*c83a76b0SSuyog Pawar                         i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3337*c83a76b0SSuyog Pawar 
3338*c83a76b0SSuyog Pawar                         if(i4_tot_cost < i4_min_cost)
3339*c83a76b0SSuyog Pawar                         {
3340*c83a76b0SSuyog Pawar                             i4_min_cost = i4_tot_cost;
3341*c83a76b0SSuyog Pawar                             i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3342*c83a76b0SSuyog Pawar                             e_min_id = PT_BL;
3343*c83a76b0SSuyog Pawar                             pu1_final_out = s_err_prms.pu1_ref;
3344*c83a76b0SSuyog Pawar                         }
3345*c83a76b0SSuyog Pawar                     }
3346*c83a76b0SSuyog Pawar                 }
3347*c83a76b0SSuyog Pawar                 /* BR pt */
3348*c83a76b0SSuyog Pawar                 if(i4_grid_mask & BIT_EN(PT_BR))
3349*c83a76b0SSuyog Pawar                 {
3350*c83a76b0SSuyog Pawar                     S32 mvx_plus_1 = (i4_mv_x + 1);
3351*c83a76b0SSuyog Pawar                     S32 mvy_plus_1 = (i4_mv_y + 1);
3352*c83a76b0SSuyog Pawar                     CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3353*c83a76b0SSuyog Pawar                         ps_dedup_enabler, 1, mvx_qpel + 2, mvy_qpel + 2, check_for_duplicate);
3354*c83a76b0SSuyog Pawar 
3355*c83a76b0SSuyog Pawar                     if(!check_for_duplicate)
3356*c83a76b0SSuyog Pawar                     {
3357*c83a76b0SSuyog Pawar                         /* search node mv is stored in qpel units */
3358*c83a76b0SSuyog Pawar                         ps_search_node->s_mv.i2_mvx = ((S16)mvx_plus_1 << 1);
3359*c83a76b0SSuyog Pawar                         ps_search_node->s_mv.i2_mvy = ((S16)mvy_plus_1 << 1);
3360*c83a76b0SSuyog Pawar                         i4_frac_x = mvx_plus_1 & 1;
3361*c83a76b0SSuyog Pawar                         i4_frac_y = mvy_plus_1 & 1;
3362*c83a76b0SSuyog Pawar                         pu1_ref = apu1_hpel_ref[i4_frac_y * 2 + i4_frac_x];
3363*c83a76b0SSuyog Pawar                         s_err_prms.pu1_ref =
3364*c83a76b0SSuyog Pawar                             pu1_ref + (mvx_plus_1 >> 1) + ((mvy_plus_1 >> 1) * i4_ref_stride);
3365*c83a76b0SSuyog Pawar 
3366*c83a76b0SSuyog Pawar                         pf_err_compute(&s_err_prms);
3367*c83a76b0SSuyog Pawar                         /* Update the mv's with the current candt motion vectors */
3368*c83a76b0SSuyog Pawar                         s_result_prms.i2_mv_x = mvx_qpel + 2;
3369*c83a76b0SSuyog Pawar                         s_result_prms.i2_mv_y = mvy_qpel + 2;
3370*c83a76b0SSuyog Pawar                         hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3371*c83a76b0SSuyog Pawar 
3372*c83a76b0SSuyog Pawar                         i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3373*c83a76b0SSuyog Pawar 
3374*c83a76b0SSuyog Pawar                         if(i4_tot_cost < i4_min_cost)
3375*c83a76b0SSuyog Pawar                         {
3376*c83a76b0SSuyog Pawar                             i4_min_cost = i4_tot_cost;
3377*c83a76b0SSuyog Pawar                             i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3378*c83a76b0SSuyog Pawar                             e_min_id = PT_BR;
3379*c83a76b0SSuyog Pawar                             pu1_final_out = s_err_prms.pu1_ref;
3380*c83a76b0SSuyog Pawar                         }
3381*c83a76b0SSuyog Pawar                     }
3382*c83a76b0SSuyog Pawar                 }
3383*c83a76b0SSuyog Pawar                 if(e_min_id == PT_C)
3384*c83a76b0SSuyog Pawar                 {
3385*c83a76b0SSuyog Pawar                     break;
3386*c83a76b0SSuyog Pawar                 }
3387*c83a76b0SSuyog Pawar             }
3388*c83a76b0SSuyog Pawar             else
3389*c83a76b0SSuyog Pawar             {
3390*c83a76b0SSuyog Pawar                 break;
3391*c83a76b0SSuyog Pawar             }
3392*c83a76b0SSuyog Pawar         }
3393*c83a76b0SSuyog Pawar 
3394*c83a76b0SSuyog Pawar         /*********************************************************************/
3395*c83a76b0SSuyog Pawar         /* Depending on the best result location, we may be able to skip     */
3396*c83a76b0SSuyog Pawar         /* atleast two pts, centre pt and one more pt. E.g. if right pt is   */
3397*c83a76b0SSuyog Pawar         /* the best result, the next iteration need not do centre, left pts  */
3398*c83a76b0SSuyog Pawar         /*********************************************************************/
3399*c83a76b0SSuyog Pawar         if(i4_i)
3400*c83a76b0SSuyog Pawar         {
3401*c83a76b0SSuyog Pawar             i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id];
3402*c83a76b0SSuyog Pawar         }
3403*c83a76b0SSuyog Pawar         else
3404*c83a76b0SSuyog Pawar         {
3405*c83a76b0SSuyog Pawar             i4_grid_mask = gai4_opt_grid_mask_conventional[e_min_id];
3406*c83a76b0SSuyog Pawar         }
3407*c83a76b0SSuyog Pawar         i4_mv_x += gai1_grid_id_to_x[e_min_id];
3408*c83a76b0SSuyog Pawar         i4_mv_y += gai1_grid_id_to_y[e_min_id];
3409*c83a76b0SSuyog Pawar         ps_search_node->s_mv.i2_mvx = (S16)(i4_mv_x << 1);
3410*c83a76b0SSuyog Pawar         ps_search_node->s_mv.i2_mvy = (S16)(i4_mv_y << 1);
3411*c83a76b0SSuyog Pawar         i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 2, ps_range_prms);
3412*c83a76b0SSuyog Pawar     }
3413*c83a76b0SSuyog Pawar 
3414*c83a76b0SSuyog Pawar     /* Convert to QPEL units */
3415*c83a76b0SSuyog Pawar     i4_mv_x <<= 1;
3416*c83a76b0SSuyog Pawar     i4_mv_y <<= 1;
3417*c83a76b0SSuyog Pawar 
3418*c83a76b0SSuyog Pawar     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
3419*c83a76b0SSuyog Pawar     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
3420*c83a76b0SSuyog Pawar 
3421*c83a76b0SSuyog Pawar     /* Early exit if this partition is visiting same hpel mv again */
3422*c83a76b0SSuyog Pawar     /* Assumption : Checkin for early exit in best result of partition */
3423*c83a76b0SSuyog Pawar     if((ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_x ==
3424*c83a76b0SSuyog Pawar         ps_search_node->s_mv.i2_mvx) &&
3425*c83a76b0SSuyog Pawar        (ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_y ==
3426*c83a76b0SSuyog Pawar         ps_search_node->s_mv.i2_mvy))
3427*c83a76b0SSuyog Pawar     {
3428*c83a76b0SSuyog Pawar         return (ps_search_results->aps_part_results[search_idx][part_id][0].i4_tot_cost);
3429*c83a76b0SSuyog Pawar     }
3430*c83a76b0SSuyog Pawar     else
3431*c83a76b0SSuyog Pawar     {
3432*c83a76b0SSuyog Pawar         /* Store the best hpel mv for future early exit checks */
3433*c83a76b0SSuyog Pawar         ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_x =
3434*c83a76b0SSuyog Pawar             (S16)i4_mv_x;
3435*c83a76b0SSuyog Pawar         ps_search_results->aps_part_results[search_idx][part_id][0].i2_best_hpel_mv_y =
3436*c83a76b0SSuyog Pawar             (S16)i4_mv_y;
3437*c83a76b0SSuyog Pawar     }
3438*c83a76b0SSuyog Pawar 
3439*c83a76b0SSuyog Pawar     /* Early exit if this partition is visiting same hpel mv again */
3440*c83a76b0SSuyog Pawar     /* Assumption : Checkin for early exit in second best result of partition */
3441*c83a76b0SSuyog Pawar     if((ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_x ==
3442*c83a76b0SSuyog Pawar         ps_search_node->s_mv.i2_mvx) &&
3443*c83a76b0SSuyog Pawar        (ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_y ==
3444*c83a76b0SSuyog Pawar         ps_search_node->s_mv.i2_mvy))
3445*c83a76b0SSuyog Pawar     {
3446*c83a76b0SSuyog Pawar         return (ps_search_results->aps_part_results[search_idx][part_id][1].i4_tot_cost);
3447*c83a76b0SSuyog Pawar     }
3448*c83a76b0SSuyog Pawar     else
3449*c83a76b0SSuyog Pawar     {
3450*c83a76b0SSuyog Pawar         /* Store the best hpel mv for future early exit checks */
3451*c83a76b0SSuyog Pawar         ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_x =
3452*c83a76b0SSuyog Pawar             (S16)i4_mv_x;
3453*c83a76b0SSuyog Pawar         ps_search_results->aps_part_results[search_idx][part_id][1].i2_best_hpel_mv_y =
3454*c83a76b0SSuyog Pawar             (S16)i4_mv_y;
3455*c83a76b0SSuyog Pawar     }
3456*c83a76b0SSuyog Pawar 
3457*c83a76b0SSuyog Pawar     /* Exact interpolation or averaging chosen here */
3458*c83a76b0SSuyog Pawar     pf_qpel_interp = ps_prms->pf_qpel_interp;
3459*c83a76b0SSuyog Pawar 
3460*c83a76b0SSuyog Pawar     /* Next QPEL ME */
3461*c83a76b0SSuyog Pawar     /* In this case, we have option of doing exact QPEL interpolation or avg */
3462*c83a76b0SSuyog Pawar     /*************************************************************************/
3463*c83a76b0SSuyog Pawar     /*        x                                                              */
3464*c83a76b0SSuyog Pawar     /*    A b C d                                                            */
3465*c83a76b0SSuyog Pawar     /*    e f g h                                                            */
3466*c83a76b0SSuyog Pawar     /*    I j K l                                                            */
3467*c83a76b0SSuyog Pawar     /*    m n o p                                                            */
3468*c83a76b0SSuyog Pawar     /*    Q r S t                                                            */
3469*c83a76b0SSuyog Pawar     /*                                                                       */
3470*c83a76b0SSuyog Pawar     /*    Approximate QPEL logic                                             */
3471*c83a76b0SSuyog Pawar     /*    b = avg(A,C) f = avg(I,C), g= avg(C,K) j=avg(I,K)                  */
3472*c83a76b0SSuyog Pawar     /*    for any given pt, we can get all the information required about    */
3473*c83a76b0SSuyog Pawar     /*    the surrounding 4 pts. For example, given point C (0.5, 0)         */
3474*c83a76b0SSuyog Pawar     /*     surrounding pts info:                                             */
3475*c83a76b0SSuyog Pawar     /*     b : qpel offset: 1, 0, generated by averaging. buffer1: fpel buf  */
3476*c83a76b0SSuyog Pawar     /*           buffer 2: hxfy, offsets for both are 0, 0                   */
3477*c83a76b0SSuyog Pawar     /*    similarly for other pts the info can be gotten                     */
3478*c83a76b0SSuyog Pawar     /*************************************************************************/
3479*c83a76b0SSuyog Pawar     i4_grid_mask = GRID_ALL_PTS_VALID ^ (BIT_EN(PT_C));
3480*c83a76b0SSuyog Pawar     i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms);
3481*c83a76b0SSuyog Pawar 
3482*c83a76b0SSuyog Pawar     /*************************************************************************/
3483*c83a76b0SSuyog Pawar     /* One time preparation of non changing interpolation params. These      */
3484*c83a76b0SSuyog Pawar     /* include a set of ping pong result buf ptrs, input buf ptrs and some   */
3485*c83a76b0SSuyog Pawar     /* working memory (not used though in case of averaging).                */
3486*c83a76b0SSuyog Pawar     /*************************************************************************/
3487*c83a76b0SSuyog Pawar     s_interp_prms.ppu1_ref = &apu1_hpel_ref[0];
3488*c83a76b0SSuyog Pawar     s_interp_prms.i4_ref_stride = i4_ref_stride;
3489*c83a76b0SSuyog Pawar     s_interp_prms.i4_blk_wd = i4_blk_wd;
3490*c83a76b0SSuyog Pawar     s_interp_prms.i4_blk_ht = i4_blk_ht;
3491*c83a76b0SSuyog Pawar 
3492*c83a76b0SSuyog Pawar     i4_final_out_stride = i4_ref_stride;
3493*c83a76b0SSuyog Pawar 
3494*c83a76b0SSuyog Pawar     {
3495*c83a76b0SSuyog Pawar         U08 *pu1_mem;
3496*c83a76b0SSuyog Pawar         /*********************************************************************/
3497*c83a76b0SSuyog Pawar         /* Allocation of working memory for interpolated buffers. We maintain*/
3498*c83a76b0SSuyog Pawar         /* an intermediate working buffer, and 2 ping pong interpolated out  */
3499*c83a76b0SSuyog Pawar         /* buffers, purpose of ping pong explained later below               */
3500*c83a76b0SSuyog Pawar         /*********************************************************************/
3501*c83a76b0SSuyog Pawar         pu1_mem = ps_prms->pu1_wkg_mem;
3502*c83a76b0SSuyog Pawar         s_interp_prms.pu1_wkg_mem = pu1_mem;
3503*c83a76b0SSuyog Pawar 
3504*c83a76b0SSuyog Pawar         //pu1_mem += (INTERP_INTERMED_BUF_SIZE);
3505*c83a76b0SSuyog Pawar         s_interp_prms.apu1_interp_out[0] = pu1_mem;
3506*c83a76b0SSuyog Pawar 
3507*c83a76b0SSuyog Pawar         pu1_mem += (INTERP_OUT_BUF_SIZE);
3508*c83a76b0SSuyog Pawar         s_interp_prms.apu1_interp_out[1] = pu1_mem;
3509*c83a76b0SSuyog Pawar 
3510*c83a76b0SSuyog Pawar         pu1_mem += (INTERP_OUT_BUF_SIZE);
3511*c83a76b0SSuyog Pawar         s_interp_prms.apu1_interp_out[2] = pu1_mem;
3512*c83a76b0SSuyog Pawar 
3513*c83a76b0SSuyog Pawar         pu1_mem += (INTERP_OUT_BUF_SIZE);
3514*c83a76b0SSuyog Pawar         s_interp_prms.apu1_interp_out[3] = pu1_mem;
3515*c83a76b0SSuyog Pawar 
3516*c83a76b0SSuyog Pawar         pu1_mem += (INTERP_OUT_BUF_SIZE);
3517*c83a76b0SSuyog Pawar         s_interp_prms.apu1_interp_out[4] = pu1_mem;
3518*c83a76b0SSuyog Pawar 
3519*c83a76b0SSuyog Pawar         /*********************************************************************/
3520*c83a76b0SSuyog Pawar         /* Stride of interpolated output is just a function of blk width of  */
3521*c83a76b0SSuyog Pawar         /* this partition and hence remains constant for this partition      */
3522*c83a76b0SSuyog Pawar         /*********************************************************************/
3523*c83a76b0SSuyog Pawar         s_interp_prms.i4_out_stride = (i4_blk_wd);
3524*c83a76b0SSuyog Pawar     }
3525*c83a76b0SSuyog Pawar 
3526*c83a76b0SSuyog Pawar     {
3527*c83a76b0SSuyog Pawar         UWORD8 *apu1_final[4];
3528*c83a76b0SSuyog Pawar         WORD32 ai4_ref_stride[4];
3529*c83a76b0SSuyog Pawar         /*************************************************************************/
3530*c83a76b0SSuyog Pawar         /* Ping pong design for interpolated buffers. We use a min id, which     */
3531*c83a76b0SSuyog Pawar         /* tracks the id of the ppu1_interp_out that stores the best result.     */
3532*c83a76b0SSuyog Pawar         /* When new interp to be done, it uses 1 - bes result id to do the interp*/
3533*c83a76b0SSuyog Pawar         /* min id is toggled when any new result becomes the best result.        */
3534*c83a76b0SSuyog Pawar         /*************************************************************************/
3535*c83a76b0SSuyog Pawar 
3536*c83a76b0SSuyog Pawar         for(i4_i = 0; i4_i < i4_num_qpel_refine; i4_i++)
3537*c83a76b0SSuyog Pawar         {
3538*c83a76b0SSuyog Pawar             e_min_id = PT_C;
3539*c83a76b0SSuyog Pawar 
3540*c83a76b0SSuyog Pawar             hme_qpel_interp_comprehensive(
3541*c83a76b0SSuyog Pawar                 &s_interp_prms, apu1_final, ai4_ref_stride, i4_mv_x, i4_mv_y, i4_grid_mask);
3542*c83a76b0SSuyog Pawar 
3543*c83a76b0SSuyog Pawar             mvx_qpel = i4_mv_x;
3544*c83a76b0SSuyog Pawar             mvy_qpel = i4_mv_y;
3545*c83a76b0SSuyog Pawar 
3546*c83a76b0SSuyog Pawar             if(i4_grid_mask & BIT_EN(PT_L))
3547*c83a76b0SSuyog Pawar             {
3548*c83a76b0SSuyog Pawar                 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3549*c83a76b0SSuyog Pawar                     ps_dedup_enabler, 1, mvx_qpel - 1, mvy_qpel - 0, check_for_duplicate);
3550*c83a76b0SSuyog Pawar 
3551*c83a76b0SSuyog Pawar                 if(!check_for_duplicate)
3552*c83a76b0SSuyog Pawar                 {
3553*c83a76b0SSuyog Pawar                     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1;
3554*c83a76b0SSuyog Pawar                     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
3555*c83a76b0SSuyog Pawar 
3556*c83a76b0SSuyog Pawar                     s_err_prms.pu1_ref = apu1_final[0];
3557*c83a76b0SSuyog Pawar                     s_err_prms.i4_ref_stride = ai4_ref_stride[0];
3558*c83a76b0SSuyog Pawar 
3559*c83a76b0SSuyog Pawar                     pf_err_compute(&s_err_prms);
3560*c83a76b0SSuyog Pawar                     /* Update the mv's with the current candt motion vectors */
3561*c83a76b0SSuyog Pawar                     s_result_prms.i2_mv_x = mvx_qpel - 1;
3562*c83a76b0SSuyog Pawar                     s_result_prms.i2_mv_y = mvy_qpel;
3563*c83a76b0SSuyog Pawar                     hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3564*c83a76b0SSuyog Pawar 
3565*c83a76b0SSuyog Pawar                     i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3566*c83a76b0SSuyog Pawar                     if(i4_tot_cost < i4_min_cost)
3567*c83a76b0SSuyog Pawar                     {
3568*c83a76b0SSuyog Pawar                         e_min_id = PT_L;
3569*c83a76b0SSuyog Pawar                         i4_min_cost = i4_tot_cost;
3570*c83a76b0SSuyog Pawar                         i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3571*c83a76b0SSuyog Pawar                     }
3572*c83a76b0SSuyog Pawar                 }
3573*c83a76b0SSuyog Pawar             }
3574*c83a76b0SSuyog Pawar             if(i4_grid_mask & BIT_EN(PT_T))
3575*c83a76b0SSuyog Pawar             {
3576*c83a76b0SSuyog Pawar                 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3577*c83a76b0SSuyog Pawar                     ps_dedup_enabler, 1, mvx_qpel - 0, mvy_qpel - 1, check_for_duplicate);
3578*c83a76b0SSuyog Pawar 
3579*c83a76b0SSuyog Pawar                 if(!check_for_duplicate)
3580*c83a76b0SSuyog Pawar                 {
3581*c83a76b0SSuyog Pawar                     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
3582*c83a76b0SSuyog Pawar                     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1;
3583*c83a76b0SSuyog Pawar 
3584*c83a76b0SSuyog Pawar                     s_err_prms.pu1_ref = apu1_final[1];
3585*c83a76b0SSuyog Pawar                     s_err_prms.i4_ref_stride = ai4_ref_stride[1];
3586*c83a76b0SSuyog Pawar 
3587*c83a76b0SSuyog Pawar                     pf_err_compute(&s_err_prms);
3588*c83a76b0SSuyog Pawar                     /* Update the mv's with the current candt motion vectors */
3589*c83a76b0SSuyog Pawar                     s_result_prms.i2_mv_x = mvx_qpel;
3590*c83a76b0SSuyog Pawar                     s_result_prms.i2_mv_y = mvy_qpel - 1;
3591*c83a76b0SSuyog Pawar                     hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3592*c83a76b0SSuyog Pawar                     i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3593*c83a76b0SSuyog Pawar                     if(i4_tot_cost < i4_min_cost)
3594*c83a76b0SSuyog Pawar                     {
3595*c83a76b0SSuyog Pawar                         e_min_id = PT_T;
3596*c83a76b0SSuyog Pawar                         i4_min_cost = i4_tot_cost;
3597*c83a76b0SSuyog Pawar                         i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3598*c83a76b0SSuyog Pawar                     }
3599*c83a76b0SSuyog Pawar                 }
3600*c83a76b0SSuyog Pawar             }
3601*c83a76b0SSuyog Pawar             if(i4_grid_mask & BIT_EN(PT_R))
3602*c83a76b0SSuyog Pawar             {
3603*c83a76b0SSuyog Pawar                 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3604*c83a76b0SSuyog Pawar                     ps_dedup_enabler, 1, mvx_qpel + 1, mvy_qpel, check_for_duplicate);
3605*c83a76b0SSuyog Pawar 
3606*c83a76b0SSuyog Pawar                 if(!check_for_duplicate)
3607*c83a76b0SSuyog Pawar                 {
3608*c83a76b0SSuyog Pawar                     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1;
3609*c83a76b0SSuyog Pawar                     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
3610*c83a76b0SSuyog Pawar 
3611*c83a76b0SSuyog Pawar                     s_err_prms.pu1_ref = apu1_final[2];
3612*c83a76b0SSuyog Pawar                     s_err_prms.i4_ref_stride = ai4_ref_stride[2];
3613*c83a76b0SSuyog Pawar 
3614*c83a76b0SSuyog Pawar                     pf_err_compute(&s_err_prms);
3615*c83a76b0SSuyog Pawar                     /* Update the mv's with the current candt motion vectors */
3616*c83a76b0SSuyog Pawar                     s_result_prms.i2_mv_x = mvx_qpel + 1;
3617*c83a76b0SSuyog Pawar                     s_result_prms.i2_mv_y = mvy_qpel;
3618*c83a76b0SSuyog Pawar                     hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3619*c83a76b0SSuyog Pawar 
3620*c83a76b0SSuyog Pawar                     i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3621*c83a76b0SSuyog Pawar                     if(i4_tot_cost < i4_min_cost)
3622*c83a76b0SSuyog Pawar                     {
3623*c83a76b0SSuyog Pawar                         e_min_id = PT_R;
3624*c83a76b0SSuyog Pawar                         i4_min_cost = i4_tot_cost;
3625*c83a76b0SSuyog Pawar                         i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3626*c83a76b0SSuyog Pawar                     }
3627*c83a76b0SSuyog Pawar                 }
3628*c83a76b0SSuyog Pawar             }
3629*c83a76b0SSuyog Pawar             /* i4_mv_x and i4_mv_y will always be the centre pt */
3630*c83a76b0SSuyog Pawar             /* for qpel we  start with least hpel, and hence compute of center pt never reqd */
3631*c83a76b0SSuyog Pawar             if(i4_grid_mask & BIT_EN(PT_B))
3632*c83a76b0SSuyog Pawar             {
3633*c83a76b0SSuyog Pawar                 CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3634*c83a76b0SSuyog Pawar                     ps_dedup_enabler, 1, mvx_qpel, mvy_qpel + 1, check_for_duplicate);
3635*c83a76b0SSuyog Pawar 
3636*c83a76b0SSuyog Pawar                 if(!check_for_duplicate)
3637*c83a76b0SSuyog Pawar                 {
3638*c83a76b0SSuyog Pawar                     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
3639*c83a76b0SSuyog Pawar                     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1;
3640*c83a76b0SSuyog Pawar 
3641*c83a76b0SSuyog Pawar                     s_err_prms.pu1_ref = apu1_final[3];
3642*c83a76b0SSuyog Pawar                     s_err_prms.i4_ref_stride = ai4_ref_stride[3];
3643*c83a76b0SSuyog Pawar 
3644*c83a76b0SSuyog Pawar                     pf_err_compute(&s_err_prms);
3645*c83a76b0SSuyog Pawar                     /* Update the mv's with the current candt motion vectors */
3646*c83a76b0SSuyog Pawar                     s_result_prms.i2_mv_x = mvx_qpel;
3647*c83a76b0SSuyog Pawar                     s_result_prms.i2_mv_y = mvy_qpel + 1;
3648*c83a76b0SSuyog Pawar                     hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3649*c83a76b0SSuyog Pawar                     i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3650*c83a76b0SSuyog Pawar                     if(i4_tot_cost < i4_min_cost)
3651*c83a76b0SSuyog Pawar                     {
3652*c83a76b0SSuyog Pawar                         e_min_id = PT_B;
3653*c83a76b0SSuyog Pawar                         i4_min_cost = i4_tot_cost;
3654*c83a76b0SSuyog Pawar                         i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3655*c83a76b0SSuyog Pawar                     }
3656*c83a76b0SSuyog Pawar                 }
3657*c83a76b0SSuyog Pawar             }
3658*c83a76b0SSuyog Pawar 
3659*c83a76b0SSuyog Pawar             if(e_min_id == PT_C)
3660*c83a76b0SSuyog Pawar             {
3661*c83a76b0SSuyog Pawar                 if(!i4_i)
3662*c83a76b0SSuyog Pawar                 {
3663*c83a76b0SSuyog Pawar                     S32 i4_interp_buf_id = 0;
3664*c83a76b0SSuyog Pawar 
3665*c83a76b0SSuyog Pawar                     if(i4_grid_mask & BIT_EN(PT_TL))
3666*c83a76b0SSuyog Pawar                     {
3667*c83a76b0SSuyog Pawar                         CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3668*c83a76b0SSuyog Pawar                             ps_dedup_enabler, 1, mvx_qpel - 1, mvy_qpel - 1, check_for_duplicate);
3669*c83a76b0SSuyog Pawar 
3670*c83a76b0SSuyog Pawar                         if(!check_for_duplicate)
3671*c83a76b0SSuyog Pawar                         {
3672*c83a76b0SSuyog Pawar                             ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1;
3673*c83a76b0SSuyog Pawar                             ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1;
3674*c83a76b0SSuyog Pawar 
3675*c83a76b0SSuyog Pawar                             /* Carry out the interpolation */
3676*c83a76b0SSuyog Pawar                             pf_qpel_interp(
3677*c83a76b0SSuyog Pawar                                 &s_interp_prms, i4_mv_x - 1, i4_mv_y - 1, i4_interp_buf_id);
3678*c83a76b0SSuyog Pawar 
3679*c83a76b0SSuyog Pawar                             s_err_prms.pu1_ref = s_interp_prms.pu1_final_out;
3680*c83a76b0SSuyog Pawar                             s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride;
3681*c83a76b0SSuyog Pawar 
3682*c83a76b0SSuyog Pawar                             pf_err_compute(&s_err_prms);
3683*c83a76b0SSuyog Pawar                             /* Update the mv's with the current candt motion vectors */
3684*c83a76b0SSuyog Pawar                             s_result_prms.i2_mv_x = mvx_qpel - 1;
3685*c83a76b0SSuyog Pawar                             s_result_prms.i2_mv_y = mvy_qpel - 1;
3686*c83a76b0SSuyog Pawar                             hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3687*c83a76b0SSuyog Pawar 
3688*c83a76b0SSuyog Pawar                             i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3689*c83a76b0SSuyog Pawar 
3690*c83a76b0SSuyog Pawar                             if(i4_tot_cost < i4_min_cost)
3691*c83a76b0SSuyog Pawar                             {
3692*c83a76b0SSuyog Pawar                                 e_min_id = PT_TL;
3693*c83a76b0SSuyog Pawar                                 i4_min_cost = i4_tot_cost;
3694*c83a76b0SSuyog Pawar                                 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3695*c83a76b0SSuyog Pawar                             }
3696*c83a76b0SSuyog Pawar                         }
3697*c83a76b0SSuyog Pawar                     }
3698*c83a76b0SSuyog Pawar                     if(i4_grid_mask & BIT_EN(PT_TR))
3699*c83a76b0SSuyog Pawar                     {
3700*c83a76b0SSuyog Pawar                         CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3701*c83a76b0SSuyog Pawar                             ps_dedup_enabler, 1, mvx_qpel + 1, mvy_qpel - 1, check_for_duplicate);
3702*c83a76b0SSuyog Pawar 
3703*c83a76b0SSuyog Pawar                         if(!check_for_duplicate)
3704*c83a76b0SSuyog Pawar                         {
3705*c83a76b0SSuyog Pawar                             ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1;
3706*c83a76b0SSuyog Pawar                             ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y - 1;
3707*c83a76b0SSuyog Pawar 
3708*c83a76b0SSuyog Pawar                             /* Carry out the interpolation */
3709*c83a76b0SSuyog Pawar                             pf_qpel_interp(
3710*c83a76b0SSuyog Pawar                                 &s_interp_prms, i4_mv_x + 1, i4_mv_y - 1, i4_interp_buf_id);
3711*c83a76b0SSuyog Pawar 
3712*c83a76b0SSuyog Pawar                             s_err_prms.pu1_ref = s_interp_prms.pu1_final_out;
3713*c83a76b0SSuyog Pawar                             s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride;
3714*c83a76b0SSuyog Pawar 
3715*c83a76b0SSuyog Pawar                             pf_err_compute(&s_err_prms);
3716*c83a76b0SSuyog Pawar                             /* Update the mv's with the current candt motion vectors */
3717*c83a76b0SSuyog Pawar                             s_result_prms.i2_mv_x = mvx_qpel + 1;
3718*c83a76b0SSuyog Pawar                             s_result_prms.i2_mv_y = mvy_qpel - 1;
3719*c83a76b0SSuyog Pawar                             hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3720*c83a76b0SSuyog Pawar 
3721*c83a76b0SSuyog Pawar                             i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3722*c83a76b0SSuyog Pawar 
3723*c83a76b0SSuyog Pawar                             if(i4_tot_cost < i4_min_cost)
3724*c83a76b0SSuyog Pawar                             {
3725*c83a76b0SSuyog Pawar                                 e_min_id = PT_TR;
3726*c83a76b0SSuyog Pawar                                 i4_min_cost = i4_tot_cost;
3727*c83a76b0SSuyog Pawar                                 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3728*c83a76b0SSuyog Pawar                             }
3729*c83a76b0SSuyog Pawar                         }
3730*c83a76b0SSuyog Pawar                     }
3731*c83a76b0SSuyog Pawar                     if(i4_grid_mask & BIT_EN(PT_BL))
3732*c83a76b0SSuyog Pawar                     {
3733*c83a76b0SSuyog Pawar                         CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3734*c83a76b0SSuyog Pawar                             ps_dedup_enabler, 1, mvx_qpel - 1, mvy_qpel + 1, check_for_duplicate);
3735*c83a76b0SSuyog Pawar 
3736*c83a76b0SSuyog Pawar                         if(!check_for_duplicate)
3737*c83a76b0SSuyog Pawar                         {
3738*c83a76b0SSuyog Pawar                             ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x - 1;
3739*c83a76b0SSuyog Pawar                             ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1;
3740*c83a76b0SSuyog Pawar 
3741*c83a76b0SSuyog Pawar                             /* Carry out the interpolation */
3742*c83a76b0SSuyog Pawar                             pf_qpel_interp(
3743*c83a76b0SSuyog Pawar                                 &s_interp_prms, i4_mv_x - 1, i4_mv_y + 1, i4_interp_buf_id);
3744*c83a76b0SSuyog Pawar 
3745*c83a76b0SSuyog Pawar                             s_err_prms.pu1_ref = s_interp_prms.pu1_final_out;
3746*c83a76b0SSuyog Pawar                             s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride;
3747*c83a76b0SSuyog Pawar 
3748*c83a76b0SSuyog Pawar                             pf_err_compute(&s_err_prms);
3749*c83a76b0SSuyog Pawar                             /* Update the mv's with the current candt motion vectors */
3750*c83a76b0SSuyog Pawar                             s_result_prms.i2_mv_x = mvx_qpel - 1;
3751*c83a76b0SSuyog Pawar                             s_result_prms.i2_mv_y = mvy_qpel + 1;
3752*c83a76b0SSuyog Pawar                             hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3753*c83a76b0SSuyog Pawar 
3754*c83a76b0SSuyog Pawar                             i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3755*c83a76b0SSuyog Pawar 
3756*c83a76b0SSuyog Pawar                             if(i4_tot_cost < i4_min_cost)
3757*c83a76b0SSuyog Pawar                             {
3758*c83a76b0SSuyog Pawar                                 e_min_id = PT_BL;
3759*c83a76b0SSuyog Pawar                                 i4_min_cost = i4_tot_cost;
3760*c83a76b0SSuyog Pawar                                 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3761*c83a76b0SSuyog Pawar                             }
3762*c83a76b0SSuyog Pawar                         }
3763*c83a76b0SSuyog Pawar                     }
3764*c83a76b0SSuyog Pawar                     /* i4_mv_x and i4_mv_y will always be the centre pt */
3765*c83a76b0SSuyog Pawar                     /* for qpel we  start with least hpel, and hence compute of center pt never reqd */
3766*c83a76b0SSuyog Pawar                     if(i4_grid_mask & BIT_EN(PT_BR))
3767*c83a76b0SSuyog Pawar                     {
3768*c83a76b0SSuyog Pawar                         CHECK_FOR_DUPES_AND_INSERT_UNIQUE_NODES(
3769*c83a76b0SSuyog Pawar                             ps_dedup_enabler, 1, mvx_qpel + 1, mvy_qpel + 1, check_for_duplicate);
3770*c83a76b0SSuyog Pawar 
3771*c83a76b0SSuyog Pawar                         if(!check_for_duplicate)
3772*c83a76b0SSuyog Pawar                         {
3773*c83a76b0SSuyog Pawar                             ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x + 1;
3774*c83a76b0SSuyog Pawar                             ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y + 1;
3775*c83a76b0SSuyog Pawar 
3776*c83a76b0SSuyog Pawar                             /* Carry out the interpolation */
3777*c83a76b0SSuyog Pawar                             pf_qpel_interp(
3778*c83a76b0SSuyog Pawar                                 &s_interp_prms, i4_mv_x + 1, i4_mv_y + 1, i4_interp_buf_id);
3779*c83a76b0SSuyog Pawar 
3780*c83a76b0SSuyog Pawar                             s_err_prms.pu1_ref = s_interp_prms.pu1_final_out;
3781*c83a76b0SSuyog Pawar                             s_err_prms.i4_ref_stride = s_interp_prms.i4_final_out_stride;
3782*c83a76b0SSuyog Pawar 
3783*c83a76b0SSuyog Pawar                             pf_err_compute(&s_err_prms);
3784*c83a76b0SSuyog Pawar                             /* Update the mv's with the current candt motion vectors */
3785*c83a76b0SSuyog Pawar                             s_result_prms.i2_mv_x = mvx_qpel + 1;
3786*c83a76b0SSuyog Pawar                             s_result_prms.i2_mv_y = mvy_qpel + 1;
3787*c83a76b0SSuyog Pawar                             hme_update_results_pt_pu_best1_subpel_hs(&s_result_prms);
3788*c83a76b0SSuyog Pawar 
3789*c83a76b0SSuyog Pawar                             i4_tot_cost = s_err_prms.pi4_sad_grid[part_id];
3790*c83a76b0SSuyog Pawar 
3791*c83a76b0SSuyog Pawar                             if(i4_tot_cost < i4_min_cost)
3792*c83a76b0SSuyog Pawar                             {
3793*c83a76b0SSuyog Pawar                                 e_min_id = PT_BR;
3794*c83a76b0SSuyog Pawar                                 i4_min_cost = i4_tot_cost;
3795*c83a76b0SSuyog Pawar                                 i4_min_sad = s_err_prms.pi4_sad_grid[part_id];
3796*c83a76b0SSuyog Pawar                             }
3797*c83a76b0SSuyog Pawar                         }
3798*c83a76b0SSuyog Pawar                     }
3799*c83a76b0SSuyog Pawar                     if(e_min_id == PT_C)
3800*c83a76b0SSuyog Pawar                     {
3801*c83a76b0SSuyog Pawar                         break;
3802*c83a76b0SSuyog Pawar                     }
3803*c83a76b0SSuyog Pawar                 }
3804*c83a76b0SSuyog Pawar                 else
3805*c83a76b0SSuyog Pawar                 {
3806*c83a76b0SSuyog Pawar                     break;
3807*c83a76b0SSuyog Pawar                 }
3808*c83a76b0SSuyog Pawar             }
3809*c83a76b0SSuyog Pawar 
3810*c83a76b0SSuyog Pawar             if(i4_i)
3811*c83a76b0SSuyog Pawar             {
3812*c83a76b0SSuyog Pawar                 i4_grid_mask = gai4_opt_grid_mask_diamond[e_min_id];
3813*c83a76b0SSuyog Pawar             }
3814*c83a76b0SSuyog Pawar             else
3815*c83a76b0SSuyog Pawar             {
3816*c83a76b0SSuyog Pawar                 i4_grid_mask = gai4_opt_grid_mask_conventional[e_min_id];
3817*c83a76b0SSuyog Pawar             }
3818*c83a76b0SSuyog Pawar             i4_mv_x += gai1_grid_id_to_x[e_min_id];
3819*c83a76b0SSuyog Pawar             i4_mv_y += gai1_grid_id_to_y[e_min_id];
3820*c83a76b0SSuyog Pawar             ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
3821*c83a76b0SSuyog Pawar             ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
3822*c83a76b0SSuyog Pawar             i4_grid_mask &= hme_clamp_grid_by_mvrange(ps_search_node, 1, ps_range_prms);
3823*c83a76b0SSuyog Pawar         }
3824*c83a76b0SSuyog Pawar     }
3825*c83a76b0SSuyog Pawar 
3826*c83a76b0SSuyog Pawar     /* update modified motion vectors and cost at end of subpel */
3827*c83a76b0SSuyog Pawar     ps_search_node->s_mv.i2_mvx = (S16)i4_mv_x;
3828*c83a76b0SSuyog Pawar     ps_search_node->s_mv.i2_mvy = (S16)i4_mv_y;
3829*c83a76b0SSuyog Pawar     ps_search_node->i4_tot_cost = i4_min_cost;
3830*c83a76b0SSuyog Pawar     ps_search_node->i4_sad = i4_min_sad;
3831*c83a76b0SSuyog Pawar 
3832*c83a76b0SSuyog Pawar     /********************************************************************************/
3833*c83a76b0SSuyog Pawar     /* TODO: Restoring back Sad lambda from Hadamard lambda                         */
3834*c83a76b0SSuyog Pawar     /* Need to pass the had/satd lambda in more cleaner way for subpel cost compute */
3835*c83a76b0SSuyog Pawar     /********************************************************************************/
3836*c83a76b0SSuyog Pawar     //ps_pred_ctxt->lambda >>= 1;
3837*c83a76b0SSuyog Pawar 
3838*c83a76b0SSuyog Pawar     return (i4_min_cost);
3839*c83a76b0SSuyog Pawar }
3840*c83a76b0SSuyog Pawar #endif
3841*c83a76b0SSuyog Pawar 
hme_subpel_refine_struct_to_search_results_struct_converter(subpel_refine_ctxt_t * ps_subpel_refine_ctxt,search_results_t * ps_search_results,U08 u1_pred_dir,ME_QUALITY_PRESETS_T e_quality_preset)3842*c83a76b0SSuyog Pawar static void hme_subpel_refine_struct_to_search_results_struct_converter(
3843*c83a76b0SSuyog Pawar     subpel_refine_ctxt_t *ps_subpel_refine_ctxt,
3844*c83a76b0SSuyog Pawar     search_results_t *ps_search_results,
3845*c83a76b0SSuyog Pawar     U08 u1_pred_dir,
3846*c83a76b0SSuyog Pawar     ME_QUALITY_PRESETS_T e_quality_preset)
3847*c83a76b0SSuyog Pawar {
3848*c83a76b0SSuyog Pawar     U08 i;
3849*c83a76b0SSuyog Pawar 
3850*c83a76b0SSuyog Pawar     U08 u1_num_results_per_part = ps_search_results->u1_num_results_per_part;
3851*c83a76b0SSuyog Pawar 
3852*c83a76b0SSuyog Pawar     for(i = 0; i < ps_subpel_refine_ctxt->i4_num_valid_parts; i++)
3853*c83a76b0SSuyog Pawar     {
3854*c83a76b0SSuyog Pawar         S32 index;
3855*c83a76b0SSuyog Pawar         S32 i4_sad;
3856*c83a76b0SSuyog Pawar 
3857*c83a76b0SSuyog Pawar         S32 part_id = ps_subpel_refine_ctxt->ai4_part_id[i];
3858*c83a76b0SSuyog Pawar 
3859*c83a76b0SSuyog Pawar         search_node_t *ps_best_node = ps_search_results->aps_part_results[u1_pred_dir][part_id];
3860*c83a76b0SSuyog Pawar 
3861*c83a76b0SSuyog Pawar         if(ps_subpel_refine_ctxt->i4_num_valid_parts > 8)
3862*c83a76b0SSuyog Pawar         {
3863*c83a76b0SSuyog Pawar             index = part_id;
3864*c83a76b0SSuyog Pawar         }
3865*c83a76b0SSuyog Pawar         else
3866*c83a76b0SSuyog Pawar         {
3867*c83a76b0SSuyog Pawar             index = i;
3868*c83a76b0SSuyog Pawar         }
3869*c83a76b0SSuyog Pawar 
3870*c83a76b0SSuyog Pawar         if(!ps_best_node->u1_subpel_done)
3871*c83a76b0SSuyog Pawar         {
3872*c83a76b0SSuyog Pawar             i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
3873*c83a76b0SSuyog Pawar                      ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3874*c83a76b0SSuyog Pawar             ps_best_node[0].i4_sdi = 0;
3875*c83a76b0SSuyog Pawar             ASSERT((e_quality_preset == ME_PRISTINE_QUALITY) ? (ps_best_node[0].i4_sdi >= 0) : 1);
3876*c83a76b0SSuyog Pawar             ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
3877*c83a76b0SSuyog Pawar 
3878*c83a76b0SSuyog Pawar             if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
3879*c83a76b0SSuyog Pawar             {
3880*c83a76b0SSuyog Pawar                 i4_sad = MAX_SIGNED_16BIT_VAL;
3881*c83a76b0SSuyog Pawar             }
3882*c83a76b0SSuyog Pawar 
3883*c83a76b0SSuyog Pawar             ps_best_node[0].i4_sad = i4_sad;
3884*c83a76b0SSuyog Pawar             ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3885*c83a76b0SSuyog Pawar             ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
3886*c83a76b0SSuyog Pawar             ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
3887*c83a76b0SSuyog Pawar             ps_best_node[0].i1_ref_idx = (WORD8)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
3888*c83a76b0SSuyog Pawar             ps_best_node->u1_subpel_done = 1;
3889*c83a76b0SSuyog Pawar 
3890*c83a76b0SSuyog Pawar             if(2 == u1_num_results_per_part)
3891*c83a76b0SSuyog Pawar             {
3892*c83a76b0SSuyog Pawar                 i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[1][index] -
3893*c83a76b0SSuyog Pawar                          ps_subpel_refine_ctxt->i2_mv_cost[1][index];
3894*c83a76b0SSuyog Pawar                 ps_best_node[1].i4_sdi = 0;
3895*c83a76b0SSuyog Pawar                 ps_best_node[1].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[1][index];
3896*c83a76b0SSuyog Pawar 
3897*c83a76b0SSuyog Pawar                 if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] == MAX_SIGNED_16BIT_VAL)
3898*c83a76b0SSuyog Pawar                 {
3899*c83a76b0SSuyog Pawar                     i4_sad = MAX_SIGNED_16BIT_VAL;
3900*c83a76b0SSuyog Pawar                 }
3901*c83a76b0SSuyog Pawar 
3902*c83a76b0SSuyog Pawar                 ps_best_node[1].i4_sad = i4_sad;
3903*c83a76b0SSuyog Pawar                 ps_best_node[1].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[1][index];
3904*c83a76b0SSuyog Pawar                 ps_best_node[1].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[1][index];
3905*c83a76b0SSuyog Pawar                 ps_best_node[1].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[1][index];
3906*c83a76b0SSuyog Pawar                 ps_best_node[1].i1_ref_idx = (WORD8)ps_subpel_refine_ctxt->i2_ref_idx[1][index];
3907*c83a76b0SSuyog Pawar                 ps_best_node[1].u1_subpel_done = 1;
3908*c83a76b0SSuyog Pawar             }
3909*c83a76b0SSuyog Pawar         }
3910*c83a76b0SSuyog Pawar         else if(
3911*c83a76b0SSuyog Pawar             (2 == u1_num_results_per_part) &&
3912*c83a76b0SSuyog Pawar             (ps_subpel_refine_ctxt->i2_tot_cost[0][index] < ps_best_node[1].i4_tot_cost))
3913*c83a76b0SSuyog Pawar         {
3914*c83a76b0SSuyog Pawar             if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] < ps_best_node[0].i4_tot_cost)
3915*c83a76b0SSuyog Pawar             {
3916*c83a76b0SSuyog Pawar                 i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
3917*c83a76b0SSuyog Pawar                          ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3918*c83a76b0SSuyog Pawar                 ps_best_node[0].i4_sdi = 0;
3919*c83a76b0SSuyog Pawar                 ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
3920*c83a76b0SSuyog Pawar 
3921*c83a76b0SSuyog Pawar                 if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
3922*c83a76b0SSuyog Pawar                 {
3923*c83a76b0SSuyog Pawar                     i4_sad = MAX_SIGNED_16BIT_VAL;
3924*c83a76b0SSuyog Pawar                 }
3925*c83a76b0SSuyog Pawar 
3926*c83a76b0SSuyog Pawar                 ps_best_node[0].i4_sad = i4_sad;
3927*c83a76b0SSuyog Pawar                 ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3928*c83a76b0SSuyog Pawar                 ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
3929*c83a76b0SSuyog Pawar                 ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
3930*c83a76b0SSuyog Pawar                 ps_best_node[0].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
3931*c83a76b0SSuyog Pawar 
3932*c83a76b0SSuyog Pawar                 i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[1][index] -
3933*c83a76b0SSuyog Pawar                          ps_subpel_refine_ctxt->i2_mv_cost[1][index];
3934*c83a76b0SSuyog Pawar                 ps_best_node[1].i4_sdi = 0;
3935*c83a76b0SSuyog Pawar                 ps_best_node[1].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[1][index];
3936*c83a76b0SSuyog Pawar 
3937*c83a76b0SSuyog Pawar                 if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] == MAX_SIGNED_16BIT_VAL)
3938*c83a76b0SSuyog Pawar                 {
3939*c83a76b0SSuyog Pawar                     i4_sad = MAX_SIGNED_16BIT_VAL;
3940*c83a76b0SSuyog Pawar                 }
3941*c83a76b0SSuyog Pawar 
3942*c83a76b0SSuyog Pawar                 ps_best_node[1].i4_sad = i4_sad;
3943*c83a76b0SSuyog Pawar                 ps_best_node[1].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[1][index];
3944*c83a76b0SSuyog Pawar                 ps_best_node[1].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[1][index];
3945*c83a76b0SSuyog Pawar                 ps_best_node[1].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[1][index];
3946*c83a76b0SSuyog Pawar                 ps_best_node[1].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[1][index];
3947*c83a76b0SSuyog Pawar             }
3948*c83a76b0SSuyog Pawar             else if(ps_subpel_refine_ctxt->i2_tot_cost[1][index] > ps_best_node[0].i4_tot_cost)
3949*c83a76b0SSuyog Pawar             {
3950*c83a76b0SSuyog Pawar                 if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] >= ps_best_node[0].i4_tot_cost)
3951*c83a76b0SSuyog Pawar                 {
3952*c83a76b0SSuyog Pawar                     i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
3953*c83a76b0SSuyog Pawar                              ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3954*c83a76b0SSuyog Pawar                     ps_best_node[1].i4_sdi = 0;
3955*c83a76b0SSuyog Pawar                     ps_best_node[1].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
3956*c83a76b0SSuyog Pawar 
3957*c83a76b0SSuyog Pawar                     if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
3958*c83a76b0SSuyog Pawar                     {
3959*c83a76b0SSuyog Pawar                         i4_sad = MAX_SIGNED_16BIT_VAL;
3960*c83a76b0SSuyog Pawar                     }
3961*c83a76b0SSuyog Pawar 
3962*c83a76b0SSuyog Pawar                     ps_best_node[1].i4_sad = i4_sad;
3963*c83a76b0SSuyog Pawar                     ps_best_node[1].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3964*c83a76b0SSuyog Pawar                     ps_best_node[1].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
3965*c83a76b0SSuyog Pawar                     ps_best_node[1].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
3966*c83a76b0SSuyog Pawar                     ps_best_node[1].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
3967*c83a76b0SSuyog Pawar                 }
3968*c83a76b0SSuyog Pawar                 else if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] < ps_best_node[0].i4_tot_cost)
3969*c83a76b0SSuyog Pawar                 {
3970*c83a76b0SSuyog Pawar                     memmove(&ps_best_node[1], &ps_best_node[0], sizeof(search_node_t));
3971*c83a76b0SSuyog Pawar 
3972*c83a76b0SSuyog Pawar                     i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
3973*c83a76b0SSuyog Pawar                              ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3974*c83a76b0SSuyog Pawar                     ps_best_node[0].i4_sdi = 0;
3975*c83a76b0SSuyog Pawar                     ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
3976*c83a76b0SSuyog Pawar 
3977*c83a76b0SSuyog Pawar                     if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
3978*c83a76b0SSuyog Pawar                     {
3979*c83a76b0SSuyog Pawar                         i4_sad = MAX_SIGNED_16BIT_VAL;
3980*c83a76b0SSuyog Pawar                     }
3981*c83a76b0SSuyog Pawar 
3982*c83a76b0SSuyog Pawar                     ps_best_node[0].i4_sad = i4_sad;
3983*c83a76b0SSuyog Pawar                     ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3984*c83a76b0SSuyog Pawar                     ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
3985*c83a76b0SSuyog Pawar                     ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
3986*c83a76b0SSuyog Pawar                     ps_best_node[0].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
3987*c83a76b0SSuyog Pawar                 }
3988*c83a76b0SSuyog Pawar             }
3989*c83a76b0SSuyog Pawar         }
3990*c83a76b0SSuyog Pawar         else if(
3991*c83a76b0SSuyog Pawar             (1 == u1_num_results_per_part) &&
3992*c83a76b0SSuyog Pawar             (ps_subpel_refine_ctxt->i2_tot_cost[0][index] < ps_best_node[0].i4_tot_cost))
3993*c83a76b0SSuyog Pawar         {
3994*c83a76b0SSuyog Pawar             i4_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
3995*c83a76b0SSuyog Pawar                      ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3996*c83a76b0SSuyog Pawar             ps_best_node[0].i4_sdi = 0;
3997*c83a76b0SSuyog Pawar             ps_best_node[0].i4_tot_cost = ps_subpel_refine_ctxt->i2_tot_cost[0][index];
3998*c83a76b0SSuyog Pawar 
3999*c83a76b0SSuyog Pawar             if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] == MAX_SIGNED_16BIT_VAL)
4000*c83a76b0SSuyog Pawar             {
4001*c83a76b0SSuyog Pawar                 i4_sad = MAX_SIGNED_16BIT_VAL;
4002*c83a76b0SSuyog Pawar             }
4003*c83a76b0SSuyog Pawar 
4004*c83a76b0SSuyog Pawar             ps_best_node[0].i4_sad = i4_sad;
4005*c83a76b0SSuyog Pawar             ps_best_node[0].i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
4006*c83a76b0SSuyog Pawar             ps_best_node[0].s_mv.i2_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
4007*c83a76b0SSuyog Pawar             ps_best_node[0].s_mv.i2_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
4008*c83a76b0SSuyog Pawar             ps_best_node[0].i1_ref_idx = (S08)ps_subpel_refine_ctxt->i2_ref_idx[0][index];
4009*c83a76b0SSuyog Pawar         }
4010*c83a76b0SSuyog Pawar     }
4011*c83a76b0SSuyog Pawar }
4012*c83a76b0SSuyog Pawar 
4013*c83a76b0SSuyog Pawar /**
4014*c83a76b0SSuyog Pawar ********************************************************************************
4015*c83a76b0SSuyog Pawar *  @fn     S32 hme_subpel_refine_cu_hs
4016*c83a76b0SSuyog Pawar *
4017*c83a76b0SSuyog Pawar *  @brief  Evaluates the best subpel mvs for active partitions of an MB in L0
4018*c83a76b0SSuyog Pawar *          layer for the high speed preset. Recursive hadamard SATD / SAD
4019*c83a76b0SSuyog Pawar *          and mv cost is used for 2NxN and NxN partitions with active partition
4020*c83a76b0SSuyog Pawar *          update
4021*c83a76b0SSuyog Pawar *
4022*c83a76b0SSuyog Pawar *  @param[in]  ps_prms: subpel prms input to this function
4023*c83a76b0SSuyog Pawar *
4024*c83a76b0SSuyog Pawar *  @param[in]  ps_curr_layer: points to the current layer ctxt
4025*c83a76b0SSuyog Pawar *
4026*c83a76b0SSuyog Pawar *  @param[out] ps_search_results: points to the search resutls that get updated
4027*c83a76b0SSuyog Pawar *              with best results
4028*c83a76b0SSuyog Pawar *
4029*c83a76b0SSuyog Pawar *  @param[in]  search_idx:  ref id of the frame for which results get updated
4030*c83a76b0SSuyog Pawar *
4031*c83a76b0SSuyog Pawar *  @param[in]  ps_wt_inp_prms:  current frame input params
4032*c83a76b0SSuyog Pawar *
4033*c83a76b0SSuyog Pawar *  @return     None
4034*c83a76b0SSuyog Pawar ********************************************************************************
4035*c83a76b0SSuyog Pawar */
hme_subpel_refine_cu_hs(hme_subpel_prms_t * ps_prms,layer_ctxt_t * ps_curr_layer,search_results_t * ps_search_results,S32 search_idx,wgt_pred_ctxt_t * ps_wt_inp_prms,WORD32 blk_8x8_mask,me_func_selector_t * ps_func_selector,ihevce_cmn_opt_func_t * ps_cmn_utils_optimised_function_list,ihevce_me_optimised_function_list_t * ps_me_optimised_function_list)4036*c83a76b0SSuyog Pawar void hme_subpel_refine_cu_hs(
4037*c83a76b0SSuyog Pawar     hme_subpel_prms_t *ps_prms,
4038*c83a76b0SSuyog Pawar     layer_ctxt_t *ps_curr_layer,
4039*c83a76b0SSuyog Pawar     search_results_t *ps_search_results,
4040*c83a76b0SSuyog Pawar     S32 search_idx,
4041*c83a76b0SSuyog Pawar     wgt_pred_ctxt_t *ps_wt_inp_prms,
4042*c83a76b0SSuyog Pawar     WORD32 blk_8x8_mask,
4043*c83a76b0SSuyog Pawar     me_func_selector_t *ps_func_selector,
4044*c83a76b0SSuyog Pawar     ihevce_cmn_opt_func_t *ps_cmn_utils_optimised_function_list,
4045*c83a76b0SSuyog Pawar     ihevce_me_optimised_function_list_t *ps_me_optimised_function_list)
4046*c83a76b0SSuyog Pawar {
4047*c83a76b0SSuyog Pawar     /* Unique search node list for 2nx2n and nxn partitions */
4048*c83a76b0SSuyog Pawar     search_node_t as_nodes_2nx2n[MAX_RESULTS_PER_PART * 5];
4049*c83a76b0SSuyog Pawar     subpel_dedup_enabler_t as_subpel_dedup_enabler[MAX_NUM_REF];
4050*c83a76b0SSuyog Pawar     search_node_t *ps_search_node;
4051*c83a76b0SSuyog Pawar 
4052*c83a76b0SSuyog Pawar     S32 i, i4_part_mask, j;
4053*c83a76b0SSuyog Pawar     S32 i4_sad_grid;
4054*c83a76b0SSuyog Pawar     S32 max_subpel_cand;
4055*c83a76b0SSuyog Pawar     WORD32 index;
4056*c83a76b0SSuyog Pawar     S32 num_unique_nodes_2nx2n;
4057*c83a76b0SSuyog Pawar     S32 part_id;
4058*c83a76b0SSuyog Pawar     S32 x_off, y_off;
4059*c83a76b0SSuyog Pawar     S32 i4_inp_off;
4060*c83a76b0SSuyog Pawar 
4061*c83a76b0SSuyog Pawar     CU_SIZE_T e_cu_size;
4062*c83a76b0SSuyog Pawar     BLK_SIZE_T e_blk_size;
4063*c83a76b0SSuyog Pawar 
4064*c83a76b0SSuyog Pawar     subpel_refine_ctxt_t *ps_subpel_refine_ctxt = ps_prms->ps_subpel_refine_ctxt;
4065*c83a76b0SSuyog Pawar 
4066*c83a76b0SSuyog Pawar     S32 i4_use_satd = ps_prms->i4_use_satd;
4067*c83a76b0SSuyog Pawar     S32 i4_num_act_refs = ps_prms->i4_num_act_ref_l0 + ps_prms->i4_num_act_ref_l1;
4068*c83a76b0SSuyog Pawar 
4069*c83a76b0SSuyog Pawar     ASSERT(ps_search_results->u1_num_results_per_part <= MAX_RESULTS_PER_PART);
4070*c83a76b0SSuyog Pawar 
4071*c83a76b0SSuyog Pawar     if(!DISABLE_SUBPEL_REFINEMENT_WHEN_SRC_IS_NOISY || !ps_prms->u1_is_cu_noisy)
4072*c83a76b0SSuyog Pawar     {
4073*c83a76b0SSuyog Pawar         e_cu_size = ps_search_results->e_cu_size;
4074*c83a76b0SSuyog Pawar         i4_part_mask = ps_search_results->i4_part_mask;
4075*c83a76b0SSuyog Pawar 
4076*c83a76b0SSuyog Pawar         ps_prms->i4_inp_type = sizeof(U08);
4077*c83a76b0SSuyog Pawar 
4078*c83a76b0SSuyog Pawar         num_unique_nodes_2nx2n = 0;
4079*c83a76b0SSuyog Pawar 
4080*c83a76b0SSuyog Pawar         for(i = 0; i < i4_num_act_refs; i++)
4081*c83a76b0SSuyog Pawar         {
4082*c83a76b0SSuyog Pawar             as_subpel_dedup_enabler[i].u1_ref_idx = MAX_NUM_REF;
4083*c83a76b0SSuyog Pawar         }
4084*c83a76b0SSuyog Pawar 
4085*c83a76b0SSuyog Pawar         /************************************************************************/
4086*c83a76b0SSuyog Pawar         /*                                                                      */
4087*c83a76b0SSuyog Pawar         /*  Initialize SATD cost for each valid partition id.one time before    */
4088*c83a76b0SSuyog Pawar         /*  doing full pel time. This is because of the following reasons:      */
4089*c83a76b0SSuyog Pawar         /*   1. Full pel cost was done in  SAD while subpel is in SATD mode     */
4090*c83a76b0SSuyog Pawar         /*   2. Partitions like AMP, Nx2N and 2NxN are refined on the fly while */
4091*c83a76b0SSuyog Pawar         /*      doing Diamond search for 2Nx2N and NxN. This partitions are     */
4092*c83a76b0SSuyog Pawar         /*      not explicitly refine in high speed mode                        */
4093*c83a76b0SSuyog Pawar         /*                                                                      */
4094*c83a76b0SSuyog Pawar         /************************************************************************/
4095*c83a76b0SSuyog Pawar         for(i = 0; i < ps_subpel_refine_ctxt->i4_num_valid_parts; i++)
4096*c83a76b0SSuyog Pawar         {
4097*c83a76b0SSuyog Pawar             S32 enable_subpel = 0;
4098*c83a76b0SSuyog Pawar             S32 part_type;
4099*c83a76b0SSuyog Pawar 
4100*c83a76b0SSuyog Pawar             /* Derive the x and y offsets of this part id */
4101*c83a76b0SSuyog Pawar             part_id = ps_subpel_refine_ctxt->ai4_part_id[i];
4102*c83a76b0SSuyog Pawar             if(ps_subpel_refine_ctxt->i4_num_valid_parts > 8)
4103*c83a76b0SSuyog Pawar             {
4104*c83a76b0SSuyog Pawar                 index = part_id;
4105*c83a76b0SSuyog Pawar             }
4106*c83a76b0SSuyog Pawar             else
4107*c83a76b0SSuyog Pawar             {
4108*c83a76b0SSuyog Pawar                 index = i;
4109*c83a76b0SSuyog Pawar             }
4110*c83a76b0SSuyog Pawar 
4111*c83a76b0SSuyog Pawar             part_type = ge_part_id_to_part_type[part_id];
4112*c83a76b0SSuyog Pawar             x_off = gas_part_attr_in_cu[part_id].u1_x_start << e_cu_size;
4113*c83a76b0SSuyog Pawar             y_off = gas_part_attr_in_cu[part_id].u1_y_start << e_cu_size;
4114*c83a76b0SSuyog Pawar             x_off += ps_search_results->u1_x_off;
4115*c83a76b0SSuyog Pawar             y_off += ps_search_results->u1_y_off;
4116*c83a76b0SSuyog Pawar             i4_inp_off = x_off + y_off * ps_prms->i4_inp_stride;
4117*c83a76b0SSuyog Pawar             e_blk_size = ge_part_id_to_blk_size[e_cu_size][part_id];
4118*c83a76b0SSuyog Pawar 
4119*c83a76b0SSuyog Pawar             x_off += ps_prms->i4_ctb_x_off;
4120*c83a76b0SSuyog Pawar             y_off += ps_prms->i4_ctb_y_off;
4121*c83a76b0SSuyog Pawar 
4122*c83a76b0SSuyog Pawar             max_subpel_cand = 0;
4123*c83a76b0SSuyog Pawar 
4124*c83a76b0SSuyog Pawar             /* Choose the minimum number of candidates to be used for Sub pel refinement */
4125*c83a76b0SSuyog Pawar             if(PART_ID_2Nx2N == part_type)
4126*c83a76b0SSuyog Pawar             {
4127*c83a76b0SSuyog Pawar                 max_subpel_cand =
4128*c83a76b0SSuyog Pawar                     MIN(ps_prms->u1_max_subpel_candts_2Nx2N,
4129*c83a76b0SSuyog Pawar                         ps_search_results->u1_num_results_per_part);
4130*c83a76b0SSuyog Pawar             }
4131*c83a76b0SSuyog Pawar             else if(PRT_NxN == part_type)
4132*c83a76b0SSuyog Pawar             {
4133*c83a76b0SSuyog Pawar                 max_subpel_cand = MIN(
4134*c83a76b0SSuyog Pawar                     ps_prms->u1_max_subpel_candts_NxN, ps_search_results->u1_num_results_per_part);
4135*c83a76b0SSuyog Pawar             }
4136*c83a76b0SSuyog Pawar 
4137*c83a76b0SSuyog Pawar             /* If incomplete CTB, NxN num candidates should be forced to min 1 */
4138*c83a76b0SSuyog Pawar             if((0 == max_subpel_cand) && (blk_8x8_mask != 15))
4139*c83a76b0SSuyog Pawar             {
4140*c83a76b0SSuyog Pawar                 max_subpel_cand = 1;
4141*c83a76b0SSuyog Pawar             }
4142*c83a76b0SSuyog Pawar 
4143*c83a76b0SSuyog Pawar             if((PART_ID_2Nx2N == part_type) || (PRT_NxN == part_type))
4144*c83a76b0SSuyog Pawar             {
4145*c83a76b0SSuyog Pawar                 enable_subpel = 1;
4146*c83a76b0SSuyog Pawar             }
4147*c83a76b0SSuyog Pawar 
4148*c83a76b0SSuyog Pawar             /* Compute full pel SATD for each result per partition before subpel */
4149*c83a76b0SSuyog Pawar             /* refinement starts.                                                */
4150*c83a76b0SSuyog Pawar             /* Also prepare unique candidate list for 2Nx2N and NxN partitions   */
4151*c83a76b0SSuyog Pawar             for(j = 0; j < ps_search_results->u1_num_results_per_part; j++)
4152*c83a76b0SSuyog Pawar             {
4153*c83a76b0SSuyog Pawar                 err_prms_t s_err_prms;
4154*c83a76b0SSuyog Pawar                 S32 i4_satd = 0;
4155*c83a76b0SSuyog Pawar                 S32 i1_ref_idx;
4156*c83a76b0SSuyog Pawar                 U08 *pu1_ref_base;
4157*c83a76b0SSuyog Pawar                 S32 i4_ref_stride = ps_curr_layer->i4_rec_stride;
4158*c83a76b0SSuyog Pawar                 S32 i4_mv_x, i4_mv_y;
4159*c83a76b0SSuyog Pawar 
4160*c83a76b0SSuyog Pawar                 ps_search_node = ps_search_results->aps_part_results[search_idx][part_id] + j;
4161*c83a76b0SSuyog Pawar 
4162*c83a76b0SSuyog Pawar                 if(ps_subpel_refine_ctxt->i2_mv_x[j][index] == INTRA_MV)
4163*c83a76b0SSuyog Pawar                 {
4164*c83a76b0SSuyog Pawar                     ps_search_node->u1_subpel_done = 1;
4165*c83a76b0SSuyog Pawar                     continue;
4166*c83a76b0SSuyog Pawar                 }
4167*c83a76b0SSuyog Pawar 
4168*c83a76b0SSuyog Pawar                 i1_ref_idx = ps_subpel_refine_ctxt->i2_ref_idx[j][index];
4169*c83a76b0SSuyog Pawar                 ps_prms->pv_inp = (void *)(ps_wt_inp_prms->apu1_wt_inp[i1_ref_idx] + i4_inp_off);
4170*c83a76b0SSuyog Pawar                 pu1_ref_base = ps_curr_layer->ppu1_list_rec_fxfy[i1_ref_idx];
4171*c83a76b0SSuyog Pawar 
4172*c83a76b0SSuyog Pawar                 i4_mv_x = ps_subpel_refine_ctxt->i2_mv_x[j][index];
4173*c83a76b0SSuyog Pawar                 i4_mv_y = ps_subpel_refine_ctxt->i2_mv_y[j][index];
4174*c83a76b0SSuyog Pawar 
4175*c83a76b0SSuyog Pawar                 if(i4_use_satd)
4176*c83a76b0SSuyog Pawar                 {
4177*c83a76b0SSuyog Pawar                     s_err_prms.pu1_inp = (U08 *)ps_prms->pv_inp;
4178*c83a76b0SSuyog Pawar                     s_err_prms.i4_inp_stride = ps_prms->i4_inp_stride;
4179*c83a76b0SSuyog Pawar                     s_err_prms.pu1_ref = pu1_ref_base + x_off + (y_off * i4_ref_stride) + i4_mv_x +
4180*c83a76b0SSuyog Pawar                                          (i4_mv_y * i4_ref_stride);
4181*c83a76b0SSuyog Pawar 
4182*c83a76b0SSuyog Pawar                     s_err_prms.i4_ref_stride = i4_ref_stride;
4183*c83a76b0SSuyog Pawar                     s_err_prms.i4_part_mask = (ENABLE_2Nx2N);
4184*c83a76b0SSuyog Pawar                     s_err_prms.i4_grid_mask = 1;
4185*c83a76b0SSuyog Pawar                     s_err_prms.pi4_sad_grid = &i4_sad_grid;
4186*c83a76b0SSuyog Pawar                     s_err_prms.i4_blk_wd = gau1_blk_size_to_wd[e_blk_size];
4187*c83a76b0SSuyog Pawar                     s_err_prms.i4_blk_ht = gau1_blk_size_to_ht[e_blk_size];
4188*c83a76b0SSuyog Pawar 
4189*c83a76b0SSuyog Pawar                     s_err_prms.ps_cmn_utils_optimised_function_list =
4190*c83a76b0SSuyog Pawar                         ps_cmn_utils_optimised_function_list;
4191*c83a76b0SSuyog Pawar 
4192*c83a76b0SSuyog Pawar                     compute_satd_8bit(&s_err_prms);
4193*c83a76b0SSuyog Pawar 
4194*c83a76b0SSuyog Pawar                     i4_satd = s_err_prms.pi4_sad_grid[0];
4195*c83a76b0SSuyog Pawar 
4196*c83a76b0SSuyog Pawar                     ps_subpel_refine_ctxt->i2_tot_cost[j][index] =
4197*c83a76b0SSuyog Pawar                         CLIP_S16(ps_subpel_refine_ctxt->i2_mv_cost[j][index] + i4_satd);
4198*c83a76b0SSuyog Pawar                     ps_subpel_refine_ctxt->ai2_fullpel_satd[j][index] = i4_satd;
4199*c83a76b0SSuyog Pawar                 }
4200*c83a76b0SSuyog Pawar 
4201*c83a76b0SSuyog Pawar                 /* Sub-pel candidate filtration */
4202*c83a76b0SSuyog Pawar                 if(j)
4203*c83a76b0SSuyog Pawar                 {
4204*c83a76b0SSuyog Pawar                     S16 i2_best_sad;
4205*c83a76b0SSuyog Pawar                     S32 i4_best_mvx;
4206*c83a76b0SSuyog Pawar                     S32 i4_best_mvy;
4207*c83a76b0SSuyog Pawar 
4208*c83a76b0SSuyog Pawar                     search_node_t *ps_node =
4209*c83a76b0SSuyog Pawar                         ps_search_results->aps_part_results[search_idx][part_id];
4210*c83a76b0SSuyog Pawar 
4211*c83a76b0SSuyog Pawar                     U08 u1_is_subpel_done = ps_node->u1_subpel_done;
4212*c83a76b0SSuyog Pawar                     S16 i2_curr_sad = ps_subpel_refine_ctxt->ai2_fullpel_satd[j][index];
4213*c83a76b0SSuyog Pawar                     S32 i4_curr_mvx = i4_mv_x << 2;
4214*c83a76b0SSuyog Pawar                     S32 i4_curr_mvy = i4_mv_y << 2;
4215*c83a76b0SSuyog Pawar 
4216*c83a76b0SSuyog Pawar                     if(u1_is_subpel_done)
4217*c83a76b0SSuyog Pawar                     {
4218*c83a76b0SSuyog Pawar                         i2_best_sad = ps_node->i4_sad;
4219*c83a76b0SSuyog Pawar 
4220*c83a76b0SSuyog Pawar                         if(ps_node->i1_ref_idx == i1_ref_idx)
4221*c83a76b0SSuyog Pawar                         {
4222*c83a76b0SSuyog Pawar                             i4_best_mvx = ps_node->s_mv.i2_mvx;
4223*c83a76b0SSuyog Pawar                             i4_best_mvy = ps_node->s_mv.i2_mvy;
4224*c83a76b0SSuyog Pawar                         }
4225*c83a76b0SSuyog Pawar                         else if(i1_ref_idx == ps_subpel_refine_ctxt->i2_ref_idx[0][index])
4226*c83a76b0SSuyog Pawar                         {
4227*c83a76b0SSuyog Pawar                             i4_best_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
4228*c83a76b0SSuyog Pawar                             i4_best_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
4229*c83a76b0SSuyog Pawar                         }
4230*c83a76b0SSuyog Pawar                         else
4231*c83a76b0SSuyog Pawar                         {
4232*c83a76b0SSuyog Pawar                             i4_best_mvx = INTRA_MV;
4233*c83a76b0SSuyog Pawar                             i4_best_mvy = INTRA_MV;
4234*c83a76b0SSuyog Pawar                         }
4235*c83a76b0SSuyog Pawar                     }
4236*c83a76b0SSuyog Pawar                     else
4237*c83a76b0SSuyog Pawar                     {
4238*c83a76b0SSuyog Pawar                         i2_best_sad = ps_subpel_refine_ctxt->i2_tot_cost[0][index] -
4239*c83a76b0SSuyog Pawar                                       ps_subpel_refine_ctxt->i2_mv_cost[0][index];
4240*c83a76b0SSuyog Pawar 
4241*c83a76b0SSuyog Pawar                         if(i1_ref_idx == ps_subpel_refine_ctxt->i2_ref_idx[0][index])
4242*c83a76b0SSuyog Pawar                         {
4243*c83a76b0SSuyog Pawar                             i4_best_mvx = ps_subpel_refine_ctxt->i2_mv_x[0][index];
4244*c83a76b0SSuyog Pawar                             i4_best_mvy = ps_subpel_refine_ctxt->i2_mv_y[0][index];
4245*c83a76b0SSuyog Pawar                         }
4246*c83a76b0SSuyog Pawar                         else
4247*c83a76b0SSuyog Pawar                         {
4248*c83a76b0SSuyog Pawar                             i4_best_mvx = INTRA_MV;
4249*c83a76b0SSuyog Pawar                             i4_best_mvy = INTRA_MV;
4250*c83a76b0SSuyog Pawar                         }
4251*c83a76b0SSuyog Pawar                     }
4252*c83a76b0SSuyog Pawar 
4253*c83a76b0SSuyog Pawar                     i2_best_sad += (i2_best_sad >> ps_prms->u1_subpel_candt_threshold);
4254*c83a76b0SSuyog Pawar 
4255*c83a76b0SSuyog Pawar                     if(((ABS(i4_curr_mvx - i4_best_mvx) < 2) &&
4256*c83a76b0SSuyog Pawar                         (ABS(i4_curr_mvy - i4_best_mvy) < 2)) ||
4257*c83a76b0SSuyog Pawar                        (i2_curr_sad > i2_best_sad))
4258*c83a76b0SSuyog Pawar                     {
4259*c83a76b0SSuyog Pawar                         enable_subpel = 0;
4260*c83a76b0SSuyog Pawar                     }
4261*c83a76b0SSuyog Pawar                 }
4262*c83a76b0SSuyog Pawar 
4263*c83a76b0SSuyog Pawar                 ps_search_node->u1_part_id = part_id;
4264*c83a76b0SSuyog Pawar 
4265*c83a76b0SSuyog Pawar                 /* Convert mvs in part results from FPEL to QPEL units */
4266*c83a76b0SSuyog Pawar                 ps_subpel_refine_ctxt->i2_mv_x[j][index] <<= 2;
4267*c83a76b0SSuyog Pawar                 ps_subpel_refine_ctxt->i2_mv_y[j][index] <<= 2;
4268*c83a76b0SSuyog Pawar 
4269*c83a76b0SSuyog Pawar                 /* If the candidate number is more than the number of candts
4270*c83a76b0SSuyog Pawar                 set initally, do not add those candts for refinement */
4271*c83a76b0SSuyog Pawar                 if(j >= max_subpel_cand)
4272*c83a76b0SSuyog Pawar                 {
4273*c83a76b0SSuyog Pawar                     enable_subpel = 0;
4274*c83a76b0SSuyog Pawar                 }
4275*c83a76b0SSuyog Pawar 
4276*c83a76b0SSuyog Pawar                 if(enable_subpel)
4277*c83a76b0SSuyog Pawar                 {
4278*c83a76b0SSuyog Pawar                     if(num_unique_nodes_2nx2n == 0)
4279*c83a76b0SSuyog Pawar                     {
4280*c83a76b0SSuyog Pawar                         S32 i4_index = ps_subpel_refine_ctxt->i2_ref_idx[j][index];
4281*c83a76b0SSuyog Pawar 
4282*c83a76b0SSuyog Pawar                         as_subpel_dedup_enabler[i4_index].i2_mv_x =
4283*c83a76b0SSuyog Pawar                             ps_subpel_refine_ctxt->i2_mv_x[j][index];
4284*c83a76b0SSuyog Pawar                         as_subpel_dedup_enabler[i4_index].i2_mv_y =
4285*c83a76b0SSuyog Pawar                             ps_subpel_refine_ctxt->i2_mv_y[j][index];
4286*c83a76b0SSuyog Pawar                         as_subpel_dedup_enabler[i4_index].u1_ref_idx =
4287*c83a76b0SSuyog Pawar                             (U08)ps_subpel_refine_ctxt->i2_ref_idx[j][index];
4288*c83a76b0SSuyog Pawar                         memset(
4289*c83a76b0SSuyog Pawar                             as_subpel_dedup_enabler[i4_index].au4_node_map,
4290*c83a76b0SSuyog Pawar                             0,
4291*c83a76b0SSuyog Pawar                             sizeof(U32) * 2 * MAP_X_MAX);
4292*c83a76b0SSuyog Pawar                     }
4293*c83a76b0SSuyog Pawar                     INSERT_NEW_NODE_NOMAP_ALTERNATE(
4294*c83a76b0SSuyog Pawar                         as_nodes_2nx2n, num_unique_nodes_2nx2n, ps_subpel_refine_ctxt, j, i);
4295*c83a76b0SSuyog Pawar                 }
4296*c83a76b0SSuyog Pawar             }
4297*c83a76b0SSuyog Pawar 
4298*c83a76b0SSuyog Pawar             /*********************************************************************************************/
4299*c83a76b0SSuyog Pawar             /* If sad_1 < sad_2, then satd_1 need not be lesser than satd_2. Therefore, after conversion */
4300*c83a76b0SSuyog Pawar             /* to satd, tot_cost_1 may not be lesser than tot_cost_2. So we need to sort the search nodes*/
4301*c83a76b0SSuyog Pawar             /* for each partition again, based on the new costs                                          */
4302*c83a76b0SSuyog Pawar             /*********************************************************************************************/
4303*c83a76b0SSuyog Pawar             /*********************************************************************************************/
4304*c83a76b0SSuyog Pawar             /* Because right now, we store only the two best candidates for each partition, the sort will*/
4305*c83a76b0SSuyog Pawar             /* converge to a simple swap.                                                                */
4306*c83a76b0SSuyog Pawar             /* ASSUMPTION : We store only two best results per partition                                 */
4307*c83a76b0SSuyog Pawar             /*********************************************************************************************/
4308*c83a76b0SSuyog Pawar             if(ps_search_results->u1_num_results_per_part == 2)
4309*c83a76b0SSuyog Pawar             {
4310*c83a76b0SSuyog Pawar                 if(ps_subpel_refine_ctxt->i2_tot_cost[0][index] >
4311*c83a76b0SSuyog Pawar                    ps_subpel_refine_ctxt->i2_tot_cost[1][index])
4312*c83a76b0SSuyog Pawar                 {
4313*c83a76b0SSuyog Pawar                     SWAP(
4314*c83a76b0SSuyog Pawar                         ps_subpel_refine_ctxt->i2_tot_cost[0][index],
4315*c83a76b0SSuyog Pawar                         ps_subpel_refine_ctxt->i2_tot_cost[1][index]);
4316*c83a76b0SSuyog Pawar 
4317*c83a76b0SSuyog Pawar                     SWAP(
4318*c83a76b0SSuyog Pawar                         ps_subpel_refine_ctxt->i2_mv_cost[0][index],
4319*c83a76b0SSuyog Pawar                         ps_subpel_refine_ctxt->i2_mv_cost[1][index]);
4320*c83a76b0SSuyog Pawar 
4321*c83a76b0SSuyog Pawar                     SWAP(
4322*c83a76b0SSuyog Pawar                         ps_subpel_refine_ctxt->i2_mv_x[0][index],
4323*c83a76b0SSuyog Pawar                         ps_subpel_refine_ctxt->i2_mv_x[1][index]);
4324*c83a76b0SSuyog Pawar 
4325*c83a76b0SSuyog Pawar                     SWAP(
4326*c83a76b0SSuyog Pawar                         ps_subpel_refine_ctxt->i2_mv_y[0][index],
4327*c83a76b0SSuyog Pawar                         ps_subpel_refine_ctxt->i2_mv_y[1][index]);
4328*c83a76b0SSuyog Pawar 
4329*c83a76b0SSuyog Pawar                     SWAP(
4330*c83a76b0SSuyog Pawar                         ps_subpel_refine_ctxt->i2_ref_idx[0][index],
4331*c83a76b0SSuyog Pawar                         ps_subpel_refine_ctxt->i2_ref_idx[1][index]);
4332*c83a76b0SSuyog Pawar 
4333*c83a76b0SSuyog Pawar                     SWAP(
4334*c83a76b0SSuyog Pawar                         ps_subpel_refine_ctxt->ai2_fullpel_satd[0][index],
4335*c83a76b0SSuyog Pawar                         ps_subpel_refine_ctxt->ai2_fullpel_satd[1][index]);
4336*c83a76b0SSuyog Pawar                 }
4337*c83a76b0SSuyog Pawar             }
4338*c83a76b0SSuyog Pawar         }
4339*c83a76b0SSuyog Pawar 
4340*c83a76b0SSuyog Pawar         if(blk_8x8_mask == 0xf)
4341*c83a76b0SSuyog Pawar         {
4342*c83a76b0SSuyog Pawar             num_unique_nodes_2nx2n =
4343*c83a76b0SSuyog Pawar                 MIN(num_unique_nodes_2nx2n, ps_prms->u1_max_num_subpel_refine_centers);
4344*c83a76b0SSuyog Pawar         }
4345*c83a76b0SSuyog Pawar         {
4346*c83a76b0SSuyog Pawar             x_off = gas_part_attr_in_cu[0].u1_x_start << e_cu_size;
4347*c83a76b0SSuyog Pawar             y_off = gas_part_attr_in_cu[0].u1_y_start << e_cu_size;
4348*c83a76b0SSuyog Pawar             x_off += ps_search_results->u1_x_off;
4349*c83a76b0SSuyog Pawar             y_off += ps_search_results->u1_y_off;
4350*c83a76b0SSuyog Pawar             i4_inp_off = x_off + y_off * ps_prms->i4_inp_stride;
4351*c83a76b0SSuyog Pawar             e_blk_size = ge_part_id_to_blk_size[e_cu_size][0];
4352*c83a76b0SSuyog Pawar 
4353*c83a76b0SSuyog Pawar             for(j = 0; j < num_unique_nodes_2nx2n; j++)
4354*c83a76b0SSuyog Pawar             {
4355*c83a76b0SSuyog Pawar                 S32 pred_lx;
4356*c83a76b0SSuyog Pawar                 ps_search_node = &as_nodes_2nx2n[j];
4357*c83a76b0SSuyog Pawar 
4358*c83a76b0SSuyog Pawar                 if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
4359*c83a76b0SSuyog Pawar                 {
4360*c83a76b0SSuyog Pawar                     continue;
4361*c83a76b0SSuyog Pawar                 }
4362*c83a76b0SSuyog Pawar 
4363*c83a76b0SSuyog Pawar                 {
4364*c83a76b0SSuyog Pawar                     S08 i1_ref_idx = ps_search_node->i1_ref_idx;
4365*c83a76b0SSuyog Pawar                     subpel_dedup_enabler_t *ps_dedup_enabler =
4366*c83a76b0SSuyog Pawar                         &(as_subpel_dedup_enabler[i1_ref_idx]);
4367*c83a76b0SSuyog Pawar 
4368*c83a76b0SSuyog Pawar                     if(ps_dedup_enabler->u1_ref_idx == MAX_NUM_REF)
4369*c83a76b0SSuyog Pawar                     {
4370*c83a76b0SSuyog Pawar                         as_subpel_dedup_enabler[i1_ref_idx].i2_mv_x = ps_search_node->s_mv.i2_mvx;
4371*c83a76b0SSuyog Pawar                         as_subpel_dedup_enabler[i1_ref_idx].i2_mv_y = ps_search_node->s_mv.i2_mvy;
4372*c83a76b0SSuyog Pawar                         as_subpel_dedup_enabler[i1_ref_idx].u1_ref_idx = i1_ref_idx;
4373*c83a76b0SSuyog Pawar                         memset(
4374*c83a76b0SSuyog Pawar                             as_subpel_dedup_enabler[i1_ref_idx].au4_node_map,
4375*c83a76b0SSuyog Pawar                             0,
4376*c83a76b0SSuyog Pawar                             sizeof(U32) * 2 * MAP_X_MAX);
4377*c83a76b0SSuyog Pawar                     }
4378*c83a76b0SSuyog Pawar                 }
4379*c83a76b0SSuyog Pawar 
4380*c83a76b0SSuyog Pawar                 pred_lx = search_idx;
4381*c83a76b0SSuyog Pawar                 ps_prms->pv_inp =
4382*c83a76b0SSuyog Pawar                     (void *)(ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off);
4383*c83a76b0SSuyog Pawar 
4384*c83a76b0SSuyog Pawar                 hme_subpel_refine_search_node_high_speed(
4385*c83a76b0SSuyog Pawar                     ps_search_node,
4386*c83a76b0SSuyog Pawar                     ps_prms,
4387*c83a76b0SSuyog Pawar                     ps_curr_layer,
4388*c83a76b0SSuyog Pawar                     e_blk_size,
4389*c83a76b0SSuyog Pawar                     x_off + ps_prms->i4_ctb_x_off,
4390*c83a76b0SSuyog Pawar                     y_off + ps_prms->i4_ctb_y_off,
4391*c83a76b0SSuyog Pawar                     ps_search_results,
4392*c83a76b0SSuyog Pawar                     pred_lx,
4393*c83a76b0SSuyog Pawar                     i4_part_mask,
4394*c83a76b0SSuyog Pawar                     &ps_subpel_refine_ctxt->ai4_part_id[0],
4395*c83a76b0SSuyog Pawar                     search_idx,
4396*c83a76b0SSuyog Pawar                     &(as_subpel_dedup_enabler[ps_search_node->i1_ref_idx]),
4397*c83a76b0SSuyog Pawar                     ps_func_selector,
4398*c83a76b0SSuyog Pawar                     ps_me_optimised_function_list);
4399*c83a76b0SSuyog Pawar             }
4400*c83a76b0SSuyog Pawar         }
4401*c83a76b0SSuyog Pawar     }
4402*c83a76b0SSuyog Pawar     else
4403*c83a76b0SSuyog Pawar     {
4404*c83a76b0SSuyog Pawar         for(i = 0; i < ps_subpel_refine_ctxt->i4_num_valid_parts; i++)
4405*c83a76b0SSuyog Pawar         {
4406*c83a76b0SSuyog Pawar             S32 i4_index;
4407*c83a76b0SSuyog Pawar 
4408*c83a76b0SSuyog Pawar             S32 i4_part_id = ps_subpel_refine_ctxt->ai4_part_id[i];
4409*c83a76b0SSuyog Pawar 
4410*c83a76b0SSuyog Pawar             if(ps_subpel_refine_ctxt->i4_num_valid_parts > 8)
4411*c83a76b0SSuyog Pawar             {
4412*c83a76b0SSuyog Pawar                 i4_index = i4_part_id;
4413*c83a76b0SSuyog Pawar             }
4414*c83a76b0SSuyog Pawar             else
4415*c83a76b0SSuyog Pawar             {
4416*c83a76b0SSuyog Pawar                 i4_index = i;
4417*c83a76b0SSuyog Pawar             }
4418*c83a76b0SSuyog Pawar 
4419*c83a76b0SSuyog Pawar             for(j = 0; j < ps_search_results->u1_num_results_per_part; j++)
4420*c83a76b0SSuyog Pawar             {
4421*c83a76b0SSuyog Pawar                 ps_subpel_refine_ctxt->i2_mv_x[j][i4_index] <<= 2;
4422*c83a76b0SSuyog Pawar                 ps_subpel_refine_ctxt->i2_mv_y[j][i4_index] <<= 2;
4423*c83a76b0SSuyog Pawar             }
4424*c83a76b0SSuyog Pawar         }
4425*c83a76b0SSuyog Pawar     }
4426*c83a76b0SSuyog Pawar 
4427*c83a76b0SSuyog Pawar     hme_subpel_refine_struct_to_search_results_struct_converter(
4428*c83a76b0SSuyog Pawar         ps_subpel_refine_ctxt, ps_search_results, search_idx, ps_prms->e_me_quality_presets);
4429*c83a76b0SSuyog Pawar }
4430