1 /******************************************************************************
2 *
3 * Copyright (C) 2018 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20
21 /*!
22 ***************************************************************************
23 * \file hme_err_compute.c
24 *
25 * \brief
26 * SAD / SATD routines for error computation
27 *
28 * Detailed_description : Contains various types of SAD/SATD routines for
29 * error computation between a given input and reference ptr. The SAD
30 * routines can evaluate for either a single point or a grid, and can
31 * evaluate with either partial updates or no partial updates. Partial
32 * updates means evaluating sub block SADs, e.g. 4 4x4 subblock SAD in
33 * addition to the main 8x8 block SAD.
34 *
35 * \date
36 * 22/9/2012
37 *
38 * \author Ittiam
39 ***************************************************************************
40 */
41
42 /*****************************************************************************/
43 /* File Includes */
44 /*****************************************************************************/
45 /* System include files */
46 #include <stdio.h>
47 #include <string.h>
48 #include <stdlib.h>
49 #include <assert.h>
50 #include <stdarg.h>
51 #include <math.h>
52 #include <limits.h>
53
54 /* User include files */
55 #include "ihevc_typedefs.h"
56 #include "itt_video_api.h"
57 #include "ihevce_api.h"
58
59 #include "rc_cntrl_param.h"
60 #include "rc_frame_info_collector.h"
61 #include "rc_look_ahead_params.h"
62
63 #include "ihevc_defs.h"
64 #include "ihevc_structs.h"
65 #include "ihevc_platform_macros.h"
66 #include "ihevc_deblk.h"
67 #include "ihevc_itrans_recon.h"
68 #include "ihevc_chroma_itrans_recon.h"
69 #include "ihevc_chroma_intra_pred.h"
70 #include "ihevc_intra_pred.h"
71 #include "ihevc_inter_pred.h"
72 #include "ihevc_mem_fns.h"
73 #include "ihevc_padding.h"
74 #include "ihevc_weighted_pred.h"
75 #include "ihevc_sao.h"
76 #include "ihevc_resi_trans.h"
77 #include "ihevc_quant_iquant_ssd.h"
78 #include "ihevc_cabac_tables.h"
79
80 #include "ihevce_defs.h"
81 #include "ihevce_lap_enc_structs.h"
82 #include "ihevce_multi_thrd_structs.h"
83 #include "ihevce_multi_thrd_funcs.h"
84 #include "ihevce_me_common_defs.h"
85 #include "ihevce_had_satd.h"
86 #include "ihevce_error_codes.h"
87 #include "ihevce_bitstream.h"
88 #include "ihevce_cabac.h"
89 #include "ihevce_rdoq_macros.h"
90 #include "ihevce_function_selector.h"
91 #include "ihevce_enc_structs.h"
92 #include "ihevce_entropy_structs.h"
93 #include "ihevce_cmn_utils_instr_set_router.h"
94 #include "ihevce_enc_loop_structs.h"
95 #include "ihevce_bs_compute_ctb.h"
96 #include "ihevce_global_tables.h"
97 #include "ihevce_dep_mngr_interface.h"
98 #include "hme_datatype.h"
99 #include "hme_interface.h"
100 #include "hme_common_defs.h"
101 #include "hme_defs.h"
102 #include "ihevce_me_instr_set_router.h"
103 #include "hme_globals.h"
104 #include "hme_utils.h"
105 #include "hme_coarse.h"
106 #include "hme_refine.h"
107 #include "hme_err_compute.h"
108 #include "hme_common_utils.h"
109 #include "hme_search_algo.h"
110 #include "ihevce_stasino_helpers.h"
111
112 /******************************************************************************
113 * MACRO DEFINITIONS
114 ******************************************************************************/
115
116 /*****************************************************************************/
117 /* Theoritically, the various types of SAD functions that are needed for */
118 /* reasons of optimality. SADs that are to be evaluated at a single pt can be*/
119 /* more optimal than SADs that are to be evaluated for a grid of 3x3. The */
120 /* SADs to be evaluated at a grid are classified as separate functions, since*/
121 /* evaluating them on a single function call helps reuse inputs for a small */
122 /* grid of 3x3. Also, if no partial updates are required, there are 3 basic */
123 /* funcitons, width 4K (K = odd number), width 8K (K = odd number) and width */
124 /* 16K, K any number. For partial updates, it is assumed that the block size */
125 /* is square (8x8, 16x16, 32x32, 64x64) and further differentiation is done */
126 /* based on the basic evaluation unit. E.g. if 16x16 blk size requires, part */
127 /* update on AMP partitions, then basic SAD unit is 4x4, if it doesnt, then */
128 /* basic SAD unit is 8x8. */
129 /*****************************************************************************/
130
131 #define UPD_RES_PT_NPU_BEST1 hme_update_results_grid_pu_bestn
132 #define UPD_RES_PT_NPU_BESTN hme_update_results_grid_pu_bestn
133 #define UPD_RES_PT_PU_BEST1 hme_update_results_grid_pu_bestn
134 #define UPD_RES_PT_PU_BESTN hme_update_results_grid_pu_bestn
135 #define UPD_RES_GRID_NPU_BEST1 hme_update_results_grid_pu_bestn
136 #define UPD_RES_GRID_NPU_BESTN hme_update_results_grid_pu_bestn
137 #define UPD_RES_GRID_PU_BEST1 hme_update_results_grid_pu_bestn
138 #define UPD_RES_GRID_PU_BESTN hme_update_results_grid_pu_bestn
139
140 /*******************************************************************************
141 * FUNCTION DEFINITIONS
142 *******************************************************************************/
hme_cmp_nodes(search_node_t * ps_best_node1,search_node_t * ps_best_node2)143 S32 hme_cmp_nodes(search_node_t *ps_best_node1, search_node_t *ps_best_node2)
144 {
145 if((ps_best_node1->s_mv.i2_mvx == ps_best_node2->s_mv.i2_mvx) &&
146 (ps_best_node1->s_mv.i2_mvy == ps_best_node2->s_mv.i2_mvy) &&
147 (ps_best_node1->i1_ref_idx == ps_best_node2->i1_ref_idx))
148 {
149 return 0;
150 }
151 return -1;
152 }
153
compute_4x4_sads_for_16x16_blk(grid_ctxt_t * ps_grid,UWORD8 * pu1_cur_ptr,WORD32 cur_buf_stride,UWORD16 ** u2_part_sads,cand_t * ps_cand,WORD32 * num_cands)154 void compute_4x4_sads_for_16x16_blk(
155 grid_ctxt_t *ps_grid, /* Grid ctxt */
156 UWORD8 *pu1_cur_ptr, /* Pointer to top-left of current block */
157 WORD32 cur_buf_stride, /* Buffer stride of current buffer */
158 UWORD16 **
159 u2_part_sads, /* 2D Array containing SADs for all 17 partitions. As many rows as partitions. SADs in a row correspond to each of the candidates */
160 cand_t *ps_cand, /* Return the list of candidates evaluated */
161 WORD32 *num_cands /* Number of candidates that were processed */
162 )
163 {
164 WORD32 a, b, c, d, i;
165 WORD16 grd_sz_y = (ps_grid->grd_sz_y_x & 0xFFFF0000) >> 16;
166 WORD16 grd_sz_x = (ps_grid->grd_sz_y_x & 0xFFFF);
167 //WORD32 offset_x[9] = {-grd_sz_x, 0, grd_sz_x, -grd_sz_x, 0, grd_sz_x, grd_sz_x, 0, -grd_sz_x};
168 //WORD32 offset_y[9] = {-grd_sz_y, -grd_sz_y, -grd_sz_y, 0, 0, 0, grd_sz_y, grd_sz_y, grd_sz_y};
169 /* Assumes the following order: C, L, T, R, B, TL, TR, BL, BR */
170 WORD32 offset_x[9] = { 0, -grd_sz_x, 0, grd_sz_x, 0, -grd_sz_x, grd_sz_x, -grd_sz_x, grd_sz_x };
171 WORD32 offset_y[9] = { 0, 0, -grd_sz_y, 0, grd_sz_y, -grd_sz_y, -grd_sz_y, grd_sz_y, grd_sz_y };
172 WORD32 ref_buf_stride = ps_grid->ref_buf_stride;
173 WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
174 WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
175 cand_t *cand0 = ps_cand;
176 UWORD16 au2_4x4_sad[NUM_4X4];
177
178 *num_cands = 0;
179
180 /* Loop to fill up the cand_t array and to calculate num_cands */
181 for(i = 0; i < ps_grid->num_grids; i++)
182 {
183 WORD32 j;
184 WORD32 mask = ps_grid->pi4_grd_mask[i];
185 UWORD8 *pu1_ref_ptr_center = ps_grid->ppu1_ref_ptr[i];
186 WORD32 mv_x = ps_grid->p_mv[i].i2_mv_x;
187 WORD32 mv_y = (ps_grid->p_mv[i].i2_mv_y);
188
189 for(j = 0; j < NUM_CANDIDATES_IN_GRID; j++, mask >>= 1)
190 {
191 if(mask & 1)
192 {
193 *num_cands = *num_cands + 1;
194 cand0->grid_ix = i;
195 cand0->ref_idx = ps_grid->p_ref_idx[i];
196 cand0->pu1_ref_ptr =
197 pu1_ref_ptr_center + offset_x[j] + ref_buf_stride * offset_y[j];
198 cand0->mv.i2_mv_x = (S16)(mv_x) + offset_x[j];
199 cand0->mv.i2_mv_y = (S16)(mv_y) + offset_y[j];
200 cand0++;
201 }
202 }
203 }
204
205 /* Loop to compute the SAD's */
206 for(a = 0; a < *num_cands; a++)
207 {
208 cand_t *cand = ps_cand + a;
209 memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
210 for(b = 0; b < NUM_4X4; b++)
211 {
212 WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
213 WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
214
215 for(c = 0; c < NUM_ROWS_IN_4X4; c++)
216 {
217 WORD32 z_cur = (cur_buf_stride)*c + t1;
218 WORD32 z_ref = (ref_buf_stride)*c + t2;
219 for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
220 {
221 au2_4x4_sad[b] += (UWORD16)ABS(
222 (((S32)cand->pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
223 }
224 }
225 }
226
227 u2_part_sads[PART_ID_NxN_TL][a] =
228 (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
229 u2_part_sads[PART_ID_NxN_TR][a] =
230 (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
231 u2_part_sads[PART_ID_NxN_BL][a] =
232 (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
233 u2_part_sads[PART_ID_NxN_BR][a] =
234 (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
235 u2_part_sads[PART_ID_Nx2N_L][a] =
236 u2_part_sads[PART_ID_NxN_TL][a] + u2_part_sads[PART_ID_NxN_BL][a];
237 u2_part_sads[PART_ID_Nx2N_R][a] =
238 u2_part_sads[PART_ID_NxN_TR][a] + u2_part_sads[PART_ID_NxN_BR][a];
239 u2_part_sads[PART_ID_2NxN_T][a] =
240 u2_part_sads[PART_ID_NxN_TR][a] + u2_part_sads[PART_ID_NxN_TL][a];
241 u2_part_sads[PART_ID_2NxN_B][a] =
242 u2_part_sads[PART_ID_NxN_BR][a] + u2_part_sads[PART_ID_NxN_BL][a];
243 u2_part_sads[PART_ID_nLx2N_L][a] =
244 (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
245 u2_part_sads[PART_ID_nRx2N_R][a] =
246 (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
247 u2_part_sads[PART_ID_2NxnU_T][a] =
248 (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
249 u2_part_sads[PART_ID_2NxnD_B][a] =
250 (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
251 u2_part_sads[PART_ID_2Nx2N][a] =
252 u2_part_sads[PART_ID_2NxN_T][a] + u2_part_sads[PART_ID_2NxN_B][a];
253 u2_part_sads[PART_ID_2NxnU_B][a] =
254 u2_part_sads[PART_ID_2Nx2N][a] - u2_part_sads[PART_ID_2NxnU_T][a];
255 u2_part_sads[PART_ID_2NxnD_T][a] =
256 u2_part_sads[PART_ID_2Nx2N][a] - u2_part_sads[PART_ID_2NxnD_B][a];
257 u2_part_sads[PART_ID_nRx2N_L][a] =
258 u2_part_sads[PART_ID_2Nx2N][a] - u2_part_sads[PART_ID_nRx2N_R][a];
259 u2_part_sads[PART_ID_nLx2N_R][a] =
260 u2_part_sads[PART_ID_2Nx2N][a] - u2_part_sads[PART_ID_nLx2N_L][a];
261 }
262 }
263
264 /**
265 ********************************************************************************
266 * @fn compute_part_sads_for_MxM_blk(grid_ctxt_t *ps_grid,
267 * UWORD8 *pu1_cur_ptr,
268 * WORD32 cur_buf_stride,
269 * WORD32 **pi4_part_sads,
270 * cand_t *ps_cand,
271 * WORD32 *num_cands
272 *
273 * @brief Computes partial SADs and updates partition results for an MxM blk
274 * and does so for several grids of points. This can be used for
275 * 32x32/64x64 blks with 17 partition updates
276 *
277 *
278 * @param[in] ps_grid : Pointer to grid ctxt that has multiple grid of max
279 * 9 pts per grid
280 *
281 * @param[in] pu1_cur_ptr : Top left of input buffer
282 *
283 * @param[in] pi4_part_sads : array of pointers, each entry pointing to
284 * results to be updated for a given partition
285 *
286 * @return The ps_search_results structure has the best result updated for
287 * the 2Nx2N partition alone.
288
289 ********************************************************************************
290 */
compute_part_sads_for_MxM_blk(grid_ctxt_t * ps_grid,UWORD8 * pu1_cur_ptr,WORD32 cur_buf_stride,WORD32 ** pp_part_sads,cand_t * ps_cand,WORD32 * num_cands,CU_SIZE_T e_cu_size)291 void compute_part_sads_for_MxM_blk(
292 grid_ctxt_t *ps_grid,
293 UWORD8 *pu1_cur_ptr,
294 WORD32 cur_buf_stride,
295 WORD32 **pp_part_sads,
296 cand_t *ps_cand,
297 WORD32 *num_cands,
298 CU_SIZE_T e_cu_size)
299 {
300 WORD32 a, b, c, d, i;
301 WORD16 grd_sz_y = (ps_grid->grd_sz_y_x & 0xFFFF0000) >> 16;
302 WORD16 grd_sz_x = (ps_grid->grd_sz_y_x & 0xFFFF);
303
304 /* Assumes the following order: C, L, T, R, B, TL, TR, BL, BR */
305 WORD32 offset_x[9] = { 0, -grd_sz_x, 0, grd_sz_x, 0, -grd_sz_x, grd_sz_x, -grd_sz_x, grd_sz_x };
306 WORD32 offset_y[9] = { 0, 0, -grd_sz_y, 0, grd_sz_y, -grd_sz_y, -grd_sz_y, grd_sz_y, grd_sz_y };
307 WORD32 shift = (WORD32)e_cu_size;
308
309 WORD32 ref_buf_stride = ps_grid->ref_buf_stride;
310 WORD32 cur_buf_stride_lsN = (cur_buf_stride << (1 + shift));
311 WORD32 ref_buf_stride_lsN = (ref_buf_stride << (1 + shift));
312 /* Num rows and pixels per row: 8 for CU_32x32 and 16 for CU_64x64 */
313 WORD32 num_rows_in_nxn = 2 << shift;
314 WORD32 num_pixels_in_row = 2 << shift;
315 cand_t *cand0 = ps_cand;
316 /* for a 2Nx2N partition we evaluate nxn SADs, where n = N/2. This is */
317 /* needed for AMP cases. */
318 WORD32 a_nxn_sad[NUM_4X4];
319 *num_cands = 0;
320
321 /* Loop to fill up the cand_t array and to calculate num_cands */
322 for(i = 0; i < ps_grid->num_grids; i++)
323 {
324 WORD32 j;
325 WORD32 mask = ps_grid->pi4_grd_mask[i];
326 UWORD8 *pu1_ref_ptr_center = ps_grid->ppu1_ref_ptr[i];
327 WORD32 mv_x = ps_grid->p_mv[i].i2_mv_x;
328 WORD32 mv_y = (ps_grid->p_mv[i].i2_mv_y);
329
330 for(j = 0; j < NUM_CANDIDATES_IN_GRID; j++, mask >>= 1)
331 {
332 if(mask & 1)
333 {
334 *num_cands = *num_cands + 1;
335 cand0->grid_ix = i;
336 cand0->ref_idx = ps_grid->p_ref_idx[i];
337 cand0->pu1_ref_ptr =
338 pu1_ref_ptr_center + offset_x[j] + ref_buf_stride * offset_y[j];
339 cand0->mv.i2_mv_x = (S16)(mv_x) + offset_x[j];
340 cand0->mv.i2_mv_y = (S16)(mv_y) + offset_y[j];
341 cand0++;
342 }
343 }
344 }
345
346 /* Loop to compute the SAD's */
347 for(a = 0; a < *num_cands; a++)
348 {
349 cand_t *cand = ps_cand + a;
350 memset(&a_nxn_sad[0], 0, NUM_4X4 * sizeof(WORD32));
351 for(b = 0; b < NUM_4X4; b++)
352 {
353 WORD32 t1 = (b % 4) * num_pixels_in_row + (b >> 2) * cur_buf_stride_lsN;
354 WORD32 t2 = (b % 4) * num_pixels_in_row + (b >> 2) * ref_buf_stride_lsN;
355
356 for(c = 0; c < num_rows_in_nxn; c++)
357 {
358 WORD32 z_cur = (cur_buf_stride)*c + t1;
359 WORD32 z_ref = (ref_buf_stride)*c + t2;
360 for(d = 0; d < num_pixels_in_row; d++)
361 {
362 a_nxn_sad[b] += (WORD32)ABS(
363 (((WORD32)cand->pu1_ref_ptr[(z_ref + d)]) -
364 ((WORD32)pu1_cur_ptr[(z_cur + d)])));
365 }
366 }
367 }
368
369 pp_part_sads[PART_ID_NxN_TL][a] =
370 (a_nxn_sad[0] + a_nxn_sad[1] + a_nxn_sad[4] + a_nxn_sad[5]);
371 pp_part_sads[PART_ID_NxN_TR][a] =
372 (a_nxn_sad[2] + a_nxn_sad[3] + a_nxn_sad[6] + a_nxn_sad[7]);
373 pp_part_sads[PART_ID_NxN_BL][a] =
374 (a_nxn_sad[8] + a_nxn_sad[9] + a_nxn_sad[12] + a_nxn_sad[13]);
375 pp_part_sads[PART_ID_NxN_BR][a] =
376 (a_nxn_sad[10] + a_nxn_sad[11] + a_nxn_sad[14] + a_nxn_sad[15]);
377 pp_part_sads[PART_ID_Nx2N_L][a] =
378 pp_part_sads[PART_ID_NxN_TL][a] + pp_part_sads[PART_ID_NxN_BL][a];
379 pp_part_sads[PART_ID_Nx2N_R][a] =
380 pp_part_sads[PART_ID_NxN_TR][a] + pp_part_sads[PART_ID_NxN_BR][a];
381 pp_part_sads[PART_ID_2NxN_T][a] =
382 pp_part_sads[PART_ID_NxN_TR][a] + pp_part_sads[PART_ID_NxN_TL][a];
383 pp_part_sads[PART_ID_2NxN_B][a] =
384 pp_part_sads[PART_ID_NxN_BR][a] + pp_part_sads[PART_ID_NxN_BL][a];
385 pp_part_sads[PART_ID_nLx2N_L][a] =
386 (a_nxn_sad[8] + a_nxn_sad[0] + a_nxn_sad[12] + a_nxn_sad[4]);
387 pp_part_sads[PART_ID_nRx2N_R][a] =
388 (a_nxn_sad[3] + a_nxn_sad[7] + a_nxn_sad[15] + a_nxn_sad[11]);
389 pp_part_sads[PART_ID_2NxnU_T][a] =
390 (a_nxn_sad[1] + a_nxn_sad[0] + a_nxn_sad[2] + a_nxn_sad[3]);
391 pp_part_sads[PART_ID_2NxnD_B][a] =
392 (a_nxn_sad[15] + a_nxn_sad[14] + a_nxn_sad[12] + a_nxn_sad[13]);
393 pp_part_sads[PART_ID_2Nx2N][a] =
394 pp_part_sads[PART_ID_2NxN_T][a] + pp_part_sads[PART_ID_2NxN_B][a];
395 pp_part_sads[PART_ID_2NxnU_B][a] =
396 pp_part_sads[PART_ID_2Nx2N][a] - pp_part_sads[PART_ID_2NxnU_T][a];
397 pp_part_sads[PART_ID_2NxnD_T][a] =
398 pp_part_sads[PART_ID_2Nx2N][a] - pp_part_sads[PART_ID_2NxnD_B][a];
399 pp_part_sads[PART_ID_nRx2N_L][a] =
400 pp_part_sads[PART_ID_2Nx2N][a] - pp_part_sads[PART_ID_nRx2N_R][a];
401 pp_part_sads[PART_ID_nLx2N_R][a] =
402 pp_part_sads[PART_ID_2Nx2N][a] - pp_part_sads[PART_ID_nLx2N_L][a];
403 }
404 }
405
hme_evalsad_grid_pu_16x16(err_prms_t * ps_prms)406 void hme_evalsad_grid_pu_16x16(err_prms_t *ps_prms)
407 {
408 grid_ctxt_t s_grid;
409 cand_t as_candt[9];
410 U16 au2_sad_grid[TOT_NUM_PARTS * 9];
411 U16 *apu2_sad_grid[TOT_NUM_PARTS];
412 hme_mv_t s_mv = { 0, 0 };
413 S32 i4_ref_idx = 0, i;
414 S32 num_candts = 0;
415 s_grid.num_grids = 1;
416 s_grid.ref_buf_stride = ps_prms->i4_ref_stride;
417 s_grid.grd_sz_y_x = ((ps_prms->i4_step << 16) | ps_prms->i4_step);
418 s_grid.ppu1_ref_ptr = &ps_prms->pu1_ref;
419 s_grid.pi4_grd_mask = &ps_prms->i4_grid_mask;
420 s_grid.p_mv = &s_mv;
421 s_grid.p_ref_idx = &i4_ref_idx;
422 for(i = 0; i < 9; i++)
423 {
424 if(s_grid.pi4_grd_mask[0] & (1 << i))
425 num_candts++;
426 }
427
428 for(i = 0; i < TOT_NUM_PARTS; i++)
429 apu2_sad_grid[i] = &au2_sad_grid[i * num_candts];
430
431 compute_4x4_sads_for_16x16_blk(
432 &s_grid, ps_prms->pu1_inp, ps_prms->i4_inp_stride, apu2_sad_grid, as_candt, &num_candts);
433 for(i = 0; i < TOT_NUM_PARTS * num_candts; i++)
434 {
435 ps_prms->pi4_sad_grid[i] = au2_sad_grid[i];
436 }
437 }
438
hme_evalsad_grid_npu_MxN(err_prms_t * ps_prms)439 void hme_evalsad_grid_npu_MxN(err_prms_t *ps_prms)
440 {
441 U08 *pu1_inp_base, *pu1_ref_c;
442 S32 *pi4_sad = ps_prms->pi4_sad_grid;
443 S32 i, grid_count = 0;
444 S32 step = ps_prms->i4_step;
445 S32 x_off = step, y_off = step * ps_prms->i4_ref_stride;
446
447 ASSERT((ps_prms->i4_part_mask & (ps_prms->i4_part_mask - 1)) == 0);
448
449 //assert(ps_prms->i4_blk_ht <= 8);
450 //assert(ps_prms->i4_blk_wd <= 8);
451 for(i = 0; i < 9; i++)
452 {
453 if(ps_prms->i4_grid_mask & (1 << i))
454 grid_count++;
455 }
456 pi4_sad += (ps_prms->pi4_valid_part_ids[0] * grid_count);
457
458 pu1_inp_base = ps_prms->pu1_inp;
459 pu1_ref_c = ps_prms->pu1_ref;
460 for(i = 0; i < 9; i++)
461 {
462 S32 sad = 0, j, k;
463 U08 *pu1_inp, *pu1_ref;
464
465 if(!(ps_prms->i4_grid_mask & (1 << i)))
466 continue;
467 pu1_ref = pu1_ref_c + x_off * gai1_grid_id_to_x[i];
468 pu1_ref += y_off * gai1_grid_id_to_y[i];
469 pu1_inp = pu1_inp_base;
470
471 for(j = 0; j < ps_prms->i4_blk_ht; j++)
472 {
473 for(k = 0; k < ps_prms->i4_blk_wd; k++)
474 {
475 sad += (ABS((pu1_inp[k] - pu1_ref[k])));
476 }
477 pu1_inp += ps_prms->i4_inp_stride;
478 pu1_ref += ps_prms->i4_ref_stride;
479 }
480 *pi4_sad++ = sad;
481 }
482 }
483
hme_evalsad_pt_npu_MxN_8bit_compute(WORD32 ht,WORD32 wd,UWORD8 * pu1_inp,UWORD8 * pu1_ref,WORD32 i4_inp_stride,WORD32 i4_ref_stride)484 WORD32 hme_evalsad_pt_npu_MxN_8bit_compute(
485 WORD32 ht,
486 WORD32 wd,
487 UWORD8 *pu1_inp,
488 UWORD8 *pu1_ref,
489 WORD32 i4_inp_stride,
490 WORD32 i4_ref_stride)
491 {
492 WORD32 i, j;
493 WORD32 sad = 0;
494 for(i = 0; i < ht; i++)
495 {
496 for(j = 0; j < wd; j++)
497 {
498 sad += (ABS(((S32)pu1_inp[j] - (S32)pu1_ref[j])));
499 }
500 pu1_inp += i4_inp_stride;
501 pu1_ref += i4_ref_stride;
502 }
503 return sad;
504 }
505
hme_evalsad_pt_npu_MxN_8bit(err_prms_t * ps_prms)506 void hme_evalsad_pt_npu_MxN_8bit(err_prms_t *ps_prms)
507 {
508 S32 wd, ht;
509 U08 *pu1_inp, *pu1_ref;
510
511 wd = ps_prms->i4_blk_wd;
512 ht = ps_prms->i4_blk_ht;
513
514 pu1_inp = ps_prms->pu1_inp;
515 pu1_ref = ps_prms->pu1_ref;
516
517 ps_prms->pi4_sad_grid[0] = hme_evalsad_pt_npu_MxN_8bit_compute(
518 ht, wd, pu1_inp, pu1_ref, ps_prms->i4_inp_stride, ps_prms->i4_ref_stride);
519 }
520
compute_satd_8bit(err_prms_t * ps_prms)521 void compute_satd_8bit(err_prms_t *ps_prms)
522 {
523 U08 *pu1_origin;
524 S32 src_strd;
525 U08 *pu1_pred_buf;
526 S32 dst_strd;
527 S32 wd, ht;
528 U32 u4_sad = 0;
529 WORD32 x, y;
530 U08 *u1_pi0, *u1_pi1;
531
532 pu1_origin = ps_prms->pu1_inp;
533 pu1_pred_buf = ps_prms->pu1_ref;
534 src_strd = ps_prms->i4_inp_stride;
535 dst_strd = ps_prms->i4_ref_stride;
536 wd = ps_prms->i4_blk_wd;
537 ht = ps_prms->i4_blk_ht;
538
539 u1_pi0 = pu1_origin;
540 u1_pi1 = pu1_pred_buf;
541
542 /* Follows the following logic:
543 For block sizes less than or equal to 16X16, the basic transform size is 4x4
544 For block sizes greater than or equal to 32x32, the basic transform size is 8x8 */
545 if((wd > 0x10) || (ht > 0x10))
546 {
547 for(y = 0; y < ht; y += 8)
548 {
549 for(x = 0; x < wd; x += 8)
550 {
551 u4_sad += ps_prms->ps_cmn_utils_optimised_function_list->pf_HAD_8x8_8bit(
552 &u1_pi0[x], src_strd, &u1_pi1[x], dst_strd, NULL, 1);
553 }
554 u1_pi0 += src_strd * 8;
555 u1_pi1 += dst_strd * 8;
556 }
557 }
558 else
559 {
560 for(y = 0; y < ht; y += 4)
561 {
562 for(x = 0; x < wd; x += 4)
563 {
564 u4_sad += ps_prms->ps_cmn_utils_optimised_function_list->pf_HAD_4x4_8bit(
565 &u1_pi0[x], src_strd, &u1_pi1[x], dst_strd, NULL, 1);
566 }
567 u1_pi0 += src_strd * 4;
568 u1_pi1 += dst_strd * 4;
569 }
570 }
571
572 ps_prms->pi4_sad_grid[0] = (S32)u4_sad;
573 }
574
hme_init_pred_part(pred_ctxt_t * ps_pred_ctxt,search_node_t * ps_tl,search_node_t * ps_t,search_node_t * ps_tr,search_node_t * ps_l,search_node_t * ps_bl,search_node_t * ps_coloc,search_node_t * ps_zeromv,search_node_t ** pps_proj_coloc,PART_ID_T e_part_id)575 void hme_init_pred_part(
576 pred_ctxt_t *ps_pred_ctxt,
577 search_node_t *ps_tl,
578 search_node_t *ps_t,
579 search_node_t *ps_tr,
580 search_node_t *ps_l,
581 search_node_t *ps_bl,
582 search_node_t *ps_coloc,
583 search_node_t *ps_zeromv,
584 search_node_t **pps_proj_coloc,
585 PART_ID_T e_part_id)
586 {
587 pred_candt_nodes_t *ps_candt_nodes;
588
589 ps_candt_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
590
591 ps_candt_nodes->ps_tl = ps_tl;
592 ps_candt_nodes->ps_tr = ps_tr;
593 ps_candt_nodes->ps_t = ps_t;
594 ps_candt_nodes->ps_l = ps_l;
595 ps_candt_nodes->ps_bl = ps_bl;
596 ps_candt_nodes->ps_coloc = ps_coloc;
597 ps_candt_nodes->ps_zeromv = ps_zeromv;
598 ps_candt_nodes->pps_proj_coloc = pps_proj_coloc;
599 }
600
hme_init_pred_ctxt_no_encode(pred_ctxt_t * ps_pred_ctxt,search_results_t * ps_search_results,search_node_t * ps_top_candts,search_node_t * ps_left_candts,search_node_t ** pps_proj_coloc_candts,search_node_t * ps_coloc_candts,search_node_t * ps_zeromv_candt,S32 pred_lx,S32 lambda,S32 lambda_q_shift,U08 ** ppu1_ref_bits_tlu,S16 * pi2_ref_scf)601 void hme_init_pred_ctxt_no_encode(
602 pred_ctxt_t *ps_pred_ctxt,
603 search_results_t *ps_search_results,
604 search_node_t *ps_top_candts,
605 search_node_t *ps_left_candts,
606 search_node_t **pps_proj_coloc_candts,
607 search_node_t *ps_coloc_candts,
608 search_node_t *ps_zeromv_candt,
609 S32 pred_lx,
610 S32 lambda,
611 S32 lambda_q_shift,
612 U08 **ppu1_ref_bits_tlu,
613 S16 *pi2_ref_scf)
614 {
615 search_node_t *ps_invalid, *ps_l, *ps_t, *ps_tl, *ps_tr, *ps_bl;
616 search_node_t *ps_coloc;
617 PART_ID_T e_part_id;
618
619 /* Assume that resolution is subpel to begin with */
620 ps_pred_ctxt->mv_pel = 0; // FPEL
621
622 /* lambda and pred_lx (PRED_L0/PRED_L1) */
623 ps_pred_ctxt->lambda = lambda;
624 ps_pred_ctxt->lambda_q_shift = lambda_q_shift;
625 ps_pred_ctxt->pred_lx = pred_lx;
626 ps_pred_ctxt->ppu1_ref_bits_tlu = ppu1_ref_bits_tlu;
627 ps_pred_ctxt->pi2_ref_scf = pi2_ref_scf;
628 ps_pred_ctxt->proj_used = 0;
629
630 /* Bottom left should not be valid */
631 ASSERT(ps_left_candts[2].u1_is_avail == 0);
632 ps_invalid = &ps_left_candts[2];
633
634 /*************************************************************************/
635 /* for the case of no encode, the idea is to set up cants as follows */
636 /* */
637 /* ____ ______________ */
638 /* | TL | T | T1 | TR | */
639 /* |____|____|____|____| */
640 /* | L | b0 | b1 | */
641 /* |____|____|____| */
642 /* | L1 | b2 | b3 | */
643 /* |____|____|____| */
644 /* | BL | */
645 /* |____| */
646 /* */
647 /* If use_4x4 is 0, then b0,b1,b2,b3 are single 8x8 blk. then T=T1 */
648 /* and L=L1. topleft, top and topright are TL,T,TR respectively */
649 /* Left and bottom left is L and BL respectively. */
650 /* If use_4x4 is 1: then the above holds true only for PARTID = 0 (8x8) */
651 /* For the 4 subblocks (partids 4-7) */
652 /* */
653 /* Block Left Top Top Left Top Right Bottom Left */
654 /* b0 L T TL T1 L1 */
655 /* b1 b0 T1 T TR BL(invalid) */
656 /* b2 L1 b0 L0 b1 BL (invalid) */
657 /* b3 b2 b1 b0 BL(inv) BL (inv) */
658 /* */
659 /* Note : For block b1, bottom left pts to b2, which is not yet ready */
660 /* hence it is kept invalid and made to pt to BL. For block b3 top rt */
661 /* is invalid and hence made to pt to BL which is invalid. */
662 /* BL is invalid since it lies in a bottom left 8x8 blk and not yet ready*/
663 /*************************************************************************/
664
665 /* ps_coloc always points to a fixe candt (global) */
666 /* TODO : replace incoming ps_coloc from global to geniune coloc */
667 ps_coloc = ps_coloc_candts;
668
669 /* INITIALIZATION OF 8x8 BLK */
670 ps_tl = ps_top_candts;
671 ps_t = ps_tl + 2;
672 ps_tr = ps_t + 1;
673 ps_l = ps_left_candts + 1;
674 ps_bl = ps_invalid;
675 e_part_id = PART_ID_2Nx2N;
676 hme_init_pred_part(
677 ps_pred_ctxt,
678 ps_tl,
679 ps_t,
680 ps_tr,
681 ps_l,
682 ps_bl,
683 ps_coloc,
684 ps_zeromv_candt,
685 pps_proj_coloc_candts,
686 e_part_id);
687
688 /* INITIALIZATION OF 4x4 TL BLK */
689 e_part_id = PART_ID_NxN_TL;
690 ps_tl = ps_top_candts;
691 ps_t = ps_tl + 1;
692 ps_tr = ps_t + 1;
693 ps_l = ps_left_candts;
694 ps_bl = ps_l + 1;
695 hme_init_pred_part(
696 ps_pred_ctxt,
697 ps_tl,
698 ps_t,
699 ps_tr,
700 ps_l,
701 ps_bl,
702 ps_coloc,
703 ps_zeromv_candt,
704 pps_proj_coloc_candts,
705 e_part_id);
706
707 /* INITIALIZATION OF 4x4 TR BLK */
708 e_part_id = PART_ID_NxN_TR;
709 ps_tl = ps_top_candts + 1;
710 ps_t = ps_tl + 1;
711 ps_tr = ps_t + 1;
712 ps_l = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
713 ps_bl = ps_invalid;
714 hme_init_pred_part(
715 ps_pred_ctxt,
716 ps_tl,
717 ps_t,
718 ps_tr,
719 ps_l,
720 ps_bl,
721 ps_coloc,
722 ps_zeromv_candt,
723 pps_proj_coloc_candts,
724 e_part_id);
725
726 /* INITIALIZATION OF 4x4 BL BLK */
727 e_part_id = PART_ID_NxN_BL;
728 ps_tl = ps_left_candts;
729 ps_t = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
730 ps_tr = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TR];
731 ps_l = ps_left_candts + 1;
732 ps_bl = ps_invalid; //invalid
733 hme_init_pred_part(
734 ps_pred_ctxt,
735 ps_tl,
736 ps_t,
737 ps_tr,
738 ps_l,
739 ps_bl,
740 ps_coloc,
741 ps_zeromv_candt,
742 pps_proj_coloc_candts,
743 e_part_id);
744
745 /* INITIALIZATION OF 4x4 BR BLK */
746 e_part_id = PART_ID_NxN_BR;
747 ps_tl = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
748 ps_t = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TR];
749 ps_tr = ps_invalid; // invalid
750 ps_l = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_BL];
751 ps_bl = ps_invalid; // invalid
752 hme_init_pred_part(
753 ps_pred_ctxt,
754 ps_tl,
755 ps_t,
756 ps_tr,
757 ps_l,
758 ps_bl,
759 ps_coloc,
760 ps_zeromv_candt,
761 pps_proj_coloc_candts,
762 e_part_id);
763 }
764
hme_init_pred_ctxt_encode(pred_ctxt_t * ps_pred_ctxt,search_results_t * ps_search_results,search_node_t * ps_coloc_candts,search_node_t * ps_zeromv_candt,mv_grid_t * ps_mv_grid,S32 pred_lx,S32 lambda,S32 lambda_q_shift,U08 ** ppu1_ref_bits_tlu,S16 * pi2_ref_scf)765 void hme_init_pred_ctxt_encode(
766 pred_ctxt_t *ps_pred_ctxt,
767 search_results_t *ps_search_results,
768 search_node_t *ps_coloc_candts,
769 search_node_t *ps_zeromv_candt,
770 mv_grid_t *ps_mv_grid,
771 S32 pred_lx,
772 S32 lambda,
773 S32 lambda_q_shift,
774 U08 **ppu1_ref_bits_tlu,
775 S16 *pi2_ref_scf)
776 {
777 search_node_t *ps_invalid, *ps_l, *ps_t, *ps_tl, *ps_tr, *ps_bl;
778 search_node_t *ps_coloc;
779 search_node_t *ps_grid_cu_base;
780 CU_SIZE_T e_cu_size = ps_search_results->e_cu_size;
781
782 /* Part Start, Part sizes in 4x4 units */
783 S32 part_wd, part_ht, part_start_x, part_start_y;
784
785 /* Partition type, number of partitions in type */
786 S32 part_id;
787
788 /* Coordinates of the CU in 4x4 units */
789 S32 cu_start_x, cu_start_y;
790 S32 shift = e_cu_size;
791
792 /* top right and bot left validity at CU level */
793 S32 cu_tr_valid, cu_bl_valid;
794 /* strideo f the grid */
795 S32 grid_stride = ps_mv_grid->i4_stride;
796
797 ps_pred_ctxt->lambda = lambda;
798 ps_pred_ctxt->lambda_q_shift = lambda_q_shift;
799 ps_pred_ctxt->pred_lx = pred_lx;
800 ps_pred_ctxt->mv_pel = 0;
801 ps_pred_ctxt->ppu1_ref_bits_tlu = ppu1_ref_bits_tlu;
802 ps_pred_ctxt->pi2_ref_scf = pi2_ref_scf;
803 ps_pred_ctxt->proj_used = 1;
804
805 cu_start_x = ps_search_results->u1_x_off >> 2;
806 cu_start_y = ps_search_results->u1_y_off >> 2;
807
808 /* Coloc always points to fixed global candt */
809 ps_coloc = ps_coloc_candts;
810
811 /* Go to base of the CU in the MV Grid */
812 ps_grid_cu_base = &ps_mv_grid->as_node[0];
813 ps_grid_cu_base += (ps_mv_grid->i4_start_offset + cu_start_x);
814 ps_grid_cu_base += (grid_stride * cu_start_y);
815
816 /* points to the real bottom left of the grid, will never be valid */
817 ps_invalid = &ps_mv_grid->as_node[0];
818 ps_invalid += (grid_stride * 17);
819
820 {
821 S32 shift = 1 + e_cu_size;
822 cu_tr_valid = gau1_cu_tr_valid[cu_start_y >> shift][cu_start_x >> shift];
823 cu_bl_valid = gau1_cu_bl_valid[cu_start_y >> shift][cu_start_x >> shift];
824 }
825
826 /*************************************************************************/
827 /* for the case of encode, the idea is to set up cants as follows */
828 /* */
829 /* ____ ______________ ____ ____ */
830 /* | T0 | T1 | T2 | T3 | T4 | T5 | */
831 /* |____|____|____|____|____|____| */
832 /* | L1 | | | */
833 /* |____| | | */
834 /* | L2 | p0 | p1 | */
835 /* |____| | | */
836 /* | L3 | | | */
837 /* |____| | | */
838 /* | L4 | L' | | */
839 /* |____|____|______________| */
840 /* | BL | */
841 /* |____| */
842 /* The example is shown with 16x16 CU, though it can be generalized */
843 /* This CU has 2 partitions, cu_wd = 4. also p_wd, p_ht are partition */
844 /* width and ht in 4x4 units. */
845 /* For a given CU, derive the top left, top and bottom left and top rt */
846 /* pts. Left and top are assumed to be valid. */
847 /* IF there aretwo partitions in the CU (like p0 and p1) and vertical, */
848 /* then for first partition, left, top, top left and top right valid */
849 /* Bottom left is valid. store these validity flags. Also store the */
850 /* grid offsets of the partitions w.r.t. CU start in units of 4x4.For p0*/
851 /* Left grid offset = -1, 3. Top Grd offset = -1, 0. */
852 /* Top left grid offset = -1, -1. Top right = 1, -1. BL = -1, 4. */
853 /* For p1, validity flags are left, top, top left, top right, valid. */
854 /* BL is invalid. Grid offsets are: Left = dont care. T = 1, -1 (T2) */
855 /* TR = 4, -1 (T5). TL = 0, -1 (T1). BL = don't care. */
856 /* For p1, set the left pred candt to the best search result of p0. */
857 /*************************************************************************/
858
859 /* Loop over all partitions, and identify the 5 neighbours */
860 for(part_id = 0; part_id < TOT_NUM_PARTS; part_id++)
861 {
862 part_attr_t *ps_part_attr = &gas_part_attr_in_cu[part_id];
863 S32 tr_valid, bl_valid, is_vert;
864 search_node_t *ps_grid_pu_base;
865 PART_TYPE_T e_part_type;
866 PART_ID_T first_part;
867 S32 part_num;
868
869 e_part_type = ge_part_id_to_part_type[part_id];
870 first_part = ge_part_type_to_part_id[e_part_type][0];
871 is_vert = gau1_is_vert_part[e_part_type];
872 part_num = gau1_part_id_to_part_num[part_id];
873 tr_valid = gau1_partid_tr_valid[part_id] & cu_tr_valid;
874 bl_valid = gau1_partid_bl_valid[part_id] & cu_bl_valid;
875
876 part_start_x = (ps_part_attr->u1_x_start << shift) >> 2;
877 part_start_y = (ps_part_attr->u1_y_start << shift) >> 2;
878 part_wd = (ps_part_attr->u1_x_count << shift) >> 2;
879 part_ht = (ps_part_attr->u1_y_count << shift) >> 2;
880
881 /* go to top left of part */
882 ps_grid_pu_base = ps_grid_cu_base + part_start_x;
883 ps_grid_pu_base += (part_start_y * grid_stride);
884
885 ps_tl = ps_grid_pu_base - 1 - grid_stride;
886 ps_t = ps_grid_pu_base - grid_stride + part_wd - 1;
887 ps_l = ps_grid_pu_base - 1 + ((part_ht - 1) * grid_stride);
888 ps_tr = ps_t + 1;
889 ps_bl = ps_l + grid_stride;
890
891 if(!tr_valid)
892 ps_tr = ps_invalid;
893 if(!bl_valid)
894 ps_bl = ps_invalid;
895
896 if(part_num == 1)
897 {
898 /* for cases of two partitions 2nd part has 1st part as candt */
899 /* if vertical type, left candt of 2nd part is 1st part. */
900 /* if horz type, top candt of 2nd part is 1st part. */
901 if(is_vert)
902 {
903 ps_l = ps_search_results->aps_part_results[pred_lx][first_part];
904 }
905 else
906 {
907 ps_t = ps_search_results->aps_part_results[pred_lx][first_part];
908 }
909 }
910 if(part_num == 2)
911 {
912 /* only possible for NxN_BL */
913 ps_t = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
914 ps_tr = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TR];
915 }
916 if(part_num == 3)
917 {
918 /* only possible for NxN_BR */
919 ps_t = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TR];
920 ps_tl = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_TL];
921 ps_l = ps_search_results->aps_part_results[pred_lx][PART_ID_NxN_BL];
922 }
923 hme_init_pred_part(
924 ps_pred_ctxt,
925 ps_tl,
926 ps_t,
927 ps_tr,
928 ps_l,
929 ps_bl,
930 ps_coloc,
931 ps_zeromv_candt,
932 NULL,
933 (PART_ID_T)part_id);
934 }
935 }
936
937 /**
938 ********************************************************************************
939 * @fn compute_mv_cost_explicit(search_node_t *ps_node,
940 * pred_ctxt_t *ps_pred_ctxt,
941 * PART_ID_T e_part_id)
942 *
943 * @brief MV cost for explicit search in layers not encoded
944 *
945 * @param[in] ps_node: search node having mv and ref id for which to eval cost
946 *
947 * @param[in] ps_pred_ctxt : mv pred context
948 *
949 * @param[in] e_part_id : Partition id.
950 *
951 * @return Cost value
952
953 ********************************************************************************
954 */
compute_mv_cost_explicit(search_node_t * ps_node,pred_ctxt_t * ps_pred_ctxt,PART_ID_T e_part_id,S32 inp_mv_pel)955 S32 compute_mv_cost_explicit(
956 search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
957 {
958 #define RETURN_FIXED_COST 0
959 search_node_t *ps_pred_node_a = NULL, *ps_pred_node_b = NULL;
960 pred_candt_nodes_t *ps_pred_nodes;
961 S32 inp_shift = 2 - inp_mv_pel;
962 S32 pred_shift = 2 - ps_pred_ctxt->mv_pel;
963 S32 mv_p_x, mv_p_y;
964 S16 mvdx1, mvdx2, mvdy1, mvdy2;
965 S32 cost, ref_bits;
966
967 /*************************************************************************/
968 /* Logic for cost computation for explicit search. For such a search, */
969 /* it is guaranteed that all predictor candts have same ref id. The only */
970 /* probable issue is with the availability which needs checking. This fxn*/
971 /* does not suffer the need to scale predictor candts due to diff ref id */
972 /*************************************************************************/
973
974 /* Hack: currently we always assume 2Nx2N. */
975 /* TODO: get rid of this hack and return cost tuned to each partition */
976 ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
977 ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_node->i1_ref_idx];
978
979 /*************************************************************************/
980 /* Priority to bottom left availability. Else we go to left. If both are */
981 /* not available, then a remains null */
982 /*************************************************************************/
983 if(ps_pred_nodes->ps_tl->u1_is_avail)
984 ps_pred_node_a = ps_pred_nodes->ps_tl;
985 else if(ps_pred_nodes->ps_l->u1_is_avail)
986 ps_pred_node_a = ps_pred_nodes->ps_l;
987
988 /*************************************************************************/
989 /* For encoder, top left may not be really needed unless we use slices, */
990 /* and even then in ME it may not be relevant. So we only consider T or */
991 /* TR, as, if both T and TR are not available, TL also will not be */
992 /*************************************************************************/
993 if(ps_pred_nodes->ps_tr->u1_is_avail)
994 ps_pred_node_b = ps_pred_nodes->ps_tr;
995 else if(ps_pred_nodes->ps_t->u1_is_avail)
996 ps_pred_node_b = ps_pred_nodes->ps_t;
997
998 if(ps_pred_node_a == NULL)
999 {
1000 ps_pred_node_a = ps_pred_nodes->ps_coloc;
1001 if(ps_pred_node_b == NULL)
1002 ps_pred_node_b = ps_pred_nodes->ps_zeromv;
1003 }
1004 else if(ps_pred_node_b == NULL)
1005 ps_pred_node_b = ps_pred_nodes->ps_coloc;
1006 else if(0 == hme_cmp_nodes(ps_pred_node_a, ps_pred_node_b))
1007 {
1008 ps_pred_node_b = ps_pred_nodes->ps_coloc;
1009 }
1010
1011 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1012 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1013 COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1014 mvdx1 = ABS(mvdx1);
1015 mvdy1 = ABS(mvdy1);
1016
1017 mv_p_x = ps_pred_node_b->s_mv.i2_mvx;
1018 mv_p_y = ps_pred_node_b->s_mv.i2_mvy;
1019 COMPUTE_DIFF_MV(mvdx2, mvdy2, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1020 mvdx2 = ABS(mvdx2);
1021 mvdy2 = ABS(mvdy2);
1022
1023 if((mvdx1 + mvdy1) < (mvdx2 + mvdy2))
1024 {
1025 cost =
1026 hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) + ref_bits + 2;
1027 }
1028 else
1029 {
1030 cost =
1031 hme_get_range(mvdx2) + hme_get_range(mvdy2) + (mvdx2 > 0) + (mvdy2 > 0) + ref_bits + 2;
1032 }
1033 {
1034 S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1035 return ((cost * ps_pred_ctxt->lambda + rnd) >> ps_pred_ctxt->lambda_q_shift);
1036 }
1037 }
1038 /**
1039 ********************************************************************************
1040 * @fn compute_mv_cost_coarse(search_node_t *ps_node,
1041 * pred_ctxt_t *ps_pred_ctxt,
1042 * PART_ID_T e_part_id)
1043 *
1044 * @brief MV cost for coarse explicit search in coarsest layer
1045 *
1046 * @param[in] ps_node: search node having mv and ref id for which to eval cost
1047 *
1048 * @param[in] ps_pred_ctxt : mv pred context
1049 *
1050 * @param[in] e_part_id : Partition id.
1051 *
1052 * @return Cost value
1053
1054 ********************************************************************************
1055 */
compute_mv_cost_coarse(search_node_t * ps_node,pred_ctxt_t * ps_pred_ctxt,PART_ID_T e_part_id,S32 inp_mv_pel)1056 S32 compute_mv_cost_coarse(
1057 search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1058 {
1059 ARG_NOT_USED(e_part_id);
1060
1061 return (compute_mv_cost_explicit(ps_node, ps_pred_ctxt, PART_ID_2Nx2N, inp_mv_pel));
1062 }
1063
1064 /**
1065 ********************************************************************************
1066 * @fn compute_mv_cost_coarse_high_speed(search_node_t *ps_node,
1067 * pred_ctxt_t *ps_pred_ctxt,
1068 * PART_ID_T e_part_id)
1069 *
1070 * @brief MV cost for coarse explicit search in coarsest layer
1071 *
1072 * @param[in] ps_node: search node having mv and ref id for which to eval cost
1073 *
1074 * @param[in] ps_pred_ctxt : mv pred context
1075 *
1076 * @param[in] e_part_id : Partition id.
1077 *
1078 * @return Cost value
1079
1080 ********************************************************************************
1081 */
compute_mv_cost_coarse_high_speed(search_node_t * ps_node,pred_ctxt_t * ps_pred_ctxt,PART_ID_T e_part_id,S32 inp_mv_pel)1082 S32 compute_mv_cost_coarse_high_speed(
1083 search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1084 {
1085 S32 rnd, mvx, mvy, i4_search_idx;
1086 S32 cost;
1087
1088 mvx = ps_node->s_mv.i2_mvx;
1089 mvy = ps_node->s_mv.i2_mvy;
1090 i4_search_idx = ps_node->i1_ref_idx;
1091
1092 cost = (2 * hme_get_range(ABS(mvx)) - 1) + (2 * hme_get_range(ABS(mvy)) - 1) + i4_search_idx;
1093 cost += (mvx != 0) ? 1 : 0;
1094 cost += (mvy != 0) ? 1 : 0;
1095 rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1096 cost = (cost * ps_pred_ctxt->lambda + rnd) >> ps_pred_ctxt->lambda_q_shift;
1097 return cost;
1098 }
1099
1100 /**
1101 ********************************************************************************
1102 * @fn compute_mv_cost_explicit_refine(search_node_t *ps_node,
1103 * pred_ctxt_t *ps_pred_ctxt,
1104 * PART_ID_T e_part_id)
1105 *
1106 * @brief MV cost for explicit search in layers not encoded. Always returns
1107 * cost of the projected colocated candidate
1108 *
1109 * @param[in] ps_node: search node having mv and ref id for which to eval cost
1110 *
1111 * @param[in] ps_pred_ctxt : mv pred context
1112 *
1113 * @param[in] e_part_id : Partition id.
1114 *
1115 * @return Cost value
1116
1117 ********************************************************************************
1118 */
compute_mv_cost_explicit_refine(search_node_t * ps_node,pred_ctxt_t * ps_pred_ctxt,PART_ID_T e_part_id,S32 inp_mv_pel)1119 S32 compute_mv_cost_explicit_refine(
1120 search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1121 {
1122 search_node_t *ps_pred_node_a = NULL;
1123 pred_candt_nodes_t *ps_pred_nodes;
1124 S32 inp_shift = 2 - inp_mv_pel;
1125 S32 pred_shift = 2 - ps_pred_ctxt->mv_pel;
1126 S32 mv_p_x, mv_p_y;
1127 S16 mvdx1, mvdy1;
1128 S32 cost, ref_bits;
1129
1130 ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
1131 ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_node->i1_ref_idx];
1132
1133 ps_pred_node_a = ps_pred_nodes->pps_proj_coloc[0];
1134
1135 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1136 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1137 COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1138 mvdx1 = ABS(mvdx1);
1139 mvdy1 = ABS(mvdy1);
1140
1141 cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) + ref_bits + 2;
1142
1143 {
1144 S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1145 return ((cost * ps_pred_ctxt->lambda + rnd) >> ps_pred_ctxt->lambda_q_shift);
1146 }
1147 }
1148
1149 /**
1150 ********************************************************************************
1151 * @fn compute_mv_cost_refine(search_node_t *ps_node,
1152 * pred_ctxt_t *ps_pred_ctxt,
1153 * PART_ID_T e_part_id)
1154 *
1155 * @brief MV cost for coarse explicit search in coarsest layer
1156 *
1157 * @param[in] ps_node: search node having mv and ref id for which to eval cost
1158 *
1159 * @param[in] ps_pred_ctxt : mv pred context
1160 *
1161 * @param[in] e_part_id : Partition id.
1162 *
1163 * @return Cost value
1164
1165 ********************************************************************************
1166 */
compute_mv_cost_refine(search_node_t * ps_node,pred_ctxt_t * ps_pred_ctxt,PART_ID_T e_part_id,S32 inp_mv_pel)1167 S32 compute_mv_cost_refine(
1168 search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1169 {
1170 return (compute_mv_cost_explicit_refine(ps_node, ps_pred_ctxt, e_part_id, inp_mv_pel));
1171 }
1172
compute_mv_cost_implicit(search_node_t * ps_node,pred_ctxt_t * ps_pred_ctxt,PART_ID_T e_part_id,S32 inp_mv_pel)1173 S32 compute_mv_cost_implicit(
1174 search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1175 {
1176 search_node_t *ps_pred_node_a = NULL, *ps_pred_node_b = NULL;
1177 pred_candt_nodes_t *ps_pred_nodes;
1178 S08 i1_ref_idx;
1179 S08 i1_ref_tl = -1, i1_ref_tr = -1, i1_ref_t = -1;
1180 S08 i1_ref_bl = -1, i1_ref_l = -1;
1181 S32 inp_shift = 2 - inp_mv_pel;
1182 S32 pred_shift; /* = 2 - ps_pred_ctxt->mv_pel;*/
1183 S32 ref_bits, cost;
1184 S32 mv_p_x, mv_p_y;
1185 S16 mvdx1, mvdx2, mvdy1, mvdy2;
1186
1187 //return 0;
1188 i1_ref_idx = ps_node->i1_ref_idx;
1189
1190 /*************************************************************************/
1191 /* Logic for cost computation for explicit search. For such a search, */
1192 /* it is guaranteed that all predictor candts have same ref id. The only */
1193 /* probable issue is with the availability which needs checking. This fxn*/
1194 /* does not suffer the need to scale predictor candts due to diff ref id */
1195 /*************************************************************************/
1196
1197 ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
1198 ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][i1_ref_idx];
1199
1200 /*************************************************************************/
1201 /* Priority to bottom left availability. Else we go to left. If both are */
1202 /* not available, then a remains null */
1203 /*************************************************************************/
1204 if(ps_pred_nodes->ps_bl->u1_is_avail)
1205 i1_ref_bl = ps_pred_nodes->ps_bl->i1_ref_idx;
1206 if(ps_pred_nodes->ps_l->u1_is_avail)
1207 i1_ref_l = ps_pred_nodes->ps_l->i1_ref_idx;
1208 if(i1_ref_bl == i1_ref_idx)
1209 ps_pred_node_a = ps_pred_nodes->ps_bl;
1210 else if(i1_ref_l == i1_ref_idx)
1211 ps_pred_node_a = ps_pred_nodes->ps_l;
1212 if(ps_pred_node_a == NULL)
1213 {
1214 if(i1_ref_bl != -1)
1215 ps_pred_node_a = ps_pred_nodes->ps_bl;
1216 else if(i1_ref_l != -1)
1217 ps_pred_node_a = ps_pred_nodes->ps_l;
1218 }
1219
1220 /*************************************************************************/
1221 /* For encoder, top left may not be really needed unless we use slices, */
1222 /* and even then in ME it may not be relevant. So we only consider T or */
1223 /* TR, as, if both T and TR are not available, TL also will not be */
1224 /*************************************************************************/
1225 if(ps_pred_nodes->ps_tr->u1_is_avail)
1226 i1_ref_tr = ps_pred_nodes->ps_tr->i1_ref_idx;
1227 if(ps_pred_nodes->ps_t->u1_is_avail)
1228 i1_ref_t = ps_pred_nodes->ps_t->i1_ref_idx;
1229 if(ps_pred_nodes->ps_tl->u1_is_avail)
1230 i1_ref_tl = ps_pred_nodes->ps_tl->i1_ref_idx;
1231 if(i1_ref_tr == i1_ref_idx)
1232 ps_pred_node_b = ps_pred_nodes->ps_tr;
1233 else if(i1_ref_t == i1_ref_idx)
1234 ps_pred_node_b = ps_pred_nodes->ps_t;
1235 else if(i1_ref_tl == i1_ref_idx)
1236 ps_pred_node_b = ps_pred_nodes->ps_tl;
1237
1238 if(ps_pred_node_b == NULL)
1239 {
1240 if(i1_ref_tr != -1)
1241 ps_pred_node_b = ps_pred_nodes->ps_tr;
1242 else if(i1_ref_t != -1)
1243 ps_pred_node_b = ps_pred_nodes->ps_t;
1244 else if(i1_ref_tl != -1)
1245 ps_pred_node_b = ps_pred_nodes->ps_tl;
1246 }
1247 if(ps_pred_node_a == NULL)
1248 {
1249 ps_pred_node_a = ps_pred_nodes->ps_coloc;
1250 if(ps_pred_node_b == NULL)
1251 ps_pred_node_b = ps_pred_nodes->ps_zeromv;
1252 }
1253 else if(ps_pred_node_b == NULL)
1254 ps_pred_node_b = ps_pred_nodes->ps_coloc;
1255 else if(0 == hme_cmp_nodes(ps_pred_node_a, ps_pred_node_b))
1256 {
1257 ps_pred_node_b = ps_pred_nodes->ps_coloc;
1258 }
1259
1260 if(ps_pred_node_a->i1_ref_idx != i1_ref_idx)
1261 {
1262 SCALE_FOR_POC_DELTA(mv_p_x, mv_p_y, ps_pred_node_a, i1_ref_idx, ps_pred_ctxt->pi2_ref_scf);
1263 }
1264 else
1265 {
1266 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1267 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1268 }
1269 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
1270 COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1271 mvdx1 = ABS(mvdx1);
1272 mvdy1 = ABS(mvdy1);
1273
1274 if(ps_pred_node_b->i1_ref_idx != i1_ref_idx)
1275 {
1276 SCALE_FOR_POC_DELTA(mv_p_x, mv_p_y, ps_pred_node_b, i1_ref_idx, ps_pred_ctxt->pi2_ref_scf);
1277 }
1278 else
1279 {
1280 mv_p_x = ps_pred_node_b->s_mv.i2_mvx;
1281 mv_p_y = ps_pred_node_b->s_mv.i2_mvy;
1282 }
1283 pred_shift = ps_pred_node_b->u1_subpel_done ? 0 : 2;
1284 COMPUTE_DIFF_MV(mvdx2, mvdy2, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1285 mvdx2 = ABS(mvdx2);
1286 mvdy2 = ABS(mvdy2);
1287
1288 if((mvdx1 + mvdy1) < (mvdx2 + mvdy2))
1289 {
1290 cost = 2 * hme_get_range(mvdx1) + 2 * hme_get_range(mvdy1) + 2 * (mvdx1 > 0) +
1291 2 * (mvdy1 > 0) + ref_bits + 2;
1292 }
1293 else
1294 {
1295 cost = 2 * hme_get_range(mvdx2) + 2 * hme_get_range(mvdy2) + 2 * (mvdx2 > 0) +
1296 2 * (mvdy2 > 0) + ref_bits + 2;
1297 }
1298 {
1299 /* Part bits in Q1, so evaluate cost as ((mv_cost<<1) + partbitsQ1 + rnd)>>(q+1)*/
1300 S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift);
1301 S32 tot_cost = (cost * ps_pred_ctxt->lambda) << 1;
1302
1303 tot_cost += (gau1_bits_for_part_id_q1[e_part_id] * ps_pred_ctxt->lambda);
1304 return ((tot_cost + rnd) >> (ps_pred_ctxt->lambda_q_shift + 1));
1305 }
1306 }
1307
compute_mv_cost_implicit_high_speed(search_node_t * ps_node,pred_ctxt_t * ps_pred_ctxt,PART_ID_T e_part_id,S32 inp_mv_pel)1308 S32 compute_mv_cost_implicit_high_speed(
1309 search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1310 {
1311 search_node_t *ps_pred_node_a = NULL, *ps_pred_node_b = NULL;
1312 pred_candt_nodes_t *ps_pred_nodes;
1313 S08 i1_ref_idx;
1314 S08 i1_ref_tr = -1;
1315 S08 i1_ref_l = -1;
1316 S32 inp_shift = 2 - inp_mv_pel;
1317 S32 pred_shift; /* = 2 - ps_pred_ctxt->mv_pel; */
1318 S32 ref_bits, cost;
1319 S32 mv_p_x, mv_p_y;
1320 S16 mvdx1, mvdx2, mvdy1, mvdy2;
1321
1322 i1_ref_idx = ps_node->i1_ref_idx;
1323
1324 ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
1325 ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][i1_ref_idx];
1326
1327 /*************************************************************************/
1328 /* Priority to bottom left availability. Else we go to left. If both are */
1329 /* not available, then a remains null */
1330 /*************************************************************************/
1331 if(ps_pred_nodes->ps_l->u1_is_avail)
1332 {
1333 i1_ref_l = ps_pred_nodes->ps_l->i1_ref_idx;
1334 ps_pred_node_a = ps_pred_nodes->ps_l;
1335 }
1336
1337 /*************************************************************************/
1338 /* For encoder, top left may not be really needed unless we use slices, */
1339 /* and even then in ME it may not be relevant. So we only consider T or */
1340 /* TR, as, if both T and TR are not available, TL also will not be */
1341 /*************************************************************************/
1342
1343 if((!(ps_pred_ctxt->proj_used) && (ps_pred_nodes->ps_tr->u1_is_avail)))
1344 {
1345 i1_ref_tr = ps_pred_nodes->ps_tr->i1_ref_idx;
1346 ps_pred_node_b = ps_pred_nodes->ps_tr;
1347 }
1348 else
1349 {
1350 ps_pred_node_b = ps_pred_nodes->ps_coloc;
1351 }
1352
1353 if(ps_pred_node_a == NULL)
1354 {
1355 ps_pred_node_a = ps_pred_nodes->ps_coloc;
1356
1357 if(ps_pred_node_b == ps_pred_nodes->ps_coloc)
1358 ps_pred_node_b = ps_pred_nodes->ps_zeromv;
1359 }
1360
1361 if(ps_pred_node_a->i1_ref_idx != i1_ref_idx)
1362 {
1363 SCALE_FOR_POC_DELTA(mv_p_x, mv_p_y, ps_pred_node_a, i1_ref_idx, ps_pred_ctxt->pi2_ref_scf);
1364 }
1365 else
1366 {
1367 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1368 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1369 }
1370
1371 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
1372 COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1373 mvdx1 = ABS(mvdx1);
1374 mvdy1 = ABS(mvdy1);
1375
1376 if(ps_pred_node_b->i1_ref_idx != i1_ref_idx)
1377 {
1378 SCALE_FOR_POC_DELTA(mv_p_x, mv_p_y, ps_pred_node_b, i1_ref_idx, ps_pred_ctxt->pi2_ref_scf);
1379 }
1380 else
1381 {
1382 mv_p_x = ps_pred_node_b->s_mv.i2_mvx;
1383 mv_p_y = ps_pred_node_b->s_mv.i2_mvy;
1384 }
1385
1386 pred_shift = ps_pred_node_b->u1_subpel_done ? 0 : 2;
1387 COMPUTE_DIFF_MV(mvdx2, mvdy2, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1388 mvdx2 = ABS(mvdx2);
1389 mvdy2 = ABS(mvdy2);
1390
1391 if((mvdx1 + mvdy1) < (mvdx2 + mvdy2))
1392 {
1393 cost =
1394 hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) + ref_bits + 2;
1395 }
1396 else
1397 {
1398 cost =
1399 hme_get_range(mvdx2) + hme_get_range(mvdy2) + (mvdx2 > 0) + (mvdy2 > 0) + ref_bits + 2;
1400 }
1401 {
1402 /* Part bits in Q1, so evaluate cost as ((mv_cost<<1) + partbitsQ1 + rnd)>>(q+1)*/
1403 S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1404 S32 tot_cost = (cost * ps_pred_ctxt->lambda);
1405
1406 return ((tot_cost + rnd) >> (ps_pred_ctxt->lambda_q_shift));
1407 }
1408 }
1409
compute_mv_cost_implicit_high_speed_modified(search_node_t * ps_node,pred_ctxt_t * ps_pred_ctxt,PART_ID_T e_part_id,S32 inp_mv_pel)1410 S32 compute_mv_cost_implicit_high_speed_modified(
1411 search_node_t *ps_node, pred_ctxt_t *ps_pred_ctxt, PART_ID_T e_part_id, S32 inp_mv_pel)
1412 {
1413 search_node_t *ps_pred_node_a = NULL;
1414 pred_candt_nodes_t *ps_pred_nodes;
1415 S32 inp_shift = 2 - inp_mv_pel;
1416 S32 pred_shift; /* = 2 - ps_pred_ctxt->mv_pel; */
1417 S32 mv_p_x, mv_p_y;
1418 S16 mvdx1, mvdy1;
1419 S32 cost, ref_bits;
1420
1421 ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[e_part_id];
1422 ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_node->i1_ref_idx];
1423
1424 ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
1425
1426 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
1427 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
1428 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
1429 COMPUTE_DIFF_MV(mvdx1, mvdy1, ps_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
1430 mvdx1 = ABS(mvdx1);
1431 mvdy1 = ABS(mvdy1);
1432
1433 cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) + (mvdy1 > 0) + ref_bits + 2;
1434
1435 {
1436 S32 rnd = 1 << (ps_pred_ctxt->lambda_q_shift - 1);
1437 return ((cost * ps_pred_ctxt->lambda + rnd) >> ps_pred_ctxt->lambda_q_shift);
1438 }
1439 }
1440
hme_update_results_grid_pu_bestn_xtreme_speed(result_upd_prms_t * ps_result_prms)1441 void hme_update_results_grid_pu_bestn_xtreme_speed(result_upd_prms_t *ps_result_prms)
1442 {
1443 /*The function modified with assumption that only 2NxN_B and Nx2N_R is modified */
1444
1445 search_node_t s_search_node_grid;
1446 const search_node_t *ps_search_node_base;
1447 search_node_t *ps_search_node_grid, *ps_best_node;
1448 S32 i4_min_cost = (MAX_32BIT_VAL), i4_search_idx;
1449 S32 num_results, i4_unique_id = -1, i4_grid_pt;
1450 search_results_t *ps_search_results;
1451 S32 *pi4_valid_part_ids;
1452 S32 i4_step = ps_result_prms->i4_step;
1453 S32 i4_grid_mask, i, i4_min_id;
1454 S32 i4_tot_cost, i4_mv_cost, i4_sad, id;
1455 S32 *pi4_sad_grid = ps_result_prms->pi4_sad_grid;
1456 S32 grid_count = 0;
1457 S32 pred_lx;
1458
1459 i4_min_id = (S32)PT_C;
1460 i4_min_cost = MAX_32BIT_VAL;
1461 ps_search_node_grid = &s_search_node_grid;
1462 ps_search_node_base = ps_result_prms->ps_search_node_base;
1463 *ps_search_node_grid = *ps_search_node_base;
1464 pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1465 ps_search_results = ps_result_prms->ps_search_results;
1466 num_results = (S32)ps_search_results->u1_num_results_per_part;
1467 i4_grid_mask = ps_result_prms->i4_grid_mask;
1468
1469 for(i = 0; i < 9; i++)
1470 {
1471 if(i4_grid_mask & (1 << i))
1472 grid_count++;
1473 }
1474
1475 /* Some basic assumptions: only single pt, only part updates */
1476 /* and more than 1 best result to be computed. */
1477 //ASSERT(ps_result_prms->i4_grid_mask != 1);
1478 //ASSERT(ps_result_prms->i4_part_mask != ENABLE_2Nx2N);
1479 //ASSERT(ps_search_results->num_results > 1);
1480
1481 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
1482 pred_lx = 1 - ps_search_results->pu1_is_past[i4_search_idx];
1483
1484 /*************************************************************************/
1485 /* Supposing we do hte result update for a unique partid, we can */
1486 /* store the best pt id in the grid and also min cost is return */
1487 /* param. This will be useful for early exit cases. */
1488 /* TODO : once we have separate fxn for unique part+grid, we can */
1489 /* do away with this code here */
1490 /*************************************************************************/
1491 //if (pi4_valid_part_ids[1] == -1)
1492 i4_unique_id = pi4_valid_part_ids[0];
1493
1494 /* pi4_valid_part_ids contains all the valid ids. We loop through */
1495 /* this till we encounter -1. This is easier than having to */
1496 /* figure out part by part, besides, active part decision is */
1497 /* usually fixed for a given duration of search, e.g. entire fpel */
1498 /* refinement for a blk/cu will use fixed valid part mask */
1499 id = pi4_valid_part_ids[0];
1500
1501 /*****************************************************************/
1502 /* points to the best search results corresponding to this */
1503 /* specific part type. */
1504 /*****************************************************************/
1505 ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1506
1507 /*************************************************************************/
1508 /* Outer loop runs through all active pts in the grid */
1509 /*************************************************************************/
1510 for(i4_grid_pt = 0; i4_grid_pt < (S32)NUM_GRID_PTS; i4_grid_pt++)
1511 {
1512 if(!(i4_grid_mask & (1 << i4_grid_pt)))
1513 continue;
1514
1515 /* For the pt in the grid, update mvx and y depending on */
1516 /* location of pt. Updates are in FPEL units. */
1517 ps_search_node_grid->s_mv.i2_mvx = ps_search_node_base->s_mv.i2_mvx;
1518 ps_search_node_grid->s_mv.i2_mvy = ps_search_node_base->s_mv.i2_mvy;
1519 ps_search_node_grid->s_mv.i2_mvx += (S16)(i4_step * gai1_grid_id_to_x[i4_grid_pt]);
1520 ps_search_node_grid->s_mv.i2_mvy += (S16)(i4_step * gai1_grid_id_to_y[i4_grid_pt]);
1521
1522 {
1523 /* evaluate mv cost and totalcost for this part for this given mv*/
1524 i4_mv_cost = compute_mv_cost_coarse_high_speed(
1525 ps_search_node_grid,
1526 &ps_search_results->as_pred_ctxt[pred_lx],
1527 (PART_ID_T)id,
1528 MV_RES_FPEL);
1529
1530 i4_sad = pi4_sad_grid[grid_count * id];
1531 i4_tot_cost = i4_sad + i4_mv_cost;
1532
1533 ASSERT(i4_unique_id == id);
1534 ASSERT(num_results == 1);
1535
1536 /*****************************************************************/
1537 /* We do not labor through the results if the total cost worse */
1538 /* than the last of the results. */
1539 /*****************************************************************/
1540 if(i4_tot_cost < ps_best_node[num_results - 1].i4_tot_cost)
1541 {
1542 i4_min_id = i4_grid_pt;
1543 ps_result_prms->i4_min_cost = i4_tot_cost;
1544
1545 ps_best_node[0] = *ps_search_node_grid;
1546 ps_best_node[0].i4_sad = i4_sad;
1547 ps_best_node[0].i4_mv_cost = i4_mv_cost;
1548 ps_best_node[0].i4_tot_cost = i4_tot_cost;
1549 }
1550 }
1551 pi4_sad_grid++;
1552 }
1553 ps_result_prms->i4_min_id = i4_min_id;
1554 }
1555
hme_update_results_grid_pu_bestn(result_upd_prms_t * ps_result_prms)1556 void hme_update_results_grid_pu_bestn(result_upd_prms_t *ps_result_prms)
1557 {
1558 search_node_t s_search_node_grid;
1559 const search_node_t *ps_search_node_base;
1560 search_node_t *ps_search_node_grid, *ps_best_node;
1561 S32 i4_min_cost = (MAX_32BIT_VAL), i4_search_idx;
1562 S32 num_results, i4_unique_id = -1, i4_grid_pt;
1563 search_results_t *ps_search_results;
1564 S32 *pi4_valid_part_ids;
1565 S32 i4_step = ps_result_prms->i4_step;
1566 S32 i4_grid_mask, i4_count, i, i4_min_id;
1567 S32 i4_tot_cost, i4_mv_cost, i4_sad, id;
1568 S32 *pi4_sad_grid = ps_result_prms->pi4_sad_grid;
1569 S32 grid_count = 0;
1570 S32 pred_lx;
1571
1572 i4_min_id = (S32)PT_C;
1573 i4_min_cost = MAX_32BIT_VAL;
1574 ps_search_node_grid = &s_search_node_grid;
1575 ps_search_node_base = ps_result_prms->ps_search_node_base;
1576 *ps_search_node_grid = *ps_search_node_base;
1577 pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1578 ps_search_results = ps_result_prms->ps_search_results;
1579 num_results = (S32)ps_search_results->u1_num_results_per_part;
1580 i4_grid_mask = ps_result_prms->i4_grid_mask;
1581
1582 for(i = 0; i < 9; i++)
1583 {
1584 if(i4_grid_mask & (1 << i))
1585 {
1586 grid_count++;
1587 }
1588 }
1589
1590 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
1591 pred_lx = 1 - ps_search_results->pu1_is_past[i4_search_idx];
1592
1593 i4_unique_id = pi4_valid_part_ids[0];
1594
1595 /*************************************************************************/
1596 /* Outer loop runs through all active pts in the grid */
1597 /*************************************************************************/
1598 for(i4_grid_pt = 0; i4_grid_pt < (S32)NUM_GRID_PTS; i4_grid_pt++)
1599 {
1600 if(!(i4_grid_mask & (1 << i4_grid_pt)))
1601 {
1602 continue;
1603 }
1604
1605 /* For the pt in the grid, update mvx and y depending on */
1606 /* location of pt. Updates are in FPEL units. */
1607 ps_search_node_grid->s_mv.i2_mvx = ps_search_node_base->s_mv.i2_mvx;
1608 ps_search_node_grid->s_mv.i2_mvy = ps_search_node_base->s_mv.i2_mvy;
1609 ps_search_node_grid->s_mv.i2_mvx += (S16)(i4_step * gai1_grid_id_to_x[i4_grid_pt]);
1610 ps_search_node_grid->s_mv.i2_mvy += (S16)(i4_step * gai1_grid_id_to_y[i4_grid_pt]);
1611
1612 i4_count = 0;
1613
1614 while((id = pi4_valid_part_ids[i4_count]) >= 0)
1615 {
1616 /*****************************************************************/
1617 /* points to the best search results corresponding to this */
1618 /* specific part type. */
1619 /*****************************************************************/
1620 ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1621
1622 /* evaluate mv cost and totalcost for this part for this given mv*/
1623 i4_mv_cost = ps_result_prms->pf_mv_cost_compute(
1624 ps_search_node_grid,
1625 &ps_search_results->as_pred_ctxt[pred_lx],
1626 (PART_ID_T)id,
1627 MV_RES_FPEL);
1628
1629 i4_sad = pi4_sad_grid[grid_count * id];
1630 i4_tot_cost = i4_sad + i4_mv_cost;
1631
1632 if(i4_unique_id == id)
1633 {
1634 if(i4_tot_cost < ps_result_prms->i4_min_cost)
1635 {
1636 i4_min_id = i4_grid_pt;
1637 ps_result_prms->i4_min_cost = i4_tot_cost;
1638 }
1639 }
1640
1641 if(i4_tot_cost < ps_best_node[num_results - 1].i4_tot_cost)
1642 {
1643 for(i = 0; i < num_results - 1; i++)
1644 {
1645 if(i4_tot_cost < ps_best_node[i].i4_tot_cost)
1646 {
1647 memmove(
1648 ps_best_node + i + 1,
1649 ps_best_node + i,
1650 sizeof(search_node_t) * (num_results - 1 - i));
1651 break;
1652 }
1653 else if(i4_tot_cost == ps_best_node[i].i4_tot_cost)
1654 {
1655 if(0 == hme_cmp_nodes(ps_search_node_grid, ps_best_node + i))
1656 break;
1657 }
1658 }
1659 ps_best_node[i] = *ps_search_node_grid;
1660 ps_best_node[i].i4_sad = i4_sad;
1661 ps_best_node[i].i4_mv_cost = i4_mv_cost;
1662 ps_best_node[i].i4_tot_cost = i4_tot_cost;
1663 }
1664 i4_count++;
1665 }
1666 pi4_sad_grid++;
1667 }
1668 ps_result_prms->i4_min_id = i4_min_id;
1669 }
1670
1671 /**
1672 ********************************************************************************
1673 * @fn hme_update_results_grid_pu_bestn_no_encode(result_upd_prms_t *ps_result_prms)
1674 *
1675 * @brief Updates results for the case where 1 best result is to be updated
1676 * for a given pt, for several parts
1677 * Note : The function is replicated for CLIPing the cost to 16bit to make
1678 * bit match with SIMD version
1679 *
1680 * @param[in] result_upd_prms_t : Contains the input parameters to this fxn
1681 *
1682 * @return The result_upd_prms_t structure is updated for all the active
1683 * parts in case the current candt has results for any given part
1684 * that is the best result for that part
1685 ********************************************************************************
1686 */
hme_update_results_grid_pu_bestn_no_encode(result_upd_prms_t * ps_result_prms)1687 void hme_update_results_grid_pu_bestn_no_encode(result_upd_prms_t *ps_result_prms)
1688 {
1689 search_node_t s_search_node_grid;
1690 const search_node_t *ps_search_node_base;
1691 search_node_t *ps_search_node_grid, *ps_best_node;
1692 S32 i4_min_cost = (MAX_32BIT_VAL), i4_search_idx;
1693 S32 num_results, i4_unique_id = -1, i4_grid_pt;
1694 search_results_t *ps_search_results;
1695 S32 *pi4_valid_part_ids;
1696 S32 i4_step = ps_result_prms->i4_step;
1697 S32 i4_grid_mask, i4_count, i, i4_min_id;
1698 S32 i4_tot_cost, i4_mv_cost, i4_sad, id;
1699 S32 *pi4_sad_grid = ps_result_prms->pi4_sad_grid;
1700 S32 grid_count = 0;
1701 S32 pred_lx;
1702
1703 i4_min_id = (S32)PT_C;
1704 i4_min_cost = MAX_32BIT_VAL;
1705 ps_search_node_grid = &s_search_node_grid;
1706 ps_search_node_base = ps_result_prms->ps_search_node_base;
1707 *ps_search_node_grid = *ps_search_node_base;
1708 pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1709 ps_search_results = ps_result_prms->ps_search_results;
1710 num_results = (S32)ps_search_results->u1_num_results_per_part;
1711 i4_grid_mask = ps_result_prms->i4_grid_mask;
1712
1713 for(i = 0; i < 9; i++)
1714 {
1715 if(i4_grid_mask & (1 << i))
1716 grid_count++;
1717 }
1718
1719 /* Some basic assumptions: only single pt, only part updates */
1720 /* and more than 1 best result to be computed. */
1721
1722 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
1723 pred_lx = 1 - ps_search_results->pu1_is_past[i4_search_idx];
1724
1725 /*************************************************************************/
1726 /* Supposing we do hte result update for a unique partid, we can */
1727 /* store the best pt id in the grid and also min cost is return */
1728 /* param. This will be useful for early exit cases. */
1729 /* TODO : once we have separate fxn for unique part+grid, we can */
1730 /* do away with this code here */
1731 /*************************************************************************/
1732 //if (pi4_valid_part_ids[1] == -1)
1733 i4_unique_id = pi4_valid_part_ids[0];
1734
1735 /*************************************************************************/
1736 /* Outer loop runs through all active pts in the grid */
1737 /*************************************************************************/
1738 for(i4_grid_pt = 0; i4_grid_pt < (S32)NUM_GRID_PTS; i4_grid_pt++)
1739 {
1740 if(!(i4_grid_mask & (1 << i4_grid_pt)))
1741 continue;
1742
1743 /* For the pt in the grid, update mvx and y depending on */
1744 /* location of pt. Updates are in FPEL units. */
1745 ps_search_node_grid->s_mv.i2_mvx = ps_search_node_base->s_mv.i2_mvx;
1746 ps_search_node_grid->s_mv.i2_mvy = ps_search_node_base->s_mv.i2_mvy;
1747 ps_search_node_grid->s_mv.i2_mvx += (S16)(i4_step * gai1_grid_id_to_x[i4_grid_pt]);
1748 ps_search_node_grid->s_mv.i2_mvy += (S16)(i4_step * gai1_grid_id_to_y[i4_grid_pt]);
1749
1750 i4_count = 0;
1751
1752 /* pi4_valid_part_ids contains all the valid ids. We loop through */
1753 /* this till we encounter -1. This is easier than having to */
1754 /* figure out part by part, besides, active part decision is */
1755 /* usually fixed for a given duration of search, e.g. entire fpel */
1756 /* refinement for a blk/cu will use fixed valid part mask */
1757
1758 while((id = pi4_valid_part_ids[i4_count]) >= 0)
1759 {
1760 //ps_search_node_grid->e_part_type = (PART_TYPE_T)id;
1761
1762 /*****************************************************************/
1763 /* points to the best search results corresponding to this */
1764 /* specific part type. */
1765 /*****************************************************************/
1766 ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1767
1768 /* evaluate mv cost and totalcost for this part for this given mv*/
1769 i4_mv_cost = ps_result_prms->pf_mv_cost_compute(
1770 ps_search_node_grid,
1771 &ps_search_results->as_pred_ctxt[pred_lx],
1772 (PART_ID_T)id,
1773 MV_RES_FPEL);
1774
1775 i4_sad = pi4_sad_grid[grid_count * id];
1776
1777 /* Clipping to 16 bit to bit match with SIMD version */
1778 i4_mv_cost = CLIP_S16(i4_mv_cost);
1779 i4_sad = CLIP_S16(i4_sad);
1780
1781 i4_tot_cost = i4_sad + i4_mv_cost;
1782 /* Clipping to 16 bit to bit match with SIMD version */
1783 i4_tot_cost = CLIP_S16(i4_tot_cost);
1784
1785 if(i4_unique_id == id)
1786 {
1787 if(i4_tot_cost < ps_result_prms->i4_min_cost)
1788 {
1789 i4_min_id = i4_grid_pt;
1790 ps_result_prms->i4_min_cost = i4_tot_cost;
1791 }
1792 }
1793
1794 /*****************************************************************/
1795 /* We do not labor through the results if the total cost worse */
1796 /* than the last of the results. */
1797 /*****************************************************************/
1798 if(i4_tot_cost < ps_best_node[num_results - 1].i4_tot_cost)
1799 {
1800 /*************************************************************/
1801 /* Identify where the current result isto be placed.Basically*/
1802 /* find the node which has cost just higher thannodeundertest*/
1803 /*************************************************************/
1804 for(i = 0; i < num_results - 1; i++)
1805 {
1806 if(i4_tot_cost <= ps_best_node[i].i4_tot_cost)
1807 {
1808 memmove(
1809 ps_best_node + i + 1,
1810 ps_best_node + i,
1811 sizeof(search_node_t) * (num_results - 1 - i));
1812 break;
1813 }
1814 }
1815 ps_best_node[i] = *ps_search_node_grid;
1816 ps_best_node[i].i4_sad = i4_sad;
1817 ps_best_node[i].i4_mv_cost = i4_mv_cost;
1818 ps_best_node[i].i4_tot_cost = i4_tot_cost;
1819 }
1820 i4_count++;
1821 }
1822 pi4_sad_grid++;
1823 }
1824 ps_result_prms->i4_min_id = i4_min_id;
1825 }
1826
1827 /**
1828 ********************************************************************************
1829 * @fn hme_update_results_pt_npu_best1(result_upd_prms_t *ps_result_prms)
1830 *
1831 * @brief Updates results for the case where 1 best result is to be updated
1832 * for a given pt, for several parts
1833 *
1834 * @param[in] ps_result_prms. Contains the input parameters to this fxn
1835 * ::ps_pred_info : contains cost fxn ptr and predictor info
1836 * ::pi4_sad : 17x9 SAD Grid, this case, only 1st 17 entries valid
1837 * ::ps_search_results: Search results structure
1838 * ::i1_ref_id : Reference index
1839 * ::i4_grid_mask: Dont Care for this fxn
1840 * ::pi4_valid_part_ids : valid part ids
1841 * ::ps_search_node_base: Contains the centre pt candt info.
1842 *
1843 * @return The ps_search_results structure is updated for all the active
1844 * parts in case the current candt has results for any given part
1845 * that is the best result for that part
1846 ********************************************************************************
1847 */
1848
hme_update_results_pt_pu_best1_subpel_hs(err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms)1849 void hme_update_results_pt_pu_best1_subpel_hs(
1850 err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
1851 {
1852 search_node_t *ps_search_node_base, *ps_best_node;
1853 search_results_t *ps_search_results;
1854 S32 id, i4_search_idx = ps_result_prms->u1_pred_lx;
1855 S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
1856 S32 num_results, i;
1857 S32 *pi4_valid_part_ids;
1858
1859 pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1860 /* Some basic assumptions: only single pt, only part updates */
1861 /* and more than 1 best result to be computed. */
1862 ASSERT(ps_result_prms->i4_grid_mask == 1);
1863
1864 ps_search_results = ps_result_prms->ps_search_results;
1865 num_results = (S32)ps_search_results->u1_num_results_per_part;
1866
1867 /* Compute mv cost, total cost */
1868 ps_search_node_base = (search_node_t *)ps_result_prms->ps_search_node_base;
1869
1870 while((id = pi4_valid_part_ids[i4_count]) >= 0)
1871 {
1872 S32 update_required = 1;
1873
1874 ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1875 /* Use a pre-computed cost instead of freshly evaluating subpel cost */
1876 i4_mv_cost = ps_best_node->i4_mv_cost;
1877 i4_sad = ps_result_prms->pi4_sad_grid[id];
1878 i4_tot_cost = i4_sad + i4_mv_cost;
1879
1880 /* We do not labor through the results if the total cost is worse than */
1881 /* the last of the results. */
1882 if(i4_tot_cost < ps_best_node[num_results - 1].i4_tot_cost)
1883 {
1884 /* Identify where the current result is to be placed. Basically find */
1885 /* the node which has cost just higher than node under test */
1886 for(i = 0; i < num_results - 1; i++)
1887 {
1888 if(ps_best_node[i].i1_ref_idx != -1)
1889 {
1890 if(i4_tot_cost < ps_best_node[i].i4_tot_cost)
1891 {
1892 memmove(
1893 ps_best_node + i + 1,
1894 ps_best_node + i,
1895 sizeof(search_node_t) * (num_results - 1 - i));
1896 break;
1897 }
1898 else if(i4_tot_cost == ps_best_node[i].i4_tot_cost)
1899 {
1900 update_required = 0;
1901 break;
1902 }
1903 }
1904 else
1905 {
1906 break;
1907 }
1908 }
1909
1910 if(update_required)
1911 {
1912 /* Update when either ref_idx or mv's are different */
1913 ps_best_node[i] = *ps_search_node_base;
1914 ps_best_node[i].i4_sad = i4_sad;
1915 ps_best_node[i].i4_mv_cost = i4_mv_cost;
1916 ps_best_node[i].i4_tot_cost = i4_tot_cost;
1917 }
1918 }
1919 i4_count++;
1920 }
1921 }
1922
hme_update_results_pt_pu_best1_subpel_hs_1(err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms)1923 void hme_update_results_pt_pu_best1_subpel_hs_1(
1924 err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
1925 {
1926 search_node_t *ps_search_node_base, *ps_best_node;
1927 search_results_t *ps_search_results;
1928 S32 id, i4_search_idx = ps_result_prms->u1_pred_lx;
1929 S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
1930 S32 num_results;
1931 S32 *pi4_valid_part_ids;
1932
1933 pi4_valid_part_ids = ps_result_prms->pi4_valid_part_ids;
1934 /* Some basic assumptions: only single pt, only part updates */
1935 /* and more than 1 best result to be computed. */
1936 ASSERT(ps_result_prms->i4_grid_mask == 1);
1937
1938 ps_search_results = ps_result_prms->ps_search_results;
1939 num_results = (S32)ps_search_results->u1_num_results_per_part;
1940
1941 /* Compute mv cost, total cost */
1942 ps_search_node_base = (search_node_t *)ps_result_prms->ps_search_node_base;
1943
1944 while((id = pi4_valid_part_ids[i4_count]) >= 0)
1945 {
1946 S32 update_required = 0;
1947
1948 ps_best_node = ps_search_results->aps_part_results[i4_search_idx][id];
1949 /* Use a pre-computed cost instead of freshly evaluating subpel cost */
1950 i4_mv_cost = ps_best_node->i4_mv_cost;
1951 i4_sad = ps_result_prms->pi4_sad_grid[id];
1952 i4_tot_cost = i4_sad + i4_mv_cost;
1953
1954 /* We do not labor through the results if the total cost is worse than */
1955 /* the last of the results. */
1956 if(i4_tot_cost < ps_best_node[1].i4_tot_cost)
1957 {
1958 S32 sdi_value = 0;
1959
1960 update_required = 2;
1961 /* Identify where the current result is to be placed. Basically find */
1962 /* the node which has cost just higher than node under test */
1963 {
1964 if(i4_tot_cost < ps_best_node[0].i4_tot_cost)
1965 {
1966 update_required = 1;
1967 sdi_value = ps_best_node[0].i4_sad - i4_sad;
1968 }
1969 else if(
1970 (ps_result_prms->i2_mv_x == ps_best_node[0].s_mv.i2_mvx) &&
1971 (ps_result_prms->i2_mv_y == ps_best_node[0].s_mv.i2_mvy) &&
1972 (ps_best_node[0].i1_ref_idx == ps_result_prms->i1_ref_idx))
1973 {
1974 update_required = 0;
1975 }
1976 }
1977 if(update_required == 2)
1978 {
1979 subpel_refine_ctxt_t *ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
1980
1981 ps_subpel_refine_ctxt->i2_tot_cost[1][i4_count] = i4_tot_cost;
1982 ps_subpel_refine_ctxt->i2_mv_cost[1][i4_count] = i4_mv_cost;
1983 ps_subpel_refine_ctxt->i2_mv_x[1][i4_count] = ps_result_prms->i2_mv_x;
1984 ps_subpel_refine_ctxt->i2_mv_y[1][i4_count] = ps_result_prms->i2_mv_y;
1985 ps_subpel_refine_ctxt->i2_ref_idx[1][i4_count] = ps_result_prms->i1_ref_idx;
1986 }
1987 else if(update_required == 1)
1988 {
1989 subpel_refine_ctxt_t *ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
1990
1991 ps_subpel_refine_ctxt->i2_tot_cost[1][i4_count] =
1992 ps_subpel_refine_ctxt->i2_tot_cost[0][i4_count];
1993 ps_subpel_refine_ctxt->i2_mv_cost[1][i4_count] =
1994 ps_subpel_refine_ctxt->i2_mv_cost[0][i4_count];
1995 ps_subpel_refine_ctxt->i2_mv_x[1][i4_count] =
1996 ps_subpel_refine_ctxt->i2_mv_x[0][i4_count];
1997 ps_subpel_refine_ctxt->i2_mv_y[1][i4_count] =
1998 ps_subpel_refine_ctxt->i2_mv_y[0][i4_count];
1999 ps_subpel_refine_ctxt->i2_ref_idx[1][i4_count] =
2000 ps_subpel_refine_ctxt->i2_ref_idx[0][i4_count];
2001
2002 ps_subpel_refine_ctxt->i2_tot_cost[0][i4_count] = i4_tot_cost;
2003 ps_subpel_refine_ctxt->i2_mv_cost[0][i4_count] = i4_mv_cost;
2004 ps_subpel_refine_ctxt->i2_mv_x[0][i4_count] = ps_result_prms->i2_mv_x;
2005 ps_subpel_refine_ctxt->i2_mv_y[0][i4_count] = ps_result_prms->i2_mv_y;
2006 ps_subpel_refine_ctxt->i2_ref_idx[0][i4_count] = ps_result_prms->i1_ref_idx;
2007 }
2008 }
2009 i4_count++;
2010 }
2011 }
2012
2013 /**
2014 ******************************************************************************
2015 * @brief Gives a result fxn ptr for a index [x] where x is as:
2016 * 0 : single pt, no partial updates, 1 best result
2017 * 1 : single pt, no partial updates, N best results
2018 * 2 : single pt, partial updates, 1 best result
2019 * 3 : single pt, partial updates, N best results
2020 * 0 : grid , no partial updates, 1 best result
2021 * 1 : grid , no partial updates, N best results
2022 * 2 : grid , partial updates, 1 best result
2023 * 3 : grid , partial updates, N best results
2024 ******************************************************************************
2025 */
2026
2027 static PF_RESULT_FXN_T g_pf_result_fxn[8] = { UPD_RES_PT_NPU_BEST1, UPD_RES_PT_NPU_BESTN,
2028 UPD_RES_PT_PU_BEST1, UPD_RES_PT_PU_BESTN,
2029 UPD_RES_GRID_NPU_BEST1, UPD_RES_GRID_NPU_BESTN,
2030 UPD_RES_GRID_PU_BEST1, UPD_RES_GRID_PU_BESTN };
2031
2032 /**
2033 ********************************************************************************
2034 * @fn hme_get_result_fxn(i4_grid_mask, i4_part_mask, i4_num_results)
2035 *
2036 * @brief Obtains the suitable result function that evaluates COST and also
2037 * computes one or more best results for point/grid, single part or
2038 * more than one part.
2039 *
2040 * @param[in] i4_grid_mask : Mask containing which of 9 grid pts active
2041 *
2042 * @param[in] i4_part_mask : Mask containing which of the 17 parts active
2043 *
2044 * @param[in] i4_num_results: Number of active results
2045 *
2046 * @return Pointer to the appropriate result update function
2047 ********************************************************************************
2048 */
hme_get_result_fxn(S32 i4_grid_mask,S32 i4_part_mask,S32 i4_num_results)2049 PF_RESULT_FXN_T hme_get_result_fxn(S32 i4_grid_mask, S32 i4_part_mask, S32 i4_num_results)
2050 {
2051 S32 i4_is_grid = (i4_grid_mask != 1);
2052 S32 i4_is_pu = ((i4_part_mask & (i4_part_mask - 1)) != 0);
2053 S32 i4_res_gt1 = (i4_num_results > 1);
2054 S32 id;
2055
2056 id = (i4_is_grid << 2) + (i4_is_pu << 1) + i4_res_gt1;
2057
2058 return (g_pf_result_fxn[id]);
2059 }
2060
hme_calc_sad_and_2_best_results(hme_search_prms_t * ps_search_prms,wgt_pred_ctxt_t * ps_wt_inp_prms,err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms,U08 ** ppu1_ref,S32 i4_ref_stride)2061 void hme_calc_sad_and_2_best_results(
2062 hme_search_prms_t *ps_search_prms,
2063 wgt_pred_ctxt_t *ps_wt_inp_prms,
2064 err_prms_t *ps_err_prms,
2065 result_upd_prms_t *ps_result_prms,
2066 U08 **ppu1_ref,
2067 S32 i4_ref_stride)
2068 {
2069 S32 i4_candt;
2070 S32 i4_inp_off;
2071 S32 i4_ref_offset;
2072 S32 i4_num_nodes;
2073
2074 S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
2075 S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
2076 WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
2077 WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
2078 WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
2079
2080 mv_refine_ctxt_t *ps_mv_refine_ctxt;
2081 search_node_t *ps_search_node;
2082
2083 ps_mv_refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
2084 i4_num_nodes = ps_search_prms->i4_num_search_nodes;
2085 i4_inp_off = ps_search_prms->i4_cu_x_off;
2086 i4_inp_off += ps_search_prms->i4_cu_y_off * cur_buf_stride;
2087 i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
2088 ps_search_node = ps_search_prms->ps_search_nodes;
2089
2090 for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
2091 {
2092 /**********************************************************************/
2093 /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID */
2094 /**********************************************************************/
2095 {
2096 WORD32 b, c, d;
2097 UWORD8 *pu1_cur_ptr;
2098 UWORD8 *pu1_ref_ptr;
2099 UWORD16 au2_4x4_sad[NUM_4X4];
2100
2101 if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
2102 {
2103 continue;
2104 }
2105
2106 ps_err_prms->pu1_inp =
2107 ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
2108 ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
2109 ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
2110 ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
2111
2112 pu1_cur_ptr = ps_err_prms->pu1_inp;
2113 pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
2114
2115 /* Loop to compute the SAD's */
2116 {
2117 memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
2118 for(b = 0; b < NUM_4X4; b++)
2119 {
2120 WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
2121 WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
2122
2123 for(c = 0; c < NUM_ROWS_IN_4X4; c++)
2124 {
2125 WORD32 z_cur = (cur_buf_stride)*c + t1;
2126 WORD32 z_ref = (ref_buf_stride)*c + t2;
2127 for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
2128 {
2129 au2_4x4_sad[b] += (UWORD16)ABS((
2130 ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
2131 }
2132 }
2133 }
2134
2135 pi4_sad_grid[PART_ID_NxN_TL] =
2136 (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
2137 pi4_sad_grid[PART_ID_NxN_TR] =
2138 (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
2139 pi4_sad_grid[PART_ID_NxN_BL] =
2140 (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2141 pi4_sad_grid[PART_ID_NxN_BR] =
2142 (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
2143 pi4_sad_grid[PART_ID_Nx2N_L] =
2144 pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
2145 pi4_sad_grid[PART_ID_Nx2N_R] =
2146 pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
2147 pi4_sad_grid[PART_ID_2NxN_T] =
2148 pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
2149 pi4_sad_grid[PART_ID_2NxN_B] =
2150 pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
2151 pi4_sad_grid[PART_ID_nLx2N_L] =
2152 (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
2153 pi4_sad_grid[PART_ID_nRx2N_R] =
2154 (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
2155 pi4_sad_grid[PART_ID_2NxnU_T] =
2156 (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
2157 pi4_sad_grid[PART_ID_2NxnD_B] =
2158 (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2159 pi4_sad_grid[PART_ID_2Nx2N] =
2160 pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
2161 pi4_sad_grid[PART_ID_2NxnU_B] =
2162 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
2163 pi4_sad_grid[PART_ID_2NxnD_T] =
2164 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
2165 pi4_sad_grid[PART_ID_nRx2N_L] =
2166 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
2167 pi4_sad_grid[PART_ID_nLx2N_R] =
2168 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
2169 }
2170 }
2171
2172 {
2173 S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
2174 S32 *pi4_valid_part_ids = &ps_mv_refine_ctxt->ai4_part_id[0];
2175 S32 best_node_cost;
2176 S32 second_best_node_cost;
2177
2178 {
2179 S16 mvdx1, mvdy1;
2180 S32 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
2181 search_results_t *ps_search_results = ps_result_prms->ps_search_results;
2182 S32 pred_lx = i4_search_idx;
2183
2184 pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
2185 pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
2186 search_node_t *ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
2187
2188 S32 inp_shift = 2;
2189 S32 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
2190 S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
2191 S32 lambda = ps_pred_ctxt->lambda;
2192 S32 rnd = 1 << (lambda_q_shift - 1);
2193 S32 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
2194 S32 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
2195 S32 ref_bits =
2196 ps_pred_ctxt
2197 ->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
2198
2199 COMPUTE_DIFF_MV(
2200 mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
2201
2202 mvdx1 = ABS(mvdx1);
2203 mvdy1 = ABS(mvdy1);
2204
2205 i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) +
2206 (mvdy1 > 0) + ref_bits + 2;
2207
2208 i4_mv_cost *= lambda;
2209 i4_mv_cost += rnd;
2210 i4_mv_cost >>= lambda_q_shift;
2211
2212 i4_mv_cost = CLIP_U16(i4_mv_cost);
2213 }
2214
2215 /*For each valid partition, update the refine_prm structure to reflect the best and second
2216 best candidates for that partition*/
2217
2218 for(i4_count = 0; i4_count < ps_mv_refine_ctxt->i4_num_valid_parts; i4_count++)
2219 {
2220 S32 update_required = 0;
2221 S32 part_id = pi4_valid_part_ids[i4_count];
2222 S32 index = (ps_mv_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
2223
2224 /*Calculate total cost*/
2225 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
2226 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
2227
2228 /*****************************************************************/
2229 /* We do not labor through the results if the total cost worse */
2230 /* than the last of the results. */
2231 /*****************************************************************/
2232 best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_tot_cost[0][index]);
2233 second_best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_tot_cost[1][index]);
2234
2235 if(i4_tot_cost < second_best_node_cost)
2236 {
2237 update_required = 2;
2238
2239 /*************************************************************/
2240 /* Identify where the current result isto be placed.Basically*/
2241 /* find the node which has cost just higher thannodeundertest*/
2242 /*************************************************************/
2243 if(i4_tot_cost < best_node_cost)
2244 {
2245 update_required = 1;
2246 }
2247 else if(i4_tot_cost == best_node_cost)
2248 {
2249 update_required = 0;
2250 }
2251
2252 if(update_required == 2)
2253 {
2254 ps_mv_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
2255 ps_mv_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
2256 ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_search_node->s_mv.i2_mvx;
2257 ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_search_node->s_mv.i2_mvy;
2258 ps_mv_refine_ctxt->i2_ref_idx[1][index] = ps_search_node->i1_ref_idx;
2259 }
2260 else if(update_required == 1)
2261 {
2262 ps_mv_refine_ctxt->i2_tot_cost[1][index] =
2263 ps_mv_refine_ctxt->i2_tot_cost[0][index];
2264 ps_mv_refine_ctxt->i2_mv_cost[1][index] =
2265 ps_mv_refine_ctxt->i2_mv_cost[0][index];
2266 ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_mv_refine_ctxt->i2_mv_x[0][index];
2267 ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_mv_refine_ctxt->i2_mv_y[0][index];
2268 ps_mv_refine_ctxt->i2_ref_idx[1][index] =
2269 ps_mv_refine_ctxt->i2_ref_idx[0][index];
2270
2271 ps_mv_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
2272 ps_mv_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
2273 ps_mv_refine_ctxt->i2_mv_x[0][index] = ps_search_node->s_mv.i2_mvx;
2274 ps_mv_refine_ctxt->i2_mv_y[0][index] = ps_search_node->s_mv.i2_mvy;
2275 ps_mv_refine_ctxt->i2_ref_idx[0][index] = ps_search_node->i1_ref_idx;
2276 }
2277 }
2278 }
2279 }
2280 ps_search_node++;
2281 }
2282
2283 {
2284 WORD32 i4_i;
2285 WORD32 part_id;
2286 search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
2287 for(i4_i = 0; i4_i < ps_mv_refine_ctxt->i4_num_valid_parts; i4_i++)
2288 {
2289 part_id = ps_mv_refine_ctxt->ai4_part_id[i4_i];
2290 if(ps_mv_refine_ctxt->i2_tot_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
2291 {
2292 ASSERT(ps_mv_refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
2293 ASSERT(ps_mv_refine_ctxt->i2_mv_x[0][part_id] == 0);
2294 ASSERT(ps_mv_refine_ctxt->i2_mv_y[0][part_id] == 0);
2295
2296 ps_mv_refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
2297 }
2298 if(ps_mv_refine_ctxt->i2_tot_cost[1][part_id] >= MAX_SIGNED_16BIT_VAL)
2299 {
2300 ASSERT(ps_mv_refine_ctxt->i2_mv_cost[1][part_id] == MAX_SIGNED_16BIT_VAL);
2301 ASSERT(ps_mv_refine_ctxt->i2_mv_x[1][part_id] == 0);
2302 ASSERT(ps_mv_refine_ctxt->i2_mv_y[1][part_id] == 0);
2303
2304 ps_mv_refine_ctxt->i2_ref_idx[1][part_id] = ps_search_node->i1_ref_idx;
2305 }
2306 }
2307 }
2308 }
2309
hme_calc_sad_and_2_best_results_subpel(err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms)2310 void hme_calc_sad_and_2_best_results_subpel(
2311 err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
2312 {
2313 S32 i4_candt;
2314 S32 i4_num_nodes;
2315
2316 S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
2317 S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
2318 WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
2319 WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
2320 WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
2321
2322 mv_refine_ctxt_t *ps_subpel_refine_ctxt;
2323 ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
2324 i4_num_nodes = 1;
2325
2326 /* Run through each of the candts in a loop */
2327 for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
2328 {
2329 /**********************************************************************/
2330 /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID */
2331 /**********************************************************************/
2332 {
2333 WORD32 b, c, d;
2334 UWORD8 *pu1_cur_ptr;
2335 UWORD8 *pu1_ref_ptr;
2336 UWORD16 au2_4x4_sad[NUM_4X4];
2337
2338 pu1_cur_ptr = ps_err_prms->pu1_inp;
2339 pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
2340
2341 /* Loop to compute the SAD's */
2342 {
2343 memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
2344 for(b = 0; b < NUM_4X4; b++)
2345 {
2346 WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
2347 WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
2348
2349 for(c = 0; c < NUM_ROWS_IN_4X4; c++)
2350 {
2351 WORD32 z_cur = (cur_buf_stride)*c + t1;
2352 WORD32 z_ref = (ref_buf_stride)*c + t2;
2353 for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
2354 {
2355 au2_4x4_sad[b] += (UWORD16)ABS((
2356 ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
2357 }
2358 }
2359 }
2360
2361 pi4_sad_grid[PART_ID_NxN_TL] =
2362 (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
2363 pi4_sad_grid[PART_ID_NxN_TR] =
2364 (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
2365 pi4_sad_grid[PART_ID_NxN_BL] =
2366 (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2367 pi4_sad_grid[PART_ID_NxN_BR] =
2368 (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
2369 pi4_sad_grid[PART_ID_Nx2N_L] =
2370 pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
2371 pi4_sad_grid[PART_ID_Nx2N_R] =
2372 pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
2373 pi4_sad_grid[PART_ID_2NxN_T] =
2374 pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
2375 pi4_sad_grid[PART_ID_2NxN_B] =
2376 pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
2377 pi4_sad_grid[PART_ID_nLx2N_L] =
2378 (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
2379 pi4_sad_grid[PART_ID_nRx2N_R] =
2380 (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
2381 pi4_sad_grid[PART_ID_2NxnU_T] =
2382 (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
2383 pi4_sad_grid[PART_ID_2NxnD_B] =
2384 (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2385 pi4_sad_grid[PART_ID_2Nx2N] =
2386 pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
2387 pi4_sad_grid[PART_ID_2NxnU_B] =
2388 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
2389 pi4_sad_grid[PART_ID_2NxnD_T] =
2390 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
2391 pi4_sad_grid[PART_ID_nRx2N_L] =
2392 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
2393 pi4_sad_grid[PART_ID_nLx2N_R] =
2394 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
2395 }
2396 }
2397 /**********************************************************************/
2398 /* CALL THE FUNCTION THAT COMPUTES UPDATES THE BEST RESULTS */
2399 /**********************************************************************/
2400 {
2401 S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
2402 S32 *pi4_valid_part_ids = &ps_subpel_refine_ctxt->ai4_part_id[0];
2403 S32 best_node_cost;
2404 S32 second_best_node_cost;
2405
2406 /*For each valid partition, update the refine_prm structure to reflect the best and second
2407 best candidates for that partition*/
2408
2409 for(i4_count = 0; i4_count < ps_subpel_refine_ctxt->i4_num_valid_parts; i4_count++)
2410 {
2411 S32 update_required = 0;
2412 S32 part_id = pi4_valid_part_ids[i4_count];
2413 S32 index = (ps_subpel_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
2414
2415 /* Use a pre-computed cost instead of freshly evaluating subpel cost */
2416 i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
2417
2418 /*Calculate total cost*/
2419 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
2420 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
2421
2422 /*****************************************************************/
2423 /* We do not labor through the results if the total cost worse */
2424 /* than the last of the results. */
2425 /*****************************************************************/
2426 best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[0][index]);
2427 second_best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[1][index]);
2428
2429 if(i4_tot_cost < second_best_node_cost)
2430 {
2431 update_required = 2;
2432
2433 /*************************************************************/
2434 /* Identify where the current result isto be placed.Basically*/
2435 /* find the node which has cost just higher thannodeundertest*/
2436 /*************************************************************/
2437 if(i4_tot_cost < best_node_cost)
2438 {
2439 update_required = 1;
2440 }
2441 else if(i4_tot_cost == ps_subpel_refine_ctxt->i2_tot_cost[0][index])
2442 {
2443 update_required = 0;
2444 }
2445 if(update_required == 2)
2446 {
2447 ps_subpel_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
2448 ps_subpel_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
2449 ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_result_prms->i2_mv_x;
2450 ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_result_prms->i2_mv_y;
2451 ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_result_prms->i1_ref_idx;
2452 }
2453 else if(update_required == 1)
2454 {
2455 ps_subpel_refine_ctxt->i2_tot_cost[1][index] =
2456 ps_subpel_refine_ctxt->i2_tot_cost[0][index];
2457 ps_subpel_refine_ctxt->i2_mv_cost[1][index] =
2458 ps_subpel_refine_ctxt->i2_mv_cost[0][index];
2459 ps_subpel_refine_ctxt->i2_mv_x[1][index] =
2460 ps_subpel_refine_ctxt->i2_mv_x[0][index];
2461 ps_subpel_refine_ctxt->i2_mv_y[1][index] =
2462 ps_subpel_refine_ctxt->i2_mv_y[0][index];
2463 ps_subpel_refine_ctxt->i2_ref_idx[1][index] =
2464 ps_subpel_refine_ctxt->i2_ref_idx[0][index];
2465
2466 ps_subpel_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
2467 ps_subpel_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
2468 ps_subpel_refine_ctxt->i2_mv_x[0][index] = ps_result_prms->i2_mv_x;
2469 ps_subpel_refine_ctxt->i2_mv_y[0][index] = ps_result_prms->i2_mv_y;
2470 ps_subpel_refine_ctxt->i2_ref_idx[0][index] = ps_result_prms->i1_ref_idx;
2471 }
2472 }
2473 }
2474 }
2475 }
2476
2477 {
2478 WORD32 i4_count = 0;
2479 for(i4_count = 0; i4_count < TOT_NUM_PARTS; i4_count++)
2480 {
2481 WORD32 j;
2482 for(j = 0; j < 2; j++)
2483 {
2484 if(ps_subpel_refine_ctxt->i2_tot_cost[j][i4_count] >= MAX_SIGNED_16BIT_VAL)
2485 {
2486 ps_subpel_refine_ctxt->ai2_fullpel_satd[j][i4_count] = MAX_SIGNED_16BIT_VAL;
2487 }
2488 }
2489 }
2490 }
2491 }
2492
hme_calc_stim_injected_sad_and_2_best_results(hme_search_prms_t * ps_search_prms,wgt_pred_ctxt_t * ps_wt_inp_prms,err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms,U08 ** ppu1_ref,S32 i4_ref_stride)2493 void hme_calc_stim_injected_sad_and_2_best_results(
2494 hme_search_prms_t *ps_search_prms,
2495 wgt_pred_ctxt_t *ps_wt_inp_prms,
2496 err_prms_t *ps_err_prms,
2497 result_upd_prms_t *ps_result_prms,
2498 U08 **ppu1_ref,
2499 S32 i4_ref_stride)
2500 {
2501 mv_refine_ctxt_t *ps_mv_refine_ctxt;
2502 search_node_t *ps_search_node;
2503
2504 S32 i4_candt;
2505 S32 i4_count;
2506 S32 i4_inp_off;
2507 S32 i4_ref_offset;
2508 S32 i4_num_nodes;
2509 ULWORD64 *au8_final_src_sigmaX, *au8_final_src_sigmaXSquared, au8_final_ref_sigmaX[17],
2510 au8_final_ref_sigmaXSquared[17];
2511 UWORD32 au4_4x4_ref_sigmaX[NUM_4X4], au4_4x4_ref_sigmaXSquared[NUM_4X4];
2512 S32 *pi4_valid_part_ids;
2513
2514 S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
2515 S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
2516 WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
2517 WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
2518 WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
2519
2520 ps_mv_refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
2521 i4_num_nodes = ps_search_prms->i4_num_search_nodes;
2522 i4_inp_off = ps_search_prms->i4_cu_x_off;
2523 i4_inp_off += ps_search_prms->i4_cu_y_off * cur_buf_stride;
2524 i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
2525 ps_search_node = ps_search_prms->ps_search_nodes;
2526 pi4_valid_part_ids = &ps_mv_refine_ctxt->ai4_part_id[0];
2527
2528 /* Set local pointer to point to partition level sigma values calculated in hme_refine */
2529 au8_final_src_sigmaX = ps_search_prms->pu8_part_src_sigmaX;
2530 au8_final_src_sigmaXSquared = ps_search_prms->pu8_part_src_sigmaXSquared;
2531
2532 for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
2533 {
2534 {
2535 WORD32 b, c, d;
2536 UWORD8 *pu1_cur_ptr;
2537 UWORD8 *pu1_ref_ptr;
2538 UWORD16 au2_4x4_sad[NUM_4X4];
2539
2540 if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
2541 {
2542 continue;
2543 }
2544
2545 ps_err_prms->pu1_inp =
2546 ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
2547 ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
2548 ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
2549 ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
2550
2551 pu1_cur_ptr = ps_err_prms->pu1_inp;
2552 pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
2553
2554 /* Loop to compute the SAD's */
2555 {
2556 memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
2557 for(b = 0; b < NUM_4X4; b++)
2558 {
2559 WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
2560 WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
2561
2562 for(c = 0; c < NUM_ROWS_IN_4X4; c++)
2563 {
2564 WORD32 z_cur = (cur_buf_stride)*c + t1;
2565 WORD32 z_ref = (ref_buf_stride)*c + t2;
2566 for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
2567 {
2568 au2_4x4_sad[b] += (UWORD16)ABS((
2569 ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
2570 }
2571 }
2572 }
2573
2574 /* Compute sigmaX and sigmaX_Squared at 4x4 level for ref from ref_ptr */
2575 hme_compute_sigmaX_and_sigmaXSquared(
2576 pu1_ref_ptr,
2577 ref_buf_stride,
2578 au4_4x4_ref_sigmaX,
2579 au4_4x4_ref_sigmaXSquared,
2580 4,
2581 4,
2582 16,
2583 16,
2584 1,
2585 4);
2586
2587 pi4_sad_grid[PART_ID_NxN_TL] =
2588 (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
2589 pi4_sad_grid[PART_ID_NxN_TR] =
2590 (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
2591 pi4_sad_grid[PART_ID_NxN_BL] =
2592 (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2593 pi4_sad_grid[PART_ID_NxN_BR] =
2594 (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
2595 pi4_sad_grid[PART_ID_Nx2N_L] =
2596 pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
2597 pi4_sad_grid[PART_ID_Nx2N_R] =
2598 pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
2599 pi4_sad_grid[PART_ID_2NxN_T] =
2600 pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
2601 pi4_sad_grid[PART_ID_2NxN_B] =
2602 pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
2603 pi4_sad_grid[PART_ID_nLx2N_L] =
2604 (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
2605 pi4_sad_grid[PART_ID_nRx2N_R] =
2606 (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
2607 pi4_sad_grid[PART_ID_2NxnU_T] =
2608 (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
2609 pi4_sad_grid[PART_ID_2NxnD_B] =
2610 (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2611 pi4_sad_grid[PART_ID_2Nx2N] =
2612 pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
2613 pi4_sad_grid[PART_ID_2NxnU_B] =
2614 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
2615 pi4_sad_grid[PART_ID_2NxnD_T] =
2616 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
2617 pi4_sad_grid[PART_ID_nRx2N_L] =
2618 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
2619 pi4_sad_grid[PART_ID_nLx2N_R] =
2620 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
2621 }
2622 }
2623
2624 {
2625 S32 i4_sad, i4_mv_cost, i4_tot_cost;
2626 S32 best_node_cost;
2627 S32 second_best_node_cost;
2628 ULWORD64 u8_temp_var, u8_temp_var1;
2629 ULWORD64 u8_ref_X_Square, u8_pure_dist, u8_src_var, u8_ref_var;
2630
2631 {
2632 S16 mvdx1, mvdy1;
2633 S32 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
2634 search_results_t *ps_search_results = ps_result_prms->ps_search_results;
2635 S32 pred_lx = i4_search_idx;
2636
2637 pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
2638 pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
2639 search_node_t *ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
2640
2641 S32 inp_shift = 2;
2642 S32 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
2643 S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
2644 S32 lambda = ps_pred_ctxt->lambda;
2645 S32 rnd = 1 << (lambda_q_shift - 1);
2646 S32 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
2647 S32 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
2648 S32 ref_bits =
2649 ps_pred_ctxt
2650 ->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
2651
2652 COMPUTE_DIFF_MV(
2653 mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
2654
2655 mvdx1 = ABS(mvdx1);
2656 mvdy1 = ABS(mvdy1);
2657
2658 i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) +
2659 (mvdy1 > 0) + ref_bits + 2;
2660
2661 i4_mv_cost *= lambda;
2662 i4_mv_cost += rnd;
2663 i4_mv_cost >>= lambda_q_shift;
2664
2665 i4_mv_cost = CLIP_U16(i4_mv_cost);
2666 }
2667
2668 for(i4_count = 0; i4_count < ps_mv_refine_ctxt->i4_num_valid_parts; i4_count++)
2669 {
2670 S32 i4_stim_injected_sad;
2671 S32 i4_stim_injected_cost;
2672 S32 i4_noise_term;
2673 unsigned long u4_shift_val;
2674 S32 i4_bits_req;
2675
2676 S32 update_required = 0;
2677 S32 part_id = pi4_valid_part_ids[i4_count];
2678 S32 index = (ps_mv_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
2679
2680 WORD32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT;
2681
2682 S32 i4_inv_wt = ps_wt_inp_prms->a_inv_wpred_wt[ps_search_node->i1_ref_idx];
2683
2684 if(ps_search_prms->i4_alpha_stim_multiplier)
2685 {
2686 /* Compute ref sigmaX and sigmaX_Squared values for valid partitions from previously computed ref 4x4 level values */
2687 hme_compute_final_sigma_of_pu_from_base_blocks(
2688 au4_4x4_ref_sigmaX,
2689 au4_4x4_ref_sigmaXSquared,
2690 au8_final_ref_sigmaX,
2691 au8_final_ref_sigmaXSquared,
2692 16,
2693 4,
2694 part_id,
2695 4);
2696
2697 u8_ref_X_Square =
2698 (au8_final_ref_sigmaX[part_id] * au8_final_ref_sigmaX[part_id]);
2699 u8_ref_var = (au8_final_ref_sigmaXSquared[part_id] - u8_ref_X_Square);
2700
2701 /* Multiply un-normalized src_var with inv_wt if its not same as default wt */
2702 /* and shift the resulting src_var if its more than 27 bits to avoid overflow */
2703 /* The amount by which it is shifted is passed on to u4_shift_val and applied equally on ref_var */
2704 u4_shift_val = ihevce_calc_stim_injected_variance(
2705 au8_final_src_sigmaX,
2706 au8_final_src_sigmaXSquared,
2707 &u8_src_var,
2708 i4_inv_wt,
2709 ps_wt_inp_prms->ai4_shift_val[ps_search_node->i1_ref_idx],
2710 ps_wt_inp_prms->wpred_log_wdc,
2711 part_id);
2712
2713 u8_ref_var = u8_ref_var >> u4_shift_val;
2714
2715 /* Do the same check on ref_var to avoid overflow and apply similar shift on src_var */
2716 GETRANGE64(i4_bits_req, u8_ref_var);
2717
2718 if(i4_bits_req > 27)
2719 {
2720 u8_ref_var = u8_ref_var >> (i4_bits_req - 27);
2721 u8_src_var = u8_src_var >> (i4_bits_req - 27);
2722 }
2723
2724 if(u8_src_var == u8_ref_var)
2725 {
2726 u8_temp_var = (1 << STIM_Q_FORMAT);
2727 }
2728 else
2729 {
2730 u8_temp_var = (2 * u8_src_var * u8_ref_var);
2731 u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT));
2732 u8_temp_var1 = (u8_src_var * u8_src_var) + (u8_ref_var * u8_ref_var);
2733 u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2));
2734 u8_temp_var = (u8_temp_var / u8_temp_var1);
2735 }
2736
2737 i4_noise_term = (UWORD32)u8_temp_var;
2738
2739 ASSERT(i4_noise_term >= 0);
2740
2741 i4_noise_term *= ps_search_prms->i4_alpha_stim_multiplier;
2742 }
2743 else
2744 {
2745 i4_noise_term = 0;
2746 }
2747 u8_pure_dist = pi4_sad_grid[part_id];
2748 u8_pure_dist *= ((1 << (i4_q_level)) - (i4_noise_term));
2749 u8_pure_dist += (1 << ((i4_q_level)-1));
2750 i4_stim_injected_sad = (UWORD32)(u8_pure_dist >> (i4_q_level));
2751
2752 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
2753 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
2754 i4_stim_injected_sad = CLIP3(i4_stim_injected_sad, 0, 0x7fff);
2755 i4_stim_injected_cost = CLIP_S16(i4_stim_injected_sad + i4_mv_cost);
2756
2757 best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_stim_injected_cost[0][index]);
2758 second_best_node_cost =
2759 CLIP_S16(ps_mv_refine_ctxt->i2_stim_injected_cost[1][index]);
2760
2761 if(i4_stim_injected_cost < second_best_node_cost)
2762 {
2763 update_required = 2;
2764
2765 if(i4_stim_injected_cost < best_node_cost)
2766 {
2767 update_required = 1;
2768 }
2769 else if(i4_stim_injected_cost == best_node_cost)
2770 {
2771 update_required = 0;
2772 }
2773
2774 if(update_required == 2)
2775 {
2776 ps_mv_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
2777 ps_mv_refine_ctxt->i2_stim_injected_cost[1][index] = i4_stim_injected_cost;
2778 ps_mv_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
2779 ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_search_node->s_mv.i2_mvx;
2780 ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_search_node->s_mv.i2_mvy;
2781 ps_mv_refine_ctxt->i2_ref_idx[1][index] = ps_search_node->i1_ref_idx;
2782 }
2783 else if(update_required == 1)
2784 {
2785 ps_mv_refine_ctxt->i2_tot_cost[1][index] =
2786 ps_mv_refine_ctxt->i2_tot_cost[0][index];
2787 ps_mv_refine_ctxt->i2_stim_injected_cost[1][index] =
2788 ps_mv_refine_ctxt->i2_stim_injected_cost[0][index];
2789 ps_mv_refine_ctxt->i2_mv_cost[1][index] =
2790 ps_mv_refine_ctxt->i2_mv_cost[0][index];
2791 ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_mv_refine_ctxt->i2_mv_x[0][index];
2792 ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_mv_refine_ctxt->i2_mv_y[0][index];
2793 ps_mv_refine_ctxt->i2_ref_idx[1][index] =
2794 ps_mv_refine_ctxt->i2_ref_idx[0][index];
2795
2796 ps_mv_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
2797 ps_mv_refine_ctxt->i2_stim_injected_cost[0][index] = i4_stim_injected_cost;
2798 ps_mv_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
2799 ps_mv_refine_ctxt->i2_mv_x[0][index] = ps_search_node->s_mv.i2_mvx;
2800 ps_mv_refine_ctxt->i2_mv_y[0][index] = ps_search_node->s_mv.i2_mvy;
2801 ps_mv_refine_ctxt->i2_ref_idx[0][index] = ps_search_node->i1_ref_idx;
2802 }
2803 }
2804 }
2805 }
2806
2807 ps_search_node++;
2808 }
2809
2810 {
2811 WORD32 i4_i;
2812 WORD32 part_id;
2813 search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
2814 for(i4_i = 0; i4_i < ps_mv_refine_ctxt->i4_num_valid_parts; i4_i++)
2815 {
2816 part_id = ps_mv_refine_ctxt->ai4_part_id[i4_i];
2817 if(ps_mv_refine_ctxt->i2_stim_injected_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
2818 {
2819 ASSERT(ps_mv_refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
2820 ASSERT(ps_mv_refine_ctxt->i2_mv_x[0][part_id] == 0);
2821 ASSERT(ps_mv_refine_ctxt->i2_mv_y[0][part_id] == 0);
2822
2823 ps_mv_refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
2824 }
2825 if(ps_mv_refine_ctxt->i2_stim_injected_cost[1][part_id] >= MAX_SIGNED_16BIT_VAL)
2826 {
2827 ASSERT(ps_mv_refine_ctxt->i2_mv_cost[1][part_id] == MAX_SIGNED_16BIT_VAL);
2828 ASSERT(ps_mv_refine_ctxt->i2_mv_x[1][part_id] == 0);
2829 ASSERT(ps_mv_refine_ctxt->i2_mv_y[1][part_id] == 0);
2830
2831 ps_mv_refine_ctxt->i2_ref_idx[1][part_id] = ps_search_node->i1_ref_idx;
2832 }
2833 }
2834 }
2835 }
2836
hme_calc_sad_and_1_best_result(hme_search_prms_t * ps_search_prms,wgt_pred_ctxt_t * ps_wt_inp_prms,err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms,U08 ** ppu1_ref,S32 i4_ref_stride)2837 void hme_calc_sad_and_1_best_result(
2838 hme_search_prms_t *ps_search_prms,
2839 wgt_pred_ctxt_t *ps_wt_inp_prms,
2840 err_prms_t *ps_err_prms,
2841 result_upd_prms_t *ps_result_prms,
2842 U08 **ppu1_ref,
2843 S32 i4_ref_stride)
2844 {
2845 S32 i4_candt;
2846 S32 i4_inp_off;
2847 S32 i4_ref_offset;
2848 S32 i4_num_nodes;
2849
2850 S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
2851 S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
2852 WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
2853 WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
2854 WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
2855
2856 mv_refine_ctxt_t *ps_mv_refine_ctxt;
2857 search_node_t *ps_search_node;
2858
2859 ps_mv_refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
2860 i4_num_nodes = ps_search_prms->i4_num_search_nodes;
2861 i4_inp_off = ps_search_prms->i4_cu_x_off;
2862 i4_inp_off += ps_search_prms->i4_cu_y_off * cur_buf_stride;
2863 i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
2864 ps_search_node = ps_search_prms->ps_search_nodes;
2865
2866 for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
2867 {
2868 /**********************************************************************/
2869 /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID */
2870 /**********************************************************************/
2871 {
2872 WORD32 b, c, d;
2873 UWORD8 *pu1_cur_ptr;
2874 UWORD8 *pu1_ref_ptr;
2875 UWORD16 au2_4x4_sad[NUM_4X4];
2876
2877 if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
2878 {
2879 continue;
2880 }
2881
2882 ps_err_prms->pu1_inp =
2883 ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
2884 ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
2885 ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
2886 ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
2887
2888 pu1_cur_ptr = ps_err_prms->pu1_inp;
2889 pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
2890
2891 /* Loop to compute the SAD's */
2892 {
2893 memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
2894 for(b = 0; b < NUM_4X4; b++)
2895 {
2896 WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
2897 WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
2898
2899 for(c = 0; c < NUM_ROWS_IN_4X4; c++)
2900 {
2901 WORD32 z_cur = (cur_buf_stride)*c + t1;
2902 WORD32 z_ref = (ref_buf_stride)*c + t2;
2903 for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
2904 {
2905 au2_4x4_sad[b] += (UWORD16)ABS((
2906 ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
2907 }
2908 }
2909 }
2910
2911 pi4_sad_grid[PART_ID_NxN_TL] =
2912 (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
2913 pi4_sad_grid[PART_ID_NxN_TR] =
2914 (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
2915 pi4_sad_grid[PART_ID_NxN_BL] =
2916 (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2917 pi4_sad_grid[PART_ID_NxN_BR] =
2918 (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
2919 pi4_sad_grid[PART_ID_Nx2N_L] =
2920 pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
2921 pi4_sad_grid[PART_ID_Nx2N_R] =
2922 pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
2923 pi4_sad_grid[PART_ID_2NxN_T] =
2924 pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
2925 pi4_sad_grid[PART_ID_2NxN_B] =
2926 pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
2927 pi4_sad_grid[PART_ID_nLx2N_L] =
2928 (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
2929 pi4_sad_grid[PART_ID_nRx2N_R] =
2930 (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
2931 pi4_sad_grid[PART_ID_2NxnU_T] =
2932 (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
2933 pi4_sad_grid[PART_ID_2NxnD_B] =
2934 (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
2935 pi4_sad_grid[PART_ID_2Nx2N] =
2936 pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
2937 pi4_sad_grid[PART_ID_2NxnU_B] =
2938 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
2939 pi4_sad_grid[PART_ID_2NxnD_T] =
2940 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
2941 pi4_sad_grid[PART_ID_nRx2N_L] =
2942 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
2943 pi4_sad_grid[PART_ID_nLx2N_R] =
2944 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
2945 }
2946 }
2947
2948 {
2949 S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
2950 S32 *pi4_valid_part_ids = &ps_mv_refine_ctxt->ai4_part_id[0];
2951 S32 best_node_cost;
2952 S32 second_best_node_cost;
2953
2954 {
2955 S16 mvdx1, mvdy1;
2956 S32 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
2957 search_results_t *ps_search_results = ps_result_prms->ps_search_results;
2958 S32 pred_lx = i4_search_idx;
2959
2960 pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
2961 pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
2962 search_node_t *ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
2963
2964 S32 inp_shift = 2;
2965 S32 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
2966 S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
2967 S32 lambda = ps_pred_ctxt->lambda;
2968 S32 rnd = 1 << (lambda_q_shift - 1);
2969 S32 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
2970 S32 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
2971 S32 ref_bits =
2972 ps_pred_ctxt
2973 ->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
2974
2975 COMPUTE_DIFF_MV(
2976 mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
2977
2978 mvdx1 = ABS(mvdx1);
2979 mvdy1 = ABS(mvdy1);
2980
2981 i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) +
2982 (mvdy1 > 0) + ref_bits + 2;
2983
2984 i4_mv_cost *= lambda;
2985 i4_mv_cost += rnd;
2986 i4_mv_cost >>= lambda_q_shift;
2987
2988 i4_mv_cost = CLIP_U16(i4_mv_cost);
2989 }
2990
2991 /*For each valid partition, update the refine_prm structure to reflect the best and second
2992 best candidates for that partition*/
2993
2994 for(i4_count = 0; i4_count < ps_mv_refine_ctxt->i4_num_valid_parts; i4_count++)
2995 {
2996 S32 update_required = 0;
2997 S32 part_id = pi4_valid_part_ids[i4_count];
2998 S32 index = (ps_mv_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
2999
3000 /*Calculate total cost*/
3001 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
3002 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
3003
3004 /*****************************************************************/
3005 /* We do not labor through the results if the total cost worse */
3006 /* than the last of the results. */
3007 /*****************************************************************/
3008 best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_tot_cost[0][index]);
3009 second_best_node_cost = SHRT_MAX;
3010
3011 if(i4_tot_cost < second_best_node_cost)
3012 {
3013 update_required = 0;
3014
3015 /*************************************************************/
3016 /* Identify where the current result isto be placed.Basically*/
3017 /* find the node which has cost just higher thannodeundertest*/
3018 /*************************************************************/
3019 if(i4_tot_cost < best_node_cost)
3020 {
3021 update_required = 1;
3022 }
3023 else if(i4_tot_cost == best_node_cost)
3024 {
3025 update_required = 0;
3026 }
3027
3028 if(update_required == 2)
3029 {
3030 ps_mv_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
3031 ps_mv_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
3032 ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_search_node->s_mv.i2_mvx;
3033 ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_search_node->s_mv.i2_mvy;
3034 ps_mv_refine_ctxt->i2_ref_idx[1][index] = ps_search_node->i1_ref_idx;
3035 }
3036 else if(update_required == 1)
3037 {
3038 ps_mv_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
3039 ps_mv_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
3040 ps_mv_refine_ctxt->i2_mv_x[0][index] = ps_search_node->s_mv.i2_mvx;
3041 ps_mv_refine_ctxt->i2_mv_y[0][index] = ps_search_node->s_mv.i2_mvy;
3042 ps_mv_refine_ctxt->i2_ref_idx[0][index] = ps_search_node->i1_ref_idx;
3043 }
3044 }
3045 }
3046 }
3047 ps_search_node++;
3048 }
3049
3050 {
3051 WORD32 i4_i;
3052 WORD32 part_id;
3053 search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
3054 for(i4_i = 0; i4_i < ps_mv_refine_ctxt->i4_num_valid_parts; i4_i++)
3055 {
3056 part_id = ps_mv_refine_ctxt->ai4_part_id[i4_i];
3057 if(ps_mv_refine_ctxt->i2_tot_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
3058 {
3059 ASSERT(ps_mv_refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
3060 ASSERT(ps_mv_refine_ctxt->i2_mv_x[0][part_id] == 0);
3061 ASSERT(ps_mv_refine_ctxt->i2_mv_y[0][part_id] == 0);
3062
3063 ps_mv_refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
3064 }
3065 }
3066 }
3067 }
3068
hme_calc_stim_injected_sad_and_1_best_result(hme_search_prms_t * ps_search_prms,wgt_pred_ctxt_t * ps_wt_inp_prms,err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms,U08 ** ppu1_ref,S32 i4_ref_stride)3069 void hme_calc_stim_injected_sad_and_1_best_result(
3070 hme_search_prms_t *ps_search_prms,
3071 wgt_pred_ctxt_t *ps_wt_inp_prms,
3072 err_prms_t *ps_err_prms,
3073 result_upd_prms_t *ps_result_prms,
3074 U08 **ppu1_ref,
3075 S32 i4_ref_stride)
3076 {
3077 mv_refine_ctxt_t *ps_mv_refine_ctxt;
3078 search_node_t *ps_search_node;
3079
3080 S32 i4_candt;
3081 S32 i4_count;
3082 S32 i4_inp_off;
3083 S32 i4_ref_offset;
3084 S32 i4_num_nodes;
3085 ULWORD64 *au8_final_src_sigmaX, *au8_final_src_sigmaXSquared, au8_final_ref_sigmaX[17],
3086 au8_final_ref_sigmaXSquared[17];
3087 UWORD32 au4_4x4_ref_sigmaX[NUM_4X4], au4_4x4_ref_sigmaXSquared[NUM_4X4];
3088 S32 *pi4_valid_part_ids;
3089
3090 S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
3091 S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
3092 WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
3093 WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
3094 WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
3095
3096 ps_mv_refine_ctxt = ps_search_prms->ps_fullpel_refine_ctxt;
3097 i4_num_nodes = ps_search_prms->i4_num_search_nodes;
3098 i4_inp_off = ps_search_prms->i4_cu_x_off;
3099 i4_inp_off += ps_search_prms->i4_cu_y_off * cur_buf_stride;
3100 i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
3101 ps_search_node = ps_search_prms->ps_search_nodes;
3102 pi4_valid_part_ids = &ps_mv_refine_ctxt->ai4_part_id[0];
3103
3104 /* Set local pointer to point to partition level sigma values calculated in hme_refine */
3105 au8_final_src_sigmaX = ps_search_prms->pu8_part_src_sigmaX;
3106 au8_final_src_sigmaXSquared = ps_search_prms->pu8_part_src_sigmaXSquared;
3107
3108 for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
3109 {
3110 {
3111 WORD32 b, c, d;
3112 UWORD8 *pu1_cur_ptr;
3113 UWORD8 *pu1_ref_ptr;
3114 UWORD16 au2_4x4_sad[NUM_4X4];
3115
3116 if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
3117 {
3118 continue;
3119 }
3120
3121 ps_err_prms->pu1_inp =
3122 ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
3123 ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
3124 ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
3125 ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
3126
3127 pu1_cur_ptr = ps_err_prms->pu1_inp;
3128 pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
3129
3130 /* Loop to compute the SAD's */
3131 {
3132 memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
3133 for(b = 0; b < NUM_4X4; b++)
3134 {
3135 WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
3136 WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
3137
3138 for(c = 0; c < NUM_ROWS_IN_4X4; c++)
3139 {
3140 WORD32 z_cur = (cur_buf_stride)*c + t1;
3141 WORD32 z_ref = (ref_buf_stride)*c + t2;
3142 for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
3143 {
3144 au2_4x4_sad[b] += (UWORD16)ABS((
3145 ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
3146 }
3147 }
3148 }
3149
3150 /* Compute sigmaX and sigmaX_Squared at 4x4 level for ref from ref_ptr */
3151 hme_compute_sigmaX_and_sigmaXSquared(
3152 pu1_ref_ptr,
3153 ref_buf_stride,
3154 au4_4x4_ref_sigmaX,
3155 au4_4x4_ref_sigmaXSquared,
3156 4,
3157 4,
3158 16,
3159 16,
3160 1,
3161 4);
3162
3163 pi4_sad_grid[PART_ID_NxN_TL] =
3164 (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
3165 pi4_sad_grid[PART_ID_NxN_TR] =
3166 (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
3167 pi4_sad_grid[PART_ID_NxN_BL] =
3168 (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
3169 pi4_sad_grid[PART_ID_NxN_BR] =
3170 (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
3171 pi4_sad_grid[PART_ID_Nx2N_L] =
3172 pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
3173 pi4_sad_grid[PART_ID_Nx2N_R] =
3174 pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
3175 pi4_sad_grid[PART_ID_2NxN_T] =
3176 pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
3177 pi4_sad_grid[PART_ID_2NxN_B] =
3178 pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
3179 pi4_sad_grid[PART_ID_nLx2N_L] =
3180 (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
3181 pi4_sad_grid[PART_ID_nRx2N_R] =
3182 (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
3183 pi4_sad_grid[PART_ID_2NxnU_T] =
3184 (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
3185 pi4_sad_grid[PART_ID_2NxnD_B] =
3186 (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
3187 pi4_sad_grid[PART_ID_2Nx2N] =
3188 pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
3189 pi4_sad_grid[PART_ID_2NxnU_B] =
3190 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
3191 pi4_sad_grid[PART_ID_2NxnD_T] =
3192 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
3193 pi4_sad_grid[PART_ID_nRx2N_L] =
3194 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
3195 pi4_sad_grid[PART_ID_nLx2N_R] =
3196 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
3197 }
3198 }
3199
3200 {
3201 S32 i4_sad, i4_mv_cost, i4_tot_cost;
3202 S32 best_node_cost;
3203 S32 second_best_node_cost;
3204 ULWORD64 u8_temp_var, u8_temp_var1;
3205 ULWORD64 u8_ref_X_Square, u8_pure_dist, u8_src_var, u8_ref_var;
3206
3207 {
3208 S16 mvdx1, mvdy1;
3209 S32 i4_search_idx = (S32)ps_result_prms->i1_ref_idx;
3210 search_results_t *ps_search_results = ps_result_prms->ps_search_results;
3211 S32 pred_lx = i4_search_idx;
3212
3213 pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[pred_lx];
3214 pred_candt_nodes_t *ps_pred_nodes = &ps_pred_ctxt->as_pred_nodes[PART_2Nx2N];
3215 search_node_t *ps_pred_node_a = ps_pred_nodes->ps_mvp_node;
3216
3217 S32 inp_shift = 2;
3218 S32 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
3219 S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
3220 S32 lambda = ps_pred_ctxt->lambda;
3221 S32 rnd = 1 << (lambda_q_shift - 1);
3222 S32 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
3223 S32 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
3224 S32 ref_bits =
3225 ps_pred_ctxt
3226 ->ppu1_ref_bits_tlu[ps_pred_ctxt->pred_lx][ps_search_node->i1_ref_idx];
3227
3228 COMPUTE_DIFF_MV(
3229 mvdx1, mvdy1, ps_search_node, mv_p_x, mv_p_y, inp_shift, pred_shift);
3230
3231 mvdx1 = ABS(mvdx1);
3232 mvdy1 = ABS(mvdy1);
3233
3234 i4_mv_cost = hme_get_range(mvdx1) + hme_get_range(mvdy1) + (mvdx1 > 0) +
3235 (mvdy1 > 0) + ref_bits + 2;
3236
3237 i4_mv_cost *= lambda;
3238 i4_mv_cost += rnd;
3239 i4_mv_cost >>= lambda_q_shift;
3240
3241 i4_mv_cost = CLIP_U16(i4_mv_cost);
3242 }
3243
3244 for(i4_count = 0; i4_count < ps_mv_refine_ctxt->i4_num_valid_parts; i4_count++)
3245 {
3246 S32 i4_stim_injected_sad;
3247 S32 i4_stim_injected_cost;
3248 S32 i4_noise_term;
3249 unsigned long u4_shift_val;
3250 S32 i4_bits_req;
3251
3252 S32 update_required = 0;
3253 S32 part_id = pi4_valid_part_ids[i4_count];
3254 S32 index = (ps_mv_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
3255
3256 WORD32 i4_q_level = STIM_Q_FORMAT + ALPHA_Q_FORMAT;
3257
3258 S32 i4_inv_wt = ps_wt_inp_prms->a_inv_wpred_wt[ps_search_node->i1_ref_idx];
3259
3260 if(ps_search_prms->i4_alpha_stim_multiplier)
3261 {
3262 /* Compute ref sigmaX and sigmaX_Squared values for valid partitions from previously computed ref 4x4 level values */
3263 hme_compute_final_sigma_of_pu_from_base_blocks(
3264 au4_4x4_ref_sigmaX,
3265 au4_4x4_ref_sigmaXSquared,
3266 au8_final_ref_sigmaX,
3267 au8_final_ref_sigmaXSquared,
3268 16,
3269 4,
3270 part_id,
3271 4);
3272
3273 u8_ref_X_Square =
3274 (au8_final_ref_sigmaX[part_id] * au8_final_ref_sigmaX[part_id]);
3275 u8_ref_var = (au8_final_ref_sigmaXSquared[part_id] - u8_ref_X_Square);
3276
3277 /* Multiply un-normalized src_var with inv_wt if its not same as default wt */
3278 /* and shift the resulting src_var if its more than 27 bits to avoid overflow */
3279 /* The amount by which it is shifted is passed on to u4_shift_val and applied equally on ref_var */
3280 u4_shift_val = ihevce_calc_stim_injected_variance(
3281 au8_final_src_sigmaX,
3282 au8_final_src_sigmaXSquared,
3283 &u8_src_var,
3284 i4_inv_wt,
3285 ps_wt_inp_prms->ai4_shift_val[ps_search_node->i1_ref_idx],
3286 ps_wt_inp_prms->wpred_log_wdc,
3287 part_id);
3288
3289 u8_ref_var = u8_ref_var >> u4_shift_val;
3290
3291 /* Do the same check on ref_var to avoid overflow and apply similar shift on src_var */
3292 GETRANGE64(i4_bits_req, u8_ref_var);
3293
3294 if(i4_bits_req > 27)
3295 {
3296 u8_ref_var = u8_ref_var >> (i4_bits_req - 27);
3297 u8_src_var = u8_src_var >> (i4_bits_req - 27);
3298 }
3299
3300 if(u8_src_var == u8_ref_var)
3301 {
3302 u8_temp_var = (1 << STIM_Q_FORMAT);
3303 }
3304 else
3305 {
3306 u8_temp_var = (2 * u8_src_var * u8_ref_var);
3307 u8_temp_var = (u8_temp_var * (1 << STIM_Q_FORMAT));
3308 u8_temp_var1 = (u8_src_var * u8_src_var) + (u8_ref_var * u8_ref_var);
3309 u8_temp_var = (u8_temp_var + (u8_temp_var1 / 2));
3310 u8_temp_var = (u8_temp_var / u8_temp_var1);
3311 }
3312
3313 i4_noise_term = (UWORD32)u8_temp_var;
3314
3315 ASSERT(i4_noise_term >= 0);
3316
3317 i4_noise_term *= ps_search_prms->i4_alpha_stim_multiplier;
3318 }
3319 else
3320 {
3321 i4_noise_term = 0;
3322 }
3323 u8_pure_dist = pi4_sad_grid[part_id];
3324 u8_pure_dist *= ((1 << (i4_q_level)) - (i4_noise_term));
3325 u8_pure_dist += (1 << ((i4_q_level)-1));
3326 i4_stim_injected_sad = (UWORD32)(u8_pure_dist >> (i4_q_level));
3327
3328 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
3329 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
3330 i4_stim_injected_sad = CLIP3(i4_stim_injected_sad, 0, 0x7fff);
3331 i4_stim_injected_cost = CLIP_S16(i4_stim_injected_sad + i4_mv_cost);
3332
3333 best_node_cost = CLIP_S16(ps_mv_refine_ctxt->i2_stim_injected_cost[0][index]);
3334 second_best_node_cost = SHRT_MAX;
3335
3336 if(i4_stim_injected_cost < second_best_node_cost)
3337 {
3338 update_required = 0;
3339
3340 if(i4_stim_injected_cost < best_node_cost)
3341 {
3342 update_required = 1;
3343 }
3344 else if(i4_stim_injected_cost == best_node_cost)
3345 {
3346 update_required = 0;
3347 }
3348
3349 if(update_required == 2)
3350 {
3351 ps_mv_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
3352 ps_mv_refine_ctxt->i2_stim_injected_cost[1][index] = i4_stim_injected_cost;
3353 ps_mv_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
3354 ps_mv_refine_ctxt->i2_mv_x[1][index] = ps_search_node->s_mv.i2_mvx;
3355 ps_mv_refine_ctxt->i2_mv_y[1][index] = ps_search_node->s_mv.i2_mvy;
3356 ps_mv_refine_ctxt->i2_ref_idx[1][index] = ps_search_node->i1_ref_idx;
3357 }
3358 else if(update_required == 1)
3359 {
3360 ps_mv_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
3361 ps_mv_refine_ctxt->i2_stim_injected_cost[0][index] = i4_stim_injected_cost;
3362 ps_mv_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
3363 ps_mv_refine_ctxt->i2_mv_x[0][index] = ps_search_node->s_mv.i2_mvx;
3364 ps_mv_refine_ctxt->i2_mv_y[0][index] = ps_search_node->s_mv.i2_mvy;
3365 ps_mv_refine_ctxt->i2_ref_idx[0][index] = ps_search_node->i1_ref_idx;
3366 }
3367 }
3368 }
3369 }
3370
3371 ps_search_node++;
3372 }
3373
3374 {
3375 WORD32 i4_i;
3376 WORD32 part_id;
3377 search_node_t *ps_search_node = ps_search_prms->ps_search_nodes;
3378 for(i4_i = 0; i4_i < ps_mv_refine_ctxt->i4_num_valid_parts; i4_i++)
3379 {
3380 part_id = ps_mv_refine_ctxt->ai4_part_id[i4_i];
3381 if(ps_mv_refine_ctxt->i2_stim_injected_cost[0][part_id] >= MAX_SIGNED_16BIT_VAL)
3382 {
3383 ASSERT(ps_mv_refine_ctxt->i2_mv_cost[0][part_id] == MAX_SIGNED_16BIT_VAL);
3384 ASSERT(ps_mv_refine_ctxt->i2_mv_x[0][part_id] == 0);
3385 ASSERT(ps_mv_refine_ctxt->i2_mv_y[0][part_id] == 0);
3386
3387 ps_mv_refine_ctxt->i2_ref_idx[0][part_id] = ps_search_node->i1_ref_idx;
3388 }
3389 }
3390 }
3391 }
3392
hme_calc_sad_and_1_best_result_subpel(err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms)3393 void hme_calc_sad_and_1_best_result_subpel(
3394 err_prms_t *ps_err_prms, result_upd_prms_t *ps_result_prms)
3395 {
3396 S32 i4_candt;
3397 S32 i4_num_nodes;
3398
3399 S32 *pi4_sad_grid = ps_err_prms->pi4_sad_grid;
3400
3401 S32 cur_buf_stride = ps_err_prms->i4_inp_stride;
3402 WORD32 ref_buf_stride = ps_err_prms->i4_ref_stride;
3403 WORD32 cur_buf_stride_ls2 = (cur_buf_stride << 2);
3404 WORD32 ref_buf_stride_ls2 = (ref_buf_stride << 2);
3405
3406 mv_refine_ctxt_t *ps_subpel_refine_ctxt;
3407 ps_subpel_refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
3408 i4_num_nodes = 1;
3409
3410 /* Run through each of the candts in a loop */
3411 for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
3412 {
3413 /**********************************************************************/
3414 /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID */
3415 /**********************************************************************/
3416 {
3417 WORD32 b, c, d;
3418 UWORD8 *pu1_cur_ptr;
3419 UWORD8 *pu1_ref_ptr;
3420 UWORD16 au2_4x4_sad[NUM_4X4];
3421
3422 pu1_cur_ptr = ps_err_prms->pu1_inp;
3423 pu1_ref_ptr = &ps_err_prms->pu1_ref[0];
3424
3425 /* Loop to compute the SAD's */
3426 {
3427 memset(&au2_4x4_sad[0], 0, NUM_4X4 * sizeof(UWORD16));
3428 for(b = 0; b < NUM_4X4; b++)
3429 {
3430 WORD32 t1 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * cur_buf_stride_ls2;
3431 WORD32 t2 = (b % 4) * NUM_PIXELS_IN_ROW + (b >> 2) * ref_buf_stride_ls2;
3432
3433 for(c = 0; c < NUM_ROWS_IN_4X4; c++)
3434 {
3435 WORD32 z_cur = (cur_buf_stride)*c + t1;
3436 WORD32 z_ref = (ref_buf_stride)*c + t2;
3437 for(d = 0; d < NUM_PIXELS_IN_ROW; d++)
3438 {
3439 au2_4x4_sad[b] += (UWORD16)ABS((
3440 ((S32)pu1_ref_ptr[(z_ref + d)]) - ((S32)pu1_cur_ptr[(z_cur + d)])));
3441 }
3442 }
3443 }
3444
3445 pi4_sad_grid[PART_ID_NxN_TL] =
3446 (au2_4x4_sad[0] + au2_4x4_sad[1] + au2_4x4_sad[4] + au2_4x4_sad[5]);
3447 pi4_sad_grid[PART_ID_NxN_TR] =
3448 (au2_4x4_sad[2] + au2_4x4_sad[3] + au2_4x4_sad[6] + au2_4x4_sad[7]);
3449 pi4_sad_grid[PART_ID_NxN_BL] =
3450 (au2_4x4_sad[8] + au2_4x4_sad[9] + au2_4x4_sad[12] + au2_4x4_sad[13]);
3451 pi4_sad_grid[PART_ID_NxN_BR] =
3452 (au2_4x4_sad[10] + au2_4x4_sad[11] + au2_4x4_sad[14] + au2_4x4_sad[15]);
3453 pi4_sad_grid[PART_ID_Nx2N_L] =
3454 pi4_sad_grid[PART_ID_NxN_TL] + pi4_sad_grid[PART_ID_NxN_BL];
3455 pi4_sad_grid[PART_ID_Nx2N_R] =
3456 pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_BR];
3457 pi4_sad_grid[PART_ID_2NxN_T] =
3458 pi4_sad_grid[PART_ID_NxN_TR] + pi4_sad_grid[PART_ID_NxN_TL];
3459 pi4_sad_grid[PART_ID_2NxN_B] =
3460 pi4_sad_grid[PART_ID_NxN_BR] + pi4_sad_grid[PART_ID_NxN_BL];
3461 pi4_sad_grid[PART_ID_nLx2N_L] =
3462 (au2_4x4_sad[8] + au2_4x4_sad[0] + au2_4x4_sad[12] + au2_4x4_sad[4]);
3463 pi4_sad_grid[PART_ID_nRx2N_R] =
3464 (au2_4x4_sad[3] + au2_4x4_sad[7] + au2_4x4_sad[15] + au2_4x4_sad[11]);
3465 pi4_sad_grid[PART_ID_2NxnU_T] =
3466 (au2_4x4_sad[1] + au2_4x4_sad[0] + au2_4x4_sad[2] + au2_4x4_sad[3]);
3467 pi4_sad_grid[PART_ID_2NxnD_B] =
3468 (au2_4x4_sad[15] + au2_4x4_sad[14] + au2_4x4_sad[12] + au2_4x4_sad[13]);
3469 pi4_sad_grid[PART_ID_2Nx2N] =
3470 pi4_sad_grid[PART_ID_2NxN_T] + pi4_sad_grid[PART_ID_2NxN_B];
3471 pi4_sad_grid[PART_ID_2NxnU_B] =
3472 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
3473 pi4_sad_grid[PART_ID_2NxnD_T] =
3474 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
3475 pi4_sad_grid[PART_ID_nRx2N_L] =
3476 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
3477 pi4_sad_grid[PART_ID_nLx2N_R] =
3478 pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
3479 }
3480 }
3481 /**********************************************************************/
3482 /* CALL THE FUNCTION THAT COMPUTES UPDATES THE BEST RESULTS */
3483 /**********************************************************************/
3484 {
3485 S32 i4_count = 0, i4_sad, i4_mv_cost, i4_tot_cost;
3486 S32 *pi4_valid_part_ids = &ps_subpel_refine_ctxt->ai4_part_id[0];
3487 S32 best_node_cost;
3488 S32 second_best_node_cost;
3489
3490 /*For each valid partition, update the refine_prm structure to reflect the best and second
3491 best candidates for that partition*/
3492
3493 for(i4_count = 0; i4_count < ps_subpel_refine_ctxt->i4_num_valid_parts; i4_count++)
3494 {
3495 S32 update_required = 0;
3496 S32 part_id = pi4_valid_part_ids[i4_count];
3497 S32 index = (ps_subpel_refine_ctxt->i4_num_valid_parts > 8) ? part_id : i4_count;
3498
3499 /* Use a pre-computed cost instead of freshly evaluating subpel cost */
3500 i4_mv_cost = ps_subpel_refine_ctxt->i2_mv_cost[0][index];
3501
3502 /*Calculate total cost*/
3503 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
3504 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
3505
3506 /*****************************************************************/
3507 /* We do not labor through the results if the total cost worse */
3508 /* than the last of the results. */
3509 /*****************************************************************/
3510 best_node_cost = CLIP_S16(ps_subpel_refine_ctxt->i2_tot_cost[0][index]);
3511 second_best_node_cost = SHRT_MAX;
3512
3513 if(i4_tot_cost < second_best_node_cost)
3514 {
3515 update_required = 0;
3516
3517 /*************************************************************/
3518 /* Identify where the current result isto be placed.Basically*/
3519 /* find the node which has cost just higher thannodeundertest*/
3520 /*************************************************************/
3521 if(i4_tot_cost < best_node_cost)
3522 {
3523 update_required = 1;
3524 }
3525 else if(i4_tot_cost == ps_subpel_refine_ctxt->i2_tot_cost[0][index])
3526 {
3527 update_required = 0;
3528 }
3529 if(update_required == 2)
3530 {
3531 ps_subpel_refine_ctxt->i2_tot_cost[1][index] = i4_tot_cost;
3532 ps_subpel_refine_ctxt->i2_mv_cost[1][index] = i4_mv_cost;
3533 ps_subpel_refine_ctxt->i2_mv_x[1][index] = ps_result_prms->i2_mv_x;
3534 ps_subpel_refine_ctxt->i2_mv_y[1][index] = ps_result_prms->i2_mv_y;
3535 ps_subpel_refine_ctxt->i2_ref_idx[1][index] = ps_result_prms->i1_ref_idx;
3536 }
3537 else if(update_required == 1)
3538 {
3539 ps_subpel_refine_ctxt->i2_tot_cost[0][index] = i4_tot_cost;
3540 ps_subpel_refine_ctxt->i2_mv_cost[0][index] = i4_mv_cost;
3541 ps_subpel_refine_ctxt->i2_mv_x[0][index] = ps_result_prms->i2_mv_x;
3542 ps_subpel_refine_ctxt->i2_mv_y[0][index] = ps_result_prms->i2_mv_y;
3543 ps_subpel_refine_ctxt->i2_ref_idx[0][index] = ps_result_prms->i1_ref_idx;
3544 }
3545 }
3546 }
3547 }
3548 }
3549
3550 {
3551 WORD32 i4_count = 0;
3552 for(i4_count = 0; i4_count < TOT_NUM_PARTS; i4_count++)
3553 {
3554 if(ps_subpel_refine_ctxt->i2_tot_cost[0][i4_count] >= MAX_SIGNED_16BIT_VAL)
3555 {
3556 ps_subpel_refine_ctxt->ai2_fullpel_satd[0][i4_count] = MAX_SIGNED_16BIT_VAL;
3557 }
3558 }
3559 }
3560 }
3561
3562 /**
3563 ********************************************************************************
3564 * @fn hme_calc_pt_sad_and_result_explicit(hme_search_prms_t *ps_search_prms,
3565 * wgt_pred_ctxt_t *ps_wt_inp_prms,
3566 * err_prms_t *ps_err_prms,
3567 * result_upd_prms_t *ps_result_prms,
3568 * U08 **ppu1_ref,
3569 * S32 i4_ref_stride)
3570 *
3571 * @brief Run thorugh the provided candidates and compute the point SAD and
3572 * cost and update the results in the order
3573 *
3574 * @param[in] ps_search_prms
3575 * @param[in] ps_wt_inp_prms
3576 * @param[in] ps_err_prms
3577 * @param[out] ps_result_prms
3578 * @param[in] ppu1_ref
3579 * @param[in] i4_ref_stride
3580 *
3581 * @return None
3582 ********************************************************************************
3583 */
3584
hme_calc_pt_sad_and_result_explicit(hme_search_prms_t * ps_search_prms,wgt_pred_ctxt_t * ps_wt_inp_prms,err_prms_t * ps_err_prms,result_upd_prms_t * ps_result_prms,U08 ** ppu1_ref,S32 i4_ref_stride)3585 void hme_calc_pt_sad_and_result_explicit(
3586 hme_search_prms_t *ps_search_prms,
3587 wgt_pred_ctxt_t *ps_wt_inp_prms,
3588 err_prms_t *ps_err_prms,
3589 result_upd_prms_t *ps_result_prms,
3590 U08 **ppu1_ref,
3591 S32 i4_ref_stride)
3592 {
3593 WORD32 i4_grid_mask, i4_part_mask, i4_num_results, i4_candt, i4_num_nodes;
3594 WORD32 i4_inp_stride, i4_inp_off, i4_ref_offset;
3595
3596 search_node_t *ps_search_node;
3597 BLK_SIZE_T e_blk_size;
3598 PF_SAD_FXN_T pf_sad_fxn;
3599 PF_RESULT_FXN_T pf_hme_result_fxn;
3600
3601 i4_grid_mask = 0x1; /* Point SAD */
3602
3603 /* Get the parameters required */
3604 i4_part_mask = ps_search_prms->i4_part_mask;
3605 e_blk_size = ps_search_prms->e_blk_size;
3606 i4_num_results = (S32)ps_search_prms->ps_search_results->u1_num_results_per_part;
3607 i4_num_nodes = ps_search_prms->i4_num_search_nodes;
3608 ps_search_node = ps_search_prms->ps_search_nodes;
3609
3610 i4_inp_stride = ps_search_prms->i4_inp_stride;
3611 /* Move to the location of the search blk in inp buffer */
3612 i4_inp_off = ps_search_prms->i4_cu_x_off;
3613 i4_inp_off += ps_search_prms->i4_cu_y_off * i4_inp_stride;
3614 i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
3615
3616 pf_sad_fxn = hme_get_sad_fxn(e_blk_size, i4_grid_mask, i4_part_mask);
3617 /**********************************************************************/
3618 /* we have a sparsely populated SAD grid of size 9x17. */
3619 /* the id of the results in the grid is shown */
3620 /* 5 2 6 */
3621 /* 1 0 3 */
3622 /* 7 4 8 */
3623 /* The motivation for choosing a grid like this is that */
3624 /* in case of no refinement, the central location is */
3625 /* the first entry in the grid */
3626 /* Also for diamond, the 4 entries get considered first */
3627 /* This is consistent with the diamond notation used in */
3628 /* subpel refinement. To Check */
3629 /* Update the results for the given search candt */
3630 /* returns the cost of the 2Nx2N partition */
3631 /**********************************************************************/
3632
3633 /* Get the modified update result fun. with CLIP16 of cost to match */
3634 /* with SIMD */
3635 pf_hme_result_fxn = hme_update_results_grid_pu_bestn_no_encode;
3636
3637 for(i4_candt = 0; i4_candt < i4_num_nodes; i4_candt++)
3638 {
3639 if(ps_search_node->s_mv.i2_mvx == INTRA_MV)
3640 continue;
3641
3642 /* initialize minimum cost for this candidate. As we search around */
3643 /* this candidate, this is used to check early exit, when in any */
3644 /* given iteration, the center pt of the grid is lowest value */
3645 ps_result_prms->i4_min_cost = MAX_32BIT_VAL;
3646
3647 ps_err_prms->pu1_inp = ps_wt_inp_prms->apu1_wt_inp[ps_search_node->i1_ref_idx] + i4_inp_off;
3648 ps_err_prms->i4_grid_mask = i4_grid_mask;
3649
3650 ps_err_prms->pu1_ref = ppu1_ref[ps_search_node->i1_ref_idx] + i4_ref_offset;
3651 ps_err_prms->pu1_ref += ps_search_node->s_mv.i2_mvx;
3652 ps_err_prms->pu1_ref += (ps_search_node->s_mv.i2_mvy * i4_ref_stride);
3653
3654 /**********************************************************************/
3655 /* CALL THE FUNCTION THAT COMPUTES THE SAD AND UPDATES THE SAD GRID */
3656 /**********************************************************************/
3657 pf_sad_fxn(ps_err_prms);
3658
3659 /**********************************************************************/
3660 /* CALL THE FUNCTION THAT COMPUTES UPDATES THE BEST RESULTS */
3661 /**********************************************************************/
3662 ps_result_prms->i4_grid_mask = i4_grid_mask;
3663 ps_result_prms->ps_search_node_base = ps_search_node;
3664 pf_hme_result_fxn(ps_result_prms);
3665
3666 ps_search_node++;
3667 }
3668 }
3669
3670 /**
3671 ********************************************************************************
3672 * @fn hme_set_mvp_node(search_results_t *ps_search_results,
3673 * search_node_t *ps_candt_prj_coloc,
3674 * S08 i1_ref_idx)
3675 *
3676 * @brief Set node used for motion vector predictor computation
3677 * Either TR or L is compared to projected colocated and
3678 * closest is decided as MVP
3679 *
3680 * @param[in] ps_search_results
3681 *
3682 * @param[in] ps_candt_prj_coloc
3683 *
3684 * @param[in] i1_ref_idx
3685 *
3686 * @return None
3687 ********************************************************************************
3688 */
hme_set_mvp_node(search_results_t * ps_search_results,search_node_t * ps_candt_prj_coloc,U08 u1_pred_lx,U08 u1_default_ref_id)3689 void hme_set_mvp_node(
3690 search_results_t *ps_search_results,
3691 search_node_t *ps_candt_prj_coloc,
3692 U08 u1_pred_lx,
3693 U08 u1_default_ref_id)
3694 {
3695 S32 i;
3696 pred_ctxt_t *ps_pred_ctxt = &ps_search_results->as_pred_ctxt[u1_pred_lx];
3697 pred_candt_nodes_t *ps_pred_nodes = ps_pred_ctxt->as_pred_nodes;
3698 search_node_t *ps_pred_node_a = NULL, *ps_pred_node_b = NULL;
3699
3700 S32 inp_shift = 2;
3701 S32 pred_shift;
3702 S32 ref_bits;
3703 S32 mv_p_x, mv_p_y;
3704 S16 mvdx1, mvdx2, mvdy1, mvdy2;
3705
3706 ref_bits = ps_pred_ctxt->ppu1_ref_bits_tlu[u1_pred_lx][u1_default_ref_id];
3707
3708 /*************************************************************************/
3709 /* Priority to bottom left availability. Else we go to left. If both are */
3710 /* not available, then a remains null */
3711 /*************************************************************************/
3712 if(ps_pred_nodes->ps_l->u1_is_avail)
3713 {
3714 ps_pred_node_a = ps_pred_nodes->ps_l;
3715 }
3716
3717 if((!(ps_pred_ctxt->proj_used) && (ps_pred_nodes->ps_tr->u1_is_avail)))
3718 {
3719 ps_pred_node_b = ps_pred_nodes->ps_tr;
3720 }
3721 else
3722 {
3723 ps_pred_node_b = ps_pred_nodes->ps_coloc;
3724 ps_pred_node_b->s_mv = ps_pred_node_b->ps_mv[0];
3725 }
3726
3727 if(ps_pred_node_a == NULL)
3728 {
3729 ps_pred_node_a = ps_pred_nodes->ps_coloc;
3730 ps_pred_node_a->s_mv = ps_pred_node_a->ps_mv[0];
3731
3732 if(ps_pred_node_b == ps_pred_nodes->ps_coloc)
3733 {
3734 ps_pred_node_b = ps_pred_nodes->ps_zeromv;
3735 ps_pred_node_b->s_mv = ps_pred_node_b->ps_mv[0];
3736 }
3737 }
3738
3739 if(ps_pred_node_a->i1_ref_idx != u1_default_ref_id)
3740 {
3741 SCALE_FOR_POC_DELTA(
3742 mv_p_x, mv_p_y, ps_pred_node_a, u1_default_ref_id, ps_pred_ctxt->pi2_ref_scf);
3743 }
3744 else
3745 {
3746 mv_p_x = ps_pred_node_a->s_mv.i2_mvx;
3747 mv_p_y = ps_pred_node_a->s_mv.i2_mvy;
3748 }
3749 pred_shift = ps_pred_node_a->u1_subpel_done ? 0 : 2;
3750 COMPUTE_MV_DIFFERENCE(mvdx1, mvdy1, ps_candt_prj_coloc, mv_p_x, mv_p_y, inp_shift, pred_shift);
3751 mvdx1 = ABS(mvdx1);
3752 mvdy1 = ABS(mvdy1);
3753
3754 if(ps_pred_node_b->i1_ref_idx != u1_default_ref_id)
3755 {
3756 SCALE_FOR_POC_DELTA(
3757 mv_p_x, mv_p_y, ps_pred_node_b, u1_default_ref_id, ps_pred_ctxt->pi2_ref_scf);
3758 }
3759 else
3760 {
3761 mv_p_x = ps_pred_node_b->s_mv.i2_mvx;
3762 mv_p_y = ps_pred_node_b->s_mv.i2_mvy;
3763 }
3764 pred_shift = ps_pred_node_b->u1_subpel_done ? 0 : 2;
3765 COMPUTE_MV_DIFFERENCE(mvdx2, mvdy2, ps_candt_prj_coloc, mv_p_x, mv_p_y, inp_shift, pred_shift);
3766 mvdx2 = ABS(mvdx2);
3767 mvdy2 = ABS(mvdy2);
3768
3769 if((mvdx1 + mvdy1) < (mvdx2 + mvdy2))
3770 {
3771 for(i = 0; i < TOT_NUM_PARTS; i++)
3772 {
3773 ps_pred_nodes[i].ps_mvp_node = ps_pred_node_a;
3774 }
3775 }
3776 else
3777 {
3778 for(i = 0; i < TOT_NUM_PARTS; i++)
3779 {
3780 ps_pred_nodes[i].ps_mvp_node = ps_pred_node_b;
3781 }
3782 }
3783 }
3784