xref: /aosp_15_r20/external/libavc/encoder/svc/isvce_core_coding.c (revision 495ae853bb871d1e5a258cb02c2cc13cde8ddb9a)
1 /******************************************************************************
2  *
3  * Copyright (C) 2022 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19  */
20 
21 /**
22  *******************************************************************************
23  * @file
24  *  isvce_core_coding.c
25  *
26  * @brief
27  *  This file contains routines that perform luma and chroma core coding for
28  *  intra macroblocks
29  *
30  * @author
31  *  ittiam
32  *
33  * @par List of Functions:
34  *  - isvce_pack_l_mb_i16()
35  *  - isvce_pack_c_mb_i8()
36  *  - isvce_code_luma_intra_macroblock_16x16()
37  *  - isvce_code_luma_intra_macroblock_4x4()
38  *  - isvce_code_chroma_intra_macroblock_8x8()
39  *
40  * @remarks
41  *  None
42  *
43  *******************************************************************************
44  */
45 
46 /*****************************************************************************/
47 /* File Includes                                                             */
48 /*****************************************************************************/
49 
50 /* System include files */
51 #include <stdio.h>
52 #include <string.h>
53 #include <assert.h>
54 
55 /* User include files */
56 #include "ih264_typedefs.h"
57 #include "ih264_debug.h"
58 #include "ih264_platform_macros.h"
59 #include "iv2.h"
60 #include "ive2.h"
61 #include "isvc_macros.h"
62 #include "isvc_defs.h"
63 #include "ih264e_config.h"
64 #include "isvce_defs.h"
65 #include "ih264_trans_data.h"
66 #include "ih264e_error.h"
67 #include "ih264e_bitstream.h"
68 #include "ime_distortion_metrics.h"
69 #include "ime_defs.h"
70 #include "ime_structs.h"
71 #include "isvc_structs.h"
72 #include "isvc_trans_quant_itrans_iquant.h"
73 #include "isvc_inter_pred_filters.h"
74 #include "isvc_mem_fns.h"
75 #include "ih264_padding.h"
76 #include "ih264_intra_pred_filters.h"
77 #include "ih264_deblk_edge_filters.h"
78 #include "isvc_cabac_tables.h"
79 #include "irc_cntrl_param.h"
80 #include "irc_frame_info_collector.h"
81 #include "isvce_rate_control.h"
82 #include "isvce_cabac_structs.h"
83 #include "isvce_structs.h"
84 #include "isvce_globals.h"
85 #include "isvce_core_coding.h"
86 #include "isvce_mc.h"
87 #include "isvce_ibl_eval.h"
88 
89 /*****************************************************************************/
90 /* Function Definitions                                                      */
91 /*****************************************************************************/
92 
93 /**
94 *******************************************************************************
95 *
96 * @brief
97 *  This function performs does the DCT transform then Hadamard transform
98 *  and quantization for a macroblock when the mb mode is intra 16x16 mode
99 *
100 * @par Description:
101 *  First  cf4 is done on all 16 4x4 blocks of the 16x16 input block.
102 *  Then hadamard transform is done on the DC coefficients
103 *  Quantization is then performed on the 16x16 block, 4x4 wise
104 *
105 * @param[in] pu1_src
106 *  Pointer to source sub-block
107 *
108 * @param[in] pu1_pred
109 *  Pointer to prediction sub-block
110 *
111 * @param[in] pi2_out
112 *  Pointer to residual sub-block
113 *  The output will be in linear format
114 *  The first 16 continuous locations will contain the values of Dc block
115 *  After DC block and a stride 1st AC block will follow
116 *  After one more stride next AC block will follow
117 *  The blocks will be in raster scan order
118 *
119 * @param[in] i4_src_stride
120 *  Source stride
121 *
122 * @param[in] i4_pred_stride
123 *  Prediction stride
124 *
125 * @param[in] dst_strd
126 *  Destination stride
127 *
128 * @param[in] pu2_scale_matrix
129 *  The quantization matrix for 4x4 transform
130 *
131 * @param[in] pu2_threshold_matrix
132 *  Threshold matrix
133 *
134 * @param[in] u4_qbits
135 *  15+QP/6
136 *
137 * @param[in] u4_round_factor
138 *  Round factor for quant
139 *
140 * @param[out] pu1_nnz
141 *  Memory to store the non-zeros after transform
142 *  The first byte will be the nnz of DC block
143 *  From the next byte the AC nnzs will be stored in raster scan order
144 *
145 * @param u4_dc_flag
146 *  Signals if Dc transform is to be done or not
147 *   1 -> Dc transform will be done
148 *   0 -> Dc transform will not be done
149 *
150 * @remarks
151 *
152 *******************************************************************************
153 */
isvce_luma_16x16_resi_trans_dctrans_quant(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_quant_coeffs,buffer_container_t * ps_upsampled_res,isa_dependent_fxns_t * ps_isa_dependent_fxns,const UWORD16 * pu2_scale_matrix,const UWORD16 * pu2_threshold_matrix,UWORD8 * pu1_nnz,UWORD32 u4_qbits,UWORD32 u4_round_factor,UWORD32 u4_dc_flag,UWORD8 u1_use_upsampled_res)154 void isvce_luma_16x16_resi_trans_dctrans_quant(
155     buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_quant_coeffs,
156     buffer_container_t *ps_upsampled_res, isa_dependent_fxns_t *ps_isa_dependent_fxns,
157     const UWORD16 *pu2_scale_matrix, const UWORD16 *pu2_threshold_matrix, UWORD8 *pu1_nnz,
158     UWORD32 u4_qbits, UWORD32 u4_round_factor, UWORD32 u4_dc_flag, UWORD8 u1_use_upsampled_res)
159 {
160     WORD32 blk_cntr;
161     WORD32 i4_offsetx, i4_offsety;
162 
163     enc_loop_fxns_t *ps_enc_loop_fxns = &ps_isa_dependent_fxns->s_enc_loop_fxns;
164     buffer_container_t s_src = ps_src[0];
165     buffer_container_t s_pred = ps_pred[0];
166     buffer_container_t s_quant_coeffs = ps_quant_coeffs[0];
167     buffer_container_t s_upsampled_res = {0};
168     resi_trans_quant_constants_t s_resi_trans_quant_constants = {
169         .pu2_scale_matrix = pu2_scale_matrix,
170         .pu2_threshold_matrix = pu2_threshold_matrix,
171         .u4_qbits = u4_qbits,
172         .u4_round_factor = u4_round_factor};
173 
174     UWORD8 u1_resi_trans_fxn_idx = isvc_get_resi_trans_quant_variant_idx(u1_use_upsampled_res);
175 
176     /* Move to the ac addresses */
177     pu1_nnz++;
178     s_quant_coeffs.pv_data = ((WORD16 *) s_quant_coeffs.pv_data) + s_quant_coeffs.i4_data_stride;
179 
180     if(u1_use_upsampled_res)
181     {
182         s_upsampled_res = ps_upsampled_res[0];
183     }
184 
185     for(blk_cntr = 0; blk_cntr < NUM_LUMA4x4_BLOCKS_IN_MB; blk_cntr++)
186     {
187         IND2SUB_LUMA_MB(blk_cntr, i4_offsetx, i4_offsety);
188 
189         s_src.pv_data =
190             ((UWORD8 *) ps_src[0].pv_data) + i4_offsetx + i4_offsety * ps_src[0].i4_data_stride;
191         s_pred.pv_data =
192             ((UWORD8 *) ps_pred[0].pv_data) + i4_offsetx + i4_offsety * ps_pred[0].i4_data_stride;
193         s_quant_coeffs.pv_data =
194             ((WORD16 *) ps_quant_coeffs[0].pv_data) + blk_cntr * ps_quant_coeffs[0].i4_data_stride;
195 
196         if(u1_use_upsampled_res)
197         {
198             s_upsampled_res.pv_data = ((WORD16 *) ps_upsampled_res[0].pv_data) + i4_offsetx +
199                                       i4_offsety * ps_upsampled_res[0].i4_data_stride;
200         }
201 
202         /* Move to the ac addresses */
203         s_quant_coeffs.pv_data =
204             ((WORD16 *) s_quant_coeffs.pv_data) + ps_quant_coeffs[0].i4_data_stride;
205 
206         s_quant_coeffs.i4_data_stride = 4;
207 
208         ps_enc_loop_fxns->apf_resi_trans_quant_4x4[u1_resi_trans_fxn_idx](
209             &s_src, &s_pred, &s_quant_coeffs, &s_upsampled_res, &s_resi_trans_quant_constants,
210             &pu1_nnz[blk_cntr], ((WORD16 *) ps_quant_coeffs->pv_data) + blk_cntr,
211             u1_use_upsampled_res);
212     }
213 
214     if(!u4_dc_flag)
215     {
216         return;
217     }
218 
219     /*
220      * In case of i16x16, we need to remove the contribution of dc coeffs into
221      * nnz of each block. We are doing that in the packing function
222      */
223 
224     /* Adjust pointers to point to dc values */
225     s_quant_coeffs = ps_quant_coeffs[0];
226     pu1_nnz--;
227 
228     u4_qbits++;
229     u4_round_factor <<= 1;
230 
231     ps_enc_loop_fxns->pf_hadamard_quant_4x4(((WORD16 *) s_quant_coeffs.pv_data),
232                                             ((WORD16 *) s_quant_coeffs.pv_data),
233                                             &s_resi_trans_quant_constants, &pu1_nnz[0]);
234 }
235 
236 /**
237 *******************************************************************************
238 *
239 * @brief
240 *  This function performs the intra 16x16 inverse transform process for H264
241 *  it includes inverse Dc transform, inverse quant and then inverse transform
242 *
243 * @par Description:
244 *
245 * @param[in] pi2_src
246 *  Input data, 16x16 size
247 *  First 16 mem locations will have the Dc coffs in rater scan order in linear
248 *fashion after a stride 1st AC clock will be present again in raster can order
249 *  Then each AC block of the 16x16 block will follow in raster scan order
250 *
251 * @param[in] pu1_pred
252 *  The predicted data, 16x16 size
253 *  Block by block form
254 *
255 * @param[in] pu1_out
256 *  Output 16x16
257 *  In block by block form
258 *
259 * @param[in] i4_src_stride
260 *  Source stride
261 *
262 * @param[in] i4_pred_stride
263 *  input stride for prediction buffer
264 *
265 * @param[in] i4_out_stride
266 *  input stride for output buffer
267 *
268 * @param[in] pu2_iscale_mat
269 *  Inverse quantization matrix for 4x4 transform
270 *
271 * @param[in] pu2_weigh_mat
272 *  weight matrix of 4x4 transform
273 *
274 * @param[in] u4_qp_div_6
275 *  QP/6
276 *
277 * @param[in] pi4_tmp
278 *  Input temporary buffer
279 *  needs to be at least 20 in size
280 *
281 * @param[in] pu4_cntrl
282 *  Controls the transform path
283 *  total Last 17 bits are used
284 *  the 16th th bit will correspond to DC block
285 *  and 32-17 will correspond to the ac blocks in raster scan order
286 *  bit equaling zero indicates that the entire 4x4 block is zero for DC
287 *  For AC blocks a bit equaling zero will mean that all 15 AC coffs of the block
288 *is nonzero
289 *
290 * @param[in] pi4_tmp
291 *  Input temporary buffer
292 *  needs to be at least COFF_CNT_SUB_BLK_4x4+COFF_CNT_SUB_BLK_4x4 size
293 *
294 * @returns
295 *  none
296 *
297 * @remarks
298 *  The all zero case must be taken care outside
299 *
300 *******************************************************************************
301 */
isvce_luma_16x16_idctrans_iquant_itrans_recon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_recon,buffer_container_t * ps_res,buffer_container_t * ps_res_pred,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,isa_dependent_fxns_t * ps_isa_dependent_fxns,WORD32 * pi4_tmp,UWORD32 u4_cntrl,UWORD32 u4_dc_trans_flag,UWORD8 u1_res_accumulate)302 void isvce_luma_16x16_idctrans_iquant_itrans_recon(
303     buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_recon,
304     buffer_container_t *ps_res, buffer_container_t *ps_res_pred,
305     iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants,
306     isa_dependent_fxns_t *ps_isa_dependent_fxns, WORD32 *pi4_tmp, UWORD32 u4_cntrl,
307     UWORD32 u4_dc_trans_flag, UWORD8 u1_res_accumulate)
308 {
309     /* Cntrl bits for 4x4 transforms
310      * u4_blk_cntrl       : controls if a 4x4 block should be processed in ac path
311      * u4_dc_cntrl        : controls is a 4x4 block is to be processed in dc path
312      *                    : dc block must contain only single dc coefficient
313      * u4_empty_blk_cntrl : control fot 4x4 block with no coeffs, ie no dc and ac
314      *                    : ie not (ac or dc)
315      */
316     UWORD32 u4_blk_cntrl, u4_dc_cntrl, u4_empty_blk_cntrl;
317     UWORD32 u4_blk_id;
318     WORD32 i4_offset_x, i4_offset_y;
319     UWORD32 u4_dc_inc;
320     WORD16 *pi2_dc_src;
321 
322     enc_loop_fxns_t *ps_enc_loop_fxns = &ps_isa_dependent_fxns->s_enc_loop_fxns;
323     buffer_container_t s_src = ps_src[0];
324     buffer_container_t s_pred = ps_pred[0];
325     buffer_container_t s_recon = ps_recon[0];
326     buffer_container_t s_res = ps_res[0];
327     buffer_container_t s_res_pred = ps_res_pred[0];
328 
329     /* Start index for inverse quant in a 4x4 block */
330     WORD32 i4_iq_start_idx = (u4_dc_trans_flag == 0) ? 0 : 1;
331     const UWORD16 *pu2_iscale_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
332     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
333     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
334     UWORD8 u1_iq_it_recon_fxn_idx =
335         isvc_get_iq_it_recon_variant_idx(!!u4_dc_trans_flag, u1_res_accumulate);
336 
337     /*
338      * For intra blocks we need to do inverse dc transform
339      * In case if intra blocks, its here that we populate the dc bits in cntrl
340      * as they cannot be populated any earlier
341      */
342     if(u4_dc_trans_flag)
343     {
344         UWORD32 cntr, u4_dc_cntrl;
345 
346         /* Do inv hadamard and place the results at the start of each AC block */
347         ps_enc_loop_fxns->pf_ihadamard_scaling_4x4(ps_src->pv_data, ps_src->pv_data, pu2_iscale_mat,
348                                                    pu2_weigh_mat, u4_qp_div_6, pi4_tmp);
349 
350         /* Update the cntrl flag */
351         u4_dc_cntrl = 0;
352 
353         for(cntr = 0; cntr < DC_COEFF_CNT_LUMA_MB; cntr++)
354         {
355             u4_dc_cntrl |= ((((WORD16 *) ps_src->pv_data)[cntr] != 0) << (15 - cntr));
356         }
357 
358         /* Mark dc bits as 1 if corresponding ac bit is 0 */
359         u4_dc_cntrl = (~(u4_cntrl >> 16) & u4_dc_cntrl);
360 
361         /* Combine both ac and dc bits */
362         u4_cntrl = (u4_cntrl & CNTRL_FLAG_AC_MASK_LUMA) | (u4_dc_cntrl & CNTRL_FLAG_DC_MASK_LUMA);
363     }
364 
365     /* Source for dc coeffs
366      * If the block is intra, we have to read dc values from first row of src
367      * then stride for each block is 1, other wise its src stride
368      */
369     pi2_dc_src = ((WORD16 *) ps_src->pv_data) + (i4_iq_start_idx == 0) * ps_src->i4_data_stride;
370     u4_dc_inc = (i4_iq_start_idx == 0) ? ps_src->i4_data_stride : 1;
371 
372     /* Get the block bits */
373     u4_blk_cntrl = (u4_cntrl & CNTRL_FLAG_AC_MASK_LUMA);
374     u4_dc_cntrl = (u4_cntrl & CNTRL_FLAG_DC_MASK_LUMA) << 16;
375     u4_empty_blk_cntrl = (~(u4_dc_cntrl | u4_blk_cntrl)) & 0xFFFF0000;
376 
377     /* Get first block to process */
378     DEQUEUE_BLKID_FROM_CONTROL(u4_dc_cntrl, u4_blk_id);
379 
380     while(u4_blk_id < NUM_LUMA4x4_BLOCKS_IN_MB)
381     {
382         /* Compute address of src blocks */
383         WORD32 i4_src_offset = u4_dc_inc * u4_blk_id;
384 
385         /* Tx blk coeffs are stored blk by blk */
386         /* Hence, in order to access rows of each Tx blk, one needs to stride of
387          * TxxSize */
388         s_src.i4_data_stride = 4;
389         s_src.pv_data = pi2_dc_src + i4_src_offset;
390 
391         IND2SUB_LUMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
392 
393         /* Compute address of out and pred blocks */
394         s_pred.pv_data =
395             ((UWORD8 *) ps_pred->pv_data) + i4_offset_x + i4_offset_y * ps_pred->i4_data_stride;
396         s_recon.pv_data =
397             ((UWORD8 *) ps_recon->pv_data) + i4_offset_x + i4_offset_y * ps_recon->i4_data_stride;
398         s_res.pv_data =
399             ((WORD16 *) ps_res->pv_data) + i4_offset_x + i4_offset_y * ps_res->i4_data_stride;
400         s_res_pred.pv_data = ((WORD16 *) ps_res_pred->pv_data) + i4_offset_x +
401                              i4_offset_y * ps_res_pred->i4_data_stride;
402 
403         ps_enc_loop_fxns->apf_iquant_itrans_recon_4x4_dc[u1_iq_it_recon_fxn_idx](
404             &s_src, &s_pred, &s_res_pred, &s_res, &s_recon, ps_iq_it_res_rec_constants, NULL,
405             pi2_dc_src + i4_src_offset, i4_iq_start_idx, u1_res_accumulate);
406 
407         DEQUEUE_BLKID_FROM_CONTROL(u4_dc_cntrl, u4_blk_id);
408     }
409 
410     /* now process ac/mixed blocks */
411     DEQUEUE_BLKID_FROM_CONTROL(u4_blk_cntrl, u4_blk_id);
412     while(u4_blk_id < NUM_LUMA4x4_BLOCKS_IN_MB)
413     {
414         IND2SUB_LUMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
415 
416         /* Tx blk coeffs are stored blk by blk */
417         /* Hence, in order to access rows of each Tx blk, one needs to stride of
418          * TxxSize */
419         s_src.i4_data_stride = 4;
420         /* The AC blocks starts from 2nd row */
421         s_src.pv_data = ((WORD16 *) ps_src->pv_data) + (u4_blk_id + 1) * ps_src->i4_data_stride;
422 
423         s_pred.pv_data =
424             ((UWORD8 *) ps_pred->pv_data) + i4_offset_x + i4_offset_y * ps_pred->i4_data_stride;
425         s_recon.pv_data =
426             ((UWORD8 *) ps_recon->pv_data) + i4_offset_x + i4_offset_y * ps_recon->i4_data_stride;
427         s_res.pv_data =
428             ((WORD16 *) ps_res->pv_data) + i4_offset_x + i4_offset_y * ps_res->i4_data_stride;
429         s_res_pred.pv_data = ((WORD16 *) ps_res_pred->pv_data) + i4_offset_x +
430                              i4_offset_y * ps_res_pred->i4_data_stride;
431 
432         ps_enc_loop_fxns->apf_iquant_itrans_recon_4x4[u1_iq_it_recon_fxn_idx](
433             &s_src, &s_pred, &s_res_pred, &s_res, &s_recon, ps_iq_it_res_rec_constants,
434             (WORD16 *) pi4_tmp, pi2_dc_src + u4_blk_id, i4_iq_start_idx, u1_res_accumulate);
435 
436         DEQUEUE_BLKID_FROM_CONTROL(u4_blk_cntrl, u4_blk_id);
437     }
438 
439     /* Now process empty blocks */
440     DEQUEUE_BLKID_FROM_CONTROL(u4_empty_blk_cntrl, u4_blk_id);
441     while(u4_blk_id < NUM_LUMA4x4_BLOCKS_IN_MB)
442     {
443         IND2SUB_LUMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
444 
445         /* Tx blk coeffs are stored blk by blk */
446         /* Hence, in order to access rows of each Tx blk, one needs to stride of
447          * TxxSize */
448         s_src.i4_data_stride = 4;
449         /* The AC blocks starts from 2nd row */
450         s_src.pv_data = ((WORD16 *) ps_src->pv_data) + (u4_blk_id + 1) * ps_src->i4_data_stride;
451 
452         s_pred.pv_data =
453             ((UWORD8 *) ps_pred->pv_data) + i4_offset_x + i4_offset_y * ps_pred->i4_data_stride;
454         s_recon.pv_data =
455             ((UWORD8 *) ps_recon->pv_data) + i4_offset_x + i4_offset_y * ps_recon->i4_data_stride;
456         s_res.pv_data =
457             ((WORD16 *) ps_res->pv_data) + i4_offset_x + i4_offset_y * ps_res->i4_data_stride;
458         s_res_pred.pv_data = ((WORD16 *) ps_res_pred->pv_data) + i4_offset_x +
459                              i4_offset_y * ps_res_pred->i4_data_stride;
460 
461         ps_enc_loop_fxns->pf_zcbf_iquant_itrans_recon_4x4(
462             &s_src, &s_pred, &s_res_pred, &s_res, &s_recon, ps_iq_it_res_rec_constants, NULL,
463             pi2_dc_src + u4_blk_id, i4_iq_start_idx, u1_res_accumulate);
464 
465         DEQUEUE_BLKID_FROM_CONTROL(u4_empty_blk_cntrl, u4_blk_id);
466     }
467 }
468 
469 /**
470 *******************************************************************************
471 *
472 * @brief
473 *  This function performs does the DCT transform then Hadamard transform
474 *  and quantization for a chroma macroblock
475 *
476 * @par Description:
477 *  First  cf4 is done on all 16 4x4 blocks of the 8x8input block
478 *  Then hadamard transform is done on the DC coefficients
479 *  Quantization is then performed on the 8x8 block, 4x4 wise
480 *
481 * @param[in] pu1_src
482 *  Pointer to source sub-block
483 *  The input is in interleaved format for two chroma planes
484 *
485 * @param[in] pu1_pred
486 *  Pointer to prediction sub-block
487 *  Prediction is in inter leaved format
488 *
489 * @param[in] pi2_out
490 *  Pointer to residual sub-block
491 *  The output will be in linear format
492 *  The first 4 continuous locations will contain the values of DC block for U
493 *  and then next 4 will contain for V.
494 *  After DC block and a stride 1st AC block of U plane will follow
495 *  After one more stride next AC block of V plane will follow
496 *  The blocks will be in raster scan order
497 *
498 *  After all the AC blocks of U plane AC blocks of V plane will follow in exact
499 *  same way
500 *
501 * @param[in] i4_src_stride
502 *  Source stride
503 *
504 * @param[in] i4_pred_stride
505 *  Prediction stride
506 *
507 * @param[in] dst_strd
508 *  Destination stride
509 *
510 * @param[in] pu2_scale_matrix
511 *  The quantization matrix for 4x4 transform
512 *
513 * @param[in] pu2_threshold_matrix
514 *  Threshold matrix
515 *
516 * @param[in] u4_qbits
517 *  15+QP/6
518 *
519 * @param[in] u4_round_factor
520 *  Round factor for quant
521 *
522 * @param[out] pu1_nnz
523 *  Memory to store the non-zeros after transform
524 *  The first byte will be the nnz od DC block for U plane
525 *  From the next byte the AC nnzs will be storerd in raster scan order
526 *  The fifth byte will be nnz of Dc block of V plane
527 *  Then Ac blocks will follow
528 *
529 * @param u4_dc_flag
530 *  Signals if Dc transform is to be done or not
531 *   1 -> Dc transform will be done
532 *   0 -> Dc transform will not be done
533 *
534 * @remarks
535 *
536 *******************************************************************************
537 */
isvce_chroma_8x8_resi_trans_dctrans_quant(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_quant_coeffs,buffer_container_t * ps_upsampled_res,isa_dependent_fxns_t * ps_isa_dependent_fxns,const UWORD16 * pu2_scale_matrix,const UWORD16 * pu2_threshold_matrix,UWORD8 * pu1_nnz,UWORD32 u4_qbits,UWORD32 u4_round_factor,UWORD8 u1_use_upsampled_res)538 void isvce_chroma_8x8_resi_trans_dctrans_quant(
539     buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_quant_coeffs,
540     buffer_container_t *ps_upsampled_res, isa_dependent_fxns_t *ps_isa_dependent_fxns,
541     const UWORD16 *pu2_scale_matrix, const UWORD16 *pu2_threshold_matrix, UWORD8 *pu1_nnz,
542     UWORD32 u4_qbits, UWORD32 u4_round_factor, UWORD8 u1_use_upsampled_res)
543 {
544     WORD32 blk_cntr;
545     WORD32 i4_offsetx, i4_offsety;
546     UWORD8 au1_dcnnz[2];
547 
548     enc_loop_fxns_t *ps_enc_loop_fxns = &ps_isa_dependent_fxns->s_enc_loop_fxns;
549     buffer_container_t s_src = ps_src[0];
550     buffer_container_t s_pred = ps_pred[0];
551     buffer_container_t s_quant_coeffs = ps_quant_coeffs[0];
552     buffer_container_t s_upsampled_res = {0};
553     resi_trans_quant_constants_t s_resi_trans_quant_constants = {
554         .pu2_scale_matrix = pu2_scale_matrix,
555         .pu2_threshold_matrix = pu2_threshold_matrix,
556         .u4_qbits = u4_qbits,
557         .u4_round_factor = u4_round_factor};
558 
559     UWORD8 u1_resi_trans_fxn_idx = isvc_get_resi_trans_quant_variant_idx(u1_use_upsampled_res);
560 
561     if(u1_use_upsampled_res)
562     {
563         s_upsampled_res = ps_upsampled_res[0];
564     }
565 
566     /* Move to the ac addresses */
567     pu1_nnz++;
568 
569     for(blk_cntr = 0; blk_cntr < NUM_CHROMA4x4_BLOCKS_IN_MB; blk_cntr++)
570     {
571         IND2SUB_CHROMA_MB(blk_cntr, i4_offsetx, i4_offsety);
572 
573         s_src.pv_data =
574             ((UWORD8 *) ps_src[0].pv_data) + i4_offsetx + i4_offsety * ps_src[0].i4_data_stride;
575         s_pred.pv_data =
576             ((UWORD8 *) ps_pred[0].pv_data) + i4_offsetx + i4_offsety * ps_pred[0].i4_data_stride;
577         s_quant_coeffs.pv_data =
578             ((WORD16 *) ps_quant_coeffs[0].pv_data) + blk_cntr * ps_quant_coeffs[0].i4_data_stride;
579 
580         if(u1_use_upsampled_res)
581         {
582             s_upsampled_res.pv_data = ((WORD16 *) ps_upsampled_res[0].pv_data) + i4_offsetx +
583                                       i4_offsety * ps_upsampled_res[0].i4_data_stride;
584         }
585 
586         /* Move to the ac addresses */
587         s_quant_coeffs.pv_data =
588             ((WORD16 *) s_quant_coeffs.pv_data) + ps_quant_coeffs[0].i4_data_stride;
589 
590         s_quant_coeffs.i4_data_stride = 4;
591 
592         /* For chroma, v plane nnz is populated from position 5 */
593         ps_enc_loop_fxns->apf_resi_trans_quant_chroma_4x4[u1_resi_trans_fxn_idx](
594             &s_src, &s_pred, &s_quant_coeffs, &s_upsampled_res, &s_resi_trans_quant_constants,
595             &pu1_nnz[blk_cntr + (blk_cntr > 3)], ((WORD16 *) ps_quant_coeffs->pv_data) + blk_cntr,
596             u1_use_upsampled_res);
597     }
598 
599     /* Adjust pointers to point to dc values */
600     s_quant_coeffs = ps_quant_coeffs[0];
601     pu1_nnz--;
602 
603     s_resi_trans_quant_constants.u4_qbits++;
604     s_resi_trans_quant_constants.u4_round_factor <<= 1;
605 
606     ps_enc_loop_fxns->pf_hadamard_quant_2x2_uv(((WORD16 *) ps_quant_coeffs->pv_data),
607                                                ((WORD16 *) ps_quant_coeffs->pv_data),
608                                                &s_resi_trans_quant_constants, au1_dcnnz);
609 
610     /* Copy the dc nnzs */
611     pu1_nnz[0] = au1_dcnnz[0];
612     pu1_nnz[5] = au1_dcnnz[1];
613 }
614 
615 /**
616 *******************************************************************************
617 * @brief
618 *  This function performs the inverse transform with process for chroma MB of
619 *H264
620 *
621 * @par Description:
622 *  Does inverse DC transform ,inverse quantization inverse transform
623 *
624 * @param[in] pi2_src
625 *  Input data, 16x16 size
626 *  The input is in the form of, first 4 locations will contain DC coeffs of
627 *  U plane, next 4 will contain DC coeffs of V plane, then AC blocks of U plane
628 *  in raster scan order will follow, each block as linear array in raster scan
629 *order. After a stride next AC block will follow. After all AC blocks of U plane
630 *  V plane AC blocks will follow in exact same order.
631 *
632 * @param[in] pu1_pred
633 *  The predicted data, 8x16 size, U and V interleaved
634 *
635 * @param[in] pu1_out
636 *  Output 8x16, U and V interleaved
637 *
638 * @param[in] i4_src_stride
639 *  Source stride
640 *
641 * @param[in] i4_pred_stride
642 *  input stride for prediction buffer
643 *
644 * @param[in] i4_out_stride
645 *  input stride for output buffer
646 *
647 * @param[in] pu2_iscale_mat
648 *  Inverse quantization martix for 4x4 transform
649 *
650 * @param[in] pu2_weigh_mat
651 *  weight matrix of 4x4 transform
652 *
653 * @param[in] u4_qp_div_6
654 *  QP/6
655 *
656 * @param[in] pi4_tmp
657 *  Input temporary buffer
658 *  needs to be at least COFF_CNT_SUB_BLK_4x4 + Number of Dc cofss for chroma *
659 *number of planes in size
660 *
661 * @param[in] pu4_cntrl
662 *  Controls the transform path
663 *  the 15 th bit will correspond to DC block of U plane , 14th will indicate the
664 *V plane Dc block 32-28 bits will indicate AC blocks of U plane in raster scan
665 *order 27-23 bits will indicate AC blocks of V plane in rater scan order The bit
666 *1 implies that there is at least one non zero coeff in a block
667 *
668 * @returns
669 *  none
670 *
671 * @remarks
672 *******************************************************************************
673 */
isvce_chroma_8x8_idctrans_iquant_itrans_recon(buffer_container_t * ps_src,buffer_container_t * ps_pred,buffer_container_t * ps_recon,buffer_container_t * ps_res,buffer_container_t * ps_res_pred,iq_it_res_rec_constants_t * ps_iq_it_res_rec_constants,isa_dependent_fxns_t * ps_isa_dependent_fxns,WORD32 * pi4_tmp,UWORD32 u4_cntrl,UWORD8 u1_res_accumulate)674 void isvce_chroma_8x8_idctrans_iquant_itrans_recon(
675     buffer_container_t *ps_src, buffer_container_t *ps_pred, buffer_container_t *ps_recon,
676     buffer_container_t *ps_res, buffer_container_t *ps_res_pred,
677     iq_it_res_rec_constants_t *ps_iq_it_res_rec_constants,
678     isa_dependent_fxns_t *ps_isa_dependent_fxns, WORD32 *pi4_tmp, UWORD32 u4_cntrl,
679     UWORD8 u1_res_accumulate)
680 {
681     /* Cntrl bits for 4x4 transforms
682      * u4_blk_cntrl       : controls if a 4x4 block should be processed in ac path
683      * u4_dc_cntrl        : controls is a 4x4 block is to be processed in dc path
684      *                    : dc block must contain only single dc coefficient
685      * u4_empty_blk_cntrl : control fot 4x4 block with no coeffs, ie no dc and ac
686      *                    : ie not (ac or dc)
687      */
688     UWORD32 u4_blk_cntrl, u4_dc_cntrl, u4_empty_blk_cntrl;
689     WORD32 u4_blk_id;
690     WORD32 i4_offset_x, i4_offset_y;
691     WORD16 *pi2_dc_src;
692     /* Increment for dc block */
693     WORD32 i4_dc_inc;
694 
695     enc_loop_fxns_t *ps_enc_loop_fxns = &ps_isa_dependent_fxns->s_enc_loop_fxns;
696     buffer_container_t s_src = ps_src[0];
697     buffer_container_t s_pred = ps_pred[0];
698     buffer_container_t s_recon = ps_recon[0];
699     buffer_container_t s_res = ps_res[0];
700     buffer_container_t s_res_pred = ps_res_pred[0];
701 
702     WORD16 i2_zero = 0;
703     const UWORD16 *pu2_iscale_mat = ps_iq_it_res_rec_constants->pu2_iscal_mat;
704     const UWORD16 *pu2_weigh_mat = ps_iq_it_res_rec_constants->pu2_weigh_mat;
705     UWORD32 u4_qp_div_6 = ps_iq_it_res_rec_constants->u4_qp_div_6;
706     UWORD8 u1_iq_it_recon_fxn_idx = isvc_get_iq_it_recon_variant_idx(0, u1_res_accumulate);
707 
708     /*
709      * Lets do the inverse transform for dc coeffs in chroma
710      */
711     if(u4_cntrl & CNTRL_FLAG_DCBLK_MASK_CHROMA)
712     {
713         UWORD32 cntr, u4_dc_cntrl;
714         /* Do inv hadamard for u an v block */
715 
716         ps_enc_loop_fxns->pf_ihadamard_scaling_2x2_uv(s_src.pv_data, s_src.pv_data, pu2_iscale_mat,
717                                                       pu2_weigh_mat, u4_qp_div_6, NULL);
718         /*
719          * Update the cntrl flag
720          * Flag is updated as follows bits 15-11 -> u block dc bits
721          */
722         u4_dc_cntrl = 0;
723         for(cntr = 0; cntr < 8; cntr++)
724         {
725             u4_dc_cntrl |= ((((WORD16 *) ps_src->pv_data)[cntr] != 0) << (15 - cntr));
726         }
727 
728         /* Mark dc bits as 1 if corresponding ac bit is 0 */
729         u4_dc_cntrl = (~(u4_cntrl >> 16) & u4_dc_cntrl);
730         /* Combine both ac and dc bits */
731         u4_cntrl =
732             (u4_cntrl & CNTRL_FLAG_AC_MASK_CHROMA) | (u4_dc_cntrl & CNTRL_FLAG_DC_MASK_CHROMA);
733 
734         /* Since we populated the dc coffs, we have to read them from there */
735         pi2_dc_src = ((WORD16 *) ps_src->pv_data);
736         i4_dc_inc = 1;
737     }
738     else
739     {
740         u4_cntrl = u4_cntrl & CNTRL_FLAG_AC_MASK_CHROMA;
741         pi2_dc_src = &i2_zero;
742         i4_dc_inc = 0;
743     }
744 
745     /* Get the block bits */
746     u4_blk_cntrl = (u4_cntrl & CNTRL_FLAG_AC_MASK_CHROMA);
747     u4_dc_cntrl = (u4_cntrl & CNTRL_FLAG_DC_MASK_CHROMA) << 16;
748     u4_empty_blk_cntrl = (~(u4_dc_cntrl | u4_blk_cntrl)) & 0xFF000000;
749 
750     DEQUEUE_BLKID_FROM_CONTROL(u4_dc_cntrl, u4_blk_id);
751 
752     while(u4_blk_id < 8)
753     {
754         WORD32 dc_src_offset = u4_blk_id * i4_dc_inc;
755 
756         /* Tx blk coeffs are stored blk by blk */
757         /* Hence, in order to access rows of each Tx blk, one needs to stride of
758          * TxxSize */
759         s_src.i4_data_stride = 4;
760         s_src.pv_data = pi2_dc_src + dc_src_offset;
761 
762         IND2SUB_CHROMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
763 
764         s_pred.pv_data =
765             ((UWORD8 *) ps_pred->pv_data) + i4_offset_x + i4_offset_y * ps_pred->i4_data_stride;
766         s_recon.pv_data =
767             ((UWORD8 *) ps_recon->pv_data) + i4_offset_x + i4_offset_y * ps_recon->i4_data_stride;
768         s_res.pv_data =
769             ((WORD16 *) ps_res->pv_data) + i4_offset_x + i4_offset_y * ps_res->i4_data_stride;
770         s_res_pred.pv_data = ((WORD16 *) ps_res_pred->pv_data) + i4_offset_x +
771                              i4_offset_y * ps_res_pred->i4_data_stride;
772 
773         ps_enc_loop_fxns->apf_iquant_itrans_recon_chroma_4x4_dc[u1_iq_it_recon_fxn_idx](
774             &s_src, &s_pred, &s_res_pred, &s_res, &s_recon, ps_iq_it_res_rec_constants, NULL,
775             s_src.pv_data, 0, u1_res_accumulate);
776 
777         /* Get next DC block to process */
778         DEQUEUE_BLKID_FROM_CONTROL(u4_dc_cntrl, u4_blk_id);
779     }
780 
781     /* now process ac/mixed blocks */
782     DEQUEUE_BLKID_FROM_CONTROL(u4_blk_cntrl, u4_blk_id);
783     while(u4_blk_id < 8)
784     {
785         WORD32 dc_src_offset = i4_dc_inc * u4_blk_id;
786 
787         /* Tx blk coeffs are stored blk by blk */
788         /* Hence, in order to access rows of each Tx blk, one needs to stride of
789          * TxxSize */
790         s_src.i4_data_stride = 4;
791         /* The AC blocks starts from 2nd row */
792         s_src.pv_data = ((WORD16 *) ps_src->pv_data) + (u4_blk_id + 1) * ps_src->i4_data_stride;
793 
794         IND2SUB_CHROMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
795 
796         s_pred.pv_data =
797             ((UWORD8 *) ps_pred->pv_data) + i4_offset_x + i4_offset_y * ps_pred->i4_data_stride;
798         s_recon.pv_data =
799             ((UWORD8 *) ps_recon->pv_data) + i4_offset_x + i4_offset_y * ps_recon->i4_data_stride;
800         s_res.pv_data =
801             ((WORD16 *) ps_res->pv_data) + i4_offset_x + i4_offset_y * ps_res->i4_data_stride;
802         s_res_pred.pv_data = ((WORD16 *) ps_res_pred->pv_data) + i4_offset_x +
803                              i4_offset_y * ps_res_pred->i4_data_stride;
804 
805         ps_enc_loop_fxns->apf_iquant_itrans_recon_chroma_4x4[u1_iq_it_recon_fxn_idx](
806             &s_src, &s_pred, &s_res_pred, &s_res, &s_recon, ps_iq_it_res_rec_constants,
807             (WORD16 *) pi4_tmp, pi2_dc_src + dc_src_offset, 0, u1_res_accumulate);
808 
809         DEQUEUE_BLKID_FROM_CONTROL(u4_blk_cntrl, u4_blk_id);
810     }
811 
812     /* Now process empty blocks */
813     DEQUEUE_BLKID_FROM_CONTROL(u4_empty_blk_cntrl, u4_blk_id);
814 
815     while(u4_blk_id < 8)
816     {
817         WORD32 dc_src_offset = i4_dc_inc * u4_blk_id;
818 
819         IND2SUB_CHROMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
820 
821         /* Tx blk coeffs are stored blk by blk */
822         /* Hence, in order to access rows of each Tx blk, one needs to stride of
823          * TxxSize */
824         s_src.i4_data_stride = 4;
825         /* The AC blocks starts from 2nd row */
826         s_src.pv_data = ((WORD16 *) ps_src->pv_data) + (u4_blk_id + 1) * ps_src->i4_data_stride;
827 
828         s_pred.pv_data =
829             ((UWORD8 *) ps_pred->pv_data) + i4_offset_x + i4_offset_y * ps_pred->i4_data_stride;
830         s_recon.pv_data =
831             ((UWORD8 *) ps_recon->pv_data) + i4_offset_x + i4_offset_y * ps_recon->i4_data_stride;
832         s_res.pv_data =
833             ((WORD16 *) ps_res->pv_data) + i4_offset_x + i4_offset_y * ps_res->i4_data_stride;
834         s_res_pred.pv_data = ((WORD16 *) ps_res_pred->pv_data) + i4_offset_x +
835                              i4_offset_y * ps_res_pred->i4_data_stride;
836 
837         ps_enc_loop_fxns->pf_chroma_zcbf_iquant_itrans_recon_4x4(
838             &s_src, &s_pred, &s_res_pred, &s_res, &s_recon, ps_iq_it_res_rec_constants,
839             (WORD16 *) pi4_tmp, pi2_dc_src + dc_src_offset, 0, u1_res_accumulate);
840 
841         DEQUEUE_BLKID_FROM_CONTROL(u4_empty_blk_cntrl, u4_blk_id);
842     }
843 }
844 
845 /**
846 ******************************************************************************
847 *
848 * @brief  This function packs residue of an i16x16 luma mb for entropy coding
849 *
850 * @par   Description
851 *  An i16 macro block contains two classes of units, dc 4x4 block and
852 *  4x4 ac blocks. while packing the mb, the dc block is sent first, and
853 *  the 16 ac blocks are sent next in scan order. Each and every block is
854 *  represented by 3 parameters (nnz, significant coefficient map and the
855 *  residue coefficients itself). If a 4x4 unit does not have any coefficients
856 *  then only nnz is sent. Inside a 4x4 block the individual coefficients are
857 *  sent in scan order.
858 *
859 *  The first byte of each block will be nnz of the block, if it is non zero,
860 *  a 2 byte significance map is sent. This is followed by nonzero coefficients.
861 *  This is repeated for 1 dc + 16 ac blocks.
862 *
863 * @param[in]  pi2_res_mb
864 *  pointer to residue mb
865 *
866 * @param[in, out]  pv_mb_coeff_data
867 *  buffer pointing to packed residue coefficients
868 *
869 * @param[in]  u4_res_strd
870 *  residual block stride
871 *
872 * @param[out]  u1_cbp_l
873 *  coded block pattern luma
874 *
875 * @param[in]   pu1_nnz
876 *  number of non zero coefficients in each 4x4 unit
877 *
878 * @param[out]
879 *  Control signal for inverse transform of 16x16 blocks
880 *
881 * @return none
882 *
883 * @ remarks
884 *
885 ******************************************************************************
886 */
isvce_pack_l_mb_i16(WORD16 * pi2_res_mb,void ** pv_mb_coeff_data,WORD32 i4_res_strd,UWORD8 * u1_cbp_l,UWORD8 * pu1_nnz,UWORD32 * pu4_cntrl)887 void isvce_pack_l_mb_i16(WORD16 *pi2_res_mb, void **pv_mb_coeff_data, WORD32 i4_res_strd,
888                          UWORD8 *u1_cbp_l, UWORD8 *pu1_nnz, UWORD32 *pu4_cntrl)
889 {
890     /* pointer to packed sub block buffer space */
891     tu_sblk_coeff_data_t *ps_mb_coeff_data = (*pv_mb_coeff_data), *ps_mb_coeff_data_ac;
892 
893     /* no of non zero coefficients in the current sub block */
894     UWORD32 u4_nnz_cnt;
895 
896     /* significant coefficient map */
897     UWORD32 u4_s_map;
898 
899     /* pointer to scanning matrix */
900     const UWORD8 *pu1_scan_order;
901 
902     /* number of non zeros in sub block */
903     UWORD32 u4_nnz;
904 
905     /* coeff scan order */
906     const UWORD8 u1_scan_order[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15};
907 
908     /* temp var */
909     UWORD32 coeff_cnt, mask, b4, u4_cntrl = 0;
910 
911     /*DC and AC coeff pointers*/
912     WORD16 *pi2_res_mb_ac, *pi2_res_mb_dc;
913 
914     /********************************************************/
915     /*  pack dc coeff data for entropy coding               */
916     /********************************************************/
917 
918     pi2_res_mb_dc = pi2_res_mb;
919     pu1_scan_order = gu1_luma_scan_order_dc;
920 
921     u4_nnz = *pu1_nnz;
922     u4_cntrl = 0;
923 
924     /* write number of non zero coefficients */
925     ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz;
926 
927     if(u4_nnz)
928     {
929         for(u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz; coeff_cnt++)
930         {
931             if(pi2_res_mb_dc[pu1_scan_order[coeff_cnt]])
932             {
933                 /* write residue */
934                 ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] =
935                     pi2_res_mb_dc[pu1_scan_order[coeff_cnt]];
936                 u4_s_map |= mask;
937             }
938             mask <<= 1;
939         }
940         /* write significant coeff map */
941         ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
942         (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
943 
944         u4_cntrl = 0x00008000;  // Set DC bit in ctrl code
945     }
946     else
947     {
948         (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
949     }
950 
951     /********************************************************/
952     /*  pack ac coeff data for entropy coding               */
953     /********************************************************/
954 
955     pu1_nnz++;
956     pu1_scan_order = gu1_luma_scan_order;
957     pi2_res_mb += i4_res_strd; /*Move to AC block*/
958 
959     ps_mb_coeff_data_ac = (*pv_mb_coeff_data);
960 
961     for(b4 = 0; b4 < 16; b4++)
962     {
963         ps_mb_coeff_data = (*pv_mb_coeff_data);
964 
965         u4_nnz = pu1_nnz[u1_scan_order[b4]];
966 
967         /* Jump according to the scan order */
968         pi2_res_mb_ac = pi2_res_mb + (i4_res_strd * u1_scan_order[b4]);
969 
970         /*
971          * Since this is a i16x16 block, we should not count dc coeff on indi
972          * vidual 4x4 blocks to nnz. But due to the implementation of 16x16
973          * trans function, we add dc's nnz to u4_nnz too. Hence we adjust that
974          * here
975          */
976         u4_nnz -= (pi2_res_mb_ac[0] != 0);
977 
978         /* write number of non zero coefficients */
979         ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz;
980 
981         if(u4_nnz)
982         {
983             for(u4_nnz_cnt = 0, coeff_cnt = 1, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz;
984                 coeff_cnt++)
985             {
986                 if(pi2_res_mb_ac[pu1_scan_order[coeff_cnt]])
987                 {
988                     /* write residue */
989                     ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] =
990                         pi2_res_mb_ac[pu1_scan_order[coeff_cnt]];
991                     u4_s_map |= mask;
992                 }
993                 mask <<= 1;
994             }
995             /* write significant coeff map */
996             ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
997             (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
998             *u1_cbp_l = 15;
999 
1000             u4_cntrl |= (1 << (31 - u1_scan_order[b4]));
1001         }
1002         else
1003         {
1004             (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
1005         }
1006     }
1007 
1008     if(!(*u1_cbp_l))
1009     {
1010         (*pv_mb_coeff_data) = ps_mb_coeff_data_ac;
1011     }
1012 
1013     /* Store the cntrl signal */
1014     (*pu4_cntrl) = u4_cntrl;
1015     return;
1016 }
1017 
1018 /**
1019 ******************************************************************************
1020 *
1021 * @brief  This function packs residue of an p16x16 luma mb for entropy coding
1022 *
1023 * @par   Description
1024 *  A p16x16 macro block contains two classes of units 16  4x4 ac blocks.
1025 *  while packing the mb, the dc block is sent first, and
1026 *  the 16 ac blocks are sent next in scan order. Each and every block is
1027 *  represented by 3 parameters (nnz, significant coefficient map and the
1028 *  residue coefficients itself). If a 4x4 unit does not have any coefficients
1029 *  then only nnz is sent. Inside a 4x4 block the individual coefficients are
1030 *  sent in scan order.
1031 *
1032 *  The first byte of each block will be nnz of the block, if it is non zero,
1033 *  a 2 byte significance map is sent. This is followed by nonzero coefficients.
1034 *  This is repeated for 1 dc + 16 ac blocks.
1035 *
1036 * @param[in]  pi2_res_mb
1037 *  pointer to residue mb
1038 *
1039 * @param[in, out]  pv_mb_coeff_data
1040 *  buffer pointing to packed residue coefficients
1041 *
1042 * @param[in]  i4_res_strd
1043 *  residual block stride
1044 *
1045 * @param[out]  u1_cbp_l
1046 *  coded block pattern luma
1047 *
1048 * @param[in]   pu1_nnz
1049 *  number of non zero coefficients in each 4x4 unit
1050 *
1051 * @param[out] pu4_cntrl
1052 *  Control signal for inverse transform
1053 *
1054 * @return none
1055 *
1056 * @remarks Killing coffs not yet coded
1057 *
1058 ******************************************************************************
1059 */
isvce_pack_l_mb(WORD16 * pi2_res_mb,void ** pv_mb_coeff_data,WORD32 i4_res_strd,UWORD8 * u1_cbp_l,UWORD8 * pu1_nnz,UWORD32 u4_thres_resi,UWORD32 * pu4_cntrl)1060 void isvce_pack_l_mb(WORD16 *pi2_res_mb, void **pv_mb_coeff_data, WORD32 i4_res_strd,
1061                      UWORD8 *u1_cbp_l, UWORD8 *pu1_nnz, UWORD32 u4_thres_resi, UWORD32 *pu4_cntrl)
1062 {
1063     /* pointer to packed sub block buffer space */
1064     tu_sblk_coeff_data_t *ps_mb_coeff_data, *ps_mb_coeff_data_b8, *ps_mb_coeff_data_mb;
1065 
1066     /* no of non zero coefficients in the current sub block */
1067     UWORD32 u4_nnz_cnt;
1068 
1069     /* significant coefficient map */
1070     UWORD32 u4_s_map;
1071 
1072     /* pointer to scanning matrix */
1073     const UWORD8 *pu1_scan_order = gu1_luma_scan_order;
1074 
1075     /* number of non zeros in sub block */
1076     UWORD32 u4_nnz;
1077 
1078     /* pointer to residual sub block */
1079     WORD16 *pi2_res_sb;
1080 
1081     /* coeff scan order */
1082     const UWORD8 u1_scan_order[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15};
1083 
1084     /* coeff cost */
1085     const UWORD8 *pu1_coeff_cost = gu1_coeff_cost;
1086 
1087     /* temp var */
1088     UWORD32 u4_mb_coeff_cost = 0, u4_b8_coeff_cost = 0, coeff_cnt, mask, u4_cntrl = 0, b4, b8;
1089 
1090     /* temp var */
1091     WORD32 i4_res_val, i4_run = -1, dcac_block;
1092 
1093     /* When Hadamard transform is disabled, first row values are dont care, ignore
1094      * them */
1095     pi2_res_mb += i4_res_strd;
1096 
1097     /* When Hadamard transform is disabled, first unit value is dont care, ignore
1098      * this */
1099     pu1_nnz++;
1100 
1101     ps_mb_coeff_data_mb = ps_mb_coeff_data_b8 = (*pv_mb_coeff_data);
1102 
1103     /********************************************************/
1104     /*  pack coeff data for entropy coding                  */
1105     /********************************************************/
1106 
1107     for(b4 = 0; b4 < 16; b4++)
1108     {
1109         ps_mb_coeff_data = (*pv_mb_coeff_data);
1110 
1111         b8 = b4 >> 2;
1112 
1113         u4_nnz = pu1_nnz[u1_scan_order[b4]];
1114 
1115         /* Jump according to the scan order */
1116         pi2_res_sb = pi2_res_mb + (i4_res_strd * u1_scan_order[b4]);
1117 
1118         /* write number of non zero coefficients */
1119         ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz;
1120 
1121         if(u4_nnz)
1122         {
1123             for(u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz;
1124                 coeff_cnt++)
1125             {
1126                 /* number of runs of zero before, this is used to compute coeff cost */
1127                 i4_run++;
1128 
1129                 i4_res_val = pi2_res_sb[pu1_scan_order[coeff_cnt]];
1130 
1131                 if(i4_res_val)
1132                 {
1133                     /* write residue */
1134                     ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = i4_res_val;
1135                     u4_s_map |= mask;
1136 
1137                     if(u4_thres_resi)
1138                     {
1139                         /* compute coeff cost */
1140                         if(i4_res_val == 1 || i4_res_val == -1)
1141                         {
1142                             if(i4_run < 6) u4_b8_coeff_cost += pu1_coeff_cost[i4_run];
1143                         }
1144                         else
1145                             u4_b8_coeff_cost += 9;
1146 
1147                         i4_run = -1;
1148                     }
1149                 }
1150 
1151                 mask <<= 1;
1152             }
1153 
1154             /* write significant coeff map */
1155             ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
1156             (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
1157 
1158             /* cbp */
1159             *u1_cbp_l |= (1 << b8);
1160 
1161             /* Cntrl map for inverse transform computation
1162              *
1163              * If coeff_cnt is zero, it means that only nonzero was a dc coeff
1164              * Hence we have to set the 16 - u1_scan_order[b4]) position instead
1165              * of 31 - u1_scan_order[b4]
1166              */
1167             dcac_block = (coeff_cnt == 0) ? 16 : 31;
1168             u4_cntrl |= (1 << (dcac_block - u1_scan_order[b4]));
1169         }
1170         else
1171         {
1172             (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
1173         }
1174 
1175         /* Decide if the 8x8 unit has to be sent for entropy coding? */
1176         if((b4 + 1) % 4 == 0)
1177         {
1178             if(u4_thres_resi && (u4_b8_coeff_cost <= LUMA_SUB_BLOCK_SKIP_THRESHOLD) &&
1179                (*u1_cbp_l & (1 << b8)))
1180             {
1181                 /*
1182                  * When we want to reset the full 8x8 block, we have to reset
1183                  * both the dc and ac coeff bits hence we have the symmetric
1184                  * arrangement of bits
1185                  */
1186                 const UWORD32 cntrl_mask_map[4] = {0xcc00cc00, 0x33003300, 0x00cc00cc, 0x00330033};
1187 
1188                 /* restore cbp */
1189                 *u1_cbp_l = (*u1_cbp_l & (~(1 << b8)));
1190 
1191                 /* correct cntrl flag */
1192                 u4_cntrl = u4_cntrl & (~cntrl_mask_map[(b4 >> 2)]);
1193 
1194                 /* correct nnz */
1195                 pu1_nnz[u1_scan_order[b4 - 3]] = 0;
1196                 pu1_nnz[u1_scan_order[b4 - 2]] = 0;
1197                 pu1_nnz[u1_scan_order[b4 - 1]] = 0;
1198                 pu1_nnz[u1_scan_order[b4]] = 0;
1199 
1200                 /* reset blk cost */
1201                 u4_b8_coeff_cost = 0;
1202             }
1203 
1204             if(!(*u1_cbp_l & (1 << b8)))
1205             {
1206                 (*pv_mb_coeff_data) = ps_mb_coeff_data_b8;
1207             }
1208 
1209             u4_mb_coeff_cost += u4_b8_coeff_cost;
1210 
1211             u4_b8_coeff_cost = 0;
1212             i4_run = -1;
1213             ps_mb_coeff_data_b8 = (*pv_mb_coeff_data);
1214         }
1215     }
1216 
1217     if(u4_thres_resi && (u4_mb_coeff_cost <= LUMA_BLOCK_SKIP_THRESHOLD) && (*u1_cbp_l))
1218     {
1219         (*pv_mb_coeff_data) = ps_mb_coeff_data_mb;
1220         *u1_cbp_l = 0;
1221         u4_cntrl = 0;
1222         memset(pu1_nnz, 0, 16);
1223     }
1224 
1225     (*pu4_cntrl) = u4_cntrl;
1226 
1227     return;
1228 }
1229 
1230 /**
1231 ******************************************************************************
1232 *
1233 * @brief  This function packs residue of an i8x8 chroma mb for entropy coding
1234 *
1235 * @par   Description
1236 *  An i8 chroma macro block contains two classes of units, dc 2x2 block and
1237 *  4x4 ac blocks. while packing the mb, the dc block is sent first, and
1238 *  the 4 ac blocks are sent next in scan order. Each and every block is
1239 *  represented by 3 parameters (nnz, significant coefficient map and the
1240 *  residue coefficients itself). If a 4x4 unit does not have any coefficients
1241 *  then only nnz is sent. Inside a 4x4 block the individual coefficients are
1242 *  sent in scan order.
1243 *
1244 *  The first byte of each block will be nnz of the block, if it is non zero,
1245 *  a 2 byte significance map is sent. This is followed by nonzero coefficients.
1246 *  This is repeated for 1 dc + 4 ac blocks.
1247 *
1248 * @param[in]  pi2_res_mb
1249 *  pointer to residue mb
1250 *
1251 * @param[in, out]  pv_mb_coeff_data
1252 *  buffer pointing to packed residue coefficients
1253 *
1254 * @param[in]  u4_res_strd
1255 *  residual block stride
1256 *
1257 * @param[out]  u1_cbp_c
1258 *  coded block pattern chroma
1259 *
1260 * @param[in]   pu1_nnz
1261 *  number of non zero coefficients in each 4x4 unit
1262 *
1263 * @param[out]   pu1_nnz
1264 *  Control signal for inverse transform
1265 *
1266 * @param[in]   u4_swap_uv
1267 *  Swaps the order of U and V planes in entropy bitstream
1268 *
1269 * @return none
1270 *
1271 * @ remarks
1272 *
1273 ******************************************************************************
1274 */
isvce_pack_c_mb(WORD16 * pi2_res_mb,void ** pv_mb_coeff_data,WORD32 i4_res_strd,UWORD8 * u1_cbp_c,UWORD8 * pu1_nnz,UWORD32 u4_thres_resi,UWORD32 * pu4_cntrl,UWORD32 u4_swap_uv)1275 void isvce_pack_c_mb(WORD16 *pi2_res_mb, void **pv_mb_coeff_data, WORD32 i4_res_strd,
1276                      UWORD8 *u1_cbp_c, UWORD8 *pu1_nnz, UWORD32 u4_thres_resi, UWORD32 *pu4_cntrl,
1277                      UWORD32 u4_swap_uv)
1278 {
1279     /* pointer to packed sub block buffer space */
1280     tu_sblk_coeff_data_t *ps_mb_coeff_data = (*pv_mb_coeff_data);
1281     tu_sblk_coeff_data_t *ps_mb_coeff_data_dc, *ps_mb_coeff_data_ac;
1282 
1283     /* nnz pointer */
1284     UWORD8 *pu1_nnz_ac, *pu1_nnz_dc;
1285 
1286     /* nnz counter */
1287     UWORD32 u4_nnz_cnt;
1288 
1289     /* significant coefficient map */
1290     UWORD32 u4_s_map;
1291 
1292     /* pointer to scanning matrix */
1293     const UWORD8 *pu1_scan_order;
1294 
1295     /* no of non zero coefficients in the current sub block */
1296     UWORD32 u4_nnz;
1297 
1298     /* pointer to residual sub block, res val */
1299     WORD16 *pi2_res_sb, i2_res_val;
1300 
1301     /* temp var */
1302     UWORD32 coeff_cnt, mask, b4, plane;
1303 
1304     /* temp var */
1305     UWORD32 u4_coeff_cost;
1306     WORD32 i4_run;
1307 
1308     /* coeff cost */
1309     const UWORD8 *pu1_coeff_cost = gu1_coeff_cost;
1310 
1311     /* pointer to packed buffer space */
1312     UWORD32 *pu4_mb_coeff_data = NULL;
1313 
1314     /* ac coded block pattern */
1315     UWORD8 u1_cbp_ac;
1316 
1317     /* Variable to store the current bit pos in cntrl variable*/
1318     UWORD32 cntrl_pos = 0;
1319 
1320     /********************************************************/
1321     /*  pack dc coeff data for entropy coding               */
1322     /********************************************************/
1323     pu1_scan_order = gu1_chroma_scan_order_dc;
1324     pi2_res_sb = pi2_res_mb;
1325     pu1_nnz_dc = pu1_nnz;
1326     (*pu4_cntrl) = 0;
1327     cntrl_pos = 15;
1328     ps_mb_coeff_data_dc = (*pv_mb_coeff_data);
1329 
1330     /* Color space conversion between SP_UV and SP_VU
1331      * We always assume SP_UV for all the processing
1332      * Hence to get proper stream output we need to swap U and V channels here
1333      *
1334      * For that there are two paths we need to look for
1335      * One is the path to bitstream , these variables should have the proper input
1336      * configured UV or VU
1337      * For the other path the inverse transform variables should have what ever
1338      * ordering the input had
1339      */
1340 
1341     if(u4_swap_uv)
1342     {
1343         pu1_nnz_dc += 5; /* Move to NNZ of V planve */
1344         pi2_res_sb += 4; /* Move to DC coff of V plane */
1345 
1346         cntrl_pos = 14; /* Control bit for V plane */
1347     }
1348 
1349     for(plane = 0; plane < 2; plane++)
1350     {
1351         ps_mb_coeff_data = (*pv_mb_coeff_data);
1352 
1353         u4_nnz = *pu1_nnz_dc;
1354         /* write number of non zero coefficients U/V */
1355         ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz;
1356 
1357         if(u4_nnz)
1358         {
1359             for(u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz;
1360                 coeff_cnt++)
1361             {
1362                 i2_res_val = pi2_res_sb[pu1_scan_order[coeff_cnt]];
1363                 if(i2_res_val)
1364                 {
1365                     /* write residue U/V */
1366                     ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = i2_res_val;
1367                     u4_s_map |= mask;
1368                 }
1369                 mask <<= 1;
1370             }
1371             /* write significant coeff map U/V */
1372             ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
1373             (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
1374             *u1_cbp_c = 1;
1375 
1376             (*pu4_cntrl) |= (1 << cntrl_pos);
1377         }
1378         else
1379         {
1380             (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
1381         }
1382 
1383         if(u4_swap_uv)
1384         {
1385             cntrl_pos++;     /* Control bit for U plane */
1386             pu1_nnz_dc -= 5; /* Move to NNZ of U plane */
1387             pi2_res_sb -= 4; /* Move to DC coff of U plane */
1388         }
1389         else
1390         {
1391             cntrl_pos--;     /* Control bit for U plane */
1392             pu1_nnz_dc += 5; /* 4 for AC NNZ and 1 for DC */
1393             pi2_res_sb += 4; /* Move to DC coff of V plane */
1394         }
1395     }
1396 
1397     /********************************************************/
1398     /*  pack ac coeff data for entropy coding               */
1399     /********************************************************/
1400 
1401     pu1_scan_order = gu1_chroma_scan_order;
1402     ps_mb_coeff_data_ac = (*pv_mb_coeff_data);
1403 
1404     if(u4_swap_uv)
1405     {
1406         pi2_res_sb = pi2_res_mb + i4_res_strd * 5; /* Move to V plane ,ie 1dc row+ 4 ac row */
1407         cntrl_pos = 27;           /* The control bits are to be added for V bloc ie 31-4 th bit */
1408         pu1_nnz_ac = pu1_nnz + 6; /*Move the nnz to V block NNZ 1 dc + 1dc + 4 ac */
1409     }
1410     else
1411     {
1412         pi2_res_sb = pi2_res_mb + i4_res_strd; /* Move to U plane ,ie 1dc row */
1413         cntrl_pos = 31;
1414         pu1_nnz_ac = pu1_nnz + 1; /* Move the nnz to V block NNZ 1 dc */
1415     }
1416 
1417     for(plane = 0; plane < 2; plane++)
1418     {
1419         pu4_mb_coeff_data = (*pv_mb_coeff_data);
1420 
1421         u4_coeff_cost = 0;
1422         i4_run = -1;
1423 
1424         /* get the current cbp, so that it automatically
1425          * gets reverted in case of zero ac values */
1426         u1_cbp_ac = *u1_cbp_c;
1427 
1428         for(b4 = 0; b4 < 4; b4++)
1429         {
1430             ps_mb_coeff_data = (*pv_mb_coeff_data);
1431 
1432             u4_nnz = *pu1_nnz_ac;
1433 
1434             /*
1435              * We are scanning only ac coeffs, but the nnz is for the
1436              * complete 4x4 block. Hence we have to discount the nnz contributed
1437              * by the dc coefficient
1438              */
1439             u4_nnz -= (pi2_res_sb[0] != 0);
1440 
1441             /* write number of non zero coefficients U/V */
1442             ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz;
1443 
1444             if(u4_nnz)
1445             {
1446                 for(u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz;
1447                     coeff_cnt++)
1448                 {
1449                     i2_res_val = pi2_res_sb[pu1_scan_order[coeff_cnt]];
1450 
1451                     i4_run++;
1452 
1453                     if(i2_res_val)
1454                     {
1455                         /* write residue U/V */
1456                         ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = i2_res_val;
1457                         u4_s_map |= mask;
1458 
1459                         if(u4_thres_resi && (u4_coeff_cost < CHROMA_BLOCK_SKIP_THRESHOLD))
1460                         {
1461                             /* compute coeff cost */
1462                             if(i2_res_val == 1 || i2_res_val == -1)
1463                             {
1464                                 if(i4_run < 6) u4_coeff_cost += pu1_coeff_cost[i4_run];
1465                             }
1466                             else
1467                                 u4_coeff_cost += 9;
1468 
1469                             i4_run = -1;
1470                         }
1471                     }
1472                     mask <<= 1;
1473                 }
1474 
1475                 /* write significant coeff map U/V */
1476                 ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
1477                 (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
1478                 u1_cbp_ac = 2;
1479 
1480                 (*pu4_cntrl) |= 1 << cntrl_pos;
1481             }
1482             else
1483             {
1484                 (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
1485             }
1486 
1487             pu1_nnz_ac++;
1488             pi2_res_sb += i4_res_strd;
1489             cntrl_pos--;
1490         }
1491 
1492         /* reset block */
1493         if(u4_thres_resi && (u4_coeff_cost < CHROMA_BLOCK_SKIP_THRESHOLD))
1494         {
1495             pu4_mb_coeff_data[0] = 0;
1496             pu4_mb_coeff_data[1] = 0;
1497             pu4_mb_coeff_data[2] = 0;
1498             pu4_mb_coeff_data[3] = 0;
1499             (*pv_mb_coeff_data) = pu4_mb_coeff_data + 4;
1500 
1501             /* Generate the control signal */
1502             /* Zero out the current plane's AC coefficients */
1503             (*pu4_cntrl) &= ((plane == u4_swap_uv) ? 0x0FFFFFFF : 0xF0FFFFFF);
1504 
1505             /* Similarly do for the NNZ also */
1506             *(pu1_nnz_ac - 4) = 0;
1507             *(pu1_nnz_ac - 3) = 0;
1508             *(pu1_nnz_ac - 2) = 0;
1509             *(pu1_nnz_ac - 1) = 0;
1510         }
1511         else
1512         {
1513             *u1_cbp_c = u1_cbp_ac;
1514         }
1515 
1516         if(u4_swap_uv)
1517         {
1518             pi2_res_sb =
1519                 pi2_res_mb + i4_res_strd; /* Move to V plane ,ie 1dc row+ 4 ac row + 1 dc row */
1520             cntrl_pos = 31; /* The control bits are to be added for V bloc ie 31-4 th bit */
1521             pu1_nnz_ac = pu1_nnz + 1; /* Move the nnz to V block NNZ 1 dc + 1dc + 4 ac */
1522 
1523             pu1_nnz_ac = pu1_nnz + 1;
1524         }
1525         else
1526             pu1_nnz_ac = pu1_nnz + 6; /* Go to nnz of V plane */
1527     }
1528 
1529     /* restore the ptr basing on cbp */
1530     if(*u1_cbp_c == 0)
1531     {
1532         (*pv_mb_coeff_data) = ps_mb_coeff_data_dc;
1533     }
1534     else if(*u1_cbp_c == 1)
1535     {
1536         (*pv_mb_coeff_data) = ps_mb_coeff_data_ac;
1537     }
1538 
1539     return;
1540 }
1541 
1542 /**
1543 *******************************************************************************
1544 *
1545 * @brief performs luma core coding when intra mode is i16x16
1546 *
1547 * @par Description:
1548 *  If the current mb is to be coded as intra of mb type i16x16, the mb is first
1549 *  predicted using one of i16x16 prediction filters, basing on the intra mode
1550 *  chosen. Then, error is computed between the input blk and the estimated blk.
1551 *  This error is transformed (hierarchical transform i.e., dct followed by hada-
1552 *  -mard), quantized. The quantized coefficients are packed in scan order for
1553 *  entropy coding.
1554 *
1555 * @param[in] ps_proc_ctxt
1556 *  pointer to the current macro block context
1557 *
1558 * @returns u1_cbp_l
1559 *  coded block pattern luma
1560 *
1561 * @remarks none
1562 *
1563 *******************************************************************************
1564 */
1565 
isvce_code_luma_intra_macroblock_16x16(isvce_process_ctxt_t * ps_proc)1566 UWORD8 isvce_code_luma_intra_macroblock_16x16(isvce_process_ctxt_t *ps_proc)
1567 {
1568     buffer_container_t s_src;
1569     buffer_container_t s_pred;
1570     buffer_container_t s_recon;
1571     buffer_container_t s_res;
1572     buffer_container_t s_quant_coeffs;
1573 
1574     /*Cntrol signal for itrans*/
1575     UWORD32 u4_cntrl;
1576 
1577     isvce_codec_t *ps_codec = ps_proc->ps_codec;
1578     quant_params_t *ps_qp_params = ps_proc->ps_qp_params[0];
1579     isa_dependent_fxns_t *ps_isa_dependent_fxns = &ps_codec->s_isa_dependent_fxns;
1580     inter_pred_fxns_t *ps_inter_pred_fxns = &ps_isa_dependent_fxns->s_inter_pred_fxns;
1581     iq_it_res_rec_constants_t s_iq_it_res_rec_constants = {
1582         .pu2_iscal_mat = ps_qp_params->pu2_iscale_mat,
1583         .pu2_weigh_mat = ps_qp_params->pu2_weigh_mat,
1584         .u4_qp_div_6 = ps_qp_params->u1_qp_div};
1585 
1586     UWORD8 *pu1_pred_mb = NULL;
1587     WORD16 *pi2_res_mb = ps_proc->pi2_res_buf_intra_4x4;
1588     WORD32 i4_pred_stride = ps_proc->i4_pred_strd;
1589     WORD32 i4_res_strd = ps_proc->i4_res_strd;
1590     UWORD8 u1_intra_mode = ps_proc->u1_l_i16_mode;
1591     UWORD32 au4_nnz[5] = {0};
1592     UWORD8 u1_cbp_l = 0;
1593     UWORD8 *pu1_nnz = (UWORD8 *) au4_nnz;
1594     /* pointer to packed mb coeff data */
1595     void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
1596 
1597     if(u1_intra_mode == PLANE_I16x16)
1598     {
1599         pu1_pred_mb = ps_proc->pu1_pred_mb_intra_16x16_plane;
1600     }
1601     else
1602     {
1603         pu1_pred_mb = ps_proc->pu1_pred_mb_intra_16x16;
1604     }
1605 
1606     s_src = ps_proc->s_src_buf_props.as_component_bufs[Y];
1607     s_recon = ps_proc->s_rec_buf_props.as_component_bufs[Y];
1608     s_pred.pv_data = pu1_pred_mb;
1609     s_pred.i4_data_stride = i4_pred_stride;
1610     s_quant_coeffs.pv_data = pi2_res_mb;
1611     s_quant_coeffs.i4_data_stride = i4_res_strd;
1612 
1613     s_res = ps_codec->s_svc_ilp_data.ps_residual_bufs[ps_proc->u1_spatial_layer_id]
1614                 .as_component_bufs[Y];
1615     s_res.pv_data = ((WORD16 *) s_res.pv_data) + ps_proc->i4_mb_x * MB_SIZE +
1616                     ps_proc->i4_mb_y * MB_SIZE * s_res.i4_data_stride;
1617 
1618     /********************************************************/
1619     /*  error estimation,                                   */
1620     /*  transform                                           */
1621     /*  quantization                                        */
1622     /********************************************************/
1623     isvce_luma_16x16_resi_trans_dctrans_quant(
1624         &s_src, &s_pred, &s_quant_coeffs, &ps_proc->ps_mb_res_buf->as_component_bufs[Y],
1625         ps_isa_dependent_fxns, ps_qp_params->pu2_scale_mat, ps_qp_params->pu2_thres_mat, pu1_nnz,
1626         ps_qp_params->u1_qbits, ps_qp_params->u4_dead_zone, ENABLE_DC_TRANSFORM,
1627         ps_proc->ps_mb_info->u1_residual_prediction_flag);
1628 
1629     /********************************************************/
1630     /*  pack coeff data for entropy coding                  */
1631     /********************************************************/
1632     isvce_pack_l_mb_i16(pi2_res_mb, pv_mb_coeff_data, i4_res_strd, &u1_cbp_l, pu1_nnz, &u4_cntrl);
1633 
1634     /********************************************************/
1635     /*  ierror estimation,                                  */
1636     /*  itransform                                          */
1637     /*  iquantization                                       */
1638     /********************************************************/
1639     /*
1640      *if refernce frame is not to be computed
1641      *we only need the right and bottom border 4x4 blocks to predict next intra
1642      *blocks, hence only compute them
1643      */
1644     if(!ps_proc->u4_compute_recon)
1645     {
1646         u4_cntrl &= 0x111F8000;
1647     }
1648 
1649     if(u4_cntrl)
1650     {
1651         isvce_luma_16x16_idctrans_iquant_itrans_recon(
1652             &s_quant_coeffs, &s_pred, &s_recon, &s_res,
1653             &ps_proc->ps_mb_res_buf->as_component_bufs[Y], &s_iq_it_res_rec_constants,
1654             ps_isa_dependent_fxns, ps_proc->pv_scratch_buff, u4_cntrl, ENABLE_DC_TRANSFORM, 0);
1655     }
1656     else
1657     {
1658         ps_inter_pred_fxns->pf_inter_pred_luma_copy(pu1_pred_mb, (UWORD8 *) s_recon.pv_data,
1659                                                     i4_pred_stride, s_recon.i4_data_stride, MB_SIZE,
1660                                                     MB_SIZE, NULL, 0);
1661     }
1662 
1663     return (u1_cbp_l);
1664 }
1665 
1666 /**
1667 *******************************************************************************
1668 *
1669 * @brief performs luma core coding when intra mode is i4x4
1670 *
1671 * @par Description:
1672 *  If the current mb is to be coded as intra of mb type i4x4, the mb is first
1673 *  predicted using one of i4x4 prediction filters, basing on the intra mode
1674 *  chosen. Then, error is computed between the input blk and the estimated blk.
1675 *  This error is dct transformed and quantized. The quantized coefficients are
1676 *  packed in scan order for entropy coding.
1677 *
1678 * @param[in] ps_proc_ctxt
1679 *  pointer to the current macro block context
1680 *
1681 * @returns u1_cbp_l
1682 *  coded block pattern luma
1683 *
1684 * @remarks
1685 *  The traversal of 4x4 subblocks in the 16x16 macroblock is as per the scan
1686 *order mentioned in h.264 specification
1687 *
1688 *******************************************************************************
1689 */
isvce_code_luma_intra_macroblock_4x4(isvce_process_ctxt_t * ps_proc)1690 UWORD8 isvce_code_luma_intra_macroblock_4x4(isvce_process_ctxt_t *ps_proc)
1691 {
1692     buffer_container_t s_src;
1693     buffer_container_t s_pred;
1694     buffer_container_t s_recon;
1695     buffer_container_t s_res;
1696     buffer_container_t s_res_pred;
1697     buffer_container_t s_quant_coeffs;
1698 
1699     /* pointer to neighbors: left, top, top-left */
1700     UWORD8 *pu1_mb_a;
1701     UWORD8 *pu1_mb_b;
1702     UWORD8 *pu1_mb_c;
1703     UWORD8 *pu1_mb_d;
1704     WORD32 i4_ngbr_avbl;
1705     UWORD8 u1_nnz;
1706     UWORD32 u4_nnz_cnt;
1707     /* significant coefficient map */
1708     UWORD32 u4_s_map;
1709     /*Dummy variable for 4x4 trans fucntion*/
1710     WORD16 i2_dc_dummy;
1711     UWORD32 i, b8, b4, u1_blk_x, u1_blk_y, u1_pix_x, u1_pix_y, coeff_cnt, mask;
1712 
1713     isvce_codec_t *ps_codec = ps_proc->ps_codec;
1714     quant_params_t *ps_qp_params = ps_proc->ps_qp_params[0];
1715     /* pointer to packed mb coeff data */
1716     tu_sblk_coeff_data_t *ps_mb_coeff_data, *ps_mb_coeff_data_b8;
1717     isa_dependent_fxns_t *ps_isa_dependent_fxns = &ps_codec->s_isa_dependent_fxns;
1718     enc_loop_fxns_t *ps_enc_loop_fxns = &ps_isa_dependent_fxns->s_enc_loop_fxns;
1719     inter_pred_fxns_t *ps_inter_pred_fxns = &ps_isa_dependent_fxns->s_inter_pred_fxns;
1720     resi_trans_quant_constants_t s_resi_trans_quant_constants = {
1721         .pu2_scale_matrix = ps_qp_params->pu2_scale_mat,
1722         .pu2_threshold_matrix = ps_qp_params->pu2_thres_mat,
1723         .u4_qbits = ps_qp_params->u1_qbits,
1724         .u4_round_factor = ps_qp_params->u4_dead_zone};
1725     iq_it_res_rec_constants_t s_iq_it_res_rec_constants = {
1726         .pu2_iscal_mat = ps_qp_params->pu2_iscale_mat,
1727         .pu2_weigh_mat = ps_qp_params->pu2_weigh_mat,
1728         .u4_qp_div_6 = ps_qp_params->u1_qp_div};
1729 
1730     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
1731     WORD16 *pi2_res_mb = ps_proc->pi2_res_buf_intra_4x4;
1732     WORD32 i4_pred_stride = ps_proc->i4_pred_strd;
1733     UWORD8 u1_intra_mode = ps_proc->u1_l_i16_mode;
1734     UWORD8 *pu1_ngbr_pels_i4 = ps_proc->au1_ngbr_pels;
1735     UWORD8 u1_cbp_l = 0;
1736     /* pointer to packed mb coeff data */
1737     void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
1738     const UWORD8 *pu1_scan_order = gu1_luma_scan_order;
1739     UWORD8 u1_resi_trans_fxn_idx = isvc_get_resi_trans_quant_variant_idx(0);
1740     UWORD8 u1_iq_it_recon_fxn_idx = isvc_get_iq_it_recon_variant_idx(1, 0);
1741 
1742     s_src = ps_proc->s_src_buf_props.as_component_bufs[Y];
1743     s_recon = ps_proc->s_rec_buf_props.as_component_bufs[Y];
1744     s_pred.pv_data = pu1_pred_mb;
1745     s_pred.i4_data_stride = i4_pred_stride;
1746     s_quant_coeffs.pv_data = pi2_res_mb;
1747     s_quant_coeffs.i4_data_stride = 4;
1748 
1749     /* Process 16 4x4 lum sub-blocks of the MB in scan order */
1750     for(b8 = 0; b8 < 4; b8++)
1751     {
1752         u1_blk_x = GET_BLK_RASTER_POS_X(b8) << 3;
1753         u1_blk_y = GET_BLK_RASTER_POS_Y(b8) << 3;
1754 
1755         /* if in case cbp for the 8x8 block is zero, send no residue */
1756         ps_mb_coeff_data_b8 = *pv_mb_coeff_data;
1757 
1758         for(b4 = 0; b4 < 4; b4++)
1759         {
1760             /* index of pel in MB */
1761             u1_pix_x = u1_blk_x + (GET_SUB_BLK_RASTER_POS_X(b4) << 2);
1762             u1_pix_y = u1_blk_y + (GET_SUB_BLK_RASTER_POS_Y(b4) << 2);
1763 
1764             /* Initialize source and reference pointers */
1765             s_src = ps_proc->s_src_buf_props.as_component_bufs[Y];
1766             s_recon = ps_proc->s_rec_buf_props.as_component_bufs[Y];
1767             s_src.pv_data = ((UWORD8 *) s_src.pv_data) + u1_pix_x + u1_pix_y * s_src.i4_data_stride;
1768             s_recon.pv_data =
1769                 ((UWORD8 *) s_recon.pv_data) + u1_pix_x + u1_pix_y * s_recon.i4_data_stride;
1770 
1771             s_res = ps_codec->s_svc_ilp_data.ps_residual_bufs[ps_proc->u1_spatial_layer_id]
1772                         .as_component_bufs[Y];
1773             s_res.pv_data = ((WORD16 *) s_res.pv_data) + ps_proc->i4_mb_x * MB_SIZE +
1774                             ps_proc->i4_mb_y * MB_SIZE * s_res.i4_data_stride;
1775             s_res.pv_data = ((WORD16 *) s_res.pv_data) + u1_pix_x + u1_pix_y * s_res.i4_data_stride;
1776 
1777             s_res_pred = ps_proc->ps_mb_res_buf->as_component_bufs[Y];
1778             s_res_pred.pv_data =
1779                 ((WORD16 *) s_res_pred.pv_data) + u1_pix_x + u1_pix_y * s_res_pred.i4_data_stride;
1780 
1781             /* pointer to left of ref macro block */
1782             pu1_mb_a = ((UWORD8 *) s_recon.pv_data) - 1;
1783             /* pointer to top of ref macro block */
1784             pu1_mb_b = ((UWORD8 *) s_recon.pv_data) - s_recon.i4_data_stride;
1785             /* pointer to topright of ref macro block */
1786             pu1_mb_c = pu1_mb_b + 4;
1787             /* pointer to topleft macro block */
1788             pu1_mb_d = pu1_mb_b - 1;
1789 
1790             /* compute neighbor availability */
1791             i4_ngbr_avbl = ps_proc->au1_ngbr_avbl_4x4_subblks[(b8 << 2) + b4];
1792 
1793             /* sub block intra mode */
1794             u1_intra_mode = ps_proc->au1_intra_luma_mb_4x4_modes[(b8 << 2) + b4];
1795 
1796             /********************************************************/
1797             /* gather prediction pels from neighbors for prediction */
1798             /********************************************************/
1799             /* left pels */
1800             if(i4_ngbr_avbl & LEFT_MB_AVAILABLE_MASK)
1801             {
1802                 for(i = 0; i < 4; i++)
1803                     pu1_ngbr_pels_i4[4 - 1 - i] = pu1_mb_a[i * s_recon.i4_data_stride];
1804             }
1805             else
1806             {
1807                 memset(pu1_ngbr_pels_i4, 0, 4);
1808             }
1809 
1810             /* top pels */
1811             if(i4_ngbr_avbl & TOP_MB_AVAILABLE_MASK)
1812             {
1813                 memcpy(pu1_ngbr_pels_i4 + 4 + 1, pu1_mb_b, 4);
1814             }
1815             else
1816             {
1817                 memset(pu1_ngbr_pels_i4 + 5, 0, 4);
1818             }
1819             /* top left pels */
1820             if(i4_ngbr_avbl & TOP_LEFT_MB_AVAILABLE_MASK)
1821             {
1822                 pu1_ngbr_pels_i4[4] = *pu1_mb_d;
1823             }
1824             else
1825             {
1826                 pu1_ngbr_pels_i4[4] = 0;
1827             }
1828             /* top right pels */
1829             if(i4_ngbr_avbl & TOP_RIGHT_MB_AVAILABLE_MASK)
1830             {
1831                 memcpy(pu1_ngbr_pels_i4 + 8 + 1, pu1_mb_c, 4);
1832             }
1833             else if(i4_ngbr_avbl & TOP_MB_AVAILABLE_MASK)
1834             {
1835                 memset(pu1_ngbr_pels_i4 + 8 + 1, pu1_ngbr_pels_i4[8], 4);
1836             }
1837 
1838             /********************************************************/
1839             /*  prediction                                          */
1840             /********************************************************/
1841             (ps_codec->apf_intra_pred_4_l)[u1_intra_mode](pu1_ngbr_pels_i4, pu1_pred_mb, 0,
1842                                                           i4_pred_stride, i4_ngbr_avbl);
1843 
1844             /********************************************************/
1845             /*  error estimation,                                   */
1846             /*  transform                                           */
1847             /*  quantization                                        */
1848             /********************************************************/
1849             ps_enc_loop_fxns->apf_resi_trans_quant_4x4[u1_resi_trans_fxn_idx](
1850                 &s_src, &s_pred, &s_quant_coeffs, &s_res_pred, &s_resi_trans_quant_constants,
1851                 &u1_nnz, &i2_dc_dummy, 0);
1852 
1853             /********************************************************/
1854             /*  pack coeff data for entropy coding                  */
1855             /********************************************************/
1856             ps_mb_coeff_data = *pv_mb_coeff_data;
1857 
1858             /* write number of non zero coefficients */
1859             ps_mb_coeff_data->i4_sig_map_nnz = u1_nnz;
1860 
1861             if(u1_nnz)
1862             {
1863                 for(u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u1_nnz;
1864                     coeff_cnt++)
1865                 {
1866                     if(pi2_res_mb[pu1_scan_order[coeff_cnt]])
1867                     {
1868                         /* write residue */
1869                         ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] =
1870                             pi2_res_mb[pu1_scan_order[coeff_cnt]];
1871                         u4_s_map |= mask;
1872                     }
1873                     mask <<= 1;
1874                 }
1875                 /* write significant coeff map */
1876                 ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
1877 
1878                 /* update ptr to coeff data */
1879                 (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
1880 
1881                 /* cbp */
1882                 u1_cbp_l |= (1 << b8);
1883             }
1884             else
1885             {
1886                 (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
1887             }
1888 
1889             /********************************************************/
1890             /*  ierror estimation,                                  */
1891             /*  itransform                                          */
1892             /*  iquantization                                       */
1893             /********************************************************/
1894             if(u1_nnz)
1895             {
1896                 buffer_container_t s_src = s_quant_coeffs;
1897 
1898                 /* Tx blk coeffs are stored blk by blk */
1899                 /* Hence, in order to access rows of each Tx blk, one needs to stride of
1900                  * TxxSize */
1901                 s_src.i4_data_stride = 4;
1902 
1903                 ps_enc_loop_fxns->apf_iquant_itrans_recon_4x4[u1_iq_it_recon_fxn_idx](
1904                     &s_src, &s_pred, &s_res_pred, &s_res, &s_recon, &s_iq_it_res_rec_constants,
1905                     (WORD16 *) ps_proc->pv_scratch_buff, s_src.pv_data, 0, 0);
1906             }
1907             else
1908             {
1909                 ps_inter_pred_fxns->pf_inter_pred_luma_copy(
1910                     (UWORD8 *) s_pred.pv_data, (UWORD8 *) s_recon.pv_data, s_pred.i4_data_stride,
1911                     s_recon.i4_data_stride, BLK_SIZE, BLK_SIZE, NULL, 0);
1912             }
1913         }
1914 
1915         /* if the 8x8 block has no residue, nothing needs to be sent to entropy */
1916         if(!(u1_cbp_l & (1 << b8)))
1917         {
1918             *pv_mb_coeff_data = ps_mb_coeff_data_b8;
1919         }
1920     }
1921 
1922     return (u1_cbp_l);
1923 }
1924 
1925 /**
1926 *******************************************************************************
1927 *
1928 * @brief performs luma core coding when intra mode is i4x4
1929 *
1930 * @par Description:
1931 *  If the current mb is to be coded as intra of mb type i4x4, the mb is first
1932 *  predicted using one of i4x4 prediction filters, basing on the intra mode
1933 *  chosen. Then, error is computed between the input blk and the estimated blk.
1934 *  This error is dct transformed and quantized. The quantized coefficients are
1935 *  packed in scan order for entropy coding.
1936 *
1937 * @param[in] ps_proc_ctxt
1938 *  pointer to the current macro block context
1939 *
1940 * @returns u1_cbp_l
1941 *  coded block pattern luma
1942 *
1943 * @remarks
1944 *  The traversal of 4x4 subblocks in the 16x16 macroblock is as per the scan
1945 *order mentioned in h.264 specification
1946 *
1947 *******************************************************************************
1948 */
isvce_code_luma_intra_macroblock_4x4_rdopt_on(isvce_process_ctxt_t * ps_proc)1949 UWORD8 isvce_code_luma_intra_macroblock_4x4_rdopt_on(isvce_process_ctxt_t *ps_proc)
1950 {
1951     /* pointer to packed mb coeff data */
1952     tu_sblk_coeff_data_t *ps_mb_coeff_data, *ps_mb_coeff_data_b8;
1953 
1954     UWORD32 u4_nnz_cnt;
1955     /* significant coefficient map */
1956     UWORD32 u4_s_map;
1957     UWORD32 b8, b4, coeff_cnt, mask;
1958 
1959     isvce_codec_t *ps_codec = ps_proc->ps_codec;
1960     isa_dependent_fxns_t *ps_isa_dependent_fxns = &ps_codec->s_isa_dependent_fxns;
1961     inter_pred_fxns_t *ps_inter_pred_fxns = &ps_isa_dependent_fxns->s_inter_pred_fxns;
1962 
1963     UWORD8 *pu1_ref_mb_intra_4x4 = ps_proc->pu1_ref_mb_intra_4x4;
1964     UWORD8 *pu1_rec_mb = ((UWORD8 *) ps_proc->s_rec_buf_props.as_component_bufs[0].pv_data);
1965     WORD16 *pi2_res_mb = ps_proc->pi2_res_buf_intra_4x4;
1966     WORD32 i4_rec_strd = ps_proc->s_rec_buf_props.as_component_bufs[0].i4_data_stride;
1967     UWORD8 *pu1_nnz = (UWORD8 *) ps_proc->au4_nnz_intra_4x4;
1968     UWORD8 u1_cbp_l = 0;
1969     void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
1970     const UWORD8 *pu1_scan_order = gu1_luma_scan_order;
1971 
1972     /* Process 16 4x4 lum sub-blocks of the MB in scan order */
1973     for(b8 = 0; b8 < 4; b8++)
1974     {
1975         /* if in case cbp for the 8x8 block is zero, send no residue */
1976         ps_mb_coeff_data_b8 = *pv_mb_coeff_data;
1977 
1978         for(b4 = 0; b4 < 4; b4++, pu1_nnz++, pi2_res_mb += MB_SIZE)
1979         {
1980             /********************************************************/
1981             /*  pack coeff data for entropy coding                  */
1982             /********************************************************/
1983             ps_mb_coeff_data = *pv_mb_coeff_data;
1984 
1985             /* write number of non zero coefficients */
1986             ps_mb_coeff_data->i4_sig_map_nnz = *pu1_nnz;
1987 
1988             if(*pu1_nnz)
1989             {
1990                 for(u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < *pu1_nnz;
1991                     coeff_cnt++)
1992                 {
1993                     if(pi2_res_mb[pu1_scan_order[coeff_cnt]])
1994                     {
1995                         /* write residue */
1996                         ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] =
1997                             pi2_res_mb[pu1_scan_order[coeff_cnt]];
1998                         u4_s_map |= mask;
1999                     }
2000                     mask <<= 1;
2001                 }
2002                 /* write significant coeff map */
2003                 ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
2004 
2005                 /* update ptr to coeff data */
2006                 (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
2007 
2008                 /* cbp */
2009                 u1_cbp_l |= (1 << b8);
2010             }
2011             else
2012             {
2013                 (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
2014             }
2015         }
2016 
2017         /* if the 8x8 block has no residue, nothing needs to be sent to entropy */
2018         if(!(u1_cbp_l & (1 << b8)))
2019         {
2020             *pv_mb_coeff_data = ps_mb_coeff_data_b8;
2021         }
2022     }
2023 
2024     ps_inter_pred_fxns->pf_inter_pred_luma_copy(pu1_ref_mb_intra_4x4, pu1_rec_mb, MB_SIZE,
2025                                                 i4_rec_strd, MB_SIZE, MB_SIZE, NULL, 0);
2026 
2027     return (u1_cbp_l);
2028 }
2029 
2030 /**
2031 *******************************************************************************
2032 *
2033 * @brief performs chroma core coding for intra macro blocks
2034 *
2035 * @par Description:
2036 *  If the current MB is to be intra coded with mb type chroma I8x8, the MB is
2037 *  first predicted using intra 8x8 prediction filters. The predicted data is
2038 *  compared with the input for error and the error is transformed. The DC
2039 *  coefficients of each transformed sub blocks are further transformed using
2040 *  Hadamard transform. The resulting coefficients are quantized, packed and sent
2041 *  for entropy coding.
2042 *
2043 * @param[in] ps_proc_ctxt
2044 *  pointer to the current macro block context
2045 *
2046 * @returns u1_cbp_c
2047 *  coded block pattern chroma
2048 *
2049 * @remarks
2050 *  The traversal of 4x4 subblocks in the 8x8 macroblock is as per the scan order
2051 *  mentioned in h.264 specification
2052 *
2053 *******************************************************************************
2054 */
isvce_code_chroma_intra_macroblock_8x8(isvce_process_ctxt_t * ps_proc)2055 UWORD8 isvce_code_chroma_intra_macroblock_8x8(isvce_process_ctxt_t *ps_proc)
2056 {
2057     buffer_container_t s_src;
2058     buffer_container_t s_pred;
2059     buffer_container_t s_recon;
2060     buffer_container_t s_res;
2061     buffer_container_t s_res_pred;
2062     buffer_container_t s_quant_coeffs;
2063 
2064     /* Control signal for inverse transform */
2065     UWORD32 u4_cntrl;
2066 
2067     isvce_codec_t *ps_codec = ps_proc->ps_codec;
2068     quant_params_t *ps_qp_params = ps_proc->ps_qp_params[1];
2069     isa_dependent_fxns_t *ps_isa_dependent_fxns = &ps_codec->s_isa_dependent_fxns;
2070     iq_it_res_rec_constants_t s_iq_it_res_rec_constants = {
2071         .pu2_iscal_mat = ps_qp_params->pu2_iscale_mat,
2072         .pu2_weigh_mat = ps_qp_params->pu2_weigh_mat,
2073         .u4_qp_div_6 = ps_qp_params->u1_qp_div};
2074 
2075     UWORD8 *pu1_pred_mb = NULL;
2076     WORD16 *pi2_res_mb = ps_proc->pi2_res_buf;
2077     WORD32 i4_pred_stride = ps_proc->i4_pred_strd;
2078     WORD32 i4_res_strd = ps_proc->i4_res_strd;
2079     UWORD8 u1_intra_mode = ps_proc->u1_c_i8_mode;
2080     UWORD8 u1_cbp_c = 0;
2081     UWORD8 au1_nnz[2 * (NUM_4x4_IN_8x8 + 1)] = {0};
2082     /* pointer to packed mb coeff data */
2083     void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
2084     /* See if we need to swap U and V plances for entropy */
2085     UWORD32 u4_swap_uv = (ps_codec->s_cfg.e_inp_color_fmt == IV_YUV_420SP_VU);
2086 
2087     if(PLANE_CH_I8x8 == u1_intra_mode)
2088     {
2089         pu1_pred_mb = ps_proc->pu1_pred_mb_intra_chroma_plane;
2090     }
2091     else
2092     {
2093         pu1_pred_mb = ps_proc->pu1_pred_mb_intra_chroma;
2094     }
2095 
2096     s_src = ps_proc->s_src_buf_props.as_component_bufs[UV];
2097     s_recon = ps_proc->s_rec_buf_props.as_component_bufs[UV];
2098     s_pred.pv_data = pu1_pred_mb;
2099     s_pred.i4_data_stride = i4_pred_stride;
2100     s_quant_coeffs.pv_data = pi2_res_mb;
2101     s_quant_coeffs.i4_data_stride = i4_res_strd;
2102 
2103     s_res = ps_codec->s_svc_ilp_data.ps_residual_bufs[ps_proc->u1_spatial_layer_id]
2104                 .as_component_bufs[UV];
2105     s_res.pv_data = ((WORD16 *) s_res.pv_data) + ps_proc->i4_mb_x * MB_SIZE +
2106                     ps_proc->i4_mb_y * (MB_SIZE / 2) * s_res.i4_data_stride;
2107 
2108     s_res_pred = ps_proc->ps_mb_res_buf->as_component_bufs[U];
2109 
2110     /********************************************************/
2111     /*  error estimation,                                   */
2112     /*  transform                                           */
2113     /*  quantization                                        */
2114     /********************************************************/
2115     isvce_chroma_8x8_resi_trans_dctrans_quant(
2116         &s_src, &s_pred, &s_quant_coeffs, &s_res_pred, ps_isa_dependent_fxns,
2117         ps_qp_params->pu2_scale_mat, ps_qp_params->pu2_thres_mat, au1_nnz, ps_qp_params->u1_qbits,
2118         ps_qp_params->u4_dead_zone, 0);
2119 
2120     /********************************************************/
2121     /*  pack coeff data for entropy coding                  */
2122     /********************************************************/
2123     isvce_pack_c_mb(pi2_res_mb, pv_mb_coeff_data, i4_res_strd, &u1_cbp_c, au1_nnz,
2124                     ps_codec->u4_thres_resi, &u4_cntrl, u4_swap_uv);
2125 
2126     /********************************************************/
2127     /*  ierror estimation,                                  */
2128     /*  itransform                                          */
2129     /*  iquantization                                       */
2130     /********************************************************/
2131     isvce_chroma_8x8_idctrans_iquant_itrans_recon(
2132         &s_quant_coeffs, &s_pred, &s_recon, &s_res, &s_res_pred, &s_iq_it_res_rec_constants,
2133         ps_isa_dependent_fxns, ps_proc->pv_scratch_buff, u4_cntrl, 0);
2134 
2135     memcpy(ps_proc->au1_chroma_nnz, au1_nnz, sizeof(ps_proc->au1_chroma_nnz));
2136 
2137     return (u1_cbp_c);
2138 }
2139 
2140 /**
2141 *******************************************************************************
2142 *
2143 * @brief performs luma core coding when  mode is inter
2144 *
2145 * @par Description:
2146 *  If the current mb is to be coded as inter the mb is predicted based on the
2147 *  sub mb partitions and corresponding motion vectors generated by ME. Then,
2148 *  error is computed between the input blk and the estimated blk. This error is
2149 *  transformed, quantized. The quantized coefficients are packed in scan order
2150 *  for entropy coding
2151 *
2152 * @param[in] ps_proc_ctxt
2153 *  pointer to the current macro block context
2154 *
2155 * @returns u1_cbp_l
2156 *  coded block pattern luma
2157 *
2158 * @remarks none
2159 *
2160 *******************************************************************************
2161 */
2162 
isvce_code_luma_inter_macroblock_16x16(isvce_process_ctxt_t * ps_proc)2163 UWORD8 isvce_code_luma_inter_macroblock_16x16(isvce_process_ctxt_t *ps_proc)
2164 {
2165     buffer_container_t s_src;
2166     buffer_container_t s_pred;
2167     buffer_container_t s_recon;
2168     buffer_container_t s_res;
2169     buffer_container_t s_res_pred;
2170     buffer_container_t s_quant_coeffs;
2171 
2172     /*Control signal of itrans*/
2173     UWORD32 u4_cntrl;
2174 
2175     isvce_codec_t *ps_codec = ps_proc->ps_codec;
2176     quant_params_t *ps_qp_params = ps_proc->ps_qp_params[0];
2177     isa_dependent_fxns_t *ps_isa_dependent_fxns = &ps_codec->s_isa_dependent_fxns;
2178     iq_it_res_rec_constants_t s_iq_it_res_rec_constants = {
2179         .pu2_iscal_mat = ps_qp_params->pu2_iscale_mat,
2180         .pu2_weigh_mat = ps_qp_params->pu2_weigh_mat,
2181         .u4_qp_div_6 = ps_qp_params->u1_qp_div};
2182 
2183     WORD16 *pi2_res_mb = ps_proc->pi2_res_buf_intra_4x4;
2184     WORD32 i4_res_strd = ps_proc->i4_res_strd;
2185     UWORD8 u1_cbp_l = 0;
2186     UWORD8 *pu1_nnz = (UWORD8 *) ps_proc->au4_nnz;
2187     /* pointer to packed mb coeff data */
2188     void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
2189 
2190     ps_proc->au4_nnz[0] = 0;
2191     ps_proc->au4_nnz[1] = 0;
2192     ps_proc->au4_nnz[2] = 0;
2193     ps_proc->au4_nnz[3] = 0;
2194     ps_proc->au4_nnz[4] = 0;
2195 
2196     /********************************************************/
2197     /*  prediction                                          */
2198     /********************************************************/
2199     isvce_motion_comp_luma(ps_proc, &s_pred);
2200 
2201     s_src = ps_proc->s_src_buf_props.as_component_bufs[0];
2202     s_recon = ps_proc->s_rec_buf_props.as_component_bufs[0];
2203     s_quant_coeffs.pv_data = pi2_res_mb;
2204     s_quant_coeffs.i4_data_stride = i4_res_strd;
2205 
2206     s_res = ps_codec->s_svc_ilp_data.ps_residual_bufs[ps_proc->u1_spatial_layer_id]
2207                 .as_component_bufs[Y];
2208     s_res.pv_data = ((WORD16 *) s_res.pv_data) + ps_proc->i4_mb_x * MB_SIZE +
2209                     ps_proc->i4_mb_y * MB_SIZE * s_res.i4_data_stride;
2210 
2211     s_res_pred = ps_proc->ps_mb_res_buf->as_component_bufs[Y];
2212 
2213     /********************************************************/
2214     /*  error estimation,                                   */
2215     /*  transform                                           */
2216     /*  quantization                                        */
2217     /********************************************************/
2218     if(ps_proc->u4_min_sad_reached == 0 || ps_proc->u4_min_sad != 0)
2219     {
2220         isvce_luma_16x16_resi_trans_dctrans_quant(
2221             &s_src, &s_pred, &s_quant_coeffs, &s_res_pred, ps_isa_dependent_fxns,
2222             ps_qp_params->pu2_scale_mat, ps_qp_params->pu2_thres_mat, pu1_nnz,
2223             ps_qp_params->u1_qbits, ps_qp_params->u4_dead_zone, DISABLE_DC_TRANSFORM,
2224             ps_proc->ps_mb_info->u1_residual_prediction_flag);
2225 
2226         /********************************************************/
2227         /*  pack coeff data for entropy coding                  */
2228         /********************************************************/
2229         isvce_pack_l_mb(pi2_res_mb, pv_mb_coeff_data, i4_res_strd, &u1_cbp_l, pu1_nnz,
2230                         ps_codec->u4_thres_resi, &u4_cntrl);
2231     }
2232     else
2233     {
2234         u1_cbp_l = 0;
2235         u4_cntrl = 0;
2236     }
2237 
2238     /********************************************************/
2239     /*  ierror estimation,                                  */
2240     /*  itransform                                          */
2241     /*  iquantization                                       */
2242     /********************************************************/
2243 
2244     /*If the frame is not to be used for P frame reference or dumping recon
2245      * we only will use the reocn for only predicting intra Mbs
2246      * THis will need only right and bottom edge 4x4 blocks recon
2247      * Hence we selectively enable them using control signal(including DC)
2248      */
2249     if(ps_proc->u4_compute_recon != 1)
2250     {
2251         u4_cntrl &= 0x111F0000;
2252     }
2253 
2254     isvce_luma_16x16_idctrans_iquant_itrans_recon(
2255         &s_quant_coeffs, &s_pred, &s_recon, &s_res, &s_res_pred, &s_iq_it_res_rec_constants,
2256         ps_isa_dependent_fxns, ps_proc->pv_scratch_buff, u4_cntrl, DISABLE_DC_TRANSFORM,
2257         ps_proc->ps_mb_info->u1_residual_prediction_flag);
2258 
2259     return (u1_cbp_l);
2260 }
2261 
2262 /**
2263 *******************************************************************************
2264 *
2265 * @brief performs chroma core coding for inter macro blocks
2266 *
2267 * @par Description:
2268 *  If the current mb is to be coded as inter predicted mb,based on the sub mb
2269 *partitions and corresponding motion vectors generated by ME  ,prediction is
2270 *done. Then, error is computed between the input blk and the estimated blk. This
2271 *error is transformed , quantized. The quantized coefficients are packed in scan
2272 *order for entropy coding.
2273 *
2274 * @param[in] ps_proc_ctxt
2275 *  pointer to the current macro block context
2276 *
2277 * @returns u1_cbp_l
2278 *  coded block pattern chroma
2279 *
2280 * @remarks none
2281 *
2282 *******************************************************************************
2283 */
isvce_code_chroma_inter_macroblock_8x8(isvce_process_ctxt_t * ps_proc)2284 UWORD8 isvce_code_chroma_inter_macroblock_8x8(isvce_process_ctxt_t *ps_proc)
2285 {
2286     buffer_container_t s_src;
2287     buffer_container_t s_pred;
2288     buffer_container_t s_recon;
2289     buffer_container_t s_res;
2290     buffer_container_t s_res_pred;
2291     buffer_container_t s_quant_coeffs;
2292 
2293     /*Control signal for inverse transform*/
2294     UWORD32 u4_cntrl;
2295 
2296     isvce_codec_t *ps_codec = ps_proc->ps_codec;
2297     quant_params_t *ps_qp_params = ps_proc->ps_qp_params[1];
2298     isa_dependent_fxns_t *ps_isa_dependent_fxns = &ps_codec->s_isa_dependent_fxns;
2299     iq_it_res_rec_constants_t s_iq_it_res_rec_constants = {
2300         .pu2_iscal_mat = ps_qp_params->pu2_iscale_mat,
2301         .pu2_weigh_mat = ps_qp_params->pu2_weigh_mat,
2302         .u4_qp_div_6 = ps_qp_params->u1_qp_div};
2303 
2304     WORD16 *pi2_res_mb = ps_proc->pi2_res_buf;
2305     WORD32 i4_res_strd = ps_proc->i4_res_strd;
2306     UWORD8 u1_cbp_c = 0;
2307     UWORD8 au1_nnz[2 * (NUM_4x4_IN_8x8 + 1)] = {0};
2308     /* pointer to packed mb coeff data */
2309     void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
2310     /*See if we need to swap U and V plances for entropy*/
2311     UWORD32 u4_swap_uv = ps_codec->s_cfg.e_inp_color_fmt == IV_YUV_420SP_VU;
2312 
2313     isvce_motion_comp_chroma(ps_proc, &s_pred);
2314 
2315     s_src = ps_proc->s_src_buf_props.as_component_bufs[UV];
2316     s_recon = ps_proc->s_rec_buf_props.as_component_bufs[UV];
2317     s_quant_coeffs.pv_data = pi2_res_mb;
2318     s_quant_coeffs.i4_data_stride = i4_res_strd;
2319 
2320     s_res = ps_codec->s_svc_ilp_data.ps_residual_bufs[ps_proc->u1_spatial_layer_id]
2321                 .as_component_bufs[UV];
2322     s_res.pv_data = ((WORD16 *) s_res.pv_data) + ps_proc->i4_mb_x * MB_SIZE +
2323                     ps_proc->i4_mb_y * (MB_SIZE / 2) * s_res.i4_data_stride;
2324 
2325     s_res_pred = ps_proc->ps_mb_res_buf->as_component_bufs[UV];
2326 
2327     /********************************************************/
2328     /*  error estimation,                                   */
2329     /*  transform                                           */
2330     /*  quantization                                        */
2331     /********************************************************/
2332     isvce_chroma_8x8_resi_trans_dctrans_quant(
2333         &s_src, &s_pred, &s_quant_coeffs, &s_res_pred, ps_isa_dependent_fxns,
2334         ps_qp_params->pu2_scale_mat, ps_qp_params->pu2_thres_mat, au1_nnz, ps_qp_params->u1_qbits,
2335         ps_qp_params->u4_dead_zone, ps_proc->ps_mb_info->u1_residual_prediction_flag);
2336 
2337     /********************************************************/
2338     /*  pack coeff data for entropy coding                  */
2339     /********************************************************/
2340     isvce_pack_c_mb(pi2_res_mb, pv_mb_coeff_data, i4_res_strd, &u1_cbp_c, au1_nnz,
2341                     ps_codec->u4_thres_resi, &u4_cntrl, u4_swap_uv);
2342 
2343     /********************************************************/
2344     /*  ierror estimation,                                  */
2345     /*  itransform                                          */
2346     /*  iquantization                                       */
2347     /********************************************************/
2348 
2349     /* If the frame is not to be used for P frame reference or dumping recon
2350      * we only will use the reocn for only predicting intra Mbs
2351      * THis will need only right and bottom edge 4x4 blocks recon
2352      * Hence we selectively enable them using control signal(including DC)
2353      */
2354     if(!ps_proc->u4_compute_recon)
2355     {
2356         u4_cntrl &= 0x7700C000;
2357     }
2358 
2359     isvce_chroma_8x8_idctrans_iquant_itrans_recon(
2360         &s_quant_coeffs, &s_pred, &s_recon, &s_res, &s_res_pred, &s_iq_it_res_rec_constants,
2361         ps_isa_dependent_fxns, ps_proc->pv_scratch_buff, u4_cntrl,
2362         ps_proc->ps_mb_info->u1_residual_prediction_flag);
2363 
2364     memcpy(ps_proc->au1_chroma_nnz, au1_nnz, sizeof(ps_proc->au1_chroma_nnz));
2365 
2366     return (u1_cbp_c);
2367 }
2368