xref: /aosp_15_r20/external/libavc/encoder/ih264e_core_coding.c (revision 495ae853bb871d1e5a258cb02c2cc13cde8ddb9a)
1 /******************************************************************************
2  *
3  * Copyright (C) 2015 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 
21 /**
22 *******************************************************************************
23 * @file
24 *  ih264e_core_coding.c
25 *
26 * @brief
27 *  This file contains routines that perform luma and chroma core coding of
28 *  H264 macroblocks
29 *
30 * @author
31 *  ittiam
32 *
33 * @par List of Functions:
34 *  - ih264e_luma_16x16_resi_trans_dctrans_quant
35 *  - ih264e_luma_16x16_idctrans_iquant_itrans_recon
36 *  - ih264e_chroma_8x8_resi_trans_dctrans_quant
37 *  - ih264e_chroma_8x8_idctrans_iquant_itrans_recon
38 *  - ih264e_pack_l_mb_i16
39 *  - ih264e_pack_l_mb
40 *  - ih264e_pack_c_mb_i8
41 *  - ih264e_code_luma_intra_macroblock_16x16
42 *  - ih264e_code_luma_intra_macroblock_4x4
43 *  - ih264e_code_luma_intra_macroblock_4x4_rdopt_on
44 *  - ih264e_code_chroma_intra_macroblock_8x8
45 *  - ih264e_code_luma_inter_macroblock_16x16
46 *  - ih264e_code_chroma_inter_macroblock_8x8
47 *
48 * @remarks
49 *  none
50 *
51 *******************************************************************************
52 */
53 
54 /*****************************************************************************/
55 /* File Includes                                                             */
56 /*****************************************************************************/
57 
58 /* System Include Files */
59 #include <stdio.h>
60 #include <string.h>
61 #include <assert.h>
62 
63 /* User Include Files */
64 #include "ih264e_config.h"
65 #include "ih264_typedefs.h"
66 #include "iv2.h"
67 #include "ive2.h"
68 
69 #include "ih264_macros.h"
70 #include "ih264_defs.h"
71 #include "ih264_mem_fns.h"
72 #include "ih264_padding.h"
73 #include "ih264_structs.h"
74 #include "ih264_trans_quant_itrans_iquant.h"
75 #include "ih264_inter_pred_filters.h"
76 #include "ih264_intra_pred_filters.h"
77 #include "ih264_deblk_edge_filters.h"
78 #include "ih264_trans_data.h"
79 #include "ih264_cabac_tables.h"
80 #include "ih264_platform_macros.h"
81 
82 #include "ime_defs.h"
83 #include "ime_distortion_metrics.h"
84 #include "ime_structs.h"
85 
86 #include "irc_cntrl_param.h"
87 #include "irc_frame_info_collector.h"
88 
89 #include "ih264e_error.h"
90 #include "ih264e_defs.h"
91 #include "ih264e_globals.h"
92 #include "ih264e_rate_control.h"
93 #include "ih264e_bitstream.h"
94 #include "ih264e_cabac_structs.h"
95 #include "ih264e_structs.h"
96 #include "ih264e_mc.h"
97 #include "ih264e_core_coding.h"
98 
99 
100 /*****************************************************************************/
101 /* Function Definitions                                                      */
102 /*****************************************************************************/
103 
104 /**
105 *******************************************************************************
106 *
107 * @brief
108 *  This function performs does the DCT transform then Hadamard transform
109 *  and quantization for a macroblock when the mb mode is intra 16x16 mode
110 *
111 * @par Description:
112 *  First  cf4 is done on all 16 4x4 blocks of the 16x16 input block.
113 *  Then hadamard transform is done on the DC coefficients
114 *  Quantization is then performed on the 16x16 block, 4x4 wise
115 *
116 * @param[in] pu1_src
117 *  Pointer to source sub-block
118 *
119 * @param[in] pu1_pred
120 *  Pointer to prediction sub-block
121 *
122 * @param[in] pi2_out
123 *  Pointer to residual sub-block
124 *  The output will be in linear format
125 *  The first 16 continuous locations will contain the values of Dc block
126 *  After DC block and a stride 1st AC block will follow
127 *  After one more stride next AC block will follow
128 *  The blocks will be in raster scan order
129 *
130 * @param[in] src_strd
131 *  Source stride
132 *
133 * @param[in] pred_strd
134 *  Prediction stride
135 *
136 * @param[in] dst_strd
137 *  Destination stride
138 *
139 * @param[in] pu2_scale_matrix
140 *  The quantization matrix for 4x4 transform
141 *
142 * @param[in] pu2_threshold_matrix
143 *  Threshold matrix
144 *
145 * @param[in] u4_qbits
146 *  15+QP/6
147 *
148 * @param[in] u4_round_factor
149 *  Round factor for quant
150 *
151 * @param[out] pu1_nnz
152 *  Memory to store the non-zeros after transform
153 *  The first byte will be the nnz of DC block
154 *  From the next byte the AC nnzs will be stored in raster scan order
155 *
156 * @param u4_dc_flag
157 *  Signals if Dc transform is to be done or not
158 *   1 -> Dc transform will be done
159 *   0 -> Dc transform will not be done
160 *
161 * @remarks
162 *
163 *******************************************************************************
164 */
ih264e_luma_16x16_resi_trans_dctrans_quant(codec_t * ps_codec,UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD16 * pi2_out,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd,const UWORD16 * pu2_scale_matrix,const UWORD16 * pu2_threshold_matrix,UWORD32 u4_qbits,UWORD32 u4_round_factor,UWORD8 * pu1_nnz,UWORD32 u4_dc_flag)165 void ih264e_luma_16x16_resi_trans_dctrans_quant(codec_t *ps_codec,
166                                                 UWORD8 *pu1_src,
167                                                 UWORD8 *pu1_pred,
168                                                 WORD16 *pi2_out,
169                                                 WORD32 src_strd,
170                                                 WORD32 pred_strd,
171                                                 WORD32 dst_strd,
172                                                 const UWORD16 *pu2_scale_matrix,
173                                                 const UWORD16 *pu2_threshold_matrix,
174                                                 UWORD32 u4_qbits,
175                                                 UWORD32 u4_round_factor,
176                                                 UWORD8 *pu1_nnz,
177                                                 UWORD32 u4_dc_flag)
178 
179 {
180     WORD32 blk_cntr;
181     WORD32 i4_offsetx, i4_offsety;
182     UWORD8 *pu1_curr_src, *pu1_curr_pred;
183 
184     WORD16 *pi2_dc_str = pi2_out;
185 
186     /* Move to the ac addresses */
187     pu1_nnz++;
188     pi2_out += dst_strd;
189 
190     for (blk_cntr = 0; blk_cntr < NUM_LUMA4x4_BLOCKS_IN_MB; blk_cntr++)
191     {
192         IND2SUB_LUMA_MB(blk_cntr, i4_offsetx, i4_offsety);
193 
194         pu1_curr_src = pu1_src + i4_offsetx + i4_offsety * src_strd;
195         pu1_curr_pred = pu1_pred + i4_offsetx + i4_offsety * pred_strd;
196 
197         ps_codec->pf_resi_trans_quant_4x4(pu1_curr_src, pu1_curr_pred,
198                                           pi2_out + blk_cntr * dst_strd,
199                                           src_strd, pred_strd, pu2_scale_matrix,
200                                           pu2_threshold_matrix, u4_qbits,
201                                           u4_round_factor, &pu1_nnz[blk_cntr],
202                                           &pi2_dc_str[blk_cntr]);
203 
204     }
205 
206     if (!u4_dc_flag)
207         return;
208 
209     /*
210      * In case of i16x16, we need to remove the contribution of dc coeffs into
211      * nnz of each block. We are doing that in the packing function
212      */
213 
214     /* Adjust pointers to point to dc values */
215     pi2_out -= dst_strd;
216     pu1_nnz--;
217 
218     u4_qbits++;
219     u4_round_factor <<= 1;
220 
221     ps_codec->pf_hadamard_quant_4x4(pi2_dc_str, pi2_out, pu2_scale_matrix,
222                                     pu2_threshold_matrix, u4_qbits,
223                                     u4_round_factor, &pu1_nnz[0]);
224 }
225 
226 /**
227 *******************************************************************************
228 *
229 * @brief
230 *  This function performs the intra 16x16 inverse transform process for H264
231 *  it includes inverse Dc transform, inverse quant and then inverse transform
232 *
233 * @par Description:
234 *
235 * @param[in] pi2_src
236 *  Input data, 16x16 size
237 *  First 16 mem locations will have the Dc coffs in raster scan order in linear
238 *  fashion after a stride 1st AC clock will be present again in raster can order
239 *  Then each AC block of the 16x16 block will follow in raster scan order
240 *
241 * @param[in] pu1_pred
242 *  The predicted data, 16x16 size
243 *  Block by block form
244 *
245 * @param[in] pu1_out
246 *  Output 16x16
247 *  In block by block form
248 *
249 * @param[in] src_strd
250 *  Source stride
251 *
252 * @param[in] pred_strd
253 *  input stride for prediction buffer
254 *
255 * @param[in] out_strd
256 *  input stride for output buffer
257 *
258 * @param[in] pu2_iscale_mat
259 *  Inverse quantization matrix for 4x4 transform
260 *
261 * @param[in] pu2_weigh_mat
262 *  weight matrix of 4x4 transform
263 *
264 * @param[in] qp_div
265 *  QP/6
266 *
267 * @param[in] u4_cntrl
268 *  Controls the transform path
269 *  total Last 17 bits are used
270 *  the 16th th bit will correspond to DC block
271 *  and 32-17 will correspond to the ac blocks in raster scan order
272 *  bit equaling zero indicates that the entire 4x4 block is zero for DC
273 *  For AC blocks a bit equaling zero will mean that all 15 AC coffs of the block
274 *  is nonzero
275 *
276 * @param[in] u4_dc_trans_flag
277 *  Differentiates intra vs inter
278 *
279 * @param[in] pi4_tmp
280 *  Input temporary buffer
281 *  needs to be at least COFF_CNT_SUB_BLK_4x4+COFF_CNT_SUB_BLK_4x4 size
282 *
283 * @returns
284 *  none
285 *
286 * @remarks
287 *  The all zero case must be taken care outside
288 *
289 *******************************************************************************
290 */
ih264e_luma_16x16_idctrans_iquant_itrans_recon(codec_t * ps_codec,WORD16 * pi2_src,UWORD8 * pu1_pred,UWORD8 * pu1_out,WORD32 src_strd,WORD32 pred_strd,WORD32 out_strd,const UWORD16 * pu2_iscale_mat,const UWORD16 * pu2_weigh_mat,UWORD32 qp_div,UWORD32 u4_cntrl,UWORD32 u4_dc_trans_flag,WORD32 * pi4_tmp)291 void ih264e_luma_16x16_idctrans_iquant_itrans_recon(codec_t *ps_codec,
292                                                     WORD16 *pi2_src,
293                                                     UWORD8 *pu1_pred,
294                                                     UWORD8 *pu1_out,
295                                                     WORD32 src_strd,
296                                                     WORD32 pred_strd,
297                                                     WORD32 out_strd,
298                                                     const UWORD16 *pu2_iscale_mat,
299                                                     const UWORD16 *pu2_weigh_mat,
300                                                     UWORD32 qp_div,
301                                                     UWORD32 u4_cntrl,
302                                                     UWORD32 u4_dc_trans_flag,
303                                                     WORD32 *pi4_tmp)
304 {
305     /* Start index for inverse quant in a 4x4 block */
306     WORD32 iq_start_idx = (u4_dc_trans_flag == 0) ? 0 : 1;
307 
308     /* Cntrl bits for 4x4 transforms
309      * u4_blk_cntrl       : controls if a 4x4 block should be processed in ac path
310      * u4_dc_cntrl        : controls is a 4x4 block is to be processed in dc path
311      *                    : dc block must contain only single dc coefficient
312      * u4_empty_blk_cntrl : control fot 4x4 block with no coeffs, ie no dc and ac
313      *                    : ie not (ac or dc)
314      */
315     UWORD32 u4_blk_cntrl, u4_dc_cntrl, u4_empty_blk_cntrl;
316 
317     /* tmp registers for block ids */
318     UWORD32 u4_blk_id;
319 
320     /* Subscrripts */
321     WORD32 i4_offset_x, i4_offset_y;
322 
323     UWORD8 *pu1_cur_prd_blk, *pu1_cur_out_blk;
324 
325     /* Src and stride for dc coeffs */
326     UWORD32 u4_dc_inc;
327     WORD16 *pi2_dc_src;
328 
329     /*
330      * For intra blocks we need to do inverse dc transform
331      * In case if intra blocks, its here that we populate the dc bits in cntrl
332      * as they cannot be populated any earlier
333      */
334     if (u4_dc_trans_flag)
335     {
336         UWORD32 cntr, u4_dc_cntrl;
337         /* Do inv hadamard and place the results at the start of each AC block */
338         ps_codec->pf_ihadamard_scaling_4x4(pi2_src, pi2_src, pu2_iscale_mat,
339                                            pu2_weigh_mat, qp_div, pi4_tmp);
340 
341         /* Update the cntrl flag */
342         u4_dc_cntrl = 0;
343         for (cntr = 0; cntr < DC_COEFF_CNT_LUMA_MB; cntr++)
344         {
345             u4_dc_cntrl |= ((pi2_src[cntr] != 0) << (15 - cntr));
346         }
347         /* Mark dc bits as 1 if corresponding ac bit is 0 */
348         u4_dc_cntrl = (~(u4_cntrl >> 16) & u4_dc_cntrl);
349         /* Combine both ac and dc bits */
350         u4_cntrl = (u4_cntrl & CNTRL_FLAG_AC_MASK_LUMA)
351                         | (u4_dc_cntrl & CNTRL_FLAG_DC_MASK_LUMA);
352     }
353 
354     /* Source for dc coeffs
355      * If the block is intra, we have to read dc values from first row of src
356      * then stride for each block is 1, other wise its src stride
357      */
358     pi2_dc_src = (iq_start_idx == 0) ? (pi2_src + src_strd) : pi2_src;
359     u4_dc_inc = (iq_start_idx == 0) ? src_strd : 1;
360 
361     /* The AC blocks starts from 2nd row */
362     pi2_src += src_strd;
363 
364     /* Get the block bits */
365     u4_blk_cntrl = (u4_cntrl & CNTRL_FLAG_AC_MASK_LUMA);
366     u4_dc_cntrl = (u4_cntrl & CNTRL_FLAG_DC_MASK_LUMA) << 16;
367     u4_empty_blk_cntrl = (~(u4_dc_cntrl | u4_blk_cntrl)) & 0xFFFF0000;
368 
369     /* Get first block to process */
370     DEQUEUE_BLKID_FROM_CONTROL(u4_dc_cntrl, u4_blk_id);
371     while (u4_blk_id < NUM_LUMA4x4_BLOCKS_IN_MB)
372     {
373         /* Compute address of src blocks */
374         WORD32 i4_src_offset = u4_dc_inc * u4_blk_id;
375 
376         IND2SUB_LUMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
377 
378         /* Compute address of out and pred blocks */
379         pu1_cur_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd;
380         pu1_cur_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd;
381 
382         /* Do inv dc transform */
383         ps_codec->pf_iquant_itrans_recon_4x4_dc(pi2_dc_src + i4_src_offset,
384                                                 pu1_cur_prd_blk,
385                                                 pu1_cur_out_blk, pred_strd,
386                                                 out_strd, pu2_iscale_mat,
387                                                 pu2_weigh_mat, qp_div, NULL,
388                                                 iq_start_idx,
389                                                 pi2_dc_src + i4_src_offset);
390         /* Get next DC block to process */
391         DEQUEUE_BLKID_FROM_CONTROL(u4_dc_cntrl, u4_blk_id);
392     }
393 
394     /* now process ac/mixed blocks */
395     DEQUEUE_BLKID_FROM_CONTROL(u4_blk_cntrl, u4_blk_id);
396     while (u4_blk_id < NUM_LUMA4x4_BLOCKS_IN_MB)
397     {
398 
399         WORD32 i4_src_offset = src_strd * u4_blk_id;
400 
401         IND2SUB_LUMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
402 
403         pu1_cur_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd;
404         pu1_cur_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd;
405 
406         ps_codec->pf_iquant_itrans_recon_4x4(pi2_src + i4_src_offset,
407                                              pu1_cur_prd_blk, pu1_cur_out_blk,
408                                              pred_strd, out_strd,
409                                              pu2_iscale_mat, pu2_weigh_mat,
410                                              qp_div, (WORD16*) pi4_tmp,
411                                              iq_start_idx,
412                                              pi2_dc_src + u4_blk_id);
413 
414         DEQUEUE_BLKID_FROM_CONTROL(u4_blk_cntrl, u4_blk_id);
415     }
416 
417     /* Now process empty blocks */
418     DEQUEUE_BLKID_FROM_CONTROL(u4_empty_blk_cntrl, u4_blk_id);
419     while (u4_blk_id < NUM_LUMA4x4_BLOCKS_IN_MB)
420     {
421         IND2SUB_LUMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
422 
423         pu1_cur_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd;
424         pu1_cur_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd;
425 
426         ps_codec->pf_inter_pred_luma_copy(pu1_cur_prd_blk, pu1_cur_out_blk,
427                                           pred_strd, out_strd, SIZE_4X4_BLK_HRZ,
428                                           SIZE_4X4_BLK_VERT, 0, 0);
429 
430         DEQUEUE_BLKID_FROM_CONTROL(u4_empty_blk_cntrl, u4_blk_id);
431     }
432 }
433 
434 /**
435 *******************************************************************************
436 *
437 * @brief
438 *  This function performs does the DCT transform then Hadamard transform
439 *  and quantization for a chroma macroblock
440 *
441 * @par Description:
442 *  First  cf4 is done on all 16 4x4 blocks of the 8x8input block
443 *  Then hadamard transform is done on the DC coefficients
444 *  Quantization is then performed on the 8x8 block, 4x4 wise
445 *
446 * @param[in] pu1_src
447 *  Pointer to source sub-block
448 *  The input is in interleaved format for two chroma planes
449 *
450 * @param[in] pu1_pred
451 *  Pointer to prediction sub-block
452 *  Prediction is in inter leaved format
453 *
454 * @param[in] pi2_out
455 *  Pointer to residual sub-block
456 *  The output will be in linear format
457 *  The first 4 continuous locations will contain the values of DC block for U
458 *  and then next 4 will contain for V.
459 *  After DC block and a stride 1st AC block of U plane will follow
460 *  After one more stride next AC block of V plane will follow
461 *  The blocks will be in raster scan order
462 *
463 *  After all the AC blocks of U plane AC blocks of V plane will follow in exact
464 *  same way
465 *
466 * @param[in] src_strd
467 *  Source stride
468 *
469 * @param[in] pred_strd
470 *  Prediction stride
471 *
472 * @param[in] out_strd
473 *  Destination stride
474 *
475 * @param[in] pu2_scale_matrix
476 *  The quantization matrix for 4x4 transform
477 *
478 * @param[in] pu2_threshold_matrix
479 *  Threshold matrix
480 *
481 * @param[in] u4_qbits
482 *  15+QP/6
483 *
484 * @param[in] u4_round_factor
485 *  Round factor for quant
486 *
487 * @param[out] pu1_nnz_c
488 *  Memory to store the non-zeros after transform
489 *  The first byte will be the nnz od DC block for U plane
490 *  From the next byte the AC nnzs will be storerd in raster scan order
491 *  The fifth byte will be nnz of Dc block of V plane
492 *  Then Ac blocks will follow
493 *
494 * @remarks
495 *
496 *******************************************************************************
497 */
ih264e_chroma_8x8_resi_trans_dctrans_quant(codec_t * ps_codec,UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD16 * pi2_out,WORD32 src_strd,WORD32 pred_strd,WORD32 out_strd,const UWORD16 * pu2_scale_matrix,const UWORD16 * pu2_threshold_matrix,UWORD32 u4_qbits,UWORD32 u4_round_factor,UWORD8 * pu1_nnz_c)498 void ih264e_chroma_8x8_resi_trans_dctrans_quant(codec_t *ps_codec,
499                                                 UWORD8 *pu1_src,
500                                                 UWORD8 *pu1_pred,
501                                                 WORD16 *pi2_out,
502                                                 WORD32 src_strd,
503                                                 WORD32 pred_strd,
504                                                 WORD32 out_strd,
505                                                 const UWORD16 *pu2_scale_matrix,
506                                                 const UWORD16 *pu2_threshold_matrix,
507                                                 UWORD32 u4_qbits,
508                                                 UWORD32 u4_round_factor,
509                                                 UWORD8 *pu1_nnz_c)
510 {
511     WORD32 blk_cntr;
512     WORD32 i4_offsetx, i4_offsety;
513     UWORD8 *pu1_curr_src, *pu1_curr_pred;
514 
515     WORD16 pi2_dc_str[8];
516     UWORD8 au1_dcnnz[2];
517 
518     /* Move to the ac addresses */
519     pu1_nnz_c++;
520     pi2_out += out_strd;
521 
522     for (blk_cntr = 0; blk_cntr < NUM_CHROMA4x4_BLOCKS_IN_MB; blk_cntr++)
523     {
524         IND2SUB_CHROMA_MB(blk_cntr, i4_offsetx, i4_offsety);
525 
526         pu1_curr_src = pu1_src + i4_offsetx + i4_offsety * src_strd;
527         pu1_curr_pred = pu1_pred + i4_offsetx + i4_offsety * pred_strd;
528 
529         /* For chroma, v plane nnz is populated from position 5 */
530         ps_codec->pf_resi_trans_quant_chroma_4x4(
531                         pu1_curr_src, pu1_curr_pred,
532                         pi2_out + blk_cntr * out_strd, src_strd, pred_strd,
533                         pu2_scale_matrix, pu2_threshold_matrix, u4_qbits,
534                         u4_round_factor, &pu1_nnz_c[blk_cntr + (blk_cntr > 3)],
535                         &pi2_dc_str[blk_cntr]);
536     }
537 
538     /* Adjust pointers to point to dc values */
539     pi2_out -= out_strd;
540     pu1_nnz_c--;
541 
542     u4_qbits++;
543     u4_round_factor <<= 1;
544 
545     ps_codec->pf_hadamard_quant_2x2_uv(pi2_dc_str, pi2_out, pu2_scale_matrix,
546                                        pu2_threshold_matrix, u4_qbits,
547                                        u4_round_factor, au1_dcnnz);
548 
549     /* Copy the dc nnzs */
550     pu1_nnz_c[0] = au1_dcnnz[0];
551     pu1_nnz_c[5] = au1_dcnnz[1];
552 
553 }
554 
555 /**
556 *******************************************************************************
557 * @brief Does inverse DC transform, inverse quantization inverse transform for
558 * chroma MB
559 *
560 * @par Description:
561 *  Does inverse DC transform, inverse quantization inverse transform for
562 *  chroma MB
563 *
564 * @param[in] pi2_src
565 *  Input data, 16x16 size
566 *  The input is in the form of, first 4 locations will contain DC coeffs of
567 *  U plane, next 4 will contain DC coeffs of V plane, then AC blocks of U plane
568 *  in raster scan order will follow, each block as linear array in raster scan order.
569 *  After a stride next AC block will follow. After all AC blocks of U plane
570 *  V plane AC blocks will follow in exact same order.
571 *
572 * @param[in] pu1_pred
573 *  The predicted data, 8x16 size, U and V interleaved
574 *
575 * @param[in] pu1_out
576 *  Output 8x16, U and V interleaved
577 *
578 * @param[in] src_strd
579 *  Source stride
580 *
581 * @param[in] pred_strd
582 *  input stride for prediction buffer
583 *
584 * @param[in] out_strd
585 *  input stride for output buffer
586 *
587 * @param[in] pu2_iscale_mat
588 *  Inverse quantization martix for 4x4 transform
589 *
590 * @param[in] pu2_weigh_mat
591 *  weight matrix of 4x4 transform
592 *
593 * @param[in] qp_div
594 *  QP/6
595 *
596 * @param[in] u4_cntrl
597 *  Controls the transform path
598 *  the 15 th bit will correspond to DC block of U plane, 14th will indicate the
599 *  V plane Dc block. 32-28 bits will indicate AC blocks of U plane in raster
600 *  scan order. 27-23 bits will indicate AC blocks of V plane in rater scan order.
601 *  The bit 1 implies that there is at least one non zero coeff in a block
602 *
603 * @param[in] pi4_tmp
604 *  Input temporary buffer
605 *  needs to be at least COFF_CNT_SUB_BLK_4x4 + (Number of Dc coeffs for chroma *
606 *  number of planes) in size
607 *
608 * @returns
609 *  none
610 *
611 * @remarks
612 *
613 *******************************************************************************
614 */
ih264e_chroma_8x8_idctrans_iquant_itrans_recon(codec_t * ps_codec,WORD16 * pi2_src,UWORD8 * pu1_pred,UWORD8 * pu1_out,WORD32 src_strd,WORD32 pred_strd,WORD32 out_strd,const UWORD16 * pu2_iscale_mat,const UWORD16 * pu2_weigh_mat,UWORD32 qp_div,UWORD32 u4_cntrl,WORD32 * pi4_tmp)615 void ih264e_chroma_8x8_idctrans_iquant_itrans_recon(codec_t *ps_codec,
616                                                     WORD16 *pi2_src,
617                                                     UWORD8 *pu1_pred,
618                                                     UWORD8 *pu1_out,
619                                                     WORD32 src_strd,
620                                                     WORD32 pred_strd,
621                                                     WORD32 out_strd,
622                                                     const UWORD16 *pu2_iscale_mat,
623                                                     const UWORD16 *pu2_weigh_mat,
624                                                     UWORD32 qp_div,
625                                                     UWORD32 u4_cntrl,
626                                                     WORD32 *pi4_tmp)
627 {
628     /* Cntrl bits for 4x4 transforms
629      * u4_blk_cntrl       : controls if a 4x4 block should be processed in ac path
630      * u4_dc_cntrl        : controls is a 4x4 block is to be processed in dc path
631      *                    : dc block must contain only single dc coefficient
632      * u4_empty_blk_cntrl : control fot 4x4 block with no coeffs, ie no dc and ac
633      *                    : ie not (ac or dc)
634      */
635 
636     UWORD32 u4_blk_cntrl, u4_dc_cntrl, u4_empty_blk_cntrl;
637 
638     /* tmp registers for block ids */
639     WORD32 u4_blk_id;
640 
641     /* Offsets for pointers */
642     WORD32 i4_offset_x, i4_offset_y;
643 
644     /* Pointer to 4x4 blocks */
645     UWORD8 *pu1_cur_4x4_prd_blk, *pu1_cur_4x4_out_blk;
646 
647     /* Tmp register for pointer to dc coffs */
648     WORD16 *pi2_dc_src;
649 
650     WORD16 i2_zero = 0;
651 
652     /* Increment for dc block */
653     WORD32 i4_dc_inc;
654 
655     /*
656      * Lets do the inverse transform for dc coeffs in chroma
657      */
658     if (u4_cntrl & CNTRL_FLAG_DCBLK_MASK_CHROMA)
659     {
660         UWORD32 cntr, u4_dc_cntrl;
661 
662         /* Do inv hadamard for u an v block */
663         ps_codec->pf_ihadamard_scaling_2x2_uv(pi2_src, pi2_src, pu2_iscale_mat,
664                                               pu2_weigh_mat, qp_div, NULL);
665         /*
666          * Update the cntrl flag
667          * Flag is updated as follows bits 15-11 -> u block dc bits
668          */
669         u4_dc_cntrl = 0;
670         for (cntr = 0; cntr < 8; cntr++)
671         {
672             u4_dc_cntrl |= ((pi2_src[cntr] != 0) << (15 - cntr));
673         }
674 
675         /* Mark dc bits as 1 if corresponding ac bit is 0 */
676         u4_dc_cntrl = (~(u4_cntrl >> 16) & u4_dc_cntrl);
677         /* Combine both ac and dc bits */
678         u4_cntrl = (u4_cntrl & CNTRL_FLAG_AC_MASK_CHROMA)
679                         | (u4_dc_cntrl & CNTRL_FLAG_DC_MASK_CHROMA);
680 
681         /* Since we populated the dc coffs, we have to read them from there */
682         pi2_dc_src = pi2_src;
683         i4_dc_inc = 1;
684     }
685     else
686     {
687         u4_cntrl = u4_cntrl & CNTRL_FLAG_AC_MASK_CHROMA;
688         pi2_dc_src = &i2_zero;
689         i4_dc_inc = 0;
690     }
691 
692     /* Get the block bits */
693     u4_blk_cntrl = (u4_cntrl & CNTRL_FLAG_AC_MASK_CHROMA);
694     u4_dc_cntrl = (u4_cntrl & CNTRL_FLAG_DC_MASK_CHROMA) << 16;
695     u4_empty_blk_cntrl = (~(u4_dc_cntrl | u4_blk_cntrl)) & 0xFF000000;
696 
697     /* The AC blocks starts from 2nd row */
698     pi2_src += src_strd;
699 
700     DEQUEUE_BLKID_FROM_CONTROL(u4_dc_cntrl, u4_blk_id);
701     while (u4_blk_id < 8)
702     {
703         WORD32 dc_src_offset = u4_blk_id * i4_dc_inc;
704 
705         IND2SUB_CHROMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
706 
707         pu1_cur_4x4_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd;
708         pu1_cur_4x4_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd;
709 
710         ps_codec->pf_iquant_itrans_recon_chroma_4x4_dc(
711                         pi2_dc_src + dc_src_offset, pu1_cur_4x4_prd_blk,
712                         pu1_cur_4x4_out_blk, pred_strd, out_strd, NULL, NULL, 0,
713                         NULL, pi2_dc_src + dc_src_offset);
714         /* Get next DC block to process */
715         DEQUEUE_BLKID_FROM_CONTROL(u4_dc_cntrl, u4_blk_id);
716     }
717 
718     /* now process ac/mixed blocks */
719     DEQUEUE_BLKID_FROM_CONTROL(u4_blk_cntrl, u4_blk_id);
720     while (u4_blk_id < 8)
721     {
722         WORD32 i4_src_offset = src_strd * u4_blk_id;
723         WORD32 dc_src_offset = i4_dc_inc * u4_blk_id;
724 
725         IND2SUB_CHROMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
726 
727         pu1_cur_4x4_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd;
728         pu1_cur_4x4_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd;
729 
730         ps_codec->pf_iquant_itrans_recon_chroma_4x4(pi2_src + i4_src_offset,
731                                                     pu1_cur_4x4_prd_blk,
732                                                     pu1_cur_4x4_out_blk,
733                                                     pred_strd, out_strd,
734                                                     pu2_iscale_mat,
735                                                     pu2_weigh_mat, qp_div,
736                                                     (WORD16 *) pi4_tmp,
737                                                     pi2_dc_src + dc_src_offset);
738 
739         DEQUEUE_BLKID_FROM_CONTROL(u4_blk_cntrl, u4_blk_id);
740     }
741 
742     /* Now process empty blocks */
743     DEQUEUE_BLKID_FROM_CONTROL(u4_empty_blk_cntrl, u4_blk_id);
744     while (u4_blk_id < 8)
745     {
746         IND2SUB_CHROMA_MB(u4_blk_id, i4_offset_x, i4_offset_y);
747 
748         pu1_cur_4x4_prd_blk = pu1_pred + i4_offset_x + i4_offset_y * pred_strd;
749         pu1_cur_4x4_out_blk = pu1_out + i4_offset_x + i4_offset_y * out_strd;
750 
751         ps_codec->pf_interleave_copy(pu1_cur_4x4_prd_blk, pu1_cur_4x4_out_blk,
752                                      pred_strd, out_strd, SIZE_4X4_BLK_VERT,
753                                      SIZE_4X4_BLK_HRZ);
754 
755         DEQUEUE_BLKID_FROM_CONTROL(u4_empty_blk_cntrl, u4_blk_id);
756     }
757 }
758 
759 /**
760 ******************************************************************************
761 *
762 * @brief  This function packs residue of an i16x16 luma mb for entropy coding
763 *
764 * @par   Description
765 *  An i16 macro block contains two classes of units, dc 4x4 block and
766 *  4x4 ac blocks. while packing the mb, the dc block is sent first, and
767 *  the 16 ac blocks are sent next in scan order. Each and every block is
768 *  represented by 3 parameters (nnz, significant coefficient map and the
769 *  residue coefficients itself). If a 4x4 unit does not have any coefficients
770 *  then only nnz is sent. Inside a 4x4 block the individual coefficients are
771 *  sent in scan order.
772 *
773 *  The first byte of each block will be nnz of the block, if it is non zero,
774 *  a 2 byte significance map is sent. This is followed by nonzero coefficients.
775 *  This is repeated for 1 dc + 16 ac blocks.
776 *
777 * @param[in]  pi2_res_mb
778 *  pointer to residue mb
779 *
780 * @param[in, out]  pv_mb_coeff_data
781 *  buffer pointing to packed residue coefficients
782 *
783 * @param[in]  u4_res_strd
784 *  residual block stride
785 *
786 * @param[out]  u1_cbp_l
787 *  coded block pattern luma
788 *
789 * @param[in]   pu1_nnz
790 *  number of non zero coefficients in each 4x4 unit
791 *
792 * @param[out]
793 *  Control signal for inverse transform of 16x16 blocks
794 *
795 * @return none
796 *
797 * @ remarks
798 *
799 ******************************************************************************
800 */
ih264e_pack_l_mb_i16(WORD16 * pi2_res_mb,void ** pv_mb_coeff_data,WORD32 i4_res_strd,UWORD8 * u1_cbp_l,UWORD8 * pu1_nnz,UWORD32 * pu4_cntrl)801 void ih264e_pack_l_mb_i16(WORD16 *pi2_res_mb,
802                           void **pv_mb_coeff_data,
803                           WORD32 i4_res_strd,
804                           UWORD8 *u1_cbp_l,
805                           UWORD8 *pu1_nnz,
806                           UWORD32 *pu4_cntrl)
807 {
808     /* pointer to packed sub block buffer space */
809     tu_sblk_coeff_data_t *ps_mb_coeff_data = (*pv_mb_coeff_data), *ps_mb_coeff_data_ac;
810 
811     /* no of non zero coefficients in the current sub block */
812     UWORD32 u4_nnz_cnt;
813 
814     /* significant coefficient map */
815     UWORD32 u4_s_map;
816 
817     /* pointer to scanning matrix */
818     const UWORD8 *pu1_scan_order;
819 
820     /* number of non zeros in sub block */
821     UWORD32 u4_nnz;
822 
823     /* coeff scan order */
824     const UWORD8 u1_scan_order[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15};
825 
826     /* temp var */
827     UWORD32 coeff_cnt, mask, b4,u4_cntrl=0;
828 
829     /*DC and AC coeff pointers*/
830     WORD16 *pi2_res_mb_ac,*pi2_res_mb_dc;
831 
832     /********************************************************/
833     /*  pack dc coeff data for entropy coding               */
834     /********************************************************/
835 
836     pi2_res_mb_dc = pi2_res_mb;
837     pu1_scan_order = gu1_luma_scan_order_dc;
838 
839     u4_nnz = *pu1_nnz;
840     u4_cntrl = 0;
841 
842     /* write number of non zero coefficients */
843     ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz;
844 
845     if (u4_nnz)
846     {
847         for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz; coeff_cnt++)
848         {
849             if (pi2_res_mb_dc[pu1_scan_order[coeff_cnt]])
850             {
851                 /* write residue */
852                 ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = pi2_res_mb_dc[pu1_scan_order[coeff_cnt]];
853                 u4_s_map |= mask;
854             }
855             mask <<= 1;
856         }
857         /* write significant coeff map */
858         ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
859         (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
860 
861         u4_cntrl = 0x00008000;// Set DC bit in ctrl code
862     }
863     else
864     {
865         (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
866     }
867 
868     /********************************************************/
869     /*  pack ac coeff data for entropy coding               */
870     /********************************************************/
871 
872     pu1_nnz ++;
873     pu1_scan_order = gu1_luma_scan_order;
874     pi2_res_mb += i4_res_strd; /*Move to AC block*/
875 
876     ps_mb_coeff_data_ac = (*pv_mb_coeff_data);
877 
878     for (b4 = 0; b4 < 16; b4++)
879     {
880         ps_mb_coeff_data = (*pv_mb_coeff_data);
881 
882         u4_nnz = pu1_nnz[u1_scan_order[b4]];
883 
884         /* Jump according to the scan order */
885         pi2_res_mb_ac = pi2_res_mb + (i4_res_strd * u1_scan_order[b4]);
886 
887         /*
888          * Since this is a i16x16 block, we should not count dc coeff on indi
889          * vidual 4x4 blocks to nnz. But due to the implementation of 16x16
890          * trans function, we add dc's nnz to u4_nnz too. Hence we adjust that
891          * here
892          */
893         u4_nnz -= (pi2_res_mb_ac[0] != 0);
894 
895         /* write number of non zero coefficients */
896         ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz;
897 
898         if (u4_nnz)
899         {
900             for (u4_nnz_cnt = 0, coeff_cnt = 1, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz; coeff_cnt++)
901             {
902                 if (pi2_res_mb_ac[pu1_scan_order[coeff_cnt]])
903                 {
904                     /* write residue */
905                     ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = pi2_res_mb_ac[pu1_scan_order[coeff_cnt]];
906                     u4_s_map |= mask;
907                 }
908                 mask <<= 1;
909             }
910             /* write significant coeff map */
911             ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
912             (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
913             *u1_cbp_l = 15;
914 
915             u4_cntrl |= (1 << (31 - u1_scan_order[b4]));
916         }
917         else
918         {
919             (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
920         }
921 
922     }
923 
924     if (!(*u1_cbp_l))
925     {
926         (*pv_mb_coeff_data) = ps_mb_coeff_data_ac;
927     }
928 
929     /* Store the cntrl signal */
930     (*pu4_cntrl) = u4_cntrl;
931     return;
932 }
933 
934 /**
935 ******************************************************************************
936 *
937 * @brief  This function packs residue of an p16x16 luma mb for entropy coding
938 *
939 * @par   Description
940 *  A p16x16 macro block contains two classes of units 16  4x4 ac blocks.
941 *  while packing the mb, the dc block is sent first, and
942 *  the 16 ac blocks are sent next in scan order. Each and every block is
943 *  represented by 3 parameters (nnz, significant coefficient map and the
944 *  residue coefficients itself). If a 4x4 unit does not have any coefficients
945 *  then only nnz is sent. Inside a 4x4 block the individual coefficients are
946 *  sent in scan order.
947 *
948 *  The first byte of each block will be nnz of the block, if it is non zero,
949 *  a 2 byte significance map is sent. This is followed by nonzero coefficients.
950 *  This is repeated for 1 dc + 16 ac blocks.
951 *
952 * @param[in]  pi2_res_mb
953 *  pointer to residue mb
954 *
955 * @param[in, out]  pv_mb_coeff_data
956 *  buffer pointing to packed residue coefficients
957 *
958 * @param[in]  i4_res_strd
959 *  residual block stride
960 *
961 * @param[out]  u1_cbp_l
962 *  coded block pattern luma
963 *
964 * @param[in]   pu1_nnz
965 *  number of non zero coefficients in each 4x4 unit
966 *
967 * @param[out] pu4_cntrl
968 *  Control signal for inverse transform
969 *
970 * @return none
971 *
972 * @remarks Killing coffs not yet coded
973 *
974 ******************************************************************************
975 */
ih264e_pack_l_mb(WORD16 * pi2_res_mb,void ** pv_mb_coeff_data,WORD32 i4_res_strd,UWORD8 * u1_cbp_l,UWORD8 * pu1_nnz,UWORD32 u4_thres_resi,UWORD32 * pu4_cntrl)976 void ih264e_pack_l_mb(WORD16 *pi2_res_mb,
977                       void **pv_mb_coeff_data,
978                       WORD32 i4_res_strd,
979                       UWORD8 *u1_cbp_l,
980                       UWORD8 *pu1_nnz,
981                       UWORD32 u4_thres_resi,
982                       UWORD32 *pu4_cntrl)
983 {
984     /* pointer to packed sub block buffer space */
985     tu_sblk_coeff_data_t *ps_mb_coeff_data, *ps_mb_coeff_data_b8, *ps_mb_coeff_data_mb;
986 
987     /* no of non zero coefficients in the current sub block */
988     UWORD32 u4_nnz_cnt;
989 
990     /* significant coefficient map */
991     UWORD32 u4_s_map;
992 
993     /* pointer to scanning matrix */
994     const UWORD8 *pu1_scan_order = gu1_luma_scan_order;
995 
996     /* number of non zeros in sub block */
997     UWORD32 u4_nnz;
998 
999     /* pointer to residual sub block */
1000     WORD16  *pi2_res_sb;
1001 
1002     /* coeff scan order */
1003     const UWORD8 u1_scan_order[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15};
1004 
1005     /* coeff cost */
1006     const UWORD8  *pu1_coeff_cost = gu1_coeff_cost;
1007 
1008     /* temp var */
1009     UWORD32 u4_mb_coeff_cost = 0, u4_b8_coeff_cost = 0, coeff_cnt, mask, u4_cntrl = 0, b4, b8;
1010 
1011     /* temp var */
1012     WORD32 i4_res_val, i4_run = -1, dcac_block;
1013 
1014     /* When Hadamard transform is disabled, first row values are dont care, ignore them */
1015     pi2_res_mb += i4_res_strd;
1016 
1017     /* When Hadamard transform is disabled, first unit value is dont care, ignore this */
1018     pu1_nnz ++;
1019 
1020     ps_mb_coeff_data_mb = ps_mb_coeff_data_b8 = (*pv_mb_coeff_data);
1021 
1022     /********************************************************/
1023     /*  pack coeff data for entropy coding                  */
1024     /********************************************************/
1025 
1026     for (b4 = 0; b4 < 16; b4++)
1027     {
1028         ps_mb_coeff_data = (*pv_mb_coeff_data);
1029 
1030         b8 = b4 >> 2;
1031 
1032         u4_nnz = pu1_nnz[u1_scan_order[b4]];
1033 
1034         /* Jump according to the scan order */
1035         pi2_res_sb = pi2_res_mb + (i4_res_strd * u1_scan_order[b4]);
1036 
1037         /* write number of non zero coefficients */
1038         ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz;
1039 
1040         if (u4_nnz)
1041         {
1042             for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz; coeff_cnt++)
1043             {
1044                 /* number of runs of zero before, this is used to compute coeff cost */
1045                 i4_run++;
1046 
1047                 i4_res_val = pi2_res_sb[pu1_scan_order[coeff_cnt]];
1048 
1049                 if (i4_res_val)
1050                 {
1051                     /* write residue */
1052                     ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = i4_res_val;
1053                     u4_s_map |= mask;
1054 
1055                     if (u4_thres_resi)
1056                     {
1057                         /* compute coeff cost */
1058                         if (i4_res_val == 1 || i4_res_val == -1)
1059                         {
1060                             if (i4_run < 6)
1061                                 u4_b8_coeff_cost += pu1_coeff_cost[i4_run];
1062                         }
1063                         else
1064                             u4_b8_coeff_cost += 9;
1065 
1066                         i4_run = -1;
1067                     }
1068                 }
1069 
1070                 mask <<= 1;
1071             }
1072 
1073             /* write significant coeff map */
1074             ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
1075             (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
1076 
1077             /* cbp */
1078             *u1_cbp_l |= (1 << b8);
1079 
1080             /* Cntrl map for inverse transform computation
1081              *
1082              * If coeff_cnt is zero, it means that only nonzero was a dc coeff
1083              * Hence we have to set the 16 - u1_scan_order[b4]) position instead
1084              * of 31 - u1_scan_order[b4]
1085              */
1086             dcac_block = (coeff_cnt == 0)?16:31;
1087             u4_cntrl |= (1 << (dcac_block - u1_scan_order[b4]));
1088         }
1089         else
1090         {
1091             (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
1092         }
1093 
1094         /* Decide if the 8x8 unit has to be sent for entropy coding? */
1095         if ((b4+1) % 4 == 0)
1096         {
1097             if (u4_thres_resi && (u4_b8_coeff_cost <= LUMA_SUB_BLOCK_SKIP_THRESHOLD) &&
1098                             (*u1_cbp_l & (1 << b8)))
1099             {
1100                 /*
1101                  * When we want to reset the full 8x8 block, we have to reset
1102                  * both the dc and ac coeff bits hence we have the symmetric
1103                  * arrangement of bits
1104                  */
1105                 const UWORD32 cntrl_mask_map[4] = {0xcc00cc00, 0x33003300, 0x00cc00cc, 0x00330033};
1106 
1107                 /* restore cbp */
1108                 *u1_cbp_l = (*u1_cbp_l & (~(1 << b8)));
1109 
1110                 /* correct cntrl flag */
1111                 u4_cntrl = u4_cntrl & (~cntrl_mask_map[(b4 >> 2)]);
1112 
1113                 /* correct nnz */
1114                 pu1_nnz[u1_scan_order[b4 - 3]] = 0;
1115                 pu1_nnz[u1_scan_order[b4 - 2]] = 0;
1116                 pu1_nnz[u1_scan_order[b4 - 1]] = 0;
1117                 pu1_nnz[u1_scan_order[b4]] = 0;
1118 
1119                 /* reset blk cost */
1120                 u4_b8_coeff_cost = 0;
1121             }
1122 
1123             if (!(*u1_cbp_l & (1 << b8)))
1124             {
1125                 (*pv_mb_coeff_data) = ps_mb_coeff_data_b8;
1126             }
1127 
1128             u4_mb_coeff_cost += u4_b8_coeff_cost;
1129 
1130             u4_b8_coeff_cost = 0;
1131             i4_run = -1;
1132             ps_mb_coeff_data_b8 = (*pv_mb_coeff_data);
1133         }
1134     }
1135 
1136     if (u4_thres_resi && (u4_mb_coeff_cost <= LUMA_BLOCK_SKIP_THRESHOLD)
1137                     && (*u1_cbp_l))
1138     {
1139         (*pv_mb_coeff_data) = ps_mb_coeff_data_mb;
1140         *u1_cbp_l = 0;
1141         u4_cntrl = 0;
1142         memset(pu1_nnz, 0, 16);
1143     }
1144 
1145     (*pu4_cntrl) = u4_cntrl;
1146 
1147     return;
1148 }
1149 
1150 /**
1151 ******************************************************************************
1152 *
1153 * @brief  This function packs residue of an i8x8 chroma mb for entropy coding
1154 *
1155 * @par   Description
1156 *  An i8 chroma macro block contains two classes of units, dc 2x2 block and
1157 *  4x4 ac blocks. while packing the mb, the dc block is sent first, and
1158 *  the 4 ac blocks are sent next in scan order. Each and every block is
1159 *  represented by 3 parameters (nnz, significant coefficient map and the
1160 *  residue coefficients itself). If a 4x4 unit does not have any coefficients
1161 *  then only nnz is sent. Inside a 4x4 block the individual coefficients are
1162 *  sent in scan order.
1163 *
1164 *  The first byte of each block will be nnz of the block, if it is non zero,
1165 *  a 2 byte significance map is sent. This is followed by nonzero coefficients.
1166 *  This is repeated for 1 dc + 4 ac blocks.
1167 *
1168 * @param[in]  pi2_res_mb
1169 *  pointer to residue mb
1170 *
1171 * @param[in, out]  pv_mb_coeff_data
1172 *  buffer pointing to packed residue coefficients
1173 *
1174 * @param[in]  u4_res_strd
1175 *  residual block stride
1176 *
1177 * @param[out]  u1_cbp_c
1178 *  coded block pattern chroma
1179 *
1180 * @param[in]   pu1_nnz
1181 *  number of non zero coefficients in each 4x4 unit
1182 *
1183 * @param[out]   pu1_nnz
1184 *  Control signal for inverse transform
1185 *
1186 * @param[in]   u4_swap_uv
1187 *  Swaps the order of U and V planes in entropy bitstream
1188 *
1189 * @return none
1190 *
1191 * @ remarks
1192 *
1193 ******************************************************************************
1194 */
ih264e_pack_c_mb(WORD16 * pi2_res_mb,void ** pv_mb_coeff_data,WORD32 i4_res_strd,UWORD8 * u1_cbp_c,UWORD8 * pu1_nnz,UWORD32 u4_thres_resi,UWORD32 * pu4_cntrl,UWORD32 u4_swap_uv)1195 void ih264e_pack_c_mb(WORD16 *pi2_res_mb,
1196                       void **pv_mb_coeff_data,
1197                       WORD32 i4_res_strd,
1198                       UWORD8 *u1_cbp_c,
1199                       UWORD8 *pu1_nnz,
1200                       UWORD32 u4_thres_resi,
1201                       UWORD32 *pu4_cntrl,
1202                       UWORD32 u4_swap_uv)
1203 {
1204     /* pointer to packed sub block buffer space */
1205     tu_sblk_coeff_data_t *ps_mb_coeff_data = (*pv_mb_coeff_data);
1206     tu_sblk_coeff_data_t *ps_mb_coeff_data_dc, *ps_mb_coeff_data_ac;
1207 
1208     /* nnz pointer */
1209     UWORD8 *pu1_nnz_ac, *pu1_nnz_dc;
1210 
1211     /* nnz counter */
1212     UWORD32 u4_nnz_cnt;
1213 
1214     /* significant coefficient map */
1215     UWORD32 u4_s_map;
1216 
1217     /* pointer to scanning matrix */
1218     const UWORD8 *pu1_scan_order;
1219 
1220     /* no of non zero coefficients in the current sub block */
1221     UWORD32 u4_nnz;
1222 
1223     /* pointer to residual sub block, res val */
1224     WORD16 *pi2_res_sb, i2_res_val;
1225 
1226     /* temp var */
1227     UWORD32 coeff_cnt, mask, b4,plane;
1228 
1229     /* temp var */
1230     UWORD32 u4_coeff_cost;
1231     WORD32 i4_run;
1232 
1233     /* coeff cost */
1234     const UWORD8 *pu1_coeff_cost = gu1_coeff_cost;
1235 
1236     /* pointer to packed buffer space */
1237     UWORD32 *pu4_mb_coeff_data = NULL;
1238 
1239     /* ac coded block pattern */
1240     UWORD8 u1_cbp_ac;
1241 
1242     /* Variable to store the current bit pos in cntrl variable*/
1243     UWORD32 cntrl_pos = 0;
1244 
1245     /********************************************************/
1246     /*  pack dc coeff data for entropy coding               */
1247     /********************************************************/
1248     pu1_scan_order = gu1_chroma_scan_order_dc;
1249     pi2_res_sb = pi2_res_mb;
1250     pu1_nnz_dc = pu1_nnz;
1251     (*pu4_cntrl) = 0;
1252     cntrl_pos = 15;
1253     ps_mb_coeff_data_dc = (*pv_mb_coeff_data);
1254 
1255     /* Color space conversion between SP_UV and SP_VU
1256      * We always assume SP_UV for all the processing
1257      * Hence to get proper stream output we need to swap U and V channels here
1258      *
1259      * For that there are two paths we need to look for
1260      * One is the path to bitstream , these variables should have the proper input
1261      * configured UV or VU
1262      * For the other path the inverse transform variables should have what ever ordering the
1263      * input had
1264      */
1265 
1266     if (u4_swap_uv)
1267     {
1268         pu1_nnz_dc += 5;/* Move to NNZ of V planve */
1269         pi2_res_sb += 4;/* Move to DC coff of V plane */
1270 
1271         cntrl_pos = 14; /* Control bit for V plane */
1272     }
1273 
1274     for (plane = 0; plane < 2; plane++)
1275     {
1276         ps_mb_coeff_data = (*pv_mb_coeff_data);
1277 
1278         u4_nnz = *pu1_nnz_dc;
1279         /* write number of non zero coefficients U/V */
1280         ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz;
1281 
1282         if (u4_nnz)
1283         {
1284             for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz; coeff_cnt++)
1285             {
1286                 i2_res_val = pi2_res_sb[pu1_scan_order[coeff_cnt]];
1287                 if (i2_res_val)
1288                 {
1289                     /* write residue U/V */
1290                     ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = i2_res_val;
1291                     u4_s_map |= mask;
1292                 }
1293                 mask <<= 1;
1294             }
1295             /* write significant coeff map U/V */
1296             ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
1297             (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
1298             *u1_cbp_c = 1;
1299 
1300             (*pu4_cntrl) |= (1 << cntrl_pos);
1301         }
1302         else
1303         {
1304             (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
1305         }
1306 
1307         if (u4_swap_uv)
1308         {
1309             cntrl_pos++; /* Control bit for U plane */
1310             pu1_nnz_dc -= 5; /* Move to NNZ of U plane */
1311             pi2_res_sb -= 4; /* Move to DC coff of U plane */
1312 
1313         }
1314         else
1315         {
1316             cntrl_pos--; /* Control bit for U plane */
1317             pu1_nnz_dc += 5; /* 4 for AC NNZ and 1 for DC */
1318             pi2_res_sb += 4; /* Move to DC coff of V plane */
1319         }
1320     }
1321 
1322     /********************************************************/
1323     /*  pack ac coeff data for entropy coding               */
1324     /********************************************************/
1325 
1326     pu1_scan_order = gu1_chroma_scan_order;
1327     ps_mb_coeff_data_ac = (*pv_mb_coeff_data);
1328 
1329     if (u4_swap_uv)
1330     {
1331         pi2_res_sb = pi2_res_mb + i4_res_strd * 5; /* Move to V plane ,ie 1dc row+ 4 ac row */
1332         cntrl_pos = 27; /* The control bits are to be added for V bloc ie 31-4 th bit */
1333         pu1_nnz_ac = pu1_nnz + 6;/*Move the nnz to V block NNZ 1 dc + 1dc + 4 ac */
1334     }
1335     else
1336     {
1337         pi2_res_sb = pi2_res_mb + i4_res_strd; /* Move to U plane ,ie 1dc row */
1338         cntrl_pos = 31;
1339         pu1_nnz_ac = pu1_nnz + 1; /* Move the nnz to V block NNZ 1 dc */
1340     }
1341 
1342     for (plane = 0; plane < 2; plane++)
1343     {
1344         pu4_mb_coeff_data = (*pv_mb_coeff_data);
1345 
1346         u4_coeff_cost = 0;
1347         i4_run = -1;
1348 
1349         /* get the current cbp, so that it automatically
1350          * gets reverted in case of zero ac values */
1351         u1_cbp_ac = *u1_cbp_c;
1352 
1353         for (b4 = 0; b4 < 4; b4++)
1354         {
1355             ps_mb_coeff_data = (*pv_mb_coeff_data);
1356 
1357             u4_nnz = *pu1_nnz_ac;
1358 
1359             /*
1360              * We are scanning only ac coeffs, but the nnz is for the
1361              * complete 4x4 block. Hence we have to discount the nnz contributed
1362              * by the dc coefficient
1363              */
1364             u4_nnz -= (pi2_res_sb[0]!=0);
1365 
1366             /* write number of non zero coefficients U/V */
1367             ps_mb_coeff_data->i4_sig_map_nnz = u4_nnz;
1368 
1369             if (u4_nnz)
1370             {
1371                 for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u4_nnz; coeff_cnt++)
1372                 {
1373                     i2_res_val = pi2_res_sb[pu1_scan_order[coeff_cnt]];
1374 
1375                     i4_run++;
1376 
1377                     if (i2_res_val)
1378                     {
1379                         /* write residue U/V */
1380                         ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = i2_res_val;
1381                         u4_s_map |= mask;
1382 
1383                         if ( u4_thres_resi && (u4_coeff_cost < CHROMA_BLOCK_SKIP_THRESHOLD) )
1384                         {
1385                             /* compute coeff cost */
1386                             if (i2_res_val == 1 || i2_res_val == -1)
1387                             {
1388                                 if (i4_run < 6)
1389                                     u4_coeff_cost += pu1_coeff_cost[i4_run];
1390                             }
1391                             else
1392                                 u4_coeff_cost += 9;
1393 
1394                             i4_run = -1;
1395                         }
1396                     }
1397                     mask <<= 1;
1398                 }
1399 
1400                 /* write significant coeff map U/V */
1401                 ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
1402                 (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
1403                 u1_cbp_ac = 2;
1404 
1405                 (*pu4_cntrl) |= 1 << cntrl_pos;
1406             }
1407             else
1408             {
1409                 (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
1410             }
1411 
1412             pu1_nnz_ac++;
1413             pi2_res_sb += i4_res_strd;
1414             cntrl_pos--;
1415         }
1416 
1417         /* reset block */
1418         if (u4_thres_resi && (u4_coeff_cost < CHROMA_BLOCK_SKIP_THRESHOLD))
1419         {
1420             pu4_mb_coeff_data[0] = 0;
1421             pu4_mb_coeff_data[1] = 0;
1422             pu4_mb_coeff_data[2] = 0;
1423             pu4_mb_coeff_data[3] = 0;
1424             (*pv_mb_coeff_data) = pu4_mb_coeff_data + 4;
1425 
1426             /* Generate the control signal */
1427             /* Zero out the current plane's AC coefficients */
1428             (*pu4_cntrl) &= ((plane == u4_swap_uv) ? 0x0FFFFFFF : 0xF0FFFFFF);
1429 
1430             /* Similarly do for the NNZ also */
1431             *(pu1_nnz_ac - 4) = 0;
1432             *(pu1_nnz_ac - 3) = 0;
1433             *(pu1_nnz_ac - 2) = 0;
1434             *(pu1_nnz_ac - 1) = 0;
1435         }
1436         else
1437         {
1438             *u1_cbp_c = u1_cbp_ac;
1439         }
1440 
1441         if (u4_swap_uv)
1442         {
1443             pi2_res_sb = pi2_res_mb + i4_res_strd; /* Move to V plane ,ie 1dc row+ 4 ac row + 1 dc row */
1444             cntrl_pos = 31; /* The control bits are to be added for V bloc ie 31-4 th bit */
1445             pu1_nnz_ac = pu1_nnz + 1; /* Move the nnz to V block NNZ 1 dc + 1dc + 4 ac */
1446 
1447             pu1_nnz_ac = pu1_nnz + 1;
1448         }
1449         else
1450             pu1_nnz_ac = pu1_nnz + 6; /* Go to nnz of V plane */
1451     }
1452 
1453     /* restore the ptr basing on cbp */
1454     if (*u1_cbp_c == 0)
1455     {
1456         (*pv_mb_coeff_data) = ps_mb_coeff_data_dc;
1457     }
1458     else if (*u1_cbp_c == 1)
1459     {
1460         (*pv_mb_coeff_data) = ps_mb_coeff_data_ac;
1461     }
1462 
1463     return ;
1464 }
1465 
1466 /**
1467 *******************************************************************************
1468 *
1469 * @brief performs luma core coding when intra mode is i16x16
1470 *
1471 * @par Description:
1472 *  If the current mb is to be coded as intra of mb type i16x16, the mb is first
1473 *  predicted using one of i16x16 prediction filters, basing on the intra mode
1474 *  chosen. Then, error is computed between the input blk and the estimated blk.
1475 *  This error is transformed (hierarchical transform i.e., dct followed by hada-
1476 *  -mard), quantized. The quantized coefficients are packed in scan order for
1477 *  entropy coding.
1478 *
1479 * @param[in] ps_proc_ctxt
1480 *  pointer to the current macro block context
1481 *
1482 * @returns u1_cbp_l
1483 *  coded block pattern luma
1484 *
1485 * @remarks none
1486 *
1487 *******************************************************************************
1488 */
ih264e_code_luma_intra_macroblock_16x16(process_ctxt_t * ps_proc)1489 UWORD8 ih264e_code_luma_intra_macroblock_16x16(process_ctxt_t *ps_proc)
1490 {
1491     /* Codec Context */
1492     codec_t *ps_codec = ps_proc->ps_codec;
1493 
1494     /* pointer to ref macro block */
1495     UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_luma;
1496 
1497     /* pointer to src macro block */
1498     UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_luma;
1499 
1500     /* pointer to prediction macro block */
1501     UWORD8 *pu1_pred_mb = NULL;
1502 
1503     /* pointer to residual macro block */
1504     WORD16 *pi2_res_mb = ps_proc->pi2_res_buf;
1505 
1506     /* strides */
1507     WORD32 i4_src_strd = ps_proc->i4_src_strd;
1508     WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
1509     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
1510     WORD32 i4_res_strd = ps_proc->i4_res_strd;
1511 
1512     /* intra mode */
1513     UWORD8 u1_intra_mode = ps_proc->u1_l_i16_mode;
1514 
1515     /* coded block pattern */
1516     UWORD8 u1_cbp_l = 0;
1517 
1518     /* number of non zero coeffs*/
1519     UWORD32 au4_nnz[5];
1520     UWORD8  *pu1_nnz = (UWORD8 *)au4_nnz;
1521 
1522     /*Cntrol signal for itrans*/
1523     UWORD32 u4_cntrl;
1524 
1525     /* quantization parameters */
1526     quant_params_t *ps_qp_params = ps_proc->ps_qp_params[0];
1527 
1528     /* pointer to packed mb coeff data */
1529     void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
1530 
1531     /* init nnz */
1532     au4_nnz[0] = 0;
1533     au4_nnz[1] = 0;
1534     au4_nnz[2] = 0;
1535     au4_nnz[3] = 0;
1536     au4_nnz[4] = 0;
1537 
1538     if (u1_intra_mode == PLANE_I16x16)
1539     {
1540         pu1_pred_mb = ps_proc->pu1_pred_mb_intra_16x16_plane;
1541     }
1542     else
1543     {
1544         pu1_pred_mb = ps_proc->pu1_pred_mb_intra_16x16;
1545     }
1546 
1547     /********************************************************/
1548     /*  error estimation,                                   */
1549     /*  transform                                           */
1550     /*  quantization                                        */
1551     /********************************************************/
1552     ih264e_luma_16x16_resi_trans_dctrans_quant(ps_codec, pu1_curr_mb,
1553                                                pu1_pred_mb, pi2_res_mb,
1554                                                i4_src_strd, i4_pred_strd,
1555                                                i4_res_strd,
1556                                                ps_qp_params->pu2_scale_mat,
1557                                                ps_qp_params->pu2_thres_mat,
1558                                                ps_qp_params->u1_qbits,
1559                                                ps_qp_params->u4_dead_zone,
1560                                                pu1_nnz, ENABLE_DC_TRANSFORM);
1561 
1562     /********************************************************/
1563     /*  pack coeff data for entropy coding                  */
1564     /********************************************************/
1565     ih264e_pack_l_mb_i16(pi2_res_mb, pv_mb_coeff_data, i4_res_strd, &u1_cbp_l,
1566                          pu1_nnz, &u4_cntrl);
1567 
1568     /********************************************************/
1569     /*  ierror estimation,                                  */
1570     /*  itransform                                          */
1571     /*  iquantization                                       */
1572     /********************************************************/
1573     /*
1574      *if refernce frame is not to be computed
1575      *we only need the right and bottom border 4x4 blocks to predict next intra
1576      *blocks, hence only compute them
1577      */
1578     if (!ps_proc->u4_compute_recon)
1579     {
1580         u4_cntrl &= 0x111F8000;
1581     }
1582 
1583     if (u4_cntrl)
1584     {
1585         ih264e_luma_16x16_idctrans_iquant_itrans_recon(
1586                         ps_codec, pi2_res_mb, pu1_pred_mb, pu1_ref_mb,
1587                         i4_res_strd, i4_pred_strd, i4_rec_strd,
1588                         ps_qp_params->pu2_iscale_mat,
1589                         ps_qp_params->pu2_weigh_mat, ps_qp_params->u1_qp_div,
1590                         u4_cntrl, ENABLE_DC_TRANSFORM,
1591                         ps_proc->pv_scratch_buff);
1592     }
1593     else
1594     {
1595         ps_codec->pf_inter_pred_luma_copy(pu1_pred_mb, pu1_ref_mb, i4_pred_strd,
1596                                           i4_rec_strd, MB_SIZE, MB_SIZE, NULL,
1597                                           0);
1598     }
1599 
1600     return (u1_cbp_l);
1601 }
1602 
1603 
1604 /**
1605 *******************************************************************************
1606 *
1607 * @brief performs luma core coding when intra mode is i4x4
1608 *
1609 * @par Description:
1610 *  If the current mb is to be coded as intra of mb type i4x4, the mb is first
1611 *  predicted using one of i4x4 prediction filters, basing on the intra mode
1612 *  chosen. Then, error is computed between the input blk and the estimated blk.
1613 *  This error is dct transformed and quantized. The quantized coefficients are
1614 *  packed in scan order for entropy coding.
1615 *
1616 * @param[in] ps_proc_ctxt
1617 *  pointer to the current macro block context
1618 *
1619 * @returns u1_cbp_l
1620 *  coded block pattern luma
1621 *
1622 * @remarks
1623 *  The traversal of 4x4 subblocks in the 16x16 macroblock is as per the scan order
1624 *  mentioned in h.264 specification
1625 *
1626 *******************************************************************************
1627 */
ih264e_code_luma_intra_macroblock_4x4(process_ctxt_t * ps_proc)1628 UWORD8 ih264e_code_luma_intra_macroblock_4x4(process_ctxt_t *ps_proc)
1629 {
1630     /* Codec Context */
1631     codec_t *ps_codec = ps_proc->ps_codec;
1632 
1633     /* pointer to ref macro block */
1634     UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_luma;
1635 
1636     /* pointer to src macro block */
1637     UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_luma;
1638 
1639     /* pointer to prediction macro block */
1640     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
1641 
1642     /* pointer to residual macro block */
1643     WORD16 *pi2_res_mb = ps_proc->pi2_res_buf;
1644 
1645     /* strides */
1646     WORD32 i4_src_strd = ps_proc->i4_src_strd;
1647     WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
1648     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
1649 
1650     /* pointer to neighbors: left, top, top-left */
1651     UWORD8 *pu1_mb_a;
1652     UWORD8 *pu1_mb_b;
1653     UWORD8 *pu1_mb_c;
1654     UWORD8 *pu1_mb_d;
1655 
1656     /* intra mode */
1657     UWORD8 u1_intra_mode = ps_proc->u1_l_i16_mode;
1658 
1659     /* neighbor availability */
1660     WORD32 i4_ngbr_avbl;
1661 
1662     /* neighbor pels for intra prediction */
1663     UWORD8 *pu1_ngbr_pels_i4 = ps_proc->au1_ngbr_pels;
1664 
1665     /* coded block pattern */
1666     UWORD8 u1_cbp_l = 0;
1667 
1668     /* number of non zero coeffs*/
1669     UWORD8  u1_nnz;
1670 
1671     /* quantization parameters */
1672     quant_params_t *ps_qp_params = ps_proc->ps_qp_params[0];
1673 
1674     /* pointer to packed mb coeff data */
1675     void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
1676 
1677     /* pointer to packed mb coeff data */
1678     tu_sblk_coeff_data_t *ps_mb_coeff_data, *ps_mb_coeff_data_b8;
1679 
1680     /* no of non zero coefficients in the current sub block */
1681     UWORD32 u4_nnz_cnt;
1682 
1683     /* significant coefficient map */
1684     UWORD32 u4_s_map;
1685 
1686     /* pointer to scanning matrix */
1687     const UWORD8 *pu1_scan_order = gu1_luma_scan_order;
1688 
1689     /*Dummy variable for 4x4 trans fucntion*/
1690     WORD16 i2_dc_dummy;
1691 
1692     /* temp var */
1693     UWORD32 i, b8, b4, u1_blk_x, u1_blk_y, u1_pix_x, u1_pix_y, coeff_cnt, mask;
1694 
1695     /* Process 16 4x4 lum sub-blocks of the MB in scan order */
1696     for (b8 = 0; b8 < 4; b8++)
1697     {
1698         u1_blk_x = GET_BLK_RASTER_POS_X(b8) << 3;
1699         u1_blk_y = GET_BLK_RASTER_POS_Y(b8) << 3;
1700 
1701         /* if in case cbp for the 8x8 block is zero, send no residue */
1702         ps_mb_coeff_data_b8 = *pv_mb_coeff_data;
1703 
1704         for (b4 = 0; b4 < 4; b4++)
1705         {
1706             /* index of pel in MB */
1707             u1_pix_x = u1_blk_x + (GET_SUB_BLK_RASTER_POS_X(b4) << 2);
1708             u1_pix_y = u1_blk_y + (GET_SUB_BLK_RASTER_POS_Y(b4) << 2);
1709 
1710             /* Initialize source and reference pointers */
1711             pu1_curr_mb = ps_proc->pu1_src_buf_luma + u1_pix_x + (u1_pix_y * i4_src_strd);
1712             pu1_ref_mb = ps_proc->pu1_rec_buf_luma + u1_pix_x + (u1_pix_y * i4_rec_strd);
1713 
1714             /* pointer to left of ref macro block */
1715             pu1_mb_a = pu1_ref_mb - 1;
1716             /* pointer to top of ref macro block */
1717             pu1_mb_b = pu1_ref_mb - i4_rec_strd;
1718             /* pointer to topright of ref macro block */
1719             pu1_mb_c = pu1_mb_b + 4;
1720             /* pointer to topleft macro block */
1721             pu1_mb_d = pu1_mb_b - 1;
1722 
1723             /* compute neighbor availability */
1724             i4_ngbr_avbl = ps_proc->au1_ngbr_avbl_4x4_subblks[(b8 << 2) + b4];
1725 
1726             /* sub block intra mode */
1727             u1_intra_mode = ps_proc->au1_intra_luma_mb_4x4_modes[(b8 << 2) + b4];
1728 
1729             /********************************************************/
1730             /* gather prediction pels from neighbors for prediction */
1731             /********************************************************/
1732             /* left pels */
1733             if (i4_ngbr_avbl & LEFT_MB_AVAILABLE_MASK)
1734             {
1735                 for (i = 0; i < 4; i++)
1736                     pu1_ngbr_pels_i4[4 - 1 - i] = pu1_mb_a[i * i4_rec_strd];
1737             }
1738             else
1739             {
1740                 memset(pu1_ngbr_pels_i4, 0, 4);
1741             }
1742 
1743             /* top pels */
1744             if (i4_ngbr_avbl & TOP_MB_AVAILABLE_MASK)
1745             {
1746                 memcpy(pu1_ngbr_pels_i4 + 4 + 1, pu1_mb_b, 4);
1747             }
1748             else
1749             {
1750                 memset(pu1_ngbr_pels_i4 + 5, 0, 4);
1751             }
1752             /* top left pels */
1753             if (i4_ngbr_avbl & TOP_LEFT_MB_AVAILABLE_MASK)
1754             {
1755                 pu1_ngbr_pels_i4[4] = *pu1_mb_d;
1756             }
1757             else
1758             {
1759                 pu1_ngbr_pels_i4[4] = 0;
1760             }
1761             /* top right pels */
1762             if (i4_ngbr_avbl & TOP_RIGHT_MB_AVAILABLE_MASK)
1763             {
1764                 memcpy(pu1_ngbr_pels_i4+8+1,pu1_mb_c,4);
1765             }
1766             else if (i4_ngbr_avbl & TOP_MB_AVAILABLE_MASK)
1767             {
1768                 memset(pu1_ngbr_pels_i4+8+1,pu1_ngbr_pels_i4[8],4);
1769             }
1770 
1771             /********************************************************/
1772             /*  prediction                                          */
1773             /********************************************************/
1774             (ps_codec->apf_intra_pred_4_l)[u1_intra_mode](pu1_ngbr_pels_i4,
1775                                                           pu1_pred_mb, 0,
1776                                                           i4_pred_strd,
1777                                                           i4_ngbr_avbl);
1778 
1779             /********************************************************/
1780             /*  error estimation,                                   */
1781             /*  transform                                           */
1782             /*  quantization                                        */
1783             /********************************************************/
1784             ps_codec->pf_resi_trans_quant_4x4(pu1_curr_mb, pu1_pred_mb,
1785                                               pi2_res_mb, i4_src_strd,
1786                                               i4_pred_strd,
1787                                               ps_qp_params->pu2_scale_mat,
1788                                               ps_qp_params->pu2_thres_mat,
1789                                               ps_qp_params->u1_qbits,
1790                                               ps_qp_params->u4_dead_zone,
1791                                               &u1_nnz, &i2_dc_dummy);
1792 
1793             /********************************************************/
1794             /*  pack coeff data for entropy coding                  */
1795             /********************************************************/
1796             ps_mb_coeff_data = *pv_mb_coeff_data;
1797 
1798             /* write number of non zero coefficients */
1799             ps_mb_coeff_data->i4_sig_map_nnz = u1_nnz;
1800 
1801             if (u1_nnz)
1802             {
1803                 for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < u1_nnz; coeff_cnt++)
1804                 {
1805                     if (pi2_res_mb[pu1_scan_order[coeff_cnt]])
1806                     {
1807                         /* write residue */
1808                         ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = pi2_res_mb[pu1_scan_order[coeff_cnt]];
1809                         u4_s_map |= mask;
1810                     }
1811                     mask <<= 1;
1812                 }
1813                 /* write significant coeff map */
1814                 ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
1815 
1816                 /* update ptr to coeff data */
1817                 (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
1818 
1819                 /* cbp */
1820                 u1_cbp_l |= (1 << b8);
1821             }
1822             else
1823             {
1824                 (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
1825             }
1826 
1827             /********************************************************/
1828             /*  ierror estimation,                                  */
1829             /*  itransform                                          */
1830             /*  iquantization                                       */
1831             /********************************************************/
1832             if (u1_nnz)
1833                 ps_codec->pf_iquant_itrans_recon_4x4(
1834                                 pi2_res_mb, pu1_pred_mb, pu1_ref_mb,
1835                                 /*No input stride,*/i4_pred_strd,
1836                                 i4_rec_strd, ps_qp_params->pu2_iscale_mat,
1837                                 ps_qp_params->pu2_weigh_mat,
1838                                 ps_qp_params->u1_qp_div,
1839                                 ps_proc->pv_scratch_buff, 0, 0);
1840             else
1841                 ps_codec->pf_inter_pred_luma_copy(pu1_pred_mb, pu1_ref_mb,
1842                                                   i4_pred_strd, i4_rec_strd,
1843                                                   BLK_SIZE, BLK_SIZE, NULL,
1844                                                   0);
1845 
1846         }
1847 
1848         /* if the 8x8 block has no residue, nothing needs to be sent to entropy */
1849         if (!(u1_cbp_l & (1 << b8)))
1850         {
1851             *pv_mb_coeff_data = ps_mb_coeff_data_b8;
1852         }
1853     }
1854 
1855     return (u1_cbp_l);
1856 }
1857 
1858 /**
1859 *******************************************************************************
1860 *
1861 * @brief performs luma core coding when intra mode is i4x4
1862 *
1863 * @par Description:
1864 *  If the current mb is to be coded as intra of mb type i4x4, the mb is first
1865 *  predicted using one of i4x4 prediction filters, basing on the intra mode
1866 *  chosen. Then, error is computed between the input blk and the estimated blk.
1867 *  This error is dct transformed and quantized. The quantized coefficients are
1868 *  packed in scan order for entropy coding.
1869 *
1870 * @param[in] ps_proc_ctxt
1871 *  pointer to the current macro block context
1872 *
1873 * @returns u1_cbp_l
1874 *  coded block pattern luma
1875 *
1876 * @remarks
1877 *  The traversal of 4x4 subblocks in the 16x16 macroblock is as per the scan order
1878 *  mentioned in h.264 specification
1879 *
1880 *******************************************************************************
1881 */
ih264e_code_luma_intra_macroblock_4x4_rdopt_on(process_ctxt_t * ps_proc)1882 UWORD8 ih264e_code_luma_intra_macroblock_4x4_rdopt_on(process_ctxt_t *ps_proc)
1883 {
1884     /* Codec Context */
1885     codec_t *ps_codec = ps_proc->ps_codec;
1886 
1887     /* pointer to ref macro block */
1888     UWORD8 *pu1_ref_mb_intra_4x4 = ps_proc->pu1_ref_mb_intra_4x4;
1889 
1890     /* pointer to recon buffer */
1891     UWORD8 *pu1_rec_mb = ps_proc->pu1_rec_buf_luma;
1892 
1893     /* pointer to residual macro block */
1894     WORD16 *pi2_res_mb = ps_proc->pi2_res_buf_intra_4x4;
1895 
1896     /* strides */
1897     WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
1898 
1899     /* number of non zero coeffs*/
1900     UWORD8  *pu1_nnz = (UWORD8 *)ps_proc->au4_nnz_intra_4x4;
1901 
1902     /* coded block pattern */
1903     UWORD8 u1_cbp_l = 0;
1904 
1905     /* pointer to packed mb coeff data */
1906     void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
1907 
1908     /* pointer to packed mb coeff data */
1909     tu_sblk_coeff_data_t *ps_mb_coeff_data, *ps_mb_coeff_data_b8;
1910 
1911     /* no of non zero coefficients in the current sub block */
1912     UWORD32 u4_nnz_cnt;
1913 
1914     /* significant coefficient map */
1915     UWORD32 u4_s_map;
1916 
1917     /* pointer to scanning matrix */
1918     const UWORD8 *pu1_scan_order = gu1_luma_scan_order;
1919 
1920     /* temp var */
1921     UWORD32 b8, b4, coeff_cnt, mask;
1922 
1923     /* Process 16 4x4 lum sub-blocks of the MB in scan order */
1924     for (b8 = 0; b8 < 4; b8++)
1925     {
1926         /* if in case cbp for the 8x8 block is zero, send no residue */
1927         ps_mb_coeff_data_b8 = *pv_mb_coeff_data;
1928 
1929         for (b4 = 0; b4 < 4; b4++, pu1_nnz++, pi2_res_mb += MB_SIZE)
1930         {
1931             /********************************************************/
1932             /*  pack coeff data for entropy coding                  */
1933             /********************************************************/
1934             ps_mb_coeff_data = *pv_mb_coeff_data;
1935 
1936             /* write number of non zero coefficients */
1937             ps_mb_coeff_data->i4_sig_map_nnz = *pu1_nnz;
1938 
1939             if (*pu1_nnz)
1940             {
1941                 for (u4_nnz_cnt = 0, coeff_cnt = 0, mask = 1, u4_s_map = 0; u4_nnz_cnt < *pu1_nnz; coeff_cnt++)
1942                 {
1943                     if (pi2_res_mb[pu1_scan_order[coeff_cnt]])
1944                     {
1945                         /* write residue */
1946                         ps_mb_coeff_data->ai2_residue[u4_nnz_cnt++] = pi2_res_mb[pu1_scan_order[coeff_cnt]];
1947                         u4_s_map |= mask;
1948                     }
1949                     mask <<= 1;
1950                 }
1951                 /* write significant coeff map */
1952                 ps_mb_coeff_data->i4_sig_map_nnz |= (u4_s_map << 16);
1953 
1954                 /* update ptr to coeff data */
1955                 (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue + ALIGN2(u4_nnz_cnt);
1956 
1957                 /* cbp */
1958                 u1_cbp_l |= (1 << b8);
1959             }
1960             else
1961             {
1962                 (*pv_mb_coeff_data) = ps_mb_coeff_data->ai2_residue;
1963             }
1964         }
1965 
1966         /* if the 8x8 block has no residue, nothing needs to be sent to entropy */
1967         if (!(u1_cbp_l & (1 << b8)))
1968         {
1969             *pv_mb_coeff_data = ps_mb_coeff_data_b8;
1970         }
1971     }
1972 
1973     /* memcpy recon */
1974     ps_codec->pf_inter_pred_luma_copy(pu1_ref_mb_intra_4x4, pu1_rec_mb, MB_SIZE, i4_rec_strd, MB_SIZE, MB_SIZE, NULL, 0);
1975 
1976     return (u1_cbp_l);
1977 }
1978 
1979 
1980 /**
1981 *******************************************************************************
1982 *
1983 * @brief performs chroma core coding for intra macro blocks
1984 *
1985 * @par Description:
1986 *  If the current MB is to be intra coded with mb type chroma I8x8, the MB is
1987 *  first predicted using intra 8x8 prediction filters. The predicted data is
1988 *  compared with the input for error and the error is transformed. The DC
1989 *  coefficients of each transformed sub blocks are further transformed using
1990 *  Hadamard transform. The resulting coefficients are quantized, packed and sent
1991 *  for entropy coding.
1992 *
1993 * @param[in] ps_proc_ctxt
1994 *  pointer to the current macro block context
1995 *
1996 * @returns u1_cbp_c
1997 *  coded block pattern chroma
1998 *
1999 * @remarks
2000 *  The traversal of 4x4 subblocks in the 8x8 macroblock is as per the scan order
2001 *  mentioned in h.264 specification
2002 *
2003 *******************************************************************************
2004 */
ih264e_code_chroma_intra_macroblock_8x8(process_ctxt_t * ps_proc)2005 UWORD8 ih264e_code_chroma_intra_macroblock_8x8(process_ctxt_t *ps_proc)
2006 {
2007     /* Codec Context */
2008     codec_t *ps_codec = ps_proc->ps_codec;
2009 
2010     /* pointer to ref macro block */
2011     UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_chroma;
2012 
2013     /* pointer to src macro block */
2014     UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_chroma;
2015 
2016     /* pointer to prediction macro block */
2017     UWORD8 *pu1_pred_mb = NULL;
2018 
2019     /* pointer to residual macro block */
2020     WORD16 *pi2_res_mb = ps_proc->pi2_res_buf;
2021 
2022     /* strides */
2023     WORD32 i4_src_strd = ps_proc->i4_src_chroma_strd;
2024     WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
2025     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
2026     WORD32 i4_res_strd = ps_proc->i4_res_strd;
2027 
2028     /* intra mode */
2029     UWORD8 u1_intra_mode = ps_proc->u1_c_i8_mode;
2030 
2031     /* coded block pattern */
2032     UWORD8 u1_cbp_c = 0;
2033 
2034     /* number of non zero coeffs*/
2035     UWORD8 au1_nnz[18] = {0};
2036 
2037     /* quantization parameters */
2038     quant_params_t *ps_qp_params = ps_proc->ps_qp_params[1];
2039 
2040     /* Control signal for inverse transform */
2041     UWORD32 u4_cntrl;
2042 
2043     /* pointer to packed mb coeff data */
2044     void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
2045 
2046     /* See if we need to swap U and V plances for entropy */
2047     UWORD32 u4_swap_uv = ps_codec->s_cfg.e_inp_color_fmt == IV_YUV_420SP_VU;
2048 
2049     if (PLANE_CH_I8x8 == u1_intra_mode)
2050     {
2051         pu1_pred_mb = ps_proc->pu1_pred_mb_intra_chroma_plane;
2052     }
2053     else
2054     {
2055         pu1_pred_mb = ps_proc->pu1_pred_mb_intra_chroma;
2056     }
2057 
2058     /********************************************************/
2059     /*  error estimation,                                   */
2060     /*  transform                                           */
2061     /*  quantization                                        */
2062     /********************************************************/
2063     ih264e_chroma_8x8_resi_trans_dctrans_quant(ps_codec, pu1_curr_mb,
2064                                                pu1_pred_mb, pi2_res_mb,
2065                                                i4_src_strd, i4_pred_strd,
2066                                                i4_res_strd,
2067                                                ps_qp_params->pu2_scale_mat,
2068                                                ps_qp_params->pu2_thres_mat,
2069                                                ps_qp_params->u1_qbits,
2070                                                ps_qp_params->u4_dead_zone,
2071                                                au1_nnz);
2072 
2073     /********************************************************/
2074     /*  pack coeff data for entropy coding                  */
2075     /********************************************************/
2076     ih264e_pack_c_mb(pi2_res_mb, pv_mb_coeff_data, i4_res_strd, &u1_cbp_c,
2077                      au1_nnz, ps_codec->u4_thres_resi, &u4_cntrl, u4_swap_uv);
2078 
2079     /********************************************************/
2080     /*  ierror estimation,                                  */
2081     /*  itransform                                          */
2082     /*  iquantization                                       */
2083     /********************************************************/
2084     ih264e_chroma_8x8_idctrans_iquant_itrans_recon(ps_codec, pi2_res_mb,
2085                                                    pu1_pred_mb, pu1_ref_mb,
2086                                                    i4_res_strd, i4_pred_strd,
2087                                                    i4_rec_strd,
2088                                                    ps_qp_params->pu2_iscale_mat,
2089                                                    ps_qp_params->pu2_weigh_mat,
2090                                                    ps_qp_params->u1_qp_div,
2091                                                    u4_cntrl,
2092                                                    ps_proc->pv_scratch_buff);
2093     return (u1_cbp_c);
2094 }
2095 
2096 
2097 /**
2098 *******************************************************************************
2099 *
2100 * @brief performs luma core coding when mode is inter
2101 *
2102 * @par Description:
2103 *  If the current mb is to be coded as inter the mb is predicted based on the
2104 *  sub mb partitions and corresponding motion vectors generated by ME. Then,
2105 *  error is computed between the input blk and the estimated blk. This error is
2106 *  transformed, quantized. The quantized coefficients are packed in scan order
2107 *  for entropy coding
2108 *
2109 * @param[in] ps_proc_ctxt
2110 *  pointer to the current macro block context
2111 *
2112 * @returns coded block pattern luma
2113 *
2114 * @remarks none
2115 *
2116 *******************************************************************************
2117 */
ih264e_code_luma_inter_macroblock_16x16(process_ctxt_t * ps_proc)2118 UWORD8 ih264e_code_luma_inter_macroblock_16x16(process_ctxt_t *ps_proc)
2119 {
2120     /* Codec Context */
2121     codec_t *ps_codec = ps_proc->ps_codec;
2122 
2123     /* pointer to ref macro block */
2124     UWORD8 *pu1_rec_mb = ps_proc->pu1_rec_buf_luma;
2125 
2126     /* pointer to src macro block */
2127     UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_luma;
2128 
2129     /* pointer to prediction macro block */
2130     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
2131 
2132     /* pointer to residual macro block */
2133     WORD16 *pi2_res_mb = ps_proc->pi2_res_buf;
2134 
2135     /* strides */
2136     WORD32 i4_src_strd = ps_proc->i4_src_strd;
2137     WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
2138     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
2139     WORD32 i4_res_strd = ps_proc->i4_res_strd;
2140 
2141     /* coded block pattern */
2142     UWORD8 u1_cbp_l = 0;
2143 
2144     /*Control signal of itrans*/
2145     UWORD32 u4_cntrl;
2146 
2147     /* number of non zero coeffs*/
2148     UWORD8  *pu1_nnz = (UWORD8 *)ps_proc->au4_nnz;
2149 
2150     /* quantization parameters */
2151     quant_params_t *ps_qp_params = ps_proc->ps_qp_params[0];
2152 
2153     /* pointer to packed mb coeff data */
2154     void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
2155 
2156     /* pseudo pred buffer */
2157     UWORD8 *pu1_pseudo_pred = pu1_pred_mb;
2158 
2159     /* pseudo pred buffer stride */
2160     WORD32 i4_pseudo_pred_strd = i4_pred_strd;
2161 
2162     /* init nnz */
2163     ps_proc->au4_nnz[0] = 0;
2164     ps_proc->au4_nnz[1] = 0;
2165     ps_proc->au4_nnz[2] = 0;
2166     ps_proc->au4_nnz[3] = 0;
2167     ps_proc->au4_nnz[4] = 0;
2168 
2169     /********************************************************/
2170     /*  prediction                                          */
2171     /********************************************************/
2172     ih264e_motion_comp_luma(ps_proc, &pu1_pseudo_pred, &i4_pseudo_pred_strd);
2173 
2174     /********************************************************/
2175     /*  error estimation,                                   */
2176     /*  transform                                           */
2177     /*  quantization                                        */
2178     /********************************************************/
2179     if (ps_proc->u4_min_sad_reached == 0 || ps_proc->u4_min_sad != 0)
2180     {
2181         ih264e_luma_16x16_resi_trans_dctrans_quant(ps_codec, pu1_curr_mb,
2182                                                    pu1_pseudo_pred, pi2_res_mb,
2183                                                    i4_src_strd,
2184                                                    i4_pseudo_pred_strd,
2185                                                    i4_res_strd,
2186                                                    ps_qp_params->pu2_scale_mat,
2187                                                    ps_qp_params->pu2_thres_mat,
2188                                                    ps_qp_params->u1_qbits,
2189                                                    ps_qp_params->u4_dead_zone,
2190                                                    pu1_nnz,
2191                                                    DISABLE_DC_TRANSFORM);
2192 
2193         /********************************************************/
2194         /*  pack coeff data for entropy coding                  */
2195         /********************************************************/
2196         ih264e_pack_l_mb(pi2_res_mb, pv_mb_coeff_data, i4_res_strd, &u1_cbp_l,
2197                          pu1_nnz, ps_codec->u4_thres_resi, &u4_cntrl);
2198     }
2199     else
2200     {
2201         u1_cbp_l = 0;
2202         u4_cntrl = 0;
2203     }
2204 
2205     /********************************************************/
2206     /*  ierror estimation,                                  */
2207     /*  itransform                                          */
2208     /*  iquantization                                       */
2209     /********************************************************/
2210 
2211     /*If the frame is not to be used for P frame reference or dumping recon
2212      * we only will use the reocn for only predicting intra Mbs
2213      * THis will need only right and bottom edge 4x4 blocks recon
2214      * Hence we selectively enable them using control signal(including DC)
2215      */
2216     if (ps_proc->u4_compute_recon != 1)
2217     {
2218         u4_cntrl &= 0x111F0000;
2219     }
2220 
2221     if (u4_cntrl)
2222     {
2223         ih264e_luma_16x16_idctrans_iquant_itrans_recon(
2224                         ps_codec, pi2_res_mb, pu1_pseudo_pred, pu1_rec_mb,
2225                         i4_res_strd, i4_pseudo_pred_strd, i4_rec_strd,
2226                         ps_qp_params->pu2_iscale_mat,
2227                         ps_qp_params->pu2_weigh_mat, ps_qp_params->u1_qp_div,
2228                         u4_cntrl /*Cntrl*/, DISABLE_DC_TRANSFORM,
2229                         ps_proc->pv_scratch_buff);
2230     }
2231     else
2232     {
2233         ps_codec->pf_inter_pred_luma_copy(pu1_pseudo_pred, pu1_rec_mb,
2234                                           i4_pseudo_pred_strd, i4_rec_strd,
2235                                           MB_SIZE, MB_SIZE, NULL, 0);
2236     }
2237 
2238 
2239     return (u1_cbp_l);
2240 }
2241 
2242 /**
2243 *******************************************************************************
2244 *
2245 * @brief performs chroma core coding for inter macro blocks
2246 *
2247 * @par Description:
2248 *  If the current mb is to be coded as inter predicted mb, based on the sub mb
2249 *  partitions and corresponding motion vectors generated by ME, prediction is done.
2250 *  Then, error is computed between the input blk and the estimated blk.
2251 *  This error is transformed, quantized. The quantized coefficients
2252 *  are packed in scan order for entropy coding.
2253 *
2254 * @param[in] ps_proc_ctxt
2255 *  pointer to the current macro block context
2256 *
2257 * @returns coded block pattern chroma
2258 *
2259 * @remarks none
2260 *
2261 *******************************************************************************
2262 */
ih264e_code_chroma_inter_macroblock_8x8(process_ctxt_t * ps_proc)2263 UWORD8 ih264e_code_chroma_inter_macroblock_8x8(process_ctxt_t *ps_proc)
2264 {
2265     /* Codec Context */
2266     codec_t *ps_codec = ps_proc->ps_codec;
2267 
2268     /* pointer to ref macro block */
2269     UWORD8 *pu1_rec_mb = ps_proc->pu1_rec_buf_chroma;
2270 
2271     /* pointer to src macro block */
2272     UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_chroma;
2273 
2274     /* pointer to prediction macro block */
2275     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
2276 
2277     /* pointer to residual macro block */
2278     WORD16 *pi2_res_mb = ps_proc->pi2_res_buf;
2279 
2280     /* strides */
2281     WORD32 i4_src_strd = ps_proc->i4_src_chroma_strd;
2282     WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
2283     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
2284     WORD32 i4_res_strd = ps_proc->i4_res_strd;
2285 
2286     /* coded block pattern */
2287     UWORD8 u1_cbp_c = 0;
2288 
2289     /*Control signal for inverse transform*/
2290     UWORD32 u4_cntrl;
2291 
2292     /* number of non zero coeffs*/
2293     UWORD8 au1_nnz[10] = {0};
2294 
2295     /* quantization parameters */
2296     quant_params_t *ps_qp_params = ps_proc->ps_qp_params[1];
2297 
2298     /* pointer to packed mb coeff data */
2299     void **pv_mb_coeff_data = &(ps_proc->pv_mb_coeff_data);
2300 
2301     /*See if we need to swap U and V plances for entropy*/
2302     UWORD32 u4_swap_uv = ps_codec->s_cfg.e_inp_color_fmt == IV_YUV_420SP_VU;
2303 
2304     /********************************************************/
2305     /*  prediction                                          */
2306     /********************************************************/
2307     ih264e_motion_comp_chroma(ps_proc);
2308 
2309     /********************************************************/
2310     /*  error estimation,                                   */
2311     /*  transform                                           */
2312     /*  quantization                                        */
2313     /********************************************************/
2314     ih264e_chroma_8x8_resi_trans_dctrans_quant(ps_codec, pu1_curr_mb,
2315                                                pu1_pred_mb, pi2_res_mb,
2316                                                i4_src_strd, i4_pred_strd,
2317                                                i4_res_strd,
2318                                                ps_qp_params->pu2_scale_mat,
2319                                                ps_qp_params->pu2_thres_mat,
2320                                                ps_qp_params->u1_qbits,
2321                                                ps_qp_params->u4_dead_zone,
2322                                                au1_nnz);
2323 
2324     /********************************************************/
2325     /*  pack coeff data for entropy coding                  */
2326     /********************************************************/
2327     ih264e_pack_c_mb(pi2_res_mb, pv_mb_coeff_data, i4_res_strd, &u1_cbp_c,
2328                      au1_nnz, ps_codec->u4_thres_resi, &u4_cntrl, u4_swap_uv);
2329 
2330     /********************************************************/
2331     /*  ierror estimation,                                  */
2332     /*  itransform                                          */
2333     /*  iquantization                                       */
2334     /********************************************************/
2335 
2336     /* If the frame is not to be used for P frame reference or dumping recon
2337      * we only will use the reocn for only predicting intra Mbs
2338      * THis will need only right and bottom edge 4x4 blocks recon
2339      * Hence we selectively enable them using control signal(including DC)
2340      */
2341     if (!ps_proc->u4_compute_recon)
2342     {
2343         u4_cntrl &= 0x7700C000;
2344     }
2345 
2346     if (u4_cntrl)
2347     {
2348         ih264e_chroma_8x8_idctrans_iquant_itrans_recon(
2349                         ps_codec, pi2_res_mb, pu1_pred_mb, pu1_rec_mb,
2350                         i4_res_strd, i4_pred_strd, i4_rec_strd,
2351                         ps_qp_params->pu2_iscale_mat,
2352                         ps_qp_params->pu2_weigh_mat, ps_qp_params->u1_qp_div,
2353                         u4_cntrl, ps_proc->pv_scratch_buff);
2354     }
2355     else
2356     {
2357         ps_codec->pf_inter_pred_luma_copy(pu1_pred_mb, pu1_rec_mb, i4_pred_strd,
2358                                           i4_rec_strd, MB_SIZE >> 1, MB_SIZE,
2359                                           NULL, 0);
2360     }
2361 
2362     return (u1_cbp_c);
2363 }
2364