xref: /aosp_15_r20/external/libavc/encoder/ih264e_intra_modes_eval.c (revision 495ae853bb871d1e5a258cb02c2cc13cde8ddb9a)
1 /******************************************************************************
2  *
3  * Copyright (C) 2015 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 
21 /**
22 *******************************************************************************
23 * @file
24 *  ih264e_intra_modes_eval.c
25 *
26 * @brief
27 *  This file contains definitions of routines that perform rate distortion
28 *  analysis on a macroblock if they are to be coded as intra.
29 *
30 * @author
31 *  ittiam
32 *
33 * @par List of Functions:
34 *  - ih264e_derive_neighbor_availability_of_mbs
35 *  - ih264e_derive_ngbr_avbl_of_mb_partitions
36 *  - ih264e_evaluate_intra16x16_modes_for_least_cost_rdoptoff
37 *  - ih264e_evaluate_intra8x8_modes_for_least_cost_rdoptoff
38 *  - ih264e_evaluate_intra4x4_modes_for_least_cost_rdoptoff
39 *  - ih264e_evaluate_intra4x4_modes_for_least_cost_rdopton
40 *  - ih264e_evaluate_chroma_intra8x8_modes_for_least_cost_rdoptoff
41 *  - ih264e_evaluate_intra16x16_modes
42 *  - ih264e_evaluate_intra4x4_modes
43 *  - ih264e_evaluate_intra_chroma_modes
44 *
45 * @remarks
46 *  none
47 *
48 *******************************************************************************
49 */
50 
51 /*****************************************************************************/
52 /* File Includes                                                             */
53 /*****************************************************************************/
54 
55 /* System Include Files */
56 #include <stdio.h>
57 #include <string.h>
58 #include <limits.h>
59 #include <assert.h>
60 
61 /* User Include Files */
62 #include "ih264e_config.h"
63 #include "ih264_typedefs.h"
64 #include "iv2.h"
65 #include "ive2.h"
66 
67 #include "ih264_debug.h"
68 #include "ih264_macros.h"
69 #include "ih264_defs.h"
70 #include "ih264_mem_fns.h"
71 #include "ih264_padding.h"
72 #include "ih264_structs.h"
73 #include "ih264_trans_quant_itrans_iquant.h"
74 #include "ih264_inter_pred_filters.h"
75 #include "ih264_intra_pred_filters.h"
76 #include "ih264_deblk_edge_filters.h"
77 #include "ih264_common_tables.h"
78 #include "ih264_cabac_tables.h"
79 
80 #include "ime_defs.h"
81 #include "ime_distortion_metrics.h"
82 #include "ime_structs.h"
83 #include "ime_platform_macros.h"
84 
85 #include "irc_cntrl_param.h"
86 #include "irc_frame_info_collector.h"
87 
88 #include "ih264e_error.h"
89 #include "ih264e_defs.h"
90 #include "ih264e_globals.h"
91 #include "ih264e_rate_control.h"
92 #include "ih264e_bitstream.h"
93 #include "ih264e_cabac_structs.h"
94 #include "ih264e_structs.h"
95 #include "ih264e_intra_modes_eval.h"
96 
97 
98 /*****************************************************************************/
99 /* Function Definitions                                                      */
100 /*****************************************************************************/
101 
102 /**
103 ******************************************************************************
104 *
105 * @brief
106 *  derivation process for macroblock availability
107 *
108 * @par   Description
109 *  Calculates the availability of the left, top, topright and topleft macroblocks.
110 *
111 * @param[in] ps_proc_ctxt
112 *  pointer to proc context (handle)
113 *
114 * @remarks Based on section 6.4.5 in H264 spec
115 *
116 * @return  none
117 *
118 ******************************************************************************
119 */
ih264e_derive_nghbr_avbl_of_mbs(process_ctxt_t * ps_proc)120 void ih264e_derive_nghbr_avbl_of_mbs(process_ctxt_t *ps_proc)
121 {
122     UWORD8 *pu1_slice_idx_curr = ps_proc->pu1_slice_idx;
123     UWORD8 *pu1_slice_idx_b;
124     UWORD8 *pu1_slice_idx_a;
125     UWORD8 *pu1_slice_idx_c;
126     UWORD8 *pu1_slice_idx_d;
127     block_neighbors_t *ps_ngbr_avbl;
128     WORD32 i4_mb_x, i4_mb_y;
129     WORD32 i4_wd_mbs;
130 
131     i4_mb_x = ps_proc->i4_mb_x;
132     i4_mb_y = ps_proc->i4_mb_y;
133 
134     i4_wd_mbs = ps_proc->i4_wd_mbs;
135 
136     pu1_slice_idx_curr += (i4_mb_y * i4_wd_mbs) + i4_mb_x;
137     pu1_slice_idx_a = pu1_slice_idx_curr - 1;
138     pu1_slice_idx_b = pu1_slice_idx_curr - i4_wd_mbs;
139     pu1_slice_idx_c = pu1_slice_idx_b + 1;
140     pu1_slice_idx_d = pu1_slice_idx_b - 1;
141     ps_ngbr_avbl = ps_proc->ps_ngbr_avbl;
142 
143     /**********************************************************************/
144     /* The macroblock is marked as available, unless one of the following */
145     /* conditions is true in which case the macroblock shall be marked as */
146     /* not available.                                                     */
147     /* 1. mbAddr < 0                                                      */
148     /* 2  mbAddr > CurrMbAddr                                             */
149     /* 3. the macroblock with address mbAddr belongs to a different slice */
150     /* than the macroblock with address CurrMbAddr                        */
151     /**********************************************************************/
152 
153     /* left macroblock availability */
154     if (i4_mb_x == 0)
155     { /* macroblocks along first column */
156         ps_ngbr_avbl->u1_mb_a = 0;
157     }
158     else
159     { /* macroblocks belong to same slice? */
160         if (*pu1_slice_idx_a != *pu1_slice_idx_curr)
161             ps_ngbr_avbl->u1_mb_a = 0;
162         else
163             ps_ngbr_avbl->u1_mb_a = 1;
164     }
165 
166     /* top macroblock availability */
167     if (i4_mb_y == 0)
168     { /* macroblocks along first row */
169         ps_ngbr_avbl->u1_mb_b = 0;
170     }
171     else
172     { /* macroblocks belong to same slice? */
173         if (*pu1_slice_idx_b != *pu1_slice_idx_curr)
174             ps_ngbr_avbl->u1_mb_b = 0;
175         else
176             ps_ngbr_avbl->u1_mb_b = 1;
177     }
178 
179     /* top right macroblock availability */
180     if (i4_mb_x == i4_wd_mbs-1 || i4_mb_y == 0)
181     { /* macroblocks along last column */
182         ps_ngbr_avbl->u1_mb_c = 0;
183     }
184     else
185     { /* macroblocks belong to same slice? */
186         if (*pu1_slice_idx_c != *pu1_slice_idx_curr)
187             ps_ngbr_avbl->u1_mb_c = 0;
188         else
189             ps_ngbr_avbl->u1_mb_c = 1;
190     }
191 
192     /* top left macroblock availability */
193     if (i4_mb_x == 0 || i4_mb_y == 0)
194     { /* macroblocks along first column */
195         ps_ngbr_avbl->u1_mb_d = 0;
196     }
197     else
198     { /* macroblocks belong to same slice? */
199         if (*pu1_slice_idx_d != *pu1_slice_idx_curr)
200             ps_ngbr_avbl->u1_mb_d = 0;
201         else
202             ps_ngbr_avbl->u1_mb_d = 1;
203     }
204 }
205 
206 /**
207 ******************************************************************************
208 *
209 * @brief
210 *  derivation process for subblock/partition availability
211 *
212 * @par   Description
213 *  Calculates the availability of the left, top, topright and topleft subblock
214 *  or partitions.
215 *
216 * @param[in]    ps_proc_ctxt
217 *  pointer to macroblock context (handle)
218 *
219 * @param[in]    i1_pel_pos_x
220 *  column position of the pel wrt the current block
221 *
222 * @param[in]    i1_pel_pos_y
223 *  row position of the pel in wrt current block
224 *
225 * @remarks     Assumptions: before calling this function it is assumed that
226 *   the neighbor availability of the current macroblock is already derived.
227 *   Based on table 6-3 of H264 specification
228 *
229 * @return      availability status (yes or no)
230 *
231 ******************************************************************************
232 */
ih264e_derive_ngbr_avbl_of_mb_partitions(block_neighbors_t * ps_ngbr_avbl,WORD8 i1_pel_pos_x,WORD8 i1_pel_pos_y)233 UWORD8 ih264e_derive_ngbr_avbl_of_mb_partitions(block_neighbors_t *ps_ngbr_avbl,
234                                                 WORD8 i1_pel_pos_x,
235                                                 WORD8 i1_pel_pos_y)
236 {
237     UWORD8 u1_neighbor_avail=0;
238 
239     /**********************************************************************/
240     /* values of i1_pel_pos_x in the range 0-15 inclusive correspond to   */
241     /* various columns of a macroblock                                    */
242     /*                                                                    */
243     /* values of i1_pel_pos_y in the range 0-15 inclusive correspond to   */
244     /* various rows of a macroblock                                       */
245     /*                                                                    */
246     /* other values of i1_pel_pos_x & i1_pel_pos_y represents elements    */
247     /* outside the bound of an mb ie., represents its neighbors.          */
248     /**********************************************************************/
249     if (i1_pel_pos_x < 0)
250     { /* column(-1) */
251         if (i1_pel_pos_y < 0)
252         { /* row(-1) */
253             u1_neighbor_avail = ps_ngbr_avbl->u1_mb_d; /* current mb topleft availability */
254         }
255         else if (i1_pel_pos_y >= 0 && i1_pel_pos_y < 16)
256         { /* all rows of a macroblock */
257             u1_neighbor_avail = ps_ngbr_avbl->u1_mb_a; /* current mb left availability */
258         }
259         else /* if (i1_pel_pos_y >= 16) */
260         { /* rows(+16) */
261             u1_neighbor_avail = 0;  /* current mb bottom left availability */
262         }
263     }
264     else if (i1_pel_pos_x >= 0 && i1_pel_pos_x < 16)
265     { /* all columns of a macroblock */
266         if (i1_pel_pos_y < 0)
267         { /* row(-1) */
268             u1_neighbor_avail = ps_ngbr_avbl->u1_mb_b; /* current mb top availability */
269         }
270         else if (i1_pel_pos_y >= 0 && i1_pel_pos_y < 16)
271         { /* all rows of a macroblock */
272             u1_neighbor_avail = 1; /* current mb availability */
273             /* availability of the partition is dependent on the position of the partition inside the mb */
274             /* although the availability is declared as 1 in all cases these needs to be corrected somewhere else and this is not done in here */
275         }
276         else /* if (i1_pel_pos_y >= 16) */
277         { /* rows(+16) */
278             u1_neighbor_avail = 0;  /* current mb bottom availability */
279         }
280     }
281     else if (i1_pel_pos_x >= 16)
282     { /* column(+16) */
283         if (i1_pel_pos_y < 0)
284         { /* row(-1) */
285             u1_neighbor_avail = ps_ngbr_avbl->u1_mb_c; /* current mb top right availability */
286         }
287         else /* if (i1_pel_pos_y >= 0) */
288         { /* all other rows */
289             u1_neighbor_avail = 0;  /* current mb right & bottom right availability */
290         }
291     }
292 
293     return u1_neighbor_avail;
294 }
295 
296 /**
297 ******************************************************************************
298 *
299 * @brief
300 *  evaluate best intra 16x16 mode (rate distortion opt off)
301 *
302 * @par Description
303 *  This function evaluates all the possible intra 16x16 modes and finds the mode
304 *  that best represents the macro-block (least distortion) and occupies fewer
305 *  bits in the bit-stream.
306 *
307 * @param[in]   ps_proc_ctxt
308 *  pointer to process context (handle)
309 *
310 * @remarks
311 *  Ideally the cost of encoding a macroblock is calculated as
312 *  (distortion + lambda*rate). Where distortion is SAD/SATD,... between the
313 *  input block and the reconstructed block and rate is the number of bits taken
314 *  to place the macroblock in the bit-stream. In this routine the rate does not
315 *  exactly point to the total number of bits it takes, rather it points to header
316 *  bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits
317 *  and residual bits fall in to texture bits the number of bits taken to encoding
318 *  mbtype is considered as rate, we compute cost. Further we will approximate
319 *  the distortion as the deviation b/w input and the predicted block as opposed
320 *  to input and reconstructed block.
321 *
322 *  NOTE: As per the Document JVT-O079, for intra 16x16 macroblock,
323 *  the SAD and cost are one and the same.
324 *
325 * @return     none
326 *
327 ******************************************************************************
328 */
ih264e_evaluate_intra16x16_modes_for_least_cost_rdoptoff(process_ctxt_t * ps_proc)329 void ih264e_evaluate_intra16x16_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc)
330 {
331     /* Codec Context */
332     codec_t *ps_codec = ps_proc->ps_codec;
333 
334     /* SAD(distortion metric) of an 8x8 block */
335     WORD32 i4_mb_distortion = INT_MAX, i4_mb_distortion_least = INT_MAX;
336 
337     /* lambda */
338     UWORD32 u4_lambda = ps_proc->u4_lambda;
339 
340     /* cost = distortion + lambda*rate */
341     WORD32 i4_mb_cost= INT_MAX, i4_mb_cost_least = INT_MAX;
342 
343     /* intra mode */
344     UWORD32 u4_intra_mode, u4_best_intra_16x16_mode = DC_I16x16;
345 
346     /* neighbor pels for intra prediction */
347     UWORD8 *pu1_ngbr_pels_i16 = ps_proc->au1_ngbr_pels;
348 
349     /* neighbor availability */
350     WORD32 i4_ngbr_avbl;
351 
352     /* pointer to src macro block */
353     UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_luma;
354     UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_luma;
355 
356     /* pointer to prediction macro block */
357     UWORD8 *pu1_pred_mb_intra_16x16 = ps_proc->pu1_pred_mb_intra_16x16;
358     UWORD8 *pu1_pred_mb_intra_16x16_plane = ps_proc->pu1_pred_mb_intra_16x16_plane;
359 
360     /* strides */
361     WORD32 i4_src_strd = ps_proc->i4_src_strd;
362     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
363     WORD32 i4_rec_strd = ps_proc->i4_rec_strd;
364 
365     /* pointer to neighbors left, top, topleft */
366     UWORD8 *pu1_mb_a = pu1_ref_mb - 1;
367     UWORD8 *pu1_mb_b = pu1_ref_mb - i4_rec_strd;
368     UWORD8 *pu1_mb_d = pu1_mb_b - 1;
369     UWORD8 u1_mb_a, u1_mb_b, u1_mb_d;
370 
371     /* valid intra modes map */
372     UWORD32 u4_valid_intra_modes;
373 
374     /* lut for valid intra modes */
375     const UWORD8 u1_valid_intra_modes[8] = {4, 6, 4, 6, 5, 7, 5, 15};
376 
377     /* temp var */
378     UWORD32 i, u4_enable_fast_sad = 0, offset = 0;
379     mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
380     UWORD32 u4_constrained_intra_pred = ps_proc->ps_codec->s_cfg.u4_constrained_intra_pred;
381 
382     /* init temp var */
383     if (ps_proc->i4_slice_type != ISLICE)
384     {
385         /* Offset for MBtype */
386         offset = (ps_proc->i4_slice_type == PSLICE) ? 5 : 23;
387         u4_enable_fast_sad = ps_proc->s_me_ctxt.u4_enable_fast_sad;
388     }
389 
390     /* locating neighbors that are available for prediction */
391 
392     /* gather prediction pels from the neighbors, if particular set is not available
393      * it is set to zero*/
394     /* left pels */
395     u1_mb_a = ((ps_proc->ps_ngbr_avbl->u1_mb_a)
396                     && (u4_constrained_intra_pred ? ps_proc->s_left_mb_syntax_ele.u2_is_intra : 1));
397     if (u1_mb_a)
398     {
399         for(i = 0; i < 16; i++)
400             pu1_ngbr_pels_i16[16-1-i] = pu1_mb_a[i * i4_rec_strd];
401     }
402     else
403     {
404         ps_codec->pf_mem_set_mul8(pu1_ngbr_pels_i16,0,MB_SIZE);
405     }
406     /* top pels */
407     u1_mb_b = ((ps_proc->ps_ngbr_avbl->u1_mb_b)
408                     && (u4_constrained_intra_pred ? ps_top_mb_syn_ele->u2_is_intra : 1));
409     if (u1_mb_b)
410     {
411         ps_codec->pf_mem_cpy_mul8(pu1_ngbr_pels_i16+16+1,pu1_mb_b,16);
412     }
413     else
414     {
415         ps_codec->pf_mem_set_mul8(pu1_ngbr_pels_i16+16+1,0,MB_SIZE);
416     }
417     /* topleft pels */
418     u1_mb_d = ((ps_proc->ps_ngbr_avbl->u1_mb_d)
419                     && (u4_constrained_intra_pred ? ps_proc->s_top_left_mb_syntax_ele.u2_is_intra : 1));
420     if (u1_mb_d)
421     {
422         pu1_ngbr_pels_i16[16] = *pu1_mb_d;
423     }
424     else
425     {
426         pu1_ngbr_pels_i16[16] = 0;
427     }
428 
429     i4_ngbr_avbl = (u1_mb_a) + (u1_mb_b << 2) + (u1_mb_d << 1);
430     ps_proc->i4_ngbr_avbl_16x16_mb = i4_ngbr_avbl;
431 
432     /* set valid intra modes for evaluation */
433     u4_valid_intra_modes = u1_valid_intra_modes[i4_ngbr_avbl];
434 
435     if (ps_codec->s_cfg.u4_enc_speed_preset == IVE_FAST ||
436                     ps_codec->s_cfg.u4_enc_speed_preset == IVE_FASTEST)
437         u4_valid_intra_modes &= ~(1 << PLANE_I16x16);
438 
439     /* evaluate b/w HORZ_I16x16, VERT_I16x16 & DC_I16x16 */
440     ps_codec->pf_ih264e_evaluate_intra16x16_modes(pu1_curr_mb, pu1_ngbr_pels_i16, pu1_pred_mb_intra_16x16,
441                                                   i4_src_strd, i4_pred_strd,
442                                                   i4_ngbr_avbl, &u4_intra_mode, &i4_mb_distortion_least,
443                                                   u4_valid_intra_modes);
444 
445     /* cost = distortion + lambda*rate */
446     i4_mb_cost_least = i4_mb_distortion_least;
447 
448     if (((u4_valid_intra_modes >> 3) & 1) != 0)
449     {
450         /* intra prediction for PLANE mode*/
451         (ps_codec->apf_intra_pred_16_l)[PLANE_I16x16](pu1_ngbr_pels_i16, pu1_pred_mb_intra_16x16_plane, 0, i4_pred_strd, i4_ngbr_avbl);
452 
453         /* evaluate distortion between the actual blk and the estimated blk for the given mode */
454         ps_codec->apf_compute_sad_16x16[u4_enable_fast_sad](pu1_curr_mb, pu1_pred_mb_intra_16x16_plane, i4_src_strd, i4_pred_strd, i4_mb_cost_least, &i4_mb_distortion);
455 
456         /* cost = distortion + lambda*rate */
457         i4_mb_cost = i4_mb_distortion;
458 
459         /* update the least cost information if necessary */
460         if(i4_mb_cost < i4_mb_distortion_least)
461         {
462             u4_intra_mode = PLANE_I16x16;
463 
464             i4_mb_cost_least = i4_mb_cost;
465             i4_mb_distortion_least = i4_mb_distortion;
466         }
467     }
468 
469     u4_best_intra_16x16_mode = u4_intra_mode;
470 
471     DEBUG("%d partition cost, %d intra mode\n", i4_mb_cost_least * 32, u4_best_intra_16x16_mode);
472 
473     ps_proc->u1_l_i16_mode = u4_best_intra_16x16_mode;
474 
475     /* cost = distortion + lambda*rate */
476     i4_mb_cost_least    = i4_mb_distortion_least + u4_lambda*u1_uev_codelength[offset + u4_best_intra_16x16_mode];
477 
478 
479     /* update the type of the mb if necessary */
480     if (i4_mb_cost_least < ps_proc->i4_mb_cost)
481     {
482         ps_proc->i4_mb_cost = i4_mb_cost_least;
483         ps_proc->i4_mb_distortion = i4_mb_distortion_least;
484         ps_proc->u4_mb_type = I16x16;
485     }
486     if (i4_mb_cost_least < ps_proc->i4_mb_intra_cost)
487     {
488         ps_proc->i4_mb_intra_cost = i4_mb_cost_least;
489     }
490 
491     return ;
492 }
493 
494 
495 /**
496 ******************************************************************************
497 *
498 * @brief
499 *  evaluate best intra 8x8 mode (rate distortion opt on)
500 *
501 * @par Description
502 *  This function evaluates all the possible intra 8x8 modes and finds the mode
503 *  that best represents the macro-block (least distortion) and occupies fewer
504 *  bits in the bit-stream.
505 *
506 * @param[in]    ps_proc_ctxt
507 *  pointer to proc ctxt
508 *
509 * @remarks Ideally the cost of encoding a macroblock is calculated as
510 *  (distortion + lambda*rate). Where distortion is SAD/SATD,... between the
511 *  input block and the reconstructed block and rate is the number of bits taken
512 *  to place the macroblock in the bit-stream. In this routine the rate does not
513 *  exactly point to the total number of bits it takes, rather it points to header
514 *  bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits
515 *  and residual bits fall in to texture bits the number of bits taken to encoding
516 *  mbtype is considered as rate, we compute cost. Further we will approximate
517 *  the distortion as the deviation b/w input and the predicted block as opposed
518 *  to input and reconstructed block.
519 *
520 *  NOTE: TODO: This function needs to be tested
521 *
522 *  @return      none
523 *
524 ******************************************************************************
525 */
ih264e_evaluate_intra8x8_modes_for_least_cost_rdoptoff(process_ctxt_t * ps_proc)526 void ih264e_evaluate_intra8x8_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc)
527 {
528     /* Codec Context */
529     codec_t *ps_codec = ps_proc->ps_codec;
530 
531     /* SAD(distortion metric) of an 4x4 block */
532     WORD32 i4_partition_distortion, i4_partition_distortion_least = INT_MAX, i4_total_distortion = 0;
533 
534     /* lambda */
535     UWORD32 u4_lambda = ps_proc->u4_lambda;
536 
537     /* cost = distortion + lambda*rate */
538     WORD32 i4_partition_cost, i4_partition_cost_least, i4_total_cost = u4_lambda;
539 
540     /* cost due to mbtype */
541     UWORD32 u4_cost_one_bit = u4_lambda, u4_cost_four_bits = 4 * u4_lambda;
542 
543     /* intra mode */
544     UWORD32 u4_intra_mode, u4_best_intra_8x8_mode = DC_I8x8, u4_estimated_intra_8x8_mode;
545 
546     /* neighbor pels for intra prediction */
547     UWORD8 *pu1_ngbr_pels_i8 = ps_proc->au1_ngbr_pels;
548 
549     /* pointer to curr partition */
550     UWORD8 *pu1_mb_curr;
551 
552     /* pointer to prediction macro block */
553     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
554 
555     /* strides */
556     WORD32 i4_src_strd = ps_proc->i4_src_strd;
557     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
558 
559     /* neighbors left, top, top right, top left */
560     UWORD8 *pu1_mb_a;
561     UWORD8 *pu1_mb_b;
562     UWORD8 *pu1_mb_d;
563 
564     /* neighbor availability */
565     WORD32 i4_ngbr_avbl;
566     block_neighbors_t s_ngbr_avbl;
567 
568     /* temp vars */
569     UWORD32  b8, u4_pix_x, u4_pix_y;
570     UWORD32 u4_constrained_intra_pred = ps_proc->ps_codec->s_cfg.u4_constrained_intra_pred;
571     block_neighbors_t s_ngbr_avbl_MB;
572 
573     /* ngbr mb syntax information */
574     UWORD8 *pu1_top_mb_intra_modes = ps_proc->pu1_top_mb_intra_modes + (ps_proc->i4_mb_x << 4);
575     mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
576     mb_info_t *ps_top_right_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
577 
578     /* valid intra modes map */
579     UWORD32 u4_valid_intra_modes;
580 
581     if (ps_proc->ps_ngbr_avbl->u1_mb_c)
582     {
583         ps_top_right_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + (ps_proc->i4_mb_x + 1);
584     }
585     /* left pels */
586     s_ngbr_avbl_MB.u1_mb_a = ((ps_proc->ps_ngbr_avbl->u1_mb_a)
587                                   && (u4_constrained_intra_pred ? ps_proc->s_left_mb_syntax_ele.u2_is_intra : 1));
588 
589     /* top pels */
590     s_ngbr_avbl_MB.u1_mb_b = ((ps_proc->ps_ngbr_avbl->u1_mb_b)
591                                   && (u4_constrained_intra_pred ? ps_top_mb_syn_ele->u2_is_intra : 1));
592 
593     /* topleft pels */
594     s_ngbr_avbl_MB.u1_mb_d = ((ps_proc->ps_ngbr_avbl->u1_mb_d)
595                                   && (u4_constrained_intra_pred ? ps_proc->s_top_left_mb_syntax_ele.u2_is_intra : 1));
596 
597     /* top right */
598     s_ngbr_avbl_MB.u1_mb_c = ((ps_proc->ps_ngbr_avbl->u1_mb_c)
599                                   && (u4_constrained_intra_pred ? ps_top_right_mb_syn_ele->u2_is_intra : 1));
600 
601 
602     for (b8 = 0; b8 < 4; b8++)
603     {
604         u4_pix_x = (b8 & 0x01) << 3;
605         u4_pix_y = (b8 >> 1) << 3;
606 
607         pu1_mb_curr = ps_proc->pu1_src_buf_luma + u4_pix_x + (u4_pix_y * i4_src_strd);
608         /* when rdopt is off, we use the input as reference for constructing prediction buffer */
609         /* as opposed to using the recon pels. (open loop intra prediction) */
610         pu1_mb_a = pu1_mb_curr - 1; /* pointer to left macro block */
611         pu1_mb_b = pu1_mb_curr - i4_src_strd; /* pointer to top macro block */
612         pu1_mb_d = pu1_mb_b - 1; /* pointer to top left macro block */
613 
614         /* locating neighbors that are available for prediction */
615         /* TODO : update the neighbor availability information basing on constrained intra pred information */
616         /* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines */
617         /* basing on neighbors available and hence evade the computation of neighbor availability totally. */
618         s_ngbr_avbl.u1_mb_a = ih264e_derive_ngbr_avbl_of_mb_partitions(&s_ngbr_avbl_MB, u4_pix_x - 1, u4_pix_y); /* xD = -1, yD = 0 */
619         s_ngbr_avbl.u1_mb_b = ih264e_derive_ngbr_avbl_of_mb_partitions(&s_ngbr_avbl_MB, u4_pix_x, u4_pix_y - 1); /* xD = 0, yD = -1 */
620         s_ngbr_avbl.u1_mb_c = ih264e_derive_ngbr_avbl_of_mb_partitions(&s_ngbr_avbl_MB, u4_pix_x + 8, u4_pix_y - 1); /* xD = BLK_8x8_SIZE, yD = -1 */
621         s_ngbr_avbl.u1_mb_d = ih264e_derive_ngbr_avbl_of_mb_partitions(&s_ngbr_avbl_MB, u4_pix_x - 1, u4_pix_y - 1); /* xD = -1, yD = -1 */
622 
623         /* i4_ngbr_avbl = blk_a * LEFT_MB_AVAILABLE_MASK + blk_b * TOP_MB_AVAILABLE_MASK + blk_c * TOP_RIGHT_MB_AVAILABLE_MASK + blk_d * TOP_LEFT_MB_AVAILABLE_MASK */
624         i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) +  (s_ngbr_avbl.u1_mb_c << 3) +
625                         (s_ngbr_avbl.u1_mb_a << 4);
626         /* if top partition is available and top right is not available for intra prediction, then */
627         /* padd top right samples using top sample and make top right also available */
628         /* i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) +  ((s_ngbr_avbl.u1_mb_b | s_ngbr_avbl.u1_mb_c) << 3); */
629         ps_proc->ai4_neighbor_avail_8x8_subblks[b8] = i4_ngbr_avbl;
630 
631 
632         ih264_intra_pred_luma_8x8_mode_ref_filtering(pu1_mb_a, pu1_mb_b, pu1_mb_d, pu1_ngbr_pels_i8,
633                                                      i4_src_strd, i4_ngbr_avbl);
634 
635         i4_partition_cost_least = INT_MAX;
636         /* set valid intra modes for evaluation */
637         u4_valid_intra_modes = 0x1ff;
638 
639         if (!s_ngbr_avbl.u1_mb_b)
640         {
641             u4_valid_intra_modes &= ~(1 << VERT_I4x4);
642             u4_valid_intra_modes &= ~(1 << DIAG_DL_I4x4);
643             u4_valid_intra_modes &= ~(1 << VERT_L_I4x4);
644         }
645         if (!s_ngbr_avbl.u1_mb_a)
646         {
647             u4_valid_intra_modes &= ~(1 << HORZ_I4x4);
648             u4_valid_intra_modes &= ~(1 << HORZ_U_I4x4);
649         }
650         if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b || !s_ngbr_avbl.u1_mb_d)
651         {
652             u4_valid_intra_modes &= ~(1 << DIAG_DR_I4x4);
653             u4_valid_intra_modes &= ~(1 << VERT_R_I4x4);
654             u4_valid_intra_modes &= ~(1 << HORZ_D_I4x4);
655         }
656 
657         /* estimate the intra 8x8 mode for the current partition (for evaluating cost) */
658         if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b)
659         {
660             u4_estimated_intra_8x8_mode = DC_I8x8;
661         }
662         else
663         {
664             UWORD32 u4_left_intra_8x8_mode = DC_I8x8;
665             UWORD32 u4_top_intra_8x8_mode = DC_I8x8;
666 
667             if (u4_pix_x == 0)
668             {
669                 if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I8x8)
670                 {
671                     u4_left_intra_8x8_mode = ps_proc->au1_left_mb_intra_modes[b8+1];
672                 }
673                 else if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I4x4)
674                 {
675                     u4_left_intra_8x8_mode = ps_proc->au1_left_mb_intra_modes[(b8+1)*4+2];
676                 }
677             }
678             else
679             {
680                 u4_left_intra_8x8_mode = ps_proc->au1_intra_luma_mb_8x8_modes[b8-1];
681             }
682 
683             if (u4_pix_y == 0)
684             {
685                 if (ps_top_mb_syn_ele->u2_mb_type == I8x8)
686                 {
687                     u4_top_intra_8x8_mode = pu1_top_mb_intra_modes[b8+2];
688                 }
689                 else if (ps_top_mb_syn_ele->u2_mb_type == I4x4)
690                 {
691                     u4_top_intra_8x8_mode = pu1_top_mb_intra_modes[(b8+2)*4+2];
692                 }
693             }
694             else
695             {
696                 u4_top_intra_8x8_mode = ps_proc->au1_intra_luma_mb_8x8_modes[b8-2];
697             }
698 
699             u4_estimated_intra_8x8_mode = MIN(u4_left_intra_8x8_mode, u4_top_intra_8x8_mode);
700         }
701 
702         /* perform intra mode 8x8 evaluation */
703         for (u4_intra_mode = VERT_I8x8; u4_valid_intra_modes != 0; u4_intra_mode++, u4_valid_intra_modes >>= 1)
704         {
705             if ( (u4_valid_intra_modes & 1) == 0)
706                 continue;
707 
708             /* intra prediction */
709             (ps_codec->apf_intra_pred_8_l)[u4_intra_mode](pu1_ngbr_pels_i8, pu1_pred_mb, 0, i4_pred_strd, i4_ngbr_avbl);
710 
711             /* evaluate distortion between the actual blk and the estimated blk for the given mode */
712             ime_compute_sad_8x8(pu1_mb_curr, pu1_pred_mb, i4_src_strd, i4_pred_strd, i4_partition_cost_least, &i4_partition_distortion);
713 
714             i4_partition_cost = i4_partition_distortion + ((u4_estimated_intra_8x8_mode == u4_intra_mode)?u4_cost_one_bit:u4_cost_four_bits);
715 
716             /* update the least cost information if necessary */
717             if (i4_partition_cost < i4_partition_cost_least)
718             {
719                 i4_partition_cost_least = i4_partition_cost;
720                 i4_partition_distortion_least = i4_partition_distortion;
721                 u4_best_intra_8x8_mode = u4_intra_mode;
722             }
723         }
724         /* macroblock distortion */
725         i4_total_cost += i4_partition_cost_least;
726         i4_total_distortion += i4_partition_distortion_least;
727         /* mb partition mode */
728         ps_proc->au1_intra_luma_mb_8x8_modes[b8] = u4_best_intra_8x8_mode;
729 
730     }
731 
732     /* update the type of the mb if necessary */
733     if (i4_total_cost < ps_proc->i4_mb_cost)
734     {
735         ps_proc->i4_mb_cost = i4_total_cost;
736         ps_proc->i4_mb_distortion = i4_total_distortion;
737         ps_proc->u4_mb_type = I8x8;
738     }
739     if (i4_total_cost < ps_proc->i4_mb_intra_cost)
740     {
741         ps_proc->i4_mb_intra_cost = i4_total_cost;
742     }
743 
744     return ;
745 }
746 
747 
748 /**
749 ******************************************************************************
750 *
751 * @brief
752 *  evaluate best intra 4x4 mode (rate distortion opt off)
753 *
754 * @par Description
755 *  This function evaluates all the possible intra 4x4 modes and finds the mode
756 *  that best represents the macro-block (least distortion) and occupies fewer
757 *  bits in the bit-stream.
758 *
759 * @param[in]    ps_proc_ctxt
760 *  pointer to proc ctxt
761 *
762 * @remarks
763 *  Ideally the cost of encoding a macroblock is calculated as
764 *  (distortion + lambda*rate). Where distortion is SAD/SATD,... between the
765 *  input block and the reconstructed block and rate is the number of bits taken
766 *  to place the macroblock in the bit-stream. In this routine the rate does not
767 *  exactly point to the total number of bits it takes, rather it points to header
768 *  bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits
769 *  and residual bits fall in to texture bits the number of bits taken to encoding
770 *  mbtype is considered as rate, we compute cost. Further we will approximate
771 *  the distortion as the deviation b/w input and the predicted block as opposed
772 *  to input and reconstructed block.
773 *
774 *  NOTE: As per the Document JVT-O079, for the whole intra 4x4 macroblock,
775 *  24*lambda is added to the SAD before comparison with the best SAD for
776 *  inter prediction. This is an empirical value to prevent using too many intra
777 *  blocks.
778 *
779 * @return      none
780 *
781 ******************************************************************************
782 */
ih264e_evaluate_intra4x4_modes_for_least_cost_rdoptoff(process_ctxt_t * ps_proc)783 void ih264e_evaluate_intra4x4_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc)
784 {
785     /* Codec Context */
786     codec_t *ps_codec = ps_proc->ps_codec;
787 
788     /* SAD(distortion metric) of an 4x4 block */
789     WORD32 i4_partition_distortion_least = INT_MAX, i4_total_distortion = 0;
790 
791     /* lambda */
792     UWORD32 u4_lambda = ps_proc->u4_lambda;
793 
794     /* cost = distortion + lambda*rate */
795     WORD32 i4_partition_cost_least, i4_total_cost = (24 + 1) * u4_lambda;
796 
797     /* cost due to mbtype */
798     UWORD32 u4_cost_one_bit = u4_lambda, u4_cost_four_bits = 4 * u4_lambda;
799 
800     /* intra mode */
801     UWORD32 u4_best_intra_4x4_mode = DC_I4x4, u4_estimated_intra_4x4_mode;
802 
803     /* neighbor pels for intra prediction */
804     UWORD8 *pu1_ngbr_pels_i4 = ps_proc->au1_ngbr_pels;
805 
806     /* pointer to curr partition */
807     UWORD8 *pu1_mb_curr;
808 
809     /* pointer to prediction macro block */
810     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
811 
812     /* strides */
813     WORD32 i4_src_strd = ps_proc->i4_src_strd;
814     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
815 
816     /* neighbors left, top, top right, top left */
817     UWORD8 *pu1_mb_a;
818     UWORD8 *pu1_mb_b;
819     UWORD8 *pu1_mb_c;
820     UWORD8 *pu1_mb_d;
821 
822     /* neighbor availability */
823     WORD32 i4_ngbr_avbl;
824     block_neighbors_t s_ngbr_avbl;
825 
826     /* temp vars */
827     UWORD32 i, b8, b4, u4_blk_x, u4_blk_y, u4_pix_x, u4_pix_y;
828 
829     /* scan order inside 4x4 block */
830     const UWORD8 u1_scan_order[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15};
831 
832     /* ngbr sub mb modes */
833     UWORD8 *pu1_top_mb_intra_modes = ps_proc->pu1_top_mb_intra_modes + (ps_proc->i4_mb_x << 4);
834     mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
835     mb_info_t *ps_top_right_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
836 
837     /* valid intra modes map */
838     UWORD32 u4_valid_intra_modes;
839     UWORD16 u2_valid_modes[8] = {4, 262, 4, 262, 141, 399, 141, 511};
840 
841     UWORD32 u4_constrained_intra_pred = ps_proc->ps_codec->s_cfg.u4_constrained_intra_pred;
842     UWORD8 u1_mb_a, u1_mb_b, u1_mb_c, u1_mb_d;
843 
844     if (ps_proc->ps_ngbr_avbl->u1_mb_c)
845     {
846         ps_top_right_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x + 1;
847     }
848     /* left pels */
849     u1_mb_a = ((ps_proc->ps_ngbr_avbl->u1_mb_a)
850                     && (u4_constrained_intra_pred ? ps_proc->s_left_mb_syntax_ele.u2_is_intra : 1));
851 
852     /* top pels */
853     u1_mb_b = ((ps_proc->ps_ngbr_avbl->u1_mb_b)
854                     && (u4_constrained_intra_pred ? ps_top_mb_syn_ele->u2_is_intra : 1));
855 
856     /* topleft pels */
857     u1_mb_d = ((ps_proc->ps_ngbr_avbl->u1_mb_d)
858                     && (u4_constrained_intra_pred ? ps_proc->s_top_left_mb_syntax_ele.u2_is_intra : 1));
859 
860     /* top right */
861     u1_mb_c = ((ps_proc->ps_ngbr_avbl->u1_mb_c)
862                     && (u4_constrained_intra_pred ? ps_top_right_mb_syn_ele->u2_is_intra : 1));
863 
864     i4_ngbr_avbl = (u1_mb_a) + (u1_mb_d << 1) + (u1_mb_b << 2) + (u1_mb_c << 3);
865     memcpy(ps_proc->au1_ngbr_avbl_4x4_subblks, gau1_ih264_4x4_ngbr_avbl[i4_ngbr_avbl], 16);
866 
867     for (b8 = 0; b8 < 4; b8++)
868     {
869         u4_blk_x = (b8 & 0x01) << 3;
870         u4_blk_y = (b8 >> 1) << 3;
871         for (b4 = 0; b4 < 4; b4++)
872         {
873             u4_pix_x = u4_blk_x + ((b4 & 0x01) << 2);
874             u4_pix_y = u4_blk_y + ((b4 >> 1) << 2);
875 
876             pu1_mb_curr = ps_proc->pu1_src_buf_luma + u4_pix_x + (u4_pix_y * i4_src_strd);
877             /* when rdopt is off, we use the input as reference for constructing prediction buffer */
878             /* as opposed to using the recon pels. (open loop intra prediction) */
879             pu1_mb_a = pu1_mb_curr - 1; /* pointer to left macro block */
880             pu1_mb_b = pu1_mb_curr - i4_src_strd; /* pointer to top macro block */
881             pu1_mb_c = pu1_mb_b + 4; /* pointer to top macro block */
882             pu1_mb_d = pu1_mb_b - 1; /* pointer to top left macro block */
883 
884             /* locating neighbors that are available for prediction */
885             /* TODO : update the neighbor availability information basing on constrained intra pred information */
886             /* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines */
887             /* basing on neighbors available and hence evade the computation of neighbor availability totally. */
888 
889             i4_ngbr_avbl = ps_proc->au1_ngbr_avbl_4x4_subblks[(b8 << 2) + b4];
890             s_ngbr_avbl.u1_mb_a = (i4_ngbr_avbl & 0x1);
891             s_ngbr_avbl.u1_mb_d = (i4_ngbr_avbl & 0x2) >> 1;
892             s_ngbr_avbl.u1_mb_b = (i4_ngbr_avbl & 0x4) >> 2;
893             s_ngbr_avbl.u1_mb_c = (i4_ngbr_avbl & 0x8) >> 3;
894             /* set valid intra modes for evaluation */
895             u4_valid_intra_modes = u2_valid_modes[i4_ngbr_avbl & 0x7];
896 
897             /* if top partition is available and top right is not available for intra prediction, then */
898             /* padd top right samples using top sample and make top right also available */
899             /* i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) + ((s_ngbr_avbl.u1_mb_b | s_ngbr_avbl.u1_mb_c) << 3); */
900 
901             /* gather prediction pels from the neighbors */
902             if (s_ngbr_avbl.u1_mb_a)
903             {
904                 for(i = 0; i < 4; i++)
905                     pu1_ngbr_pels_i4[4 - 1 -i] = pu1_mb_a[i * i4_src_strd];
906             }
907             else
908             {
909                 memset(pu1_ngbr_pels_i4, 0, 4);
910             }
911 
912             if (s_ngbr_avbl.u1_mb_b)
913             {
914                 memcpy(pu1_ngbr_pels_i4 + 4 + 1, pu1_mb_b, 4);
915             }
916             else
917             {
918                 memset(pu1_ngbr_pels_i4 + 5, 0, 4);
919             }
920 
921             if (s_ngbr_avbl.u1_mb_d)
922                 pu1_ngbr_pels_i4[4] = *pu1_mb_d;
923             else
924                 pu1_ngbr_pels_i4[4] = 0;
925 
926             if (s_ngbr_avbl.u1_mb_c)
927             {
928                 memcpy(pu1_ngbr_pels_i4 + 8 + 1, pu1_mb_c, 4);
929             }
930             else if (s_ngbr_avbl.u1_mb_b)
931             {
932                 memset(pu1_ngbr_pels_i4 + 8 + 1, pu1_ngbr_pels_i4[8], 4);
933                 s_ngbr_avbl.u1_mb_c = s_ngbr_avbl.u1_mb_b;
934             }
935 
936             i4_partition_cost_least = INT_MAX;
937 
938             /* predict the intra 4x4 mode for the current partition (for evaluating cost) */
939             if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b)
940             {
941                 u4_estimated_intra_4x4_mode = DC_I4x4;
942             }
943             else
944             {
945                 UWORD32 u4_left_intra_4x4_mode = DC_I4x4;
946                 UWORD32 u4_top_intra_4x4_mode = DC_I4x4;
947 
948                 if (u4_pix_x == 0)
949                 {
950                     if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I4x4)
951                     {
952                         u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[u1_scan_order[3 + u4_pix_y]];
953                     }
954                     else if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I8x8)
955                     {
956                         u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[b8 + 1];
957                     }
958                 }
959                 else
960                 {
961                     u4_left_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 1]];
962                 }
963 
964                 if (u4_pix_y == 0)
965                 {
966                     if (ps_top_mb_syn_ele->u2_mb_type == I4x4)
967                     {
968                         u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[u1_scan_order[12 + (u4_pix_x >> 2)]];
969                     }
970                     else if (ps_top_mb_syn_ele->u2_mb_type == I8x8)
971                     {
972                         u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[b8 + 2];
973                     }
974                 }
975                 else
976                 {
977                     u4_top_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 4]];
978                 }
979 
980                 u4_estimated_intra_4x4_mode = MIN(u4_left_intra_4x4_mode, u4_top_intra_4x4_mode);
981             }
982 
983             ps_proc->au1_predicted_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_estimated_intra_4x4_mode;
984 
985             /* mode evaluation and prediction */
986             ps_codec->pf_ih264e_evaluate_intra_4x4_modes(pu1_mb_curr,
987                                                          pu1_ngbr_pels_i4,
988                                                          pu1_pred_mb, i4_src_strd,
989                                                          i4_pred_strd, i4_ngbr_avbl,
990                                                          &u4_best_intra_4x4_mode,
991                                                          &i4_partition_cost_least,
992                                                          u4_valid_intra_modes,
993                                                          u4_lambda,
994                                                          u4_estimated_intra_4x4_mode);
995 
996 
997             i4_partition_distortion_least = i4_partition_cost_least - ((u4_estimated_intra_4x4_mode == u4_best_intra_4x4_mode) ? u4_cost_one_bit : u4_cost_four_bits);
998 
999             DEBUG("%d partition cost, %d intra mode\n", i4_partition_cost_least, u4_best_intra_4x4_mode);
1000             /* macroblock distortion */
1001             i4_total_distortion += i4_partition_distortion_least;
1002             i4_total_cost += i4_partition_cost_least;
1003             /* mb partition mode */
1004             ps_proc->au1_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_best_intra_4x4_mode;
1005         }
1006     }
1007 
1008     /* update the type of the mb if necessary */
1009     if (i4_total_cost < ps_proc->i4_mb_cost)
1010     {
1011         ps_proc->i4_mb_cost = i4_total_cost;
1012         ps_proc->i4_mb_distortion = i4_total_distortion;
1013         ps_proc->u4_mb_type = I4x4;
1014     }
1015     if (i4_total_cost < ps_proc->i4_mb_intra_cost)
1016     {
1017         ps_proc->i4_mb_intra_cost = i4_total_cost;
1018     }
1019 
1020     return ;
1021 }
1022 
1023 /**
1024 ******************************************************************************
1025 *
1026 * @brief evaluate best intra 4x4 mode (rate distortion opt on)
1027 *
1028 * @par Description
1029 *  This function evaluates all the possible intra 4x4 modes and finds the mode
1030 *  that best represents the macro-block (least distortion) and occupies fewer
1031 *  bits in the bit-stream.
1032 *
1033 * @param[in]    ps_proc_ctxt
1034 *  pointer to proc ctxt
1035 *
1036 * @remarks
1037 *  Ideally the cost of encoding a macroblock is calculated as
1038 *  (distortion + lambda*rate). Where distortion is SAD/SATD,... between the
1039 *  input block and the reconstructed block and rate is the number of bits taken
1040 *  to place the macroblock in the bit-stream. In this routine the rate does not
1041 *  exactly point to the total number of bits it takes, rather it points to header
1042 *  bits necessary for encoding the macroblock. Assuming the deltaQP, cbp bits
1043 *  and residual bits fall in to texture bits the number of bits taken to encoding
1044 *  mbtype is considered as rate, we compute cost. Further we will approximate
1045 *  the distortion as the deviation b/w input and the predicted block as opposed
1046 *  to input and reconstructed block.
1047 *
1048 *  NOTE: As per the Document JVT-O079, for the whole intra 4x4 macroblock,
1049 *  24*lambda is added to the SAD before comparison with the best SAD for
1050 *  inter prediction. This is an empirical value to prevent using too many intra
1051 *  blocks.
1052 *
1053 * @return      none
1054 *
1055 ******************************************************************************
1056 */
ih264e_evaluate_intra4x4_modes_for_least_cost_rdopton(process_ctxt_t * ps_proc)1057 void ih264e_evaluate_intra4x4_modes_for_least_cost_rdopton(process_ctxt_t *ps_proc)
1058 {
1059     /* Codec Context */
1060     codec_t *ps_codec = ps_proc->ps_codec;
1061 
1062     /* SAD(distortion metric) of an 4x4 block */
1063     WORD32 i4_partition_distortion_least = INT_MAX, i4_total_distortion = 0;
1064 
1065     /* lambda */
1066     UWORD32 u4_lambda = ps_proc->u4_lambda;
1067 
1068     /* cost = distortion + lambda*rate */
1069     WORD32 i4_partition_cost_least, i4_total_cost = (24 + 1) * u4_lambda;
1070 
1071     /* cost due to mbtype */
1072     UWORD32 u4_cost_one_bit = u4_lambda, u4_cost_four_bits = 4 * u4_lambda;
1073 
1074     /* intra mode */
1075     UWORD32 u4_best_intra_4x4_mode = DC_I4x4, u4_estimated_intra_4x4_mode;
1076 
1077     /* neighbor pels for intra prediction */
1078     UWORD8 *pu1_ngbr_pels_i4 = ps_proc->au1_ngbr_pels;
1079 
1080     /* pointer to curr partition */
1081     UWORD8 *pu1_mb_curr;
1082     UWORD8 *pu1_mb_ref_left, *pu1_mb_ref_top;
1083     UWORD8 *pu1_ref_mb_intra_4x4;
1084 
1085     /* pointer to residual macro block */
1086     WORD16 *pi2_res_mb = ps_proc->pi2_res_buf_intra_4x4;
1087 
1088     /* pointer to prediction macro block */
1089     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb;
1090 
1091     /* strides */
1092     WORD32 i4_src_strd = ps_proc->i4_src_strd;
1093     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
1094     WORD32 i4_ref_strd_left, i4_ref_strd_top;
1095 
1096     /* neighbors left, top, top right, top left */
1097     UWORD8 *pu1_mb_a;
1098     UWORD8 *pu1_mb_b;
1099     UWORD8 *pu1_mb_c;
1100     UWORD8 *pu1_mb_d;
1101 
1102     /* number of non zero coeffs*/
1103     UWORD8 *pu1_nnz = (UWORD8 *)ps_proc->au4_nnz_intra_4x4;
1104 
1105     /* quantization parameters */
1106     quant_params_t *ps_qp_params = ps_proc->ps_qp_params[0];
1107 
1108     /* neighbor availability */
1109     WORD32 i4_ngbr_avbl;
1110     block_neighbors_t s_ngbr_avbl;
1111 
1112     /* temp vars */
1113     UWORD32 i, b8, b4, u4_blk_x, u4_blk_y, u4_pix_x, u4_pix_y;
1114 
1115     /* scan order inside 4x4 block */
1116     const UWORD8 u1_scan_order[16] = {0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15};
1117 
1118     /* ngbr sub mb modes */
1119     UWORD8 *pu1_top_mb_intra_modes = ps_proc->pu1_top_mb_intra_modes + (ps_proc->i4_mb_x << 4);
1120     mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
1121     mb_info_t *ps_top_right_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
1122 
1123     /* valid intra modes map */
1124     UWORD32 u4_valid_intra_modes;
1125     UWORD16 u2_valid_modes[8] = {4, 262, 4, 262, 141, 399, 141, 511};
1126 
1127     /* Dummy variable for 4x4 trans function */
1128     WORD16 i2_dc_dummy;
1129     UWORD8 u1_mb_a, u1_mb_b, u1_mb_c, u1_mb_d;
1130     UWORD32 u4_constrained_intra_pred = ps_proc->ps_codec->s_cfg.u4_constrained_intra_pred;
1131 
1132     /* compute ngbr availability for sub blks */
1133     if (ps_proc->ps_ngbr_avbl->u1_mb_c)
1134     {
1135         ps_top_right_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + (ps_proc->i4_mb_x + 1);
1136     }
1137 
1138     /* left pels */
1139     u1_mb_a = ((ps_proc->ps_ngbr_avbl->u1_mb_a)
1140                     && (u4_constrained_intra_pred ? ps_proc->s_left_mb_syntax_ele.u2_is_intra : 1));
1141 
1142        /* top pels */
1143     u1_mb_b = ((ps_proc->ps_ngbr_avbl->u1_mb_b)
1144                     && (u4_constrained_intra_pred ? ps_top_mb_syn_ele->u2_is_intra : 1));
1145 
1146        /* topleft pels */
1147     u1_mb_d = ((ps_proc->ps_ngbr_avbl->u1_mb_d)
1148                     && (u4_constrained_intra_pred ? ps_proc->s_top_left_mb_syntax_ele.u2_is_intra : 1));
1149 
1150        /* top right pels */
1151     u1_mb_c = ((ps_proc->ps_ngbr_avbl->u1_mb_c)
1152                     && (u4_constrained_intra_pred ? ps_top_right_mb_syn_ele->u2_is_intra : 1));
1153 
1154     i4_ngbr_avbl = (u1_mb_a) + (u1_mb_d << 1) + (u1_mb_b << 2) + (u1_mb_c << 3);
1155     memcpy(ps_proc->au1_ngbr_avbl_4x4_subblks, gau1_ih264_4x4_ngbr_avbl[i4_ngbr_avbl], 16);
1156 
1157     for (b8 = 0; b8 < 4; b8++)
1158     {
1159         u4_blk_x = (b8 & 0x01) << 3;
1160         u4_blk_y = (b8 >> 1) << 3;
1161         for (b4 = 0; b4 < 4; b4++, pu1_nnz++, pi2_res_mb += MB_SIZE)
1162         {
1163             u4_pix_x = u4_blk_x + ((b4 & 0x01) << 2);
1164             u4_pix_y = u4_blk_y + ((b4 >> 1) << 2);
1165 
1166             pu1_ref_mb_intra_4x4 = ps_proc->pu1_ref_mb_intra_4x4 + u4_pix_x + (u4_pix_y * i4_pred_strd);
1167             pu1_mb_curr = ps_proc->pu1_src_buf_luma + u4_pix_x + (u4_pix_y * i4_src_strd);
1168             if (u4_pix_x == 0)
1169             {
1170                 i4_ref_strd_left = ps_proc->i4_rec_strd;
1171                 pu1_mb_ref_left = ps_proc->pu1_rec_buf_luma + u4_pix_x + (u4_pix_y * i4_ref_strd_left);
1172             }
1173             else
1174             {
1175                 i4_ref_strd_left = i4_pred_strd;
1176                 pu1_mb_ref_left = pu1_ref_mb_intra_4x4;
1177             }
1178             if (u4_pix_y == 0)
1179             {
1180                 i4_ref_strd_top = ps_proc->i4_rec_strd;
1181                 pu1_mb_ref_top = ps_proc->pu1_rec_buf_luma + u4_pix_x + (u4_pix_y * i4_ref_strd_top);
1182             }
1183             else
1184             {
1185                 i4_ref_strd_top = i4_pred_strd;
1186                 pu1_mb_ref_top = pu1_ref_mb_intra_4x4;
1187             }
1188 
1189             pu1_mb_a = pu1_mb_ref_left - 1; /* pointer to left macro block */
1190             pu1_mb_b = pu1_mb_ref_top - i4_ref_strd_top; /* pointer to top macro block */
1191             pu1_mb_c = pu1_mb_b + 4; /* pointer to top right macro block */
1192             if (u4_pix_y == 0)
1193                 pu1_mb_d = pu1_mb_b - 1;
1194             else
1195                 pu1_mb_d = pu1_mb_a - i4_ref_strd_left; /* pointer to top left macro block */
1196 
1197             /* locating neighbors that are available for prediction */
1198             /* TODO : update the neighbor availability information basing on constrained intra pred information */
1199             /* TODO : i4_ngbr_avbl is only being used in DC mode. Can the DC mode be split in to distinct routines */
1200             /* basing on neighbors available and hence evade the computation of neighbor availability totally. */
1201 
1202             i4_ngbr_avbl = ps_proc->au1_ngbr_avbl_4x4_subblks[(b8 << 2) + b4];
1203             s_ngbr_avbl.u1_mb_a = (i4_ngbr_avbl & 0x1);
1204             s_ngbr_avbl.u1_mb_d = (i4_ngbr_avbl & 0x2) >> 1;
1205             s_ngbr_avbl.u1_mb_b = (i4_ngbr_avbl & 0x4) >> 2;
1206             s_ngbr_avbl.u1_mb_c = (i4_ngbr_avbl & 0x8) >> 3;
1207             /* set valid intra modes for evaluation */
1208             u4_valid_intra_modes = u2_valid_modes[i4_ngbr_avbl & 0x7];
1209 
1210             /* if top partition is available and top right is not available for intra prediction, then */
1211             /* padd top right samples using top sample and make top right also available */
1212             /* i4_ngbr_avbl = (s_ngbr_avbl.u1_mb_a) + (s_ngbr_avbl.u1_mb_d << 1) + (s_ngbr_avbl.u1_mb_b << 2) + ((s_ngbr_avbl.u1_mb_b | s_ngbr_avbl.u1_mb_c) << 3); */
1213 
1214             /* gather prediction pels from the neighbors */
1215             if (s_ngbr_avbl.u1_mb_a)
1216             {
1217                 for(i = 0; i < 4; i++)
1218                     pu1_ngbr_pels_i4[4 - 1 -i] = pu1_mb_a[i * i4_ref_strd_left];
1219             }
1220             else
1221             {
1222                 memset(pu1_ngbr_pels_i4,0,4);
1223             }
1224             if(s_ngbr_avbl.u1_mb_b)
1225             {
1226                 memcpy(pu1_ngbr_pels_i4 + 4 + 1, pu1_mb_b, 4);
1227             }
1228             else
1229             {
1230                 memset(pu1_ngbr_pels_i4 + 4 + 1, 0, 4);
1231             }
1232             if (s_ngbr_avbl.u1_mb_d)
1233                 pu1_ngbr_pels_i4[4] = *pu1_mb_d;
1234             else
1235                 pu1_ngbr_pels_i4[4] = 0;
1236             if (s_ngbr_avbl.u1_mb_c)
1237             {
1238                 memcpy(pu1_ngbr_pels_i4 + 8 + 1, pu1_mb_c, 4);
1239             }
1240             else if (s_ngbr_avbl.u1_mb_b)
1241             {
1242                 memset(pu1_ngbr_pels_i4 + 8 + 1, pu1_ngbr_pels_i4[8], 4);
1243                 s_ngbr_avbl.u1_mb_c = s_ngbr_avbl.u1_mb_b;
1244             }
1245 
1246             i4_partition_cost_least = INT_MAX;
1247 
1248             /* predict the intra 4x4 mode for the current partition (for evaluating cost) */
1249             if (!s_ngbr_avbl.u1_mb_a || !s_ngbr_avbl.u1_mb_b)
1250             {
1251                 u4_estimated_intra_4x4_mode = DC_I4x4;
1252             }
1253             else
1254             {
1255                 UWORD32 u4_left_intra_4x4_mode = DC_I4x4;
1256                 UWORD32 u4_top_intra_4x4_mode = DC_I4x4;
1257 
1258                 if (u4_pix_x == 0)
1259                 {
1260                     if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I4x4)
1261                     {
1262                         u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[u1_scan_order[3 + u4_pix_y]];
1263                     }
1264                     else if (ps_proc->s_left_mb_syntax_ele.u2_mb_type == I8x8)
1265                     {
1266                         u4_left_intra_4x4_mode = ps_proc->au1_left_mb_intra_modes[b8 + 1];
1267                     }
1268                 }
1269                 else
1270                 {
1271                     u4_left_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 1]];
1272                 }
1273 
1274                 if (u4_pix_y == 0)
1275                 {
1276                     if (ps_top_mb_syn_ele->u2_mb_type == I4x4)
1277                     {
1278                         u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[u1_scan_order[12 + (u4_pix_x >> 2)]];
1279                     }
1280                     else if (ps_top_mb_syn_ele->u2_mb_type == I8x8)
1281                     {
1282                         u4_top_intra_4x4_mode = pu1_top_mb_intra_modes[b8 + 2];
1283                     }
1284                 }
1285                 else
1286                 {
1287                     u4_top_intra_4x4_mode = ps_proc->au1_intra_luma_mb_4x4_modes[u1_scan_order[(u4_pix_x >> 2) + u4_pix_y - 4]];
1288                 }
1289 
1290                 u4_estimated_intra_4x4_mode = MIN(u4_left_intra_4x4_mode, u4_top_intra_4x4_mode);
1291             }
1292 
1293             ps_proc->au1_predicted_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_estimated_intra_4x4_mode;
1294 
1295             /*mode evaluation and prediction*/
1296             ps_codec->pf_ih264e_evaluate_intra_4x4_modes(pu1_mb_curr,
1297                                                          pu1_ngbr_pels_i4,
1298                                                          pu1_pred_mb, i4_src_strd,
1299                                                          i4_pred_strd, i4_ngbr_avbl,
1300                                                          &u4_best_intra_4x4_mode,
1301                                                          &i4_partition_cost_least,
1302                                                          u4_valid_intra_modes,
1303                                                          u4_lambda,
1304                                                          u4_estimated_intra_4x4_mode);
1305 
1306 
1307             i4_partition_distortion_least = i4_partition_cost_least - ((u4_estimated_intra_4x4_mode == u4_best_intra_4x4_mode)?u4_cost_one_bit:u4_cost_four_bits);
1308 
1309             DEBUG("%d partition cost, %d intra mode\n", i4_partition_cost_least, u4_best_intra_4x4_mode);
1310 
1311             /* macroblock distortion */
1312             i4_total_distortion += i4_partition_distortion_least;
1313             i4_total_cost += i4_partition_cost_least;
1314 
1315             /* mb partition mode */
1316             ps_proc->au1_intra_luma_mb_4x4_modes[(b8 << 2) + b4] = u4_best_intra_4x4_mode;
1317 
1318 
1319             /********************************************************/
1320             /*  error estimation,                                   */
1321             /*  transform                                           */
1322             /*  quantization                                        */
1323             /********************************************************/
1324             ps_codec->pf_resi_trans_quant_4x4(pu1_mb_curr, pu1_pred_mb,
1325                                               pi2_res_mb, i4_src_strd,
1326                                               i4_pred_strd,
1327                                               /* No op stride, this implies a buff of lenght 1x16 */
1328                                               ps_qp_params->pu2_scale_mat,
1329                                               ps_qp_params->pu2_thres_mat,
1330                                               ps_qp_params->u1_qbits,
1331                                               ps_qp_params->u4_dead_zone,
1332                                               pu1_nnz, &i2_dc_dummy);
1333 
1334             /********************************************************/
1335             /*  ierror estimation,                                  */
1336             /*  itransform                                          */
1337             /*  iquantization                                       */
1338             /********************************************************/
1339             ps_codec->pf_iquant_itrans_recon_4x4(pi2_res_mb, pu1_pred_mb,
1340                                                  pu1_ref_mb_intra_4x4,
1341                                                  i4_pred_strd, i4_pred_strd,
1342                                                  ps_qp_params->pu2_iscale_mat,
1343                                                  ps_qp_params->pu2_weigh_mat,
1344                                                  ps_qp_params->u1_qp_div,
1345                                                  ps_proc->pv_scratch_buff, 0,
1346                                                  NULL);
1347         }
1348     }
1349 
1350     /* update the type of the mb if necessary */
1351     if (i4_total_cost < ps_proc->i4_mb_cost)
1352     {
1353         ps_proc->i4_mb_cost = i4_total_cost;
1354         ps_proc->i4_mb_distortion = i4_total_distortion;
1355         ps_proc->u4_mb_type = I4x4;
1356     }
1357     if (i4_total_cost < ps_proc->i4_mb_intra_cost)
1358     {
1359         ps_proc->i4_mb_intra_cost = i4_total_cost;
1360     }
1361 
1362     return ;
1363 }
1364 
1365 /**
1366 ******************************************************************************
1367 *
1368 * @brief
1369 *  evaluate best chroma intra 8x8 mode (rate distortion opt off)
1370 *
1371 * @par Description
1372 *  This function evaluates all the possible chroma intra 8x8 modes and finds
1373 *  the mode that best represents the macroblock (least distortion) and occupies
1374 *  fewer bits in the bitstream.
1375 *
1376 * @param[in] ps_proc_ctxt
1377 *  pointer to macroblock context (handle)
1378 *
1379 * @remarks
1380 *  For chroma best intra pred mode is calculated based only on SAD
1381 *
1382 * @returns none
1383 *
1384 ******************************************************************************
1385 */
ih264e_evaluate_chroma_intra8x8_modes_for_least_cost_rdoptoff(process_ctxt_t * ps_proc)1386 void ih264e_evaluate_chroma_intra8x8_modes_for_least_cost_rdoptoff(process_ctxt_t *ps_proc)
1387 {
1388     /* Codec Context */
1389     codec_t *ps_codec = ps_proc->ps_codec;
1390 
1391     /* SAD(distortion metric) of an 8x8 block */
1392     WORD32 i4_mb_distortion, i4_chroma_mb_distortion;
1393 
1394     /* intra mode */
1395     UWORD32  u4_best_chroma_intra_8x8_mode = DC_CH_I8x8;
1396 
1397     /* neighbor pels for intra prediction */
1398     UWORD8 *pu1_ngbr_pels_c_i8x8 = ps_proc->au1_ngbr_pels;
1399 
1400     /* pointer to curr macro block */
1401     UWORD8 *pu1_curr_mb = ps_proc->pu1_src_buf_chroma;
1402     UWORD8 *pu1_ref_mb = ps_proc->pu1_rec_buf_chroma;
1403 
1404     /* pointer to prediction macro block */
1405     UWORD8 *pu1_pred_mb = ps_proc->pu1_pred_mb_intra_chroma;
1406     UWORD8 *pu1_pred_mb_plane = ps_proc->pu1_pred_mb_intra_chroma_plane;
1407 
1408     /* strides */
1409     WORD32 i4_src_strd_c = ps_proc->i4_src_chroma_strd;
1410     WORD32 i4_pred_strd = ps_proc->i4_pred_strd;
1411     WORD32 i4_rec_strd_c = ps_proc->i4_rec_strd;
1412 
1413     /* neighbors left, top, top left */
1414     UWORD8 *pu1_mb_a = pu1_ref_mb - 2;
1415     UWORD8 *pu1_mb_b = pu1_ref_mb - i4_rec_strd_c;
1416     UWORD8 *pu1_mb_d = pu1_mb_b - 2;
1417 
1418     /* neighbor availability */
1419     const UWORD8  u1_valid_intra_modes[8] = {1, 3, 1, 3, 5, 7, 5, 15};
1420     WORD32 i4_ngbr_avbl;
1421 
1422     /* valid intra modes map */
1423     UWORD32 u4_valid_intra_modes;
1424     mb_info_t *ps_top_mb_syn_ele = ps_proc->ps_top_row_mb_syntax_ele + ps_proc->i4_mb_x;
1425 
1426     /* temp var */
1427     UWORD8 i;
1428     UWORD32 u4_constrained_intra_pred = ps_proc->ps_codec->s_cfg.u4_constrained_intra_pred;
1429     UWORD8 u1_mb_a, u1_mb_b, u1_mb_d;
1430 
1431     /* locating neighbors that are available for prediction */
1432     /* gather prediction pels from the neighbors */
1433     /* left pels */
1434     u1_mb_a = ((ps_proc->ps_ngbr_avbl->u1_mb_a)
1435                     && (u4_constrained_intra_pred ?  ps_proc->s_left_mb_syntax_ele.u2_is_intra : 1));
1436     if (u1_mb_a)
1437     {
1438         for (i = 0; i < 16; i += 2)
1439         {
1440             pu1_ngbr_pels_c_i8x8[16 - 2 - i] = pu1_mb_a[(i / 2) * i4_rec_strd_c];
1441             pu1_ngbr_pels_c_i8x8[16 - 1 - i] = pu1_mb_a[(i / 2) * i4_rec_strd_c + 1];
1442         }
1443     }
1444     else
1445     {
1446         ps_codec->pf_mem_set_mul8(pu1_ngbr_pels_c_i8x8, 0, MB_SIZE);
1447     }
1448 
1449     /* top pels */
1450     u1_mb_b = ((ps_proc->ps_ngbr_avbl->u1_mb_b)
1451                     && (u4_constrained_intra_pred ? ps_top_mb_syn_ele->u2_is_intra : 1));
1452     if (u1_mb_b)
1453     {
1454         ps_codec->pf_mem_cpy_mul8(&pu1_ngbr_pels_c_i8x8[18], pu1_mb_b, 16);
1455     }
1456     else
1457     {
1458         ps_codec->pf_mem_set_mul8((pu1_ngbr_pels_c_i8x8 + 18), 0, MB_SIZE);
1459     }
1460 
1461     /* top left pels */
1462     u1_mb_d = ((ps_proc->ps_ngbr_avbl->u1_mb_d)
1463                     && (u4_constrained_intra_pred ? ps_proc->s_top_left_mb_syntax_ele.u2_is_intra : 1));
1464     if (u1_mb_d)
1465     {
1466         pu1_ngbr_pels_c_i8x8[16] = *pu1_mb_d;
1467         pu1_ngbr_pels_c_i8x8[17] = *(pu1_mb_d + 1);
1468     }
1469     i4_ngbr_avbl = (u1_mb_a) + (u1_mb_b << 2) + (u1_mb_d << 1);
1470     ps_proc->i4_chroma_neighbor_avail_8x8_mb = i4_ngbr_avbl;
1471 
1472     u4_valid_intra_modes = u1_valid_intra_modes[i4_ngbr_avbl];
1473 
1474     if (ps_codec->s_cfg.u4_enc_speed_preset == IVE_FAST ||
1475                     ps_codec->s_cfg.u4_enc_speed_preset == IVE_FASTEST)
1476         u4_valid_intra_modes &= ~(1 << PLANE_CH_I8x8);
1477 
1478     i4_chroma_mb_distortion = INT_MAX;
1479 
1480     /* perform intra mode chroma  8x8 evaluation */
1481     /* intra prediction */
1482     ps_codec->pf_ih264e_evaluate_intra_chroma_modes(pu1_curr_mb,
1483                                                     pu1_ngbr_pels_c_i8x8,
1484                                                     pu1_pred_mb,
1485                                                     i4_src_strd_c,
1486                                                     i4_pred_strd,
1487                                                     i4_ngbr_avbl,
1488                                                     &u4_best_chroma_intra_8x8_mode,
1489                                                     &i4_chroma_mb_distortion,
1490                                                     u4_valid_intra_modes);
1491 
1492     if (u4_valid_intra_modes & 8)/* if Chroma PLANE is valid*/
1493     {
1494         (ps_codec->apf_intra_pred_c)[PLANE_CH_I8x8](pu1_ngbr_pels_c_i8x8, pu1_pred_mb_plane, 0, i4_pred_strd, i4_ngbr_avbl);
1495 
1496         /* evaluate distortion(sad) */
1497         ps_codec->pf_compute_sad_16x8(pu1_curr_mb, pu1_pred_mb_plane, i4_src_strd_c, i4_pred_strd, i4_chroma_mb_distortion, &i4_mb_distortion);
1498 
1499         /* update the least distortion information if necessary */
1500         if(i4_mb_distortion < i4_chroma_mb_distortion)
1501         {
1502             i4_chroma_mb_distortion = i4_mb_distortion;
1503             u4_best_chroma_intra_8x8_mode = PLANE_CH_I8x8;
1504         }
1505     }
1506 
1507     DEBUG("%d partition cost, %d intra mode\n", i4_chroma_mb_distortion, u4_best_chroma_intra_8x8_mode);
1508 
1509     ps_proc->u1_c_i8_mode = u4_best_chroma_intra_8x8_mode;
1510 
1511     return ;
1512 }
1513 
1514 
1515 /**
1516 ******************************************************************************
1517 *
1518 * @brief
1519 *  Evaluate best intra 16x16 mode (among VERT, HORZ and DC) and do the
1520 *  prediction.
1521 *
1522 * @par Description
1523 *  This function evaluates first three 16x16 modes and compute corresponding sad
1524 *  and return the buffer predicted with best mode.
1525 *
1526 * @param[in] pu1_src
1527 *  UWORD8 pointer to the source
1528 *
1529 * @param[in] pu1_ngbr_pels_i16
1530 *  UWORD8 pointer to neighbouring pels
1531 *
1532 * @param[out] pu1_dst
1533 *  UWORD8 pointer to the destination
1534 *
1535 * @param[in] src_strd
1536 *  integer source stride
1537 *
1538 * @param[in] dst_strd
1539 *  integer destination stride
1540 *
1541 * @param[in] u4_n_avblty
1542 *  availability of neighbouring pixels
1543 *
1544 * @param[in] u4_intra_mode
1545 *  Pointer to the variable in which best mode is returned
1546 *
1547 * @param[in] pu4_sadmin
1548 *  Pointer to the variable in which minimum sad is returned
1549 *
1550 * @param[in] u4_valid_intra_modes
1551 *  Says what all modes are valid
1552 *
1553 * @returns      none
1554 *
1555 ******************************************************************************
1556 */
ih264e_evaluate_intra16x16_modes(UWORD8 * pu1_src,UWORD8 * pu1_ngbr_pels_i16,UWORD8 * pu1_dst,UWORD32 src_strd,UWORD32 dst_strd,WORD32 u4_n_avblty,UWORD32 * u4_intra_mode,WORD32 * pu4_sadmin,UWORD32 u4_valid_intra_modes)1557 void ih264e_evaluate_intra16x16_modes(UWORD8 *pu1_src,
1558                                       UWORD8 *pu1_ngbr_pels_i16,
1559                                       UWORD8 *pu1_dst,
1560                                       UWORD32 src_strd,
1561                                       UWORD32 dst_strd,
1562                                       WORD32 u4_n_avblty,
1563                                       UWORD32 *u4_intra_mode,
1564                                       WORD32 *pu4_sadmin,
1565                                       UWORD32 u4_valid_intra_modes)
1566 {
1567     UWORD8 *pu1_neighbour;
1568     UWORD8 *pu1_src_temp = pu1_src;
1569     UWORD8 left = 0, top = 0;
1570     WORD32 u4_dcval = 0;
1571     WORD32 i, j;
1572     WORD32 i4_sad_vert = INT_MAX, i4_sad_horz = INT_MAX, i4_sad_dc = INT_MAX,
1573                     i4_min_sad = INT_MAX;
1574     UWORD8 val;
1575 
1576     left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK);
1577     top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2;
1578 
1579     /* left available */
1580     if (left)
1581     {
1582         i4_sad_horz = 0;
1583 
1584         for (i = 0; i < 16; i++)
1585         {
1586             val = pu1_ngbr_pels_i16[15 - i];
1587 
1588             u4_dcval += val;
1589 
1590             for (j = 0; j < 16; j++)
1591             {
1592                 i4_sad_horz += ABS(val - pu1_src_temp[j]);
1593             }
1594 
1595             pu1_src_temp += src_strd;
1596         }
1597         u4_dcval += 8;
1598     }
1599 
1600     pu1_src_temp = pu1_src;
1601     /* top available */
1602     if (top)
1603     {
1604         i4_sad_vert = 0;
1605 
1606         for (i = 0; i < 16; i++)
1607         {
1608             u4_dcval += pu1_ngbr_pels_i16[17 + i];
1609 
1610             for (j = 0; j < 16; j++)
1611             {
1612                 i4_sad_vert += ABS(pu1_ngbr_pels_i16[17 + j] - pu1_src_temp[j]);
1613             }
1614             pu1_src_temp += src_strd;
1615 
1616         }
1617         u4_dcval += 8;
1618     }
1619 
1620     u4_dcval = (u4_dcval) >> (3 + left + top);
1621 
1622     pu1_src_temp = pu1_src;
1623 
1624     /* none available */
1625     u4_dcval += (left == 0) * (top == 0) * 128;
1626 
1627     i4_sad_dc = 0;
1628 
1629     for (i = 0; i < 16; i++)
1630     {
1631         for (j = 0; j < 16; j++)
1632         {
1633             i4_sad_dc += ABS(u4_dcval - pu1_src_temp[j]);
1634         }
1635         pu1_src_temp += src_strd;
1636     }
1637 
1638     if ((u4_valid_intra_modes & 04) == 0)/* If DC is disabled */
1639         i4_sad_dc = INT_MAX;
1640 
1641     if ((u4_valid_intra_modes & 01) == 0)/* If VERT is disabled */
1642         i4_sad_vert = INT_MAX;
1643 
1644     if ((u4_valid_intra_modes & 02) == 0)/* If HORZ is disabled */
1645         i4_sad_horz = INT_MAX;
1646 
1647     i4_min_sad = MIN3(i4_sad_horz, i4_sad_dc, i4_sad_vert);
1648 
1649     /* Finding Minimum sad and doing corresponding prediction */
1650     if (i4_min_sad < *pu4_sadmin)
1651     {
1652         *pu4_sadmin = i4_min_sad;
1653         if (i4_min_sad == i4_sad_vert)
1654         {
1655             *u4_intra_mode = VERT_I16x16;
1656             pu1_neighbour = pu1_ngbr_pels_i16 + 17;
1657             for (j = 0; j < 16; j++)
1658             {
1659                 memcpy(pu1_dst, pu1_neighbour, MB_SIZE);
1660                 pu1_dst += dst_strd;
1661             }
1662         }
1663         else if (i4_min_sad == i4_sad_horz)
1664         {
1665             *u4_intra_mode = HORZ_I16x16;
1666             for (j = 0; j < 16; j++)
1667             {
1668                 val = pu1_ngbr_pels_i16[15 - j];
1669                 memset(pu1_dst, val, MB_SIZE);
1670                 pu1_dst += dst_strd;
1671             }
1672         }
1673         else
1674         {
1675             *u4_intra_mode = DC_I16x16;
1676             for (j = 0; j < 16; j++)
1677             {
1678                 memset(pu1_dst, u4_dcval, MB_SIZE);
1679                 pu1_dst += dst_strd;
1680             }
1681         }
1682     }
1683     return;
1684 }
1685 
1686 /**
1687 ******************************************************************************
1688 *
1689 * @brief
1690 *  Evaluate best intra 4x4 mode and perform prediction.
1691 *
1692 * @par Description
1693 *  This function evaluates  4x4 modes and compute corresponding sad
1694 *  and return the buffer predicted with best mode.
1695 *
1696 * @param[in] pu1_src
1697 *  UWORD8 pointer to the source
1698 *
1699 * @param[in] pu1_ngbr_pels
1700 *  UWORD8 pointer to neighbouring pels
1701 *
1702 * @param[out] pu1_dst
1703 *  UWORD8 pointer to the destination
1704 *
1705 * @param[in] src_strd
1706 *  integer source stride
1707 *
1708 * @param[in] dst_strd
1709 *  integer destination stride
1710 *
1711 * @param[in] u4_n_avblty
1712 *  availability of neighbouring pixels
1713 *
1714 * @param[in] u4_intra_mode
1715 *  Pointer to the variable in which best mode is returned
1716 *
1717 * @param[in] pu4_sadmin
1718 *  Pointer to the variable in which minimum cost is returned
1719 *
1720 * @param[in] u4_valid_intra_modes
1721 *  Says what all modes are valid
1722 *
1723 * @param[in] u4_lambda
1724 *  Lamda value for computing cost from SAD
1725 *
1726 * @param[in] u4_predictd_mode
1727 *  Predicted mode for cost computation
1728 *
1729 * @returns      none
1730 *
1731 ******************************************************************************
1732 */
ih264e_evaluate_intra_4x4_modes(UWORD8 * pu1_src,UWORD8 * pu1_ngbr_pels,UWORD8 * pu1_dst,UWORD32 src_strd,UWORD32 dst_strd,WORD32 u4_n_avblty,UWORD32 * u4_intra_mode,WORD32 * pu4_sadmin,UWORD32 u4_valid_intra_modes,UWORD32 u4_lambda,UWORD32 u4_predictd_mode)1733 void ih264e_evaluate_intra_4x4_modes(UWORD8 *pu1_src,
1734                                      UWORD8 *pu1_ngbr_pels,
1735                                      UWORD8 *pu1_dst,
1736                                      UWORD32 src_strd,
1737                                      UWORD32 dst_strd,
1738                                      WORD32 u4_n_avblty,
1739                                      UWORD32 *u4_intra_mode,
1740                                      WORD32 *pu4_sadmin,
1741                                      UWORD32 u4_valid_intra_modes,
1742                                      UWORD32  u4_lambda,
1743                                      UWORD32 u4_predictd_mode)
1744 {
1745     UWORD8 *pu1_src_temp = pu1_src;
1746     UWORD8 *pu1_pred = pu1_ngbr_pels;
1747     UWORD8 left = 0, top = 0;
1748     UWORD8 u1_pred_val = 0;
1749     UWORD8 u1_pred_vals[4] = {0};
1750     UWORD8 *pu1_pred_val = NULL;
1751     /* To store FILT121 operated values*/
1752     UWORD8 u1_pred_vals_diag_121[15] = {0};
1753     /* To store FILT11 operated values*/
1754     UWORD8 u1_pred_vals_diag_11[15] = {0};
1755     UWORD8 u1_pred_vals_vert_r[8] = {0};
1756     UWORD8 u1_pred_vals_horz_d[10] = {0};
1757     UWORD8 u1_pred_vals_horz_u[10] = {0};
1758     WORD32 u4_dcval = 0;
1759     WORD32 i4_sad[MAX_I4x4] = {INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX,
1760                                INT_MAX, INT_MAX, INT_MAX, INT_MAX};
1761 
1762     WORD32 i4_cost[MAX_I4x4] = {INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX,
1763                                 INT_MAX, INT_MAX, INT_MAX, INT_MAX};
1764     WORD32 i, i4_min_cost = INT_MAX;
1765 
1766     left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK);
1767     top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2;
1768 
1769     /* Computing SAD */
1770 
1771     /* VERT mode valid */
1772     if (u4_valid_intra_modes & 1)
1773     {
1774         pu1_pred = pu1_ngbr_pels + 5;
1775         i4_sad[VERT_I4x4] = 0;
1776         i4_cost[VERT_I4x4] = 0;
1777 
1778         USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]);
1779         pu1_src_temp += src_strd;
1780         USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]);
1781         pu1_src_temp += src_strd;
1782         USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]);
1783         pu1_src_temp += src_strd;
1784         USADA8(pu1_src_temp, pu1_pred, i4_sad[VERT_I4x4]);
1785 
1786         i4_cost[VERT_I4x4] = i4_sad[VERT_I4x4] + ((u4_predictd_mode == VERT_I4x4) ?
1787                                         u4_lambda : 4 * u4_lambda);
1788     }
1789 
1790     /* HORZ mode valid */
1791     if (u4_valid_intra_modes & 2)
1792     {
1793         i4_sad[HORZ_I4x4] = 0;
1794         i4_cost[HORZ_I4x4] =0;
1795         pu1_src_temp = pu1_src;
1796 
1797         u1_pred_val = pu1_ngbr_pels[3];
1798 
1799         i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val)
1800                         + ABS(pu1_src_temp[1] - u1_pred_val)
1801                         + ABS(pu1_src_temp[2] - u1_pred_val)
1802                         + ABS(pu1_src_temp[3] - u1_pred_val);
1803         pu1_src_temp += src_strd;
1804 
1805         u1_pred_val = pu1_ngbr_pels[2];
1806 
1807         i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val)
1808                         + ABS(pu1_src_temp[1] - u1_pred_val)
1809                         + ABS(pu1_src_temp[2] - u1_pred_val)
1810                         + ABS(pu1_src_temp[3] - u1_pred_val);
1811         pu1_src_temp += src_strd;
1812 
1813         u1_pred_val = pu1_ngbr_pels[1];
1814 
1815         i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val)
1816                         + ABS(pu1_src_temp[1] - u1_pred_val)
1817                         + ABS(pu1_src_temp[2] - u1_pred_val)
1818                         + ABS(pu1_src_temp[3] - u1_pred_val);
1819         pu1_src_temp += src_strd;
1820 
1821         u1_pred_val = pu1_ngbr_pels[0];
1822 
1823         i4_sad[HORZ_I4x4] += ABS(pu1_src_temp[0] - u1_pred_val)
1824                         + ABS(pu1_src_temp[1] - u1_pred_val)
1825                         + ABS(pu1_src_temp[2] - u1_pred_val)
1826                         + ABS(pu1_src_temp[3] - u1_pred_val);
1827 
1828         i4_cost[HORZ_I4x4] = i4_sad[HORZ_I4x4] + ((u4_predictd_mode == HORZ_I4x4) ?
1829                                         u4_lambda : 4 * u4_lambda);
1830     }
1831 
1832     /* DC mode valid */
1833     if (u4_valid_intra_modes & 4)
1834     {
1835         i4_sad[DC_I4x4] = 0;
1836         i4_cost[DC_I4x4] = 0;
1837         pu1_src_temp = pu1_src;
1838 
1839         if (left)
1840             u4_dcval = pu1_ngbr_pels[0] + pu1_ngbr_pels[1] + pu1_ngbr_pels[2]
1841                             + pu1_ngbr_pels[3] + 2;
1842         if (top)
1843             u4_dcval += pu1_ngbr_pels[5] + pu1_ngbr_pels[6] + pu1_ngbr_pels[7]
1844                             + pu1_ngbr_pels[8] + 2;
1845 
1846         u4_dcval = (u4_dcval) ? (u4_dcval >> (1 + left + top)) : 128;
1847 
1848         /* none available */
1849         memset(u1_pred_vals, u4_dcval, 4);
1850         USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]);
1851         pu1_src_temp += src_strd;
1852         USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]);
1853         pu1_src_temp += src_strd;
1854         USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]);
1855         pu1_src_temp += src_strd;
1856         USADA8(pu1_src_temp, u1_pred_vals, i4_sad[DC_I4x4]);
1857         pu1_src_temp += src_strd;
1858 
1859         i4_cost[DC_I4x4] = i4_sad[DC_I4x4] + ((u4_predictd_mode == DC_I4x4) ?
1860                                         u4_lambda : 4 * u4_lambda);
1861     }
1862 
1863     /* if modes other than VERT, HORZ and DC are  valid */
1864     if (u4_valid_intra_modes > 7)
1865     {
1866         pu1_pred = pu1_ngbr_pels;
1867         pu1_pred[13] = pu1_pred[14] = pu1_pred[12];
1868 
1869         /* Performing FILT121 and FILT11 operation for all neighbour values*/
1870         for (i = 0; i < 13; i++)
1871         {
1872             u1_pred_vals_diag_121[i] = FILT121(pu1_pred[0], pu1_pred[1], pu1_pred[2]);
1873             u1_pred_vals_diag_11[i] = FILT11(pu1_pred[0], pu1_pred[1]);
1874 
1875             pu1_pred++;
1876         }
1877 
1878         if (u4_valid_intra_modes & 8)/* DIAG_DL */
1879         {
1880             i4_sad[DIAG_DL_I4x4] = 0;
1881             i4_cost[DIAG_DL_I4x4] = 0;
1882             pu1_src_temp = pu1_src;
1883             pu1_pred_val = u1_pred_vals_diag_121 + 5;
1884 
1885             USADA8(pu1_src_temp, pu1_pred_val, i4_sad[DIAG_DL_I4x4]);
1886             pu1_src_temp += src_strd;
1887             USADA8(pu1_src_temp, (pu1_pred_val + 1), i4_sad[DIAG_DL_I4x4]);
1888             pu1_src_temp += src_strd;
1889             USADA8(pu1_src_temp, (pu1_pred_val + 2), i4_sad[DIAG_DL_I4x4]);
1890             pu1_src_temp += src_strd;
1891             USADA8(pu1_src_temp, (pu1_pred_val + 3), i4_sad[DIAG_DL_I4x4]);
1892             pu1_src_temp += src_strd;
1893             i4_cost[DIAG_DL_I4x4] = i4_sad[DIAG_DL_I4x4] + ((u4_predictd_mode == DIAG_DL_I4x4) ?
1894                                             u4_lambda : 4 * u4_lambda);
1895         }
1896 
1897         if (u4_valid_intra_modes & 16)/* DIAG_DR */
1898         {
1899             i4_sad[DIAG_DR_I4x4] = 0;
1900             i4_cost[DIAG_DR_I4x4] = 0;
1901             pu1_src_temp = pu1_src;
1902             pu1_pred_val = u1_pred_vals_diag_121 + 3;
1903 
1904             USADA8(pu1_src_temp, pu1_pred_val, i4_sad[DIAG_DR_I4x4]);
1905             pu1_src_temp += src_strd;
1906             USADA8(pu1_src_temp, (pu1_pred_val - 1), i4_sad[DIAG_DR_I4x4]);
1907             pu1_src_temp += src_strd;
1908             USADA8(pu1_src_temp, (pu1_pred_val - 2), i4_sad[DIAG_DR_I4x4]);
1909             pu1_src_temp += src_strd;
1910             USADA8(pu1_src_temp, (pu1_pred_val - 3), i4_sad[DIAG_DR_I4x4]);
1911             pu1_src_temp += src_strd;
1912             i4_cost[DIAG_DR_I4x4] = i4_sad[DIAG_DR_I4x4] + ((u4_predictd_mode == DIAG_DR_I4x4) ?
1913                                             u4_lambda : 4 * u4_lambda);
1914 
1915         }
1916 
1917         if (u4_valid_intra_modes & 32)/* VERT_R mode valid ????*/
1918         {
1919             i4_sad[VERT_R_I4x4] = 0;
1920 
1921             pu1_src_temp = pu1_src;
1922             u1_pred_vals_vert_r[0] = u1_pred_vals_diag_121[2];
1923             memcpy((u1_pred_vals_vert_r + 1), (u1_pred_vals_diag_11 + 4), 3);
1924             u1_pred_vals_vert_r[4] = u1_pred_vals_diag_121[1];
1925             memcpy((u1_pred_vals_vert_r + 5), (u1_pred_vals_diag_121 + 3), 3);
1926 
1927             pu1_pred_val = u1_pred_vals_diag_11 + 4;
1928             USADA8(pu1_src_temp, pu1_pred_val, i4_sad[VERT_R_I4x4]);
1929             pu1_pred_val = u1_pred_vals_diag_121 + 3;
1930             pu1_src_temp += src_strd;
1931             USADA8(pu1_src_temp, pu1_pred_val, i4_sad[VERT_R_I4x4]);
1932             pu1_src_temp += src_strd;
1933             USADA8(pu1_src_temp, (u1_pred_vals_vert_r), i4_sad[VERT_R_I4x4]);
1934             pu1_src_temp += src_strd;
1935             USADA8(pu1_src_temp, (u1_pred_vals_vert_r + 4),
1936                    i4_sad[VERT_R_I4x4]);
1937 
1938             i4_cost[VERT_R_I4x4] = i4_sad[VERT_R_I4x4] + ((u4_predictd_mode == VERT_R_I4x4) ?
1939                                             u4_lambda : 4 * u4_lambda);
1940         }
1941 
1942         if (u4_valid_intra_modes & 64)/* HORZ_D mode valid ????*/
1943         {
1944             i4_sad[HORZ_D_I4x4] = 0;
1945 
1946             pu1_src_temp = pu1_src;
1947             u1_pred_vals_horz_d[6] = u1_pred_vals_diag_11[3];
1948             memcpy((u1_pred_vals_horz_d + 7), (u1_pred_vals_diag_121 + 3), 3);
1949             u1_pred_vals_horz_d[0] = u1_pred_vals_diag_11[0];
1950             u1_pred_vals_horz_d[1] = u1_pred_vals_diag_121[0];
1951             u1_pred_vals_horz_d[2] = u1_pred_vals_diag_11[1];
1952             u1_pred_vals_horz_d[3] = u1_pred_vals_diag_121[1];
1953             u1_pred_vals_horz_d[4] = u1_pred_vals_diag_11[2];
1954             u1_pred_vals_horz_d[5] = u1_pred_vals_diag_121[2];
1955 
1956             pu1_pred_val = u1_pred_vals_horz_d;
1957             USADA8(pu1_src_temp, (pu1_pred_val + 6), i4_sad[HORZ_D_I4x4]);
1958             pu1_src_temp += src_strd;
1959             USADA8(pu1_src_temp, (pu1_pred_val + 4), i4_sad[HORZ_D_I4x4]);
1960             pu1_src_temp += src_strd;
1961             USADA8(pu1_src_temp, (pu1_pred_val + 2), i4_sad[HORZ_D_I4x4]);
1962             pu1_src_temp += src_strd;
1963             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[HORZ_D_I4x4]);
1964 
1965             i4_cost[HORZ_D_I4x4] = i4_sad[HORZ_D_I4x4] + ((u4_predictd_mode == HORZ_D_I4x4) ?
1966                                             u4_lambda : 4 * u4_lambda);
1967         }
1968 
1969         if (u4_valid_intra_modes & 128)/* VERT_L mode valid ????*/
1970         {
1971             i4_sad[VERT_L_I4x4] = 0;
1972             pu1_src_temp = pu1_src;
1973             pu1_pred_val = u1_pred_vals_diag_11 + 5;
1974             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]);
1975             pu1_src_temp += src_strd;
1976             pu1_pred_val = u1_pred_vals_diag_121 + 5;
1977             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]);
1978             pu1_src_temp += src_strd;
1979             pu1_pred_val = u1_pred_vals_diag_11 + 6;
1980             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]);
1981             pu1_src_temp += src_strd;
1982             pu1_pred_val = u1_pred_vals_diag_121 + 6;
1983             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[VERT_L_I4x4]);
1984 
1985             i4_cost[VERT_L_I4x4] = i4_sad[VERT_L_I4x4] + ((u4_predictd_mode == VERT_L_I4x4) ?
1986                                             u4_lambda : 4 * u4_lambda);
1987         }
1988 
1989         if (u4_valid_intra_modes & 256)/* HORZ_U mode valid ????*/
1990         {
1991             i4_sad[HORZ_U_I4x4] = 0;
1992             pu1_src_temp = pu1_src;
1993             u1_pred_vals_horz_u[0] = u1_pred_vals_diag_11[2];
1994             u1_pred_vals_horz_u[1] = u1_pred_vals_diag_121[1];
1995             u1_pred_vals_horz_u[2] = u1_pred_vals_diag_11[1];
1996             u1_pred_vals_horz_u[3] = u1_pred_vals_diag_121[0];
1997             u1_pred_vals_horz_u[4] = u1_pred_vals_diag_11[0];
1998             u1_pred_vals_horz_u[5] = FILT121(pu1_ngbr_pels[0], pu1_ngbr_pels[0], pu1_ngbr_pels[1]);
1999 
2000             memset((u1_pred_vals_horz_u + 6), pu1_ngbr_pels[0], 4);
2001 
2002             pu1_pred_val = u1_pred_vals_horz_u;
2003             USADA8(pu1_src_temp, (pu1_pred_val), i4_sad[HORZ_U_I4x4]);
2004             pu1_src_temp += src_strd;
2005             USADA8(pu1_src_temp, (pu1_pred_val + 2), i4_sad[HORZ_U_I4x4]);
2006             pu1_src_temp += src_strd;
2007             USADA8(pu1_src_temp, (pu1_pred_val + 4), i4_sad[HORZ_U_I4x4]);
2008             pu1_src_temp += src_strd;
2009             USADA8(pu1_src_temp, (pu1_pred_val + 6), i4_sad[HORZ_U_I4x4]);
2010 
2011             i4_cost[HORZ_U_I4x4] = i4_sad[HORZ_U_I4x4] + ((u4_predictd_mode == HORZ_U_I4x4) ?
2012                                             u4_lambda : 4 * u4_lambda);
2013         }
2014 
2015         i4_min_cost = MIN3(MIN3(i4_cost[0], i4_cost[1], i4_cost[2]),
2016                         MIN3(i4_cost[3], i4_cost[4], i4_cost[5]),
2017                         MIN3(i4_cost[6], i4_cost[7], i4_cost[8]));
2018 
2019     }
2020     else
2021     {
2022         /* Only first three modes valid */
2023         i4_min_cost = MIN3(i4_cost[0], i4_cost[1], i4_cost[2]);
2024     }
2025 
2026     *pu4_sadmin = i4_min_cost;
2027 
2028     if (i4_min_cost == i4_cost[0])
2029     {
2030         *u4_intra_mode = VERT_I4x4;
2031         pu1_pred_val = pu1_ngbr_pels + 5;
2032         memcpy(pu1_dst, (pu1_pred_val), 4);
2033         pu1_dst += dst_strd;
2034         memcpy(pu1_dst, (pu1_pred_val), 4);
2035         pu1_dst += dst_strd;
2036         memcpy(pu1_dst, (pu1_pred_val), 4);
2037         pu1_dst += dst_strd;
2038         memcpy(pu1_dst, (pu1_pred_val), 4);
2039     }
2040     else if (i4_min_cost == i4_cost[1])
2041     {
2042         *u4_intra_mode = HORZ_I4x4;
2043         memset(pu1_dst, pu1_ngbr_pels[3], 4);
2044         pu1_dst += dst_strd;
2045         memset(pu1_dst, pu1_ngbr_pels[2], 4);
2046         pu1_dst += dst_strd;
2047         memset(pu1_dst, pu1_ngbr_pels[1], 4);
2048         pu1_dst += dst_strd;
2049         memset(pu1_dst, pu1_ngbr_pels[0], 4);
2050     }
2051     else if (i4_min_cost == i4_cost[2])
2052     {
2053         *u4_intra_mode = DC_I4x4;
2054         memset(pu1_dst, u4_dcval, 4);
2055         pu1_dst += dst_strd;
2056         memset(pu1_dst, u4_dcval, 4);
2057         pu1_dst += dst_strd;
2058         memset(pu1_dst, u4_dcval, 4);
2059         pu1_dst += dst_strd;
2060         memset(pu1_dst, u4_dcval, 4);
2061     }
2062     else if (i4_min_cost == i4_cost[3])
2063     {
2064         *u4_intra_mode = DIAG_DL_I4x4;
2065         pu1_pred_val = u1_pred_vals_diag_121 + 5;
2066         memcpy(pu1_dst, (pu1_pred_val), 4);
2067         pu1_dst += dst_strd;
2068         memcpy(pu1_dst, (pu1_pred_val + 1), 4);
2069         pu1_dst += dst_strd;
2070         memcpy(pu1_dst, (pu1_pred_val + 2), 4);
2071         pu1_dst += dst_strd;
2072         memcpy(pu1_dst, (pu1_pred_val + 3), 4);
2073     }
2074     else if (i4_min_cost == i4_cost[4])
2075     {
2076         *u4_intra_mode = DIAG_DR_I4x4;
2077         pu1_pred_val = u1_pred_vals_diag_121 + 3;
2078 
2079         memcpy(pu1_dst, (pu1_pred_val), 4);
2080         pu1_dst += dst_strd;
2081         memcpy(pu1_dst, (pu1_pred_val - 1), 4);
2082         pu1_dst += dst_strd;
2083         memcpy(pu1_dst, (pu1_pred_val - 2), 4);
2084         pu1_dst += dst_strd;
2085         memcpy(pu1_dst, (pu1_pred_val - 3), 4);
2086     }
2087     else if (i4_min_cost == i4_cost[5])
2088     {
2089         *u4_intra_mode = VERT_R_I4x4;
2090         pu1_pred_val = u1_pred_vals_diag_11 + 4;
2091         memcpy(pu1_dst, (pu1_pred_val), 4);
2092         pu1_dst += dst_strd;
2093         pu1_pred_val = u1_pred_vals_diag_121 + 3;
2094         memcpy(pu1_dst, (pu1_pred_val), 4);
2095         pu1_dst += dst_strd;
2096         memcpy(pu1_dst, (u1_pred_vals_vert_r), 4);
2097         pu1_dst += dst_strd;
2098         memcpy(pu1_dst, (u1_pred_vals_vert_r + 4), 4);
2099     }
2100     else if (i4_min_cost == i4_cost[6])
2101     {
2102         *u4_intra_mode = HORZ_D_I4x4;
2103         pu1_pred_val = u1_pred_vals_horz_d;
2104         memcpy(pu1_dst, (pu1_pred_val + 6), 4);
2105         pu1_dst += dst_strd;
2106         memcpy(pu1_dst, (pu1_pred_val + 4), 4);
2107         pu1_dst += dst_strd;
2108         memcpy(pu1_dst, (pu1_pred_val + 2), 4);
2109         pu1_dst += dst_strd;
2110         memcpy(pu1_dst, (pu1_pred_val), 4);
2111         pu1_dst += dst_strd;
2112     }
2113     else if (i4_min_cost == i4_cost[7])
2114     {
2115         *u4_intra_mode = VERT_L_I4x4;
2116         pu1_pred_val = u1_pred_vals_diag_11 + 5;
2117         memcpy(pu1_dst, (pu1_pred_val), 4);
2118         pu1_dst += dst_strd;
2119         pu1_pred_val = u1_pred_vals_diag_121 + 5;
2120         memcpy(pu1_dst, (pu1_pred_val), 4);
2121         pu1_dst += dst_strd;
2122         pu1_pred_val = u1_pred_vals_diag_11 + 6;
2123         memcpy(pu1_dst, (pu1_pred_val), 4);
2124         pu1_dst += dst_strd;
2125         pu1_pred_val = u1_pred_vals_diag_121 + 6;
2126         memcpy(pu1_dst, (pu1_pred_val), 4);
2127     }
2128     else if (i4_min_cost == i4_cost[8])
2129     {
2130         *u4_intra_mode = HORZ_U_I4x4;
2131         pu1_pred_val = u1_pred_vals_horz_u;
2132         memcpy(pu1_dst, (pu1_pred_val), 4);
2133         pu1_dst += dst_strd;
2134         memcpy(pu1_dst, (pu1_pred_val + 2), 4);
2135         pu1_dst += dst_strd;
2136         memcpy(pu1_dst, (pu1_pred_val + 4), 4);
2137         pu1_dst += dst_strd;
2138         memcpy(pu1_dst, (pu1_pred_val + 6), 4);
2139         pu1_dst += dst_strd;
2140     }
2141 
2142     return;
2143 }
2144 
2145 /**
2146 ******************************************************************************
2147 *
2148 * @brief:
2149 *  Evaluate best intr chroma mode (among VERT, HORZ and DC ) and do the prediction.
2150 *
2151 * @par Description
2152 *  This function evaluates  first three intra chroma modes and compute corresponding sad
2153 *  and return the buffer predicted with best mode.
2154 *
2155 * @param[in] pu1_src
2156 *  UWORD8 pointer to the source
2157 *
2158 * @param[in] pu1_ngbr_pels
2159 *  UWORD8 pointer to neighbouring pels
2160 *
2161 * @param[out] pu1_dst
2162 *  UWORD8 pointer to the destination
2163 *
2164 * @param[in] src_strd
2165 *  integer source stride
2166 *
2167 * @param[in] dst_strd
2168 *  integer destination stride
2169 *
2170 * @param[in] u4_n_avblty
2171 *  availability of neighbouring pixels
2172 *
2173 * @param[in] u4_intra_mode
2174 *  Pointer to the variable in which best mode is returned
2175 *
2176 * @param[in] pu4_sadmin
2177 *  Pointer to the variable in which minimum sad is returned
2178 *
2179 * @param[in] u4_valid_intra_modes
2180 *  Says what all modes are valid
2181 *
2182 * @return      none
2183 *
2184 ******************************************************************************
2185 */
ih264e_evaluate_intra_chroma_modes(UWORD8 * pu1_src,UWORD8 * pu1_ngbr_pels,UWORD8 * pu1_dst,UWORD32 src_strd,UWORD32 dst_strd,WORD32 u4_n_avblty,UWORD32 * u4_intra_mode,WORD32 * pu4_sadmin,UWORD32 u4_valid_intra_modes)2186 void ih264e_evaluate_intra_chroma_modes(UWORD8 *pu1_src,
2187                                         UWORD8 *pu1_ngbr_pels,
2188                                         UWORD8 *pu1_dst,
2189                                         UWORD32 src_strd,
2190                                         UWORD32 dst_strd,
2191                                         WORD32 u4_n_avblty,
2192                                         UWORD32 *u4_intra_mode,
2193                                         WORD32 *pu4_sadmin,
2194                                         UWORD32 u4_valid_intra_modes)
2195 {
2196     UWORD8 *pu1_neighbour;
2197     UWORD8 *pu1_src_temp = pu1_src;
2198     UWORD8 left = 0, top = 0;
2199     WORD32 u4_dcval_u_l[2] = { 0, 0 }, /*sum left neighbours for 'U' ,two separate sets - sum of first four from top,and sum of four values from bottom */
2200            u4_dcval_u_t[2] = { 0, 0 };  /*sum top neighbours for 'U'*/
2201 
2202     WORD32 u4_dcval_v_l[2] = { 0, 0 }, /*sum left neighbours for 'V'*/
2203            u4_dcval_v_t[2] = { 0, 0 }; /*sum top neighbours for 'V'*/
2204 
2205     WORD32 i, j, row, col, i4_sad_vert = INT_MAX, i4_sad_horz = INT_MAX,
2206                     i4_sad_dc = INT_MAX, i4_min_sad = INT_MAX;
2207     UWORD8 val_u, val_v;
2208 
2209     WORD32 u4_dc_val[2][2][2];/*  -----------
2210                                   |    |    |  Chroma can have four
2211                                   | 00 | 01 |  separate dc value...
2212                                   -----------  u4_dc_val corresponds to this dc values
2213                                   |    |    |  with u4_dc_val[2][2][U] and u4_dc_val[2][2][V]
2214                                   | 10 | 11 |
2215                                   -----------                */
2216     left = (u4_n_avblty & LEFT_MB_AVAILABLE_MASK);
2217     top = (u4_n_avblty & TOP_MB_AVAILABLE_MASK) >> 2;
2218 
2219     /*Evaluating HORZ*/
2220     if (left)/* Ifleft available*/
2221     {
2222         i4_sad_horz = 0;
2223 
2224         for (i = 0; i < 8; i++)
2225         {
2226             val_v = pu1_ngbr_pels[15 - 2 * i];
2227             val_u = pu1_ngbr_pels[15 - 2 * i - 1];
2228             row = i / 4;
2229             u4_dcval_u_l[row] += val_u;
2230             u4_dcval_v_l[row] += val_v;
2231             for (j = 0; j < 8; j++)
2232             {
2233                 i4_sad_horz += ABS(val_u - pu1_src_temp[2 * j]);/* Finding SAD for HORZ mode*/
2234                 i4_sad_horz += ABS(val_v - pu1_src_temp[2 * j + 1]);
2235             }
2236 
2237             pu1_src_temp += src_strd;
2238         }
2239         u4_dcval_u_l[0] += 2;
2240         u4_dcval_u_l[1] += 2;
2241         u4_dcval_v_l[0] += 2;
2242         u4_dcval_v_l[1] += 2;
2243     }
2244 
2245     /*Evaluating VERT**/
2246     pu1_src_temp = pu1_src;
2247     if (top) /* top available*/
2248     {
2249         i4_sad_vert = 0;
2250 
2251         for (i = 0; i < 8; i++)
2252         {
2253             col = i / 4;
2254 
2255             val_u = pu1_ngbr_pels[18 + i * 2];
2256             val_v = pu1_ngbr_pels[18 + i * 2 + 1];
2257             u4_dcval_u_t[col] += val_u;
2258             u4_dcval_v_t[col] += val_v;
2259 
2260             for (j = 0; j < 16; j++)
2261             {
2262                 i4_sad_vert += ABS(pu1_ngbr_pels[18 + j] - pu1_src_temp[j]);/* Finding SAD for VERT mode*/
2263             }
2264             pu1_src_temp += src_strd;
2265 
2266         }
2267         u4_dcval_u_t[0] += 2;
2268         u4_dcval_u_t[1] += 2;
2269         u4_dcval_v_t[0] += 2;
2270         u4_dcval_v_t[1] += 2;
2271     }
2272 
2273     /* computing DC value*/
2274     /* Equation  8-128 in spec*/
2275     u4_dc_val[0][0][0] = (u4_dcval_u_l[0] + u4_dcval_u_t[0]) >> (1 + left + top);
2276     u4_dc_val[0][0][1] = (u4_dcval_v_l[0] + u4_dcval_v_t[0]) >> (1 + left + top);
2277     u4_dc_val[1][1][0] = (u4_dcval_u_l[1] + u4_dcval_u_t[1]) >> (1 + left + top);
2278     u4_dc_val[1][1][1] = (u4_dcval_v_l[1] + u4_dcval_v_t[1]) >> (1 + left + top);
2279 
2280     if (top)
2281     {
2282         /* Equation  8-132 in spec*/
2283         u4_dc_val[0][1][0] = (u4_dcval_u_t[1]) >> (1 + top);
2284         u4_dc_val[0][1][1] = (u4_dcval_v_t[1]) >> (1 + top);
2285     }
2286     else
2287     {
2288         u4_dc_val[0][1][0] = (u4_dcval_u_l[0]) >> (1 + left);
2289         u4_dc_val[0][1][1] = (u4_dcval_v_l[0]) >> (1 + left);
2290     }
2291 
2292     if (left)
2293     {
2294         u4_dc_val[1][0][0] = (u4_dcval_u_l[1]) >> (1 + left);
2295         u4_dc_val[1][0][1] = (u4_dcval_v_l[1]) >> (1 + left);
2296     }
2297     else
2298     {
2299         u4_dc_val[1][0][0] = (u4_dcval_u_t[0]) >> (1 + top);
2300         u4_dc_val[1][0][1] = (u4_dcval_v_t[0]) >> (1 + top);
2301     }
2302 
2303     if (!(left || top))
2304     {
2305         /*none available*/
2306         u4_dc_val[0][0][0] = u4_dc_val[0][0][1] =
2307         u4_dc_val[0][1][0] = u4_dc_val[0][1][1] =
2308         u4_dc_val[1][0][0] = u4_dc_val[1][0][1] =
2309         u4_dc_val[1][1][0] = u4_dc_val[1][1][1] = 128;
2310     }
2311 
2312     /* Evaluating DC */
2313     pu1_src_temp = pu1_src;
2314     i4_sad_dc = 0;
2315     for (i = 0; i < 8; i++)
2316     {
2317         for (j = 0; j < 8; j++)
2318         {
2319             col = j / 4;
2320             row = i / 4;
2321             val_u = u4_dc_val[row][col][0];
2322             val_v = u4_dc_val[row][col][1];
2323 
2324             i4_sad_dc += ABS(val_u - pu1_src_temp[2 * j]);/* Finding SAD for DC mode*/
2325             i4_sad_dc += ABS(val_v - pu1_src_temp[2 * j + 1]);
2326         }
2327         pu1_src_temp += src_strd;
2328     }
2329 
2330     if ((u4_valid_intra_modes & 01) == 0)/* If DC is disabled*/
2331         i4_sad_dc = INT_MAX;
2332     if ((u4_valid_intra_modes & 02) == 0)/* If HORZ is disabled*/
2333         i4_sad_horz = INT_MAX;
2334     if ((u4_valid_intra_modes & 04) == 0)/* If VERT is disabled*/
2335         i4_sad_vert = INT_MAX;
2336 
2337     i4_min_sad = MIN3(i4_sad_horz, i4_sad_dc, i4_sad_vert);
2338 
2339     /* Finding Minimum sad and doing corresponding prediction*/
2340     if (i4_min_sad < *pu4_sadmin)
2341     {
2342         *pu4_sadmin = i4_min_sad;
2343 
2344         if (i4_min_sad == i4_sad_dc)
2345         {
2346             *u4_intra_mode = DC_CH_I8x8;
2347             for (i = 0; i < 8; i++)
2348             {
2349                 for (j = 0; j < 8; j++)
2350                 {
2351                     col = j / 4;
2352                     row = i / 4;
2353 
2354                     pu1_dst[2 * j] = u4_dc_val[row][col][0];
2355                     pu1_dst[2 * j + 1] = u4_dc_val[row][col][1];
2356                 }
2357                 pu1_dst += dst_strd;
2358             }
2359         }
2360         else if (i4_min_sad == i4_sad_horz)
2361         {
2362             *u4_intra_mode = HORZ_CH_I8x8;
2363             for (j = 0; j < 8; j++)
2364             {
2365                 val_v = pu1_ngbr_pels[15 - 2 * j];
2366                 val_u = pu1_ngbr_pels[15 - 2 * j - 1];
2367 
2368                 for (i = 0; i < 8; i++)
2369                 {
2370                     pu1_dst[2 * i] = val_u;
2371                     pu1_dst[2 * i + 1] = val_v;
2372 
2373                 }
2374                 pu1_dst += dst_strd;
2375             }
2376         }
2377         else
2378         {
2379             *u4_intra_mode = VERT_CH_I8x8;
2380             pu1_neighbour = pu1_ngbr_pels + 18;
2381             for (j = 0; j < 8; j++)
2382             {
2383                 memcpy(pu1_dst, pu1_neighbour, MB_SIZE);
2384                 pu1_dst += dst_strd;
2385             }
2386         }
2387     }
2388 
2389     return;
2390 }
2391