xref: /aosp_15_r20/external/libavc/common/ih264_resi_trans_quant.c (revision 495ae853bb871d1e5a258cb02c2cc13cde8ddb9a)
1 /******************************************************************************
2  *
3  * Copyright (C) 2015 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 
21 /**
22 *******************************************************************************
23 * @file
24 *  ih264_resi_trans_quant.c
25 *
26 * @brief
27 *  Contains function definitions single stage forward transform for H.264
28 *  It will calculate the residue, do the cf and then do quantization
29 *
30 * @author
31 *  ittiam
32 *
33 * @par List of Functions:
34 *  - ih264_resi_trans_quant_4x4
35 *  - ih264_resi_trans_quant_chroma_4x4
36 *  - ih264_hadamard_quant_4x4
37 *  - ih264_hadamard_quant_2x2_uv
38 *  - ih264_resi_trans_quant_8x8
39 *
40 * @remarks
41 *  none
42 *
43 *******************************************************************************
44 */
45 
46 
47 /*****************************************************************************/
48 /* File Includes                                                             */
49 /*****************************************************************************/
50 
51 /* System Include Files */
52 #include <stddef.h>
53 
54 /* User Include Files */
55 #include "ih264_typedefs.h"
56 #include "ih264_defs.h"
57 #include "ih264_macros.h"
58 #include "ih264_size_defs.h"
59 #include "ih264_trans_macros.h"
60 #include "ih264_trans_data.h"
61 #include "ih264_structs.h"
62 #include "ih264_trans_quant_itrans_iquant.h"
63 
64 
65 /*****************************************************************************/
66 /* Function Definitions                                                      */
67 /*****************************************************************************/
68 
69 /**
70 *******************************************************************************
71 *
72 * @brief
73 *  This function performs forward transform and quantization on a 4x4 block
74 *
75 * @par Description:
76 *  The function accepts source buffer and estimation buffer. From these, it
77 *  computes the residue. This is residue is then transformed and quantized.
78 *  The transform and quantization are in placed computed. They use the residue
79 *  buffer for this.
80 *
81 * @param[in] pu1_src
82 *  Pointer to source sub-block
83 *
84 * @param[in] pu1_pred
85 *  Pointer to prediction sub-block
86 *
87 * @param[in] pi2_out
88 *  Pointer to residual sub-block
89 *
90 * @param[in] src_strd
91 *  Source stride
92 *
93 * @param[in] pred_strd
94 *  Prediction stride
95 *
96 * @param[in] pu2_scale_matrix
97 *  Pointer to Forward Quant Scale Matrix
98 *
99 * @param[in] pu2_threshold_matrix
100 *  Pointer to Forward Quant Threshold Matrix
101 *
102 * @param[in] u4_qbits
103 *  QP_BITS_h264_4x4 + floor(QP/6)
104 *
105 * @param[in] u4_round_factor
106 *  Quantization Round factor
107 *
108 * @param[out] pu1_nnz
109 *  Total non-zero coefficients in the current sub-block
110 *
111 * @param[in] pi2_alt_dc_addr
112 *  DC Coefficient of the block
113 *
114 * @remarks none
115 *
116 *******************************************************************************
117 */
ih264_resi_trans_quant_4x4(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD16 * pi2_out,WORD32 src_strd,WORD32 pred_strd,const UWORD16 * pu2_scale_matrix,const UWORD16 * pu2_threshold_matrix,UWORD32 u4_qbits,UWORD32 u4_round_factor,UWORD8 * pu1_nnz,WORD16 * pi2_alt_dc_addr)118 void ih264_resi_trans_quant_4x4(UWORD8 *pu1_src,
119                                 UWORD8 *pu1_pred,
120                                 WORD16 *pi2_out,
121                                 WORD32 src_strd,
122                                 WORD32 pred_strd,
123                                 const UWORD16 *pu2_scale_matrix,
124                                 const UWORD16 *pu2_threshold_matrix,
125                                 UWORD32 u4_qbits,
126                                 UWORD32 u4_round_factor,
127                                 UWORD8 *pu1_nnz,
128                                 WORD16 *pi2_alt_dc_addr)
129 {
130     UWORD32 i;
131     WORD32 x0, x1, x2, x3, x4, x5, x6, x7;
132     WORD32 i4_value;
133     WORD16 *pi2_out_tmp = pi2_out;
134     UWORD32 u4_nonzero_coeff = 0;
135 
136     for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
137     {
138         /* computing prediction error (residue) */
139         x4 = pu1_src[0] - pu1_pred[0];
140         x5 = pu1_src[1] - pu1_pred[1];
141         x6 = pu1_src[2] - pu1_pred[2];
142         x7 = pu1_src[3] - pu1_pred[3];
143 
144         /* Horizontal transform */
145         x0 = x4 + x7;
146         x1 = x5 + x6;
147         x2 = x5 - x6;
148         x3 = x4 - x7;
149 
150         pi2_out_tmp[0] = x0 + x1;
151         pi2_out_tmp[1] = (x3 << 1) + x2;
152         pi2_out_tmp[2] = x0 - x1;
153         pi2_out_tmp[3] = x3 - (x2 << 1);
154 
155         /* pointing to next row; */
156         pu1_src += src_strd;
157         pu1_pred += pred_strd;
158         pi2_out_tmp += 4;
159     }
160 
161     pi2_out_tmp = pi2_out;
162     for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
163     {
164         /* Vertical transform and quantization */
165         x4 = pi2_out_tmp[0];
166         x5 = pi2_out_tmp[4];
167         x6 = pi2_out_tmp[8];
168         x7 = pi2_out_tmp[12];
169 
170         x0 = x4 + x7;
171         x1 = x5 + x6;
172         x2 = x5 - x6;
173         x3 = x4 - x7;
174 
175         /* quantization is done in place */
176         i4_value = x0 + x1;
177         if(i == 0)
178         {
179             (*pi2_alt_dc_addr) = i4_value;
180         }
181         FWD_QUANT(i4_value, pu2_threshold_matrix[0],
182                   pu2_scale_matrix[0], u4_round_factor, u4_qbits,
183                   u4_nonzero_coeff);
184         pi2_out_tmp[0] = i4_value;
185 
186         i4_value = (x3 << 1) + x2;
187         FWD_QUANT(i4_value, pu2_threshold_matrix[4],
188                   pu2_scale_matrix[4], u4_round_factor, u4_qbits,
189                   u4_nonzero_coeff);
190         pi2_out_tmp[4] = i4_value;
191 
192         i4_value = x0 - x1;
193         FWD_QUANT(i4_value, pu2_threshold_matrix[8],
194                   pu2_scale_matrix[8], u4_round_factor, u4_qbits,
195                   u4_nonzero_coeff);
196         pi2_out_tmp[8] = i4_value;
197 
198         i4_value = x3 - (x2 << 1);
199         FWD_QUANT(i4_value, pu2_threshold_matrix[12],
200                   pu2_scale_matrix[12], u4_round_factor, u4_qbits,
201                   u4_nonzero_coeff);
202         pi2_out_tmp[12] = i4_value;
203 
204         pi2_out_tmp++;
205         pu2_scale_matrix++;
206         pu2_threshold_matrix++;
207     }
208 
209     /* Return total nonzero coefficients in the current sub block */
210     *pu1_nnz =  u4_nonzero_coeff;
211 }
212 
213 /**
214 *******************************************************************************
215 *
216 * @brief
217 *  This function performs forward transform and quantization on a 4x4
218 *  chroma block with interleaved values
219 *
220 * @par Description:
221 *  The function accepts source buffer and estimation buffer. From these, it
222 *  computes the residue. This is residue is then transformed and quantized.
223 *  The transform and quantization are in placed computed. They use the residue
224 *  buffer for this.
225 *
226 * @param[in] pu1_src
227 *  Pointer to source sub-block
228 *
229 * @param[in] pu1_pred
230 *  Pointer to prediction sub-block
231 *
232 * @param[in] pi2_out
233 *  Pointer to residual sub-block
234 *
235 * @param[in] src_strd
236 *  Source stride
237 *
238 * @param[in] pred_strd
239 *  Prediction stride
240 *
241 * @param[in] pu2_scale_matrix
242 *  Pointer to Forward Quant Scale Matrix
243 *
244 * @param[in] pu2_threshold_matrix
245 *  Pointer to Forward Quant Threshold Matrix
246 *
247 * @param[in] u4_qbits
248 *  QP_BITS_h264_4x4 + floor(QP/6)
249 *
250 * @param[in] u4_round_factor
251 *  Quantization Round factor
252 *
253 * @param[out] pu1_nnz
254 *  Total non-zero coefficients in the current sub-block
255 *
256 * @param[in] pi2_alt_dc_addr
257 *  DC Coefficient of the block
258 *
259 * @remarks none
260 *
261 *******************************************************************************
262 */
ih264_resi_trans_quant_chroma_4x4(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD16 * pi2_out,WORD32 src_strd,WORD32 pred_strd,const UWORD16 * pu2_scale_matrix,const UWORD16 * pu2_threshold_matrix,UWORD32 u4_qbits,UWORD32 u4_round_factor,UWORD8 * pu1_nnz,WORD16 * pu1_dc_alt_addr)263 void ih264_resi_trans_quant_chroma_4x4(UWORD8 *pu1_src,
264                                        UWORD8 *pu1_pred,
265                                        WORD16 *pi2_out,
266                                        WORD32 src_strd,
267                                        WORD32 pred_strd,
268                                        const UWORD16 *pu2_scale_matrix,
269                                        const UWORD16 *pu2_threshold_matrix,
270                                        UWORD32 u4_qbits,
271                                        UWORD32 u4_round_factor,
272                                        UWORD8 *pu1_nnz,
273                                        WORD16 *pu1_dc_alt_addr)
274 {
275     UWORD32 i;
276     WORD32 x0, x1, x2, x3, x4, x5, x6, x7;
277     WORD32 i4_value;
278     WORD16 *pi2_out_tmp = pi2_out;
279     UWORD32 u4_nonzero_coeff = 0;
280 
281     for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
282     {
283         /* computing prediction error (residue) */
284         x4 = pu1_src[0] - pu1_pred[0];
285         x5 = pu1_src[2] - pu1_pred[2];
286         x6 = pu1_src[4] - pu1_pred[4];
287         x7 = pu1_src[6] - pu1_pred[6];
288 
289         /* Horizontal transform */
290         x0 = x4 + x7;
291         x1 = x5 + x6;
292         x2 = x5 - x6;
293         x3 = x4 - x7;
294 
295         pi2_out_tmp[0] = x0 + x1;
296         pi2_out_tmp[1] = (x3 << 1) + x2;
297         pi2_out_tmp[2] = x0 - x1;
298         pi2_out_tmp[3] = x3 - (x2 << 1);
299 
300         /* pointing to next row; */
301         pu1_src += src_strd;
302         pu1_pred += pred_strd;
303         pi2_out_tmp += 4;
304     }
305 
306     pi2_out_tmp = pi2_out;
307     for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
308     {
309         /* Vertical transform and quantization */
310         x4 = pi2_out_tmp[0];
311         x5 = pi2_out_tmp[4];
312         x6 = pi2_out_tmp[8];
313         x7 = pi2_out_tmp[12];
314 
315         x0 = x4 + x7;
316         x1 = x5 + x6;
317         x2 = x5 - x6;
318         x3 = x4 - x7;
319 
320         /* quantization is done in place */
321         i4_value = x0 + x1;
322         if(i == 0)
323         {
324             *pu1_dc_alt_addr = i4_value;
325         }
326         FWD_QUANT(i4_value, pu2_threshold_matrix[0],
327                   pu2_scale_matrix[0], u4_round_factor, u4_qbits,
328                   u4_nonzero_coeff);
329         pi2_out_tmp[0] = i4_value;
330 
331         i4_value = (x3 << 1) + x2;
332         FWD_QUANT(i4_value, pu2_threshold_matrix[4],
333                   pu2_scale_matrix[4], u4_round_factor, u4_qbits,
334                   u4_nonzero_coeff);
335         pi2_out_tmp[4] = i4_value;
336 
337         i4_value = x0 - x1;
338         FWD_QUANT(i4_value, pu2_threshold_matrix[8],
339                   pu2_scale_matrix[8], u4_round_factor, u4_qbits,
340                   u4_nonzero_coeff);
341         pi2_out_tmp[8] = i4_value;
342 
343         i4_value = x3 - (x2 << 1);
344         FWD_QUANT(i4_value, pu2_threshold_matrix[12],
345                   pu2_scale_matrix[12], u4_round_factor, u4_qbits,
346                   u4_nonzero_coeff);
347         pi2_out_tmp[12] = i4_value;
348 
349         pi2_out_tmp++;
350         pu2_scale_matrix++;
351         pu2_threshold_matrix++;
352     }
353 
354     /* Return total nonzero coefficients in the current sub block */
355     *pu1_nnz =  u4_nonzero_coeff;
356 }
357 
358 /**
359 *******************************************************************************
360 *
361 * @brief
362 *  This function performs forward hadamard transform and quantization on a
363 *  4x4 block
364 *
365 * @par Description:
366 *  The function accepts source buffer and estimation buffer. From these, it
367 *  computes the residue. This is residue is then transformed and quantized.
368 *  The transform and quantization are in placed computed. They use the residue
369 *  buffer for this.
370 *
371 * @param[in] pu1_src
372 *  Pointer to source sub-block
373 *
374 * @param[in] pi2_dst
375 *  Pointer to destination sub-block
376 *
377 * @param[in] pu2_threshold_matrix
378 *  Pointer to Forward Quant Threshold Matrix
379 *
380 * @param[in] pu2_scale_matrix
381 *  Pointer to Forward Quant Scale Matrix
382 *
383 * @param[in] u4_qbits
384 *  QP_BITS_h264_4x4 + floor(QP/6)
385 *
386 * @param[in] u4_round_factor
387 *  Quantization Round factor
388 *
389 * @param[out] pu1_nnz
390 *  Total non-zero coefficients in the current sub-block
391 *
392 * @remarks none
393 *
394 ********************************************************************************
395 */
ih264_hadamard_quant_4x4(WORD16 * pi2_src,WORD16 * pi2_dst,const UWORD16 * pu2_scale_matrix,const UWORD16 * pu2_threshold_matrix,UWORD32 u4_qbits,UWORD32 u4_round_factor,UWORD8 * pu1_nnz)396 void ih264_hadamard_quant_4x4(WORD16 *pi2_src,
397                               WORD16 *pi2_dst,
398                               const UWORD16 *pu2_scale_matrix,
399                               const UWORD16 *pu2_threshold_matrix,
400                               UWORD32 u4_qbits,
401                               UWORD32 u4_round_factor,
402                               UWORD8 *pu1_nnz)
403 {
404     WORD32 i;
405     WORD32 x0, x1, x2, x3, x4, x5, x6, x7, i4_value;
406 
407     *pu1_nnz = 0;
408 
409     for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
410     {
411         x4 = pi2_src[0];
412         x5 = pi2_src[1];
413         x6 = pi2_src[2];
414         x7 = pi2_src[3];
415 
416         x0 = x4 + x7;
417         x1 = x5 + x6;
418         x2 = x5 - x6;
419         x3 = x4 - x7;
420 
421         pi2_dst[0] = x0 + x1;
422         pi2_dst[1] = x3 + x2;
423         pi2_dst[2] = x0 - x1;
424         pi2_dst[3] = x3 - x2;
425 
426         pi2_src += 4;
427         pi2_dst += 4;
428     }
429 
430     /* Vertical transform and quantization */
431     pi2_dst -= SUB_BLK_WIDTH_4x4 << 2;
432 
433     for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
434     {
435         x4 = pi2_dst[0];
436         x5 = pi2_dst[4];
437         x6 = pi2_dst[8];
438         x7 = pi2_dst[12];
439 
440         x0 = x4 + x7;
441         x1 = x5 + x6;
442         x2 = x5 - x6;
443         x3 = x4 - x7;
444 
445         i4_value = (x0 + x1) >> 1;
446         FWD_QUANT(i4_value, pu2_threshold_matrix[0],
447                   pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
448         pi2_dst[0] = i4_value;
449 
450         i4_value = (x3 + x2) >> 1;
451         FWD_QUANT(i4_value, pu2_threshold_matrix[0],
452                   pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
453         pi2_dst[4] = i4_value;
454 
455         i4_value = (x0 - x1) >> 1;
456         FWD_QUANT(i4_value, pu2_threshold_matrix[0],
457                   pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
458         pi2_dst[8] = i4_value;
459 
460         i4_value = (x3 - x2) >> 1;
461         FWD_QUANT(i4_value, pu2_threshold_matrix[0],
462                   pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
463         pi2_dst[12] = i4_value;
464 
465         pi2_dst++;
466     }
467 }
468 
469 /**
470 *******************************************************************************
471 *
472 * @brief
473 *   This function performs forward hadamard transform and quantization on a
474 *   2x2 block for both U and V planes
475 *
476 * @par Description:
477 *  The function accepts source buffer and estimation buffer. From these, it
478 *  computes the residue. This is residue is then transformed and quantized.
479 *  The transform and quantization are in placed computed. They use the residue
480 *  buffer for this.
481 *
482 * @param[in] pu1_src
483 *  Pointer to source sub-block
484 *
485 * @param[in] pi2_dst
486 *  Pointer to destination sub-block
487 *
488 * @param[in] pu2_threshold_matrix
489 *  Pointer to Forward Quant Threshold Matrix
490 *
491 * @param[in] pu2_scale_matrix
492 *  Pointer to Forward Quant Scale Matrix
493 *
494 * @param[in] u4_qbits
495 *  QP_BITS_h264_4x4 + floor(QP/6)
496 *
497 * @param[in] u4_round_factor
498 *  Quantization Round factor
499 *
500 * @param[out] pu1_nnz
501 *  Total non-zero coefficients in the current sub-block
502 *
503 * @remarks
504 *  NNZ for dc is populated at 0 and 5th position of pu1_nnz
505 *
506 *******************************************************************************
507 */
ih264_hadamard_quant_2x2_uv(WORD16 * pi2_src,WORD16 * pi2_dst,const UWORD16 * pu2_scale_matrix,const UWORD16 * pu2_threshold_matrix,UWORD32 u4_qbits,UWORD32 u4_round_factor,UWORD8 * pu1_nnz)508 void ih264_hadamard_quant_2x2_uv(WORD16 *pi2_src,
509                                  WORD16 *pi2_dst,
510                                  const UWORD16 *pu2_scale_matrix,
511                                  const UWORD16 *pu2_threshold_matrix,
512                                  UWORD32 u4_qbits,
513                                  UWORD32 u4_round_factor,
514                                  UWORD8 *pu1_nnz)
515 {
516     WORD32 x0, x1, x2, x3, x4, x5, x6, x7;
517     WORD32 i4_value, plane;
518 
519     for(plane = 0; plane < 2; plane++)
520     {
521         pu1_nnz[plane] = 0;
522 
523         /* Horizontal transform */
524         x4 = pi2_src[0];
525         x5 = pi2_src[1];
526         x6 = pi2_src[2];
527         x7 = pi2_src[3];
528 
529         x0 = x4 + x5;
530         x1 = x4 - x5;
531         x2 = x6 + x7;
532         x3 = x6 - x7;
533 
534         /* Vertical transform and quantization */
535         i4_value = (x0 + x2);
536         FWD_QUANT(i4_value, pu2_threshold_matrix[0],
537                   pu2_scale_matrix[0], u4_round_factor, u4_qbits,
538                   pu1_nnz[plane]);
539         pi2_dst[0] = i4_value;
540 
541         i4_value = (x0 - x2);
542         FWD_QUANT(i4_value, pu2_threshold_matrix[0],
543                   pu2_scale_matrix[0], u4_round_factor, u4_qbits,
544                   pu1_nnz[plane]);
545         pi2_dst[2] = i4_value;
546 
547         i4_value = (x1 - x3);
548         FWD_QUANT(i4_value, pu2_threshold_matrix[0],
549                   pu2_scale_matrix[0], u4_round_factor, u4_qbits,
550                   pu1_nnz[plane]);
551         pi2_dst[3] = i4_value;
552 
553         i4_value = (x1 + x3);
554         FWD_QUANT(i4_value, pu2_threshold_matrix[0],
555                   pu2_scale_matrix[0], u4_round_factor, u4_qbits,
556                   pu1_nnz[plane]);
557         pi2_dst[1] = i4_value;
558 
559         pi2_dst += 4;
560         pi2_src += 4;
561     }
562 }
563 
564 /**
565 *******************************************************************************
566 *
567 * @brief
568 *  This function performs Single stage forward transform CF8 and quantization
569 *  on 8x8 blocks
570 *
571 * @par Description:
572 *  Performs single stage 8x8 forward transform CF8 after calculating the residue
573 *  The result is then quantized
574 *
575 * @param[in] pu1_src
576 *  Pointer to source sub-block
577 *
578 * @param[in] pu1_pred
579 *  Pointer to prediction sub-block
580 *
581 * @param[in] pi2_out
582 *  Pointer to residual sub-block
583 *
584 * @param[in] src_strd
585 *  Source stride
586 *
587 * @param[in] pred_strd
588 *  Prediction stride
589 *
590 * @param[in] pu2_scale_matrix
591 *  Pointer to Forward Quant Scale Matrix
592 *
593 * @param[in] pu2_threshold_matrix
594 *  Pointer to Forward Quant Threshold Matrix
595 *
596 * @param[in] u4_qbits
597 *  QP_BITS_h264_8x8 + floor(QP/6)
598 *
599 * @param[in] u4_round_factor
600 *  Quantization Round factor
601 *
602 * @param[out] pu1_nnz
603 *  Total non-zero coefficients in the current sub-block
604 *
605 * @param[in] pi2_alt_dc_addr
606 *  UNUSED
607 *
608 * @returns none
609 *
610 * @remarks:
611 *  TODO: This function needs to be tested before integration
612 *
613 *******************************************************************************
614 */
ih264_resi_trans_quant_8x8(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD16 * pi2_out,WORD32 src_strd,WORD32 pred_strd,const UWORD16 * pu2_scale_matrix,const UWORD16 * pu2_threshold_matrix,UWORD32 u4_qbits,UWORD32 u4_round_factor,UWORD8 * pu1_nnz,WORD16 * pu1_dc_alt_addr)615 void ih264_resi_trans_quant_8x8(UWORD8 *pu1_src,
616                                 UWORD8 *pu1_pred,
617                                 WORD16 *pi2_out,
618                                 WORD32 src_strd,
619                                 WORD32 pred_strd,
620                                 const UWORD16 *pu2_scale_matrix,
621                                 const UWORD16 *pu2_threshold_matrix,
622                                 UWORD32 u4_qbits,
623                                 UWORD32 u4_round_factor,
624                                 UWORD8 *pu1_nnz,
625                                 WORD16 *pu1_dc_alt_addr)
626 {
627     WORD16 *pi2_out_tmp = pi2_out;
628     WORD32 i;
629     WORD32 a0, a1, a2, a3, a4, a5, a6, a7;
630     WORD32 r0, r1, r2, r3, r4, r5, r6, r7;
631     UWORD32 u4_nonzero_coeff = 0;
632 
633     UNUSED(pu1_dc_alt_addr);
634 
635     /* Horizontal transform */
636     for(i = 0; i < SUB_BLK_WIDTH_8x8; ++i)
637     {
638         r0 = pu1_src[0];
639         r0 -= pu1_pred[0];
640         r1 = pu1_src[1];
641         r1 -= pu1_pred[1];
642         r2 = pu1_src[2]; r2 -= pu1_pred[2];
643         r3 = pu1_src[3]; r3 -= pu1_pred[3];
644         r4 = pu1_src[4]; r4 -= pu1_pred[4];
645         r5 = pu1_src[5]; r5 -= pu1_pred[5];
646         r6 = pu1_src[6]; r6 -= pu1_pred[6];
647         r7 = pu1_src[7]; r7 -= pu1_pred[7];
648 
649         a0 = r0 + r7;
650         a1 = r1 + r6;
651         a2 = r2 + r5;
652         a3 = r3 + r4;
653 
654         a4 = a0 + a3;
655         a5 = a1 + a2;
656         a6 = a0 - a3;
657         a7 = a1 - a2;
658 
659         pi2_out_tmp[0] = a4 + a5;
660         pi2_out_tmp[2] = a6 + (a7 >> 1);
661         pi2_out_tmp[4] = a4 - a5;
662         pi2_out_tmp[6] = (a6 >> 1) - a7;
663 
664         a0 = r0 - r7;
665         a1 = r1 - r6;
666         a2 = r2 - r5;
667         a3 = r3 - r4;
668 
669         a4 = a1 + a2 + ((a0 >> 1) + a0);
670         a5 = a0 - a3 - ((a2 >> 1) + a2);
671         a6 = a0 + a3 - ((a1 >> 1) + a1);
672         a7 = a1 - a2 + ((a3 >> 1) + a3);
673 
674         pi2_out_tmp[1] = a4 + (a7 >> 2);
675         pi2_out_tmp[3] = a5 + (a6 >> 2);
676         pi2_out_tmp[5] = a6 - (a5 >> 2);
677         pi2_out_tmp[7] = (a4 >> 2) - a7;
678 
679         pu1_src += src_strd;
680         pu1_pred += pred_strd;
681         pi2_out_tmp += 8;
682     }
683 
684     /* vertical transform and quant */
685     pi2_out_tmp = pi2_out;
686     for(i = 0; i < SUB_BLK_WIDTH_8x8; ++i)
687     {
688         r0 = pi2_out_tmp[0];
689         r1 = pi2_out_tmp[8];
690         r2 = pi2_out_tmp[16];
691         r3 = pi2_out_tmp[24];
692         r4 = pi2_out_tmp[32];
693         r5 = pi2_out_tmp[40];
694         r6 = pi2_out_tmp[48];
695         r7 = pi2_out_tmp[56];
696 
697         a0 = r0 + r7;
698         a1 = r1 + r6;
699         a2 = r2 + r5;
700         a3 = r3 + r4;
701 
702         a4 = a0 + a3;
703         a5 = a1 + a2;
704         a6 = a0 - a3;
705         a7 = a1 - a2;
706 
707         a0 = r0 - r7;
708         a1 = r1 - r6;
709         a2 = r2 - r5;
710         a3 = r3 - r4;
711 
712         r0 = a4 + a5;
713         r2 = a6 + (a7 >> 1);
714         r4 = a4 - a5;
715         r6 = (a6 >> 1) - a7;
716 
717         a4 = a1 + a2 + ((a0 >> 1) + a0);
718         a5 = a0 - a3 - ((a2 >> 1) + a2);
719         a6 = a0 + a3 - ((a1 >> 1) + a1);
720         a7 = a1 - a2 + ((a3 >> 1) + a3);
721 
722         r1 = a4 + (a7 >> 2);
723         r3 = a5 + (a6 >> 2);
724         r5 = a6 - (a5 >> 2);
725         r7 = (a4 >> 2) - a7;
726 
727         FWD_QUANT(r0, pu2_threshold_matrix[0],
728                   pu2_scale_matrix[0], u4_round_factor, u4_qbits,
729                   u4_nonzero_coeff);
730         pi2_out_tmp[0] = r0;
731 
732         FWD_QUANT(r1, pu2_threshold_matrix[8],
733                   pu2_scale_matrix[8], u4_round_factor, u4_qbits,
734                   u4_nonzero_coeff);
735         pi2_out_tmp[8] = r1;
736 
737         FWD_QUANT(r2, pu2_threshold_matrix[16],
738                   pu2_scale_matrix[16], u4_round_factor, u4_qbits,
739                   u4_nonzero_coeff);
740         pi2_out_tmp[16] = r2;
741 
742         FWD_QUANT(r3, pu2_threshold_matrix[24],
743                   pu2_scale_matrix[24], u4_round_factor, u4_qbits,
744                   u4_nonzero_coeff);
745         pi2_out_tmp[24] = r3;
746 
747         FWD_QUANT(r4, pu2_threshold_matrix[32],
748                   pu2_scale_matrix[32], u4_round_factor, u4_qbits,
749                   u4_nonzero_coeff);
750         pi2_out_tmp[32] = r4;
751 
752         FWD_QUANT(r5, pu2_threshold_matrix[40],
753                   pu2_scale_matrix[40], u4_round_factor, u4_qbits,
754                   u4_nonzero_coeff);
755         pi2_out_tmp[40] = r5;
756 
757         FWD_QUANT(r6, pu2_threshold_matrix[48],
758                   pu2_scale_matrix[48], u4_round_factor, u4_qbits,
759                   u4_nonzero_coeff);
760         pi2_out_tmp[48] = r6;
761 
762         FWD_QUANT(r7, pu2_threshold_matrix[56],
763                   pu2_scale_matrix[56], u4_round_factor, u4_qbits,
764                   u4_nonzero_coeff);
765         pi2_out_tmp[56] = r7;
766 
767         pi2_out_tmp++;
768         pu2_scale_matrix++;
769         pu2_threshold_matrix++;
770     }
771     /* Return total nonzero coefficients in the current sub block */
772     *pu1_nnz =  u4_nonzero_coeff;
773 }
774