1 /******************************************************************************
2 *
3 * Copyright (C) 2015 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20
21 /**
22 *******************************************************************************
23 * @file
24 * ih264_resi_trans_quant.c
25 *
26 * @brief
27 * Contains function definitions single stage forward transform for H.264
28 * It will calculate the residue, do the cf and then do quantization
29 *
30 * @author
31 * ittiam
32 *
33 * @par List of Functions:
34 * - ih264_resi_trans_quant_4x4
35 * - ih264_resi_trans_quant_chroma_4x4
36 * - ih264_hadamard_quant_4x4
37 * - ih264_hadamard_quant_2x2_uv
38 * - ih264_resi_trans_quant_8x8
39 *
40 * @remarks
41 * none
42 *
43 *******************************************************************************
44 */
45
46
47 /*****************************************************************************/
48 /* File Includes */
49 /*****************************************************************************/
50
51 /* System Include Files */
52 #include <stddef.h>
53
54 /* User Include Files */
55 #include "ih264_typedefs.h"
56 #include "ih264_defs.h"
57 #include "ih264_macros.h"
58 #include "ih264_size_defs.h"
59 #include "ih264_trans_macros.h"
60 #include "ih264_trans_data.h"
61 #include "ih264_structs.h"
62 #include "ih264_trans_quant_itrans_iquant.h"
63
64
65 /*****************************************************************************/
66 /* Function Definitions */
67 /*****************************************************************************/
68
69 /**
70 *******************************************************************************
71 *
72 * @brief
73 * This function performs forward transform and quantization on a 4x4 block
74 *
75 * @par Description:
76 * The function accepts source buffer and estimation buffer. From these, it
77 * computes the residue. This is residue is then transformed and quantized.
78 * The transform and quantization are in placed computed. They use the residue
79 * buffer for this.
80 *
81 * @param[in] pu1_src
82 * Pointer to source sub-block
83 *
84 * @param[in] pu1_pred
85 * Pointer to prediction sub-block
86 *
87 * @param[in] pi2_out
88 * Pointer to residual sub-block
89 *
90 * @param[in] src_strd
91 * Source stride
92 *
93 * @param[in] pred_strd
94 * Prediction stride
95 *
96 * @param[in] pu2_scale_matrix
97 * Pointer to Forward Quant Scale Matrix
98 *
99 * @param[in] pu2_threshold_matrix
100 * Pointer to Forward Quant Threshold Matrix
101 *
102 * @param[in] u4_qbits
103 * QP_BITS_h264_4x4 + floor(QP/6)
104 *
105 * @param[in] u4_round_factor
106 * Quantization Round factor
107 *
108 * @param[out] pu1_nnz
109 * Total non-zero coefficients in the current sub-block
110 *
111 * @param[in] pi2_alt_dc_addr
112 * DC Coefficient of the block
113 *
114 * @remarks none
115 *
116 *******************************************************************************
117 */
ih264_resi_trans_quant_4x4(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD16 * pi2_out,WORD32 src_strd,WORD32 pred_strd,const UWORD16 * pu2_scale_matrix,const UWORD16 * pu2_threshold_matrix,UWORD32 u4_qbits,UWORD32 u4_round_factor,UWORD8 * pu1_nnz,WORD16 * pi2_alt_dc_addr)118 void ih264_resi_trans_quant_4x4(UWORD8 *pu1_src,
119 UWORD8 *pu1_pred,
120 WORD16 *pi2_out,
121 WORD32 src_strd,
122 WORD32 pred_strd,
123 const UWORD16 *pu2_scale_matrix,
124 const UWORD16 *pu2_threshold_matrix,
125 UWORD32 u4_qbits,
126 UWORD32 u4_round_factor,
127 UWORD8 *pu1_nnz,
128 WORD16 *pi2_alt_dc_addr)
129 {
130 UWORD32 i;
131 WORD32 x0, x1, x2, x3, x4, x5, x6, x7;
132 WORD32 i4_value;
133 WORD16 *pi2_out_tmp = pi2_out;
134 UWORD32 u4_nonzero_coeff = 0;
135
136 for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
137 {
138 /* computing prediction error (residue) */
139 x4 = pu1_src[0] - pu1_pred[0];
140 x5 = pu1_src[1] - pu1_pred[1];
141 x6 = pu1_src[2] - pu1_pred[2];
142 x7 = pu1_src[3] - pu1_pred[3];
143
144 /* Horizontal transform */
145 x0 = x4 + x7;
146 x1 = x5 + x6;
147 x2 = x5 - x6;
148 x3 = x4 - x7;
149
150 pi2_out_tmp[0] = x0 + x1;
151 pi2_out_tmp[1] = (x3 << 1) + x2;
152 pi2_out_tmp[2] = x0 - x1;
153 pi2_out_tmp[3] = x3 - (x2 << 1);
154
155 /* pointing to next row; */
156 pu1_src += src_strd;
157 pu1_pred += pred_strd;
158 pi2_out_tmp += 4;
159 }
160
161 pi2_out_tmp = pi2_out;
162 for (i = 0; i < SUB_BLK_WIDTH_4x4; i++)
163 {
164 /* Vertical transform and quantization */
165 x4 = pi2_out_tmp[0];
166 x5 = pi2_out_tmp[4];
167 x6 = pi2_out_tmp[8];
168 x7 = pi2_out_tmp[12];
169
170 x0 = x4 + x7;
171 x1 = x5 + x6;
172 x2 = x5 - x6;
173 x3 = x4 - x7;
174
175 /* quantization is done in place */
176 i4_value = x0 + x1;
177 if(i == 0)
178 {
179 (*pi2_alt_dc_addr) = i4_value;
180 }
181 FWD_QUANT(i4_value, pu2_threshold_matrix[0],
182 pu2_scale_matrix[0], u4_round_factor, u4_qbits,
183 u4_nonzero_coeff);
184 pi2_out_tmp[0] = i4_value;
185
186 i4_value = (x3 << 1) + x2;
187 FWD_QUANT(i4_value, pu2_threshold_matrix[4],
188 pu2_scale_matrix[4], u4_round_factor, u4_qbits,
189 u4_nonzero_coeff);
190 pi2_out_tmp[4] = i4_value;
191
192 i4_value = x0 - x1;
193 FWD_QUANT(i4_value, pu2_threshold_matrix[8],
194 pu2_scale_matrix[8], u4_round_factor, u4_qbits,
195 u4_nonzero_coeff);
196 pi2_out_tmp[8] = i4_value;
197
198 i4_value = x3 - (x2 << 1);
199 FWD_QUANT(i4_value, pu2_threshold_matrix[12],
200 pu2_scale_matrix[12], u4_round_factor, u4_qbits,
201 u4_nonzero_coeff);
202 pi2_out_tmp[12] = i4_value;
203
204 pi2_out_tmp++;
205 pu2_scale_matrix++;
206 pu2_threshold_matrix++;
207 }
208
209 /* Return total nonzero coefficients in the current sub block */
210 *pu1_nnz = u4_nonzero_coeff;
211 }
212
213 /**
214 *******************************************************************************
215 *
216 * @brief
217 * This function performs forward transform and quantization on a 4x4
218 * chroma block with interleaved values
219 *
220 * @par Description:
221 * The function accepts source buffer and estimation buffer. From these, it
222 * computes the residue. This is residue is then transformed and quantized.
223 * The transform and quantization are in placed computed. They use the residue
224 * buffer for this.
225 *
226 * @param[in] pu1_src
227 * Pointer to source sub-block
228 *
229 * @param[in] pu1_pred
230 * Pointer to prediction sub-block
231 *
232 * @param[in] pi2_out
233 * Pointer to residual sub-block
234 *
235 * @param[in] src_strd
236 * Source stride
237 *
238 * @param[in] pred_strd
239 * Prediction stride
240 *
241 * @param[in] pu2_scale_matrix
242 * Pointer to Forward Quant Scale Matrix
243 *
244 * @param[in] pu2_threshold_matrix
245 * Pointer to Forward Quant Threshold Matrix
246 *
247 * @param[in] u4_qbits
248 * QP_BITS_h264_4x4 + floor(QP/6)
249 *
250 * @param[in] u4_round_factor
251 * Quantization Round factor
252 *
253 * @param[out] pu1_nnz
254 * Total non-zero coefficients in the current sub-block
255 *
256 * @param[in] pi2_alt_dc_addr
257 * DC Coefficient of the block
258 *
259 * @remarks none
260 *
261 *******************************************************************************
262 */
ih264_resi_trans_quant_chroma_4x4(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD16 * pi2_out,WORD32 src_strd,WORD32 pred_strd,const UWORD16 * pu2_scale_matrix,const UWORD16 * pu2_threshold_matrix,UWORD32 u4_qbits,UWORD32 u4_round_factor,UWORD8 * pu1_nnz,WORD16 * pu1_dc_alt_addr)263 void ih264_resi_trans_quant_chroma_4x4(UWORD8 *pu1_src,
264 UWORD8 *pu1_pred,
265 WORD16 *pi2_out,
266 WORD32 src_strd,
267 WORD32 pred_strd,
268 const UWORD16 *pu2_scale_matrix,
269 const UWORD16 *pu2_threshold_matrix,
270 UWORD32 u4_qbits,
271 UWORD32 u4_round_factor,
272 UWORD8 *pu1_nnz,
273 WORD16 *pu1_dc_alt_addr)
274 {
275 UWORD32 i;
276 WORD32 x0, x1, x2, x3, x4, x5, x6, x7;
277 WORD32 i4_value;
278 WORD16 *pi2_out_tmp = pi2_out;
279 UWORD32 u4_nonzero_coeff = 0;
280
281 for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
282 {
283 /* computing prediction error (residue) */
284 x4 = pu1_src[0] - pu1_pred[0];
285 x5 = pu1_src[2] - pu1_pred[2];
286 x6 = pu1_src[4] - pu1_pred[4];
287 x7 = pu1_src[6] - pu1_pred[6];
288
289 /* Horizontal transform */
290 x0 = x4 + x7;
291 x1 = x5 + x6;
292 x2 = x5 - x6;
293 x3 = x4 - x7;
294
295 pi2_out_tmp[0] = x0 + x1;
296 pi2_out_tmp[1] = (x3 << 1) + x2;
297 pi2_out_tmp[2] = x0 - x1;
298 pi2_out_tmp[3] = x3 - (x2 << 1);
299
300 /* pointing to next row; */
301 pu1_src += src_strd;
302 pu1_pred += pred_strd;
303 pi2_out_tmp += 4;
304 }
305
306 pi2_out_tmp = pi2_out;
307 for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
308 {
309 /* Vertical transform and quantization */
310 x4 = pi2_out_tmp[0];
311 x5 = pi2_out_tmp[4];
312 x6 = pi2_out_tmp[8];
313 x7 = pi2_out_tmp[12];
314
315 x0 = x4 + x7;
316 x1 = x5 + x6;
317 x2 = x5 - x6;
318 x3 = x4 - x7;
319
320 /* quantization is done in place */
321 i4_value = x0 + x1;
322 if(i == 0)
323 {
324 *pu1_dc_alt_addr = i4_value;
325 }
326 FWD_QUANT(i4_value, pu2_threshold_matrix[0],
327 pu2_scale_matrix[0], u4_round_factor, u4_qbits,
328 u4_nonzero_coeff);
329 pi2_out_tmp[0] = i4_value;
330
331 i4_value = (x3 << 1) + x2;
332 FWD_QUANT(i4_value, pu2_threshold_matrix[4],
333 pu2_scale_matrix[4], u4_round_factor, u4_qbits,
334 u4_nonzero_coeff);
335 pi2_out_tmp[4] = i4_value;
336
337 i4_value = x0 - x1;
338 FWD_QUANT(i4_value, pu2_threshold_matrix[8],
339 pu2_scale_matrix[8], u4_round_factor, u4_qbits,
340 u4_nonzero_coeff);
341 pi2_out_tmp[8] = i4_value;
342
343 i4_value = x3 - (x2 << 1);
344 FWD_QUANT(i4_value, pu2_threshold_matrix[12],
345 pu2_scale_matrix[12], u4_round_factor, u4_qbits,
346 u4_nonzero_coeff);
347 pi2_out_tmp[12] = i4_value;
348
349 pi2_out_tmp++;
350 pu2_scale_matrix++;
351 pu2_threshold_matrix++;
352 }
353
354 /* Return total nonzero coefficients in the current sub block */
355 *pu1_nnz = u4_nonzero_coeff;
356 }
357
358 /**
359 *******************************************************************************
360 *
361 * @brief
362 * This function performs forward hadamard transform and quantization on a
363 * 4x4 block
364 *
365 * @par Description:
366 * The function accepts source buffer and estimation buffer. From these, it
367 * computes the residue. This is residue is then transformed and quantized.
368 * The transform and quantization are in placed computed. They use the residue
369 * buffer for this.
370 *
371 * @param[in] pu1_src
372 * Pointer to source sub-block
373 *
374 * @param[in] pi2_dst
375 * Pointer to destination sub-block
376 *
377 * @param[in] pu2_threshold_matrix
378 * Pointer to Forward Quant Threshold Matrix
379 *
380 * @param[in] pu2_scale_matrix
381 * Pointer to Forward Quant Scale Matrix
382 *
383 * @param[in] u4_qbits
384 * QP_BITS_h264_4x4 + floor(QP/6)
385 *
386 * @param[in] u4_round_factor
387 * Quantization Round factor
388 *
389 * @param[out] pu1_nnz
390 * Total non-zero coefficients in the current sub-block
391 *
392 * @remarks none
393 *
394 ********************************************************************************
395 */
ih264_hadamard_quant_4x4(WORD16 * pi2_src,WORD16 * pi2_dst,const UWORD16 * pu2_scale_matrix,const UWORD16 * pu2_threshold_matrix,UWORD32 u4_qbits,UWORD32 u4_round_factor,UWORD8 * pu1_nnz)396 void ih264_hadamard_quant_4x4(WORD16 *pi2_src,
397 WORD16 *pi2_dst,
398 const UWORD16 *pu2_scale_matrix,
399 const UWORD16 *pu2_threshold_matrix,
400 UWORD32 u4_qbits,
401 UWORD32 u4_round_factor,
402 UWORD8 *pu1_nnz)
403 {
404 WORD32 i;
405 WORD32 x0, x1, x2, x3, x4, x5, x6, x7, i4_value;
406
407 *pu1_nnz = 0;
408
409 for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
410 {
411 x4 = pi2_src[0];
412 x5 = pi2_src[1];
413 x6 = pi2_src[2];
414 x7 = pi2_src[3];
415
416 x0 = x4 + x7;
417 x1 = x5 + x6;
418 x2 = x5 - x6;
419 x3 = x4 - x7;
420
421 pi2_dst[0] = x0 + x1;
422 pi2_dst[1] = x3 + x2;
423 pi2_dst[2] = x0 - x1;
424 pi2_dst[3] = x3 - x2;
425
426 pi2_src += 4;
427 pi2_dst += 4;
428 }
429
430 /* Vertical transform and quantization */
431 pi2_dst -= SUB_BLK_WIDTH_4x4 << 2;
432
433 for(i = 0; i < SUB_BLK_WIDTH_4x4; i++)
434 {
435 x4 = pi2_dst[0];
436 x5 = pi2_dst[4];
437 x6 = pi2_dst[8];
438 x7 = pi2_dst[12];
439
440 x0 = x4 + x7;
441 x1 = x5 + x6;
442 x2 = x5 - x6;
443 x3 = x4 - x7;
444
445 i4_value = (x0 + x1) >> 1;
446 FWD_QUANT(i4_value, pu2_threshold_matrix[0],
447 pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
448 pi2_dst[0] = i4_value;
449
450 i4_value = (x3 + x2) >> 1;
451 FWD_QUANT(i4_value, pu2_threshold_matrix[0],
452 pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
453 pi2_dst[4] = i4_value;
454
455 i4_value = (x0 - x1) >> 1;
456 FWD_QUANT(i4_value, pu2_threshold_matrix[0],
457 pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
458 pi2_dst[8] = i4_value;
459
460 i4_value = (x3 - x2) >> 1;
461 FWD_QUANT(i4_value, pu2_threshold_matrix[0],
462 pu2_scale_matrix[0], u4_round_factor, u4_qbits, pu1_nnz[0]);
463 pi2_dst[12] = i4_value;
464
465 pi2_dst++;
466 }
467 }
468
469 /**
470 *******************************************************************************
471 *
472 * @brief
473 * This function performs forward hadamard transform and quantization on a
474 * 2x2 block for both U and V planes
475 *
476 * @par Description:
477 * The function accepts source buffer and estimation buffer. From these, it
478 * computes the residue. This is residue is then transformed and quantized.
479 * The transform and quantization are in placed computed. They use the residue
480 * buffer for this.
481 *
482 * @param[in] pu1_src
483 * Pointer to source sub-block
484 *
485 * @param[in] pi2_dst
486 * Pointer to destination sub-block
487 *
488 * @param[in] pu2_threshold_matrix
489 * Pointer to Forward Quant Threshold Matrix
490 *
491 * @param[in] pu2_scale_matrix
492 * Pointer to Forward Quant Scale Matrix
493 *
494 * @param[in] u4_qbits
495 * QP_BITS_h264_4x4 + floor(QP/6)
496 *
497 * @param[in] u4_round_factor
498 * Quantization Round factor
499 *
500 * @param[out] pu1_nnz
501 * Total non-zero coefficients in the current sub-block
502 *
503 * @remarks
504 * NNZ for dc is populated at 0 and 5th position of pu1_nnz
505 *
506 *******************************************************************************
507 */
ih264_hadamard_quant_2x2_uv(WORD16 * pi2_src,WORD16 * pi2_dst,const UWORD16 * pu2_scale_matrix,const UWORD16 * pu2_threshold_matrix,UWORD32 u4_qbits,UWORD32 u4_round_factor,UWORD8 * pu1_nnz)508 void ih264_hadamard_quant_2x2_uv(WORD16 *pi2_src,
509 WORD16 *pi2_dst,
510 const UWORD16 *pu2_scale_matrix,
511 const UWORD16 *pu2_threshold_matrix,
512 UWORD32 u4_qbits,
513 UWORD32 u4_round_factor,
514 UWORD8 *pu1_nnz)
515 {
516 WORD32 x0, x1, x2, x3, x4, x5, x6, x7;
517 WORD32 i4_value, plane;
518
519 for(plane = 0; plane < 2; plane++)
520 {
521 pu1_nnz[plane] = 0;
522
523 /* Horizontal transform */
524 x4 = pi2_src[0];
525 x5 = pi2_src[1];
526 x6 = pi2_src[2];
527 x7 = pi2_src[3];
528
529 x0 = x4 + x5;
530 x1 = x4 - x5;
531 x2 = x6 + x7;
532 x3 = x6 - x7;
533
534 /* Vertical transform and quantization */
535 i4_value = (x0 + x2);
536 FWD_QUANT(i4_value, pu2_threshold_matrix[0],
537 pu2_scale_matrix[0], u4_round_factor, u4_qbits,
538 pu1_nnz[plane]);
539 pi2_dst[0] = i4_value;
540
541 i4_value = (x0 - x2);
542 FWD_QUANT(i4_value, pu2_threshold_matrix[0],
543 pu2_scale_matrix[0], u4_round_factor, u4_qbits,
544 pu1_nnz[plane]);
545 pi2_dst[2] = i4_value;
546
547 i4_value = (x1 - x3);
548 FWD_QUANT(i4_value, pu2_threshold_matrix[0],
549 pu2_scale_matrix[0], u4_round_factor, u4_qbits,
550 pu1_nnz[plane]);
551 pi2_dst[3] = i4_value;
552
553 i4_value = (x1 + x3);
554 FWD_QUANT(i4_value, pu2_threshold_matrix[0],
555 pu2_scale_matrix[0], u4_round_factor, u4_qbits,
556 pu1_nnz[plane]);
557 pi2_dst[1] = i4_value;
558
559 pi2_dst += 4;
560 pi2_src += 4;
561 }
562 }
563
564 /**
565 *******************************************************************************
566 *
567 * @brief
568 * This function performs Single stage forward transform CF8 and quantization
569 * on 8x8 blocks
570 *
571 * @par Description:
572 * Performs single stage 8x8 forward transform CF8 after calculating the residue
573 * The result is then quantized
574 *
575 * @param[in] pu1_src
576 * Pointer to source sub-block
577 *
578 * @param[in] pu1_pred
579 * Pointer to prediction sub-block
580 *
581 * @param[in] pi2_out
582 * Pointer to residual sub-block
583 *
584 * @param[in] src_strd
585 * Source stride
586 *
587 * @param[in] pred_strd
588 * Prediction stride
589 *
590 * @param[in] pu2_scale_matrix
591 * Pointer to Forward Quant Scale Matrix
592 *
593 * @param[in] pu2_threshold_matrix
594 * Pointer to Forward Quant Threshold Matrix
595 *
596 * @param[in] u4_qbits
597 * QP_BITS_h264_8x8 + floor(QP/6)
598 *
599 * @param[in] u4_round_factor
600 * Quantization Round factor
601 *
602 * @param[out] pu1_nnz
603 * Total non-zero coefficients in the current sub-block
604 *
605 * @param[in] pi2_alt_dc_addr
606 * UNUSED
607 *
608 * @returns none
609 *
610 * @remarks:
611 * TODO: This function needs to be tested before integration
612 *
613 *******************************************************************************
614 */
ih264_resi_trans_quant_8x8(UWORD8 * pu1_src,UWORD8 * pu1_pred,WORD16 * pi2_out,WORD32 src_strd,WORD32 pred_strd,const UWORD16 * pu2_scale_matrix,const UWORD16 * pu2_threshold_matrix,UWORD32 u4_qbits,UWORD32 u4_round_factor,UWORD8 * pu1_nnz,WORD16 * pu1_dc_alt_addr)615 void ih264_resi_trans_quant_8x8(UWORD8 *pu1_src,
616 UWORD8 *pu1_pred,
617 WORD16 *pi2_out,
618 WORD32 src_strd,
619 WORD32 pred_strd,
620 const UWORD16 *pu2_scale_matrix,
621 const UWORD16 *pu2_threshold_matrix,
622 UWORD32 u4_qbits,
623 UWORD32 u4_round_factor,
624 UWORD8 *pu1_nnz,
625 WORD16 *pu1_dc_alt_addr)
626 {
627 WORD16 *pi2_out_tmp = pi2_out;
628 WORD32 i;
629 WORD32 a0, a1, a2, a3, a4, a5, a6, a7;
630 WORD32 r0, r1, r2, r3, r4, r5, r6, r7;
631 UWORD32 u4_nonzero_coeff = 0;
632
633 UNUSED(pu1_dc_alt_addr);
634
635 /* Horizontal transform */
636 for(i = 0; i < SUB_BLK_WIDTH_8x8; ++i)
637 {
638 r0 = pu1_src[0];
639 r0 -= pu1_pred[0];
640 r1 = pu1_src[1];
641 r1 -= pu1_pred[1];
642 r2 = pu1_src[2]; r2 -= pu1_pred[2];
643 r3 = pu1_src[3]; r3 -= pu1_pred[3];
644 r4 = pu1_src[4]; r4 -= pu1_pred[4];
645 r5 = pu1_src[5]; r5 -= pu1_pred[5];
646 r6 = pu1_src[6]; r6 -= pu1_pred[6];
647 r7 = pu1_src[7]; r7 -= pu1_pred[7];
648
649 a0 = r0 + r7;
650 a1 = r1 + r6;
651 a2 = r2 + r5;
652 a3 = r3 + r4;
653
654 a4 = a0 + a3;
655 a5 = a1 + a2;
656 a6 = a0 - a3;
657 a7 = a1 - a2;
658
659 pi2_out_tmp[0] = a4 + a5;
660 pi2_out_tmp[2] = a6 + (a7 >> 1);
661 pi2_out_tmp[4] = a4 - a5;
662 pi2_out_tmp[6] = (a6 >> 1) - a7;
663
664 a0 = r0 - r7;
665 a1 = r1 - r6;
666 a2 = r2 - r5;
667 a3 = r3 - r4;
668
669 a4 = a1 + a2 + ((a0 >> 1) + a0);
670 a5 = a0 - a3 - ((a2 >> 1) + a2);
671 a6 = a0 + a3 - ((a1 >> 1) + a1);
672 a7 = a1 - a2 + ((a3 >> 1) + a3);
673
674 pi2_out_tmp[1] = a4 + (a7 >> 2);
675 pi2_out_tmp[3] = a5 + (a6 >> 2);
676 pi2_out_tmp[5] = a6 - (a5 >> 2);
677 pi2_out_tmp[7] = (a4 >> 2) - a7;
678
679 pu1_src += src_strd;
680 pu1_pred += pred_strd;
681 pi2_out_tmp += 8;
682 }
683
684 /* vertical transform and quant */
685 pi2_out_tmp = pi2_out;
686 for(i = 0; i < SUB_BLK_WIDTH_8x8; ++i)
687 {
688 r0 = pi2_out_tmp[0];
689 r1 = pi2_out_tmp[8];
690 r2 = pi2_out_tmp[16];
691 r3 = pi2_out_tmp[24];
692 r4 = pi2_out_tmp[32];
693 r5 = pi2_out_tmp[40];
694 r6 = pi2_out_tmp[48];
695 r7 = pi2_out_tmp[56];
696
697 a0 = r0 + r7;
698 a1 = r1 + r6;
699 a2 = r2 + r5;
700 a3 = r3 + r4;
701
702 a4 = a0 + a3;
703 a5 = a1 + a2;
704 a6 = a0 - a3;
705 a7 = a1 - a2;
706
707 a0 = r0 - r7;
708 a1 = r1 - r6;
709 a2 = r2 - r5;
710 a3 = r3 - r4;
711
712 r0 = a4 + a5;
713 r2 = a6 + (a7 >> 1);
714 r4 = a4 - a5;
715 r6 = (a6 >> 1) - a7;
716
717 a4 = a1 + a2 + ((a0 >> 1) + a0);
718 a5 = a0 - a3 - ((a2 >> 1) + a2);
719 a6 = a0 + a3 - ((a1 >> 1) + a1);
720 a7 = a1 - a2 + ((a3 >> 1) + a3);
721
722 r1 = a4 + (a7 >> 2);
723 r3 = a5 + (a6 >> 2);
724 r5 = a6 - (a5 >> 2);
725 r7 = (a4 >> 2) - a7;
726
727 FWD_QUANT(r0, pu2_threshold_matrix[0],
728 pu2_scale_matrix[0], u4_round_factor, u4_qbits,
729 u4_nonzero_coeff);
730 pi2_out_tmp[0] = r0;
731
732 FWD_QUANT(r1, pu2_threshold_matrix[8],
733 pu2_scale_matrix[8], u4_round_factor, u4_qbits,
734 u4_nonzero_coeff);
735 pi2_out_tmp[8] = r1;
736
737 FWD_QUANT(r2, pu2_threshold_matrix[16],
738 pu2_scale_matrix[16], u4_round_factor, u4_qbits,
739 u4_nonzero_coeff);
740 pi2_out_tmp[16] = r2;
741
742 FWD_QUANT(r3, pu2_threshold_matrix[24],
743 pu2_scale_matrix[24], u4_round_factor, u4_qbits,
744 u4_nonzero_coeff);
745 pi2_out_tmp[24] = r3;
746
747 FWD_QUANT(r4, pu2_threshold_matrix[32],
748 pu2_scale_matrix[32], u4_round_factor, u4_qbits,
749 u4_nonzero_coeff);
750 pi2_out_tmp[32] = r4;
751
752 FWD_QUANT(r5, pu2_threshold_matrix[40],
753 pu2_scale_matrix[40], u4_round_factor, u4_qbits,
754 u4_nonzero_coeff);
755 pi2_out_tmp[40] = r5;
756
757 FWD_QUANT(r6, pu2_threshold_matrix[48],
758 pu2_scale_matrix[48], u4_round_factor, u4_qbits,
759 u4_nonzero_coeff);
760 pi2_out_tmp[48] = r6;
761
762 FWD_QUANT(r7, pu2_threshold_matrix[56],
763 pu2_scale_matrix[56], u4_round_factor, u4_qbits,
764 u4_nonzero_coeff);
765 pi2_out_tmp[56] = r7;
766
767 pi2_out_tmp++;
768 pu2_scale_matrix++;
769 pu2_threshold_matrix++;
770 }
771 /* Return total nonzero coefficients in the current sub block */
772 *pu1_nnz = u4_nonzero_coeff;
773 }
774