1*c83a76b0SSuyog Pawar /******************************************************************************
2*c83a76b0SSuyog Pawar *
3*c83a76b0SSuyog Pawar * Copyright (C) 2018 The Android Open Source Project
4*c83a76b0SSuyog Pawar *
5*c83a76b0SSuyog Pawar * Licensed under the Apache License, Version 2.0 (the "License");
6*c83a76b0SSuyog Pawar * you may not use this file except in compliance with the License.
7*c83a76b0SSuyog Pawar * You may obtain a copy of the License at:
8*c83a76b0SSuyog Pawar *
9*c83a76b0SSuyog Pawar * http://www.apache.org/licenses/LICENSE-2.0
10*c83a76b0SSuyog Pawar *
11*c83a76b0SSuyog Pawar * Unless required by applicable law or agreed to in writing, software
12*c83a76b0SSuyog Pawar * distributed under the License is distributed on an "AS IS" BASIS,
13*c83a76b0SSuyog Pawar * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14*c83a76b0SSuyog Pawar * See the License for the specific language governing permissions and
15*c83a76b0SSuyog Pawar * limitations under the License.
16*c83a76b0SSuyog Pawar *
17*c83a76b0SSuyog Pawar *****************************************************************************
18*c83a76b0SSuyog Pawar * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19*c83a76b0SSuyog Pawar */
20*c83a76b0SSuyog Pawar
21*c83a76b0SSuyog Pawar /**
22*c83a76b0SSuyog Pawar ******************************************************************************
23*c83a76b0SSuyog Pawar * @file ihevce_had_satd.c
24*c83a76b0SSuyog Pawar *
25*c83a76b0SSuyog Pawar * @brief
26*c83a76b0SSuyog Pawar * This file contains functions of Hadamard SAD and SATD
27*c83a76b0SSuyog Pawar *
28*c83a76b0SSuyog Pawar * @author
29*c83a76b0SSuyog Pawar * Ittiam
30*c83a76b0SSuyog Pawar *
31*c83a76b0SSuyog Pawar * List of Functions
32*c83a76b0SSuyog Pawar * <TODO: TO BE ADDED>
33*c83a76b0SSuyog Pawar *
34*c83a76b0SSuyog Pawar ******************************************************************************
35*c83a76b0SSuyog Pawar */
36*c83a76b0SSuyog Pawar
37*c83a76b0SSuyog Pawar /*****************************************************************************/
38*c83a76b0SSuyog Pawar /* File Includes */
39*c83a76b0SSuyog Pawar /*****************************************************************************/
40*c83a76b0SSuyog Pawar /* System include files */
41*c83a76b0SSuyog Pawar #include <stdio.h>
42*c83a76b0SSuyog Pawar #include <string.h>
43*c83a76b0SSuyog Pawar #include <stdlib.h>
44*c83a76b0SSuyog Pawar #include <assert.h>
45*c83a76b0SSuyog Pawar #include <stdarg.h>
46*c83a76b0SSuyog Pawar #include <math.h>
47*c83a76b0SSuyog Pawar
48*c83a76b0SSuyog Pawar /* User include files */
49*c83a76b0SSuyog Pawar #include "ihevc_typedefs.h"
50*c83a76b0SSuyog Pawar #include "itt_video_api.h"
51*c83a76b0SSuyog Pawar #include "ihevce_api.h"
52*c83a76b0SSuyog Pawar
53*c83a76b0SSuyog Pawar #include "rc_cntrl_param.h"
54*c83a76b0SSuyog Pawar #include "rc_frame_info_collector.h"
55*c83a76b0SSuyog Pawar #include "rc_look_ahead_params.h"
56*c83a76b0SSuyog Pawar
57*c83a76b0SSuyog Pawar #include "ihevc_defs.h"
58*c83a76b0SSuyog Pawar #include "ihevc_structs.h"
59*c83a76b0SSuyog Pawar #include "ihevc_platform_macros.h"
60*c83a76b0SSuyog Pawar #include "ihevc_deblk.h"
61*c83a76b0SSuyog Pawar #include "ihevc_itrans_recon.h"
62*c83a76b0SSuyog Pawar #include "ihevc_chroma_itrans_recon.h"
63*c83a76b0SSuyog Pawar #include "ihevc_chroma_intra_pred.h"
64*c83a76b0SSuyog Pawar #include "ihevc_intra_pred.h"
65*c83a76b0SSuyog Pawar #include "ihevc_inter_pred.h"
66*c83a76b0SSuyog Pawar #include "ihevc_mem_fns.h"
67*c83a76b0SSuyog Pawar #include "ihevc_padding.h"
68*c83a76b0SSuyog Pawar #include "ihevc_weighted_pred.h"
69*c83a76b0SSuyog Pawar #include "ihevc_sao.h"
70*c83a76b0SSuyog Pawar #include "ihevc_resi_trans.h"
71*c83a76b0SSuyog Pawar #include "ihevc_quant_iquant_ssd.h"
72*c83a76b0SSuyog Pawar #include "ihevc_cabac_tables.h"
73*c83a76b0SSuyog Pawar
74*c83a76b0SSuyog Pawar #include "ihevce_defs.h"
75*c83a76b0SSuyog Pawar #include "ihevce_lap_enc_structs.h"
76*c83a76b0SSuyog Pawar #include "ihevce_multi_thrd_structs.h"
77*c83a76b0SSuyog Pawar #include "ihevce_multi_thrd_funcs.h"
78*c83a76b0SSuyog Pawar #include "ihevce_me_common_defs.h"
79*c83a76b0SSuyog Pawar #include "ihevce_had_satd.h"
80*c83a76b0SSuyog Pawar #include "ihevce_error_codes.h"
81*c83a76b0SSuyog Pawar #include "ihevce_bitstream.h"
82*c83a76b0SSuyog Pawar #include "ihevce_cabac.h"
83*c83a76b0SSuyog Pawar #include "ihevce_rdoq_macros.h"
84*c83a76b0SSuyog Pawar #include "ihevce_function_selector.h"
85*c83a76b0SSuyog Pawar #include "ihevce_enc_structs.h"
86*c83a76b0SSuyog Pawar #include "ihevce_cmn_utils_instr_set_router.h"
87*c83a76b0SSuyog Pawar #include "hme_datatype.h"
88*c83a76b0SSuyog Pawar #include "hme_interface.h"
89*c83a76b0SSuyog Pawar #include "hme_common_defs.h"
90*c83a76b0SSuyog Pawar #include "hme_defs.h"
91*c83a76b0SSuyog Pawar
92*c83a76b0SSuyog Pawar /*****************************************************************************/
93*c83a76b0SSuyog Pawar /* Function Definitions */
94*c83a76b0SSuyog Pawar /*****************************************************************************/
95*c83a76b0SSuyog Pawar
ihevce_hadamard_4x4_8bit(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)96*c83a76b0SSuyog Pawar static void ihevce_hadamard_4x4_8bit(
97*c83a76b0SSuyog Pawar UWORD8 *pu1_src,
98*c83a76b0SSuyog Pawar WORD32 src_strd,
99*c83a76b0SSuyog Pawar UWORD8 *pu1_pred,
100*c83a76b0SSuyog Pawar WORD32 pred_strd,
101*c83a76b0SSuyog Pawar WORD16 *pi2_dst,
102*c83a76b0SSuyog Pawar WORD32 dst_strd)
103*c83a76b0SSuyog Pawar {
104*c83a76b0SSuyog Pawar WORD32 k;
105*c83a76b0SSuyog Pawar WORD16 m[16];
106*c83a76b0SSuyog Pawar
107*c83a76b0SSuyog Pawar /*===== hadamard horz transform =====*/
108*c83a76b0SSuyog Pawar for(k = 0; k < 4; k++)
109*c83a76b0SSuyog Pawar {
110*c83a76b0SSuyog Pawar WORD32 r0, r1, r2, r3;
111*c83a76b0SSuyog Pawar WORD32 h0, h1, h2, h3;
112*c83a76b0SSuyog Pawar
113*c83a76b0SSuyog Pawar /* Compute the residue block */
114*c83a76b0SSuyog Pawar r0 = pu1_src[0] - pu1_pred[0];
115*c83a76b0SSuyog Pawar r1 = pu1_src[1] - pu1_pred[1];
116*c83a76b0SSuyog Pawar r2 = pu1_src[2] - pu1_pred[2];
117*c83a76b0SSuyog Pawar r3 = pu1_src[3] - pu1_pred[3];
118*c83a76b0SSuyog Pawar
119*c83a76b0SSuyog Pawar h0 = r0 + r1;
120*c83a76b0SSuyog Pawar h1 = r0 - r1;
121*c83a76b0SSuyog Pawar h2 = r2 + r3;
122*c83a76b0SSuyog Pawar h3 = r2 - r3;
123*c83a76b0SSuyog Pawar
124*c83a76b0SSuyog Pawar m[k * 4 + 0] = h0 + h2;
125*c83a76b0SSuyog Pawar m[k * 4 + 1] = h1 + h3;
126*c83a76b0SSuyog Pawar m[k * 4 + 2] = h0 - h2;
127*c83a76b0SSuyog Pawar m[k * 4 + 3] = h1 - h3;
128*c83a76b0SSuyog Pawar
129*c83a76b0SSuyog Pawar pu1_pred += pred_strd;
130*c83a76b0SSuyog Pawar pu1_src += src_strd;
131*c83a76b0SSuyog Pawar }
132*c83a76b0SSuyog Pawar
133*c83a76b0SSuyog Pawar /*===== hadamard vert transform =====*/
134*c83a76b0SSuyog Pawar for(k = 0; k < 4; k++)
135*c83a76b0SSuyog Pawar {
136*c83a76b0SSuyog Pawar WORD32 v0, v1, v2, v3;
137*c83a76b0SSuyog Pawar
138*c83a76b0SSuyog Pawar v0 = m[0 + k] + m[4 + k];
139*c83a76b0SSuyog Pawar v1 = m[0 + k] - m[4 + k];
140*c83a76b0SSuyog Pawar v2 = m[8 + k] + m[12 + k];
141*c83a76b0SSuyog Pawar v3 = m[8 + k] - m[12 + k];
142*c83a76b0SSuyog Pawar
143*c83a76b0SSuyog Pawar pi2_dst[0 * dst_strd + k] = v0 + v2;
144*c83a76b0SSuyog Pawar pi2_dst[1 * dst_strd + k] = v1 + v3;
145*c83a76b0SSuyog Pawar pi2_dst[2 * dst_strd + k] = v0 - v2;
146*c83a76b0SSuyog Pawar pi2_dst[3 * dst_strd + k] = v1 - v3;
147*c83a76b0SSuyog Pawar }
148*c83a76b0SSuyog Pawar }
149*c83a76b0SSuyog Pawar
ihevce_hadamard_8x8_8bit(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)150*c83a76b0SSuyog Pawar static void ihevce_hadamard_8x8_8bit(
151*c83a76b0SSuyog Pawar UWORD8 *pu1_src,
152*c83a76b0SSuyog Pawar WORD32 src_strd,
153*c83a76b0SSuyog Pawar UWORD8 *pu1_pred,
154*c83a76b0SSuyog Pawar WORD32 pred_strd,
155*c83a76b0SSuyog Pawar WORD16 *pi2_dst,
156*c83a76b0SSuyog Pawar WORD32 dst_strd)
157*c83a76b0SSuyog Pawar {
158*c83a76b0SSuyog Pawar WORD32 i;
159*c83a76b0SSuyog Pawar
160*c83a76b0SSuyog Pawar // y0
161*c83a76b0SSuyog Pawar ihevce_hadamard_4x4_8bit(pu1_src, src_strd, pu1_pred, pred_strd, pi2_dst, dst_strd);
162*c83a76b0SSuyog Pawar // y1
163*c83a76b0SSuyog Pawar ihevce_hadamard_4x4_8bit(pu1_src + 4, src_strd, pu1_pred + 4, pred_strd, pi2_dst + 4, dst_strd);
164*c83a76b0SSuyog Pawar // y2
165*c83a76b0SSuyog Pawar ihevce_hadamard_4x4_8bit(
166*c83a76b0SSuyog Pawar pu1_src + 4 * src_strd,
167*c83a76b0SSuyog Pawar src_strd,
168*c83a76b0SSuyog Pawar pu1_pred + 4 * pred_strd,
169*c83a76b0SSuyog Pawar pred_strd,
170*c83a76b0SSuyog Pawar pi2_dst + (4 * dst_strd),
171*c83a76b0SSuyog Pawar dst_strd);
172*c83a76b0SSuyog Pawar // y3
173*c83a76b0SSuyog Pawar ihevce_hadamard_4x4_8bit(
174*c83a76b0SSuyog Pawar pu1_src + 4 + 4 * src_strd,
175*c83a76b0SSuyog Pawar src_strd,
176*c83a76b0SSuyog Pawar pu1_pred + 4 + 4 * pred_strd,
177*c83a76b0SSuyog Pawar pred_strd,
178*c83a76b0SSuyog Pawar pi2_dst + (4 * dst_strd) + 4,
179*c83a76b0SSuyog Pawar dst_strd);
180*c83a76b0SSuyog Pawar
181*c83a76b0SSuyog Pawar /* Child HAD results combined as follows to get Parent result */
182*c83a76b0SSuyog Pawar /* _ _ */
183*c83a76b0SSuyog Pawar /* | (y0 + y1) + (y2 + y3) (y0 - y1) + (y2 - y3) | */
184*c83a76b0SSuyog Pawar /* | (y0 + y1) - (y2 + y3) (y0 - y1) - (y2 - y3) | */
185*c83a76b0SSuyog Pawar /* \- -/ */
186*c83a76b0SSuyog Pawar for(i = 0; i < 16; i++)
187*c83a76b0SSuyog Pawar {
188*c83a76b0SSuyog Pawar WORD32 idx = (i >> 2) * dst_strd + (i % 4);
189*c83a76b0SSuyog Pawar WORD16 a0 = pi2_dst[idx];
190*c83a76b0SSuyog Pawar WORD16 a1 = pi2_dst[4 + idx];
191*c83a76b0SSuyog Pawar WORD16 a2 = pi2_dst[(4 * dst_strd) + idx];
192*c83a76b0SSuyog Pawar WORD16 a3 = pi2_dst[(4 * dst_strd) + 4 + idx];
193*c83a76b0SSuyog Pawar
194*c83a76b0SSuyog Pawar WORD16 b0 = (a0 + a1);
195*c83a76b0SSuyog Pawar WORD16 b1 = (a0 - a1);
196*c83a76b0SSuyog Pawar WORD16 b2 = (a2 + a3);
197*c83a76b0SSuyog Pawar WORD16 b3 = (a2 - a3);
198*c83a76b0SSuyog Pawar
199*c83a76b0SSuyog Pawar pi2_dst[idx] = b0 + b2;
200*c83a76b0SSuyog Pawar pi2_dst[4 + idx] = b1 + b3;
201*c83a76b0SSuyog Pawar pi2_dst[(4 * dst_strd) + idx] = b0 - b2;
202*c83a76b0SSuyog Pawar pi2_dst[(4 * dst_strd) + 4 + idx] = b1 - b3;
203*c83a76b0SSuyog Pawar }
204*c83a76b0SSuyog Pawar }
205*c83a76b0SSuyog Pawar
ihevce_hadamard_16x16_8bit(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)206*c83a76b0SSuyog Pawar static void ihevce_hadamard_16x16_8bit(
207*c83a76b0SSuyog Pawar UWORD8 *pu1_src,
208*c83a76b0SSuyog Pawar WORD32 src_strd,
209*c83a76b0SSuyog Pawar UWORD8 *pu1_pred,
210*c83a76b0SSuyog Pawar WORD32 pred_strd,
211*c83a76b0SSuyog Pawar WORD16 *pi2_dst,
212*c83a76b0SSuyog Pawar WORD32 dst_strd)
213*c83a76b0SSuyog Pawar {
214*c83a76b0SSuyog Pawar WORD32 i;
215*c83a76b0SSuyog Pawar
216*c83a76b0SSuyog Pawar // y0
217*c83a76b0SSuyog Pawar ihevce_hadamard_8x8_8bit(pu1_src, src_strd, pu1_pred, pred_strd, pi2_dst, dst_strd);
218*c83a76b0SSuyog Pawar // y1
219*c83a76b0SSuyog Pawar ihevce_hadamard_8x8_8bit(pu1_src + 8, src_strd, pu1_pred + 8, pred_strd, pi2_dst + 8, dst_strd);
220*c83a76b0SSuyog Pawar // y2
221*c83a76b0SSuyog Pawar ihevce_hadamard_8x8_8bit(
222*c83a76b0SSuyog Pawar pu1_src + 8 * src_strd,
223*c83a76b0SSuyog Pawar src_strd,
224*c83a76b0SSuyog Pawar pu1_pred + 8 * pred_strd,
225*c83a76b0SSuyog Pawar pred_strd,
226*c83a76b0SSuyog Pawar pi2_dst + (8 * dst_strd),
227*c83a76b0SSuyog Pawar dst_strd);
228*c83a76b0SSuyog Pawar // y3
229*c83a76b0SSuyog Pawar ihevce_hadamard_8x8_8bit(
230*c83a76b0SSuyog Pawar pu1_src + 8 + 8 * src_strd,
231*c83a76b0SSuyog Pawar src_strd,
232*c83a76b0SSuyog Pawar pu1_pred + 8 + 8 * pred_strd,
233*c83a76b0SSuyog Pawar pred_strd,
234*c83a76b0SSuyog Pawar pi2_dst + (8 * dst_strd) + 8,
235*c83a76b0SSuyog Pawar dst_strd);
236*c83a76b0SSuyog Pawar
237*c83a76b0SSuyog Pawar /* Child HAD results combined as follows to get Parent result */
238*c83a76b0SSuyog Pawar /* _ _ */
239*c83a76b0SSuyog Pawar /* | (y0 + y1) + (y2 + y3) (y0 - y1) + (y2 - y3) | */
240*c83a76b0SSuyog Pawar /* | (y0 + y1) - (y2 + y3) (y0 - y1) - (y2 - y3) | */
241*c83a76b0SSuyog Pawar /* \- -/ */
242*c83a76b0SSuyog Pawar for(i = 0; i < 64; i++)
243*c83a76b0SSuyog Pawar {
244*c83a76b0SSuyog Pawar WORD32 idx = (i >> 3) * dst_strd + (i % 8);
245*c83a76b0SSuyog Pawar WORD16 a0 = pi2_dst[idx];
246*c83a76b0SSuyog Pawar WORD16 a1 = pi2_dst[8 + idx];
247*c83a76b0SSuyog Pawar WORD16 a2 = pi2_dst[(8 * dst_strd) + idx];
248*c83a76b0SSuyog Pawar WORD16 a3 = pi2_dst[(8 * dst_strd) + 8 + idx];
249*c83a76b0SSuyog Pawar
250*c83a76b0SSuyog Pawar WORD16 b0 = (a0 + a1) >> 1;
251*c83a76b0SSuyog Pawar WORD16 b1 = (a0 - a1) >> 1;
252*c83a76b0SSuyog Pawar WORD16 b2 = (a2 + a3) >> 1;
253*c83a76b0SSuyog Pawar WORD16 b3 = (a2 - a3) >> 1;
254*c83a76b0SSuyog Pawar
255*c83a76b0SSuyog Pawar pi2_dst[idx] = b0 + b2;
256*c83a76b0SSuyog Pawar pi2_dst[8 + idx] = b1 + b3;
257*c83a76b0SSuyog Pawar pi2_dst[(8 * dst_strd) + idx] = b0 - b2;
258*c83a76b0SSuyog Pawar pi2_dst[(8 * dst_strd) + 8 + idx] = b1 - b3;
259*c83a76b0SSuyog Pawar }
260*c83a76b0SSuyog Pawar }
261*c83a76b0SSuyog Pawar
ihevce_hadamard_32x32_8bit(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)262*c83a76b0SSuyog Pawar static void ihevce_hadamard_32x32_8bit(
263*c83a76b0SSuyog Pawar UWORD8 *pu1_src,
264*c83a76b0SSuyog Pawar WORD32 src_strd,
265*c83a76b0SSuyog Pawar UWORD8 *pu1_pred,
266*c83a76b0SSuyog Pawar WORD32 pred_strd,
267*c83a76b0SSuyog Pawar WORD16 *pi2_dst,
268*c83a76b0SSuyog Pawar WORD32 dst_strd)
269*c83a76b0SSuyog Pawar {
270*c83a76b0SSuyog Pawar WORD32 i;
271*c83a76b0SSuyog Pawar
272*c83a76b0SSuyog Pawar // y0
273*c83a76b0SSuyog Pawar ihevce_hadamard_16x16_8bit(pu1_src, src_strd, pu1_pred, pred_strd, pi2_dst, dst_strd);
274*c83a76b0SSuyog Pawar // y1
275*c83a76b0SSuyog Pawar ihevce_hadamard_16x16_8bit(
276*c83a76b0SSuyog Pawar pu1_src + 16, src_strd, pu1_pred + 16, pred_strd, pi2_dst + 16, dst_strd);
277*c83a76b0SSuyog Pawar // y2
278*c83a76b0SSuyog Pawar ihevce_hadamard_16x16_8bit(
279*c83a76b0SSuyog Pawar pu1_src + 16 * src_strd,
280*c83a76b0SSuyog Pawar src_strd,
281*c83a76b0SSuyog Pawar pu1_pred + 16 * pred_strd,
282*c83a76b0SSuyog Pawar pred_strd,
283*c83a76b0SSuyog Pawar pi2_dst + (16 * dst_strd),
284*c83a76b0SSuyog Pawar dst_strd);
285*c83a76b0SSuyog Pawar // y3
286*c83a76b0SSuyog Pawar ihevce_hadamard_16x16_8bit(
287*c83a76b0SSuyog Pawar pu1_src + 16 + 16 * src_strd,
288*c83a76b0SSuyog Pawar src_strd,
289*c83a76b0SSuyog Pawar pu1_pred + 16 + 16 * pred_strd,
290*c83a76b0SSuyog Pawar pred_strd,
291*c83a76b0SSuyog Pawar pi2_dst + (16 * dst_strd) + 16,
292*c83a76b0SSuyog Pawar dst_strd);
293*c83a76b0SSuyog Pawar
294*c83a76b0SSuyog Pawar /* Child HAD results combined as follows to get Parent result */
295*c83a76b0SSuyog Pawar /* _ _ */
296*c83a76b0SSuyog Pawar /* | (y0 + y1) + (y2 + y3) (y0 - y1) + (y2 - y3) | */
297*c83a76b0SSuyog Pawar /* | (y0 + y1) - (y2 + y3) (y0 - y1) - (y2 - y3) | */
298*c83a76b0SSuyog Pawar /* \- -/ */
299*c83a76b0SSuyog Pawar for(i = 0; i < 256; i++)
300*c83a76b0SSuyog Pawar {
301*c83a76b0SSuyog Pawar WORD32 idx = (i >> 4) * dst_strd + (i % 16);
302*c83a76b0SSuyog Pawar WORD16 a0 = pi2_dst[idx] >> 2;
303*c83a76b0SSuyog Pawar WORD16 a1 = pi2_dst[16 + idx] >> 2;
304*c83a76b0SSuyog Pawar WORD16 a2 = pi2_dst[(16 * dst_strd) + idx] >> 2;
305*c83a76b0SSuyog Pawar WORD16 a3 = pi2_dst[(16 * dst_strd) + 16 + idx] >> 2;
306*c83a76b0SSuyog Pawar
307*c83a76b0SSuyog Pawar WORD16 b0 = (a0 + a1);
308*c83a76b0SSuyog Pawar WORD16 b1 = (a0 - a1);
309*c83a76b0SSuyog Pawar WORD16 b2 = (a2 + a3);
310*c83a76b0SSuyog Pawar WORD16 b3 = (a2 - a3);
311*c83a76b0SSuyog Pawar
312*c83a76b0SSuyog Pawar pi2_dst[idx] = b0 + b2;
313*c83a76b0SSuyog Pawar pi2_dst[16 + idx] = b1 + b3;
314*c83a76b0SSuyog Pawar pi2_dst[(16 * dst_strd) + idx] = b0 - b2;
315*c83a76b0SSuyog Pawar pi2_dst[(16 * dst_strd) + 16 + idx] = b1 - b3;
316*c83a76b0SSuyog Pawar }
317*c83a76b0SSuyog Pawar }
318*c83a76b0SSuyog Pawar
319*c83a76b0SSuyog Pawar /**
320*c83a76b0SSuyog Pawar *******************************************************************************
321*c83a76b0SSuyog Pawar *
322*c83a76b0SSuyog Pawar * @brief
323*c83a76b0SSuyog Pawar * Compute Hadamard sad for 4x4 block with 8-bit input
324*c83a76b0SSuyog Pawar *
325*c83a76b0SSuyog Pawar * @par Description:
326*c83a76b0SSuyog Pawar *
327*c83a76b0SSuyog Pawar * @param[in] pu1_origin
328*c83a76b0SSuyog Pawar * UWORD8 pointer to the current block
329*c83a76b0SSuyog Pawar *
330*c83a76b0SSuyog Pawar * @param[in] src_strd
331*c83a76b0SSuyog Pawar * WORD32 Source stride
332*c83a76b0SSuyog Pawar *
333*c83a76b0SSuyog Pawar * @param[in] pu1_pred_buf
334*c83a76b0SSuyog Pawar * UWORD8 pointer to the prediction block
335*c83a76b0SSuyog Pawar *
336*c83a76b0SSuyog Pawar * @param[in] pred_strd
337*c83a76b0SSuyog Pawar * WORD32 Pred stride
338*c83a76b0SSuyog Pawar *
339*c83a76b0SSuyog Pawar * @param[in] pi2_dst
340*c83a76b0SSuyog Pawar * WORD16 pointer to the transform block
341*c83a76b0SSuyog Pawar *
342*c83a76b0SSuyog Pawar * @param[in] dst_strd
343*c83a76b0SSuyog Pawar * WORD32 Destination stride
344*c83a76b0SSuyog Pawar *
345*c83a76b0SSuyog Pawar * @param[in] size
346*c83a76b0SSuyog Pawar * WORD32 transform Block size
347*c83a76b0SSuyog Pawar *
348*c83a76b0SSuyog Pawar * @returns hadamard SAD
349*c83a76b0SSuyog Pawar *
350*c83a76b0SSuyog Pawar * @remarks
351*c83a76b0SSuyog Pawar * Not updating the transform destination now. Only returning the SATD
352*c83a76b0SSuyog Pawar *
353*c83a76b0SSuyog Pawar *******************************************************************************
354*c83a76b0SSuyog Pawar */
ihevce_HAD_4x4_8bit(UWORD8 * pu1_origin,WORD32 src_strd,UWORD8 * pu1_pred_buf,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)355*c83a76b0SSuyog Pawar UWORD32 ihevce_HAD_4x4_8bit(
356*c83a76b0SSuyog Pawar UWORD8 *pu1_origin,
357*c83a76b0SSuyog Pawar WORD32 src_strd,
358*c83a76b0SSuyog Pawar UWORD8 *pu1_pred_buf,
359*c83a76b0SSuyog Pawar WORD32 pred_strd,
360*c83a76b0SSuyog Pawar WORD16 *pi2_dst,
361*c83a76b0SSuyog Pawar WORD32 dst_strd)
362*c83a76b0SSuyog Pawar {
363*c83a76b0SSuyog Pawar WORD32 k;
364*c83a76b0SSuyog Pawar WORD16 v[16];
365*c83a76b0SSuyog Pawar UWORD32 u4_sad = 0;
366*c83a76b0SSuyog Pawar
367*c83a76b0SSuyog Pawar (void)pi2_dst;
368*c83a76b0SSuyog Pawar (void)dst_strd;
369*c83a76b0SSuyog Pawar ihevce_hadamard_4x4_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 4);
370*c83a76b0SSuyog Pawar
371*c83a76b0SSuyog Pawar for(k = 0; k < 16; ++k)
372*c83a76b0SSuyog Pawar u4_sad += abs(v[k]);
373*c83a76b0SSuyog Pawar u4_sad = ((u4_sad + 2) >> 2);
374*c83a76b0SSuyog Pawar
375*c83a76b0SSuyog Pawar return u4_sad;
376*c83a76b0SSuyog Pawar }
377*c83a76b0SSuyog Pawar
378*c83a76b0SSuyog Pawar /**
379*c83a76b0SSuyog Pawar *******************************************************************************
380*c83a76b0SSuyog Pawar *
381*c83a76b0SSuyog Pawar * @brief
382*c83a76b0SSuyog Pawar * Computes Hadamard Sad for 8x8 block with 8-bit input
383*c83a76b0SSuyog Pawar *
384*c83a76b0SSuyog Pawar * @par Description:
385*c83a76b0SSuyog Pawar *
386*c83a76b0SSuyog Pawar * @param[in] pu1_origin
387*c83a76b0SSuyog Pawar * UWORD8 pointer to the current block
388*c83a76b0SSuyog Pawar *
389*c83a76b0SSuyog Pawar * @param[in] src_strd
390*c83a76b0SSuyog Pawar * WORD32 Source stride
391*c83a76b0SSuyog Pawar *
392*c83a76b0SSuyog Pawar * @param[in] pu1_pred_buf
393*c83a76b0SSuyog Pawar * UWORD8 pointer to the prediction block
394*c83a76b0SSuyog Pawar *
395*c83a76b0SSuyog Pawar * @param[in] pred_strd
396*c83a76b0SSuyog Pawar * WORD32 Pred stride
397*c83a76b0SSuyog Pawar *
398*c83a76b0SSuyog Pawar * @param[in] pi2_dst
399*c83a76b0SSuyog Pawar * WORD16 pointer to the transform block
400*c83a76b0SSuyog Pawar *
401*c83a76b0SSuyog Pawar * @param[in] dst_strd
402*c83a76b0SSuyog Pawar * WORD32 Destination stride
403*c83a76b0SSuyog Pawar *
404*c83a76b0SSuyog Pawar * @param[in] size
405*c83a76b0SSuyog Pawar * WORD32 transform Block size
406*c83a76b0SSuyog Pawar *
407*c83a76b0SSuyog Pawar * @returns Hadamard SAD
408*c83a76b0SSuyog Pawar *
409*c83a76b0SSuyog Pawar * @remarks
410*c83a76b0SSuyog Pawar * Not updating the transform destination now. Only returning the SATD
411*c83a76b0SSuyog Pawar *
412*c83a76b0SSuyog Pawar *******************************************************************************
413*c83a76b0SSuyog Pawar */
ihevce_HAD_8x8_8bit(UWORD8 * pu1_origin,WORD32 src_strd,UWORD8 * pu1_pred_buf,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)414*c83a76b0SSuyog Pawar UWORD32 ihevce_HAD_8x8_8bit(
415*c83a76b0SSuyog Pawar UWORD8 *pu1_origin,
416*c83a76b0SSuyog Pawar WORD32 src_strd,
417*c83a76b0SSuyog Pawar UWORD8 *pu1_pred_buf,
418*c83a76b0SSuyog Pawar WORD32 pred_strd,
419*c83a76b0SSuyog Pawar WORD16 *pi2_dst,
420*c83a76b0SSuyog Pawar WORD32 dst_strd)
421*c83a76b0SSuyog Pawar {
422*c83a76b0SSuyog Pawar WORD32 k;
423*c83a76b0SSuyog Pawar UWORD32 u4_sad = 0;
424*c83a76b0SSuyog Pawar WORD16 v[64];
425*c83a76b0SSuyog Pawar
426*c83a76b0SSuyog Pawar (void)pi2_dst;
427*c83a76b0SSuyog Pawar (void)dst_strd;
428*c83a76b0SSuyog Pawar ihevce_hadamard_8x8_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 8);
429*c83a76b0SSuyog Pawar
430*c83a76b0SSuyog Pawar for(k = 0; k < 64; ++k)
431*c83a76b0SSuyog Pawar u4_sad += abs(v[k]);
432*c83a76b0SSuyog Pawar u4_sad = ((u4_sad + 4) >> 3);
433*c83a76b0SSuyog Pawar
434*c83a76b0SSuyog Pawar return u4_sad;
435*c83a76b0SSuyog Pawar }
436*c83a76b0SSuyog Pawar
437*c83a76b0SSuyog Pawar /**
438*c83a76b0SSuyog Pawar *******************************************************************************
439*c83a76b0SSuyog Pawar *
440*c83a76b0SSuyog Pawar * @brief
441*c83a76b0SSuyog Pawar * Compute dc suppressed hadamard sad for 8x8 block with 8-bit input
442*c83a76b0SSuyog Pawar *
443*c83a76b0SSuyog Pawar * @par Description:
444*c83a76b0SSuyog Pawar *
445*c83a76b0SSuyog Pawar * @param[in] pu1_origin
446*c83a76b0SSuyog Pawar * UWORD8 pointer to the current block
447*c83a76b0SSuyog Pawar *
448*c83a76b0SSuyog Pawar * @param[in] src_strd
449*c83a76b0SSuyog Pawar * WORD32 Source stride
450*c83a76b0SSuyog Pawar *
451*c83a76b0SSuyog Pawar * @param[in] pu1_pred_buf
452*c83a76b0SSuyog Pawar * UWORD8 pointer to the prediction block
453*c83a76b0SSuyog Pawar *
454*c83a76b0SSuyog Pawar * @param[in] pred_strd
455*c83a76b0SSuyog Pawar * WORD32 Pred stride
456*c83a76b0SSuyog Pawar *
457*c83a76b0SSuyog Pawar * @param[in] pi2_dst
458*c83a76b0SSuyog Pawar * WORD16 pointer to the transform block
459*c83a76b0SSuyog Pawar *
460*c83a76b0SSuyog Pawar * @param[in] dst_strd
461*c83a76b0SSuyog Pawar * WORD32 Destination stride
462*c83a76b0SSuyog Pawar *
463*c83a76b0SSuyog Pawar * @param[in] size
464*c83a76b0SSuyog Pawar * WORD32 transform Block size
465*c83a76b0SSuyog Pawar *
466*c83a76b0SSuyog Pawar * @returns Hadamard SAD with DC Suppressed
467*c83a76b0SSuyog Pawar *
468*c83a76b0SSuyog Pawar * @remarks
469*c83a76b0SSuyog Pawar * Not updating the transform destination now. Only returning the SATD
470*c83a76b0SSuyog Pawar *
471*c83a76b0SSuyog Pawar *******************************************************************************
472*c83a76b0SSuyog Pawar */
ihevce_compute_ac_had_8x8_8bit(UWORD8 * pu1_origin,WORD32 src_strd,UWORD8 * pu1_pred_buf,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)473*c83a76b0SSuyog Pawar UWORD32 ihevce_compute_ac_had_8x8_8bit(
474*c83a76b0SSuyog Pawar UWORD8 *pu1_origin,
475*c83a76b0SSuyog Pawar WORD32 src_strd,
476*c83a76b0SSuyog Pawar UWORD8 *pu1_pred_buf,
477*c83a76b0SSuyog Pawar WORD32 pred_strd,
478*c83a76b0SSuyog Pawar WORD16 *pi2_dst,
479*c83a76b0SSuyog Pawar WORD32 dst_strd)
480*c83a76b0SSuyog Pawar {
481*c83a76b0SSuyog Pawar WORD32 k;
482*c83a76b0SSuyog Pawar UWORD32 u4_sad = 0;
483*c83a76b0SSuyog Pawar WORD16 v[64];
484*c83a76b0SSuyog Pawar
485*c83a76b0SSuyog Pawar (void)pi2_dst;
486*c83a76b0SSuyog Pawar (void)dst_strd;
487*c83a76b0SSuyog Pawar ihevce_hadamard_8x8_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 8);
488*c83a76b0SSuyog Pawar
489*c83a76b0SSuyog Pawar v[0] = 0;
490*c83a76b0SSuyog Pawar for(k = 0; k < 64; ++k)
491*c83a76b0SSuyog Pawar u4_sad += abs(v[k]);
492*c83a76b0SSuyog Pawar u4_sad = ((u4_sad + 4) >> 3);
493*c83a76b0SSuyog Pawar
494*c83a76b0SSuyog Pawar return u4_sad;
495*c83a76b0SSuyog Pawar }
496*c83a76b0SSuyog Pawar
497*c83a76b0SSuyog Pawar /**
498*c83a76b0SSuyog Pawar *******************************************************************************
499*c83a76b0SSuyog Pawar *
500*c83a76b0SSuyog Pawar * @brief
501*c83a76b0SSuyog Pawar * Computes Hadamard Sad for 16x16 block with 8-bit input
502*c83a76b0SSuyog Pawar *
503*c83a76b0SSuyog Pawar * @par Description:
504*c83a76b0SSuyog Pawar *
505*c83a76b0SSuyog Pawar * @param[in] pu1_origin
506*c83a76b0SSuyog Pawar * UWORD8 pointer to the current block
507*c83a76b0SSuyog Pawar *
508*c83a76b0SSuyog Pawar * @param[in] src_strd
509*c83a76b0SSuyog Pawar * WORD32 Source stride
510*c83a76b0SSuyog Pawar *
511*c83a76b0SSuyog Pawar * @param[in] pu1_pred_buf
512*c83a76b0SSuyog Pawar * UWORD8 pointer to the prediction block
513*c83a76b0SSuyog Pawar *
514*c83a76b0SSuyog Pawar * @param[in] pred_strd
515*c83a76b0SSuyog Pawar * WORD32 Pred stride
516*c83a76b0SSuyog Pawar *
517*c83a76b0SSuyog Pawar * @param[in] pi2_dst
518*c83a76b0SSuyog Pawar * WORD16 pointer to the transform block
519*c83a76b0SSuyog Pawar *
520*c83a76b0SSuyog Pawar * @param[in] dst_strd
521*c83a76b0SSuyog Pawar * WORD32 Destination stride
522*c83a76b0SSuyog Pawar *
523*c83a76b0SSuyog Pawar * @param[in] size
524*c83a76b0SSuyog Pawar * WORD32 transform Block size
525*c83a76b0SSuyog Pawar *
526*c83a76b0SSuyog Pawar * @returns Hadamard SAD
527*c83a76b0SSuyog Pawar *
528*c83a76b0SSuyog Pawar * @remarks
529*c83a76b0SSuyog Pawar * Not updating the transform destination now. Only returning the SATD
530*c83a76b0SSuyog Pawar *
531*c83a76b0SSuyog Pawar *******************************************************************************
532*c83a76b0SSuyog Pawar */
ihevce_HAD_16x16_8bit(UWORD8 * pu1_origin,WORD32 src_strd,UWORD8 * pu1_pred_buf,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)533*c83a76b0SSuyog Pawar UWORD32 ihevce_HAD_16x16_8bit(
534*c83a76b0SSuyog Pawar UWORD8 *pu1_origin,
535*c83a76b0SSuyog Pawar WORD32 src_strd,
536*c83a76b0SSuyog Pawar UWORD8 *pu1_pred_buf,
537*c83a76b0SSuyog Pawar WORD32 pred_strd,
538*c83a76b0SSuyog Pawar WORD16 *pi2_dst,
539*c83a76b0SSuyog Pawar WORD32 dst_strd)
540*c83a76b0SSuyog Pawar {
541*c83a76b0SSuyog Pawar WORD32 k;
542*c83a76b0SSuyog Pawar UWORD32 u4_sad = 0;
543*c83a76b0SSuyog Pawar WORD16 v[256];
544*c83a76b0SSuyog Pawar
545*c83a76b0SSuyog Pawar (void)pi2_dst;
546*c83a76b0SSuyog Pawar (void)dst_strd;
547*c83a76b0SSuyog Pawar ihevce_hadamard_16x16_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 16);
548*c83a76b0SSuyog Pawar
549*c83a76b0SSuyog Pawar for(k = 0; k < 256; ++k)
550*c83a76b0SSuyog Pawar u4_sad += abs(v[k]);
551*c83a76b0SSuyog Pawar u4_sad = ((u4_sad + 4) >> 3);
552*c83a76b0SSuyog Pawar
553*c83a76b0SSuyog Pawar return u4_sad;
554*c83a76b0SSuyog Pawar }
555*c83a76b0SSuyog Pawar
556*c83a76b0SSuyog Pawar /**
557*c83a76b0SSuyog Pawar *******************************************************************************
558*c83a76b0SSuyog Pawar *
559*c83a76b0SSuyog Pawar * @brief
560*c83a76b0SSuyog Pawar * Computes Hadamard Sad for 32x32 block with 8-bit input
561*c83a76b0SSuyog Pawar *
562*c83a76b0SSuyog Pawar * @par Description:
563*c83a76b0SSuyog Pawar *
564*c83a76b0SSuyog Pawar * @param[in] pu1_origin
565*c83a76b0SSuyog Pawar * UWORD8 pointer to the current block
566*c83a76b0SSuyog Pawar *
567*c83a76b0SSuyog Pawar * @param[in] src_strd
568*c83a76b0SSuyog Pawar * WORD32 Source stride
569*c83a76b0SSuyog Pawar *
570*c83a76b0SSuyog Pawar * @param[in] pu1_pred_buf
571*c83a76b0SSuyog Pawar * UWORD8 pointer to the prediction block
572*c83a76b0SSuyog Pawar *
573*c83a76b0SSuyog Pawar * @param[in] pred_strd
574*c83a76b0SSuyog Pawar * WORD32 Pred stride
575*c83a76b0SSuyog Pawar *
576*c83a76b0SSuyog Pawar * @param[in] pi2_dst
577*c83a76b0SSuyog Pawar * WORD16 pointer to the transform block
578*c83a76b0SSuyog Pawar *
579*c83a76b0SSuyog Pawar * @param[in] dst_strd
580*c83a76b0SSuyog Pawar * WORD32 Destination stride
581*c83a76b0SSuyog Pawar *
582*c83a76b0SSuyog Pawar * @param[in] size
583*c83a76b0SSuyog Pawar * WORD32 transform Block size
584*c83a76b0SSuyog Pawar *
585*c83a76b0SSuyog Pawar * @returns Hadamard SAD
586*c83a76b0SSuyog Pawar *
587*c83a76b0SSuyog Pawar * @remarks
588*c83a76b0SSuyog Pawar * Not updating the transform destination now. Only returning the SATD
589*c83a76b0SSuyog Pawar *
590*c83a76b0SSuyog Pawar *******************************************************************************
591*c83a76b0SSuyog Pawar */
ihevce_HAD_32x32_8bit(UWORD8 * pu1_origin,WORD32 src_strd,UWORD8 * pu1_pred_buf,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd)592*c83a76b0SSuyog Pawar UWORD32 ihevce_HAD_32x32_8bit(
593*c83a76b0SSuyog Pawar UWORD8 *pu1_origin,
594*c83a76b0SSuyog Pawar WORD32 src_strd,
595*c83a76b0SSuyog Pawar UWORD8 *pu1_pred_buf,
596*c83a76b0SSuyog Pawar WORD32 pred_strd,
597*c83a76b0SSuyog Pawar WORD16 *pi2_dst,
598*c83a76b0SSuyog Pawar WORD32 dst_strd)
599*c83a76b0SSuyog Pawar {
600*c83a76b0SSuyog Pawar WORD32 k;
601*c83a76b0SSuyog Pawar UWORD32 u4_sad = 0;
602*c83a76b0SSuyog Pawar WORD16 v[32 * 32];
603*c83a76b0SSuyog Pawar
604*c83a76b0SSuyog Pawar (void)pi2_dst;
605*c83a76b0SSuyog Pawar (void)dst_strd;
606*c83a76b0SSuyog Pawar ihevce_hadamard_32x32_8bit(pu1_origin, src_strd, pu1_pred_buf, pred_strd, v, 32);
607*c83a76b0SSuyog Pawar
608*c83a76b0SSuyog Pawar for(k = 0; k < 32 * 32; ++k)
609*c83a76b0SSuyog Pawar u4_sad += abs(v[k]);
610*c83a76b0SSuyog Pawar u4_sad = ((u4_sad + 2) >> 2);
611*c83a76b0SSuyog Pawar
612*c83a76b0SSuyog Pawar return u4_sad;
613*c83a76b0SSuyog Pawar }
614*c83a76b0SSuyog Pawar
615*c83a76b0SSuyog Pawar //#if COMPUTE_16x16_R == C
616*c83a76b0SSuyog Pawar /**
617*c83a76b0SSuyog Pawar *******************************************************************************
618*c83a76b0SSuyog Pawar *
619*c83a76b0SSuyog Pawar * @brief
620*c83a76b0SSuyog Pawar * Computes 8x8 transform using children 4x4 hadamard results
621*c83a76b0SSuyog Pawar *
622*c83a76b0SSuyog Pawar * @par Description:
623*c83a76b0SSuyog Pawar *
624*c83a76b0SSuyog Pawar * @param[in] pi2_4x4_had
625*c83a76b0SSuyog Pawar * WORD16 pointer to 4x4 hadamard buffer(y0, y1, y2, y3 hadmard in Zscan order)
626*c83a76b0SSuyog Pawar *
627*c83a76b0SSuyog Pawar * @param[in] had4_strd
628*c83a76b0SSuyog Pawar * stride of 4x4 hadmard buffer pi2_y0, pi2_y1, pi2_y2, pi2_y3
629*c83a76b0SSuyog Pawar *
630*c83a76b0SSuyog Pawar * @param[out] pi2_dst
631*c83a76b0SSuyog Pawar * destination buffer where 8x8 hadamard result is stored
632*c83a76b0SSuyog Pawar *
633*c83a76b0SSuyog Pawar * @param[in] dst_stride
634*c83a76b0SSuyog Pawar * stride of destination block
635*c83a76b0SSuyog Pawar *
636*c83a76b0SSuyog Pawar * @param[in] i4_frm_qstep
637*c83a76b0SSuyog Pawar * frm_qstep value based on the which the threshold value is calculated
638*c83a76b0SSuyog Pawar *
639*c83a76b0SSuyog Pawar * @returns
640*c83a76b0SSuyog Pawar * 8x8 Hadamard SATD
641*c83a76b0SSuyog Pawar * @remarks
642*c83a76b0SSuyog Pawar *
643*c83a76b0SSuyog Pawar *******************************************************************************
644*c83a76b0SSuyog Pawar */
ihevce_compute_8x8HAD_using_4x4(WORD16 * pi2_4x4_had,WORD32 had4_strd,WORD16 * pi2_dst,WORD32 dst_strd,WORD32 i4_frm_qstep,WORD32 * pi4_cbf)645*c83a76b0SSuyog Pawar static UWORD32 ihevce_compute_8x8HAD_using_4x4(
646*c83a76b0SSuyog Pawar WORD16 *pi2_4x4_had,
647*c83a76b0SSuyog Pawar WORD32 had4_strd,
648*c83a76b0SSuyog Pawar WORD16 *pi2_dst,
649*c83a76b0SSuyog Pawar WORD32 dst_strd,
650*c83a76b0SSuyog Pawar WORD32 i4_frm_qstep,
651*c83a76b0SSuyog Pawar WORD32 *pi4_cbf)
652*c83a76b0SSuyog Pawar {
653*c83a76b0SSuyog Pawar /* Qstep value is right shifted by 8 */
654*c83a76b0SSuyog Pawar WORD32 threshold = (i4_frm_qstep >> 8);
655*c83a76b0SSuyog Pawar
656*c83a76b0SSuyog Pawar /* Initialize pointers to 4 subblocks of 4x4 HAD buffer */
657*c83a76b0SSuyog Pawar WORD16 *pi2_y0 = pi2_4x4_had;
658*c83a76b0SSuyog Pawar WORD16 *pi2_y1 = pi2_4x4_had + 4;
659*c83a76b0SSuyog Pawar WORD16 *pi2_y2 = pi2_4x4_had + had4_strd * 4;
660*c83a76b0SSuyog Pawar WORD16 *pi2_y3 = pi2_4x4_had + had4_strd * 4 + 4;
661*c83a76b0SSuyog Pawar
662*c83a76b0SSuyog Pawar /* Initialize pointers to store 8x8 HAD output */
663*c83a76b0SSuyog Pawar WORD16 *pi2_dst0 = pi2_dst;
664*c83a76b0SSuyog Pawar WORD16 *pi2_dst1 = pi2_dst + 4;
665*c83a76b0SSuyog Pawar WORD16 *pi2_dst2 = pi2_dst + dst_strd * 4;
666*c83a76b0SSuyog Pawar WORD16 *pi2_dst3 = pi2_dst + dst_strd * 4 + 4;
667*c83a76b0SSuyog Pawar
668*c83a76b0SSuyog Pawar UWORD32 u4_satd = 0;
669*c83a76b0SSuyog Pawar WORD32 i;
670*c83a76b0SSuyog Pawar
671*c83a76b0SSuyog Pawar /* Child HAD results combined as follows to get Parent result */
672*c83a76b0SSuyog Pawar /* _ _ */
673*c83a76b0SSuyog Pawar /* | (y0 + y1) + (y2 + y3) (y0 - y1) + (y2 - y3) | */
674*c83a76b0SSuyog Pawar /* | (y0 + y1) - (y2 + y3) (y0 - y1) - (y2 - y3) | */
675*c83a76b0SSuyog Pawar /* \- -/ */
676*c83a76b0SSuyog Pawar for(i = 0; i < 16; i++)
677*c83a76b0SSuyog Pawar {
678*c83a76b0SSuyog Pawar WORD32 src_idx = (i >> 2) * had4_strd + (i % 4);
679*c83a76b0SSuyog Pawar WORD32 dst_idx = (i >> 2) * dst_strd + (i % 4);
680*c83a76b0SSuyog Pawar
681*c83a76b0SSuyog Pawar WORD16 a0 = pi2_y0[src_idx];
682*c83a76b0SSuyog Pawar WORD16 a1 = pi2_y1[src_idx];
683*c83a76b0SSuyog Pawar WORD16 a2 = pi2_y2[src_idx];
684*c83a76b0SSuyog Pawar WORD16 a3 = pi2_y3[src_idx];
685*c83a76b0SSuyog Pawar
686*c83a76b0SSuyog Pawar WORD16 b0 = (a0 + a1);
687*c83a76b0SSuyog Pawar WORD16 b1 = (a0 - a1);
688*c83a76b0SSuyog Pawar WORD16 b2 = (a2 + a3);
689*c83a76b0SSuyog Pawar WORD16 b3 = (a2 - a3);
690*c83a76b0SSuyog Pawar
691*c83a76b0SSuyog Pawar pi2_dst0[dst_idx] = b0 + b2;
692*c83a76b0SSuyog Pawar pi2_dst1[dst_idx] = b1 + b3;
693*c83a76b0SSuyog Pawar pi2_dst2[dst_idx] = b0 - b2;
694*c83a76b0SSuyog Pawar pi2_dst3[dst_idx] = b1 - b3;
695*c83a76b0SSuyog Pawar
696*c83a76b0SSuyog Pawar if(ABS(pi2_dst0[dst_idx]) > threshold)
697*c83a76b0SSuyog Pawar *pi4_cbf = 1;
698*c83a76b0SSuyog Pawar if(ABS(pi2_dst1[dst_idx]) > threshold)
699*c83a76b0SSuyog Pawar *pi4_cbf = 1;
700*c83a76b0SSuyog Pawar if(ABS(pi2_dst2[dst_idx]) > threshold)
701*c83a76b0SSuyog Pawar *pi4_cbf = 1;
702*c83a76b0SSuyog Pawar if(ABS(pi2_dst3[dst_idx]) > threshold)
703*c83a76b0SSuyog Pawar *pi4_cbf = 1;
704*c83a76b0SSuyog Pawar
705*c83a76b0SSuyog Pawar u4_satd += ABS(pi2_dst0[dst_idx]);
706*c83a76b0SSuyog Pawar u4_satd += ABS(pi2_dst1[dst_idx]);
707*c83a76b0SSuyog Pawar u4_satd += ABS(pi2_dst2[dst_idx]);
708*c83a76b0SSuyog Pawar u4_satd += ABS(pi2_dst3[dst_idx]);
709*c83a76b0SSuyog Pawar }
710*c83a76b0SSuyog Pawar
711*c83a76b0SSuyog Pawar /* return the 8x8 satd */
712*c83a76b0SSuyog Pawar return (u4_satd);
713*c83a76b0SSuyog Pawar }
714*c83a76b0SSuyog Pawar
715*c83a76b0SSuyog Pawar /**
716*c83a76b0SSuyog Pawar *******************************************************************************
717*c83a76b0SSuyog Pawar *
718*c83a76b0SSuyog Pawar * @brief
719*c83a76b0SSuyog Pawar * Computes Residue and Hadamard Transform for four 4x4 blocks (Z scan) of
720*c83a76b0SSuyog Pawar * a 8x8 block (Residue is computed for 8-bit src and prediction buffers)
721*c83a76b0SSuyog Pawar * Modified to incorporate the dead-zone implementation - Lokesh
722*c83a76b0SSuyog Pawar *
723*c83a76b0SSuyog Pawar * @par Description:
724*c83a76b0SSuyog Pawar *
725*c83a76b0SSuyog Pawar * @param[in] pu1_origin
726*c83a76b0SSuyog Pawar * UWORD8 pointer to the current block
727*c83a76b0SSuyog Pawar *
728*c83a76b0SSuyog Pawar * @param[in] src_strd
729*c83a76b0SSuyog Pawar * WORD32 Source stride
730*c83a76b0SSuyog Pawar *
731*c83a76b0SSuyog Pawar * @param[in] pu1_pred
732*c83a76b0SSuyog Pawar * UWORD8 pointer to the prediction block
733*c83a76b0SSuyog Pawar *
734*c83a76b0SSuyog Pawar * @param[in] pred_strd
735*c83a76b0SSuyog Pawar * WORD32 Pred stride
736*c83a76b0SSuyog Pawar *
737*c83a76b0SSuyog Pawar * @param[out] pi2_dst
738*c83a76b0SSuyog Pawar * WORD16 pointer to the transform block
739*c83a76b0SSuyog Pawar *
740*c83a76b0SSuyog Pawar * @param[in] dst_strd
741*c83a76b0SSuyog Pawar * WORD32 Destination stride
742*c83a76b0SSuyog Pawar *
743*c83a76b0SSuyog Pawar * @param[out] pi4_hsad
744*c83a76b0SSuyog Pawar * array for storing hadmard sad of each 4x4 block
745*c83a76b0SSuyog Pawar *
746*c83a76b0SSuyog Pawar * @param[in] hsad_stride
747*c83a76b0SSuyog Pawar * stride of hadmard sad destination buffer (for Zscan order of storing sads)
748*c83a76b0SSuyog Pawar *
749*c83a76b0SSuyog Pawar * @param[in] i4_frm_qstep
750*c83a76b0SSuyog Pawar * frm_qstep value based on the which the threshold value is calculated
751*c83a76b0SSuyog Pawar *
752*c83a76b0SSuyog Pawar * @returns
753*c83a76b0SSuyog Pawar *
754*c83a76b0SSuyog Pawar * @remarks
755*c83a76b0SSuyog Pawar *
756*c83a76b0SSuyog Pawar *******************************************************************************
757*c83a76b0SSuyog Pawar */
ihevce_had4_4x4(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst4x4,WORD32 dst_strd,WORD32 * pi4_hsad,WORD32 hsad_stride,WORD32 i4_frm_qstep)758*c83a76b0SSuyog Pawar static WORD32 ihevce_had4_4x4(
759*c83a76b0SSuyog Pawar UWORD8 *pu1_src,
760*c83a76b0SSuyog Pawar WORD32 src_strd,
761*c83a76b0SSuyog Pawar UWORD8 *pu1_pred,
762*c83a76b0SSuyog Pawar WORD32 pred_strd,
763*c83a76b0SSuyog Pawar WORD16 *pi2_dst4x4,
764*c83a76b0SSuyog Pawar WORD32 dst_strd,
765*c83a76b0SSuyog Pawar WORD32 *pi4_hsad,
766*c83a76b0SSuyog Pawar WORD32 hsad_stride,
767*c83a76b0SSuyog Pawar WORD32 i4_frm_qstep)
768*c83a76b0SSuyog Pawar {
769*c83a76b0SSuyog Pawar WORD32 i, k;
770*c83a76b0SSuyog Pawar WORD32 i4_child_total_sad = 0;
771*c83a76b0SSuyog Pawar
772*c83a76b0SSuyog Pawar (void)i4_frm_qstep;
773*c83a76b0SSuyog Pawar /* -------- Compute four 4x4 HAD Transforms ---------*/
774*c83a76b0SSuyog Pawar for(i = 0; i < 4; i++)
775*c83a76b0SSuyog Pawar {
776*c83a76b0SSuyog Pawar UWORD8 *pu1_pi0, *pu1_pi1;
777*c83a76b0SSuyog Pawar WORD16 *pi2_dst;
778*c83a76b0SSuyog Pawar WORD32 blkx, blky;
779*c83a76b0SSuyog Pawar UWORD32 u4_hsad = 0;
780*c83a76b0SSuyog Pawar // TODO: choose deadzone as f(qstep)
781*c83a76b0SSuyog Pawar WORD32 threshold = 0;
782*c83a76b0SSuyog Pawar
783*c83a76b0SSuyog Pawar /*****************************************************/
784*c83a76b0SSuyog Pawar /* Assuming the looping structure of the four */
785*c83a76b0SSuyog Pawar /* blocks is in Z scan order of 4x4s in a 8x8 */
786*c83a76b0SSuyog Pawar /* block instead of raster scan */
787*c83a76b0SSuyog Pawar /*****************************************************/
788*c83a76b0SSuyog Pawar blkx = (i & 0x1);
789*c83a76b0SSuyog Pawar blky = (i >> 1);
790*c83a76b0SSuyog Pawar
791*c83a76b0SSuyog Pawar pu1_pi0 = pu1_src + (blkx * 4) + (blky * 4 * src_strd);
792*c83a76b0SSuyog Pawar pu1_pi1 = pu1_pred + (blkx * 4) + (blky * 4 * pred_strd);
793*c83a76b0SSuyog Pawar pi2_dst = pi2_dst4x4 + (blkx * 4) + (blky * 4 * dst_strd);
794*c83a76b0SSuyog Pawar
795*c83a76b0SSuyog Pawar ihevce_hadamard_4x4_8bit(pu1_pi0, src_strd, pu1_pi1, pred_strd, pi2_dst, dst_strd);
796*c83a76b0SSuyog Pawar
797*c83a76b0SSuyog Pawar for(k = 0; k < 4; k++)
798*c83a76b0SSuyog Pawar {
799*c83a76b0SSuyog Pawar if(ABS(pi2_dst[0 * dst_strd + k]) < threshold)
800*c83a76b0SSuyog Pawar pi2_dst[0 * dst_strd + k] = 0;
801*c83a76b0SSuyog Pawar
802*c83a76b0SSuyog Pawar if(ABS(pi2_dst[1 * dst_strd + k]) < threshold)
803*c83a76b0SSuyog Pawar pi2_dst[1 * dst_strd + k] = 0;
804*c83a76b0SSuyog Pawar
805*c83a76b0SSuyog Pawar if(ABS(pi2_dst[2 * dst_strd + k]) < threshold)
806*c83a76b0SSuyog Pawar pi2_dst[2 * dst_strd + k] = 0;
807*c83a76b0SSuyog Pawar
808*c83a76b0SSuyog Pawar if(ABS(pi2_dst[3 * dst_strd + k]) < threshold)
809*c83a76b0SSuyog Pawar pi2_dst[3 * dst_strd + k] = 0;
810*c83a76b0SSuyog Pawar
811*c83a76b0SSuyog Pawar /* Accumulate the SATD */
812*c83a76b0SSuyog Pawar u4_hsad += ABS(pi2_dst[0 * dst_strd + k]);
813*c83a76b0SSuyog Pawar u4_hsad += ABS(pi2_dst[1 * dst_strd + k]);
814*c83a76b0SSuyog Pawar u4_hsad += ABS(pi2_dst[2 * dst_strd + k]);
815*c83a76b0SSuyog Pawar u4_hsad += ABS(pi2_dst[3 * dst_strd + k]);
816*c83a76b0SSuyog Pawar }
817*c83a76b0SSuyog Pawar
818*c83a76b0SSuyog Pawar /*===== Normalize the HSAD =====*/
819*c83a76b0SSuyog Pawar pi4_hsad[blkx + (blky * hsad_stride)] = ((u4_hsad + 2) >> 2);
820*c83a76b0SSuyog Pawar i4_child_total_sad += ((u4_hsad + 2) >> 2);
821*c83a76b0SSuyog Pawar }
822*c83a76b0SSuyog Pawar return i4_child_total_sad;
823*c83a76b0SSuyog Pawar }
824*c83a76b0SSuyog Pawar
825*c83a76b0SSuyog Pawar /**
826*c83a76b0SSuyog Pawar *******************************************************************************
827*c83a76b0SSuyog Pawar *
828*c83a76b0SSuyog Pawar * @brief
829*c83a76b0SSuyog Pawar * HSAD is returned for the 4, 4x4 in 8x8
830*c83a76b0SSuyog Pawar *
831*c83a76b0SSuyog Pawar * @par Description:
832*c83a76b0SSuyog Pawar *
833*c83a76b0SSuyog Pawar * @param[in] pu1_origin
834*c83a76b0SSuyog Pawar * UWORD8 pointer to the current block
835*c83a76b0SSuyog Pawar *
836*c83a76b0SSuyog Pawar * @param[in] src_strd
837*c83a76b0SSuyog Pawar * WORD32 Source stride
838*c83a76b0SSuyog Pawar *
839*c83a76b0SSuyog Pawar * @param[in] pu1_pred
840*c83a76b0SSuyog Pawar * UWORD8 pointer to the prediction block
841*c83a76b0SSuyog Pawar *
842*c83a76b0SSuyog Pawar * @param[in] pred_strd
843*c83a76b0SSuyog Pawar * WORD32 Pred stride
844*c83a76b0SSuyog Pawar *
845*c83a76b0SSuyog Pawar * @param[out] pi2_dst
846*c83a76b0SSuyog Pawar * WORD16 pointer to the transform output block
847*c83a76b0SSuyog Pawar *
848*c83a76b0SSuyog Pawar * @param[out] dst_strd
849*c83a76b0SSuyog Pawar * WORD32 Destination stride
850*c83a76b0SSuyog Pawar *
851*c83a76b0SSuyog Pawar * @param[out] ppi4_hsad
852*c83a76b0SSuyog Pawar * pointer to base pointers for storing hadmard sads of various
853*c83a76b0SSuyog Pawar * block sizes (4x4 to 32x32)
854*c83a76b0SSuyog Pawar *
855*c83a76b0SSuyog Pawar * @param[in] pos_x_y_4x4
856*c83a76b0SSuyog Pawar * Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB
857*c83a76b0SSuyog Pawar * Lower 16bits denote xpos and upper 16ypos of the 4x4block
858*c83a76b0SSuyog Pawar *
859*c83a76b0SSuyog Pawar * @param[in] num_4x4_in_row
860*c83a76b0SSuyog Pawar * Denotes the number of current 4x4 blocks in a ctb/CU/MB
861*c83a76b0SSuyog Pawar *
862*c83a76b0SSuyog Pawar * @returns
863*c83a76b0SSuyog Pawar *
864*c83a76b0SSuyog Pawar * @remarks
865*c83a76b0SSuyog Pawar *
866*c83a76b0SSuyog Pawar *******************************************************************************
867*c83a76b0SSuyog Pawar */
ihevce_had_8x8_using_4_4x4(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd,WORD32 ** ppi4_hsad,WORD32 pos_x_y_4x4,WORD32 num_4x4_in_row)868*c83a76b0SSuyog Pawar void ihevce_had_8x8_using_4_4x4(
869*c83a76b0SSuyog Pawar UWORD8 *pu1_src,
870*c83a76b0SSuyog Pawar WORD32 src_strd,
871*c83a76b0SSuyog Pawar UWORD8 *pu1_pred,
872*c83a76b0SSuyog Pawar WORD32 pred_strd,
873*c83a76b0SSuyog Pawar WORD16 *pi2_dst,
874*c83a76b0SSuyog Pawar WORD32 dst_strd,
875*c83a76b0SSuyog Pawar WORD32 **ppi4_hsad,
876*c83a76b0SSuyog Pawar WORD32 pos_x_y_4x4,
877*c83a76b0SSuyog Pawar WORD32 num_4x4_in_row)
878*c83a76b0SSuyog Pawar {
879*c83a76b0SSuyog Pawar WORD16 ai2_4x4_had[64];
880*c83a76b0SSuyog Pawar WORD32 pos_x = pos_x_y_4x4 & 0xFFFF;
881*c83a76b0SSuyog Pawar WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF;
882*c83a76b0SSuyog Pawar WORD32 *pi4_4x4_hsad;
883*c83a76b0SSuyog Pawar WORD32 *pi4_8x8_hsad;
884*c83a76b0SSuyog Pawar
885*c83a76b0SSuyog Pawar (void)pi2_dst;
886*c83a76b0SSuyog Pawar (void)dst_strd;
887*c83a76b0SSuyog Pawar ASSERT(pos_x >= 0);
888*c83a76b0SSuyog Pawar ASSERT(pos_y >= 0);
889*c83a76b0SSuyog Pawar
890*c83a76b0SSuyog Pawar /* Initialize pointers to store 4x4 and 8x8 HAD SATDs */
891*c83a76b0SSuyog Pawar pi4_4x4_hsad = ppi4_hsad[HAD_4x4] + pos_x + pos_y * num_4x4_in_row;
892*c83a76b0SSuyog Pawar pi4_8x8_hsad = ppi4_hsad[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
893*c83a76b0SSuyog Pawar
894*c83a76b0SSuyog Pawar /* -------- Compute four 4x4 HAD Transforms of 8x8 in one call--------- */
895*c83a76b0SSuyog Pawar pi4_8x8_hsad[0] = ihevce_had4_4x4(
896*c83a76b0SSuyog Pawar pu1_src, src_strd, pu1_pred, pred_strd, ai2_4x4_had, 8, pi4_4x4_hsad, num_4x4_in_row, 0);
897*c83a76b0SSuyog Pawar }
898*c83a76b0SSuyog Pawar
899*c83a76b0SSuyog Pawar /**
900*c83a76b0SSuyog Pawar *******************************************************************************
901*c83a76b0SSuyog Pawar *
902*c83a76b0SSuyog Pawar * @brief
903*c83a76b0SSuyog Pawar * Reursive Hadamard Transform for 8x8 block. HSAD is returned for the 8x8
904*c83a76b0SSuyog Pawar * block and its four subblocks(4x4).
905*c83a76b0SSuyog Pawar *
906*c83a76b0SSuyog Pawar * @par Description:
907*c83a76b0SSuyog Pawar *
908*c83a76b0SSuyog Pawar * @param[in] pu1_origin
909*c83a76b0SSuyog Pawar * UWORD8 pointer to the current block
910*c83a76b0SSuyog Pawar *
911*c83a76b0SSuyog Pawar * @param[in] src_strd
912*c83a76b0SSuyog Pawar * WORD32 Source stride
913*c83a76b0SSuyog Pawar *
914*c83a76b0SSuyog Pawar * @param[in] pu1_pred
915*c83a76b0SSuyog Pawar * UWORD8 pointer to the prediction block
916*c83a76b0SSuyog Pawar *
917*c83a76b0SSuyog Pawar * @param[in] pred_strd
918*c83a76b0SSuyog Pawar * WORD32 Pred stride
919*c83a76b0SSuyog Pawar *
920*c83a76b0SSuyog Pawar * @param[out] pi2_dst
921*c83a76b0SSuyog Pawar * WORD16 pointer to the transform output block
922*c83a76b0SSuyog Pawar *
923*c83a76b0SSuyog Pawar * @param[out] dst_strd
924*c83a76b0SSuyog Pawar * WORD32 Destination stride
925*c83a76b0SSuyog Pawar *
926*c83a76b0SSuyog Pawar * @param[out] ppi4_hsad
927*c83a76b0SSuyog Pawar * pointer to base pointers for storing hadmard sads of various
928*c83a76b0SSuyog Pawar * block sizes (4x4 to 32x32)
929*c83a76b0SSuyog Pawar *
930*c83a76b0SSuyog Pawar * @param[in] pos_x_y_4x4
931*c83a76b0SSuyog Pawar * Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB
932*c83a76b0SSuyog Pawar * Lower 16bits denote xpos and upper 16ypos of the 4x4block
933*c83a76b0SSuyog Pawar *
934*c83a76b0SSuyog Pawar * @param[in] num_4x4_in_row
935*c83a76b0SSuyog Pawar * Denotes the number of current 4x4 blocks in a ctb/CU/MB
936*c83a76b0SSuyog Pawar *
937*c83a76b0SSuyog Pawar * @param[in] i4_frm_qstep
938*c83a76b0SSuyog Pawar * frm_qstep value based on the which the threshold value is calculated
939*c83a76b0SSuyog Pawar *
940*c83a76b0SSuyog Pawar * @returns
941*c83a76b0SSuyog Pawar *
942*c83a76b0SSuyog Pawar * @remarks
943*c83a76b0SSuyog Pawar *
944*c83a76b0SSuyog Pawar *******************************************************************************
945*c83a76b0SSuyog Pawar */
ihevce_had_8x8_using_4_4x4_r(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd,WORD32 ** ppi4_hsad,WORD32 ** ppi4_tu_split,WORD32 ** ppi4_tu_early_cbf,WORD32 pos_x_y_4x4,WORD32 num_4x4_in_row,WORD32 lambda,WORD32 lambda_q_shift,WORD32 i4_frm_qstep,WORD32 i4_cur_depth,WORD32 i4_max_depth,WORD32 i4_max_tr_size,WORD32 * pi4_tu_split_cost,void * pv_func_sel)946*c83a76b0SSuyog Pawar WORD32 ihevce_had_8x8_using_4_4x4_r(
947*c83a76b0SSuyog Pawar UWORD8 *pu1_src,
948*c83a76b0SSuyog Pawar WORD32 src_strd,
949*c83a76b0SSuyog Pawar UWORD8 *pu1_pred,
950*c83a76b0SSuyog Pawar WORD32 pred_strd,
951*c83a76b0SSuyog Pawar WORD16 *pi2_dst,
952*c83a76b0SSuyog Pawar WORD32 dst_strd,
953*c83a76b0SSuyog Pawar WORD32 **ppi4_hsad,
954*c83a76b0SSuyog Pawar WORD32 **ppi4_tu_split,
955*c83a76b0SSuyog Pawar WORD32 **ppi4_tu_early_cbf,
956*c83a76b0SSuyog Pawar WORD32 pos_x_y_4x4,
957*c83a76b0SSuyog Pawar WORD32 num_4x4_in_row,
958*c83a76b0SSuyog Pawar WORD32 lambda,
959*c83a76b0SSuyog Pawar WORD32 lambda_q_shift,
960*c83a76b0SSuyog Pawar WORD32 i4_frm_qstep,
961*c83a76b0SSuyog Pawar WORD32 i4_cur_depth,
962*c83a76b0SSuyog Pawar WORD32 i4_max_depth,
963*c83a76b0SSuyog Pawar WORD32 i4_max_tr_size,
964*c83a76b0SSuyog Pawar WORD32 *pi4_tu_split_cost,
965*c83a76b0SSuyog Pawar void *pv_func_sel)
966*c83a76b0SSuyog Pawar {
967*c83a76b0SSuyog Pawar WORD16 ai2_4x4_had[64];
968*c83a76b0SSuyog Pawar WORD32 pos_x = pos_x_y_4x4 & 0xFFFF;
969*c83a76b0SSuyog Pawar WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF;
970*c83a76b0SSuyog Pawar WORD32 *pi4_4x4_hsad;
971*c83a76b0SSuyog Pawar WORD32 *pi4_8x8_hsad;
972*c83a76b0SSuyog Pawar WORD32 *pi4_8x8_tu_split;
973*c83a76b0SSuyog Pawar
974*c83a76b0SSuyog Pawar WORD32 *pi4_8x8_tu_early_cbf;
975*c83a76b0SSuyog Pawar
976*c83a76b0SSuyog Pawar UWORD32 u4_satd;
977*c83a76b0SSuyog Pawar WORD32 cost_child = 0, cost_parent = 0;
978*c83a76b0SSuyog Pawar WORD32 early_cbf = 0;
979*c83a76b0SSuyog Pawar
980*c83a76b0SSuyog Pawar const UWORD8 u1_cur_tr_size = 8;
981*c83a76b0SSuyog Pawar /* Stores the best cost for the Current 8x8: Lokesh */
982*c83a76b0SSuyog Pawar WORD32 best_cost = 0;
983*c83a76b0SSuyog Pawar
984*c83a76b0SSuyog Pawar (void)pv_func_sel;
985*c83a76b0SSuyog Pawar ASSERT(pos_x >= 0);
986*c83a76b0SSuyog Pawar ASSERT(pos_y >= 0);
987*c83a76b0SSuyog Pawar
988*c83a76b0SSuyog Pawar /* Initialize pointers to store 4x4 and 8x8 HAD SATDs */
989*c83a76b0SSuyog Pawar pi4_4x4_hsad = ppi4_hsad[HAD_4x4] + pos_x + pos_y * num_4x4_in_row;
990*c83a76b0SSuyog Pawar pi4_8x8_hsad = ppi4_hsad[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
991*c83a76b0SSuyog Pawar pi4_8x8_tu_split = ppi4_tu_split[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
992*c83a76b0SSuyog Pawar pi4_8x8_tu_early_cbf =
993*c83a76b0SSuyog Pawar ppi4_tu_early_cbf[HAD_8x8] + (pos_x >> 1) + (pos_y >> 1) * (num_4x4_in_row >> 1);
994*c83a76b0SSuyog Pawar
995*c83a76b0SSuyog Pawar /* -------- Compute four 4x4 HAD Transforms of 8x8 in one call--------- */
996*c83a76b0SSuyog Pawar cost_child = ihevce_had4_4x4(
997*c83a76b0SSuyog Pawar pu1_src, src_strd, pu1_pred, pred_strd, ai2_4x4_had, 8, pi4_4x4_hsad, num_4x4_in_row, 0);
998*c83a76b0SSuyog Pawar
999*c83a76b0SSuyog Pawar /* -------- Compute 8x8 HAD Transform using 4x4 results ------------- */
1000*c83a76b0SSuyog Pawar u4_satd = ihevce_compute_8x8HAD_using_4x4(
1001*c83a76b0SSuyog Pawar ai2_4x4_had, 8, pi2_dst, dst_strd, i4_frm_qstep, &early_cbf);
1002*c83a76b0SSuyog Pawar
1003*c83a76b0SSuyog Pawar /* store the normalized 8x8 satd */
1004*c83a76b0SSuyog Pawar cost_parent = ((u4_satd + 4) >> 3);
1005*c83a76b0SSuyog Pawar
1006*c83a76b0SSuyog Pawar /* 4 CBF Flags, extra 1 becoz of the 0.5 bits per bin is assumed */
1007*c83a76b0SSuyog Pawar cost_child += ((4) * lambda) >> (lambda_q_shift + 1);
1008*c83a76b0SSuyog Pawar
1009*c83a76b0SSuyog Pawar if(i4_cur_depth < i4_max_depth)
1010*c83a76b0SSuyog Pawar {
1011*c83a76b0SSuyog Pawar if((cost_child < cost_parent) || (i4_max_tr_size < u1_cur_tr_size))
1012*c83a76b0SSuyog Pawar {
1013*c83a76b0SSuyog Pawar //cost_child -= ((4) * lambda) >> (lambda_q_shift + 1);
1014*c83a76b0SSuyog Pawar *pi4_tu_split_cost += (4 * lambda) >> (lambda_q_shift + 1);
1015*c83a76b0SSuyog Pawar best_cost = cost_child;
1016*c83a76b0SSuyog Pawar best_cost <<= 1;
1017*c83a76b0SSuyog Pawar best_cost++;
1018*c83a76b0SSuyog Pawar pi4_8x8_tu_split[0] = 1;
1019*c83a76b0SSuyog Pawar pi4_8x8_hsad[0] = cost_child;
1020*c83a76b0SSuyog Pawar }
1021*c83a76b0SSuyog Pawar else
1022*c83a76b0SSuyog Pawar {
1023*c83a76b0SSuyog Pawar //cost_parent -= ((1) * lambda) >> (lambda_q_shift + 1);
1024*c83a76b0SSuyog Pawar best_cost = cost_parent;
1025*c83a76b0SSuyog Pawar best_cost <<= 1;
1026*c83a76b0SSuyog Pawar pi4_8x8_tu_split[0] = 0;
1027*c83a76b0SSuyog Pawar pi4_8x8_hsad[0] = cost_parent;
1028*c83a76b0SSuyog Pawar }
1029*c83a76b0SSuyog Pawar }
1030*c83a76b0SSuyog Pawar else
1031*c83a76b0SSuyog Pawar {
1032*c83a76b0SSuyog Pawar //cost_parent -= ((1) * lambda) >> (lambda_q_shift + 1);
1033*c83a76b0SSuyog Pawar best_cost = cost_parent;
1034*c83a76b0SSuyog Pawar best_cost <<= 1;
1035*c83a76b0SSuyog Pawar pi4_8x8_tu_split[0] = 0;
1036*c83a76b0SSuyog Pawar pi4_8x8_hsad[0] = cost_parent;
1037*c83a76b0SSuyog Pawar }
1038*c83a76b0SSuyog Pawar
1039*c83a76b0SSuyog Pawar pi4_8x8_tu_early_cbf[0] = early_cbf;
1040*c83a76b0SSuyog Pawar
1041*c83a76b0SSuyog Pawar /* best cost has tu_split_flag at LSB(Least significant bit) */
1042*c83a76b0SSuyog Pawar return ((best_cost << 1) + early_cbf);
1043*c83a76b0SSuyog Pawar }
1044*c83a76b0SSuyog Pawar
1045*c83a76b0SSuyog Pawar /**
1046*c83a76b0SSuyog Pawar *******************************************************************************
1047*c83a76b0SSuyog Pawar *
1048*c83a76b0SSuyog Pawar * @brief
1049*c83a76b0SSuyog Pawar * Computes 16x16 transform using children 8x8 hadamard results
1050*c83a76b0SSuyog Pawar * Modified to incorporate the dead-zone implementation - Lokesh
1051*c83a76b0SSuyog Pawar *
1052*c83a76b0SSuyog Pawar * @par Description:
1053*c83a76b0SSuyog Pawar *
1054*c83a76b0SSuyog Pawar * @param[in] pi2_8x8_had
1055*c83a76b0SSuyog Pawar * WORD16 pointer to 8x8 hadamard buffer(y0, y1, y2, y3 hadmard in Zscan order)
1056*c83a76b0SSuyog Pawar *
1057*c83a76b0SSuyog Pawar * @param[in] had8_strd
1058*c83a76b0SSuyog Pawar * stride of 8x8 hadmard buffer pi2_y0, pi2_y1, pi2_y2, pi2_y3
1059*c83a76b0SSuyog Pawar *
1060*c83a76b0SSuyog Pawar * @param[out] pi2_dst
1061*c83a76b0SSuyog Pawar * destination buffer where 8x8 hadamard result is stored
1062*c83a76b0SSuyog Pawar *
1063*c83a76b0SSuyog Pawar * @param[in] dst_stride
1064*c83a76b0SSuyog Pawar * stride of destination block
1065*c83a76b0SSuyog Pawar *
1066*c83a76b0SSuyog Pawar * @param[in] i4_frm_qstep
1067*c83a76b0SSuyog Pawar * frm_qstep value based on the which the threshold value is calculated
1068*c83a76b0SSuyog Pawar *
1069*c83a76b0SSuyog Pawar * @returns
1070*c83a76b0SSuyog Pawar * 16x16 Hadamard SATD
1071*c83a76b0SSuyog Pawar * @remarks
1072*c83a76b0SSuyog Pawar *
1073*c83a76b0SSuyog Pawar *******************************************************************************
1074*c83a76b0SSuyog Pawar */
ihevce_compute_16x16HAD_using_8x8(WORD16 * pi2_8x8_had,WORD32 had8_strd,WORD16 * pi2_dst,WORD32 dst_strd,WORD32 i4_frm_qstep,WORD32 * pi4_cbf)1075*c83a76b0SSuyog Pawar static UWORD32 ihevce_compute_16x16HAD_using_8x8(
1076*c83a76b0SSuyog Pawar WORD16 *pi2_8x8_had,
1077*c83a76b0SSuyog Pawar WORD32 had8_strd,
1078*c83a76b0SSuyog Pawar WORD16 *pi2_dst,
1079*c83a76b0SSuyog Pawar WORD32 dst_strd,
1080*c83a76b0SSuyog Pawar WORD32 i4_frm_qstep,
1081*c83a76b0SSuyog Pawar WORD32 *pi4_cbf)
1082*c83a76b0SSuyog Pawar {
1083*c83a76b0SSuyog Pawar /* Qstep value is right shifted by 8 */
1084*c83a76b0SSuyog Pawar WORD32 threshold = (i4_frm_qstep >> 8);
1085*c83a76b0SSuyog Pawar
1086*c83a76b0SSuyog Pawar /* Initialize pointers to 4 subblocks of 8x8 HAD buffer */
1087*c83a76b0SSuyog Pawar WORD16 *pi2_y0 = pi2_8x8_had;
1088*c83a76b0SSuyog Pawar WORD16 *pi2_y1 = pi2_8x8_had + 8;
1089*c83a76b0SSuyog Pawar WORD16 *pi2_y2 = pi2_8x8_had + had8_strd * 8;
1090*c83a76b0SSuyog Pawar WORD16 *pi2_y3 = pi2_8x8_had + had8_strd * 8 + 8;
1091*c83a76b0SSuyog Pawar
1092*c83a76b0SSuyog Pawar /* Initialize pointers to store 8x8 HAD output */
1093*c83a76b0SSuyog Pawar WORD16 *pi2_dst0 = pi2_dst;
1094*c83a76b0SSuyog Pawar WORD16 *pi2_dst1 = pi2_dst + 8;
1095*c83a76b0SSuyog Pawar WORD16 *pi2_dst2 = pi2_dst + dst_strd * 8;
1096*c83a76b0SSuyog Pawar WORD16 *pi2_dst3 = pi2_dst + dst_strd * 8 + 8;
1097*c83a76b0SSuyog Pawar
1098*c83a76b0SSuyog Pawar UWORD32 u4_satd = 0;
1099*c83a76b0SSuyog Pawar WORD32 i;
1100*c83a76b0SSuyog Pawar
1101*c83a76b0SSuyog Pawar /* Child HAD results combined as follows to get Parent result */
1102*c83a76b0SSuyog Pawar /* _ _ */
1103*c83a76b0SSuyog Pawar /* | (y0 + y1) + (y2 + y3) (y0 - y1) + (y2 - y3) | */
1104*c83a76b0SSuyog Pawar /* | (y0 + y1) - (y2 + y3) (y0 - y1) - (y2 - y3) | */
1105*c83a76b0SSuyog Pawar /* \- -/ */
1106*c83a76b0SSuyog Pawar for(i = 0; i < 64; i++)
1107*c83a76b0SSuyog Pawar {
1108*c83a76b0SSuyog Pawar WORD32 src_idx = (i >> 3) * had8_strd + (i % 8);
1109*c83a76b0SSuyog Pawar WORD32 dst_idx = (i >> 3) * dst_strd + (i % 8);
1110*c83a76b0SSuyog Pawar
1111*c83a76b0SSuyog Pawar WORD16 a0 = pi2_y0[src_idx];
1112*c83a76b0SSuyog Pawar WORD16 a1 = pi2_y1[src_idx];
1113*c83a76b0SSuyog Pawar WORD16 a2 = pi2_y2[src_idx];
1114*c83a76b0SSuyog Pawar WORD16 a3 = pi2_y3[src_idx];
1115*c83a76b0SSuyog Pawar
1116*c83a76b0SSuyog Pawar WORD16 b0 = (a0 + a1) >> 1;
1117*c83a76b0SSuyog Pawar WORD16 b1 = (a0 - a1) >> 1;
1118*c83a76b0SSuyog Pawar WORD16 b2 = (a2 + a3) >> 1;
1119*c83a76b0SSuyog Pawar WORD16 b3 = (a2 - a3) >> 1;
1120*c83a76b0SSuyog Pawar
1121*c83a76b0SSuyog Pawar pi2_dst0[dst_idx] = b0 + b2;
1122*c83a76b0SSuyog Pawar pi2_dst1[dst_idx] = b1 + b3;
1123*c83a76b0SSuyog Pawar pi2_dst2[dst_idx] = b0 - b2;
1124*c83a76b0SSuyog Pawar pi2_dst3[dst_idx] = b1 - b3;
1125*c83a76b0SSuyog Pawar
1126*c83a76b0SSuyog Pawar /* Make the value of dst to zerp, if it falls below the dead-zone */
1127*c83a76b0SSuyog Pawar if(ABS(pi2_dst0[dst_idx]) > threshold)
1128*c83a76b0SSuyog Pawar *pi4_cbf = 1;
1129*c83a76b0SSuyog Pawar if(ABS(pi2_dst1[dst_idx]) > threshold)
1130*c83a76b0SSuyog Pawar *pi4_cbf = 1;
1131*c83a76b0SSuyog Pawar if(ABS(pi2_dst2[dst_idx]) > threshold)
1132*c83a76b0SSuyog Pawar *pi4_cbf = 1;
1133*c83a76b0SSuyog Pawar if(ABS(pi2_dst3[dst_idx]) > threshold)
1134*c83a76b0SSuyog Pawar *pi4_cbf = 1;
1135*c83a76b0SSuyog Pawar
1136*c83a76b0SSuyog Pawar u4_satd += ABS(pi2_dst0[dst_idx]);
1137*c83a76b0SSuyog Pawar u4_satd += ABS(pi2_dst1[dst_idx]);
1138*c83a76b0SSuyog Pawar u4_satd += ABS(pi2_dst2[dst_idx]);
1139*c83a76b0SSuyog Pawar u4_satd += ABS(pi2_dst3[dst_idx]);
1140*c83a76b0SSuyog Pawar }
1141*c83a76b0SSuyog Pawar
1142*c83a76b0SSuyog Pawar /* return 16x16 satd */
1143*c83a76b0SSuyog Pawar return (u4_satd);
1144*c83a76b0SSuyog Pawar }
1145*c83a76b0SSuyog Pawar
1146*c83a76b0SSuyog Pawar /**
1147*c83a76b0SSuyog Pawar *******************************************************************************
1148*c83a76b0SSuyog Pawar *
1149*c83a76b0SSuyog Pawar * @brief
1150*c83a76b0SSuyog Pawar * Hadamard Transform for 16x16 block with 8x8 and 4x4 SATD updates.
1151*c83a76b0SSuyog Pawar * Uses recursive 8x8 had output to compute satd for 16x16 and its children
1152*c83a76b0SSuyog Pawar *
1153*c83a76b0SSuyog Pawar * @par Description:
1154*c83a76b0SSuyog Pawar *
1155*c83a76b0SSuyog Pawar * @param[in] pu1_origin
1156*c83a76b0SSuyog Pawar * UWORD8 pointer to the current block
1157*c83a76b0SSuyog Pawar *
1158*c83a76b0SSuyog Pawar * @param[in] src_strd
1159*c83a76b0SSuyog Pawar * WORD32 Source stride
1160*c83a76b0SSuyog Pawar *
1161*c83a76b0SSuyog Pawar * @param[in] pu1_pred
1162*c83a76b0SSuyog Pawar * UWORD8 pointer to the prediction block
1163*c83a76b0SSuyog Pawar *
1164*c83a76b0SSuyog Pawar * @param[in] pred_strd
1165*c83a76b0SSuyog Pawar * WORD32 Pred stride
1166*c83a76b0SSuyog Pawar *
1167*c83a76b0SSuyog Pawar * @param[out] pi2_dst
1168*c83a76b0SSuyog Pawar * WORD16 pointer to the transform output block
1169*c83a76b0SSuyog Pawar *
1170*c83a76b0SSuyog Pawar * @param[out] dst_strd
1171*c83a76b0SSuyog Pawar * WORD32 Destination stride
1172*c83a76b0SSuyog Pawar *
1173*c83a76b0SSuyog Pawar * @param[out] ppi4_hsad
1174*c83a76b0SSuyog Pawar * pointer to base pointers for storing hadmard sads of various
1175*c83a76b0SSuyog Pawar * block sizes (4x4 to 32x32)
1176*c83a76b0SSuyog Pawar *
1177*c83a76b0SSuyog Pawar * @param[in] pos_x_y_4x4
1178*c83a76b0SSuyog Pawar * Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB
1179*c83a76b0SSuyog Pawar * Lower 16bits denote xpos and upper 16ypos of the 4x4block
1180*c83a76b0SSuyog Pawar *
1181*c83a76b0SSuyog Pawar * @param[in] num_4x4_in_row
1182*c83a76b0SSuyog Pawar * Denotes the number of current 4x4 blocks in a ctb/CU/MB
1183*c83a76b0SSuyog Pawar *
1184*c83a76b0SSuyog Pawar * @param[in] lambda
1185*c83a76b0SSuyog Pawar * lambda values is the cost factor calculated based on QP
1186*c83a76b0SSuyog Pawar *
1187*c83a76b0SSuyog Pawar * @param[in] lambda_q_shift
1188*c83a76b0SSuyog Pawar * lambda_q_shift used to reverse the lambda value back from q8 format
1189*c83a76b0SSuyog Pawar *
1190*c83a76b0SSuyog Pawar * @param[in] depth
1191*c83a76b0SSuyog Pawar * depth gives the current TU depth with respect to the CU
1192*c83a76b0SSuyog Pawar *
1193*c83a76b0SSuyog Pawar * @param[in] i4_frm_qstep
1194*c83a76b0SSuyog Pawar * frm_qstep value based on the which the threshold value is calculated
1195*c83a76b0SSuyog Pawar *
1196*c83a76b0SSuyog Pawar * @returns
1197*c83a76b0SSuyog Pawar *
1198*c83a76b0SSuyog Pawar * @remarks
1199*c83a76b0SSuyog Pawar *
1200*c83a76b0SSuyog Pawar *******************************************************************************
1201*c83a76b0SSuyog Pawar */
1202*c83a76b0SSuyog Pawar
ihevce_had_16x16_r(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd,WORD32 ** ppi4_hsad,WORD32 ** ppi4_tu_split,WORD32 ** ppi4_tu_early_cbf,WORD32 pos_x_y_4x4,WORD32 num_4x4_in_row,WORD32 lambda,WORD32 lambda_q_shift,WORD32 i4_frm_qstep,WORD32 i4_cur_depth,WORD32 i4_max_depth,WORD32 i4_max_tr_size,WORD32 * pi4_tu_split_cost,void * pv_func_sel)1203*c83a76b0SSuyog Pawar WORD32 ihevce_had_16x16_r(
1204*c83a76b0SSuyog Pawar UWORD8 *pu1_src,
1205*c83a76b0SSuyog Pawar WORD32 src_strd,
1206*c83a76b0SSuyog Pawar UWORD8 *pu1_pred,
1207*c83a76b0SSuyog Pawar WORD32 pred_strd,
1208*c83a76b0SSuyog Pawar WORD16 *pi2_dst,
1209*c83a76b0SSuyog Pawar WORD32 dst_strd,
1210*c83a76b0SSuyog Pawar WORD32 **ppi4_hsad,
1211*c83a76b0SSuyog Pawar WORD32 **ppi4_tu_split,
1212*c83a76b0SSuyog Pawar WORD32 **ppi4_tu_early_cbf,
1213*c83a76b0SSuyog Pawar WORD32 pos_x_y_4x4,
1214*c83a76b0SSuyog Pawar WORD32 num_4x4_in_row,
1215*c83a76b0SSuyog Pawar WORD32 lambda,
1216*c83a76b0SSuyog Pawar WORD32 lambda_q_shift,
1217*c83a76b0SSuyog Pawar WORD32 i4_frm_qstep,
1218*c83a76b0SSuyog Pawar WORD32 i4_cur_depth,
1219*c83a76b0SSuyog Pawar WORD32 i4_max_depth,
1220*c83a76b0SSuyog Pawar WORD32 i4_max_tr_size,
1221*c83a76b0SSuyog Pawar WORD32 *pi4_tu_split_cost,
1222*c83a76b0SSuyog Pawar void *pv_func_sel)
1223*c83a76b0SSuyog Pawar {
1224*c83a76b0SSuyog Pawar WORD16 ai2_8x8_had[256];
1225*c83a76b0SSuyog Pawar WORD32 *pi4_16x16_hsad;
1226*c83a76b0SSuyog Pawar WORD32 *pi4_16x16_tu_split;
1227*c83a76b0SSuyog Pawar
1228*c83a76b0SSuyog Pawar WORD32 *pi4_16x16_tu_early_cbf;
1229*c83a76b0SSuyog Pawar
1230*c83a76b0SSuyog Pawar UWORD32 u4_satd = 0;
1231*c83a76b0SSuyog Pawar WORD32 tu_split_flag = 0;
1232*c83a76b0SSuyog Pawar WORD32 i4_early_cbf_flag = 0, early_cbf = 0;
1233*c83a76b0SSuyog Pawar const UWORD8 u1_cur_tr_size = 16;
1234*c83a76b0SSuyog Pawar
1235*c83a76b0SSuyog Pawar /* cost_parent : Stores the cost of the parent HAD transform (16x16) */
1236*c83a76b0SSuyog Pawar /* cost_child : Stores the cost of the child HAD transform (16x16) */
1237*c83a76b0SSuyog Pawar WORD32 cost_parent = 0, cost_child = 0;
1238*c83a76b0SSuyog Pawar
1239*c83a76b0SSuyog Pawar /*best_cost returns the best cost at the end of the function */
1240*c83a76b0SSuyog Pawar /*tu_split denoes whether the TU (16x16)is split or not */
1241*c83a76b0SSuyog Pawar WORD32 best_cost = 0, best_cost_tu_split;
1242*c83a76b0SSuyog Pawar WORD32 i;
1243*c83a76b0SSuyog Pawar
1244*c83a76b0SSuyog Pawar WORD16 *pi2_y0;
1245*c83a76b0SSuyog Pawar UWORD8 *pu1_src0;
1246*c83a76b0SSuyog Pawar UWORD8 *pu1_pred0;
1247*c83a76b0SSuyog Pawar WORD32 pos_x_y_4x4_0;
1248*c83a76b0SSuyog Pawar
1249*c83a76b0SSuyog Pawar WORD32 pos_x = pos_x_y_4x4 & 0xFFFF;
1250*c83a76b0SSuyog Pawar WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF;
1251*c83a76b0SSuyog Pawar
1252*c83a76b0SSuyog Pawar ASSERT(pos_x >= 0);
1253*c83a76b0SSuyog Pawar ASSERT(pos_y >= 0);
1254*c83a76b0SSuyog Pawar
1255*c83a76b0SSuyog Pawar /* Initialize pointers to store 16x16 SATDs */
1256*c83a76b0SSuyog Pawar pi4_16x16_hsad = ppi4_hsad[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2);
1257*c83a76b0SSuyog Pawar
1258*c83a76b0SSuyog Pawar pi4_16x16_tu_split =
1259*c83a76b0SSuyog Pawar ppi4_tu_split[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2);
1260*c83a76b0SSuyog Pawar
1261*c83a76b0SSuyog Pawar pi4_16x16_tu_early_cbf =
1262*c83a76b0SSuyog Pawar ppi4_tu_early_cbf[HAD_16x16] + (pos_x >> 2) + (pos_y >> 2) * (num_4x4_in_row >> 2);
1263*c83a76b0SSuyog Pawar
1264*c83a76b0SSuyog Pawar /* -------- Compute four 8x8 HAD Transforms of 16x16 call--------- */
1265*c83a76b0SSuyog Pawar for(i = 0; i < 4; i++)
1266*c83a76b0SSuyog Pawar {
1267*c83a76b0SSuyog Pawar pu1_src0 = pu1_src + (i & 0x01) * 8 + (i >> 1) * src_strd * 8;
1268*c83a76b0SSuyog Pawar pu1_pred0 = pu1_pred + (i & 0x01) * 8 + (i >> 1) * pred_strd * 8;
1269*c83a76b0SSuyog Pawar pi2_y0 = ai2_8x8_had + (i & 0x01) * 8 + (i >> 1) * 16 * 8;
1270*c83a76b0SSuyog Pawar pos_x_y_4x4_0 = pos_x_y_4x4 + (i & 0x01) * 2 + (i >> 1) * (2 << 16);
1271*c83a76b0SSuyog Pawar
1272*c83a76b0SSuyog Pawar best_cost_tu_split = ihevce_had_8x8_using_4_4x4_r(
1273*c83a76b0SSuyog Pawar pu1_src0,
1274*c83a76b0SSuyog Pawar src_strd,
1275*c83a76b0SSuyog Pawar pu1_pred0,
1276*c83a76b0SSuyog Pawar pred_strd,
1277*c83a76b0SSuyog Pawar pi2_y0,
1278*c83a76b0SSuyog Pawar 16,
1279*c83a76b0SSuyog Pawar ppi4_hsad,
1280*c83a76b0SSuyog Pawar ppi4_tu_split,
1281*c83a76b0SSuyog Pawar ppi4_tu_early_cbf,
1282*c83a76b0SSuyog Pawar pos_x_y_4x4_0,
1283*c83a76b0SSuyog Pawar num_4x4_in_row,
1284*c83a76b0SSuyog Pawar lambda,
1285*c83a76b0SSuyog Pawar lambda_q_shift,
1286*c83a76b0SSuyog Pawar i4_frm_qstep,
1287*c83a76b0SSuyog Pawar i4_cur_depth + 1,
1288*c83a76b0SSuyog Pawar i4_max_depth,
1289*c83a76b0SSuyog Pawar i4_max_tr_size,
1290*c83a76b0SSuyog Pawar pi4_tu_split_cost,
1291*c83a76b0SSuyog Pawar pv_func_sel);
1292*c83a76b0SSuyog Pawar
1293*c83a76b0SSuyog Pawar /* Cost is shifted by two bits for Tu_split_flag and early cbf flag */
1294*c83a76b0SSuyog Pawar best_cost = (best_cost_tu_split >> 2);
1295*c83a76b0SSuyog Pawar
1296*c83a76b0SSuyog Pawar /* Last but one bit stores the information regarding the TU_Split */
1297*c83a76b0SSuyog Pawar tu_split_flag += (best_cost_tu_split & 0x3) >> 1;
1298*c83a76b0SSuyog Pawar
1299*c83a76b0SSuyog Pawar /* Last bit stores the information regarding the early_cbf */
1300*c83a76b0SSuyog Pawar i4_early_cbf_flag += (best_cost_tu_split & 0x1);
1301*c83a76b0SSuyog Pawar
1302*c83a76b0SSuyog Pawar cost_child += best_cost;
1303*c83a76b0SSuyog Pawar
1304*c83a76b0SSuyog Pawar tu_split_flag <<= 1;
1305*c83a76b0SSuyog Pawar i4_early_cbf_flag <<= 1;
1306*c83a76b0SSuyog Pawar }
1307*c83a76b0SSuyog Pawar
1308*c83a76b0SSuyog Pawar /* -------- Compute 16x16 HAD Transform using 8x8 results ------------- */
1309*c83a76b0SSuyog Pawar pi2_y0 = ai2_8x8_had;
1310*c83a76b0SSuyog Pawar
1311*c83a76b0SSuyog Pawar /* Threshold currently passed as "0" */
1312*c83a76b0SSuyog Pawar u4_satd =
1313*c83a76b0SSuyog Pawar ihevce_compute_16x16HAD_using_8x8(pi2_y0, 16, pi2_dst, dst_strd, i4_frm_qstep, &early_cbf);
1314*c83a76b0SSuyog Pawar
1315*c83a76b0SSuyog Pawar /* store the normalized satd */
1316*c83a76b0SSuyog Pawar cost_parent = ((u4_satd + 4) >> 3);
1317*c83a76b0SSuyog Pawar
1318*c83a76b0SSuyog Pawar /* 4 TU_Split flags , 4 CBF Flags, extra 1 becoz of the 0.5 bits per bin is assumed */
1319*c83a76b0SSuyog Pawar cost_child += ((4 + 4) * lambda) >> (lambda_q_shift + 1);
1320*c83a76b0SSuyog Pawar
1321*c83a76b0SSuyog Pawar i4_early_cbf_flag += early_cbf;
1322*c83a76b0SSuyog Pawar
1323*c83a76b0SSuyog Pawar /* Right now the depth is hard-coded to 4: The depth can be modified from the config file
1324*c83a76b0SSuyog Pawar which decides the extent to which TU_REC needs to be done */
1325*c83a76b0SSuyog Pawar if(i4_cur_depth < i4_max_depth)
1326*c83a76b0SSuyog Pawar {
1327*c83a76b0SSuyog Pawar if((cost_child < cost_parent) || (i4_max_tr_size < u1_cur_tr_size))
1328*c83a76b0SSuyog Pawar {
1329*c83a76b0SSuyog Pawar //cost_child -= ((4 + 4) * lambda) >> (lambda_q_shift + 1);
1330*c83a76b0SSuyog Pawar *pi4_tu_split_cost += ((4 + 4) * lambda) >> (lambda_q_shift + 1);
1331*c83a76b0SSuyog Pawar tu_split_flag += 1;
1332*c83a76b0SSuyog Pawar best_cost = cost_child;
1333*c83a76b0SSuyog Pawar }
1334*c83a76b0SSuyog Pawar else
1335*c83a76b0SSuyog Pawar {
1336*c83a76b0SSuyog Pawar //cost_parent -= ((1 + 1) * lambda) >> (lambda_q_shift + 1);
1337*c83a76b0SSuyog Pawar tu_split_flag += 0;
1338*c83a76b0SSuyog Pawar best_cost = cost_parent;
1339*c83a76b0SSuyog Pawar }
1340*c83a76b0SSuyog Pawar }
1341*c83a76b0SSuyog Pawar else
1342*c83a76b0SSuyog Pawar {
1343*c83a76b0SSuyog Pawar //cost_parent -= ((1 + 1) * lambda) >> (lambda_q_shift + 1);
1344*c83a76b0SSuyog Pawar tu_split_flag += 0;
1345*c83a76b0SSuyog Pawar best_cost = cost_parent;
1346*c83a76b0SSuyog Pawar }
1347*c83a76b0SSuyog Pawar
1348*c83a76b0SSuyog Pawar pi4_16x16_hsad[0] = best_cost;
1349*c83a76b0SSuyog Pawar pi4_16x16_tu_split[0] = tu_split_flag;
1350*c83a76b0SSuyog Pawar pi4_16x16_tu_early_cbf[0] = i4_early_cbf_flag;
1351*c83a76b0SSuyog Pawar
1352*c83a76b0SSuyog Pawar /*returning two values(best cost & tu_split_flag) as a single value*/
1353*c83a76b0SSuyog Pawar return ((best_cost << 10) + (tu_split_flag << 5) + i4_early_cbf_flag);
1354*c83a76b0SSuyog Pawar }
1355*c83a76b0SSuyog Pawar
1356*c83a76b0SSuyog Pawar //#endif
1357*c83a76b0SSuyog Pawar /**
1358*c83a76b0SSuyog Pawar *******************************************************************************
1359*c83a76b0SSuyog Pawar *
1360*c83a76b0SSuyog Pawar * @brief
1361*c83a76b0SSuyog Pawar * Computes 32x32 transform using children 16x16 hadamard results
1362*c83a76b0SSuyog Pawar *
1363*c83a76b0SSuyog Pawar * @par Description:
1364*c83a76b0SSuyog Pawar *
1365*c83a76b0SSuyog Pawar * @param[in] pi2_16x16_had
1366*c83a76b0SSuyog Pawar * WORD16 pointer to 16x16 hadamard buffer(y0, y1, y2, y3 hadmard in Zscan order)
1367*c83a76b0SSuyog Pawar *
1368*c83a76b0SSuyog Pawar * @param[in] had16_strd
1369*c83a76b0SSuyog Pawar * stride of 16x16 hadmard buffer pi2_y0, pi2_y1, pi2_y2, pi2_y3
1370*c83a76b0SSuyog Pawar *
1371*c83a76b0SSuyog Pawar * @param[out] pi2_dst
1372*c83a76b0SSuyog Pawar * destination buffer where 16x16 hadamard result is stored
1373*c83a76b0SSuyog Pawar *
1374*c83a76b0SSuyog Pawar * @param[in] dst_stride
1375*c83a76b0SSuyog Pawar * stride of destination block
1376*c83a76b0SSuyog Pawar *
1377*c83a76b0SSuyog Pawar * @param[in] i4_frm_qstep
1378*c83a76b0SSuyog Pawar * frm_qstep value based on the which the threshold value is calculated
1379*c83a76b0SSuyog Pawar *
1380*c83a76b0SSuyog Pawar * @returns
1381*c83a76b0SSuyog Pawar * 32x32 Hadamard SATD
1382*c83a76b0SSuyog Pawar * @remarks
1383*c83a76b0SSuyog Pawar *
1384*c83a76b0SSuyog Pawar *******************************************************************************
1385*c83a76b0SSuyog Pawar */
1386*c83a76b0SSuyog Pawar //#if COMPUTE_32x32_USING_16X16 == C
ihevce_compute_32x32HAD_using_16x16(WORD16 * pi2_16x16_had,WORD32 had16_strd,WORD16 * pi2_dst,WORD32 dst_strd,WORD32 i4_frm_qstep,WORD32 * pi4_cbf)1387*c83a76b0SSuyog Pawar UWORD32 ihevce_compute_32x32HAD_using_16x16(
1388*c83a76b0SSuyog Pawar WORD16 *pi2_16x16_had,
1389*c83a76b0SSuyog Pawar WORD32 had16_strd,
1390*c83a76b0SSuyog Pawar WORD16 *pi2_dst,
1391*c83a76b0SSuyog Pawar WORD32 dst_strd,
1392*c83a76b0SSuyog Pawar WORD32 i4_frm_qstep,
1393*c83a76b0SSuyog Pawar WORD32 *pi4_cbf)
1394*c83a76b0SSuyog Pawar {
1395*c83a76b0SSuyog Pawar /* Qstep value is right shifted by 8 */
1396*c83a76b0SSuyog Pawar WORD32 threshold = (i4_frm_qstep >> 8);
1397*c83a76b0SSuyog Pawar
1398*c83a76b0SSuyog Pawar /* Initialize pointers to 4 subblocks of 8x8 HAD buffer */
1399*c83a76b0SSuyog Pawar WORD16 *pi2_y0 = pi2_16x16_had;
1400*c83a76b0SSuyog Pawar WORD16 *pi2_y1 = pi2_16x16_had + 16;
1401*c83a76b0SSuyog Pawar WORD16 *pi2_y2 = pi2_16x16_had + had16_strd * 16;
1402*c83a76b0SSuyog Pawar WORD16 *pi2_y3 = pi2_16x16_had + had16_strd * 16 + 16;
1403*c83a76b0SSuyog Pawar
1404*c83a76b0SSuyog Pawar /* Initialize pointers to store 8x8 HAD output */
1405*c83a76b0SSuyog Pawar WORD16 *pi2_dst0 = pi2_dst;
1406*c83a76b0SSuyog Pawar WORD16 *pi2_dst1 = pi2_dst + 16;
1407*c83a76b0SSuyog Pawar WORD16 *pi2_dst2 = pi2_dst + dst_strd * 16;
1408*c83a76b0SSuyog Pawar WORD16 *pi2_dst3 = pi2_dst + dst_strd * 16 + 16;
1409*c83a76b0SSuyog Pawar
1410*c83a76b0SSuyog Pawar UWORD32 u4_satd = 0;
1411*c83a76b0SSuyog Pawar WORD32 i;
1412*c83a76b0SSuyog Pawar
1413*c83a76b0SSuyog Pawar /* Child HAD results combined as follows to get Parent result */
1414*c83a76b0SSuyog Pawar /* _ _ */
1415*c83a76b0SSuyog Pawar /* | (y0 + y1) + (y2 + y3) (y0 - y1) + (y2 - y3) | */
1416*c83a76b0SSuyog Pawar /* | (y0 + y1) - (y2 + y3) (y0 - y1) - (y2 - y3) | */
1417*c83a76b0SSuyog Pawar /* \- -/ */
1418*c83a76b0SSuyog Pawar for(i = 0; i < 256; i++)
1419*c83a76b0SSuyog Pawar {
1420*c83a76b0SSuyog Pawar WORD32 src_idx = (i >> 4) * had16_strd + (i % 16);
1421*c83a76b0SSuyog Pawar WORD32 dst_idx = (i >> 4) * dst_strd + (i % 16);
1422*c83a76b0SSuyog Pawar
1423*c83a76b0SSuyog Pawar WORD16 a0 = pi2_y0[src_idx] >> 2;
1424*c83a76b0SSuyog Pawar WORD16 a1 = pi2_y1[src_idx] >> 2;
1425*c83a76b0SSuyog Pawar WORD16 a2 = pi2_y2[src_idx] >> 2;
1426*c83a76b0SSuyog Pawar WORD16 a3 = pi2_y3[src_idx] >> 2;
1427*c83a76b0SSuyog Pawar
1428*c83a76b0SSuyog Pawar WORD16 b0 = (a0 + a1);
1429*c83a76b0SSuyog Pawar WORD16 b1 = (a0 - a1);
1430*c83a76b0SSuyog Pawar WORD16 b2 = (a2 + a3);
1431*c83a76b0SSuyog Pawar WORD16 b3 = (a2 - a3);
1432*c83a76b0SSuyog Pawar
1433*c83a76b0SSuyog Pawar pi2_dst0[dst_idx] = b0 + b2;
1434*c83a76b0SSuyog Pawar pi2_dst1[dst_idx] = b1 + b3;
1435*c83a76b0SSuyog Pawar pi2_dst2[dst_idx] = b0 - b2;
1436*c83a76b0SSuyog Pawar pi2_dst3[dst_idx] = b1 - b3;
1437*c83a76b0SSuyog Pawar
1438*c83a76b0SSuyog Pawar /* Make the value of dst to zerp, if it falls below the dead-zone */
1439*c83a76b0SSuyog Pawar if(ABS(pi2_dst0[dst_idx]) > threshold)
1440*c83a76b0SSuyog Pawar *pi4_cbf = 1;
1441*c83a76b0SSuyog Pawar if(ABS(pi2_dst1[dst_idx]) > threshold)
1442*c83a76b0SSuyog Pawar *pi4_cbf = 1;
1443*c83a76b0SSuyog Pawar if(ABS(pi2_dst2[dst_idx]) > threshold)
1444*c83a76b0SSuyog Pawar *pi4_cbf = 1;
1445*c83a76b0SSuyog Pawar if(ABS(pi2_dst3[dst_idx]) > threshold)
1446*c83a76b0SSuyog Pawar *pi4_cbf = 1;
1447*c83a76b0SSuyog Pawar
1448*c83a76b0SSuyog Pawar u4_satd += ABS(pi2_dst0[dst_idx]);
1449*c83a76b0SSuyog Pawar u4_satd += ABS(pi2_dst1[dst_idx]);
1450*c83a76b0SSuyog Pawar u4_satd += ABS(pi2_dst2[dst_idx]);
1451*c83a76b0SSuyog Pawar u4_satd += ABS(pi2_dst3[dst_idx]);
1452*c83a76b0SSuyog Pawar }
1453*c83a76b0SSuyog Pawar
1454*c83a76b0SSuyog Pawar /* return 32x32 satd */
1455*c83a76b0SSuyog Pawar return (u4_satd);
1456*c83a76b0SSuyog Pawar }
1457*c83a76b0SSuyog Pawar //#endif
1458*c83a76b0SSuyog Pawar
1459*c83a76b0SSuyog Pawar /**
1460*c83a76b0SSuyog Pawar *******************************************************************************
1461*c83a76b0SSuyog Pawar *
1462*c83a76b0SSuyog Pawar * @brief
1463*c83a76b0SSuyog Pawar * Hadamard Transform for 32x32 block with 16x6, 8x8 and 4x4 SATD updates.
1464*c83a76b0SSuyog Pawar * Uses recursive 16x16 had output to compute satd for 32x32 and its children
1465*c83a76b0SSuyog Pawar *
1466*c83a76b0SSuyog Pawar * @par Description:
1467*c83a76b0SSuyog Pawar *
1468*c83a76b0SSuyog Pawar * @param[in] pu1_origin
1469*c83a76b0SSuyog Pawar * UWORD8 pointer to the current block
1470*c83a76b0SSuyog Pawar *
1471*c83a76b0SSuyog Pawar * @param[in] src_strd
1472*c83a76b0SSuyog Pawar * WORD32 Source stride
1473*c83a76b0SSuyog Pawar *
1474*c83a76b0SSuyog Pawar * @param[in] pu1_pred
1475*c83a76b0SSuyog Pawar * UWORD8 pointer to the prediction block
1476*c83a76b0SSuyog Pawar *
1477*c83a76b0SSuyog Pawar * @param[in] pred_strd
1478*c83a76b0SSuyog Pawar * WORD32 Pred stride
1479*c83a76b0SSuyog Pawar *
1480*c83a76b0SSuyog Pawar * @param[out] pi2_dst
1481*c83a76b0SSuyog Pawar * WORD16 pointer to the transform output block
1482*c83a76b0SSuyog Pawar *
1483*c83a76b0SSuyog Pawar * @param[out] dst_strd
1484*c83a76b0SSuyog Pawar * WORD32 Destination stride
1485*c83a76b0SSuyog Pawar *
1486*c83a76b0SSuyog Pawar * @param[out] ppi4_hsad
1487*c83a76b0SSuyog Pawar * pointer to base pointers for storing hadmard sads of various
1488*c83a76b0SSuyog Pawar * block sizes (4x4 to 32x32)
1489*c83a76b0SSuyog Pawar *
1490*c83a76b0SSuyog Pawar * @param[in] pos_x_y_4x4
1491*c83a76b0SSuyog Pawar * Denotes packed x,y postion of current 4x4 block w.r.t to start of ctb/CU/MB
1492*c83a76b0SSuyog Pawar * Lower 16bits denote xpos and upper 16ypos of the 4x4block
1493*c83a76b0SSuyog Pawar *
1494*c83a76b0SSuyog Pawar * @param[in] num_4x4_in_row
1495*c83a76b0SSuyog Pawar * Denotes the number of current 4x4 blocks in a ctb/CU/MB
1496*c83a76b0SSuyog Pawar *
1497*c83a76b0SSuyog Pawar * @param[in] lambda
1498*c83a76b0SSuyog Pawar * lambda values is the cost factor calculated based on QP
1499*c83a76b0SSuyog Pawar *
1500*c83a76b0SSuyog Pawar * @param[in] lambda_q_shift
1501*c83a76b0SSuyog Pawar * lambda_q_shift used to reverse the lambda value back from q8 format
1502*c83a76b0SSuyog Pawar *
1503*c83a76b0SSuyog Pawar * @param[in] depth
1504*c83a76b0SSuyog Pawar * depth gives the current TU depth with respect to the CU
1505*c83a76b0SSuyog Pawar *
1506*c83a76b0SSuyog Pawar * @param[in] i4_frm_qstep
1507*c83a76b0SSuyog Pawar * frm_qstep value based on the which the threshold value is calculated
1508*c83a76b0SSuyog Pawar *
1509*c83a76b0SSuyog Pawar *
1510*c83a76b0SSuyog Pawar * @returns
1511*c83a76b0SSuyog Pawar *
1512*c83a76b0SSuyog Pawar * @remarks
1513*c83a76b0SSuyog Pawar *
1514*c83a76b0SSuyog Pawar *******************************************************************************
1515*c83a76b0SSuyog Pawar */
ihevce_had_32x32_r(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_pred,WORD32 pred_strd,WORD16 * pi2_dst,WORD32 dst_strd,WORD32 ** ppi4_hsad,WORD32 ** ppi4_tu_split,WORD32 ** ppi4_tu_early_cbf,WORD32 pos_x_y_4x4,WORD32 num_4x4_in_row,WORD32 lambda,WORD32 lambda_q_shift,WORD32 i4_frm_qstep,WORD32 i4_cur_depth,WORD32 i4_max_depth,WORD32 i4_max_tr_size,WORD32 * pi4_tu_split_cost,me_func_selector_t * ps_func_selector)1516*c83a76b0SSuyog Pawar void ihevce_had_32x32_r(
1517*c83a76b0SSuyog Pawar UWORD8 *pu1_src,
1518*c83a76b0SSuyog Pawar WORD32 src_strd,
1519*c83a76b0SSuyog Pawar UWORD8 *pu1_pred,
1520*c83a76b0SSuyog Pawar WORD32 pred_strd,
1521*c83a76b0SSuyog Pawar WORD16 *pi2_dst,
1522*c83a76b0SSuyog Pawar WORD32 dst_strd,
1523*c83a76b0SSuyog Pawar WORD32 **ppi4_hsad,
1524*c83a76b0SSuyog Pawar WORD32 **ppi4_tu_split,
1525*c83a76b0SSuyog Pawar WORD32 **ppi4_tu_early_cbf,
1526*c83a76b0SSuyog Pawar WORD32 pos_x_y_4x4,
1527*c83a76b0SSuyog Pawar WORD32 num_4x4_in_row,
1528*c83a76b0SSuyog Pawar WORD32 lambda,
1529*c83a76b0SSuyog Pawar WORD32 lambda_q_shift,
1530*c83a76b0SSuyog Pawar WORD32 i4_frm_qstep,
1531*c83a76b0SSuyog Pawar WORD32 i4_cur_depth,
1532*c83a76b0SSuyog Pawar WORD32 i4_max_depth,
1533*c83a76b0SSuyog Pawar WORD32 i4_max_tr_size,
1534*c83a76b0SSuyog Pawar WORD32 *pi4_tu_split_cost,
1535*c83a76b0SSuyog Pawar me_func_selector_t *ps_func_selector)
1536*c83a76b0SSuyog Pawar
1537*c83a76b0SSuyog Pawar {
1538*c83a76b0SSuyog Pawar WORD16 ai2_16x16_had[1024];
1539*c83a76b0SSuyog Pawar WORD32 *pi4_32x32_hsad;
1540*c83a76b0SSuyog Pawar WORD32 *pi4_32x32_tu_split;
1541*c83a76b0SSuyog Pawar WORD32 *pi4_32x32_tu_early_cbf;
1542*c83a76b0SSuyog Pawar
1543*c83a76b0SSuyog Pawar WORD32 pos_x = pos_x_y_4x4 & 0xFFFF;
1544*c83a76b0SSuyog Pawar WORD32 pos_y = (pos_x_y_4x4 >> 16) & 0xFFFF;
1545*c83a76b0SSuyog Pawar WORD32 tu_split_flag = 0;
1546*c83a76b0SSuyog Pawar const UWORD8 u1_cur_tr_size = 32;
1547*c83a76b0SSuyog Pawar WORD32 i4_early_cbf_flag = 0, early_cbf = 0;
1548*c83a76b0SSuyog Pawar
1549*c83a76b0SSuyog Pawar /* cost_parent : Stores the cost of the parent HAD transform (16x16) */
1550*c83a76b0SSuyog Pawar /* cost_child : Stores the cost of the child HAD transform (16x16) */
1551*c83a76b0SSuyog Pawar WORD32 cost_child = 0, cost_parent = 0;
1552*c83a76b0SSuyog Pawar
1553*c83a76b0SSuyog Pawar /*retuned as the best cost for the entire TU (32x32) */
1554*c83a76b0SSuyog Pawar WORD32 best_cost = 0;
1555*c83a76b0SSuyog Pawar /*captures the best cost and tu_split at child level */
1556*c83a76b0SSuyog Pawar WORD32 best_cost_tu_split;
1557*c83a76b0SSuyog Pawar
1558*c83a76b0SSuyog Pawar /* Initialize pointers to 4 8x8 blocks in 16x16 */
1559*c83a76b0SSuyog Pawar WORD16 *pi2_y0 = ai2_16x16_had;
1560*c83a76b0SSuyog Pawar WORD16 *pi2_y1 = ai2_16x16_had + 16;
1561*c83a76b0SSuyog Pawar WORD16 *pi2_y2 = ai2_16x16_had + 32 * 16;
1562*c83a76b0SSuyog Pawar WORD16 *pi2_y3 = ai2_16x16_had + 32 * 16 + 16;
1563*c83a76b0SSuyog Pawar
1564*c83a76b0SSuyog Pawar UWORD8 *pu1_src0 = pu1_src;
1565*c83a76b0SSuyog Pawar UWORD8 *pu1_src1 = pu1_src + 16;
1566*c83a76b0SSuyog Pawar UWORD8 *pu1_src2 = pu1_src + src_strd * 16;
1567*c83a76b0SSuyog Pawar UWORD8 *pu1_src3 = pu1_src + src_strd * 16 + 16;
1568*c83a76b0SSuyog Pawar
1569*c83a76b0SSuyog Pawar UWORD8 *pu1_pred0 = pu1_pred;
1570*c83a76b0SSuyog Pawar UWORD8 *pu1_pred1 = pu1_pred + 16;
1571*c83a76b0SSuyog Pawar UWORD8 *pu1_pred2 = pu1_pred + pred_strd * 16;
1572*c83a76b0SSuyog Pawar UWORD8 *pu1_pred3 = pu1_pred + pred_strd * 16 + 16;
1573*c83a76b0SSuyog Pawar
1574*c83a76b0SSuyog Pawar ASSERT(pos_x >= 0);
1575*c83a76b0SSuyog Pawar ASSERT(pos_y >= 0);
1576*c83a76b0SSuyog Pawar
1577*c83a76b0SSuyog Pawar /* Initialize pointers to store 32x32 SATDs */
1578*c83a76b0SSuyog Pawar pi4_32x32_hsad = ppi4_hsad[HAD_32x32] + (pos_x >> 3) + (pos_y >> 3) * (num_4x4_in_row >> 3);
1579*c83a76b0SSuyog Pawar
1580*c83a76b0SSuyog Pawar pi4_32x32_tu_split =
1581*c83a76b0SSuyog Pawar ppi4_tu_split[HAD_32x32] + (pos_x >> 3) + (pos_y >> 3) * (num_4x4_in_row >> 3);
1582*c83a76b0SSuyog Pawar
1583*c83a76b0SSuyog Pawar pi4_32x32_tu_early_cbf =
1584*c83a76b0SSuyog Pawar ppi4_tu_early_cbf[HAD_32x32] + (pos_x >> 3) + (pos_y >> 3) * (num_4x4_in_row >> 3);
1585*c83a76b0SSuyog Pawar
1586*c83a76b0SSuyog Pawar /* -------- Compute four 8x8 HAD Transforms of 16x16 call--------- */
1587*c83a76b0SSuyog Pawar best_cost_tu_split = ps_func_selector->pf_had_16x16_r(
1588*c83a76b0SSuyog Pawar pu1_src0,
1589*c83a76b0SSuyog Pawar src_strd,
1590*c83a76b0SSuyog Pawar pu1_pred0,
1591*c83a76b0SSuyog Pawar pred_strd,
1592*c83a76b0SSuyog Pawar pi2_y0,
1593*c83a76b0SSuyog Pawar 32,
1594*c83a76b0SSuyog Pawar ppi4_hsad,
1595*c83a76b0SSuyog Pawar ppi4_tu_split,
1596*c83a76b0SSuyog Pawar ppi4_tu_early_cbf,
1597*c83a76b0SSuyog Pawar pos_x_y_4x4,
1598*c83a76b0SSuyog Pawar num_4x4_in_row,
1599*c83a76b0SSuyog Pawar lambda,
1600*c83a76b0SSuyog Pawar lambda_q_shift,
1601*c83a76b0SSuyog Pawar i4_frm_qstep,
1602*c83a76b0SSuyog Pawar i4_cur_depth + 1,
1603*c83a76b0SSuyog Pawar i4_max_depth,
1604*c83a76b0SSuyog Pawar i4_max_tr_size,
1605*c83a76b0SSuyog Pawar pi4_tu_split_cost,
1606*c83a76b0SSuyog Pawar NULL);
1607*c83a76b0SSuyog Pawar
1608*c83a76b0SSuyog Pawar /* cost is shifted by 10bits */
1609*c83a76b0SSuyog Pawar best_cost = best_cost_tu_split >> 10;
1610*c83a76b0SSuyog Pawar
1611*c83a76b0SSuyog Pawar /* Tu split is present in the 6-10 bits */
1612*c83a76b0SSuyog Pawar tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5;
1613*c83a76b0SSuyog Pawar
1614*c83a76b0SSuyog Pawar /*Early CBF info is present in the last 5 bits */
1615*c83a76b0SSuyog Pawar i4_early_cbf_flag += best_cost_tu_split & 0x1F;
1616*c83a76b0SSuyog Pawar
1617*c83a76b0SSuyog Pawar tu_split_flag <<= 5;
1618*c83a76b0SSuyog Pawar i4_early_cbf_flag <<= 5;
1619*c83a76b0SSuyog Pawar
1620*c83a76b0SSuyog Pawar cost_child += best_cost;
1621*c83a76b0SSuyog Pawar
1622*c83a76b0SSuyog Pawar best_cost_tu_split = ps_func_selector->pf_had_16x16_r(
1623*c83a76b0SSuyog Pawar pu1_src1,
1624*c83a76b0SSuyog Pawar src_strd,
1625*c83a76b0SSuyog Pawar pu1_pred1,
1626*c83a76b0SSuyog Pawar pred_strd,
1627*c83a76b0SSuyog Pawar pi2_y1,
1628*c83a76b0SSuyog Pawar 32,
1629*c83a76b0SSuyog Pawar ppi4_hsad,
1630*c83a76b0SSuyog Pawar ppi4_tu_split,
1631*c83a76b0SSuyog Pawar ppi4_tu_early_cbf,
1632*c83a76b0SSuyog Pawar pos_x_y_4x4 + 4,
1633*c83a76b0SSuyog Pawar num_4x4_in_row,
1634*c83a76b0SSuyog Pawar lambda,
1635*c83a76b0SSuyog Pawar lambda_q_shift,
1636*c83a76b0SSuyog Pawar i4_frm_qstep,
1637*c83a76b0SSuyog Pawar i4_cur_depth + 1,
1638*c83a76b0SSuyog Pawar i4_max_depth,
1639*c83a76b0SSuyog Pawar i4_max_tr_size,
1640*c83a76b0SSuyog Pawar pi4_tu_split_cost,
1641*c83a76b0SSuyog Pawar NULL);
1642*c83a76b0SSuyog Pawar
1643*c83a76b0SSuyog Pawar /* cost is shifted by 10bits */
1644*c83a76b0SSuyog Pawar best_cost = best_cost_tu_split >> 10;
1645*c83a76b0SSuyog Pawar
1646*c83a76b0SSuyog Pawar /* Tu split is present in the 6-10 bits */
1647*c83a76b0SSuyog Pawar tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5;
1648*c83a76b0SSuyog Pawar
1649*c83a76b0SSuyog Pawar /*Early CBF info is present in the last 5 bits */
1650*c83a76b0SSuyog Pawar i4_early_cbf_flag += best_cost_tu_split & 0x1F;
1651*c83a76b0SSuyog Pawar
1652*c83a76b0SSuyog Pawar tu_split_flag <<= 5;
1653*c83a76b0SSuyog Pawar i4_early_cbf_flag <<= 5;
1654*c83a76b0SSuyog Pawar
1655*c83a76b0SSuyog Pawar cost_child += best_cost;
1656*c83a76b0SSuyog Pawar
1657*c83a76b0SSuyog Pawar best_cost_tu_split = ps_func_selector->pf_had_16x16_r(
1658*c83a76b0SSuyog Pawar pu1_src2,
1659*c83a76b0SSuyog Pawar src_strd,
1660*c83a76b0SSuyog Pawar pu1_pred2,
1661*c83a76b0SSuyog Pawar pred_strd,
1662*c83a76b0SSuyog Pawar pi2_y2,
1663*c83a76b0SSuyog Pawar 32,
1664*c83a76b0SSuyog Pawar ppi4_hsad,
1665*c83a76b0SSuyog Pawar ppi4_tu_split,
1666*c83a76b0SSuyog Pawar ppi4_tu_early_cbf,
1667*c83a76b0SSuyog Pawar pos_x_y_4x4 + (4 << 16),
1668*c83a76b0SSuyog Pawar num_4x4_in_row,
1669*c83a76b0SSuyog Pawar lambda,
1670*c83a76b0SSuyog Pawar lambda_q_shift,
1671*c83a76b0SSuyog Pawar i4_frm_qstep,
1672*c83a76b0SSuyog Pawar i4_cur_depth + 1,
1673*c83a76b0SSuyog Pawar i4_max_depth,
1674*c83a76b0SSuyog Pawar i4_max_tr_size,
1675*c83a76b0SSuyog Pawar pi4_tu_split_cost,
1676*c83a76b0SSuyog Pawar NULL);
1677*c83a76b0SSuyog Pawar
1678*c83a76b0SSuyog Pawar /* cost is shifted by 10bits */
1679*c83a76b0SSuyog Pawar best_cost = best_cost_tu_split >> 10;
1680*c83a76b0SSuyog Pawar
1681*c83a76b0SSuyog Pawar /* Tu split is present in the 6-10 bits */
1682*c83a76b0SSuyog Pawar tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5;
1683*c83a76b0SSuyog Pawar
1684*c83a76b0SSuyog Pawar /*Early CBF info is present in the last 5 bits */
1685*c83a76b0SSuyog Pawar i4_early_cbf_flag += best_cost_tu_split & 0x1F;
1686*c83a76b0SSuyog Pawar
1687*c83a76b0SSuyog Pawar tu_split_flag <<= 5;
1688*c83a76b0SSuyog Pawar i4_early_cbf_flag <<= 5;
1689*c83a76b0SSuyog Pawar
1690*c83a76b0SSuyog Pawar cost_child += best_cost;
1691*c83a76b0SSuyog Pawar
1692*c83a76b0SSuyog Pawar best_cost_tu_split = ps_func_selector->pf_had_16x16_r(
1693*c83a76b0SSuyog Pawar pu1_src3,
1694*c83a76b0SSuyog Pawar src_strd,
1695*c83a76b0SSuyog Pawar pu1_pred3,
1696*c83a76b0SSuyog Pawar pred_strd,
1697*c83a76b0SSuyog Pawar pi2_y3,
1698*c83a76b0SSuyog Pawar 32,
1699*c83a76b0SSuyog Pawar ppi4_hsad,
1700*c83a76b0SSuyog Pawar ppi4_tu_split,
1701*c83a76b0SSuyog Pawar ppi4_tu_early_cbf,
1702*c83a76b0SSuyog Pawar pos_x_y_4x4 + (4 << 16) + 4,
1703*c83a76b0SSuyog Pawar num_4x4_in_row,
1704*c83a76b0SSuyog Pawar lambda,
1705*c83a76b0SSuyog Pawar lambda_q_shift,
1706*c83a76b0SSuyog Pawar i4_frm_qstep,
1707*c83a76b0SSuyog Pawar i4_cur_depth + 1,
1708*c83a76b0SSuyog Pawar i4_max_depth,
1709*c83a76b0SSuyog Pawar i4_max_tr_size,
1710*c83a76b0SSuyog Pawar pi4_tu_split_cost,
1711*c83a76b0SSuyog Pawar NULL);
1712*c83a76b0SSuyog Pawar
1713*c83a76b0SSuyog Pawar /* cost is shifted by 10bits */
1714*c83a76b0SSuyog Pawar best_cost = best_cost_tu_split >> 10;
1715*c83a76b0SSuyog Pawar
1716*c83a76b0SSuyog Pawar /* Tu split is present in the 6-10 bits */
1717*c83a76b0SSuyog Pawar tu_split_flag += (best_cost_tu_split & 0x3E0) >> 5;
1718*c83a76b0SSuyog Pawar
1719*c83a76b0SSuyog Pawar /*Early CBF info is present in the last 5 bits */
1720*c83a76b0SSuyog Pawar i4_early_cbf_flag += best_cost_tu_split & 0x1F;
1721*c83a76b0SSuyog Pawar
1722*c83a76b0SSuyog Pawar tu_split_flag <<= 1;
1723*c83a76b0SSuyog Pawar i4_early_cbf_flag <<= 1;
1724*c83a76b0SSuyog Pawar
1725*c83a76b0SSuyog Pawar cost_child += best_cost;
1726*c83a76b0SSuyog Pawar
1727*c83a76b0SSuyog Pawar {
1728*c83a76b0SSuyog Pawar UWORD32 u4_satd = 0;
1729*c83a76b0SSuyog Pawar
1730*c83a76b0SSuyog Pawar u4_satd = ps_func_selector->pf_compute_32x32HAD_using_16x16(
1731*c83a76b0SSuyog Pawar pi2_y0, 32, pi2_dst, dst_strd, i4_frm_qstep, &early_cbf);
1732*c83a76b0SSuyog Pawar
1733*c83a76b0SSuyog Pawar cost_parent = ((u4_satd + 2) >> 2);
1734*c83a76b0SSuyog Pawar }
1735*c83a76b0SSuyog Pawar
1736*c83a76b0SSuyog Pawar /* 4 TU_Split flags , 4 CBF Flags*/
1737*c83a76b0SSuyog Pawar cost_child += ((4 + 4) * lambda) >> (lambda_q_shift + 1);
1738*c83a76b0SSuyog Pawar
1739*c83a76b0SSuyog Pawar i4_early_cbf_flag += early_cbf;
1740*c83a76b0SSuyog Pawar
1741*c83a76b0SSuyog Pawar /* 1 TU_SPlit flag, 1 CBF flag */
1742*c83a76b0SSuyog Pawar //cost_parent += ((1 + 1)* lambda) >> (lambda_q_shift + 1);
1743*c83a76b0SSuyog Pawar
1744*c83a76b0SSuyog Pawar if(i4_cur_depth < i4_max_depth)
1745*c83a76b0SSuyog Pawar {
1746*c83a76b0SSuyog Pawar if((cost_child < cost_parent) || (u1_cur_tr_size > i4_max_tr_size))
1747*c83a76b0SSuyog Pawar {
1748*c83a76b0SSuyog Pawar *pi4_tu_split_cost += ((4 + 4) * lambda) >> (lambda_q_shift + 1);
1749*c83a76b0SSuyog Pawar best_cost = cost_child;
1750*c83a76b0SSuyog Pawar tu_split_flag++;
1751*c83a76b0SSuyog Pawar }
1752*c83a76b0SSuyog Pawar else
1753*c83a76b0SSuyog Pawar {
1754*c83a76b0SSuyog Pawar tu_split_flag = 0;
1755*c83a76b0SSuyog Pawar best_cost = cost_parent;
1756*c83a76b0SSuyog Pawar }
1757*c83a76b0SSuyog Pawar }
1758*c83a76b0SSuyog Pawar else
1759*c83a76b0SSuyog Pawar {
1760*c83a76b0SSuyog Pawar tu_split_flag = 0;
1761*c83a76b0SSuyog Pawar best_cost = cost_parent;
1762*c83a76b0SSuyog Pawar }
1763*c83a76b0SSuyog Pawar
1764*c83a76b0SSuyog Pawar pi4_32x32_tu_split[0] = tu_split_flag;
1765*c83a76b0SSuyog Pawar
1766*c83a76b0SSuyog Pawar pi4_32x32_hsad[0] = best_cost;
1767*c83a76b0SSuyog Pawar
1768*c83a76b0SSuyog Pawar pi4_32x32_tu_early_cbf[0] = i4_early_cbf_flag;
1769*c83a76b0SSuyog Pawar }
1770