1*c83a76b0SSuyog Pawar /******************************************************************************
2*c83a76b0SSuyog Pawar *
3*c83a76b0SSuyog Pawar * Copyright (C) 2018 The Android Open Source Project
4*c83a76b0SSuyog Pawar *
5*c83a76b0SSuyog Pawar * Licensed under the Apache License, Version 2.0 (the "License");
6*c83a76b0SSuyog Pawar * you may not use this file except in compliance with the License.
7*c83a76b0SSuyog Pawar * You may obtain a copy of the License at:
8*c83a76b0SSuyog Pawar *
9*c83a76b0SSuyog Pawar * http://www.apache.org/licenses/LICENSE-2.0
10*c83a76b0SSuyog Pawar *
11*c83a76b0SSuyog Pawar * Unless required by applicable law or agreed to in writing, software
12*c83a76b0SSuyog Pawar * distributed under the License is distributed on an "AS IS" BASIS,
13*c83a76b0SSuyog Pawar * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14*c83a76b0SSuyog Pawar * See the License for the specific language governing permissions and
15*c83a76b0SSuyog Pawar * limitations under the License.
16*c83a76b0SSuyog Pawar *
17*c83a76b0SSuyog Pawar *****************************************************************************
18*c83a76b0SSuyog Pawar * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19*c83a76b0SSuyog Pawar */
20*c83a76b0SSuyog Pawar /**
21*c83a76b0SSuyog Pawar ******************************************************************************
22*c83a76b0SSuyog Pawar * @file
23*c83a76b0SSuyog Pawar * ihevce_subpel_neon.c
24*c83a76b0SSuyog Pawar *
25*c83a76b0SSuyog Pawar * @brief
26*c83a76b0SSuyog Pawar * Subpel refinement modules for ME algo
27*c83a76b0SSuyog Pawar *
28*c83a76b0SSuyog Pawar * @author
29*c83a76b0SSuyog Pawar * Ittiam
30*c83a76b0SSuyog Pawar *
31*c83a76b0SSuyog Pawar * @par List of Functions:
32*c83a76b0SSuyog Pawar *
33*c83a76b0SSuyog Pawar * @remarks
34*c83a76b0SSuyog Pawar * None
35*c83a76b0SSuyog Pawar *
36*c83a76b0SSuyog Pawar ********************************************************************************
37*c83a76b0SSuyog Pawar */
38*c83a76b0SSuyog Pawar
39*c83a76b0SSuyog Pawar /*****************************************************************************/
40*c83a76b0SSuyog Pawar /* File Includes */
41*c83a76b0SSuyog Pawar /*****************************************************************************/
42*c83a76b0SSuyog Pawar /* System include files */
43*c83a76b0SSuyog Pawar #include <stdio.h>
44*c83a76b0SSuyog Pawar #include <string.h>
45*c83a76b0SSuyog Pawar #include <assert.h>
46*c83a76b0SSuyog Pawar #include <arm_neon.h>
47*c83a76b0SSuyog Pawar
48*c83a76b0SSuyog Pawar /* User include files */
49*c83a76b0SSuyog Pawar #include "ihevc_typedefs.h"
50*c83a76b0SSuyog Pawar #include "itt_video_api.h"
51*c83a76b0SSuyog Pawar #include "ihevc_cmn_utils_neon.h"
52*c83a76b0SSuyog Pawar #include "ihevc_chroma_itrans_recon.h"
53*c83a76b0SSuyog Pawar #include "ihevc_chroma_intra_pred.h"
54*c83a76b0SSuyog Pawar #include "ihevc_debug.h"
55*c83a76b0SSuyog Pawar #include "ihevc_deblk.h"
56*c83a76b0SSuyog Pawar #include "ihevc_defs.h"
57*c83a76b0SSuyog Pawar #include "ihevc_itrans_recon.h"
58*c83a76b0SSuyog Pawar #include "ihevc_intra_pred.h"
59*c83a76b0SSuyog Pawar #include "ihevc_inter_pred.h"
60*c83a76b0SSuyog Pawar #include "ihevc_macros.h"
61*c83a76b0SSuyog Pawar #include "ihevc_mem_fns.h"
62*c83a76b0SSuyog Pawar #include "ihevc_padding.h"
63*c83a76b0SSuyog Pawar #include "ihevc_quant_iquant_ssd.h"
64*c83a76b0SSuyog Pawar #include "ihevc_resi_trans.h"
65*c83a76b0SSuyog Pawar #include "ihevc_sao.h"
66*c83a76b0SSuyog Pawar #include "ihevc_structs.h"
67*c83a76b0SSuyog Pawar #include "ihevc_weighted_pred.h"
68*c83a76b0SSuyog Pawar
69*c83a76b0SSuyog Pawar #include "rc_cntrl_param.h"
70*c83a76b0SSuyog Pawar #include "rc_frame_info_collector.h"
71*c83a76b0SSuyog Pawar #include "rc_look_ahead_params.h"
72*c83a76b0SSuyog Pawar
73*c83a76b0SSuyog Pawar #include "ihevce_api.h"
74*c83a76b0SSuyog Pawar #include "ihevce_defs.h"
75*c83a76b0SSuyog Pawar #include "ihevce_lap_enc_structs.h"
76*c83a76b0SSuyog Pawar #include "ihevce_multi_thrd_structs.h"
77*c83a76b0SSuyog Pawar #include "ihevce_function_selector.h"
78*c83a76b0SSuyog Pawar #include "ihevce_me_common_defs.h"
79*c83a76b0SSuyog Pawar #include "ihevce_enc_structs.h"
80*c83a76b0SSuyog Pawar #include "ihevce_had_satd.h"
81*c83a76b0SSuyog Pawar #include "ihevce_ipe_instr_set_router.h"
82*c83a76b0SSuyog Pawar #include "ihevce_global_tables.h"
83*c83a76b0SSuyog Pawar
84*c83a76b0SSuyog Pawar #include "hme_datatype.h"
85*c83a76b0SSuyog Pawar #include "hme_common_defs.h"
86*c83a76b0SSuyog Pawar #include "hme_interface.h"
87*c83a76b0SSuyog Pawar #include "hme_defs.h"
88*c83a76b0SSuyog Pawar
89*c83a76b0SSuyog Pawar #include "ihevce_me_instr_set_router.h"
90*c83a76b0SSuyog Pawar
91*c83a76b0SSuyog Pawar /*****************************************************************************/
92*c83a76b0SSuyog Pawar /* Function Declarations */
93*c83a76b0SSuyog Pawar /*****************************************************************************/
94*c83a76b0SSuyog Pawar FT_CALC_SATD_AND_RESULT hme_evalsatd_update_1_best_result_pt_pu_16x16_neon;
95*c83a76b0SSuyog Pawar
96*c83a76b0SSuyog Pawar WORD32 ihevce_had4_4x4_neon(
97*c83a76b0SSuyog Pawar UWORD8 *pu1_src,
98*c83a76b0SSuyog Pawar WORD32 src_strd,
99*c83a76b0SSuyog Pawar UWORD8 *pu1_pred,
100*c83a76b0SSuyog Pawar WORD32 pred_strd,
101*c83a76b0SSuyog Pawar WORD16 *pi2_dst4x4,
102*c83a76b0SSuyog Pawar WORD32 dst_strd,
103*c83a76b0SSuyog Pawar WORD32 *pi4_hsad,
104*c83a76b0SSuyog Pawar WORD32 hsad_stride,
105*c83a76b0SSuyog Pawar WORD32 i4_frm_qstep);
106*c83a76b0SSuyog Pawar
107*c83a76b0SSuyog Pawar /*****************************************************************************/
108*c83a76b0SSuyog Pawar /* Function Definitions */
109*c83a76b0SSuyog Pawar /*****************************************************************************/
110*c83a76b0SSuyog Pawar
hme_4x4_qpel_interp_avg_neon(UWORD8 * pu1_src_a,UWORD8 * pu1_src_b,WORD32 src_a_strd,WORD32 src_b_strd,UWORD8 * pu1_dst,WORD32 dst_strd)111*c83a76b0SSuyog Pawar static void hme_4x4_qpel_interp_avg_neon(
112*c83a76b0SSuyog Pawar UWORD8 *pu1_src_a,
113*c83a76b0SSuyog Pawar UWORD8 *pu1_src_b,
114*c83a76b0SSuyog Pawar WORD32 src_a_strd,
115*c83a76b0SSuyog Pawar WORD32 src_b_strd,
116*c83a76b0SSuyog Pawar UWORD8 *pu1_dst,
117*c83a76b0SSuyog Pawar WORD32 dst_strd)
118*c83a76b0SSuyog Pawar {
119*c83a76b0SSuyog Pawar uint8x16_t src_a = load_unaligned_u8q(pu1_src_a, src_a_strd);
120*c83a76b0SSuyog Pawar uint8x16_t src_b = load_unaligned_u8q(pu1_src_b, src_b_strd);
121*c83a76b0SSuyog Pawar uint8x16_t dst = vrhaddq_u8(src_a, src_b);
122*c83a76b0SSuyog Pawar
123*c83a76b0SSuyog Pawar store_unaligned_u8q(pu1_dst, dst_strd, dst);
124*c83a76b0SSuyog Pawar }
125*c83a76b0SSuyog Pawar
hme_8xn_qpel_interp_avg_neon(UWORD8 * pu1_src_a,UWORD8 * pu1_src_b,WORD32 src_a_strd,WORD32 src_b_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 ht)126*c83a76b0SSuyog Pawar static void hme_8xn_qpel_interp_avg_neon(
127*c83a76b0SSuyog Pawar UWORD8 *pu1_src_a,
128*c83a76b0SSuyog Pawar UWORD8 *pu1_src_b,
129*c83a76b0SSuyog Pawar WORD32 src_a_strd,
130*c83a76b0SSuyog Pawar WORD32 src_b_strd,
131*c83a76b0SSuyog Pawar UWORD8 *pu1_dst,
132*c83a76b0SSuyog Pawar WORD32 dst_strd,
133*c83a76b0SSuyog Pawar WORD32 ht)
134*c83a76b0SSuyog Pawar {
135*c83a76b0SSuyog Pawar WORD32 i;
136*c83a76b0SSuyog Pawar
137*c83a76b0SSuyog Pawar for(i = 0; i < ht; i++)
138*c83a76b0SSuyog Pawar {
139*c83a76b0SSuyog Pawar uint8x8_t src_a = vld1_u8(pu1_src_a);
140*c83a76b0SSuyog Pawar uint8x8_t src_b = vld1_u8(pu1_src_b);
141*c83a76b0SSuyog Pawar uint8x8_t dst = vrhadd_u8(src_a, src_b);
142*c83a76b0SSuyog Pawar
143*c83a76b0SSuyog Pawar vst1_u8(pu1_dst, dst);
144*c83a76b0SSuyog Pawar pu1_src_a += src_a_strd;
145*c83a76b0SSuyog Pawar pu1_src_b += src_b_strd;
146*c83a76b0SSuyog Pawar pu1_dst += dst_strd;
147*c83a76b0SSuyog Pawar }
148*c83a76b0SSuyog Pawar }
149*c83a76b0SSuyog Pawar
hme_16xn_qpel_interp_avg_neon(UWORD8 * pu1_src_a,UWORD8 * pu1_src_b,WORD32 src_a_strd,WORD32 src_b_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 ht)150*c83a76b0SSuyog Pawar static void hme_16xn_qpel_interp_avg_neon(
151*c83a76b0SSuyog Pawar UWORD8 *pu1_src_a,
152*c83a76b0SSuyog Pawar UWORD8 *pu1_src_b,
153*c83a76b0SSuyog Pawar WORD32 src_a_strd,
154*c83a76b0SSuyog Pawar WORD32 src_b_strd,
155*c83a76b0SSuyog Pawar UWORD8 *pu1_dst,
156*c83a76b0SSuyog Pawar WORD32 dst_strd,
157*c83a76b0SSuyog Pawar WORD32 ht)
158*c83a76b0SSuyog Pawar {
159*c83a76b0SSuyog Pawar WORD32 i;
160*c83a76b0SSuyog Pawar
161*c83a76b0SSuyog Pawar for(i = 0; i < ht; i++)
162*c83a76b0SSuyog Pawar {
163*c83a76b0SSuyog Pawar uint8x16_t src_a = vld1q_u8(pu1_src_a);
164*c83a76b0SSuyog Pawar uint8x16_t src_b = vld1q_u8(pu1_src_b);
165*c83a76b0SSuyog Pawar uint8x16_t dst = vrhaddq_u8(src_a, src_b);
166*c83a76b0SSuyog Pawar
167*c83a76b0SSuyog Pawar vst1q_u8(pu1_dst, dst);
168*c83a76b0SSuyog Pawar pu1_src_a += src_a_strd;
169*c83a76b0SSuyog Pawar pu1_src_b += src_b_strd;
170*c83a76b0SSuyog Pawar pu1_dst += dst_strd;
171*c83a76b0SSuyog Pawar }
172*c83a76b0SSuyog Pawar }
173*c83a76b0SSuyog Pawar
hme_32xn_qpel_interp_avg_neon(UWORD8 * pu1_src_a,UWORD8 * pu1_src_b,WORD32 src_a_strd,WORD32 src_b_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 ht)174*c83a76b0SSuyog Pawar static void hme_32xn_qpel_interp_avg_neon(
175*c83a76b0SSuyog Pawar UWORD8 *pu1_src_a,
176*c83a76b0SSuyog Pawar UWORD8 *pu1_src_b,
177*c83a76b0SSuyog Pawar WORD32 src_a_strd,
178*c83a76b0SSuyog Pawar WORD32 src_b_strd,
179*c83a76b0SSuyog Pawar UWORD8 *pu1_dst,
180*c83a76b0SSuyog Pawar WORD32 dst_strd,
181*c83a76b0SSuyog Pawar WORD32 ht)
182*c83a76b0SSuyog Pawar {
183*c83a76b0SSuyog Pawar WORD32 i;
184*c83a76b0SSuyog Pawar
185*c83a76b0SSuyog Pawar for(i = 0; i < ht; i++)
186*c83a76b0SSuyog Pawar {
187*c83a76b0SSuyog Pawar uint8x16_t src_a_0 = vld1q_u8(pu1_src_a);
188*c83a76b0SSuyog Pawar uint8x16_t src_b_0 = vld1q_u8(pu1_src_b);
189*c83a76b0SSuyog Pawar uint8x16_t dst_0 = vrhaddq_u8(src_a_0, src_b_0);
190*c83a76b0SSuyog Pawar
191*c83a76b0SSuyog Pawar uint8x16_t src_a_1 = vld1q_u8(pu1_src_a + 16);
192*c83a76b0SSuyog Pawar uint8x16_t src_b_1 = vld1q_u8(pu1_src_b + 16);
193*c83a76b0SSuyog Pawar uint8x16_t dst_1 = vrhaddq_u8(src_a_1, src_b_1);
194*c83a76b0SSuyog Pawar
195*c83a76b0SSuyog Pawar vst1q_u8(pu1_dst, dst_0);
196*c83a76b0SSuyog Pawar vst1q_u8(pu1_dst + 16, dst_1);
197*c83a76b0SSuyog Pawar pu1_src_a += src_a_strd;
198*c83a76b0SSuyog Pawar pu1_src_b += src_b_strd;
199*c83a76b0SSuyog Pawar pu1_dst += dst_strd;
200*c83a76b0SSuyog Pawar }
201*c83a76b0SSuyog Pawar }
202*c83a76b0SSuyog Pawar
hme_4mx4n_qpel_interp_avg_neon(UWORD8 * pu1_src_a,UWORD8 * pu1_src_b,WORD32 src_a_strd,WORD32 src_b_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 blk_wd,WORD32 blk_ht)203*c83a76b0SSuyog Pawar static void hme_4mx4n_qpel_interp_avg_neon(
204*c83a76b0SSuyog Pawar UWORD8 *pu1_src_a,
205*c83a76b0SSuyog Pawar UWORD8 *pu1_src_b,
206*c83a76b0SSuyog Pawar WORD32 src_a_strd,
207*c83a76b0SSuyog Pawar WORD32 src_b_strd,
208*c83a76b0SSuyog Pawar UWORD8 *pu1_dst,
209*c83a76b0SSuyog Pawar WORD32 dst_strd,
210*c83a76b0SSuyog Pawar WORD32 blk_wd,
211*c83a76b0SSuyog Pawar WORD32 blk_ht)
212*c83a76b0SSuyog Pawar {
213*c83a76b0SSuyog Pawar WORD32 i, j;
214*c83a76b0SSuyog Pawar
215*c83a76b0SSuyog Pawar assert(blk_wd % 4 == 0);
216*c83a76b0SSuyog Pawar assert(blk_ht % 4 == 0);
217*c83a76b0SSuyog Pawar
218*c83a76b0SSuyog Pawar for(i = 0; i < blk_ht; i += 4)
219*c83a76b0SSuyog Pawar {
220*c83a76b0SSuyog Pawar for(j = 0; j < blk_wd;)
221*c83a76b0SSuyog Pawar {
222*c83a76b0SSuyog Pawar WORD32 wd = blk_wd - j;
223*c83a76b0SSuyog Pawar
224*c83a76b0SSuyog Pawar if(wd >= 32)
225*c83a76b0SSuyog Pawar {
226*c83a76b0SSuyog Pawar hme_32xn_qpel_interp_avg_neon(
227*c83a76b0SSuyog Pawar pu1_src_a + j, pu1_src_b + j, src_a_strd, src_b_strd, pu1_dst + j, dst_strd, 4);
228*c83a76b0SSuyog Pawar j += 32;
229*c83a76b0SSuyog Pawar }
230*c83a76b0SSuyog Pawar else if(wd >= 16)
231*c83a76b0SSuyog Pawar {
232*c83a76b0SSuyog Pawar hme_16xn_qpel_interp_avg_neon(
233*c83a76b0SSuyog Pawar pu1_src_a + j, pu1_src_b + j, src_a_strd, src_b_strd, pu1_dst + j, dst_strd, 4);
234*c83a76b0SSuyog Pawar j += 16;
235*c83a76b0SSuyog Pawar }
236*c83a76b0SSuyog Pawar else if(wd >= 8)
237*c83a76b0SSuyog Pawar {
238*c83a76b0SSuyog Pawar hme_8xn_qpel_interp_avg_neon(
239*c83a76b0SSuyog Pawar pu1_src_a + j, pu1_src_b + j, src_a_strd, src_b_strd, pu1_dst + j, dst_strd, 4);
240*c83a76b0SSuyog Pawar j += 8;
241*c83a76b0SSuyog Pawar }
242*c83a76b0SSuyog Pawar else
243*c83a76b0SSuyog Pawar {
244*c83a76b0SSuyog Pawar hme_4x4_qpel_interp_avg_neon(
245*c83a76b0SSuyog Pawar pu1_src_a + j, pu1_src_b + j, src_a_strd, src_b_strd, pu1_dst + j, dst_strd);
246*c83a76b0SSuyog Pawar j += 4;
247*c83a76b0SSuyog Pawar }
248*c83a76b0SSuyog Pawar }
249*c83a76b0SSuyog Pawar pu1_src_a += (4 * src_a_strd);
250*c83a76b0SSuyog Pawar pu1_src_b += (4 * src_b_strd);
251*c83a76b0SSuyog Pawar pu1_dst += (4 * dst_strd);
252*c83a76b0SSuyog Pawar }
253*c83a76b0SSuyog Pawar }
254*c83a76b0SSuyog Pawar
hme_qpel_interp_avg_neon(interp_prms_t * ps_prms,S32 i4_mv_x,S32 i4_mv_y,S32 i4_buf_id)255*c83a76b0SSuyog Pawar void hme_qpel_interp_avg_neon(interp_prms_t *ps_prms, S32 i4_mv_x, S32 i4_mv_y, S32 i4_buf_id)
256*c83a76b0SSuyog Pawar {
257*c83a76b0SSuyog Pawar U08 *pu1_src1, *pu1_src2, *pu1_dst;
258*c83a76b0SSuyog Pawar qpel_input_buf_cfg_t *ps_inp_cfg;
259*c83a76b0SSuyog Pawar S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
260*c83a76b0SSuyog Pawar S32 i4_ref_stride = ps_prms->i4_ref_stride;
261*c83a76b0SSuyog Pawar
262*c83a76b0SSuyog Pawar i4_mv_x_frac = i4_mv_x & 3;
263*c83a76b0SSuyog Pawar i4_mv_y_frac = i4_mv_y & 3;
264*c83a76b0SSuyog Pawar
265*c83a76b0SSuyog Pawar i4_offset = (i4_mv_x >> 2) + (i4_mv_y >> 2) * i4_ref_stride;
266*c83a76b0SSuyog Pawar
267*c83a76b0SSuyog Pawar /* Derive the descriptor that has all offset and size info */
268*c83a76b0SSuyog Pawar ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
269*c83a76b0SSuyog Pawar
270*c83a76b0SSuyog Pawar if(ps_inp_cfg->i1_buf_id1 == ps_inp_cfg->i1_buf_id2)
271*c83a76b0SSuyog Pawar {
272*c83a76b0SSuyog Pawar /* This is case for fxfy/hxfy/fxhy/hxhy */
273*c83a76b0SSuyog Pawar ps_prms->pu1_final_out = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
274*c83a76b0SSuyog Pawar ps_prms->pu1_final_out += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
275*c83a76b0SSuyog Pawar ps_prms->pu1_final_out += (ps_inp_cfg->i1_buf_yoff1 * ps_prms->i4_ref_stride);
276*c83a76b0SSuyog Pawar ps_prms->i4_final_out_stride = i4_ref_stride;
277*c83a76b0SSuyog Pawar
278*c83a76b0SSuyog Pawar return;
279*c83a76b0SSuyog Pawar }
280*c83a76b0SSuyog Pawar
281*c83a76b0SSuyog Pawar pu1_src1 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
282*c83a76b0SSuyog Pawar pu1_src1 += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
283*c83a76b0SSuyog Pawar pu1_src1 += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
284*c83a76b0SSuyog Pawar
285*c83a76b0SSuyog Pawar pu1_src2 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id2];
286*c83a76b0SSuyog Pawar pu1_src2 += ps_inp_cfg->i1_buf_xoff2 + i4_offset;
287*c83a76b0SSuyog Pawar pu1_src2 += (ps_inp_cfg->i1_buf_yoff2 * i4_ref_stride);
288*c83a76b0SSuyog Pawar
289*c83a76b0SSuyog Pawar pu1_dst = ps_prms->apu1_interp_out[i4_buf_id];
290*c83a76b0SSuyog Pawar
291*c83a76b0SSuyog Pawar hme_4mx4n_qpel_interp_avg_neon(
292*c83a76b0SSuyog Pawar pu1_src1,
293*c83a76b0SSuyog Pawar pu1_src2,
294*c83a76b0SSuyog Pawar ps_prms->i4_ref_stride,
295*c83a76b0SSuyog Pawar ps_prms->i4_ref_stride,
296*c83a76b0SSuyog Pawar pu1_dst,
297*c83a76b0SSuyog Pawar ps_prms->i4_out_stride,
298*c83a76b0SSuyog Pawar ps_prms->i4_blk_wd,
299*c83a76b0SSuyog Pawar ps_prms->i4_blk_ht);
300*c83a76b0SSuyog Pawar ps_prms->pu1_final_out = pu1_dst;
301*c83a76b0SSuyog Pawar ps_prms->i4_final_out_stride = ps_prms->i4_out_stride;
302*c83a76b0SSuyog Pawar }
303*c83a76b0SSuyog Pawar
304*c83a76b0SSuyog Pawar // TODO: Can this function and above function be unified
hme_qpel_interp_avg_1pt_neon(interp_prms_t * ps_prms,S32 i4_mv_x,S32 i4_mv_y,S32 i4_buf_id,U08 ** ppu1_final,S32 * pi4_final_stride)305*c83a76b0SSuyog Pawar void hme_qpel_interp_avg_1pt_neon(
306*c83a76b0SSuyog Pawar interp_prms_t *ps_prms,
307*c83a76b0SSuyog Pawar S32 i4_mv_x,
308*c83a76b0SSuyog Pawar S32 i4_mv_y,
309*c83a76b0SSuyog Pawar S32 i4_buf_id,
310*c83a76b0SSuyog Pawar U08 **ppu1_final,
311*c83a76b0SSuyog Pawar S32 *pi4_final_stride)
312*c83a76b0SSuyog Pawar {
313*c83a76b0SSuyog Pawar U08 *pu1_src1, *pu1_src2, *pu1_dst;
314*c83a76b0SSuyog Pawar qpel_input_buf_cfg_t *ps_inp_cfg;
315*c83a76b0SSuyog Pawar S32 i4_mv_x_frac, i4_mv_y_frac, i4_offset;
316*c83a76b0SSuyog Pawar S32 i4_ref_stride = ps_prms->i4_ref_stride;
317*c83a76b0SSuyog Pawar
318*c83a76b0SSuyog Pawar i4_mv_x_frac = i4_mv_x & 3;
319*c83a76b0SSuyog Pawar i4_mv_y_frac = i4_mv_y & 3;
320*c83a76b0SSuyog Pawar
321*c83a76b0SSuyog Pawar i4_offset = (i4_mv_x >> 2) + (i4_mv_y >> 2) * i4_ref_stride;
322*c83a76b0SSuyog Pawar
323*c83a76b0SSuyog Pawar /* Derive the descriptor that has all offset and size info */
324*c83a76b0SSuyog Pawar ps_inp_cfg = &gas_qpel_inp_buf_cfg[i4_mv_y_frac][i4_mv_x_frac];
325*c83a76b0SSuyog Pawar
326*c83a76b0SSuyog Pawar pu1_src1 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id1];
327*c83a76b0SSuyog Pawar pu1_src1 += ps_inp_cfg->i1_buf_xoff1 + i4_offset;
328*c83a76b0SSuyog Pawar pu1_src1 += (ps_inp_cfg->i1_buf_yoff1 * i4_ref_stride);
329*c83a76b0SSuyog Pawar
330*c83a76b0SSuyog Pawar pu1_src2 = ps_prms->ppu1_ref[ps_inp_cfg->i1_buf_id2];
331*c83a76b0SSuyog Pawar pu1_src2 += ps_inp_cfg->i1_buf_xoff2 + i4_offset;
332*c83a76b0SSuyog Pawar pu1_src2 += (ps_inp_cfg->i1_buf_yoff2 * i4_ref_stride);
333*c83a76b0SSuyog Pawar
334*c83a76b0SSuyog Pawar pu1_dst = ps_prms->apu1_interp_out[i4_buf_id];
335*c83a76b0SSuyog Pawar
336*c83a76b0SSuyog Pawar hme_4mx4n_qpel_interp_avg_neon(
337*c83a76b0SSuyog Pawar pu1_src1,
338*c83a76b0SSuyog Pawar pu1_src2,
339*c83a76b0SSuyog Pawar ps_prms->i4_ref_stride,
340*c83a76b0SSuyog Pawar ps_prms->i4_ref_stride,
341*c83a76b0SSuyog Pawar pu1_dst,
342*c83a76b0SSuyog Pawar ps_prms->i4_out_stride,
343*c83a76b0SSuyog Pawar ps_prms->i4_blk_wd,
344*c83a76b0SSuyog Pawar ps_prms->i4_blk_ht);
345*c83a76b0SSuyog Pawar ppu1_final[i4_buf_id] = pu1_dst;
346*c83a76b0SSuyog Pawar pi4_final_stride[i4_buf_id] = ps_prms->i4_out_stride;
347*c83a76b0SSuyog Pawar }
348*c83a76b0SSuyog Pawar
hme_qpel_interp_avg_2pt_vert_with_reuse_neon(interp_prms_t * ps_prms,S32 i4_mv_x,S32 i4_mv_y,U08 ** ppu1_final,S32 * pi4_final_stride)349*c83a76b0SSuyog Pawar void hme_qpel_interp_avg_2pt_vert_with_reuse_neon(
350*c83a76b0SSuyog Pawar interp_prms_t *ps_prms, S32 i4_mv_x, S32 i4_mv_y, U08 **ppu1_final, S32 *pi4_final_stride)
351*c83a76b0SSuyog Pawar {
352*c83a76b0SSuyog Pawar hme_qpel_interp_avg_1pt_neon(ps_prms, i4_mv_x, i4_mv_y + 1, 3, ppu1_final, pi4_final_stride);
353*c83a76b0SSuyog Pawar
354*c83a76b0SSuyog Pawar hme_qpel_interp_avg_1pt_neon(ps_prms, i4_mv_x, i4_mv_y - 1, 1, ppu1_final, pi4_final_stride);
355*c83a76b0SSuyog Pawar }
356*c83a76b0SSuyog Pawar
hme_qpel_interp_avg_2pt_horz_with_reuse_neon(interp_prms_t * ps_prms,S32 i4_mv_x,S32 i4_mv_y,U08 ** ppu1_final,S32 * pi4_final_stride)357*c83a76b0SSuyog Pawar void hme_qpel_interp_avg_2pt_horz_with_reuse_neon(
358*c83a76b0SSuyog Pawar interp_prms_t *ps_prms, S32 i4_mv_x, S32 i4_mv_y, U08 **ppu1_final, S32 *pi4_final_stride)
359*c83a76b0SSuyog Pawar {
360*c83a76b0SSuyog Pawar hme_qpel_interp_avg_1pt_neon(ps_prms, i4_mv_x + 1, i4_mv_y, 2, ppu1_final, pi4_final_stride);
361*c83a76b0SSuyog Pawar
362*c83a76b0SSuyog Pawar hme_qpel_interp_avg_1pt_neon(ps_prms, i4_mv_x - 1, i4_mv_y, 0, ppu1_final, pi4_final_stride);
363*c83a76b0SSuyog Pawar }
364*c83a76b0SSuyog Pawar
hme_evalsatd_update_1_best_result_pt_pu_16x16_neon(err_prms_t * ps_prms,result_upd_prms_t * ps_result_prms)365*c83a76b0SSuyog Pawar void hme_evalsatd_update_1_best_result_pt_pu_16x16_neon(
366*c83a76b0SSuyog Pawar err_prms_t *ps_prms, result_upd_prms_t *ps_result_prms)
367*c83a76b0SSuyog Pawar {
368*c83a76b0SSuyog Pawar mv_refine_ctxt_t *refine_ctxt = ps_result_prms->ps_subpel_refine_ctxt;
369*c83a76b0SSuyog Pawar S32 *pi4_sad_grid = ps_prms->pi4_sad_grid;
370*c83a76b0SSuyog Pawar S32 *pi4_valid_part_ids = &refine_ctxt->ai4_part_id[0];
371*c83a76b0SSuyog Pawar
372*c83a76b0SSuyog Pawar S32 ai4_satd_4x4[16];
373*c83a76b0SSuyog Pawar S32 ai4_satd_8x8[4];
374*c83a76b0SSuyog Pawar
375*c83a76b0SSuyog Pawar U08 *pu1_inp = ps_prms->pu1_inp;
376*c83a76b0SSuyog Pawar U08 *pu1_ref = ps_prms->pu1_ref;
377*c83a76b0SSuyog Pawar
378*c83a76b0SSuyog Pawar S32 inp_stride = ps_prms->i4_inp_stride;
379*c83a76b0SSuyog Pawar S32 ref_stride = ps_prms->i4_ref_stride;
380*c83a76b0SSuyog Pawar
381*c83a76b0SSuyog Pawar S32 i;
382*c83a76b0SSuyog Pawar
383*c83a76b0SSuyog Pawar /* Call recursive 16x16 HAD module; updates satds for 4x4, 8x8 and 16x16 */
384*c83a76b0SSuyog Pawar for(i = 0; i < 4; i++)
385*c83a76b0SSuyog Pawar {
386*c83a76b0SSuyog Pawar U08 *pu1_src = pu1_inp + (i & 0x1) * 8 + (i >> 1) * inp_stride * 8;
387*c83a76b0SSuyog Pawar U08 *pu1_pred = pu1_ref + (i & 0x1) * 8 + (i >> 1) * ref_stride * 8;
388*c83a76b0SSuyog Pawar S16 idx = (i & 0x1) * 2 + (i >> 1) * 8;
389*c83a76b0SSuyog Pawar
390*c83a76b0SSuyog Pawar ai4_satd_8x8[i] = ihevce_had4_4x4_neon(
391*c83a76b0SSuyog Pawar pu1_src, inp_stride, pu1_pred, ref_stride, NULL, 0, &ai4_satd_4x4[idx], 4, 0);
392*c83a76b0SSuyog Pawar }
393*c83a76b0SSuyog Pawar
394*c83a76b0SSuyog Pawar /* Update 16x16 SATDs */
395*c83a76b0SSuyog Pawar pi4_sad_grid[PART_ID_2Nx2N] =
396*c83a76b0SSuyog Pawar ai4_satd_8x8[0] + ai4_satd_8x8[1] + ai4_satd_8x8[2] + ai4_satd_8x8[3];
397*c83a76b0SSuyog Pawar
398*c83a76b0SSuyog Pawar pi4_sad_grid[PART_ID_NxN_TL] = ai4_satd_8x8[0];
399*c83a76b0SSuyog Pawar pi4_sad_grid[PART_ID_NxN_TR] = ai4_satd_8x8[1];
400*c83a76b0SSuyog Pawar pi4_sad_grid[PART_ID_NxN_BL] = ai4_satd_8x8[2];
401*c83a76b0SSuyog Pawar pi4_sad_grid[PART_ID_NxN_BR] = ai4_satd_8x8[3];
402*c83a76b0SSuyog Pawar
403*c83a76b0SSuyog Pawar /* Update 8x16 / 16x8 SATDs */
404*c83a76b0SSuyog Pawar pi4_sad_grid[PART_ID_Nx2N_L] = ai4_satd_8x8[0] + ai4_satd_8x8[2];
405*c83a76b0SSuyog Pawar pi4_sad_grid[PART_ID_Nx2N_R] = ai4_satd_8x8[1] + ai4_satd_8x8[3];
406*c83a76b0SSuyog Pawar pi4_sad_grid[PART_ID_2NxN_T] = ai4_satd_8x8[0] + ai4_satd_8x8[1];
407*c83a76b0SSuyog Pawar pi4_sad_grid[PART_ID_2NxN_B] = ai4_satd_8x8[2] + ai4_satd_8x8[3];
408*c83a76b0SSuyog Pawar
409*c83a76b0SSuyog Pawar /* Update AMP SATDs 16x12,16x4, 12x16,4x16 */
410*c83a76b0SSuyog Pawar pi4_sad_grid[PART_ID_nLx2N_L] =
411*c83a76b0SSuyog Pawar ai4_satd_4x4[0] + ai4_satd_4x4[2] + ai4_satd_4x4[8] + ai4_satd_4x4[10];
412*c83a76b0SSuyog Pawar pi4_sad_grid[PART_ID_nRx2N_R] =
413*c83a76b0SSuyog Pawar ai4_satd_4x4[5] + ai4_satd_4x4[7] + ai4_satd_4x4[13] + ai4_satd_4x4[15];
414*c83a76b0SSuyog Pawar pi4_sad_grid[PART_ID_2NxnU_T] =
415*c83a76b0SSuyog Pawar ai4_satd_4x4[0] + ai4_satd_4x4[1] + ai4_satd_4x4[4] + ai4_satd_4x4[5];
416*c83a76b0SSuyog Pawar pi4_sad_grid[PART_ID_2NxnD_B] =
417*c83a76b0SSuyog Pawar ai4_satd_4x4[10] + ai4_satd_4x4[11] + ai4_satd_4x4[14] + ai4_satd_4x4[15];
418*c83a76b0SSuyog Pawar
419*c83a76b0SSuyog Pawar pi4_sad_grid[PART_ID_nLx2N_R] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nLx2N_L];
420*c83a76b0SSuyog Pawar pi4_sad_grid[PART_ID_nRx2N_L] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_nRx2N_R];
421*c83a76b0SSuyog Pawar pi4_sad_grid[PART_ID_2NxnU_B] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnU_T];
422*c83a76b0SSuyog Pawar pi4_sad_grid[PART_ID_2NxnD_T] = pi4_sad_grid[PART_ID_2Nx2N] - pi4_sad_grid[PART_ID_2NxnD_B];
423*c83a76b0SSuyog Pawar
424*c83a76b0SSuyog Pawar /* For each valid partition, update the refine_prm structure to
425*c83a76b0SSuyog Pawar * reflect the best and second best candidates for that partition */
426*c83a76b0SSuyog Pawar for(i = 0; i < refine_ctxt->i4_num_valid_parts; i++)
427*c83a76b0SSuyog Pawar {
428*c83a76b0SSuyog Pawar S32 part_id = pi4_valid_part_ids[i];
429*c83a76b0SSuyog Pawar S32 id = (refine_ctxt->i4_num_valid_parts > 8) ? part_id : i;
430*c83a76b0SSuyog Pawar S32 i4_mv_cost = refine_ctxt->i2_mv_cost[0][id];
431*c83a76b0SSuyog Pawar S32 i4_sad = CLIP3(pi4_sad_grid[part_id], 0, 0x7fff);
432*c83a76b0SSuyog Pawar S32 i4_tot_cost = CLIP_S16(i4_sad + i4_mv_cost);
433*c83a76b0SSuyog Pawar S32 best_node_cost = CLIP_S16(refine_ctxt->i2_tot_cost[0][id]);
434*c83a76b0SSuyog Pawar
435*c83a76b0SSuyog Pawar if(i4_tot_cost < best_node_cost)
436*c83a76b0SSuyog Pawar {
437*c83a76b0SSuyog Pawar refine_ctxt->i2_tot_cost[0][id] = i4_tot_cost;
438*c83a76b0SSuyog Pawar refine_ctxt->i2_mv_cost[0][id] = i4_mv_cost;
439*c83a76b0SSuyog Pawar refine_ctxt->i2_mv_x[0][id] = ps_result_prms->i2_mv_x;
440*c83a76b0SSuyog Pawar refine_ctxt->i2_mv_y[0][id] = ps_result_prms->i2_mv_y;
441*c83a76b0SSuyog Pawar refine_ctxt->i2_ref_idx[0][id] = ps_result_prms->i1_ref_idx;
442*c83a76b0SSuyog Pawar }
443*c83a76b0SSuyog Pawar }
444*c83a76b0SSuyog Pawar }
445