1 /******************************************************************************
2 *
3 * Copyright (C) 2018 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 * ihevce_coarse_layer_sad_neon.c
24 *
25 * @brief
26 * Contains intrinsic definitions of functions for computing sad
27 *
28 * @author
29 * Ittiam
30 *
31 * @par List of Functions:
32 *
33 * @remarks
34 * None
35 *
36 ********************************************************************************
37 */
38
39 /*****************************************************************************/
40 /* File Includes */
41 /*****************************************************************************/
42 /* System include files */
43 #include <stdio.h>
44 #include <string.h>
45 #include <assert.h>
46 #include <arm_neon.h>
47
48 /* User include files */
49 #include "ihevc_typedefs.h"
50 #include "itt_video_api.h"
51 #include "ihevc_cmn_utils_neon.h"
52 #include "ihevc_chroma_itrans_recon.h"
53 #include "ihevc_chroma_intra_pred.h"
54 #include "ihevc_debug.h"
55 #include "ihevc_deblk.h"
56 #include "ihevc_defs.h"
57 #include "ihevc_itrans_recon.h"
58 #include "ihevc_intra_pred.h"
59 #include "ihevc_inter_pred.h"
60 #include "ihevc_macros.h"
61 #include "ihevc_mem_fns.h"
62 #include "ihevc_padding.h"
63 #include "ihevc_quant_iquant_ssd.h"
64 #include "ihevc_resi_trans.h"
65 #include "ihevc_sao.h"
66 #include "ihevc_structs.h"
67 #include "ihevc_weighted_pred.h"
68
69 #include "rc_cntrl_param.h"
70 #include "rc_frame_info_collector.h"
71 #include "rc_look_ahead_params.h"
72
73 #include "ihevce_api.h"
74 #include "ihevce_defs.h"
75 #include "ihevce_lap_enc_structs.h"
76 #include "ihevce_multi_thrd_structs.h"
77 #include "ihevce_function_selector.h"
78 #include "ihevce_me_common_defs.h"
79 #include "ihevce_enc_structs.h"
80 #include "ihevce_had_satd.h"
81 #include "ihevce_ipe_instr_set_router.h"
82 #include "ihevce_global_tables.h"
83
84 #include "hme_datatype.h"
85 #include "hme_common_defs.h"
86 #include "hme_interface.h"
87 #include "hme_defs.h"
88 #include "hme_globals.h"
89
90 #include "ihevce_me_instr_set_router.h"
91
92 /*****************************************************************************/
93 /* Function Definitions */
94 /*****************************************************************************/
95
hme_store_4x4_sads_high_speed_neon(hme_search_prms_t * ps_search_prms,layer_ctxt_t * ps_layer_ctxt,range_prms_t * ps_mv_limit,wgt_pred_ctxt_t * ps_wt_inp_prms,S16 * pi2_sads_4x4)96 void hme_store_4x4_sads_high_speed_neon(
97 hme_search_prms_t *ps_search_prms,
98 layer_ctxt_t *ps_layer_ctxt,
99 range_prms_t *ps_mv_limit,
100 wgt_pred_ctxt_t *ps_wt_inp_prms,
101 S16 *pi2_sads_4x4)
102 {
103 uint8x8_t src2[4];
104 uint8x16_t src;
105
106 S32 i, j;
107
108 /* Input and reference attributes */
109 U08 *pu1_inp, *pu1_ref;
110 S32 i4_inp_stride, i4_ref_stride, i4_ref_offset;
111
112 /* The reference is actually an array of ptrs since there are several */
113 /* reference id. So an array gets passed form calling function */
114 U08 **ppu1_ref, *pu1_ref_coloc;
115
116 S32 stepy, stepx, step_shift_x, step_shift_y;
117 S32 mvx, mvy, mv_x_offset, mv_y_offset, mv_x_range, mv_y_range;
118
119 /* Points to the range limits for mv */
120 range_prms_t *ps_range_prms = ps_search_prms->aps_mv_range[0];
121
122 /* Reference index to be searched */
123 S32 i4_search_idx = ps_search_prms->i1_ref_idx;
124
125 pu1_inp = ps_wt_inp_prms->apu1_wt_inp[i4_search_idx];
126 i4_inp_stride = ps_search_prms->i4_inp_stride;
127
128 /* Move to the location of the search blk in inp buffer */
129 pu1_inp += ps_search_prms->i4_cu_x_off;
130 pu1_inp += ps_search_prms->i4_cu_y_off * i4_inp_stride;
131
132 /*************************************************************************/
133 /* we use either input of previously encoded pictures as reference */
134 /* in coarse layer */
135 /*************************************************************************/
136 i4_ref_stride = ps_layer_ctxt->i4_inp_stride;
137 ppu1_ref = ps_layer_ctxt->ppu1_list_inp;
138
139 /* colocated position in reference picture */
140 i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
141 pu1_ref_coloc = ppu1_ref[i4_search_idx] + i4_ref_offset;
142
143 stepx = stepy = HME_COARSE_STEP_SIZE_HIGH_SPEED;
144 /*TODO: Calculate Step shift from the #define HME_COARSE_STEP_SIZE_HIGH_SPEED */
145 step_shift_x = step_shift_y = 2;
146
147 mv_x_offset = -(ps_mv_limit->i2_min_x >> step_shift_x);
148 mv_y_offset = -(ps_mv_limit->i2_min_y >> step_shift_y);
149 mv_x_range = (-ps_mv_limit->i2_min_x + ps_mv_limit->i2_max_x) >> step_shift_x;
150 mv_y_range = (-ps_mv_limit->i2_min_y + ps_mv_limit->i2_max_y) >> step_shift_y;
151
152 ASSERT(4 == stepx);
153
154 /* load input */
155 {
156 S32 mv_x_sweep = ps_range_prms->i2_max_x - ps_range_prms->i2_min_x;
157 uint32x2_t a[4];
158
159 for(i = 0; i < 4; i++)
160 {
161 a[i] = vld1_dup_u32((uint32_t *)pu1_inp);
162 pu1_inp += i4_inp_stride;
163 }
164 src2[0] = vreinterpret_u8_u32(a[0]);
165 src2[1] = vreinterpret_u8_u32(a[1]);
166 src2[2] = vreinterpret_u8_u32(a[2]);
167 src2[3] = vreinterpret_u8_u32(a[3]);
168
169 if((mv_x_sweep >> step_shift_x) & 1)
170 {
171 uint32x2x2_t l = vtrn_u32(a[0], a[1]);
172 uint32x2x2_t m = vtrn_u32(a[2], a[3]);
173
174 src = vcombine_u8(vreinterpret_u8_u32(l.val[0]), vreinterpret_u8_u32(m.val[0]));
175 }
176 }
177
178 /* Run 2loops to sweep over the reference area */
179 for(mvy = ps_range_prms->i2_min_y; mvy < ps_range_prms->i2_max_y; mvy += stepy)
180 {
181 for(mvx = ps_range_prms->i2_min_x; mvx < ps_range_prms->i2_max_x;)
182 {
183 U16 *pu2_sad = (U16 *)&pi2_sads_4x4
184 [((mvx >> step_shift_x) + mv_x_offset) +
185 ((mvy >> step_shift_y) + mv_y_offset) * mv_x_range];
186
187 pu1_ref = pu1_ref_coloc + mvx + (mvy * i4_ref_stride);
188 if((mvx + (stepx * 4)) <= ps_range_prms->i2_max_x) // 16x4
189 {
190 uint16x8_t abs_01 = vdupq_n_u16(0);
191 uint16x8_t abs_23 = vdupq_n_u16(0);
192 uint16x4_t tmp_a0, tmp_a1;
193
194 for(j = 0; j < 4; j++)
195 {
196 uint8x16_t ref = vld1q_u8(pu1_ref);
197
198 abs_01 = vabal_u8(abs_01, src2[j], vget_low_u8(ref));
199 abs_23 = vabal_u8(abs_23, src2[j], vget_high_u8(ref));
200 pu1_ref += i4_ref_stride;
201 }
202 tmp_a0 = vpadd_u16(vget_low_u16(abs_01), vget_high_u16(abs_01));
203 tmp_a1 = vpadd_u16(vget_low_u16(abs_23), vget_high_u16(abs_23));
204 abs_01 = vcombine_u16(tmp_a0, tmp_a1);
205 tmp_a0 = vpadd_u16(vget_low_u16(abs_01), vget_high_u16(abs_01));
206 vst1_u16(pu2_sad, tmp_a0);
207 mvx += stepx * 4;
208 }
209 else if((mvx + (stepx * 2)) <= ps_range_prms->i2_max_x) // 8x4
210 {
211 uint16x8_t abs_01 = vdupq_n_u16(0);
212 uint16x4_t tmp_a;
213 uint32x2_t tmp_b;
214
215 for(j = 0; j < 4; j++)
216 {
217 uint8x8_t ref = vld1_u8(pu1_ref);
218
219 abs_01 = vabal_u8(abs_01, src2[j], ref);
220 pu1_ref += i4_ref_stride;
221 }
222 tmp_a = vpadd_u16(vget_low_u16(abs_01), vget_high_u16(abs_01));
223 tmp_b = vpaddl_u16(tmp_a);
224 pu2_sad[0] = vget_lane_u32(tmp_b, 0);
225 pu2_sad[1] = vget_lane_u32(tmp_b, 1);
226 mvx += stepx * 2;
227 }
228 else if((mvx + stepx) <= ps_range_prms->i2_max_x) // 4x4
229 {
230 const uint8x16_t ref = load_unaligned_u8q(pu1_ref, i4_ref_stride);
231 uint16x8_t abs = vabdl_u8(vget_low_u8(src), vget_low_u8(ref));
232 uint32x4_t b;
233 uint64x2_t c;
234
235 abs = vabal_u8(abs, vget_high_u8(src), vget_high_u8(ref));
236 b = vpaddlq_u16(abs);
237 c = vpaddlq_u32(b);
238 *pu2_sad = vget_lane_u32(
239 vadd_u32(
240 vreinterpret_u32_u64(vget_low_u64(c)),
241 vreinterpret_u32_u64(vget_high_u64(c))),
242 0);
243 mvx += stepx;
244 }
245 }
246 }
247 }
248
hme_store_4x4_sads_high_quality_neon(hme_search_prms_t * ps_search_prms,layer_ctxt_t * ps_layer_ctxt,range_prms_t * ps_mv_limit,wgt_pred_ctxt_t * ps_wt_inp_prms,S16 * pi2_sads_4x4)249 void hme_store_4x4_sads_high_quality_neon(
250 hme_search_prms_t *ps_search_prms,
251 layer_ctxt_t *ps_layer_ctxt,
252 range_prms_t *ps_mv_limit,
253 wgt_pred_ctxt_t *ps_wt_inp_prms,
254 S16 *pi2_sads_4x4)
255 {
256 uint8x8_t src2[4];
257 uint8x16_t src;
258
259 S32 i, j;
260
261 /* Input and reference attributes */
262 U08 *pu1_inp, *pu1_ref;
263 S32 i4_inp_stride, i4_ref_stride, i4_ref_offset;
264
265 /* The reference is actually an array of ptrs since there are several */
266 /* reference id. So an array gets passed form calling function */
267 U08 **ppu1_ref, *pu1_ref_coloc;
268
269 S32 stepy, stepx, step_shift_x, step_shift_y;
270 S32 mvx, mvy, mv_x_offset, mv_y_offset, mv_x_range, mv_y_range;
271
272 /* Points to the range limits for mv */
273 range_prms_t *ps_range_prms = ps_search_prms->aps_mv_range[0];
274
275 /* Reference index to be searched */
276 S32 i4_search_idx = ps_search_prms->i1_ref_idx;
277
278 pu1_inp = ps_wt_inp_prms->apu1_wt_inp[i4_search_idx];
279 i4_inp_stride = ps_search_prms->i4_inp_stride;
280
281 /* Move to the location of the search blk in inp buffer */
282 pu1_inp += ps_search_prms->i4_cu_x_off;
283 pu1_inp += ps_search_prms->i4_cu_y_off * i4_inp_stride;
284
285 /*************************************************************************/
286 /* we use either input of previously encoded pictures as reference */
287 /* in coarse layer */
288 /*************************************************************************/
289 i4_ref_stride = ps_layer_ctxt->i4_inp_stride;
290 ppu1_ref = ps_layer_ctxt->ppu1_list_inp;
291
292 /* colocated position in reference picture */
293 i4_ref_offset = (i4_ref_stride * ps_search_prms->i4_y_off) + ps_search_prms->i4_x_off;
294 pu1_ref_coloc = ppu1_ref[i4_search_idx] + i4_ref_offset;
295
296 stepx = stepy = HME_COARSE_STEP_SIZE_HIGH_QUALITY;
297 /*TODO: Calculate Step shift from the #define HME_COARSE_STEP_SIZE_HIGH_QUALITY */
298 step_shift_x = step_shift_y = 1;
299
300 mv_x_offset = -(ps_mv_limit->i2_min_x >> step_shift_x);
301 mv_y_offset = -(ps_mv_limit->i2_min_y >> step_shift_y);
302 mv_x_range = (-ps_mv_limit->i2_min_x + ps_mv_limit->i2_max_x) >> step_shift_x;
303 mv_y_range = (-ps_mv_limit->i2_min_y + ps_mv_limit->i2_max_y) >> step_shift_y;
304
305 /* load input */
306 {
307 S32 mv_x_sweep = ps_range_prms->i2_max_x - ps_range_prms->i2_min_x;
308 uint32x2_t a[4];
309
310 for(i = 0; i < 4; i++)
311 {
312 a[i] = vld1_dup_u32((uint32_t *)pu1_inp);
313 pu1_inp += i4_inp_stride;
314 }
315 src2[0] = vreinterpret_u8_u32(a[0]);
316 src2[1] = vreinterpret_u8_u32(a[1]);
317 src2[2] = vreinterpret_u8_u32(a[2]);
318 src2[3] = vreinterpret_u8_u32(a[3]);
319
320 if((mv_x_sweep >> 2) & 1)
321 {
322 uint32x2x2_t l = vtrn_u32(a[0], a[1]);
323 uint32x2x2_t m = vtrn_u32(a[2], a[3]);
324
325 src = vcombine_u8(vreinterpret_u8_u32(l.val[0]), vreinterpret_u8_u32(m.val[0]));
326 }
327 }
328
329 /* Run 2loops to sweep over the reference area */
330 for(mvy = ps_range_prms->i2_min_y; mvy < ps_range_prms->i2_max_y; mvy += stepy)
331 {
332 for(mvx = ps_range_prms->i2_min_x; mvx < ps_range_prms->i2_max_x;)
333 {
334 U16 *pu2_sad = (U16 *)&pi2_sads_4x4
335 [((mvx >> step_shift_x) + mv_x_offset) +
336 ((mvy >> step_shift_y) + mv_y_offset) * mv_x_range];
337
338 pu1_ref = pu1_ref_coloc + mvx + (mvy * i4_ref_stride);
339 if((mvx + (stepx * 8)) <= ps_range_prms->i2_max_x) // 16x4
340 {
341 uint16x8_t abs_a_01 = vdupq_n_u16(0);
342 uint16x8_t abs_a_23 = vdupq_n_u16(0);
343 uint16x8_t abs_b_01 = vdupq_n_u16(0);
344 uint16x8_t abs_b_23 = vdupq_n_u16(0);
345 uint16x4_t tmp_b0, tmp_b1;
346 uint16x4x2_t tmp_a;
347
348 for(j = 0; j < 4; j++)
349 {
350 uint8x16_t ref_a = vld1q_u8(pu1_ref);
351 uint8x16_t ref_b = vld1q_u8(pu1_ref + 2);
352
353 abs_a_01 = vabal_u8(abs_a_01, src2[j], vget_low_u8(ref_a));
354 abs_a_23 = vabal_u8(abs_a_23, src2[j], vget_high_u8(ref_a));
355 abs_b_01 = vabal_u8(abs_b_01, src2[j], vget_low_u8(ref_b));
356 abs_b_23 = vabal_u8(abs_b_23, src2[j], vget_high_u8(ref_b));
357 pu1_ref += i4_ref_stride;
358 }
359 tmp_a.val[0] = vpadd_u16(vget_low_u16(abs_a_01), vget_high_u16(abs_a_01));
360 tmp_a.val[1] = vpadd_u16(vget_low_u16(abs_a_23), vget_high_u16(abs_a_23));
361 abs_a_01 = vcombine_u16(tmp_a.val[0], tmp_a.val[1]);
362 tmp_a.val[0] = vpadd_u16(vget_low_u16(abs_a_01), vget_high_u16(abs_a_01));
363 tmp_b0 = vpadd_u16(vget_low_u16(abs_b_01), vget_high_u16(abs_b_01));
364 tmp_b1 = vpadd_u16(vget_low_u16(abs_b_23), vget_high_u16(abs_b_23));
365 abs_b_01 = vcombine_u16(tmp_b0, tmp_b1);
366 tmp_a.val[1] = vpadd_u16(vget_low_u16(abs_b_01), vget_high_u16(abs_b_01));
367 vst2_u16(pu2_sad, tmp_a);
368 mvx += stepx * 8;
369 }
370 else if((mvx + (stepx * 4)) <= ps_range_prms->i2_max_x) // 8x4
371 {
372 uint16x8_t abs_a_01 = vdupq_n_u16(0);
373 uint16x8_t abs_b_01 = vdupq_n_u16(0);
374 uint16x4_t tmp_a, tmp_b;
375
376 for(j = 0; j < 4; j++)
377 {
378 uint8x8_t ref_a = vld1_u8(pu1_ref);
379 uint8x8_t ref_b = vld1_u8(pu1_ref + 2);
380
381 abs_a_01 = vabal_u8(abs_a_01, src2[j], ref_a);
382 abs_b_01 = vabal_u8(abs_b_01, src2[j], ref_b);
383 pu1_ref += i4_ref_stride;
384 }
385 tmp_a = vpadd_u16(vget_low_u16(abs_a_01), vget_high_u16(abs_a_01));
386 tmp_b = vpadd_u16(vget_low_u16(abs_b_01), vget_high_u16(abs_b_01));
387 tmp_a = vpadd_u16(tmp_a, tmp_b);
388
389 pu2_sad[0] = vget_lane_u16(tmp_a, 0);
390 pu2_sad[1] = vget_lane_u16(tmp_a, 2);
391 pu2_sad[2] = vget_lane_u16(tmp_a, 1);
392 pu2_sad[3] = vget_lane_u16(tmp_a, 3);
393 mvx += stepx * 4;
394 }
395 else if((mvx + stepx * 2) <= ps_range_prms->i2_max_x) // 4x4
396 {
397 uint8x16_t ref = load_unaligned_u8q(pu1_ref, i4_ref_stride);
398 uint16x8_t abs = vabdl_u8(vget_low_u8(src), vget_low_u8(ref));
399 uint32x4_t b;
400 uint64x2_t c;
401
402 abs = vabal_u8(abs, vget_high_u8(src), vget_high_u8(ref));
403 b = vpaddlq_u16(abs);
404 c = vpaddlq_u32(b);
405 *pu2_sad = vget_lane_u32(
406 vadd_u32(
407 vreinterpret_u32_u64(vget_low_u64(c)),
408 vreinterpret_u32_u64(vget_high_u64(c))),
409 0);
410
411 ref = load_unaligned_u8q(pu1_ref + 2, i4_ref_stride);
412 abs = vabdl_u8(vget_low_u8(src), vget_low_u8(ref));
413 abs = vabal_u8(abs, vget_high_u8(src), vget_high_u8(ref));
414 b = vpaddlq_u16(abs);
415 c = vpaddlq_u32(b);
416 pu2_sad[1] = vget_lane_u32(
417 vadd_u32(
418 vreinterpret_u32_u64(vget_low_u64(c)),
419 vreinterpret_u32_u64(vget_high_u64(c))),
420 0);
421 mvx += stepx * 2;
422 }
423 else
424 {
425 assert(0);
426 }
427 }
428 }
429 }
430
431 #define BEST_COST(i) \
432 if(sad_array[0][i] < min_cost_4x8) \
433 { \
434 best_mv_x_4x8 = mvx + i * stepx; \
435 best_mv_y_4x8 = mvy; \
436 min_cost_4x8 = sad_array[0][i]; \
437 } \
438 if(sad_array[1][i] < min_cost_8x4) \
439 { \
440 best_mv_x_8x4 = mvx + i * stepx; \
441 best_mv_y_8x4 = mvy; \
442 min_cost_8x4 = sad_array[1][i]; \
443 }
444
hme_combine_4x4_sads_and_compute_cost_high_speed_neon(S08 i1_ref_idx,range_prms_t * ps_mv_range,range_prms_t * ps_mv_limit,hme_mv_t * ps_best_mv_4x8,hme_mv_t * ps_best_mv_8x4,pred_ctxt_t * ps_pred_ctxt,PF_MV_COST_FXN pf_mv_cost_compute,S16 * pi2_sads_4x4_current,S16 * pi2_sads_4x4_east,S16 * pi2_sads_4x4_south)445 void hme_combine_4x4_sads_and_compute_cost_high_speed_neon(
446 S08 i1_ref_idx,
447 range_prms_t *ps_mv_range,
448 range_prms_t *ps_mv_limit,
449 hme_mv_t *ps_best_mv_4x8,
450 hme_mv_t *ps_best_mv_8x4,
451 pred_ctxt_t *ps_pred_ctxt,
452 PF_MV_COST_FXN pf_mv_cost_compute,
453 S16 *pi2_sads_4x4_current,
454 S16 *pi2_sads_4x4_east,
455 S16 *pi2_sads_4x4_south)
456 {
457 S32 best_mv_y_4x8, best_mv_x_4x8, best_mv_y_8x4, best_mv_x_8x4;
458
459 S32 stepy = HME_COARSE_STEP_SIZE_HIGH_SPEED;
460 S32 stepx = HME_COARSE_STEP_SIZE_HIGH_SPEED;
461 /*TODO: Calculate Step shift from the #define HME_COARSE_STEP_SIZE_HIGH_SPEED */
462 S32 step_shift_x = 2;
463 S32 step_shift_y = 2;
464
465 S32 mvx, mvy, mv_x_offset, mv_y_offset, mv_x_range, mv_y_range;
466
467 S32 lambda = ps_pred_ctxt->lambda;
468 S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
469 S32 rnd = 1 << (lambda_q_shift - 1);
470
471 S32 min_cost_4x8 = MAX_32BIT_VAL;
472 S32 min_cost_8x4 = MAX_32BIT_VAL;
473
474 S32 i;
475
476 const uint16x8_t v_ref_idx = vdupq_n_u16(i1_ref_idx);
477 const uint32x4_t v_lambda = vdupq_n_u32(lambda);
478 const uint32x4_t v_rnd_factor = vdupq_n_u32(rnd);
479 const int32x4_t v_lambda_q_shift = vdupq_n_s32(-lambda_q_shift);
480
481 mv_x_offset = (-ps_mv_limit->i2_min_x >> step_shift_x);
482 mv_y_offset = (-ps_mv_limit->i2_min_y >> step_shift_y);
483 mv_x_range = (-ps_mv_limit->i2_min_x + ps_mv_limit->i2_max_x) >> step_shift_x;
484 mv_y_range = (-ps_mv_limit->i2_min_y + ps_mv_limit->i2_max_y) >> step_shift_y;
485
486 ASSERT(MAX_MVX_SUPPORTED_IN_COARSE_LAYER >= ABS(ps_mv_range->i2_max_x));
487 ASSERT(MAX_MVY_SUPPORTED_IN_COARSE_LAYER >= ABS(ps_mv_range->i2_max_y));
488
489 /* Run 2loops to sweep over the reference area */
490 for(mvy = ps_mv_range->i2_min_y; mvy < ps_mv_range->i2_max_y; mvy += stepy)
491 {
492 /* LUT: (2 * hme_get_range(mv_y) - 1) + ((!mv_y) ? 0 : 1) */
493 uint16x8_t mvy_wt = vld1q_u16((U16 *)&gi2_mvy_range[ABS(mvy)][0]);
494
495 /* mvy wt + ref_idx */
496 mvy_wt = vaddq_u16(mvy_wt, v_ref_idx);
497
498 for(mvx = ps_mv_range->i2_min_x; mvx < ps_mv_range->i2_max_x;)
499 {
500 S32 sad_pos = ((mvx >> step_shift_x) + mv_x_offset) +
501 ((mvy >> step_shift_y) + mv_y_offset) * mv_x_range;
502
503 if(mvx + (8 * stepx) <= ps_mv_range->i2_max_x) // 8x4
504 {
505 uint16x8_t curr = vld1q_u16((U16 *)pi2_sads_4x4_current + sad_pos);
506 uint16x8_t south = vld1q_u16((U16 *)pi2_sads_4x4_south + sad_pos);
507 uint16x8_t east = vld1q_u16((U16 *)pi2_sads_4x4_east + sad_pos);
508 uint16x8_t sad_4x8 = vaddq_u16(curr, south);
509 uint16x8_t sad_8x4 = vaddq_u16(curr, east);
510 /* LUT: (2 * hme_get_range(mv_x) - 1) + ((!mv_x) ? 0 : 1) */
511 uint16x8_t mv_wt =
512 vld1q_u16((U16 *)&gi2_mvx_range[mvx + MAX_MVX_SUPPORTED_IN_COARSE_LAYER][0]);
513 uint32x4_t total_cost_0, total_cost_1;
514 uint16x8_t total_cost;
515 U16 sad_array[2][8];
516
517 /* mv weight + ref_idx */
518 mv_wt = vaddq_u16(mv_wt, mvy_wt);
519
520 total_cost_0 = vmulq_u32(v_lambda, vmovl_u16(vget_low_u16(mv_wt)));
521 total_cost_1 = vmulq_u32(v_lambda, vmovl_u16(vget_high_u16(mv_wt)));
522
523 total_cost_0 = vaddq_u32(total_cost_0, v_rnd_factor);
524 total_cost_1 = vaddq_u32(total_cost_1, v_rnd_factor);
525
526 total_cost_0 = vshlq_u32(total_cost_0, v_lambda_q_shift);
527 total_cost_1 = vshlq_u32(total_cost_1, v_lambda_q_shift);
528
529 total_cost = vcombine_u16(vmovn_u32(total_cost_0), vmovn_u32(total_cost_1));
530
531 sad_4x8 = vaddq_u16(total_cost, sad_4x8);
532 sad_8x4 = vaddq_u16(total_cost, sad_8x4);
533
534 vst1q_u16(sad_array[0], sad_4x8);
535 vst1q_u16(sad_array[1], sad_8x4);
536
537 for(i = 0; i < 8; i++)
538 {
539 BEST_COST(i);
540 }
541 mvx += stepx * 8;
542 }
543 else if(mvx + (4 * stepx) <= ps_mv_range->i2_max_x) // 4x4
544 {
545 uint16x4_t curr = vld1_u16((U16 *)pi2_sads_4x4_current + sad_pos);
546 uint16x4_t south = vld1_u16((U16 *)pi2_sads_4x4_south + sad_pos);
547 uint16x4_t east = vld1_u16((U16 *)pi2_sads_4x4_east + sad_pos);
548 uint16x4_t sad_4x8 = vadd_u16(curr, south);
549 uint16x4_t sad_8x4 = vadd_u16(curr, east);
550 /* LUT: (2 * hme_get_range(mv_x) - 1) + ((!mv_x) ? 0 : 1) */
551 uint16x4_t mv_wt =
552 vld1_u16((U16 *)&gi2_mvx_range[mvx + MAX_MVX_SUPPORTED_IN_COARSE_LAYER][0]);
553 uint32x4_t total_cost;
554 U16 sad_array[2][4];
555
556 /* mv weight + ref_idx */
557 mv_wt = vadd_u16(mv_wt, vget_low_u16(mvy_wt));
558
559 total_cost = vmulq_u32(v_lambda, vmovl_u16(mv_wt));
560 total_cost = vaddq_u32(total_cost, v_rnd_factor);
561 total_cost = vshlq_u32(total_cost, v_lambda_q_shift);
562
563 sad_4x8 = vadd_u16(vmovn_u32(total_cost), sad_4x8);
564 sad_8x4 = vadd_u16(vmovn_u32(total_cost), sad_8x4);
565
566 vst1_u16(sad_array[0], sad_4x8);
567 vst1_u16(sad_array[1], sad_8x4);
568
569 for(i = 0; i < 4; i++)
570 {
571 BEST_COST(i);
572 }
573
574 mvx += stepx * 4;
575 }
576 else
577 {
578 S16 sad_array[2][1];
579 S32 mv_cost;
580
581 /* Get SAD by adding SAD for current and neighbour S */
582 sad_array[0][0] = pi2_sads_4x4_current[sad_pos] + pi2_sads_4x4_south[sad_pos];
583 sad_array[1][0] = pi2_sads_4x4_current[sad_pos] + pi2_sads_4x4_east[sad_pos];
584
585 mv_cost = gi2_mvy_range[ABS(mvy)][0] +
586 gi2_mvx_range[mvx + MAX_MVX_SUPPORTED_IN_COARSE_LAYER][0] + i1_ref_idx;
587
588 mv_cost = (mv_cost * lambda + rnd) >> lambda_q_shift;
589
590 sad_array[0][0] += mv_cost;
591 sad_array[1][0] += mv_cost;
592
593 BEST_COST(0);
594 mvx += stepx;
595 }
596 }
597 }
598
599 ps_best_mv_4x8->i2_mv_x = best_mv_x_4x8;
600 ps_best_mv_4x8->i2_mv_y = best_mv_y_4x8;
601
602 ps_best_mv_8x4->i2_mv_x = best_mv_x_8x4;
603 ps_best_mv_8x4->i2_mv_y = best_mv_y_8x4;
604 }
605
hme_combine_4x4_sads_and_compute_cost_high_quality_neon(S08 i1_ref_idx,range_prms_t * ps_mv_range,range_prms_t * ps_mv_limit,hme_mv_t * ps_best_mv_4x8,hme_mv_t * ps_best_mv_8x4,pred_ctxt_t * ps_pred_ctxt,PF_MV_COST_FXN pf_mv_cost_compute,S16 * pi2_sads_4x4_current,S16 * pi2_sads_4x4_east,S16 * pi2_sads_4x4_south)606 void hme_combine_4x4_sads_and_compute_cost_high_quality_neon(
607 S08 i1_ref_idx,
608 range_prms_t *ps_mv_range,
609 range_prms_t *ps_mv_limit,
610 hme_mv_t *ps_best_mv_4x8,
611 hme_mv_t *ps_best_mv_8x4,
612 pred_ctxt_t *ps_pred_ctxt,
613 PF_MV_COST_FXN pf_mv_cost_compute,
614 S16 *pi2_sads_4x4_current,
615 S16 *pi2_sads_4x4_east,
616 S16 *pi2_sads_4x4_south)
617 {
618 S32 best_mv_y_4x8, best_mv_x_4x8, best_mv_y_8x4, best_mv_x_8x4;
619
620 S32 stepy = HME_COARSE_STEP_SIZE_HIGH_QUALITY;
621 S32 stepx = HME_COARSE_STEP_SIZE_HIGH_QUALITY;
622 /*TODO: Calculate Step shift from the #define HME_COARSE_STEP_SIZE_HIGH_SPEED */
623 S32 step_shift_x = 1;
624 S32 step_shift_y = 1;
625
626 S32 mvx, mvy, mv_x_offset, mv_y_offset, mv_x_range, mv_y_range;
627
628 S32 lambda = ps_pred_ctxt->lambda;
629 S32 lambda_q_shift = ps_pred_ctxt->lambda_q_shift;
630 S32 rnd = 1 << (lambda_q_shift - 1);
631
632 S32 min_cost_4x8 = MAX_32BIT_VAL;
633 S32 min_cost_8x4 = MAX_32BIT_VAL;
634
635 S32 i;
636
637 const uint16x8_t v_ref_idx = vdupq_n_u16(i1_ref_idx);
638 const uint32x4_t v_lambda = vdupq_n_u32(lambda);
639 const uint32x4_t v_rnd_factor = vdupq_n_u32(rnd);
640 const int32x4_t v_lambda_q_shift = vdupq_n_s32(-lambda_q_shift);
641
642 mv_x_offset = (-ps_mv_limit->i2_min_x >> step_shift_x);
643 mv_y_offset = (-ps_mv_limit->i2_min_y >> step_shift_y);
644 mv_x_range = (-ps_mv_limit->i2_min_x + ps_mv_limit->i2_max_x) >> step_shift_x;
645 mv_y_range = (-ps_mv_limit->i2_min_y + ps_mv_limit->i2_max_y) >> step_shift_y;
646
647 ASSERT(MAX_MVX_SUPPORTED_IN_COARSE_LAYER >= ABS(ps_mv_range->i2_max_x));
648 ASSERT(MAX_MVY_SUPPORTED_IN_COARSE_LAYER >= ABS(ps_mv_range->i2_max_y));
649
650 /* Run 2loops to sweep over the reference area */
651 for(mvy = ps_mv_range->i2_min_y; mvy < ps_mv_range->i2_max_y; mvy += stepy)
652 {
653 /* LUT: (2 * hme_get_range(mv_y) - 1) + ((!mv_y) ? 0 : 1) */
654 uint16x8_t mvy_wt = vld1q_u16((U16 *)&gi2_mvy_range[ABS(mvy)][0]);
655
656 /* mvy wt + ref_idx */
657 mvy_wt = vaddq_u16(mvy_wt, v_ref_idx);
658
659 for(mvx = ps_mv_range->i2_min_x; mvx < ps_mv_range->i2_max_x;)
660 {
661 S32 sad_pos = ((mvx >> step_shift_x) + mv_x_offset) +
662 ((mvy >> step_shift_y) + mv_y_offset) * mv_x_range;
663
664 if(mvx + (8 * stepx) <= ps_mv_range->i2_max_x) // 8x4
665 {
666 uint16x8_t curr = vld1q_u16((U16 *)pi2_sads_4x4_current + sad_pos);
667 uint16x8_t south = vld1q_u16((U16 *)pi2_sads_4x4_south + sad_pos);
668 uint16x8_t east = vld1q_u16((U16 *)pi2_sads_4x4_east + sad_pos);
669 uint16x8_t sad_4x8 = vaddq_u16(curr, south);
670 uint16x8_t sad_8x4 = vaddq_u16(curr, east);
671 /* LUT: (2 * hme_get_range(mv_x) - 1) + ((!mv_x) ? 0 : 1) */
672 uint16x8_t mv_wt = vld1q_u16(
673 (U16 *)&gi2_mvx_range_high_quality[mvx + MAX_MVX_SUPPORTED_IN_COARSE_LAYER][0]);
674 uint32x4_t total_cost_0, total_cost_1;
675 uint16x8_t total_cost;
676 U16 sad_array[2][8];
677
678 /* mv weight + ref_idx */
679 mv_wt = vaddq_u16(mv_wt, mvy_wt);
680
681 total_cost_0 = vmulq_u32(v_lambda, vmovl_u16(vget_low_u16(mv_wt)));
682 total_cost_1 = vmulq_u32(v_lambda, vmovl_u16(vget_high_u16(mv_wt)));
683
684 total_cost_0 = vaddq_u32(total_cost_0, v_rnd_factor);
685 total_cost_1 = vaddq_u32(total_cost_1, v_rnd_factor);
686
687 total_cost_0 = vshlq_u32(total_cost_0, v_lambda_q_shift);
688 total_cost_1 = vshlq_u32(total_cost_1, v_lambda_q_shift);
689
690 total_cost = vcombine_u16(vmovn_u32(total_cost_0), vmovn_u32(total_cost_1));
691
692 sad_4x8 = vaddq_u16(total_cost, sad_4x8);
693 sad_8x4 = vaddq_u16(total_cost, sad_8x4);
694
695 vst1q_u16(sad_array[0], sad_4x8);
696 vst1q_u16(sad_array[1], sad_8x4);
697
698 for(i = 0; i < 8; i++)
699 {
700 BEST_COST(i);
701 }
702 mvx += stepx * 8;
703 }
704 else if(mvx + (4 * stepx) <= ps_mv_range->i2_max_x) // 4x4
705 {
706 uint16x4_t curr = vld1_u16((U16 *)pi2_sads_4x4_current + sad_pos);
707 uint16x4_t south = vld1_u16((U16 *)pi2_sads_4x4_south + sad_pos);
708 uint16x4_t east = vld1_u16((U16 *)pi2_sads_4x4_east + sad_pos);
709 uint16x4_t sad_4x8 = vadd_u16(curr, south);
710 uint16x4_t sad_8x4 = vadd_u16(curr, east);
711 /* LUT: (2 * hme_get_range(mv_x) - 1) + ((!mv_x) ? 0 : 1) */
712 uint16x4_t mv_wt = vld1_u16(
713 (U16 *)&gi2_mvx_range_high_quality[mvx + MAX_MVX_SUPPORTED_IN_COARSE_LAYER][0]);
714 uint32x4_t total_cost;
715 U16 sad_array[2][4];
716
717 /* mv weight + ref_idx */
718 mv_wt = vadd_u16(mv_wt, vget_low_u16(mvy_wt));
719
720 total_cost = vmulq_u32(v_lambda, vmovl_u16(mv_wt));
721 total_cost = vaddq_u32(total_cost, v_rnd_factor);
722 total_cost = vshlq_u32(total_cost, v_lambda_q_shift);
723
724 sad_4x8 = vadd_u16(vmovn_u32(total_cost), sad_4x8);
725 sad_8x4 = vadd_u16(vmovn_u32(total_cost), sad_8x4);
726
727 vst1_u16(sad_array[0], sad_4x8);
728 vst1_u16(sad_array[1], sad_8x4);
729
730 for(i = 0; i < 4; i++)
731 {
732 BEST_COST(i);
733 }
734
735 mvx += stepx * 4;
736 }
737 else
738 {
739 S16 sad_array[2][1];
740 S32 mv_cost;
741
742 /* Get SAD by adding SAD for current and neighbour S */
743 sad_array[0][0] = pi2_sads_4x4_current[sad_pos] + pi2_sads_4x4_south[sad_pos];
744 sad_array[1][0] = pi2_sads_4x4_current[sad_pos] + pi2_sads_4x4_east[sad_pos];
745
746 mv_cost = gi2_mvy_range[ABS(mvy)][0] +
747 gi2_mvx_range_high_quality[mvx + MAX_MVX_SUPPORTED_IN_COARSE_LAYER][0] +
748 i1_ref_idx;
749
750 mv_cost = (mv_cost * lambda + rnd) >> lambda_q_shift;
751
752 sad_array[0][0] += mv_cost;
753 sad_array[1][0] += mv_cost;
754
755 BEST_COST(0);
756 mvx += stepx;
757 }
758 }
759 }
760
761 ps_best_mv_4x8->i2_mv_x = best_mv_x_4x8;
762 ps_best_mv_4x8->i2_mv_y = best_mv_y_4x8;
763
764 ps_best_mv_8x4->i2_mv_x = best_mv_x_8x4;
765 ps_best_mv_8x4->i2_mv_y = best_mv_y_8x4;
766 }
767