1 /******************************************************************************
2 *
3 * Copyright (C) 2018 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 * ihevce_scan_coeffs_neon.c
24 *
25 * @brief
26 * Contains definitions for scanning quantized tu
27 *
28 * @author
29 * Ittiam
30 *
31 * @par List of Functions:
32 *
33 * @remarks
34 * None
35 *
36 ********************************************************************************
37 */
38 /*****************************************************************************/
39 /* File Includes */
40 /*****************************************************************************/
41 /* System include files */
42 #include <stdio.h>
43 #include <stdlib.h>
44 #include <assert.h>
45 #include <string.h>
46 #include <arm_neon.h>
47
48 /* User include files */
49 #include "ihevc_typedefs.h"
50 #include "itt_video_api.h"
51 #include "ihevc_defs.h"
52 #include "ihevc_debug.h"
53 #include "ihevce_api.h"
54 #include "ihevce_defs.h"
55 #include "rc_cntrl_param.h"
56 #include "rc_frame_info_collector.h"
57 #include "rc_look_ahead_params.h"
58 #include "ihevce_lap_enc_structs.h"
59 #include "ihevc_platform_macros.h"
60 #include "ihevc_structs.h"
61 #include "ihevce_multi_thrd_structs.h"
62
63 #include "ihevc_deblk.h"
64 #include "ihevc_itrans_recon.h"
65 #include "ihevc_chroma_itrans_recon.h"
66 #include "ihevc_chroma_intra_pred.h"
67 #include "ihevc_intra_pred.h"
68 #include "ihevc_inter_pred.h"
69 #include "ihevc_mem_fns.h"
70 #include "ihevc_padding.h"
71 #include "ihevc_weighted_pred.h"
72 #include "ihevc_sao.h"
73 #include "ihevc_resi_trans.h"
74 #include "ihevc_quant_iquant_ssd.h"
75 #include "ihevce_function_selector.h"
76 #include "ihevce_me_common_defs.h"
77 #include "ihevce_enc_structs.h"
78 #include "ihevce_global_tables.h"
79 #include "ihevce_ipe_instr_set_router.h"
80 #include "ihevce_common_utils.h"
81
82 /*****************************************************************************/
83 /* Function Declarations */
84 /*****************************************************************************/
85 FT_SCAN_COEFFS ihevce_scan_coeffs_neon;
86
87 /*****************************************************************************/
88 /* Function Definitions */
89 /*****************************************************************************/
movemask_neon(uint8x16_t input)90 static WORD32 movemask_neon(uint8x16_t input)
91 {
92 const int8_t __attribute__((aligned(16))) xr[8] = { -7, -6, -5, -4, -3, -2, -1, 0 };
93 uint8x8_t mask_and = vdup_n_u8(0x80);
94 int8x8_t mask_shift = vld1_s8(xr);
95
96 uint8x8_t lo = vget_low_u8(input);
97 uint8x8_t hi = vget_high_u8(input);
98
99 lo = vand_u8(lo, mask_and);
100 lo = vshl_u8(lo, mask_shift);
101
102 hi = vand_u8(hi, mask_and);
103 hi = vshl_u8(hi, mask_shift);
104
105 lo = vpadd_u8(lo, lo);
106 lo = vpadd_u8(lo, lo);
107 lo = vpadd_u8(lo, lo);
108
109 hi = vpadd_u8(hi, hi);
110 hi = vpadd_u8(hi, hi);
111 hi = vpadd_u8(hi, hi);
112
113 return ((hi[0] << 8) | (lo[0] & 0xFF));
114 }
115
ihevce_scan_coeffs_neon(WORD16 * pi2_quant_coeffs,WORD32 * pi4_subBlock2csbfId_map,WORD32 scan_idx,WORD32 trans_size,UWORD8 * pu1_out_data,UWORD8 * pu1_csbf_buf,WORD32 i4_csbf_stride)116 WORD32 ihevce_scan_coeffs_neon(
117 WORD16 *pi2_quant_coeffs,
118 WORD32 *pi4_subBlock2csbfId_map,
119 WORD32 scan_idx,
120 WORD32 trans_size,
121 UWORD8 *pu1_out_data,
122 UWORD8 *pu1_csbf_buf,
123 WORD32 i4_csbf_stride)
124 {
125 WORD32 i, trans_unit_idx, num_gt1_flag, num_gt0_flag;
126 UWORD16 u2_csbf0flags;
127 WORD32 num_bytes = 0;
128 UWORD8 *pu1_trans_table;
129 UWORD8 *pu1_csb_table;
130 WORD32 shift_value, mask_value;
131 WORD32 blk_row, blk_col;
132 WORD32 x_pos, y_pos;
133 WORD32 quant_coeff;
134
135 UWORD8 *pu1_out_data_header;
136 UWORD16 *pu2_out_data_coeff;
137
138 int8x16_t one, shuffle, zero;
139 int16x8_t ones;
140 int8x8x2_t quant;
141
142 (void)i4_csbf_stride;
143 pu1_out_data_header = pu1_out_data;
144 u2_csbf0flags = 0xBAD0;
145
146 pu1_csb_table = (UWORD8 *)&(g_u1_scan_table_4x4[scan_idx][0]);
147
148 GETRANGE(shift_value, trans_size);
149 shift_value = shift_value - 3;
150 mask_value = (trans_size / 4) - 1;
151
152 switch(trans_size)
153 {
154 case 32:
155 pu1_trans_table = (UWORD8 *)&(g_u1_scan_table_8x8[scan_idx][0]);
156 break;
157 case 16:
158 pu1_trans_table = (UWORD8 *)&(g_u1_scan_table_4x4[scan_idx][0]);
159 break;
160 case 8:
161 pu1_trans_table = (UWORD8 *)&(g_u1_scan_table_2x2[scan_idx][0]);
162 break;
163 case 4:
164 pu1_trans_table = (UWORD8 *)&(g_u1_scan_table_1x1[0]);
165 break;
166 }
167
168 shuffle = vld1q_s8((WORD8 *)pu1_csb_table);
169 zero = vdupq_n_s8(0);
170 one = vdupq_n_s8(1);
171 ones = vdupq_n_s16(1);
172
173 for(trans_unit_idx = (trans_size * trans_size / 16) - 1; trans_unit_idx >= 0; trans_unit_idx--)
174 {
175 if(pu1_csbf_buf[pi4_subBlock2csbfId_map[pu1_trans_table[trans_unit_idx]]])
176 {
177 WORD32 sig_coeff_abs_gt0_flags, sig_coeff_abs_gt1_flags;
178 WORD32 sign_flag, pos_last_coded;
179 UWORD8 u1_last_x, u1_last_y;
180 WORD16 *pi2_temp_quant_coeff = pi2_quant_coeffs;
181
182 int16x4_t quant0, quant1, quant2, quant3;
183 int16x8_t quant01, quant23;
184 int8x8_t a, b, c, d, shuffle_0, shuffle_1;
185 int8x16_t shuffle_out, shuffle_out_abs;
186 uint8x16_t sign, eq0, eq1;
187
188 blk_row = pu1_trans_table[trans_unit_idx] >> shift_value;
189 blk_col = pu1_trans_table[trans_unit_idx] & mask_value;
190
191 pi2_temp_quant_coeff += (blk_col * 4 + (blk_row * 4) * trans_size);
192
193 quant0 = vld1_s16(pi2_temp_quant_coeff + 0 * trans_size);
194 quant1 = vld1_s16(pi2_temp_quant_coeff + 1 * trans_size);
195 quant2 = vld1_s16(pi2_temp_quant_coeff + 2 * trans_size);
196 quant3 = vld1_s16(pi2_temp_quant_coeff + 3 * trans_size);
197
198 quant01 = vcombine_s16(quant0, quant1);
199 quant23 = vcombine_s16(quant2, quant3);
200
201 a = vqmovn_s16(quant01);
202 b = vqmovn_s16(quant23);
203
204 quant.val[0] = a;
205 quant.val[1] = b;
206
207 c = vget_low_s8(shuffle);
208 d = vget_high_s8(shuffle);
209
210 shuffle_0 = vtbl2_s8(quant, c);
211 shuffle_1 = vtbl2_s8(quant, d);
212 shuffle_out = vcombine_s8(shuffle_0, shuffle_1);
213
214 shuffle_out_abs = vabsq_s8(shuffle_out);
215
216 sign = vcgtq_s8(zero, shuffle_out);
217 eq0 = vceqq_s8(shuffle_out, zero);
218 eq1 = vceqq_s8(shuffle_out_abs, one);
219
220 sign_flag = movemask_neon(sign);
221 sig_coeff_abs_gt0_flags = movemask_neon(eq0);
222 sig_coeff_abs_gt1_flags = movemask_neon(eq1);
223
224 sig_coeff_abs_gt0_flags = ~sig_coeff_abs_gt0_flags;
225 sig_coeff_abs_gt1_flags = ~sig_coeff_abs_gt1_flags;
226 sig_coeff_abs_gt0_flags = sig_coeff_abs_gt0_flags & 0x0000FFFF;
227 sig_coeff_abs_gt1_flags = sig_coeff_abs_gt1_flags & sig_coeff_abs_gt0_flags;
228
229 ASSERT(sig_coeff_abs_gt0_flags != 0);
230 GET_POS_MSB_32(pos_last_coded, sig_coeff_abs_gt0_flags);
231
232 /* Update gt1 flag based on num_gt0_flag */
233 num_gt0_flag = ihevce_num_ones_popcnt(sig_coeff_abs_gt0_flags);
234
235 /* Find the position of 9th(MAX_GT_ONE+1) 1 in sig_coeff_abs_gt0_flags from MSB and update gt1 flag */
236 if(num_gt0_flag > MAX_GT_ONE)
237 {
238 WORD32 gt0_first_byte = sig_coeff_abs_gt0_flags & 0xFF;
239 WORD32 num_gt0_second_byte =
240 ihevce_num_ones_popcnt(sig_coeff_abs_gt0_flags & 0xFF00);
241 WORD32 pos_nineth_one; /* pos. of 9th one from MSB of sig_coeff_abs_gt0_flags */
242 WORD32 gt0_after_nineth_one, num_gt0_first_byte_to_nine;
243
244 num_gt0_first_byte_to_nine = (MAX_GT_ONE + 1) - num_gt0_second_byte;
245
246 while(num_gt0_first_byte_to_nine)
247 {
248 GET_POS_MSB_32(pos_nineth_one, gt0_first_byte);
249 gt0_first_byte = CLEAR_BIT(
250 gt0_first_byte,
251 pos_nineth_one); /*gt0_second_byte &= (~(0x1<<pos_eighth_one));*/
252 num_gt0_first_byte_to_nine--;
253 }
254
255 /* Update gt1 based on pos_eighth_one */
256 gt0_after_nineth_one = SET_BIT(gt0_first_byte, pos_nineth_one);
257 sig_coeff_abs_gt1_flags = sig_coeff_abs_gt1_flags | gt0_after_nineth_one;
258 }
259
260 /* Get x_pos & y_pos of last coded in csb wrt to TU */
261 u1_last_x = (pu1_csb_table[pos_last_coded] & 0x3) + blk_col * 4;
262 u1_last_y = (pu1_csb_table[pos_last_coded] >> 2) + blk_row * 4;
263
264 num_gt1_flag = ihevce_num_ones_popcnt(sig_coeff_abs_gt1_flags);
265
266 /* storing last_x and last_y */
267 *pu1_out_data_header = u1_last_x;
268 pu1_out_data_header++;
269
270 *pu1_out_data_header = u1_last_y;
271 pu1_out_data_header++;
272
273 /* storing the scan order */
274 *pu1_out_data_header = (UWORD8)scan_idx;
275 pu1_out_data_header++;
276
277 /* storing last_sub_block pos. in scan order count */
278 *pu1_out_data_header = (UWORD8)trans_unit_idx;
279 pu1_out_data_header++;
280
281 /*stored the first 4 bytes, now all are word16. So word16 pointer*/
282 pu2_out_data_coeff = (UWORD16 *)pu1_out_data_header;
283
284 /* u2_csbf0flags word */
285 u2_csbf0flags = 0xBAD0 | 1; /*since right&bottom csbf is 0*/
286 /* storing u2_csbf0flags word */
287 *pu2_out_data_coeff = u2_csbf0flags;
288 pu2_out_data_coeff++;
289
290 /* storing u2_sig_coeff_abs_gt0_flags 2 bytes */
291 *pu2_out_data_coeff = (UWORD16)sig_coeff_abs_gt0_flags;
292 pu2_out_data_coeff++;
293
294 /* storing u2_sig_coeff_abs_gt1_flags 2 bytes */
295 *pu2_out_data_coeff = (UWORD16)sig_coeff_abs_gt1_flags;
296 pu2_out_data_coeff++;
297
298 /* storing u2_sign_flags 2 bytes */
299 *pu2_out_data_coeff = (UWORD16)sign_flag;
300 pu2_out_data_coeff++;
301
302 /* Store the u2_abs_coeff_remaining[] */
303 for(i = 0; i < num_gt1_flag; i++)
304 {
305 volatile WORD32 bit_pos;
306 ASSERT(sig_coeff_abs_gt1_flags != 0);
307
308 GET_POS_MSB_32(bit_pos, sig_coeff_abs_gt1_flags);
309 sig_coeff_abs_gt1_flags = CLEAR_BIT(
310 sig_coeff_abs_gt1_flags,
311 bit_pos); /*sig_coeff_abs_gt1_flags &= (~(0x1<<bit_pos));*/
312
313 x_pos = (pu1_csb_table[bit_pos] & 0x3);
314 y_pos = (pu1_csb_table[bit_pos] >> 2);
315
316 quant_coeff = pi2_temp_quant_coeff[x_pos + (y_pos * trans_size)];
317
318 /* storing u2_abs_coeff_remaining[i] 2 bytes */
319 *pu2_out_data_coeff = (UWORD16)abs(quant_coeff) - 1;
320 pu2_out_data_coeff++;
321 }
322
323 break; /*We just need this loop for finding 1st non-zero csb only*/
324 }
325 }
326
327 /* go through remaining csb in the scan order */
328 for(trans_unit_idx = trans_unit_idx - 1; trans_unit_idx >= 0; trans_unit_idx--)
329 {
330 blk_row = pu1_trans_table[trans_unit_idx] >> shift_value; /*row of csb*/
331 blk_col = pu1_trans_table[trans_unit_idx] & mask_value; /*col of csb*/
332
333 /* u2_csbf0flags word */
334 u2_csbf0flags = 0xBAD0 | /* assuming csbf_buf has only 0 or 1 values */
335 (pu1_csbf_buf[pi4_subBlock2csbfId_map[pu1_trans_table[trans_unit_idx]]]);
336
337 /********************************************************************/
338 /* Minor hack: As per HEVC spec csbf in not signalled in stream for */
339 /* block0, instead sig coeff map is directly signalled. This is */
340 /* taken care by forcing csbf for block0 to be 1 even if it is 0 */
341 /********************************************************************/
342 if(0 == trans_unit_idx)
343 {
344 u2_csbf0flags |= 1;
345 }
346
347 if((blk_col + 1 < trans_size / 4)) /* checking right boundary */
348 {
349 if(pu1_csbf_buf[pi4_subBlock2csbfId_map[blk_row * trans_size / 4 + blk_col + 1]])
350 {
351 /* set the 2nd bit of u2_csbf0flags for right csbf */
352 u2_csbf0flags = u2_csbf0flags | (1 << 1);
353 }
354 }
355 if((blk_row + 1 < trans_size / 4)) /* checking bottom oundary */
356 {
357 if(pu1_csbf_buf[pi4_subBlock2csbfId_map[(blk_row + 1) * trans_size / 4 + blk_col]])
358 {
359 /* set the 3rd bit of u2_csbf0flags for bottom csbf */
360 u2_csbf0flags = u2_csbf0flags | (1 << 2);
361 }
362 }
363
364 /* storing u2_csbf0flags word */
365 *pu2_out_data_coeff = u2_csbf0flags;
366 pu2_out_data_coeff++;
367
368 /* check for the csb flag in our scan order */
369 if(u2_csbf0flags & 0x1)
370 {
371 WORD32 sig_coeff_abs_gt0_flags, sig_coeff_abs_gt1_flags;
372 WORD32 sign_flag;
373
374 int16x4_t quant0, quant1, quant2, quant3;
375 int16x8_t quant01, quant23;
376 int8x8_t a, b, c, d, shuffle_0, shuffle_1;
377 int8x16_t shuffle_out, shuffle_out_abs;
378 uint8x16_t sign, eq0, eq1;
379
380 /* x_pos=blk_col*4, y_pos=blk_row*4 */
381 WORD16 *pi2_temp_quant_coeff =
382 pi2_quant_coeffs + blk_col * 4 + (blk_row * 4) * trans_size;
383
384 /* Load Quant Values */
385 quant0 = vld1_s16(pi2_temp_quant_coeff + 0 * trans_size);
386 quant1 = vld1_s16(pi2_temp_quant_coeff + 1 * trans_size);
387 quant2 = vld1_s16(pi2_temp_quant_coeff + 2 * trans_size);
388 quant3 = vld1_s16(pi2_temp_quant_coeff + 3 * trans_size);
389
390 /* Two quant rows together */
391 quant01 = vcombine_s16(quant0, quant1);
392 quant23 = vcombine_s16(quant2, quant3);
393
394 /* All 4 rows: For sign, gt0, gt1 flags, even 8 bit version is enough! */
395 a = vqmovn_s16(quant01);
396 b = vqmovn_s16(quant23);
397
398 quant.val[0] = a;
399 quant.val[1] = b;
400
401 c = vget_low_s8(shuffle);
402 d = vget_high_s8(shuffle);
403
404 shuffle_0 = vtbl2_s8(quant, c);
405 shuffle_1 = vtbl2_s8(quant, d);
406 shuffle_out = vcombine_s8(shuffle_0, shuffle_1);
407
408 /* ABS values */
409 shuffle_out_abs = vabsq_s8(shuffle_out);
410
411 /* sign bits : Will get 0xFF if (0 > shuffle_out) */
412 sign = vcgtq_s8(zero, shuffle_out);
413 /* gt0 : Will get 0xFF if ( shuffle_out == 0 ) */
414 eq0 = vceqq_s8(shuffle_out, zero);
415 /* gt1 : Will get 0xFF if ( abs(shuffle_out) == 1 ) */
416 eq1 = vceqq_s8(shuffle_out_abs, one);
417
418 /* movemask:0 extended upper 16bits,Only low16 bits are required while storing */
419 sign_flag = movemask_neon(sign);
420 sig_coeff_abs_gt0_flags = movemask_neon(eq0);
421 sig_coeff_abs_gt1_flags = movemask_neon(eq1);
422
423 /* Update gt0 and gt1 based on ==0 and ==1 flag */
424 sig_coeff_abs_gt0_flags = ~sig_coeff_abs_gt0_flags; /* != 0 */
425 sig_coeff_abs_gt1_flags = ~sig_coeff_abs_gt1_flags; /* (abs) != 1 */
426 sig_coeff_abs_gt0_flags = sig_coeff_abs_gt0_flags & 0x0000FFFF; /* Clear high Word */
427 sig_coeff_abs_gt1_flags = sig_coeff_abs_gt1_flags & sig_coeff_abs_gt0_flags;
428
429 /* Update gt1 flag based on num_gt0_flag */
430 num_gt0_flag = ihevce_num_ones_popcnt(sig_coeff_abs_gt0_flags);
431
432 /* Find the position of 9th(MAX_GT_ONE+1) 1 in sig_coeff_abs_gt0_flags from MSB and update gt1 flag */
433 if(num_gt0_flag > MAX_GT_ONE)
434 {
435 WORD32 gt0_first_byte = sig_coeff_abs_gt0_flags & 0xFF;
436 WORD32 num_gt0_second_byte =
437 ihevce_num_ones_popcnt(sig_coeff_abs_gt0_flags & 0xFF00);
438 WORD32 pos_nineth_one; /* pos. of 9th one from MSB of sig_coeff_abs_gt0_flags */
439 WORD32 gt0_after_nineth_one, num_gt0_first_byte_to_nine;
440
441 num_gt0_first_byte_to_nine = (MAX_GT_ONE + 1) - num_gt0_second_byte;
442
443 while(num_gt0_first_byte_to_nine)
444 {
445 GET_POS_MSB_32(pos_nineth_one, gt0_first_byte);
446 gt0_first_byte = CLEAR_BIT(
447 gt0_first_byte,
448 pos_nineth_one); /*gt0_second_byte &= (~(0x1<<pos_eighth_one));*/
449 num_gt0_first_byte_to_nine--;
450 }
451
452 /* Update gt1 based on pos_eighth_one */
453 gt0_after_nineth_one = SET_BIT(gt0_first_byte, pos_nineth_one);
454 sig_coeff_abs_gt1_flags = sig_coeff_abs_gt1_flags | gt0_after_nineth_one;
455 }
456
457 num_gt1_flag = ihevce_num_ones_popcnt(sig_coeff_abs_gt1_flags);
458
459 /* storing u2_sig_coeff_abs_gt0_flags 2 bytes */
460 *pu2_out_data_coeff = (UWORD16)sig_coeff_abs_gt0_flags;
461 pu2_out_data_coeff++;
462
463 /* storing u2_sig_coeff_abs_gt1_flags 2 bytes */
464 *pu2_out_data_coeff = (UWORD16)sig_coeff_abs_gt1_flags;
465 pu2_out_data_coeff++;
466
467 /* storing u2_sign_flags 2 bytes */
468 *pu2_out_data_coeff = (UWORD16)sign_flag;
469 pu2_out_data_coeff++;
470
471 /* Store the u2_abs_coeff_remaining[] */
472 for(i = 0; i < num_gt1_flag; i++)
473 {
474 volatile WORD32 bit_pos;
475 ASSERT(sig_coeff_abs_gt1_flags != 0);
476
477 GET_POS_MSB_32(bit_pos, sig_coeff_abs_gt1_flags);
478 sig_coeff_abs_gt1_flags = CLEAR_BIT(
479 sig_coeff_abs_gt1_flags,
480 bit_pos); /*sig_coeff_abs_gt1_flags &= (~(0x1<<bit_pos));*/
481
482 x_pos = (pu1_csb_table[bit_pos] & 0x3);
483 y_pos = (pu1_csb_table[bit_pos] >> 2);
484
485 quant_coeff = pi2_temp_quant_coeff[x_pos + (y_pos * trans_size)];
486
487 /* storing u2_abs_coeff_remaining[i] 2 bytes */
488 *pu2_out_data_coeff = (UWORD16)abs(quant_coeff) - 1;
489 pu2_out_data_coeff++;
490 }
491 }
492 }
493
494 num_bytes = (UWORD8 *)pu2_out_data_coeff - pu1_out_data;
495 return num_bytes; /* Return the number of bytes written to out_data */
496 }
497