xref: /aosp_15_r20/external/libhevc/encoder/arm/ihevce_scan_coeffs_neon.c (revision c83a76b084498d55f252f48b2e3786804cdf24b7)
1 /******************************************************************************
2  *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 *  ihevce_scan_coeffs_neon.c
24 *
25 * @brief
26 *  Contains definitions for scanning quantized tu
27 *
28 * @author
29 *  Ittiam
30 *
31 * @par List of Functions:
32 *
33 * @remarks
34 *  None
35 *
36 ********************************************************************************
37 */
38 /*****************************************************************************/
39 /* File Includes                                                             */
40 /*****************************************************************************/
41 /* System include files */
42 #include <stdio.h>
43 #include <stdlib.h>
44 #include <assert.h>
45 #include <string.h>
46 #include <arm_neon.h>
47 
48 /* User include files */
49 #include "ihevc_typedefs.h"
50 #include "itt_video_api.h"
51 #include "ihevc_defs.h"
52 #include "ihevc_debug.h"
53 #include "ihevce_api.h"
54 #include "ihevce_defs.h"
55 #include "rc_cntrl_param.h"
56 #include "rc_frame_info_collector.h"
57 #include "rc_look_ahead_params.h"
58 #include "ihevce_lap_enc_structs.h"
59 #include "ihevc_platform_macros.h"
60 #include "ihevc_structs.h"
61 #include "ihevce_multi_thrd_structs.h"
62 
63 #include "ihevc_deblk.h"
64 #include "ihevc_itrans_recon.h"
65 #include "ihevc_chroma_itrans_recon.h"
66 #include "ihevc_chroma_intra_pred.h"
67 #include "ihevc_intra_pred.h"
68 #include "ihevc_inter_pred.h"
69 #include "ihevc_mem_fns.h"
70 #include "ihevc_padding.h"
71 #include "ihevc_weighted_pred.h"
72 #include "ihevc_sao.h"
73 #include "ihevc_resi_trans.h"
74 #include "ihevc_quant_iquant_ssd.h"
75 #include "ihevce_function_selector.h"
76 #include "ihevce_me_common_defs.h"
77 #include "ihevce_enc_structs.h"
78 #include "ihevce_global_tables.h"
79 #include "ihevce_ipe_instr_set_router.h"
80 #include "ihevce_common_utils.h"
81 
82 /*****************************************************************************/
83 /* Function Declarations                                                     */
84 /*****************************************************************************/
85 FT_SCAN_COEFFS ihevce_scan_coeffs_neon;
86 
87 /*****************************************************************************/
88 /* Function Definitions                                                      */
89 /*****************************************************************************/
movemask_neon(uint8x16_t input)90 static WORD32 movemask_neon(uint8x16_t input)
91 {
92     const int8_t __attribute__((aligned(16))) xr[8] = { -7, -6, -5, -4, -3, -2, -1, 0 };
93     uint8x8_t mask_and = vdup_n_u8(0x80);
94     int8x8_t mask_shift = vld1_s8(xr);
95 
96     uint8x8_t lo = vget_low_u8(input);
97     uint8x8_t hi = vget_high_u8(input);
98 
99     lo = vand_u8(lo, mask_and);
100     lo = vshl_u8(lo, mask_shift);
101 
102     hi = vand_u8(hi, mask_and);
103     hi = vshl_u8(hi, mask_shift);
104 
105     lo = vpadd_u8(lo, lo);
106     lo = vpadd_u8(lo, lo);
107     lo = vpadd_u8(lo, lo);
108 
109     hi = vpadd_u8(hi, hi);
110     hi = vpadd_u8(hi, hi);
111     hi = vpadd_u8(hi, hi);
112 
113     return ((hi[0] << 8) | (lo[0] & 0xFF));
114 }
115 
ihevce_scan_coeffs_neon(WORD16 * pi2_quant_coeffs,WORD32 * pi4_subBlock2csbfId_map,WORD32 scan_idx,WORD32 trans_size,UWORD8 * pu1_out_data,UWORD8 * pu1_csbf_buf,WORD32 i4_csbf_stride)116 WORD32 ihevce_scan_coeffs_neon(
117     WORD16 *pi2_quant_coeffs,
118     WORD32 *pi4_subBlock2csbfId_map,
119     WORD32 scan_idx,
120     WORD32 trans_size,
121     UWORD8 *pu1_out_data,
122     UWORD8 *pu1_csbf_buf,
123     WORD32 i4_csbf_stride)
124 {
125     WORD32 i, trans_unit_idx, num_gt1_flag, num_gt0_flag;
126     UWORD16 u2_csbf0flags;
127     WORD32 num_bytes = 0;
128     UWORD8 *pu1_trans_table;
129     UWORD8 *pu1_csb_table;
130     WORD32 shift_value, mask_value;
131     WORD32 blk_row, blk_col;
132     WORD32 x_pos, y_pos;
133     WORD32 quant_coeff;
134 
135     UWORD8 *pu1_out_data_header;
136     UWORD16 *pu2_out_data_coeff;
137 
138     int8x16_t one, shuffle, zero;
139     int16x8_t ones;
140     int8x8x2_t quant;
141 
142     (void)i4_csbf_stride;
143     pu1_out_data_header = pu1_out_data;
144     u2_csbf0flags = 0xBAD0;
145 
146     pu1_csb_table = (UWORD8 *)&(g_u1_scan_table_4x4[scan_idx][0]);
147 
148     GETRANGE(shift_value, trans_size);
149     shift_value = shift_value - 3;
150     mask_value = (trans_size / 4) - 1;
151 
152     switch(trans_size)
153     {
154     case 32:
155         pu1_trans_table = (UWORD8 *)&(g_u1_scan_table_8x8[scan_idx][0]);
156         break;
157     case 16:
158         pu1_trans_table = (UWORD8 *)&(g_u1_scan_table_4x4[scan_idx][0]);
159         break;
160     case 8:
161         pu1_trans_table = (UWORD8 *)&(g_u1_scan_table_2x2[scan_idx][0]);
162         break;
163     case 4:
164         pu1_trans_table = (UWORD8 *)&(g_u1_scan_table_1x1[0]);
165         break;
166     }
167 
168     shuffle = vld1q_s8((WORD8 *)pu1_csb_table);
169     zero = vdupq_n_s8(0);
170     one = vdupq_n_s8(1);
171     ones = vdupq_n_s16(1);
172 
173     for(trans_unit_idx = (trans_size * trans_size / 16) - 1; trans_unit_idx >= 0; trans_unit_idx--)
174     {
175         if(pu1_csbf_buf[pi4_subBlock2csbfId_map[pu1_trans_table[trans_unit_idx]]])
176         {
177             WORD32 sig_coeff_abs_gt0_flags, sig_coeff_abs_gt1_flags;
178             WORD32 sign_flag, pos_last_coded;
179             UWORD8 u1_last_x, u1_last_y;
180             WORD16 *pi2_temp_quant_coeff = pi2_quant_coeffs;
181 
182             int16x4_t quant0, quant1, quant2, quant3;
183             int16x8_t quant01, quant23;
184             int8x8_t a, b, c, d, shuffle_0, shuffle_1;
185             int8x16_t shuffle_out, shuffle_out_abs;
186             uint8x16_t sign, eq0, eq1;
187 
188             blk_row = pu1_trans_table[trans_unit_idx] >> shift_value;
189             blk_col = pu1_trans_table[trans_unit_idx] & mask_value;
190 
191             pi2_temp_quant_coeff += (blk_col * 4 + (blk_row * 4) * trans_size);
192 
193             quant0 = vld1_s16(pi2_temp_quant_coeff + 0 * trans_size);
194             quant1 = vld1_s16(pi2_temp_quant_coeff + 1 * trans_size);
195             quant2 = vld1_s16(pi2_temp_quant_coeff + 2 * trans_size);
196             quant3 = vld1_s16(pi2_temp_quant_coeff + 3 * trans_size);
197 
198             quant01 = vcombine_s16(quant0, quant1);
199             quant23 = vcombine_s16(quant2, quant3);
200 
201             a = vqmovn_s16(quant01);
202             b = vqmovn_s16(quant23);
203 
204             quant.val[0] = a;
205             quant.val[1] = b;
206 
207             c = vget_low_s8(shuffle);
208             d = vget_high_s8(shuffle);
209 
210             shuffle_0 = vtbl2_s8(quant, c);
211             shuffle_1 = vtbl2_s8(quant, d);
212             shuffle_out = vcombine_s8(shuffle_0, shuffle_1);
213 
214             shuffle_out_abs = vabsq_s8(shuffle_out);
215 
216             sign = vcgtq_s8(zero, shuffle_out);
217             eq0 = vceqq_s8(shuffle_out, zero);
218             eq1 = vceqq_s8(shuffle_out_abs, one);
219 
220             sign_flag = movemask_neon(sign);
221             sig_coeff_abs_gt0_flags = movemask_neon(eq0);
222             sig_coeff_abs_gt1_flags = movemask_neon(eq1);
223 
224             sig_coeff_abs_gt0_flags = ~sig_coeff_abs_gt0_flags;
225             sig_coeff_abs_gt1_flags = ~sig_coeff_abs_gt1_flags;
226             sig_coeff_abs_gt0_flags = sig_coeff_abs_gt0_flags & 0x0000FFFF;
227             sig_coeff_abs_gt1_flags = sig_coeff_abs_gt1_flags & sig_coeff_abs_gt0_flags;
228 
229             ASSERT(sig_coeff_abs_gt0_flags != 0);
230             GET_POS_MSB_32(pos_last_coded, sig_coeff_abs_gt0_flags);
231 
232             /* Update gt1 flag based on num_gt0_flag */
233             num_gt0_flag = ihevce_num_ones_popcnt(sig_coeff_abs_gt0_flags);
234 
235             /* Find the position of 9th(MAX_GT_ONE+1) 1 in sig_coeff_abs_gt0_flags from MSB and update gt1 flag */
236             if(num_gt0_flag > MAX_GT_ONE)
237             {
238                 WORD32 gt0_first_byte = sig_coeff_abs_gt0_flags & 0xFF;
239                 WORD32 num_gt0_second_byte =
240                     ihevce_num_ones_popcnt(sig_coeff_abs_gt0_flags & 0xFF00);
241                 WORD32 pos_nineth_one; /* pos. of 9th one from MSB of sig_coeff_abs_gt0_flags */
242                 WORD32 gt0_after_nineth_one, num_gt0_first_byte_to_nine;
243 
244                 num_gt0_first_byte_to_nine = (MAX_GT_ONE + 1) - num_gt0_second_byte;
245 
246                 while(num_gt0_first_byte_to_nine)
247                 {
248                     GET_POS_MSB_32(pos_nineth_one, gt0_first_byte);
249                     gt0_first_byte = CLEAR_BIT(
250                         gt0_first_byte,
251                         pos_nineth_one); /*gt0_second_byte &= (~(0x1<<pos_eighth_one));*/
252                     num_gt0_first_byte_to_nine--;
253                 }
254 
255                 /* Update gt1 based on pos_eighth_one */
256                 gt0_after_nineth_one = SET_BIT(gt0_first_byte, pos_nineth_one);
257                 sig_coeff_abs_gt1_flags = sig_coeff_abs_gt1_flags | gt0_after_nineth_one;
258             }
259 
260             /* Get x_pos & y_pos of last coded in csb wrt to TU */
261             u1_last_x = (pu1_csb_table[pos_last_coded] & 0x3) + blk_col * 4;
262             u1_last_y = (pu1_csb_table[pos_last_coded] >> 2) + blk_row * 4;
263 
264             num_gt1_flag = ihevce_num_ones_popcnt(sig_coeff_abs_gt1_flags);
265 
266             /* storing last_x and last_y */
267             *pu1_out_data_header = u1_last_x;
268             pu1_out_data_header++;
269 
270             *pu1_out_data_header = u1_last_y;
271             pu1_out_data_header++;
272 
273             /* storing the scan order */
274             *pu1_out_data_header = (UWORD8)scan_idx;
275             pu1_out_data_header++;
276 
277             /* storing last_sub_block pos. in scan order count */
278             *pu1_out_data_header = (UWORD8)trans_unit_idx;
279             pu1_out_data_header++;
280 
281             /*stored the first 4 bytes, now all are word16. So word16 pointer*/
282             pu2_out_data_coeff = (UWORD16 *)pu1_out_data_header;
283 
284             /* u2_csbf0flags word */
285             u2_csbf0flags = 0xBAD0 | 1; /*since right&bottom csbf is 0*/
286             /* storing u2_csbf0flags word */
287             *pu2_out_data_coeff = u2_csbf0flags;
288             pu2_out_data_coeff++;
289 
290             /* storing u2_sig_coeff_abs_gt0_flags 2 bytes */
291             *pu2_out_data_coeff = (UWORD16)sig_coeff_abs_gt0_flags;
292             pu2_out_data_coeff++;
293 
294             /* storing u2_sig_coeff_abs_gt1_flags 2 bytes */
295             *pu2_out_data_coeff = (UWORD16)sig_coeff_abs_gt1_flags;
296             pu2_out_data_coeff++;
297 
298             /* storing u2_sign_flags 2 bytes */
299             *pu2_out_data_coeff = (UWORD16)sign_flag;
300             pu2_out_data_coeff++;
301 
302             /* Store the u2_abs_coeff_remaining[] */
303             for(i = 0; i < num_gt1_flag; i++)
304             {
305                 volatile WORD32 bit_pos;
306                 ASSERT(sig_coeff_abs_gt1_flags != 0);
307 
308                 GET_POS_MSB_32(bit_pos, sig_coeff_abs_gt1_flags);
309                 sig_coeff_abs_gt1_flags = CLEAR_BIT(
310                     sig_coeff_abs_gt1_flags,
311                     bit_pos); /*sig_coeff_abs_gt1_flags &= (~(0x1<<bit_pos));*/
312 
313                 x_pos = (pu1_csb_table[bit_pos] & 0x3);
314                 y_pos = (pu1_csb_table[bit_pos] >> 2);
315 
316                 quant_coeff = pi2_temp_quant_coeff[x_pos + (y_pos * trans_size)];
317 
318                 /* storing u2_abs_coeff_remaining[i] 2 bytes */
319                 *pu2_out_data_coeff = (UWORD16)abs(quant_coeff) - 1;
320                 pu2_out_data_coeff++;
321             }
322 
323             break; /*We just need this loop for finding 1st non-zero csb only*/
324         }
325     }
326 
327     /* go through remaining csb in the scan order */
328     for(trans_unit_idx = trans_unit_idx - 1; trans_unit_idx >= 0; trans_unit_idx--)
329     {
330         blk_row = pu1_trans_table[trans_unit_idx] >> shift_value; /*row of csb*/
331         blk_col = pu1_trans_table[trans_unit_idx] & mask_value; /*col of csb*/
332 
333         /* u2_csbf0flags word */
334         u2_csbf0flags = 0xBAD0 | /* assuming csbf_buf has only 0 or 1 values */
335                         (pu1_csbf_buf[pi4_subBlock2csbfId_map[pu1_trans_table[trans_unit_idx]]]);
336 
337         /********************************************************************/
338         /* Minor hack: As per HEVC spec csbf in not signalled in stream for */
339         /* block0, instead sig coeff map is directly signalled. This is     */
340         /* taken care by forcing csbf for block0 to be 1 even if it is 0    */
341         /********************************************************************/
342         if(0 == trans_unit_idx)
343         {
344             u2_csbf0flags |= 1;
345         }
346 
347         if((blk_col + 1 < trans_size / 4)) /* checking right boundary */
348         {
349             if(pu1_csbf_buf[pi4_subBlock2csbfId_map[blk_row * trans_size / 4 + blk_col + 1]])
350             {
351                 /* set the 2nd bit of u2_csbf0flags for right csbf */
352                 u2_csbf0flags = u2_csbf0flags | (1 << 1);
353             }
354         }
355         if((blk_row + 1 < trans_size / 4)) /* checking bottom oundary */
356         {
357             if(pu1_csbf_buf[pi4_subBlock2csbfId_map[(blk_row + 1) * trans_size / 4 + blk_col]])
358             {
359                 /* set the 3rd bit of u2_csbf0flags  for bottom csbf */
360                 u2_csbf0flags = u2_csbf0flags | (1 << 2);
361             }
362         }
363 
364         /* storing u2_csbf0flags word */
365         *pu2_out_data_coeff = u2_csbf0flags;
366         pu2_out_data_coeff++;
367 
368         /* check for the csb flag in our scan order */
369         if(u2_csbf0flags & 0x1)
370         {
371             WORD32 sig_coeff_abs_gt0_flags, sig_coeff_abs_gt1_flags;
372             WORD32 sign_flag;
373 
374             int16x4_t quant0, quant1, quant2, quant3;
375             int16x8_t quant01, quant23;
376             int8x8_t a, b, c, d, shuffle_0, shuffle_1;
377             int8x16_t shuffle_out, shuffle_out_abs;
378             uint8x16_t sign, eq0, eq1;
379 
380             /* x_pos=blk_col*4, y_pos=blk_row*4 */
381             WORD16 *pi2_temp_quant_coeff =
382                 pi2_quant_coeffs + blk_col * 4 + (blk_row * 4) * trans_size;
383 
384             /* Load Quant Values */
385             quant0 = vld1_s16(pi2_temp_quant_coeff + 0 * trans_size);
386             quant1 = vld1_s16(pi2_temp_quant_coeff + 1 * trans_size);
387             quant2 = vld1_s16(pi2_temp_quant_coeff + 2 * trans_size);
388             quant3 = vld1_s16(pi2_temp_quant_coeff + 3 * trans_size);
389 
390             /* Two quant rows together */
391             quant01 = vcombine_s16(quant0, quant1);
392             quant23 = vcombine_s16(quant2, quant3);
393 
394             /* All 4 rows: For sign, gt0, gt1 flags, even 8 bit version is enough! */
395             a = vqmovn_s16(quant01);
396             b = vqmovn_s16(quant23);
397 
398             quant.val[0] = a;
399             quant.val[1] = b;
400 
401             c = vget_low_s8(shuffle);
402             d = vget_high_s8(shuffle);
403 
404             shuffle_0 = vtbl2_s8(quant, c);
405             shuffle_1 = vtbl2_s8(quant, d);
406             shuffle_out = vcombine_s8(shuffle_0, shuffle_1);
407 
408             /* ABS values */
409             shuffle_out_abs = vabsq_s8(shuffle_out);
410 
411             /* sign bits : Will get 0xFF if (0 > shuffle_out) */
412             sign = vcgtq_s8(zero, shuffle_out);
413             /* gt0 : Will get 0xFF if ( shuffle_out == 0 ) */
414             eq0 = vceqq_s8(shuffle_out, zero);
415             /* gt1 : Will get 0xFF if ( abs(shuffle_out) == 1 ) */
416             eq1 = vceqq_s8(shuffle_out_abs, one);
417 
418             /* movemask:0 extended upper 16bits,Only low16 bits are required while storing */
419             sign_flag = movemask_neon(sign);
420             sig_coeff_abs_gt0_flags = movemask_neon(eq0);
421             sig_coeff_abs_gt1_flags = movemask_neon(eq1);
422 
423             /* Update gt0 and gt1 based on ==0 and ==1 flag */
424             sig_coeff_abs_gt0_flags = ~sig_coeff_abs_gt0_flags; /* != 0 */
425             sig_coeff_abs_gt1_flags = ~sig_coeff_abs_gt1_flags; /* (abs) != 1 */
426             sig_coeff_abs_gt0_flags = sig_coeff_abs_gt0_flags & 0x0000FFFF; /* Clear high Word */
427             sig_coeff_abs_gt1_flags = sig_coeff_abs_gt1_flags & sig_coeff_abs_gt0_flags;
428 
429             /* Update gt1 flag based on num_gt0_flag */
430             num_gt0_flag = ihevce_num_ones_popcnt(sig_coeff_abs_gt0_flags);
431 
432             /* Find the position of 9th(MAX_GT_ONE+1) 1 in sig_coeff_abs_gt0_flags from MSB and update gt1 flag */
433             if(num_gt0_flag > MAX_GT_ONE)
434             {
435                 WORD32 gt0_first_byte = sig_coeff_abs_gt0_flags & 0xFF;
436                 WORD32 num_gt0_second_byte =
437                     ihevce_num_ones_popcnt(sig_coeff_abs_gt0_flags & 0xFF00);
438                 WORD32 pos_nineth_one; /* pos. of 9th one from MSB of sig_coeff_abs_gt0_flags */
439                 WORD32 gt0_after_nineth_one, num_gt0_first_byte_to_nine;
440 
441                 num_gt0_first_byte_to_nine = (MAX_GT_ONE + 1) - num_gt0_second_byte;
442 
443                 while(num_gt0_first_byte_to_nine)
444                 {
445                     GET_POS_MSB_32(pos_nineth_one, gt0_first_byte);
446                     gt0_first_byte = CLEAR_BIT(
447                         gt0_first_byte,
448                         pos_nineth_one); /*gt0_second_byte &= (~(0x1<<pos_eighth_one));*/
449                     num_gt0_first_byte_to_nine--;
450                 }
451 
452                 /* Update gt1 based on pos_eighth_one */
453                 gt0_after_nineth_one = SET_BIT(gt0_first_byte, pos_nineth_one);
454                 sig_coeff_abs_gt1_flags = sig_coeff_abs_gt1_flags | gt0_after_nineth_one;
455             }
456 
457             num_gt1_flag = ihevce_num_ones_popcnt(sig_coeff_abs_gt1_flags);
458 
459             /* storing u2_sig_coeff_abs_gt0_flags 2 bytes */
460             *pu2_out_data_coeff = (UWORD16)sig_coeff_abs_gt0_flags;
461             pu2_out_data_coeff++;
462 
463             /* storing u2_sig_coeff_abs_gt1_flags 2 bytes */
464             *pu2_out_data_coeff = (UWORD16)sig_coeff_abs_gt1_flags;
465             pu2_out_data_coeff++;
466 
467             /* storing u2_sign_flags 2 bytes */
468             *pu2_out_data_coeff = (UWORD16)sign_flag;
469             pu2_out_data_coeff++;
470 
471             /* Store the u2_abs_coeff_remaining[] */
472             for(i = 0; i < num_gt1_flag; i++)
473             {
474                 volatile WORD32 bit_pos;
475                 ASSERT(sig_coeff_abs_gt1_flags != 0);
476 
477                 GET_POS_MSB_32(bit_pos, sig_coeff_abs_gt1_flags);
478                 sig_coeff_abs_gt1_flags = CLEAR_BIT(
479                     sig_coeff_abs_gt1_flags,
480                     bit_pos); /*sig_coeff_abs_gt1_flags &= (~(0x1<<bit_pos));*/
481 
482                 x_pos = (pu1_csb_table[bit_pos] & 0x3);
483                 y_pos = (pu1_csb_table[bit_pos] >> 2);
484 
485                 quant_coeff = pi2_temp_quant_coeff[x_pos + (y_pos * trans_size)];
486 
487                 /* storing u2_abs_coeff_remaining[i] 2 bytes */
488                 *pu2_out_data_coeff = (UWORD16)abs(quant_coeff) - 1;
489                 pu2_out_data_coeff++;
490             }
491         }
492     }
493 
494     num_bytes = (UWORD8 *)pu2_out_data_coeff - pu1_out_data;
495     return num_bytes; /* Return the number of bytes written to out_data */
496 }
497