xref: /aosp_15_r20/external/libhevc/encoder/arm/ihevce_itrans_recon_neon.c (revision c83a76b084498d55f252f48b2e3786804cdf24b7)
1 /******************************************************************************
2  *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 *  ihevce_itrans_recon_neon.c
24 *
25 * @brief
26 *  Contains functions to inverse transform and adds residue to pred buffer
27 *
28 * @author
29 *  Ittiam
30 *
31 * @par List of Functions:
32 *  - ihevce_itrans_recon_dc_neon
33 *
34 * @remarks
35 *  None
36 *
37 ********************************************************************************
38 */
39 
40 /*****************************************************************************/
41 /* File Includes                                                             */
42 /*****************************************************************************/
43 /* System include files */
44 #include <string.h>
45 #include <arm_neon.h>
46 
47 /* User include files */
48 #include "ihevc_typedefs.h"
49 #include "itt_video_api.h"
50 #include "ihevc_cmn_utils_neon.h"
51 #include "ihevce_cmn_utils_instr_set_router.h"
52 #include "ihevc_defs.h"
53 #include "ihevc_macros.h"
54 
55 /*****************************************************************************/
56 /* Function Definitions                                                      */
57 /*****************************************************************************/
ihevce_itrans_recon_dc_4x4_luma_neon(UWORD8 * pu1_pred,WORD32 pred_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 dc_value)58 static INLINE void ihevce_itrans_recon_dc_4x4_luma_neon(
59     UWORD8 *pu1_pred, WORD32 pred_strd, UWORD8 *pu1_dst, WORD32 dst_strd, WORD32 dc_value)
60 {
61     uint8x16_t src_u8;
62     int16x8_t a0, a1, a2;
63     uint8x8_t a3, a4;
64 
65     src_u8 = load_unaligned_u8q(pu1_pred, pred_strd);
66     a0 = vdupq_n_s16(dc_value);
67     a1 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src_u8)));
68     a2 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src_u8)));
69     a1 = vaddq_s16(a1, a0);
70     a2 = vaddq_s16(a2, a0);
71     a3 = vqmovun_s16(a1);
72     a4 = vqmovun_s16(a2);
73     uint32x2_t p0 = vreinterpret_u32_u8(a3);
74     uint32x2_t p1 = vreinterpret_u32_u8(a4);
75     *(UWORD32 *)(pu1_dst) = vget_lane_u32(p0, 0);
76     *(UWORD32 *)(pu1_dst + dst_strd) = vget_lane_u32(p0, 1);
77     *(UWORD32 *)(pu1_dst + 2 * dst_strd) = vget_lane_u32(p1, 0);
78     *(UWORD32 *)(pu1_dst + 3 * dst_strd) = vget_lane_u32(p1, 1);
79 }
80 
ihevce_itrans_recon_dc_4x4_chroma_neon(UWORD8 * pu1_pred,WORD32 pred_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 trans_size,WORD32 dc_value,CHROMA_PLANE_ID_T e_chroma_plane)81 static INLINE void ihevce_itrans_recon_dc_4x4_chroma_neon(
82     UWORD8 *pu1_pred,
83     WORD32 pred_strd,
84     UWORD8 *pu1_dst,
85     WORD32 dst_strd,
86     WORD32 trans_size,
87     WORD32 dc_value,
88     CHROMA_PLANE_ID_T e_chroma_plane)
89 {
90     WORD32 i;
91     int16x8_t a0, a1;
92     uint8x8_t a2, a3;
93     uint16x4_t select = vdup_n_u16(0xff << (e_chroma_plane << 3));
94 
95     a0 = vdupq_n_s16(dc_value);
96     for(i = 0; i < trans_size; i++)
97     {
98         a1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(pu1_pred + i * pred_strd)));
99         a2 = vqmovun_s16(vaddq_s16(a0, a1));
100         a3 = vld1_u8(pu1_dst + i * dst_strd);
101         a3 = vbsl_u8(vreinterpret_u8_u16(select), a2, a3);
102         vst1_u8(pu1_dst + i * dst_strd, a3);
103     }
104 }
105 
ihevce_itrans_recon_dc_8x8_luma_neon(UWORD8 * pu1_pred,WORD32 pred_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 trans_size,WORD32 dc_value)106 static INLINE void ihevce_itrans_recon_dc_8x8_luma_neon(
107     UWORD8 *pu1_pred,
108     WORD32 pred_strd,
109     UWORD8 *pu1_dst,
110     WORD32 dst_strd,
111     WORD32 trans_size,
112     WORD32 dc_value)
113 {
114     WORD32 i;
115     uint8x16_t a1, a4, a5;
116     uint8x8_t a0, a2, a3;
117 
118     a0 = (dc_value >= 0) ? vqmovun_s16(vdupq_n_s16(dc_value))
119                          : vqmovun_s16(vabsq_s16(vdupq_n_s16(dc_value)));
120     a1 = vcombine_u8(a0, a0);
121     for(i = 0; i < trans_size; i += 2)
122     {
123         a2 = vld1_u8(pu1_pred + i * pred_strd);
124         a3 = vld1_u8(pu1_pred + (i + 1) * pred_strd);
125         a4 = vcombine_u8(a2, a3);
126         a5 = (dc_value >= 0) ? vqaddq_u8(a1, a4) : vqsubq_u8(a4, a1);
127         vst1_u8(pu1_dst + i * dst_strd, vget_low_u8(a5));
128         vst1_u8(pu1_dst + (i + 1) * dst_strd, vget_high_u8(a5));
129     }
130 }
131 
ihevce_itrans_recon_dc_8x8_chroma_neon(UWORD8 * pu1_pred,WORD32 pred_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 trans_size,WORD32 dc_value,CHROMA_PLANE_ID_T e_chroma_plane)132 static INLINE void ihevce_itrans_recon_dc_8x8_chroma_neon(
133     UWORD8 *pu1_pred,
134     WORD32 pred_strd,
135     UWORD8 *pu1_dst,
136     WORD32 dst_strd,
137     WORD32 trans_size,
138     WORD32 dc_value,
139     CHROMA_PLANE_ID_T e_chroma_plane)
140 {
141     WORD32 i;
142     uint8x16_t a4, a0, a5;
143     uint8x8_t a1, a2, a3;
144     uint8x16x2_t a6;
145 
146     a1 = (dc_value >= 0) ? vqmovun_s16(vdupq_n_s16(dc_value))
147                          : vqmovun_s16(vabsq_s16(vdupq_n_s16(dc_value)));
148     a0 = vcombine_u8(a1, a1);
149     for(i = 0; i < trans_size; i += 2)
150     {
151         a2 = vld2_u8(pu1_pred + i * pred_strd).val[e_chroma_plane];
152         a3 = vld2_u8(pu1_pred + (i + 1) * pred_strd).val[e_chroma_plane];
153         a4 = vcombine_u8(a2, a3);
154         a4 = (dc_value >= 0) ? vqaddq_u8(a0, a4) : vqsubq_u8(a4, a0);
155         a2 = vld2_u8(pu1_dst + i * dst_strd).val[!e_chroma_plane];
156         a3 = vld2_u8(pu1_dst + (i + 1) * dst_strd).val[!e_chroma_plane];
157         a5 = vcombine_u8(a2, a3);
158         a6 = (e_chroma_plane == 0) ? vzipq_u8(a4, a5) : vzipq_u8(a5, a4);
159         vst1q_u8(pu1_dst + i * dst_strd, a6.val[0]);
160         vst1q_u8(pu1_dst + (i + 1) * dst_strd, a6.val[1]);
161     }
162 }
163 
ihevce_itrans_recon_dc_16x16_luma_neon(UWORD8 * pu1_pred,WORD32 pred_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 trans_size,WORD32 dc_value)164 static INLINE void ihevce_itrans_recon_dc_16x16_luma_neon(
165     UWORD8 *pu1_pred,
166     WORD32 pred_strd,
167     UWORD8 *pu1_dst,
168     WORD32 dst_strd,
169     WORD32 trans_size,
170     WORD32 dc_value)
171 {
172     WORD32 i;
173     uint8x16_t a1, a3, a2;
174     uint8x8_t a0;
175 
176     a0 = (dc_value >= 0) ? vqmovun_s16(vdupq_n_s16(dc_value))
177                          : vqmovun_s16(vabsq_s16(vdupq_n_s16(dc_value)));
178     a1 = vcombine_u8(a0, a0);
179     for(i = 0; i < trans_size; i++)
180     {
181         a2 = vld1q_u8(pu1_pred + i * pred_strd);
182         a3 = (dc_value >= 0) ? vqaddq_u8(a2, a1) : vqsubq_u8(a2, a1);
183         vst1q_u8(pu1_dst + i * dst_strd, a3);
184     }
185 }
186 
ihevce_itrans_recon_dc_16x16_chroma_neon(UWORD8 * pu1_pred,WORD32 pred_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 trans_size,WORD32 dc_value,CHROMA_PLANE_ID_T e_chroma_plane)187 static INLINE void ihevce_itrans_recon_dc_16x16_chroma_neon(
188     UWORD8 *pu1_pred,
189     WORD32 pred_strd,
190     UWORD8 *pu1_dst,
191     WORD32 dst_strd,
192     WORD32 trans_size,
193     WORD32 dc_value,
194     CHROMA_PLANE_ID_T e_chroma_plane)
195 {
196     WORD32 i;
197     uint8x8_t a0;
198     uint8x16_t a1, a2, a3;
199     uint8x16x2_t a4;
200 
201     a0 = (dc_value >= 0) ? vqmovun_s16(vdupq_n_s16(dc_value))
202                          : vqmovun_s16(vabsq_s16(vdupq_n_s16(dc_value)));
203     a1 = vcombine_u8(a0, a0);
204     for(i = 0; i < trans_size; i++)
205     {
206         a2 = vld2q_u8(pu1_pred + i * pred_strd).val[e_chroma_plane];
207         a2 = (dc_value >= 0) ? vqaddq_u8(a2, a1) : vqsubq_u8(a2, a1);
208         a3 = vld2q_u8(pu1_dst + i * dst_strd).val[!e_chroma_plane];
209         a4 = (e_chroma_plane == 0) ? vzipq_u8(a2, a3) : vzipq_u8(a3, a2);
210         vst1q_u8(pu1_dst + i * dst_strd, a4.val[0]);
211         vst1q_u8(pu1_dst + i * dst_strd + 16, a4.val[1]);
212     }
213 }
214 
ihevce_itrans_recon_dc_neon(UWORD8 * pu1_pred,WORD32 pred_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 trans_size,WORD16 i2_deq_value,CHROMA_PLANE_ID_T e_chroma_plane)215 void ihevce_itrans_recon_dc_neon(
216     UWORD8 *pu1_pred,
217     WORD32 pred_strd,
218     UWORD8 *pu1_dst,
219     WORD32 dst_strd,
220     WORD32 trans_size,
221     WORD16 i2_deq_value,
222     CHROMA_PLANE_ID_T e_chroma_plane)
223 {
224     WORD32 add, shift;
225     WORD32 dc_value;
226 
227     shift = IT_SHIFT_STAGE_1;
228     add = 1 << (shift - 1);
229     dc_value = CLIP_S16((i2_deq_value * 64 + add) >> shift);
230     shift = IT_SHIFT_STAGE_2;
231     add = 1 << (shift - 1);
232     dc_value = CLIP_S16((dc_value * 64 + add) >> shift);
233 
234     switch(trans_size)
235     {
236     case 4:
237         if(NULL_PLANE == e_chroma_plane)
238         {
239             ihevce_itrans_recon_dc_4x4_luma_neon(pu1_pred, pred_strd, pu1_dst, dst_strd, dc_value);
240         }
241         else
242         {
243             ihevce_itrans_recon_dc_4x4_chroma_neon(
244                 pu1_pred, pred_strd, pu1_dst, dst_strd, trans_size, dc_value, e_chroma_plane);
245         }
246         break;
247 
248     case 8:
249         if(NULL_PLANE == e_chroma_plane)
250         {
251             ihevce_itrans_recon_dc_8x8_luma_neon(
252                 pu1_pred, pred_strd, pu1_dst, dst_strd, trans_size, dc_value);
253         }
254         else
255         {
256             ihevce_itrans_recon_dc_8x8_chroma_neon(
257                 pu1_pred, pred_strd, pu1_dst, dst_strd, trans_size, dc_value, e_chroma_plane);
258         }
259         break;
260 
261     case 16:
262         if(NULL_PLANE == e_chroma_plane)
263         {
264             ihevce_itrans_recon_dc_16x16_luma_neon(
265                 pu1_pred, pred_strd, pu1_dst, dst_strd, trans_size, dc_value);
266         }
267         else
268         {
269             ihevce_itrans_recon_dc_16x16_chroma_neon(
270                 pu1_pred, pred_strd, pu1_dst, dst_strd, trans_size, dc_value, e_chroma_plane);
271         }
272         break;
273 
274     case 32:
275         if(NULL_PLANE == e_chroma_plane)
276         {
277             WORD32 b16;
278 
279             for(b16 = 0; b16 < 4; b16++)
280             {
281                 ihevce_itrans_recon_dc_16x16_luma_neon(
282                     pu1_pred + ((b16 >> 1) * pred_strd * 16) + ((b16 & 1) * 16),
283                     pred_strd,
284                     pu1_dst + ((b16 >> 1) * dst_strd * 16) + ((b16 & 1) * 16),
285                     dst_strd,
286                     trans_size >> 1,
287                     dc_value);
288             }
289         }
290         else
291         {
292             WORD32 b16;
293 
294             for(b16 = 0; b16 < 4; b16++)
295             {
296                 ihevce_itrans_recon_dc_16x16_chroma_neon(
297                     pu1_pred + ((b16 >> 1) * pred_strd * 16) + ((b16 & 1) * 32),
298                     pred_strd,
299                     pu1_dst + ((b16 >> 1) * dst_strd * 16) + ((b16 & 1) * 32),
300                     dst_strd,
301                     trans_size >> 1,
302                     dc_value,
303                     e_chroma_plane);
304             }
305         }
306         break;
307     }
308 }
309