1 /******************************************************************************
2 *
3 * Copyright (C) 2018 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 * ihevce_itrans_recon_neon.c
24 *
25 * @brief
26 * Contains functions to inverse transform and adds residue to pred buffer
27 *
28 * @author
29 * Ittiam
30 *
31 * @par List of Functions:
32 * - ihevce_itrans_recon_dc_neon
33 *
34 * @remarks
35 * None
36 *
37 ********************************************************************************
38 */
39
40 /*****************************************************************************/
41 /* File Includes */
42 /*****************************************************************************/
43 /* System include files */
44 #include <string.h>
45 #include <arm_neon.h>
46
47 /* User include files */
48 #include "ihevc_typedefs.h"
49 #include "itt_video_api.h"
50 #include "ihevc_cmn_utils_neon.h"
51 #include "ihevce_cmn_utils_instr_set_router.h"
52 #include "ihevc_defs.h"
53 #include "ihevc_macros.h"
54
55 /*****************************************************************************/
56 /* Function Definitions */
57 /*****************************************************************************/
ihevce_itrans_recon_dc_4x4_luma_neon(UWORD8 * pu1_pred,WORD32 pred_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 dc_value)58 static INLINE void ihevce_itrans_recon_dc_4x4_luma_neon(
59 UWORD8 *pu1_pred, WORD32 pred_strd, UWORD8 *pu1_dst, WORD32 dst_strd, WORD32 dc_value)
60 {
61 uint8x16_t src_u8;
62 int16x8_t a0, a1, a2;
63 uint8x8_t a3, a4;
64
65 src_u8 = load_unaligned_u8q(pu1_pred, pred_strd);
66 a0 = vdupq_n_s16(dc_value);
67 a1 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src_u8)));
68 a2 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src_u8)));
69 a1 = vaddq_s16(a1, a0);
70 a2 = vaddq_s16(a2, a0);
71 a3 = vqmovun_s16(a1);
72 a4 = vqmovun_s16(a2);
73 uint32x2_t p0 = vreinterpret_u32_u8(a3);
74 uint32x2_t p1 = vreinterpret_u32_u8(a4);
75 *(UWORD32 *)(pu1_dst) = vget_lane_u32(p0, 0);
76 *(UWORD32 *)(pu1_dst + dst_strd) = vget_lane_u32(p0, 1);
77 *(UWORD32 *)(pu1_dst + 2 * dst_strd) = vget_lane_u32(p1, 0);
78 *(UWORD32 *)(pu1_dst + 3 * dst_strd) = vget_lane_u32(p1, 1);
79 }
80
ihevce_itrans_recon_dc_4x4_chroma_neon(UWORD8 * pu1_pred,WORD32 pred_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 trans_size,WORD32 dc_value,CHROMA_PLANE_ID_T e_chroma_plane)81 static INLINE void ihevce_itrans_recon_dc_4x4_chroma_neon(
82 UWORD8 *pu1_pred,
83 WORD32 pred_strd,
84 UWORD8 *pu1_dst,
85 WORD32 dst_strd,
86 WORD32 trans_size,
87 WORD32 dc_value,
88 CHROMA_PLANE_ID_T e_chroma_plane)
89 {
90 WORD32 i;
91 int16x8_t a0, a1;
92 uint8x8_t a2, a3;
93 uint16x4_t select = vdup_n_u16(0xff << (e_chroma_plane << 3));
94
95 a0 = vdupq_n_s16(dc_value);
96 for(i = 0; i < trans_size; i++)
97 {
98 a1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(pu1_pred + i * pred_strd)));
99 a2 = vqmovun_s16(vaddq_s16(a0, a1));
100 a3 = vld1_u8(pu1_dst + i * dst_strd);
101 a3 = vbsl_u8(vreinterpret_u8_u16(select), a2, a3);
102 vst1_u8(pu1_dst + i * dst_strd, a3);
103 }
104 }
105
ihevce_itrans_recon_dc_8x8_luma_neon(UWORD8 * pu1_pred,WORD32 pred_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 trans_size,WORD32 dc_value)106 static INLINE void ihevce_itrans_recon_dc_8x8_luma_neon(
107 UWORD8 *pu1_pred,
108 WORD32 pred_strd,
109 UWORD8 *pu1_dst,
110 WORD32 dst_strd,
111 WORD32 trans_size,
112 WORD32 dc_value)
113 {
114 WORD32 i;
115 uint8x16_t a1, a4, a5;
116 uint8x8_t a0, a2, a3;
117
118 a0 = (dc_value >= 0) ? vqmovun_s16(vdupq_n_s16(dc_value))
119 : vqmovun_s16(vabsq_s16(vdupq_n_s16(dc_value)));
120 a1 = vcombine_u8(a0, a0);
121 for(i = 0; i < trans_size; i += 2)
122 {
123 a2 = vld1_u8(pu1_pred + i * pred_strd);
124 a3 = vld1_u8(pu1_pred + (i + 1) * pred_strd);
125 a4 = vcombine_u8(a2, a3);
126 a5 = (dc_value >= 0) ? vqaddq_u8(a1, a4) : vqsubq_u8(a4, a1);
127 vst1_u8(pu1_dst + i * dst_strd, vget_low_u8(a5));
128 vst1_u8(pu1_dst + (i + 1) * dst_strd, vget_high_u8(a5));
129 }
130 }
131
ihevce_itrans_recon_dc_8x8_chroma_neon(UWORD8 * pu1_pred,WORD32 pred_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 trans_size,WORD32 dc_value,CHROMA_PLANE_ID_T e_chroma_plane)132 static INLINE void ihevce_itrans_recon_dc_8x8_chroma_neon(
133 UWORD8 *pu1_pred,
134 WORD32 pred_strd,
135 UWORD8 *pu1_dst,
136 WORD32 dst_strd,
137 WORD32 trans_size,
138 WORD32 dc_value,
139 CHROMA_PLANE_ID_T e_chroma_plane)
140 {
141 WORD32 i;
142 uint8x16_t a4, a0, a5;
143 uint8x8_t a1, a2, a3;
144 uint8x16x2_t a6;
145
146 a1 = (dc_value >= 0) ? vqmovun_s16(vdupq_n_s16(dc_value))
147 : vqmovun_s16(vabsq_s16(vdupq_n_s16(dc_value)));
148 a0 = vcombine_u8(a1, a1);
149 for(i = 0; i < trans_size; i += 2)
150 {
151 a2 = vld2_u8(pu1_pred + i * pred_strd).val[e_chroma_plane];
152 a3 = vld2_u8(pu1_pred + (i + 1) * pred_strd).val[e_chroma_plane];
153 a4 = vcombine_u8(a2, a3);
154 a4 = (dc_value >= 0) ? vqaddq_u8(a0, a4) : vqsubq_u8(a4, a0);
155 a2 = vld2_u8(pu1_dst + i * dst_strd).val[!e_chroma_plane];
156 a3 = vld2_u8(pu1_dst + (i + 1) * dst_strd).val[!e_chroma_plane];
157 a5 = vcombine_u8(a2, a3);
158 a6 = (e_chroma_plane == 0) ? vzipq_u8(a4, a5) : vzipq_u8(a5, a4);
159 vst1q_u8(pu1_dst + i * dst_strd, a6.val[0]);
160 vst1q_u8(pu1_dst + (i + 1) * dst_strd, a6.val[1]);
161 }
162 }
163
ihevce_itrans_recon_dc_16x16_luma_neon(UWORD8 * pu1_pred,WORD32 pred_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 trans_size,WORD32 dc_value)164 static INLINE void ihevce_itrans_recon_dc_16x16_luma_neon(
165 UWORD8 *pu1_pred,
166 WORD32 pred_strd,
167 UWORD8 *pu1_dst,
168 WORD32 dst_strd,
169 WORD32 trans_size,
170 WORD32 dc_value)
171 {
172 WORD32 i;
173 uint8x16_t a1, a3, a2;
174 uint8x8_t a0;
175
176 a0 = (dc_value >= 0) ? vqmovun_s16(vdupq_n_s16(dc_value))
177 : vqmovun_s16(vabsq_s16(vdupq_n_s16(dc_value)));
178 a1 = vcombine_u8(a0, a0);
179 for(i = 0; i < trans_size; i++)
180 {
181 a2 = vld1q_u8(pu1_pred + i * pred_strd);
182 a3 = (dc_value >= 0) ? vqaddq_u8(a2, a1) : vqsubq_u8(a2, a1);
183 vst1q_u8(pu1_dst + i * dst_strd, a3);
184 }
185 }
186
ihevce_itrans_recon_dc_16x16_chroma_neon(UWORD8 * pu1_pred,WORD32 pred_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 trans_size,WORD32 dc_value,CHROMA_PLANE_ID_T e_chroma_plane)187 static INLINE void ihevce_itrans_recon_dc_16x16_chroma_neon(
188 UWORD8 *pu1_pred,
189 WORD32 pred_strd,
190 UWORD8 *pu1_dst,
191 WORD32 dst_strd,
192 WORD32 trans_size,
193 WORD32 dc_value,
194 CHROMA_PLANE_ID_T e_chroma_plane)
195 {
196 WORD32 i;
197 uint8x8_t a0;
198 uint8x16_t a1, a2, a3;
199 uint8x16x2_t a4;
200
201 a0 = (dc_value >= 0) ? vqmovun_s16(vdupq_n_s16(dc_value))
202 : vqmovun_s16(vabsq_s16(vdupq_n_s16(dc_value)));
203 a1 = vcombine_u8(a0, a0);
204 for(i = 0; i < trans_size; i++)
205 {
206 a2 = vld2q_u8(pu1_pred + i * pred_strd).val[e_chroma_plane];
207 a2 = (dc_value >= 0) ? vqaddq_u8(a2, a1) : vqsubq_u8(a2, a1);
208 a3 = vld2q_u8(pu1_dst + i * dst_strd).val[!e_chroma_plane];
209 a4 = (e_chroma_plane == 0) ? vzipq_u8(a2, a3) : vzipq_u8(a3, a2);
210 vst1q_u8(pu1_dst + i * dst_strd, a4.val[0]);
211 vst1q_u8(pu1_dst + i * dst_strd + 16, a4.val[1]);
212 }
213 }
214
ihevce_itrans_recon_dc_neon(UWORD8 * pu1_pred,WORD32 pred_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 trans_size,WORD16 i2_deq_value,CHROMA_PLANE_ID_T e_chroma_plane)215 void ihevce_itrans_recon_dc_neon(
216 UWORD8 *pu1_pred,
217 WORD32 pred_strd,
218 UWORD8 *pu1_dst,
219 WORD32 dst_strd,
220 WORD32 trans_size,
221 WORD16 i2_deq_value,
222 CHROMA_PLANE_ID_T e_chroma_plane)
223 {
224 WORD32 add, shift;
225 WORD32 dc_value;
226
227 shift = IT_SHIFT_STAGE_1;
228 add = 1 << (shift - 1);
229 dc_value = CLIP_S16((i2_deq_value * 64 + add) >> shift);
230 shift = IT_SHIFT_STAGE_2;
231 add = 1 << (shift - 1);
232 dc_value = CLIP_S16((dc_value * 64 + add) >> shift);
233
234 switch(trans_size)
235 {
236 case 4:
237 if(NULL_PLANE == e_chroma_plane)
238 {
239 ihevce_itrans_recon_dc_4x4_luma_neon(pu1_pred, pred_strd, pu1_dst, dst_strd, dc_value);
240 }
241 else
242 {
243 ihevce_itrans_recon_dc_4x4_chroma_neon(
244 pu1_pred, pred_strd, pu1_dst, dst_strd, trans_size, dc_value, e_chroma_plane);
245 }
246 break;
247
248 case 8:
249 if(NULL_PLANE == e_chroma_plane)
250 {
251 ihevce_itrans_recon_dc_8x8_luma_neon(
252 pu1_pred, pred_strd, pu1_dst, dst_strd, trans_size, dc_value);
253 }
254 else
255 {
256 ihevce_itrans_recon_dc_8x8_chroma_neon(
257 pu1_pred, pred_strd, pu1_dst, dst_strd, trans_size, dc_value, e_chroma_plane);
258 }
259 break;
260
261 case 16:
262 if(NULL_PLANE == e_chroma_plane)
263 {
264 ihevce_itrans_recon_dc_16x16_luma_neon(
265 pu1_pred, pred_strd, pu1_dst, dst_strd, trans_size, dc_value);
266 }
267 else
268 {
269 ihevce_itrans_recon_dc_16x16_chroma_neon(
270 pu1_pred, pred_strd, pu1_dst, dst_strd, trans_size, dc_value, e_chroma_plane);
271 }
272 break;
273
274 case 32:
275 if(NULL_PLANE == e_chroma_plane)
276 {
277 WORD32 b16;
278
279 for(b16 = 0; b16 < 4; b16++)
280 {
281 ihevce_itrans_recon_dc_16x16_luma_neon(
282 pu1_pred + ((b16 >> 1) * pred_strd * 16) + ((b16 & 1) * 16),
283 pred_strd,
284 pu1_dst + ((b16 >> 1) * dst_strd * 16) + ((b16 & 1) * 16),
285 dst_strd,
286 trans_size >> 1,
287 dc_value);
288 }
289 }
290 else
291 {
292 WORD32 b16;
293
294 for(b16 = 0; b16 < 4; b16++)
295 {
296 ihevce_itrans_recon_dc_16x16_chroma_neon(
297 pu1_pred + ((b16 >> 1) * pred_strd * 16) + ((b16 & 1) * 32),
298 pred_strd,
299 pu1_dst + ((b16 >> 1) * dst_strd * 16) + ((b16 & 1) * 32),
300 dst_strd,
301 trans_size >> 1,
302 dc_value,
303 e_chroma_plane);
304 }
305 }
306 break;
307 }
308 }
309