1 /******************************************************************************
2 *
3 * Copyright (C) 2018 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 * ihevce_copy_neon.c
24 *
25 * @brief
26 * Contains intrinsic definitions of functions for block copy
27 *
28 * @author
29 * ittiam
30 *
31 * @par List of Functions:
32 * - ihevce_2d_square_copy_luma_neon()
33 * - ihevce_copy_2d_neon()
34 * - ihevce_chroma_interleave_2d_copy_neon()
35 *
36 * @remarks
37 * None
38 *
39 *******************************************************************************
40 */
41
42 /*****************************************************************************/
43 /* File Includes */
44 /*****************************************************************************/
45 /* System include files */
46 #include <string.h>
47 #include <assert.h>
48 #include <arm_neon.h>
49
50 /* User include files */
51 #include "ihevc_typedefs.h"
52 #include "itt_video_api.h"
53 #include "ihevc_platform_macros.h"
54
55 #include "ihevce_cmn_utils_instr_set_router.h"
56
57 /*****************************************************************************/
58 /* Function Definitions */
59 /*****************************************************************************/
60
ihevce_chroma_interleave_2d_copy_neon(UWORD8 * pu1_uv_src,WORD32 src_strd,UWORD8 * pu1_uv_dst,WORD32 dst_strd,WORD32 w,WORD32 h,CHROMA_PLANE_ID_T e_chroma_plane)61 void ihevce_chroma_interleave_2d_copy_neon(
62 UWORD8 *pu1_uv_src,
63 WORD32 src_strd,
64 UWORD8 *pu1_uv_dst,
65 WORD32 dst_strd,
66 WORD32 w,
67 WORD32 h,
68 CHROMA_PLANE_ID_T e_chroma_plane)
69 {
70 (void)h;
71 assert(w == h);
72 assert((e_chroma_plane == U_PLANE) || (e_chroma_plane == V_PLANE));
73
74 if(w == 4)
75 {
76 uint16x4_t select = vdup_n_u16(0xff << (e_chroma_plane << 3));
77
78 for(; w > 0; w--)
79 {
80 uint8x8_t src_0, dst_0;
81
82 // row 0
83 src_0 = vld1_u8(pu1_uv_src);
84 dst_0 = vld1_u8(pu1_uv_dst);
85 dst_0 = vbsl_u8(vreinterpret_u8_u16(select), src_0, dst_0);
86 vst1_u8(pu1_uv_dst, dst_0);
87 pu1_uv_src += src_strd;
88 pu1_uv_dst += dst_strd;
89 }
90 }
91 else
92 {
93 uint16x8_t select = vdupq_n_u16(0xff << (e_chroma_plane << 3));
94 WORD32 i, j;
95
96 assert(w % 8 == 0);
97 for(j = 0; j < w; j += 1)
98 {
99 UWORD8 *dst_ol = pu1_uv_dst + j * dst_strd;
100 UWORD8 *src_ol = pu1_uv_src + j * src_strd;
101
102 for(i = 0; i < w; i += 8)
103 {
104 UWORD8 *dst_il = dst_ol + (i * 2);
105 UWORD8 *src_il = src_ol + (i * 2);
106 uint8x16_t src_0, dst_0;
107
108 // row 0
109 src_0 = vld1q_u8(src_il);
110 dst_0 = vld1q_u8(dst_il);
111 dst_0 = vbslq_u8(vreinterpretq_u8_u16(select), src_0, dst_0);
112 vst1q_u8(dst_il, dst_0);
113 }
114 }
115 }
116 }
117
copy_2d_neon(UWORD8 * pu1_dst,WORD32 dst_strd,UWORD8 * pu1_src,WORD32 src_strd,WORD32 blk_wd,WORD32 blk_ht)118 static void copy_2d_neon(
119 UWORD8 *pu1_dst, WORD32 dst_strd, UWORD8 *pu1_src, WORD32 src_strd, WORD32 blk_wd, WORD32 blk_ht)
120 {
121 assert(blk_wd == 4 || blk_wd == 8 || blk_wd == 16 || blk_wd == 32 || (blk_wd % 64 == 0));
122
123 if(blk_wd == 4)
124 {
125 for(; blk_ht > 0; blk_ht--)
126 {
127 memcpy(pu1_dst, pu1_src, 4);
128 pu1_src += src_strd;
129 pu1_dst += dst_strd;
130 }
131 }
132 else if(blk_wd == 8)
133 {
134 for(; blk_ht > 0; blk_ht--)
135 {
136 uint8x8_t src = vld1_u8(pu1_src);
137
138 vst1_u8(pu1_dst, src);
139 pu1_src += src_strd;
140 pu1_dst += dst_strd;
141 }
142 }
143 else if(blk_wd == 16)
144 {
145 for(; blk_ht > 0; blk_ht--)
146 {
147 uint8x16_t src = vld1q_u8(pu1_src);
148
149 vst1q_u8(pu1_dst, src);
150 pu1_src += src_strd;
151 pu1_dst += dst_strd;
152 }
153 }
154 else if(blk_wd == 32)
155 {
156 for(; blk_ht > 0; blk_ht--)
157 {
158 uint8x16_t src_0, src_1;
159
160 // row 0
161 src_0 = vld1q_u8(pu1_src);
162 vst1q_u8(pu1_dst, src_0);
163 src_1 = vld1q_u8(pu1_src + 16);
164 vst1q_u8(pu1_dst + 16, src_1);
165 pu1_src += src_strd;
166 pu1_dst += dst_strd;
167 }
168 }
169 else if(blk_wd % 64 == 0)
170 {
171 WORD32 i, j;
172
173 for(j = 0; j < blk_ht; j += 1)
174 {
175 UWORD8 *dst_ol = pu1_dst + j * dst_strd;
176 UWORD8 *src_ol = pu1_src + j * src_strd;
177
178 for(i = 0; i < blk_wd; i += 64)
179 {
180 uint8x16_t src_0, src_1, src_2, src_3;
181 UWORD8 *dst_il = dst_ol + i;
182 UWORD8 *src_il = src_ol + i;
183
184 src_0 = vld1q_u8(src_il);
185 vst1q_u8(dst_il, src_0);
186 src_1 = vld1q_u8(src_il + 16);
187 vst1q_u8(dst_il + 16, src_1);
188 src_2 = vld1q_u8(src_il + 32);
189 vst1q_u8(dst_il + 32, src_2);
190 src_3 = vld1q_u8(src_il + 48);
191 vst1q_u8(dst_il + 48, src_3);
192 }
193 }
194 }
195 }
196
ihevce_2d_square_copy_luma_neon(void * p_dst,WORD32 dst_strd,void * p_src,WORD32 src_strd,WORD32 num_cols_to_copy,WORD32 unit_size)197 void ihevce_2d_square_copy_luma_neon(
198 void *p_dst,
199 WORD32 dst_strd,
200 void *p_src,
201 WORD32 src_strd,
202 WORD32 num_cols_to_copy,
203 WORD32 unit_size)
204 {
205 UWORD8 *pu1_dst = (UWORD8 *)p_dst;
206 UWORD8 *pu1_src = (UWORD8 *)p_src;
207
208 copy_2d_neon(
209 pu1_dst,
210 dst_strd * unit_size,
211 pu1_src,
212 src_strd * unit_size,
213 num_cols_to_copy * unit_size,
214 num_cols_to_copy);
215 }
216
ihevce_copy_2d_neon(UWORD8 * pu1_dst,WORD32 dst_strd,UWORD8 * pu1_src,WORD32 src_strd,WORD32 blk_wd,WORD32 blk_ht)217 void ihevce_copy_2d_neon(
218 UWORD8 *pu1_dst, WORD32 dst_strd, UWORD8 *pu1_src, WORD32 src_strd, WORD32 blk_wd, WORD32 blk_ht)
219 {
220 if(blk_wd == 0)
221 return;
222
223 if(blk_wd > 64)
224 {
225 copy_2d_neon(pu1_dst, dst_strd, pu1_src, src_strd, 64, blk_ht);
226 ihevce_copy_2d_neon(pu1_dst + 64, dst_strd, pu1_src + 64, src_strd, blk_wd - 64, blk_ht);
227 }
228 else if(blk_wd > 32)
229 {
230 copy_2d_neon(pu1_dst, dst_strd, pu1_src, src_strd, 32, blk_ht);
231 ihevce_copy_2d_neon(pu1_dst + 32, dst_strd, pu1_src + 32, src_strd, blk_wd - 32, blk_ht);
232 }
233 else if(blk_wd >= 16)
234 {
235 if(blk_ht % 2 == 0)
236 {
237 copy_2d_neon(pu1_dst, dst_strd, pu1_src, src_strd, 16, blk_ht);
238 ihevce_copy_2d_neon(
239 pu1_dst + 16, dst_strd, pu1_src + 16, src_strd, blk_wd - 16, blk_ht);
240 }
241 else
242 {
243 copy_2d_neon(pu1_dst, dst_strd, pu1_src, src_strd, 16, blk_ht - 1);
244 memcpy(pu1_dst + (blk_ht - 1) * dst_strd, pu1_src + (blk_ht - 1) * src_strd, blk_wd);
245 ihevce_copy_2d_neon(
246 pu1_dst + 16, dst_strd, pu1_src + 16, src_strd, blk_wd - 16, blk_ht - 1);
247 }
248 }
249 else if(blk_wd >= 8)
250 {
251 if(blk_ht % 2 == 0)
252 {
253 copy_2d_neon(pu1_dst, dst_strd, pu1_src, src_strd, 8, blk_ht);
254 ihevce_copy_2d_neon(pu1_dst + 8, dst_strd, pu1_src + 8, src_strd, blk_wd - 8, blk_ht);
255 }
256 else
257 {
258 copy_2d_neon(pu1_dst, dst_strd, pu1_src, src_strd, 8, blk_ht - 1);
259 memcpy(pu1_dst + (blk_ht - 1) * dst_strd, pu1_src + (blk_ht - 1) * src_strd, blk_wd);
260 ihevce_copy_2d_neon(
261 pu1_dst + 8, dst_strd, pu1_src + 8, src_strd, blk_wd - 8, blk_ht - 1);
262 }
263 }
264 else if(blk_wd >= 4)
265 {
266 if(blk_ht % 2 == 0)
267 {
268 copy_2d_neon(pu1_dst, dst_strd, pu1_src, src_strd, 4, blk_ht);
269 ihevce_copy_2d_neon(pu1_dst + 4, dst_strd, pu1_src + 4, src_strd, blk_wd - 4, blk_ht);
270 }
271 else
272 {
273 copy_2d_neon(pu1_dst, dst_strd, pu1_src, src_strd, 4, blk_ht - 1);
274 memcpy(pu1_dst + (blk_ht - 1) * dst_strd, pu1_src + (blk_ht - 1) * src_strd, blk_wd);
275 ihevce_copy_2d_neon(
276 pu1_dst + 4, dst_strd, pu1_src + 4, src_strd, blk_wd - 4, blk_ht - 1);
277 }
278 }
279 else
280 {
281 ihevce_copy_2d(pu1_dst, dst_strd, pu1_src, src_strd, blk_wd, blk_ht);
282 }
283 }
284