xref: /aosp_15_r20/external/libhevc/encoder/arm/ihevce_copy_neon.c (revision c83a76b084498d55f252f48b2e3786804cdf24b7)
1 /******************************************************************************
2  *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 *  ihevce_copy_neon.c
24 *
25 * @brief
26 *  Contains intrinsic definitions of functions for block copy
27 *
28 * @author
29 *  ittiam
30 *
31 * @par List of Functions:
32 *  - ihevce_2d_square_copy_luma_neon()
33 *  - ihevce_copy_2d_neon()
34 *  - ihevce_chroma_interleave_2d_copy_neon()
35 *
36 * @remarks
37 *  None
38 *
39 *******************************************************************************
40 */
41 
42 /*****************************************************************************/
43 /* File Includes                                                             */
44 /*****************************************************************************/
45 /* System include files */
46 #include <string.h>
47 #include <assert.h>
48 #include <arm_neon.h>
49 
50 /* User include files */
51 #include "ihevc_typedefs.h"
52 #include "itt_video_api.h"
53 #include "ihevc_platform_macros.h"
54 
55 #include "ihevce_cmn_utils_instr_set_router.h"
56 
57 /*****************************************************************************/
58 /* Function Definitions                                                      */
59 /*****************************************************************************/
60 
ihevce_chroma_interleave_2d_copy_neon(UWORD8 * pu1_uv_src,WORD32 src_strd,UWORD8 * pu1_uv_dst,WORD32 dst_strd,WORD32 w,WORD32 h,CHROMA_PLANE_ID_T e_chroma_plane)61 void ihevce_chroma_interleave_2d_copy_neon(
62     UWORD8 *pu1_uv_src,
63     WORD32 src_strd,
64     UWORD8 *pu1_uv_dst,
65     WORD32 dst_strd,
66     WORD32 w,
67     WORD32 h,
68     CHROMA_PLANE_ID_T e_chroma_plane)
69 {
70     (void)h;
71     assert(w == h);
72     assert((e_chroma_plane == U_PLANE) || (e_chroma_plane == V_PLANE));
73 
74     if(w == 4)
75     {
76         uint16x4_t select = vdup_n_u16(0xff << (e_chroma_plane << 3));
77 
78         for(; w > 0; w--)
79         {
80             uint8x8_t src_0, dst_0;
81 
82             // row 0
83             src_0 = vld1_u8(pu1_uv_src);
84             dst_0 = vld1_u8(pu1_uv_dst);
85             dst_0 = vbsl_u8(vreinterpret_u8_u16(select), src_0, dst_0);
86             vst1_u8(pu1_uv_dst, dst_0);
87             pu1_uv_src += src_strd;
88             pu1_uv_dst += dst_strd;
89         }
90     }
91     else
92     {
93         uint16x8_t select = vdupq_n_u16(0xff << (e_chroma_plane << 3));
94         WORD32 i, j;
95 
96         assert(w % 8 == 0);
97         for(j = 0; j < w; j += 1)
98         {
99             UWORD8 *dst_ol = pu1_uv_dst + j * dst_strd;
100             UWORD8 *src_ol = pu1_uv_src + j * src_strd;
101 
102             for(i = 0; i < w; i += 8)
103             {
104                 UWORD8 *dst_il = dst_ol + (i * 2);
105                 UWORD8 *src_il = src_ol + (i * 2);
106                 uint8x16_t src_0, dst_0;
107 
108                 // row 0
109                 src_0 = vld1q_u8(src_il);
110                 dst_0 = vld1q_u8(dst_il);
111                 dst_0 = vbslq_u8(vreinterpretq_u8_u16(select), src_0, dst_0);
112                 vst1q_u8(dst_il, dst_0);
113             }
114         }
115     }
116 }
117 
copy_2d_neon(UWORD8 * pu1_dst,WORD32 dst_strd,UWORD8 * pu1_src,WORD32 src_strd,WORD32 blk_wd,WORD32 blk_ht)118 static void copy_2d_neon(
119     UWORD8 *pu1_dst, WORD32 dst_strd, UWORD8 *pu1_src, WORD32 src_strd, WORD32 blk_wd, WORD32 blk_ht)
120 {
121     assert(blk_wd == 4 || blk_wd == 8 || blk_wd == 16 || blk_wd == 32 || (blk_wd % 64 == 0));
122 
123     if(blk_wd == 4)
124     {
125         for(; blk_ht > 0; blk_ht--)
126         {
127             memcpy(pu1_dst, pu1_src, 4);
128             pu1_src += src_strd;
129             pu1_dst += dst_strd;
130         }
131     }
132     else if(blk_wd == 8)
133     {
134         for(; blk_ht > 0; blk_ht--)
135         {
136             uint8x8_t src = vld1_u8(pu1_src);
137 
138             vst1_u8(pu1_dst, src);
139             pu1_src += src_strd;
140             pu1_dst += dst_strd;
141         }
142     }
143     else if(blk_wd == 16)
144     {
145         for(; blk_ht > 0; blk_ht--)
146         {
147             uint8x16_t src = vld1q_u8(pu1_src);
148 
149             vst1q_u8(pu1_dst, src);
150             pu1_src += src_strd;
151             pu1_dst += dst_strd;
152         }
153     }
154     else if(blk_wd == 32)
155     {
156         for(; blk_ht > 0; blk_ht--)
157         {
158             uint8x16_t src_0, src_1;
159 
160             // row 0
161             src_0 = vld1q_u8(pu1_src);
162             vst1q_u8(pu1_dst, src_0);
163             src_1 = vld1q_u8(pu1_src + 16);
164             vst1q_u8(pu1_dst + 16, src_1);
165             pu1_src += src_strd;
166             pu1_dst += dst_strd;
167         }
168     }
169     else if(blk_wd % 64 == 0)
170     {
171         WORD32 i, j;
172 
173         for(j = 0; j < blk_ht; j += 1)
174         {
175             UWORD8 *dst_ol = pu1_dst + j * dst_strd;
176             UWORD8 *src_ol = pu1_src + j * src_strd;
177 
178             for(i = 0; i < blk_wd; i += 64)
179             {
180                 uint8x16_t src_0, src_1, src_2, src_3;
181                 UWORD8 *dst_il = dst_ol + i;
182                 UWORD8 *src_il = src_ol + i;
183 
184                 src_0 = vld1q_u8(src_il);
185                 vst1q_u8(dst_il, src_0);
186                 src_1 = vld1q_u8(src_il + 16);
187                 vst1q_u8(dst_il + 16, src_1);
188                 src_2 = vld1q_u8(src_il + 32);
189                 vst1q_u8(dst_il + 32, src_2);
190                 src_3 = vld1q_u8(src_il + 48);
191                 vst1q_u8(dst_il + 48, src_3);
192             }
193         }
194     }
195 }
196 
ihevce_2d_square_copy_luma_neon(void * p_dst,WORD32 dst_strd,void * p_src,WORD32 src_strd,WORD32 num_cols_to_copy,WORD32 unit_size)197 void ihevce_2d_square_copy_luma_neon(
198     void *p_dst,
199     WORD32 dst_strd,
200     void *p_src,
201     WORD32 src_strd,
202     WORD32 num_cols_to_copy,
203     WORD32 unit_size)
204 {
205     UWORD8 *pu1_dst = (UWORD8 *)p_dst;
206     UWORD8 *pu1_src = (UWORD8 *)p_src;
207 
208     copy_2d_neon(
209         pu1_dst,
210         dst_strd * unit_size,
211         pu1_src,
212         src_strd * unit_size,
213         num_cols_to_copy * unit_size,
214         num_cols_to_copy);
215 }
216 
ihevce_copy_2d_neon(UWORD8 * pu1_dst,WORD32 dst_strd,UWORD8 * pu1_src,WORD32 src_strd,WORD32 blk_wd,WORD32 blk_ht)217 void ihevce_copy_2d_neon(
218     UWORD8 *pu1_dst, WORD32 dst_strd, UWORD8 *pu1_src, WORD32 src_strd, WORD32 blk_wd, WORD32 blk_ht)
219 {
220     if(blk_wd == 0)
221         return;
222 
223     if(blk_wd > 64)
224     {
225         copy_2d_neon(pu1_dst, dst_strd, pu1_src, src_strd, 64, blk_ht);
226         ihevce_copy_2d_neon(pu1_dst + 64, dst_strd, pu1_src + 64, src_strd, blk_wd - 64, blk_ht);
227     }
228     else if(blk_wd > 32)
229     {
230         copy_2d_neon(pu1_dst, dst_strd, pu1_src, src_strd, 32, blk_ht);
231         ihevce_copy_2d_neon(pu1_dst + 32, dst_strd, pu1_src + 32, src_strd, blk_wd - 32, blk_ht);
232     }
233     else if(blk_wd >= 16)
234     {
235         if(blk_ht % 2 == 0)
236         {
237             copy_2d_neon(pu1_dst, dst_strd, pu1_src, src_strd, 16, blk_ht);
238             ihevce_copy_2d_neon(
239                 pu1_dst + 16, dst_strd, pu1_src + 16, src_strd, blk_wd - 16, blk_ht);
240         }
241         else
242         {
243             copy_2d_neon(pu1_dst, dst_strd, pu1_src, src_strd, 16, blk_ht - 1);
244             memcpy(pu1_dst + (blk_ht - 1) * dst_strd, pu1_src + (blk_ht - 1) * src_strd, blk_wd);
245             ihevce_copy_2d_neon(
246                 pu1_dst + 16, dst_strd, pu1_src + 16, src_strd, blk_wd - 16, blk_ht - 1);
247         }
248     }
249     else if(blk_wd >= 8)
250     {
251         if(blk_ht % 2 == 0)
252         {
253             copy_2d_neon(pu1_dst, dst_strd, pu1_src, src_strd, 8, blk_ht);
254             ihevce_copy_2d_neon(pu1_dst + 8, dst_strd, pu1_src + 8, src_strd, blk_wd - 8, blk_ht);
255         }
256         else
257         {
258             copy_2d_neon(pu1_dst, dst_strd, pu1_src, src_strd, 8, blk_ht - 1);
259             memcpy(pu1_dst + (blk_ht - 1) * dst_strd, pu1_src + (blk_ht - 1) * src_strd, blk_wd);
260             ihevce_copy_2d_neon(
261                 pu1_dst + 8, dst_strd, pu1_src + 8, src_strd, blk_wd - 8, blk_ht - 1);
262         }
263     }
264     else if(blk_wd >= 4)
265     {
266         if(blk_ht % 2 == 0)
267         {
268             copy_2d_neon(pu1_dst, dst_strd, pu1_src, src_strd, 4, blk_ht);
269             ihevce_copy_2d_neon(pu1_dst + 4, dst_strd, pu1_src + 4, src_strd, blk_wd - 4, blk_ht);
270         }
271         else
272         {
273             copy_2d_neon(pu1_dst, dst_strd, pu1_src, src_strd, 4, blk_ht - 1);
274             memcpy(pu1_dst + (blk_ht - 1) * dst_strd, pu1_src + (blk_ht - 1) * src_strd, blk_wd);
275             ihevce_copy_2d_neon(
276                 pu1_dst + 4, dst_strd, pu1_src + 4, src_strd, blk_wd - 4, blk_ht - 1);
277         }
278     }
279     else
280     {
281         ihevce_copy_2d(pu1_dst, dst_strd, pu1_src, src_strd, blk_wd, blk_ht);
282     }
283 }
284