xref: /aosp_15_r20/external/libopenapv/src/neon/oapv_tq_neon.c (revision abb65b4b03b69e1d508d4d9a44dcf199df16e7c3)
1 /*
2  * Copyright (c) 2022 Samsung Electronics Co., Ltd.
3  * All Rights Reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  * - Redistributions of source code must retain the above copyright notice,
9  *   this list of conditions and the following disclaimer.
10  *
11  * - Redistributions in binary form must reproduce the above copyright notice,
12  *   this list of conditions and the following disclaimer in the documentation
13  *   and/or other materials provided with the distribution.
14  *
15  * - Neither the name of the copyright owner, nor the names of its contributors
16  *   may be used to endorse or promote products derived from this software
17  *   without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 
33 #include "oapv_def.h"
34 #include "oapv_tq_neon.h"
35 
36 #if ARM_NEON
37 
38 const s32 oapv_coeff[8][4] =
39 {
40     {64, 64, 64, 64}, // 0th row coeff
41     {89, 75, 50, 18}, // 2nd row coeff
42     {84, 35, 84, 35}, // 3rd row coeff
43     {75,-18,-89,-50}, // 4th row coeff
44     {64,-64, 64,-64}, // 5th row coeff
45     {50,-89, 18, 75}, // 6th row coeff
46     {35,-84, 35,-84}, // 7th row coeff
47     {18,-50, 75,-89}  // 8th row coeff
48 };
49 
50 #define multiply_s32(part1, part2, coeff, res) \
51     low = vmulq_s32(part1, coeff); \
52     high = vmulq_s32(part2, coeff); \
53     res = vcombine_s32(vpadd_s32(vget_low_s32(low), vget_high_s32(low)), vpadd_s32(vget_low_s32(high), vget_high_s32(high))); \
54 
oapv_tx_pb8b_neon(s16 * src,s16 * dst,const int shift,int line)55 static void oapv_tx_pb8b_neon(s16 *src, s16 *dst, const int shift, int line)
56 {
57     s16 i;
58     s16 *tempSrc = src;
59     int16x4_t src_part1, src_part2;
60     int32x4_t coeff0, coeff1, coeff2, coeff3, coeff4, coeff5, coeff6, coeff7;
61     int32x4_t add = vdupq_n_s32(1 << (shift - 1));
62     int32x4_t sh = vdupq_n_s32(-shift);
63 
64     int32x4_t EE_part1, EE_part2, EO_part1, EO_part2, low, high, result0, result1, result2, result3, result4, result5, result6, result7, E1, O1, E2, O2, res1, res2, res3, res4;
65 
66     for(i = 0; i < 8; i += 4)
67     {
68         // Loading src[0 - 3] and src[4 - 7]
69         src_part1 = vld1_s16(tempSrc);
70         tempSrc += 4;
71         src_part2 = vld1_s16(tempSrc);
72         tempSrc += 4;
73 
74         //reverse src_part2
75         src_part2 = vrev64_s16(src_part2);
76 
77         E1 = vaddl_s16(src_part1, src_part2);
78         O1 = vsubl_s16(src_part1, src_part2);
79 
80         // Loading src[8 - 11] and src[12 - 15]
81         src_part1 = vld1_s16(tempSrc);
82         tempSrc += 4;
83         src_part2 = vld1_s16(tempSrc);
84         tempSrc += 4;
85 
86         //reverse src_part2
87         src_part2 = vrev64_s16(src_part2);
88 
89         E2 = vaddl_s16(src_part1, src_part2);
90         O2 = vsubl_s16(src_part1, src_part2);
91 
92         int32x4_t tmp1 = vcombine_s32(vget_low_s32(E1), vget_low_s32(E2));
93         int32x4_t tmp2 = vcombine_s32(vget_high_s32(E1), vget_high_s32(E2));
94         tmp2 = vrev64q_s32(tmp2);
95 
96         EE_part1 = vaddq_s32(tmp1, tmp2);
97         EO_part1 = vsubq_s32(tmp1, tmp2);
98 
99         coeff1 = vld1q_s32(oapv_coeff[1]);
100         coeff3 = vld1q_s32(oapv_coeff[3]);
101         coeff5 = vld1q_s32(oapv_coeff[5]);
102         coeff7 = vld1q_s32(oapv_coeff[7]);
103 
104         multiply_s32(O1, O2, coeff1, result1);
105         multiply_s32(O1, O2, coeff3, result3);
106         multiply_s32(O1, O2, coeff5, result5);
107         multiply_s32(O1, O2, coeff7, result7);
108 
109         res1 = vpaddq_s32(result1, result3);
110         res2 = vpaddq_s32(result5, result7);
111 
112         // add and shift
113         res1 = vshlq_s32(vaddq_s32(res1, add), sh);
114         res2 = vshlq_s32(vaddq_s32(res2, add), sh);
115 
116         // Loading src[16 - 19] and src[20 - 23]
117         src_part1 = vld1_s16(tempSrc);
118         tempSrc += 4;
119         src_part2 = vld1_s16(tempSrc);
120         tempSrc += 4;
121 
122         //reverse src_part2
123         src_part2 = vrev64_s16(src_part2);
124 
125         E1 = vaddl_s16(src_part1, src_part2);
126         O1 = vsubl_s16(src_part1, src_part2);
127 
128         // Loading src[24 - 27] and src[28 - 31]
129         src_part1 = vld1_s16(tempSrc);
130         tempSrc += 4;
131         src_part2 = vld1_s16(tempSrc);
132         tempSrc += 4;
133 
134         //reverse src_part2
135         src_part2 = vrev64_s16(src_part2);
136 
137         E2 = vaddl_s16(src_part1, src_part2);
138         O2 = vsubl_s16(src_part1, src_part2);
139 
140         multiply_s32(O1, O2, coeff1, result1);
141         multiply_s32(O1, O2, coeff3, result3);
142         multiply_s32(O1, O2, coeff5, result5);
143         multiply_s32(O1, O2, coeff7, result7);
144 
145         res3 = vpaddq_s32(result1, result3);
146         res4 = vpaddq_s32(result5, result7);
147 
148         // add and shift
149         res3 = vshlq_s32(vaddq_s32(res3, add), sh);
150         res4 = vshlq_s32(vaddq_s32(res4, add), sh);
151 
152         // store result in destination
153         vst1_s16(dst + 1 * line + i, vmovn_s32(vcombine_s32(vget_low_s32(res1), vget_low_s32(res3))));
154         vst1_s16(dst + 3 * line + i, vmovn_s32(vcombine_s32(vget_high_s32(res1), vget_high_s32(res3))));
155         vst1_s16(dst + 5 * line + i, vmovn_s32(vcombine_s32(vget_low_s32(res2), vget_low_s32(res4))));
156         vst1_s16(dst + 7 * line + i, vmovn_s32(vcombine_s32(vget_high_s32(res2), vget_high_s32(res4))));
157 
158         coeff0 = vld1q_s32(oapv_coeff[0]);
159         coeff2 = vld1q_s32(oapv_coeff[2]);
160         coeff4 = vld1q_s32(oapv_coeff[4]);
161         coeff6 = vld1q_s32(oapv_coeff[6]);
162 
163         tmp1 = vcombine_s32(vget_low_s32(E1), vget_low_s32(E2));
164         tmp2 = vcombine_s32(vget_high_s32(E1), vget_high_s32(E2));
165         tmp2 = vrev64q_s32(tmp2);
166 
167         EE_part2 = vaddq_s32(tmp1, tmp2);
168         EO_part2 = vsubq_s32(tmp1, tmp2);
169 
170         multiply_s32(EE_part1, EE_part2, coeff0, result0);
171         multiply_s32(EE_part1, EE_part2, coeff4, result4);
172         multiply_s32(EO_part1, EO_part2, coeff2, result2);
173         multiply_s32(EO_part1, EO_part2, coeff6, result6);
174 
175         // add and shift
176         result0 = vshlq_s32(vaddq_s32(result0, add), sh);
177         result2 = vshlq_s32(vaddq_s32(result2, add), sh);
178         result4 = vshlq_s32(vaddq_s32(result4, add), sh);
179         result6 = vshlq_s32(vaddq_s32(result6, add), sh);
180 
181         // store result in destination
182         vst1_s16(dst + 0 * line + i, vmovn_s32(result0));
183         vst1_s16(dst + 2 * line + i, vmovn_s32(result2));
184         vst1_s16(dst + 4 * line + i, vmovn_s32(result4));
185         vst1_s16(dst + 6 * line + i, vmovn_s32(result6));
186     }
187 }
188 
189 const oapv_fn_tx_t oapv_tbl_fn_txb_neon[2] =
190     {
191         oapv_tx_pb8b_neon,
192             NULL
193 };
194 
195 ///////////////////////////////////////////////////////////////////////////////
196 // end of encoder code
197 // ENABLE_ENCODER
198 ///////////////////////////////////////////////////////////////////////////////
199 
200 // Required coefficients from oapv_tbl_tm8
201 # define OAPV_INVTX_COEF_0	    89 // coef10, -coef32, -coef51,  coef73
202 # define OAPV_INVTX_COEF_1	    75 // coef11,  coef30,  coef53,  coef72
203 # define OAPV_INVTX_COEF_2	    50 // coef12, -coef33,  coef50, -coef71
204 # define OAPV_INVTX_COEF_3	    18 // coef13, -coef31,  coef52,  coef70
205 # define OAPV_INVTX_COEF_5	    84 // coef20, -coef61
206 # define OAPV_INVTX_COEF_6      35 // coef21,  coef60
207 # define OAPV_INVTX_COEF_4_LOG2	 6 // log2(coef00), log2(coef01), log2(coef40), log2(-coef41)
208 
oapv_itx_pb8b_opt_neon(s16 * src,int shift1,int shift2,int line)209 void oapv_itx_pb8b_opt_neon(s16* src, int shift1, int shift2, int line)
210 {
211     int32x4_t add1 = vdupq_n_s32(1 << (shift1 - 1));
212     int32x4_t add2 = vdupq_n_s32(1 << (shift2 - 1));
213 
214     int32x4_t sh1 = vdupq_n_s32(-shift1);
215     int32x4_t sh2 = vdupq_n_s32(-shift2);
216 
217     int16x4_t dest0, dest1, dest2, dest3, dest4, dest5, dest6, dest7, dest8, dest9, dest10, dest11, dest12, dest13, dest14, dest15;
218 
219     //DCT Pass 1
220     {
221         int16x8_t v_src_0_8 = vld1q_s16(src);
222         int16x8_t v_src_1_9 = vld1q_s16(src + line);
223         int16x8_t v_src_2_10 = vld1q_s16(src + 2 * line);
224         int16x8_t v_src_3_11 = vld1q_s16(src + 3 * line);
225         int16x8_t v_src_4_12 = vld1q_s16(src + 4 * line);
226         int16x8_t v_src_5_13 = vld1q_s16(src + 5 * line);
227         int16x8_t v_src_6_14 = vld1q_s16(src + 6 * line);
228         int16x8_t v_src_7_15 = vld1q_s16(src + 7 * line);
229 
230         int16x4_t v_src_0  = vget_low_s16(v_src_0_8);
231         int16x4_t v_src_1  = vget_low_s16(v_src_1_9);
232         int16x4_t v_src_2  = vget_low_s16(v_src_2_10);
233         int16x4_t v_src_3  = vget_low_s16(v_src_3_11);
234         int16x4_t v_src_4  = vget_low_s16(v_src_4_12);
235         int16x4_t v_src_5  = vget_low_s16(v_src_5_13);
236         int16x4_t v_src_6  = vget_low_s16(v_src_6_14);
237         int16x4_t v_src_7  = vget_low_s16(v_src_7_15);
238         int16x4_t v_src_8  = vget_high_s16(v_src_0_8);
239         int16x4_t v_src_9  = vget_high_s16(v_src_1_9);
240         int16x4_t v_src_10 = vget_high_s16(v_src_2_10);
241         int16x4_t v_src_11 = vget_high_s16(v_src_3_11);
242         int16x4_t v_src_12 = vget_high_s16(v_src_4_12);
243         int16x4_t v_src_13 = vget_high_s16(v_src_5_13);
244         int16x4_t v_src_14 = vget_high_s16(v_src_6_14);
245         int16x4_t v_src_15 = vget_high_s16(v_src_7_15);
246 
247         int32x4_t temp1 = vaddq_s32(vmull_n_s16(v_src_1, OAPV_INVTX_COEF_0), vmull_n_s16(v_src_3, OAPV_INVTX_COEF_1));
248         int32x4_t temp2 = vsubq_s32(vmull_n_s16(v_src_1, OAPV_INVTX_COEF_1), vmull_n_s16(v_src_3, OAPV_INVTX_COEF_3));
249 
250         int32x4_t temp3 = vsubq_s32(vmull_n_s16(v_src_1, OAPV_INVTX_COEF_2), vmull_n_s16(v_src_3, OAPV_INVTX_COEF_0));
251         int32x4_t temp4 = vsubq_s32(vmull_n_s16(v_src_1, OAPV_INVTX_COEF_3), vmull_n_s16(v_src_3, OAPV_INVTX_COEF_2));
252 
253         int32x4_t temp5 = vaddq_s32(vmull_n_s16(v_src_5, OAPV_INVTX_COEF_2), vmull_n_s16(v_src_7, OAPV_INVTX_COEF_3));
254         int32x4_t temp6 = vaddq_s32(vmull_n_s16(v_src_5, OAPV_INVTX_COEF_0), vmull_n_s16(v_src_7, OAPV_INVTX_COEF_2));
255         temp6 = vnegq_s32(temp6);
256 
257         int32x4_t temp7 = vaddq_s32(vmull_n_s16(v_src_5, OAPV_INVTX_COEF_3), vmull_n_s16(v_src_7, OAPV_INVTX_COEF_1));
258         int32x4_t temp8 = vsubq_s32(vmull_n_s16(v_src_5, OAPV_INVTX_COEF_1), vmull_n_s16(v_src_7, OAPV_INVTX_COEF_0));
259 
260         int32x4_t temp9 = vaddq_s32(vmull_n_s16(v_src_9, OAPV_INVTX_COEF_0), vmull_n_s16(v_src_11, OAPV_INVTX_COEF_1));
261         int32x4_t temp10 = vsubq_s32(vmull_n_s16(v_src_9, OAPV_INVTX_COEF_1), vmull_n_s16(v_src_11, OAPV_INVTX_COEF_3));
262 
263         int32x4_t temp11 = vsubq_s32(vmull_n_s16(v_src_9, OAPV_INVTX_COEF_2), vmull_n_s16(v_src_11, OAPV_INVTX_COEF_0));
264         int32x4_t temp12 = vsubq_s32(vmull_n_s16(v_src_9, OAPV_INVTX_COEF_3), vmull_n_s16(v_src_11, OAPV_INVTX_COEF_2));
265 
266         int32x4_t temp13 = vaddq_s32(vmull_n_s16(v_src_13, OAPV_INVTX_COEF_2), vmull_n_s16(v_src_15, OAPV_INVTX_COEF_3));
267         int32x4_t temp14 = vaddq_s32(vmull_n_s16(v_src_13, OAPV_INVTX_COEF_0), vmull_n_s16(v_src_15, OAPV_INVTX_COEF_2));
268         temp14 = vnegq_s32(temp14);
269 
270         int32x4_t temp15 = vaddq_s32(vmull_n_s16(v_src_13, OAPV_INVTX_COEF_3), vmull_n_s16(v_src_15, OAPV_INVTX_COEF_1));
271         int32x4_t temp16 = vsubq_s32(vmull_n_s16(v_src_13, OAPV_INVTX_COEF_1), vmull_n_s16(v_src_15, OAPV_INVTX_COEF_0));
272 
273         int32x4_t O0 = vaddq_s32(temp1, temp5);
274         int32x4_t O1 = vaddq_s32(temp2, temp6);
275         int32x4_t O2 = vaddq_s32(temp3, temp7);
276         int32x4_t O3 = vaddq_s32(temp4, temp8);
277         int32x4_t O4 = vaddq_s32(temp9, temp13);
278         int32x4_t O5 = vaddq_s32(temp10, temp14);
279         int32x4_t O6 = vaddq_s32(temp11, temp15);
280         int32x4_t O7 = vaddq_s32(temp12, temp16);
281 
282         int32x4_t EO0 = vaddq_s32(vmull_n_s16(v_src_2, OAPV_INVTX_COEF_5), vmull_n_s16(v_src_6, OAPV_INVTX_COEF_6));
283         int32x4_t EO1 = vsubq_s32(vmull_n_s16(v_src_2, OAPV_INVTX_COEF_6), vmull_n_s16(v_src_6, OAPV_INVTX_COEF_5));
284         int32x4_t EE0 = vaddq_s32(vshll_n_s16(v_src_0, OAPV_INVTX_COEF_4_LOG2), vshll_n_s16(v_src_4, OAPV_INVTX_COEF_4_LOG2));
285         int32x4_t EE1 = vsubq_s32(vshll_n_s16(v_src_0, OAPV_INVTX_COEF_4_LOG2), vshll_n_s16(v_src_4, OAPV_INVTX_COEF_4_LOG2));
286         int32x4_t EO2 = vaddq_s32(vmull_n_s16(v_src_10, OAPV_INVTX_COEF_5), vmull_n_s16(v_src_14, OAPV_INVTX_COEF_6));
287         int32x4_t EO3 = vsubq_s32(vmull_n_s16(v_src_10, OAPV_INVTX_COEF_6), vmull_n_s16(v_src_14, OAPV_INVTX_COEF_5));
288         int32x4_t EE2 = vaddq_s32(vshll_n_s16(v_src_8, OAPV_INVTX_COEF_4_LOG2), vshll_n_s16(v_src_12, OAPV_INVTX_COEF_4_LOG2));
289         int32x4_t EE3 = vsubq_s32(vshll_n_s16(v_src_8, OAPV_INVTX_COEF_4_LOG2), vshll_n_s16(v_src_12, OAPV_INVTX_COEF_4_LOG2));
290 
291         int32x4_t E0 = vaddq_s32(EE0, EO0);
292         int32x4_t E1 = vaddq_s32(EE1, EO1);
293         int32x4_t E2 = vsubq_s32(EE1, EO1);
294         int32x4_t E3 = vsubq_s32(EE0, EO0);
295         int32x4_t E4 = vaddq_s32(EE2, EO2);
296         int32x4_t E5 = vaddq_s32(EE3, EO3);
297         int32x4_t E6 = vsubq_s32(EE3, EO3);
298         int32x4_t E7 = vsubq_s32(EE2, EO2);
299 
300         dest0 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E0, O0), add1), sh1));
301         dest1 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E1, O1), add1), sh1));
302         dest2 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E2, O2), add1), sh1));
303         dest3 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E3, O3), add1), sh1));
304         dest4 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E0, O0), add1), sh1));
305         dest5 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E1, O1), add1), sh1));
306         dest6 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E2, O2), add1), sh1));
307         dest7 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E3, O3), add1), sh1));
308         dest8 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E4, O4), add1), sh1));
309         dest9 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E5, O5), add1), sh1));
310         dest10 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E6, O6), add1), sh1));
311         dest11 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E7, O7), add1), sh1));
312         dest12 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E4, O4), add1), sh1));
313         dest13 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E5, O5), add1), sh1));
314         dest14 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E6, O6), add1), sh1));
315         dest15 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E7, O7), add1), sh1));
316 
317         int16x4_t t0 = vzip1_s16(dest0, dest1);
318         int16x4_t t1 = vzip1_s16(dest2, dest3);
319         int16x4_t t2 = vzip2_s16(dest0, dest1);
320         int16x4_t t3 = vzip2_s16(dest2, dest3);
321         int16x4_t t4 = vzip1_s16(dest8, dest9);
322         int16x4_t t5 = vzip1_s16(dest10, dest11);
323         int16x4_t t6 = vzip2_s16(dest8, dest9);
324         int16x4_t t7 = vzip2_s16(dest10, dest11);
325 
326         dest0 = vreinterpret_s16_s32(vzip1_s32(vreinterpret_s32_s16(t0), vreinterpret_s32_s16(t1)));
327         dest1 = vreinterpret_s16_s32(vzip2_s32(vreinterpret_s32_s16(t0), vreinterpret_s32_s16(t1)));
328         dest2 = vreinterpret_s16_s32(vzip1_s32(vreinterpret_s32_s16(t2), vreinterpret_s32_s16(t3)));
329         dest3 = vreinterpret_s16_s32(vzip2_s32(vreinterpret_s32_s16(t2), vreinterpret_s32_s16(t3)));
330         dest8 = vreinterpret_s16_s32(vzip1_s32(vreinterpret_s32_s16(t4), vreinterpret_s32_s16(t5)));
331         dest9 = vreinterpret_s16_s32(vzip2_s32(vreinterpret_s32_s16(t4), vreinterpret_s32_s16(t5)));
332         dest10 = vreinterpret_s16_s32(vzip1_s32(vreinterpret_s32_s16(t6), vreinterpret_s32_s16(t7)));
333         dest11 = vreinterpret_s16_s32(vzip2_s32(vreinterpret_s32_s16(t6), vreinterpret_s32_s16(t7)));
334 
335         int16x4_t t8 = vzip1_s16(dest5, dest4);
336         int16x4_t t9 = vzip1_s16(dest7, dest6);
337         int16x4_t t10 = vzip2_s16(dest5, dest4);
338         int16x4_t t11 = vzip2_s16(dest7, dest6);
339         int16x4_t t12 = vzip1_s16(dest13, dest12);
340         int16x4_t t13 = vzip1_s16(dest15, dest14);
341         int16x4_t t14 = vzip2_s16(dest13, dest12);
342         int16x4_t t15 = vzip2_s16(dest15, dest14);
343 
344         dest4 = vreinterpret_s16_s32(vzip1_s32(vreinterpret_s32_s16(t9), vreinterpret_s32_s16(t8)));
345         dest5 = vreinterpret_s16_s32(vzip2_s32(vreinterpret_s32_s16(t9), vreinterpret_s32_s16(t8)));
346         dest6 = vreinterpret_s16_s32(vzip1_s32(vreinterpret_s32_s16(t11), vreinterpret_s32_s16(t10)));
347         dest7 = vreinterpret_s16_s32(vzip2_s32(vreinterpret_s32_s16(t11), vreinterpret_s32_s16(t10)));
348         dest12 = vreinterpret_s16_s32(vzip1_s32(vreinterpret_s32_s16(t13), vreinterpret_s32_s16(t12)));
349         dest13 = vreinterpret_s16_s32(vzip2_s32(vreinterpret_s32_s16(t13), vreinterpret_s32_s16(t12)));
350         dest14 = vreinterpret_s16_s32(vzip1_s32(vreinterpret_s32_s16(t15), vreinterpret_s32_s16(t14)));
351         dest15 = vreinterpret_s16_s32(vzip2_s32(vreinterpret_s32_s16(t15), vreinterpret_s32_s16(t14)));
352     }
353 
354     //DCT Pass 2
355     {
356         int32x4_t temp1 = vaddq_s32(vmull_n_s16(dest1, OAPV_INVTX_COEF_0), vmull_n_s16(dest3, OAPV_INVTX_COEF_1));
357         int32x4_t temp2 = vsubq_s32(vmull_n_s16(dest1, OAPV_INVTX_COEF_1), vmull_n_s16(dest3, OAPV_INVTX_COEF_3));
358 
359         int32x4_t temp3 = vsubq_s32(vmull_n_s16(dest1, OAPV_INVTX_COEF_2), vmull_n_s16(dest3, OAPV_INVTX_COEF_0));
360         int32x4_t temp4 = vsubq_s32(vmull_n_s16(dest1, OAPV_INVTX_COEF_3), vmull_n_s16(dest3, OAPV_INVTX_COEF_2));
361 
362         int32x4_t temp5 = vaddq_s32(vmull_n_s16(dest9, OAPV_INVTX_COEF_2), vmull_n_s16(dest11, OAPV_INVTX_COEF_3));
363         int32x4_t temp6 = vaddq_s32(vmull_n_s16(dest9, OAPV_INVTX_COEF_0), vmull_n_s16(dest11, OAPV_INVTX_COEF_2));
364         temp6 = vnegq_s32(temp6);
365 
366         int32x4_t temp7 = vaddq_s32(vmull_n_s16(dest9, OAPV_INVTX_COEF_3), vmull_n_s16(dest11, OAPV_INVTX_COEF_1));
367         int32x4_t temp8 = vsubq_s32(vmull_n_s16(dest9, OAPV_INVTX_COEF_1), vmull_n_s16(dest11, OAPV_INVTX_COEF_0));
368 
369         int32x4_t temp9 = vaddq_s32(vmull_n_s16(dest5, OAPV_INVTX_COEF_0), vmull_n_s16(dest7, OAPV_INVTX_COEF_1));
370         int32x4_t temp10 = vsubq_s32(vmull_n_s16(dest5, OAPV_INVTX_COEF_1), vmull_n_s16(dest7, OAPV_INVTX_COEF_3));
371 
372         int32x4_t temp11 = vsubq_s32(vmull_n_s16(dest5, OAPV_INVTX_COEF_2), vmull_n_s16(dest7, OAPV_INVTX_COEF_0));
373         int32x4_t temp12 = vsubq_s32(vmull_n_s16(dest5, OAPV_INVTX_COEF_3), vmull_n_s16(dest7, OAPV_INVTX_COEF_2));
374 
375         int32x4_t temp13 = vaddq_s32(vmull_n_s16(dest13, OAPV_INVTX_COEF_2), vmull_n_s16(dest15, OAPV_INVTX_COEF_3));
376         int32x4_t temp14 = vaddq_s32(vmull_n_s16(dest13, OAPV_INVTX_COEF_0), vmull_n_s16(dest15, OAPV_INVTX_COEF_2));
377         temp14 = vnegq_s32(temp14);
378 
379         int32x4_t temp15 = vaddq_s32(vmull_n_s16(dest13, OAPV_INVTX_COEF_3), vmull_n_s16(dest15, OAPV_INVTX_COEF_1));
380         int32x4_t temp16 = vsubq_s32(vmull_n_s16(dest13, OAPV_INVTX_COEF_1), vmull_n_s16(dest15, OAPV_INVTX_COEF_0));
381 
382         int32x4_t O0 = vaddq_s32(temp1, temp5);
383         int32x4_t O1 = vaddq_s32(temp2, temp6);
384         int32x4_t O2 = vaddq_s32(temp3, temp7);
385         int32x4_t O3 = vaddq_s32(temp4, temp8);
386         int32x4_t O4 = vaddq_s32(temp9, temp13);
387         int32x4_t O5 = vaddq_s32(temp10, temp14);
388         int32x4_t O6 = vaddq_s32(temp11, temp15);
389         int32x4_t O7 = vaddq_s32(temp12, temp16);
390 
391         int32x4_t EO0 = vaddq_s32(vmull_n_s16(dest2, OAPV_INVTX_COEF_5), vmull_n_s16(dest10, OAPV_INVTX_COEF_6));
392         int32x4_t EO1 = vsubq_s32(vmull_n_s16(dest2, OAPV_INVTX_COEF_6), vmull_n_s16(dest10, OAPV_INVTX_COEF_5));
393         int32x4_t EE0 = vaddq_s32(vshll_n_s16(dest0, OAPV_INVTX_COEF_4_LOG2), vshll_n_s16(dest8, OAPV_INVTX_COEF_4_LOG2));
394         int32x4_t EE1 = vsubq_s32(vshll_n_s16(dest0, OAPV_INVTX_COEF_4_LOG2), vshll_n_s16(dest8, OAPV_INVTX_COEF_4_LOG2));
395         int32x4_t EO2 = vaddq_s32(vmull_n_s16(dest6, OAPV_INVTX_COEF_5), vmull_n_s16(dest14, OAPV_INVTX_COEF_6));
396         int32x4_t EO3 = vsubq_s32(vmull_n_s16(dest6, OAPV_INVTX_COEF_6), vmull_n_s16(dest14, OAPV_INVTX_COEF_5));
397         int32x4_t EE2 = vaddq_s32(vshll_n_s16(dest4, OAPV_INVTX_COEF_4_LOG2), vshll_n_s16(dest12, OAPV_INVTX_COEF_4_LOG2));
398         int32x4_t EE3 = vsubq_s32(vshll_n_s16(dest4, OAPV_INVTX_COEF_4_LOG2), vshll_n_s16(dest12, OAPV_INVTX_COEF_4_LOG2));
399 
400         int32x4_t E0 = vaddq_s32(EE0, EO0);
401         int32x4_t E1 = vaddq_s32(EE1, EO1);
402         int32x4_t E2 = vsubq_s32(EE1, EO1);
403         int32x4_t E3 = vsubq_s32(EE0, EO0);
404         int32x4_t E4 = vaddq_s32(EE2, EO2);
405         int32x4_t E5 = vaddq_s32(EE3, EO3);
406         int32x4_t E6 = vsubq_s32(EE3, EO3);
407         int32x4_t E7 = vsubq_s32(EE2, EO2);
408 
409         int16x4_t v_src_0 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E0, O0), add2), sh2));
410         int16x4_t v_src_1 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E1, O1), add2), sh2));
411         int16x4_t v_src_2 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E2, O2), add2), sh2));
412         int16x4_t v_src_3 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E3, O3), add2), sh2));
413         int16x4_t v_src_4 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E0, O0), add2), sh2));
414         int16x4_t v_src_5 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E1, O1), add2), sh2));
415         int16x4_t v_src_6 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E2, O2), add2), sh2));
416         int16x4_t v_src_7 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E3, O3), add2), sh2));
417         int16x4_t v_src_8 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E4, O4), add2), sh2));
418         int16x4_t v_src_9 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E5, O5), add2), sh2));
419         int16x4_t v_src_10 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E6, O6), add2), sh2));
420         int16x4_t v_src_11 = vmovn_s32(vshlq_s32(vaddq_s32(vaddq_s32(E7, O7), add2), sh2));
421         int16x4_t v_src_12 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E4, O4), add2), sh2));
422         int16x4_t v_src_13 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E5, O5), add2), sh2));
423         int16x4_t v_src_14 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E6, O6), add2), sh2));
424         int16x4_t v_src_15 = vmovn_s32(vshlq_s32(vaddq_s32(vsubq_s32(E7, O7), add2), sh2));
425 
426         int16x4_t t0 = vzip1_s16(v_src_0, v_src_1);
427         int16x4_t t1 = vzip1_s16(v_src_2, v_src_3);
428         int16x4_t t2 = vzip2_s16(v_src_0, v_src_1);
429         int16x4_t t3 = vzip2_s16(v_src_2, v_src_3);
430         int16x4_t t4 = vzip1_s16(v_src_8, v_src_9);
431         int16x4_t t5 = vzip1_s16(v_src_10, v_src_11);
432         int16x4_t t6 = vzip2_s16(v_src_8, v_src_9);
433         int16x4_t t7 = vzip2_s16(v_src_10, v_src_11);
434 
435         v_src_0 = vreinterpret_s16_s32(vzip1_s32(vreinterpret_s32_s16(t0), vreinterpret_s32_s16(t1)));
436         v_src_1 = vreinterpret_s16_s32(vzip2_s32(vreinterpret_s32_s16(t0), vreinterpret_s32_s16(t1)));
437         v_src_2 = vreinterpret_s16_s32(vzip1_s32(vreinterpret_s32_s16(t2), vreinterpret_s32_s16(t3)));
438         v_src_3 = vreinterpret_s16_s32(vzip2_s32(vreinterpret_s32_s16(t2), vreinterpret_s32_s16(t3)));
439         v_src_8 = vreinterpret_s16_s32(vzip1_s32(vreinterpret_s32_s16(t4), vreinterpret_s32_s16(t5)));
440         v_src_9 = vreinterpret_s16_s32(vzip2_s32(vreinterpret_s32_s16(t4), vreinterpret_s32_s16(t5)));
441         v_src_10 = vreinterpret_s16_s32(vzip1_s32(vreinterpret_s32_s16(t6), vreinterpret_s32_s16(t7)));
442         v_src_11 = vreinterpret_s16_s32(vzip2_s32(vreinterpret_s32_s16(t6), vreinterpret_s32_s16(t7)));
443 
444         int16x4_t t8 = vzip1_s16(v_src_5, v_src_4);
445         int16x4_t t9 = vzip1_s16(v_src_7, v_src_6);
446         int16x4_t t10 = vzip2_s16(v_src_5, v_src_4);
447         int16x4_t t11 = vzip2_s16(v_src_7, v_src_6);
448         int16x4_t t12 = vzip1_s16(v_src_13, v_src_12);
449         int16x4_t t13 = vzip1_s16(v_src_15, v_src_14);
450         int16x4_t t14 = vzip2_s16(v_src_13, v_src_12);
451         int16x4_t t15 = vzip2_s16(v_src_15, v_src_14);
452 
453         v_src_4 = vreinterpret_s16_s32(vzip1_s32(vreinterpret_s32_s16(t9), vreinterpret_s32_s16(t8)));
454         v_src_5 = vreinterpret_s16_s32(vzip2_s32(vreinterpret_s32_s16(t9), vreinterpret_s32_s16(t8)));
455         v_src_6 = vreinterpret_s16_s32(vzip1_s32(vreinterpret_s32_s16(t11), vreinterpret_s32_s16(t10)));
456         v_src_7 = vreinterpret_s16_s32(vzip2_s32(vreinterpret_s32_s16(t11), vreinterpret_s32_s16(t10)));
457         v_src_12 = vreinterpret_s16_s32(vzip1_s32(vreinterpret_s32_s16(t13), vreinterpret_s32_s16(t12)));
458         v_src_13 = vreinterpret_s16_s32(vzip2_s32(vreinterpret_s32_s16(t13), vreinterpret_s32_s16(t12)));
459         v_src_14 = vreinterpret_s16_s32(vzip1_s32(vreinterpret_s32_s16(t15), vreinterpret_s32_s16(t14)));
460         v_src_15 = vreinterpret_s16_s32(vzip2_s32(vreinterpret_s32_s16(t15), vreinterpret_s32_s16(t14)));
461 
462         int16x8_t v_src_0_4 = vcombine_s16(v_src_0, v_src_4);
463         int16x8_t v_src_1_5 = vcombine_s16(v_src_1, v_src_5);
464         int16x8_t v_src_2_6 = vcombine_s16(v_src_2, v_src_6);
465         int16x8_t v_src_3_7 = vcombine_s16(v_src_3, v_src_7);
466         int16x8_t v_src_8_12 = vcombine_s16(v_src_8, v_src_12);
467         int16x8_t v_src_9_13 = vcombine_s16(v_src_9, v_src_13);
468         int16x8_t v_src_10_14 = vcombine_s16(v_src_10, v_src_14);
469         int16x8_t v_src_11_15 = vcombine_s16(v_src_11, v_src_15);
470 
471         vst1q_s16(src, v_src_0_4);
472         vst1q_s16(src + 8, v_src_1_5);
473         vst1q_s16(src + 16, v_src_2_6);
474         vst1q_s16(src + 24, v_src_3_7);
475         vst1q_s16(src + 32, v_src_8_12);
476         vst1q_s16(src + 40, v_src_9_13);
477         vst1q_s16(src + 48, v_src_10_14);
478         vst1q_s16(src + 56, v_src_11_15);
479     }
480 }
481 
482 const oapv_fn_itx_t oapv_tbl_fn_itx_neon[2] =
483     {
484         oapv_itx_pb8b_opt_neon,
485             NULL
486 };
487 
oapv_quant_neon(s16 * coef,u8 qp,int q_matrix[OAPV_BLK_D],int log2_w,int log2_h,int bit_depth,int deadzone_offset)488 static int oapv_quant_neon(s16* coef, u8 qp, int q_matrix[OAPV_BLK_D], int log2_w, int log2_h, int bit_depth, int deadzone_offset)
489 {
490     s64 offset;
491     int shift;
492     int tr_shift;
493 
494     int log2_size = (log2_w + log2_h) >> 1;
495     tr_shift = MAX_TX_DYNAMIC_RANGE - bit_depth - log2_size;
496     shift = QUANT_SHIFT + tr_shift + (qp / 6);
497     offset = (s64)deadzone_offset << (shift - 9);
498     int pixels=(1 << (log2_w + log2_h));
499 
500     int i;
501     int16x8_t coef_row;
502     int64x2_t offset_vector     = vdupq_n_s64(offset);
503     int64x2_t shift_vector      = vdupq_n_s64(-shift);
504     uint16x8_t zero_vector      = vdupq_n_s16(0);
505 
506     for (i = 0; i < pixels; i+=8)
507     {
508         // Load one coef row
509         coef_row = vld1q_s16(coef+i);
510 
511 		// Extract coef signs and construct abs coef-vec
512         uint16x8_t sign_mask   = vcltq_s16(coef_row, zero_vector);
513         int16x8_t coef_row_abs = vabsq_s16(coef_row);
514 
515         // Split abs coef-vec and unpack to s32
516         int32x4_t coef_low_32b  = vmovl_s16(vget_low_s16(coef_row_abs));
517         int32x4_t coef_high_32b = vmovl_high_s16(coef_row_abs);
518 
519         // Load q_matrix elements
520         int32x4_t quant_matrix_low  = vld1q_s32(q_matrix + i);
521         int32x4_t quant_matrix_high = vld1q_s32(q_matrix + i + 4);
522 
523         // Multiply 2X: 32-bit coef with 32-bit q_matrix and add 64-bit offset_vector to store result as 64-bit
524         int64x2_t coef_low_32b_first_half   = vmlal_s32(offset_vector, vget_low_s32 (coef_low_32b), vget_low_s32 (quant_matrix_low));
525         int64x2_t coef_low_32b_second_half  = vmlal_s32(offset_vector, vget_high_s32(coef_low_32b), vget_high_s32(quant_matrix_low));
526 
527         int64x2_t coef_high_32b_first_half  = vmlal_s32(offset_vector, vget_low_s32 (coef_high_32b), vget_low_s32 (quant_matrix_high));
528         int64x2_t coef_high_32b_second_half = vmlal_s32(offset_vector, vget_high_s32(coef_high_32b), vget_high_s32(quant_matrix_high));
529 
530         // Shift 64-bit results
531         coef_low_32b_first_half   = vshlq_s64(coef_low_32b_first_half, shift_vector);
532         coef_low_32b_second_half  = vshlq_s64(coef_low_32b_second_half, shift_vector);
533         coef_high_32b_first_half  = vshlq_s64(coef_high_32b_first_half, shift_vector);
534         coef_high_32b_second_half = vshlq_s64(coef_high_32b_second_half, shift_vector);
535 
536         // Combine 2X: 64x2 registers into one 32x4 register
537         coef_low_32b  = vcombine_u32(vmovn_s64(coef_low_32b_first_half),  vmovn_s64(coef_low_32b_second_half));
538         coef_high_32b = vcombine_u32(vmovn_s64(coef_high_32b_first_half), vmovn_s64(coef_high_32b_second_half));
539 
540         // Combine 2X: 32x4 registers into one 16x8 register
541         int16x8_t output_vector = vcombine_u16(vmovn_s32(coef_low_32b), vmovn_s32(coef_high_32b));
542 
543         // Apply extracted coef sign to result
544         output_vector = vbslq_s16(sign_mask,  vnegq_s16(output_vector), output_vector);
545 
546         // Store result row into buffer
547         vst1q_s16(coef + i, output_vector);
548     }
549     return OAPV_OK;
550 }
551 
552 
553 const oapv_fn_quant_t oapv_tbl_fn_quant_neon[2] =
554 {
555     oapv_quant_neon,
556         NULL
557 };
558 
559 #endif /* ARM_NEON */
560