xref: /aosp_15_r20/external/libvpx/vpx_dsp/arm/highbd_quantize_neon.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <arm_neon.h>
12 
13 #include "./vpx_config.h"
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/arm/mem_neon.h"
16 #include "vp9/common/vp9_scan.h"
17 #include "vp9/encoder/vp9_block.h"
18 
highbd_calculate_dqcoeff_and_store(const int32x4_t dqcoeff_0,const int32x4_t dqcoeff_1,tran_low_t * dqcoeff_ptr)19 static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store(
20     const int32x4_t dqcoeff_0, const int32x4_t dqcoeff_1,
21     tran_low_t *dqcoeff_ptr) {
22   vst1q_s32(dqcoeff_ptr, dqcoeff_0);
23   vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
24 }
25 
highbd_quantize_8_neon(const int32x4_t coeff_0,const int32x4_t coeff_1,const int32x4_t zbin,const int32x4_t round,const int32x4_t quant,const int32x4_t quant_shift,int32x4_t * qcoeff_0,int32x4_t * qcoeff_1)26 static VPX_FORCE_INLINE void highbd_quantize_8_neon(
27     const int32x4_t coeff_0, const int32x4_t coeff_1, const int32x4_t zbin,
28     const int32x4_t round, const int32x4_t quant, const int32x4_t quant_shift,
29     int32x4_t *qcoeff_0, int32x4_t *qcoeff_1) {
30   // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values
31   const int32x4_t coeff_0_sign = vshrq_n_s32(coeff_0, 31);
32   const int32x4_t coeff_1_sign = vshrq_n_s32(coeff_1, 31);
33   const int32x4_t coeff_0_abs = vabsq_s32(coeff_0);
34   const int32x4_t coeff_1_abs = vabsq_s32(coeff_1);
35 
36   // Calculate 2 masks of elements outside the bin
37   const int32x4_t zbin_mask_0 =
38       vreinterpretq_s32_u32(vcgeq_s32(coeff_0_abs, zbin));
39   const int32x4_t zbin_mask_1 = vreinterpretq_s32_u32(
40       vcgeq_s32(coeff_1_abs, vdupq_lane_s32(vget_low_s32(zbin), 1)));
41 
42   // Get the rounded values
43   const int32x4_t rounded_0 = vaddq_s32(coeff_0_abs, round);
44   const int32x4_t rounded_1 =
45       vaddq_s32(coeff_1_abs, vdupq_lane_s32(vget_low_s32(round), 1));
46 
47   // (round * (quant << 15) * 2) >> 16 == (round * quant)
48   int32x4_t qcoeff_tmp_0 = vqdmulhq_s32(rounded_0, quant);
49   int32x4_t qcoeff_tmp_1 =
50       vqdmulhq_s32(rounded_1, vdupq_lane_s32(vget_low_s32(quant), 1));
51 
52   // Add rounded values
53   qcoeff_tmp_0 = vaddq_s32(qcoeff_tmp_0, rounded_0);
54   qcoeff_tmp_1 = vaddq_s32(qcoeff_tmp_1, rounded_1);
55 
56   // (round * (quant_shift << 15) * 2) >> 16 == (round * quant_shift)
57   qcoeff_tmp_0 = vqdmulhq_s32(qcoeff_tmp_0, quant_shift);
58   qcoeff_tmp_1 =
59       vqdmulhq_s32(qcoeff_tmp_1, vdupq_lane_s32(vget_low_s32(quant_shift), 1));
60 
61   // Restore the sign bit.
62   qcoeff_tmp_0 = veorq_s32(qcoeff_tmp_0, coeff_0_sign);
63   qcoeff_tmp_1 = veorq_s32(qcoeff_tmp_1, coeff_1_sign);
64   qcoeff_tmp_0 = vsubq_s32(qcoeff_tmp_0, coeff_0_sign);
65   qcoeff_tmp_1 = vsubq_s32(qcoeff_tmp_1, coeff_1_sign);
66 
67   // Only keep the relevant coeffs
68   *qcoeff_0 = vandq_s32(qcoeff_tmp_0, zbin_mask_0);
69   *qcoeff_1 = vandq_s32(qcoeff_tmp_1, zbin_mask_1);
70 }
71 
72 static VPX_FORCE_INLINE int16x8_t
highbd_quantize_b_neon(const tran_low_t * coeff_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int32x4_t zbin,const int32x4_t round,const int32x4_t quant,const int32x4_t quant_shift,const int32x4_t dequant)73 highbd_quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
74                        tran_low_t *dqcoeff_ptr, const int32x4_t zbin,
75                        const int32x4_t round, const int32x4_t quant,
76                        const int32x4_t quant_shift, const int32x4_t dequant) {
77   int32x4_t qcoeff_0, qcoeff_1, dqcoeff_0, dqcoeff_1;
78 
79   // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values
80   const int32x4_t coeff_0 = vld1q_s32(coeff_ptr);
81   const int32x4_t coeff_1 = vld1q_s32(coeff_ptr + 4);
82   highbd_quantize_8_neon(coeff_0, coeff_1, zbin, round, quant, quant_shift,
83                          &qcoeff_0, &qcoeff_1);
84 
85   // Store the 32-bit qcoeffs
86   vst1q_s32(qcoeff_ptr, qcoeff_0);
87   vst1q_s32(qcoeff_ptr + 4, qcoeff_1);
88 
89   // Calculate and store the dqcoeffs
90   dqcoeff_0 = vmulq_s32(qcoeff_0, dequant);
91   dqcoeff_1 = vmulq_s32(qcoeff_1, vdupq_lane_s32(vget_low_s32(dequant), 1));
92 
93   highbd_calculate_dqcoeff_and_store(dqcoeff_0, dqcoeff_1, dqcoeff_ptr);
94 
95   return vcombine_s16(vmovn_s32(qcoeff_0), vmovn_s32(qcoeff_1));
96 }
97 
vpx_highbd_quantize_b_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const struct macroblock_plane * const mb_plane,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const struct ScanOrder * const scan_order)98 void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
99                                 const struct macroblock_plane *const mb_plane,
100                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
101                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
102                                 const struct ScanOrder *const scan_order) {
103   const int16x8_t neg_one = vdupq_n_s16(-1);
104   uint16x8_t eob_max;
105   const int16_t *iscan = scan_order->iscan;
106 
107   // Only the first element of each vector is DC.
108   // High half has identical elements, but we can reconstruct it from the low
109   // half by duplicating the 2nd element. So we only need to pass a 4x32-bit
110   // vector
111   int32x4_t zbin = vmovl_s16(vld1_s16(mb_plane->zbin));
112   int32x4_t round = vmovl_s16(vld1_s16(mb_plane->round));
113   // Extend the quant, quant_shift vectors to ones of 32-bit elements
114   // scale to high-half, so we can use vqdmulhq_s32
115   int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant)), 15);
116   int32x4_t quant_shift =
117       vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant_shift)), 15);
118   int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr));
119 
120   // Process first 8 values which include a dc component.
121   {
122     const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
123 
124     const int16x8_t qcoeff =
125         highbd_quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
126                                quant, quant_shift, dequant);
127 
128     // Set non-zero elements to -1 and use that to extract values for eob.
129     eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
130 
131     __builtin_prefetch(coeff_ptr + 64);
132 
133     coeff_ptr += 8;
134     iscan += 8;
135     qcoeff_ptr += 8;
136     dqcoeff_ptr += 8;
137   }
138 
139   n_coeffs -= 8;
140 
141   {
142     zbin = vdupq_lane_s32(vget_low_s32(zbin), 1);
143     round = vdupq_lane_s32(vget_low_s32(round), 1);
144     quant = vdupq_lane_s32(vget_low_s32(quant), 1);
145     quant_shift = vdupq_lane_s32(vget_low_s32(quant_shift), 1);
146     dequant = vdupq_lane_s32(vget_low_s32(dequant), 1);
147 
148     do {
149       const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
150 
151       const int16x8_t qcoeff =
152           highbd_quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin,
153                                  round, quant, quant_shift, dequant);
154 
155       // Set non-zero elements to -1 and use that to extract values for eob.
156       eob_max =
157           vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
158 
159       __builtin_prefetch(coeff_ptr + 64);
160       coeff_ptr += 8;
161       iscan += 8;
162       qcoeff_ptr += 8;
163       dqcoeff_ptr += 8;
164       n_coeffs -= 8;
165     } while (n_coeffs > 0);
166   }
167 
168 #if VPX_ARCH_AARCH64
169   *eob_ptr = vmaxvq_u16(eob_max);
170 #else
171   {
172     const uint16x4_t eob_max_0 =
173         vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
174     const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
175     const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
176     vst1_lane_u16(eob_ptr, eob_max_2, 0);
177   }
178 #endif  // VPX_ARCH_AARCH64
179 }
180 
extract_sign_bit(int32x4_t a)181 static VPX_FORCE_INLINE int32x4_t extract_sign_bit(int32x4_t a) {
182   return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
183 }
184 
highbd_calculate_dqcoeff_and_store_32x32(int32x4_t dqcoeff_0,int32x4_t dqcoeff_1,tran_low_t * dqcoeff_ptr)185 static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store_32x32(
186     int32x4_t dqcoeff_0, int32x4_t dqcoeff_1, tran_low_t *dqcoeff_ptr) {
187   // Add 1 if negative to round towards zero because the C uses division.
188   dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
189   dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
190 
191   dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1);
192   dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1);
193   vst1q_s32(dqcoeff_ptr, dqcoeff_0);
194   vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
195 }
196 
highbd_quantize_b_32x32_neon(const tran_low_t * coeff_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int32x4_t zbin,const int32x4_t round,const int32x4_t quant,const int32x4_t quant_shift,const int32x4_t dequant)197 static VPX_FORCE_INLINE int16x8_t highbd_quantize_b_32x32_neon(
198     const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
199     tran_low_t *dqcoeff_ptr, const int32x4_t zbin, const int32x4_t round,
200     const int32x4_t quant, const int32x4_t quant_shift,
201     const int32x4_t dequant) {
202   int32x4_t qcoeff_0, qcoeff_1, dqcoeff_0, dqcoeff_1;
203 
204   // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values
205   const int32x4_t coeff_0 = vld1q_s32(coeff_ptr);
206   const int32x4_t coeff_1 = vld1q_s32(coeff_ptr + 4);
207   highbd_quantize_8_neon(coeff_0, coeff_1, zbin, round, quant, quant_shift,
208                          &qcoeff_0, &qcoeff_1);
209 
210   // Store the 32-bit qcoeffs
211   vst1q_s32(qcoeff_ptr, qcoeff_0);
212   vst1q_s32(qcoeff_ptr + 4, qcoeff_1);
213 
214   // Calculate and store the dqcoeffs
215   dqcoeff_0 = vmulq_s32(qcoeff_0, dequant);
216   dqcoeff_1 = vmulq_s32(qcoeff_1, vdupq_lane_s32(vget_low_s32(dequant), 1));
217 
218   highbd_calculate_dqcoeff_and_store_32x32(dqcoeff_0, dqcoeff_1, dqcoeff_ptr);
219 
220   return vcombine_s16(vmovn_s32(qcoeff_0), vmovn_s32(qcoeff_1));
221 }
222 
vpx_highbd_quantize_b_32x32_neon(const tran_low_t * coeff_ptr,const struct macroblock_plane * const mb_plane,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const struct ScanOrder * const scan_order)223 void vpx_highbd_quantize_b_32x32_neon(
224     const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
225     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
226     uint16_t *eob_ptr, const struct ScanOrder *const scan_order) {
227   const int16x8_t neg_one = vdupq_n_s16(-1);
228   uint16x8_t eob_max;
229   int i;
230   const int16_t *iscan = scan_order->iscan;
231 
232   // Only the first element of each vector is DC.
233   // High half has identical elements, but we can reconstruct it from the low
234   // half by duplicating the 2nd element. So we only need to pass a 4x32-bit
235   // vector
236   int32x4_t zbin = vrshrq_n_s32(vmovl_s16(vld1_s16(mb_plane->zbin)), 1);
237   int32x4_t round = vrshrq_n_s32(vmovl_s16(vld1_s16(mb_plane->round)), 1);
238   // Extend the quant, quant_shift vectors to ones of 32-bit elements
239   // scale to high-half, so we can use vqdmulhq_s32
240   int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant)), 15);
241   int32x4_t quant_shift =
242       vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant_shift)), 16);
243   int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr));
244 
245   // Process first 8 values which include a dc component.
246   {
247     const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
248 
249     const int16x8_t qcoeff =
250         highbd_quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin,
251                                      round, quant, quant_shift, dequant);
252 
253     // Set non-zero elements to -1 and use that to extract values for eob.
254     eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
255 
256     __builtin_prefetch(coeff_ptr + 64);
257     coeff_ptr += 8;
258     iscan += 8;
259     qcoeff_ptr += 8;
260     dqcoeff_ptr += 8;
261   }
262 
263   {
264     zbin = vdupq_lane_s32(vget_low_s32(zbin), 1);
265     round = vdupq_lane_s32(vget_low_s32(round), 1);
266     quant = vdupq_lane_s32(vget_low_s32(quant), 1);
267     quant_shift = vdupq_lane_s32(vget_low_s32(quant_shift), 1);
268     dequant = vdupq_lane_s32(vget_low_s32(dequant), 1);
269 
270     for (i = 1; i < 32 * 32 / 8; ++i) {
271       const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
272 
273       const int16x8_t qcoeff =
274           highbd_quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin,
275                                        round, quant, quant_shift, dequant);
276 
277       // Set non-zero elements to -1 and use that to extract values for eob.
278       eob_max =
279           vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
280 
281       __builtin_prefetch(coeff_ptr + 64);
282       coeff_ptr += 8;
283       iscan += 8;
284       qcoeff_ptr += 8;
285       dqcoeff_ptr += 8;
286     }
287   }
288 
289 #if VPX_ARCH_AARCH64
290   *eob_ptr = vmaxvq_u16(eob_max);
291 #else
292   {
293     const uint16x4_t eob_max_0 =
294         vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
295     const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
296     const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
297     vst1_lane_u16(eob_ptr, eob_max_2, 0);
298   }
299 #endif  // VPX_ARCH_AARCH64
300 }
301