xref: /aosp_15_r20/external/libvpx/vpx_dsp/loongarch/quantize_lsx.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vpx_config.h"
12 #include "./vpx_dsp_rtcd.h"
13 #include "vpx_util/loongson_intrinsics.h"
14 #include "vp9/common/vp9_scan.h"
15 #include "vp9/encoder/vp9_block.h"
16 
calculate_qcoeff(__m128i coeff,__m128i coeff_abs,__m128i round,__m128i quant,__m128i shift,__m128i cmp_mask)17 static INLINE __m128i calculate_qcoeff(__m128i coeff, __m128i coeff_abs,
18                                        __m128i round, __m128i quant,
19                                        __m128i shift, __m128i cmp_mask) {
20   __m128i rounded, qcoeff;
21 
22   rounded = __lsx_vsadd_h(coeff_abs, round);
23   qcoeff = __lsx_vmuh_h(rounded, quant);
24   qcoeff = __lsx_vadd_h(rounded, qcoeff);
25   qcoeff = __lsx_vmuh_h(qcoeff, shift);
26   qcoeff = __lsx_vsigncov_h(coeff, qcoeff);
27   qcoeff = __lsx_vand_v(qcoeff, cmp_mask);
28 
29   return qcoeff;
30 }
31 
calculate_dqcoeff_and_store(__m128i qcoeff,__m128i dequant,int16_t * dqcoeff)32 static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant,
33                                                int16_t *dqcoeff) {
34   __m128i dqcoeff16 = __lsx_vmul_h(qcoeff, dequant);
35   __lsx_vst(dqcoeff16, dqcoeff, 0);
36 }
37 
calculate_dqcoeff_and_store_32x32(__m128i qcoeff,__m128i dequant,int16_t * dqcoeff)38 static INLINE void calculate_dqcoeff_and_store_32x32(__m128i qcoeff,
39                                                      __m128i dequant,
40                                                      int16_t *dqcoeff) {
41   // Un-sign to bias rounding like C.
42   __m128i low, high, dqcoeff32_0, dqcoeff32_1, res;
43   __m128i zero = __lsx_vldi(0);
44   __m128i coeff = __lsx_vabsd_h(qcoeff, zero);
45 
46   const __m128i sign_0 = __lsx_vilvl_h(qcoeff, zero);
47   const __m128i sign_1 = __lsx_vilvh_h(qcoeff, zero);
48 
49   low = __lsx_vmul_h(coeff, dequant);
50   high = __lsx_vmuh_h(coeff, dequant);
51   dqcoeff32_0 = __lsx_vilvl_h(high, low);
52   dqcoeff32_1 = __lsx_vilvh_h(high, low);
53 
54   // "Divide" by 2.
55   dqcoeff32_0 = __lsx_vsrai_w(dqcoeff32_0, 1);
56   dqcoeff32_1 = __lsx_vsrai_w(dqcoeff32_1, 1);
57   dqcoeff32_0 = __lsx_vsigncov_w(sign_0, dqcoeff32_0);
58   dqcoeff32_1 = __lsx_vsigncov_w(sign_1, dqcoeff32_1);
59   res = __lsx_vpickev_h(dqcoeff32_1, dqcoeff32_0);
60   __lsx_vst(res, dqcoeff, 0);
61 }
62 
scan_for_eob(__m128i coeff0,__m128i coeff1,const int16_t * scan,int index,__m128i zero)63 static INLINE __m128i scan_for_eob(__m128i coeff0, __m128i coeff1,
64                                    const int16_t *scan, int index,
65                                    __m128i zero) {
66   const __m128i zero_coeff0 = __lsx_vseq_h(coeff0, zero);
67   const __m128i zero_coeff1 = __lsx_vseq_h(coeff1, zero);
68   __m128i scan0 = __lsx_vld(scan + index, 0);
69   __m128i scan1 = __lsx_vld(scan + index + 8, 0);
70   __m128i eob0, eob1;
71 
72   eob0 = __lsx_vandn_v(zero_coeff0, scan0);
73   eob1 = __lsx_vandn_v(zero_coeff1, scan1);
74   return __lsx_vmax_h(eob0, eob1);
75 }
76 
accumulate_eob(__m128i eob)77 static INLINE int16_t accumulate_eob(__m128i eob) {
78   __m128i eob_shuffled;
79   int16_t res_m;
80 
81   eob_shuffled = __lsx_vshuf4i_w(eob, 0xe);
82   eob = __lsx_vmax_h(eob, eob_shuffled);
83   eob_shuffled = __lsx_vshuf4i_h(eob, 0xe);
84   eob = __lsx_vmax_h(eob, eob_shuffled);
85   eob_shuffled = __lsx_vshuf4i_h(eob, 0x1);
86   eob = __lsx_vmax_h(eob, eob_shuffled);
87   res_m = __lsx_vpickve2gr_h(eob, 1);
88 
89   return res_m;
90 }
91 
92 #if !CONFIG_VP9_HIGHBITDEPTH
93 
vpx_quantize_b_lsx(const int16_t * coeff_ptr,intptr_t n_coeffs,const struct macroblock_plane * const mb_plane,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const struct ScanOrder * const scan_order)94 void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
95                         const struct macroblock_plane *const mb_plane,
96                         tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
97                         const int16_t *dequant_ptr, uint16_t *eob_ptr,
98                         const struct ScanOrder *const scan_order) {
99   __m128i zero = __lsx_vldi(0);
100   int index = 16;
101   const int16_t *iscan = scan_order->iscan;
102 
103   __m128i zbin, round, quant, dequant, quant_shift;
104   __m128i coeff0, coeff1;
105   __m128i qcoeff0, qcoeff1;
106   __m128i cmp_mask0, cmp_mask1;
107   __m128i eob, eob0;
108 
109   zbin = __lsx_vld(mb_plane->zbin, 0);
110   round = __lsx_vld(mb_plane->round, 0);
111   quant = __lsx_vld(mb_plane->quant, 0);
112   dequant = __lsx_vld(dequant_ptr, 0);
113   quant_shift = __lsx_vld(mb_plane->quant_shift, 0);
114   // Handle one DC and first 15 AC.
115   DUP2_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, coeff0, coeff1);
116   qcoeff0 = __lsx_vabsd_h(coeff0, zero);
117   qcoeff1 = __lsx_vabsd_h(coeff1, zero);
118 
119   cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0);
120   zbin = __lsx_vilvh_d(zbin, zbin);
121   cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
122 
123   qcoeff0 =
124       calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
125   round = __lsx_vilvh_d(round, round);
126   quant = __lsx_vilvh_d(quant, quant);
127   quant_shift = __lsx_vilvh_d(quant_shift, quant_shift);
128   qcoeff1 =
129       calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
130 
131   __lsx_vst(qcoeff0, qcoeff_ptr, 0);
132   __lsx_vst(qcoeff1, qcoeff_ptr, 16);
133 
134   calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr);
135   dequant = __lsx_vilvh_d(dequant, dequant);
136   calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
137 
138   eob = scan_for_eob(qcoeff0, qcoeff1, iscan, 0, zero);
139   // AC only loop.
140   while (index < n_coeffs) {
141     coeff0 = __lsx_vld(coeff_ptr + index, 0);
142     coeff1 = __lsx_vld(coeff_ptr + index + 8, 0);
143 
144     qcoeff0 = __lsx_vabsd_h(coeff0, zero);
145     qcoeff1 = __lsx_vabsd_h(coeff1, zero);
146 
147     cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0);
148     cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
149 
150     qcoeff0 =
151         calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
152     qcoeff1 =
153         calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
154 
155     __lsx_vst(qcoeff0, qcoeff_ptr + index, 0);
156     __lsx_vst(qcoeff1, qcoeff_ptr + index + 8, 0);
157 
158     calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
159     calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
160 
161     eob0 = scan_for_eob(qcoeff0, qcoeff1, iscan, index, zero);
162     eob = __lsx_vmax_h(eob, eob0);
163 
164     index += 16;
165   }
166 
167   *eob_ptr = accumulate_eob(eob);
168 }
169 
vpx_quantize_b_32x32_lsx(const tran_low_t * coeff_ptr,const struct macroblock_plane * const mb_plane,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const struct ScanOrder * const scan_order)170 void vpx_quantize_b_32x32_lsx(const tran_low_t *coeff_ptr,
171                               const struct macroblock_plane *const mb_plane,
172                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
173                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
174                               const struct ScanOrder *const scan_order) {
175   __m128i zero = __lsx_vldi(0);
176   int index;
177   const int16_t *iscan = scan_order->iscan;
178 
179   __m128i zbin, round, quant, dequant, quant_shift;
180   __m128i coeff0, coeff1, qcoeff0, qcoeff1, cmp_mask0, cmp_mask1;
181   __m128i eob = zero, eob0;
182 
183   zbin = __lsx_vld(mb_plane->zbin, 0);
184   zbin = __lsx_vsrari_h(zbin, 1);
185   round = __lsx_vld(mb_plane->round, 0);
186   round = __lsx_vsrari_h(round, 1);
187 
188   quant = __lsx_vld(mb_plane->quant, 0);
189   dequant = __lsx_vld(dequant_ptr, 0);
190   quant_shift = __lsx_vld(mb_plane->quant_shift, 0);
191   quant_shift = __lsx_vslli_h(quant_shift, 1);
192   // Handle one DC and first 15 AC.
193   DUP2_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, coeff0, coeff1);
194   qcoeff0 = __lsx_vabsd_h(coeff0, zero);
195   qcoeff1 = __lsx_vabsd_h(coeff1, zero);
196 
197   cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0);
198   // remove DC from zbin
199   zbin = __lsx_vilvh_d(zbin, zbin);
200   cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
201 
202   qcoeff0 =
203       calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
204   // remove DC in quant_shift, quant, quant_shift
205   round = __lsx_vilvh_d(round, round);
206   quant = __lsx_vilvh_d(quant, quant);
207   quant_shift = __lsx_vilvh_d(quant_shift, quant_shift);
208   qcoeff1 =
209       calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
210   __lsx_vst(qcoeff0, qcoeff_ptr, 0);
211   __lsx_vst(qcoeff1, qcoeff_ptr, 16);
212 
213   calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr);
214   dequant = __lsx_vilvh_d(dequant, dequant);
215   calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, dqcoeff_ptr + 8);
216   eob = scan_for_eob(qcoeff0, qcoeff1, iscan, 0, zero);
217   // AC only loop.
218   for (index = 16; index < 32 * 32; index += 16) {
219     coeff0 = __lsx_vld(coeff_ptr + index, 0);
220     coeff1 = __lsx_vld(coeff_ptr + index + 8, 0);
221 
222     qcoeff0 = __lsx_vabsd_h(coeff0, zero);
223     qcoeff1 = __lsx_vabsd_h(coeff1, zero);
224 
225     cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0);
226     cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
227 
228     qcoeff0 =
229         calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
230     qcoeff1 =
231         calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
232     __lsx_vst(qcoeff0, qcoeff_ptr + index, 0);
233     __lsx_vst(qcoeff1, qcoeff_ptr + index + 8, 0);
234 
235     calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr + index);
236     calculate_dqcoeff_and_store_32x32(qcoeff1, dequant,
237                                       dqcoeff_ptr + 8 + index);
238     eob0 = scan_for_eob(qcoeff0, qcoeff1, iscan, index, zero);
239     eob = __lsx_vmax_h(eob, eob0);
240   }
241 
242   *eob_ptr = accumulate_eob(eob);
243 }
244 #endif
245