1 /*
2 * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vpx_config.h"
12 #include "./vpx_dsp_rtcd.h"
13 #include "vpx_util/loongson_intrinsics.h"
14 #include "vp9/common/vp9_scan.h"
15 #include "vp9/encoder/vp9_block.h"
16
calculate_qcoeff(__m128i coeff,__m128i coeff_abs,__m128i round,__m128i quant,__m128i shift,__m128i cmp_mask)17 static INLINE __m128i calculate_qcoeff(__m128i coeff, __m128i coeff_abs,
18 __m128i round, __m128i quant,
19 __m128i shift, __m128i cmp_mask) {
20 __m128i rounded, qcoeff;
21
22 rounded = __lsx_vsadd_h(coeff_abs, round);
23 qcoeff = __lsx_vmuh_h(rounded, quant);
24 qcoeff = __lsx_vadd_h(rounded, qcoeff);
25 qcoeff = __lsx_vmuh_h(qcoeff, shift);
26 qcoeff = __lsx_vsigncov_h(coeff, qcoeff);
27 qcoeff = __lsx_vand_v(qcoeff, cmp_mask);
28
29 return qcoeff;
30 }
31
calculate_dqcoeff_and_store(__m128i qcoeff,__m128i dequant,int16_t * dqcoeff)32 static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant,
33 int16_t *dqcoeff) {
34 __m128i dqcoeff16 = __lsx_vmul_h(qcoeff, dequant);
35 __lsx_vst(dqcoeff16, dqcoeff, 0);
36 }
37
calculate_dqcoeff_and_store_32x32(__m128i qcoeff,__m128i dequant,int16_t * dqcoeff)38 static INLINE void calculate_dqcoeff_and_store_32x32(__m128i qcoeff,
39 __m128i dequant,
40 int16_t *dqcoeff) {
41 // Un-sign to bias rounding like C.
42 __m128i low, high, dqcoeff32_0, dqcoeff32_1, res;
43 __m128i zero = __lsx_vldi(0);
44 __m128i coeff = __lsx_vabsd_h(qcoeff, zero);
45
46 const __m128i sign_0 = __lsx_vilvl_h(qcoeff, zero);
47 const __m128i sign_1 = __lsx_vilvh_h(qcoeff, zero);
48
49 low = __lsx_vmul_h(coeff, dequant);
50 high = __lsx_vmuh_h(coeff, dequant);
51 dqcoeff32_0 = __lsx_vilvl_h(high, low);
52 dqcoeff32_1 = __lsx_vilvh_h(high, low);
53
54 // "Divide" by 2.
55 dqcoeff32_0 = __lsx_vsrai_w(dqcoeff32_0, 1);
56 dqcoeff32_1 = __lsx_vsrai_w(dqcoeff32_1, 1);
57 dqcoeff32_0 = __lsx_vsigncov_w(sign_0, dqcoeff32_0);
58 dqcoeff32_1 = __lsx_vsigncov_w(sign_1, dqcoeff32_1);
59 res = __lsx_vpickev_h(dqcoeff32_1, dqcoeff32_0);
60 __lsx_vst(res, dqcoeff, 0);
61 }
62
scan_for_eob(__m128i coeff0,__m128i coeff1,const int16_t * scan,int index,__m128i zero)63 static INLINE __m128i scan_for_eob(__m128i coeff0, __m128i coeff1,
64 const int16_t *scan, int index,
65 __m128i zero) {
66 const __m128i zero_coeff0 = __lsx_vseq_h(coeff0, zero);
67 const __m128i zero_coeff1 = __lsx_vseq_h(coeff1, zero);
68 __m128i scan0 = __lsx_vld(scan + index, 0);
69 __m128i scan1 = __lsx_vld(scan + index + 8, 0);
70 __m128i eob0, eob1;
71
72 eob0 = __lsx_vandn_v(zero_coeff0, scan0);
73 eob1 = __lsx_vandn_v(zero_coeff1, scan1);
74 return __lsx_vmax_h(eob0, eob1);
75 }
76
accumulate_eob(__m128i eob)77 static INLINE int16_t accumulate_eob(__m128i eob) {
78 __m128i eob_shuffled;
79 int16_t res_m;
80
81 eob_shuffled = __lsx_vshuf4i_w(eob, 0xe);
82 eob = __lsx_vmax_h(eob, eob_shuffled);
83 eob_shuffled = __lsx_vshuf4i_h(eob, 0xe);
84 eob = __lsx_vmax_h(eob, eob_shuffled);
85 eob_shuffled = __lsx_vshuf4i_h(eob, 0x1);
86 eob = __lsx_vmax_h(eob, eob_shuffled);
87 res_m = __lsx_vpickve2gr_h(eob, 1);
88
89 return res_m;
90 }
91
92 #if !CONFIG_VP9_HIGHBITDEPTH
93
vpx_quantize_b_lsx(const int16_t * coeff_ptr,intptr_t n_coeffs,const struct macroblock_plane * const mb_plane,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const struct ScanOrder * const scan_order)94 void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
95 const struct macroblock_plane *const mb_plane,
96 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
97 const int16_t *dequant_ptr, uint16_t *eob_ptr,
98 const struct ScanOrder *const scan_order) {
99 __m128i zero = __lsx_vldi(0);
100 int index = 16;
101 const int16_t *iscan = scan_order->iscan;
102
103 __m128i zbin, round, quant, dequant, quant_shift;
104 __m128i coeff0, coeff1;
105 __m128i qcoeff0, qcoeff1;
106 __m128i cmp_mask0, cmp_mask1;
107 __m128i eob, eob0;
108
109 zbin = __lsx_vld(mb_plane->zbin, 0);
110 round = __lsx_vld(mb_plane->round, 0);
111 quant = __lsx_vld(mb_plane->quant, 0);
112 dequant = __lsx_vld(dequant_ptr, 0);
113 quant_shift = __lsx_vld(mb_plane->quant_shift, 0);
114 // Handle one DC and first 15 AC.
115 DUP2_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, coeff0, coeff1);
116 qcoeff0 = __lsx_vabsd_h(coeff0, zero);
117 qcoeff1 = __lsx_vabsd_h(coeff1, zero);
118
119 cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0);
120 zbin = __lsx_vilvh_d(zbin, zbin);
121 cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
122
123 qcoeff0 =
124 calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
125 round = __lsx_vilvh_d(round, round);
126 quant = __lsx_vilvh_d(quant, quant);
127 quant_shift = __lsx_vilvh_d(quant_shift, quant_shift);
128 qcoeff1 =
129 calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
130
131 __lsx_vst(qcoeff0, qcoeff_ptr, 0);
132 __lsx_vst(qcoeff1, qcoeff_ptr, 16);
133
134 calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr);
135 dequant = __lsx_vilvh_d(dequant, dequant);
136 calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
137
138 eob = scan_for_eob(qcoeff0, qcoeff1, iscan, 0, zero);
139 // AC only loop.
140 while (index < n_coeffs) {
141 coeff0 = __lsx_vld(coeff_ptr + index, 0);
142 coeff1 = __lsx_vld(coeff_ptr + index + 8, 0);
143
144 qcoeff0 = __lsx_vabsd_h(coeff0, zero);
145 qcoeff1 = __lsx_vabsd_h(coeff1, zero);
146
147 cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0);
148 cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
149
150 qcoeff0 =
151 calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
152 qcoeff1 =
153 calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
154
155 __lsx_vst(qcoeff0, qcoeff_ptr + index, 0);
156 __lsx_vst(qcoeff1, qcoeff_ptr + index + 8, 0);
157
158 calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
159 calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
160
161 eob0 = scan_for_eob(qcoeff0, qcoeff1, iscan, index, zero);
162 eob = __lsx_vmax_h(eob, eob0);
163
164 index += 16;
165 }
166
167 *eob_ptr = accumulate_eob(eob);
168 }
169
vpx_quantize_b_32x32_lsx(const tran_low_t * coeff_ptr,const struct macroblock_plane * const mb_plane,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const struct ScanOrder * const scan_order)170 void vpx_quantize_b_32x32_lsx(const tran_low_t *coeff_ptr,
171 const struct macroblock_plane *const mb_plane,
172 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
173 const int16_t *dequant_ptr, uint16_t *eob_ptr,
174 const struct ScanOrder *const scan_order) {
175 __m128i zero = __lsx_vldi(0);
176 int index;
177 const int16_t *iscan = scan_order->iscan;
178
179 __m128i zbin, round, quant, dequant, quant_shift;
180 __m128i coeff0, coeff1, qcoeff0, qcoeff1, cmp_mask0, cmp_mask1;
181 __m128i eob = zero, eob0;
182
183 zbin = __lsx_vld(mb_plane->zbin, 0);
184 zbin = __lsx_vsrari_h(zbin, 1);
185 round = __lsx_vld(mb_plane->round, 0);
186 round = __lsx_vsrari_h(round, 1);
187
188 quant = __lsx_vld(mb_plane->quant, 0);
189 dequant = __lsx_vld(dequant_ptr, 0);
190 quant_shift = __lsx_vld(mb_plane->quant_shift, 0);
191 quant_shift = __lsx_vslli_h(quant_shift, 1);
192 // Handle one DC and first 15 AC.
193 DUP2_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, coeff0, coeff1);
194 qcoeff0 = __lsx_vabsd_h(coeff0, zero);
195 qcoeff1 = __lsx_vabsd_h(coeff1, zero);
196
197 cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0);
198 // remove DC from zbin
199 zbin = __lsx_vilvh_d(zbin, zbin);
200 cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
201
202 qcoeff0 =
203 calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
204 // remove DC in quant_shift, quant, quant_shift
205 round = __lsx_vilvh_d(round, round);
206 quant = __lsx_vilvh_d(quant, quant);
207 quant_shift = __lsx_vilvh_d(quant_shift, quant_shift);
208 qcoeff1 =
209 calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
210 __lsx_vst(qcoeff0, qcoeff_ptr, 0);
211 __lsx_vst(qcoeff1, qcoeff_ptr, 16);
212
213 calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr);
214 dequant = __lsx_vilvh_d(dequant, dequant);
215 calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, dqcoeff_ptr + 8);
216 eob = scan_for_eob(qcoeff0, qcoeff1, iscan, 0, zero);
217 // AC only loop.
218 for (index = 16; index < 32 * 32; index += 16) {
219 coeff0 = __lsx_vld(coeff_ptr + index, 0);
220 coeff1 = __lsx_vld(coeff_ptr + index + 8, 0);
221
222 qcoeff0 = __lsx_vabsd_h(coeff0, zero);
223 qcoeff1 = __lsx_vabsd_h(coeff1, zero);
224
225 cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0);
226 cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
227
228 qcoeff0 =
229 calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
230 qcoeff1 =
231 calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
232 __lsx_vst(qcoeff0, qcoeff_ptr + index, 0);
233 __lsx_vst(qcoeff1, qcoeff_ptr + index + 8, 0);
234
235 calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr + index);
236 calculate_dqcoeff_and_store_32x32(qcoeff1, dequant,
237 dqcoeff_ptr + 8 + index);
238 eob0 = scan_for_eob(qcoeff0, qcoeff1, iscan, index, zero);
239 eob = __lsx_vmax_h(eob, eob0);
240 }
241
242 *eob_ptr = accumulate_eob(eob);
243 }
244 #endif
245