1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <emmintrin.h>
13 #include <xmmintrin.h>
14
15 #include "config/av1_rtcd.h"
16
17 #include "aom/aom_integer.h"
18 #include "aom_dsp/x86/quantize_x86.h"
19
read_coeff(const tran_low_t * coeff,intptr_t offset,__m128i * c0,__m128i * c1)20 static inline void read_coeff(const tran_low_t *coeff, intptr_t offset,
21 __m128i *c0, __m128i *c1) {
22 const tran_low_t *addr = coeff + offset;
23 if (sizeof(tran_low_t) == 4) {
24 const __m128i x0 = _mm_load_si128((const __m128i *)addr);
25 const __m128i x1 = _mm_load_si128((const __m128i *)addr + 1);
26 const __m128i x2 = _mm_load_si128((const __m128i *)addr + 2);
27 const __m128i x3 = _mm_load_si128((const __m128i *)addr + 3);
28 *c0 = _mm_packs_epi32(x0, x1);
29 *c1 = _mm_packs_epi32(x2, x3);
30 } else {
31 *c0 = _mm_load_si128((const __m128i *)addr);
32 *c1 = _mm_load_si128((const __m128i *)addr + 1);
33 }
34 }
35
write_qcoeff(const __m128i * qc0,const __m128i * qc1,tran_low_t * qcoeff,intptr_t offset)36 static inline void write_qcoeff(const __m128i *qc0, const __m128i *qc1,
37 tran_low_t *qcoeff, intptr_t offset) {
38 tran_low_t *addr = qcoeff + offset;
39 if (sizeof(tran_low_t) == 4) {
40 const __m128i zero = _mm_setzero_si128();
41 __m128i sign_bits = _mm_cmplt_epi16(*qc0, zero);
42 __m128i y0 = _mm_unpacklo_epi16(*qc0, sign_bits);
43 __m128i y1 = _mm_unpackhi_epi16(*qc0, sign_bits);
44 _mm_store_si128((__m128i *)addr, y0);
45 _mm_store_si128((__m128i *)addr + 1, y1);
46
47 sign_bits = _mm_cmplt_epi16(*qc1, zero);
48 y0 = _mm_unpacklo_epi16(*qc1, sign_bits);
49 y1 = _mm_unpackhi_epi16(*qc1, sign_bits);
50 _mm_store_si128((__m128i *)addr + 2, y0);
51 _mm_store_si128((__m128i *)addr + 3, y1);
52 } else {
53 _mm_store_si128((__m128i *)addr, *qc0);
54 _mm_store_si128((__m128i *)addr + 1, *qc1);
55 }
56 }
57
write_zero(tran_low_t * qcoeff,intptr_t offset)58 static inline void write_zero(tran_low_t *qcoeff, intptr_t offset) {
59 const __m128i zero = _mm_setzero_si128();
60 tran_low_t *addr = qcoeff + offset;
61 if (sizeof(tran_low_t) == 4) {
62 _mm_store_si128((__m128i *)addr, zero);
63 _mm_store_si128((__m128i *)addr + 1, zero);
64 _mm_store_si128((__m128i *)addr + 2, zero);
65 _mm_store_si128((__m128i *)addr + 3, zero);
66 } else {
67 _mm_store_si128((__m128i *)addr, zero);
68 _mm_store_si128((__m128i *)addr + 1, zero);
69 }
70 }
71
quantize(const int16_t * iscan_ptr,const tran_low_t * coeff_ptr,intptr_t n_coeffs,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const __m128i * round0,const __m128i * round1,const __m128i * quant0,const __m128i * quant1,const __m128i * dequant0,const __m128i * dequant1,const __m128i * thr0,const __m128i * thr1,__m128i * eob)72 static inline void quantize(const int16_t *iscan_ptr,
73 const tran_low_t *coeff_ptr, intptr_t n_coeffs,
74 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
75 const __m128i *round0, const __m128i *round1,
76 const __m128i *quant0, const __m128i *quant1,
77 const __m128i *dequant0, const __m128i *dequant1,
78 const __m128i *thr0, const __m128i *thr1,
79 __m128i *eob) {
80 __m128i coeff0, coeff1;
81 // Do DC and first 15 AC
82 read_coeff(coeff_ptr, n_coeffs, &coeff0, &coeff1);
83
84 // Poor man's sign extract
85 const __m128i coeff0_sign = _mm_srai_epi16(coeff0, 15);
86 const __m128i coeff1_sign = _mm_srai_epi16(coeff1, 15);
87 __m128i qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
88 __m128i qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
89 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
90 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
91 const __m128i mask0 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff0, *thr0),
92 _mm_cmpeq_epi16(qcoeff0, *thr0));
93 const __m128i mask1 = _mm_or_si128(_mm_cmpgt_epi16(qcoeff1, *thr1),
94 _mm_cmpeq_epi16(qcoeff1, *thr1));
95 const int nzflag = _mm_movemask_epi8(mask0) | _mm_movemask_epi8(mask1);
96
97 if (nzflag) {
98 qcoeff0 = _mm_adds_epi16(qcoeff0, *round0);
99 qcoeff1 = _mm_adds_epi16(qcoeff1, *round1);
100 const __m128i qtmp0 = _mm_mulhi_epi16(qcoeff0, *quant0);
101 const __m128i qtmp1 = _mm_mulhi_epi16(qcoeff1, *quant1);
102
103 // Reinsert signs
104 qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
105 qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
106 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
107 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
108
109 write_qcoeff(&qcoeff0, &qcoeff1, qcoeff_ptr, n_coeffs);
110
111 coeff0 = _mm_mullo_epi16(qcoeff0, *dequant0);
112 coeff1 = _mm_mullo_epi16(qcoeff1, *dequant1);
113
114 write_qcoeff(&coeff0, &coeff1, dqcoeff_ptr, n_coeffs);
115
116 const __m128i zero = _mm_setzero_si128();
117 // Scan for eob
118 const __m128i zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
119 const __m128i zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
120 const __m128i nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
121 const __m128i nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
122 const __m128i iscan0 =
123 _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
124 const __m128i iscan1 =
125 _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
126 // Add one to convert from indices to counts
127 const __m128i iscan0_nz = _mm_sub_epi16(iscan0, nzero_coeff0);
128 const __m128i iscan1_nz = _mm_sub_epi16(iscan1, nzero_coeff1);
129 const __m128i eob0 = _mm_and_si128(iscan0_nz, nzero_coeff0);
130 const __m128i eob1 = _mm_and_si128(iscan1_nz, nzero_coeff1);
131 const __m128i eob2 = _mm_max_epi16(eob0, eob1);
132 *eob = _mm_max_epi16(*eob, eob2);
133 } else {
134 write_zero(qcoeff_ptr, n_coeffs);
135 write_zero(dqcoeff_ptr, n_coeffs);
136 }
137 }
138
av1_quantize_fp_sse2(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan_ptr,const int16_t * iscan_ptr)139 void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
140 const int16_t *zbin_ptr, const int16_t *round_ptr,
141 const int16_t *quant_ptr,
142 const int16_t *quant_shift_ptr,
143 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
144 const int16_t *dequant_ptr, uint16_t *eob_ptr,
145 const int16_t *scan_ptr, const int16_t *iscan_ptr) {
146 (void)scan_ptr;
147 (void)zbin_ptr;
148 (void)quant_shift_ptr;
149
150 coeff_ptr += n_coeffs;
151 iscan_ptr += n_coeffs;
152 qcoeff_ptr += n_coeffs;
153 dqcoeff_ptr += n_coeffs;
154 n_coeffs = -n_coeffs;
155
156 const __m128i round0 = _mm_load_si128((const __m128i *)round_ptr);
157 const __m128i round1 = _mm_unpackhi_epi64(round0, round0);
158 const __m128i quant0 = _mm_load_si128((const __m128i *)quant_ptr);
159 const __m128i quant1 = _mm_unpackhi_epi64(quant0, quant0);
160 const __m128i dequant0 = _mm_load_si128((const __m128i *)dequant_ptr);
161 const __m128i dequant1 = _mm_unpackhi_epi64(dequant0, dequant0);
162 const __m128i thr0 = _mm_srai_epi16(dequant0, 1);
163 const __m128i thr1 = _mm_srai_epi16(dequant1, 1);
164 __m128i eob = _mm_setzero_si128();
165
166 quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round0,
167 &round1, &quant0, &quant1, &dequant0, &dequant1, &thr0, &thr1, &eob);
168
169 n_coeffs += 8 * 2;
170
171 // AC only loop
172 while (n_coeffs < 0) {
173 quantize(iscan_ptr, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round1,
174 &round1, &quant1, &quant1, &dequant1, &dequant1, &thr1, &thr1,
175 &eob);
176 n_coeffs += 8 * 2;
177 }
178
179 // Accumulate EOB
180 {
181 __m128i eob_shuffled;
182 eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
183 eob = _mm_max_epi16(eob, eob_shuffled);
184 eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
185 eob = _mm_max_epi16(eob, eob_shuffled);
186 eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
187 eob = _mm_max_epi16(eob, eob_shuffled);
188 *eob_ptr = _mm_extract_epi16(eob, 1);
189 }
190 }
191
quantize_lp(const int16_t * iscan_ptr,const int16_t * coeff_ptr,intptr_t n_coeffs,int16_t * qcoeff_ptr,int16_t * dqcoeff_ptr,const __m128i * round0,const __m128i * round1,const __m128i * quant0,const __m128i * quant1,const __m128i * dequant0,const __m128i * dequant1,__m128i * eob)192 static inline void quantize_lp(const int16_t *iscan_ptr,
193 const int16_t *coeff_ptr, intptr_t n_coeffs,
194 int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
195 const __m128i *round0, const __m128i *round1,
196 const __m128i *quant0, const __m128i *quant1,
197 const __m128i *dequant0, const __m128i *dequant1,
198 __m128i *eob) {
199 const int16_t *read = coeff_ptr + n_coeffs;
200 __m128i coeff0 = _mm_load_si128((const __m128i *)read);
201 __m128i coeff1 = _mm_load_si128((const __m128i *)read + 1);
202
203 // Poor man's sign extract
204 const __m128i coeff0_sign = _mm_srai_epi16(coeff0, 15);
205 const __m128i coeff1_sign = _mm_srai_epi16(coeff1, 15);
206 __m128i qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
207 __m128i qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
208 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
209 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
210
211 qcoeff0 = _mm_adds_epi16(qcoeff0, *round0);
212 qcoeff1 = _mm_adds_epi16(qcoeff1, *round1);
213 const __m128i qtmp0 = _mm_mulhi_epi16(qcoeff0, *quant0);
214 const __m128i qtmp1 = _mm_mulhi_epi16(qcoeff1, *quant1);
215
216 // Reinsert signs
217 qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
218 qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
219 qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
220 qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
221
222 int16_t *addr = qcoeff_ptr + n_coeffs;
223 _mm_store_si128((__m128i *)addr, qcoeff0);
224 _mm_store_si128((__m128i *)addr + 1, qcoeff1);
225
226 coeff0 = _mm_mullo_epi16(qcoeff0, *dequant0);
227 coeff1 = _mm_mullo_epi16(qcoeff1, *dequant1);
228
229 addr = dqcoeff_ptr + n_coeffs;
230 _mm_store_si128((__m128i *)addr, coeff0);
231 _mm_store_si128((__m128i *)addr + 1, coeff1);
232
233 const __m128i zero = _mm_setzero_si128();
234 // Scan for eob
235 const __m128i zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
236 const __m128i zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
237 const __m128i nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
238 const __m128i nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
239
240 const __m128i iscan0 =
241 _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
242 const __m128i iscan1 =
243 _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
244
245 // Add one to convert from indices to counts
246 const __m128i iscan0_nz = _mm_sub_epi16(iscan0, nzero_coeff0);
247 const __m128i iscan1_nz = _mm_sub_epi16(iscan1, nzero_coeff1);
248 const __m128i eob0 = _mm_and_si128(iscan0_nz, nzero_coeff0);
249 const __m128i eob1 = _mm_and_si128(iscan1_nz, nzero_coeff1);
250 const __m128i eob2 = _mm_max_epi16(eob0, eob1);
251 *eob = _mm_max_epi16(*eob, eob2);
252 }
253
av1_quantize_lp_sse2(const int16_t * coeff_ptr,intptr_t n_coeffs,const int16_t * round_ptr,const int16_t * quant_ptr,int16_t * qcoeff_ptr,int16_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)254 void av1_quantize_lp_sse2(const int16_t *coeff_ptr, intptr_t n_coeffs,
255 const int16_t *round_ptr, const int16_t *quant_ptr,
256 int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
257 const int16_t *dequant_ptr, uint16_t *eob_ptr,
258 const int16_t *scan, const int16_t *iscan) {
259 (void)scan;
260 coeff_ptr += n_coeffs;
261 iscan += n_coeffs;
262 qcoeff_ptr += n_coeffs;
263 dqcoeff_ptr += n_coeffs;
264 n_coeffs = -n_coeffs;
265
266 // Setup global values
267 const __m128i round0 = _mm_load_si128((const __m128i *)round_ptr);
268 const __m128i round1 = _mm_unpackhi_epi64(round0, round0);
269 const __m128i quant0 = _mm_load_si128((const __m128i *)quant_ptr);
270 const __m128i quant1 = _mm_unpackhi_epi64(quant0, quant0);
271 const __m128i dequant0 = _mm_load_si128((const __m128i *)dequant_ptr);
272 const __m128i dequant1 = _mm_unpackhi_epi64(dequant0, dequant0);
273 __m128i eob = _mm_setzero_si128();
274
275 // DC and first 15 AC
276 quantize_lp(iscan, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round0,
277 &round1, &quant0, &quant1, &dequant0, &dequant1, &eob);
278 n_coeffs += 8 * 2;
279
280 // AC only loop
281 while (n_coeffs < 0) {
282 quantize_lp(iscan, coeff_ptr, n_coeffs, qcoeff_ptr, dqcoeff_ptr, &round1,
283 &round1, &quant1, &quant1, &dequant1, &dequant1, &eob);
284 n_coeffs += 8 * 2;
285 }
286
287 // Accumulate EOB
288 *eob_ptr = accumulate_eob(eob);
289 }
290