1 /*
2 * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <assert.h>
13 #include <emmintrin.h>
14 #include "config/aom_dsp_rtcd.h"
15 #include "aom/aom_integer.h"
16 #include "aom_dsp/quantize.h"
17 #include "aom_dsp/x86/quantize_x86.h"
18
aom_quantize_b_adaptive_sse2(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)19 void aom_quantize_b_adaptive_sse2(
20 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
21 const int16_t *round_ptr, const int16_t *quant_ptr,
22 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
23 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
24 const int16_t *scan, const int16_t *iscan) {
25 int index = 16;
26 int non_zero_count = 0;
27 int non_zero_count_prescan_add_zero = 0;
28 int is_found0 = 0, is_found1 = 0;
29 int eob = -1;
30 const __m128i zero = _mm_setzero_si128();
31 __m128i zbin, round, quant, dequant, shift;
32 __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
33 __m128i qcoeff0, qcoeff1;
34 __m128i cmp_mask0, cmp_mask1;
35 __m128i all_zero;
36 __m128i mask0 = zero, mask1 = zero;
37
38 int prescan_add[2];
39 int thresh[4];
40 const qm_val_t wt = (1 << AOM_QM_BITS);
41 for (int i = 0; i < 2; ++i) {
42 prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
43 thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1;
44 }
45 thresh[2] = thresh[3] = thresh[1];
46 __m128i threshold[2];
47 threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
48 threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
49
50 #if SKIP_EOB_FACTOR_ADJUST
51 int first = -1;
52 #endif
53 // Setup global values.
54 load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
55 dequant_ptr, &dequant, quant_shift_ptr, &shift);
56
57 // Do DC and first 15 AC.
58 coeff0 = load_coefficients(coeff_ptr);
59 coeff1 = load_coefficients(coeff_ptr + 8);
60
61 // Poor man's abs().
62 coeff0_sign = _mm_srai_epi16(coeff0, 15);
63 coeff1_sign = _mm_srai_epi16(coeff1, 15);
64 qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
65 qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
66
67 update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
68
69 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
70 zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
71 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
72
73 update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1);
74
75 threshold[0] = threshold[1];
76 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
77 if (_mm_movemask_epi8(all_zero) == 0) {
78 _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
79 _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
80 _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
81 _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
82 _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
83 _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
84 _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
85 _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
86 round = _mm_unpackhi_epi64(round, round);
87 quant = _mm_unpackhi_epi64(quant, quant);
88 shift = _mm_unpackhi_epi64(shift, shift);
89 dequant = _mm_unpackhi_epi64(dequant, dequant);
90 } else {
91 calculate_qcoeff(&qcoeff0, round, quant, shift);
92
93 round = _mm_unpackhi_epi64(round, round);
94 quant = _mm_unpackhi_epi64(quant, quant);
95 shift = _mm_unpackhi_epi64(shift, shift);
96
97 calculate_qcoeff(&qcoeff1, round, quant, shift);
98
99 // Reinsert signs
100 qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
101 qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
102
103 // Mask out zbin threshold coeffs
104 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
105 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
106
107 store_coefficients(qcoeff0, qcoeff_ptr);
108 store_coefficients(qcoeff1, qcoeff_ptr + 8);
109
110 coeff0 = calculate_dqcoeff(qcoeff0, dequant);
111 dequant = _mm_unpackhi_epi64(dequant, dequant);
112 coeff1 = calculate_dqcoeff(qcoeff1, dequant);
113
114 store_coefficients(coeff0, dqcoeff_ptr);
115 store_coefficients(coeff1, dqcoeff_ptr + 8);
116 }
117
118 // AC only loop.
119 while (index < n_coeffs) {
120 coeff0 = load_coefficients(coeff_ptr + index);
121 coeff1 = load_coefficients(coeff_ptr + index + 8);
122
123 coeff0_sign = _mm_srai_epi16(coeff0, 15);
124 coeff1_sign = _mm_srai_epi16(coeff1, 15);
125 qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
126 qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
127
128 update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0,
129 &mask0);
130
131 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
132 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
133
134 update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1);
135
136 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
137 if (_mm_movemask_epi8(all_zero) == 0) {
138 _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
139 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
140 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
141 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
142 _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
143 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
144 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
145 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
146 index += 16;
147 continue;
148 }
149 calculate_qcoeff(&qcoeff0, round, quant, shift);
150 calculate_qcoeff(&qcoeff1, round, quant, shift);
151
152 qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
153 qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
154
155 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
156 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
157
158 store_coefficients(qcoeff0, qcoeff_ptr + index);
159 store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
160
161 coeff0 = calculate_dqcoeff(qcoeff0, dequant);
162 coeff1 = calculate_dqcoeff(qcoeff1, dequant);
163
164 store_coefficients(coeff0, dqcoeff_ptr + index);
165 store_coefficients(coeff1, dqcoeff_ptr + index + 8);
166
167 index += 16;
168 }
169 if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
170 if (is_found1)
171 non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
172
173 for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
174 const int rc = scan[i];
175 qcoeff_ptr[rc] = 0;
176 dqcoeff_ptr[rc] = 0;
177 }
178
179 for (int i = non_zero_count - 1; i >= 0; i--) {
180 const int rc = scan[i];
181 if (qcoeff_ptr[rc]) {
182 eob = i;
183 break;
184 }
185 }
186
187 *eob_ptr = eob + 1;
188 #if SKIP_EOB_FACTOR_ADJUST
189 // TODO(Aniket): Experiment the following loop with intrinsic by combining
190 // with the quantization loop above
191 for (int i = 0; i < non_zero_count; i++) {
192 const int rc = scan[i];
193 const int qcoeff = qcoeff_ptr[rc];
194 if (qcoeff) {
195 first = i;
196 break;
197 }
198 }
199 if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
200 const int rc = scan[(*eob_ptr - 1)];
201 if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
202 const int coeff = coeff_ptr[rc] * wt;
203 const int coeff_sign = AOMSIGN(coeff);
204 const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
205 const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
206 const int prescan_add_val =
207 ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
208 if (abs_coeff <
209 (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
210 qcoeff_ptr[rc] = 0;
211 dqcoeff_ptr[rc] = 0;
212 *eob_ptr = 0;
213 }
214 }
215 }
216 #endif
217 }
218
aom_quantize_b_32x32_adaptive_sse2(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)219 void aom_quantize_b_32x32_adaptive_sse2(
220 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
221 const int16_t *round_ptr, const int16_t *quant_ptr,
222 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
223 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
224 const int16_t *scan, const int16_t *iscan) {
225 int index = 16;
226 const int log_scale = 1;
227 int non_zero_count = 0;
228 int non_zero_count_prescan_add_zero = 0;
229 int is_found0 = 0, is_found1 = 0;
230 int eob = -1;
231 const __m128i zero = _mm_setzero_si128();
232 const __m128i one = _mm_set1_epi16(1);
233 const __m128i log_scale_vec = _mm_set1_epi16(log_scale);
234 __m128i zbin, round, quant, dequant, shift;
235 __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
236 __m128i qcoeff0, qcoeff1;
237 __m128i cmp_mask0, cmp_mask1;
238 __m128i all_zero;
239 __m128i mask0 = zero, mask1 = zero;
240
241 const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
242 ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
243 int prescan_add[2];
244 int thresh[4];
245 const qm_val_t wt = (1 << AOM_QM_BITS);
246 for (int i = 0; i < 2; ++i) {
247 prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
248 thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
249 }
250 thresh[2] = thresh[3] = thresh[1];
251 __m128i threshold[2];
252 threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
253 threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
254
255 #if SKIP_EOB_FACTOR_ADJUST
256 int first = -1;
257 #endif
258 // Setup global values.
259 zbin = _mm_load_si128((const __m128i *)zbin_ptr);
260 round = _mm_load_si128((const __m128i *)round_ptr);
261 quant = _mm_load_si128((const __m128i *)quant_ptr);
262 dequant = _mm_load_si128((const __m128i *)dequant_ptr);
263 shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
264
265 // Shift with rounding.
266 zbin = _mm_add_epi16(zbin, log_scale_vec);
267 round = _mm_add_epi16(round, log_scale_vec);
268 zbin = _mm_srli_epi16(zbin, log_scale);
269 round = _mm_srli_epi16(round, log_scale);
270 zbin = _mm_sub_epi16(zbin, one);
271
272 // Do DC and first 15 AC.
273 coeff0 = load_coefficients(coeff_ptr);
274 coeff1 = load_coefficients(coeff_ptr + 8);
275
276 coeff0_sign = _mm_srai_epi16(coeff0, 15);
277 coeff1_sign = _mm_srai_epi16(coeff1, 15);
278 qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
279 qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
280
281 update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
282
283 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
284 zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
285 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
286
287 update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1);
288
289 threshold[0] = threshold[1];
290 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
291 if (_mm_movemask_epi8(all_zero) == 0) {
292 _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
293 _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
294 _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
295 _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
296 _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
297 _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
298 _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
299 _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
300 round = _mm_unpackhi_epi64(round, round);
301 quant = _mm_unpackhi_epi64(quant, quant);
302 shift = _mm_unpackhi_epi64(shift, shift);
303 dequant = _mm_unpackhi_epi64(dequant, dequant);
304 } else {
305 calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
306 round = _mm_unpackhi_epi64(round, round);
307 quant = _mm_unpackhi_epi64(quant, quant);
308 shift = _mm_unpackhi_epi64(shift, shift);
309 calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
310
311 // Reinsert signs
312 qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
313 qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
314
315 // Mask out zbin threshold coeffs
316 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
317 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
318
319 store_coefficients(qcoeff0, qcoeff_ptr);
320 store_coefficients(qcoeff1, qcoeff_ptr + 8);
321
322 calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr,
323 &log_scale);
324 dequant = _mm_unpackhi_epi64(dequant, dequant);
325 calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
326 dqcoeff_ptr + 8, &log_scale);
327 }
328
329 // AC only loop.
330 while (index < n_coeffs) {
331 coeff0 = load_coefficients(coeff_ptr + index);
332 coeff1 = load_coefficients(coeff_ptr + index + 8);
333
334 coeff0_sign = _mm_srai_epi16(coeff0, 15);
335 coeff1_sign = _mm_srai_epi16(coeff1, 15);
336 qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
337 qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
338
339 update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0,
340 &mask0);
341
342 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
343 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
344
345 update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1);
346
347 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
348 if (_mm_movemask_epi8(all_zero) == 0) {
349 _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
350 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
351 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
352 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
353 _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
354 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
355 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
356 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
357 index += 16;
358 continue;
359 }
360 calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
361 calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
362
363 qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
364 qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
365
366 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
367 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
368
369 store_coefficients(qcoeff0, qcoeff_ptr + index);
370 store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
371
372 calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero,
373 dqcoeff_ptr + index, &log_scale);
374 calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
375 dqcoeff_ptr + index + 8, &log_scale);
376 index += 16;
377 }
378 if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
379 if (is_found1)
380 non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
381
382 for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
383 const int rc = scan[i];
384 qcoeff_ptr[rc] = 0;
385 dqcoeff_ptr[rc] = 0;
386 }
387
388 for (int i = non_zero_count - 1; i >= 0; i--) {
389 const int rc = scan[i];
390 if (qcoeff_ptr[rc]) {
391 eob = i;
392 break;
393 }
394 }
395
396 *eob_ptr = eob + 1;
397 #if SKIP_EOB_FACTOR_ADJUST
398 // TODO(Aniket): Experiment the following loop with intrinsic by combining
399 // with the quantization loop above
400 for (int i = 0; i < non_zero_count; i++) {
401 const int rc = scan[i];
402 const int qcoeff = qcoeff_ptr[rc];
403 if (qcoeff) {
404 first = i;
405 break;
406 }
407 }
408 if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
409 const int rc = scan[(*eob_ptr - 1)];
410 if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
411 const int coeff = coeff_ptr[rc] * wt;
412 const int coeff_sign = AOMSIGN(coeff);
413 const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
414 const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
415 const int prescan_add_val =
416 ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
417 if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
418 qcoeff_ptr[rc] = 0;
419 dqcoeff_ptr[rc] = 0;
420 *eob_ptr = 0;
421 }
422 }
423 }
424 #endif
425 }
426
aom_quantize_b_64x64_adaptive_sse2(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)427 void aom_quantize_b_64x64_adaptive_sse2(
428 const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
429 const int16_t *round_ptr, const int16_t *quant_ptr,
430 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
431 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
432 const int16_t *scan, const int16_t *iscan) {
433 int index = 16;
434 const int log_scale = 2;
435 int non_zero_count = 0;
436 int non_zero_count_prescan_add_zero = 0;
437 int is_found0 = 0, is_found1 = 0;
438 int eob = -1;
439 const __m128i zero = _mm_setzero_si128();
440 const __m128i one = _mm_set1_epi16(1);
441 const __m128i log_scale_vec = _mm_set1_epi16(log_scale);
442 __m128i zbin, round, quant, dequant, shift;
443 __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
444 __m128i qcoeff0, qcoeff1;
445 __m128i cmp_mask0, cmp_mask1;
446 __m128i all_zero;
447 __m128i mask0 = zero, mask1 = zero;
448
449 const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
450 ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
451 int prescan_add[2];
452 int thresh[4];
453 const qm_val_t wt = (1 << AOM_QM_BITS);
454 for (int i = 0; i < 2; ++i) {
455 prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
456 thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
457 }
458 thresh[2] = thresh[3] = thresh[1];
459 __m128i threshold[2];
460 threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
461 threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
462
463 #if SKIP_EOB_FACTOR_ADJUST
464 int first = -1;
465 #endif
466 // Setup global values.
467 zbin = _mm_load_si128((const __m128i *)zbin_ptr);
468 round = _mm_load_si128((const __m128i *)round_ptr);
469 quant = _mm_load_si128((const __m128i *)quant_ptr);
470 dequant = _mm_load_si128((const __m128i *)dequant_ptr);
471 shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
472
473 // Shift with rounding.
474 zbin = _mm_add_epi16(zbin, log_scale_vec);
475 round = _mm_add_epi16(round, log_scale_vec);
476 zbin = _mm_srli_epi16(zbin, log_scale);
477 round = _mm_srli_epi16(round, log_scale);
478 zbin = _mm_sub_epi16(zbin, one);
479
480 // Do DC and first 15 AC.
481 coeff0 = load_coefficients(coeff_ptr);
482 coeff1 = load_coefficients(coeff_ptr + 8);
483
484 coeff0_sign = _mm_srai_epi16(coeff0, 15);
485 coeff1_sign = _mm_srai_epi16(coeff1, 15);
486 qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
487 qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
488
489 update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
490
491 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
492 zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
493 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
494
495 update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1);
496
497 threshold[0] = threshold[1];
498 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
499 if (_mm_movemask_epi8(all_zero) == 0) {
500 _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
501 _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
502 _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
503 _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
504 _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
505 _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
506 _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
507 _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
508 round = _mm_unpackhi_epi64(round, round);
509 quant = _mm_unpackhi_epi64(quant, quant);
510 shift = _mm_unpackhi_epi64(shift, shift);
511 dequant = _mm_unpackhi_epi64(dequant, dequant);
512 } else {
513 calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
514 round = _mm_unpackhi_epi64(round, round);
515 quant = _mm_unpackhi_epi64(quant, quant);
516 shift = _mm_unpackhi_epi64(shift, shift);
517 calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
518
519 // Reinsert signs
520 qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
521 qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
522
523 // Mask out zbin threshold coeffs
524 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
525 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
526
527 store_coefficients(qcoeff0, qcoeff_ptr);
528 store_coefficients(qcoeff1, qcoeff_ptr + 8);
529
530 calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr,
531 &log_scale);
532 dequant = _mm_unpackhi_epi64(dequant, dequant);
533 calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
534 dqcoeff_ptr + 8, &log_scale);
535 }
536
537 // AC only loop.
538 while (index < n_coeffs) {
539 coeff0 = load_coefficients(coeff_ptr + index);
540 coeff1 = load_coefficients(coeff_ptr + index + 8);
541
542 coeff0_sign = _mm_srai_epi16(coeff0, 15);
543 coeff1_sign = _mm_srai_epi16(coeff1, 15);
544 qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
545 qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
546
547 update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0,
548 &mask0);
549
550 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
551 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
552
553 update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1);
554
555 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
556 if (_mm_movemask_epi8(all_zero) == 0) {
557 _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
558 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
559 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
560 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
561 _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
562 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
563 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
564 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
565 index += 16;
566 continue;
567 }
568 calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
569 calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
570
571 qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
572 qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
573
574 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
575 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
576
577 store_coefficients(qcoeff0, qcoeff_ptr + index);
578 store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
579
580 calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero,
581 dqcoeff_ptr + index, &log_scale);
582 calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
583 dqcoeff_ptr + index + 8, &log_scale);
584 index += 16;
585 }
586 if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
587 if (is_found1)
588 non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
589
590 for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
591 const int rc = scan[i];
592 qcoeff_ptr[rc] = 0;
593 dqcoeff_ptr[rc] = 0;
594 }
595
596 for (int i = non_zero_count - 1; i >= 0; i--) {
597 const int rc = scan[i];
598 if (qcoeff_ptr[rc]) {
599 eob = i;
600 break;
601 }
602 }
603
604 *eob_ptr = eob + 1;
605 #if SKIP_EOB_FACTOR_ADJUST
606 // TODO(Aniket): Experiment the following loop with intrinsic by combining
607 // with the quantization loop above
608 for (int i = 0; i < non_zero_count; i++) {
609 const int rc = scan[i];
610 const int qcoeff = qcoeff_ptr[rc];
611 if (qcoeff) {
612 first = i;
613 break;
614 }
615 }
616 if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
617 const int rc = scan[(*eob_ptr - 1)];
618 if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
619 const int coeff = coeff_ptr[rc] * wt;
620 const int coeff_sign = AOMSIGN(coeff);
621 const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
622 const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
623 const int prescan_add_val =
624 ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
625 if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
626 qcoeff_ptr[rc] = 0;
627 dqcoeff_ptr[rc] = 0;
628 *eob_ptr = 0;
629 }
630 }
631 }
632 #endif
633 }
634