xref: /aosp_15_r20/external/libaom/aom_dsp/x86/adaptive_quantize_sse2.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <emmintrin.h>
14 #include "config/aom_dsp_rtcd.h"
15 #include "aom/aom_integer.h"
16 #include "aom_dsp/quantize.h"
17 #include "aom_dsp/x86/quantize_x86.h"
18 
aom_quantize_b_adaptive_sse2(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)19 void aom_quantize_b_adaptive_sse2(
20     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
21     const int16_t *round_ptr, const int16_t *quant_ptr,
22     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
23     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
24     const int16_t *scan, const int16_t *iscan) {
25   int index = 16;
26   int non_zero_count = 0;
27   int non_zero_count_prescan_add_zero = 0;
28   int is_found0 = 0, is_found1 = 0;
29   int eob = -1;
30   const __m128i zero = _mm_setzero_si128();
31   __m128i zbin, round, quant, dequant, shift;
32   __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
33   __m128i qcoeff0, qcoeff1;
34   __m128i cmp_mask0, cmp_mask1;
35   __m128i all_zero;
36   __m128i mask0 = zero, mask1 = zero;
37 
38   int prescan_add[2];
39   int thresh[4];
40   const qm_val_t wt = (1 << AOM_QM_BITS);
41   for (int i = 0; i < 2; ++i) {
42     prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
43     thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1;
44   }
45   thresh[2] = thresh[3] = thresh[1];
46   __m128i threshold[2];
47   threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
48   threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
49 
50 #if SKIP_EOB_FACTOR_ADJUST
51   int first = -1;
52 #endif
53   // Setup global values.
54   load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
55                 dequant_ptr, &dequant, quant_shift_ptr, &shift);
56 
57   // Do DC and first 15 AC.
58   coeff0 = load_coefficients(coeff_ptr);
59   coeff1 = load_coefficients(coeff_ptr + 8);
60 
61   // Poor man's abs().
62   coeff0_sign = _mm_srai_epi16(coeff0, 15);
63   coeff1_sign = _mm_srai_epi16(coeff1, 15);
64   qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
65   qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
66 
67   update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
68 
69   cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
70   zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
71   cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
72 
73   update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1);
74 
75   threshold[0] = threshold[1];
76   all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
77   if (_mm_movemask_epi8(all_zero) == 0) {
78     _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
79     _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
80     _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
81     _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
82     _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
83     _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
84     _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
85     _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
86     round = _mm_unpackhi_epi64(round, round);
87     quant = _mm_unpackhi_epi64(quant, quant);
88     shift = _mm_unpackhi_epi64(shift, shift);
89     dequant = _mm_unpackhi_epi64(dequant, dequant);
90   } else {
91     calculate_qcoeff(&qcoeff0, round, quant, shift);
92 
93     round = _mm_unpackhi_epi64(round, round);
94     quant = _mm_unpackhi_epi64(quant, quant);
95     shift = _mm_unpackhi_epi64(shift, shift);
96 
97     calculate_qcoeff(&qcoeff1, round, quant, shift);
98 
99     // Reinsert signs
100     qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
101     qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
102 
103     // Mask out zbin threshold coeffs
104     qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
105     qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
106 
107     store_coefficients(qcoeff0, qcoeff_ptr);
108     store_coefficients(qcoeff1, qcoeff_ptr + 8);
109 
110     coeff0 = calculate_dqcoeff(qcoeff0, dequant);
111     dequant = _mm_unpackhi_epi64(dequant, dequant);
112     coeff1 = calculate_dqcoeff(qcoeff1, dequant);
113 
114     store_coefficients(coeff0, dqcoeff_ptr);
115     store_coefficients(coeff1, dqcoeff_ptr + 8);
116   }
117 
118   // AC only loop.
119   while (index < n_coeffs) {
120     coeff0 = load_coefficients(coeff_ptr + index);
121     coeff1 = load_coefficients(coeff_ptr + index + 8);
122 
123     coeff0_sign = _mm_srai_epi16(coeff0, 15);
124     coeff1_sign = _mm_srai_epi16(coeff1, 15);
125     qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
126     qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
127 
128     update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0,
129                  &mask0);
130 
131     cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
132     cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
133 
134     update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1);
135 
136     all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
137     if (_mm_movemask_epi8(all_zero) == 0) {
138       _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
139       _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
140       _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
141       _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
142       _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
143       _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
144       _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
145       _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
146       index += 16;
147       continue;
148     }
149     calculate_qcoeff(&qcoeff0, round, quant, shift);
150     calculate_qcoeff(&qcoeff1, round, quant, shift);
151 
152     qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
153     qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
154 
155     qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
156     qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
157 
158     store_coefficients(qcoeff0, qcoeff_ptr + index);
159     store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
160 
161     coeff0 = calculate_dqcoeff(qcoeff0, dequant);
162     coeff1 = calculate_dqcoeff(qcoeff1, dequant);
163 
164     store_coefficients(coeff0, dqcoeff_ptr + index);
165     store_coefficients(coeff1, dqcoeff_ptr + index + 8);
166 
167     index += 16;
168   }
169   if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
170   if (is_found1)
171     non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
172 
173   for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
174     const int rc = scan[i];
175     qcoeff_ptr[rc] = 0;
176     dqcoeff_ptr[rc] = 0;
177   }
178 
179   for (int i = non_zero_count - 1; i >= 0; i--) {
180     const int rc = scan[i];
181     if (qcoeff_ptr[rc]) {
182       eob = i;
183       break;
184     }
185   }
186 
187   *eob_ptr = eob + 1;
188 #if SKIP_EOB_FACTOR_ADJUST
189   // TODO(Aniket): Experiment the following loop with intrinsic by combining
190   // with the quantization loop above
191   for (int i = 0; i < non_zero_count; i++) {
192     const int rc = scan[i];
193     const int qcoeff = qcoeff_ptr[rc];
194     if (qcoeff) {
195       first = i;
196       break;
197     }
198   }
199   if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
200     const int rc = scan[(*eob_ptr - 1)];
201     if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
202       const int coeff = coeff_ptr[rc] * wt;
203       const int coeff_sign = AOMSIGN(coeff);
204       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
205       const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
206       const int prescan_add_val =
207           ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
208       if (abs_coeff <
209           (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
210         qcoeff_ptr[rc] = 0;
211         dqcoeff_ptr[rc] = 0;
212         *eob_ptr = 0;
213       }
214     }
215   }
216 #endif
217 }
218 
aom_quantize_b_32x32_adaptive_sse2(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)219 void aom_quantize_b_32x32_adaptive_sse2(
220     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
221     const int16_t *round_ptr, const int16_t *quant_ptr,
222     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
223     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
224     const int16_t *scan, const int16_t *iscan) {
225   int index = 16;
226   const int log_scale = 1;
227   int non_zero_count = 0;
228   int non_zero_count_prescan_add_zero = 0;
229   int is_found0 = 0, is_found1 = 0;
230   int eob = -1;
231   const __m128i zero = _mm_setzero_si128();
232   const __m128i one = _mm_set1_epi16(1);
233   const __m128i log_scale_vec = _mm_set1_epi16(log_scale);
234   __m128i zbin, round, quant, dequant, shift;
235   __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
236   __m128i qcoeff0, qcoeff1;
237   __m128i cmp_mask0, cmp_mask1;
238   __m128i all_zero;
239   __m128i mask0 = zero, mask1 = zero;
240 
241   const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
242                          ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
243   int prescan_add[2];
244   int thresh[4];
245   const qm_val_t wt = (1 << AOM_QM_BITS);
246   for (int i = 0; i < 2; ++i) {
247     prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
248     thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
249   }
250   thresh[2] = thresh[3] = thresh[1];
251   __m128i threshold[2];
252   threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
253   threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
254 
255 #if SKIP_EOB_FACTOR_ADJUST
256   int first = -1;
257 #endif
258   // Setup global values.
259   zbin = _mm_load_si128((const __m128i *)zbin_ptr);
260   round = _mm_load_si128((const __m128i *)round_ptr);
261   quant = _mm_load_si128((const __m128i *)quant_ptr);
262   dequant = _mm_load_si128((const __m128i *)dequant_ptr);
263   shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
264 
265   // Shift with rounding.
266   zbin = _mm_add_epi16(zbin, log_scale_vec);
267   round = _mm_add_epi16(round, log_scale_vec);
268   zbin = _mm_srli_epi16(zbin, log_scale);
269   round = _mm_srli_epi16(round, log_scale);
270   zbin = _mm_sub_epi16(zbin, one);
271 
272   // Do DC and first 15 AC.
273   coeff0 = load_coefficients(coeff_ptr);
274   coeff1 = load_coefficients(coeff_ptr + 8);
275 
276   coeff0_sign = _mm_srai_epi16(coeff0, 15);
277   coeff1_sign = _mm_srai_epi16(coeff1, 15);
278   qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
279   qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
280 
281   update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
282 
283   cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
284   zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
285   cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
286 
287   update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1);
288 
289   threshold[0] = threshold[1];
290   all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
291   if (_mm_movemask_epi8(all_zero) == 0) {
292     _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
293     _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
294     _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
295     _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
296     _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
297     _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
298     _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
299     _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
300     round = _mm_unpackhi_epi64(round, round);
301     quant = _mm_unpackhi_epi64(quant, quant);
302     shift = _mm_unpackhi_epi64(shift, shift);
303     dequant = _mm_unpackhi_epi64(dequant, dequant);
304   } else {
305     calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
306     round = _mm_unpackhi_epi64(round, round);
307     quant = _mm_unpackhi_epi64(quant, quant);
308     shift = _mm_unpackhi_epi64(shift, shift);
309     calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
310 
311     // Reinsert signs
312     qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
313     qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
314 
315     // Mask out zbin threshold coeffs
316     qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
317     qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
318 
319     store_coefficients(qcoeff0, qcoeff_ptr);
320     store_coefficients(qcoeff1, qcoeff_ptr + 8);
321 
322     calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr,
323                                           &log_scale);
324     dequant = _mm_unpackhi_epi64(dequant, dequant);
325     calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
326                                           dqcoeff_ptr + 8, &log_scale);
327   }
328 
329   // AC only loop.
330   while (index < n_coeffs) {
331     coeff0 = load_coefficients(coeff_ptr + index);
332     coeff1 = load_coefficients(coeff_ptr + index + 8);
333 
334     coeff0_sign = _mm_srai_epi16(coeff0, 15);
335     coeff1_sign = _mm_srai_epi16(coeff1, 15);
336     qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
337     qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
338 
339     update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0,
340                  &mask0);
341 
342     cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
343     cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
344 
345     update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1);
346 
347     all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
348     if (_mm_movemask_epi8(all_zero) == 0) {
349       _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
350       _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
351       _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
352       _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
353       _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
354       _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
355       _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
356       _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
357       index += 16;
358       continue;
359     }
360     calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
361     calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
362 
363     qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
364     qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
365 
366     qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
367     qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
368 
369     store_coefficients(qcoeff0, qcoeff_ptr + index);
370     store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
371 
372     calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero,
373                                           dqcoeff_ptr + index, &log_scale);
374     calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
375                                           dqcoeff_ptr + index + 8, &log_scale);
376     index += 16;
377   }
378   if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
379   if (is_found1)
380     non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
381 
382   for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
383     const int rc = scan[i];
384     qcoeff_ptr[rc] = 0;
385     dqcoeff_ptr[rc] = 0;
386   }
387 
388   for (int i = non_zero_count - 1; i >= 0; i--) {
389     const int rc = scan[i];
390     if (qcoeff_ptr[rc]) {
391       eob = i;
392       break;
393     }
394   }
395 
396   *eob_ptr = eob + 1;
397 #if SKIP_EOB_FACTOR_ADJUST
398   // TODO(Aniket): Experiment the following loop with intrinsic by combining
399   // with the quantization loop above
400   for (int i = 0; i < non_zero_count; i++) {
401     const int rc = scan[i];
402     const int qcoeff = qcoeff_ptr[rc];
403     if (qcoeff) {
404       first = i;
405       break;
406     }
407   }
408   if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
409     const int rc = scan[(*eob_ptr - 1)];
410     if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
411       const int coeff = coeff_ptr[rc] * wt;
412       const int coeff_sign = AOMSIGN(coeff);
413       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
414       const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
415       const int prescan_add_val =
416           ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
417       if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
418         qcoeff_ptr[rc] = 0;
419         dqcoeff_ptr[rc] = 0;
420         *eob_ptr = 0;
421       }
422     }
423   }
424 #endif
425 }
426 
aom_quantize_b_64x64_adaptive_sse2(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)427 void aom_quantize_b_64x64_adaptive_sse2(
428     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
429     const int16_t *round_ptr, const int16_t *quant_ptr,
430     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
431     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
432     const int16_t *scan, const int16_t *iscan) {
433   int index = 16;
434   const int log_scale = 2;
435   int non_zero_count = 0;
436   int non_zero_count_prescan_add_zero = 0;
437   int is_found0 = 0, is_found1 = 0;
438   int eob = -1;
439   const __m128i zero = _mm_setzero_si128();
440   const __m128i one = _mm_set1_epi16(1);
441   const __m128i log_scale_vec = _mm_set1_epi16(log_scale);
442   __m128i zbin, round, quant, dequant, shift;
443   __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
444   __m128i qcoeff0, qcoeff1;
445   __m128i cmp_mask0, cmp_mask1;
446   __m128i all_zero;
447   __m128i mask0 = zero, mask1 = zero;
448 
449   const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
450                          ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
451   int prescan_add[2];
452   int thresh[4];
453   const qm_val_t wt = (1 << AOM_QM_BITS);
454   for (int i = 0; i < 2; ++i) {
455     prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
456     thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
457   }
458   thresh[2] = thresh[3] = thresh[1];
459   __m128i threshold[2];
460   threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
461   threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
462 
463 #if SKIP_EOB_FACTOR_ADJUST
464   int first = -1;
465 #endif
466   // Setup global values.
467   zbin = _mm_load_si128((const __m128i *)zbin_ptr);
468   round = _mm_load_si128((const __m128i *)round_ptr);
469   quant = _mm_load_si128((const __m128i *)quant_ptr);
470   dequant = _mm_load_si128((const __m128i *)dequant_ptr);
471   shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
472 
473   // Shift with rounding.
474   zbin = _mm_add_epi16(zbin, log_scale_vec);
475   round = _mm_add_epi16(round, log_scale_vec);
476   zbin = _mm_srli_epi16(zbin, log_scale);
477   round = _mm_srli_epi16(round, log_scale);
478   zbin = _mm_sub_epi16(zbin, one);
479 
480   // Do DC and first 15 AC.
481   coeff0 = load_coefficients(coeff_ptr);
482   coeff1 = load_coefficients(coeff_ptr + 8);
483 
484   coeff0_sign = _mm_srai_epi16(coeff0, 15);
485   coeff1_sign = _mm_srai_epi16(coeff1, 15);
486   qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
487   qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
488 
489   update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
490 
491   cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
492   zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
493   cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
494 
495   update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1);
496 
497   threshold[0] = threshold[1];
498   all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
499   if (_mm_movemask_epi8(all_zero) == 0) {
500     _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
501     _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
502     _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
503     _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
504     _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
505     _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
506     _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
507     _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
508     round = _mm_unpackhi_epi64(round, round);
509     quant = _mm_unpackhi_epi64(quant, quant);
510     shift = _mm_unpackhi_epi64(shift, shift);
511     dequant = _mm_unpackhi_epi64(dequant, dequant);
512   } else {
513     calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
514     round = _mm_unpackhi_epi64(round, round);
515     quant = _mm_unpackhi_epi64(quant, quant);
516     shift = _mm_unpackhi_epi64(shift, shift);
517     calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
518 
519     // Reinsert signs
520     qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
521     qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
522 
523     // Mask out zbin threshold coeffs
524     qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
525     qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
526 
527     store_coefficients(qcoeff0, qcoeff_ptr);
528     store_coefficients(qcoeff1, qcoeff_ptr + 8);
529 
530     calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr,
531                                           &log_scale);
532     dequant = _mm_unpackhi_epi64(dequant, dequant);
533     calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
534                                           dqcoeff_ptr + 8, &log_scale);
535   }
536 
537   // AC only loop.
538   while (index < n_coeffs) {
539     coeff0 = load_coefficients(coeff_ptr + index);
540     coeff1 = load_coefficients(coeff_ptr + index + 8);
541 
542     coeff0_sign = _mm_srai_epi16(coeff0, 15);
543     coeff1_sign = _mm_srai_epi16(coeff1, 15);
544     qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
545     qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
546 
547     update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0,
548                  &mask0);
549 
550     cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
551     cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
552 
553     update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1);
554 
555     all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
556     if (_mm_movemask_epi8(all_zero) == 0) {
557       _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
558       _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
559       _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
560       _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
561       _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
562       _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
563       _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
564       _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
565       index += 16;
566       continue;
567     }
568     calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
569     calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
570 
571     qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
572     qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
573 
574     qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
575     qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
576 
577     store_coefficients(qcoeff0, qcoeff_ptr + index);
578     store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
579 
580     calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero,
581                                           dqcoeff_ptr + index, &log_scale);
582     calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
583                                           dqcoeff_ptr + index + 8, &log_scale);
584     index += 16;
585   }
586   if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
587   if (is_found1)
588     non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
589 
590   for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
591     const int rc = scan[i];
592     qcoeff_ptr[rc] = 0;
593     dqcoeff_ptr[rc] = 0;
594   }
595 
596   for (int i = non_zero_count - 1; i >= 0; i--) {
597     const int rc = scan[i];
598     if (qcoeff_ptr[rc]) {
599       eob = i;
600       break;
601     }
602   }
603 
604   *eob_ptr = eob + 1;
605 #if SKIP_EOB_FACTOR_ADJUST
606   // TODO(Aniket): Experiment the following loop with intrinsic by combining
607   // with the quantization loop above
608   for (int i = 0; i < non_zero_count; i++) {
609     const int rc = scan[i];
610     const int qcoeff = qcoeff_ptr[rc];
611     if (qcoeff) {
612       first = i;
613       break;
614     }
615   }
616   if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
617     const int rc = scan[(*eob_ptr - 1)];
618     if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
619       const int coeff = coeff_ptr[rc] * wt;
620       const int coeff_sign = AOMSIGN(coeff);
621       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
622       const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
623       const int prescan_add_val =
624           ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
625       if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
626         qcoeff_ptr[rc] = 0;
627         dqcoeff_ptr[rc] = 0;
628         *eob_ptr = 0;
629       }
630     }
631   }
632 #endif
633 }
634