xref: /aosp_15_r20/external/libaom/aom_dsp/x86/highbd_adaptive_quantize_sse2.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2019, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <emmintrin.h>
13 #include "config/aom_dsp_rtcd.h"
14 
15 #include "aom/aom_integer.h"
16 #include "aom_dsp/quantize.h"
17 #include "aom_dsp/x86/quantize_x86.h"
18 
highbd_invert_sign_64bit_sse2(__m128i a,__m128i sign)19 static inline __m128i highbd_invert_sign_64bit_sse2(__m128i a, __m128i sign) {
20   a = _mm_xor_si128(a, sign);
21   return _mm_sub_epi64(a, sign);
22 }
23 
highbd_mul_shift_sse2(const __m128i * x,const __m128i * y,__m128i * p,const int shift)24 static inline void highbd_mul_shift_sse2(const __m128i *x, const __m128i *y,
25                                          __m128i *p, const int shift) {
26   __m128i sign = _mm_srai_epi32(*y, 31);
27   __m128i sign_lo = _mm_unpacklo_epi32(sign, sign);
28   __m128i sign_hi = _mm_unpackhi_epi32(sign, sign);
29   __m128i abs_y = invert_sign_32_sse2(*y, sign);
30   __m128i prod_lo = _mm_mul_epu32(*x, abs_y);
31   __m128i prod_hi = _mm_srli_epi64(*x, 32);
32   const __m128i mult_hi = _mm_srli_epi64(abs_y, 32);
33   prod_hi = _mm_mul_epu32(prod_hi, mult_hi);
34   prod_lo = highbd_invert_sign_64bit_sse2(prod_lo, sign_lo);
35   prod_hi = highbd_invert_sign_64bit_sse2(prod_hi, sign_hi);
36 
37   prod_lo = _mm_srli_epi64(prod_lo, shift);
38   const __m128i mask = _mm_set_epi32(0, -1, 0, -1);
39   prod_lo = _mm_and_si128(prod_lo, mask);
40   prod_hi = _mm_srli_epi64(prod_hi, shift);
41 
42   prod_hi = _mm_slli_epi64(prod_hi, 32);
43   *p = _mm_or_si128(prod_lo, prod_hi);
44 }
45 
highbd_calculate_qcoeff(__m128i * coeff,const __m128i * round,const __m128i * quant,const __m128i * shift,const int * log_scale)46 static inline void highbd_calculate_qcoeff(__m128i *coeff, const __m128i *round,
47                                            const __m128i *quant,
48                                            const __m128i *shift,
49                                            const int *log_scale) {
50   __m128i tmp, qcoeff;
51   qcoeff = _mm_add_epi32(*coeff, *round);
52   highbd_mul_shift_sse2(&qcoeff, quant, &tmp, 16);
53   qcoeff = _mm_add_epi32(tmp, qcoeff);
54   highbd_mul_shift_sse2(&qcoeff, shift, coeff, 16 - *log_scale);
55 }
56 
highbd_update_mask1(__m128i * cmp_mask0,const int16_t * iscan_ptr,int * is_found,__m128i * mask)57 static inline void highbd_update_mask1(__m128i *cmp_mask0,
58                                        const int16_t *iscan_ptr, int *is_found,
59                                        __m128i *mask) {
60   __m128i temp_mask = _mm_setzero_si128();
61   if (_mm_movemask_epi8(*cmp_mask0)) {
62     __m128i iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr));
63     __m128i mask0 = _mm_and_si128(*cmp_mask0, iscan0);
64     temp_mask = mask0;
65     *is_found = 1;
66   }
67   *mask = _mm_max_epi16(temp_mask, *mask);
68 }
69 
highbd_update_mask0(__m128i * qcoeff0,__m128i * qcoeff1,__m128i * threshold,const int16_t * iscan_ptr,int * is_found,__m128i * mask)70 static inline void highbd_update_mask0(__m128i *qcoeff0, __m128i *qcoeff1,
71                                        __m128i *threshold,
72                                        const int16_t *iscan_ptr, int *is_found,
73                                        __m128i *mask) {
74   __m128i coeff[2], cmp_mask0, cmp_mask1;
75 
76   coeff[0] = _mm_slli_epi32(*qcoeff0, AOM_QM_BITS);
77   cmp_mask0 = _mm_cmpgt_epi32(coeff[0], threshold[0]);
78   coeff[1] = _mm_slli_epi32(*qcoeff1, AOM_QM_BITS);
79   cmp_mask1 = _mm_cmpgt_epi32(coeff[1], threshold[1]);
80 
81   cmp_mask0 = _mm_packs_epi32(cmp_mask0, cmp_mask1);
82 
83   highbd_update_mask1(&cmp_mask0, iscan_ptr, is_found, mask);
84 }
85 
highbd_calculate_dqcoeff(__m128i qcoeff,__m128i dequant,const int log_scale)86 static inline __m128i highbd_calculate_dqcoeff(__m128i qcoeff, __m128i dequant,
87                                                const int log_scale) {
88   __m128i coeff_sign = _mm_srai_epi32(qcoeff, 31);
89   __m128i abs_coeff = invert_sign_32_sse2(qcoeff, coeff_sign);
90   highbd_mul_shift_sse2(&abs_coeff, &dequant, &abs_coeff, log_scale);
91   return invert_sign_32_sse2(abs_coeff, coeff_sign);
92 }
93 
aom_highbd_quantize_b_adaptive_sse2(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)94 void aom_highbd_quantize_b_adaptive_sse2(
95     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
96     const int16_t *round_ptr, const int16_t *quant_ptr,
97     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
98     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
99     const int16_t *scan, const int16_t *iscan) {
100   int index = 8;
101   const int log_scale = 0;
102   int non_zero_count = 0;
103   int non_zero_count_prescan_add_zero = 0;
104   int is_found0 = 0, is_found1 = 0;
105   int eob = -1;
106   const __m128i zero = _mm_setzero_si128();
107   const __m128i one = _mm_set1_epi32(1);
108   __m128i zbin, round, quant, dequant, shift;
109   __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
110   __m128i qcoeff0, qcoeff1;
111   __m128i cmp_mask0, cmp_mask1, cmp_mask;
112   __m128i all_zero;
113   __m128i mask0 = zero, mask1 = zero;
114 
115   int prescan_add[2];
116   int thresh[4];
117   const qm_val_t wt = (1 << AOM_QM_BITS);
118   for (int i = 0; i < 2; ++i) {
119     prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
120     thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1;
121   }
122   thresh[2] = thresh[3] = thresh[1];
123   __m128i threshold[2];
124   threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
125   threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
126 
127 #if SKIP_EOB_FACTOR_ADJUST
128   int first = -1;
129 #endif
130   // Setup global values.
131   zbin = _mm_load_si128((const __m128i *)zbin_ptr);
132   round = _mm_load_si128((const __m128i *)round_ptr);
133   quant = _mm_load_si128((const __m128i *)quant_ptr);
134   dequant = _mm_load_si128((const __m128i *)dequant_ptr);
135   shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
136 
137   __m128i zbin_sign = _mm_srai_epi16(zbin, 15);
138   __m128i round_sign = _mm_srai_epi16(round, 15);
139   __m128i quant_sign = _mm_srai_epi16(quant, 15);
140   __m128i dequant_sign = _mm_srai_epi16(dequant, 15);
141   __m128i shift_sign = _mm_srai_epi16(shift, 15);
142 
143   zbin = _mm_unpacklo_epi16(zbin, zbin_sign);
144   round = _mm_unpacklo_epi16(round, round_sign);
145   quant = _mm_unpacklo_epi16(quant, quant_sign);
146   dequant = _mm_unpacklo_epi16(dequant, dequant_sign);
147   shift = _mm_unpacklo_epi16(shift, shift_sign);
148   zbin = _mm_sub_epi32(zbin, one);
149 
150   // Do DC and first 15 AC.
151   coeff0 = _mm_load_si128((__m128i *)(coeff_ptr));
152   coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
153 
154   coeff0_sign = _mm_srai_epi32(coeff0, 31);
155   coeff1_sign = _mm_srai_epi32(coeff1, 31);
156   qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
157   qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
158 
159   highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
160 
161   cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
162   zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
163   cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
164   cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
165   highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1);
166 
167   threshold[0] = threshold[1];
168   all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
169   if (_mm_movemask_epi8(all_zero) == 0) {
170     _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
171     _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
172     _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
173     _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
174 
175     round = _mm_unpackhi_epi64(round, round);
176     quant = _mm_unpackhi_epi64(quant, quant);
177     shift = _mm_unpackhi_epi64(shift, shift);
178     dequant = _mm_unpackhi_epi64(dequant, dequant);
179   } else {
180     highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
181 
182     round = _mm_unpackhi_epi64(round, round);
183     quant = _mm_unpackhi_epi64(quant, quant);
184     shift = _mm_unpackhi_epi64(shift, shift);
185     highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
186 
187     // Reinsert signs
188     qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
189     qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
190 
191     // Mask out zbin threshold coeffs
192     qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
193     qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
194 
195     _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0);
196     _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1);
197 
198     coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
199     dequant = _mm_unpackhi_epi64(dequant, dequant);
200     coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
201     _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0);
202     _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1);
203   }
204 
205   // AC only loop.
206   while (index < n_coeffs) {
207     coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index));
208     coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4));
209 
210     coeff0_sign = _mm_srai_epi32(coeff0, 31);
211     coeff1_sign = _mm_srai_epi32(coeff1, 31);
212     qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
213     qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
214 
215     highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index,
216                         &is_found0, &mask0);
217 
218     cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
219     cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
220     cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
221     highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1);
222 
223     all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
224     if (_mm_movemask_epi8(all_zero) == 0) {
225       _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
226       _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
227       _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
228       _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
229       index += 8;
230       continue;
231     }
232     highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
233     highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
234 
235     qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
236     qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
237 
238     qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
239     qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
240 
241     _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0);
242     _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1);
243 
244     coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
245     coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
246 
247     _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0);
248     _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1);
249 
250     index += 8;
251   }
252   if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
253   if (is_found1)
254     non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
255 
256   for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
257     const int rc = scan[i];
258     qcoeff_ptr[rc] = 0;
259     dqcoeff_ptr[rc] = 0;
260   }
261 
262   for (int i = non_zero_count - 1; i >= 0; i--) {
263     const int rc = scan[i];
264     if (qcoeff_ptr[rc]) {
265       eob = i;
266       break;
267     }
268   }
269 
270   *eob_ptr = eob + 1;
271 #if SKIP_EOB_FACTOR_ADJUST
272   // TODO(Aniket): Experiment the following loop with intrinsic by combining
273   // with the quantization loop above
274   for (int i = 0; i < non_zero_count; i++) {
275     const int rc = scan[i];
276     const int qcoeff = qcoeff_ptr[rc];
277     if (qcoeff) {
278       first = i;
279       break;
280     }
281   }
282   if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
283     const int rc = scan[(*eob_ptr - 1)];
284     if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
285       const int coeff = coeff_ptr[rc] * wt;
286       const int coeff_sign = AOMSIGN(coeff);
287       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
288       const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
289       const int prescan_add_val =
290           ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
291       if (abs_coeff <
292           (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
293         qcoeff_ptr[rc] = 0;
294         dqcoeff_ptr[rc] = 0;
295         *eob_ptr = 0;
296       }
297     }
298   }
299 #endif
300 }
301 
aom_highbd_quantize_b_32x32_adaptive_sse2(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)302 void aom_highbd_quantize_b_32x32_adaptive_sse2(
303     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
304     const int16_t *round_ptr, const int16_t *quant_ptr,
305     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
306     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
307     const int16_t *scan, const int16_t *iscan) {
308   int index = 8;
309   const int log_scale = 1;
310   int non_zero_count = 0;
311   int non_zero_count_prescan_add_zero = 0;
312   int is_found0 = 0, is_found1 = 0;
313   int eob = -1;
314   const __m128i zero = _mm_setzero_si128();
315   const __m128i one = _mm_set1_epi32(1);
316   const __m128i log_scale_vec = _mm_set1_epi32(log_scale);
317   __m128i zbin, round, quant, dequant, shift;
318   __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
319   __m128i qcoeff0, qcoeff1;
320   __m128i cmp_mask0, cmp_mask1, cmp_mask;
321   __m128i all_zero;
322   __m128i mask0 = zero, mask1 = zero;
323 
324   const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
325                          ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
326   int prescan_add[2];
327   int thresh[4];
328   const qm_val_t wt = (1 << AOM_QM_BITS);
329   for (int i = 0; i < 2; ++i) {
330     prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
331     thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
332   }
333   thresh[2] = thresh[3] = thresh[1];
334   __m128i threshold[2];
335   threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
336   threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
337 
338 #if SKIP_EOB_FACTOR_ADJUST
339   int first = -1;
340 #endif
341   // Setup global values.
342   zbin = _mm_load_si128((const __m128i *)zbin_ptr);
343   round = _mm_load_si128((const __m128i *)round_ptr);
344   quant = _mm_load_si128((const __m128i *)quant_ptr);
345   dequant = _mm_load_si128((const __m128i *)dequant_ptr);
346   shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
347 
348   __m128i zbin_sign = _mm_srai_epi16(zbin, 15);
349   __m128i round_sign = _mm_srai_epi16(round, 15);
350   __m128i quant_sign = _mm_srai_epi16(quant, 15);
351   __m128i dequant_sign = _mm_srai_epi16(dequant, 15);
352   __m128i shift_sign = _mm_srai_epi16(shift, 15);
353 
354   zbin = _mm_unpacklo_epi16(zbin, zbin_sign);
355   round = _mm_unpacklo_epi16(round, round_sign);
356   quant = _mm_unpacklo_epi16(quant, quant_sign);
357   dequant = _mm_unpacklo_epi16(dequant, dequant_sign);
358   shift = _mm_unpacklo_epi16(shift, shift_sign);
359 
360   // Shift with rounding.
361   zbin = _mm_add_epi32(zbin, log_scale_vec);
362   round = _mm_add_epi32(round, log_scale_vec);
363   zbin = _mm_srli_epi32(zbin, log_scale);
364   round = _mm_srli_epi32(round, log_scale);
365   zbin = _mm_sub_epi32(zbin, one);
366 
367   // Do DC and first 15 AC.
368   coeff0 = _mm_load_si128((__m128i *)(coeff_ptr));
369   coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
370 
371   coeff0_sign = _mm_srai_epi32(coeff0, 31);
372   coeff1_sign = _mm_srai_epi32(coeff1, 31);
373   qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
374   qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
375 
376   highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
377 
378   cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
379   zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
380   cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
381   cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
382   highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1);
383 
384   threshold[0] = threshold[1];
385   all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
386   if (_mm_movemask_epi8(all_zero) == 0) {
387     _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
388     _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
389     _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
390     _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
391 
392     round = _mm_unpackhi_epi64(round, round);
393     quant = _mm_unpackhi_epi64(quant, quant);
394     shift = _mm_unpackhi_epi64(shift, shift);
395     dequant = _mm_unpackhi_epi64(dequant, dequant);
396   } else {
397     highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
398 
399     round = _mm_unpackhi_epi64(round, round);
400     quant = _mm_unpackhi_epi64(quant, quant);
401     shift = _mm_unpackhi_epi64(shift, shift);
402     highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
403 
404     // Reinsert signs
405     qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
406     qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
407 
408     // Mask out zbin threshold coeffs
409     qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
410     qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
411 
412     _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0);
413     _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1);
414 
415     coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
416     dequant = _mm_unpackhi_epi64(dequant, dequant);
417     coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
418     _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0);
419     _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1);
420   }
421 
422   // AC only loop.
423   while (index < n_coeffs) {
424     coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index));
425     coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4));
426 
427     coeff0_sign = _mm_srai_epi32(coeff0, 31);
428     coeff1_sign = _mm_srai_epi32(coeff1, 31);
429     qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
430     qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
431 
432     highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index,
433                         &is_found0, &mask0);
434 
435     cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
436     cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
437     cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
438     highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1);
439 
440     all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
441     if (_mm_movemask_epi8(all_zero) == 0) {
442       _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
443       _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
444       _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
445       _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
446       index += 8;
447       continue;
448     }
449     highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
450     highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
451 
452     qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
453     qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
454 
455     qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
456     qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
457 
458     _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0);
459     _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1);
460 
461     coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
462     coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
463 
464     _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0);
465     _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1);
466 
467     index += 8;
468   }
469   if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
470   if (is_found1)
471     non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
472 
473   for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
474     const int rc = scan[i];
475     qcoeff_ptr[rc] = 0;
476     dqcoeff_ptr[rc] = 0;
477   }
478 
479   for (int i = non_zero_count - 1; i >= 0; i--) {
480     const int rc = scan[i];
481     if (qcoeff_ptr[rc]) {
482       eob = i;
483       break;
484     }
485   }
486 
487   *eob_ptr = eob + 1;
488 #if SKIP_EOB_FACTOR_ADJUST
489   // TODO(Aniket): Experiment the following loop with intrinsic by combining
490   // with the quantization loop above
491   for (int i = 0; i < non_zero_count; i++) {
492     const int rc = scan[i];
493     const int qcoeff = qcoeff_ptr[rc];
494     if (qcoeff) {
495       first = i;
496       break;
497     }
498   }
499   if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
500     const int rc = scan[(*eob_ptr - 1)];
501     if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
502       const int coeff = coeff_ptr[rc] * wt;
503       const int coeff_sign = AOMSIGN(coeff);
504       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
505       const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
506       const int prescan_add_val =
507           ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
508       if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
509         qcoeff_ptr[rc] = 0;
510         dqcoeff_ptr[rc] = 0;
511         *eob_ptr = 0;
512       }
513     }
514   }
515 #endif
516 }
517 
aom_highbd_quantize_b_64x64_adaptive_sse2(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)518 void aom_highbd_quantize_b_64x64_adaptive_sse2(
519     const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
520     const int16_t *round_ptr, const int16_t *quant_ptr,
521     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
522     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
523     const int16_t *scan, const int16_t *iscan) {
524   int index = 8;
525   const int log_scale = 2;
526   int non_zero_count = 0;
527   int non_zero_count_prescan_add_zero = 0;
528   int is_found0 = 0, is_found1 = 0;
529   int eob = -1;
530   const __m128i zero = _mm_setzero_si128();
531   const __m128i one = _mm_set1_epi32(1);
532   const __m128i log_scale_vec = _mm_set1_epi32(log_scale);
533   __m128i zbin, round, quant, dequant, shift;
534   __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
535   __m128i qcoeff0, qcoeff1;
536   __m128i cmp_mask0, cmp_mask1, cmp_mask;
537   __m128i all_zero;
538   __m128i mask0 = zero, mask1 = zero;
539 
540   const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
541                          ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
542   int prescan_add[2];
543   int thresh[4];
544   const qm_val_t wt = (1 << AOM_QM_BITS);
545   for (int i = 0; i < 2; ++i) {
546     prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
547     thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
548   }
549   thresh[2] = thresh[3] = thresh[1];
550   __m128i threshold[2];
551   threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
552   threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
553 
554 #if SKIP_EOB_FACTOR_ADJUST
555   int first = -1;
556 #endif
557   // Setup global values.
558   zbin = _mm_load_si128((const __m128i *)zbin_ptr);
559   round = _mm_load_si128((const __m128i *)round_ptr);
560   quant = _mm_load_si128((const __m128i *)quant_ptr);
561   dequant = _mm_load_si128((const __m128i *)dequant_ptr);
562   shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
563 
564   __m128i zbin_sign = _mm_srai_epi16(zbin, 15);
565   __m128i round_sign = _mm_srai_epi16(round, 15);
566   __m128i quant_sign = _mm_srai_epi16(quant, 15);
567   __m128i dequant_sign = _mm_srai_epi16(dequant, 15);
568   __m128i shift_sign = _mm_srai_epi16(shift, 15);
569 
570   zbin = _mm_unpacklo_epi16(zbin, zbin_sign);
571   round = _mm_unpacklo_epi16(round, round_sign);
572   quant = _mm_unpacklo_epi16(quant, quant_sign);
573   dequant = _mm_unpacklo_epi16(dequant, dequant_sign);
574   shift = _mm_unpacklo_epi16(shift, shift_sign);
575 
576   // Shift with rounding.
577   zbin = _mm_add_epi32(zbin, log_scale_vec);
578   round = _mm_add_epi32(round, log_scale_vec);
579   zbin = _mm_srli_epi32(zbin, log_scale);
580   round = _mm_srli_epi32(round, log_scale);
581   zbin = _mm_sub_epi32(zbin, one);
582 
583   // Do DC and first 15 AC.
584   coeff0 = _mm_load_si128((__m128i *)(coeff_ptr));
585   coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
586 
587   coeff0_sign = _mm_srai_epi32(coeff0, 31);
588   coeff1_sign = _mm_srai_epi32(coeff1, 31);
589   qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
590   qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
591 
592   highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
593 
594   cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
595   zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
596   cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
597   cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
598   highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1);
599 
600   threshold[0] = threshold[1];
601   all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
602   if (_mm_movemask_epi8(all_zero) == 0) {
603     _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
604     _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
605     _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
606     _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
607 
608     round = _mm_unpackhi_epi64(round, round);
609     quant = _mm_unpackhi_epi64(quant, quant);
610     shift = _mm_unpackhi_epi64(shift, shift);
611     dequant = _mm_unpackhi_epi64(dequant, dequant);
612   } else {
613     highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
614 
615     round = _mm_unpackhi_epi64(round, round);
616     quant = _mm_unpackhi_epi64(quant, quant);
617     shift = _mm_unpackhi_epi64(shift, shift);
618     highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
619 
620     // Reinsert signs
621     qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
622     qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
623 
624     // Mask out zbin threshold coeffs
625     qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
626     qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
627 
628     _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0);
629     _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1);
630 
631     coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
632     dequant = _mm_unpackhi_epi64(dequant, dequant);
633     coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
634     _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0);
635     _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1);
636   }
637 
638   // AC only loop.
639   while (index < n_coeffs) {
640     coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index));
641     coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4));
642 
643     coeff0_sign = _mm_srai_epi32(coeff0, 31);
644     coeff1_sign = _mm_srai_epi32(coeff1, 31);
645     qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
646     qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
647 
648     highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index,
649                         &is_found0, &mask0);
650 
651     cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
652     cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
653     cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
654     highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1);
655 
656     all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
657     if (_mm_movemask_epi8(all_zero) == 0) {
658       _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
659       _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
660       _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
661       _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
662       index += 8;
663       continue;
664     }
665     highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
666     highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
667 
668     qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
669     qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
670 
671     qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
672     qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
673 
674     _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0);
675     _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1);
676 
677     coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
678     coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
679 
680     _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0);
681     _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1);
682 
683     index += 8;
684   }
685   if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
686   if (is_found1)
687     non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
688 
689   for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
690     const int rc = scan[i];
691     qcoeff_ptr[rc] = 0;
692     dqcoeff_ptr[rc] = 0;
693   }
694 
695   for (int i = non_zero_count - 1; i >= 0; i--) {
696     const int rc = scan[i];
697     if (qcoeff_ptr[rc]) {
698       eob = i;
699       break;
700     }
701   }
702 
703   *eob_ptr = eob + 1;
704 #if SKIP_EOB_FACTOR_ADJUST
705   // TODO(Aniket): Experiment the following loop with intrinsic by combining
706   // with the quantization loop above
707   for (int i = 0; i < non_zero_count; i++) {
708     const int rc = scan[i];
709     const int qcoeff = qcoeff_ptr[rc];
710     if (qcoeff) {
711       first = i;
712       break;
713     }
714   }
715   if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
716     const int rc = scan[(*eob_ptr - 1)];
717     if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
718       const int coeff = coeff_ptr[rc] * wt;
719       const int coeff_sign = AOMSIGN(coeff);
720       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
721       const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
722       const int prescan_add_val =
723           ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
724       if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
725         qcoeff_ptr[rc] = 0;
726         dqcoeff_ptr[rc] = 0;
727         *eob_ptr = 0;
728       }
729     }
730   }
731 #endif
732 }
733