xref: /aosp_15_r20/external/libvpx/vpx_dsp/x86/quantize_avx2.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1*fb1b10abSAndroid Build Coastguard Worker /*
2*fb1b10abSAndroid Build Coastguard Worker  *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3*fb1b10abSAndroid Build Coastguard Worker  *
4*fb1b10abSAndroid Build Coastguard Worker  *  Use of this source code is governed by a BSD-style license
5*fb1b10abSAndroid Build Coastguard Worker  *  that can be found in the LICENSE file in the root of the source
6*fb1b10abSAndroid Build Coastguard Worker  *  tree. An additional intellectual property rights grant can be found
7*fb1b10abSAndroid Build Coastguard Worker  *  in the file PATENTS.  All contributing project authors may
8*fb1b10abSAndroid Build Coastguard Worker  *  be found in the AUTHORS file in the root of the source tree.
9*fb1b10abSAndroid Build Coastguard Worker  */
10*fb1b10abSAndroid Build Coastguard Worker 
11*fb1b10abSAndroid Build Coastguard Worker #include <assert.h>
12*fb1b10abSAndroid Build Coastguard Worker #include <immintrin.h>
13*fb1b10abSAndroid Build Coastguard Worker 
14*fb1b10abSAndroid Build Coastguard Worker #include "./vpx_dsp_rtcd.h"
15*fb1b10abSAndroid Build Coastguard Worker #include "vpx/vpx_integer.h"
16*fb1b10abSAndroid Build Coastguard Worker #include "vp9/common/vp9_scan.h"
17*fb1b10abSAndroid Build Coastguard Worker #include "vp9/encoder/vp9_block.h"
18*fb1b10abSAndroid Build Coastguard Worker 
load_b_values_avx2(const struct macroblock_plane * mb_plane,__m256i * zbin,__m256i * round,__m256i * quant,const int16_t * dequant_ptr,__m256i * dequant,__m256i * shift,int log_scale)19*fb1b10abSAndroid Build Coastguard Worker static VPX_FORCE_INLINE void load_b_values_avx2(
20*fb1b10abSAndroid Build Coastguard Worker     const struct macroblock_plane *mb_plane, __m256i *zbin, __m256i *round,
21*fb1b10abSAndroid Build Coastguard Worker     __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant,
22*fb1b10abSAndroid Build Coastguard Worker     __m256i *shift, int log_scale) {
23*fb1b10abSAndroid Build Coastguard Worker   *zbin =
24*fb1b10abSAndroid Build Coastguard Worker       _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->zbin));
25*fb1b10abSAndroid Build Coastguard Worker   *zbin = _mm256_permute4x64_epi64(*zbin, 0x54);
26*fb1b10abSAndroid Build Coastguard Worker   if (log_scale > 0) {
27*fb1b10abSAndroid Build Coastguard Worker     const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1)));
28*fb1b10abSAndroid Build Coastguard Worker     *zbin = _mm256_add_epi16(*zbin, rnd);
29*fb1b10abSAndroid Build Coastguard Worker     *zbin = _mm256_srai_epi16(*zbin, log_scale);
30*fb1b10abSAndroid Build Coastguard Worker   }
31*fb1b10abSAndroid Build Coastguard Worker   // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when
32*fb1b10abSAndroid Build Coastguard Worker   // calculating the zbin mask. (See quantize_b_logscale{0,1,2}_16)
33*fb1b10abSAndroid Build Coastguard Worker   *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1));
34*fb1b10abSAndroid Build Coastguard Worker 
35*fb1b10abSAndroid Build Coastguard Worker   *round =
36*fb1b10abSAndroid Build Coastguard Worker       _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->round));
37*fb1b10abSAndroid Build Coastguard Worker   *round = _mm256_permute4x64_epi64(*round, 0x54);
38*fb1b10abSAndroid Build Coastguard Worker   if (log_scale > 0) {
39*fb1b10abSAndroid Build Coastguard Worker     const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1)));
40*fb1b10abSAndroid Build Coastguard Worker     *round = _mm256_add_epi16(*round, rnd);
41*fb1b10abSAndroid Build Coastguard Worker     *round = _mm256_srai_epi16(*round, log_scale);
42*fb1b10abSAndroid Build Coastguard Worker   }
43*fb1b10abSAndroid Build Coastguard Worker 
44*fb1b10abSAndroid Build Coastguard Worker   *quant =
45*fb1b10abSAndroid Build Coastguard Worker       _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->quant));
46*fb1b10abSAndroid Build Coastguard Worker   *quant = _mm256_permute4x64_epi64(*quant, 0x54);
47*fb1b10abSAndroid Build Coastguard Worker   *dequant =
48*fb1b10abSAndroid Build Coastguard Worker       _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr));
49*fb1b10abSAndroid Build Coastguard Worker   *dequant = _mm256_permute4x64_epi64(*dequant, 0x54);
50*fb1b10abSAndroid Build Coastguard Worker   *shift = _mm256_castsi128_si256(
51*fb1b10abSAndroid Build Coastguard Worker       _mm_load_si128((const __m128i *)mb_plane->quant_shift));
52*fb1b10abSAndroid Build Coastguard Worker   *shift = _mm256_permute4x64_epi64(*shift, 0x54);
53*fb1b10abSAndroid Build Coastguard Worker }
54*fb1b10abSAndroid Build Coastguard Worker 
55*fb1b10abSAndroid Build Coastguard Worker static VPX_FORCE_INLINE __m256i
load_coefficients_avx2(const tran_low_t * coeff_ptr)56*fb1b10abSAndroid Build Coastguard Worker load_coefficients_avx2(const tran_low_t *coeff_ptr) {
57*fb1b10abSAndroid Build Coastguard Worker #if CONFIG_VP9_HIGHBITDEPTH
58*fb1b10abSAndroid Build Coastguard Worker   // typedef int32_t tran_low_t;
59*fb1b10abSAndroid Build Coastguard Worker   const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)coeff_ptr);
60*fb1b10abSAndroid Build Coastguard Worker   const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(coeff_ptr + 8));
61*fb1b10abSAndroid Build Coastguard Worker   return _mm256_packs_epi32(coeff1, coeff2);
62*fb1b10abSAndroid Build Coastguard Worker #else
63*fb1b10abSAndroid Build Coastguard Worker   // typedef int16_t tran_low_t;
64*fb1b10abSAndroid Build Coastguard Worker   return _mm256_loadu_si256((const __m256i *)coeff_ptr);
65*fb1b10abSAndroid Build Coastguard Worker #endif
66*fb1b10abSAndroid Build Coastguard Worker }
67*fb1b10abSAndroid Build Coastguard Worker 
store_coefficients_avx2(__m256i coeff_vals,tran_low_t * coeff_ptr)68*fb1b10abSAndroid Build Coastguard Worker static VPX_FORCE_INLINE void store_coefficients_avx2(__m256i coeff_vals,
69*fb1b10abSAndroid Build Coastguard Worker                                                      tran_low_t *coeff_ptr) {
70*fb1b10abSAndroid Build Coastguard Worker #if CONFIG_VP9_HIGHBITDEPTH
71*fb1b10abSAndroid Build Coastguard Worker   // typedef int32_t tran_low_t;
72*fb1b10abSAndroid Build Coastguard Worker   __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15);
73*fb1b10abSAndroid Build Coastguard Worker   __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign);
74*fb1b10abSAndroid Build Coastguard Worker   __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign);
75*fb1b10abSAndroid Build Coastguard Worker   _mm256_storeu_si256((__m256i *)coeff_ptr, coeff_vals_lo);
76*fb1b10abSAndroid Build Coastguard Worker   _mm256_storeu_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi);
77*fb1b10abSAndroid Build Coastguard Worker #else
78*fb1b10abSAndroid Build Coastguard Worker   // typedef int16_t tran_low_t;
79*fb1b10abSAndroid Build Coastguard Worker   _mm256_storeu_si256((__m256i *)coeff_ptr, coeff_vals);
80*fb1b10abSAndroid Build Coastguard Worker #endif
81*fb1b10abSAndroid Build Coastguard Worker }
82*fb1b10abSAndroid Build Coastguard Worker 
83*fb1b10abSAndroid Build Coastguard Worker static VPX_FORCE_INLINE __m256i
quantize_b_16(const tran_low_t * coeff_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,__m256i * v_quant,__m256i * v_dequant,__m256i * v_round,__m256i * v_zbin,__m256i * v_quant_shift)84*fb1b10abSAndroid Build Coastguard Worker quantize_b_16(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
85*fb1b10abSAndroid Build Coastguard Worker               tran_low_t *dqcoeff_ptr, __m256i *v_quant, __m256i *v_dequant,
86*fb1b10abSAndroid Build Coastguard Worker               __m256i *v_round, __m256i *v_zbin, __m256i *v_quant_shift) {
87*fb1b10abSAndroid Build Coastguard Worker   const __m256i v_coeff = load_coefficients_avx2(coeff_ptr);
88*fb1b10abSAndroid Build Coastguard Worker   const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff);
89*fb1b10abSAndroid Build Coastguard Worker   const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin);
90*fb1b10abSAndroid Build Coastguard Worker 
91*fb1b10abSAndroid Build Coastguard Worker   if (_mm256_movemask_epi8(v_zbin_mask) == 0) {
92*fb1b10abSAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256());
93*fb1b10abSAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256());
94*fb1b10abSAndroid Build Coastguard Worker #if CONFIG_VP9_HIGHBITDEPTH
95*fb1b10abSAndroid Build Coastguard Worker     _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256());
96*fb1b10abSAndroid Build Coastguard Worker     _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256());
97*fb1b10abSAndroid Build Coastguard Worker #endif  // CONFIG_VP9_HIGHBITDEPTH
98*fb1b10abSAndroid Build Coastguard Worker     return _mm256_setzero_si256();
99*fb1b10abSAndroid Build Coastguard Worker   }
100*fb1b10abSAndroid Build Coastguard Worker   {
101*fb1b10abSAndroid Build Coastguard Worker     // tmp = v_zbin_mask ? (int64_t)abs_coeff + log_scaled_round : 0
102*fb1b10abSAndroid Build Coastguard Worker     const __m256i v_tmp_rnd =
103*fb1b10abSAndroid Build Coastguard Worker         _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask);
104*fb1b10abSAndroid Build Coastguard Worker 
105*fb1b10abSAndroid Build Coastguard Worker     const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant);
106*fb1b10abSAndroid Build Coastguard Worker     const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd);
107*fb1b10abSAndroid Build Coastguard Worker     const __m256i v_tmp32 = _mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift);
108*fb1b10abSAndroid Build Coastguard Worker     const __m256i v_nz_mask =
109*fb1b10abSAndroid Build Coastguard Worker         _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256());
110*fb1b10abSAndroid Build Coastguard Worker     const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff);
111*fb1b10abSAndroid Build Coastguard Worker #if CONFIG_VP9_HIGHBITDEPTH
112*fb1b10abSAndroid Build Coastguard Worker     const __m256i low = _mm256_mullo_epi16(v_qcoeff, *v_dequant);
113*fb1b10abSAndroid Build Coastguard Worker     const __m256i high = _mm256_mulhi_epi16(v_qcoeff, *v_dequant);
114*fb1b10abSAndroid Build Coastguard Worker 
115*fb1b10abSAndroid Build Coastguard Worker     const __m256i v_dqcoeff_lo = _mm256_unpacklo_epi16(low, high);
116*fb1b10abSAndroid Build Coastguard Worker     const __m256i v_dqcoeff_hi = _mm256_unpackhi_epi16(low, high);
117*fb1b10abSAndroid Build Coastguard Worker #else
118*fb1b10abSAndroid Build Coastguard Worker     const __m256i v_dqcoeff = _mm256_mullo_epi16(v_qcoeff, *v_dequant);
119*fb1b10abSAndroid Build Coastguard Worker #endif
120*fb1b10abSAndroid Build Coastguard Worker 
121*fb1b10abSAndroid Build Coastguard Worker     store_coefficients_avx2(v_qcoeff, qcoeff_ptr);
122*fb1b10abSAndroid Build Coastguard Worker #if CONFIG_VP9_HIGHBITDEPTH
123*fb1b10abSAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i *)(dqcoeff_ptr), v_dqcoeff_lo);
124*fb1b10abSAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + 8), v_dqcoeff_hi);
125*fb1b10abSAndroid Build Coastguard Worker #else
126*fb1b10abSAndroid Build Coastguard Worker     store_coefficients_avx2(v_dqcoeff, dqcoeff_ptr);
127*fb1b10abSAndroid Build Coastguard Worker #endif
128*fb1b10abSAndroid Build Coastguard Worker     return v_nz_mask;
129*fb1b10abSAndroid Build Coastguard Worker   }
130*fb1b10abSAndroid Build Coastguard Worker }
131*fb1b10abSAndroid Build Coastguard Worker 
get_max_lane_eob(const int16_t * iscan,__m256i v_eobmax,__m256i v_mask)132*fb1b10abSAndroid Build Coastguard Worker static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan,
133*fb1b10abSAndroid Build Coastguard Worker                                                  __m256i v_eobmax,
134*fb1b10abSAndroid Build Coastguard Worker                                                  __m256i v_mask) {
135*fb1b10abSAndroid Build Coastguard Worker #if CONFIG_VP9_HIGHBITDEPTH
136*fb1b10abSAndroid Build Coastguard Worker   const __m256i v_iscan = _mm256_permute4x64_epi64(
137*fb1b10abSAndroid Build Coastguard Worker       _mm256_loadu_si256((const __m256i *)iscan), 0xD8);
138*fb1b10abSAndroid Build Coastguard Worker #else
139*fb1b10abSAndroid Build Coastguard Worker   const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan);
140*fb1b10abSAndroid Build Coastguard Worker #endif
141*fb1b10abSAndroid Build Coastguard Worker   const __m256i v_nz_iscan = _mm256_and_si256(v_iscan, v_mask);
142*fb1b10abSAndroid Build Coastguard Worker   return _mm256_max_epi16(v_eobmax, v_nz_iscan);
143*fb1b10abSAndroid Build Coastguard Worker }
144*fb1b10abSAndroid Build Coastguard Worker 
accumulate_eob256(__m256i eob256)145*fb1b10abSAndroid Build Coastguard Worker static VPX_FORCE_INLINE int16_t accumulate_eob256(__m256i eob256) {
146*fb1b10abSAndroid Build Coastguard Worker   const __m128i eob_lo = _mm256_castsi256_si128(eob256);
147*fb1b10abSAndroid Build Coastguard Worker   const __m128i eob_hi = _mm256_extractf128_si256(eob256, 1);
148*fb1b10abSAndroid Build Coastguard Worker   __m128i eob = _mm_max_epi16(eob_lo, eob_hi);
149*fb1b10abSAndroid Build Coastguard Worker   __m128i eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
150*fb1b10abSAndroid Build Coastguard Worker   eob = _mm_max_epi16(eob, eob_shuffled);
151*fb1b10abSAndroid Build Coastguard Worker   eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
152*fb1b10abSAndroid Build Coastguard Worker   eob = _mm_max_epi16(eob, eob_shuffled);
153*fb1b10abSAndroid Build Coastguard Worker   eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
154*fb1b10abSAndroid Build Coastguard Worker   eob = _mm_max_epi16(eob, eob_shuffled);
155*fb1b10abSAndroid Build Coastguard Worker   return _mm_extract_epi16(eob, 1);
156*fb1b10abSAndroid Build Coastguard Worker }
157*fb1b10abSAndroid Build Coastguard Worker 
vpx_quantize_b_avx2(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const struct macroblock_plane * const mb_plane,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const struct ScanOrder * const scan_order)158*fb1b10abSAndroid Build Coastguard Worker void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
159*fb1b10abSAndroid Build Coastguard Worker                          const struct macroblock_plane *const mb_plane,
160*fb1b10abSAndroid Build Coastguard Worker                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
161*fb1b10abSAndroid Build Coastguard Worker                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
162*fb1b10abSAndroid Build Coastguard Worker                          const struct ScanOrder *const scan_order) {
163*fb1b10abSAndroid Build Coastguard Worker   __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift, v_nz_mask;
164*fb1b10abSAndroid Build Coastguard Worker   __m256i v_eobmax = _mm256_setzero_si256();
165*fb1b10abSAndroid Build Coastguard Worker   intptr_t count;
166*fb1b10abSAndroid Build Coastguard Worker   const int16_t *iscan = scan_order->iscan;
167*fb1b10abSAndroid Build Coastguard Worker 
168*fb1b10abSAndroid Build Coastguard Worker   load_b_values_avx2(mb_plane, &v_zbin, &v_round, &v_quant, dequant_ptr,
169*fb1b10abSAndroid Build Coastguard Worker                      &v_dequant, &v_quant_shift, 0);
170*fb1b10abSAndroid Build Coastguard Worker   // Do DC and first 15 AC.
171*fb1b10abSAndroid Build Coastguard Worker   v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant,
172*fb1b10abSAndroid Build Coastguard Worker                             &v_dequant, &v_round, &v_zbin, &v_quant_shift);
173*fb1b10abSAndroid Build Coastguard Worker 
174*fb1b10abSAndroid Build Coastguard Worker   v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask);
175*fb1b10abSAndroid Build Coastguard Worker 
176*fb1b10abSAndroid Build Coastguard Worker   v_round = _mm256_unpackhi_epi64(v_round, v_round);
177*fb1b10abSAndroid Build Coastguard Worker   v_quant = _mm256_unpackhi_epi64(v_quant, v_quant);
178*fb1b10abSAndroid Build Coastguard Worker   v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant);
179*fb1b10abSAndroid Build Coastguard Worker   v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift);
180*fb1b10abSAndroid Build Coastguard Worker   v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin);
181*fb1b10abSAndroid Build Coastguard Worker 
182*fb1b10abSAndroid Build Coastguard Worker   for (count = n_coeffs - 16; count > 0; count -= 16) {
183*fb1b10abSAndroid Build Coastguard Worker     coeff_ptr += 16;
184*fb1b10abSAndroid Build Coastguard Worker     qcoeff_ptr += 16;
185*fb1b10abSAndroid Build Coastguard Worker     dqcoeff_ptr += 16;
186*fb1b10abSAndroid Build Coastguard Worker     iscan += 16;
187*fb1b10abSAndroid Build Coastguard Worker     v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant,
188*fb1b10abSAndroid Build Coastguard Worker                               &v_dequant, &v_round, &v_zbin, &v_quant_shift);
189*fb1b10abSAndroid Build Coastguard Worker 
190*fb1b10abSAndroid Build Coastguard Worker     v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask);
191*fb1b10abSAndroid Build Coastguard Worker   }
192*fb1b10abSAndroid Build Coastguard Worker 
193*fb1b10abSAndroid Build Coastguard Worker   *eob_ptr = accumulate_eob256(v_eobmax);
194*fb1b10abSAndroid Build Coastguard Worker }
195*fb1b10abSAndroid Build Coastguard Worker 
quantize_b_32x32_16(const tran_low_t * coeff_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * iscan,__m256i * v_quant,__m256i * v_dequant,__m256i * v_round,__m256i * v_zbin,__m256i * v_quant_shift,__m256i * v_eobmax)196*fb1b10abSAndroid Build Coastguard Worker static VPX_FORCE_INLINE __m256i quantize_b_32x32_16(
197*fb1b10abSAndroid Build Coastguard Worker     const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
198*fb1b10abSAndroid Build Coastguard Worker     tran_low_t *dqcoeff_ptr, const int16_t *iscan, __m256i *v_quant,
199*fb1b10abSAndroid Build Coastguard Worker     __m256i *v_dequant, __m256i *v_round, __m256i *v_zbin,
200*fb1b10abSAndroid Build Coastguard Worker     __m256i *v_quant_shift, __m256i *v_eobmax) {
201*fb1b10abSAndroid Build Coastguard Worker   const __m256i v_coeff = load_coefficients_avx2(coeff_ptr);
202*fb1b10abSAndroid Build Coastguard Worker   const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff);
203*fb1b10abSAndroid Build Coastguard Worker   const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin);
204*fb1b10abSAndroid Build Coastguard Worker 
205*fb1b10abSAndroid Build Coastguard Worker   if (_mm256_movemask_epi8(v_zbin_mask) == 0) {
206*fb1b10abSAndroid Build Coastguard Worker     _mm256_store_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256());
207*fb1b10abSAndroid Build Coastguard Worker     _mm256_store_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256());
208*fb1b10abSAndroid Build Coastguard Worker #if CONFIG_VP9_HIGHBITDEPTH
209*fb1b10abSAndroid Build Coastguard Worker     _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256());
210*fb1b10abSAndroid Build Coastguard Worker     _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256());
211*fb1b10abSAndroid Build Coastguard Worker #endif
212*fb1b10abSAndroid Build Coastguard Worker     return *v_eobmax;
213*fb1b10abSAndroid Build Coastguard Worker   }
214*fb1b10abSAndroid Build Coastguard Worker   {
215*fb1b10abSAndroid Build Coastguard Worker     // tmp = v_zbin_mask ? (int64_t)abs_coeff + round : 0
216*fb1b10abSAndroid Build Coastguard Worker     const __m256i v_tmp_rnd =
217*fb1b10abSAndroid Build Coastguard Worker         _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask);
218*fb1b10abSAndroid Build Coastguard Worker     //  tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
219*fb1b10abSAndroid Build Coastguard Worker     //                 quant_shift_ptr[rc != 0]) >> 15);
220*fb1b10abSAndroid Build Coastguard Worker     const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant);
221*fb1b10abSAndroid Build Coastguard Worker     const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd);
222*fb1b10abSAndroid Build Coastguard Worker     const __m256i v_tmp32_hi =
223*fb1b10abSAndroid Build Coastguard Worker         _mm256_slli_epi16(_mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift), 1);
224*fb1b10abSAndroid Build Coastguard Worker     const __m256i v_tmp32_lo =
225*fb1b10abSAndroid Build Coastguard Worker         _mm256_srli_epi16(_mm256_mullo_epi16(v_tmp32_b, *v_quant_shift), 15);
226*fb1b10abSAndroid Build Coastguard Worker     const __m256i v_tmp32 = _mm256_or_si256(v_tmp32_hi, v_tmp32_lo);
227*fb1b10abSAndroid Build Coastguard Worker     const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff);
228*fb1b10abSAndroid Build Coastguard Worker     const __m256i v_sign_lo =
229*fb1b10abSAndroid Build Coastguard Worker         _mm256_unpacklo_epi16(_mm256_setzero_si256(), v_coeff);
230*fb1b10abSAndroid Build Coastguard Worker     const __m256i v_sign_hi =
231*fb1b10abSAndroid Build Coastguard Worker         _mm256_unpackhi_epi16(_mm256_setzero_si256(), v_coeff);
232*fb1b10abSAndroid Build Coastguard Worker     const __m256i low = _mm256_mullo_epi16(v_tmp32, *v_dequant);
233*fb1b10abSAndroid Build Coastguard Worker     const __m256i high = _mm256_mulhi_epi16(v_tmp32, *v_dequant);
234*fb1b10abSAndroid Build Coastguard Worker     const __m256i v_dqcoeff_lo = _mm256_sign_epi32(
235*fb1b10abSAndroid Build Coastguard Worker         _mm256_srli_epi32(_mm256_unpacklo_epi16(low, high), 1), v_sign_lo);
236*fb1b10abSAndroid Build Coastguard Worker     const __m256i v_dqcoeff_hi = _mm256_sign_epi32(
237*fb1b10abSAndroid Build Coastguard Worker         _mm256_srli_epi32(_mm256_unpackhi_epi16(low, high), 1), v_sign_hi);
238*fb1b10abSAndroid Build Coastguard Worker     const __m256i v_nz_mask =
239*fb1b10abSAndroid Build Coastguard Worker         _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256());
240*fb1b10abSAndroid Build Coastguard Worker 
241*fb1b10abSAndroid Build Coastguard Worker     store_coefficients_avx2(v_qcoeff, qcoeff_ptr);
242*fb1b10abSAndroid Build Coastguard Worker 
243*fb1b10abSAndroid Build Coastguard Worker #if CONFIG_VP9_HIGHBITDEPTH
244*fb1b10abSAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i *)(dqcoeff_ptr), v_dqcoeff_lo);
245*fb1b10abSAndroid Build Coastguard Worker     _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + 8), v_dqcoeff_hi);
246*fb1b10abSAndroid Build Coastguard Worker #else
247*fb1b10abSAndroid Build Coastguard Worker     store_coefficients_avx2(_mm256_packs_epi32(v_dqcoeff_lo, v_dqcoeff_hi),
248*fb1b10abSAndroid Build Coastguard Worker                             dqcoeff_ptr);
249*fb1b10abSAndroid Build Coastguard Worker #endif
250*fb1b10abSAndroid Build Coastguard Worker 
251*fb1b10abSAndroid Build Coastguard Worker     return get_max_lane_eob(iscan, *v_eobmax, v_nz_mask);
252*fb1b10abSAndroid Build Coastguard Worker   }
253*fb1b10abSAndroid Build Coastguard Worker }
254*fb1b10abSAndroid Build Coastguard Worker 
vpx_quantize_b_32x32_avx2(const tran_low_t * coeff_ptr,const struct macroblock_plane * const mb_plane,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const struct ScanOrder * const scan_order)255*fb1b10abSAndroid Build Coastguard Worker void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr,
256*fb1b10abSAndroid Build Coastguard Worker                                const struct macroblock_plane *const mb_plane,
257*fb1b10abSAndroid Build Coastguard Worker                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
258*fb1b10abSAndroid Build Coastguard Worker                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
259*fb1b10abSAndroid Build Coastguard Worker                                const struct ScanOrder *const scan_order) {
260*fb1b10abSAndroid Build Coastguard Worker   __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift;
261*fb1b10abSAndroid Build Coastguard Worker   __m256i v_eobmax = _mm256_setzero_si256();
262*fb1b10abSAndroid Build Coastguard Worker   intptr_t count;
263*fb1b10abSAndroid Build Coastguard Worker   const int16_t *iscan = scan_order->iscan;
264*fb1b10abSAndroid Build Coastguard Worker 
265*fb1b10abSAndroid Build Coastguard Worker   load_b_values_avx2(mb_plane, &v_zbin, &v_round, &v_quant, dequant_ptr,
266*fb1b10abSAndroid Build Coastguard Worker                      &v_dequant, &v_quant_shift, 1);
267*fb1b10abSAndroid Build Coastguard Worker 
268*fb1b10abSAndroid Build Coastguard Worker   // Do DC and first 15 AC.
269*fb1b10abSAndroid Build Coastguard Worker   v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan,
270*fb1b10abSAndroid Build Coastguard Worker                                  &v_quant, &v_dequant, &v_round, &v_zbin,
271*fb1b10abSAndroid Build Coastguard Worker                                  &v_quant_shift, &v_eobmax);
272*fb1b10abSAndroid Build Coastguard Worker 
273*fb1b10abSAndroid Build Coastguard Worker   v_round = _mm256_unpackhi_epi64(v_round, v_round);
274*fb1b10abSAndroid Build Coastguard Worker   v_quant = _mm256_unpackhi_epi64(v_quant, v_quant);
275*fb1b10abSAndroid Build Coastguard Worker   v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant);
276*fb1b10abSAndroid Build Coastguard Worker   v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift);
277*fb1b10abSAndroid Build Coastguard Worker   v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin);
278*fb1b10abSAndroid Build Coastguard Worker 
279*fb1b10abSAndroid Build Coastguard Worker   for (count = (32 * 32) - 16; count > 0; count -= 16) {
280*fb1b10abSAndroid Build Coastguard Worker     coeff_ptr += 16;
281*fb1b10abSAndroid Build Coastguard Worker     qcoeff_ptr += 16;
282*fb1b10abSAndroid Build Coastguard Worker     dqcoeff_ptr += 16;
283*fb1b10abSAndroid Build Coastguard Worker     iscan += 16;
284*fb1b10abSAndroid Build Coastguard Worker     v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan,
285*fb1b10abSAndroid Build Coastguard Worker                                    &v_quant, &v_dequant, &v_round, &v_zbin,
286*fb1b10abSAndroid Build Coastguard Worker                                    &v_quant_shift, &v_eobmax);
287*fb1b10abSAndroid Build Coastguard Worker   }
288*fb1b10abSAndroid Build Coastguard Worker 
289*fb1b10abSAndroid Build Coastguard Worker   *eob_ptr = accumulate_eob256(v_eobmax);
290*fb1b10abSAndroid Build Coastguard Worker }
291