xref: /aosp_15_r20/external/libaom/av1/encoder/x86/av1_highbd_quantize_sse4.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <smmintrin.h>
13 #include <stdint.h>
14 
15 #include "config/av1_rtcd.h"
16 
17 #include "aom_dsp/aom_dsp_common.h"
18 #include "aom_dsp/x86/synonyms.h"
19 
20 // Coefficient quantization phase 1
21 // param[0-2] : rounding/quan/dequan constants
quantize_coeff_phase1(__m128i * coeff,const __m128i * param,const int shift,const int scale,__m128i * qcoeff,__m128i * dquan,__m128i * sign)22 static inline void quantize_coeff_phase1(__m128i *coeff, const __m128i *param,
23                                          const int shift, const int scale,
24                                          __m128i *qcoeff, __m128i *dquan,
25                                          __m128i *sign) {
26   const __m128i zero = _mm_setzero_si128();
27   const __m128i one = _mm_set1_epi32(1);
28 
29   *sign = _mm_cmplt_epi32(*coeff, zero);
30   *sign = _mm_or_si128(*sign, one);
31   *coeff = _mm_abs_epi32(*coeff);
32 
33   qcoeff[0] = _mm_add_epi32(*coeff, param[0]);
34   qcoeff[1] = _mm_unpackhi_epi32(qcoeff[0], zero);
35   qcoeff[0] = _mm_unpacklo_epi32(qcoeff[0], zero);
36 
37   qcoeff[0] = _mm_mul_epi32(qcoeff[0], param[1]);
38   qcoeff[0] = _mm_srli_epi64(qcoeff[0], shift);
39   dquan[0] = _mm_mul_epi32(qcoeff[0], param[2]);
40   dquan[0] = _mm_srli_epi64(dquan[0], scale);
41   const __m128i abs_s = _mm_slli_epi32(*coeff, 1 + scale);
42   qcoeff[2] = _mm_cmplt_epi32(abs_s, param[3]);
43 }
44 
45 // Coefficient quantization phase 2
quantize_coeff_phase2(__m128i * qcoeff,__m128i * dquan,const __m128i * sign,const __m128i * param,const int shift,const int scale,tran_low_t * qAddr,tran_low_t * dqAddr)46 static inline void quantize_coeff_phase2(__m128i *qcoeff, __m128i *dquan,
47                                          const __m128i *sign,
48                                          const __m128i *param, const int shift,
49                                          const int scale, tran_low_t *qAddr,
50                                          tran_low_t *dqAddr) {
51   __m128i mask0L = _mm_set_epi32(-1, -1, 0, 0);
52   __m128i mask0H = _mm_set_epi32(0, 0, -1, -1);
53 
54   qcoeff[1] = _mm_mul_epi32(qcoeff[1], param[1]);
55   qcoeff[1] = _mm_srli_epi64(qcoeff[1], shift);
56   dquan[1] = _mm_mul_epi32(qcoeff[1], param[2]);
57   dquan[1] = _mm_srli_epi64(dquan[1], scale);
58 
59   // combine L&H
60   qcoeff[0] = _mm_shuffle_epi32(qcoeff[0], 0xd8);
61   qcoeff[1] = _mm_shuffle_epi32(qcoeff[1], 0x8d);
62 
63   qcoeff[0] = _mm_and_si128(qcoeff[0], mask0H);
64   qcoeff[1] = _mm_and_si128(qcoeff[1], mask0L);
65 
66   dquan[0] = _mm_shuffle_epi32(dquan[0], 0xd8);
67   dquan[1] = _mm_shuffle_epi32(dquan[1], 0x8d);
68 
69   dquan[0] = _mm_and_si128(dquan[0], mask0H);
70   dquan[1] = _mm_and_si128(dquan[1], mask0L);
71 
72   qcoeff[0] = _mm_or_si128(qcoeff[0], qcoeff[1]);
73   dquan[0] = _mm_or_si128(dquan[0], dquan[1]);
74 
75   qcoeff[0] = _mm_sign_epi32(qcoeff[0], *sign);
76   dquan[0] = _mm_sign_epi32(dquan[0], *sign);
77   qcoeff[0] = _mm_andnot_si128(qcoeff[2], qcoeff[0]);
78   dquan[0] = _mm_andnot_si128(qcoeff[2], dquan[0]);
79   _mm_storeu_si128((__m128i *)qAddr, qcoeff[0]);
80   _mm_storeu_si128((__m128i *)dqAddr, dquan[0]);
81 }
82 
find_eob(tran_low_t * qcoeff_ptr,const int16_t * iscan,__m128i * eob)83 static inline void find_eob(tran_low_t *qcoeff_ptr, const int16_t *iscan,
84                             __m128i *eob) {
85   const __m128i zero = _mm_setzero_si128();
86   __m128i mask, iscanIdx;
87   const __m128i q0 = _mm_loadu_si128((__m128i const *)qcoeff_ptr);
88   const __m128i q1 = _mm_loadu_si128((__m128i const *)(qcoeff_ptr + 4));
89   __m128i nz_flag0 = _mm_cmpeq_epi32(q0, zero);
90   __m128i nz_flag1 = _mm_cmpeq_epi32(q1, zero);
91 
92   nz_flag0 = _mm_cmpeq_epi32(nz_flag0, zero);
93   nz_flag1 = _mm_cmpeq_epi32(nz_flag1, zero);
94 
95   mask = _mm_packs_epi32(nz_flag0, nz_flag1);
96   iscanIdx = _mm_loadu_si128((__m128i const *)iscan);
97   iscanIdx = _mm_sub_epi16(iscanIdx, mask);
98   iscanIdx = _mm_and_si128(iscanIdx, mask);
99   *eob = _mm_max_epi16(*eob, iscanIdx);
100 }
101 
get_accumulated_eob(__m128i * eob)102 static inline uint16_t get_accumulated_eob(__m128i *eob) {
103   __m128i eob_shuffled;
104   uint16_t eobValue;
105   eob_shuffled = _mm_shuffle_epi32(*eob, 0xe);
106   *eob = _mm_max_epi16(*eob, eob_shuffled);
107   eob_shuffled = _mm_shufflelo_epi16(*eob, 0xe);
108   *eob = _mm_max_epi16(*eob, eob_shuffled);
109   eob_shuffled = _mm_shufflelo_epi16(*eob, 0x1);
110   *eob = _mm_max_epi16(*eob, eob_shuffled);
111   eobValue = _mm_extract_epi16(*eob, 0);
112   return eobValue;
113 }
114 
av1_highbd_quantize_fp_sse4_1(const tran_low_t * coeff_ptr,intptr_t count,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan,int log_scale)115 void av1_highbd_quantize_fp_sse4_1(
116     const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr,
117     const int16_t *round_ptr, const int16_t *quant_ptr,
118     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
119     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
120     const int16_t *scan, const int16_t *iscan, int log_scale) {
121   __m128i coeff[2], qcoeff[3], dequant[2], qparam[4], coeff_sign;
122   __m128i eob = _mm_setzero_si128();
123   const tran_low_t *src = coeff_ptr;
124   tran_low_t *quanAddr = qcoeff_ptr;
125   tran_low_t *dquanAddr = dqcoeff_ptr;
126   const int shift = 16 - log_scale;
127   const int coeff_stride = 4;
128   const int quan_stride = coeff_stride;
129   (void)zbin_ptr;
130   (void)quant_shift_ptr;
131   (void)scan;
132 
133   memset(quanAddr, 0, count * sizeof(quanAddr[0]));
134   memset(dquanAddr, 0, count * sizeof(dquanAddr[0]));
135 
136   coeff[0] = _mm_loadu_si128((__m128i const *)src);
137   const int round1 = ROUND_POWER_OF_TWO(round_ptr[1], log_scale);
138   const int round0 = ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
139 
140   qparam[0] = _mm_set_epi32(round1, round1, round1, round0);
141   qparam[1] = _mm_set_epi64x((uint32_t)quant_ptr[1], (uint32_t)quant_ptr[0]);
142   qparam[2] =
143       _mm_set_epi64x((uint32_t)dequant_ptr[1], (uint32_t)dequant_ptr[0]);
144   qparam[3] = _mm_set_epi32(dequant_ptr[1], dequant_ptr[1], dequant_ptr[1],
145                             dequant_ptr[0]);
146 
147   // DC and first 3 AC
148   quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
149                         &coeff_sign);
150 
151   // update round/quan/dquan for AC
152   qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]);
153   qparam[1] = _mm_set1_epi64x((uint32_t)quant_ptr[1]);
154   qparam[2] = _mm_set1_epi64x((uint32_t)dequant_ptr[1]);
155   qparam[3] = _mm_set1_epi32(dequant_ptr[1]);
156   quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale,
157                         quanAddr, dquanAddr);
158 
159   // next 4 AC
160   coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
161   quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant,
162                         &coeff_sign);
163   quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale,
164                         quanAddr + quan_stride, dquanAddr + quan_stride);
165 
166   find_eob(quanAddr, iscan, &eob);
167 
168   count -= 8;
169 
170   // loop for the rest of AC
171   while (count > 0) {
172     src += coeff_stride << 1;
173     quanAddr += quan_stride << 1;
174     dquanAddr += quan_stride << 1;
175     iscan += quan_stride << 1;
176 
177     coeff[0] = _mm_loadu_si128((__m128i const *)src);
178     coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
179 
180     quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
181                           &coeff_sign);
182     quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
183                           log_scale, quanAddr, dquanAddr);
184 
185     quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant,
186                           &coeff_sign);
187     quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
188                           log_scale, quanAddr + quan_stride,
189                           dquanAddr + quan_stride);
190 
191     find_eob(quanAddr, iscan, &eob);
192 
193     count -= 8;
194   }
195   *eob_ptr = get_accumulated_eob(&eob);
196 }
197