1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <smmintrin.h>
13 #include <stdint.h>
14
15 #include "config/av1_rtcd.h"
16
17 #include "aom_dsp/aom_dsp_common.h"
18 #include "aom_dsp/x86/synonyms.h"
19
20 // Coefficient quantization phase 1
21 // param[0-2] : rounding/quan/dequan constants
quantize_coeff_phase1(__m128i * coeff,const __m128i * param,const int shift,const int scale,__m128i * qcoeff,__m128i * dquan,__m128i * sign)22 static inline void quantize_coeff_phase1(__m128i *coeff, const __m128i *param,
23 const int shift, const int scale,
24 __m128i *qcoeff, __m128i *dquan,
25 __m128i *sign) {
26 const __m128i zero = _mm_setzero_si128();
27 const __m128i one = _mm_set1_epi32(1);
28
29 *sign = _mm_cmplt_epi32(*coeff, zero);
30 *sign = _mm_or_si128(*sign, one);
31 *coeff = _mm_abs_epi32(*coeff);
32
33 qcoeff[0] = _mm_add_epi32(*coeff, param[0]);
34 qcoeff[1] = _mm_unpackhi_epi32(qcoeff[0], zero);
35 qcoeff[0] = _mm_unpacklo_epi32(qcoeff[0], zero);
36
37 qcoeff[0] = _mm_mul_epi32(qcoeff[0], param[1]);
38 qcoeff[0] = _mm_srli_epi64(qcoeff[0], shift);
39 dquan[0] = _mm_mul_epi32(qcoeff[0], param[2]);
40 dquan[0] = _mm_srli_epi64(dquan[0], scale);
41 const __m128i abs_s = _mm_slli_epi32(*coeff, 1 + scale);
42 qcoeff[2] = _mm_cmplt_epi32(abs_s, param[3]);
43 }
44
45 // Coefficient quantization phase 2
quantize_coeff_phase2(__m128i * qcoeff,__m128i * dquan,const __m128i * sign,const __m128i * param,const int shift,const int scale,tran_low_t * qAddr,tran_low_t * dqAddr)46 static inline void quantize_coeff_phase2(__m128i *qcoeff, __m128i *dquan,
47 const __m128i *sign,
48 const __m128i *param, const int shift,
49 const int scale, tran_low_t *qAddr,
50 tran_low_t *dqAddr) {
51 __m128i mask0L = _mm_set_epi32(-1, -1, 0, 0);
52 __m128i mask0H = _mm_set_epi32(0, 0, -1, -1);
53
54 qcoeff[1] = _mm_mul_epi32(qcoeff[1], param[1]);
55 qcoeff[1] = _mm_srli_epi64(qcoeff[1], shift);
56 dquan[1] = _mm_mul_epi32(qcoeff[1], param[2]);
57 dquan[1] = _mm_srli_epi64(dquan[1], scale);
58
59 // combine L&H
60 qcoeff[0] = _mm_shuffle_epi32(qcoeff[0], 0xd8);
61 qcoeff[1] = _mm_shuffle_epi32(qcoeff[1], 0x8d);
62
63 qcoeff[0] = _mm_and_si128(qcoeff[0], mask0H);
64 qcoeff[1] = _mm_and_si128(qcoeff[1], mask0L);
65
66 dquan[0] = _mm_shuffle_epi32(dquan[0], 0xd8);
67 dquan[1] = _mm_shuffle_epi32(dquan[1], 0x8d);
68
69 dquan[0] = _mm_and_si128(dquan[0], mask0H);
70 dquan[1] = _mm_and_si128(dquan[1], mask0L);
71
72 qcoeff[0] = _mm_or_si128(qcoeff[0], qcoeff[1]);
73 dquan[0] = _mm_or_si128(dquan[0], dquan[1]);
74
75 qcoeff[0] = _mm_sign_epi32(qcoeff[0], *sign);
76 dquan[0] = _mm_sign_epi32(dquan[0], *sign);
77 qcoeff[0] = _mm_andnot_si128(qcoeff[2], qcoeff[0]);
78 dquan[0] = _mm_andnot_si128(qcoeff[2], dquan[0]);
79 _mm_storeu_si128((__m128i *)qAddr, qcoeff[0]);
80 _mm_storeu_si128((__m128i *)dqAddr, dquan[0]);
81 }
82
find_eob(tran_low_t * qcoeff_ptr,const int16_t * iscan,__m128i * eob)83 static inline void find_eob(tran_low_t *qcoeff_ptr, const int16_t *iscan,
84 __m128i *eob) {
85 const __m128i zero = _mm_setzero_si128();
86 __m128i mask, iscanIdx;
87 const __m128i q0 = _mm_loadu_si128((__m128i const *)qcoeff_ptr);
88 const __m128i q1 = _mm_loadu_si128((__m128i const *)(qcoeff_ptr + 4));
89 __m128i nz_flag0 = _mm_cmpeq_epi32(q0, zero);
90 __m128i nz_flag1 = _mm_cmpeq_epi32(q1, zero);
91
92 nz_flag0 = _mm_cmpeq_epi32(nz_flag0, zero);
93 nz_flag1 = _mm_cmpeq_epi32(nz_flag1, zero);
94
95 mask = _mm_packs_epi32(nz_flag0, nz_flag1);
96 iscanIdx = _mm_loadu_si128((__m128i const *)iscan);
97 iscanIdx = _mm_sub_epi16(iscanIdx, mask);
98 iscanIdx = _mm_and_si128(iscanIdx, mask);
99 *eob = _mm_max_epi16(*eob, iscanIdx);
100 }
101
get_accumulated_eob(__m128i * eob)102 static inline uint16_t get_accumulated_eob(__m128i *eob) {
103 __m128i eob_shuffled;
104 uint16_t eobValue;
105 eob_shuffled = _mm_shuffle_epi32(*eob, 0xe);
106 *eob = _mm_max_epi16(*eob, eob_shuffled);
107 eob_shuffled = _mm_shufflelo_epi16(*eob, 0xe);
108 *eob = _mm_max_epi16(*eob, eob_shuffled);
109 eob_shuffled = _mm_shufflelo_epi16(*eob, 0x1);
110 *eob = _mm_max_epi16(*eob, eob_shuffled);
111 eobValue = _mm_extract_epi16(*eob, 0);
112 return eobValue;
113 }
114
av1_highbd_quantize_fp_sse4_1(const tran_low_t * coeff_ptr,intptr_t count,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan,int log_scale)115 void av1_highbd_quantize_fp_sse4_1(
116 const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr,
117 const int16_t *round_ptr, const int16_t *quant_ptr,
118 const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
119 tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
120 const int16_t *scan, const int16_t *iscan, int log_scale) {
121 __m128i coeff[2], qcoeff[3], dequant[2], qparam[4], coeff_sign;
122 __m128i eob = _mm_setzero_si128();
123 const tran_low_t *src = coeff_ptr;
124 tran_low_t *quanAddr = qcoeff_ptr;
125 tran_low_t *dquanAddr = dqcoeff_ptr;
126 const int shift = 16 - log_scale;
127 const int coeff_stride = 4;
128 const int quan_stride = coeff_stride;
129 (void)zbin_ptr;
130 (void)quant_shift_ptr;
131 (void)scan;
132
133 memset(quanAddr, 0, count * sizeof(quanAddr[0]));
134 memset(dquanAddr, 0, count * sizeof(dquanAddr[0]));
135
136 coeff[0] = _mm_loadu_si128((__m128i const *)src);
137 const int round1 = ROUND_POWER_OF_TWO(round_ptr[1], log_scale);
138 const int round0 = ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
139
140 qparam[0] = _mm_set_epi32(round1, round1, round1, round0);
141 qparam[1] = _mm_set_epi64x((uint32_t)quant_ptr[1], (uint32_t)quant_ptr[0]);
142 qparam[2] =
143 _mm_set_epi64x((uint32_t)dequant_ptr[1], (uint32_t)dequant_ptr[0]);
144 qparam[3] = _mm_set_epi32(dequant_ptr[1], dequant_ptr[1], dequant_ptr[1],
145 dequant_ptr[0]);
146
147 // DC and first 3 AC
148 quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
149 &coeff_sign);
150
151 // update round/quan/dquan for AC
152 qparam[0] = _mm_unpackhi_epi64(qparam[0], qparam[0]);
153 qparam[1] = _mm_set1_epi64x((uint32_t)quant_ptr[1]);
154 qparam[2] = _mm_set1_epi64x((uint32_t)dequant_ptr[1]);
155 qparam[3] = _mm_set1_epi32(dequant_ptr[1]);
156 quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale,
157 quanAddr, dquanAddr);
158
159 // next 4 AC
160 coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
161 quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant,
162 &coeff_sign);
163 quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift, log_scale,
164 quanAddr + quan_stride, dquanAddr + quan_stride);
165
166 find_eob(quanAddr, iscan, &eob);
167
168 count -= 8;
169
170 // loop for the rest of AC
171 while (count > 0) {
172 src += coeff_stride << 1;
173 quanAddr += quan_stride << 1;
174 dquanAddr += quan_stride << 1;
175 iscan += quan_stride << 1;
176
177 coeff[0] = _mm_loadu_si128((__m128i const *)src);
178 coeff[1] = _mm_loadu_si128((__m128i const *)(src + coeff_stride));
179
180 quantize_coeff_phase1(&coeff[0], qparam, shift, log_scale, qcoeff, dequant,
181 &coeff_sign);
182 quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
183 log_scale, quanAddr, dquanAddr);
184
185 quantize_coeff_phase1(&coeff[1], qparam, shift, log_scale, qcoeff, dequant,
186 &coeff_sign);
187 quantize_coeff_phase2(qcoeff, dequant, &coeff_sign, qparam, shift,
188 log_scale, quanAddr + quan_stride,
189 dquanAddr + quan_stride);
190
191 find_eob(quanAddr, iscan, &eob);
192
193 count -= 8;
194 }
195 *eob_ptr = get_accumulated_eob(&eob);
196 }
197