1 /*
2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <tmmintrin.h>
13
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx/vpx_integer.h"
16 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
17 #include "vpx_dsp/x86/quantize_sse2.h"
18 #include "vpx_dsp/x86/quantize_ssse3.h"
19 #include "vp9/common/vp9_scan.h"
20 #include "vp9/encoder/vp9_block.h"
21
vpx_quantize_b_ssse3(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const struct macroblock_plane * const mb_plane,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const struct ScanOrder * const scan_order)22 void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
23 const struct macroblock_plane *const mb_plane,
24 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
25 const int16_t *dequant_ptr, uint16_t *eob_ptr,
26 const struct ScanOrder *const scan_order) {
27 const __m128i zero = _mm_setzero_si128();
28 int index = 16;
29 const int16_t *iscan = scan_order->iscan;
30
31 __m128i zbin, round, quant, dequant, shift;
32 __m128i coeff0, coeff1;
33 __m128i qcoeff0, qcoeff1;
34 __m128i cmp_mask0, cmp_mask1;
35 __m128i eob, eob0;
36
37 load_b_values(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift);
38
39 // Do DC and first 15 AC.
40 coeff0 = load_tran_low(coeff_ptr);
41 coeff1 = load_tran_low(coeff_ptr + 8);
42
43 qcoeff0 = _mm_abs_epi16(coeff0);
44 qcoeff1 = _mm_abs_epi16(coeff1);
45
46 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
47 zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
48 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
49
50 calculate_qcoeff(&qcoeff0, round, quant, shift);
51 round = _mm_unpackhi_epi64(round, round);
52 quant = _mm_unpackhi_epi64(quant, quant);
53 shift = _mm_unpackhi_epi64(shift, shift);
54 calculate_qcoeff(&qcoeff1, round, quant, shift);
55
56 // Reinsert signs
57 qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
58 qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
59
60 // Mask out zbin threshold coeffs
61 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
62 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
63
64 store_tran_low(qcoeff0, qcoeff_ptr);
65 store_tran_low(qcoeff1, qcoeff_ptr + 8);
66
67 calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr);
68 dequant = _mm_unpackhi_epi64(dequant, dequant);
69 calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
70
71 eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
72
73 // AC only loop.
74 while (index < n_coeffs) {
75 coeff0 = load_tran_low(coeff_ptr + index);
76 coeff1 = load_tran_low(coeff_ptr + index + 8);
77
78 qcoeff0 = _mm_abs_epi16(coeff0);
79 qcoeff1 = _mm_abs_epi16(coeff1);
80
81 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
82 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
83
84 calculate_qcoeff(&qcoeff0, round, quant, shift);
85 calculate_qcoeff(&qcoeff1, round, quant, shift);
86
87 qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
88 qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
89
90 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
91 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
92
93 store_tran_low(qcoeff0, qcoeff_ptr + index);
94 store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
95
96 calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
97 calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
98
99 eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
100 eob = _mm_max_epi16(eob, eob0);
101
102 index += 16;
103 }
104
105 *eob_ptr = accumulate_eob(eob);
106 }
107
vpx_quantize_b_32x32_ssse3(const tran_low_t * coeff_ptr,const struct macroblock_plane * const mb_plane,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const struct ScanOrder * const scan_order)108 void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr,
109 const struct macroblock_plane *const mb_plane,
110 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
111 const int16_t *dequant_ptr, uint16_t *eob_ptr,
112 const struct ScanOrder *const scan_order) {
113 const __m128i zero = _mm_setzero_si128();
114 int index;
115 const int16_t *iscan = scan_order->iscan;
116
117 __m128i zbin, round, quant, dequant, shift;
118 __m128i coeff0, coeff1;
119 __m128i qcoeff0, qcoeff1;
120 __m128i cmp_mask0, cmp_mask1;
121 __m128i all_zero;
122 __m128i eob = zero, eob0;
123
124 load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant,
125 &shift);
126
127 // Do DC and first 15 AC.
128 coeff0 = load_tran_low(coeff_ptr);
129 coeff1 = load_tran_low(coeff_ptr + 8);
130
131 qcoeff0 = _mm_abs_epi16(coeff0);
132 qcoeff1 = _mm_abs_epi16(coeff1);
133
134 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
135 zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC.
136 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
137
138 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
139 if (_mm_movemask_epi8(all_zero) == 0) {
140 _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
141 _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
142 _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
143 _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
144 #if CONFIG_VP9_HIGHBITDEPTH
145 _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
146 _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
147 _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
148 _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
149 #endif // CONFIG_HIGHBITDEPTH
150
151 round = _mm_unpackhi_epi64(round, round);
152 quant = _mm_unpackhi_epi64(quant, quant);
153 shift = _mm_unpackhi_epi64(shift, shift);
154 dequant = _mm_unpackhi_epi64(dequant, dequant);
155 } else {
156 calculate_qcoeff(&qcoeff0, round, quant, shift);
157 round = _mm_unpackhi_epi64(round, round);
158 quant = _mm_unpackhi_epi64(quant, quant);
159 shift = _mm_unpackhi_epi64(shift, shift);
160 calculate_qcoeff(&qcoeff1, round, quant, shift);
161
162 // Reinsert signs.
163 qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
164 qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
165
166 // Mask out zbin threshold coeffs.
167 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
168 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
169
170 store_tran_low(qcoeff0, qcoeff_ptr);
171 store_tran_low(qcoeff1, qcoeff_ptr + 8);
172
173 calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero, dqcoeff_ptr);
174 dequant = _mm_unpackhi_epi64(dequant, dequant);
175 calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8);
176
177 eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
178 }
179
180 // AC only loop.
181 for (index = 16; index < 32 * 32; index += 16) {
182 coeff0 = load_tran_low(coeff_ptr + index);
183 coeff1 = load_tran_low(coeff_ptr + index + 8);
184
185 qcoeff0 = _mm_abs_epi16(coeff0);
186 qcoeff1 = _mm_abs_epi16(coeff1);
187
188 cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
189 cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
190
191 all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
192 if (_mm_movemask_epi8(all_zero) == 0) {
193 _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
194 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
195 _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
196 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
197 #if CONFIG_VP9_HIGHBITDEPTH
198 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
199 _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
200 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
201 _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
202 #endif // CONFIG_VP9_HIGHBITDEPTH
203 continue;
204 }
205
206 calculate_qcoeff(&qcoeff0, round, quant, shift);
207 calculate_qcoeff(&qcoeff1, round, quant, shift);
208
209 qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
210 qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
211
212 qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
213 qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
214
215 store_tran_low(qcoeff0, qcoeff_ptr + index);
216 store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
217
218 calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, zero,
219 dqcoeff_ptr + index);
220 calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero,
221 dqcoeff_ptr + 8 + index);
222
223 eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
224 eob = _mm_max_epi16(eob, eob0);
225 }
226
227 *eob_ptr = accumulate_eob(eob);
228 }
229