1 /*
2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "vpx_mem/vpx_mem.h"
12 #include "vpx_ports/asmdefs_mmi.h"
13 #include "vp8/encoder/onyx_int.h"
14 #include "vp8/encoder/quantize.h"
15 #include "vp8/common/quant_common.h"
16
17 #define REGULAR_SELECT_EOB(i, rc) \
18 z = coeff_ptr[rc]; \
19 sz = (z >> 31); \
20 x = (z ^ sz) - sz; \
21 zbin = zbin_ptr[rc] + *(zbin_boost_ptr++) + zbin_oq_value; \
22 if (x >= zbin) { \
23 x += round_ptr[rc]; \
24 y = ((((x * quant_ptr[rc]) >> 16) + x) * quant_shift_ptr[rc]) >> 16; \
25 if (y) { \
26 x = (y ^ sz) - sz; \
27 qcoeff_ptr[rc] = x; \
28 dqcoeff_ptr[rc] = x * dequant_ptr[rc]; \
29 eob = i; \
30 zbin_boost_ptr = b->zrun_zbin_boost; \
31 } \
32 }
33
vp8_fast_quantize_b_mmi(BLOCK * b,BLOCKD * d)34 void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) {
35 const int16_t *coeff_ptr = b->coeff;
36 const int16_t *round_ptr = b->round;
37 const int16_t *quant_ptr = b->quant_fast;
38 int16_t *qcoeff_ptr = d->qcoeff;
39 int16_t *dqcoeff_ptr = d->dqcoeff;
40 const int16_t *dequant_ptr = d->dequant;
41 const int16_t *inv_zig_zag = vp8_default_inv_zig_zag;
42
43 double ftmp[13];
44 uint64_t tmp[1];
45 int64_t eob = 0;
46 double ones;
47
48 __asm__ volatile(
49 // loop 0 ~ 7
50 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
51 "pcmpeqh %[ones], %[ones], %[ones] \n\t"
52 "gsldlc1 %[ftmp1], 0x07(%[coeff_ptr]) \n\t"
53 "gsldrc1 %[ftmp1], 0x00(%[coeff_ptr]) \n\t"
54 "dli %[tmp0], 0x0f \n\t"
55 "dmtc1 %[tmp0], %[ftmp9] \n\t"
56 "gsldlc1 %[ftmp2], 0x0f(%[coeff_ptr]) \n\t"
57 "gsldrc1 %[ftmp2], 0x08(%[coeff_ptr]) \n\t"
58
59 "psrah %[ftmp3], %[ftmp1], %[ftmp9] \n\t"
60 "pxor %[ftmp1], %[ftmp3], %[ftmp1] \n\t"
61 "psubh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
62 "psrah %[ftmp4], %[ftmp2], %[ftmp9] \n\t"
63 "pxor %[ftmp2], %[ftmp4], %[ftmp2] \n\t"
64 "psubh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
65
66 "gsldlc1 %[ftmp5], 0x07(%[round_ptr]) \n\t"
67 "gsldrc1 %[ftmp5], 0x00(%[round_ptr]) \n\t"
68 "gsldlc1 %[ftmp6], 0x0f(%[round_ptr]) \n\t"
69 "gsldrc1 %[ftmp6], 0x08(%[round_ptr]) \n\t"
70 "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
71 "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
72 "gsldlc1 %[ftmp7], 0x07(%[quant_ptr]) \n\t"
73 "gsldrc1 %[ftmp7], 0x00(%[quant_ptr]) \n\t"
74 "gsldlc1 %[ftmp8], 0x0f(%[quant_ptr]) \n\t"
75 "gsldrc1 %[ftmp8], 0x08(%[quant_ptr]) \n\t"
76 "pmulhuh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
77 "pmulhuh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
78
79 "pxor %[ftmp7], %[ftmp5], %[ftmp3] \n\t"
80 "pxor %[ftmp8], %[ftmp6], %[ftmp4] \n\t"
81 "psubh %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
82 "psubh %[ftmp8], %[ftmp8], %[ftmp4] \n\t"
83 "gssdlc1 %[ftmp7], 0x07(%[qcoeff_ptr]) \n\t"
84 "gssdrc1 %[ftmp7], 0x00(%[qcoeff_ptr]) \n\t"
85 "gssdlc1 %[ftmp8], 0x0f(%[qcoeff_ptr]) \n\t"
86 "gssdrc1 %[ftmp8], 0x08(%[qcoeff_ptr]) \n\t"
87
88 "gsldlc1 %[ftmp1], 0x07(%[inv_zig_zag]) \n\t"
89 "gsldrc1 %[ftmp1], 0x00(%[inv_zig_zag]) \n\t"
90 "gsldlc1 %[ftmp2], 0x0f(%[inv_zig_zag]) \n\t"
91 "gsldrc1 %[ftmp2], 0x08(%[inv_zig_zag]) \n\t"
92 "pcmpeqh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
93 "pcmpeqh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
94 "pxor %[ftmp5], %[ftmp5], %[ones] \n\t"
95 "pxor %[ftmp6], %[ftmp6], %[ones] \n\t"
96 "pand %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
97 "pand %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
98 "pmaxsh %[ftmp10], %[ftmp5], %[ftmp6] \n\t"
99
100 "gsldlc1 %[ftmp5], 0x07(%[dequant_ptr]) \n\t"
101 "gsldrc1 %[ftmp5], 0x00(%[dequant_ptr]) \n\t"
102 "gsldlc1 %[ftmp6], 0x0f(%[dequant_ptr]) \n\t"
103 "gsldrc1 %[ftmp6], 0x08(%[dequant_ptr]) \n\t"
104 "pmullh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
105 "pmullh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
106 "gssdlc1 %[ftmp5], 0x07(%[dqcoeff_ptr]) \n\t"
107 "gssdrc1 %[ftmp5], 0x00(%[dqcoeff_ptr]) \n\t"
108 "gssdlc1 %[ftmp6], 0x0f(%[dqcoeff_ptr]) \n\t"
109 "gssdrc1 %[ftmp6], 0x08(%[dqcoeff_ptr]) \n\t"
110
111 // loop 8 ~ 15
112 "gsldlc1 %[ftmp1], 0x17(%[coeff_ptr]) \n\t"
113 "gsldrc1 %[ftmp1], 0x10(%[coeff_ptr]) \n\t"
114 "gsldlc1 %[ftmp2], 0x1f(%[coeff_ptr]) \n\t"
115 "gsldrc1 %[ftmp2], 0x18(%[coeff_ptr]) \n\t"
116
117 "psrah %[ftmp3], %[ftmp1], %[ftmp9] \n\t"
118 "pxor %[ftmp1], %[ftmp3], %[ftmp1] \n\t"
119 "psubh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
120 "psrah %[ftmp4], %[ftmp2], %[ftmp9] \n\t"
121 "pxor %[ftmp2], %[ftmp4], %[ftmp2] \n\t"
122 "psubh %[ftmp2], %[ftmp2], %[ftmp4] \n\t"
123
124 "gsldlc1 %[ftmp5], 0x17(%[round_ptr]) \n\t"
125 "gsldrc1 %[ftmp5], 0x10(%[round_ptr]) \n\t"
126 "gsldlc1 %[ftmp6], 0x1f(%[round_ptr]) \n\t"
127 "gsldrc1 %[ftmp6], 0x18(%[round_ptr]) \n\t"
128 "paddh %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
129 "paddh %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
130 "gsldlc1 %[ftmp7], 0x17(%[quant_ptr]) \n\t"
131 "gsldrc1 %[ftmp7], 0x10(%[quant_ptr]) \n\t"
132 "gsldlc1 %[ftmp8], 0x1f(%[quant_ptr]) \n\t"
133 "gsldrc1 %[ftmp8], 0x18(%[quant_ptr]) \n\t"
134 "pmulhuh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
135 "pmulhuh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
136
137 "pxor %[ftmp7], %[ftmp5], %[ftmp3] \n\t"
138 "pxor %[ftmp8], %[ftmp6], %[ftmp4] \n\t"
139 "psubh %[ftmp7], %[ftmp7], %[ftmp3] \n\t"
140 "psubh %[ftmp8], %[ftmp8], %[ftmp4] \n\t"
141 "gssdlc1 %[ftmp7], 0x17(%[qcoeff_ptr]) \n\t"
142 "gssdrc1 %[ftmp7], 0x10(%[qcoeff_ptr]) \n\t"
143 "gssdlc1 %[ftmp8], 0x1f(%[qcoeff_ptr]) \n\t"
144 "gssdrc1 %[ftmp8], 0x18(%[qcoeff_ptr]) \n\t"
145
146 "gsldlc1 %[ftmp1], 0x17(%[inv_zig_zag]) \n\t"
147 "gsldrc1 %[ftmp1], 0x10(%[inv_zig_zag]) \n\t"
148 "gsldlc1 %[ftmp2], 0x1f(%[inv_zig_zag]) \n\t"
149 "gsldrc1 %[ftmp2], 0x18(%[inv_zig_zag]) \n\t"
150 "pcmpeqh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
151 "pcmpeqh %[ftmp6], %[ftmp6], %[ftmp0] \n\t"
152 "pxor %[ftmp5], %[ftmp5], %[ones] \n\t"
153 "pxor %[ftmp6], %[ftmp6], %[ones] \n\t"
154 "pand %[ftmp5], %[ftmp5], %[ftmp1] \n\t"
155 "pand %[ftmp6], %[ftmp6], %[ftmp2] \n\t"
156 "pmaxsh %[ftmp11], %[ftmp5], %[ftmp6] \n\t"
157
158 "gsldlc1 %[ftmp5], 0x17(%[dequant_ptr]) \n\t"
159 "gsldrc1 %[ftmp5], 0x10(%[dequant_ptr]) \n\t"
160 "gsldlc1 %[ftmp6], 0x1f(%[dequant_ptr]) \n\t"
161 "gsldrc1 %[ftmp6], 0x18(%[dequant_ptr]) \n\t"
162 "pmullh %[ftmp5], %[ftmp5], %[ftmp7] \n\t"
163 "pmullh %[ftmp6], %[ftmp6], %[ftmp8] \n\t"
164 "gssdlc1 %[ftmp5], 0x17(%[dqcoeff_ptr]) \n\t"
165 "gssdrc1 %[ftmp5], 0x10(%[dqcoeff_ptr]) \n\t"
166 "gssdlc1 %[ftmp6], 0x1f(%[dqcoeff_ptr]) \n\t"
167 "gssdrc1 %[ftmp6], 0x18(%[dqcoeff_ptr]) \n\t"
168
169 "dli %[tmp0], 0x10 \n\t"
170 "dmtc1 %[tmp0], %[ftmp9] \n\t"
171
172 "pmaxsh %[ftmp10], %[ftmp10], %[ftmp11] \n\t"
173 "psrlw %[ftmp11], %[ftmp10], %[ftmp9] \n\t"
174 "pmaxsh %[ftmp10], %[ftmp10], %[ftmp11] \n\t"
175 "dli %[tmp0], 0xaa \n\t"
176 "dmtc1 %[tmp0], %[ftmp9] \n\t"
177 "pshufh %[ftmp11], %[ftmp10], %[ftmp9] \n\t"
178 "pmaxsh %[ftmp10], %[ftmp10], %[ftmp11] \n\t"
179 "dli %[tmp0], 0xffff \n\t"
180 "dmtc1 %[tmp0], %[ftmp9] \n\t"
181 "pand %[ftmp10], %[ftmp10], %[ftmp9] \n\t"
182 "gssdlc1 %[ftmp10], 0x07(%[eob]) \n\t"
183 "gssdrc1 %[ftmp10], 0x00(%[eob]) \n\t"
184 : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
185 [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
186 [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
187 [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
188 [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
189 [tmp0] "=&r"(tmp[0]), [ones] "=&f"(ones)
190 : [coeff_ptr] "r"((mips_reg)coeff_ptr),
191 [qcoeff_ptr] "r"((mips_reg)qcoeff_ptr),
192 [dequant_ptr] "r"((mips_reg)dequant_ptr),
193 [round_ptr] "r"((mips_reg)round_ptr),
194 [quant_ptr] "r"((mips_reg)quant_ptr),
195 [dqcoeff_ptr] "r"((mips_reg)dqcoeff_ptr),
196 [inv_zig_zag] "r"((mips_reg)inv_zig_zag), [eob] "r"((mips_reg)&eob)
197 : "memory");
198
199 *d->eob = eob;
200 }
201
vp8_regular_quantize_b_mmi(BLOCK * b,BLOCKD * d)202 void vp8_regular_quantize_b_mmi(BLOCK *b, BLOCKD *d) {
203 int eob = 0;
204 int x, y, z, sz, zbin;
205 const int16_t *zbin_boost_ptr = b->zrun_zbin_boost;
206 const int16_t *coeff_ptr = b->coeff;
207 const int16_t *zbin_ptr = b->zbin;
208 const int16_t *round_ptr = b->round;
209 const int16_t *quant_ptr = b->quant;
210 const int16_t *quant_shift_ptr = b->quant_shift;
211 int16_t *qcoeff_ptr = d->qcoeff;
212 int16_t *dqcoeff_ptr = d->dqcoeff;
213 const int16_t *dequant_ptr = d->dequant;
214 const int16_t zbin_oq_value = b->zbin_extra;
215 register double ftmp0 asm("$f0");
216
217 // memset(qcoeff_ptr, 0, 32);
218 // memset(dqcoeff_ptr, 0, 32);
219 /* clang-format off */
220 __asm__ volatile (
221 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
222 "gssdlc1 %[ftmp0], 0x07(%[qcoeff_ptr]) \n\t"
223 "gssdrc1 %[ftmp0], 0x00(%[qcoeff_ptr]) \n\t"
224 "gssdlc1 %[ftmp0], 0x0f(%[qcoeff_ptr]) \n\t"
225 "gssdrc1 %[ftmp0], 0x08(%[qcoeff_ptr]) \n\t"
226 "gssdlc1 %[ftmp0], 0x17(%[qcoeff_ptr]) \n\t"
227 "gssdrc1 %[ftmp0], 0x10(%[qcoeff_ptr]) \n\t"
228 "gssdlc1 %[ftmp0], 0x1f(%[qcoeff_ptr]) \n\t"
229 "gssdrc1 %[ftmp0], 0x18(%[qcoeff_ptr]) \n\t"
230
231 "gssdlc1 %[ftmp0], 0x07(%[dqcoeff_ptr]) \n\t"
232 "gssdrc1 %[ftmp0], 0x00(%[dqcoeff_ptr]) \n\t"
233 "gssdlc1 %[ftmp0], 0x0f(%[dqcoeff_ptr]) \n\t"
234 "gssdrc1 %[ftmp0], 0x08(%[dqcoeff_ptr]) \n\t"
235 "gssdlc1 %[ftmp0], 0x17(%[dqcoeff_ptr]) \n\t"
236 "gssdrc1 %[ftmp0], 0x10(%[dqcoeff_ptr]) \n\t"
237 "gssdlc1 %[ftmp0], 0x1f(%[dqcoeff_ptr]) \n\t"
238 "gssdrc1 %[ftmp0], 0x18(%[dqcoeff_ptr]) \n\t"
239 : [ftmp0]"=&f"(ftmp0)
240 : [qcoeff_ptr]"r"(qcoeff_ptr), [dqcoeff_ptr]"r"(dqcoeff_ptr)
241 : "memory"
242 );
243 /* clang-format on */
244
245 REGULAR_SELECT_EOB(1, 0);
246 REGULAR_SELECT_EOB(2, 1);
247 REGULAR_SELECT_EOB(3, 4);
248 REGULAR_SELECT_EOB(4, 8);
249 REGULAR_SELECT_EOB(5, 5);
250 REGULAR_SELECT_EOB(6, 2);
251 REGULAR_SELECT_EOB(7, 3);
252 REGULAR_SELECT_EOB(8, 6);
253 REGULAR_SELECT_EOB(9, 9);
254 REGULAR_SELECT_EOB(10, 12);
255 REGULAR_SELECT_EOB(11, 13);
256 REGULAR_SELECT_EOB(12, 10);
257 REGULAR_SELECT_EOB(13, 7);
258 REGULAR_SELECT_EOB(14, 11);
259 REGULAR_SELECT_EOB(15, 14);
260 REGULAR_SELECT_EOB(16, 15);
261
262 *d->eob = (char)eob;
263 }
264