1 /*
2 * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vp8_rtcd.h"
12 #include "vpx_ports/mem.h"
13 #include "vpx_ports/asmdefs_mmi.h"
14
15 /* clang-format off */
16 /* TRANSPOSE_4H: transpose 4x4 matrix.
17 Input: ftmp1,ftmp2,ftmp3,ftmp4
18 Output: ftmp1,ftmp2,ftmp3,ftmp4
19 Note: ftmp0 always be 0, ftmp5~9 used for temporary value.
20 */
21 #define TRANSPOSE_4H \
22 MMI_LI(%[tmp0], 0x93) \
23 "mtc1 %[tmp0], %[ftmp10] \n\t" \
24 "punpcklhw %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
25 "punpcklhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \
26 "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
27 "por %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \
28 "punpckhhw %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \
29 "punpckhhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \
30 "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
31 "por %[ftmp6], %[ftmp6], %[ftmp9] \n\t" \
32 "punpcklhw %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \
33 "punpcklhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \
34 "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
35 "por %[ftmp7], %[ftmp7], %[ftmp9] \n\t" \
36 "punpckhhw %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \
37 "punpckhhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \
38 "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
39 "por %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \
40 "punpcklwd %[ftmp1], %[ftmp5], %[ftmp7] \n\t" \
41 "punpckhwd %[ftmp2], %[ftmp5], %[ftmp7] \n\t" \
42 "punpcklwd %[ftmp3], %[ftmp6], %[ftmp8] \n\t" \
43 "punpckhwd %[ftmp4], %[ftmp6], %[ftmp8] \n\t"
44 /* clang-format on */
45
vp8_short_fdct4x4_mmi(int16_t * input,int16_t * output,int pitch)46 void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {
47 uint64_t tmp[1];
48 int16_t *ip = input;
49 double ff_ph_op1, ff_ph_op3;
50
51 #if _MIPS_SIM == _ABIO32
52 register double ftmp0 asm("$f0");
53 register double ftmp1 asm("$f2");
54 register double ftmp2 asm("$f4");
55 register double ftmp3 asm("$f6");
56 register double ftmp4 asm("$f8");
57 register double ftmp5 asm("$f10");
58 register double ftmp6 asm("$f12");
59 register double ftmp7 asm("$f14");
60 register double ftmp8 asm("$f16");
61 register double ftmp9 asm("$f18");
62 register double ftmp10 asm("$f20");
63 register double ftmp11 asm("$f22");
64 register double ftmp12 asm("$f24");
65 #else
66 register double ftmp0 asm("$f0");
67 register double ftmp1 asm("$f1");
68 register double ftmp2 asm("$f2");
69 register double ftmp3 asm("$f3");
70 register double ftmp4 asm("$f4");
71 register double ftmp5 asm("$f5");
72 register double ftmp6 asm("$f6");
73 register double ftmp7 asm("$f7");
74 register double ftmp8 asm("$f8");
75 register double ftmp9 asm("$f9");
76 register double ftmp10 asm("$f10");
77 register double ftmp11 asm("$f11");
78 register double ftmp12 asm("$f12");
79 #endif // _MIPS_SIM == _ABIO32
80
81 DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL };
82 DECLARE_ALIGNED(8, const uint64_t, ff_ph_07) = { 0x0007000700070007ULL };
83 DECLARE_ALIGNED(8, const uint64_t, ff_pw_12000) = { 0x00002ee000002ee0ULL };
84 DECLARE_ALIGNED(8, const uint64_t, ff_pw_51000) = { 0x0000c7380000c738ULL };
85 DECLARE_ALIGNED(8, const uint64_t, ff_pw_14500) = { 0x000038a4000038a4ULL };
86 DECLARE_ALIGNED(8, const uint64_t, ff_pw_7500) = { 0x00001d4c00001d4cULL };
87 DECLARE_ALIGNED(8, const uint64_t, ff_pw_5352) = { 0x000014e8000014e8ULL };
88 DECLARE_ALIGNED(8, const uint64_t, ff_pw_2217) = { 0x000008a9000008a9ULL };
89 DECLARE_ALIGNED(8, const uint64_t, ff_ph_8) = { 0x0008000800080008ULL };
90
91 /* clang-format off */
92 __asm__ volatile (
93 "dli %[tmp0], 0x14e808a914e808a9 \n\t"
94 "dmtc1 %[tmp0], %[ff_ph_op1] \n\t"
95 "dli %[tmp0], 0xeb1808a9eb1808a9 \n\t"
96 "dmtc1 %[tmp0], %[ff_ph_op3] \n\t"
97 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
98 "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t"
99 "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t"
100 MMI_ADDU(%[ip], %[ip], %[pitch])
101 "gsldlc1 %[ftmp2], 0x07(%[ip]) \n\t"
102 "gsldrc1 %[ftmp2], 0x00(%[ip]) \n\t"
103 MMI_ADDU(%[ip], %[ip], %[pitch])
104 "gsldlc1 %[ftmp3], 0x07(%[ip]) \n\t"
105 "gsldrc1 %[ftmp3], 0x00(%[ip]) \n\t"
106 MMI_ADDU(%[ip], %[ip], %[pitch])
107 "gsldlc1 %[ftmp4], 0x07(%[ip]) \n\t"
108 "gsldrc1 %[ftmp4], 0x00(%[ip]) \n\t"
109 MMI_ADDU(%[ip], %[ip], %[pitch])
110 TRANSPOSE_4H
111
112 "ldc1 %[ftmp11], %[ff_ph_8] \n\t"
113 // f1 + f4
114 "paddh %[ftmp5], %[ftmp1], %[ftmp4] \n\t"
115 // a1
116 "pmullh %[ftmp5], %[ftmp5], %[ftmp11] \n\t"
117 // f2 + f3
118 "paddh %[ftmp6], %[ftmp2], %[ftmp3] \n\t"
119 // b1
120 "pmullh %[ftmp6], %[ftmp6], %[ftmp11] \n\t"
121 // f2 - f3
122 "psubh %[ftmp7], %[ftmp2], %[ftmp3] \n\t"
123 // c1
124 "pmullh %[ftmp7], %[ftmp7], %[ftmp11] \n\t"
125 // f1 - f4
126 "psubh %[ftmp8], %[ftmp1], %[ftmp4] \n\t"
127 // d1
128 "pmullh %[ftmp8], %[ftmp8], %[ftmp11] \n\t"
129 // op[0] = a1 + b1
130 "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t"
131 // op[2] = a1 - b1
132 "psubh %[ftmp3], %[ftmp5], %[ftmp6] \n\t"
133
134 // op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12
135 MMI_LI(%[tmp0], 0x0c)
136 "dmtc1 %[tmp0], %[ftmp11] \n\t"
137 "ldc1 %[ftmp12], %[ff_pw_14500] \n\t"
138 "punpcklhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
139 "pmaddhw %[ftmp5], %[ftmp9], %[ff_ph_op1] \n\t"
140 "punpckhhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
141 "pmaddhw %[ftmp6], %[ftmp9], %[ff_ph_op1] \n\t"
142 "paddw %[ftmp5], %[ftmp5], %[ftmp12] \n\t"
143 "paddw %[ftmp6], %[ftmp6], %[ftmp12] \n\t"
144 "psraw %[ftmp5], %[ftmp5], %[ftmp11] \n\t"
145 "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t"
146 "packsswh %[ftmp2], %[ftmp5], %[ftmp6] \n\t"
147
148 // op[3] = (d1 * 2217 - c1 * 5352 + 7500) >> 12
149 "ldc1 %[ftmp12], %[ff_pw_7500] \n\t"
150 "punpcklhw %[ftmp9], %[ftmp8], %[ftmp7] \n\t"
151 "pmaddhw %[ftmp5], %[ftmp9], %[ff_ph_op3] \n\t"
152 "punpckhhw %[ftmp9], %[ftmp8], %[ftmp7] \n\t"
153 "pmaddhw %[ftmp6], %[ftmp9], %[ff_ph_op3] \n\t"
154 "paddw %[ftmp5], %[ftmp5], %[ftmp12] \n\t"
155 "paddw %[ftmp6], %[ftmp6], %[ftmp12] \n\t"
156 "psraw %[ftmp5], %[ftmp5], %[ftmp11] \n\t"
157 "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t"
158 "packsswh %[ftmp4], %[ftmp5], %[ftmp6] \n\t"
159 TRANSPOSE_4H
160
161 "paddh %[ftmp5], %[ftmp1], %[ftmp4] \n\t"
162 "paddh %[ftmp6], %[ftmp2], %[ftmp3] \n\t"
163 "psubh %[ftmp7], %[ftmp2], %[ftmp3] \n\t"
164 "psubh %[ftmp8], %[ftmp1], %[ftmp4] \n\t"
165
166 "pcmpeqh %[ftmp0], %[ftmp8], %[ftmp0] \n\t"
167 "ldc1 %[ftmp9], %[ff_ph_01] \n\t"
168 "paddh %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
169
170 "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t"
171 "psubh %[ftmp2], %[ftmp5], %[ftmp6] \n\t"
172 "ldc1 %[ftmp9], %[ff_ph_07] \n\t"
173 "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
174 "paddh %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
175 MMI_LI(%[tmp0], 0x04)
176 "dmtc1 %[tmp0], %[ftmp9] \n\t"
177 "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
178 "psrah %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
179
180 MMI_LI(%[tmp0], 0x10)
181 "mtc1 %[tmp0], %[ftmp9] \n\t"
182 "ldc1 %[ftmp12], %[ff_pw_12000] \n\t"
183 "punpcklhw %[ftmp5], %[ftmp7], %[ftmp8] \n\t"
184 "pmaddhw %[ftmp10], %[ftmp5], %[ff_ph_op1] \n\t"
185 "punpckhhw %[ftmp5], %[ftmp7], %[ftmp8] \n\t"
186 "pmaddhw %[ftmp11], %[ftmp5], %[ff_ph_op1] \n\t"
187 "paddw %[ftmp10], %[ftmp10], %[ftmp12] \n\t"
188 "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t"
189 "psraw %[ftmp10], %[ftmp10], %[ftmp9] \n\t"
190 "psraw %[ftmp11], %[ftmp11], %[ftmp9] \n\t"
191 "packsswh %[ftmp3], %[ftmp10], %[ftmp11] \n\t"
192 "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
193
194 "ldc1 %[ftmp12], %[ff_pw_51000] \n\t"
195 "punpcklhw %[ftmp5], %[ftmp8], %[ftmp7] \n\t"
196 "pmaddhw %[ftmp10], %[ftmp5], %[ff_ph_op3] \n\t"
197 "punpckhhw %[ftmp5], %[ftmp8], %[ftmp7] \n\t"
198 "pmaddhw %[ftmp11], %[ftmp5], %[ff_ph_op3] \n\t"
199 "paddw %[ftmp10], %[ftmp10], %[ftmp12] \n\t"
200 "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t"
201 "psraw %[ftmp10], %[ftmp10], %[ftmp9] \n\t"
202 "psraw %[ftmp11], %[ftmp11], %[ftmp9] \n\t"
203 "packsswh %[ftmp4], %[ftmp10], %[ftmp11] \n\t"
204
205 "gssdlc1 %[ftmp1], 0x07(%[output]) \n\t"
206 "gssdrc1 %[ftmp1], 0x00(%[output]) \n\t"
207 "gssdlc1 %[ftmp3], 0x0f(%[output]) \n\t"
208 "gssdrc1 %[ftmp3], 0x08(%[output]) \n\t"
209 "gssdlc1 %[ftmp2], 0x17(%[output]) \n\t"
210 "gssdrc1 %[ftmp2], 0x10(%[output]) \n\t"
211 "gssdlc1 %[ftmp4], 0x1f(%[output]) \n\t"
212 "gssdrc1 %[ftmp4], 0x18(%[output]) \n\t"
213
214 : [ftmp0] "=&f"(ftmp0), [ftmp1] "=&f"(ftmp1), [ftmp2] "=&f"(ftmp2),
215 [ftmp3] "=&f"(ftmp3), [ftmp4] "=&f"(ftmp4), [ftmp5] "=&f"(ftmp5),
216 [ftmp6] "=&f"(ftmp6), [ftmp7] "=&f"(ftmp7), [ftmp8] "=&f"(ftmp8),
217 [ftmp9] "=&f"(ftmp9), [ftmp10] "=&f"(ftmp10), [ftmp11] "=&f"(ftmp11),
218 [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0]), [ip]"+&r"(ip),
219 [ff_ph_op1] "=&f"(ff_ph_op1), [ff_ph_op3] "=&f"(ff_ph_op3)
220 : [ff_ph_01] "m"(ff_ph_01), [ff_ph_07] "m"(ff_ph_07),
221 [ff_pw_14500] "m"(ff_pw_14500), [ff_pw_7500] "m"(ff_pw_7500),
222 [ff_pw_12000] "m"(ff_pw_12000), [ff_pw_51000] "m"(ff_pw_51000),
223 [ff_pw_5352]"m"(ff_pw_5352), [ff_pw_2217]"m"(ff_pw_2217),
224 [ff_ph_8]"m"(ff_ph_8), [pitch]"r"(pitch), [output] "r"(output)
225 : "memory"
226 );
227 /* clang-format on */
228 }
229
vp8_short_fdct8x4_mmi(int16_t * input,int16_t * output,int pitch)230 void vp8_short_fdct8x4_mmi(int16_t *input, int16_t *output, int pitch) {
231 vp8_short_fdct4x4_mmi(input, output, pitch);
232 vp8_short_fdct4x4_mmi(input + 4, output + 16, pitch);
233 }
234
vp8_short_walsh4x4_mmi(int16_t * input,int16_t * output,int pitch)235 void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) {
236 double ftmp[13], ff_ph_01, ff_pw_01, ff_pw_03, ff_pw_mask;
237 uint64_t tmp[1];
238
239 /* clang-format off */
240 __asm__ volatile (
241 "dli %[tmp0], 0x0001000100010001 \n\t"
242 "dmtc1 %[tmp0], %[ff_ph_01] \n\t"
243 "dli %[tmp0], 0x0000000100000001 \n\t"
244 "dmtc1 %[tmp0], %[ff_pw_01] \n\t"
245 "dli %[tmp0], 0x0000000300000003 \n\t"
246 "dmtc1 %[tmp0], %[ff_pw_03] \n\t"
247 "dli %[tmp0], 0x0001000000010000 \n\t"
248 "dmtc1 %[tmp0], %[ff_pw_mask] \n\t"
249 MMI_LI(%[tmp0], 0x02)
250 "pxor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
251 "dmtc1 %[tmp0], %[ftmp11] \n\t"
252
253 "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t"
254 "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t"
255 MMI_ADDU(%[ip], %[ip], %[pitch])
256 "gsldlc1 %[ftmp2], 0x07(%[ip]) \n\t"
257 "gsldrc1 %[ftmp2], 0x00(%[ip]) \n\t"
258 MMI_ADDU(%[ip], %[ip], %[pitch])
259 "gsldlc1 %[ftmp3], 0x07(%[ip]) \n\t"
260 "gsldrc1 %[ftmp3], 0x00(%[ip]) \n\t"
261 MMI_ADDU(%[ip], %[ip], %[pitch])
262 "gsldlc1 %[ftmp4], 0x07(%[ip]) \n\t"
263 "gsldrc1 %[ftmp4], 0x00(%[ip]) \n\t"
264 TRANSPOSE_4H
265
266 "psllh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
267 "psllh %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
268 "psllh %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
269 "psllh %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
270 // a
271 "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
272 // d
273 "paddh %[ftmp6], %[ftmp2], %[ftmp4] \n\t"
274 // c
275 "psubh %[ftmp7], %[ftmp2], %[ftmp4] \n\t"
276 // b
277 "psubh %[ftmp8], %[ftmp1], %[ftmp3] \n\t"
278
279 // a + d
280 "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t"
281 // b + c
282 "paddh %[ftmp2], %[ftmp8], %[ftmp7] \n\t"
283 // b - c
284 "psubh %[ftmp3], %[ftmp8], %[ftmp7] \n\t"
285 // a - d
286 "psubh %[ftmp4], %[ftmp5], %[ftmp6] \n\t"
287
288 "pcmpeqh %[ftmp6], %[ftmp5], %[ftmp0] \n\t"
289 "paddh %[ftmp6], %[ftmp6], %[ff_ph_01] \n\t"
290 "paddh %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
291 TRANSPOSE_4H
292
293 // op[2], op[0]
294 "pmaddhw %[ftmp5], %[ftmp1], %[ff_pw_01] \n\t"
295 // op[3], op[1]
296 "pmaddhw %[ftmp1], %[ftmp1], %[ff_pw_mask] \n\t"
297
298 // op[6], op[4]
299 "pmaddhw %[ftmp6], %[ftmp2], %[ff_pw_01] \n\t"
300 // op[7], op[5]
301 "pmaddhw %[ftmp2], %[ftmp2], %[ff_pw_mask] \n\t"
302
303 // op[10], op[8]
304 "pmaddhw %[ftmp7], %[ftmp3], %[ff_pw_01] \n\t"
305 // op[11], op[9]
306 "pmaddhw %[ftmp3], %[ftmp3], %[ff_pw_mask] \n\t"
307
308 // op[14], op[12]
309 "pmaddhw %[ftmp8], %[ftmp4], %[ff_pw_01] \n\t"
310 // op[15], op[13]
311 "pmaddhw %[ftmp4], %[ftmp4], %[ff_pw_mask] \n\t"
312
313 // a1, a3
314 "paddw %[ftmp9], %[ftmp5], %[ftmp7] \n\t"
315 // d1, d3
316 "paddw %[ftmp10], %[ftmp6], %[ftmp8] \n\t"
317 // c1, c3
318 "psubw %[ftmp11], %[ftmp6], %[ftmp8] \n\t"
319 // b1, b3
320 "psubw %[ftmp12], %[ftmp5], %[ftmp7] \n\t"
321
322 // a1 + d1, a3 + d3
323 "paddw %[ftmp5], %[ftmp9], %[ftmp10] \n\t"
324 // b1 + c1, b3 + c3
325 "paddw %[ftmp6], %[ftmp12], %[ftmp11] \n\t"
326 // b1 - c1, b3 - c3
327 "psubw %[ftmp7], %[ftmp12], %[ftmp11] \n\t"
328 // a1 - d1, a3 - d3
329 "psubw %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
330
331 // a2, a4
332 "paddw %[ftmp9], %[ftmp1], %[ftmp3] \n\t"
333 // d2, d4
334 "paddw %[ftmp10], %[ftmp2], %[ftmp4] \n\t"
335 // c2, c4
336 "psubw %[ftmp11], %[ftmp2], %[ftmp4] \n\t"
337 // b2, b4
338 "psubw %[ftmp12], %[ftmp1], %[ftmp3] \n\t"
339
340 // a2 + d2, a4 + d4
341 "paddw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
342 // b2 + c2, b4 + c4
343 "paddw %[ftmp2], %[ftmp12], %[ftmp11] \n\t"
344 // b2 - c2, b4 - c4
345 "psubw %[ftmp3], %[ftmp12], %[ftmp11] \n\t"
346 // a2 - d2, a4 - d4
347 "psubw %[ftmp4], %[ftmp9], %[ftmp10] \n\t"
348
349 MMI_LI(%[tmp0], 0x03)
350 "dmtc1 %[tmp0], %[ftmp11] \n\t"
351
352 "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp1] \n\t"
353 "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
354 "paddw %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
355 "paddw %[ftmp1], %[ftmp1], %[ff_pw_03] \n\t"
356 "psraw %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
357
358 "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp2] \n\t"
359 "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
360 "paddw %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
361 "paddw %[ftmp2], %[ftmp2], %[ff_pw_03] \n\t"
362 "psraw %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
363
364 "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp3] \n\t"
365 "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
366 "paddw %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
367 "paddw %[ftmp3], %[ftmp3], %[ff_pw_03] \n\t"
368 "psraw %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
369
370 "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp4] \n\t"
371 "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
372 "paddw %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
373 "paddw %[ftmp4], %[ftmp4], %[ff_pw_03] \n\t"
374 "psraw %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
375
376 "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp5] \n\t"
377 "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
378 "paddw %[ftmp5], %[ftmp5], %[ftmp9] \n\t"
379 "paddw %[ftmp5], %[ftmp5], %[ff_pw_03] \n\t"
380 "psraw %[ftmp5], %[ftmp5], %[ftmp11] \n\t"
381
382 "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp6] \n\t"
383 "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
384 "paddw %[ftmp6], %[ftmp6], %[ftmp9] \n\t"
385 "paddw %[ftmp6], %[ftmp6], %[ff_pw_03] \n\t"
386 "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t"
387
388 "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp7] \n\t"
389 "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
390 "paddw %[ftmp7], %[ftmp7], %[ftmp9] \n\t"
391 "paddw %[ftmp7], %[ftmp7], %[ff_pw_03] \n\t"
392 "psraw %[ftmp7], %[ftmp7], %[ftmp11] \n\t"
393
394 "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp8] \n\t"
395 "pand %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
396 "paddw %[ftmp8], %[ftmp8], %[ftmp9] \n\t"
397 "paddw %[ftmp8], %[ftmp8], %[ff_pw_03] \n\t"
398 "psraw %[ftmp8], %[ftmp8], %[ftmp11] \n\t"
399
400 "packsswh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
401 "packsswh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
402 "packsswh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
403 "packsswh %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
404
405 MMI_LI(%[tmp0], 0x72)
406 "dmtc1 %[tmp0], %[ftmp11] \n\t"
407 "pshufh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
408 "pshufh %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
409 "pshufh %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
410 "pshufh %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
411
412 "gssdlc1 %[ftmp1], 0x07(%[op]) \n\t"
413 "gssdrc1 %[ftmp1], 0x00(%[op]) \n\t"
414 "gssdlc1 %[ftmp2], 0x0f(%[op]) \n\t"
415 "gssdrc1 %[ftmp2], 0x08(%[op]) \n\t"
416 "gssdlc1 %[ftmp3], 0x17(%[op]) \n\t"
417 "gssdrc1 %[ftmp3], 0x10(%[op]) \n\t"
418 "gssdlc1 %[ftmp4], 0x1f(%[op]) \n\t"
419 "gssdrc1 %[ftmp4], 0x18(%[op]) \n\t"
420 : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
421 [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
422 [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
423 [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
424 [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
425 [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
426 [ftmp12]"=&f"(ftmp[12]), [ff_pw_mask]"=&f"(ff_pw_mask),
427 [tmp0]"=&r"(tmp[0]), [ff_pw_01]"=&f"(ff_pw_01),
428 [ip]"+&r"(input), [ff_pw_03]"=&f"(ff_pw_03),
429 [ff_ph_01]"=&f"(ff_ph_01)
430 : [op]"r"(output), [pitch]"r"((mips_reg)pitch)
431 : "memory"
432 );
433 /* clang-format on */
434 }
435