1 // Copyright 2014 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // MIPS version of dsp functions
11 //
12 // Author(s): Djordje Pesut ([email protected])
13 // Jovan Zelincevic ([email protected])
14
15 #include "src/dsp/dsp.h"
16
17 #if defined(WEBP_USE_MIPS_DSP_R2)
18
19 #include "src/dsp/mips_macro.h"
20
21 static const int kC1 = WEBP_TRANSFORM_AC3_C1;
22 static const int kC2 = WEBP_TRANSFORM_AC3_C2;
23
TransformDC(const int16_t * in,uint8_t * dst)24 static void TransformDC(const int16_t* in, uint8_t* dst) {
25 int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10;
26
27 __asm__ volatile (
28 LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, dst,
29 0, 0, 0, 0,
30 0, 1, 2, 3,
31 BPS)
32 "lh %[temp5], 0(%[in]) \n\t"
33 "addiu %[temp5], %[temp5], 4 \n\t"
34 "ins %[temp5], %[temp5], 16, 16 \n\t"
35 "shra.ph %[temp5], %[temp5], 3 \n\t"
36 CONVERT_2_BYTES_TO_HALF(temp6, temp7, temp8, temp9, temp10, temp1, temp2,
37 temp3, temp1, temp2, temp3, temp4)
38 STORE_SAT_SUM_X2(temp6, temp7, temp8, temp9, temp10, temp1, temp2, temp3,
39 temp5, temp5, temp5, temp5, temp5, temp5, temp5, temp5,
40 dst, 0, 1, 2, 3, BPS)
41
42 OUTPUT_EARLY_CLOBBER_REGS_10()
43 : [in]"r"(in), [dst]"r"(dst)
44 : "memory"
45 );
46 }
47
TransformAC3(const int16_t * in,uint8_t * dst)48 static void TransformAC3(const int16_t* in, uint8_t* dst) {
49 const int a = in[0] + 4;
50 int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);
51 const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);
52 const int c1 = WEBP_TRANSFORM_AC3_MUL2(in[1]);
53 const int d1 = WEBP_TRANSFORM_AC3_MUL1(in[1]);
54 int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
55 int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
56
57 __asm__ volatile (
58 "ins %[c4], %[d4], 16, 16 \n\t"
59 "replv.ph %[temp1], %[a] \n\t"
60 "replv.ph %[temp4], %[d1] \n\t"
61 ADD_SUB_HALVES(temp2, temp3, temp1, c4)
62 "replv.ph %[temp5], %[c1] \n\t"
63 SHIFT_R_SUM_X2(temp1, temp6, temp7, temp8, temp2, temp9, temp10, temp4,
64 temp2, temp2, temp3, temp3, temp4, temp5, temp4, temp5)
65 LOAD_WITH_OFFSET_X4(temp3, temp5, temp11, temp12, dst,
66 0, 0, 0, 0,
67 0, 1, 2, 3,
68 BPS)
69 CONVERT_2_BYTES_TO_HALF(temp13, temp14, temp3, temp15, temp5, temp16,
70 temp11, temp17, temp3, temp5, temp11, temp12)
71 PACK_2_HALVES_TO_WORD(temp12, temp18, temp7, temp6, temp1, temp8, temp2,
72 temp4, temp7, temp6, temp10, temp9)
73 STORE_SAT_SUM_X2(temp13, temp14, temp3, temp15, temp5, temp16, temp11,
74 temp17, temp12, temp18, temp1, temp8, temp2, temp4,
75 temp7, temp6, dst, 0, 1, 2, 3, BPS)
76
77 OUTPUT_EARLY_CLOBBER_REGS_18(),
78 [c4]"+&r"(c4)
79 : [dst]"r"(dst), [a]"r"(a), [d1]"r"(d1), [d4]"r"(d4), [c1]"r"(c1)
80 : "memory"
81 );
82 }
83
TransformOne(const int16_t * in,uint8_t * dst)84 static void TransformOne(const int16_t* in, uint8_t* dst) {
85 int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
86 int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
87
88 __asm__ volatile (
89 "ulw %[temp1], 0(%[in]) \n\t"
90 "ulw %[temp2], 16(%[in]) \n\t"
91 LOAD_IN_X2(temp5, temp6, 24, 26)
92 ADD_SUB_HALVES(temp3, temp4, temp1, temp2)
93 LOAD_IN_X2(temp1, temp2, 8, 10)
94 MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14,
95 temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6,
96 temp13, temp11, temp14, temp12)
97 INSERT_HALF_X2(temp8, temp7, temp10, temp9)
98 "ulw %[temp17], 4(%[in]) \n\t"
99 "ulw %[temp18], 20(%[in]) \n\t"
100 ADD_SUB_HALVES(temp1, temp2, temp3, temp8)
101 ADD_SUB_HALVES(temp5, temp6, temp4, temp7)
102 ADD_SUB_HALVES(temp7, temp8, temp17, temp18)
103 LOAD_IN_X2(temp17, temp18, 12, 14)
104 LOAD_IN_X2(temp9, temp10, 28, 30)
105 MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17,
106 temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10,
107 temp15, temp4, temp16, temp17)
108 INSERT_HALF_X2(temp11, temp12, temp13, temp14)
109 ADD_SUB_HALVES(temp17, temp8, temp8, temp11)
110 ADD_SUB_HALVES(temp3, temp4, temp7, temp12)
111
112 // horizontal
113 SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6)
114 INSERT_HALF_X2(temp1, temp6, temp5, temp2)
115 SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8)
116 "repl.ph %[temp2], 0x4 \n\t"
117 INSERT_HALF_X2(temp3, temp8, temp17, temp4)
118 "addq.ph %[temp1], %[temp1], %[temp2] \n\t"
119 "addq.ph %[temp6], %[temp6], %[temp2] \n\t"
120 ADD_SUB_HALVES(temp2, temp4, temp1, temp3)
121 ADD_SUB_HALVES(temp5, temp7, temp6, temp8)
122 MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18,
123 temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15,
124 temp6, temp17, temp8, temp18)
125 MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16,
126 temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14,
127 temp18, temp12, temp17, temp16)
128 INSERT_HALF_X2(temp1, temp3, temp9, temp13)
129 INSERT_HALF_X2(temp6, temp8, temp11, temp15)
130 SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15,
131 temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8,
132 temp6)
133 PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13,
134 temp16, temp11, temp10, temp15, temp14)
135 LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, dst,
136 0, 0, 0, 0,
137 0, 1, 2, 3,
138 BPS)
139 CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10,
140 temp11, temp10, temp11, temp14, temp15)
141 STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11,
142 temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4,
143 dst, 0, 1, 2, 3, BPS)
144
145 OUTPUT_EARLY_CLOBBER_REGS_18()
146 : [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2)
147 : "memory", "hi", "lo"
148 );
149 }
150
TransformTwo(const int16_t * in,uint8_t * dst,int do_two)151 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
152 TransformOne(in, dst);
153 if (do_two) {
154 TransformOne(in + 16, dst + 4);
155 }
156 }
157
FilterLoop26(uint8_t * p,int hstride,int vstride,int size,int thresh,int ithresh,int hev_thresh)158 static WEBP_INLINE void FilterLoop26(uint8_t* p,
159 int hstride, int vstride, int size,
160 int thresh, int ithresh, int hev_thresh) {
161 const int thresh2 = 2 * thresh + 1;
162 int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
163 int temp10, temp11, temp12, temp13, temp14, temp15;
164
165 __asm__ volatile (
166 ".set push \n\t"
167 ".set noreorder \n\t"
168 "1: \n\t"
169 "negu %[temp1], %[hstride] \n\t"
170 "addiu %[size], %[size], -1 \n\t"
171 "sll %[temp2], %[hstride], 1 \n\t"
172 "sll %[temp3], %[temp1], 1 \n\t"
173 "addu %[temp4], %[temp2], %[hstride] \n\t"
174 "addu %[temp5], %[temp3], %[temp1] \n\t"
175 "lbu %[temp7], 0(%[p]) \n\t"
176 "sll %[temp6], %[temp3], 1 \n\t"
177 "lbux %[temp8], %[temp5](%[p]) \n\t"
178 "lbux %[temp9], %[temp3](%[p]) \n\t"
179 "lbux %[temp10], %[temp1](%[p]) \n\t"
180 "lbux %[temp11], %[temp6](%[p]) \n\t"
181 "lbux %[temp12], %[hstride](%[p]) \n\t"
182 "lbux %[temp13], %[temp2](%[p]) \n\t"
183 "lbux %[temp14], %[temp4](%[p]) \n\t"
184 "subu %[temp1], %[temp10], %[temp7] \n\t"
185 "subu %[temp2], %[temp9], %[temp12] \n\t"
186 "absq_s.w %[temp3], %[temp1] \n\t"
187 "absq_s.w %[temp4], %[temp2] \n\t"
188 "negu %[temp1], %[temp1] \n\t"
189 "sll %[temp3], %[temp3], 2 \n\t"
190 "addu %[temp15], %[temp3], %[temp4] \n\t"
191 "subu %[temp3], %[temp15], %[thresh2] \n\t"
192 "sll %[temp6], %[temp1], 1 \n\t"
193 "bgtz %[temp3], 3f \n\t"
194 " subu %[temp4], %[temp11], %[temp8] \n\t"
195 "absq_s.w %[temp4], %[temp4] \n\t"
196 "shll_s.w %[temp2], %[temp2], 24 \n\t"
197 "subu %[temp4], %[temp4], %[ithresh] \n\t"
198 "bgtz %[temp4], 3f \n\t"
199 " subu %[temp3], %[temp8], %[temp9] \n\t"
200 "absq_s.w %[temp3], %[temp3] \n\t"
201 "subu %[temp3], %[temp3], %[ithresh] \n\t"
202 "bgtz %[temp3], 3f \n\t"
203 " subu %[temp5], %[temp9], %[temp10] \n\t"
204 "absq_s.w %[temp3], %[temp5] \n\t"
205 "absq_s.w %[temp5], %[temp5] \n\t"
206 "subu %[temp3], %[temp3], %[ithresh] \n\t"
207 "bgtz %[temp3], 3f \n\t"
208 " subu %[temp3], %[temp14], %[temp13] \n\t"
209 "absq_s.w %[temp3], %[temp3] \n\t"
210 "slt %[temp5], %[hev_thresh], %[temp5] \n\t"
211 "subu %[temp3], %[temp3], %[ithresh] \n\t"
212 "bgtz %[temp3], 3f \n\t"
213 " subu %[temp3], %[temp13], %[temp12] \n\t"
214 "absq_s.w %[temp3], %[temp3] \n\t"
215 "sra %[temp4], %[temp2], 24 \n\t"
216 "subu %[temp3], %[temp3], %[ithresh] \n\t"
217 "bgtz %[temp3], 3f \n\t"
218 " subu %[temp15], %[temp12], %[temp7] \n\t"
219 "absq_s.w %[temp3], %[temp15] \n\t"
220 "absq_s.w %[temp15], %[temp15] \n\t"
221 "subu %[temp3], %[temp3], %[ithresh] \n\t"
222 "bgtz %[temp3], 3f \n\t"
223 " slt %[temp15], %[hev_thresh], %[temp15] \n\t"
224 "addu %[temp3], %[temp6], %[temp1] \n\t"
225 "or %[temp2], %[temp5], %[temp15] \n\t"
226 "addu %[temp5], %[temp4], %[temp3] \n\t"
227 "beqz %[temp2], 4f \n\t"
228 " shra_r.w %[temp1], %[temp5], 3 \n\t"
229 "addiu %[temp2], %[temp5], 3 \n\t"
230 "sra %[temp2], %[temp2], 3 \n\t"
231 "shll_s.w %[temp1], %[temp1], 27 \n\t"
232 "shll_s.w %[temp2], %[temp2], 27 \n\t"
233 "subu %[temp3], %[p], %[hstride] \n\t"
234 "sra %[temp1], %[temp1], 27 \n\t"
235 "sra %[temp2], %[temp2], 27 \n\t"
236 "subu %[temp1], %[temp7], %[temp1] \n\t"
237 "addu %[temp2], %[temp10], %[temp2] \n\t"
238 "lbux %[temp2], %[temp2](%[VP8kclip1]) \n\t"
239 "lbux %[temp1], %[temp1](%[VP8kclip1]) \n\t"
240 "sb %[temp2], 0(%[temp3]) \n\t"
241 "j 3f \n\t"
242 " sb %[temp1], 0(%[p]) \n\t"
243 "4: \n\t"
244 "shll_s.w %[temp5], %[temp5], 24 \n\t"
245 "subu %[temp14], %[p], %[hstride] \n\t"
246 "subu %[temp11], %[temp14], %[hstride] \n\t"
247 "sra %[temp6], %[temp5], 24 \n\t"
248 "sll %[temp1], %[temp6], 3 \n\t"
249 "subu %[temp15], %[temp11], %[hstride] \n\t"
250 "addu %[temp2], %[temp6], %[temp1] \n\t"
251 "sll %[temp3], %[temp2], 1 \n\t"
252 "addu %[temp4], %[temp3], %[temp2] \n\t"
253 "addiu %[temp2], %[temp2], 63 \n\t"
254 "addiu %[temp3], %[temp3], 63 \n\t"
255 "addiu %[temp4], %[temp4], 63 \n\t"
256 "sra %[temp2], %[temp2], 7 \n\t"
257 "sra %[temp3], %[temp3], 7 \n\t"
258 "sra %[temp4], %[temp4], 7 \n\t"
259 "addu %[temp1], %[temp8], %[temp2] \n\t"
260 "addu %[temp5], %[temp9], %[temp3] \n\t"
261 "addu %[temp6], %[temp10], %[temp4] \n\t"
262 "subu %[temp8], %[temp7], %[temp4] \n\t"
263 "subu %[temp7], %[temp12], %[temp3] \n\t"
264 "addu %[temp10], %[p], %[hstride] \n\t"
265 "subu %[temp9], %[temp13], %[temp2] \n\t"
266 "addu %[temp12], %[temp10], %[hstride] \n\t"
267 "lbux %[temp2], %[temp1](%[VP8kclip1]) \n\t"
268 "lbux %[temp3], %[temp5](%[VP8kclip1]) \n\t"
269 "lbux %[temp4], %[temp6](%[VP8kclip1]) \n\t"
270 "lbux %[temp5], %[temp8](%[VP8kclip1]) \n\t"
271 "lbux %[temp6], %[temp7](%[VP8kclip1]) \n\t"
272 "lbux %[temp8], %[temp9](%[VP8kclip1]) \n\t"
273 "sb %[temp2], 0(%[temp15]) \n\t"
274 "sb %[temp3], 0(%[temp11]) \n\t"
275 "sb %[temp4], 0(%[temp14]) \n\t"
276 "sb %[temp5], 0(%[p]) \n\t"
277 "sb %[temp6], 0(%[temp10]) \n\t"
278 "sb %[temp8], 0(%[temp12]) \n\t"
279 "3: \n\t"
280 "bgtz %[size], 1b \n\t"
281 " addu %[p], %[p], %[vstride] \n\t"
282 ".set pop \n\t"
283 : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),[temp3]"=&r"(temp3),
284 [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
285 [temp7]"=&r"(temp7),[temp8]"=&r"(temp8),[temp9]"=&r"(temp9),
286 [temp10]"=&r"(temp10),[temp11]"=&r"(temp11),[temp12]"=&r"(temp12),
287 [temp13]"=&r"(temp13),[temp14]"=&r"(temp14),[temp15]"=&r"(temp15),
288 [size]"+&r"(size), [p]"+&r"(p)
289 : [hstride]"r"(hstride), [thresh2]"r"(thresh2),
290 [ithresh]"r"(ithresh),[vstride]"r"(vstride), [hev_thresh]"r"(hev_thresh),
291 [VP8kclip1]"r"(VP8kclip1)
292 : "memory"
293 );
294 }
295
FilterLoop24(uint8_t * p,int hstride,int vstride,int size,int thresh,int ithresh,int hev_thresh)296 static WEBP_INLINE void FilterLoop24(uint8_t* p,
297 int hstride, int vstride, int size,
298 int thresh, int ithresh, int hev_thresh) {
299 int p0, q0, p1, q1, p2, q2, p3, q3;
300 int step1, step2, temp1, temp2, temp3, temp4;
301 uint8_t* pTemp0;
302 uint8_t* pTemp1;
303 const int thresh2 = 2 * thresh + 1;
304
305 __asm__ volatile (
306 ".set push \n\t"
307 ".set noreorder \n\t"
308 "bltz %[size], 3f \n\t"
309 " nop \n\t"
310 "2: \n\t"
311 "negu %[step1], %[hstride] \n\t"
312 "lbu %[q0], 0(%[p]) \n\t"
313 "lbux %[p0], %[step1](%[p]) \n\t"
314 "subu %[step1], %[step1], %[hstride] \n\t"
315 "lbux %[q1], %[hstride](%[p]) \n\t"
316 "subu %[temp1], %[p0], %[q0] \n\t"
317 "lbux %[p1], %[step1](%[p]) \n\t"
318 "addu %[step2], %[hstride], %[hstride] \n\t"
319 "absq_s.w %[temp2], %[temp1] \n\t"
320 "subu %[temp3], %[p1], %[q1] \n\t"
321 "absq_s.w %[temp4], %[temp3] \n\t"
322 "sll %[temp2], %[temp2], 2 \n\t"
323 "addu %[temp2], %[temp2], %[temp4] \n\t"
324 "subu %[temp4], %[temp2], %[thresh2] \n\t"
325 "subu %[step1], %[step1], %[hstride] \n\t"
326 "bgtz %[temp4], 0f \n\t"
327 " lbux %[p2], %[step1](%[p]) \n\t"
328 "subu %[step1], %[step1], %[hstride] \n\t"
329 "lbux %[q2], %[step2](%[p]) \n\t"
330 "lbux %[p3], %[step1](%[p]) \n\t"
331 "subu %[temp4], %[p2], %[p1] \n\t"
332 "addu %[step2], %[step2], %[hstride] \n\t"
333 "subu %[temp2], %[p3], %[p2] \n\t"
334 "absq_s.w %[temp4], %[temp4] \n\t"
335 "absq_s.w %[temp2], %[temp2] \n\t"
336 "lbux %[q3], %[step2](%[p]) \n\t"
337 "subu %[temp4], %[temp4], %[ithresh] \n\t"
338 "negu %[temp1], %[temp1] \n\t"
339 "bgtz %[temp4], 0f \n\t"
340 " subu %[temp2], %[temp2], %[ithresh] \n\t"
341 "subu %[p3], %[p1], %[p0] \n\t"
342 "bgtz %[temp2], 0f \n\t"
343 " absq_s.w %[p3], %[p3] \n\t"
344 "subu %[temp4], %[q3], %[q2] \n\t"
345 "subu %[pTemp0], %[p], %[hstride] \n\t"
346 "absq_s.w %[temp4], %[temp4] \n\t"
347 "subu %[temp2], %[p3], %[ithresh] \n\t"
348 "sll %[step1], %[temp1], 1 \n\t"
349 "bgtz %[temp2], 0f \n\t"
350 " subu %[temp4], %[temp4], %[ithresh] \n\t"
351 "subu %[temp2], %[q2], %[q1] \n\t"
352 "bgtz %[temp4], 0f \n\t"
353 " absq_s.w %[temp2], %[temp2] \n\t"
354 "subu %[q3], %[q1], %[q0] \n\t"
355 "absq_s.w %[q3], %[q3] \n\t"
356 "subu %[temp2], %[temp2], %[ithresh] \n\t"
357 "addu %[temp1], %[temp1], %[step1] \n\t"
358 "bgtz %[temp2], 0f \n\t"
359 " subu %[temp4], %[q3], %[ithresh] \n\t"
360 "slt %[p3], %[hev_thresh], %[p3] \n\t"
361 "bgtz %[temp4], 0f \n\t"
362 " slt %[q3], %[hev_thresh], %[q3] \n\t"
363 "or %[q3], %[q3], %[p3] \n\t"
364 "bgtz %[q3], 1f \n\t"
365 " shra_r.w %[temp2], %[temp1], 3 \n\t"
366 "addiu %[temp1], %[temp1], 3 \n\t"
367 "sra %[temp1], %[temp1], 3 \n\t"
368 "shll_s.w %[temp2], %[temp2], 27 \n\t"
369 "shll_s.w %[temp1], %[temp1], 27 \n\t"
370 "addu %[pTemp1], %[p], %[hstride] \n\t"
371 "sra %[temp2], %[temp2], 27 \n\t"
372 "sra %[temp1], %[temp1], 27 \n\t"
373 "addiu %[step1], %[temp2], 1 \n\t"
374 "sra %[step1], %[step1], 1 \n\t"
375 "addu %[p0], %[p0], %[temp1] \n\t"
376 "addu %[p1], %[p1], %[step1] \n\t"
377 "subu %[q0], %[q0], %[temp2] \n\t"
378 "subu %[q1], %[q1], %[step1] \n\t"
379 "lbux %[temp2], %[p0](%[VP8kclip1]) \n\t"
380 "lbux %[temp3], %[q0](%[VP8kclip1]) \n\t"
381 "lbux %[temp4], %[q1](%[VP8kclip1]) \n\t"
382 "sb %[temp2], 0(%[pTemp0]) \n\t"
383 "lbux %[temp1], %[p1](%[VP8kclip1]) \n\t"
384 "subu %[pTemp0], %[pTemp0], %[hstride] \n\t"
385 "sb %[temp3], 0(%[p]) \n\t"
386 "sb %[temp4], 0(%[pTemp1]) \n\t"
387 "j 0f \n\t"
388 " sb %[temp1], 0(%[pTemp0]) \n\t"
389 "1: \n\t"
390 "shll_s.w %[temp3], %[temp3], 24 \n\t"
391 "sra %[temp3], %[temp3], 24 \n\t"
392 "addu %[temp1], %[temp1], %[temp3] \n\t"
393 "shra_r.w %[temp2], %[temp1], 3 \n\t"
394 "addiu %[temp1], %[temp1], 3 \n\t"
395 "shll_s.w %[temp2], %[temp2], 27 \n\t"
396 "sra %[temp1], %[temp1], 3 \n\t"
397 "shll_s.w %[temp1], %[temp1], 27 \n\t"
398 "sra %[temp2], %[temp2], 27 \n\t"
399 "sra %[temp1], %[temp1], 27 \n\t"
400 "addu %[p0], %[p0], %[temp1] \n\t"
401 "subu %[q0], %[q0], %[temp2] \n\t"
402 "lbux %[temp1], %[p0](%[VP8kclip1]) \n\t"
403 "lbux %[temp2], %[q0](%[VP8kclip1]) \n\t"
404 "sb %[temp2], 0(%[p]) \n\t"
405 "sb %[temp1], 0(%[pTemp0]) \n\t"
406 "0: \n\t"
407 "subu %[size], %[size], 1 \n\t"
408 "bgtz %[size], 2b \n\t"
409 " addu %[p], %[p], %[vstride] \n\t"
410 "3: \n\t"
411 ".set pop \n\t"
412 : [p0]"=&r"(p0), [q0]"=&r"(q0), [p1]"=&r"(p1), [q1]"=&r"(q1),
413 [p2]"=&r"(p2), [q2]"=&r"(q2), [p3]"=&r"(p3), [q3]"=&r"(q3),
414 [step2]"=&r"(step2), [step1]"=&r"(step1), [temp1]"=&r"(temp1),
415 [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
416 [pTemp0]"=&r"(pTemp0), [pTemp1]"=&r"(pTemp1), [p]"+&r"(p),
417 [size]"+&r"(size)
418 : [vstride]"r"(vstride), [ithresh]"r"(ithresh),
419 [hev_thresh]"r"(hev_thresh), [hstride]"r"(hstride),
420 [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
421 : "memory"
422 );
423 }
424
425 // on macroblock edges
VFilter16(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)426 static void VFilter16(uint8_t* p, int stride,
427 int thresh, int ithresh, int hev_thresh) {
428 FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
429 }
430
HFilter16(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)431 static void HFilter16(uint8_t* p, int stride,
432 int thresh, int ithresh, int hev_thresh) {
433 FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
434 }
435
436 // 8-pixels wide variant, for chroma filtering
VFilter8(uint8_t * u,uint8_t * v,int stride,int thresh,int ithresh,int hev_thresh)437 static void VFilter8(uint8_t* u, uint8_t* v, int stride,
438 int thresh, int ithresh, int hev_thresh) {
439 FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
440 FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
441 }
442
HFilter8(uint8_t * u,uint8_t * v,int stride,int thresh,int ithresh,int hev_thresh)443 static void HFilter8(uint8_t* u, uint8_t* v, int stride,
444 int thresh, int ithresh, int hev_thresh) {
445 FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
446 FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
447 }
448
449 // on three inner edges
VFilter16i(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)450 static void VFilter16i(uint8_t* p, int stride,
451 int thresh, int ithresh, int hev_thresh) {
452 int k;
453 for (k = 3; k > 0; --k) {
454 p += 4 * stride;
455 FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
456 }
457 }
458
HFilter16i(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)459 static void HFilter16i(uint8_t* p, int stride,
460 int thresh, int ithresh, int hev_thresh) {
461 int k;
462 for (k = 3; k > 0; --k) {
463 p += 4;
464 FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
465 }
466 }
467
VFilter8i(uint8_t * u,uint8_t * v,int stride,int thresh,int ithresh,int hev_thresh)468 static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
469 int thresh, int ithresh, int hev_thresh) {
470 FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
471 FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
472 }
473
HFilter8i(uint8_t * u,uint8_t * v,int stride,int thresh,int ithresh,int hev_thresh)474 static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
475 int thresh, int ithresh, int hev_thresh) {
476 FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
477 FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
478 }
479
480 //------------------------------------------------------------------------------
481 // Simple In-loop filtering (Paragraph 15.2)
482
SimpleVFilter16(uint8_t * p,int stride,int thresh)483 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
484 int i;
485 const int thresh2 = 2 * thresh + 1;
486 int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
487 uint8_t* p1 = p - stride;
488 __asm__ volatile (
489 ".set push \n\t"
490 ".set noreorder \n\t"
491 "li %[i], 16 \n\t"
492 "0: \n\t"
493 "negu %[temp4], %[stride] \n\t"
494 "sll %[temp5], %[temp4], 1 \n\t"
495 "lbu %[temp2], 0(%[p]) \n\t"
496 "lbux %[temp3], %[stride](%[p]) \n\t"
497 "lbux %[temp1], %[temp4](%[p]) \n\t"
498 "lbux %[temp0], %[temp5](%[p]) \n\t"
499 "subu %[temp7], %[temp1], %[temp2] \n\t"
500 "subu %[temp6], %[temp0], %[temp3] \n\t"
501 "absq_s.w %[temp4], %[temp7] \n\t"
502 "absq_s.w %[temp5], %[temp6] \n\t"
503 "sll %[temp4], %[temp4], 2 \n\t"
504 "subu %[temp5], %[temp5], %[thresh2] \n\t"
505 "addu %[temp5], %[temp4], %[temp5] \n\t"
506 "negu %[temp8], %[temp7] \n\t"
507 "bgtz %[temp5], 1f \n\t"
508 " addiu %[i], %[i], -1 \n\t"
509 "sll %[temp4], %[temp8], 1 \n\t"
510 "shll_s.w %[temp5], %[temp6], 24 \n\t"
511 "addu %[temp3], %[temp4], %[temp8] \n\t"
512 "sra %[temp5], %[temp5], 24 \n\t"
513 "addu %[temp3], %[temp3], %[temp5] \n\t"
514 "addiu %[temp7], %[temp3], 3 \n\t"
515 "sra %[temp7], %[temp7], 3 \n\t"
516 "shra_r.w %[temp8], %[temp3], 3 \n\t"
517 "shll_s.w %[temp0], %[temp7], 27 \n\t"
518 "shll_s.w %[temp4], %[temp8], 27 \n\t"
519 "sra %[temp0], %[temp0], 27 \n\t"
520 "sra %[temp4], %[temp4], 27 \n\t"
521 "addu %[temp7], %[temp1], %[temp0] \n\t"
522 "subu %[temp2], %[temp2], %[temp4] \n\t"
523 "lbux %[temp3], %[temp7](%[VP8kclip1]) \n\t"
524 "lbux %[temp4], %[temp2](%[VP8kclip1]) \n\t"
525 "sb %[temp3], 0(%[p1]) \n\t"
526 "sb %[temp4], 0(%[p]) \n\t"
527 "1: \n\t"
528 "addiu %[p1], %[p1], 1 \n\t"
529 "bgtz %[i], 0b \n\t"
530 " addiu %[p], %[p], 1 \n\t"
531 " .set pop \n\t"
532 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
533 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
534 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
535 [p]"+&r"(p), [i]"=&r"(i), [p1]"+&r"(p1)
536 : [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
537 : "memory"
538 );
539 }
540
541 // TEMP0 = SRC[A + A1 * BPS]
542 // TEMP1 = SRC[B + B1 * BPS]
543 // TEMP2 = SRC[C + C1 * BPS]
544 // TEMP3 = SRC[D + D1 * BPS]
545 #define LOAD_4_BYTES(TEMP0, TEMP1, TEMP2, TEMP3, \
546 A, A1, B, B1, C, C1, D, D1, SRC) \
547 "lbu %[" #TEMP0 "], " #A "+" #A1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
548 "lbu %[" #TEMP1 "], " #B "+" #B1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
549 "lbu %[" #TEMP2 "], " #C "+" #C1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
550 "lbu %[" #TEMP3 "], " #D "+" #D1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
551
SimpleHFilter16(uint8_t * p,int stride,int thresh)552 static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
553 int i;
554 const int thresh2 = 2 * thresh + 1;
555 int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
556 __asm__ volatile (
557 ".set push \n\t"
558 ".set noreorder \n\t"
559 "li %[i], 16 \n\t"
560 "0: \n\t"
561 LOAD_4_BYTES(temp0, temp1, temp2, temp3, -2, 0, -1, 0, 0, 0, 1, 0, p)
562 "subu %[temp7], %[temp1], %[temp2] \n\t"
563 "subu %[temp6], %[temp0], %[temp3] \n\t"
564 "absq_s.w %[temp4], %[temp7] \n\t"
565 "absq_s.w %[temp5], %[temp6] \n\t"
566 "sll %[temp4], %[temp4], 2 \n\t"
567 "addu %[temp5], %[temp4], %[temp5] \n\t"
568 "subu %[temp5], %[temp5], %[thresh2] \n\t"
569 "negu %[temp8], %[temp7] \n\t"
570 "bgtz %[temp5], 1f \n\t"
571 " addiu %[i], %[i], -1 \n\t"
572 "sll %[temp4], %[temp8], 1 \n\t"
573 "shll_s.w %[temp5], %[temp6], 24 \n\t"
574 "addu %[temp3], %[temp4], %[temp8] \n\t"
575 "sra %[temp5], %[temp5], 24 \n\t"
576 "addu %[temp3], %[temp3], %[temp5] \n\t"
577 "addiu %[temp7], %[temp3], 3 \n\t"
578 "sra %[temp7], %[temp7], 3 \n\t"
579 "shra_r.w %[temp8], %[temp3], 3 \n\t"
580 "shll_s.w %[temp0], %[temp7], 27 \n\t"
581 "shll_s.w %[temp4], %[temp8], 27 \n\t"
582 "sra %[temp0], %[temp0], 27 \n\t"
583 "sra %[temp4], %[temp4], 27 \n\t"
584 "addu %[temp7], %[temp1], %[temp0] \n\t"
585 "subu %[temp2], %[temp2], %[temp4] \n\t"
586 "lbux %[temp3], %[temp7](%[VP8kclip1]) \n\t"
587 "lbux %[temp4], %[temp2](%[VP8kclip1]) \n\t"
588 "sb %[temp3], -1(%[p]) \n\t"
589 "sb %[temp4], 0(%[p]) \n\t"
590 "1: \n\t"
591 "bgtz %[i], 0b \n\t"
592 " addu %[p], %[p], %[stride] \n\t"
593 ".set pop \n\t"
594 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
595 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
596 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
597 [p]"+&r"(p), [i]"=&r"(i)
598 : [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
599 : "memory"
600 );
601 }
602
SimpleVFilter16i(uint8_t * p,int stride,int thresh)603 static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
604 int k;
605 for (k = 3; k > 0; --k) {
606 p += 4 * stride;
607 SimpleVFilter16(p, stride, thresh);
608 }
609 }
610
SimpleHFilter16i(uint8_t * p,int stride,int thresh)611 static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
612 int k;
613 for (k = 3; k > 0; --k) {
614 p += 4;
615 SimpleHFilter16(p, stride, thresh);
616 }
617 }
618
619 // DST[A * BPS] = TEMP0
620 // DST[B + C * BPS] = TEMP1
621 #define STORE_8_BYTES(TEMP0, TEMP1, A, B, C, DST) \
622 "usw %[" #TEMP0 "], " #A "*" XSTR(BPS) "(%[" #DST "]) \n\t" \
623 "usw %[" #TEMP1 "], " #B "+" #C "*" XSTR(BPS) "(%[" #DST "]) \n\t"
624
VE4(uint8_t * dst)625 static void VE4(uint8_t* dst) { // vertical
626 const uint8_t* top = dst - BPS;
627 int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
628 __asm__ volatile (
629 "ulw %[temp0], -1(%[top]) \n\t"
630 "ulh %[temp1], 3(%[top]) \n\t"
631 "preceu.ph.qbr %[temp2], %[temp0] \n\t"
632 "preceu.ph.qbl %[temp3], %[temp0] \n\t"
633 "preceu.ph.qbr %[temp4], %[temp1] \n\t"
634 "packrl.ph %[temp5], %[temp3], %[temp2] \n\t"
635 "packrl.ph %[temp6], %[temp4], %[temp3] \n\t"
636 "shll.ph %[temp5], %[temp5], 1 \n\t"
637 "shll.ph %[temp6], %[temp6], 1 \n\t"
638 "addq.ph %[temp2], %[temp5], %[temp2] \n\t"
639 "addq.ph %[temp6], %[temp6], %[temp4] \n\t"
640 "addq.ph %[temp2], %[temp2], %[temp3] \n\t"
641 "addq.ph %[temp6], %[temp6], %[temp3] \n\t"
642 "shra_r.ph %[temp2], %[temp2], 2 \n\t"
643 "shra_r.ph %[temp6], %[temp6], 2 \n\t"
644 "precr.qb.ph %[temp4], %[temp6], %[temp2] \n\t"
645 STORE_8_BYTES(temp4, temp4, 0, 0, 1, dst)
646 STORE_8_BYTES(temp4, temp4, 2, 0, 3, dst)
647 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
648 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
649 [temp6]"=&r"(temp6)
650 : [top]"r"(top), [dst]"r"(dst)
651 : "memory"
652 );
653 }
654
DC4(uint8_t * dst)655 static void DC4(uint8_t* dst) { // DC
656 int temp0, temp1, temp2, temp3, temp4;
657 __asm__ volatile (
658 "ulw %[temp0], -1*" XSTR(BPS) "(%[dst]) \n\t"
659 LOAD_4_BYTES(temp1, temp2, temp3, temp4, -1, 0, -1, 1, -1, 2, -1, 3, dst)
660 "ins %[temp1], %[temp2], 8, 8 \n\t"
661 "ins %[temp1], %[temp3], 16, 8 \n\t"
662 "ins %[temp1], %[temp4], 24, 8 \n\t"
663 "raddu.w.qb %[temp0], %[temp0] \n\t"
664 "raddu.w.qb %[temp1], %[temp1] \n\t"
665 "addu %[temp0], %[temp0], %[temp1] \n\t"
666 "shra_r.w %[temp0], %[temp0], 3 \n\t"
667 "replv.qb %[temp0], %[temp0] \n\t"
668 STORE_8_BYTES(temp0, temp0, 0, 0, 1, dst)
669 STORE_8_BYTES(temp0, temp0, 2, 0, 3, dst)
670 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
671 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4)
672 : [dst]"r"(dst)
673 : "memory"
674 );
675 }
676
RD4(uint8_t * dst)677 static void RD4(uint8_t* dst) { // Down-right
678 int temp0, temp1, temp2, temp3, temp4;
679 int temp5, temp6, temp7, temp8;
680 __asm__ volatile (
681 LOAD_4_BYTES(temp0, temp1, temp2, temp3, -1, 0, -1, 1, -1, 2, -1, 3, dst)
682 "ulw %[temp7], -1-" XSTR(BPS) "(%[dst]) \n\t"
683 "ins %[temp1], %[temp0], 16, 16 \n\t"
684 "preceu.ph.qbr %[temp5], %[temp7] \n\t"
685 "ins %[temp2], %[temp1], 16, 16 \n\t"
686 "preceu.ph.qbl %[temp4], %[temp7] \n\t"
687 "ins %[temp3], %[temp2], 16, 16 \n\t"
688 "shll.ph %[temp2], %[temp2], 1 \n\t"
689 "addq.ph %[temp3], %[temp3], %[temp1] \n\t"
690 "packrl.ph %[temp6], %[temp5], %[temp1] \n\t"
691 "addq.ph %[temp3], %[temp3], %[temp2] \n\t"
692 "addq.ph %[temp1], %[temp1], %[temp5] \n\t"
693 "shll.ph %[temp6], %[temp6], 1 \n\t"
694 "addq.ph %[temp1], %[temp1], %[temp6] \n\t"
695 "packrl.ph %[temp0], %[temp4], %[temp5] \n\t"
696 "addq.ph %[temp8], %[temp5], %[temp4] \n\t"
697 "shra_r.ph %[temp3], %[temp3], 2 \n\t"
698 "shll.ph %[temp0], %[temp0], 1 \n\t"
699 "shra_r.ph %[temp1], %[temp1], 2 \n\t"
700 "addq.ph %[temp8], %[temp0], %[temp8] \n\t"
701 "lbu %[temp5], 3-" XSTR(BPS) "(%[dst]) \n\t"
702 "precrq.ph.w %[temp7], %[temp7], %[temp7] \n\t"
703 "shra_r.ph %[temp8], %[temp8], 2 \n\t"
704 "ins %[temp7], %[temp5], 0, 8 \n\t"
705 "precr.qb.ph %[temp2], %[temp1], %[temp3] \n\t"
706 "raddu.w.qb %[temp4], %[temp7] \n\t"
707 "precr.qb.ph %[temp6], %[temp8], %[temp1] \n\t"
708 "shra_r.w %[temp4], %[temp4], 2 \n\t"
709 STORE_8_BYTES(temp2, temp6, 3, 0, 1, dst)
710 "prepend %[temp2], %[temp8], 8 \n\t"
711 "prepend %[temp6], %[temp4], 8 \n\t"
712 STORE_8_BYTES(temp2, temp6, 2, 0, 0, dst)
713 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
714 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
715 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
716 : [dst]"r"(dst)
717 : "memory"
718 );
719 }
720
721 // TEMP0 = SRC[A * BPS]
722 // TEMP1 = SRC[B + C * BPS]
723 #define LOAD_8_BYTES(TEMP0, TEMP1, A, B, C, SRC) \
724 "ulw %[" #TEMP0 "], " #A "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
725 "ulw %[" #TEMP1 "], " #B "+" #C "*" XSTR(BPS) "(%[" #SRC "]) \n\t"
726
LD4(uint8_t * dst)727 static void LD4(uint8_t* dst) { // Down-Left
728 int temp0, temp1, temp2, temp3, temp4;
729 int temp5, temp6, temp7, temp8, temp9;
730 __asm__ volatile (
731 LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
732 "preceu.ph.qbl %[temp2], %[temp0] \n\t"
733 "preceu.ph.qbr %[temp3], %[temp0] \n\t"
734 "preceu.ph.qbr %[temp4], %[temp1] \n\t"
735 "preceu.ph.qbl %[temp5], %[temp1] \n\t"
736 "packrl.ph %[temp6], %[temp2], %[temp3] \n\t"
737 "packrl.ph %[temp7], %[temp4], %[temp2] \n\t"
738 "packrl.ph %[temp8], %[temp5], %[temp4] \n\t"
739 "shll.ph %[temp6], %[temp6], 1 \n\t"
740 "addq.ph %[temp9], %[temp2], %[temp6] \n\t"
741 "shll.ph %[temp7], %[temp7], 1 \n\t"
742 "addq.ph %[temp9], %[temp9], %[temp3] \n\t"
743 "shll.ph %[temp8], %[temp8], 1 \n\t"
744 "shra_r.ph %[temp9], %[temp9], 2 \n\t"
745 "addq.ph %[temp3], %[temp4], %[temp7] \n\t"
746 "addq.ph %[temp0], %[temp5], %[temp8] \n\t"
747 "addq.ph %[temp3], %[temp3], %[temp2] \n\t"
748 "addq.ph %[temp0], %[temp0], %[temp4] \n\t"
749 "shra_r.ph %[temp3], %[temp3], 2 \n\t"
750 "shra_r.ph %[temp0], %[temp0], 2 \n\t"
751 "srl %[temp1], %[temp1], 24 \n\t"
752 "sll %[temp1], %[temp1], 1 \n\t"
753 "raddu.w.qb %[temp5], %[temp5] \n\t"
754 "precr.qb.ph %[temp9], %[temp3], %[temp9] \n\t"
755 "precr.qb.ph %[temp3], %[temp0], %[temp3] \n\t"
756 "addu %[temp1], %[temp1], %[temp5] \n\t"
757 "shra_r.w %[temp1], %[temp1], 2 \n\t"
758 STORE_8_BYTES(temp9, temp3, 0, 0, 2, dst)
759 "prepend %[temp9], %[temp0], 8 \n\t"
760 "prepend %[temp3], %[temp1], 8 \n\t"
761 STORE_8_BYTES(temp9, temp3, 1, 0, 3, dst)
762 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
763 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
764 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
765 [temp9]"=&r"(temp9)
766 : [dst]"r"(dst)
767 : "memory"
768 );
769 }
770
771 //------------------------------------------------------------------------------
772 // Chroma
773
DC8uv(uint8_t * dst)774 static void DC8uv(uint8_t* dst) { // DC
775 int temp0, temp1, temp2, temp3, temp4;
776 int temp5, temp6, temp7, temp8, temp9;
777 __asm__ volatile (
778 LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
779 LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst)
780 LOAD_4_BYTES(temp6, temp7, temp8, temp9, -1, 4, -1, 5, -1, 6, -1, 7, dst)
781 "raddu.w.qb %[temp0], %[temp0] \n\t"
782 "raddu.w.qb %[temp1], %[temp1] \n\t"
783 "addu %[temp2], %[temp2], %[temp3] \n\t"
784 "addu %[temp4], %[temp4], %[temp5] \n\t"
785 "addu %[temp6], %[temp6], %[temp7] \n\t"
786 "addu %[temp8], %[temp8], %[temp9] \n\t"
787 "addu %[temp0], %[temp0], %[temp1] \n\t"
788 "addu %[temp2], %[temp2], %[temp4] \n\t"
789 "addu %[temp6], %[temp6], %[temp8] \n\t"
790 "addu %[temp0], %[temp0], %[temp2] \n\t"
791 "addu %[temp0], %[temp0], %[temp6] \n\t"
792 "shra_r.w %[temp0], %[temp0], 4 \n\t"
793 "replv.qb %[temp0], %[temp0] \n\t"
794 STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
795 STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
796 STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
797 STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
798 STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
799 STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
800 STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
801 STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
802 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
803 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
804 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
805 [temp9]"=&r"(temp9)
806 : [dst]"r"(dst)
807 : "memory"
808 );
809 }
810
DC8uvNoLeft(uint8_t * dst)811 static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples
812 int temp0, temp1;
813 __asm__ volatile (
814 LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
815 "raddu.w.qb %[temp0], %[temp0] \n\t"
816 "raddu.w.qb %[temp1], %[temp1] \n\t"
817 "addu %[temp0], %[temp0], %[temp1] \n\t"
818 "shra_r.w %[temp0], %[temp0], 3 \n\t"
819 "replv.qb %[temp0], %[temp0] \n\t"
820 STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
821 STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
822 STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
823 STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
824 STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
825 STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
826 STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
827 STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
828 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
829 : [dst]"r"(dst)
830 : "memory"
831 );
832 }
833
DC8uvNoTop(uint8_t * dst)834 static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples
835 int temp0, temp1, temp2, temp3, temp4;
836 int temp5, temp6, temp7, temp8;
837 __asm__ volatile (
838 LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst)
839 LOAD_4_BYTES(temp6, temp7, temp8, temp1, -1, 4, -1, 5, -1, 6, -1, 7, dst)
840 "addu %[temp2], %[temp2], %[temp3] \n\t"
841 "addu %[temp4], %[temp4], %[temp5] \n\t"
842 "addu %[temp6], %[temp6], %[temp7] \n\t"
843 "addu %[temp8], %[temp8], %[temp1] \n\t"
844 "addu %[temp2], %[temp2], %[temp4] \n\t"
845 "addu %[temp6], %[temp6], %[temp8] \n\t"
846 "addu %[temp0], %[temp6], %[temp2] \n\t"
847 "shra_r.w %[temp0], %[temp0], 3 \n\t"
848 "replv.qb %[temp0], %[temp0] \n\t"
849 STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
850 STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
851 STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
852 STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
853 STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
854 STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
855 STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
856 STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
857 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
858 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
859 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
860 : [dst]"r"(dst)
861 : "memory"
862 );
863 }
864
865 #undef LOAD_8_BYTES
866 #undef STORE_8_BYTES
867 #undef LOAD_4_BYTES
868
869 #define CLIPPING(SIZE) \
870 "preceu.ph.qbl %[temp2], %[temp0] \n\t" \
871 "preceu.ph.qbr %[temp0], %[temp0] \n\t" \
872 ".if " #SIZE " == 8 \n\t" \
873 "preceu.ph.qbl %[temp3], %[temp1] \n\t" \
874 "preceu.ph.qbr %[temp1], %[temp1] \n\t" \
875 ".endif \n\t" \
876 "addu.ph %[temp2], %[temp2], %[dst_1] \n\t" \
877 "addu.ph %[temp0], %[temp0], %[dst_1] \n\t" \
878 ".if " #SIZE " == 8 \n\t" \
879 "addu.ph %[temp3], %[temp3], %[dst_1] \n\t" \
880 "addu.ph %[temp1], %[temp1], %[dst_1] \n\t" \
881 ".endif \n\t" \
882 "shll_s.ph %[temp2], %[temp2], 7 \n\t" \
883 "shll_s.ph %[temp0], %[temp0], 7 \n\t" \
884 ".if " #SIZE " == 8 \n\t" \
885 "shll_s.ph %[temp3], %[temp3], 7 \n\t" \
886 "shll_s.ph %[temp1], %[temp1], 7 \n\t" \
887 ".endif \n\t" \
888 "precrqu_s.qb.ph %[temp0], %[temp2], %[temp0] \n\t" \
889 ".if " #SIZE " == 8 \n\t" \
890 "precrqu_s.qb.ph %[temp1], %[temp3], %[temp1] \n\t" \
891 ".endif \n\t"
892
893
894 #define CLIP_8B_TO_DST(DST, TOP, SIZE) do { \
895 int dst_1 = ((int)(DST)[-1] << 16) + (DST)[-1]; \
896 int temp0, temp1, temp2, temp3; \
897 __asm__ volatile ( \
898 ".if " #SIZE " < 8 \n\t" \
899 "ulw %[temp0], 0(%[top]) \n\t" \
900 "subu.ph %[dst_1], %[dst_1], %[top_1] \n\t" \
901 CLIPPING(4) \
902 "usw %[temp0], 0(%[dst]) \n\t" \
903 ".else \n\t" \
904 "ulw %[temp0], 0(%[top]) \n\t" \
905 "ulw %[temp1], 4(%[top]) \n\t" \
906 "subu.ph %[dst_1], %[dst_1], %[top_1] \n\t" \
907 CLIPPING(8) \
908 "usw %[temp0], 0(%[dst]) \n\t" \
909 "usw %[temp1], 4(%[dst]) \n\t" \
910 ".if " #SIZE " == 16 \n\t" \
911 "ulw %[temp0], 8(%[top]) \n\t" \
912 "ulw %[temp1], 12(%[top]) \n\t" \
913 CLIPPING(8) \
914 "usw %[temp0], 8(%[dst]) \n\t" \
915 "usw %[temp1], 12(%[dst]) \n\t" \
916 ".endif \n\t" \
917 ".endif \n\t" \
918 : [dst_1]"+&r"(dst_1), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \
919 [temp2]"=&r"(temp2), [temp3]"=&r"(temp3) \
920 : [top_1]"r"(top_1), [top]"r"((TOP)), [dst]"r"((DST)) \
921 : "memory" \
922 ); \
923 } while (0)
924
925 #define CLIP_TO_DST(DST, SIZE) do { \
926 int y; \
927 const uint8_t* top = (DST) - BPS; \
928 const int top_1 = ((int)top[-1] << 16) + top[-1]; \
929 for (y = 0; y < (SIZE); ++y) { \
930 CLIP_8B_TO_DST((DST), top, (SIZE)); \
931 (DST) += BPS; \
932 } \
933 } while (0)
934
935 #define TRUE_MOTION(DST, SIZE) \
936 static void TrueMotion##SIZE(uint8_t* (DST)) { \
937 CLIP_TO_DST((DST), (SIZE)); \
938 }
939
940 TRUE_MOTION(dst, 4)
941 TRUE_MOTION(dst, 8)
942 TRUE_MOTION(dst, 16)
943
944 #undef TRUE_MOTION
945 #undef CLIP_TO_DST
946 #undef CLIP_8B_TO_DST
947 #undef CLIPPING
948
949 //------------------------------------------------------------------------------
950 // Entry point
951
952 extern void VP8DspInitMIPSdspR2(void);
953
VP8DspInitMIPSdspR2(void)954 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMIPSdspR2(void) {
955 VP8TransformDC = TransformDC;
956 VP8TransformAC3 = TransformAC3;
957 VP8Transform = TransformTwo;
958
959 VP8VFilter16 = VFilter16;
960 VP8HFilter16 = HFilter16;
961 VP8VFilter8 = VFilter8;
962 VP8HFilter8 = HFilter8;
963 VP8VFilter16i = VFilter16i;
964 VP8HFilter16i = HFilter16i;
965 VP8VFilter8i = VFilter8i;
966 VP8HFilter8i = HFilter8i;
967 VP8SimpleVFilter16 = SimpleVFilter16;
968 VP8SimpleHFilter16 = SimpleHFilter16;
969 VP8SimpleVFilter16i = SimpleVFilter16i;
970 VP8SimpleHFilter16i = SimpleHFilter16i;
971
972 VP8PredLuma4[0] = DC4;
973 VP8PredLuma4[1] = TrueMotion4;
974 VP8PredLuma4[2] = VE4;
975 VP8PredLuma4[4] = RD4;
976 VP8PredLuma4[6] = LD4;
977
978 VP8PredChroma8[0] = DC8uv;
979 VP8PredChroma8[1] = TrueMotion8;
980 VP8PredChroma8[4] = DC8uvNoTop;
981 VP8PredChroma8[5] = DC8uvNoLeft;
982
983 VP8PredLuma16[1] = TrueMotion16;
984 }
985
986 #else // !WEBP_USE_MIPS_DSP_R2
987
988 WEBP_DSP_INIT_STUB(VP8DspInitMIPSdspR2)
989
990 #endif // WEBP_USE_MIPS_DSP_R2
991