xref: /aosp_15_r20/external/webp/src/dsp/dec_mips_dsp_r2.c (revision b2055c353e87c8814eb2b6b1b11112a1562253bd)
1 // Copyright 2014 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // MIPS version of dsp functions
11 //
12 // Author(s):  Djordje Pesut    ([email protected])
13 //             Jovan Zelincevic ([email protected])
14 
15 #include "src/dsp/dsp.h"
16 
17 #if defined(WEBP_USE_MIPS_DSP_R2)
18 
19 #include "src/dsp/mips_macro.h"
20 
21 static const int kC1 = WEBP_TRANSFORM_AC3_C1;
22 static const int kC2 = WEBP_TRANSFORM_AC3_C2;
23 
TransformDC(const int16_t * in,uint8_t * dst)24 static void TransformDC(const int16_t* in, uint8_t* dst) {
25   int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10;
26 
27   __asm__ volatile (
28     LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, dst,
29                         0, 0, 0, 0,
30                         0, 1, 2, 3,
31                         BPS)
32     "lh               %[temp5],  0(%[in])               \n\t"
33     "addiu            %[temp5],  %[temp5],  4           \n\t"
34     "ins              %[temp5],  %[temp5],  16, 16      \n\t"
35     "shra.ph          %[temp5],  %[temp5],  3           \n\t"
36     CONVERT_2_BYTES_TO_HALF(temp6, temp7, temp8, temp9, temp10, temp1, temp2,
37                             temp3, temp1, temp2, temp3, temp4)
38     STORE_SAT_SUM_X2(temp6, temp7, temp8, temp9, temp10, temp1, temp2, temp3,
39                      temp5, temp5, temp5, temp5, temp5, temp5, temp5, temp5,
40                      dst, 0, 1, 2, 3, BPS)
41 
42     OUTPUT_EARLY_CLOBBER_REGS_10()
43     : [in]"r"(in), [dst]"r"(dst)
44     : "memory"
45   );
46 }
47 
TransformAC3(const int16_t * in,uint8_t * dst)48 static void TransformAC3(const int16_t* in, uint8_t* dst) {
49   const int a = in[0] + 4;
50   int c4 = WEBP_TRANSFORM_AC3_MUL2(in[4]);
51   const int d4 = WEBP_TRANSFORM_AC3_MUL1(in[4]);
52   const int c1 = WEBP_TRANSFORM_AC3_MUL2(in[1]);
53   const int d1 = WEBP_TRANSFORM_AC3_MUL1(in[1]);
54   int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
55   int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
56 
57   __asm__ volatile (
58     "ins              %[c4],      %[d4],     16,       16    \n\t"
59     "replv.ph         %[temp1],   %[a]                       \n\t"
60     "replv.ph         %[temp4],   %[d1]                      \n\t"
61     ADD_SUB_HALVES(temp2, temp3, temp1, c4)
62     "replv.ph         %[temp5],   %[c1]                      \n\t"
63     SHIFT_R_SUM_X2(temp1, temp6, temp7, temp8, temp2, temp9, temp10, temp4,
64                    temp2, temp2, temp3, temp3, temp4, temp5, temp4, temp5)
65     LOAD_WITH_OFFSET_X4(temp3, temp5, temp11, temp12, dst,
66                         0, 0, 0, 0,
67                         0, 1, 2, 3,
68                         BPS)
69     CONVERT_2_BYTES_TO_HALF(temp13, temp14, temp3, temp15, temp5, temp16,
70                             temp11, temp17, temp3, temp5, temp11, temp12)
71     PACK_2_HALVES_TO_WORD(temp12, temp18, temp7, temp6, temp1, temp8, temp2,
72                           temp4, temp7, temp6, temp10, temp9)
73     STORE_SAT_SUM_X2(temp13, temp14, temp3, temp15, temp5, temp16, temp11,
74                      temp17, temp12, temp18, temp1, temp8, temp2, temp4,
75                      temp7, temp6, dst, 0, 1, 2, 3, BPS)
76 
77     OUTPUT_EARLY_CLOBBER_REGS_18(),
78       [c4]"+&r"(c4)
79     : [dst]"r"(dst), [a]"r"(a), [d1]"r"(d1), [d4]"r"(d4), [c1]"r"(c1)
80     : "memory"
81   );
82 }
83 
TransformOne(const int16_t * in,uint8_t * dst)84 static void TransformOne(const int16_t* in, uint8_t* dst) {
85   int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
86   int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
87 
88   __asm__ volatile (
89     "ulw              %[temp1],   0(%[in])                 \n\t"
90     "ulw              %[temp2],   16(%[in])                \n\t"
91     LOAD_IN_X2(temp5, temp6, 24, 26)
92     ADD_SUB_HALVES(temp3, temp4, temp1, temp2)
93     LOAD_IN_X2(temp1, temp2, 8, 10)
94     MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14,
95                   temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6,
96                   temp13, temp11, temp14, temp12)
97     INSERT_HALF_X2(temp8, temp7, temp10, temp9)
98     "ulw              %[temp17],  4(%[in])                 \n\t"
99     "ulw              %[temp18],  20(%[in])                \n\t"
100     ADD_SUB_HALVES(temp1, temp2, temp3, temp8)
101     ADD_SUB_HALVES(temp5, temp6, temp4, temp7)
102     ADD_SUB_HALVES(temp7, temp8, temp17, temp18)
103     LOAD_IN_X2(temp17, temp18, 12, 14)
104     LOAD_IN_X2(temp9, temp10, 28, 30)
105     MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17,
106                   temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10,
107                   temp15, temp4, temp16, temp17)
108     INSERT_HALF_X2(temp11, temp12, temp13, temp14)
109     ADD_SUB_HALVES(temp17, temp8, temp8, temp11)
110     ADD_SUB_HALVES(temp3, temp4, temp7, temp12)
111 
112     // horizontal
113     SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6)
114     INSERT_HALF_X2(temp1, temp6, temp5, temp2)
115     SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8)
116     "repl.ph          %[temp2],   0x4                      \n\t"
117     INSERT_HALF_X2(temp3, temp8, temp17, temp4)
118     "addq.ph          %[temp1],   %[temp1],  %[temp2]      \n\t"
119     "addq.ph          %[temp6],   %[temp6],  %[temp2]      \n\t"
120     ADD_SUB_HALVES(temp2, temp4, temp1, temp3)
121     ADD_SUB_HALVES(temp5, temp7, temp6, temp8)
122     MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18,
123                   temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15,
124                   temp6, temp17, temp8, temp18)
125     MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16,
126                   temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14,
127                   temp18, temp12, temp17, temp16)
128     INSERT_HALF_X2(temp1, temp3, temp9, temp13)
129     INSERT_HALF_X2(temp6, temp8, temp11, temp15)
130     SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15,
131                    temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8,
132                    temp6)
133     PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13,
134                           temp16, temp11, temp10, temp15, temp14)
135     LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, dst,
136                         0, 0, 0, 0,
137                         0, 1, 2, 3,
138                         BPS)
139     CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10,
140                             temp11, temp10, temp11, temp14, temp15)
141     STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11,
142                      temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4,
143                      dst, 0, 1, 2, 3, BPS)
144 
145     OUTPUT_EARLY_CLOBBER_REGS_18()
146     : [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2)
147     : "memory", "hi", "lo"
148   );
149 }
150 
TransformTwo(const int16_t * in,uint8_t * dst,int do_two)151 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
152   TransformOne(in, dst);
153   if (do_two) {
154     TransformOne(in + 16, dst + 4);
155   }
156 }
157 
FilterLoop26(uint8_t * p,int hstride,int vstride,int size,int thresh,int ithresh,int hev_thresh)158 static WEBP_INLINE void FilterLoop26(uint8_t* p,
159                                      int hstride, int vstride, int size,
160                                      int thresh, int ithresh, int hev_thresh) {
161   const int thresh2 = 2 * thresh + 1;
162   int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
163   int temp10, temp11, temp12, temp13, temp14, temp15;
164 
165   __asm__ volatile (
166     ".set      push                                      \n\t"
167     ".set      noreorder                                 \n\t"
168   "1:                                                    \n\t"
169     "negu      %[temp1],  %[hstride]                     \n\t"
170     "addiu     %[size],   %[size],        -1             \n\t"
171     "sll       %[temp2],  %[hstride],     1              \n\t"
172     "sll       %[temp3],  %[temp1],       1              \n\t"
173     "addu      %[temp4],  %[temp2],       %[hstride]     \n\t"
174     "addu      %[temp5],  %[temp3],       %[temp1]       \n\t"
175     "lbu       %[temp7],  0(%[p])                        \n\t"
176     "sll       %[temp6],  %[temp3],       1              \n\t"
177     "lbux      %[temp8],  %[temp5](%[p])                 \n\t"
178     "lbux      %[temp9],  %[temp3](%[p])                 \n\t"
179     "lbux      %[temp10], %[temp1](%[p])                 \n\t"
180     "lbux      %[temp11], %[temp6](%[p])                 \n\t"
181     "lbux      %[temp12], %[hstride](%[p])               \n\t"
182     "lbux      %[temp13], %[temp2](%[p])                 \n\t"
183     "lbux      %[temp14], %[temp4](%[p])                 \n\t"
184     "subu      %[temp1],  %[temp10],      %[temp7]       \n\t"
185     "subu      %[temp2],  %[temp9],       %[temp12]      \n\t"
186     "absq_s.w  %[temp3],  %[temp1]                       \n\t"
187     "absq_s.w  %[temp4],  %[temp2]                       \n\t"
188     "negu      %[temp1],  %[temp1]                       \n\t"
189     "sll       %[temp3],  %[temp3],       2              \n\t"
190     "addu      %[temp15], %[temp3],       %[temp4]       \n\t"
191     "subu      %[temp3],  %[temp15],      %[thresh2]     \n\t"
192     "sll       %[temp6],  %[temp1],       1              \n\t"
193     "bgtz      %[temp3],  3f                             \n\t"
194     " subu     %[temp4],  %[temp11],      %[temp8]       \n\t"
195     "absq_s.w  %[temp4],  %[temp4]                       \n\t"
196     "shll_s.w  %[temp2],  %[temp2],       24             \n\t"
197     "subu      %[temp4],  %[temp4],       %[ithresh]     \n\t"
198     "bgtz      %[temp4],  3f                             \n\t"
199     " subu     %[temp3],  %[temp8],       %[temp9]       \n\t"
200     "absq_s.w  %[temp3],  %[temp3]                       \n\t"
201     "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
202     "bgtz      %[temp3],  3f                             \n\t"
203     " subu     %[temp5],  %[temp9],       %[temp10]      \n\t"
204     "absq_s.w  %[temp3],  %[temp5]                       \n\t"
205     "absq_s.w  %[temp5],  %[temp5]                       \n\t"
206     "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
207     "bgtz      %[temp3],  3f                             \n\t"
208     " subu     %[temp3],  %[temp14],      %[temp13]      \n\t"
209     "absq_s.w  %[temp3],  %[temp3]                       \n\t"
210     "slt       %[temp5],  %[hev_thresh],  %[temp5]       \n\t"
211     "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
212     "bgtz      %[temp3],  3f                             \n\t"
213     " subu     %[temp3],  %[temp13],      %[temp12]      \n\t"
214     "absq_s.w  %[temp3],  %[temp3]                       \n\t"
215     "sra       %[temp4],  %[temp2],       24             \n\t"
216     "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
217     "bgtz      %[temp3],  3f                             \n\t"
218     " subu     %[temp15], %[temp12],      %[temp7]       \n\t"
219     "absq_s.w  %[temp3],  %[temp15]                      \n\t"
220     "absq_s.w  %[temp15], %[temp15]                      \n\t"
221     "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
222     "bgtz      %[temp3],  3f                             \n\t"
223     " slt      %[temp15], %[hev_thresh],  %[temp15]      \n\t"
224     "addu      %[temp3],  %[temp6],       %[temp1]       \n\t"
225     "or        %[temp2],  %[temp5],       %[temp15]      \n\t"
226     "addu      %[temp5],  %[temp4],       %[temp3]       \n\t"
227     "beqz      %[temp2],  4f                             \n\t"
228     " shra_r.w %[temp1],  %[temp5],       3              \n\t"
229     "addiu     %[temp2],  %[temp5],       3              \n\t"
230     "sra       %[temp2],  %[temp2],       3              \n\t"
231     "shll_s.w  %[temp1],  %[temp1],       27             \n\t"
232     "shll_s.w  %[temp2],  %[temp2],       27             \n\t"
233     "subu      %[temp3],  %[p],           %[hstride]     \n\t"
234     "sra       %[temp1],  %[temp1],       27             \n\t"
235     "sra       %[temp2],  %[temp2],       27             \n\t"
236     "subu      %[temp1],  %[temp7],       %[temp1]       \n\t"
237     "addu      %[temp2],  %[temp10],      %[temp2]       \n\t"
238     "lbux      %[temp2],  %[temp2](%[VP8kclip1])         \n\t"
239     "lbux      %[temp1],  %[temp1](%[VP8kclip1])         \n\t"
240     "sb        %[temp2],  0(%[temp3])                    \n\t"
241     "j         3f                                        \n\t"
242     " sb       %[temp1],  0(%[p])                        \n\t"
243   "4:                                                    \n\t"
244     "shll_s.w  %[temp5],  %[temp5],       24             \n\t"
245     "subu      %[temp14], %[p],           %[hstride]     \n\t"
246     "subu      %[temp11], %[temp14],      %[hstride]     \n\t"
247     "sra       %[temp6],  %[temp5],       24             \n\t"
248     "sll       %[temp1],  %[temp6],       3              \n\t"
249     "subu      %[temp15], %[temp11],      %[hstride]     \n\t"
250     "addu      %[temp2],  %[temp6],       %[temp1]       \n\t"
251     "sll       %[temp3],  %[temp2],       1              \n\t"
252     "addu      %[temp4],  %[temp3],       %[temp2]       \n\t"
253     "addiu     %[temp2],  %[temp2],       63             \n\t"
254     "addiu     %[temp3],  %[temp3],       63             \n\t"
255     "addiu     %[temp4],  %[temp4],       63             \n\t"
256     "sra       %[temp2],  %[temp2],       7              \n\t"
257     "sra       %[temp3],  %[temp3],       7              \n\t"
258     "sra       %[temp4],  %[temp4],       7              \n\t"
259     "addu      %[temp1],  %[temp8],       %[temp2]       \n\t"
260     "addu      %[temp5],  %[temp9],       %[temp3]       \n\t"
261     "addu      %[temp6],  %[temp10],      %[temp4]       \n\t"
262     "subu      %[temp8],  %[temp7],       %[temp4]       \n\t"
263     "subu      %[temp7],  %[temp12],      %[temp3]       \n\t"
264     "addu      %[temp10], %[p],           %[hstride]     \n\t"
265     "subu      %[temp9],  %[temp13],      %[temp2]       \n\t"
266     "addu      %[temp12], %[temp10],      %[hstride]     \n\t"
267     "lbux      %[temp2],  %[temp1](%[VP8kclip1])         \n\t"
268     "lbux      %[temp3],  %[temp5](%[VP8kclip1])         \n\t"
269     "lbux      %[temp4],  %[temp6](%[VP8kclip1])         \n\t"
270     "lbux      %[temp5],  %[temp8](%[VP8kclip1])         \n\t"
271     "lbux      %[temp6],  %[temp7](%[VP8kclip1])         \n\t"
272     "lbux      %[temp8],  %[temp9](%[VP8kclip1])         \n\t"
273     "sb        %[temp2],  0(%[temp15])                   \n\t"
274     "sb        %[temp3],  0(%[temp11])                   \n\t"
275     "sb        %[temp4],  0(%[temp14])                   \n\t"
276     "sb        %[temp5],  0(%[p])                        \n\t"
277     "sb        %[temp6],  0(%[temp10])                   \n\t"
278     "sb        %[temp8],  0(%[temp12])                   \n\t"
279   "3:                                                    \n\t"
280     "bgtz      %[size],   1b                             \n\t"
281     " addu     %[p],      %[p],           %[vstride]     \n\t"
282     ".set      pop                                       \n\t"
283     : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),[temp3]"=&r"(temp3),
284       [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
285       [temp7]"=&r"(temp7),[temp8]"=&r"(temp8),[temp9]"=&r"(temp9),
286       [temp10]"=&r"(temp10),[temp11]"=&r"(temp11),[temp12]"=&r"(temp12),
287       [temp13]"=&r"(temp13),[temp14]"=&r"(temp14),[temp15]"=&r"(temp15),
288       [size]"+&r"(size), [p]"+&r"(p)
289     : [hstride]"r"(hstride), [thresh2]"r"(thresh2),
290       [ithresh]"r"(ithresh),[vstride]"r"(vstride), [hev_thresh]"r"(hev_thresh),
291       [VP8kclip1]"r"(VP8kclip1)
292     : "memory"
293   );
294 }
295 
FilterLoop24(uint8_t * p,int hstride,int vstride,int size,int thresh,int ithresh,int hev_thresh)296 static WEBP_INLINE void FilterLoop24(uint8_t* p,
297                                      int hstride, int vstride, int size,
298                                      int thresh, int ithresh, int hev_thresh) {
299   int p0, q0, p1, q1, p2, q2, p3, q3;
300   int step1, step2, temp1, temp2, temp3, temp4;
301   uint8_t* pTemp0;
302   uint8_t* pTemp1;
303   const int thresh2 = 2 * thresh + 1;
304 
305   __asm__ volatile (
306     ".set      push                                   \n\t"
307     ".set      noreorder                              \n\t"
308     "bltz      %[size],    3f                         \n\t"
309     " nop                                             \n\t"
310   "2:                                                 \n\t"
311     "negu      %[step1],   %[hstride]                 \n\t"
312     "lbu       %[q0],      0(%[p])                    \n\t"
313     "lbux      %[p0],      %[step1](%[p])             \n\t"
314     "subu      %[step1],   %[step1],      %[hstride]  \n\t"
315     "lbux      %[q1],      %[hstride](%[p])           \n\t"
316     "subu      %[temp1],   %[p0],         %[q0]       \n\t"
317     "lbux      %[p1],      %[step1](%[p])             \n\t"
318     "addu      %[step2],   %[hstride],    %[hstride]  \n\t"
319     "absq_s.w  %[temp2],   %[temp1]                   \n\t"
320     "subu      %[temp3],   %[p1],         %[q1]       \n\t"
321     "absq_s.w  %[temp4],   %[temp3]                   \n\t"
322     "sll       %[temp2],   %[temp2],      2           \n\t"
323     "addu      %[temp2],   %[temp2],      %[temp4]    \n\t"
324     "subu      %[temp4],   %[temp2],      %[thresh2]  \n\t"
325     "subu      %[step1],   %[step1],      %[hstride]  \n\t"
326     "bgtz      %[temp4],   0f                         \n\t"
327     " lbux     %[p2],      %[step1](%[p])             \n\t"
328     "subu      %[step1],   %[step1],      %[hstride]  \n\t"
329     "lbux      %[q2],      %[step2](%[p])             \n\t"
330     "lbux      %[p3],      %[step1](%[p])             \n\t"
331     "subu      %[temp4],   %[p2],         %[p1]       \n\t"
332     "addu      %[step2],   %[step2],      %[hstride]  \n\t"
333     "subu      %[temp2],   %[p3],         %[p2]       \n\t"
334     "absq_s.w  %[temp4],   %[temp4]                   \n\t"
335     "absq_s.w  %[temp2],   %[temp2]                   \n\t"
336     "lbux      %[q3],      %[step2](%[p])             \n\t"
337     "subu      %[temp4],   %[temp4],      %[ithresh]  \n\t"
338     "negu      %[temp1],   %[temp1]                   \n\t"
339     "bgtz      %[temp4],   0f                         \n\t"
340     " subu     %[temp2],   %[temp2],      %[ithresh]  \n\t"
341     "subu      %[p3],      %[p1],         %[p0]       \n\t"
342     "bgtz      %[temp2],   0f                         \n\t"
343     " absq_s.w %[p3],      %[p3]                      \n\t"
344     "subu      %[temp4],   %[q3],         %[q2]       \n\t"
345     "subu      %[pTemp0],  %[p],          %[hstride]  \n\t"
346     "absq_s.w  %[temp4],   %[temp4]                   \n\t"
347     "subu      %[temp2],   %[p3],         %[ithresh]  \n\t"
348     "sll       %[step1],   %[temp1],      1           \n\t"
349     "bgtz      %[temp2],   0f                         \n\t"
350     " subu     %[temp4],   %[temp4],      %[ithresh]  \n\t"
351     "subu      %[temp2],   %[q2],         %[q1]       \n\t"
352     "bgtz      %[temp4],   0f                         \n\t"
353     " absq_s.w %[temp2],   %[temp2]                   \n\t"
354     "subu      %[q3],      %[q1],         %[q0]       \n\t"
355     "absq_s.w  %[q3],      %[q3]                      \n\t"
356     "subu      %[temp2],   %[temp2],      %[ithresh]  \n\t"
357     "addu      %[temp1],   %[temp1],      %[step1]    \n\t"
358     "bgtz      %[temp2],   0f                         \n\t"
359     " subu     %[temp4],   %[q3],         %[ithresh]  \n\t"
360     "slt       %[p3],      %[hev_thresh], %[p3]       \n\t"
361     "bgtz      %[temp4],   0f                         \n\t"
362     " slt      %[q3],      %[hev_thresh], %[q3]       \n\t"
363     "or        %[q3],      %[q3],         %[p3]       \n\t"
364     "bgtz      %[q3],      1f                         \n\t"
365     " shra_r.w %[temp2],   %[temp1],      3           \n\t"
366     "addiu     %[temp1],   %[temp1],      3           \n\t"
367     "sra       %[temp1],   %[temp1],      3           \n\t"
368     "shll_s.w  %[temp2],   %[temp2],      27          \n\t"
369     "shll_s.w  %[temp1],   %[temp1],      27          \n\t"
370     "addu      %[pTemp1],  %[p],          %[hstride]  \n\t"
371     "sra       %[temp2],   %[temp2],      27          \n\t"
372     "sra       %[temp1],   %[temp1],      27          \n\t"
373     "addiu     %[step1],   %[temp2],      1           \n\t"
374     "sra       %[step1],   %[step1],      1           \n\t"
375     "addu      %[p0],      %[p0],         %[temp1]    \n\t"
376     "addu      %[p1],      %[p1],         %[step1]    \n\t"
377     "subu      %[q0],      %[q0],         %[temp2]    \n\t"
378     "subu      %[q1],      %[q1],         %[step1]    \n\t"
379     "lbux      %[temp2],   %[p0](%[VP8kclip1])        \n\t"
380     "lbux      %[temp3],   %[q0](%[VP8kclip1])        \n\t"
381     "lbux      %[temp4],   %[q1](%[VP8kclip1])        \n\t"
382     "sb        %[temp2],   0(%[pTemp0])               \n\t"
383     "lbux      %[temp1],   %[p1](%[VP8kclip1])        \n\t"
384     "subu      %[pTemp0],  %[pTemp0],    %[hstride]   \n\t"
385     "sb        %[temp3],   0(%[p])                    \n\t"
386     "sb        %[temp4],   0(%[pTemp1])               \n\t"
387     "j         0f                                     \n\t"
388     " sb       %[temp1],   0(%[pTemp0])               \n\t"
389   "1:                                                 \n\t"
390     "shll_s.w  %[temp3],   %[temp3],      24          \n\t"
391     "sra       %[temp3],   %[temp3],      24          \n\t"
392     "addu      %[temp1],   %[temp1],      %[temp3]    \n\t"
393     "shra_r.w  %[temp2],   %[temp1],      3           \n\t"
394     "addiu     %[temp1],   %[temp1],      3           \n\t"
395     "shll_s.w  %[temp2],   %[temp2],      27          \n\t"
396     "sra       %[temp1],   %[temp1],      3           \n\t"
397     "shll_s.w  %[temp1],   %[temp1],      27          \n\t"
398     "sra       %[temp2],   %[temp2],      27          \n\t"
399     "sra       %[temp1],   %[temp1],      27          \n\t"
400     "addu      %[p0],      %[p0],         %[temp1]    \n\t"
401     "subu      %[q0],      %[q0],         %[temp2]    \n\t"
402     "lbux      %[temp1],   %[p0](%[VP8kclip1])        \n\t"
403     "lbux      %[temp2],   %[q0](%[VP8kclip1])        \n\t"
404     "sb        %[temp2],   0(%[p])                    \n\t"
405     "sb        %[temp1],   0(%[pTemp0])               \n\t"
406   "0:                                                 \n\t"
407     "subu      %[size],    %[size],       1           \n\t"
408     "bgtz      %[size],    2b                         \n\t"
409     " addu     %[p],       %[p],          %[vstride]  \n\t"
410   "3:                                                 \n\t"
411     ".set      pop                                    \n\t"
412     : [p0]"=&r"(p0), [q0]"=&r"(q0), [p1]"=&r"(p1), [q1]"=&r"(q1),
413       [p2]"=&r"(p2), [q2]"=&r"(q2), [p3]"=&r"(p3), [q3]"=&r"(q3),
414       [step2]"=&r"(step2), [step1]"=&r"(step1), [temp1]"=&r"(temp1),
415       [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
416       [pTemp0]"=&r"(pTemp0), [pTemp1]"=&r"(pTemp1), [p]"+&r"(p),
417       [size]"+&r"(size)
418     : [vstride]"r"(vstride), [ithresh]"r"(ithresh),
419       [hev_thresh]"r"(hev_thresh), [hstride]"r"(hstride),
420       [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
421     : "memory"
422   );
423 }
424 
425 // on macroblock edges
VFilter16(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)426 static void VFilter16(uint8_t* p, int stride,
427                       int thresh, int ithresh, int hev_thresh) {
428   FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
429 }
430 
HFilter16(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)431 static void HFilter16(uint8_t* p, int stride,
432                       int thresh, int ithresh, int hev_thresh) {
433   FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
434 }
435 
436 // 8-pixels wide variant, for chroma filtering
VFilter8(uint8_t * u,uint8_t * v,int stride,int thresh,int ithresh,int hev_thresh)437 static void VFilter8(uint8_t* u, uint8_t* v, int stride,
438                      int thresh, int ithresh, int hev_thresh) {
439   FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
440   FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
441 }
442 
HFilter8(uint8_t * u,uint8_t * v,int stride,int thresh,int ithresh,int hev_thresh)443 static void HFilter8(uint8_t* u, uint8_t* v, int stride,
444                      int thresh, int ithresh, int hev_thresh) {
445   FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
446   FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
447 }
448 
449 // on three inner edges
VFilter16i(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)450 static void VFilter16i(uint8_t* p, int stride,
451                        int thresh, int ithresh, int hev_thresh) {
452   int k;
453   for (k = 3; k > 0; --k) {
454     p += 4 * stride;
455     FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
456   }
457 }
458 
HFilter16i(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)459 static void HFilter16i(uint8_t* p, int stride,
460                        int thresh, int ithresh, int hev_thresh) {
461   int k;
462   for (k = 3; k > 0; --k) {
463     p += 4;
464     FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
465   }
466 }
467 
VFilter8i(uint8_t * u,uint8_t * v,int stride,int thresh,int ithresh,int hev_thresh)468 static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
469                       int thresh, int ithresh, int hev_thresh) {
470   FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
471   FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
472 }
473 
HFilter8i(uint8_t * u,uint8_t * v,int stride,int thresh,int ithresh,int hev_thresh)474 static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
475                       int thresh, int ithresh, int hev_thresh) {
476   FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
477   FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
478 }
479 
480 //------------------------------------------------------------------------------
481 // Simple In-loop filtering (Paragraph 15.2)
482 
SimpleVFilter16(uint8_t * p,int stride,int thresh)483 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
484   int i;
485   const int thresh2 = 2 * thresh + 1;
486   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
487   uint8_t* p1 = p - stride;
488   __asm__ volatile (
489     ".set      push                                      \n\t"
490     ".set      noreorder                                 \n\t"
491     "li        %[i],        16                           \n\t"
492   "0:                                                    \n\t"
493     "negu      %[temp4],    %[stride]                    \n\t"
494     "sll       %[temp5],    %[temp4],       1            \n\t"
495     "lbu       %[temp2],    0(%[p])                      \n\t"
496     "lbux      %[temp3],    %[stride](%[p])              \n\t"
497     "lbux      %[temp1],    %[temp4](%[p])               \n\t"
498     "lbux      %[temp0],    %[temp5](%[p])               \n\t"
499     "subu      %[temp7],    %[temp1],       %[temp2]     \n\t"
500     "subu      %[temp6],    %[temp0],       %[temp3]     \n\t"
501     "absq_s.w  %[temp4],    %[temp7]                     \n\t"
502     "absq_s.w  %[temp5],    %[temp6]                     \n\t"
503     "sll       %[temp4],    %[temp4],       2            \n\t"
504     "subu      %[temp5],    %[temp5],       %[thresh2]   \n\t"
505     "addu      %[temp5],    %[temp4],       %[temp5]     \n\t"
506     "negu      %[temp8],    %[temp7]                     \n\t"
507     "bgtz      %[temp5],    1f                           \n\t"
508     " addiu    %[i],        %[i],           -1           \n\t"
509     "sll       %[temp4],    %[temp8],       1            \n\t"
510     "shll_s.w  %[temp5],    %[temp6],       24           \n\t"
511     "addu      %[temp3],    %[temp4],       %[temp8]     \n\t"
512     "sra       %[temp5],    %[temp5],       24           \n\t"
513     "addu      %[temp3],    %[temp3],       %[temp5]     \n\t"
514     "addiu     %[temp7],    %[temp3],       3            \n\t"
515     "sra       %[temp7],    %[temp7],       3            \n\t"
516     "shra_r.w  %[temp8],    %[temp3],       3            \n\t"
517     "shll_s.w  %[temp0],    %[temp7],       27           \n\t"
518     "shll_s.w  %[temp4],    %[temp8],       27           \n\t"
519     "sra       %[temp0],    %[temp0],       27           \n\t"
520     "sra       %[temp4],    %[temp4],       27           \n\t"
521     "addu      %[temp7],    %[temp1],       %[temp0]     \n\t"
522     "subu      %[temp2],    %[temp2],       %[temp4]     \n\t"
523     "lbux      %[temp3],    %[temp7](%[VP8kclip1])       \n\t"
524     "lbux      %[temp4],    %[temp2](%[VP8kclip1])       \n\t"
525     "sb        %[temp3],    0(%[p1])                     \n\t"
526     "sb        %[temp4],    0(%[p])                      \n\t"
527   "1:                                                    \n\t"
528     "addiu     %[p1],       %[p1],          1            \n\t"
529     "bgtz      %[i],        0b                           \n\t"
530     " addiu    %[p],        %[p],           1            \n\t"
531     " .set     pop                                       \n\t"
532     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
533       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
534       [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
535       [p]"+&r"(p), [i]"=&r"(i), [p1]"+&r"(p1)
536     : [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
537     : "memory"
538   );
539 }
540 
541 // TEMP0 = SRC[A + A1 * BPS]
542 // TEMP1 = SRC[B + B1 * BPS]
543 // TEMP2 = SRC[C + C1 * BPS]
544 // TEMP3 = SRC[D + D1 * BPS]
545 #define LOAD_4_BYTES(TEMP0, TEMP1, TEMP2, TEMP3,                               \
546                      A, A1, B, B1, C, C1, D, D1, SRC)                          \
547   "lbu      %[" #TEMP0 "],   " #A "+" #A1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
548   "lbu      %[" #TEMP1 "],   " #B "+" #B1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
549   "lbu      %[" #TEMP2 "],   " #C "+" #C1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
550   "lbu      %[" #TEMP3 "],   " #D "+" #D1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
551 
SimpleHFilter16(uint8_t * p,int stride,int thresh)552 static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
553   int i;
554   const int thresh2 = 2 * thresh + 1;
555   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
556   __asm__ volatile (
557     ".set      push                                     \n\t"
558     ".set      noreorder                                \n\t"
559     "li        %[i],       16                           \n\t"
560   "0:                                                   \n\t"
561     LOAD_4_BYTES(temp0, temp1, temp2, temp3, -2, 0, -1, 0, 0, 0, 1, 0, p)
562     "subu      %[temp7],    %[temp1],       %[temp2]    \n\t"
563     "subu      %[temp6],    %[temp0],       %[temp3]    \n\t"
564     "absq_s.w  %[temp4],    %[temp7]                    \n\t"
565     "absq_s.w  %[temp5],    %[temp6]                    \n\t"
566     "sll       %[temp4],    %[temp4],       2           \n\t"
567     "addu      %[temp5],    %[temp4],       %[temp5]    \n\t"
568     "subu      %[temp5],    %[temp5],       %[thresh2]  \n\t"
569     "negu      %[temp8],    %[temp7]                    \n\t"
570     "bgtz      %[temp5],    1f                          \n\t"
571     " addiu    %[i],        %[i],           -1          \n\t"
572     "sll       %[temp4],    %[temp8],       1           \n\t"
573     "shll_s.w  %[temp5],    %[temp6],       24          \n\t"
574     "addu      %[temp3],    %[temp4],       %[temp8]    \n\t"
575     "sra       %[temp5],    %[temp5],       24          \n\t"
576     "addu      %[temp3],    %[temp3],       %[temp5]    \n\t"
577     "addiu     %[temp7],    %[temp3],       3           \n\t"
578     "sra       %[temp7],    %[temp7],       3           \n\t"
579     "shra_r.w  %[temp8],    %[temp3],       3           \n\t"
580     "shll_s.w  %[temp0],    %[temp7],       27          \n\t"
581     "shll_s.w  %[temp4],    %[temp8],       27          \n\t"
582     "sra       %[temp0],    %[temp0],       27          \n\t"
583     "sra       %[temp4],    %[temp4],       27          \n\t"
584     "addu      %[temp7],    %[temp1],       %[temp0]    \n\t"
585     "subu      %[temp2],    %[temp2],       %[temp4]    \n\t"
586     "lbux      %[temp3],    %[temp7](%[VP8kclip1])      \n\t"
587     "lbux      %[temp4],    %[temp2](%[VP8kclip1])      \n\t"
588     "sb        %[temp3],    -1(%[p])                    \n\t"
589     "sb        %[temp4],    0(%[p])                     \n\t"
590   "1:                                                   \n\t"
591     "bgtz      %[i],        0b                          \n\t"
592     " addu     %[p],        %[p],           %[stride]   \n\t"
593     ".set      pop                                      \n\t"
594     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
595       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
596       [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
597       [p]"+&r"(p), [i]"=&r"(i)
598     : [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
599     : "memory"
600   );
601 }
602 
SimpleVFilter16i(uint8_t * p,int stride,int thresh)603 static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
604   int k;
605   for (k = 3; k > 0; --k) {
606     p += 4 * stride;
607     SimpleVFilter16(p, stride, thresh);
608   }
609 }
610 
SimpleHFilter16i(uint8_t * p,int stride,int thresh)611 static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
612   int k;
613   for (k = 3; k > 0; --k) {
614     p += 4;
615     SimpleHFilter16(p, stride, thresh);
616   }
617 }
618 
619 // DST[A * BPS]     = TEMP0
620 // DST[B + C * BPS] = TEMP1
621 #define STORE_8_BYTES(TEMP0, TEMP1, A, B, C, DST)                              \
622   "usw    %[" #TEMP0 "],   " #A "*" XSTR(BPS) "(%[" #DST "])         \n\t"     \
623   "usw    %[" #TEMP1 "],   " #B "+" #C "*" XSTR(BPS) "(%[" #DST "])  \n\t"
624 
VE4(uint8_t * dst)625 static void VE4(uint8_t* dst) {    // vertical
626   const uint8_t* top = dst - BPS;
627   int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
628   __asm__ volatile (
629     "ulw             %[temp0],   -1(%[top])              \n\t"
630     "ulh             %[temp1],   3(%[top])               \n\t"
631     "preceu.ph.qbr   %[temp2],   %[temp0]                \n\t"
632     "preceu.ph.qbl   %[temp3],   %[temp0]                \n\t"
633     "preceu.ph.qbr   %[temp4],   %[temp1]                \n\t"
634     "packrl.ph       %[temp5],   %[temp3],    %[temp2]   \n\t"
635     "packrl.ph       %[temp6],   %[temp4],    %[temp3]   \n\t"
636     "shll.ph         %[temp5],   %[temp5],    1          \n\t"
637     "shll.ph         %[temp6],   %[temp6],    1          \n\t"
638     "addq.ph         %[temp2],   %[temp5],    %[temp2]   \n\t"
639     "addq.ph         %[temp6],   %[temp6],    %[temp4]   \n\t"
640     "addq.ph         %[temp2],   %[temp2],    %[temp3]   \n\t"
641     "addq.ph         %[temp6],   %[temp6],    %[temp3]   \n\t"
642     "shra_r.ph       %[temp2],   %[temp2],    2          \n\t"
643     "shra_r.ph       %[temp6],   %[temp6],    2          \n\t"
644     "precr.qb.ph     %[temp4],   %[temp6],    %[temp2]   \n\t"
645     STORE_8_BYTES(temp4, temp4, 0, 0, 1, dst)
646     STORE_8_BYTES(temp4, temp4, 2, 0, 3, dst)
647     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
648       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
649       [temp6]"=&r"(temp6)
650     : [top]"r"(top), [dst]"r"(dst)
651     : "memory"
652   );
653 }
654 
DC4(uint8_t * dst)655 static void DC4(uint8_t* dst) {   // DC
656   int temp0, temp1, temp2, temp3, temp4;
657   __asm__ volatile (
658     "ulw          %[temp0],   -1*" XSTR(BPS) "(%[dst]) \n\t"
659     LOAD_4_BYTES(temp1, temp2, temp3, temp4, -1, 0, -1, 1, -1, 2, -1, 3, dst)
660     "ins          %[temp1],   %[temp2],    8,     8    \n\t"
661     "ins          %[temp1],   %[temp3],    16,    8    \n\t"
662     "ins          %[temp1],   %[temp4],    24,    8    \n\t"
663     "raddu.w.qb   %[temp0],   %[temp0]                 \n\t"
664     "raddu.w.qb   %[temp1],   %[temp1]                 \n\t"
665     "addu         %[temp0],   %[temp0],    %[temp1]    \n\t"
666     "shra_r.w     %[temp0],   %[temp0],    3           \n\t"
667     "replv.qb     %[temp0],   %[temp0]                 \n\t"
668     STORE_8_BYTES(temp0, temp0, 0, 0, 1, dst)
669     STORE_8_BYTES(temp0, temp0, 2, 0, 3, dst)
670     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
671       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4)
672     : [dst]"r"(dst)
673     : "memory"
674   );
675 }
676 
RD4(uint8_t * dst)677 static void RD4(uint8_t* dst) {   // Down-right
678   int temp0, temp1, temp2, temp3, temp4;
679   int temp5, temp6, temp7, temp8;
680   __asm__ volatile (
681     LOAD_4_BYTES(temp0, temp1, temp2, temp3, -1, 0, -1, 1, -1, 2, -1, 3, dst)
682     "ulw            %[temp7],   -1-" XSTR(BPS) "(%[dst])       \n\t"
683     "ins            %[temp1],   %[temp0], 16, 16               \n\t"
684     "preceu.ph.qbr  %[temp5],   %[temp7]                       \n\t"
685     "ins            %[temp2],   %[temp1], 16, 16               \n\t"
686     "preceu.ph.qbl  %[temp4],   %[temp7]                       \n\t"
687     "ins            %[temp3],   %[temp2], 16, 16               \n\t"
688     "shll.ph        %[temp2],   %[temp2], 1                    \n\t"
689     "addq.ph        %[temp3],   %[temp3], %[temp1]             \n\t"
690     "packrl.ph      %[temp6],   %[temp5], %[temp1]             \n\t"
691     "addq.ph        %[temp3],   %[temp3], %[temp2]             \n\t"
692     "addq.ph        %[temp1],   %[temp1], %[temp5]             \n\t"
693     "shll.ph        %[temp6],   %[temp6], 1                    \n\t"
694     "addq.ph        %[temp1],   %[temp1], %[temp6]             \n\t"
695     "packrl.ph      %[temp0],   %[temp4], %[temp5]             \n\t"
696     "addq.ph        %[temp8],   %[temp5], %[temp4]             \n\t"
697     "shra_r.ph      %[temp3],   %[temp3], 2                    \n\t"
698     "shll.ph        %[temp0],   %[temp0], 1                    \n\t"
699     "shra_r.ph      %[temp1],   %[temp1], 2                    \n\t"
700     "addq.ph        %[temp8],   %[temp0], %[temp8]             \n\t"
701     "lbu            %[temp5],   3-" XSTR(BPS) "(%[dst])        \n\t"
702     "precrq.ph.w    %[temp7],   %[temp7], %[temp7]             \n\t"
703     "shra_r.ph      %[temp8],   %[temp8], 2                    \n\t"
704     "ins            %[temp7],   %[temp5], 0,  8                \n\t"
705     "precr.qb.ph    %[temp2],   %[temp1], %[temp3]             \n\t"
706     "raddu.w.qb     %[temp4],   %[temp7]                       \n\t"
707     "precr.qb.ph    %[temp6],   %[temp8], %[temp1]             \n\t"
708     "shra_r.w       %[temp4],   %[temp4], 2                    \n\t"
709     STORE_8_BYTES(temp2, temp6, 3, 0, 1, dst)
710     "prepend        %[temp2],   %[temp8], 8                    \n\t"
711     "prepend        %[temp6],   %[temp4], 8                    \n\t"
712     STORE_8_BYTES(temp2, temp6, 2, 0, 0, dst)
713     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
714       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
715       [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
716     : [dst]"r"(dst)
717     : "memory"
718   );
719 }
720 
721 // TEMP0 = SRC[A * BPS]
722 // TEMP1 = SRC[B + C * BPS]
723 #define LOAD_8_BYTES(TEMP0, TEMP1, A, B, C, SRC)                               \
724   "ulw    %[" #TEMP0 "],   " #A "*" XSTR(BPS) "(%[" #SRC "])         \n\t"     \
725   "ulw    %[" #TEMP1 "],   " #B "+" #C "*" XSTR(BPS) "(%[" #SRC "])  \n\t"
726 
LD4(uint8_t * dst)727 static void LD4(uint8_t* dst) {   // Down-Left
728   int temp0, temp1, temp2, temp3, temp4;
729   int temp5, temp6, temp7, temp8, temp9;
730   __asm__ volatile (
731     LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
732     "preceu.ph.qbl   %[temp2],    %[temp0]                     \n\t"
733     "preceu.ph.qbr   %[temp3],    %[temp0]                     \n\t"
734     "preceu.ph.qbr   %[temp4],    %[temp1]                     \n\t"
735     "preceu.ph.qbl   %[temp5],    %[temp1]                     \n\t"
736     "packrl.ph       %[temp6],    %[temp2],    %[temp3]        \n\t"
737     "packrl.ph       %[temp7],    %[temp4],    %[temp2]        \n\t"
738     "packrl.ph       %[temp8],    %[temp5],    %[temp4]        \n\t"
739     "shll.ph         %[temp6],    %[temp6],    1               \n\t"
740     "addq.ph         %[temp9],    %[temp2],    %[temp6]        \n\t"
741     "shll.ph         %[temp7],    %[temp7],    1               \n\t"
742     "addq.ph         %[temp9],    %[temp9],    %[temp3]        \n\t"
743     "shll.ph         %[temp8],    %[temp8],    1               \n\t"
744     "shra_r.ph       %[temp9],    %[temp9],    2               \n\t"
745     "addq.ph         %[temp3],    %[temp4],    %[temp7]        \n\t"
746     "addq.ph         %[temp0],    %[temp5],    %[temp8]        \n\t"
747     "addq.ph         %[temp3],    %[temp3],    %[temp2]        \n\t"
748     "addq.ph         %[temp0],    %[temp0],    %[temp4]        \n\t"
749     "shra_r.ph       %[temp3],    %[temp3],    2               \n\t"
750     "shra_r.ph       %[temp0],    %[temp0],    2               \n\t"
751     "srl             %[temp1],    %[temp1],    24              \n\t"
752     "sll             %[temp1],    %[temp1],    1               \n\t"
753     "raddu.w.qb      %[temp5],    %[temp5]                     \n\t"
754     "precr.qb.ph     %[temp9],    %[temp3],    %[temp9]        \n\t"
755     "precr.qb.ph     %[temp3],    %[temp0],    %[temp3]        \n\t"
756     "addu            %[temp1],    %[temp1],    %[temp5]        \n\t"
757     "shra_r.w        %[temp1],    %[temp1],    2               \n\t"
758     STORE_8_BYTES(temp9, temp3, 0, 0, 2, dst)
759     "prepend         %[temp9],    %[temp0],    8               \n\t"
760     "prepend         %[temp3],    %[temp1],    8               \n\t"
761     STORE_8_BYTES(temp9, temp3, 1, 0, 3, dst)
762     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
763       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
764       [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
765       [temp9]"=&r"(temp9)
766     : [dst]"r"(dst)
767     : "memory"
768   );
769 }
770 
771 //------------------------------------------------------------------------------
772 // Chroma
773 
DC8uv(uint8_t * dst)774 static void DC8uv(uint8_t* dst) {     // DC
775   int temp0, temp1, temp2, temp3, temp4;
776   int temp5, temp6, temp7, temp8, temp9;
777   __asm__ volatile (
778     LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
779     LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst)
780     LOAD_4_BYTES(temp6, temp7, temp8, temp9, -1, 4, -1, 5, -1, 6, -1, 7, dst)
781     "raddu.w.qb   %[temp0],   %[temp0]                   \n\t"
782     "raddu.w.qb   %[temp1],   %[temp1]                   \n\t"
783     "addu         %[temp2],   %[temp2],    %[temp3]      \n\t"
784     "addu         %[temp4],   %[temp4],    %[temp5]      \n\t"
785     "addu         %[temp6],   %[temp6],    %[temp7]      \n\t"
786     "addu         %[temp8],   %[temp8],    %[temp9]      \n\t"
787     "addu         %[temp0],   %[temp0],    %[temp1]      \n\t"
788     "addu         %[temp2],   %[temp2],    %[temp4]      \n\t"
789     "addu         %[temp6],   %[temp6],    %[temp8]      \n\t"
790     "addu         %[temp0],   %[temp0],    %[temp2]      \n\t"
791     "addu         %[temp0],   %[temp0],    %[temp6]      \n\t"
792     "shra_r.w     %[temp0],   %[temp0],    4             \n\t"
793     "replv.qb     %[temp0],   %[temp0]                   \n\t"
794     STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
795     STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
796     STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
797     STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
798     STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
799     STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
800     STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
801     STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
802     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
803       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
804       [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
805       [temp9]"=&r"(temp9)
806     : [dst]"r"(dst)
807     : "memory"
808   );
809 }
810 
DC8uvNoLeft(uint8_t * dst)811 static void DC8uvNoLeft(uint8_t* dst) {   // DC with no left samples
812   int temp0, temp1;
813   __asm__ volatile (
814     LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
815     "raddu.w.qb   %[temp0],   %[temp0]                   \n\t"
816     "raddu.w.qb   %[temp1],   %[temp1]                   \n\t"
817     "addu         %[temp0],   %[temp0],    %[temp1]      \n\t"
818     "shra_r.w     %[temp0],   %[temp0],    3             \n\t"
819     "replv.qb     %[temp0],   %[temp0]                   \n\t"
820     STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
821     STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
822     STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
823     STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
824     STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
825     STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
826     STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
827     STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
828     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
829     : [dst]"r"(dst)
830     : "memory"
831   );
832 }
833 
DC8uvNoTop(uint8_t * dst)834 static void DC8uvNoTop(uint8_t* dst) {  // DC with no top samples
835   int temp0, temp1, temp2, temp3, temp4;
836   int temp5, temp6, temp7, temp8;
837   __asm__ volatile (
838     LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst)
839     LOAD_4_BYTES(temp6, temp7, temp8, temp1, -1, 4, -1, 5, -1, 6, -1, 7, dst)
840     "addu         %[temp2],   %[temp2],    %[temp3]      \n\t"
841     "addu         %[temp4],   %[temp4],    %[temp5]      \n\t"
842     "addu         %[temp6],   %[temp6],    %[temp7]      \n\t"
843     "addu         %[temp8],   %[temp8],    %[temp1]      \n\t"
844     "addu         %[temp2],   %[temp2],    %[temp4]      \n\t"
845     "addu         %[temp6],   %[temp6],    %[temp8]      \n\t"
846     "addu         %[temp0],   %[temp6],    %[temp2]      \n\t"
847     "shra_r.w     %[temp0],   %[temp0],    3             \n\t"
848     "replv.qb     %[temp0],   %[temp0]                   \n\t"
849     STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
850     STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
851     STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
852     STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
853     STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
854     STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
855     STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
856     STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
857     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
858       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
859       [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
860     : [dst]"r"(dst)
861     : "memory"
862   );
863 }
864 
865 #undef LOAD_8_BYTES
866 #undef STORE_8_BYTES
867 #undef LOAD_4_BYTES
868 
869 #define CLIPPING(SIZE)                                                         \
870   "preceu.ph.qbl   %[temp2],   %[temp0]                  \n\t"                 \
871   "preceu.ph.qbr   %[temp0],   %[temp0]                  \n\t"                 \
872 ".if " #SIZE " == 8                                      \n\t"                 \
873   "preceu.ph.qbl   %[temp3],   %[temp1]                  \n\t"                 \
874   "preceu.ph.qbr   %[temp1],   %[temp1]                  \n\t"                 \
875 ".endif                                                  \n\t"                 \
876   "addu.ph         %[temp2],   %[temp2],   %[dst_1]      \n\t"                 \
877   "addu.ph         %[temp0],   %[temp0],   %[dst_1]      \n\t"                 \
878 ".if " #SIZE " == 8                                      \n\t"                 \
879   "addu.ph         %[temp3],   %[temp3],   %[dst_1]      \n\t"                 \
880   "addu.ph         %[temp1],   %[temp1],   %[dst_1]      \n\t"                 \
881 ".endif                                                  \n\t"                 \
882   "shll_s.ph       %[temp2],   %[temp2],   7             \n\t"                 \
883   "shll_s.ph       %[temp0],   %[temp0],   7             \n\t"                 \
884 ".if " #SIZE " == 8                                      \n\t"                 \
885   "shll_s.ph       %[temp3],   %[temp3],   7             \n\t"                 \
886   "shll_s.ph       %[temp1],   %[temp1],   7             \n\t"                 \
887 ".endif                                                  \n\t"                 \
888   "precrqu_s.qb.ph %[temp0],   %[temp2],   %[temp0]      \n\t"                 \
889 ".if " #SIZE " == 8                                      \n\t"                 \
890   "precrqu_s.qb.ph %[temp1],   %[temp3],   %[temp1]      \n\t"                 \
891 ".endif                                                  \n\t"
892 
893 
894 #define CLIP_8B_TO_DST(DST, TOP, SIZE) do {                                    \
895   int dst_1 = ((int)(DST)[-1] << 16) + (DST)[-1];                              \
896   int temp0, temp1, temp2, temp3;                                              \
897   __asm__ volatile (                                                           \
898   ".if " #SIZE " < 8                                     \n\t"                 \
899     "ulw             %[temp0],   0(%[top])               \n\t"                 \
900     "subu.ph         %[dst_1],   %[dst_1],    %[top_1]   \n\t"                 \
901     CLIPPING(4)                                                                \
902     "usw             %[temp0],   0(%[dst])               \n\t"                 \
903   ".else                                                 \n\t"                 \
904     "ulw             %[temp0],   0(%[top])               \n\t"                 \
905     "ulw             %[temp1],   4(%[top])               \n\t"                 \
906     "subu.ph         %[dst_1],   %[dst_1],    %[top_1]   \n\t"                 \
907     CLIPPING(8)                                                                \
908     "usw             %[temp0],   0(%[dst])               \n\t"                 \
909     "usw             %[temp1],   4(%[dst])               \n\t"                 \
910   ".if " #SIZE " == 16                                   \n\t"                 \
911     "ulw             %[temp0],   8(%[top])               \n\t"                 \
912     "ulw             %[temp1],   12(%[top])              \n\t"                 \
913     CLIPPING(8)                                                                \
914     "usw             %[temp0],   8(%[dst])               \n\t"                 \
915     "usw             %[temp1],   12(%[dst])              \n\t"                 \
916   ".endif                                                \n\t"                 \
917   ".endif                                                \n\t"                 \
918     : [dst_1]"+&r"(dst_1), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),           \
919       [temp2]"=&r"(temp2), [temp3]"=&r"(temp3)                                 \
920     : [top_1]"r"(top_1), [top]"r"((TOP)), [dst]"r"((DST))                      \
921     : "memory"                                                                 \
922   );                                                                           \
923 } while (0)
924 
925 #define CLIP_TO_DST(DST, SIZE) do {                                            \
926   int y;                                                                       \
927   const uint8_t* top = (DST) - BPS;                                            \
928   const int top_1 = ((int)top[-1] << 16) + top[-1];                            \
929   for (y = 0; y < (SIZE); ++y) {                                               \
930     CLIP_8B_TO_DST((DST), top, (SIZE));                                        \
931     (DST) += BPS;                                                              \
932   }                                                                            \
933 } while (0)
934 
935 #define TRUE_MOTION(DST, SIZE)                                                 \
936 static void TrueMotion##SIZE(uint8_t* (DST)) {                                 \
937   CLIP_TO_DST((DST), (SIZE));                                                  \
938 }
939 
940 TRUE_MOTION(dst, 4)
941 TRUE_MOTION(dst, 8)
942 TRUE_MOTION(dst, 16)
943 
944 #undef TRUE_MOTION
945 #undef CLIP_TO_DST
946 #undef CLIP_8B_TO_DST
947 #undef CLIPPING
948 
949 //------------------------------------------------------------------------------
950 // Entry point
951 
952 extern void VP8DspInitMIPSdspR2(void);
953 
VP8DspInitMIPSdspR2(void)954 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMIPSdspR2(void) {
955   VP8TransformDC = TransformDC;
956   VP8TransformAC3 = TransformAC3;
957   VP8Transform = TransformTwo;
958 
959   VP8VFilter16 = VFilter16;
960   VP8HFilter16 = HFilter16;
961   VP8VFilter8 = VFilter8;
962   VP8HFilter8 = HFilter8;
963   VP8VFilter16i = VFilter16i;
964   VP8HFilter16i = HFilter16i;
965   VP8VFilter8i = VFilter8i;
966   VP8HFilter8i = HFilter8i;
967   VP8SimpleVFilter16 = SimpleVFilter16;
968   VP8SimpleHFilter16 = SimpleHFilter16;
969   VP8SimpleVFilter16i = SimpleVFilter16i;
970   VP8SimpleHFilter16i = SimpleHFilter16i;
971 
972   VP8PredLuma4[0] = DC4;
973   VP8PredLuma4[1] = TrueMotion4;
974   VP8PredLuma4[2] = VE4;
975   VP8PredLuma4[4] = RD4;
976   VP8PredLuma4[6] = LD4;
977 
978   VP8PredChroma8[0] = DC8uv;
979   VP8PredChroma8[1] = TrueMotion8;
980   VP8PredChroma8[4] = DC8uvNoTop;
981   VP8PredChroma8[5] = DC8uvNoLeft;
982 
983   VP8PredLuma16[1] = TrueMotion16;
984 }
985 
986 #else  // !WEBP_USE_MIPS_DSP_R2
987 
988 WEBP_DSP_INIT_STUB(VP8DspInitMIPSdspR2)
989 
990 #endif  // WEBP_USE_MIPS_DSP_R2
991