xref: /aosp_15_r20/external/webp/src/dsp/dec_mips32.c (revision b2055c353e87c8814eb2b6b1b11112a1562253bd)
1 // Copyright 2014 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // MIPS version of dsp functions
11 //
12 // Author(s):  Djordje Pesut    ([email protected])
13 //             Jovan Zelincevic ([email protected])
14 
15 #include "src/dsp/dsp.h"
16 
17 #if defined(WEBP_USE_MIPS32)
18 
19 #include "src/dsp/mips_macro.h"
20 
21 static const int kC1 = WEBP_TRANSFORM_AC3_C1;
22 static const int kC2 = WEBP_TRANSFORM_AC3_C2;
23 
abs_mips32(int x)24 static WEBP_INLINE int abs_mips32(int x) {
25   const int sign = x >> 31;
26   return (x ^ sign) - sign;
27 }
28 
29 // 4 pixels in, 2 pixels out
do_filter2(uint8_t * p,int step)30 static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
31   const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
32   const int a = 3 * (q0 - p0) + VP8ksclip1[p1 - q1];
33   const int a1 = VP8ksclip2[(a + 4) >> 3];
34   const int a2 = VP8ksclip2[(a + 3) >> 3];
35   p[-step] = VP8kclip1[p0 + a2];
36   p[    0] = VP8kclip1[q0 - a1];
37 }
38 
39 // 4 pixels in, 4 pixels out
do_filter4(uint8_t * p,int step)40 static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
41   const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
42   const int a = 3 * (q0 - p0);
43   const int a1 = VP8ksclip2[(a + 4) >> 3];
44   const int a2 = VP8ksclip2[(a + 3) >> 3];
45   const int a3 = (a1 + 1) >> 1;
46   p[-2 * step] = VP8kclip1[p1 + a3];
47   p[-    step] = VP8kclip1[p0 + a2];
48   p[        0] = VP8kclip1[q0 - a1];
49   p[     step] = VP8kclip1[q1 - a3];
50 }
51 
52 // 6 pixels in, 6 pixels out
do_filter6(uint8_t * p,int step)53 static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
54   const int p2 = p[-3 * step], p1 = p[-2 * step], p0 = p[-step];
55   const int q0 = p[0], q1 = p[step], q2 = p[2 * step];
56   const int a = VP8ksclip1[3 * (q0 - p0) + VP8ksclip1[p1 - q1]];
57   // a is in [-128,127], a1 in [-27,27], a2 in [-18,18] and a3 in [-9,9]
58   const int a1 = (27 * a + 63) >> 7;  // eq. to ((3 * a + 7) * 9) >> 7
59   const int a2 = (18 * a + 63) >> 7;  // eq. to ((2 * a + 7) * 9) >> 7
60   const int a3 = (9  * a + 63) >> 7;  // eq. to ((1 * a + 7) * 9) >> 7
61   p[-3 * step] = VP8kclip1[p2 + a3];
62   p[-2 * step] = VP8kclip1[p1 + a2];
63   p[-    step] = VP8kclip1[p0 + a1];
64   p[        0] = VP8kclip1[q0 - a1];
65   p[     step] = VP8kclip1[q1 - a2];
66   p[ 2 * step] = VP8kclip1[q2 - a3];
67 }
68 
hev(const uint8_t * p,int step,int thresh)69 static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
70   const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
71   return (abs_mips32(p1 - p0) > thresh) || (abs_mips32(q1 - q0) > thresh);
72 }
73 
needs_filter(const uint8_t * p,int step,int t)74 static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int t) {
75   const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
76   return ((4 * abs_mips32(p0 - q0) + abs_mips32(p1 - q1)) <= t);
77 }
78 
needs_filter2(const uint8_t * p,int step,int t,int it)79 static WEBP_INLINE int needs_filter2(const uint8_t* p,
80                                      int step, int t, int it) {
81   const int p3 = p[-4 * step], p2 = p[-3 * step];
82   const int p1 = p[-2 * step], p0 = p[-step];
83   const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
84   if ((4 * abs_mips32(p0 - q0) + abs_mips32(p1 - q1)) > t) {
85     return 0;
86   }
87   return abs_mips32(p3 - p2) <= it && abs_mips32(p2 - p1) <= it &&
88          abs_mips32(p1 - p0) <= it && abs_mips32(q3 - q2) <= it &&
89          abs_mips32(q2 - q1) <= it && abs_mips32(q1 - q0) <= it;
90 }
91 
FilterLoop26(uint8_t * p,int hstride,int vstride,int size,int thresh,int ithresh,int hev_thresh)92 static WEBP_INLINE void FilterLoop26(uint8_t* p,
93                                      int hstride, int vstride, int size,
94                                      int thresh, int ithresh, int hev_thresh) {
95   const int thresh2 = 2 * thresh + 1;
96   while (size-- > 0) {
97     if (needs_filter2(p, hstride, thresh2, ithresh)) {
98       if (hev(p, hstride, hev_thresh)) {
99         do_filter2(p, hstride);
100       } else {
101         do_filter6(p, hstride);
102       }
103     }
104     p += vstride;
105   }
106 }
107 
FilterLoop24(uint8_t * p,int hstride,int vstride,int size,int thresh,int ithresh,int hev_thresh)108 static WEBP_INLINE void FilterLoop24(uint8_t* p,
109                                      int hstride, int vstride, int size,
110                                      int thresh, int ithresh, int hev_thresh) {
111   const int thresh2 = 2 * thresh + 1;
112   while (size-- > 0) {
113     if (needs_filter2(p, hstride, thresh2, ithresh)) {
114       if (hev(p, hstride, hev_thresh)) {
115         do_filter2(p, hstride);
116       } else {
117         do_filter4(p, hstride);
118       }
119     }
120     p += vstride;
121   }
122 }
123 
124 // on macroblock edges
VFilter16(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)125 static void VFilter16(uint8_t* p, int stride,
126                       int thresh, int ithresh, int hev_thresh) {
127   FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
128 }
129 
HFilter16(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)130 static void HFilter16(uint8_t* p, int stride,
131                       int thresh, int ithresh, int hev_thresh) {
132   FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
133 }
134 
135 // 8-pixels wide variant, for chroma filtering
VFilter8(uint8_t * u,uint8_t * v,int stride,int thresh,int ithresh,int hev_thresh)136 static void VFilter8(uint8_t* u, uint8_t* v, int stride,
137                      int thresh, int ithresh, int hev_thresh) {
138   FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
139   FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
140 }
141 
HFilter8(uint8_t * u,uint8_t * v,int stride,int thresh,int ithresh,int hev_thresh)142 static void HFilter8(uint8_t* u, uint8_t* v, int stride,
143                      int thresh, int ithresh, int hev_thresh) {
144   FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
145   FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
146 }
147 
VFilter8i(uint8_t * u,uint8_t * v,int stride,int thresh,int ithresh,int hev_thresh)148 static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
149                       int thresh, int ithresh, int hev_thresh) {
150   FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
151   FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
152 }
153 
HFilter8i(uint8_t * u,uint8_t * v,int stride,int thresh,int ithresh,int hev_thresh)154 static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
155                       int thresh, int ithresh, int hev_thresh) {
156   FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
157   FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
158 }
159 
160 // on three inner edges
VFilter16i(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)161 static void VFilter16i(uint8_t* p, int stride,
162                        int thresh, int ithresh, int hev_thresh) {
163   int k;
164   for (k = 3; k > 0; --k) {
165     p += 4 * stride;
166     FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
167   }
168 }
169 
HFilter16i(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)170 static void HFilter16i(uint8_t* p, int stride,
171                        int thresh, int ithresh, int hev_thresh) {
172   int k;
173   for (k = 3; k > 0; --k) {
174     p += 4;
175     FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
176   }
177 }
178 
179 //------------------------------------------------------------------------------
180 // Simple In-loop filtering (Paragraph 15.2)
181 
SimpleVFilter16(uint8_t * p,int stride,int thresh)182 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
183   int i;
184   const int thresh2 = 2 * thresh + 1;
185   for (i = 0; i < 16; ++i) {
186     if (needs_filter(p + i, stride, thresh2)) {
187       do_filter2(p + i, stride);
188     }
189   }
190 }
191 
SimpleHFilter16(uint8_t * p,int stride,int thresh)192 static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
193   int i;
194   const int thresh2 = 2 * thresh + 1;
195   for (i = 0; i < 16; ++i) {
196     if (needs_filter(p + i * stride, 1, thresh2)) {
197       do_filter2(p + i * stride, 1);
198     }
199   }
200 }
201 
SimpleVFilter16i(uint8_t * p,int stride,int thresh)202 static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
203   int k;
204   for (k = 3; k > 0; --k) {
205     p += 4 * stride;
206     SimpleVFilter16(p, stride, thresh);
207   }
208 }
209 
SimpleHFilter16i(uint8_t * p,int stride,int thresh)210 static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
211   int k;
212   for (k = 3; k > 0; --k) {
213     p += 4;
214     SimpleHFilter16(p, stride, thresh);
215   }
216 }
217 
TransformOne(const int16_t * in,uint8_t * dst)218 static void TransformOne(const int16_t* in, uint8_t* dst) {
219   int temp0, temp1, temp2, temp3, temp4;
220   int temp5, temp6, temp7, temp8, temp9;
221   int temp10, temp11, temp12, temp13, temp14;
222   int temp15, temp16, temp17, temp18, temp19;
223   int16_t* p_in = (int16_t*)in;
224 
225   // loops unrolled and merged to avoid usage of tmp buffer
226   // and to reduce number of stalls. MUL macro is written
227   // in assembler and inlined
228   __asm__ volatile(
229     "lh       %[temp0],  0(%[in])                      \n\t"
230     "lh       %[temp8],  16(%[in])                     \n\t"
231     "lh       %[temp4],  8(%[in])                      \n\t"
232     "lh       %[temp12], 24(%[in])                     \n\t"
233     "addu     %[temp16], %[temp0],  %[temp8]           \n\t"
234     "subu     %[temp0],  %[temp0],  %[temp8]           \n\t"
235     "mul      %[temp8],  %[temp4],  %[kC2]             \n\t"
236     MUL_SHIFT_C1(temp17, temp12)
237     MUL_SHIFT_C1_IO(temp4, temp19)
238     "mul      %[temp12], %[temp12], %[kC2]             \n\t"
239     "lh       %[temp1],  2(%[in])                      \n\t"
240     "lh       %[temp5],  10(%[in])                     \n\t"
241     "lh       %[temp9],  18(%[in])                     \n\t"
242     "lh       %[temp13], 26(%[in])                     \n\t"
243     "sra      %[temp8],  %[temp8],  16                 \n\t"
244     "sra      %[temp12], %[temp12], 16                 \n\t"
245     "lh       %[temp2],  4(%[in])                      \n\t"
246     "lh       %[temp6],  12(%[in])                     \n\t"
247     "lh       %[temp10], 20(%[in])                     \n\t"
248     "lh       %[temp14], 28(%[in])                     \n\t"
249     "subu     %[temp17], %[temp8],  %[temp17]          \n\t"
250     "addu     %[temp4],  %[temp4],  %[temp12]          \n\t"
251     "addu     %[temp8],  %[temp16], %[temp4]           \n\t"
252     "subu     %[temp4],  %[temp16], %[temp4]           \n\t"
253     "addu     %[temp16], %[temp1],  %[temp9]           \n\t"
254     "subu     %[temp1],  %[temp1],  %[temp9]           \n\t"
255     "lh       %[temp3],  6(%[in])                      \n\t"
256     "lh       %[temp7],  14(%[in])                     \n\t"
257     "lh       %[temp11], 22(%[in])                     \n\t"
258     "lh       %[temp15], 30(%[in])                     \n\t"
259     "addu     %[temp12], %[temp0],  %[temp17]          \n\t"
260     "subu     %[temp0],  %[temp0],  %[temp17]          \n\t"
261     "mul      %[temp9],  %[temp5],  %[kC2]             \n\t"
262     MUL_SHIFT_C1(temp17, temp13)
263     MUL_SHIFT_C1_IO(temp5, temp19)
264     "mul      %[temp13], %[temp13], %[kC2]             \n\t"
265     "sra      %[temp9],  %[temp9],  16                 \n\t"
266     "subu     %[temp17], %[temp9],  %[temp17]          \n\t"
267     "sra      %[temp13], %[temp13], 16                 \n\t"
268     "addu     %[temp5],  %[temp5],  %[temp13]          \n\t"
269     "addu     %[temp13], %[temp1],  %[temp17]          \n\t"
270     "subu     %[temp1],  %[temp1],  %[temp17]          \n\t"
271     MUL_SHIFT_C1(temp17, temp14)
272     "mul      %[temp14], %[temp14], %[kC2]             \n\t"
273     "addu     %[temp9],  %[temp16], %[temp5]           \n\t"
274     "subu     %[temp5],  %[temp16], %[temp5]           \n\t"
275     "addu     %[temp16], %[temp2],  %[temp10]          \n\t"
276     "subu     %[temp2],  %[temp2],  %[temp10]          \n\t"
277     "mul      %[temp10], %[temp6],  %[kC2]             \n\t"
278     MUL_SHIFT_C1_IO(temp6, temp19)
279     "sra      %[temp14], %[temp14], 16                 \n\t"
280     "sra      %[temp10], %[temp10], 16                 \n\t"
281     "subu     %[temp17], %[temp10], %[temp17]          \n\t"
282     "addu     %[temp6],  %[temp6],  %[temp14]          \n\t"
283     "addu     %[temp10], %[temp16], %[temp6]           \n\t"
284     "subu     %[temp6],  %[temp16], %[temp6]           \n\t"
285     "addu     %[temp14], %[temp2],  %[temp17]          \n\t"
286     "subu     %[temp2],  %[temp2],  %[temp17]          \n\t"
287     MUL_SHIFT_C1(temp17, temp15)
288     "mul      %[temp15], %[temp15], %[kC2]             \n\t"
289     "addu     %[temp16], %[temp3],  %[temp11]          \n\t"
290     "subu     %[temp3],  %[temp3],  %[temp11]          \n\t"
291     "mul      %[temp11], %[temp7],  %[kC2]             \n\t"
292     MUL_SHIFT_C1_IO(temp7, temp19)
293     "addiu    %[temp8],  %[temp8],  4                  \n\t"
294     "addiu    %[temp12], %[temp12], 4                  \n\t"
295     "addiu    %[temp0],  %[temp0],  4                  \n\t"
296     "addiu    %[temp4],  %[temp4],  4                  \n\t"
297     "sra      %[temp15], %[temp15], 16                 \n\t"
298     "sra      %[temp11], %[temp11], 16                 \n\t"
299     "subu     %[temp17], %[temp11], %[temp17]          \n\t"
300     "addu     %[temp7],  %[temp7],  %[temp15]          \n\t"
301     "addu     %[temp15], %[temp3],  %[temp17]          \n\t"
302     "subu     %[temp3],  %[temp3],  %[temp17]          \n\t"
303     "addu     %[temp11], %[temp16], %[temp7]           \n\t"
304     "subu     %[temp7],  %[temp16], %[temp7]           \n\t"
305     "addu     %[temp16], %[temp8],  %[temp10]          \n\t"
306     "subu     %[temp8],  %[temp8],  %[temp10]          \n\t"
307     "mul      %[temp10], %[temp9],  %[kC2]             \n\t"
308     MUL_SHIFT_C1(temp17, temp11)
309     MUL_SHIFT_C1_IO(temp9, temp19)
310     "mul      %[temp11], %[temp11], %[kC2]             \n\t"
311     "sra      %[temp10], %[temp10], 16                 \n\t"
312     "sra      %[temp11], %[temp11], 16                 \n\t"
313     "subu     %[temp17], %[temp10], %[temp17]          \n\t"
314     "addu     %[temp11], %[temp9],  %[temp11]          \n\t"
315     "addu     %[temp10], %[temp12], %[temp14]          \n\t"
316     "subu     %[temp12], %[temp12], %[temp14]          \n\t"
317     "mul      %[temp14], %[temp13], %[kC2]             \n\t"
318     MUL_SHIFT_C1(temp9, temp15)
319     MUL_SHIFT_C1_IO(temp13, temp19)
320     "mul      %[temp15], %[temp15], %[kC2]             \n\t"
321     "sra      %[temp14], %[temp14], 16                 \n\t"
322     "sra      %[temp15], %[temp15], 16                 \n\t"
323     "subu     %[temp9],  %[temp14], %[temp9]           \n\t"
324     "addu     %[temp15], %[temp13], %[temp15]          \n\t"
325     "addu     %[temp14], %[temp0],  %[temp2]           \n\t"
326     "subu     %[temp0],  %[temp0],  %[temp2]           \n\t"
327     "mul      %[temp2],  %[temp1],  %[kC2]             \n\t"
328     MUL_SHIFT_C1(temp13, temp3)
329     MUL_SHIFT_C1_IO(temp1, temp19)
330     "mul      %[temp3],  %[temp3],  %[kC2]             \n\t"
331     "sra      %[temp2],  %[temp2],  16                 \n\t"
332     "sra      %[temp3],  %[temp3],  16                 \n\t"
333     "subu     %[temp13], %[temp2],  %[temp13]          \n\t"
334     "addu     %[temp3],  %[temp1],  %[temp3]           \n\t"
335     "addu     %[temp2],  %[temp4],  %[temp6]           \n\t"
336     "subu     %[temp4],  %[temp4],  %[temp6]           \n\t"
337     "mul      %[temp6],  %[temp5],  %[kC2]             \n\t"
338     MUL_SHIFT_C1(temp1, temp7)
339     MUL_SHIFT_C1_IO(temp5, temp19)
340     "mul      %[temp7],  %[temp7],  %[kC2]             \n\t"
341     "sra      %[temp6],  %[temp6],  16                 \n\t"
342     "sra      %[temp7],  %[temp7],  16                 \n\t"
343     "subu     %[temp1],  %[temp6],  %[temp1]           \n\t"
344     "addu     %[temp7],  %[temp5],  %[temp7]           \n\t"
345     "addu     %[temp5],  %[temp16], %[temp11]          \n\t"
346     "subu     %[temp16], %[temp16], %[temp11]          \n\t"
347     "addu     %[temp11], %[temp8],  %[temp17]          \n\t"
348     "subu     %[temp8],  %[temp8],  %[temp17]          \n\t"
349     "sra      %[temp5],  %[temp5],  3                  \n\t"
350     "sra      %[temp16], %[temp16], 3                  \n\t"
351     "sra      %[temp11], %[temp11], 3                  \n\t"
352     "sra      %[temp8],  %[temp8],  3                  \n\t"
353     "addu     %[temp17], %[temp10], %[temp15]          \n\t"
354     "subu     %[temp10], %[temp10], %[temp15]          \n\t"
355     "addu     %[temp15], %[temp12], %[temp9]           \n\t"
356     "subu     %[temp12], %[temp12], %[temp9]           \n\t"
357     "sra      %[temp17], %[temp17], 3                  \n\t"
358     "sra      %[temp10], %[temp10], 3                  \n\t"
359     "sra      %[temp15], %[temp15], 3                  \n\t"
360     "sra      %[temp12], %[temp12], 3                  \n\t"
361     "addu     %[temp9],  %[temp14], %[temp3]           \n\t"
362     "subu     %[temp14], %[temp14], %[temp3]           \n\t"
363     "addu     %[temp3],  %[temp0],  %[temp13]          \n\t"
364     "subu     %[temp0],  %[temp0],  %[temp13]          \n\t"
365     "sra      %[temp9],  %[temp9],  3                  \n\t"
366     "sra      %[temp14], %[temp14], 3                  \n\t"
367     "sra      %[temp3],  %[temp3],  3                  \n\t"
368     "sra      %[temp0],  %[temp0],  3                  \n\t"
369     "addu     %[temp13], %[temp2],  %[temp7]           \n\t"
370     "subu     %[temp2],  %[temp2],  %[temp7]           \n\t"
371     "addu     %[temp7],  %[temp4],  %[temp1]           \n\t"
372     "subu     %[temp4],  %[temp4],  %[temp1]           \n\t"
373     "sra      %[temp13], %[temp13], 3                  \n\t"
374     "sra      %[temp2],  %[temp2],  3                  \n\t"
375     "sra      %[temp7],  %[temp7],  3                  \n\t"
376     "sra      %[temp4],  %[temp4],  3                  \n\t"
377     "addiu    %[temp6],  $zero,     255                \n\t"
378     "lbu      %[temp1],  0+0*" XSTR(BPS) "(%[dst])     \n\t"
379     "addu     %[temp1],  %[temp1],  %[temp5]           \n\t"
380     "sra      %[temp5],  %[temp1],  8                  \n\t"
381     "sra      %[temp18], %[temp1],  31                 \n\t"
382     "beqz     %[temp5],  1f                            \n\t"
383     "xor      %[temp1],  %[temp1],  %[temp1]           \n\t"
384     "movz     %[temp1],  %[temp6],  %[temp18]          \n\t"
385   "1:                                                  \n\t"
386     "lbu      %[temp18], 1+0*" XSTR(BPS) "(%[dst])     \n\t"
387     "sb       %[temp1],  0+0*" XSTR(BPS) "(%[dst])     \n\t"
388     "addu     %[temp18], %[temp18], %[temp11]          \n\t"
389     "sra      %[temp11], %[temp18], 8                  \n\t"
390     "sra      %[temp1],  %[temp18], 31                 \n\t"
391     "beqz     %[temp11], 2f                            \n\t"
392     "xor      %[temp18], %[temp18], %[temp18]          \n\t"
393     "movz     %[temp18], %[temp6],  %[temp1]           \n\t"
394   "2:                                                  \n\t"
395     "lbu      %[temp1],  2+0*" XSTR(BPS) "(%[dst])     \n\t"
396     "sb       %[temp18], 1+0*" XSTR(BPS) "(%[dst])     \n\t"
397     "addu     %[temp1],  %[temp1],  %[temp8]           \n\t"
398     "sra      %[temp8],  %[temp1],  8                  \n\t"
399     "sra      %[temp18], %[temp1],  31                 \n\t"
400     "beqz     %[temp8],  3f                            \n\t"
401     "xor      %[temp1],  %[temp1],  %[temp1]           \n\t"
402     "movz     %[temp1],  %[temp6],  %[temp18]          \n\t"
403   "3:                                                  \n\t"
404     "lbu      %[temp18], 3+0*" XSTR(BPS) "(%[dst])     \n\t"
405     "sb       %[temp1],  2+0*" XSTR(BPS) "(%[dst])     \n\t"
406     "addu     %[temp18], %[temp18], %[temp16]          \n\t"
407     "sra      %[temp16], %[temp18], 8                  \n\t"
408     "sra      %[temp1],  %[temp18], 31                 \n\t"
409     "beqz     %[temp16], 4f                            \n\t"
410     "xor      %[temp18], %[temp18], %[temp18]          \n\t"
411     "movz     %[temp18], %[temp6],  %[temp1]           \n\t"
412   "4:                                                  \n\t"
413     "sb       %[temp18], 3+0*" XSTR(BPS) "(%[dst])     \n\t"
414     "lbu      %[temp5],  0+1*" XSTR(BPS) "(%[dst])     \n\t"
415     "lbu      %[temp8],  1+1*" XSTR(BPS) "(%[dst])     \n\t"
416     "lbu      %[temp11], 2+1*" XSTR(BPS) "(%[dst])     \n\t"
417     "lbu      %[temp16], 3+1*" XSTR(BPS) "(%[dst])     \n\t"
418     "addu     %[temp5],  %[temp5],  %[temp17]          \n\t"
419     "addu     %[temp8],  %[temp8],  %[temp15]          \n\t"
420     "addu     %[temp11], %[temp11], %[temp12]          \n\t"
421     "addu     %[temp16], %[temp16], %[temp10]          \n\t"
422     "sra      %[temp18], %[temp5],  8                  \n\t"
423     "sra      %[temp1],  %[temp5],  31                 \n\t"
424     "beqz     %[temp18], 5f                            \n\t"
425     "xor      %[temp5],  %[temp5],  %[temp5]           \n\t"
426     "movz     %[temp5],  %[temp6],  %[temp1]           \n\t"
427   "5:                                                  \n\t"
428     "sra      %[temp18], %[temp8],  8                  \n\t"
429     "sra      %[temp1],  %[temp8],  31                 \n\t"
430     "beqz     %[temp18], 6f                            \n\t"
431     "xor      %[temp8],  %[temp8],  %[temp8]           \n\t"
432     "movz     %[temp8],  %[temp6],  %[temp1]           \n\t"
433   "6:                                                  \n\t"
434     "sra      %[temp18], %[temp11], 8                  \n\t"
435     "sra      %[temp1],  %[temp11], 31                 \n\t"
436     "sra      %[temp17], %[temp16], 8                  \n\t"
437     "sra      %[temp15], %[temp16], 31                 \n\t"
438     "beqz     %[temp18], 7f                            \n\t"
439     "xor      %[temp11], %[temp11], %[temp11]          \n\t"
440     "movz     %[temp11], %[temp6],  %[temp1]           \n\t"
441   "7:                                                  \n\t"
442     "beqz     %[temp17], 8f                            \n\t"
443     "xor      %[temp16], %[temp16], %[temp16]          \n\t"
444     "movz     %[temp16], %[temp6],  %[temp15]          \n\t"
445   "8:                                                  \n\t"
446     "sb       %[temp5],  0+1*" XSTR(BPS) "(%[dst])     \n\t"
447     "sb       %[temp8],  1+1*" XSTR(BPS) "(%[dst])     \n\t"
448     "sb       %[temp11], 2+1*" XSTR(BPS) "(%[dst])     \n\t"
449     "sb       %[temp16], 3+1*" XSTR(BPS) "(%[dst])     \n\t"
450     "lbu      %[temp5],  0+2*" XSTR(BPS) "(%[dst])     \n\t"
451     "lbu      %[temp8],  1+2*" XSTR(BPS) "(%[dst])     \n\t"
452     "lbu      %[temp11], 2+2*" XSTR(BPS) "(%[dst])     \n\t"
453     "lbu      %[temp16], 3+2*" XSTR(BPS) "(%[dst])     \n\t"
454     "addu     %[temp5],  %[temp5],  %[temp9]           \n\t"
455     "addu     %[temp8],  %[temp8],  %[temp3]           \n\t"
456     "addu     %[temp11], %[temp11], %[temp0]           \n\t"
457     "addu     %[temp16], %[temp16], %[temp14]          \n\t"
458     "sra      %[temp18], %[temp5],  8                  \n\t"
459     "sra      %[temp1],  %[temp5],  31                 \n\t"
460     "sra      %[temp17], %[temp8],  8                  \n\t"
461     "sra      %[temp15], %[temp8],  31                 \n\t"
462     "sra      %[temp12], %[temp11], 8                  \n\t"
463     "sra      %[temp10], %[temp11], 31                 \n\t"
464     "sra      %[temp9],  %[temp16], 8                  \n\t"
465     "sra      %[temp3],  %[temp16], 31                 \n\t"
466     "beqz     %[temp18], 9f                            \n\t"
467     "xor      %[temp5],  %[temp5],  %[temp5]           \n\t"
468     "movz     %[temp5],  %[temp6],  %[temp1]           \n\t"
469   "9:                                                  \n\t"
470     "beqz     %[temp17], 10f                           \n\t"
471     "xor      %[temp8],  %[temp8],  %[temp8]           \n\t"
472     "movz     %[temp8],  %[temp6],  %[temp15]          \n\t"
473   "10:                                                 \n\t"
474     "beqz     %[temp12], 11f                           \n\t"
475     "xor      %[temp11], %[temp11], %[temp11]          \n\t"
476     "movz     %[temp11], %[temp6],  %[temp10]          \n\t"
477   "11:                                                 \n\t"
478     "beqz     %[temp9],  12f                           \n\t"
479     "xor      %[temp16], %[temp16], %[temp16]          \n\t"
480     "movz     %[temp16], %[temp6],  %[temp3]           \n\t"
481   "12:                                                 \n\t"
482     "sb       %[temp5],  0+2*" XSTR(BPS) "(%[dst])     \n\t"
483     "sb       %[temp8],  1+2*" XSTR(BPS) "(%[dst])     \n\t"
484     "sb       %[temp11], 2+2*" XSTR(BPS) "(%[dst])     \n\t"
485     "sb       %[temp16], 3+2*" XSTR(BPS) "(%[dst])     \n\t"
486     "lbu      %[temp5],  0+3*" XSTR(BPS) "(%[dst])     \n\t"
487     "lbu      %[temp8],  1+3*" XSTR(BPS) "(%[dst])     \n\t"
488     "lbu      %[temp11], 2+3*" XSTR(BPS) "(%[dst])     \n\t"
489     "lbu      %[temp16], 3+3*" XSTR(BPS) "(%[dst])     \n\t"
490     "addu     %[temp5],  %[temp5],  %[temp13]          \n\t"
491     "addu     %[temp8],  %[temp8],  %[temp7]           \n\t"
492     "addu     %[temp11], %[temp11], %[temp4]           \n\t"
493     "addu     %[temp16], %[temp16], %[temp2]           \n\t"
494     "sra      %[temp18], %[temp5],  8                  \n\t"
495     "sra      %[temp1],  %[temp5],  31                 \n\t"
496     "sra      %[temp17], %[temp8],  8                  \n\t"
497     "sra      %[temp15], %[temp8],  31                 \n\t"
498     "sra      %[temp12], %[temp11], 8                  \n\t"
499     "sra      %[temp10], %[temp11], 31                 \n\t"
500     "sra      %[temp9],  %[temp16], 8                  \n\t"
501     "sra      %[temp3],  %[temp16], 31                 \n\t"
502     "beqz     %[temp18], 13f                           \n\t"
503     "xor      %[temp5],  %[temp5],  %[temp5]           \n\t"
504     "movz     %[temp5],  %[temp6],  %[temp1]           \n\t"
505   "13:                                                 \n\t"
506     "beqz     %[temp17], 14f                           \n\t"
507     "xor      %[temp8],  %[temp8],  %[temp8]           \n\t"
508     "movz     %[temp8],  %[temp6],  %[temp15]          \n\t"
509   "14:                                                 \n\t"
510     "beqz     %[temp12], 15f                           \n\t"
511     "xor      %[temp11], %[temp11], %[temp11]          \n\t"
512     "movz     %[temp11], %[temp6],  %[temp10]          \n\t"
513   "15:                                                 \n\t"
514     "beqz     %[temp9],  16f                           \n\t"
515     "xor      %[temp16], %[temp16], %[temp16]          \n\t"
516     "movz     %[temp16], %[temp6],  %[temp3]           \n\t"
517   "16:                                                 \n\t"
518     "sb       %[temp5],  0+3*" XSTR(BPS) "(%[dst])     \n\t"
519     "sb       %[temp8],  1+3*" XSTR(BPS) "(%[dst])     \n\t"
520     "sb       %[temp11], 2+3*" XSTR(BPS) "(%[dst])     \n\t"
521     "sb       %[temp16], 3+3*" XSTR(BPS) "(%[dst])     \n\t"
522 
523     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
524       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
525       [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
526       [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
527       [temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
528       [temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
529       [temp18]"=&r"(temp18), [temp19]"=&r"(temp19)
530     : [in]"r"(p_in), [kC1]"r"(kC1), [kC2]"r"(kC2), [dst]"r"(dst)
531     : "memory", "hi", "lo"
532   );
533 }
534 
TransformTwo(const int16_t * in,uint8_t * dst,int do_two)535 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
536   TransformOne(in, dst);
537   if (do_two) {
538     TransformOne(in + 16, dst + 4);
539   }
540 }
541 
542 //------------------------------------------------------------------------------
543 // Entry point
544 
545 extern void VP8DspInitMIPS32(void);
546 
VP8DspInitMIPS32(void)547 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMIPS32(void) {
548   VP8InitClipTables();
549 
550   VP8Transform = TransformTwo;
551 
552   VP8VFilter16 = VFilter16;
553   VP8HFilter16 = HFilter16;
554   VP8VFilter8 = VFilter8;
555   VP8HFilter8 = HFilter8;
556   VP8VFilter16i = VFilter16i;
557   VP8HFilter16i = HFilter16i;
558   VP8VFilter8i = VFilter8i;
559   VP8HFilter8i = HFilter8i;
560 
561   VP8SimpleVFilter16 = SimpleVFilter16;
562   VP8SimpleHFilter16 = SimpleHFilter16;
563   VP8SimpleVFilter16i = SimpleVFilter16i;
564   VP8SimpleHFilter16i = SimpleHFilter16i;
565 }
566 
567 #else  // !WEBP_USE_MIPS32
568 
569 WEBP_DSP_INIT_STUB(VP8DspInitMIPS32)
570 
571 #endif  // WEBP_USE_MIPS32
572