xref: /aosp_15_r20/external/libvpx/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <stdlib.h>
12 
13 #include "./vpx_dsp_rtcd.h"
14 #include "vpx/vpx_integer.h"
15 #include "vpx_dsp/mips/common_dspr2.h"
16 #include "vpx_dsp/mips/loopfilter_filters_dspr2.h"
17 #include "vpx_dsp/mips/loopfilter_macros_dspr2.h"
18 #include "vpx_dsp/mips/loopfilter_masks_dspr2.h"
19 #include "vpx_mem/vpx_mem.h"
20 
21 #if HAVE_DSPR2
mb_lpf_horizontal_edge(unsigned char * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh,int count)22 static void mb_lpf_horizontal_edge(unsigned char *s, int pitch,
23                                    const uint8_t *blimit, const uint8_t *limit,
24                                    const uint8_t *thresh, int count) {
25   uint32_t mask;
26   uint32_t hev, flat, flat2;
27   uint8_t i;
28   uint8_t *sp7, *sp6, *sp5, *sp4, *sp3, *sp2, *sp1, *sp0;
29   uint8_t *sq0, *sq1, *sq2, *sq3, *sq4, *sq5, *sq6, *sq7;
30   uint32_t thresh_vec, flimit_vec, limit_vec;
31   uint32_t uflimit, ulimit, uthresh;
32   uint32_t p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
33   uint32_t p1_f0, p0_f0, q0_f0, q1_f0;
34   uint32_t p7_l, p6_l, p5_l, p4_l, p3_l, p2_l, p1_l, p0_l;
35   uint32_t q0_l, q1_l, q2_l, q3_l, q4_l, q5_l, q6_l, q7_l;
36   uint32_t p7_r, p6_r, p5_r, p4_r, p3_r, p2_r, p1_r, p0_r;
37   uint32_t q0_r, q1_r, q2_r, q3_r, q4_r, q5_r, q6_r, q7_r;
38   uint32_t p2_l_f1, p1_l_f1, p0_l_f1, p2_r_f1, p1_r_f1, p0_r_f1;
39   uint32_t q0_l_f1, q1_l_f1, q2_l_f1, q0_r_f1, q1_r_f1, q2_r_f1;
40 
41   uflimit = *blimit;
42   ulimit = *limit;
43   uthresh = *thresh;
44 
45   /* create quad-byte */
46   __asm__ __volatile__(
47       "replv.qb       %[thresh_vec],    %[uthresh]      \n\t"
48       "replv.qb       %[flimit_vec],    %[uflimit]      \n\t"
49       "replv.qb       %[limit_vec],     %[ulimit]       \n\t"
50 
51       : [thresh_vec] "=&r"(thresh_vec), [flimit_vec] "=&r"(flimit_vec),
52         [limit_vec] "=r"(limit_vec)
53       : [uthresh] "r"(uthresh), [uflimit] "r"(uflimit), [ulimit] "r"(ulimit));
54 
55   /* prefetch data for store */
56   prefetch_store(s);
57 
58   for (i = 0; i < (2 * count); i++) {
59     sp7 = s - (pitch << 3);
60     sp6 = sp7 + pitch;
61     sp5 = sp6 + pitch;
62     sp4 = sp5 + pitch;
63     sp3 = sp4 + pitch;
64     sp2 = sp3 + pitch;
65     sp1 = sp2 + pitch;
66     sp0 = sp1 + pitch;
67     sq0 = s;
68     sq1 = s + pitch;
69     sq2 = sq1 + pitch;
70     sq3 = sq2 + pitch;
71     sq4 = sq3 + pitch;
72     sq5 = sq4 + pitch;
73     sq6 = sq5 + pitch;
74     sq7 = sq6 + pitch;
75 
76     __asm__ __volatile__(
77         "lw     %[p7],      (%[sp7])            \n\t"
78         "lw     %[p6],      (%[sp6])            \n\t"
79         "lw     %[p5],      (%[sp5])            \n\t"
80         "lw     %[p4],      (%[sp4])            \n\t"
81         "lw     %[p3],      (%[sp3])            \n\t"
82         "lw     %[p2],      (%[sp2])            \n\t"
83         "lw     %[p1],      (%[sp1])            \n\t"
84         "lw     %[p0],      (%[sp0])            \n\t"
85 
86         : [p3] "=&r"(p3), [p2] "=&r"(p2), [p1] "=&r"(p1), [p0] "=&r"(p0),
87           [p7] "=&r"(p7), [p6] "=&r"(p6), [p5] "=&r"(p5), [p4] "=&r"(p4)
88         : [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0),
89           [sp4] "r"(sp4), [sp5] "r"(sp5), [sp6] "r"(sp6), [sp7] "r"(sp7));
90 
91     __asm__ __volatile__(
92         "lw     %[q0],      (%[sq0])            \n\t"
93         "lw     %[q1],      (%[sq1])            \n\t"
94         "lw     %[q2],      (%[sq2])            \n\t"
95         "lw     %[q3],      (%[sq3])            \n\t"
96         "lw     %[q4],      (%[sq4])            \n\t"
97         "lw     %[q5],      (%[sq5])            \n\t"
98         "lw     %[q6],      (%[sq6])            \n\t"
99         "lw     %[q7],      (%[sq7])            \n\t"
100 
101         : [q3] "=&r"(q3), [q2] "=&r"(q2), [q1] "=&r"(q1), [q0] "=&r"(q0),
102           [q7] "=&r"(q7), [q6] "=&r"(q6), [q5] "=&r"(q5), [q4] "=&r"(q4)
103         : [sq3] "r"(sq3), [sq2] "r"(sq2), [sq1] "r"(sq1), [sq0] "r"(sq0),
104           [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6), [sq7] "r"(sq7));
105 
106     filter_hev_mask_flatmask4_dspr2(limit_vec, flimit_vec, thresh_vec, p1, p0,
107                                     p3, p2, q0, q1, q2, q3, &hev, &mask, &flat);
108 
109     flatmask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, &flat2);
110 
111     /* f0 */
112     if (((flat2 == 0) && (flat == 0) && (mask != 0)) ||
113         ((flat2 != 0) && (flat == 0) && (mask != 0))) {
114       filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
115 
116       __asm__ __volatile__(
117           "sw       %[p1_f0],   (%[sp1])            \n\t"
118           "sw       %[p0_f0],   (%[sp0])            \n\t"
119           "sw       %[q0_f0],   (%[sq0])            \n\t"
120           "sw       %[q1_f0],   (%[sq1])            \n\t"
121 
122           :
123           : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
124             [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
125             [sq1] "r"(sq1));
126     } else if ((flat2 == 0XFFFFFFFF) && (flat == 0xFFFFFFFF) &&
127                (mask == 0xFFFFFFFF)) {
128       /* f2 */
129       PACK_LEFT_0TO3()
130       PACK_LEFT_4TO7()
131       wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
132                           &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
133                           &q6_l, &q7_l);
134 
135       PACK_RIGHT_0TO3()
136       PACK_RIGHT_4TO7()
137       wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
138                           &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
139                           &q6_r, &q7_r);
140 
141       COMBINE_LEFT_RIGHT_0TO2()
142       COMBINE_LEFT_RIGHT_3TO6()
143 
144       __asm__ __volatile__(
145           "sw         %[p6], (%[sp6])    \n\t"
146           "sw         %[p5], (%[sp5])    \n\t"
147           "sw         %[p4], (%[sp4])    \n\t"
148           "sw         %[p3], (%[sp3])    \n\t"
149           "sw         %[p2], (%[sp2])    \n\t"
150           "sw         %[p1], (%[sp1])    \n\t"
151           "sw         %[p0], (%[sp0])    \n\t"
152 
153           :
154           : [p6] "r"(p6), [p5] "r"(p5), [p4] "r"(p4), [p3] "r"(p3),
155             [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [sp6] "r"(sp6),
156             [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3), [sp2] "r"(sp2),
157             [sp1] "r"(sp1), [sp0] "r"(sp0));
158 
159       __asm__ __volatile__(
160           "sw         %[q6], (%[sq6])    \n\t"
161           "sw         %[q5], (%[sq5])    \n\t"
162           "sw         %[q4], (%[sq4])    \n\t"
163           "sw         %[q3], (%[sq3])    \n\t"
164           "sw         %[q2], (%[sq2])    \n\t"
165           "sw         %[q1], (%[sq1])    \n\t"
166           "sw         %[q0], (%[sq0])    \n\t"
167 
168           :
169           : [q6] "r"(q6), [q5] "r"(q5), [q4] "r"(q4), [q3] "r"(q3),
170             [q2] "r"(q2), [q1] "r"(q1), [q0] "r"(q0), [sq6] "r"(sq6),
171             [sq5] "r"(sq5), [sq4] "r"(sq4), [sq3] "r"(sq3), [sq2] "r"(sq2),
172             [sq1] "r"(sq1), [sq0] "r"(sq0));
173     } else if ((flat2 == 0) && (flat == 0xFFFFFFFF) && (mask == 0xFFFFFFFF)) {
174       /* f1 */
175       /* left 2 element operation */
176       PACK_LEFT_0TO3()
177       mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
178 
179       /* right 2 element operation */
180       PACK_RIGHT_0TO3()
181       mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
182 
183       COMBINE_LEFT_RIGHT_0TO2()
184 
185       __asm__ __volatile__(
186           "sw         %[p2], (%[sp2])    \n\t"
187           "sw         %[p1], (%[sp1])    \n\t"
188           "sw         %[p0], (%[sp0])    \n\t"
189           "sw         %[q0], (%[sq0])    \n\t"
190           "sw         %[q1], (%[sq1])    \n\t"
191           "sw         %[q2], (%[sq2])    \n\t"
192 
193           :
194           : [p2] "r"(p2), [p1] "r"(p1), [p0] "r"(p0), [q0] "r"(q0),
195             [q1] "r"(q1), [q2] "r"(q2), [sp2] "r"(sp2), [sp1] "r"(sp1),
196             [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2));
197     } else if ((flat2 == 0) && (flat != 0) && (mask != 0)) {
198       /* f0+f1 */
199       filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
200 
201       /* left 2 element operation */
202       PACK_LEFT_0TO3()
203       mbfilter_dspr2(&p3_l, &p2_l, &p1_l, &p0_l, &q0_l, &q1_l, &q2_l, &q3_l);
204 
205       /* right 2 element operation */
206       PACK_RIGHT_0TO3()
207       mbfilter_dspr2(&p3_r, &p2_r, &p1_r, &p0_r, &q0_r, &q1_r, &q2_r, &q3_r);
208 
209       if (mask & flat & 0x000000FF) {
210         __asm__ __volatile__(
211             "sb         %[p2_r],  (%[sp2])    \n\t"
212             "sb         %[p1_r],  (%[sp1])    \n\t"
213             "sb         %[p0_r],  (%[sp0])    \n\t"
214             "sb         %[q0_r],  (%[sq0])    \n\t"
215             "sb         %[q1_r],  (%[sq1])    \n\t"
216             "sb         %[q2_r],  (%[sq2])    \n\t"
217 
218             :
219             : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
220               [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
221               [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
222               [sq1] "r"(sq1), [sq2] "r"(sq2));
223       } else if (mask & 0x000000FF) {
224         __asm__ __volatile__(
225             "sb         %[p1_f0],  (%[sp1])    \n\t"
226             "sb         %[p0_f0],  (%[sp0])    \n\t"
227             "sb         %[q0_f0],  (%[sq0])    \n\t"
228             "sb         %[q1_f0],  (%[sq1])    \n\t"
229 
230             :
231             : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
232               [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
233               [sq0] "r"(sq0), [sq1] "r"(sq1));
234       }
235 
236       __asm__ __volatile__(
237           "srl      %[p2_r],    %[p2_r],    16      \n\t"
238           "srl      %[p1_r],    %[p1_r],    16      \n\t"
239           "srl      %[p0_r],    %[p0_r],    16      \n\t"
240           "srl      %[q0_r],    %[q0_r],    16      \n\t"
241           "srl      %[q1_r],    %[q1_r],    16      \n\t"
242           "srl      %[q2_r],    %[q2_r],    16      \n\t"
243           "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
244           "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
245           "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
246           "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
247 
248           : [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r), [p0_r] "+r"(p0_r),
249             [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
250             [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
251             [q1_f0] "+r"(q1_f0)
252           :);
253 
254       if (mask & flat & 0x0000FF00) {
255         __asm__ __volatile__(
256             "sb         %[p2_r],  +1(%[sp2])    \n\t"
257             "sb         %[p1_r],  +1(%[sp1])    \n\t"
258             "sb         %[p0_r],  +1(%[sp0])    \n\t"
259             "sb         %[q0_r],  +1(%[sq0])    \n\t"
260             "sb         %[q1_r],  +1(%[sq1])    \n\t"
261             "sb         %[q2_r],  +1(%[sq2])    \n\t"
262 
263             :
264             : [p2_r] "r"(p2_r), [p1_r] "r"(p1_r), [p0_r] "r"(p0_r),
265               [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
266               [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
267               [sq1] "r"(sq1), [sq2] "r"(sq2));
268       } else if (mask & 0x0000FF00) {
269         __asm__ __volatile__(
270             "sb         %[p1_f0],  +1(%[sp1])    \n\t"
271             "sb         %[p0_f0],  +1(%[sp0])    \n\t"
272             "sb         %[q0_f0],  +1(%[sq0])    \n\t"
273             "sb         %[q1_f0],  +1(%[sq1])    \n\t"
274 
275             :
276             : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
277               [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
278               [sq0] "r"(sq0), [sq1] "r"(sq1));
279       }
280 
281       __asm__ __volatile__(
282           "srl      %[p1_f0],   %[p1_f0],   8     \n\t"
283           "srl      %[p0_f0],   %[p0_f0],   8     \n\t"
284           "srl      %[q0_f0],   %[q0_f0],   8     \n\t"
285           "srl      %[q1_f0],   %[q1_f0],   8     \n\t"
286 
287           : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
288             [q1_f0] "+r"(q1_f0)
289           :);
290 
291       if (mask & flat & 0x00FF0000) {
292         __asm__ __volatile__(
293             "sb         %[p2_l],  +2(%[sp2])    \n\t"
294             "sb         %[p1_l],  +2(%[sp1])    \n\t"
295             "sb         %[p0_l],  +2(%[sp0])    \n\t"
296             "sb         %[q0_l],  +2(%[sq0])    \n\t"
297             "sb         %[q1_l],  +2(%[sq1])    \n\t"
298             "sb         %[q2_l],  +2(%[sq2])    \n\t"
299 
300             :
301             : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
302               [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
303               [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
304               [sq1] "r"(sq1), [sq2] "r"(sq2));
305       } else if (mask & 0x00FF0000) {
306         __asm__ __volatile__(
307             "sb         %[p1_f0],  +2(%[sp1])    \n\t"
308             "sb         %[p0_f0],  +2(%[sp0])    \n\t"
309             "sb         %[q0_f0],  +2(%[sq0])    \n\t"
310             "sb         %[q1_f0],  +2(%[sq1])    \n\t"
311 
312             :
313             : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
314               [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
315               [sq0] "r"(sq0), [sq1] "r"(sq1));
316       }
317 
318       __asm__ __volatile__(
319           "srl      %[p2_l],    %[p2_l],    16      \n\t"
320           "srl      %[p1_l],    %[p1_l],    16      \n\t"
321           "srl      %[p0_l],    %[p0_l],    16      \n\t"
322           "srl      %[q0_l],    %[q0_l],    16      \n\t"
323           "srl      %[q1_l],    %[q1_l],    16      \n\t"
324           "srl      %[q2_l],    %[q2_l],    16      \n\t"
325           "srl      %[p1_f0],   %[p1_f0],   8       \n\t"
326           "srl      %[p0_f0],   %[p0_f0],   8       \n\t"
327           "srl      %[q0_f0],   %[q0_f0],   8       \n\t"
328           "srl      %[q1_f0],   %[q1_f0],   8       \n\t"
329 
330           : [p2_l] "+r"(p2_l), [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l),
331             [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
332             [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
333             [q1_f0] "+r"(q1_f0)
334           :);
335 
336       if (mask & flat & 0xFF000000) {
337         __asm__ __volatile__(
338             "sb         %[p2_l],  +3(%[sp2])    \n\t"
339             "sb         %[p1_l],  +3(%[sp1])    \n\t"
340             "sb         %[p0_l],  +3(%[sp0])    \n\t"
341             "sb         %[q0_l],  +3(%[sq0])    \n\t"
342             "sb         %[q1_l],  +3(%[sq1])    \n\t"
343             "sb         %[q2_l],  +3(%[sq2])    \n\t"
344 
345             :
346             : [p2_l] "r"(p2_l), [p1_l] "r"(p1_l), [p0_l] "r"(p0_l),
347               [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
348               [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0),
349               [sq1] "r"(sq1), [sq2] "r"(sq2));
350       } else if (mask & 0xFF000000) {
351         __asm__ __volatile__(
352             "sb         %[p1_f0],  +3(%[sp1])    \n\t"
353             "sb         %[p0_f0],  +3(%[sp0])    \n\t"
354             "sb         %[q0_f0],  +3(%[sq0])    \n\t"
355             "sb         %[q1_f0],  +3(%[sq1])    \n\t"
356 
357             :
358             : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
359               [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
360               [sq0] "r"(sq0), [sq1] "r"(sq1));
361       }
362     } else if ((flat2 != 0) && (flat != 0) && (mask != 0)) {
363       /* f0 + f1 + f2 */
364       /* f0  function */
365       filter1_dspr2(mask, hev, p1, p0, q0, q1, &p1_f0, &p0_f0, &q0_f0, &q1_f0);
366 
367       /* f1  function */
368       /* left 2 element operation */
369       PACK_LEFT_0TO3()
370       mbfilter1_dspr2(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, &p2_l_f1,
371                       &p1_l_f1, &p0_l_f1, &q0_l_f1, &q1_l_f1, &q2_l_f1);
372 
373       /* right 2 element operation */
374       PACK_RIGHT_0TO3()
375       mbfilter1_dspr2(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, &p2_r_f1,
376                       &p1_r_f1, &p0_r_f1, &q0_r_f1, &q1_r_f1, &q2_r_f1);
377 
378       /* f2  function */
379       PACK_LEFT_4TO7()
380       wide_mbfilter_dspr2(&p7_l, &p6_l, &p5_l, &p4_l, &p3_l, &p2_l, &p1_l,
381                           &p0_l, &q0_l, &q1_l, &q2_l, &q3_l, &q4_l, &q5_l,
382                           &q6_l, &q7_l);
383 
384       PACK_RIGHT_4TO7()
385       wide_mbfilter_dspr2(&p7_r, &p6_r, &p5_r, &p4_r, &p3_r, &p2_r, &p1_r,
386                           &p0_r, &q0_r, &q1_r, &q2_r, &q3_r, &q4_r, &q5_r,
387                           &q6_r, &q7_r);
388 
389       if (mask & flat & flat2 & 0x000000FF) {
390         __asm__ __volatile__(
391             "sb         %[p6_r],  (%[sp6])    \n\t"
392             "sb         %[p5_r],  (%[sp5])    \n\t"
393             "sb         %[p4_r],  (%[sp4])    \n\t"
394             "sb         %[p3_r],  (%[sp3])    \n\t"
395             "sb         %[p2_r],  (%[sp2])    \n\t"
396             "sb         %[p1_r],  (%[sp1])    \n\t"
397             "sb         %[p0_r],  (%[sp0])    \n\t"
398 
399             :
400             : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
401               [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
402               [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4), [sp3] "r"(sp3),
403               [sp2] "r"(sp2), [sp1] "r"(sp1), [p0_r] "r"(p0_r), [sp0] "r"(sp0));
404 
405         __asm__ __volatile__(
406             "sb         %[q0_r],  (%[sq0])    \n\t"
407             "sb         %[q1_r],  (%[sq1])    \n\t"
408             "sb         %[q2_r],  (%[sq2])    \n\t"
409             "sb         %[q3_r],  (%[sq3])    \n\t"
410             "sb         %[q4_r],  (%[sq4])    \n\t"
411             "sb         %[q5_r],  (%[sq5])    \n\t"
412             "sb         %[q6_r],  (%[sq6])    \n\t"
413 
414             :
415             : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
416               [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
417               [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
418               [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
419       } else if (mask & flat & 0x000000FF) {
420         __asm__ __volatile__(
421             "sb         %[p2_r_f1],  (%[sp2])    \n\t"
422             "sb         %[p1_r_f1],  (%[sp1])    \n\t"
423             "sb         %[p0_r_f1],  (%[sp0])    \n\t"
424             "sb         %[q0_r_f1],  (%[sq0])    \n\t"
425             "sb         %[q1_r_f1],  (%[sq1])    \n\t"
426             "sb         %[q2_r_f1],  (%[sq2])    \n\t"
427 
428             :
429             : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
430               [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
431               [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2),
432               [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
433               [sq2] "r"(sq2));
434       } else if (mask & 0x000000FF) {
435         __asm__ __volatile__(
436             "sb         %[p1_f0],  (%[sp1])    \n\t"
437             "sb         %[p0_f0],  (%[sp0])    \n\t"
438             "sb         %[q0_f0],  (%[sq0])    \n\t"
439             "sb         %[q1_f0],  (%[sq1])    \n\t"
440 
441             :
442             : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
443               [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
444               [sq0] "r"(sq0), [sq1] "r"(sq1));
445       }
446 
447       __asm__ __volatile__(
448           "srl        %[p6_r], %[p6_r], 16     \n\t"
449           "srl        %[p5_r], %[p5_r], 16     \n\t"
450           "srl        %[p4_r], %[p4_r], 16     \n\t"
451           "srl        %[p3_r], %[p3_r], 16     \n\t"
452           "srl        %[p2_r], %[p2_r], 16     \n\t"
453           "srl        %[p1_r], %[p1_r], 16     \n\t"
454           "srl        %[p0_r], %[p0_r], 16     \n\t"
455           "srl        %[q0_r], %[q0_r], 16     \n\t"
456           "srl        %[q1_r], %[q1_r], 16     \n\t"
457           "srl        %[q2_r], %[q2_r], 16     \n\t"
458           "srl        %[q3_r], %[q3_r], 16     \n\t"
459           "srl        %[q4_r], %[q4_r], 16     \n\t"
460           "srl        %[q5_r], %[q5_r], 16     \n\t"
461           "srl        %[q6_r], %[q6_r], 16     \n\t"
462 
463           : [q0_r] "+r"(q0_r), [q1_r] "+r"(q1_r), [q2_r] "+r"(q2_r),
464             [q3_r] "+r"(q3_r), [q4_r] "+r"(q4_r), [q5_r] "+r"(q5_r),
465             [p6_r] "+r"(p6_r), [p5_r] "+r"(p5_r), [p4_r] "+r"(p4_r),
466             [p3_r] "+r"(p3_r), [p2_r] "+r"(p2_r), [p1_r] "+r"(p1_r),
467             [q6_r] "+r"(q6_r), [p0_r] "+r"(p0_r)
468           :);
469 
470       __asm__ __volatile__(
471           "srl        %[p2_r_f1], %[p2_r_f1], 16     \n\t"
472           "srl        %[p1_r_f1], %[p1_r_f1], 16     \n\t"
473           "srl        %[p0_r_f1], %[p0_r_f1], 16     \n\t"
474           "srl        %[q0_r_f1], %[q0_r_f1], 16     \n\t"
475           "srl        %[q1_r_f1], %[q1_r_f1], 16     \n\t"
476           "srl        %[q2_r_f1], %[q2_r_f1], 16     \n\t"
477           "srl        %[p1_f0],   %[p1_f0],   8      \n\t"
478           "srl        %[p0_f0],   %[p0_f0],   8      \n\t"
479           "srl        %[q0_f0],   %[q0_f0],   8      \n\t"
480           "srl        %[q1_f0],   %[q1_f0],   8      \n\t"
481 
482           : [p2_r_f1] "+r"(p2_r_f1), [p1_r_f1] "+r"(p1_r_f1),
483             [p0_r_f1] "+r"(p0_r_f1), [q0_r_f1] "+r"(q0_r_f1),
484             [q1_r_f1] "+r"(q1_r_f1), [q2_r_f1] "+r"(q2_r_f1),
485             [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
486             [q1_f0] "+r"(q1_f0)
487           :);
488 
489       if (mask & flat & flat2 & 0x0000FF00) {
490         __asm__ __volatile__(
491             "sb         %[p6_r],  +1(%[sp6])    \n\t"
492             "sb         %[p5_r],  +1(%[sp5])    \n\t"
493             "sb         %[p4_r],  +1(%[sp4])    \n\t"
494             "sb         %[p3_r],  +1(%[sp3])    \n\t"
495             "sb         %[p2_r],  +1(%[sp2])    \n\t"
496             "sb         %[p1_r],  +1(%[sp1])    \n\t"
497             "sb         %[p0_r],  +1(%[sp0])    \n\t"
498 
499             :
500             : [p6_r] "r"(p6_r), [p5_r] "r"(p5_r), [p4_r] "r"(p4_r),
501               [p3_r] "r"(p3_r), [p2_r] "r"(p2_r), [p1_r] "r"(p1_r),
502               [p0_r] "r"(p0_r), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
503               [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
504 
505         __asm__ __volatile__(
506             "sb         %[q0_r],  +1(%[sq0])    \n\t"
507             "sb         %[q1_r],  +1(%[sq1])    \n\t"
508             "sb         %[q2_r],  +1(%[sq2])    \n\t"
509             "sb         %[q3_r],  +1(%[sq3])    \n\t"
510             "sb         %[q4_r],  +1(%[sq4])    \n\t"
511             "sb         %[q5_r],  +1(%[sq5])    \n\t"
512             "sb         %[q6_r],  +1(%[sq6])    \n\t"
513 
514             :
515             : [q0_r] "r"(q0_r), [q1_r] "r"(q1_r), [q2_r] "r"(q2_r),
516               [q3_r] "r"(q3_r), [q4_r] "r"(q4_r), [q5_r] "r"(q5_r),
517               [q6_r] "r"(q6_r), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
518               [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
519       } else if (mask & flat & 0x0000FF00) {
520         __asm__ __volatile__(
521             "sb         %[p2_r_f1],  +1(%[sp2])    \n\t"
522             "sb         %[p1_r_f1],  +1(%[sp1])    \n\t"
523             "sb         %[p0_r_f1],  +1(%[sp0])    \n\t"
524             "sb         %[q0_r_f1],  +1(%[sq0])    \n\t"
525             "sb         %[q1_r_f1],  +1(%[sq1])    \n\t"
526             "sb         %[q2_r_f1],  +1(%[sq2])    \n\t"
527 
528             :
529             : [p2_r_f1] "r"(p2_r_f1), [p1_r_f1] "r"(p1_r_f1),
530               [p0_r_f1] "r"(p0_r_f1), [q0_r_f1] "r"(q0_r_f1),
531               [q1_r_f1] "r"(q1_r_f1), [q2_r_f1] "r"(q2_r_f1), [sp2] "r"(sp2),
532               [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
533               [sq2] "r"(sq2));
534       } else if (mask & 0x0000FF00) {
535         __asm__ __volatile__(
536             "sb         %[p1_f0],  +1(%[sp1])    \n\t"
537             "sb         %[p0_f0],  +1(%[sp0])    \n\t"
538             "sb         %[q0_f0],  +1(%[sq0])    \n\t"
539             "sb         %[q1_f0],  +1(%[sq1])    \n\t"
540 
541             :
542             : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
543               [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
544               [sq0] "r"(sq0), [sq1] "r"(sq1));
545       }
546 
547       __asm__ __volatile__(
548           "srl        %[p1_f0], %[p1_f0], 8     \n\t"
549           "srl        %[p0_f0], %[p0_f0], 8     \n\t"
550           "srl        %[q0_f0], %[q0_f0], 8     \n\t"
551           "srl        %[q1_f0], %[q1_f0], 8     \n\t"
552 
553           : [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
554             [q1_f0] "+r"(q1_f0)
555           :);
556 
557       if (mask & flat & flat2 & 0x00FF0000) {
558         __asm__ __volatile__(
559             "sb         %[p6_l],  +2(%[sp6])    \n\t"
560             "sb         %[p5_l],  +2(%[sp5])    \n\t"
561             "sb         %[p4_l],  +2(%[sp4])    \n\t"
562             "sb         %[p3_l],  +2(%[sp3])    \n\t"
563             "sb         %[p2_l],  +2(%[sp2])    \n\t"
564             "sb         %[p1_l],  +2(%[sp1])    \n\t"
565             "sb         %[p0_l],  +2(%[sp0])    \n\t"
566 
567             :
568             : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
569               [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
570               [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
571               [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
572 
573         __asm__ __volatile__(
574             "sb         %[q0_l],  +2(%[sq0])    \n\t"
575             "sb         %[q1_l],  +2(%[sq1])    \n\t"
576             "sb         %[q2_l],  +2(%[sq2])    \n\t"
577             "sb         %[q3_l],  +2(%[sq3])    \n\t"
578             "sb         %[q4_l],  +2(%[sq4])    \n\t"
579             "sb         %[q5_l],  +2(%[sq5])    \n\t"
580             "sb         %[q6_l],  +2(%[sq6])    \n\t"
581 
582             :
583             : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
584               [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
585               [q6_l] "r"(q6_l), [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2),
586               [sq3] "r"(sq3), [sq4] "r"(sq4), [sq5] "r"(sq5), [sq6] "r"(sq6));
587       } else if (mask & flat & 0x00FF0000) {
588         __asm__ __volatile__(
589             "sb         %[p2_l_f1],  +2(%[sp2])    \n\t"
590             "sb         %[p1_l_f1],  +2(%[sp1])    \n\t"
591             "sb         %[p0_l_f1],  +2(%[sp0])    \n\t"
592             "sb         %[q0_l_f1],  +2(%[sq0])    \n\t"
593             "sb         %[q1_l_f1],  +2(%[sq1])    \n\t"
594             "sb         %[q2_l_f1],  +2(%[sq2])    \n\t"
595 
596             :
597             : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
598               [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
599               [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2),
600               [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
601               [sq2] "r"(sq2));
602       } else if (mask & 0x00FF0000) {
603         __asm__ __volatile__(
604             "sb         %[p1_f0],  +2(%[sp1])    \n\t"
605             "sb         %[p0_f0],  +2(%[sp0])    \n\t"
606             "sb         %[q0_f0],  +2(%[sq0])    \n\t"
607             "sb         %[q1_f0],  +2(%[sq1])    \n\t"
608 
609             :
610             : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
611               [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
612               [sq0] "r"(sq0), [sq1] "r"(sq1));
613       }
614 
615       __asm__ __volatile__(
616           "srl      %[p6_l],    %[p6_l],    16   \n\t"
617           "srl      %[p5_l],    %[p5_l],    16   \n\t"
618           "srl      %[p4_l],    %[p4_l],    16   \n\t"
619           "srl      %[p3_l],    %[p3_l],    16   \n\t"
620           "srl      %[p2_l],    %[p2_l],    16   \n\t"
621           "srl      %[p1_l],    %[p1_l],    16   \n\t"
622           "srl      %[p0_l],    %[p0_l],    16   \n\t"
623           "srl      %[q0_l],    %[q0_l],    16   \n\t"
624           "srl      %[q1_l],    %[q1_l],    16   \n\t"
625           "srl      %[q2_l],    %[q2_l],    16   \n\t"
626           "srl      %[q3_l],    %[q3_l],    16   \n\t"
627           "srl      %[q4_l],    %[q4_l],    16   \n\t"
628           "srl      %[q5_l],    %[q5_l],    16   \n\t"
629           "srl      %[q6_l],    %[q6_l],    16   \n\t"
630 
631           : [q0_l] "+r"(q0_l), [q1_l] "+r"(q1_l), [q2_l] "+r"(q2_l),
632             [q3_l] "+r"(q3_l), [q4_l] "+r"(q4_l), [q5_l] "+r"(q5_l),
633             [q6_l] "+r"(q6_l), [p6_l] "+r"(p6_l), [p5_l] "+r"(p5_l),
634             [p4_l] "+r"(p4_l), [p3_l] "+r"(p3_l), [p2_l] "+r"(p2_l),
635             [p1_l] "+r"(p1_l), [p0_l] "+r"(p0_l)
636           :);
637 
638       __asm__ __volatile__(
639           "srl      %[p2_l_f1],   %[p2_l_f1],   16   \n\t"
640           "srl      %[p1_l_f1],   %[p1_l_f1],   16   \n\t"
641           "srl      %[p0_l_f1],   %[p0_l_f1],   16   \n\t"
642           "srl      %[q0_l_f1],   %[q0_l_f1],   16   \n\t"
643           "srl      %[q1_l_f1],   %[q1_l_f1],   16   \n\t"
644           "srl      %[q2_l_f1],   %[q2_l_f1],   16   \n\t"
645           "srl      %[p1_f0],     %[p1_f0],     8    \n\t"
646           "srl      %[p0_f0],     %[p0_f0],     8    \n\t"
647           "srl      %[q0_f0],     %[q0_f0],     8    \n\t"
648           "srl      %[q1_f0],     %[q1_f0],     8    \n\t"
649 
650           : [p2_l_f1] "+r"(p2_l_f1), [p1_l_f1] "+r"(p1_l_f1),
651             [p0_l_f1] "+r"(p0_l_f1), [q0_l_f1] "+r"(q0_l_f1),
652             [q1_l_f1] "+r"(q1_l_f1), [q2_l_f1] "+r"(q2_l_f1),
653             [p1_f0] "+r"(p1_f0), [p0_f0] "+r"(p0_f0), [q0_f0] "+r"(q0_f0),
654             [q1_f0] "+r"(q1_f0)
655           :);
656 
657       if (mask & flat & flat2 & 0xFF000000) {
658         __asm__ __volatile__(
659             "sb     %[p6_l],    +3(%[sp6])    \n\t"
660             "sb     %[p5_l],    +3(%[sp5])    \n\t"
661             "sb     %[p4_l],    +3(%[sp4])    \n\t"
662             "sb     %[p3_l],    +3(%[sp3])    \n\t"
663             "sb     %[p2_l],    +3(%[sp2])    \n\t"
664             "sb     %[p1_l],    +3(%[sp1])    \n\t"
665             "sb     %[p0_l],    +3(%[sp0])    \n\t"
666 
667             :
668             : [p6_l] "r"(p6_l), [p5_l] "r"(p5_l), [p4_l] "r"(p4_l),
669               [p3_l] "r"(p3_l), [p2_l] "r"(p2_l), [p1_l] "r"(p1_l),
670               [p0_l] "r"(p0_l), [sp6] "r"(sp6), [sp5] "r"(sp5), [sp4] "r"(sp4),
671               [sp3] "r"(sp3), [sp2] "r"(sp2), [sp1] "r"(sp1), [sp0] "r"(sp0));
672 
673         __asm__ __volatile__(
674             "sb     %[q0_l],    +3(%[sq0])    \n\t"
675             "sb     %[q1_l],    +3(%[sq1])    \n\t"
676             "sb     %[q2_l],    +3(%[sq2])    \n\t"
677             "sb     %[q3_l],    +3(%[sq3])    \n\t"
678             "sb     %[q4_l],    +3(%[sq4])    \n\t"
679             "sb     %[q5_l],    +3(%[sq5])    \n\t"
680             "sb     %[q6_l],    +3(%[sq6])    \n\t"
681 
682             :
683             : [q0_l] "r"(q0_l), [q1_l] "r"(q1_l), [q2_l] "r"(q2_l),
684               [q3_l] "r"(q3_l), [q4_l] "r"(q4_l), [q5_l] "r"(q5_l),
685               [sq0] "r"(sq0), [sq1] "r"(sq1), [sq2] "r"(sq2), [sq3] "r"(sq3),
686               [sq4] "r"(sq4), [sq5] "r"(sq5), [q6_l] "r"(q6_l), [sq6] "r"(sq6));
687       } else if (mask & flat & 0xFF000000) {
688         __asm__ __volatile__(
689             "sb     %[p2_l_f1],     +3(%[sp2])    \n\t"
690             "sb     %[p1_l_f1],     +3(%[sp1])    \n\t"
691             "sb     %[p0_l_f1],     +3(%[sp0])    \n\t"
692             "sb     %[q0_l_f1],     +3(%[sq0])    \n\t"
693             "sb     %[q1_l_f1],     +3(%[sq1])    \n\t"
694             "sb     %[q2_l_f1],     +3(%[sq2])    \n\t"
695 
696             :
697             : [p2_l_f1] "r"(p2_l_f1), [p1_l_f1] "r"(p1_l_f1),
698               [p0_l_f1] "r"(p0_l_f1), [q0_l_f1] "r"(q0_l_f1),
699               [q1_l_f1] "r"(q1_l_f1), [q2_l_f1] "r"(q2_l_f1), [sp2] "r"(sp2),
700               [sp1] "r"(sp1), [sp0] "r"(sp0), [sq0] "r"(sq0), [sq1] "r"(sq1),
701               [sq2] "r"(sq2));
702       } else if (mask & 0xFF000000) {
703         __asm__ __volatile__(
704             "sb     %[p1_f0],   +3(%[sp1])    \n\t"
705             "sb     %[p0_f0],   +3(%[sp0])    \n\t"
706             "sb     %[q0_f0],   +3(%[sq0])    \n\t"
707             "sb     %[q1_f0],   +3(%[sq1])    \n\t"
708 
709             :
710             : [p1_f0] "r"(p1_f0), [p0_f0] "r"(p0_f0), [q0_f0] "r"(q0_f0),
711               [q1_f0] "r"(q1_f0), [sp1] "r"(sp1), [sp0] "r"(sp0),
712               [sq0] "r"(sq0), [sq1] "r"(sq1));
713       }
714     }
715 
716     s = s + 4;
717   }
718 }
719 
vpx_lpf_horizontal_16_dspr2(unsigned char * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)720 void vpx_lpf_horizontal_16_dspr2(unsigned char *s, int pitch,
721                                  const uint8_t *blimit, const uint8_t *limit,
722                                  const uint8_t *thresh) {
723   mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 1);
724 }
725 
vpx_lpf_horizontal_16_dual_dspr2(unsigned char * s,int pitch,const uint8_t * blimit,const uint8_t * limit,const uint8_t * thresh)726 void vpx_lpf_horizontal_16_dual_dspr2(unsigned char *s, int pitch,
727                                       const uint8_t *blimit,
728                                       const uint8_t *limit,
729                                       const uint8_t *thresh) {
730   mb_lpf_horizontal_edge(s, pitch, blimit, limit, thresh, 2);
731 }
732 #endif  // #if HAVE_DSPR2
733