xref: /aosp_15_r20/external/libvpx/vpx_dsp/mips/convolve8_dspr2.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <stdio.h>
13 
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/mips/convolve_common_dspr2.h"
16 #include "vpx_dsp/vpx_dsp_common.h"
17 #include "vpx_dsp/vpx_filter.h"
18 #include "vpx_ports/mem.h"
19 
20 #if HAVE_DSPR2
convolve_horiz_4_transposed_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)21 static void convolve_horiz_4_transposed_dspr2(const uint8_t *src,
22                                               int32_t src_stride, uint8_t *dst,
23                                               int32_t dst_stride,
24                                               const int16_t *filter_x0,
25                                               int32_t h) {
26   int32_t y;
27   uint8_t *cm = vpx_ff_cropTbl;
28   uint8_t *dst_ptr;
29   int32_t vector1b, vector2b, vector3b, vector4b;
30   int32_t Temp1, Temp2, Temp3, Temp4;
31   uint32_t vector4a = 64;
32   uint32_t tp1, tp2;
33   uint32_t p1, p2, p3, p4;
34   uint32_t tn1, tn2;
35 
36   vector1b = ((const int32_t *)filter_x0)[0];
37   vector2b = ((const int32_t *)filter_x0)[1];
38   vector3b = ((const int32_t *)filter_x0)[2];
39   vector4b = ((const int32_t *)filter_x0)[3];
40 
41   for (y = h; y--;) {
42     dst_ptr = dst;
43     /* prefetch data to cache memory */
44     prefetch_load(src + src_stride);
45     prefetch_load(src + src_stride + 32);
46 
47     __asm__ __volatile__(
48         "ulw              %[tp1],         0(%[src])                      \n\t"
49         "ulw              %[tp2],         4(%[src])                      \n\t"
50 
51         /* even 1. pixel */
52         "mtlo             %[vector4a],    $ac3                           \n\t"
53         "mthi             $zero,          $ac3                           \n\t"
54         "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
55         "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
56         "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
57         "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
58         "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
59         "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
60         "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
61         "ulw              %[tn2],         8(%[src])                      \n\t"
62         "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
63         "extp             %[Temp1],       $ac3,           31             \n\t"
64 
65         /* even 2. pixel */
66         "mtlo             %[vector4a],    $ac2                           \n\t"
67         "mthi             $zero,          $ac2                           \n\t"
68         "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
69         "balign           %[tn1],         %[tn2],         3              \n\t"
70         "balign           %[tn2],         %[tp2],         3              \n\t"
71         "balign           %[tp2],         %[tp1],         3              \n\t"
72         "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
73         "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
74         "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
75         "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
76         "extp             %[Temp3],       $ac2,           31             \n\t"
77 
78         /* odd 1. pixel */
79         "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"
80         "mtlo             %[vector4a],    $ac3                           \n\t"
81         "mthi             $zero,          $ac3                           \n\t"
82         "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
83         "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
84         "preceu.ph.qbr    %[p3],          %[tn2]                         \n\t"
85         "preceu.ph.qbl    %[p4],          %[tn2]                         \n\t"
86         "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
87         "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
88         "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
89         "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
90         "extp             %[Temp2],       $ac3,           31             \n\t"
91 
92         /* odd 2. pixel */
93         "lbux             %[tp2],         %[Temp3](%[cm])                \n\t"
94         "mtlo             %[vector4a],    $ac2                           \n\t"
95         "mthi             $zero,          $ac2                           \n\t"
96         "preceu.ph.qbr    %[p1],          %[tn1]                         \n\t"
97         "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
98         "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
99         "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
100         "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
101         "extp             %[Temp4],       $ac2,           31             \n\t"
102 
103         /* clamp */
104         "lbux             %[tn1],         %[Temp2](%[cm])                \n\t"
105         "lbux             %[p2],          %[Temp4](%[cm])                \n\t"
106 
107         /* store bytes */
108         "sb               %[tp1],         0(%[dst_ptr])                  \n\t"
109         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
110 
111         "sb               %[tn1],         0(%[dst_ptr])                  \n\t"
112         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
113 
114         "sb               %[tp2],         0(%[dst_ptr])                  \n\t"
115         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
116 
117         "sb               %[p2],          0(%[dst_ptr])                  \n\t"
118         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
119 
120         : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
121           [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
122           [p4] "=&r"(p4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
123           [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4), [dst_ptr] "+r"(dst_ptr)
124         : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
125           [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
126           [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src),
127           [dst_stride] "r"(dst_stride));
128 
129     /* Next row... */
130     src += src_stride;
131     dst += 1;
132   }
133 }
134 
convolve_horiz_8_transposed_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)135 static void convolve_horiz_8_transposed_dspr2(const uint8_t *src,
136                                               int32_t src_stride, uint8_t *dst,
137                                               int32_t dst_stride,
138                                               const int16_t *filter_x0,
139                                               int32_t h) {
140   int32_t y;
141   uint8_t *cm = vpx_ff_cropTbl;
142   uint8_t *dst_ptr;
143   uint32_t vector4a = 64;
144   int32_t vector1b, vector2b, vector3b, vector4b;
145   int32_t Temp1, Temp2, Temp3;
146   uint32_t tp1, tp2, tp3;
147   uint32_t p1, p2, p3, p4, n1;
148   uint8_t *odd_dst;
149   uint32_t dst_pitch_2 = (dst_stride << 1);
150 
151   vector1b = ((const int32_t *)filter_x0)[0];
152   vector2b = ((const int32_t *)filter_x0)[1];
153   vector3b = ((const int32_t *)filter_x0)[2];
154   vector4b = ((const int32_t *)filter_x0)[3];
155 
156   for (y = h; y--;) {
157     /* prefetch data to cache memory */
158     prefetch_load(src + src_stride);
159     prefetch_load(src + src_stride + 32);
160 
161     dst_ptr = dst;
162     odd_dst = (dst_ptr + dst_stride);
163 
164     __asm__ __volatile__(
165         "ulw              %[tp2],         0(%[src])                       \n\t"
166         "ulw              %[tp1],         4(%[src])                       \n\t"
167 
168         /* even 1. pixel */
169         "mtlo             %[vector4a],    $ac3                            \n\t"
170         "mthi             $zero,          $ac3                            \n\t"
171         "mtlo             %[vector4a],    $ac2                            \n\t"
172         "mthi             $zero,          $ac2                            \n\t"
173         "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
174         "preceu.ph.qbl    %[p2],          %[tp2]                          \n\t"
175         "preceu.ph.qbr    %[p3],          %[tp1]                          \n\t"
176         "preceu.ph.qbl    %[p4],          %[tp1]                          \n\t"
177         "ulw              %[tp3],         8(%[src])                       \n\t"
178         "dpa.w.ph         $ac3,           %[p1],          %[vector1b]     \n\t"
179         "dpa.w.ph         $ac3,           %[p2],          %[vector2b]     \n\t"
180         "dpa.w.ph         $ac3,           %[p3],          %[vector3b]     \n\t"
181         "dpa.w.ph         $ac3,           %[p4],          %[vector4b]     \n\t"
182         "extp             %[Temp1],       $ac3,           31              \n\t"
183 
184         /* even 2. pixel */
185         "preceu.ph.qbr    %[p1],          %[tp3]                          \n\t"
186         "preceu.ph.qbl    %[n1],          %[tp3]                          \n\t"
187         "ulw              %[tp2],         12(%[src])                      \n\t"
188         "dpa.w.ph         $ac2,           %[p2],          %[vector1b]     \n\t"
189         "dpa.w.ph         $ac2,           %[p3],          %[vector2b]     \n\t"
190         "dpa.w.ph         $ac2,           %[p4],          %[vector3b]     \n\t"
191         "dpa.w.ph         $ac2,           %[p1],          %[vector4b]     \n\t"
192         "extp             %[Temp3],       $ac2,           31              \n\t"
193 
194         /* even 3. pixel */
195         "lbux             %[Temp2],       %[Temp1](%[cm])                 \n\t"
196         "mtlo             %[vector4a],    $ac1                            \n\t"
197         "mthi             $zero,          $ac1                            \n\t"
198         "preceu.ph.qbr    %[p2],          %[tp2]                          \n\t"
199         "dpa.w.ph         $ac1,           %[p3],          %[vector1b]     \n\t"
200         "dpa.w.ph         $ac1,           %[p4],          %[vector2b]     \n\t"
201         "dpa.w.ph         $ac1,           %[p1],          %[vector3b]     \n\t"
202         "lbux             %[tp3],         %[Temp3](%[cm])                 \n\t"
203         "dpa.w.ph         $ac1,           %[n1],          %[vector4b]     \n\t"
204         "extp             %[p3],          $ac1,           31              \n\t"
205 
206         /* even 4. pixel */
207         "mtlo             %[vector4a],    $ac2                            \n\t"
208         "mthi             $zero,          $ac2                            \n\t"
209         "mtlo             %[vector4a],    $ac3                            \n\t"
210         "mthi             $zero,          $ac3                            \n\t"
211         "sb               %[Temp2],       0(%[dst_ptr])                   \n\t"
212         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
213         "sb               %[tp3],         0(%[dst_ptr])                   \n\t"
214         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
215 
216         "ulw              %[tp1],         1(%[src])                       \n\t"
217         "ulw              %[tp3],         5(%[src])                       \n\t"
218 
219         "dpa.w.ph         $ac2,           %[p4],          %[vector1b]     \n\t"
220         "dpa.w.ph         $ac2,           %[p1],          %[vector2b]     \n\t"
221         "dpa.w.ph         $ac2,           %[n1],          %[vector3b]     \n\t"
222         "dpa.w.ph         $ac2,           %[p2],          %[vector4b]     \n\t"
223         "extp             %[Temp3],       $ac2,           31              \n\t"
224 
225         "lbux             %[tp2],         %[p3](%[cm])                    \n\t"
226 
227         /* odd 1. pixel */
228         "mtlo             %[vector4a],    $ac1                            \n\t"
229         "mthi             $zero,          $ac1                            \n\t"
230         "preceu.ph.qbr    %[p1],          %[tp1]                          \n\t"
231         "preceu.ph.qbl    %[p2],          %[tp1]                          \n\t"
232         "preceu.ph.qbr    %[p3],          %[tp3]                          \n\t"
233         "preceu.ph.qbl    %[p4],          %[tp3]                          \n\t"
234         "sb               %[tp2],         0(%[dst_ptr])                   \n\t"
235         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
236         "ulw              %[tp2],         9(%[src])                       \n\t"
237 
238         "dpa.w.ph         $ac3,           %[p1],          %[vector1b]     \n\t"
239         "dpa.w.ph         $ac3,           %[p2],          %[vector2b]     \n\t"
240         "dpa.w.ph         $ac3,           %[p3],          %[vector3b]     \n\t"
241         "dpa.w.ph         $ac3,           %[p4],          %[vector4b]     \n\t"
242         "extp             %[Temp2],       $ac3,           31              \n\t"
243 
244         /* odd 2. pixel */
245         "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
246         "mtlo             %[vector4a],    $ac3                            \n\t"
247         "mthi             $zero,          $ac3                            \n\t"
248         "mtlo             %[vector4a],    $ac2                            \n\t"
249         "mthi             $zero,          $ac2                            \n\t"
250         "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
251         "preceu.ph.qbl    %[n1],          %[tp2]                          \n\t"
252         "ulw              %[Temp1],       13(%[src])                      \n\t"
253         "dpa.w.ph         $ac1,           %[p2],          %[vector1b]     \n\t"
254         "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
255         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
256         "dpa.w.ph         $ac1,           %[p3],          %[vector2b]     \n\t"
257         "dpa.w.ph         $ac1,           %[p4],          %[vector3b]     \n\t"
258         "dpa.w.ph         $ac1,           %[p1],          %[vector4b]     \n\t"
259         "extp             %[Temp3],       $ac1,           31              \n\t"
260 
261         /* odd 3. pixel */
262         "lbux             %[tp3],         %[Temp2](%[cm])                 \n\t"
263         "preceu.ph.qbr    %[p2],          %[Temp1]                        \n\t"
264         "dpa.w.ph         $ac3,           %[p3],          %[vector1b]     \n\t"
265         "dpa.w.ph         $ac3,           %[p4],          %[vector2b]     \n\t"
266         "dpa.w.ph         $ac3,           %[p1],          %[vector3b]     \n\t"
267         "dpa.w.ph         $ac3,           %[n1],          %[vector4b]     \n\t"
268         "extp             %[Temp2],       $ac3,           31              \n\t"
269 
270         /* odd 4. pixel */
271         "sb               %[tp3],         0(%[odd_dst])                   \n\t"
272         "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
273         "dpa.w.ph         $ac2,           %[p4],          %[vector1b]     \n\t"
274         "dpa.w.ph         $ac2,           %[p1],          %[vector2b]     \n\t"
275         "dpa.w.ph         $ac2,           %[n1],          %[vector3b]     \n\t"
276         "dpa.w.ph         $ac2,           %[p2],          %[vector4b]     \n\t"
277         "extp             %[Temp1],       $ac2,           31              \n\t"
278 
279         /* clamp */
280         "lbux             %[p4],          %[Temp3](%[cm])                 \n\t"
281         "lbux             %[p2],          %[Temp2](%[cm])                 \n\t"
282         "lbux             %[n1],          %[Temp1](%[cm])                 \n\t"
283 
284         /* store bytes */
285         "sb               %[p4],          0(%[odd_dst])                   \n\t"
286         "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
287 
288         "sb               %[p2],          0(%[odd_dst])                   \n\t"
289         "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
290 
291         "sb               %[n1],          0(%[odd_dst])                   \n\t"
292 
293         : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3), [p1] "=&r"(p1),
294           [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4), [n1] "=&r"(n1),
295           [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
296           [dst_ptr] "+r"(dst_ptr), [odd_dst] "+r"(odd_dst)
297         : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
298           [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
299           [vector4a] "r"(vector4a), [cm] "r"(cm), [src] "r"(src),
300           [dst_pitch_2] "r"(dst_pitch_2));
301 
302     /* Next row... */
303     src += src_stride;
304     dst += 1;
305   }
306 }
307 
convolve_horiz_16_transposed_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h,int32_t count)308 static void convolve_horiz_16_transposed_dspr2(
309     const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
310     int32_t dst_stride, const int16_t *filter_x0, int32_t h, int32_t count) {
311   int32_t c, y;
312   const uint8_t *src;
313   uint8_t *dst;
314   uint8_t *cm = vpx_ff_cropTbl;
315   uint32_t vector_64 = 64;
316   int32_t filter12, filter34, filter56, filter78;
317   int32_t Temp1, Temp2, Temp3;
318   uint32_t qload1, qload2;
319   uint32_t p1, p2, p3, p4, p5;
320   uint32_t st1, st2, st3;
321   uint32_t dst_pitch_2 = (dst_stride << 1);
322   uint8_t *odd_dst;
323 
324   filter12 = ((const int32_t *)filter_x0)[0];
325   filter34 = ((const int32_t *)filter_x0)[1];
326   filter56 = ((const int32_t *)filter_x0)[2];
327   filter78 = ((const int32_t *)filter_x0)[3];
328 
329   for (y = h; y--;) {
330     /* prefetch data to cache memory */
331     prefetch_load(src_ptr + src_stride);
332     prefetch_load(src_ptr + src_stride + 32);
333 
334     src = src_ptr;
335     dst = dst_ptr;
336 
337     odd_dst = (dst + dst_stride);
338 
339     for (c = 0; c < count; c++) {
340       __asm__ __volatile__(
341           "ulw              %[qload1],        0(%[src])                       "
342           "\n\t"
343           "ulw              %[qload2],        4(%[src])                       "
344           "\n\t"
345 
346           /* even 1. pixel */
347           "mtlo             %[vector_64],     $ac1                            "
348           "\n\t" /* even 1 */
349           "mthi             $zero,            $ac1                            "
350           "\n\t"
351           "mtlo             %[vector_64],     $ac2                            "
352           "\n\t" /* even 2 */
353           "mthi             $zero,            $ac2                            "
354           "\n\t"
355           "preceu.ph.qbr    %[p3],            %[qload2]                       "
356           "\n\t"
357           "preceu.ph.qbl    %[p4],            %[qload2]                       "
358           "\n\t"
359           "preceu.ph.qbr    %[p1],            %[qload1]                       "
360           "\n\t"
361           "preceu.ph.qbl    %[p2],            %[qload1]                       "
362           "\n\t"
363           "ulw              %[qload2],        8(%[src])                       "
364           "\n\t"
365           "dpa.w.ph         $ac1,             %[p1],          %[filter12]     "
366           "\n\t" /* even 1 */
367           "dpa.w.ph         $ac1,             %[p2],          %[filter34]     "
368           "\n\t" /* even 1 */
369           "dpa.w.ph         $ac1,             %[p3],          %[filter56]     "
370           "\n\t" /* even 1 */
371           "dpa.w.ph         $ac1,             %[p4],          %[filter78]     "
372           "\n\t" /* even 1 */
373           "extp             %[Temp1],         $ac1,           31              "
374           "\n\t" /* even 1 */
375 
376           /* even 2. pixel */
377           "mtlo             %[vector_64],     $ac3                            "
378           "\n\t" /* even 3 */
379           "mthi             $zero,            $ac3                            "
380           "\n\t"
381           "preceu.ph.qbr    %[p1],            %[qload2]                       "
382           "\n\t"
383           "preceu.ph.qbl    %[p5],            %[qload2]                       "
384           "\n\t"
385           "ulw              %[qload1],        12(%[src])                      "
386           "\n\t"
387           "dpa.w.ph         $ac2,             %[p2],          %[filter12]     "
388           "\n\t" /* even 1 */
389           "dpa.w.ph         $ac2,             %[p3],          %[filter34]     "
390           "\n\t" /* even 1 */
391           "dpa.w.ph         $ac2,             %[p4],          %[filter56]     "
392           "\n\t" /* even 1 */
393           "dpa.w.ph         $ac2,             %[p1],          %[filter78]     "
394           "\n\t" /* even 1 */
395           "lbux             %[st1],           %[Temp1](%[cm])                 "
396           "\n\t" /* even 1 */
397           "extp             %[Temp2],         $ac2,           31              "
398           "\n\t" /* even 1 */
399 
400           /* even 3. pixel */
401           "mtlo             %[vector_64],     $ac1                            "
402           "\n\t" /* even 4 */
403           "mthi             $zero,            $ac1                            "
404           "\n\t"
405           "preceu.ph.qbr    %[p2],            %[qload1]                       "
406           "\n\t"
407           "sb               %[st1],           0(%[dst])                       "
408           "\n\t" /* even 1 */
409           "addu             %[dst],           %[dst],         %[dst_pitch_2]   "
410           "          \n\t"
411           "dpa.w.ph         $ac3,             %[p3],          %[filter12]     "
412           "\n\t" /* even 3 */
413           "dpa.w.ph         $ac3,             %[p4],          %[filter34]     "
414           "\n\t" /* even 3 */
415           "dpa.w.ph         $ac3,             %[p1],          %[filter56]     "
416           "\n\t" /* even 3 */
417           "dpa.w.ph         $ac3,             %[p5],          %[filter78]     "
418           "\n\t" /* even 3 */
419           "extp             %[Temp3],         $ac3,           31              "
420           "\n\t" /* even 3 */
421           "lbux             %[st2],           %[Temp2](%[cm])                 "
422           "\n\t" /* even 1 */
423 
424           /* even 4. pixel */
425           "mtlo             %[vector_64],     $ac2                            "
426           "\n\t" /* even 5 */
427           "mthi             $zero,            $ac2                            "
428           "\n\t"
429           "preceu.ph.qbl    %[p3],            %[qload1]                       "
430           "\n\t"
431           "sb               %[st2],           0(%[dst])                       "
432           "\n\t" /* even 2 */
433           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
434           "\n\t"
435           "ulw              %[qload2],        16(%[src])                      "
436           "\n\t"
437           "dpa.w.ph         $ac1,             %[p4],          %[filter12]     "
438           "\n\t" /* even 4 */
439           "dpa.w.ph         $ac1,             %[p1],          %[filter34]     "
440           "\n\t" /* even 4 */
441           "dpa.w.ph         $ac1,             %[p5],          %[filter56]     "
442           "\n\t" /* even 4 */
443           "dpa.w.ph         $ac1,             %[p2],          %[filter78]     "
444           "\n\t" /* even 4 */
445           "extp             %[Temp1],         $ac1,           31              "
446           "\n\t" /* even 4 */
447           "lbux             %[st3],           %[Temp3](%[cm])                 "
448           "\n\t" /* even 3 */
449 
450           /* even 5. pixel */
451           "mtlo             %[vector_64],     $ac3                            "
452           "\n\t" /* even 6 */
453           "mthi             $zero,            $ac3                            "
454           "\n\t"
455           "preceu.ph.qbr    %[p4],            %[qload2]                       "
456           "\n\t"
457           "sb               %[st3],           0(%[dst])                       "
458           "\n\t" /* even 3 */
459           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
460           "\n\t"
461           "dpa.w.ph         $ac2,             %[p1],          %[filter12]     "
462           "\n\t" /* even 5 */
463           "dpa.w.ph         $ac2,             %[p5],          %[filter34]     "
464           "\n\t" /* even 5 */
465           "dpa.w.ph         $ac2,             %[p2],          %[filter56]     "
466           "\n\t" /* even 5 */
467           "dpa.w.ph         $ac2,             %[p3],          %[filter78]     "
468           "\n\t" /* even 5 */
469           "extp             %[Temp2],         $ac2,           31              "
470           "\n\t" /* even 5 */
471           "lbux             %[st1],           %[Temp1](%[cm])                 "
472           "\n\t" /* even 4 */
473 
474           /* even 6. pixel */
475           "mtlo             %[vector_64],     $ac1                            "
476           "\n\t" /* even 7 */
477           "mthi             $zero,            $ac1                            "
478           "\n\t"
479           "preceu.ph.qbl    %[p1],            %[qload2]                       "
480           "\n\t"
481           "sb               %[st1],           0(%[dst])                       "
482           "\n\t" /* even 4 */
483           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
484           "\n\t"
485           "ulw              %[qload1],        20(%[src])                      "
486           "\n\t"
487           "dpa.w.ph         $ac3,             %[p5],          %[filter12]     "
488           "\n\t" /* even 6 */
489           "dpa.w.ph         $ac3,             %[p2],          %[filter34]     "
490           "\n\t" /* even 6 */
491           "dpa.w.ph         $ac3,             %[p3],          %[filter56]     "
492           "\n\t" /* even 6 */
493           "dpa.w.ph         $ac3,             %[p4],          %[filter78]     "
494           "\n\t" /* even 6 */
495           "extp             %[Temp3],         $ac3,           31              "
496           "\n\t" /* even 6 */
497           "lbux             %[st2],           %[Temp2](%[cm])                 "
498           "\n\t" /* even 5 */
499 
500           /* even 7. pixel */
501           "mtlo             %[vector_64],     $ac2                            "
502           "\n\t" /* even 8 */
503           "mthi             $zero,            $ac2                            "
504           "\n\t"
505           "preceu.ph.qbr    %[p5],            %[qload1]                       "
506           "\n\t"
507           "sb               %[st2],           0(%[dst])                       "
508           "\n\t" /* even 5 */
509           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
510           "\n\t"
511           "dpa.w.ph         $ac1,             %[p2],          %[filter12]     "
512           "\n\t" /* even 7 */
513           "dpa.w.ph         $ac1,             %[p3],          %[filter34]     "
514           "\n\t" /* even 7 */
515           "dpa.w.ph         $ac1,             %[p4],          %[filter56]     "
516           "\n\t" /* even 7 */
517           "dpa.w.ph         $ac1,             %[p1],          %[filter78]     "
518           "\n\t" /* even 7 */
519           "extp             %[Temp1],         $ac1,           31              "
520           "\n\t" /* even 7 */
521           "lbux             %[st3],           %[Temp3](%[cm])                 "
522           "\n\t" /* even 6 */
523 
524           /* even 8. pixel */
525           "mtlo             %[vector_64],     $ac3                            "
526           "\n\t" /* odd 1 */
527           "mthi             $zero,            $ac3                            "
528           "\n\t"
529           "dpa.w.ph         $ac2,             %[p3],          %[filter12]     "
530           "\n\t" /* even 8 */
531           "dpa.w.ph         $ac2,             %[p4],          %[filter34]     "
532           "\n\t" /* even 8 */
533           "sb               %[st3],           0(%[dst])                       "
534           "\n\t" /* even 6 */
535           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
536           "\n\t"
537           "dpa.w.ph         $ac2,             %[p1],          %[filter56]     "
538           "\n\t" /* even 8 */
539           "dpa.w.ph         $ac2,             %[p5],          %[filter78]     "
540           "\n\t" /* even 8 */
541           "extp             %[Temp2],         $ac2,           31              "
542           "\n\t" /* even 8 */
543           "lbux             %[st1],           %[Temp1](%[cm])                 "
544           "\n\t" /* even 7 */
545 
546           /* ODD pixels */
547           "ulw              %[qload1],        1(%[src])                       "
548           "\n\t"
549           "ulw              %[qload2],        5(%[src])                       "
550           "\n\t"
551 
552           /* odd 1. pixel */
553           "mtlo             %[vector_64],     $ac1                            "
554           "\n\t" /* odd 2 */
555           "mthi             $zero,            $ac1                            "
556           "\n\t"
557           "preceu.ph.qbr    %[p1],            %[qload1]                       "
558           "\n\t"
559           "preceu.ph.qbl    %[p2],            %[qload1]                       "
560           "\n\t"
561           "preceu.ph.qbr    %[p3],            %[qload2]                       "
562           "\n\t"
563           "preceu.ph.qbl    %[p4],            %[qload2]                       "
564           "\n\t"
565           "sb               %[st1],           0(%[dst])                       "
566           "\n\t" /* even 7 */
567           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
568           "\n\t"
569           "ulw              %[qload2],        9(%[src])                       "
570           "\n\t"
571           "dpa.w.ph         $ac3,             %[p1],          %[filter12]     "
572           "\n\t" /* odd 1 */
573           "dpa.w.ph         $ac3,             %[p2],          %[filter34]     "
574           "\n\t" /* odd 1 */
575           "dpa.w.ph         $ac3,             %[p3],          %[filter56]     "
576           "\n\t" /* odd 1 */
577           "dpa.w.ph         $ac3,             %[p4],          %[filter78]     "
578           "\n\t" /* odd 1 */
579           "extp             %[Temp3],         $ac3,           31              "
580           "\n\t" /* odd 1 */
581           "lbux             %[st2],           %[Temp2](%[cm])                 "
582           "\n\t" /* even 8 */
583 
584           /* odd 2. pixel */
585           "mtlo             %[vector_64],     $ac2                            "
586           "\n\t" /* odd 3 */
587           "mthi             $zero,            $ac2                            "
588           "\n\t"
589           "preceu.ph.qbr    %[p1],            %[qload2]                       "
590           "\n\t"
591           "preceu.ph.qbl    %[p5],            %[qload2]                       "
592           "\n\t"
593           "sb               %[st2],           0(%[dst])                       "
594           "\n\t" /* even 8 */
595           "ulw              %[qload1],        13(%[src])                      "
596           "\n\t"
597           "dpa.w.ph         $ac1,             %[p2],          %[filter12]     "
598           "\n\t" /* odd 2 */
599           "dpa.w.ph         $ac1,             %[p3],          %[filter34]     "
600           "\n\t" /* odd 2 */
601           "dpa.w.ph         $ac1,             %[p4],          %[filter56]     "
602           "\n\t" /* odd 2 */
603           "dpa.w.ph         $ac1,             %[p1],          %[filter78]     "
604           "\n\t" /* odd 2 */
605           "extp             %[Temp1],         $ac1,           31              "
606           "\n\t" /* odd 2 */
607           "lbux             %[st3],           %[Temp3](%[cm])                 "
608           "\n\t" /* odd 1 */
609 
610           /* odd 3. pixel */
611           "mtlo             %[vector_64],     $ac3                            "
612           "\n\t" /* odd 4 */
613           "mthi             $zero,            $ac3                            "
614           "\n\t"
615           "preceu.ph.qbr    %[p2],            %[qload1]                       "
616           "\n\t"
617           "sb               %[st3],           0(%[odd_dst])                   "
618           "\n\t" /* odd 1 */
619           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
620           "\n\t"
621           "dpa.w.ph         $ac2,             %[p3],          %[filter12]     "
622           "\n\t" /* odd 3 */
623           "dpa.w.ph         $ac2,             %[p4],          %[filter34]     "
624           "\n\t" /* odd 3 */
625           "dpa.w.ph         $ac2,             %[p1],          %[filter56]     "
626           "\n\t" /* odd 3 */
627           "dpa.w.ph         $ac2,             %[p5],          %[filter78]     "
628           "\n\t" /* odd 3 */
629           "extp             %[Temp2],         $ac2,           31              "
630           "\n\t" /* odd 3 */
631           "lbux             %[st1],           %[Temp1](%[cm])                 "
632           "\n\t" /* odd 2 */
633 
634           /* odd 4. pixel */
635           "mtlo             %[vector_64],     $ac1                            "
636           "\n\t" /* odd 5 */
637           "mthi             $zero,            $ac1                            "
638           "\n\t"
639           "preceu.ph.qbl    %[p3],            %[qload1]                       "
640           "\n\t"
641           "sb               %[st1],           0(%[odd_dst])                   "
642           "\n\t" /* odd 2 */
643           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
644           "\n\t"
645           "ulw              %[qload2],        17(%[src])                      "
646           "\n\t"
647           "dpa.w.ph         $ac3,             %[p4],          %[filter12]     "
648           "\n\t" /* odd 4 */
649           "dpa.w.ph         $ac3,             %[p1],          %[filter34]     "
650           "\n\t" /* odd 4 */
651           "dpa.w.ph         $ac3,             %[p5],          %[filter56]     "
652           "\n\t" /* odd 4 */
653           "dpa.w.ph         $ac3,             %[p2],          %[filter78]     "
654           "\n\t" /* odd 4 */
655           "extp             %[Temp3],         $ac3,           31              "
656           "\n\t" /* odd 4 */
657           "lbux             %[st2],           %[Temp2](%[cm])                 "
658           "\n\t" /* odd 3 */
659 
660           /* odd 5. pixel */
661           "mtlo             %[vector_64],     $ac2                            "
662           "\n\t" /* odd 6 */
663           "mthi             $zero,            $ac2                            "
664           "\n\t"
665           "preceu.ph.qbr    %[p4],            %[qload2]                       "
666           "\n\t"
667           "sb               %[st2],           0(%[odd_dst])                   "
668           "\n\t" /* odd 3 */
669           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
670           "\n\t"
671           "dpa.w.ph         $ac1,             %[p1],          %[filter12]     "
672           "\n\t" /* odd 5 */
673           "dpa.w.ph         $ac1,             %[p5],          %[filter34]     "
674           "\n\t" /* odd 5 */
675           "dpa.w.ph         $ac1,             %[p2],          %[filter56]     "
676           "\n\t" /* odd 5 */
677           "dpa.w.ph         $ac1,             %[p3],          %[filter78]     "
678           "\n\t" /* odd 5 */
679           "extp             %[Temp1],         $ac1,           31              "
680           "\n\t" /* odd 5 */
681           "lbux             %[st3],           %[Temp3](%[cm])                 "
682           "\n\t" /* odd 4 */
683 
684           /* odd 6. pixel */
685           "mtlo             %[vector_64],     $ac3                            "
686           "\n\t" /* odd 7 */
687           "mthi             $zero,            $ac3                            "
688           "\n\t"
689           "preceu.ph.qbl    %[p1],            %[qload2]                       "
690           "\n\t"
691           "sb               %[st3],           0(%[odd_dst])                   "
692           "\n\t" /* odd 4 */
693           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
694           "\n\t"
695           "ulw              %[qload1],        21(%[src])                      "
696           "\n\t"
697           "dpa.w.ph         $ac2,             %[p5],          %[filter12]     "
698           "\n\t" /* odd 6 */
699           "dpa.w.ph         $ac2,             %[p2],          %[filter34]     "
700           "\n\t" /* odd 6 */
701           "dpa.w.ph         $ac2,             %[p3],          %[filter56]     "
702           "\n\t" /* odd 6 */
703           "dpa.w.ph         $ac2,             %[p4],          %[filter78]     "
704           "\n\t" /* odd 6 */
705           "extp             %[Temp2],         $ac2,           31              "
706           "\n\t" /* odd 6 */
707           "lbux             %[st1],           %[Temp1](%[cm])                 "
708           "\n\t" /* odd 5 */
709 
710           /* odd 7. pixel */
711           "mtlo             %[vector_64],     $ac1                            "
712           "\n\t" /* odd 8 */
713           "mthi             $zero,            $ac1                            "
714           "\n\t"
715           "preceu.ph.qbr    %[p5],            %[qload1]                       "
716           "\n\t"
717           "sb               %[st1],           0(%[odd_dst])                   "
718           "\n\t" /* odd 5 */
719           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
720           "\n\t"
721           "dpa.w.ph         $ac3,             %[p2],          %[filter12]     "
722           "\n\t" /* odd 7 */
723           "dpa.w.ph         $ac3,             %[p3],          %[filter34]     "
724           "\n\t" /* odd 7 */
725           "dpa.w.ph         $ac3,             %[p4],          %[filter56]     "
726           "\n\t" /* odd 7 */
727           "dpa.w.ph         $ac3,             %[p1],          %[filter78]     "
728           "\n\t" /* odd 7 */
729           "extp             %[Temp3],         $ac3,           31              "
730           "\n\t" /* odd 7 */
731 
732           /* odd 8. pixel */
733           "dpa.w.ph         $ac1,             %[p3],          %[filter12]     "
734           "\n\t" /* odd 8 */
735           "dpa.w.ph         $ac1,             %[p4],          %[filter34]     "
736           "\n\t" /* odd 8 */
737           "dpa.w.ph         $ac1,             %[p1],          %[filter56]     "
738           "\n\t" /* odd 8 */
739           "dpa.w.ph         $ac1,             %[p5],          %[filter78]     "
740           "\n\t" /* odd 8 */
741           "extp             %[Temp1],         $ac1,           31              "
742           "\n\t" /* odd 8 */
743 
744           "lbux             %[st2],           %[Temp2](%[cm])                 "
745           "\n\t" /* odd 6 */
746           "lbux             %[st3],           %[Temp3](%[cm])                 "
747           "\n\t" /* odd 7 */
748           "lbux             %[st1],           %[Temp1](%[cm])                 "
749           "\n\t" /* odd 8 */
750 
751           "sb               %[st2],           0(%[odd_dst])                   "
752           "\n\t" /* odd 6 */
753           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
754           "\n\t"
755 
756           "sb               %[st3],           0(%[odd_dst])                   "
757           "\n\t" /* odd 7 */
758           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
759           "\n\t"
760 
761           "sb               %[st1],           0(%[odd_dst])                   "
762           "\n\t" /* odd 8 */
763 
764           : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
765             [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
766             [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
767             [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
768             [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
769           : [filter12] "r"(filter12), [filter34] "r"(filter34),
770             [filter56] "r"(filter56), [filter78] "r"(filter78),
771             [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src),
772             [dst_pitch_2] "r"(dst_pitch_2));
773 
774       src += 16;
775       dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
776       odd_dst = (dst + dst_stride);
777     }
778 
779     /* Next row... */
780     src_ptr += src_stride;
781 
782     dst_ptr += 1;
783   }
784 }
785 
convolve_horiz_64_transposed_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h)786 static void convolve_horiz_64_transposed_dspr2(
787     const uint8_t *src_ptr, int32_t src_stride, uint8_t *dst_ptr,
788     int32_t dst_stride, const int16_t *filter_x0, int32_t h) {
789   int32_t c, y;
790   const uint8_t *src;
791   uint8_t *dst;
792   uint8_t *cm = vpx_ff_cropTbl;
793   uint32_t vector_64 = 64;
794   int32_t filter12, filter34, filter56, filter78;
795   int32_t Temp1, Temp2, Temp3;
796   uint32_t qload1, qload2;
797   uint32_t p1, p2, p3, p4, p5;
798   uint32_t st1, st2, st3;
799   uint32_t dst_pitch_2 = (dst_stride << 1);
800   uint8_t *odd_dst;
801 
802   filter12 = ((const int32_t *)filter_x0)[0];
803   filter34 = ((const int32_t *)filter_x0)[1];
804   filter56 = ((const int32_t *)filter_x0)[2];
805   filter78 = ((const int32_t *)filter_x0)[3];
806 
807   for (y = h; y--;) {
808     /* prefetch data to cache memory */
809     prefetch_load(src_ptr + src_stride);
810     prefetch_load(src_ptr + src_stride + 32);
811     prefetch_load(src_ptr + src_stride + 64);
812 
813     src = src_ptr;
814     dst = dst_ptr;
815 
816     odd_dst = (dst + dst_stride);
817 
818     for (c = 0; c < 4; c++) {
819       __asm__ __volatile__(
820           "ulw              %[qload1],        0(%[src])                       "
821           "\n\t"
822           "ulw              %[qload2],        4(%[src])                       "
823           "\n\t"
824 
825           /* even 1. pixel */
826           "mtlo             %[vector_64],     $ac1                            "
827           "\n\t" /* even 1 */
828           "mthi             $zero,            $ac1                            "
829           "\n\t"
830           "mtlo             %[vector_64],     $ac2                            "
831           "\n\t" /* even 2 */
832           "mthi             $zero,            $ac2                            "
833           "\n\t"
834           "preceu.ph.qbr    %[p3],            %[qload2]                       "
835           "\n\t"
836           "preceu.ph.qbl    %[p4],            %[qload2]                       "
837           "\n\t"
838           "preceu.ph.qbr    %[p1],            %[qload1]                       "
839           "\n\t"
840           "preceu.ph.qbl    %[p2],            %[qload1]                       "
841           "\n\t"
842           "ulw              %[qload2],        8(%[src])                       "
843           "\n\t"
844           "dpa.w.ph         $ac1,             %[p1],          %[filter12]     "
845           "\n\t" /* even 1 */
846           "dpa.w.ph         $ac1,             %[p2],          %[filter34]     "
847           "\n\t" /* even 1 */
848           "dpa.w.ph         $ac1,             %[p3],          %[filter56]     "
849           "\n\t" /* even 1 */
850           "dpa.w.ph         $ac1,             %[p4],          %[filter78]     "
851           "\n\t" /* even 1 */
852           "extp             %[Temp1],         $ac1,           31              "
853           "\n\t" /* even 1 */
854 
855           /* even 2. pixel */
856           "mtlo             %[vector_64],     $ac3                            "
857           "\n\t" /* even 3 */
858           "mthi             $zero,            $ac3                            "
859           "\n\t"
860           "preceu.ph.qbr    %[p1],            %[qload2]                       "
861           "\n\t"
862           "preceu.ph.qbl    %[p5],            %[qload2]                       "
863           "\n\t"
864           "ulw              %[qload1],        12(%[src])                      "
865           "\n\t"
866           "dpa.w.ph         $ac2,             %[p2],          %[filter12]     "
867           "\n\t" /* even 1 */
868           "dpa.w.ph         $ac2,             %[p3],          %[filter34]     "
869           "\n\t" /* even 1 */
870           "dpa.w.ph         $ac2,             %[p4],          %[filter56]     "
871           "\n\t" /* even 1 */
872           "dpa.w.ph         $ac2,             %[p1],          %[filter78]     "
873           "\n\t" /* even 1 */
874           "lbux             %[st1],           %[Temp1](%[cm])                 "
875           "\n\t" /* even 1 */
876           "extp             %[Temp2],         $ac2,           31              "
877           "\n\t" /* even 1 */
878 
879           /* even 3. pixel */
880           "mtlo             %[vector_64],     $ac1                            "
881           "\n\t" /* even 4 */
882           "mthi             $zero,            $ac1                            "
883           "\n\t"
884           "preceu.ph.qbr    %[p2],            %[qload1]                       "
885           "\n\t"
886           "sb               %[st1],           0(%[dst])                       "
887           "\n\t" /* even 1 */
888           "addu             %[dst],           %[dst],         %[dst_pitch_2]   "
889           "          \n\t"
890           "dpa.w.ph         $ac3,             %[p3],          %[filter12]     "
891           "\n\t" /* even 3 */
892           "dpa.w.ph         $ac3,             %[p4],          %[filter34]     "
893           "\n\t" /* even 3 */
894           "dpa.w.ph         $ac3,             %[p1],          %[filter56]     "
895           "\n\t" /* even 3 */
896           "dpa.w.ph         $ac3,             %[p5],          %[filter78]     "
897           "\n\t" /* even 3 */
898           "extp             %[Temp3],         $ac3,           31              "
899           "\n\t" /* even 3 */
900           "lbux             %[st2],           %[Temp2](%[cm])                 "
901           "\n\t" /* even 1 */
902 
903           /* even 4. pixel */
904           "mtlo             %[vector_64],     $ac2                            "
905           "\n\t" /* even 5 */
906           "mthi             $zero,            $ac2                            "
907           "\n\t"
908           "preceu.ph.qbl    %[p3],            %[qload1]                       "
909           "\n\t"
910           "sb               %[st2],           0(%[dst])                       "
911           "\n\t" /* even 2 */
912           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
913           "\n\t"
914           "ulw              %[qload2],        16(%[src])                      "
915           "\n\t"
916           "dpa.w.ph         $ac1,             %[p4],          %[filter12]     "
917           "\n\t" /* even 4 */
918           "dpa.w.ph         $ac1,             %[p1],          %[filter34]     "
919           "\n\t" /* even 4 */
920           "dpa.w.ph         $ac1,             %[p5],          %[filter56]     "
921           "\n\t" /* even 4 */
922           "dpa.w.ph         $ac1,             %[p2],          %[filter78]     "
923           "\n\t" /* even 4 */
924           "extp             %[Temp1],         $ac1,           31              "
925           "\n\t" /* even 4 */
926           "lbux             %[st3],           %[Temp3](%[cm])                 "
927           "\n\t" /* even 3 */
928 
929           /* even 5. pixel */
930           "mtlo             %[vector_64],     $ac3                            "
931           "\n\t" /* even 6 */
932           "mthi             $zero,            $ac3                            "
933           "\n\t"
934           "preceu.ph.qbr    %[p4],            %[qload2]                       "
935           "\n\t"
936           "sb               %[st3],           0(%[dst])                       "
937           "\n\t" /* even 3 */
938           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
939           "\n\t"
940           "dpa.w.ph         $ac2,             %[p1],          %[filter12]     "
941           "\n\t" /* even 5 */
942           "dpa.w.ph         $ac2,             %[p5],          %[filter34]     "
943           "\n\t" /* even 5 */
944           "dpa.w.ph         $ac2,             %[p2],          %[filter56]     "
945           "\n\t" /* even 5 */
946           "dpa.w.ph         $ac2,             %[p3],          %[filter78]     "
947           "\n\t" /* even 5 */
948           "extp             %[Temp2],         $ac2,           31              "
949           "\n\t" /* even 5 */
950           "lbux             %[st1],           %[Temp1](%[cm])                 "
951           "\n\t" /* even 4 */
952 
953           /* even 6. pixel */
954           "mtlo             %[vector_64],     $ac1                            "
955           "\n\t" /* even 7 */
956           "mthi             $zero,            $ac1                            "
957           "\n\t"
958           "preceu.ph.qbl    %[p1],            %[qload2]                       "
959           "\n\t"
960           "sb               %[st1],           0(%[dst])                       "
961           "\n\t" /* even 4 */
962           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
963           "\n\t"
964           "ulw              %[qload1],        20(%[src])                      "
965           "\n\t"
966           "dpa.w.ph         $ac3,             %[p5],          %[filter12]     "
967           "\n\t" /* even 6 */
968           "dpa.w.ph         $ac3,             %[p2],          %[filter34]     "
969           "\n\t" /* even 6 */
970           "dpa.w.ph         $ac3,             %[p3],          %[filter56]     "
971           "\n\t" /* even 6 */
972           "dpa.w.ph         $ac3,             %[p4],          %[filter78]     "
973           "\n\t" /* even 6 */
974           "extp             %[Temp3],         $ac3,           31              "
975           "\n\t" /* even 6 */
976           "lbux             %[st2],           %[Temp2](%[cm])                 "
977           "\n\t" /* even 5 */
978 
979           /* even 7. pixel */
980           "mtlo             %[vector_64],     $ac2                            "
981           "\n\t" /* even 8 */
982           "mthi             $zero,            $ac2                            "
983           "\n\t"
984           "preceu.ph.qbr    %[p5],            %[qload1]                       "
985           "\n\t"
986           "sb               %[st2],           0(%[dst])                       "
987           "\n\t" /* even 5 */
988           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
989           "\n\t"
990           "dpa.w.ph         $ac1,             %[p2],          %[filter12]     "
991           "\n\t" /* even 7 */
992           "dpa.w.ph         $ac1,             %[p3],          %[filter34]     "
993           "\n\t" /* even 7 */
994           "dpa.w.ph         $ac1,             %[p4],          %[filter56]     "
995           "\n\t" /* even 7 */
996           "dpa.w.ph         $ac1,             %[p1],          %[filter78]     "
997           "\n\t" /* even 7 */
998           "extp             %[Temp1],         $ac1,           31              "
999           "\n\t" /* even 7 */
1000           "lbux             %[st3],           %[Temp3](%[cm])                 "
1001           "\n\t" /* even 6 */
1002 
1003           /* even 8. pixel */
1004           "mtlo             %[vector_64],     $ac3                            "
1005           "\n\t" /* odd 1 */
1006           "mthi             $zero,            $ac3                            "
1007           "\n\t"
1008           "dpa.w.ph         $ac2,             %[p3],          %[filter12]     "
1009           "\n\t" /* even 8 */
1010           "dpa.w.ph         $ac2,             %[p4],          %[filter34]     "
1011           "\n\t" /* even 8 */
1012           "sb               %[st3],           0(%[dst])                       "
1013           "\n\t" /* even 6 */
1014           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
1015           "\n\t"
1016           "dpa.w.ph         $ac2,             %[p1],          %[filter56]     "
1017           "\n\t" /* even 8 */
1018           "dpa.w.ph         $ac2,             %[p5],          %[filter78]     "
1019           "\n\t" /* even 8 */
1020           "extp             %[Temp2],         $ac2,           31              "
1021           "\n\t" /* even 8 */
1022           "lbux             %[st1],           %[Temp1](%[cm])                 "
1023           "\n\t" /* even 7 */
1024 
1025           /* ODD pixels */
1026           "ulw              %[qload1],        1(%[src])                       "
1027           "\n\t"
1028           "ulw              %[qload2],        5(%[src])                       "
1029           "\n\t"
1030 
1031           /* odd 1. pixel */
1032           "mtlo             %[vector_64],     $ac1                            "
1033           "\n\t" /* odd 2 */
1034           "mthi             $zero,            $ac1                            "
1035           "\n\t"
1036           "preceu.ph.qbr    %[p1],            %[qload1]                       "
1037           "\n\t"
1038           "preceu.ph.qbl    %[p2],            %[qload1]                       "
1039           "\n\t"
1040           "preceu.ph.qbr    %[p3],            %[qload2]                       "
1041           "\n\t"
1042           "preceu.ph.qbl    %[p4],            %[qload2]                       "
1043           "\n\t"
1044           "sb               %[st1],           0(%[dst])                       "
1045           "\n\t" /* even 7 */
1046           "addu             %[dst],           %[dst],         %[dst_pitch_2]  "
1047           "\n\t"
1048           "ulw              %[qload2],        9(%[src])                       "
1049           "\n\t"
1050           "dpa.w.ph         $ac3,             %[p1],          %[filter12]     "
1051           "\n\t" /* odd 1 */
1052           "dpa.w.ph         $ac3,             %[p2],          %[filter34]     "
1053           "\n\t" /* odd 1 */
1054           "dpa.w.ph         $ac3,             %[p3],          %[filter56]     "
1055           "\n\t" /* odd 1 */
1056           "dpa.w.ph         $ac3,             %[p4],          %[filter78]     "
1057           "\n\t" /* odd 1 */
1058           "extp             %[Temp3],         $ac3,           31              "
1059           "\n\t" /* odd 1 */
1060           "lbux             %[st2],           %[Temp2](%[cm])                 "
1061           "\n\t" /* even 8 */
1062 
1063           /* odd 2. pixel */
1064           "mtlo             %[vector_64],     $ac2                            "
1065           "\n\t" /* odd 3 */
1066           "mthi             $zero,            $ac2                            "
1067           "\n\t"
1068           "preceu.ph.qbr    %[p1],            %[qload2]                       "
1069           "\n\t"
1070           "preceu.ph.qbl    %[p5],            %[qload2]                       "
1071           "\n\t"
1072           "sb               %[st2],           0(%[dst])                       "
1073           "\n\t" /* even 8 */
1074           "ulw              %[qload1],        13(%[src])                      "
1075           "\n\t"
1076           "dpa.w.ph         $ac1,             %[p2],          %[filter12]     "
1077           "\n\t" /* odd 2 */
1078           "dpa.w.ph         $ac1,             %[p3],          %[filter34]     "
1079           "\n\t" /* odd 2 */
1080           "dpa.w.ph         $ac1,             %[p4],          %[filter56]     "
1081           "\n\t" /* odd 2 */
1082           "dpa.w.ph         $ac1,             %[p1],          %[filter78]     "
1083           "\n\t" /* odd 2 */
1084           "extp             %[Temp1],         $ac1,           31              "
1085           "\n\t" /* odd 2 */
1086           "lbux             %[st3],           %[Temp3](%[cm])                 "
1087           "\n\t" /* odd 1 */
1088 
1089           /* odd 3. pixel */
1090           "mtlo             %[vector_64],     $ac3                            "
1091           "\n\t" /* odd 4 */
1092           "mthi             $zero,            $ac3                            "
1093           "\n\t"
1094           "preceu.ph.qbr    %[p2],            %[qload1]                       "
1095           "\n\t"
1096           "sb               %[st3],           0(%[odd_dst])                   "
1097           "\n\t" /* odd 1 */
1098           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
1099           "\n\t"
1100           "dpa.w.ph         $ac2,             %[p3],          %[filter12]     "
1101           "\n\t" /* odd 3 */
1102           "dpa.w.ph         $ac2,             %[p4],          %[filter34]     "
1103           "\n\t" /* odd 3 */
1104           "dpa.w.ph         $ac2,             %[p1],          %[filter56]     "
1105           "\n\t" /* odd 3 */
1106           "dpa.w.ph         $ac2,             %[p5],          %[filter78]     "
1107           "\n\t" /* odd 3 */
1108           "extp             %[Temp2],         $ac2,           31              "
1109           "\n\t" /* odd 3 */
1110           "lbux             %[st1],           %[Temp1](%[cm])                 "
1111           "\n\t" /* odd 2 */
1112 
1113           /* odd 4. pixel */
1114           "mtlo             %[vector_64],     $ac1                            "
1115           "\n\t" /* odd 5 */
1116           "mthi             $zero,            $ac1                            "
1117           "\n\t"
1118           "preceu.ph.qbl    %[p3],            %[qload1]                       "
1119           "\n\t"
1120           "sb               %[st1],           0(%[odd_dst])                   "
1121           "\n\t" /* odd 2 */
1122           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
1123           "\n\t"
1124           "ulw              %[qload2],        17(%[src])                      "
1125           "\n\t"
1126           "dpa.w.ph         $ac3,             %[p4],          %[filter12]     "
1127           "\n\t" /* odd 4 */
1128           "dpa.w.ph         $ac3,             %[p1],          %[filter34]     "
1129           "\n\t" /* odd 4 */
1130           "dpa.w.ph         $ac3,             %[p5],          %[filter56]     "
1131           "\n\t" /* odd 4 */
1132           "dpa.w.ph         $ac3,             %[p2],          %[filter78]     "
1133           "\n\t" /* odd 4 */
1134           "extp             %[Temp3],         $ac3,           31              "
1135           "\n\t" /* odd 4 */
1136           "lbux             %[st2],           %[Temp2](%[cm])                 "
1137           "\n\t" /* odd 3 */
1138 
1139           /* odd 5. pixel */
1140           "mtlo             %[vector_64],     $ac2                            "
1141           "\n\t" /* odd 6 */
1142           "mthi             $zero,            $ac2                            "
1143           "\n\t"
1144           "preceu.ph.qbr    %[p4],            %[qload2]                       "
1145           "\n\t"
1146           "sb               %[st2],           0(%[odd_dst])                   "
1147           "\n\t" /* odd 3 */
1148           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
1149           "\n\t"
1150           "dpa.w.ph         $ac1,             %[p1],          %[filter12]     "
1151           "\n\t" /* odd 5 */
1152           "dpa.w.ph         $ac1,             %[p5],          %[filter34]     "
1153           "\n\t" /* odd 5 */
1154           "dpa.w.ph         $ac1,             %[p2],          %[filter56]     "
1155           "\n\t" /* odd 5 */
1156           "dpa.w.ph         $ac1,             %[p3],          %[filter78]     "
1157           "\n\t" /* odd 5 */
1158           "extp             %[Temp1],         $ac1,           31              "
1159           "\n\t" /* odd 5 */
1160           "lbux             %[st3],           %[Temp3](%[cm])                 "
1161           "\n\t" /* odd 4 */
1162 
1163           /* odd 6. pixel */
1164           "mtlo             %[vector_64],     $ac3                            "
1165           "\n\t" /* odd 7 */
1166           "mthi             $zero,            $ac3                            "
1167           "\n\t"
1168           "preceu.ph.qbl    %[p1],            %[qload2]                       "
1169           "\n\t"
1170           "sb               %[st3],           0(%[odd_dst])                   "
1171           "\n\t" /* odd 4 */
1172           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
1173           "\n\t"
1174           "ulw              %[qload1],        21(%[src])                      "
1175           "\n\t"
1176           "dpa.w.ph         $ac2,             %[p5],          %[filter12]     "
1177           "\n\t" /* odd 6 */
1178           "dpa.w.ph         $ac2,             %[p2],          %[filter34]     "
1179           "\n\t" /* odd 6 */
1180           "dpa.w.ph         $ac2,             %[p3],          %[filter56]     "
1181           "\n\t" /* odd 6 */
1182           "dpa.w.ph         $ac2,             %[p4],          %[filter78]     "
1183           "\n\t" /* odd 6 */
1184           "extp             %[Temp2],         $ac2,           31              "
1185           "\n\t" /* odd 6 */
1186           "lbux             %[st1],           %[Temp1](%[cm])                 "
1187           "\n\t" /* odd 5 */
1188 
1189           /* odd 7. pixel */
1190           "mtlo             %[vector_64],     $ac1                            "
1191           "\n\t" /* odd 8 */
1192           "mthi             $zero,            $ac1                            "
1193           "\n\t"
1194           "preceu.ph.qbr    %[p5],            %[qload1]                       "
1195           "\n\t"
1196           "sb               %[st1],           0(%[odd_dst])                   "
1197           "\n\t" /* odd 5 */
1198           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
1199           "\n\t"
1200           "dpa.w.ph         $ac3,             %[p2],          %[filter12]     "
1201           "\n\t" /* odd 7 */
1202           "dpa.w.ph         $ac3,             %[p3],          %[filter34]     "
1203           "\n\t" /* odd 7 */
1204           "dpa.w.ph         $ac3,             %[p4],          %[filter56]     "
1205           "\n\t" /* odd 7 */
1206           "dpa.w.ph         $ac3,             %[p1],          %[filter78]     "
1207           "\n\t" /* odd 7 */
1208           "extp             %[Temp3],         $ac3,           31              "
1209           "\n\t" /* odd 7 */
1210 
1211           /* odd 8. pixel */
1212           "dpa.w.ph         $ac1,             %[p3],          %[filter12]     "
1213           "\n\t" /* odd 8 */
1214           "dpa.w.ph         $ac1,             %[p4],          %[filter34]     "
1215           "\n\t" /* odd 8 */
1216           "dpa.w.ph         $ac1,             %[p1],          %[filter56]     "
1217           "\n\t" /* odd 8 */
1218           "dpa.w.ph         $ac1,             %[p5],          %[filter78]     "
1219           "\n\t" /* odd 8 */
1220           "extp             %[Temp1],         $ac1,           31              "
1221           "\n\t" /* odd 8 */
1222 
1223           "lbux             %[st2],           %[Temp2](%[cm])                 "
1224           "\n\t" /* odd 6 */
1225           "lbux             %[st3],           %[Temp3](%[cm])                 "
1226           "\n\t" /* odd 7 */
1227           "lbux             %[st1],           %[Temp1](%[cm])                 "
1228           "\n\t" /* odd 8 */
1229 
1230           "sb               %[st2],           0(%[odd_dst])                   "
1231           "\n\t" /* odd 6 */
1232           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
1233           "\n\t"
1234 
1235           "sb               %[st3],           0(%[odd_dst])                   "
1236           "\n\t" /* odd 7 */
1237           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  "
1238           "\n\t"
1239 
1240           "sb               %[st1],           0(%[odd_dst])                   "
1241           "\n\t" /* odd 8 */
1242 
1243           : [qload1] "=&r"(qload1), [qload2] "=&r"(qload2), [p5] "=&r"(p5),
1244             [st1] "=&r"(st1), [st2] "=&r"(st2), [st3] "=&r"(st3),
1245             [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
1246             [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
1247             [dst] "+r"(dst), [odd_dst] "+r"(odd_dst)
1248           : [filter12] "r"(filter12), [filter34] "r"(filter34),
1249             [filter56] "r"(filter56), [filter78] "r"(filter78),
1250             [vector_64] "r"(vector_64), [cm] "r"(cm), [src] "r"(src),
1251             [dst_pitch_2] "r"(dst_pitch_2));
1252 
1253       src += 16;
1254       dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
1255       odd_dst = (dst + dst_stride);
1256     }
1257 
1258     /* Next row... */
1259     src_ptr += src_stride;
1260 
1261     dst_ptr += 1;
1262   }
1263 }
1264 
convolve_horiz_transposed(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter,int w,int h)1265 void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
1266                                uint8_t *dst, ptrdiff_t dst_stride,
1267                                const int16_t *filter, int w, int h) {
1268   int x, y, k;
1269 
1270   for (y = 0; y < h; ++y) {
1271     for (x = 0; x < w; ++x) {
1272       int sum = 0;
1273 
1274       for (k = 0; k < 8; ++k) sum += src[x + k] * filter[k];
1275 
1276       dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
1277     }
1278 
1279     src += src_stride;
1280     dst += 1;
1281   }
1282 }
1283 
copy_horiz_transposed(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int w,int h)1284 void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
1285                            uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
1286   int x, y;
1287 
1288   for (y = 0; y < h; ++y) {
1289     for (x = 0; x < w; ++x) {
1290       dst[x * dst_stride] = src[x];
1291     }
1292 
1293     src += src_stride;
1294     dst += 1;
1295   }
1296 }
1297 
vpx_convolve8_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int32_t x_step_q4,int y0_q4,int y_step_q4,int w,int h)1298 void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
1299                          ptrdiff_t dst_stride, const InterpKernel *filter,
1300                          int x0_q4, int32_t x_step_q4, int y0_q4, int y_step_q4,
1301                          int w, int h) {
1302   const int16_t *const filter_x = filter[x0_q4];
1303   const int16_t *const filter_y = filter[y0_q4];
1304   DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
1305   int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
1306   uint32_t pos = 38;
1307 
1308   assert(x_step_q4 == 16);
1309   assert(y_step_q4 == 16);
1310   assert(((const int32_t *)filter_x)[1] != 0x800000);
1311   assert(((const int32_t *)filter_y)[1] != 0x800000);
1312   (void)x_step_q4;
1313 
1314   /* bit positon for extract from acc */
1315   __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
1316                        :
1317                        : [pos] "r"(pos));
1318 
1319   if (intermediate_height < h) intermediate_height = h;
1320 
1321   /* copy the src to dst */
1322   if (filter_x[3] == 0x80) {
1323     copy_horiz_transposed(src - src_stride * 3, src_stride, temp,
1324                           intermediate_height, w, intermediate_height);
1325   } else if (vpx_get_filter_taps(filter_x) == 2) {
1326     vpx_convolve2_dspr2(src - src_stride * 3, src_stride, temp,
1327                         intermediate_height, filter_x, w, intermediate_height);
1328   } else {
1329     src -= (src_stride * 3 + 3);
1330 
1331     /* prefetch data to cache memory */
1332     prefetch_load(src);
1333     prefetch_load(src + 32);
1334 
1335     switch (w) {
1336       case 4:
1337         convolve_horiz_4_transposed_dspr2(src, src_stride, temp,
1338                                           intermediate_height, filter_x,
1339                                           intermediate_height);
1340         break;
1341       case 8:
1342         convolve_horiz_8_transposed_dspr2(src, src_stride, temp,
1343                                           intermediate_height, filter_x,
1344                                           intermediate_height);
1345         break;
1346       case 16:
1347       case 32:
1348         convolve_horiz_16_transposed_dspr2(src, src_stride, temp,
1349                                            intermediate_height, filter_x,
1350                                            intermediate_height, (w / 16));
1351         break;
1352       case 64:
1353         prefetch_load(src + 32);
1354         convolve_horiz_64_transposed_dspr2(src, src_stride, temp,
1355                                            intermediate_height, filter_x,
1356                                            intermediate_height);
1357         break;
1358       default:
1359         convolve_horiz_transposed(src, src_stride, temp, intermediate_height,
1360                                   filter_x, w, intermediate_height);
1361         break;
1362     }
1363   }
1364 
1365   /* copy the src to dst */
1366   if (filter_y[3] == 0x80) {
1367     copy_horiz_transposed(temp + 3, intermediate_height, dst, dst_stride, h, w);
1368   } else if (vpx_get_filter_taps(filter_y) == 2) {
1369     vpx_convolve2_dspr2(temp + 3, intermediate_height, dst, dst_stride,
1370                         filter_y, h, w);
1371   } else {
1372     switch (h) {
1373       case 4:
1374         convolve_horiz_4_transposed_dspr2(temp, intermediate_height, dst,
1375                                           dst_stride, filter_y, w);
1376         break;
1377       case 8:
1378         convolve_horiz_8_transposed_dspr2(temp, intermediate_height, dst,
1379                                           dst_stride, filter_y, w);
1380         break;
1381       case 16:
1382       case 32:
1383         convolve_horiz_16_transposed_dspr2(temp, intermediate_height, dst,
1384                                            dst_stride, filter_y, w, (h / 16));
1385         break;
1386       case 64:
1387         convolve_horiz_64_transposed_dspr2(temp, intermediate_height, dst,
1388                                            dst_stride, filter_y, w);
1389         break;
1390       default:
1391         convolve_horiz_transposed(temp, intermediate_height, dst, dst_stride,
1392                                   filter_y, h, w);
1393         break;
1394     }
1395   }
1396 }
1397 
vpx_convolve_copy_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int x_step_q4,int y0_q4,int y_step_q4,int w,int h)1398 void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
1399                              uint8_t *dst, ptrdiff_t dst_stride,
1400                              const InterpKernel *filter, int x0_q4,
1401                              int x_step_q4, int y0_q4, int y_step_q4, int w,
1402                              int h) {
1403   int x, y;
1404   (void)filter;
1405   (void)x0_q4;
1406   (void)x_step_q4;
1407   (void)y0_q4;
1408   (void)y_step_q4;
1409 
1410   /* prefetch data to cache memory */
1411   prefetch_load(src);
1412   prefetch_load(src + 32);
1413   prefetch_store(dst);
1414 
1415   switch (w) {
1416     case 4: {
1417       uint32_t tp1;
1418 
1419       /* 1 word storage */
1420       for (y = h; y--;) {
1421         prefetch_load(src + src_stride);
1422         prefetch_load(src + src_stride + 32);
1423         prefetch_store(dst + dst_stride);
1424 
1425         __asm__ __volatile__(
1426             "ulw              %[tp1],         (%[src])      \n\t"
1427             "sw               %[tp1],         (%[dst])      \n\t" /* store */
1428 
1429             : [tp1] "=&r"(tp1)
1430             : [src] "r"(src), [dst] "r"(dst));
1431 
1432         src += src_stride;
1433         dst += dst_stride;
1434       }
1435       break;
1436     }
1437     case 8: {
1438       uint32_t tp1, tp2;
1439 
1440       /* 2 word storage */
1441       for (y = h; y--;) {
1442         prefetch_load(src + src_stride);
1443         prefetch_load(src + src_stride + 32);
1444         prefetch_store(dst + dst_stride);
1445 
1446         __asm__ __volatile__(
1447             "ulw              %[tp1],         0(%[src])      \n\t"
1448             "ulw              %[tp2],         4(%[src])      \n\t"
1449             "sw               %[tp1],         0(%[dst])      \n\t" /* store */
1450             "sw               %[tp2],         4(%[dst])      \n\t" /* store */
1451 
1452             : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2)
1453             : [src] "r"(src), [dst] "r"(dst));
1454 
1455         src += src_stride;
1456         dst += dst_stride;
1457       }
1458       break;
1459     }
1460     case 16: {
1461       uint32_t tp1, tp2, tp3, tp4;
1462 
1463       /* 4 word storage */
1464       for (y = h; y--;) {
1465         prefetch_load(src + src_stride);
1466         prefetch_load(src + src_stride + 32);
1467         prefetch_store(dst + dst_stride);
1468 
1469         __asm__ __volatile__(
1470             "ulw              %[tp1],         0(%[src])      \n\t"
1471             "ulw              %[tp2],         4(%[src])      \n\t"
1472             "ulw              %[tp3],         8(%[src])      \n\t"
1473             "ulw              %[tp4],         12(%[src])     \n\t"
1474 
1475             "sw               %[tp1],         0(%[dst])      \n\t" /* store */
1476             "sw               %[tp2],         4(%[dst])      \n\t" /* store */
1477             "sw               %[tp3],         8(%[dst])      \n\t" /* store */
1478             "sw               %[tp4],         12(%[dst])     \n\t" /* store */
1479 
1480             : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
1481               [tp4] "=&r"(tp4)
1482             : [src] "r"(src), [dst] "r"(dst));
1483 
1484         src += src_stride;
1485         dst += dst_stride;
1486       }
1487       break;
1488     }
1489     case 32: {
1490       uint32_t tp1, tp2, tp3, tp4;
1491       uint32_t tp5, tp6, tp7, tp8;
1492 
1493       /* 8 word storage */
1494       for (y = h; y--;) {
1495         prefetch_load(src + src_stride);
1496         prefetch_load(src + src_stride + 32);
1497         prefetch_store(dst + dst_stride);
1498 
1499         __asm__ __volatile__(
1500             "ulw              %[tp1],         0(%[src])      \n\t"
1501             "ulw              %[tp2],         4(%[src])      \n\t"
1502             "ulw              %[tp3],         8(%[src])      \n\t"
1503             "ulw              %[tp4],         12(%[src])     \n\t"
1504             "ulw              %[tp5],         16(%[src])     \n\t"
1505             "ulw              %[tp6],         20(%[src])     \n\t"
1506             "ulw              %[tp7],         24(%[src])     \n\t"
1507             "ulw              %[tp8],         28(%[src])     \n\t"
1508 
1509             "sw               %[tp1],         0(%[dst])      \n\t" /* store */
1510             "sw               %[tp2],         4(%[dst])      \n\t" /* store */
1511             "sw               %[tp3],         8(%[dst])      \n\t" /* store */
1512             "sw               %[tp4],         12(%[dst])     \n\t" /* store */
1513             "sw               %[tp5],         16(%[dst])     \n\t" /* store */
1514             "sw               %[tp6],         20(%[dst])     \n\t" /* store */
1515             "sw               %[tp7],         24(%[dst])     \n\t" /* store */
1516             "sw               %[tp8],         28(%[dst])     \n\t" /* store */
1517 
1518             : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
1519               [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
1520               [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
1521             : [src] "r"(src), [dst] "r"(dst));
1522 
1523         src += src_stride;
1524         dst += dst_stride;
1525       }
1526       break;
1527     }
1528     case 64: {
1529       uint32_t tp1, tp2, tp3, tp4;
1530       uint32_t tp5, tp6, tp7, tp8;
1531 
1532       prefetch_load(src + 64);
1533       prefetch_store(dst + 32);
1534 
1535       /* 16 word storage */
1536       for (y = h; y--;) {
1537         prefetch_load(src + src_stride);
1538         prefetch_load(src + src_stride + 32);
1539         prefetch_load(src + src_stride + 64);
1540         prefetch_store(dst + dst_stride);
1541         prefetch_store(dst + dst_stride + 32);
1542 
1543         __asm__ __volatile__(
1544             "ulw              %[tp1],         0(%[src])      \n\t"
1545             "ulw              %[tp2],         4(%[src])      \n\t"
1546             "ulw              %[tp3],         8(%[src])      \n\t"
1547             "ulw              %[tp4],         12(%[src])     \n\t"
1548             "ulw              %[tp5],         16(%[src])     \n\t"
1549             "ulw              %[tp6],         20(%[src])     \n\t"
1550             "ulw              %[tp7],         24(%[src])     \n\t"
1551             "ulw              %[tp8],         28(%[src])     \n\t"
1552 
1553             "sw               %[tp1],         0(%[dst])      \n\t" /* store */
1554             "sw               %[tp2],         4(%[dst])      \n\t" /* store */
1555             "sw               %[tp3],         8(%[dst])      \n\t" /* store */
1556             "sw               %[tp4],         12(%[dst])     \n\t" /* store */
1557             "sw               %[tp5],         16(%[dst])     \n\t" /* store */
1558             "sw               %[tp6],         20(%[dst])     \n\t" /* store */
1559             "sw               %[tp7],         24(%[dst])     \n\t" /* store */
1560             "sw               %[tp8],         28(%[dst])     \n\t" /* store */
1561 
1562             "ulw              %[tp1],         32(%[src])     \n\t"
1563             "ulw              %[tp2],         36(%[src])     \n\t"
1564             "ulw              %[tp3],         40(%[src])     \n\t"
1565             "ulw              %[tp4],         44(%[src])     \n\t"
1566             "ulw              %[tp5],         48(%[src])     \n\t"
1567             "ulw              %[tp6],         52(%[src])     \n\t"
1568             "ulw              %[tp7],         56(%[src])     \n\t"
1569             "ulw              %[tp8],         60(%[src])     \n\t"
1570 
1571             "sw               %[tp1],         32(%[dst])     \n\t" /* store */
1572             "sw               %[tp2],         36(%[dst])     \n\t" /* store */
1573             "sw               %[tp3],         40(%[dst])     \n\t" /* store */
1574             "sw               %[tp4],         44(%[dst])     \n\t" /* store */
1575             "sw               %[tp5],         48(%[dst])     \n\t" /* store */
1576             "sw               %[tp6],         52(%[dst])     \n\t" /* store */
1577             "sw               %[tp7],         56(%[dst])     \n\t" /* store */
1578             "sw               %[tp8],         60(%[dst])     \n\t" /* store */
1579 
1580             : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
1581               [tp4] "=&r"(tp4), [tp5] "=&r"(tp5), [tp6] "=&r"(tp6),
1582               [tp7] "=&r"(tp7), [tp8] "=&r"(tp8)
1583             : [src] "r"(src), [dst] "r"(dst));
1584 
1585         src += src_stride;
1586         dst += dst_stride;
1587       }
1588       break;
1589     }
1590     default:
1591       for (y = h; y--;) {
1592         for (x = 0; x < w; ++x) {
1593           dst[x] = src[x];
1594         }
1595 
1596         src += src_stride;
1597         dst += dst_stride;
1598       }
1599       break;
1600   }
1601 }
1602 #endif
1603