xref: /aosp_15_r20/external/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <stdio.h>
13 
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/mips/convolve_common_dspr2.h"
16 #include "vpx_dsp/vpx_convolve.h"
17 #include "vpx_dsp/vpx_dsp_common.h"
18 #include "vpx_dsp/vpx_filter.h"
19 #include "vpx_ports/mem.h"
20 
21 #if HAVE_DSPR2
convolve_avg_vert_4_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_y,int32_t w,int32_t h)22 static void convolve_avg_vert_4_dspr2(const uint8_t *src, int32_t src_stride,
23                                       uint8_t *dst, int32_t dst_stride,
24                                       const int16_t *filter_y, int32_t w,
25                                       int32_t h) {
26   int32_t x, y;
27   const uint8_t *src_ptr;
28   uint8_t *dst_ptr;
29   uint8_t *cm = vpx_ff_cropTbl;
30   uint32_t vector4a = 64;
31   uint32_t load1, load2, load3, load4;
32   uint32_t p1, p2;
33   uint32_t n1, n2;
34   uint32_t scratch1, scratch2;
35   uint32_t store1, store2;
36   int32_t vector1b, vector2b, vector3b, vector4b;
37   int32_t Temp1, Temp2;
38 
39   vector1b = ((const int32_t *)filter_y)[0];
40   vector2b = ((const int32_t *)filter_y)[1];
41   vector3b = ((const int32_t *)filter_y)[2];
42   vector4b = ((const int32_t *)filter_y)[3];
43 
44   src -= 3 * src_stride;
45 
46   for (y = h; y--;) {
47     /* prefetch data to cache memory */
48     prefetch_store(dst + dst_stride);
49 
50     for (x = 0; x < w; x += 4) {
51       src_ptr = src + x;
52       dst_ptr = dst + x;
53 
54       __asm__ __volatile__(
55           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
56           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
57           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
58           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
59           "ulw              %[load3],     0(%[src_ptr])                   \n\t"
60           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
61           "ulw              %[load4],     0(%[src_ptr])                   \n\t"
62 
63           "mtlo             %[vector4a],  $ac0                            \n\t"
64           "mtlo             %[vector4a],  $ac1                            \n\t"
65           "mtlo             %[vector4a],  $ac2                            \n\t"
66           "mtlo             %[vector4a],  $ac3                            \n\t"
67           "mthi             $zero,        $ac0                            \n\t"
68           "mthi             $zero,        $ac1                            \n\t"
69           "mthi             $zero,        $ac2                            \n\t"
70           "mthi             $zero,        $ac3                            \n\t"
71 
72           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
73           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
74           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
75           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
76           "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
77           "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
78           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
79           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
80 
81           "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
82           "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
83           "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
84           "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
85 
86           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
87           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
88           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
89           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
90           "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
91           "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
92           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
93           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
94 
95           "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
96           "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
97           "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
98           "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
99 
100           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
101           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
102           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
103           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
104           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
105           "ulw              %[load3],     0(%[src_ptr])                   \n\t"
106           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
107           "ulw              %[load4],     0(%[src_ptr])                   \n\t"
108 
109           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
110           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
111           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
112           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
113           "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
114           "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
115           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
116           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
117 
118           "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
119           "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
120           "extp             %[Temp1],     $ac0,           31              \n\t"
121           "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
122           "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
123           "extp             %[Temp2],     $ac1,           31              \n\t"
124 
125           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
126           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
127           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
128           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
129           "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
130           "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
131           "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
132           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
133           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
134           "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
135 
136           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
137           "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
138           "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
139           "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
140           "extp             %[Temp1],     $ac2,           31              \n\t"
141 
142           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
143           "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
144           "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
145           "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
146           "extp             %[Temp2],     $ac3,           31              \n\t"
147           "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
148 
149           "sb               %[store1],    0(%[dst_ptr])                   \n\t"
150           "sb               %[store2],    1(%[dst_ptr])                   \n\t"
151           "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
152 
153           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
154           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
155           "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
156           "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
157 
158           "sb               %[store1],    2(%[dst_ptr])                   \n\t"
159           "sb               %[store2],    3(%[dst_ptr])                   \n\t"
160 
161           : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
162             [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
163             [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
164             [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
165             [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
166             [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
167           : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
168             [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
169             [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
170             [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
171     }
172 
173     /* Next row... */
174     src += src_stride;
175     dst += dst_stride;
176   }
177 }
178 
convolve_avg_vert_64_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_y,int32_t h)179 static void convolve_avg_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
180                                        uint8_t *dst, int32_t dst_stride,
181                                        const int16_t *filter_y, int32_t h) {
182   int32_t x, y;
183   const uint8_t *src_ptr;
184   uint8_t *dst_ptr;
185   uint8_t *cm = vpx_ff_cropTbl;
186   uint32_t vector4a = 64;
187   uint32_t load1, load2, load3, load4;
188   uint32_t p1, p2;
189   uint32_t n1, n2;
190   uint32_t scratch1, scratch2;
191   uint32_t store1, store2;
192   int32_t vector1b, vector2b, vector3b, vector4b;
193   int32_t Temp1, Temp2;
194 
195   vector1b = ((const int32_t *)filter_y)[0];
196   vector2b = ((const int32_t *)filter_y)[1];
197   vector3b = ((const int32_t *)filter_y)[2];
198   vector4b = ((const int32_t *)filter_y)[3];
199 
200   src -= 3 * src_stride;
201 
202   for (y = h; y--;) {
203     /* prefetch data to cache memory */
204     prefetch_store(dst + dst_stride);
205     prefetch_store(dst + dst_stride + 32);
206 
207     for (x = 0; x < 64; x += 4) {
208       src_ptr = src + x;
209       dst_ptr = dst + x;
210 
211       __asm__ __volatile__(
212           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
213           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
214           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
215           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
216           "ulw              %[load3],     0(%[src_ptr])                   \n\t"
217           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
218           "ulw              %[load4],     0(%[src_ptr])                   \n\t"
219 
220           "mtlo             %[vector4a],  $ac0                            \n\t"
221           "mtlo             %[vector4a],  $ac1                            \n\t"
222           "mtlo             %[vector4a],  $ac2                            \n\t"
223           "mtlo             %[vector4a],  $ac3                            \n\t"
224           "mthi             $zero,        $ac0                            \n\t"
225           "mthi             $zero,        $ac1                            \n\t"
226           "mthi             $zero,        $ac2                            \n\t"
227           "mthi             $zero,        $ac3                            \n\t"
228 
229           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
230           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
231           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
232           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
233           "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
234           "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
235           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
236           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
237 
238           "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
239           "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
240           "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
241           "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
242 
243           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
244           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
245           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
246           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
247           "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
248           "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
249           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
250           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
251 
252           "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
253           "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
254           "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
255           "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
256 
257           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
258           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
259           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
260           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
261           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
262           "ulw              %[load3],     0(%[src_ptr])                   \n\t"
263           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
264           "ulw              %[load4],     0(%[src_ptr])                   \n\t"
265 
266           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
267           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
268           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
269           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
270           "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
271           "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
272           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
273           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
274 
275           "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
276           "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
277           "extp             %[Temp1],     $ac0,           31              \n\t"
278           "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
279           "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
280           "extp             %[Temp2],     $ac1,           31              \n\t"
281 
282           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
283           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
284           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
285           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
286           "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
287           "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
288           "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
289           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
290           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
291           "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
292 
293           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
294           "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
295           "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
296           "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
297           "extp             %[Temp1],     $ac2,           31              \n\t"
298 
299           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
300           "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
301           "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
302           "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
303           "extp             %[Temp2],     $ac3,           31              \n\t"
304           "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
305 
306           "sb               %[store1],    0(%[dst_ptr])                   \n\t"
307           "sb               %[store2],    1(%[dst_ptr])                   \n\t"
308           "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
309 
310           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
311           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
312           "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
313           "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
314 
315           "sb               %[store1],    2(%[dst_ptr])                   \n\t"
316           "sb               %[store2],    3(%[dst_ptr])                   \n\t"
317 
318           : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
319             [load4] "=&r"(load4), [p1] "=&r"(p1), [p2] "=&r"(p2),
320             [n1] "=&r"(n1), [n2] "=&r"(n2), [scratch1] "=&r"(scratch1),
321             [scratch2] "=&r"(scratch2), [Temp1] "=&r"(Temp1),
322             [Temp2] "=&r"(Temp2), [store1] "=&r"(store1),
323             [store2] "=&r"(store2), [src_ptr] "+r"(src_ptr)
324           : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
325             [vector3b] "r"(vector3b), [vector4b] "r"(vector4b),
326             [vector4a] "r"(vector4a), [src_stride] "r"(src_stride),
327             [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
328     }
329 
330     /* Next row... */
331     src += src_stride;
332     dst += dst_stride;
333   }
334 }
335 
vpx_convolve8_avg_vert_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int32_t x_step_q4,int y0_q4,int y_step_q4,int w,int h)336 void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
337                                   uint8_t *dst, ptrdiff_t dst_stride,
338                                   const InterpKernel *filter, int x0_q4,
339                                   int32_t x_step_q4, int y0_q4, int y_step_q4,
340                                   int w, int h) {
341   const int16_t *const filter_y = filter[y0_q4];
342   assert(y_step_q4 == 16);
343   assert(((const int32_t *)filter_y)[1] != 0x800000);
344 
345   if (vpx_get_filter_taps(filter_y) == 2) {
346     vpx_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter,
347                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
348   } else {
349     uint32_t pos = 38;
350 
351     /* bit positon for extract from acc */
352     __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
353                          :
354                          : [pos] "r"(pos));
355 
356     prefetch_store(dst);
357 
358     switch (w) {
359       case 4:
360       case 8:
361       case 16:
362       case 32:
363         convolve_avg_vert_4_dspr2(src, src_stride, dst, dst_stride, filter_y, w,
364                                   h);
365         break;
366       case 64:
367         prefetch_store(dst + 32);
368         convolve_avg_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y,
369                                    h);
370         break;
371       default:
372         vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
373                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
374         break;
375     }
376   }
377 }
378 
vpx_convolve8_avg_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int32_t x_step_q4,int y0_q4,int y_step_q4,int w,int h)379 void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
380                              uint8_t *dst, ptrdiff_t dst_stride,
381                              const InterpKernel *filter, int x0_q4,
382                              int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
383                              int h) {
384   /* Fixed size intermediate buffer places limits on parameters. */
385   DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
386   int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
387 
388   assert(w <= 64);
389   assert(h <= 64);
390   assert(x_step_q4 == 16);
391   assert(y_step_q4 == 16);
392 
393   if (intermediate_height < h) intermediate_height = h;
394 
395   vpx_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter,
396                       x0_q4, x_step_q4, y0_q4, y_step_q4, w,
397                       intermediate_height);
398 
399   vpx_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter, x0_q4,
400                          x_step_q4, y0_q4, y_step_q4, w, h);
401 }
402 
vpx_convolve_avg_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const InterpKernel * filter,int x0_q4,int32_t x_step_q4,int y0_q4,int y_step_q4,int w,int h)403 void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
404                             uint8_t *dst, ptrdiff_t dst_stride,
405                             const InterpKernel *filter, int x0_q4,
406                             int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
407                             int h) {
408   int x, y;
409   uint32_t tp1, tp2, tn1, tp3, tp4, tn2;
410   (void)filter;
411   (void)x0_q4;
412   (void)x_step_q4;
413   (void)y0_q4;
414   (void)y_step_q4;
415 
416   /* prefetch data to cache memory */
417   prefetch_load(src);
418   prefetch_load(src + 32);
419   prefetch_store(dst);
420 
421   switch (w) {
422     case 4:
423       /* 1 word storage */
424       for (y = h; y--;) {
425         prefetch_load(src + src_stride);
426         prefetch_load(src + src_stride + 32);
427         prefetch_store(dst + dst_stride);
428 
429         __asm__ __volatile__(
430             "ulw              %[tp1],         0(%[src])      \n\t"
431             "ulw              %[tp2],         0(%[dst])      \n\t"
432             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
433             "sw               %[tn1],         0(%[dst])      \n\t" /* store */
434 
435             : [tn1] "=&r"(tn1), [tp1] "=&r"(tp1), [tp2] "=&r"(tp2)
436             : [src] "r"(src), [dst] "r"(dst));
437 
438         src += src_stride;
439         dst += dst_stride;
440       }
441       break;
442     case 8:
443       /* 2 word storage */
444       for (y = h; y--;) {
445         prefetch_load(src + src_stride);
446         prefetch_load(src + src_stride + 32);
447         prefetch_store(dst + dst_stride);
448 
449         __asm__ __volatile__(
450             "ulw              %[tp1],         0(%[src])      \n\t"
451             "ulw              %[tp2],         0(%[dst])      \n\t"
452             "ulw              %[tp3],         4(%[src])      \n\t"
453             "ulw              %[tp4],         4(%[dst])      \n\t"
454             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
455             "sw               %[tn1],         0(%[dst])      \n\t" /* store */
456             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
457             "sw               %[tn2],         4(%[dst])      \n\t" /* store */
458 
459             : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
460               [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
461             : [src] "r"(src), [dst] "r"(dst));
462 
463         src += src_stride;
464         dst += dst_stride;
465       }
466       break;
467     case 16:
468       /* 4 word storage */
469       for (y = h; y--;) {
470         prefetch_load(src + src_stride);
471         prefetch_load(src + src_stride + 32);
472         prefetch_store(dst + dst_stride);
473 
474         __asm__ __volatile__(
475             "ulw              %[tp1],         0(%[src])      \n\t"
476             "ulw              %[tp2],         0(%[dst])      \n\t"
477             "ulw              %[tp3],         4(%[src])      \n\t"
478             "ulw              %[tp4],         4(%[dst])      \n\t"
479             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
480             "ulw              %[tp1],         8(%[src])      \n\t"
481             "ulw              %[tp2],         8(%[dst])      \n\t"
482             "sw               %[tn1],         0(%[dst])      \n\t" /* store */
483             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
484             "sw               %[tn2],         4(%[dst])      \n\t" /* store */
485             "ulw              %[tp3],         12(%[src])     \n\t"
486             "ulw              %[tp4],         12(%[dst])     \n\t"
487             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
488             "sw               %[tn1],         8(%[dst])      \n\t" /* store */
489             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
490             "sw               %[tn2],         12(%[dst])     \n\t" /* store */
491 
492             : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
493               [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
494             : [src] "r"(src), [dst] "r"(dst));
495 
496         src += src_stride;
497         dst += dst_stride;
498       }
499       break;
500     case 32:
501       /* 8 word storage */
502       for (y = h; y--;) {
503         prefetch_load(src + src_stride);
504         prefetch_load(src + src_stride + 32);
505         prefetch_store(dst + dst_stride);
506 
507         __asm__ __volatile__(
508             "ulw              %[tp1],         0(%[src])      \n\t"
509             "ulw              %[tp2],         0(%[dst])      \n\t"
510             "ulw              %[tp3],         4(%[src])      \n\t"
511             "ulw              %[tp4],         4(%[dst])      \n\t"
512             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
513             "ulw              %[tp1],         8(%[src])      \n\t"
514             "ulw              %[tp2],         8(%[dst])      \n\t"
515             "sw               %[tn1],         0(%[dst])      \n\t" /* store */
516             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
517             "sw               %[tn2],         4(%[dst])      \n\t" /* store */
518             "ulw              %[tp3],         12(%[src])     \n\t"
519             "ulw              %[tp4],         12(%[dst])     \n\t"
520             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
521             "ulw              %[tp1],         16(%[src])     \n\t"
522             "ulw              %[tp2],         16(%[dst])     \n\t"
523             "sw               %[tn1],         8(%[dst])      \n\t" /* store */
524             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
525             "sw               %[tn2],         12(%[dst])     \n\t" /* store */
526             "ulw              %[tp3],         20(%[src])     \n\t"
527             "ulw              %[tp4],         20(%[dst])     \n\t"
528             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
529             "ulw              %[tp1],         24(%[src])     \n\t"
530             "ulw              %[tp2],         24(%[dst])     \n\t"
531             "sw               %[tn1],         16(%[dst])     \n\t" /* store */
532             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
533             "sw               %[tn2],         20(%[dst])     \n\t" /* store */
534             "ulw              %[tp3],         28(%[src])     \n\t"
535             "ulw              %[tp4],         28(%[dst])     \n\t"
536             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
537             "sw               %[tn1],         24(%[dst])     \n\t" /* store */
538             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
539             "sw               %[tn2],         28(%[dst])     \n\t" /* store */
540 
541             : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
542               [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
543             : [src] "r"(src), [dst] "r"(dst));
544 
545         src += src_stride;
546         dst += dst_stride;
547       }
548       break;
549     case 64:
550       prefetch_load(src + 64);
551       prefetch_store(dst + 32);
552 
553       /* 16 word storage */
554       for (y = h; y--;) {
555         prefetch_load(src + src_stride);
556         prefetch_load(src + src_stride + 32);
557         prefetch_load(src + src_stride + 64);
558         prefetch_store(dst + dst_stride);
559         prefetch_store(dst + dst_stride + 32);
560 
561         __asm__ __volatile__(
562             "ulw              %[tp1],         0(%[src])      \n\t"
563             "ulw              %[tp2],         0(%[dst])      \n\t"
564             "ulw              %[tp3],         4(%[src])      \n\t"
565             "ulw              %[tp4],         4(%[dst])      \n\t"
566             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
567             "ulw              %[tp1],         8(%[src])      \n\t"
568             "ulw              %[tp2],         8(%[dst])      \n\t"
569             "sw               %[tn1],         0(%[dst])      \n\t" /* store */
570             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
571             "sw               %[tn2],         4(%[dst])      \n\t" /* store */
572             "ulw              %[tp3],         12(%[src])     \n\t"
573             "ulw              %[tp4],         12(%[dst])     \n\t"
574             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
575             "ulw              %[tp1],         16(%[src])     \n\t"
576             "ulw              %[tp2],         16(%[dst])     \n\t"
577             "sw               %[tn1],         8(%[dst])      \n\t" /* store */
578             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
579             "sw               %[tn2],         12(%[dst])     \n\t" /* store */
580             "ulw              %[tp3],         20(%[src])     \n\t"
581             "ulw              %[tp4],         20(%[dst])     \n\t"
582             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
583             "ulw              %[tp1],         24(%[src])     \n\t"
584             "ulw              %[tp2],         24(%[dst])     \n\t"
585             "sw               %[tn1],         16(%[dst])     \n\t" /* store */
586             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
587             "sw               %[tn2],         20(%[dst])     \n\t" /* store */
588             "ulw              %[tp3],         28(%[src])     \n\t"
589             "ulw              %[tp4],         28(%[dst])     \n\t"
590             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
591             "ulw              %[tp1],         32(%[src])     \n\t"
592             "ulw              %[tp2],         32(%[dst])     \n\t"
593             "sw               %[tn1],         24(%[dst])     \n\t" /* store */
594             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
595             "sw               %[tn2],         28(%[dst])     \n\t" /* store */
596             "ulw              %[tp3],         36(%[src])     \n\t"
597             "ulw              %[tp4],         36(%[dst])     \n\t"
598             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
599             "ulw              %[tp1],         40(%[src])     \n\t"
600             "ulw              %[tp2],         40(%[dst])     \n\t"
601             "sw               %[tn1],         32(%[dst])     \n\t" /* store */
602             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
603             "sw               %[tn2],         36(%[dst])     \n\t" /* store */
604             "ulw              %[tp3],         44(%[src])     \n\t"
605             "ulw              %[tp4],         44(%[dst])     \n\t"
606             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
607             "ulw              %[tp1],         48(%[src])     \n\t"
608             "ulw              %[tp2],         48(%[dst])     \n\t"
609             "sw               %[tn1],         40(%[dst])     \n\t" /* store */
610             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
611             "sw               %[tn2],         44(%[dst])     \n\t" /* store */
612             "ulw              %[tp3],         52(%[src])     \n\t"
613             "ulw              %[tp4],         52(%[dst])     \n\t"
614             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
615             "ulw              %[tp1],         56(%[src])     \n\t"
616             "ulw              %[tp2],         56(%[dst])     \n\t"
617             "sw               %[tn1],         48(%[dst])     \n\t" /* store */
618             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
619             "sw               %[tn2],         52(%[dst])     \n\t" /* store */
620             "ulw              %[tp3],         60(%[src])     \n\t"
621             "ulw              %[tp4],         60(%[dst])     \n\t"
622             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t" /* average */
623             "sw               %[tn1],         56(%[dst])     \n\t" /* store */
624             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t" /* average */
625             "sw               %[tn2],         60(%[dst])     \n\t" /* store */
626 
627             : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
628               [tp4] "=&r"(tp4), [tn1] "=&r"(tn1), [tn2] "=&r"(tn2)
629             : [src] "r"(src), [dst] "r"(dst));
630 
631         src += src_stride;
632         dst += dst_stride;
633       }
634       break;
635     default:
636       for (y = h; y > 0; --y) {
637         for (x = 0; x < w; ++x) {
638           dst[x] = (dst[x] + src[x] + 1) >> 1;
639         }
640 
641         src += src_stride;
642         dst += dst_stride;
643       }
644       break;
645   }
646 }
647 #endif
648