xref: /aosp_15_r20/external/libvpx/vp8/common/loongarch/sixtap_filter_lsx.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  * Copyright (c) 2021 Loongson Technology Corporation Limited
3  * Contributed by Lu Wang <[email protected]>
4  *
5  * Use of this source code is governed by a BSD-style license
6  * that can be found in the LICENSE file in the root of the source
7  * tree. An additional intellectual property rights grant can be found
8  * in the file PATENTS.  All contributing project authors may
9  * be found in the AUTHORS file in the root of the source tree.
10  */
11 
12 #include "./vp8_rtcd.h"
13 #include "vp8/common/filter.h"
14 #include "vpx_ports/mem.h"
15 #include "vpx_util/loongson_intrinsics.h"
16 
17 DECLARE_ALIGNED(16, static const int8_t, vp8_subpel_filters_lsx[7][8]) = {
18   { 0, -6, 123, 12, -1, 0, 0, 0 },
19   { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */
20   { 0, -9, 93, 50, -6, 0, 0, 0 },
21   { 3, -16, 77, 77, -16, 3, 0, 0 }, /* New 1/2 pel 6 tap filter */
22   { 0, -6, 50, 93, -9, 0, 0, 0 },
23   { 1, -8, 36, 108, -11, 2, 0, 0 }, /* New 1/4 pel 6 tap filter */
24   { 0, -1, 12, 123, -6, 0, 0, 0 },
25 };
26 
27 static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
28   /* 8 width cases */
29   0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
30   /* 4 width cases */
31   0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
32   /* 4 width cases */
33   8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
34 };
35 
dpadd_h3(__m128i in0,__m128i in1,__m128i in2,__m128i coeff0,__m128i coeff1,__m128i coeff2)36 static INLINE __m128i dpadd_h3(__m128i in0, __m128i in1, __m128i in2,
37                                __m128i coeff0, __m128i coeff1, __m128i coeff2) {
38   __m128i out0_m;
39 
40   out0_m = __lsx_vdp2_h_b(in0, coeff0);
41   out0_m = __lsx_vdp2add_h_b(out0_m, in1, coeff1);
42   out0_m = __lsx_vdp2add_h_b(out0_m, in2, coeff2);
43 
44   return out0_m;
45 }
46 
horiz_6tap_filt(__m128i src0,__m128i src1,__m128i mask0,__m128i mask1,__m128i mask2,__m128i filt_h0,__m128i filt_h1,__m128i filt_h2)47 static INLINE __m128i horiz_6tap_filt(__m128i src0, __m128i src1, __m128i mask0,
48                                       __m128i mask1, __m128i mask2,
49                                       __m128i filt_h0, __m128i filt_h1,
50                                       __m128i filt_h2) {
51   __m128i vec0_m, vec1_m, vec2_m;
52   __m128i hz_out_m;
53 
54   DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m,
55             vec1_m);
56   vec2_m = __lsx_vshuf_b(src1, src0, mask2);
57   hz_out_m = dpadd_h3(vec0_m, vec1_m, vec2_m, filt_h0, filt_h1, filt_h2);
58   hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT);
59   hz_out_m = __lsx_vsat_h(hz_out_m, 7);
60 
61   return hz_out_m;
62 }
63 
filt_4tap_dpadd_h(__m128i vec0,__m128i vec1,__m128i filt0,__m128i filt1)64 static INLINE __m128i filt_4tap_dpadd_h(__m128i vec0, __m128i vec1,
65                                         __m128i filt0, __m128i filt1) {
66   __m128i tmp_m;
67 
68   tmp_m = __lsx_vdp2_h_b(vec0, filt0);
69   tmp_m = __lsx_vdp2add_h_b(tmp_m, vec1, filt1);
70 
71   return tmp_m;
72 }
73 
horiz_4tap_filt(__m128i src0,__m128i src1,__m128i mask0,__m128i mask1,__m128i filt_h0,__m128i filt_h1)74 static INLINE __m128i horiz_4tap_filt(__m128i src0, __m128i src1, __m128i mask0,
75                                       __m128i mask1, __m128i filt_h0,
76                                       __m128i filt_h1) {
77   __m128i vec0_m, vec1_m, hz_out_m;
78 
79   DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m,
80             vec1_m);
81   hz_out_m = filt_4tap_dpadd_h(vec0_m, vec1_m, filt_h0, filt_h1);
82   hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT);
83   hz_out_m = __lsx_vsat_h(hz_out_m, 7);
84 
85   return hz_out_m;
86 }
87 
88 #define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,   \
89                                    mask2, filt0, filt1, filt2, out0, out1) \
90   do {                                                                     \
91     __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m;                \
92                                                                            \
93     DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src3, src2, mask0, vec0_m, \
94               vec1_m);                                                     \
95     DUP2_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, out0, out1);   \
96     DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask1, src3, src2, mask1, vec2_m, \
97               vec3_m);                                                     \
98     DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec2_m, filt1, out1, vec3_m, filt1, \
99               out0, out1);                                                 \
100     DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask2, src3, src2, mask2, vec4_m, \
101               vec5_m);                                                     \
102     DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec4_m, filt2, out1, vec5_m, filt2, \
103               out0, out1);                                                 \
104   } while (0)
105 
106 #define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,    \
107                                    mask2, filt0, filt1, filt2, out0, out1,  \
108                                    out2, out3)                              \
109   do {                                                                      \
110     __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
111                                                                             \
112     DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, vec0_m,  \
113               vec1_m);                                                      \
114     DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0, vec2_m,  \
115               vec3_m);                                                      \
116     DUP4_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, vec2_m, filt0,  \
117               vec3_m, filt0, out0, out1, out2, out3);                       \
118     DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, vec0_m,  \
119               vec1_m);                                                      \
120     DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1, vec2_m,  \
121               vec3_m);                                                      \
122     DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, vec4_m,  \
123               vec5_m);                                                      \
124     DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2, vec6_m,  \
125               vec7_m);                                                      \
126     DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec0_m, filt1, out1, vec1_m, filt1,  \
127               out2, vec2_m, filt1, out3, vec3_m, filt1, out0, out1, out2,   \
128               out3);                                                        \
129     DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec4_m, filt2, out1, vec5_m, filt2,  \
130               out2, vec6_m, filt2, out3, vec7_m, filt2, out0, out1, out2,   \
131               out3);                                                        \
132   } while (0)
133 
134 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,   \
135                                    filt0, filt1, out0, out1)               \
136   do {                                                                     \
137     __m128i vec0_m, vec1_m, vec2_m, vec3_m;                                \
138                                                                            \
139     DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src3, src2, mask0, vec0_m, \
140               vec1_m);                                                     \
141     DUP2_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, out0, out1);   \
142     DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask1, src3, src2, mask1, vec2_m, \
143               vec3_m);                                                     \
144     DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec2_m, filt1, out1, vec3_m, filt1, \
145               out0, out1);                                                 \
146   } while (0)
147 
148 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,   \
149                                    filt0, filt1, out0, out1, out2, out3)   \
150   do {                                                                     \
151     __m128i vec0_m, vec1_m, vec2_m, vec3_m;                                \
152                                                                            \
153     DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, vec0_m, \
154               vec1_m);                                                     \
155     DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0, vec2_m, \
156               vec3_m);                                                     \
157     DUP4_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, vec2_m, filt0, \
158               vec3_m, filt0, out0, out1, out2, out3);                      \
159     DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, vec0_m, \
160               vec1_m);                                                     \
161     DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1, vec2_m, \
162               vec3_m);                                                     \
163     DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec0_m, filt1, out1, vec1_m, filt1, \
164               out2, vec2_m, filt1, out3, vec3_m, filt1, out0, out1, out2,  \
165               out3);                                                       \
166   } while (0)
167 
common_hz_6t_4x4_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter)168 static inline void common_hz_6t_4x4_lsx(uint8_t *RESTRICT src,
169                                         int32_t src_stride,
170                                         uint8_t *RESTRICT dst,
171                                         int32_t dst_stride,
172                                         const int8_t *filter) {
173   __m128i src0, src1, src2, src3, filt0, filt1, filt2;
174   __m128i mask0, mask1, mask2, out0, out1;
175   int32_t src_stride_x2 = src_stride << 1;
176   int32_t src_stride_x3 = src_stride_x2 + src_stride;
177 
178   mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
179   src -= 2;
180 
181   DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
182   filt2 = __lsx_vldrepl_h(filter, 4);
183 
184   DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
185   src0 = __lsx_vld(src, 0);
186   DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
187   src3 = __lsx_vldx(src, src_stride_x3);
188 
189   DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
190             src1, src2, src3);
191   HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
192                              filt1, filt2, out0, out1);
193   out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT);
194   out0 = __lsx_vxori_b(out0, 128);
195 
196   __lsx_vstelm_w(out0, dst, 0, 0);
197   dst += dst_stride;
198   __lsx_vstelm_w(out0, dst, 0, 1);
199   dst += dst_stride;
200   __lsx_vstelm_w(out0, dst, 0, 2);
201   dst += dst_stride;
202   __lsx_vstelm_w(out0, dst, 0, 3);
203 }
204 
common_hz_6t_4x8_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter)205 static void common_hz_6t_4x8_lsx(uint8_t *RESTRICT src, int32_t src_stride,
206                                  uint8_t *RESTRICT dst, int32_t dst_stride,
207                                  const int8_t *filter) {
208   __m128i src0, src1, src2, src3, filt0, filt1, filt2;
209   __m128i mask0, mask1, mask2, out0, out1, out2, out3;
210   int32_t src_stride_x2 = src_stride << 1;
211   int32_t src_stride_x3 = src_stride_x2 + src_stride;
212   int32_t src_stride_x4 = src_stride_x2 << 1;
213 
214   mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
215   src -= 2;
216 
217   DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
218   filt2 = __lsx_vldrepl_h(filter, 4);
219   DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
220 
221   src0 = __lsx_vld(src, 0);
222   DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
223   src3 = __lsx_vldx(src, src_stride_x3);
224   DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
225             src1, src2, src3);
226   src += src_stride_x4;
227   HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
228                              filt1, filt2, out0, out1);
229 
230   src0 = __lsx_vld(src, 0);
231   DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
232   src3 = __lsx_vldx(src, src_stride_x3);
233   DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
234             src1, src2, src3);
235   HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
236                              filt1, filt2, out2, out3);
237 
238   DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
239             VP8_FILTER_SHIFT, out0, out1);
240   DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
241   __lsx_vstelm_w(out0, dst, 0, 0);
242   dst += dst_stride;
243   __lsx_vstelm_w(out0, dst, 0, 1);
244   dst += dst_stride;
245   __lsx_vstelm_w(out0, dst, 0, 2);
246   dst += dst_stride;
247   __lsx_vstelm_w(out0, dst, 0, 3);
248   dst += dst_stride;
249 
250   __lsx_vstelm_w(out1, dst, 0, 0);
251   dst += dst_stride;
252   __lsx_vstelm_w(out1, dst, 0, 1);
253   dst += dst_stride;
254   __lsx_vstelm_w(out1, dst, 0, 2);
255   dst += dst_stride;
256   __lsx_vstelm_w(out1, dst, 0, 3);
257 }
258 
common_hz_6t_4w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)259 static void common_hz_6t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
260                                 uint8_t *RESTRICT dst, int32_t dst_stride,
261                                 const int8_t *filter, int32_t height) {
262   if (height == 4) {
263     common_hz_6t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
264   } else if (height == 8) {
265     common_hz_6t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
266   }
267 }
268 
common_hz_6t_8w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)269 static void common_hz_6t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
270                                 uint8_t *RESTRICT dst, int32_t dst_stride,
271                                 const int8_t *filter, int32_t height) {
272   uint32_t loop_cnt;
273   int32_t src_stride_x2 = src_stride << 1;
274   int32_t src_stride_x3 = src_stride_x2 + src_stride;
275   int32_t src_stride_x4 = src_stride << 2;
276   int32_t dst_stride_x2 = dst_stride << 1;
277   int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
278   int32_t dst_stride_x4 = dst_stride << 2;
279   __m128i src0, src1, src2, src3, filt0, filt1, filt2;
280   __m128i mask0, mask1, mask2, tmp0, tmp1;
281   __m128i filt, out0, out1, out2, out3;
282 
283   mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
284   src -= 2;
285 
286   filt = __lsx_vld(filter, 0);
287   DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
288   filt2 = __lsx_vreplvei_h(filt, 2);
289   DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
290 
291   DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
292             src_stride_x3, src0, src1, src2, src3);
293   DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
294             src1, src2, src3);
295   src += src_stride_x4;
296   HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
297                              filt1, filt2, out0, out1, out2, out3);
298   DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
299             VP8_FILTER_SHIFT, tmp0, tmp1);
300   DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
301   __lsx_vstelm_d(tmp0, dst, 0, 0);
302   __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
303   __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
304   __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
305   dst += dst_stride_x4;
306 
307   for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
308     DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
309               src_stride_x3, src0, src1, src2, src3);
310     DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
311               src1, src2, src3);
312     src += src_stride_x4;
313     HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
314                                filt0, filt1, filt2, out0, out1, out2, out3);
315     DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
316               VP8_FILTER_SHIFT, tmp0, tmp1);
317     DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
318     __lsx_vstelm_d(tmp0, dst, 0, 0);
319     __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
320     __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
321     __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
322     dst += dst_stride_x4;
323   }
324 }
325 
common_hz_6t_16w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)326 static void common_hz_6t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
327                                  uint8_t *RESTRICT dst, int32_t dst_stride,
328                                  const int8_t *filter, int32_t height) {
329   uint32_t loop_cnt;
330   int32_t src_stride_x2 = src_stride << 1;
331   int32_t src_stride_x3 = src_stride_x2 + src_stride;
332   int32_t src_stride_x4 = src_stride << 2;
333   int32_t dst_stride_x2 = dst_stride << 1;
334   int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
335   int32_t dst_stride_x4 = dst_stride << 2;
336   __m128i src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2;
337   __m128i mask0, mask1, mask2, out;
338   __m128i filt, out0, out1, out2, out3, out4, out5, out6, out7;
339 
340   mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
341   src -= 2;
342 
343   filt = __lsx_vld(filter, 0);
344   DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
345   filt2 = __lsx_vreplvei_h(filt, 2);
346   DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
347 
348   for (loop_cnt = (height >> 2); loop_cnt--;) {
349     DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
350               src_stride_x3, src0, src2, src4, src6);
351     src += 8;
352     DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
353               src_stride_x3, src1, src3, src5, src7);
354     DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
355               src1, src2, src3);
356     DUP4_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src6, 128, src7, 128, src4,
357               src5, src6, src7);
358     src += src_stride_x4 - 8;
359 
360     HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
361                                filt0, filt1, filt2, out0, out1, out2, out3);
362     HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
363                                filt0, filt1, filt2, out4, out5, out6, out7);
364     DUP4_ARG2(__lsx_vsrari_h, out0, VP8_FILTER_SHIFT, out1, VP8_FILTER_SHIFT,
365               out2, VP8_FILTER_SHIFT, out3, VP8_FILTER_SHIFT, out0, out1, out2,
366               out3);
367     DUP4_ARG2(__lsx_vsrari_h, out4, VP8_FILTER_SHIFT, out5, VP8_FILTER_SHIFT,
368               out6, VP8_FILTER_SHIFT, out7, VP8_FILTER_SHIFT, out4, out5, out6,
369               out7);
370     DUP4_ARG2(__lsx_vsat_h, out0, 7, out1, 7, out2, 7, out3, 7, out0, out1,
371               out2, out3);
372     DUP4_ARG2(__lsx_vsat_h, out4, 7, out5, 7, out6, 7, out7, 7, out4, out5,
373               out6, out7);
374     out = __lsx_vpickev_b(out1, out0);
375     out = __lsx_vxori_b(out, 128);
376     __lsx_vst(out, dst, 0);
377     out = __lsx_vpickev_b(out3, out2);
378     out = __lsx_vxori_b(out, 128);
379     __lsx_vstx(out, dst, dst_stride);
380     out = __lsx_vpickev_b(out5, out4);
381     out = __lsx_vxori_b(out, 128);
382     __lsx_vstx(out, dst, dst_stride_x2);
383     out = __lsx_vpickev_b(out7, out6);
384     out = __lsx_vxori_b(out, 128);
385     __lsx_vstx(out, dst, dst_stride_x3);
386     dst += dst_stride_x4;
387   }
388 }
389 
common_vt_6t_4w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)390 static void common_vt_6t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
391                                 uint8_t *RESTRICT dst, int32_t dst_stride,
392                                 const int8_t *filter, int32_t height) {
393   uint32_t loop_cnt;
394   __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
395   __m128i src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
396   __m128i src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
397   __m128i out0, out1;
398   int32_t src_stride_x2 = src_stride << 1;
399   int32_t src_stride_x3 = src_stride_x2 + src_stride;
400   int32_t src_stride_x4 = src_stride << 2;
401 
402   DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
403   filt2 = __lsx_vldrepl_h(filter, 4);
404 
405   DUP2_ARG2(__lsx_vldx, src, -src_stride_x2, src, -src_stride, src0, src1);
406   src2 = __lsx_vld(src, 0);
407   DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src3, src4);
408   src += src_stride_x3;
409 
410   DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
411             src10_r, src21_r, src32_r, src43_r);
412   DUP2_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r, src2110,
413             src4332);
414   DUP2_ARG2(__lsx_vxori_b, src2110, 128, src4332, 128, src2110, src4332);
415 
416   for (loop_cnt = (height >> 2); loop_cnt--;) {
417     src5 = __lsx_vld(src, 0);
418     DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src6, src7);
419     src8 = __lsx_vldx(src, src_stride_x3);
420     src += src_stride_x4;
421 
422     DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
423               src54_r, src65_r, src76_r, src87_r);
424     DUP2_ARG2(__lsx_vilvl_d, src65_r, src54_r, src87_r, src76_r, src6554,
425               src8776);
426     DUP2_ARG2(__lsx_vxori_b, src6554, 128, src8776, 128, src6554, src8776);
427     out0 = dpadd_h3(src2110, src4332, src6554, filt0, filt1, filt2);
428     out1 = dpadd_h3(src4332, src6554, src8776, filt0, filt1, filt2);
429 
430     out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT);
431     out0 = __lsx_vxori_b(out0, 128);
432 
433     __lsx_vstelm_w(out0, dst, 0, 0);
434     dst += dst_stride;
435     __lsx_vstelm_w(out0, dst, 0, 1);
436     dst += dst_stride;
437     __lsx_vstelm_w(out0, dst, 0, 2);
438     dst += dst_stride;
439     __lsx_vstelm_w(out0, dst, 0, 3);
440     dst += dst_stride;
441 
442     src2110 = src6554;
443     src4332 = src8776;
444     src4 = src8;
445   }
446 }
447 
common_vt_6t_8w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)448 static void common_vt_6t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
449                                 uint8_t *RESTRICT dst, int32_t dst_stride,
450                                 const int8_t *filter, int32_t height) {
451   uint32_t loop_cnt;
452   int32_t src_stride_x2 = src_stride << 1;
453   int32_t src_stride_x3 = src_stride_x2 + src_stride;
454   int32_t src_stride_x4 = src_stride << 2;
455   int32_t dst_stride_x2 = dst_stride << 1;
456   int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
457   int32_t dst_stride_x4 = dst_stride << 2;
458   __m128i src0, src1, src2, src3, src4, src7, src8, src9, src10;
459   __m128i src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
460   __m128i src109_r, filt0, filt1, filt2;
461   __m128i tmp0, tmp1;
462   __m128i filt, out0_r, out1_r, out2_r, out3_r;
463 
464   src -= src_stride_x2;
465   filt = __lsx_vld(filter, 0);
466   DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
467   filt2 = __lsx_vreplvei_h(filt, 2);
468 
469   DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
470             src_stride_x3, src0, src1, src2, src3);
471   src += src_stride_x4;
472   src4 = __lsx_vld(src, 0);
473   src += src_stride;
474 
475   DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
476             src1, src2, src3);
477   src4 = __lsx_vxori_b(src4, 128);
478   DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src2, src1, src4, src3,
479             src10_r, src32_r, src21_r, src43_r);
480 
481   for (loop_cnt = (height >> 2); loop_cnt--;) {
482     DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
483               src_stride_x3, src7, src8, src9, src10);
484     DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
485               src8, src9, src10);
486     src += src_stride_x4;
487 
488     DUP4_ARG2(__lsx_vilvl_b, src7, src4, src8, src7, src9, src8, src10, src9,
489               src76_r, src87_r, src98_r, src109_r);
490     out0_r = dpadd_h3(src10_r, src32_r, src76_r, filt0, filt1, filt2);
491     out1_r = dpadd_h3(src21_r, src43_r, src87_r, filt0, filt1, filt2);
492     out2_r = dpadd_h3(src32_r, src76_r, src98_r, filt0, filt1, filt2);
493     out3_r = dpadd_h3(src43_r, src87_r, src109_r, filt0, filt1, filt2);
494     DUP2_ARG3(__lsx_vssrarni_b_h, out1_r, out0_r, VP8_FILTER_SHIFT, out3_r,
495               out2_r, VP8_FILTER_SHIFT, tmp0, tmp1);
496     DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
497     __lsx_vstelm_d(tmp0, dst, 0, 0);
498     __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
499     __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
500     __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
501     dst += dst_stride_x4;
502 
503     src10_r = src76_r;
504     src32_r = src98_r;
505     src21_r = src87_r;
506     src43_r = src109_r;
507     src4 = src10;
508   }
509 }
510 
common_vt_6t_16w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)511 static void common_vt_6t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
512                                  uint8_t *RESTRICT dst, int32_t dst_stride,
513                                  const int8_t *filter, int32_t height) {
514   uint32_t loop_cnt;
515   int32_t src_stride_x2 = src_stride << 1;
516   int32_t src_stride_x3 = src_stride_x2 + src_stride;
517   int32_t src_stride_x4 = src_stride << 2;
518   int32_t dst_stride_x2 = dst_stride << 1;
519   int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
520   int32_t dst_stride_x4 = dst_stride << 2;
521   __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
522   __m128i src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
523   __m128i src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
524   __m128i src65_l, src87_l, filt0, filt1, filt2;
525   __m128i tmp0, tmp1, tmp2, tmp3;
526   __m128i filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
527 
528   src -= src_stride_x2;
529   filt = __lsx_vld(filter, 0);
530   DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
531   filt2 = __lsx_vreplvei_h(filt, 2);
532 
533   DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
534             src_stride_x3, src0, src1, src2, src3);
535   src += src_stride_x4;
536   src4 = __lsx_vldx(src, 0);
537   src += src_stride;
538 
539   DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
540             src1, src2, src3);
541   src4 = __lsx_vxori_b(src4, 128);
542   DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src4, src3, src2, src1,
543             src10_r, src32_r, src43_r, src21_r);
544   DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src4, src3, src2, src1,
545             src10_l, src32_l, src43_l, src21_l);
546 
547   for (loop_cnt = (height >> 2); loop_cnt--;) {
548     DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
549               src_stride_x3, src5, src6, src7, src8);
550     src += src_stride_x4;
551 
552     DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5,
553               src6, src7, src8);
554     DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
555               src54_r, src65_r, src76_r, src87_r);
556     DUP4_ARG2(__lsx_vilvh_b, src5, src4, src6, src5, src7, src6, src8, src7,
557               src54_l, src65_l, src76_l, src87_l);
558     out0_r = dpadd_h3(src10_r, src32_r, src54_r, filt0, filt1, filt2);
559     out1_r = dpadd_h3(src21_r, src43_r, src65_r, filt0, filt1, filt2);
560     out2_r = dpadd_h3(src32_r, src54_r, src76_r, filt0, filt1, filt2);
561     out3_r = dpadd_h3(src43_r, src65_r, src87_r, filt0, filt1, filt2);
562     out0_l = dpadd_h3(src10_l, src32_l, src54_l, filt0, filt1, filt2);
563     out1_l = dpadd_h3(src21_l, src43_l, src65_l, filt0, filt1, filt2);
564     out2_l = dpadd_h3(src32_l, src54_l, src76_l, filt0, filt1, filt2);
565     out3_l = dpadd_h3(src43_l, src65_l, src87_l, filt0, filt1, filt2);
566     DUP4_ARG3(__lsx_vssrarni_b_h, out0_l, out0_r, VP8_FILTER_SHIFT, out1_l,
567               out1_r, VP8_FILTER_SHIFT, out2_l, out2_r, VP8_FILTER_SHIFT,
568               out3_l, out3_r, VP8_FILTER_SHIFT, tmp0, tmp1, tmp2, tmp3);
569     DUP4_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp2, 128, tmp3, 128, tmp0,
570               tmp1, tmp2, tmp3);
571     __lsx_vstx(tmp0, dst, 0);
572     __lsx_vstx(tmp1, dst, dst_stride);
573     __lsx_vstx(tmp2, dst, dst_stride_x2);
574     __lsx_vstx(tmp3, dst, dst_stride_x3);
575     dst += dst_stride_x4;
576 
577     src10_r = src54_r;
578     src32_r = src76_r;
579     src21_r = src65_r;
580     src43_r = src87_r;
581     src10_l = src54_l;
582     src32_l = src76_l;
583     src21_l = src65_l;
584     src43_l = src87_l;
585     src4 = src8;
586   }
587 }
588 
common_hv_6ht_6vt_4w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)589 static void common_hv_6ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
590                                      uint8_t *RESTRICT dst, int32_t dst_stride,
591                                      const int8_t *filter_horiz,
592                                      const int8_t *filter_vert,
593                                      int32_t height) {
594   uint32_t loop_cnt;
595   __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, tmp0, tmp1;
596   __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
597   __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
598   __m128i hz_out7, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3;
599   __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
600   int32_t src_stride_x2 = src_stride << 1;
601   int32_t src_stride_x3 = src_stride_x2 + src_stride;
602 
603   mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
604   src -= 2;
605 
606   DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0,
607             filt_hz1);
608   filt_hz2 = __lsx_vldrepl_h(filter_horiz, 4);
609   DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0,
610             filt_vt1);
611   filt_vt2 = __lsx_vldrepl_h(filter_vert, 4);
612 
613   DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
614 
615   DUP2_ARG2(__lsx_vldx, src, -src_stride_x2, src, -src_stride, src0, src1);
616   src2 = __lsx_vld(src, 0);
617   DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src3, src4);
618   src += src_stride_x3;
619 
620   DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
621             src1, src2, src3);
622   src4 = __lsx_vxori_b(src4, 128);
623 
624   hz_out0 = horiz_6tap_filt(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
625                             filt_hz2);
626   hz_out2 = horiz_6tap_filt(src2, src3, mask0, mask1, mask2, filt_hz0, filt_hz1,
627                             filt_hz2);
628   hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
629   hz_out3 = horiz_6tap_filt(src3, src4, mask0, mask1, mask2, filt_hz0, filt_hz1,
630                             filt_hz2);
631   DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1);
632 
633   for (loop_cnt = (height >> 2); loop_cnt--;) {
634     src5 = __lsx_vld(src, 0);
635     src6 = __lsx_vldx(src, src_stride);
636     src += src_stride_x2;
637 
638     DUP2_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src5, src6);
639     hz_out5 = horiz_6tap_filt(src5, src6, mask0, mask1, mask2, filt_hz0,
640                               filt_hz1, filt_hz2);
641     hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
642 
643     src7 = __lsx_vld(src, 0);
644     src8 = __lsx_vldx(src, src_stride);
645     src += src_stride_x2;
646 
647     DUP2_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src7, src8);
648     hz_out7 = horiz_6tap_filt(src7, src8, mask0, mask1, mask2, filt_hz0,
649                               filt_hz1, filt_hz2);
650     hz_out6 = __lsx_vshuf_b(hz_out7, hz_out5, shuff);
651 
652     out2 = __lsx_vpackev_b(hz_out5, hz_out4);
653     tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
654 
655     out3 = __lsx_vpackev_b(hz_out7, hz_out6);
656     tmp1 = dpadd_h3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
657 
658     tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
659     tmp0 = __lsx_vxori_b(tmp0, 128);
660     __lsx_vstelm_w(tmp0, dst, 0, 0);
661     dst += dst_stride;
662     __lsx_vstelm_w(tmp0, dst, 0, 1);
663     dst += dst_stride;
664     __lsx_vstelm_w(tmp0, dst, 0, 2);
665     dst += dst_stride;
666     __lsx_vstelm_w(tmp0, dst, 0, 3);
667     dst += dst_stride;
668 
669     hz_out3 = hz_out7;
670     out0 = out2;
671     out1 = out3;
672   }
673 }
674 
common_hv_6ht_6vt_8w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)675 static void common_hv_6ht_6vt_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
676                                      uint8_t *RESTRICT dst, int32_t dst_stride,
677                                      const int8_t *filter_horiz,
678                                      const int8_t *filter_vert,
679                                      int32_t height) {
680   uint32_t loop_cnt;
681   int32_t src_stride_x2 = src_stride << 1;
682   int32_t src_stride_x3 = src_stride_x2 + src_stride;
683   int32_t src_stride_x4 = src_stride << 2;
684   int32_t dst_stride_x2 = dst_stride << 1;
685   int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
686   int32_t dst_stride_x4 = dst_stride << 2;
687   __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
688   __m128i filt_hz0, filt_hz1, filt_hz2;
689   __m128i mask0, mask1, mask2, vec0, vec1;
690   __m128i filt, filt_vt0, filt_vt1, filt_vt2;
691   __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
692   __m128i hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
693   __m128i tmp0, tmp1, tmp2, tmp3;
694 
695   mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
696   src -= (2 + src_stride_x2);
697 
698   filt = __lsx_vld(filter_horiz, 0);
699   DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1);
700   filt_hz2 = __lsx_vreplvei_h(filt, 2);
701 
702   DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
703   DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
704             src_stride_x3, src0, src1, src2, src3);
705   src += src_stride_x4;
706   src4 = __lsx_vldx(src, 0);
707   src += src_stride;
708 
709   DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
710             src1, src2, src3);
711   src4 = __lsx_vxori_b(src4, 128);
712 
713   hz_out0 = horiz_6tap_filt(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1,
714                             filt_hz2);
715   hz_out1 = horiz_6tap_filt(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
716                             filt_hz2);
717   hz_out2 = horiz_6tap_filt(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
718                             filt_hz2);
719   hz_out3 = horiz_6tap_filt(src3, src3, mask0, mask1, mask2, filt_hz0, filt_hz1,
720                             filt_hz2);
721   hz_out4 = horiz_6tap_filt(src4, src4, mask0, mask1, mask2, filt_hz0, filt_hz1,
722                             filt_hz2);
723   filt = __lsx_vld(filter_vert, 0);
724   DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1);
725   filt_vt2 = __lsx_vreplvei_h(filt, 2);
726 
727   DUP4_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, hz_out2,
728             hz_out1, hz_out4, hz_out3, out0, out1, out3, out4);
729 
730   for (loop_cnt = (height >> 2); loop_cnt--;) {
731     DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
732               src_stride_x3, src5, src6, src7, src8);
733     src += src_stride_x4;
734 
735     DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5,
736               src6, src7, src8);
737     hz_out5 = horiz_6tap_filt(src5, src5, mask0, mask1, mask2, filt_hz0,
738                               filt_hz1, filt_hz2);
739     out2 = __lsx_vpackev_b(hz_out5, hz_out4);
740     tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
741 
742     hz_out6 = horiz_6tap_filt(src6, src6, mask0, mask1, mask2, filt_hz0,
743                               filt_hz1, filt_hz2);
744     out5 = __lsx_vpackev_b(hz_out6, hz_out5);
745     tmp1 = dpadd_h3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
746 
747     hz_out7 = horiz_6tap_filt(src7, src7, mask0, mask1, mask2, filt_hz0,
748                               filt_hz1, filt_hz2);
749     out7 = __lsx_vpackev_b(hz_out7, hz_out6);
750     tmp2 = dpadd_h3(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
751 
752     hz_out8 = horiz_6tap_filt(src8, src8, mask0, mask1, mask2, filt_hz0,
753                               filt_hz1, filt_hz2);
754     out6 = __lsx_vpackev_b(hz_out8, hz_out7);
755     tmp3 = dpadd_h3(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
756 
757     DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, VP8_FILTER_SHIFT, tmp3, tmp2,
758               VP8_FILTER_SHIFT, vec0, vec1);
759     DUP2_ARG2(__lsx_vxori_b, vec0, 128, vec1, 128, vec0, vec1);
760 
761     __lsx_vstelm_d(vec0, dst, 0, 0);
762     __lsx_vstelm_d(vec0, dst + dst_stride, 0, 1);
763     __lsx_vstelm_d(vec1, dst + dst_stride_x2, 0, 0);
764     __lsx_vstelm_d(vec1, dst + dst_stride_x3, 0, 1);
765     dst += dst_stride_x4;
766 
767     hz_out4 = hz_out8;
768     out0 = out2;
769     out1 = out7;
770     out3 = out5;
771     out4 = out6;
772   }
773 }
774 
common_hv_6ht_6vt_16w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)775 static void common_hv_6ht_6vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
776                                       uint8_t *RESTRICT dst, int32_t dst_stride,
777                                       const int8_t *filter_horiz,
778                                       const int8_t *filter_vert,
779                                       int32_t height) {
780   common_hv_6ht_6vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
781                            filter_vert, height);
782   common_hv_6ht_6vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride,
783                            filter_horiz, filter_vert, height);
784 }
785 
common_hz_4t_4x4_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter)786 static void common_hz_4t_4x4_lsx(uint8_t *RESTRICT src, int32_t src_stride,
787                                  uint8_t *RESTRICT dst, int32_t dst_stride,
788                                  const int8_t *filter) {
789   __m128i src0, src1, src2, src3, filt0, filt1, mask0, mask1;
790   __m128i out0, out1;
791   int32_t src_stride_x2 = src_stride << 1;
792   int32_t src_stride_x3 = src_stride_x2 + src_stride;
793 
794   mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
795   src -= 1;
796 
797   DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
798   mask1 = __lsx_vaddi_bu(mask0, 2);
799 
800   src0 = __lsx_vld(src, 0);
801   DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
802   src3 = __lsx_vldx(src, src_stride_x3);
803   DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
804             src1, src2, src3);
805   HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
806                              out0, out1);
807 
808   out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT);
809   out0 = __lsx_vxori_b(out0, 128);
810 
811   __lsx_vstelm_w(out0, dst, 0, 0);
812   dst += dst_stride;
813   __lsx_vstelm_w(out0, dst, 0, 1);
814   dst += dst_stride;
815   __lsx_vstelm_w(out0, dst, 0, 2);
816   dst += dst_stride;
817   __lsx_vstelm_w(out0, dst, 0, 3);
818 }
819 
common_hz_4t_4x8_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter)820 static void common_hz_4t_4x8_lsx(uint8_t *RESTRICT src, int32_t src_stride,
821                                  uint8_t *RESTRICT dst, int32_t dst_stride,
822                                  const int8_t *filter) {
823   __m128i src0, src1, src2, src3, filt0, filt1, mask0, mask1;
824   __m128i out0, out1, out2, out3;
825   int32_t src_stride_x2 = src_stride << 1;
826   int32_t src_stride_x3 = src_stride_x2 + src_stride;
827   int32_t src_stride_x4 = src_stride << 2;
828 
829   mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
830   src -= 1;
831 
832   DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
833   mask1 = __lsx_vaddi_bu(mask0, 2);
834 
835   src0 = __lsx_vld(src, 0);
836   DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
837   src3 = __lsx_vldx(src, src_stride_x3);
838   src += src_stride_x4;
839   DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
840             src1, src2, src3);
841   HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
842                              out0, out1);
843 
844   src0 = __lsx_vld(src, 0);
845   DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
846   src3 = __lsx_vldx(src, src_stride_x3);
847   DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
848             src1, src2, src3);
849   HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
850                              out2, out3);
851   DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
852             VP8_FILTER_SHIFT, out0, out1);
853   DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
854   __lsx_vstelm_w(out0, dst, 0, 0);
855   dst += dst_stride;
856   __lsx_vstelm_w(out0, dst, 0, 1);
857   dst += dst_stride;
858   __lsx_vstelm_w(out0, dst, 0, 2);
859   dst += dst_stride;
860   __lsx_vstelm_w(out0, dst, 0, 3);
861   dst += dst_stride;
862 
863   __lsx_vstelm_w(out1, dst, 0, 0);
864   dst += dst_stride;
865   __lsx_vstelm_w(out1, dst, 0, 1);
866   dst += dst_stride;
867   __lsx_vstelm_w(out1, dst, 0, 2);
868   dst += dst_stride;
869   __lsx_vstelm_w(out1, dst, 0, 3);
870 }
871 
common_hz_4t_4w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)872 static void common_hz_4t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
873                                 uint8_t *RESTRICT dst, int32_t dst_stride,
874                                 const int8_t *filter, int32_t height) {
875   if (height == 4) {
876     common_hz_4t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
877   } else if (height == 8) {
878     common_hz_4t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
879   }
880 }
881 
common_hz_4t_8w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)882 static void common_hz_4t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
883                                 uint8_t *RESTRICT dst, int32_t dst_stride,
884                                 const int8_t *filter, int32_t height) {
885   uint32_t loop_cnt;
886   int32_t src_stride_x2 = src_stride << 1;
887   int32_t src_stride_x3 = src_stride_x2 + src_stride;
888   int32_t src_stride_x4 = src_stride << 2;
889   int32_t dst_stride_x2 = dst_stride << 1;
890   int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
891   int32_t dst_stride_x4 = dst_stride << 2;
892   __m128i src0, src1, src2, src3, filt0, filt1, mask0, mask1;
893   __m128i tmp0, tmp1;
894   __m128i filt, out0, out1, out2, out3;
895 
896   mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
897   src -= 1;
898 
899   filt = __lsx_vld(filter, 0);
900   DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
901   mask1 = __lsx_vaddi_bu(mask0, 2);
902 
903   for (loop_cnt = (height >> 2); loop_cnt--;) {
904     DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
905               src_stride_x3, src0, src1, src2, src3);
906     src += src_stride_x4;
907 
908     DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
909               src1, src2, src3);
910     HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
911                                filt1, out0, out1, out2, out3);
912     DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
913               VP8_FILTER_SHIFT, tmp0, tmp1);
914     DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
915     __lsx_vstelm_d(tmp0, dst, 0, 0);
916     __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
917     __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
918     __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
919     dst += dst_stride_x4;
920   }
921 }
922 
common_hz_4t_16w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)923 static void common_hz_4t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
924                                  uint8_t *RESTRICT dst, int32_t dst_stride,
925                                  const int8_t *filter, int32_t height) {
926   uint32_t loop_cnt;
927   int32_t src_stride_x2 = src_stride << 1;
928   int32_t src_stride_x3 = src_stride_x2 + src_stride;
929   int32_t src_stride_x4 = src_stride << 2;
930   int32_t dst_stride_x2 = dst_stride << 1;
931   int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
932   int32_t dst_stride_x4 = dst_stride << 2;
933   __m128i src0, src1, src2, src3, src4, src5, src6, src7;
934   __m128i filt0, filt1, mask0, mask1;
935   __m128i filt, out0, out1, out2, out3, out4, out5, out6, out7;
936 
937   mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
938   src -= 1;
939 
940   filt = __lsx_vld(filter, 0);
941   DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
942   mask1 = __lsx_vaddi_bu(mask0, 2);
943 
944   for (loop_cnt = (height >> 2); loop_cnt--;) {
945     DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
946               src_stride_x3, src0, src2, src4, src6);
947     src += 8;
948     DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
949               src_stride_x3, src1, src3, src5, src7);
950     src += src_stride_x4 - 8;
951 
952     DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
953               src1, src2, src3);
954     DUP4_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src6, 128, src7, 128, src4,
955               src5, src6, src7);
956     HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
957                                filt1, out0, out1, out2, out3);
958     HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0,
959                                filt1, out4, out5, out6, out7);
960     DUP4_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
961               VP8_FILTER_SHIFT, out5, out4, VP8_FILTER_SHIFT, out7, out6,
962               VP8_FILTER_SHIFT, out0, out1, out2, out3);
963     DUP4_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out2, 128, out3, 128, out0,
964               out1, out2, out3);
965     __lsx_vstx(out0, dst, 0);
966     __lsx_vstx(out1, dst, dst_stride);
967     __lsx_vstx(out2, dst, dst_stride_x2);
968     __lsx_vstx(out3, dst, dst_stride_x3);
969     dst += dst_stride_x4;
970   }
971 }
972 
common_vt_4t_4w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)973 static void common_vt_4t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
974                                 uint8_t *RESTRICT dst, int32_t dst_stride,
975                                 const int8_t *filter, int32_t height) {
976   uint32_t loop_cnt;
977   __m128i src0, src1, src2, src3, src4, src5;
978   __m128i src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
979   __m128i src2110, src4332, filt0, filt1, out0, out1;
980   int32_t src_stride_x2 = src_stride << 1;
981   int32_t src_stride_x3 = src_stride_x2 + src_stride;
982 
983   DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
984   DUP2_ARG2(__lsx_vldx, src, -src_stride, src, src_stride, src0, src2);
985   src1 = __lsx_vld(src, 0);
986   src += src_stride_x2;
987 
988   DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
989 
990   src2110 = __lsx_vilvl_d(src21_r, src10_r);
991   src2110 = __lsx_vxori_b(src2110, 128);
992 
993   for (loop_cnt = (height >> 2); loop_cnt--;) {
994     src3 = __lsx_vld(src, 0);
995     DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src4, src5);
996     src += src_stride_x3;
997     DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
998     src4332 = __lsx_vilvl_d(src43_r, src32_r);
999     src4332 = __lsx_vxori_b(src4332, 128);
1000     out0 = filt_4tap_dpadd_h(src2110, src4332, filt0, filt1);
1001 
1002     src2 = __lsx_vld(src, 0);
1003     src += src_stride;
1004     DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src54_r, src65_r);
1005     src2110 = __lsx_vilvl_d(src65_r, src54_r);
1006     src2110 = __lsx_vxori_b(src2110, 128);
1007     out1 = filt_4tap_dpadd_h(src4332, src2110, filt0, filt1);
1008     out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT);
1009     out0 = __lsx_vxori_b(out0, 128);
1010 
1011     __lsx_vstelm_w(out0, dst, 0, 0);
1012     dst += dst_stride;
1013     __lsx_vstelm_w(out0, dst, 0, 1);
1014     dst += dst_stride;
1015     __lsx_vstelm_w(out0, dst, 0, 2);
1016     dst += dst_stride;
1017     __lsx_vstelm_w(out0, dst, 0, 3);
1018     dst += dst_stride;
1019   }
1020 }
1021 
common_vt_4t_8w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)1022 static void common_vt_4t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
1023                                 uint8_t *RESTRICT dst, int32_t dst_stride,
1024                                 const int8_t *filter, int32_t height) {
1025   uint32_t loop_cnt;
1026   int32_t src_stride_x2 = src_stride << 1;
1027   int32_t src_stride_x3 = src_stride_x2 + src_stride;
1028   int32_t src_stride_x4 = src_stride << 2;
1029   int32_t dst_stride_x2 = dst_stride << 1;
1030   int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
1031   int32_t dst_stride_x4 = dst_stride << 2;
1032   __m128i src0, src1, src2, src7, src8, src9, src10;
1033   __m128i src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
1034   __m128i tmp0, tmp1;
1035   __m128i filt, out0_r, out1_r, out2_r, out3_r;
1036 
1037   src -= src_stride;
1038   filt = __lsx_vld(filter, 0);
1039   DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
1040 
1041   DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1);
1042   src2 = __lsx_vldx(src, src_stride_x2);
1043   src += src_stride_x3;
1044 
1045   DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
1046   src2 = __lsx_vxori_b(src2, 128);
1047   DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
1048 
1049   for (loop_cnt = (height >> 2); loop_cnt--;) {
1050     DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
1051               src_stride_x3, src7, src8, src9, src10);
1052     src += src_stride_x4;
1053 
1054     DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
1055               src8, src9, src10);
1056     DUP4_ARG2(__lsx_vilvl_b, src7, src2, src8, src7, src9, src8, src10, src9,
1057               src72_r, src87_r, src98_r, src109_r);
1058     out0_r = filt_4tap_dpadd_h(src10_r, src72_r, filt0, filt1);
1059     out1_r = filt_4tap_dpadd_h(src21_r, src87_r, filt0, filt1);
1060     out2_r = filt_4tap_dpadd_h(src72_r, src98_r, filt0, filt1);
1061     out3_r = filt_4tap_dpadd_h(src87_r, src109_r, filt0, filt1);
1062     DUP2_ARG3(__lsx_vssrarni_b_h, out1_r, out0_r, VP8_FILTER_SHIFT, out3_r,
1063               out2_r, VP8_FILTER_SHIFT, tmp0, tmp1);
1064     DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
1065     __lsx_vstelm_d(tmp0, dst, 0, 0);
1066     __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
1067     __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
1068     __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
1069     dst += dst_stride_x4;
1070 
1071     src10_r = src98_r;
1072     src21_r = src109_r;
1073     src2 = src10;
1074   }
1075 }
1076 
common_vt_4t_16w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)1077 static void common_vt_4t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
1078                                  uint8_t *RESTRICT dst, int32_t dst_stride,
1079                                  const int8_t *filter, int32_t height) {
1080   uint32_t loop_cnt;
1081   int32_t src_stride_x2 = src_stride << 1;
1082   int32_t src_stride_x3 = src_stride_x2 + src_stride;
1083   int32_t src_stride_x4 = src_stride << 2;
1084   int32_t dst_stride_x2 = dst_stride << 1;
1085   int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
1086   int32_t dst_stride_x4 = dst_stride << 2;
1087   __m128i src0, src1, src2, src3, src4, src5, src6;
1088   __m128i src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
1089   __m128i src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
1090   __m128i tmp0, tmp1, tmp2, tmp3;
1091   __m128i filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1092 
1093   src -= src_stride;
1094   filt = __lsx_vld(filter, 0);
1095   DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
1096 
1097   DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1);
1098   src2 = __lsx_vldx(src, src_stride_x2);
1099   src += src_stride_x3;
1100 
1101   DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
1102   src2 = __lsx_vxori_b(src2, 128);
1103   DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
1104   DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
1105 
1106   for (loop_cnt = (height >> 2); loop_cnt--;) {
1107     DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
1108               src_stride_x3, src3, src4, src5, src6);
1109     src += src_stride_x4;
1110 
1111     DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3,
1112               src4, src5, src6);
1113     DUP4_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src5, src4, src6, src5,
1114               src32_r, src43_r, src54_r, src65_r);
1115     DUP4_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src5, src4, src6, src5,
1116               src32_l, src43_l, src54_l, src65_l);
1117     out0_r = filt_4tap_dpadd_h(src10_r, src32_r, filt0, filt1);
1118     out1_r = filt_4tap_dpadd_h(src21_r, src43_r, filt0, filt1);
1119     out2_r = filt_4tap_dpadd_h(src32_r, src54_r, filt0, filt1);
1120     out3_r = filt_4tap_dpadd_h(src43_r, src65_r, filt0, filt1);
1121     out0_l = filt_4tap_dpadd_h(src10_l, src32_l, filt0, filt1);
1122     out1_l = filt_4tap_dpadd_h(src21_l, src43_l, filt0, filt1);
1123     out2_l = filt_4tap_dpadd_h(src32_l, src54_l, filt0, filt1);
1124     out3_l = filt_4tap_dpadd_h(src43_l, src65_l, filt0, filt1);
1125     DUP4_ARG3(__lsx_vssrarni_b_h, out0_l, out0_r, VP8_FILTER_SHIFT, out1_l,
1126               out1_r, VP8_FILTER_SHIFT, out2_l, out2_r, VP8_FILTER_SHIFT,
1127               out3_l, out3_r, VP8_FILTER_SHIFT, tmp0, tmp1, tmp2, tmp3);
1128     DUP4_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp2, 128, tmp3, 128, tmp0,
1129               tmp1, tmp2, tmp3);
1130     __lsx_vstx(tmp0, dst, 0);
1131     __lsx_vstx(tmp1, dst, dst_stride);
1132     __lsx_vstx(tmp2, dst, dst_stride_x2);
1133     __lsx_vstx(tmp3, dst, dst_stride_x3);
1134     dst += dst_stride_x4;
1135 
1136     src10_r = src54_r;
1137     src21_r = src65_r;
1138     src10_l = src54_l;
1139     src21_l = src65_l;
1140     src2 = src6;
1141   }
1142 }
1143 
common_hv_4ht_4vt_4w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1144 static void common_hv_4ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
1145                                      uint8_t *RESTRICT dst, int32_t dst_stride,
1146                                      const int8_t *filter_horiz,
1147                                      const int8_t *filter_vert,
1148                                      int32_t height) {
1149   uint32_t loop_cnt;
1150   __m128i src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
1151   __m128i mask0, mask1, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
1152   __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
1153   __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
1154   int32_t src_stride_x2 = src_stride << 1;
1155   int32_t src_stride_x3 = src_stride_x2 + src_stride;
1156   int32_t src_stride_x4 = src_stride << 2;
1157 
1158   mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
1159   src -= 1;
1160 
1161   DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0,
1162             filt_hz1);
1163   mask1 = __lsx_vaddi_bu(mask0, 2);
1164 
1165   src1 = __lsx_vld(src, 0);
1166   DUP2_ARG2(__lsx_vldx, src, -src_stride, src, src_stride, src0, src2);
1167   src += src_stride_x2;
1168 
1169   DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
1170   src2 = __lsx_vxori_b(src2, 128);
1171   hz_out0 = horiz_4tap_filt(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
1172   hz_out1 = horiz_4tap_filt(src1, src2, mask0, mask1, filt_hz0, filt_hz1);
1173   vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
1174 
1175   DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0,
1176             filt_vt1);
1177 
1178   for (loop_cnt = (height >> 2); loop_cnt--;) {
1179     src3 = __lsx_vld(src, 0);
1180     DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src4, src5);
1181     src6 = __lsx_vldx(src, src_stride_x3);
1182     src += src_stride_x4;
1183 
1184     DUP2_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src3, src4);
1185     hz_out3 = horiz_4tap_filt(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
1186     hz_out2 = __lsx_vshuf_b(hz_out3, hz_out1, shuff);
1187     vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
1188     tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
1189 
1190     DUP2_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src5, src6);
1191     hz_out5 = horiz_4tap_filt(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
1192     hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
1193     vec2 = __lsx_vpackev_b(hz_out5, hz_out4);
1194     tmp1 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1);
1195 
1196     tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
1197     tmp0 = __lsx_vxori_b(tmp0, 128);
1198     __lsx_vstelm_w(tmp0, dst, 0, 0);
1199     dst += dst_stride;
1200     __lsx_vstelm_w(tmp0, dst, 0, 1);
1201     dst += dst_stride;
1202     __lsx_vstelm_w(tmp0, dst, 0, 2);
1203     dst += dst_stride;
1204     __lsx_vstelm_w(tmp0, dst, 0, 3);
1205     dst += dst_stride;
1206 
1207     hz_out1 = hz_out5;
1208     vec0 = vec2;
1209   }
1210 }
1211 
common_hv_4ht_4vt_8w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1212 static inline void common_hv_4ht_4vt_8w_lsx(
1213     uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
1214     int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
1215     int32_t height) {
1216   uint32_t loop_cnt;
1217   int32_t src_stride_x2 = src_stride << 1;
1218   int32_t src_stride_x3 = src_stride_x2 + src_stride;
1219   int32_t src_stride_x4 = src_stride << 2;
1220   int32_t dst_stride_x2 = dst_stride << 1;
1221   int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
1222   int32_t dst_stride_x4 = dst_stride << 2;
1223   __m128i src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
1224   __m128i mask0, mask1, out0, out1;
1225   __m128i filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3;
1226   __m128i hz_out0, hz_out1, hz_out2, hz_out3;
1227   __m128i vec0, vec1, vec2, vec3, vec4;
1228 
1229   mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
1230   src -= 1 + src_stride;
1231 
1232   filt = __lsx_vld(filter_horiz, 0);
1233   DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1);
1234   mask1 = __lsx_vaddi_bu(mask0, 2);
1235 
1236   DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1);
1237   src2 = __lsx_vldx(src, src_stride_x2);
1238   src += src_stride_x3;
1239 
1240   DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
1241   src2 = __lsx_vxori_b(src2, 128);
1242   hz_out0 = horiz_4tap_filt(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
1243   hz_out1 = horiz_4tap_filt(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
1244   hz_out2 = horiz_4tap_filt(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
1245   DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out2, hz_out1, vec0, vec2);
1246 
1247   filt = __lsx_vld(filter_vert, 0);
1248   DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1);
1249 
1250   for (loop_cnt = (height >> 2); loop_cnt--;) {
1251     DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
1252               src_stride_x3, src3, src4, src5, src6);
1253     src += src_stride_x4;
1254 
1255     DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3,
1256               src4, src5, src6);
1257     hz_out3 = horiz_4tap_filt(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
1258     vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
1259     tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
1260 
1261     hz_out0 = horiz_4tap_filt(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
1262     vec3 = __lsx_vpackev_b(hz_out0, hz_out3);
1263     tmp1 = filt_4tap_dpadd_h(vec2, vec3, filt_vt0, filt_vt1);
1264 
1265     hz_out1 = horiz_4tap_filt(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
1266     vec4 = __lsx_vpackev_b(hz_out1, hz_out0);
1267     tmp2 = filt_4tap_dpadd_h(vec1, vec4, filt_vt0, filt_vt1);
1268 
1269     hz_out2 = horiz_4tap_filt(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
1270     DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out3, hz_out2, hz_out1, vec0, vec1);
1271     tmp3 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
1272 
1273     DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
1274     DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
1275     __lsx_vstelm_d(out0, dst, 0, 0);
1276     __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
1277     __lsx_vstelm_d(out1, dst + dst_stride_x2, 0, 0);
1278     __lsx_vstelm_d(out1, dst + dst_stride_x3, 0, 1);
1279     dst += dst_stride_x4;
1280 
1281     vec0 = vec4;
1282     vec2 = vec1;
1283   }
1284 }
1285 
common_hv_4ht_4vt_16w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1286 static void common_hv_4ht_4vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
1287                                       uint8_t *RESTRICT dst, int32_t dst_stride,
1288                                       const int8_t *filter_horiz,
1289                                       const int8_t *filter_vert,
1290                                       int32_t height) {
1291   common_hv_4ht_4vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
1292                            filter_vert, height);
1293   common_hv_4ht_4vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride,
1294                            filter_horiz, filter_vert, height);
1295 }
1296 
common_hv_6ht_4vt_4w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1297 static void common_hv_6ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
1298                                      uint8_t *RESTRICT dst, int32_t dst_stride,
1299                                      const int8_t *filter_horiz,
1300                                      const int8_t *filter_vert,
1301                                      int32_t height) {
1302   uint32_t loop_cnt;
1303   __m128i src0, src1, src2, src3, src4, src5, src6;
1304   __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
1305   __m128i filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
1306   __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
1307   __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
1308   int32_t src_stride_x2 = src_stride << 1;
1309   int32_t src_stride_x3 = src_stride_x2 + src_stride;
1310   int32_t src_stride_x4 = src_stride << 2;
1311 
1312   mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
1313   src -= 2;
1314 
1315   DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0,
1316             filt_hz1);
1317   filt_hz2 = __lsx_vldrepl_h(filter_horiz, 4);
1318   DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1319 
1320   src1 = __lsx_vld(src, 0);
1321   DUP2_ARG2(__lsx_vldx, src, -src_stride, src, src_stride, src0, src2);
1322   src += src_stride_x2;
1323 
1324   DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
1325   src2 = __lsx_vxori_b(src2, 128);
1326 
1327   hz_out0 = horiz_6tap_filt(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
1328                             filt_hz2);
1329   hz_out1 = horiz_6tap_filt(src1, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
1330                             filt_hz2);
1331   vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
1332 
1333   DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0,
1334             filt_vt1);
1335 
1336   for (loop_cnt = (height >> 2); loop_cnt--;) {
1337     src3 = __lsx_vld(src, 0);
1338     DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src4, src5);
1339     src6 = __lsx_vldx(src, src_stride_x3);
1340     src += src_stride_x4;
1341     DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3,
1342               src4, src5, src6);
1343 
1344     hz_out3 = horiz_6tap_filt(src3, src4, mask0, mask1, mask2, filt_hz0,
1345                               filt_hz1, filt_hz2);
1346     hz_out2 = __lsx_vshuf_b(hz_out3, hz_out1, shuff);
1347     vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
1348     tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
1349 
1350     hz_out5 = horiz_6tap_filt(src5, src6, mask0, mask1, mask2, filt_hz0,
1351                               filt_hz1, filt_hz2);
1352     hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
1353     vec2 = __lsx_vpackev_b(hz_out5, hz_out4);
1354     tmp1 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1);
1355 
1356     DUP2_ARG3(__lsx_vssrarni_b_h, tmp0, tmp0, 7, tmp1, tmp1, 7, tmp0, tmp1);
1357     DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
1358 
1359     __lsx_vstelm_w(tmp0, dst, 0, 0);
1360     dst += dst_stride;
1361     __lsx_vstelm_w(tmp0, dst, 0, 1);
1362     dst += dst_stride;
1363     __lsx_vstelm_w(tmp1, dst, 0, 0);
1364     dst += dst_stride;
1365     __lsx_vstelm_w(tmp1, dst, 0, 1);
1366     dst += dst_stride;
1367 
1368     hz_out1 = hz_out5;
1369     vec0 = vec2;
1370   }
1371 }
1372 
common_hv_6ht_4vt_8w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1373 static inline void common_hv_6ht_4vt_8w_lsx(
1374     uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
1375     int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
1376     int32_t height) {
1377   uint32_t loop_cnt;
1378   int32_t src_stride_x2 = src_stride << 1;
1379   int32_t src_stride_x3 = src_stride_x2 + src_stride;
1380   int32_t src_stride_x4 = src_stride << 2;
1381   int32_t dst_stride_x2 = dst_stride << 1;
1382   int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
1383   int32_t dst_stride_x4 = dst_stride << 2;
1384 
1385   __m128i src0, src1, src2, src3, src4, src5, src6;
1386   __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
1387   __m128i filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3;
1388   __m128i tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3;
1389   __m128i out0, out1;
1390 
1391   mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
1392   src -= (2 + src_stride);
1393 
1394   filt = __lsx_vld(filter_horiz, 0);
1395   DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1);
1396   filt_hz2 = __lsx_vreplvei_h(filt, 2);
1397   DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1398 
1399   DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1);
1400   src2 = __lsx_vldx(src, src_stride_x2);
1401   src += src_stride_x3;
1402 
1403   DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
1404   src2 = __lsx_vxori_b(src2, 128);
1405   hz_out0 = horiz_6tap_filt(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1,
1406                             filt_hz2);
1407   hz_out1 = horiz_6tap_filt(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
1408                             filt_hz2);
1409   hz_out2 = horiz_6tap_filt(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
1410                             filt_hz2);
1411   DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out2, hz_out1, vec0, vec2);
1412 
1413   filt = __lsx_vld(filter_vert, 0);
1414   DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1);
1415 
1416   for (loop_cnt = (height >> 2); loop_cnt--;) {
1417     DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
1418               src_stride_x3, src3, src4, src5, src6);
1419     src += src_stride_x4;
1420     DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3,
1421               src4, src5, src6);
1422 
1423     hz_out3 = horiz_6tap_filt(src3, src3, mask0, mask1, mask2, filt_hz0,
1424                               filt_hz1, filt_hz2);
1425     vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
1426     tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
1427 
1428     hz_out0 = horiz_6tap_filt(src4, src4, mask0, mask1, mask2, filt_hz0,
1429                               filt_hz1, filt_hz2);
1430     vec3 = __lsx_vpackev_b(hz_out0, hz_out3);
1431     tmp1 = filt_4tap_dpadd_h(vec2, vec3, filt_vt0, filt_vt1);
1432 
1433     hz_out1 = horiz_6tap_filt(src5, src5, mask0, mask1, mask2, filt_hz0,
1434                               filt_hz1, filt_hz2);
1435     vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
1436     tmp2 = filt_4tap_dpadd_h(vec1, vec0, filt_vt0, filt_vt1);
1437 
1438     hz_out2 = horiz_6tap_filt(src6, src6, mask0, mask1, mask2, filt_hz0,
1439                               filt_hz1, filt_hz2);
1440     DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out3, hz_out2, hz_out1, vec1, vec2);
1441     tmp3 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1);
1442 
1443     DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
1444     DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
1445     __lsx_vstelm_d(out0, dst, 0, 0);
1446     __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
1447     __lsx_vstelm_d(out1, dst + dst_stride_x2, 0, 0);
1448     __lsx_vstelm_d(out1, dst + dst_stride_x3, 0, 1);
1449     dst += dst_stride_x4;
1450   }
1451 }
1452 
common_hv_6ht_4vt_16w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1453 static void common_hv_6ht_4vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
1454                                       uint8_t *RESTRICT dst, int32_t dst_stride,
1455                                       const int8_t *filter_horiz,
1456                                       const int8_t *filter_vert,
1457                                       int32_t height) {
1458   common_hv_6ht_4vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
1459                            filter_vert, height);
1460   common_hv_6ht_4vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride,
1461                            filter_horiz, filter_vert, height);
1462 }
1463 
common_hv_4ht_6vt_4w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1464 static void common_hv_4ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
1465                                      uint8_t *RESTRICT dst, int32_t dst_stride,
1466                                      const int8_t *filter_horiz,
1467                                      const int8_t *filter_vert,
1468                                      int32_t height) {
1469   uint32_t loop_cnt;
1470   __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
1471   __m128i filt_hz0, filt_hz1, filt_vt0, filt_vt1, filt_vt2, mask0, mask1;
1472   __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1473   __m128i hz_out7, tmp0, tmp1, out0, out1, out2, out3;
1474   __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
1475   int32_t src_stride_x2 = src_stride << 1;
1476   int32_t src_stride_x3 = src_stride_x2 + src_stride;
1477   int32_t src_stride_x4 = src_stride << 2;
1478 
1479   mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
1480 
1481   src -= 1;
1482 
1483   DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0,
1484             filt_hz1);
1485   mask1 = __lsx_vaddi_bu(mask0, 2);
1486 
1487   DUP4_ARG2(__lsx_vldx, src, -src_stride_x2, src, -src_stride, src, src_stride,
1488             src, src_stride_x2, src0, src1, src3, src4);
1489   src2 = __lsx_vld(src, 0);
1490   src += src_stride_x3;
1491 
1492   DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
1493             src1, src2, src3);
1494   src4 = __lsx_vxori_b(src4, 128);
1495   hz_out0 = horiz_4tap_filt(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
1496   hz_out2 = horiz_4tap_filt(src2, src3, mask0, mask1, filt_hz0, filt_hz1);
1497   hz_out3 = horiz_4tap_filt(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
1498   hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
1499   DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1);
1500 
1501   DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0,
1502             filt_vt1);
1503   filt_vt2 = __lsx_vldrepl_h(filter_vert, 4);
1504 
1505   for (loop_cnt = (height >> 2); loop_cnt--;) {
1506     src5 = __lsx_vld(src, 0);
1507     DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src6, src7);
1508     src8 = __lsx_vldx(src, src_stride_x3);
1509     DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5,
1510               src6, src7, src8);
1511     src += src_stride_x4;
1512 
1513     hz_out5 = horiz_4tap_filt(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
1514     hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
1515     out2 = __lsx_vpackev_b(hz_out5, hz_out4);
1516     tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
1517 
1518     hz_out7 = horiz_4tap_filt(src7, src8, mask0, mask1, filt_hz0, filt_hz1);
1519     hz_out6 = __lsx_vshuf_b(hz_out7, hz_out5, shuff);
1520     out3 = __lsx_vpackev_b(hz_out7, hz_out6);
1521     tmp1 = dpadd_h3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
1522 
1523     tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
1524     tmp0 = __lsx_vxori_b(tmp0, 128);
1525     __lsx_vstelm_w(tmp0, dst, 0, 0);
1526     dst += dst_stride;
1527     __lsx_vstelm_w(tmp0, dst, 0, 1);
1528     dst += dst_stride;
1529     __lsx_vstelm_w(tmp0, dst, 0, 2);
1530     dst += dst_stride;
1531     __lsx_vstelm_w(tmp0, dst, 0, 3);
1532     dst += dst_stride;
1533 
1534     hz_out3 = hz_out7;
1535     out0 = out2;
1536     out1 = out3;
1537   }
1538 }
1539 
common_hv_4ht_6vt_8w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1540 static inline void common_hv_4ht_6vt_8w_lsx(
1541     uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
1542     int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
1543     int32_t height) {
1544   uint32_t loop_cnt;
1545   int32_t src_stride_x2 = src_stride << 1;
1546   int32_t src_stride_x3 = src_stride_x2 + src_stride;
1547   int32_t src_stride_x4 = src_stride << 2;
1548   int32_t dst_stride_x2 = dst_stride << 1;
1549   int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
1550   int32_t dst_stride_x4 = dst_stride << 2;
1551   __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
1552   __m128i filt_hz0, filt_hz1, mask0, mask1;
1553   __m128i filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3;
1554   __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1555   __m128i hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
1556   __m128i vec0, vec1;
1557 
1558   mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
1559   src -= 1 + src_stride_x2;
1560 
1561   filt = __lsx_vld(filter_horiz, 0);
1562   DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1);
1563   mask1 = __lsx_vaddi_bu(mask0, 2);
1564 
1565   DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
1566             src_stride_x3, src0, src1, src2, src3);
1567   src += src_stride_x4;
1568   src4 = __lsx_vld(src, 0);
1569   src += src_stride;
1570 
1571   DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
1572             src1, src2, src3);
1573   src4 = __lsx_vxori_b(src4, 128);
1574   hz_out0 = horiz_4tap_filt(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
1575   hz_out1 = horiz_4tap_filt(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
1576   hz_out2 = horiz_4tap_filt(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
1577   hz_out3 = horiz_4tap_filt(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
1578   hz_out4 = horiz_4tap_filt(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
1579   DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1);
1580   DUP2_ARG2(__lsx_vpackev_b, hz_out2, hz_out1, hz_out4, hz_out3, out3, out4);
1581 
1582   filt = __lsx_vld(filter_vert, 0);
1583   DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1);
1584   filt_vt2 = __lsx_vreplvei_h(filt, 2);
1585 
1586   for (loop_cnt = (height >> 2); loop_cnt--;) {
1587     DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
1588               src_stride_x3, src5, src6, src7, src8);
1589     src += src_stride_x4;
1590 
1591     DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5,
1592               src6, src7, src8);
1593     hz_out5 = horiz_4tap_filt(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
1594     out2 = __lsx_vpackev_b(hz_out5, hz_out4);
1595     tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
1596 
1597     hz_out6 = horiz_4tap_filt(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
1598     out5 = __lsx_vpackev_b(hz_out6, hz_out5);
1599     tmp1 = dpadd_h3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
1600 
1601     hz_out7 = horiz_4tap_filt(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
1602     out6 = __lsx_vpackev_b(hz_out7, hz_out6);
1603     tmp2 = dpadd_h3(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
1604 
1605     hz_out8 = horiz_4tap_filt(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
1606     out7 = __lsx_vpackev_b(hz_out8, hz_out7);
1607     tmp3 = dpadd_h3(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
1608     DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, vec0, vec1);
1609     DUP2_ARG2(__lsx_vxori_b, vec0, 128, vec1, 128, vec0, vec1);
1610     __lsx_vstelm_d(vec0, dst, 0, 0);
1611     __lsx_vstelm_d(vec0, dst + dst_stride, 0, 1);
1612     __lsx_vstelm_d(vec1, dst + dst_stride_x2, 0, 0);
1613     __lsx_vstelm_d(vec1, dst + dst_stride_x3, 0, 1);
1614     dst += dst_stride_x4;
1615     hz_out4 = hz_out8;
1616     out0 = out2;
1617     out1 = out6;
1618     out3 = out5;
1619     out4 = out7;
1620   }
1621 }
1622 
common_hv_4ht_6vt_16w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1623 static void common_hv_4ht_6vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
1624                                       uint8_t *RESTRICT dst, int32_t dst_stride,
1625                                       const int8_t *filter_horiz,
1626                                       const int8_t *filter_vert,
1627                                       int32_t height) {
1628   common_hv_4ht_6vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
1629                            filter_vert, height);
1630   common_hv_4ht_6vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride,
1631                            filter_horiz, filter_vert, height);
1632 }
1633 
1634 typedef void (*PVp8SixtapPredictFunc1)(
1635     uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
1636     int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
1637     int32_t height);
1638 
1639 typedef void (*PVp8SixtapPredictFunc2)(uint8_t *RESTRICT src,
1640                                        int32_t src_stride,
1641                                        uint8_t *RESTRICT dst,
1642                                        int32_t dst_stride, const int8_t *filter,
1643                                        int32_t height);
1644 
vp8_sixtap_predict4x4_lsx(uint8_t * RESTRICT src,int32_t src_stride,int32_t xoffset,int32_t yoffset,uint8_t * RESTRICT dst,int32_t dst_stride)1645 void vp8_sixtap_predict4x4_lsx(uint8_t *RESTRICT src, int32_t src_stride,
1646                                int32_t xoffset, int32_t yoffset,
1647                                uint8_t *RESTRICT dst, int32_t dst_stride) {
1648   const int8_t *h_filter = vp8_subpel_filters_lsx[xoffset - 1];
1649   const int8_t *v_filter = vp8_subpel_filters_lsx[yoffset - 1];
1650 
1651   static PVp8SixtapPredictFunc1 Predict4x4Funcs1[4] = {
1652     common_hv_6ht_6vt_4w_lsx,
1653     common_hv_6ht_4vt_4w_lsx,
1654     common_hv_4ht_6vt_4w_lsx,
1655     common_hv_4ht_4vt_4w_lsx,
1656   };
1657 
1658   static PVp8SixtapPredictFunc2 Predict4x4Funcs2[4] = { common_vt_6t_4w_lsx,
1659                                                         common_vt_4t_4w_lsx,
1660                                                         common_hz_6t_4w_lsx,
1661                                                         common_hz_4t_4w_lsx };
1662   if (yoffset < 8 && xoffset < 8) {
1663     if (yoffset) {
1664       if (xoffset) {
1665         switch (xoffset & 1) {
1666           case 0:
1667             switch (yoffset & 1) {
1668               case 0:
1669                 Predict4x4Funcs1[0](src, src_stride, dst, dst_stride, h_filter,
1670                                     v_filter, 4);
1671                 break;
1672               case 1:
1673                 Predict4x4Funcs1[1](src, src_stride, dst, dst_stride, h_filter,
1674                                     v_filter + 1, 4);
1675                 break;
1676             }
1677             break;
1678 
1679           case 1:
1680             switch (yoffset & 1) {
1681               case 0:
1682                 Predict4x4Funcs1[2](src, src_stride, dst, dst_stride,
1683                                     h_filter + 1, v_filter, 4);
1684                 break;
1685 
1686               case 1:
1687                 Predict4x4Funcs1[3](src, src_stride, dst, dst_stride,
1688                                     h_filter + 1, v_filter + 1, 4);
1689                 break;
1690             }
1691             break;
1692         }
1693       } else {
1694         switch (yoffset & 1) {
1695           case 0:
1696             Predict4x4Funcs2[0](src, src_stride, dst, dst_stride, v_filter, 4);
1697             break;
1698 
1699           case 1:
1700             Predict4x4Funcs2[1](src, src_stride, dst, dst_stride, v_filter + 1,
1701                                 4);
1702             break;
1703         }
1704       }
1705     } else {
1706       switch (xoffset) {
1707         case 0: {
1708           __m128i tp0;
1709 
1710           tp0 = __lsx_vldrepl_w(src, 0);
1711           src += src_stride;
1712           __lsx_vstelm_w(tp0, dst, 0, 0);
1713           dst += dst_stride;
1714           tp0 = __lsx_vldrepl_w(src, 0);
1715           src += src_stride;
1716           __lsx_vstelm_w(tp0, dst, 0, 0);
1717           dst += dst_stride;
1718           tp0 = __lsx_vldrepl_w(src, 0);
1719           src += src_stride;
1720           __lsx_vstelm_w(tp0, dst, 0, 0);
1721           dst += dst_stride;
1722           tp0 = __lsx_vldrepl_w(src, 0);
1723           __lsx_vstelm_w(tp0, dst, 0, 0);
1724 
1725           break;
1726         }
1727         case 2:
1728         case 4:
1729         case 6:
1730           Predict4x4Funcs2[2](src, src_stride, dst, dst_stride, h_filter, 4);
1731           break;
1732       }
1733       switch (xoffset & 1) {
1734         case 1:
1735           Predict4x4Funcs2[3](src, src_stride, dst, dst_stride, h_filter + 1,
1736                               4);
1737           break;
1738       }
1739     }
1740   }
1741 }
1742 
vp8_sixtap_predict8x8_lsx(uint8_t * RESTRICT src,int32_t src_stride,int32_t xoffset,int32_t yoffset,uint8_t * RESTRICT dst,int32_t dst_stride)1743 void vp8_sixtap_predict8x8_lsx(uint8_t *RESTRICT src, int32_t src_stride,
1744                                int32_t xoffset, int32_t yoffset,
1745                                uint8_t *RESTRICT dst, int32_t dst_stride) {
1746   const int8_t *h_filter = vp8_subpel_filters_lsx[xoffset - 1];
1747   const int8_t *v_filter = vp8_subpel_filters_lsx[yoffset - 1];
1748 
1749   static PVp8SixtapPredictFunc1 Predict8x8Funcs1[4] = {
1750     common_hv_6ht_6vt_8w_lsx,
1751     common_hv_6ht_4vt_8w_lsx,
1752     common_hv_4ht_6vt_8w_lsx,
1753     common_hv_4ht_4vt_8w_lsx,
1754   };
1755 
1756   static PVp8SixtapPredictFunc2 Predict8x8Funcs2[4] = { common_vt_6t_8w_lsx,
1757                                                         common_vt_4t_8w_lsx,
1758                                                         common_hz_6t_8w_lsx,
1759                                                         common_hz_4t_8w_lsx };
1760 
1761   if (yoffset < 8 && xoffset < 8) {
1762     if (yoffset) {
1763       if (xoffset) {
1764         switch (xoffset & 1) {
1765           case 0:
1766             switch (yoffset & 1) {
1767               case 0:
1768                 Predict8x8Funcs1[0](src, src_stride, dst, dst_stride, h_filter,
1769                                     v_filter, 8);
1770                 break;
1771 
1772               case 1:
1773                 Predict8x8Funcs1[1](src, src_stride, dst, dst_stride, h_filter,
1774                                     v_filter + 1, 8);
1775                 break;
1776             }
1777             break;
1778 
1779           case 1:
1780             switch (yoffset & 1) {
1781               case 0:
1782                 Predict8x8Funcs1[2](src, src_stride, dst, dst_stride,
1783                                     h_filter + 1, v_filter, 8);
1784                 break;
1785 
1786               case 1:
1787                 Predict8x8Funcs1[3](src, src_stride, dst, dst_stride,
1788                                     h_filter + 1, v_filter + 1, 8);
1789                 break;
1790             }
1791             break;
1792         }
1793       } else {
1794         switch (yoffset & 1) {
1795           case 0:
1796             Predict8x8Funcs2[0](src, src_stride, dst, dst_stride, v_filter, 8);
1797             break;
1798 
1799           case 1:
1800             Predict8x8Funcs2[1](src, src_stride, dst, dst_stride, v_filter + 1,
1801                                 8);
1802             break;
1803         }
1804       }
1805     } else {
1806       switch (xoffset & 1) {
1807         case 1:
1808           Predict8x8Funcs2[3](src, src_stride, dst, dst_stride, h_filter + 1,
1809                               8);
1810           break;
1811       }
1812       switch (xoffset) {
1813         case 0: vp8_copy_mem8x8(src, src_stride, dst, dst_stride); break;
1814         case 2:
1815         case 4:
1816         case 6:
1817           Predict8x8Funcs2[2](src, src_stride, dst, dst_stride, h_filter, 8);
1818           break;
1819       }
1820     }
1821   }
1822 }
1823 
vp8_sixtap_predict16x16_lsx(uint8_t * RESTRICT src,int32_t src_stride,int32_t xoffset,int32_t yoffset,uint8_t * RESTRICT dst,int32_t dst_stride)1824 void vp8_sixtap_predict16x16_lsx(uint8_t *RESTRICT src, int32_t src_stride,
1825                                  int32_t xoffset, int32_t yoffset,
1826                                  uint8_t *RESTRICT dst, int32_t dst_stride) {
1827   const int8_t *h_filter = vp8_subpel_filters_lsx[xoffset - 1];
1828   const int8_t *v_filter = vp8_subpel_filters_lsx[yoffset - 1];
1829 
1830   static PVp8SixtapPredictFunc1 Predict16x16Funcs1[4] = {
1831     common_hv_6ht_6vt_16w_lsx,
1832     common_hv_6ht_4vt_16w_lsx,
1833     common_hv_4ht_6vt_16w_lsx,
1834     common_hv_4ht_4vt_16w_lsx,
1835   };
1836 
1837   static PVp8SixtapPredictFunc2 Predict16x16Funcs2[4] = {
1838     common_vt_6t_16w_lsx, common_vt_4t_16w_lsx, common_hz_6t_16w_lsx,
1839     common_hz_4t_16w_lsx
1840   };
1841 
1842   if (yoffset < 8 && xoffset < 8) {
1843     if (yoffset) {
1844       if (xoffset) {
1845         switch (xoffset & 1) {
1846           case 0:
1847             switch (yoffset & 1) {
1848               case 0:
1849                 Predict16x16Funcs1[0](src, src_stride, dst, dst_stride,
1850                                       h_filter, v_filter, 16);
1851                 break;
1852 
1853               case 1:
1854                 Predict16x16Funcs1[1](src, src_stride, dst, dst_stride,
1855                                       h_filter, v_filter + 1, 16);
1856                 break;
1857             }
1858             break;
1859 
1860           case 1:
1861             switch (yoffset & 1) {
1862               case 0:
1863                 Predict16x16Funcs1[2](src, src_stride, dst, dst_stride,
1864                                       h_filter + 1, v_filter, 16);
1865                 break;
1866 
1867               case 1:
1868                 Predict16x16Funcs1[3](src, src_stride, dst, dst_stride,
1869                                       h_filter + 1, v_filter + 1, 16);
1870                 break;
1871             }
1872             break;
1873         }
1874       } else {
1875         switch (yoffset & 1) {
1876           case 0:
1877             Predict16x16Funcs2[0](src, src_stride, dst, dst_stride, v_filter,
1878                                   16);
1879             break;
1880 
1881           case 1:
1882             Predict16x16Funcs2[1](src, src_stride, dst, dst_stride,
1883                                   v_filter + 1, 16);
1884             break;
1885         }
1886       }
1887     } else {
1888       switch (xoffset & 1) {
1889         case 1:
1890           Predict16x16Funcs2[3](src, src_stride, dst, dst_stride, h_filter + 1,
1891                                 16);
1892           break;
1893       }
1894       switch (xoffset) {
1895         case 0: vp8_copy_mem16x16(src, src_stride, dst, dst_stride); break;
1896         case 2:
1897         case 4:
1898         case 6:
1899           Predict16x16Funcs2[2](src, src_stride, dst, dst_stride, h_filter, 16);
1900           break;
1901       }
1902     }
1903   }
1904 }
1905