1 /*
2 * Copyright (c) 2021 Loongson Technology Corporation Limited
3 * Contributed by Lu Wang <[email protected]>
4 *
5 * Use of this source code is governed by a BSD-style license
6 * that can be found in the LICENSE file in the root of the source
7 * tree. An additional intellectual property rights grant can be found
8 * in the file PATENTS. All contributing project authors may
9 * be found in the AUTHORS file in the root of the source tree.
10 */
11
12 #include "./vp8_rtcd.h"
13 #include "vp8/common/filter.h"
14 #include "vpx_ports/mem.h"
15 #include "vpx_util/loongson_intrinsics.h"
16
17 DECLARE_ALIGNED(16, static const int8_t, vp8_subpel_filters_lsx[7][8]) = {
18 { 0, -6, 123, 12, -1, 0, 0, 0 },
19 { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */
20 { 0, -9, 93, 50, -6, 0, 0, 0 },
21 { 3, -16, 77, 77, -16, 3, 0, 0 }, /* New 1/2 pel 6 tap filter */
22 { 0, -6, 50, 93, -9, 0, 0, 0 },
23 { 1, -8, 36, 108, -11, 2, 0, 0 }, /* New 1/4 pel 6 tap filter */
24 { 0, -1, 12, 123, -6, 0, 0, 0 },
25 };
26
27 static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
28 /* 8 width cases */
29 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
30 /* 4 width cases */
31 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
32 /* 4 width cases */
33 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
34 };
35
dpadd_h3(__m128i in0,__m128i in1,__m128i in2,__m128i coeff0,__m128i coeff1,__m128i coeff2)36 static INLINE __m128i dpadd_h3(__m128i in0, __m128i in1, __m128i in2,
37 __m128i coeff0, __m128i coeff1, __m128i coeff2) {
38 __m128i out0_m;
39
40 out0_m = __lsx_vdp2_h_b(in0, coeff0);
41 out0_m = __lsx_vdp2add_h_b(out0_m, in1, coeff1);
42 out0_m = __lsx_vdp2add_h_b(out0_m, in2, coeff2);
43
44 return out0_m;
45 }
46
horiz_6tap_filt(__m128i src0,__m128i src1,__m128i mask0,__m128i mask1,__m128i mask2,__m128i filt_h0,__m128i filt_h1,__m128i filt_h2)47 static INLINE __m128i horiz_6tap_filt(__m128i src0, __m128i src1, __m128i mask0,
48 __m128i mask1, __m128i mask2,
49 __m128i filt_h0, __m128i filt_h1,
50 __m128i filt_h2) {
51 __m128i vec0_m, vec1_m, vec2_m;
52 __m128i hz_out_m;
53
54 DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m,
55 vec1_m);
56 vec2_m = __lsx_vshuf_b(src1, src0, mask2);
57 hz_out_m = dpadd_h3(vec0_m, vec1_m, vec2_m, filt_h0, filt_h1, filt_h2);
58 hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT);
59 hz_out_m = __lsx_vsat_h(hz_out_m, 7);
60
61 return hz_out_m;
62 }
63
filt_4tap_dpadd_h(__m128i vec0,__m128i vec1,__m128i filt0,__m128i filt1)64 static INLINE __m128i filt_4tap_dpadd_h(__m128i vec0, __m128i vec1,
65 __m128i filt0, __m128i filt1) {
66 __m128i tmp_m;
67
68 tmp_m = __lsx_vdp2_h_b(vec0, filt0);
69 tmp_m = __lsx_vdp2add_h_b(tmp_m, vec1, filt1);
70
71 return tmp_m;
72 }
73
horiz_4tap_filt(__m128i src0,__m128i src1,__m128i mask0,__m128i mask1,__m128i filt_h0,__m128i filt_h1)74 static INLINE __m128i horiz_4tap_filt(__m128i src0, __m128i src1, __m128i mask0,
75 __m128i mask1, __m128i filt_h0,
76 __m128i filt_h1) {
77 __m128i vec0_m, vec1_m, hz_out_m;
78
79 DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m,
80 vec1_m);
81 hz_out_m = filt_4tap_dpadd_h(vec0_m, vec1_m, filt_h0, filt_h1);
82 hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT);
83 hz_out_m = __lsx_vsat_h(hz_out_m, 7);
84
85 return hz_out_m;
86 }
87
88 #define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
89 mask2, filt0, filt1, filt2, out0, out1) \
90 do { \
91 __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m; \
92 \
93 DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src3, src2, mask0, vec0_m, \
94 vec1_m); \
95 DUP2_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, out0, out1); \
96 DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask1, src3, src2, mask1, vec2_m, \
97 vec3_m); \
98 DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec2_m, filt1, out1, vec3_m, filt1, \
99 out0, out1); \
100 DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask2, src3, src2, mask2, vec4_m, \
101 vec5_m); \
102 DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec4_m, filt2, out1, vec5_m, filt2, \
103 out0, out1); \
104 } while (0)
105
106 #define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
107 mask2, filt0, filt1, filt2, out0, out1, \
108 out2, out3) \
109 do { \
110 __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
111 \
112 DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, vec0_m, \
113 vec1_m); \
114 DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0, vec2_m, \
115 vec3_m); \
116 DUP4_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, vec2_m, filt0, \
117 vec3_m, filt0, out0, out1, out2, out3); \
118 DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, vec0_m, \
119 vec1_m); \
120 DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1, vec2_m, \
121 vec3_m); \
122 DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, vec4_m, \
123 vec5_m); \
124 DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2, vec6_m, \
125 vec7_m); \
126 DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec0_m, filt1, out1, vec1_m, filt1, \
127 out2, vec2_m, filt1, out3, vec3_m, filt1, out0, out1, out2, \
128 out3); \
129 DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec4_m, filt2, out1, vec5_m, filt2, \
130 out2, vec6_m, filt2, out3, vec7_m, filt2, out0, out1, out2, \
131 out3); \
132 } while (0)
133
134 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
135 filt0, filt1, out0, out1) \
136 do { \
137 __m128i vec0_m, vec1_m, vec2_m, vec3_m; \
138 \
139 DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src3, src2, mask0, vec0_m, \
140 vec1_m); \
141 DUP2_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, out0, out1); \
142 DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask1, src3, src2, mask1, vec2_m, \
143 vec3_m); \
144 DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec2_m, filt1, out1, vec3_m, filt1, \
145 out0, out1); \
146 } while (0)
147
148 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
149 filt0, filt1, out0, out1, out2, out3) \
150 do { \
151 __m128i vec0_m, vec1_m, vec2_m, vec3_m; \
152 \
153 DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, vec0_m, \
154 vec1_m); \
155 DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0, vec2_m, \
156 vec3_m); \
157 DUP4_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, vec2_m, filt0, \
158 vec3_m, filt0, out0, out1, out2, out3); \
159 DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, vec0_m, \
160 vec1_m); \
161 DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1, vec2_m, \
162 vec3_m); \
163 DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec0_m, filt1, out1, vec1_m, filt1, \
164 out2, vec2_m, filt1, out3, vec3_m, filt1, out0, out1, out2, \
165 out3); \
166 } while (0)
167
common_hz_6t_4x4_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter)168 static inline void common_hz_6t_4x4_lsx(uint8_t *RESTRICT src,
169 int32_t src_stride,
170 uint8_t *RESTRICT dst,
171 int32_t dst_stride,
172 const int8_t *filter) {
173 __m128i src0, src1, src2, src3, filt0, filt1, filt2;
174 __m128i mask0, mask1, mask2, out0, out1;
175 int32_t src_stride_x2 = src_stride << 1;
176 int32_t src_stride_x3 = src_stride_x2 + src_stride;
177
178 mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
179 src -= 2;
180
181 DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
182 filt2 = __lsx_vldrepl_h(filter, 4);
183
184 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
185 src0 = __lsx_vld(src, 0);
186 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
187 src3 = __lsx_vldx(src, src_stride_x3);
188
189 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
190 src1, src2, src3);
191 HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
192 filt1, filt2, out0, out1);
193 out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT);
194 out0 = __lsx_vxori_b(out0, 128);
195
196 __lsx_vstelm_w(out0, dst, 0, 0);
197 dst += dst_stride;
198 __lsx_vstelm_w(out0, dst, 0, 1);
199 dst += dst_stride;
200 __lsx_vstelm_w(out0, dst, 0, 2);
201 dst += dst_stride;
202 __lsx_vstelm_w(out0, dst, 0, 3);
203 }
204
common_hz_6t_4x8_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter)205 static void common_hz_6t_4x8_lsx(uint8_t *RESTRICT src, int32_t src_stride,
206 uint8_t *RESTRICT dst, int32_t dst_stride,
207 const int8_t *filter) {
208 __m128i src0, src1, src2, src3, filt0, filt1, filt2;
209 __m128i mask0, mask1, mask2, out0, out1, out2, out3;
210 int32_t src_stride_x2 = src_stride << 1;
211 int32_t src_stride_x3 = src_stride_x2 + src_stride;
212 int32_t src_stride_x4 = src_stride_x2 << 1;
213
214 mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
215 src -= 2;
216
217 DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
218 filt2 = __lsx_vldrepl_h(filter, 4);
219 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
220
221 src0 = __lsx_vld(src, 0);
222 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
223 src3 = __lsx_vldx(src, src_stride_x3);
224 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
225 src1, src2, src3);
226 src += src_stride_x4;
227 HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
228 filt1, filt2, out0, out1);
229
230 src0 = __lsx_vld(src, 0);
231 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
232 src3 = __lsx_vldx(src, src_stride_x3);
233 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
234 src1, src2, src3);
235 HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
236 filt1, filt2, out2, out3);
237
238 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
239 VP8_FILTER_SHIFT, out0, out1);
240 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
241 __lsx_vstelm_w(out0, dst, 0, 0);
242 dst += dst_stride;
243 __lsx_vstelm_w(out0, dst, 0, 1);
244 dst += dst_stride;
245 __lsx_vstelm_w(out0, dst, 0, 2);
246 dst += dst_stride;
247 __lsx_vstelm_w(out0, dst, 0, 3);
248 dst += dst_stride;
249
250 __lsx_vstelm_w(out1, dst, 0, 0);
251 dst += dst_stride;
252 __lsx_vstelm_w(out1, dst, 0, 1);
253 dst += dst_stride;
254 __lsx_vstelm_w(out1, dst, 0, 2);
255 dst += dst_stride;
256 __lsx_vstelm_w(out1, dst, 0, 3);
257 }
258
common_hz_6t_4w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)259 static void common_hz_6t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
260 uint8_t *RESTRICT dst, int32_t dst_stride,
261 const int8_t *filter, int32_t height) {
262 if (height == 4) {
263 common_hz_6t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
264 } else if (height == 8) {
265 common_hz_6t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
266 }
267 }
268
common_hz_6t_8w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)269 static void common_hz_6t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
270 uint8_t *RESTRICT dst, int32_t dst_stride,
271 const int8_t *filter, int32_t height) {
272 uint32_t loop_cnt;
273 int32_t src_stride_x2 = src_stride << 1;
274 int32_t src_stride_x3 = src_stride_x2 + src_stride;
275 int32_t src_stride_x4 = src_stride << 2;
276 int32_t dst_stride_x2 = dst_stride << 1;
277 int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
278 int32_t dst_stride_x4 = dst_stride << 2;
279 __m128i src0, src1, src2, src3, filt0, filt1, filt2;
280 __m128i mask0, mask1, mask2, tmp0, tmp1;
281 __m128i filt, out0, out1, out2, out3;
282
283 mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
284 src -= 2;
285
286 filt = __lsx_vld(filter, 0);
287 DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
288 filt2 = __lsx_vreplvei_h(filt, 2);
289 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
290
291 DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
292 src_stride_x3, src0, src1, src2, src3);
293 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
294 src1, src2, src3);
295 src += src_stride_x4;
296 HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
297 filt1, filt2, out0, out1, out2, out3);
298 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
299 VP8_FILTER_SHIFT, tmp0, tmp1);
300 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
301 __lsx_vstelm_d(tmp0, dst, 0, 0);
302 __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
303 __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
304 __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
305 dst += dst_stride_x4;
306
307 for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
308 DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
309 src_stride_x3, src0, src1, src2, src3);
310 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
311 src1, src2, src3);
312 src += src_stride_x4;
313 HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
314 filt0, filt1, filt2, out0, out1, out2, out3);
315 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
316 VP8_FILTER_SHIFT, tmp0, tmp1);
317 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
318 __lsx_vstelm_d(tmp0, dst, 0, 0);
319 __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
320 __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
321 __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
322 dst += dst_stride_x4;
323 }
324 }
325
common_hz_6t_16w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)326 static void common_hz_6t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
327 uint8_t *RESTRICT dst, int32_t dst_stride,
328 const int8_t *filter, int32_t height) {
329 uint32_t loop_cnt;
330 int32_t src_stride_x2 = src_stride << 1;
331 int32_t src_stride_x3 = src_stride_x2 + src_stride;
332 int32_t src_stride_x4 = src_stride << 2;
333 int32_t dst_stride_x2 = dst_stride << 1;
334 int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
335 int32_t dst_stride_x4 = dst_stride << 2;
336 __m128i src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2;
337 __m128i mask0, mask1, mask2, out;
338 __m128i filt, out0, out1, out2, out3, out4, out5, out6, out7;
339
340 mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
341 src -= 2;
342
343 filt = __lsx_vld(filter, 0);
344 DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
345 filt2 = __lsx_vreplvei_h(filt, 2);
346 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
347
348 for (loop_cnt = (height >> 2); loop_cnt--;) {
349 DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
350 src_stride_x3, src0, src2, src4, src6);
351 src += 8;
352 DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
353 src_stride_x3, src1, src3, src5, src7);
354 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
355 src1, src2, src3);
356 DUP4_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src6, 128, src7, 128, src4,
357 src5, src6, src7);
358 src += src_stride_x4 - 8;
359
360 HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
361 filt0, filt1, filt2, out0, out1, out2, out3);
362 HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
363 filt0, filt1, filt2, out4, out5, out6, out7);
364 DUP4_ARG2(__lsx_vsrari_h, out0, VP8_FILTER_SHIFT, out1, VP8_FILTER_SHIFT,
365 out2, VP8_FILTER_SHIFT, out3, VP8_FILTER_SHIFT, out0, out1, out2,
366 out3);
367 DUP4_ARG2(__lsx_vsrari_h, out4, VP8_FILTER_SHIFT, out5, VP8_FILTER_SHIFT,
368 out6, VP8_FILTER_SHIFT, out7, VP8_FILTER_SHIFT, out4, out5, out6,
369 out7);
370 DUP4_ARG2(__lsx_vsat_h, out0, 7, out1, 7, out2, 7, out3, 7, out0, out1,
371 out2, out3);
372 DUP4_ARG2(__lsx_vsat_h, out4, 7, out5, 7, out6, 7, out7, 7, out4, out5,
373 out6, out7);
374 out = __lsx_vpickev_b(out1, out0);
375 out = __lsx_vxori_b(out, 128);
376 __lsx_vst(out, dst, 0);
377 out = __lsx_vpickev_b(out3, out2);
378 out = __lsx_vxori_b(out, 128);
379 __lsx_vstx(out, dst, dst_stride);
380 out = __lsx_vpickev_b(out5, out4);
381 out = __lsx_vxori_b(out, 128);
382 __lsx_vstx(out, dst, dst_stride_x2);
383 out = __lsx_vpickev_b(out7, out6);
384 out = __lsx_vxori_b(out, 128);
385 __lsx_vstx(out, dst, dst_stride_x3);
386 dst += dst_stride_x4;
387 }
388 }
389
common_vt_6t_4w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)390 static void common_vt_6t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
391 uint8_t *RESTRICT dst, int32_t dst_stride,
392 const int8_t *filter, int32_t height) {
393 uint32_t loop_cnt;
394 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
395 __m128i src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
396 __m128i src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
397 __m128i out0, out1;
398 int32_t src_stride_x2 = src_stride << 1;
399 int32_t src_stride_x3 = src_stride_x2 + src_stride;
400 int32_t src_stride_x4 = src_stride << 2;
401
402 DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
403 filt2 = __lsx_vldrepl_h(filter, 4);
404
405 DUP2_ARG2(__lsx_vldx, src, -src_stride_x2, src, -src_stride, src0, src1);
406 src2 = __lsx_vld(src, 0);
407 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src3, src4);
408 src += src_stride_x3;
409
410 DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
411 src10_r, src21_r, src32_r, src43_r);
412 DUP2_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r, src2110,
413 src4332);
414 DUP2_ARG2(__lsx_vxori_b, src2110, 128, src4332, 128, src2110, src4332);
415
416 for (loop_cnt = (height >> 2); loop_cnt--;) {
417 src5 = __lsx_vld(src, 0);
418 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src6, src7);
419 src8 = __lsx_vldx(src, src_stride_x3);
420 src += src_stride_x4;
421
422 DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
423 src54_r, src65_r, src76_r, src87_r);
424 DUP2_ARG2(__lsx_vilvl_d, src65_r, src54_r, src87_r, src76_r, src6554,
425 src8776);
426 DUP2_ARG2(__lsx_vxori_b, src6554, 128, src8776, 128, src6554, src8776);
427 out0 = dpadd_h3(src2110, src4332, src6554, filt0, filt1, filt2);
428 out1 = dpadd_h3(src4332, src6554, src8776, filt0, filt1, filt2);
429
430 out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT);
431 out0 = __lsx_vxori_b(out0, 128);
432
433 __lsx_vstelm_w(out0, dst, 0, 0);
434 dst += dst_stride;
435 __lsx_vstelm_w(out0, dst, 0, 1);
436 dst += dst_stride;
437 __lsx_vstelm_w(out0, dst, 0, 2);
438 dst += dst_stride;
439 __lsx_vstelm_w(out0, dst, 0, 3);
440 dst += dst_stride;
441
442 src2110 = src6554;
443 src4332 = src8776;
444 src4 = src8;
445 }
446 }
447
common_vt_6t_8w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)448 static void common_vt_6t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
449 uint8_t *RESTRICT dst, int32_t dst_stride,
450 const int8_t *filter, int32_t height) {
451 uint32_t loop_cnt;
452 int32_t src_stride_x2 = src_stride << 1;
453 int32_t src_stride_x3 = src_stride_x2 + src_stride;
454 int32_t src_stride_x4 = src_stride << 2;
455 int32_t dst_stride_x2 = dst_stride << 1;
456 int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
457 int32_t dst_stride_x4 = dst_stride << 2;
458 __m128i src0, src1, src2, src3, src4, src7, src8, src9, src10;
459 __m128i src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
460 __m128i src109_r, filt0, filt1, filt2;
461 __m128i tmp0, tmp1;
462 __m128i filt, out0_r, out1_r, out2_r, out3_r;
463
464 src -= src_stride_x2;
465 filt = __lsx_vld(filter, 0);
466 DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
467 filt2 = __lsx_vreplvei_h(filt, 2);
468
469 DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
470 src_stride_x3, src0, src1, src2, src3);
471 src += src_stride_x4;
472 src4 = __lsx_vld(src, 0);
473 src += src_stride;
474
475 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
476 src1, src2, src3);
477 src4 = __lsx_vxori_b(src4, 128);
478 DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src2, src1, src4, src3,
479 src10_r, src32_r, src21_r, src43_r);
480
481 for (loop_cnt = (height >> 2); loop_cnt--;) {
482 DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
483 src_stride_x3, src7, src8, src9, src10);
484 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
485 src8, src9, src10);
486 src += src_stride_x4;
487
488 DUP4_ARG2(__lsx_vilvl_b, src7, src4, src8, src7, src9, src8, src10, src9,
489 src76_r, src87_r, src98_r, src109_r);
490 out0_r = dpadd_h3(src10_r, src32_r, src76_r, filt0, filt1, filt2);
491 out1_r = dpadd_h3(src21_r, src43_r, src87_r, filt0, filt1, filt2);
492 out2_r = dpadd_h3(src32_r, src76_r, src98_r, filt0, filt1, filt2);
493 out3_r = dpadd_h3(src43_r, src87_r, src109_r, filt0, filt1, filt2);
494 DUP2_ARG3(__lsx_vssrarni_b_h, out1_r, out0_r, VP8_FILTER_SHIFT, out3_r,
495 out2_r, VP8_FILTER_SHIFT, tmp0, tmp1);
496 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
497 __lsx_vstelm_d(tmp0, dst, 0, 0);
498 __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
499 __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
500 __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
501 dst += dst_stride_x4;
502
503 src10_r = src76_r;
504 src32_r = src98_r;
505 src21_r = src87_r;
506 src43_r = src109_r;
507 src4 = src10;
508 }
509 }
510
common_vt_6t_16w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)511 static void common_vt_6t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
512 uint8_t *RESTRICT dst, int32_t dst_stride,
513 const int8_t *filter, int32_t height) {
514 uint32_t loop_cnt;
515 int32_t src_stride_x2 = src_stride << 1;
516 int32_t src_stride_x3 = src_stride_x2 + src_stride;
517 int32_t src_stride_x4 = src_stride << 2;
518 int32_t dst_stride_x2 = dst_stride << 1;
519 int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
520 int32_t dst_stride_x4 = dst_stride << 2;
521 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
522 __m128i src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
523 __m128i src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
524 __m128i src65_l, src87_l, filt0, filt1, filt2;
525 __m128i tmp0, tmp1, tmp2, tmp3;
526 __m128i filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
527
528 src -= src_stride_x2;
529 filt = __lsx_vld(filter, 0);
530 DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
531 filt2 = __lsx_vreplvei_h(filt, 2);
532
533 DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
534 src_stride_x3, src0, src1, src2, src3);
535 src += src_stride_x4;
536 src4 = __lsx_vldx(src, 0);
537 src += src_stride;
538
539 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
540 src1, src2, src3);
541 src4 = __lsx_vxori_b(src4, 128);
542 DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src4, src3, src2, src1,
543 src10_r, src32_r, src43_r, src21_r);
544 DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src4, src3, src2, src1,
545 src10_l, src32_l, src43_l, src21_l);
546
547 for (loop_cnt = (height >> 2); loop_cnt--;) {
548 DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
549 src_stride_x3, src5, src6, src7, src8);
550 src += src_stride_x4;
551
552 DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5,
553 src6, src7, src8);
554 DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
555 src54_r, src65_r, src76_r, src87_r);
556 DUP4_ARG2(__lsx_vilvh_b, src5, src4, src6, src5, src7, src6, src8, src7,
557 src54_l, src65_l, src76_l, src87_l);
558 out0_r = dpadd_h3(src10_r, src32_r, src54_r, filt0, filt1, filt2);
559 out1_r = dpadd_h3(src21_r, src43_r, src65_r, filt0, filt1, filt2);
560 out2_r = dpadd_h3(src32_r, src54_r, src76_r, filt0, filt1, filt2);
561 out3_r = dpadd_h3(src43_r, src65_r, src87_r, filt0, filt1, filt2);
562 out0_l = dpadd_h3(src10_l, src32_l, src54_l, filt0, filt1, filt2);
563 out1_l = dpadd_h3(src21_l, src43_l, src65_l, filt0, filt1, filt2);
564 out2_l = dpadd_h3(src32_l, src54_l, src76_l, filt0, filt1, filt2);
565 out3_l = dpadd_h3(src43_l, src65_l, src87_l, filt0, filt1, filt2);
566 DUP4_ARG3(__lsx_vssrarni_b_h, out0_l, out0_r, VP8_FILTER_SHIFT, out1_l,
567 out1_r, VP8_FILTER_SHIFT, out2_l, out2_r, VP8_FILTER_SHIFT,
568 out3_l, out3_r, VP8_FILTER_SHIFT, tmp0, tmp1, tmp2, tmp3);
569 DUP4_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp2, 128, tmp3, 128, tmp0,
570 tmp1, tmp2, tmp3);
571 __lsx_vstx(tmp0, dst, 0);
572 __lsx_vstx(tmp1, dst, dst_stride);
573 __lsx_vstx(tmp2, dst, dst_stride_x2);
574 __lsx_vstx(tmp3, dst, dst_stride_x3);
575 dst += dst_stride_x4;
576
577 src10_r = src54_r;
578 src32_r = src76_r;
579 src21_r = src65_r;
580 src43_r = src87_r;
581 src10_l = src54_l;
582 src32_l = src76_l;
583 src21_l = src65_l;
584 src43_l = src87_l;
585 src4 = src8;
586 }
587 }
588
common_hv_6ht_6vt_4w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)589 static void common_hv_6ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
590 uint8_t *RESTRICT dst, int32_t dst_stride,
591 const int8_t *filter_horiz,
592 const int8_t *filter_vert,
593 int32_t height) {
594 uint32_t loop_cnt;
595 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, tmp0, tmp1;
596 __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
597 __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
598 __m128i hz_out7, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3;
599 __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
600 int32_t src_stride_x2 = src_stride << 1;
601 int32_t src_stride_x3 = src_stride_x2 + src_stride;
602
603 mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
604 src -= 2;
605
606 DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0,
607 filt_hz1);
608 filt_hz2 = __lsx_vldrepl_h(filter_horiz, 4);
609 DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0,
610 filt_vt1);
611 filt_vt2 = __lsx_vldrepl_h(filter_vert, 4);
612
613 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
614
615 DUP2_ARG2(__lsx_vldx, src, -src_stride_x2, src, -src_stride, src0, src1);
616 src2 = __lsx_vld(src, 0);
617 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src3, src4);
618 src += src_stride_x3;
619
620 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
621 src1, src2, src3);
622 src4 = __lsx_vxori_b(src4, 128);
623
624 hz_out0 = horiz_6tap_filt(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
625 filt_hz2);
626 hz_out2 = horiz_6tap_filt(src2, src3, mask0, mask1, mask2, filt_hz0, filt_hz1,
627 filt_hz2);
628 hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
629 hz_out3 = horiz_6tap_filt(src3, src4, mask0, mask1, mask2, filt_hz0, filt_hz1,
630 filt_hz2);
631 DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1);
632
633 for (loop_cnt = (height >> 2); loop_cnt--;) {
634 src5 = __lsx_vld(src, 0);
635 src6 = __lsx_vldx(src, src_stride);
636 src += src_stride_x2;
637
638 DUP2_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src5, src6);
639 hz_out5 = horiz_6tap_filt(src5, src6, mask0, mask1, mask2, filt_hz0,
640 filt_hz1, filt_hz2);
641 hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
642
643 src7 = __lsx_vld(src, 0);
644 src8 = __lsx_vldx(src, src_stride);
645 src += src_stride_x2;
646
647 DUP2_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src7, src8);
648 hz_out7 = horiz_6tap_filt(src7, src8, mask0, mask1, mask2, filt_hz0,
649 filt_hz1, filt_hz2);
650 hz_out6 = __lsx_vshuf_b(hz_out7, hz_out5, shuff);
651
652 out2 = __lsx_vpackev_b(hz_out5, hz_out4);
653 tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
654
655 out3 = __lsx_vpackev_b(hz_out7, hz_out6);
656 tmp1 = dpadd_h3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
657
658 tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
659 tmp0 = __lsx_vxori_b(tmp0, 128);
660 __lsx_vstelm_w(tmp0, dst, 0, 0);
661 dst += dst_stride;
662 __lsx_vstelm_w(tmp0, dst, 0, 1);
663 dst += dst_stride;
664 __lsx_vstelm_w(tmp0, dst, 0, 2);
665 dst += dst_stride;
666 __lsx_vstelm_w(tmp0, dst, 0, 3);
667 dst += dst_stride;
668
669 hz_out3 = hz_out7;
670 out0 = out2;
671 out1 = out3;
672 }
673 }
674
common_hv_6ht_6vt_8w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)675 static void common_hv_6ht_6vt_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
676 uint8_t *RESTRICT dst, int32_t dst_stride,
677 const int8_t *filter_horiz,
678 const int8_t *filter_vert,
679 int32_t height) {
680 uint32_t loop_cnt;
681 int32_t src_stride_x2 = src_stride << 1;
682 int32_t src_stride_x3 = src_stride_x2 + src_stride;
683 int32_t src_stride_x4 = src_stride << 2;
684 int32_t dst_stride_x2 = dst_stride << 1;
685 int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
686 int32_t dst_stride_x4 = dst_stride << 2;
687 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
688 __m128i filt_hz0, filt_hz1, filt_hz2;
689 __m128i mask0, mask1, mask2, vec0, vec1;
690 __m128i filt, filt_vt0, filt_vt1, filt_vt2;
691 __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
692 __m128i hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
693 __m128i tmp0, tmp1, tmp2, tmp3;
694
695 mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
696 src -= (2 + src_stride_x2);
697
698 filt = __lsx_vld(filter_horiz, 0);
699 DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1);
700 filt_hz2 = __lsx_vreplvei_h(filt, 2);
701
702 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
703 DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
704 src_stride_x3, src0, src1, src2, src3);
705 src += src_stride_x4;
706 src4 = __lsx_vldx(src, 0);
707 src += src_stride;
708
709 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
710 src1, src2, src3);
711 src4 = __lsx_vxori_b(src4, 128);
712
713 hz_out0 = horiz_6tap_filt(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1,
714 filt_hz2);
715 hz_out1 = horiz_6tap_filt(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
716 filt_hz2);
717 hz_out2 = horiz_6tap_filt(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
718 filt_hz2);
719 hz_out3 = horiz_6tap_filt(src3, src3, mask0, mask1, mask2, filt_hz0, filt_hz1,
720 filt_hz2);
721 hz_out4 = horiz_6tap_filt(src4, src4, mask0, mask1, mask2, filt_hz0, filt_hz1,
722 filt_hz2);
723 filt = __lsx_vld(filter_vert, 0);
724 DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1);
725 filt_vt2 = __lsx_vreplvei_h(filt, 2);
726
727 DUP4_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, hz_out2,
728 hz_out1, hz_out4, hz_out3, out0, out1, out3, out4);
729
730 for (loop_cnt = (height >> 2); loop_cnt--;) {
731 DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
732 src_stride_x3, src5, src6, src7, src8);
733 src += src_stride_x4;
734
735 DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5,
736 src6, src7, src8);
737 hz_out5 = horiz_6tap_filt(src5, src5, mask0, mask1, mask2, filt_hz0,
738 filt_hz1, filt_hz2);
739 out2 = __lsx_vpackev_b(hz_out5, hz_out4);
740 tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
741
742 hz_out6 = horiz_6tap_filt(src6, src6, mask0, mask1, mask2, filt_hz0,
743 filt_hz1, filt_hz2);
744 out5 = __lsx_vpackev_b(hz_out6, hz_out5);
745 tmp1 = dpadd_h3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
746
747 hz_out7 = horiz_6tap_filt(src7, src7, mask0, mask1, mask2, filt_hz0,
748 filt_hz1, filt_hz2);
749 out7 = __lsx_vpackev_b(hz_out7, hz_out6);
750 tmp2 = dpadd_h3(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
751
752 hz_out8 = horiz_6tap_filt(src8, src8, mask0, mask1, mask2, filt_hz0,
753 filt_hz1, filt_hz2);
754 out6 = __lsx_vpackev_b(hz_out8, hz_out7);
755 tmp3 = dpadd_h3(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
756
757 DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, VP8_FILTER_SHIFT, tmp3, tmp2,
758 VP8_FILTER_SHIFT, vec0, vec1);
759 DUP2_ARG2(__lsx_vxori_b, vec0, 128, vec1, 128, vec0, vec1);
760
761 __lsx_vstelm_d(vec0, dst, 0, 0);
762 __lsx_vstelm_d(vec0, dst + dst_stride, 0, 1);
763 __lsx_vstelm_d(vec1, dst + dst_stride_x2, 0, 0);
764 __lsx_vstelm_d(vec1, dst + dst_stride_x3, 0, 1);
765 dst += dst_stride_x4;
766
767 hz_out4 = hz_out8;
768 out0 = out2;
769 out1 = out7;
770 out3 = out5;
771 out4 = out6;
772 }
773 }
774
common_hv_6ht_6vt_16w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)775 static void common_hv_6ht_6vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
776 uint8_t *RESTRICT dst, int32_t dst_stride,
777 const int8_t *filter_horiz,
778 const int8_t *filter_vert,
779 int32_t height) {
780 common_hv_6ht_6vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
781 filter_vert, height);
782 common_hv_6ht_6vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride,
783 filter_horiz, filter_vert, height);
784 }
785
common_hz_4t_4x4_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter)786 static void common_hz_4t_4x4_lsx(uint8_t *RESTRICT src, int32_t src_stride,
787 uint8_t *RESTRICT dst, int32_t dst_stride,
788 const int8_t *filter) {
789 __m128i src0, src1, src2, src3, filt0, filt1, mask0, mask1;
790 __m128i out0, out1;
791 int32_t src_stride_x2 = src_stride << 1;
792 int32_t src_stride_x3 = src_stride_x2 + src_stride;
793
794 mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
795 src -= 1;
796
797 DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
798 mask1 = __lsx_vaddi_bu(mask0, 2);
799
800 src0 = __lsx_vld(src, 0);
801 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
802 src3 = __lsx_vldx(src, src_stride_x3);
803 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
804 src1, src2, src3);
805 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
806 out0, out1);
807
808 out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT);
809 out0 = __lsx_vxori_b(out0, 128);
810
811 __lsx_vstelm_w(out0, dst, 0, 0);
812 dst += dst_stride;
813 __lsx_vstelm_w(out0, dst, 0, 1);
814 dst += dst_stride;
815 __lsx_vstelm_w(out0, dst, 0, 2);
816 dst += dst_stride;
817 __lsx_vstelm_w(out0, dst, 0, 3);
818 }
819
common_hz_4t_4x8_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter)820 static void common_hz_4t_4x8_lsx(uint8_t *RESTRICT src, int32_t src_stride,
821 uint8_t *RESTRICT dst, int32_t dst_stride,
822 const int8_t *filter) {
823 __m128i src0, src1, src2, src3, filt0, filt1, mask0, mask1;
824 __m128i out0, out1, out2, out3;
825 int32_t src_stride_x2 = src_stride << 1;
826 int32_t src_stride_x3 = src_stride_x2 + src_stride;
827 int32_t src_stride_x4 = src_stride << 2;
828
829 mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
830 src -= 1;
831
832 DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
833 mask1 = __lsx_vaddi_bu(mask0, 2);
834
835 src0 = __lsx_vld(src, 0);
836 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
837 src3 = __lsx_vldx(src, src_stride_x3);
838 src += src_stride_x4;
839 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
840 src1, src2, src3);
841 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
842 out0, out1);
843
844 src0 = __lsx_vld(src, 0);
845 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
846 src3 = __lsx_vldx(src, src_stride_x3);
847 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
848 src1, src2, src3);
849 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
850 out2, out3);
851 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
852 VP8_FILTER_SHIFT, out0, out1);
853 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
854 __lsx_vstelm_w(out0, dst, 0, 0);
855 dst += dst_stride;
856 __lsx_vstelm_w(out0, dst, 0, 1);
857 dst += dst_stride;
858 __lsx_vstelm_w(out0, dst, 0, 2);
859 dst += dst_stride;
860 __lsx_vstelm_w(out0, dst, 0, 3);
861 dst += dst_stride;
862
863 __lsx_vstelm_w(out1, dst, 0, 0);
864 dst += dst_stride;
865 __lsx_vstelm_w(out1, dst, 0, 1);
866 dst += dst_stride;
867 __lsx_vstelm_w(out1, dst, 0, 2);
868 dst += dst_stride;
869 __lsx_vstelm_w(out1, dst, 0, 3);
870 }
871
common_hz_4t_4w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)872 static void common_hz_4t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
873 uint8_t *RESTRICT dst, int32_t dst_stride,
874 const int8_t *filter, int32_t height) {
875 if (height == 4) {
876 common_hz_4t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
877 } else if (height == 8) {
878 common_hz_4t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
879 }
880 }
881
common_hz_4t_8w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)882 static void common_hz_4t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
883 uint8_t *RESTRICT dst, int32_t dst_stride,
884 const int8_t *filter, int32_t height) {
885 uint32_t loop_cnt;
886 int32_t src_stride_x2 = src_stride << 1;
887 int32_t src_stride_x3 = src_stride_x2 + src_stride;
888 int32_t src_stride_x4 = src_stride << 2;
889 int32_t dst_stride_x2 = dst_stride << 1;
890 int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
891 int32_t dst_stride_x4 = dst_stride << 2;
892 __m128i src0, src1, src2, src3, filt0, filt1, mask0, mask1;
893 __m128i tmp0, tmp1;
894 __m128i filt, out0, out1, out2, out3;
895
896 mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
897 src -= 1;
898
899 filt = __lsx_vld(filter, 0);
900 DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
901 mask1 = __lsx_vaddi_bu(mask0, 2);
902
903 for (loop_cnt = (height >> 2); loop_cnt--;) {
904 DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
905 src_stride_x3, src0, src1, src2, src3);
906 src += src_stride_x4;
907
908 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
909 src1, src2, src3);
910 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
911 filt1, out0, out1, out2, out3);
912 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
913 VP8_FILTER_SHIFT, tmp0, tmp1);
914 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
915 __lsx_vstelm_d(tmp0, dst, 0, 0);
916 __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
917 __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
918 __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
919 dst += dst_stride_x4;
920 }
921 }
922
common_hz_4t_16w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)923 static void common_hz_4t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
924 uint8_t *RESTRICT dst, int32_t dst_stride,
925 const int8_t *filter, int32_t height) {
926 uint32_t loop_cnt;
927 int32_t src_stride_x2 = src_stride << 1;
928 int32_t src_stride_x3 = src_stride_x2 + src_stride;
929 int32_t src_stride_x4 = src_stride << 2;
930 int32_t dst_stride_x2 = dst_stride << 1;
931 int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
932 int32_t dst_stride_x4 = dst_stride << 2;
933 __m128i src0, src1, src2, src3, src4, src5, src6, src7;
934 __m128i filt0, filt1, mask0, mask1;
935 __m128i filt, out0, out1, out2, out3, out4, out5, out6, out7;
936
937 mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
938 src -= 1;
939
940 filt = __lsx_vld(filter, 0);
941 DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
942 mask1 = __lsx_vaddi_bu(mask0, 2);
943
944 for (loop_cnt = (height >> 2); loop_cnt--;) {
945 DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
946 src_stride_x3, src0, src2, src4, src6);
947 src += 8;
948 DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
949 src_stride_x3, src1, src3, src5, src7);
950 src += src_stride_x4 - 8;
951
952 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
953 src1, src2, src3);
954 DUP4_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src6, 128, src7, 128, src4,
955 src5, src6, src7);
956 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
957 filt1, out0, out1, out2, out3);
958 HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0,
959 filt1, out4, out5, out6, out7);
960 DUP4_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
961 VP8_FILTER_SHIFT, out5, out4, VP8_FILTER_SHIFT, out7, out6,
962 VP8_FILTER_SHIFT, out0, out1, out2, out3);
963 DUP4_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out2, 128, out3, 128, out0,
964 out1, out2, out3);
965 __lsx_vstx(out0, dst, 0);
966 __lsx_vstx(out1, dst, dst_stride);
967 __lsx_vstx(out2, dst, dst_stride_x2);
968 __lsx_vstx(out3, dst, dst_stride_x3);
969 dst += dst_stride_x4;
970 }
971 }
972
common_vt_4t_4w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)973 static void common_vt_4t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
974 uint8_t *RESTRICT dst, int32_t dst_stride,
975 const int8_t *filter, int32_t height) {
976 uint32_t loop_cnt;
977 __m128i src0, src1, src2, src3, src4, src5;
978 __m128i src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
979 __m128i src2110, src4332, filt0, filt1, out0, out1;
980 int32_t src_stride_x2 = src_stride << 1;
981 int32_t src_stride_x3 = src_stride_x2 + src_stride;
982
983 DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
984 DUP2_ARG2(__lsx_vldx, src, -src_stride, src, src_stride, src0, src2);
985 src1 = __lsx_vld(src, 0);
986 src += src_stride_x2;
987
988 DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
989
990 src2110 = __lsx_vilvl_d(src21_r, src10_r);
991 src2110 = __lsx_vxori_b(src2110, 128);
992
993 for (loop_cnt = (height >> 2); loop_cnt--;) {
994 src3 = __lsx_vld(src, 0);
995 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src4, src5);
996 src += src_stride_x3;
997 DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
998 src4332 = __lsx_vilvl_d(src43_r, src32_r);
999 src4332 = __lsx_vxori_b(src4332, 128);
1000 out0 = filt_4tap_dpadd_h(src2110, src4332, filt0, filt1);
1001
1002 src2 = __lsx_vld(src, 0);
1003 src += src_stride;
1004 DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src54_r, src65_r);
1005 src2110 = __lsx_vilvl_d(src65_r, src54_r);
1006 src2110 = __lsx_vxori_b(src2110, 128);
1007 out1 = filt_4tap_dpadd_h(src4332, src2110, filt0, filt1);
1008 out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT);
1009 out0 = __lsx_vxori_b(out0, 128);
1010
1011 __lsx_vstelm_w(out0, dst, 0, 0);
1012 dst += dst_stride;
1013 __lsx_vstelm_w(out0, dst, 0, 1);
1014 dst += dst_stride;
1015 __lsx_vstelm_w(out0, dst, 0, 2);
1016 dst += dst_stride;
1017 __lsx_vstelm_w(out0, dst, 0, 3);
1018 dst += dst_stride;
1019 }
1020 }
1021
common_vt_4t_8w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)1022 static void common_vt_4t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
1023 uint8_t *RESTRICT dst, int32_t dst_stride,
1024 const int8_t *filter, int32_t height) {
1025 uint32_t loop_cnt;
1026 int32_t src_stride_x2 = src_stride << 1;
1027 int32_t src_stride_x3 = src_stride_x2 + src_stride;
1028 int32_t src_stride_x4 = src_stride << 2;
1029 int32_t dst_stride_x2 = dst_stride << 1;
1030 int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
1031 int32_t dst_stride_x4 = dst_stride << 2;
1032 __m128i src0, src1, src2, src7, src8, src9, src10;
1033 __m128i src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
1034 __m128i tmp0, tmp1;
1035 __m128i filt, out0_r, out1_r, out2_r, out3_r;
1036
1037 src -= src_stride;
1038 filt = __lsx_vld(filter, 0);
1039 DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
1040
1041 DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1);
1042 src2 = __lsx_vldx(src, src_stride_x2);
1043 src += src_stride_x3;
1044
1045 DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
1046 src2 = __lsx_vxori_b(src2, 128);
1047 DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
1048
1049 for (loop_cnt = (height >> 2); loop_cnt--;) {
1050 DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
1051 src_stride_x3, src7, src8, src9, src10);
1052 src += src_stride_x4;
1053
1054 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
1055 src8, src9, src10);
1056 DUP4_ARG2(__lsx_vilvl_b, src7, src2, src8, src7, src9, src8, src10, src9,
1057 src72_r, src87_r, src98_r, src109_r);
1058 out0_r = filt_4tap_dpadd_h(src10_r, src72_r, filt0, filt1);
1059 out1_r = filt_4tap_dpadd_h(src21_r, src87_r, filt0, filt1);
1060 out2_r = filt_4tap_dpadd_h(src72_r, src98_r, filt0, filt1);
1061 out3_r = filt_4tap_dpadd_h(src87_r, src109_r, filt0, filt1);
1062 DUP2_ARG3(__lsx_vssrarni_b_h, out1_r, out0_r, VP8_FILTER_SHIFT, out3_r,
1063 out2_r, VP8_FILTER_SHIFT, tmp0, tmp1);
1064 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
1065 __lsx_vstelm_d(tmp0, dst, 0, 0);
1066 __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
1067 __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
1068 __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
1069 dst += dst_stride_x4;
1070
1071 src10_r = src98_r;
1072 src21_r = src109_r;
1073 src2 = src10;
1074 }
1075 }
1076
common_vt_4t_16w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter,int32_t height)1077 static void common_vt_4t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
1078 uint8_t *RESTRICT dst, int32_t dst_stride,
1079 const int8_t *filter, int32_t height) {
1080 uint32_t loop_cnt;
1081 int32_t src_stride_x2 = src_stride << 1;
1082 int32_t src_stride_x3 = src_stride_x2 + src_stride;
1083 int32_t src_stride_x4 = src_stride << 2;
1084 int32_t dst_stride_x2 = dst_stride << 1;
1085 int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
1086 int32_t dst_stride_x4 = dst_stride << 2;
1087 __m128i src0, src1, src2, src3, src4, src5, src6;
1088 __m128i src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
1089 __m128i src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
1090 __m128i tmp0, tmp1, tmp2, tmp3;
1091 __m128i filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1092
1093 src -= src_stride;
1094 filt = __lsx_vld(filter, 0);
1095 DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
1096
1097 DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1);
1098 src2 = __lsx_vldx(src, src_stride_x2);
1099 src += src_stride_x3;
1100
1101 DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
1102 src2 = __lsx_vxori_b(src2, 128);
1103 DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
1104 DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
1105
1106 for (loop_cnt = (height >> 2); loop_cnt--;) {
1107 DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
1108 src_stride_x3, src3, src4, src5, src6);
1109 src += src_stride_x4;
1110
1111 DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3,
1112 src4, src5, src6);
1113 DUP4_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src5, src4, src6, src5,
1114 src32_r, src43_r, src54_r, src65_r);
1115 DUP4_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src5, src4, src6, src5,
1116 src32_l, src43_l, src54_l, src65_l);
1117 out0_r = filt_4tap_dpadd_h(src10_r, src32_r, filt0, filt1);
1118 out1_r = filt_4tap_dpadd_h(src21_r, src43_r, filt0, filt1);
1119 out2_r = filt_4tap_dpadd_h(src32_r, src54_r, filt0, filt1);
1120 out3_r = filt_4tap_dpadd_h(src43_r, src65_r, filt0, filt1);
1121 out0_l = filt_4tap_dpadd_h(src10_l, src32_l, filt0, filt1);
1122 out1_l = filt_4tap_dpadd_h(src21_l, src43_l, filt0, filt1);
1123 out2_l = filt_4tap_dpadd_h(src32_l, src54_l, filt0, filt1);
1124 out3_l = filt_4tap_dpadd_h(src43_l, src65_l, filt0, filt1);
1125 DUP4_ARG3(__lsx_vssrarni_b_h, out0_l, out0_r, VP8_FILTER_SHIFT, out1_l,
1126 out1_r, VP8_FILTER_SHIFT, out2_l, out2_r, VP8_FILTER_SHIFT,
1127 out3_l, out3_r, VP8_FILTER_SHIFT, tmp0, tmp1, tmp2, tmp3);
1128 DUP4_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp2, 128, tmp3, 128, tmp0,
1129 tmp1, tmp2, tmp3);
1130 __lsx_vstx(tmp0, dst, 0);
1131 __lsx_vstx(tmp1, dst, dst_stride);
1132 __lsx_vstx(tmp2, dst, dst_stride_x2);
1133 __lsx_vstx(tmp3, dst, dst_stride_x3);
1134 dst += dst_stride_x4;
1135
1136 src10_r = src54_r;
1137 src21_r = src65_r;
1138 src10_l = src54_l;
1139 src21_l = src65_l;
1140 src2 = src6;
1141 }
1142 }
1143
common_hv_4ht_4vt_4w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1144 static void common_hv_4ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
1145 uint8_t *RESTRICT dst, int32_t dst_stride,
1146 const int8_t *filter_horiz,
1147 const int8_t *filter_vert,
1148 int32_t height) {
1149 uint32_t loop_cnt;
1150 __m128i src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
1151 __m128i mask0, mask1, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
1152 __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
1153 __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
1154 int32_t src_stride_x2 = src_stride << 1;
1155 int32_t src_stride_x3 = src_stride_x2 + src_stride;
1156 int32_t src_stride_x4 = src_stride << 2;
1157
1158 mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
1159 src -= 1;
1160
1161 DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0,
1162 filt_hz1);
1163 mask1 = __lsx_vaddi_bu(mask0, 2);
1164
1165 src1 = __lsx_vld(src, 0);
1166 DUP2_ARG2(__lsx_vldx, src, -src_stride, src, src_stride, src0, src2);
1167 src += src_stride_x2;
1168
1169 DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
1170 src2 = __lsx_vxori_b(src2, 128);
1171 hz_out0 = horiz_4tap_filt(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
1172 hz_out1 = horiz_4tap_filt(src1, src2, mask0, mask1, filt_hz0, filt_hz1);
1173 vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
1174
1175 DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0,
1176 filt_vt1);
1177
1178 for (loop_cnt = (height >> 2); loop_cnt--;) {
1179 src3 = __lsx_vld(src, 0);
1180 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src4, src5);
1181 src6 = __lsx_vldx(src, src_stride_x3);
1182 src += src_stride_x4;
1183
1184 DUP2_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src3, src4);
1185 hz_out3 = horiz_4tap_filt(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
1186 hz_out2 = __lsx_vshuf_b(hz_out3, hz_out1, shuff);
1187 vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
1188 tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
1189
1190 DUP2_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src5, src6);
1191 hz_out5 = horiz_4tap_filt(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
1192 hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
1193 vec2 = __lsx_vpackev_b(hz_out5, hz_out4);
1194 tmp1 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1);
1195
1196 tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
1197 tmp0 = __lsx_vxori_b(tmp0, 128);
1198 __lsx_vstelm_w(tmp0, dst, 0, 0);
1199 dst += dst_stride;
1200 __lsx_vstelm_w(tmp0, dst, 0, 1);
1201 dst += dst_stride;
1202 __lsx_vstelm_w(tmp0, dst, 0, 2);
1203 dst += dst_stride;
1204 __lsx_vstelm_w(tmp0, dst, 0, 3);
1205 dst += dst_stride;
1206
1207 hz_out1 = hz_out5;
1208 vec0 = vec2;
1209 }
1210 }
1211
common_hv_4ht_4vt_8w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1212 static inline void common_hv_4ht_4vt_8w_lsx(
1213 uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
1214 int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
1215 int32_t height) {
1216 uint32_t loop_cnt;
1217 int32_t src_stride_x2 = src_stride << 1;
1218 int32_t src_stride_x3 = src_stride_x2 + src_stride;
1219 int32_t src_stride_x4 = src_stride << 2;
1220 int32_t dst_stride_x2 = dst_stride << 1;
1221 int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
1222 int32_t dst_stride_x4 = dst_stride << 2;
1223 __m128i src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
1224 __m128i mask0, mask1, out0, out1;
1225 __m128i filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3;
1226 __m128i hz_out0, hz_out1, hz_out2, hz_out3;
1227 __m128i vec0, vec1, vec2, vec3, vec4;
1228
1229 mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
1230 src -= 1 + src_stride;
1231
1232 filt = __lsx_vld(filter_horiz, 0);
1233 DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1);
1234 mask1 = __lsx_vaddi_bu(mask0, 2);
1235
1236 DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1);
1237 src2 = __lsx_vldx(src, src_stride_x2);
1238 src += src_stride_x3;
1239
1240 DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
1241 src2 = __lsx_vxori_b(src2, 128);
1242 hz_out0 = horiz_4tap_filt(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
1243 hz_out1 = horiz_4tap_filt(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
1244 hz_out2 = horiz_4tap_filt(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
1245 DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out2, hz_out1, vec0, vec2);
1246
1247 filt = __lsx_vld(filter_vert, 0);
1248 DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1);
1249
1250 for (loop_cnt = (height >> 2); loop_cnt--;) {
1251 DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
1252 src_stride_x3, src3, src4, src5, src6);
1253 src += src_stride_x4;
1254
1255 DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3,
1256 src4, src5, src6);
1257 hz_out3 = horiz_4tap_filt(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
1258 vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
1259 tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
1260
1261 hz_out0 = horiz_4tap_filt(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
1262 vec3 = __lsx_vpackev_b(hz_out0, hz_out3);
1263 tmp1 = filt_4tap_dpadd_h(vec2, vec3, filt_vt0, filt_vt1);
1264
1265 hz_out1 = horiz_4tap_filt(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
1266 vec4 = __lsx_vpackev_b(hz_out1, hz_out0);
1267 tmp2 = filt_4tap_dpadd_h(vec1, vec4, filt_vt0, filt_vt1);
1268
1269 hz_out2 = horiz_4tap_filt(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
1270 DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out3, hz_out2, hz_out1, vec0, vec1);
1271 tmp3 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
1272
1273 DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
1274 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
1275 __lsx_vstelm_d(out0, dst, 0, 0);
1276 __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
1277 __lsx_vstelm_d(out1, dst + dst_stride_x2, 0, 0);
1278 __lsx_vstelm_d(out1, dst + dst_stride_x3, 0, 1);
1279 dst += dst_stride_x4;
1280
1281 vec0 = vec4;
1282 vec2 = vec1;
1283 }
1284 }
1285
common_hv_4ht_4vt_16w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1286 static void common_hv_4ht_4vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
1287 uint8_t *RESTRICT dst, int32_t dst_stride,
1288 const int8_t *filter_horiz,
1289 const int8_t *filter_vert,
1290 int32_t height) {
1291 common_hv_4ht_4vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
1292 filter_vert, height);
1293 common_hv_4ht_4vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride,
1294 filter_horiz, filter_vert, height);
1295 }
1296
common_hv_6ht_4vt_4w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1297 static void common_hv_6ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
1298 uint8_t *RESTRICT dst, int32_t dst_stride,
1299 const int8_t *filter_horiz,
1300 const int8_t *filter_vert,
1301 int32_t height) {
1302 uint32_t loop_cnt;
1303 __m128i src0, src1, src2, src3, src4, src5, src6;
1304 __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
1305 __m128i filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
1306 __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
1307 __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
1308 int32_t src_stride_x2 = src_stride << 1;
1309 int32_t src_stride_x3 = src_stride_x2 + src_stride;
1310 int32_t src_stride_x4 = src_stride << 2;
1311
1312 mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
1313 src -= 2;
1314
1315 DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0,
1316 filt_hz1);
1317 filt_hz2 = __lsx_vldrepl_h(filter_horiz, 4);
1318 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1319
1320 src1 = __lsx_vld(src, 0);
1321 DUP2_ARG2(__lsx_vldx, src, -src_stride, src, src_stride, src0, src2);
1322 src += src_stride_x2;
1323
1324 DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
1325 src2 = __lsx_vxori_b(src2, 128);
1326
1327 hz_out0 = horiz_6tap_filt(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
1328 filt_hz2);
1329 hz_out1 = horiz_6tap_filt(src1, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
1330 filt_hz2);
1331 vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
1332
1333 DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0,
1334 filt_vt1);
1335
1336 for (loop_cnt = (height >> 2); loop_cnt--;) {
1337 src3 = __lsx_vld(src, 0);
1338 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src4, src5);
1339 src6 = __lsx_vldx(src, src_stride_x3);
1340 src += src_stride_x4;
1341 DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3,
1342 src4, src5, src6);
1343
1344 hz_out3 = horiz_6tap_filt(src3, src4, mask0, mask1, mask2, filt_hz0,
1345 filt_hz1, filt_hz2);
1346 hz_out2 = __lsx_vshuf_b(hz_out3, hz_out1, shuff);
1347 vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
1348 tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
1349
1350 hz_out5 = horiz_6tap_filt(src5, src6, mask0, mask1, mask2, filt_hz0,
1351 filt_hz1, filt_hz2);
1352 hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
1353 vec2 = __lsx_vpackev_b(hz_out5, hz_out4);
1354 tmp1 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1);
1355
1356 DUP2_ARG3(__lsx_vssrarni_b_h, tmp0, tmp0, 7, tmp1, tmp1, 7, tmp0, tmp1);
1357 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
1358
1359 __lsx_vstelm_w(tmp0, dst, 0, 0);
1360 dst += dst_stride;
1361 __lsx_vstelm_w(tmp0, dst, 0, 1);
1362 dst += dst_stride;
1363 __lsx_vstelm_w(tmp1, dst, 0, 0);
1364 dst += dst_stride;
1365 __lsx_vstelm_w(tmp1, dst, 0, 1);
1366 dst += dst_stride;
1367
1368 hz_out1 = hz_out5;
1369 vec0 = vec2;
1370 }
1371 }
1372
common_hv_6ht_4vt_8w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1373 static inline void common_hv_6ht_4vt_8w_lsx(
1374 uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
1375 int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
1376 int32_t height) {
1377 uint32_t loop_cnt;
1378 int32_t src_stride_x2 = src_stride << 1;
1379 int32_t src_stride_x3 = src_stride_x2 + src_stride;
1380 int32_t src_stride_x4 = src_stride << 2;
1381 int32_t dst_stride_x2 = dst_stride << 1;
1382 int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
1383 int32_t dst_stride_x4 = dst_stride << 2;
1384
1385 __m128i src0, src1, src2, src3, src4, src5, src6;
1386 __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
1387 __m128i filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3;
1388 __m128i tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3;
1389 __m128i out0, out1;
1390
1391 mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
1392 src -= (2 + src_stride);
1393
1394 filt = __lsx_vld(filter_horiz, 0);
1395 DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1);
1396 filt_hz2 = __lsx_vreplvei_h(filt, 2);
1397 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1398
1399 DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1);
1400 src2 = __lsx_vldx(src, src_stride_x2);
1401 src += src_stride_x3;
1402
1403 DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
1404 src2 = __lsx_vxori_b(src2, 128);
1405 hz_out0 = horiz_6tap_filt(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1,
1406 filt_hz2);
1407 hz_out1 = horiz_6tap_filt(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
1408 filt_hz2);
1409 hz_out2 = horiz_6tap_filt(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
1410 filt_hz2);
1411 DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out2, hz_out1, vec0, vec2);
1412
1413 filt = __lsx_vld(filter_vert, 0);
1414 DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1);
1415
1416 for (loop_cnt = (height >> 2); loop_cnt--;) {
1417 DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
1418 src_stride_x3, src3, src4, src5, src6);
1419 src += src_stride_x4;
1420 DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3,
1421 src4, src5, src6);
1422
1423 hz_out3 = horiz_6tap_filt(src3, src3, mask0, mask1, mask2, filt_hz0,
1424 filt_hz1, filt_hz2);
1425 vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
1426 tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
1427
1428 hz_out0 = horiz_6tap_filt(src4, src4, mask0, mask1, mask2, filt_hz0,
1429 filt_hz1, filt_hz2);
1430 vec3 = __lsx_vpackev_b(hz_out0, hz_out3);
1431 tmp1 = filt_4tap_dpadd_h(vec2, vec3, filt_vt0, filt_vt1);
1432
1433 hz_out1 = horiz_6tap_filt(src5, src5, mask0, mask1, mask2, filt_hz0,
1434 filt_hz1, filt_hz2);
1435 vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
1436 tmp2 = filt_4tap_dpadd_h(vec1, vec0, filt_vt0, filt_vt1);
1437
1438 hz_out2 = horiz_6tap_filt(src6, src6, mask0, mask1, mask2, filt_hz0,
1439 filt_hz1, filt_hz2);
1440 DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out3, hz_out2, hz_out1, vec1, vec2);
1441 tmp3 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1);
1442
1443 DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
1444 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
1445 __lsx_vstelm_d(out0, dst, 0, 0);
1446 __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
1447 __lsx_vstelm_d(out1, dst + dst_stride_x2, 0, 0);
1448 __lsx_vstelm_d(out1, dst + dst_stride_x3, 0, 1);
1449 dst += dst_stride_x4;
1450 }
1451 }
1452
common_hv_6ht_4vt_16w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1453 static void common_hv_6ht_4vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
1454 uint8_t *RESTRICT dst, int32_t dst_stride,
1455 const int8_t *filter_horiz,
1456 const int8_t *filter_vert,
1457 int32_t height) {
1458 common_hv_6ht_4vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
1459 filter_vert, height);
1460 common_hv_6ht_4vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride,
1461 filter_horiz, filter_vert, height);
1462 }
1463
common_hv_4ht_6vt_4w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1464 static void common_hv_4ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
1465 uint8_t *RESTRICT dst, int32_t dst_stride,
1466 const int8_t *filter_horiz,
1467 const int8_t *filter_vert,
1468 int32_t height) {
1469 uint32_t loop_cnt;
1470 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
1471 __m128i filt_hz0, filt_hz1, filt_vt0, filt_vt1, filt_vt2, mask0, mask1;
1472 __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1473 __m128i hz_out7, tmp0, tmp1, out0, out1, out2, out3;
1474 __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
1475 int32_t src_stride_x2 = src_stride << 1;
1476 int32_t src_stride_x3 = src_stride_x2 + src_stride;
1477 int32_t src_stride_x4 = src_stride << 2;
1478
1479 mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
1480
1481 src -= 1;
1482
1483 DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0,
1484 filt_hz1);
1485 mask1 = __lsx_vaddi_bu(mask0, 2);
1486
1487 DUP4_ARG2(__lsx_vldx, src, -src_stride_x2, src, -src_stride, src, src_stride,
1488 src, src_stride_x2, src0, src1, src3, src4);
1489 src2 = __lsx_vld(src, 0);
1490 src += src_stride_x3;
1491
1492 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
1493 src1, src2, src3);
1494 src4 = __lsx_vxori_b(src4, 128);
1495 hz_out0 = horiz_4tap_filt(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
1496 hz_out2 = horiz_4tap_filt(src2, src3, mask0, mask1, filt_hz0, filt_hz1);
1497 hz_out3 = horiz_4tap_filt(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
1498 hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
1499 DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1);
1500
1501 DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0,
1502 filt_vt1);
1503 filt_vt2 = __lsx_vldrepl_h(filter_vert, 4);
1504
1505 for (loop_cnt = (height >> 2); loop_cnt--;) {
1506 src5 = __lsx_vld(src, 0);
1507 DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src6, src7);
1508 src8 = __lsx_vldx(src, src_stride_x3);
1509 DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5,
1510 src6, src7, src8);
1511 src += src_stride_x4;
1512
1513 hz_out5 = horiz_4tap_filt(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
1514 hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
1515 out2 = __lsx_vpackev_b(hz_out5, hz_out4);
1516 tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
1517
1518 hz_out7 = horiz_4tap_filt(src7, src8, mask0, mask1, filt_hz0, filt_hz1);
1519 hz_out6 = __lsx_vshuf_b(hz_out7, hz_out5, shuff);
1520 out3 = __lsx_vpackev_b(hz_out7, hz_out6);
1521 tmp1 = dpadd_h3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
1522
1523 tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
1524 tmp0 = __lsx_vxori_b(tmp0, 128);
1525 __lsx_vstelm_w(tmp0, dst, 0, 0);
1526 dst += dst_stride;
1527 __lsx_vstelm_w(tmp0, dst, 0, 1);
1528 dst += dst_stride;
1529 __lsx_vstelm_w(tmp0, dst, 0, 2);
1530 dst += dst_stride;
1531 __lsx_vstelm_w(tmp0, dst, 0, 3);
1532 dst += dst_stride;
1533
1534 hz_out3 = hz_out7;
1535 out0 = out2;
1536 out1 = out3;
1537 }
1538 }
1539
common_hv_4ht_6vt_8w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1540 static inline void common_hv_4ht_6vt_8w_lsx(
1541 uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
1542 int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
1543 int32_t height) {
1544 uint32_t loop_cnt;
1545 int32_t src_stride_x2 = src_stride << 1;
1546 int32_t src_stride_x3 = src_stride_x2 + src_stride;
1547 int32_t src_stride_x4 = src_stride << 2;
1548 int32_t dst_stride_x2 = dst_stride << 1;
1549 int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
1550 int32_t dst_stride_x4 = dst_stride << 2;
1551 __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
1552 __m128i filt_hz0, filt_hz1, mask0, mask1;
1553 __m128i filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3;
1554 __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1555 __m128i hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
1556 __m128i vec0, vec1;
1557
1558 mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
1559 src -= 1 + src_stride_x2;
1560
1561 filt = __lsx_vld(filter_horiz, 0);
1562 DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1);
1563 mask1 = __lsx_vaddi_bu(mask0, 2);
1564
1565 DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
1566 src_stride_x3, src0, src1, src2, src3);
1567 src += src_stride_x4;
1568 src4 = __lsx_vld(src, 0);
1569 src += src_stride;
1570
1571 DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
1572 src1, src2, src3);
1573 src4 = __lsx_vxori_b(src4, 128);
1574 hz_out0 = horiz_4tap_filt(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
1575 hz_out1 = horiz_4tap_filt(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
1576 hz_out2 = horiz_4tap_filt(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
1577 hz_out3 = horiz_4tap_filt(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
1578 hz_out4 = horiz_4tap_filt(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
1579 DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1);
1580 DUP2_ARG2(__lsx_vpackev_b, hz_out2, hz_out1, hz_out4, hz_out3, out3, out4);
1581
1582 filt = __lsx_vld(filter_vert, 0);
1583 DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1);
1584 filt_vt2 = __lsx_vreplvei_h(filt, 2);
1585
1586 for (loop_cnt = (height >> 2); loop_cnt--;) {
1587 DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
1588 src_stride_x3, src5, src6, src7, src8);
1589 src += src_stride_x4;
1590
1591 DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5,
1592 src6, src7, src8);
1593 hz_out5 = horiz_4tap_filt(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
1594 out2 = __lsx_vpackev_b(hz_out5, hz_out4);
1595 tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
1596
1597 hz_out6 = horiz_4tap_filt(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
1598 out5 = __lsx_vpackev_b(hz_out6, hz_out5);
1599 tmp1 = dpadd_h3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
1600
1601 hz_out7 = horiz_4tap_filt(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
1602 out6 = __lsx_vpackev_b(hz_out7, hz_out6);
1603 tmp2 = dpadd_h3(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
1604
1605 hz_out8 = horiz_4tap_filt(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
1606 out7 = __lsx_vpackev_b(hz_out8, hz_out7);
1607 tmp3 = dpadd_h3(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
1608 DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, vec0, vec1);
1609 DUP2_ARG2(__lsx_vxori_b, vec0, 128, vec1, 128, vec0, vec1);
1610 __lsx_vstelm_d(vec0, dst, 0, 0);
1611 __lsx_vstelm_d(vec0, dst + dst_stride, 0, 1);
1612 __lsx_vstelm_d(vec1, dst + dst_stride_x2, 0, 0);
1613 __lsx_vstelm_d(vec1, dst + dst_stride_x3, 0, 1);
1614 dst += dst_stride_x4;
1615 hz_out4 = hz_out8;
1616 out0 = out2;
1617 out1 = out6;
1618 out3 = out5;
1619 out4 = out7;
1620 }
1621 }
1622
common_hv_4ht_6vt_16w_lsx(uint8_t * RESTRICT src,int32_t src_stride,uint8_t * RESTRICT dst,int32_t dst_stride,const int8_t * filter_horiz,const int8_t * filter_vert,int32_t height)1623 static void common_hv_4ht_6vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
1624 uint8_t *RESTRICT dst, int32_t dst_stride,
1625 const int8_t *filter_horiz,
1626 const int8_t *filter_vert,
1627 int32_t height) {
1628 common_hv_4ht_6vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
1629 filter_vert, height);
1630 common_hv_4ht_6vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride,
1631 filter_horiz, filter_vert, height);
1632 }
1633
1634 typedef void (*PVp8SixtapPredictFunc1)(
1635 uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
1636 int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
1637 int32_t height);
1638
1639 typedef void (*PVp8SixtapPredictFunc2)(uint8_t *RESTRICT src,
1640 int32_t src_stride,
1641 uint8_t *RESTRICT dst,
1642 int32_t dst_stride, const int8_t *filter,
1643 int32_t height);
1644
vp8_sixtap_predict4x4_lsx(uint8_t * RESTRICT src,int32_t src_stride,int32_t xoffset,int32_t yoffset,uint8_t * RESTRICT dst,int32_t dst_stride)1645 void vp8_sixtap_predict4x4_lsx(uint8_t *RESTRICT src, int32_t src_stride,
1646 int32_t xoffset, int32_t yoffset,
1647 uint8_t *RESTRICT dst, int32_t dst_stride) {
1648 const int8_t *h_filter = vp8_subpel_filters_lsx[xoffset - 1];
1649 const int8_t *v_filter = vp8_subpel_filters_lsx[yoffset - 1];
1650
1651 static PVp8SixtapPredictFunc1 Predict4x4Funcs1[4] = {
1652 common_hv_6ht_6vt_4w_lsx,
1653 common_hv_6ht_4vt_4w_lsx,
1654 common_hv_4ht_6vt_4w_lsx,
1655 common_hv_4ht_4vt_4w_lsx,
1656 };
1657
1658 static PVp8SixtapPredictFunc2 Predict4x4Funcs2[4] = { common_vt_6t_4w_lsx,
1659 common_vt_4t_4w_lsx,
1660 common_hz_6t_4w_lsx,
1661 common_hz_4t_4w_lsx };
1662 if (yoffset < 8 && xoffset < 8) {
1663 if (yoffset) {
1664 if (xoffset) {
1665 switch (xoffset & 1) {
1666 case 0:
1667 switch (yoffset & 1) {
1668 case 0:
1669 Predict4x4Funcs1[0](src, src_stride, dst, dst_stride, h_filter,
1670 v_filter, 4);
1671 break;
1672 case 1:
1673 Predict4x4Funcs1[1](src, src_stride, dst, dst_stride, h_filter,
1674 v_filter + 1, 4);
1675 break;
1676 }
1677 break;
1678
1679 case 1:
1680 switch (yoffset & 1) {
1681 case 0:
1682 Predict4x4Funcs1[2](src, src_stride, dst, dst_stride,
1683 h_filter + 1, v_filter, 4);
1684 break;
1685
1686 case 1:
1687 Predict4x4Funcs1[3](src, src_stride, dst, dst_stride,
1688 h_filter + 1, v_filter + 1, 4);
1689 break;
1690 }
1691 break;
1692 }
1693 } else {
1694 switch (yoffset & 1) {
1695 case 0:
1696 Predict4x4Funcs2[0](src, src_stride, dst, dst_stride, v_filter, 4);
1697 break;
1698
1699 case 1:
1700 Predict4x4Funcs2[1](src, src_stride, dst, dst_stride, v_filter + 1,
1701 4);
1702 break;
1703 }
1704 }
1705 } else {
1706 switch (xoffset) {
1707 case 0: {
1708 __m128i tp0;
1709
1710 tp0 = __lsx_vldrepl_w(src, 0);
1711 src += src_stride;
1712 __lsx_vstelm_w(tp0, dst, 0, 0);
1713 dst += dst_stride;
1714 tp0 = __lsx_vldrepl_w(src, 0);
1715 src += src_stride;
1716 __lsx_vstelm_w(tp0, dst, 0, 0);
1717 dst += dst_stride;
1718 tp0 = __lsx_vldrepl_w(src, 0);
1719 src += src_stride;
1720 __lsx_vstelm_w(tp0, dst, 0, 0);
1721 dst += dst_stride;
1722 tp0 = __lsx_vldrepl_w(src, 0);
1723 __lsx_vstelm_w(tp0, dst, 0, 0);
1724
1725 break;
1726 }
1727 case 2:
1728 case 4:
1729 case 6:
1730 Predict4x4Funcs2[2](src, src_stride, dst, dst_stride, h_filter, 4);
1731 break;
1732 }
1733 switch (xoffset & 1) {
1734 case 1:
1735 Predict4x4Funcs2[3](src, src_stride, dst, dst_stride, h_filter + 1,
1736 4);
1737 break;
1738 }
1739 }
1740 }
1741 }
1742
vp8_sixtap_predict8x8_lsx(uint8_t * RESTRICT src,int32_t src_stride,int32_t xoffset,int32_t yoffset,uint8_t * RESTRICT dst,int32_t dst_stride)1743 void vp8_sixtap_predict8x8_lsx(uint8_t *RESTRICT src, int32_t src_stride,
1744 int32_t xoffset, int32_t yoffset,
1745 uint8_t *RESTRICT dst, int32_t dst_stride) {
1746 const int8_t *h_filter = vp8_subpel_filters_lsx[xoffset - 1];
1747 const int8_t *v_filter = vp8_subpel_filters_lsx[yoffset - 1];
1748
1749 static PVp8SixtapPredictFunc1 Predict8x8Funcs1[4] = {
1750 common_hv_6ht_6vt_8w_lsx,
1751 common_hv_6ht_4vt_8w_lsx,
1752 common_hv_4ht_6vt_8w_lsx,
1753 common_hv_4ht_4vt_8w_lsx,
1754 };
1755
1756 static PVp8SixtapPredictFunc2 Predict8x8Funcs2[4] = { common_vt_6t_8w_lsx,
1757 common_vt_4t_8w_lsx,
1758 common_hz_6t_8w_lsx,
1759 common_hz_4t_8w_lsx };
1760
1761 if (yoffset < 8 && xoffset < 8) {
1762 if (yoffset) {
1763 if (xoffset) {
1764 switch (xoffset & 1) {
1765 case 0:
1766 switch (yoffset & 1) {
1767 case 0:
1768 Predict8x8Funcs1[0](src, src_stride, dst, dst_stride, h_filter,
1769 v_filter, 8);
1770 break;
1771
1772 case 1:
1773 Predict8x8Funcs1[1](src, src_stride, dst, dst_stride, h_filter,
1774 v_filter + 1, 8);
1775 break;
1776 }
1777 break;
1778
1779 case 1:
1780 switch (yoffset & 1) {
1781 case 0:
1782 Predict8x8Funcs1[2](src, src_stride, dst, dst_stride,
1783 h_filter + 1, v_filter, 8);
1784 break;
1785
1786 case 1:
1787 Predict8x8Funcs1[3](src, src_stride, dst, dst_stride,
1788 h_filter + 1, v_filter + 1, 8);
1789 break;
1790 }
1791 break;
1792 }
1793 } else {
1794 switch (yoffset & 1) {
1795 case 0:
1796 Predict8x8Funcs2[0](src, src_stride, dst, dst_stride, v_filter, 8);
1797 break;
1798
1799 case 1:
1800 Predict8x8Funcs2[1](src, src_stride, dst, dst_stride, v_filter + 1,
1801 8);
1802 break;
1803 }
1804 }
1805 } else {
1806 switch (xoffset & 1) {
1807 case 1:
1808 Predict8x8Funcs2[3](src, src_stride, dst, dst_stride, h_filter + 1,
1809 8);
1810 break;
1811 }
1812 switch (xoffset) {
1813 case 0: vp8_copy_mem8x8(src, src_stride, dst, dst_stride); break;
1814 case 2:
1815 case 4:
1816 case 6:
1817 Predict8x8Funcs2[2](src, src_stride, dst, dst_stride, h_filter, 8);
1818 break;
1819 }
1820 }
1821 }
1822 }
1823
vp8_sixtap_predict16x16_lsx(uint8_t * RESTRICT src,int32_t src_stride,int32_t xoffset,int32_t yoffset,uint8_t * RESTRICT dst,int32_t dst_stride)1824 void vp8_sixtap_predict16x16_lsx(uint8_t *RESTRICT src, int32_t src_stride,
1825 int32_t xoffset, int32_t yoffset,
1826 uint8_t *RESTRICT dst, int32_t dst_stride) {
1827 const int8_t *h_filter = vp8_subpel_filters_lsx[xoffset - 1];
1828 const int8_t *v_filter = vp8_subpel_filters_lsx[yoffset - 1];
1829
1830 static PVp8SixtapPredictFunc1 Predict16x16Funcs1[4] = {
1831 common_hv_6ht_6vt_16w_lsx,
1832 common_hv_6ht_4vt_16w_lsx,
1833 common_hv_4ht_6vt_16w_lsx,
1834 common_hv_4ht_4vt_16w_lsx,
1835 };
1836
1837 static PVp8SixtapPredictFunc2 Predict16x16Funcs2[4] = {
1838 common_vt_6t_16w_lsx, common_vt_4t_16w_lsx, common_hz_6t_16w_lsx,
1839 common_hz_4t_16w_lsx
1840 };
1841
1842 if (yoffset < 8 && xoffset < 8) {
1843 if (yoffset) {
1844 if (xoffset) {
1845 switch (xoffset & 1) {
1846 case 0:
1847 switch (yoffset & 1) {
1848 case 0:
1849 Predict16x16Funcs1[0](src, src_stride, dst, dst_stride,
1850 h_filter, v_filter, 16);
1851 break;
1852
1853 case 1:
1854 Predict16x16Funcs1[1](src, src_stride, dst, dst_stride,
1855 h_filter, v_filter + 1, 16);
1856 break;
1857 }
1858 break;
1859
1860 case 1:
1861 switch (yoffset & 1) {
1862 case 0:
1863 Predict16x16Funcs1[2](src, src_stride, dst, dst_stride,
1864 h_filter + 1, v_filter, 16);
1865 break;
1866
1867 case 1:
1868 Predict16x16Funcs1[3](src, src_stride, dst, dst_stride,
1869 h_filter + 1, v_filter + 1, 16);
1870 break;
1871 }
1872 break;
1873 }
1874 } else {
1875 switch (yoffset & 1) {
1876 case 0:
1877 Predict16x16Funcs2[0](src, src_stride, dst, dst_stride, v_filter,
1878 16);
1879 break;
1880
1881 case 1:
1882 Predict16x16Funcs2[1](src, src_stride, dst, dst_stride,
1883 v_filter + 1, 16);
1884 break;
1885 }
1886 }
1887 } else {
1888 switch (xoffset & 1) {
1889 case 1:
1890 Predict16x16Funcs2[3](src, src_stride, dst, dst_stride, h_filter + 1,
1891 16);
1892 break;
1893 }
1894 switch (xoffset) {
1895 case 0: vp8_copy_mem16x16(src, src_stride, dst, dst_stride); break;
1896 case 2:
1897 case 4:
1898 case 6:
1899 Predict16x16Funcs2[2](src, src_stride, dst, dst_stride, h_filter, 16);
1900 break;
1901 }
1902 }
1903 }
1904 }
1905