1 /*
2 * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #ifndef VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_
12 #define VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_
13
14 #include "./vpx_config.h"
15 #include "vpx_dsp/vpx_filter.h"
16 #include "vpx_util/loongson_intrinsics.h"
17
filt_8tap_dpadd_s_h(__m128i _reg0,__m128i _reg1,__m128i _reg2,__m128i _reg3,__m128i _filter0,__m128i _filter1,__m128i _filter2,__m128i _filter3)18 static INLINE __m128i filt_8tap_dpadd_s_h(__m128i _reg0, __m128i _reg1,
19 __m128i _reg2, __m128i _reg3,
20 __m128i _filter0, __m128i _filter1,
21 __m128i _filter2, __m128i _filter3) {
22 __m128i _vec0, _vec1;
23
24 _vec0 = __lsx_vdp2_h_b(_reg0, _filter0);
25 _vec0 = __lsx_vdp2add_h_b(_vec0, _reg1, _filter1);
26 _vec1 = __lsx_vdp2_h_b(_reg2, _filter2);
27 _vec1 = __lsx_vdp2add_h_b(_vec1, _reg3, _filter3);
28 return __lsx_vsadd_h(_vec0, _vec1);
29 }
30
horiz_8tap_filt(__m128i _src0,__m128i _src1,__m128i _mask0,__m128i _mask1,__m128i _mask2,__m128i _mask3,__m128i _filt_h0,__m128i _filt_h1,__m128i _filt_h2,__m128i _filt_h3)31 static INLINE __m128i horiz_8tap_filt(__m128i _src0, __m128i _src1,
32 __m128i _mask0, __m128i _mask1,
33 __m128i _mask2, __m128i _mask3,
34 __m128i _filt_h0, __m128i _filt_h1,
35 __m128i _filt_h2, __m128i _filt_h3) {
36 __m128i _tmp0, _tmp1, _tmp2, _tmp3;
37 __m128i _out;
38
39 DUP4_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src1, _src0, _mask1, _src1,
40 _src0, _mask2, _src1, _src0, _mask3, _tmp0, _tmp1, _tmp2, _tmp3);
41 _out = filt_8tap_dpadd_s_h(_tmp0, _tmp1, _tmp2, _tmp3, _filt_h0, _filt_h1,
42 _filt_h2, _filt_h3);
43 _out = __lsx_vsrari_h(_out, FILTER_BITS);
44 return __lsx_vsat_h(_out, 7);
45 }
46
horiz_2tap_filt_uh(__m128i in0,__m128i in1,__m128i mask,__m128i coeff)47 static INLINE __m128i horiz_2tap_filt_uh(__m128i in0, __m128i in1, __m128i mask,
48 __m128i coeff) {
49 __m128i tmp0_m, tmp1_m;
50
51 tmp0_m = __lsx_vshuf_b(in1, in0, mask);
52 tmp1_m = __lsx_vdp2_h_bu(tmp0_m, coeff);
53 return __lsx_vsrari_h(tmp1_m, FILTER_BITS);
54 }
55
56 #define LSX_LD_4(_src, _stride, _src0, _src1, _src2, _src3) \
57 do { \
58 _src0 = __lsx_vld(_src, 0); \
59 _src += _stride; \
60 _src1 = __lsx_vld(_src, 0); \
61 _src += _stride; \
62 _src2 = __lsx_vld(_src, 0); \
63 _src += _stride; \
64 _src3 = __lsx_vld(_src, 0); \
65 } while (0)
66
67 #define HORIZ_8TAP_4WID_4VECS_FILT(_src0, _src1, _src2, _src3, _mask0, _mask1, \
68 _mask2, _mask3, _filter0, _filter1, \
69 _filter2, _filter3, _out0, _out1) \
70 do { \
71 __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \
72 __m128i _reg0, _reg1, _reg2, _reg3; \
73 \
74 DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src3, _src2, _mask0, \
75 _tmp0, _tmp1); \
76 DUP2_ARG2(__lsx_vdp2_h_b, _tmp0, _filter0, _tmp1, _filter0, _reg0, _reg1); \
77 DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask1, _src3, _src2, _mask1, \
78 _tmp2, _tmp3); \
79 DUP2_ARG3(__lsx_vdp2add_h_b, _reg0, _tmp2, _filter1, _reg1, _tmp3, \
80 _filter1, _reg0, _reg1); \
81 DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask2, _src3, _src2, _mask2, \
82 _tmp4, _tmp5); \
83 DUP2_ARG2(__lsx_vdp2_h_b, _tmp4, _filter2, _tmp5, _filter2, _reg2, _reg3); \
84 DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask3, _src3, _src2, _mask3, \
85 _tmp6, _tmp7); \
86 DUP2_ARG3(__lsx_vdp2add_h_b, _reg2, _tmp6, _filter3, _reg3, _tmp7, \
87 _filter3, _reg2, _reg3); \
88 DUP2_ARG2(__lsx_vsadd_h, _reg0, _reg2, _reg1, _reg3, _out0, _out1); \
89 } while (0)
90
91 #define HORIZ_8TAP_8WID_4VECS_FILT( \
92 _src0, _src1, _src2, _src3, _mask0, _mask1, _mask2, _mask3, _filter0, \
93 _filter1, _filter2, _filter3, _out0, _out1, _out2, _out3) \
94 do { \
95 __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \
96 __m128i _reg0, _reg1, _reg2, _reg3, _reg4, _reg5, _reg6, _reg7; \
97 \
98 DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask0, _src1, _src1, _mask0, \
99 _src2, _src2, _mask0, _src3, _src3, _mask0, _tmp0, _tmp1, _tmp2, \
100 _tmp3); \
101 DUP4_ARG2(__lsx_vdp2_h_b, _tmp0, _filter0, _tmp1, _filter0, _tmp2, \
102 _filter0, _tmp3, _filter0, _reg0, _reg1, _reg2, _reg3); \
103 DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask2, _src1, _src1, _mask2, \
104 _src2, _src2, _mask2, _src3, _src3, _mask2, _tmp0, _tmp1, _tmp2, \
105 _tmp3); \
106 DUP4_ARG2(__lsx_vdp2_h_b, _tmp0, _filter2, _tmp1, _filter2, _tmp2, \
107 _filter2, _tmp3, _filter2, _reg4, _reg5, _reg6, _reg7); \
108 DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask1, _src1, _src1, _mask1, \
109 _src2, _src2, _mask1, _src3, _src3, _mask1, _tmp4, _tmp5, _tmp6, \
110 _tmp7); \
111 DUP4_ARG3(__lsx_vdp2add_h_b, _reg0, _tmp4, _filter1, _reg1, _tmp5, \
112 _filter1, _reg2, _tmp6, _filter1, _reg3, _tmp7, _filter1, _reg0, \
113 _reg1, _reg2, _reg3); \
114 DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask3, _src1, _src1, _mask3, \
115 _src2, _src2, _mask3, _src3, _src3, _mask3, _tmp4, _tmp5, _tmp6, \
116 _tmp7); \
117 DUP4_ARG3(__lsx_vdp2add_h_b, _reg4, _tmp4, _filter3, _reg5, _tmp5, \
118 _filter3, _reg6, _tmp6, _filter3, _reg7, _tmp7, _filter3, _reg4, \
119 _reg5, _reg6, _reg7); \
120 DUP4_ARG2(__lsx_vsadd_h, _reg0, _reg4, _reg1, _reg5, _reg2, _reg6, _reg3, \
121 _reg7, _out0, _out1, _out2, _out3); \
122 } while (0)
123
124 #define AVG_ST4_D(in0, in1, dst0, dst1, pdst, stride) \
125 do { \
126 __m128i tmp0_m, tmp1_m; \
127 \
128 DUP2_ARG2(__lsx_vavgr_bu, in0, dst0, in1, dst1, tmp0_m, tmp1_m); \
129 __lsx_vstelm_d(tmp0_m, pdst, 0, 0); \
130 pdst += stride; \
131 __lsx_vstelm_d(tmp0_m, pdst, 0, 1); \
132 pdst += stride; \
133 __lsx_vstelm_d(tmp1_m, pdst, 0, 0); \
134 pdst += stride; \
135 __lsx_vstelm_d(tmp1_m, pdst, 0, 1); \
136 } while (0)
137
138 #endif // VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_
139