1*b2055c35SXin Li // Copyright 2016 Google Inc. All Rights Reserved.
2*b2055c35SXin Li //
3*b2055c35SXin Li // Use of this source code is governed by a BSD-style license
4*b2055c35SXin Li // that can be found in the COPYING file in the root of the source
5*b2055c35SXin Li // tree. An additional intellectual property rights grant can be found
6*b2055c35SXin Li // in the file PATENTS. All contributing project authors may
7*b2055c35SXin Li // be found in the AUTHORS file in the root of the source tree.
8*b2055c35SXin Li // -----------------------------------------------------------------------------
9*b2055c35SXin Li //
10*b2055c35SXin Li // MSA variant of Image transform methods for lossless encoder.
11*b2055c35SXin Li //
12*b2055c35SXin Li // Authors: Prashant Patil ([email protected])
13*b2055c35SXin Li
14*b2055c35SXin Li #include "src/dsp/dsp.h"
15*b2055c35SXin Li
16*b2055c35SXin Li #if defined(WEBP_USE_MSA)
17*b2055c35SXin Li
18*b2055c35SXin Li #include "src/dsp/lossless.h"
19*b2055c35SXin Li #include "src/dsp/msa_macro.h"
20*b2055c35SXin Li
21*b2055c35SXin Li #define TRANSFORM_COLOR_8(src0, src1, dst0, dst1, c0, c1, mask0, mask1) do { \
22*b2055c35SXin Li v8i16 g0, g1, t0, t1, t2, t3; \
23*b2055c35SXin Li v4i32 t4, t5; \
24*b2055c35SXin Li VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, g0, g1); \
25*b2055c35SXin Li DOTP_SB2_SH(g0, g1, c0, c0, t0, t1); \
26*b2055c35SXin Li SRAI_H2_SH(t0, t1, 5); \
27*b2055c35SXin Li t0 = __msa_subv_h((v8i16)src0, t0); \
28*b2055c35SXin Li t1 = __msa_subv_h((v8i16)src1, t1); \
29*b2055c35SXin Li t4 = __msa_srli_w((v4i32)src0, 16); \
30*b2055c35SXin Li t5 = __msa_srli_w((v4i32)src1, 16); \
31*b2055c35SXin Li DOTP_SB2_SH(t4, t5, c1, c1, t2, t3); \
32*b2055c35SXin Li SRAI_H2_SH(t2, t3, 5); \
33*b2055c35SXin Li SUB2(t0, t2, t1, t3, t0, t1); \
34*b2055c35SXin Li VSHF_B2_UB(src0, t0, src1, t1, mask1, mask1, dst0, dst1); \
35*b2055c35SXin Li } while (0)
36*b2055c35SXin Li
37*b2055c35SXin Li #define TRANSFORM_COLOR_4(src, dst, c0, c1, mask0, mask1) do { \
38*b2055c35SXin Li const v16i8 g0 = VSHF_SB(src, src, mask0); \
39*b2055c35SXin Li v8i16 t0 = __msa_dotp_s_h(c0, g0); \
40*b2055c35SXin Li v8i16 t1; \
41*b2055c35SXin Li v4i32 t2; \
42*b2055c35SXin Li t0 = SRAI_H(t0, 5); \
43*b2055c35SXin Li t0 = __msa_subv_h((v8i16)src, t0); \
44*b2055c35SXin Li t2 = __msa_srli_w((v4i32)src, 16); \
45*b2055c35SXin Li t1 = __msa_dotp_s_h(c1, (v16i8)t2); \
46*b2055c35SXin Li t1 = SRAI_H(t1, 5); \
47*b2055c35SXin Li t0 = t0 - t1; \
48*b2055c35SXin Li dst = VSHF_UB(src, t0, mask1); \
49*b2055c35SXin Li } while (0)
50*b2055c35SXin Li
TransformColor_MSA(const VP8LMultipliers * const m,uint32_t * data,int num_pixels)51*b2055c35SXin Li static void TransformColor_MSA(const VP8LMultipliers* const m, uint32_t* data,
52*b2055c35SXin Li int num_pixels) {
53*b2055c35SXin Li v16u8 src0, dst0;
54*b2055c35SXin Li const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ |
55*b2055c35SXin Li (m->green_to_red_ << 16));
56*b2055c35SXin Li const v16i8 r2b = (v16i8)__msa_fill_w(m->red_to_blue_);
57*b2055c35SXin Li const v16u8 mask0 = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255,
58*b2055c35SXin Li 13, 255, 13, 255 };
59*b2055c35SXin Li const v16u8 mask1 = { 16, 1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11,
60*b2055c35SXin Li 28, 13, 30, 15 };
61*b2055c35SXin Li
62*b2055c35SXin Li while (num_pixels >= 8) {
63*b2055c35SXin Li v16u8 src1, dst1;
64*b2055c35SXin Li LD_UB2(data, 4, src0, src1);
65*b2055c35SXin Li TRANSFORM_COLOR_8(src0, src1, dst0, dst1, g2br, r2b, mask0, mask1);
66*b2055c35SXin Li ST_UB2(dst0, dst1, data, 4);
67*b2055c35SXin Li data += 8;
68*b2055c35SXin Li num_pixels -= 8;
69*b2055c35SXin Li }
70*b2055c35SXin Li if (num_pixels > 0) {
71*b2055c35SXin Li if (num_pixels >= 4) {
72*b2055c35SXin Li src0 = LD_UB(data);
73*b2055c35SXin Li TRANSFORM_COLOR_4(src0, dst0, g2br, r2b, mask0, mask1);
74*b2055c35SXin Li ST_UB(dst0, data);
75*b2055c35SXin Li data += 4;
76*b2055c35SXin Li num_pixels -= 4;
77*b2055c35SXin Li }
78*b2055c35SXin Li if (num_pixels > 0) {
79*b2055c35SXin Li src0 = LD_UB(data);
80*b2055c35SXin Li TRANSFORM_COLOR_4(src0, dst0, g2br, r2b, mask0, mask1);
81*b2055c35SXin Li if (num_pixels == 3) {
82*b2055c35SXin Li const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);
83*b2055c35SXin Li const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 2);
84*b2055c35SXin Li SD(pix_d, data + 0);
85*b2055c35SXin Li SW(pix_w, data + 2);
86*b2055c35SXin Li } else if (num_pixels == 2) {
87*b2055c35SXin Li const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);
88*b2055c35SXin Li SD(pix_d, data);
89*b2055c35SXin Li } else {
90*b2055c35SXin Li const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 0);
91*b2055c35SXin Li SW(pix_w, data);
92*b2055c35SXin Li }
93*b2055c35SXin Li }
94*b2055c35SXin Li }
95*b2055c35SXin Li }
96*b2055c35SXin Li
SubtractGreenFromBlueAndRed_MSA(uint32_t * argb_data,int num_pixels)97*b2055c35SXin Li static void SubtractGreenFromBlueAndRed_MSA(uint32_t* argb_data,
98*b2055c35SXin Li int num_pixels) {
99*b2055c35SXin Li int i;
100*b2055c35SXin Li uint8_t* ptemp_data = (uint8_t*)argb_data;
101*b2055c35SXin Li v16u8 src0, dst0, tmp0;
102*b2055c35SXin Li const v16u8 mask = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255,
103*b2055c35SXin Li 13, 255, 13, 255 };
104*b2055c35SXin Li
105*b2055c35SXin Li while (num_pixels >= 8) {
106*b2055c35SXin Li v16u8 src1, dst1, tmp1;
107*b2055c35SXin Li LD_UB2(ptemp_data, 16, src0, src1);
108*b2055c35SXin Li VSHF_B2_UB(src0, src1, src1, src0, mask, mask, tmp0, tmp1);
109*b2055c35SXin Li SUB2(src0, tmp0, src1, tmp1, dst0, dst1);
110*b2055c35SXin Li ST_UB2(dst0, dst1, ptemp_data, 16);
111*b2055c35SXin Li ptemp_data += 8 * 4;
112*b2055c35SXin Li num_pixels -= 8;
113*b2055c35SXin Li }
114*b2055c35SXin Li if (num_pixels > 0) {
115*b2055c35SXin Li if (num_pixels >= 4) {
116*b2055c35SXin Li src0 = LD_UB(ptemp_data);
117*b2055c35SXin Li tmp0 = VSHF_UB(src0, src0, mask);
118*b2055c35SXin Li dst0 = src0 - tmp0;
119*b2055c35SXin Li ST_UB(dst0, ptemp_data);
120*b2055c35SXin Li ptemp_data += 4 * 4;
121*b2055c35SXin Li num_pixels -= 4;
122*b2055c35SXin Li }
123*b2055c35SXin Li for (i = 0; i < num_pixels; i++) {
124*b2055c35SXin Li const uint8_t b = ptemp_data[0];
125*b2055c35SXin Li const uint8_t g = ptemp_data[1];
126*b2055c35SXin Li const uint8_t r = ptemp_data[2];
127*b2055c35SXin Li ptemp_data[0] = (b - g) & 0xff;
128*b2055c35SXin Li ptemp_data[2] = (r - g) & 0xff;
129*b2055c35SXin Li ptemp_data += 4;
130*b2055c35SXin Li }
131*b2055c35SXin Li }
132*b2055c35SXin Li }
133*b2055c35SXin Li
134*b2055c35SXin Li //------------------------------------------------------------------------------
135*b2055c35SXin Li // Entry point
136*b2055c35SXin Li
137*b2055c35SXin Li extern void VP8LEncDspInitMSA(void);
138*b2055c35SXin Li
VP8LEncDspInitMSA(void)139*b2055c35SXin Li WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMSA(void) {
140*b2055c35SXin Li VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_MSA;
141*b2055c35SXin Li VP8LTransformColor = TransformColor_MSA;
142*b2055c35SXin Li }
143*b2055c35SXin Li
144*b2055c35SXin Li #else // !WEBP_USE_MSA
145*b2055c35SXin Li
146*b2055c35SXin Li WEBP_DSP_INIT_STUB(VP8LEncDspInitMSA)
147*b2055c35SXin Li
148*b2055c35SXin Li #endif // WEBP_USE_MSA
149