xref: /aosp_15_r20/external/libyuv/source/row_rvv.cc (revision 4e366538070a3a6c5c163c31b791eab742e1657a)
1 /*
2  *  Copyright 2023 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 /*
12  * Copyright (c) 2023 SiFive, Inc. All rights reserved.
13  *
14  * Contributed by Darren Hsieh <[email protected]>
15  * Contributed by Bruce Lai <[email protected]>
16  */
17 
18 #include "libyuv/row.h"
19 
20 // This module is for clang rvv. GCC hasn't supported segment load & store.
21 #if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) && \
22     defined(__clang__)
23 #include <assert.h>
24 #include <riscv_vector.h>
25 
26 #ifdef __cplusplus
27 namespace libyuv {
28 extern "C" {
29 #endif
30 
31 // Fill YUV -> RGB conversion constants into vectors
32 // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
33 // register) is set to round-to-nearest-up mode(0).
34 #define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \
35   {                                                              \
36     asm volatile("csrwi vxrm, 0");                               \
37     ub = yuvconst->kUVCoeff[0];                                  \
38     vr = yuvconst->kUVCoeff[1];                                  \
39     ug = yuvconst->kUVCoeff[2];                                  \
40     vg = yuvconst->kUVCoeff[3];                                  \
41     yg = yuvconst->kRGBCoeffBias[0];                             \
42     bb = yuvconst->kRGBCoeffBias[1] + 32;                        \
43     bg = yuvconst->kRGBCoeffBias[2] - 32;                        \
44     br = yuvconst->kRGBCoeffBias[3] + 32;                        \
45   }
46 
47 // Read [2*VLEN/8] Y, [VLEN/8] U and [VLEN/8] V from 422
48 #define READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16) \
49   {                                                              \
50     vuint8m1_t v_tmp0, v_tmp1;                                   \
51     vuint8m2_t v_y;                                              \
52     vuint16m2_t v_u_16, v_v_16;                                  \
53     vl = __riscv_vsetvl_e8m1((w + 1) / 2);                       \
54     v_tmp0 = __riscv_vle8_v_u8m1(src_u, vl);                     \
55     v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl);             \
56     v_tmp1 = __riscv_vle8_v_u8m1(src_v, vl);                     \
57     v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl);             \
58     v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl);          \
59     v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl);          \
60     v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16);             \
61     v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16);             \
62     vl = __riscv_vsetvl_e8m2(w);                                 \
63     v_y = __riscv_vle8_v_u8m2(src_y, vl);                        \
64     v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl);                \
65   }
66 
67 // Read [2*VLEN/8] Y, [2*VLEN/8] U, and [2*VLEN/8] V from 444
68 #define READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16) \
69   {                                                              \
70     vuint8m2_t v_y;                                              \
71     vl = __riscv_vsetvl_e8m2(w);                                 \
72     v_y = __riscv_vle8_v_u8m2(src_y, vl);                        \
73     v_u = __riscv_vle8_v_u8m2(src_u, vl);                        \
74     v_v = __riscv_vle8_v_u8m2(src_v, vl);                        \
75     v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl);                \
76   }
77 
78 // Convert from YUV to fixed point RGB
79 #define YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, \
80                  v_b_16, v_r_16)                                               \
81   {                                                                            \
82     vuint16m4_t v_tmp0, v_tmp1, v_tmp2, v_tmp3, v_tmp4;                        \
83     vuint32m8_t v_tmp5;                                                        \
84     v_tmp0 = __riscv_vwmulu_vx_u16m4(v_u, ug, vl);                             \
85     v_y_16 = __riscv_vmul_vx_u16m4(v_y_16, 0x0101, vl);                        \
86     v_tmp0 = __riscv_vwmaccu_vx_u16m4(v_tmp0, vg, v_v, vl);                    \
87     v_tmp1 = __riscv_vwmulu_vx_u16m4(v_u, ub, vl);                             \
88     v_tmp5 = __riscv_vwmulu_vx_u32m8(v_y_16, yg, vl);                          \
89     v_tmp2 = __riscv_vnsrl_wx_u16m4(v_tmp5, 16, vl);                           \
90     v_tmp3 = __riscv_vadd_vx_u16m4(v_tmp2, bg, vl);                            \
91     v_tmp4 = __riscv_vadd_vv_u16m4(v_tmp2, v_tmp1, vl);                        \
92     v_tmp2 = __riscv_vwmaccu_vx_u16m4(v_tmp2, vr, v_v, vl);                    \
93     v_g_16 = __riscv_vssubu_vv_u16m4(v_tmp3, v_tmp0, vl);                      \
94     v_b_16 = __riscv_vssubu_vx_u16m4(v_tmp4, bb, vl);                          \
95     v_r_16 = __riscv_vssubu_vx_u16m4(v_tmp2, br, vl);                          \
96   }
97 
98 // Convert from fixed point RGB To 8 bit RGB
99 #define RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r) \
100   {                                                          \
101     v_g = __riscv_vnclipu_wx_u8m2(v_g_16, 6, vl);            \
102     v_b = __riscv_vnclipu_wx_u8m2(v_b_16, 6, vl);            \
103     v_r = __riscv_vnclipu_wx_u8m2(v_r_16, 6, vl);            \
104   }
105 
106 // Read [2*VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_uv
107 #define READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16)   \
108   {                                                        \
109     vuint8m1_t v_tmp0, v_tmp1;                             \
110     vuint8m2_t v_y;                                        \
111     vuint16m2_t v_u_16, v_v_16;                            \
112     vl = __riscv_vsetvl_e8m1((w + 1) / 2);                 \
113     __riscv_vlseg2e8_v_u8m1(&v_tmp0, &v_tmp1, src_uv, vl); \
114     v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl);       \
115     v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl);       \
116     v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl);    \
117     v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl);    \
118     v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16);       \
119     v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16);       \
120     vl = __riscv_vsetvl_e8m2(w);                           \
121     v_y = __riscv_vle8_v_u8m2(src_y, vl);                  \
122     v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl);          \
123   }
124 
125 // Read 2*[VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_vu
126 #define READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16)   \
127   {                                                        \
128     vuint8m1_t v_tmp0, v_tmp1;                             \
129     vuint8m2_t v_y;                                        \
130     vuint16m2_t v_u_16, v_v_16;                            \
131     vl = __riscv_vsetvl_e8m1((w + 1) / 2);                 \
132     __riscv_vlseg2e8_v_u8m1(&v_tmp0, &v_tmp1, src_vu, vl); \
133     v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl);       \
134     v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl);       \
135     v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl);    \
136     v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl);    \
137     v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16);       \
138     v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16);       \
139     vl = __riscv_vsetvl_e8m2(w);                           \
140     v_y = __riscv_vle8_v_u8m2(src_y, vl);                  \
141     v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl);          \
142   }
143 
144 #ifdef HAS_ARGBTOAR64ROW_RVV
ARGBToAR64Row_RVV(const uint8_t * src_argb,uint16_t * dst_ar64,int width)145 void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width) {
146   size_t avl = (size_t)4 * width;
147   do {
148     vuint16m8_t v_ar64;
149     vuint8m4_t v_argb;
150     size_t vl = __riscv_vsetvl_e8m4(avl);
151     v_argb = __riscv_vle8_v_u8m4(src_argb, vl);
152     v_ar64 = __riscv_vwaddu_vx_u16m8(v_argb, 0, vl);
153     v_ar64 = __riscv_vmul_vx_u16m8(v_ar64, 0x0101, vl);
154     __riscv_vse16_v_u16m8(dst_ar64, v_ar64, vl);
155     avl -= vl;
156     src_argb += vl;
157     dst_ar64 += vl;
158   } while (avl > 0);
159 }
160 #endif
161 
162 #ifdef HAS_ARGBTOAB64ROW_RVV
ARGBToAB64Row_RVV(const uint8_t * src_argb,uint16_t * dst_ab64,int width)163 void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width) {
164   size_t avl = (size_t)width;
165   do {
166     vuint16m2_t v_b_16, v_g_16, v_r_16, v_a_16;
167     vuint8m1_t v_b, v_g, v_r, v_a;
168     size_t vl = __riscv_vsetvl_e8m1(avl);
169     __riscv_vlseg4e8_v_u8m1(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
170     v_b_16 = __riscv_vwaddu_vx_u16m2(v_b, 0, vl);
171     v_g_16 = __riscv_vwaddu_vx_u16m2(v_g, 0, vl);
172     v_r_16 = __riscv_vwaddu_vx_u16m2(v_r, 0, vl);
173     v_a_16 = __riscv_vwaddu_vx_u16m2(v_a, 0, vl);
174     v_b_16 = __riscv_vmul_vx_u16m2(v_b_16, 0x0101, vl);
175     v_g_16 = __riscv_vmul_vx_u16m2(v_g_16, 0x0101, vl);
176     v_r_16 = __riscv_vmul_vx_u16m2(v_r_16, 0x0101, vl);
177     v_a_16 = __riscv_vmul_vx_u16m2(v_a_16, 0x0101, vl);
178     __riscv_vsseg4e16_v_u16m2(dst_ab64, v_r_16, v_g_16, v_b_16, v_a_16, vl);
179     avl -= vl;
180     src_argb += 4 * vl;
181     dst_ab64 += 4 * vl;
182   } while (avl > 0);
183 }
184 #endif
185 
186 #ifdef HAS_AR64TOARGBROW_RVV
AR64ToARGBRow_RVV(const uint16_t * src_ar64,uint8_t * dst_argb,int width)187 void AR64ToARGBRow_RVV(const uint16_t* src_ar64, uint8_t* dst_argb, int width) {
188   size_t avl = (size_t)4 * width;
189   do {
190     vuint16m8_t v_ar64;
191     vuint8m4_t v_argb;
192     size_t vl = __riscv_vsetvl_e16m8(avl);
193     v_ar64 = __riscv_vle16_v_u16m8(src_ar64, vl);
194     v_argb = __riscv_vnsrl_wx_u8m4(v_ar64, 8, vl);
195     __riscv_vse8_v_u8m4(dst_argb, v_argb, vl);
196     avl -= vl;
197     src_ar64 += vl;
198     dst_argb += vl;
199   } while (avl > 0);
200 }
201 #endif
202 
203 #ifdef HAS_AR64TOAB64ROW_RVV
AR64ToAB64Row_RVV(const uint16_t * src_ar64,uint16_t * dst_ab64,int width)204 void AR64ToAB64Row_RVV(const uint16_t* src_ar64,
205                        uint16_t* dst_ab64,
206                        int width) {
207   size_t w = (size_t)width;
208   do {
209     size_t vl = __riscv_vsetvl_e16m2(w);
210     vuint16m2_t v_b, v_g, v_r, v_a;
211     __riscv_vlseg4e16_v_u16m2(&v_b, &v_g, &v_r, &v_a, src_ar64, vl);
212     __riscv_vsseg4e16_v_u16m2(dst_ab64, v_r, v_g, v_b, v_a, vl);
213     w -= vl;
214     src_ar64 += vl * 4;
215     dst_ab64 += vl * 4;
216   } while (w > 0);
217 }
218 #endif
219 
220 #ifdef HAS_AB64TOARGBROW_RVV
AB64ToARGBRow_RVV(const uint16_t * src_ab64,uint8_t * dst_argb,int width)221 void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width) {
222   size_t avl = (size_t)width;
223   do {
224     vuint16m2_t v_b_16, v_g_16, v_r_16, v_a_16;
225     vuint8m1_t v_b, v_g, v_r, v_a;
226     size_t vl = __riscv_vsetvl_e16m2(avl);
227     __riscv_vlseg4e16_v_u16m2(&v_r_16, &v_g_16, &v_b_16, &v_a_16, src_ab64, vl);
228     v_b = __riscv_vnsrl_wx_u8m1(v_b_16, 8, vl);
229     v_g = __riscv_vnsrl_wx_u8m1(v_g_16, 8, vl);
230     v_r = __riscv_vnsrl_wx_u8m1(v_r_16, 8, vl);
231     v_a = __riscv_vnsrl_wx_u8m1(v_a_16, 8, vl);
232     __riscv_vsseg4e8_v_u8m1(dst_argb, v_b, v_g, v_r, v_a, vl);
233     avl -= vl;
234     src_ab64 += 4 * vl;
235     dst_argb += 4 * vl;
236   } while (avl > 0);
237 }
238 #endif
239 
240 #ifdef HAS_RAWTOARGBROW_RVV
RAWToARGBRow_RVV(const uint8_t * src_raw,uint8_t * dst_argb,int width)241 void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
242   size_t w = (size_t)width;
243   size_t vl = __riscv_vsetvl_e8m2(w);
244   vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
245   do {
246     vuint8m2_t v_b, v_g, v_r;
247     __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_raw, vl);
248     __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
249     w -= vl;
250     src_raw += vl * 3;
251     dst_argb += vl * 4;
252     vl = __riscv_vsetvl_e8m2(w);
253   } while (w > 0);
254 }
255 #endif
256 
257 #ifdef HAS_RAWTORGBAROW_RVV
RAWToRGBARow_RVV(const uint8_t * src_raw,uint8_t * dst_rgba,int width)258 void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
259   size_t w = (size_t)width;
260   size_t vl = __riscv_vsetvl_e8m2(w);
261   vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
262   do {
263     vuint8m2_t v_b, v_g, v_r;
264     __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_raw, vl);
265     __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl);
266     w -= vl;
267     src_raw += vl * 3;
268     dst_rgba += vl * 4;
269     vl = __riscv_vsetvl_e8m2(w);
270   } while (w > 0);
271 }
272 #endif
273 
274 #ifdef HAS_RAWTORGB24ROW_RVV
RAWToRGB24Row_RVV(const uint8_t * src_raw,uint8_t * dst_rgb24,int width)275 void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
276   size_t w = (size_t)width;
277   do {
278     vuint8m2_t v_b, v_g, v_r;
279     size_t vl = __riscv_vsetvl_e8m2(w);
280     __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_raw, vl);
281     __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_r, v_g, v_b, vl);
282     w -= vl;
283     src_raw += vl * 3;
284     dst_rgb24 += vl * 3;
285   } while (w > 0);
286 }
287 #endif
288 
289 #ifdef HAS_ARGBTORAWROW_RVV
ARGBToRAWRow_RVV(const uint8_t * src_argb,uint8_t * dst_raw,int width)290 void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
291   size_t w = (size_t)width;
292   do {
293     vuint8m2_t v_b, v_g, v_r, v_a;
294     size_t vl = __riscv_vsetvl_e8m2(w);
295     __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
296     __riscv_vsseg3e8_v_u8m2(dst_raw, v_r, v_g, v_b, vl);
297     w -= vl;
298     src_argb += vl * 4;
299     dst_raw += vl * 3;
300   } while (w > 0);
301 }
302 #endif
303 
304 #ifdef HAS_ARGBTORGB24ROW_RVV
ARGBToRGB24Row_RVV(const uint8_t * src_argb,uint8_t * dst_rgb24,int width)305 void ARGBToRGB24Row_RVV(const uint8_t* src_argb,
306                         uint8_t* dst_rgb24,
307                         int width) {
308   size_t w = (size_t)width;
309   do {
310     vuint8m2_t v_b, v_g, v_r, v_a;
311     size_t vl = __riscv_vsetvl_e8m2(w);
312     __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
313     __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl);
314     w -= vl;
315     src_argb += vl * 4;
316     dst_rgb24 += vl * 3;
317   } while (w > 0);
318 }
319 #endif
320 
321 #ifdef HAS_ARGBTOABGRROW_RVV
ARGBToABGRRow_RVV(const uint8_t * src_argb,uint8_t * dst_abgr,int width)322 void ARGBToABGRRow_RVV(const uint8_t* src_argb, uint8_t* dst_abgr, int width) {
323   size_t w = (size_t)width;
324   do {
325     size_t vl = __riscv_vsetvl_e8m2(w);
326     vuint8m2_t v_a, v_r, v_g, v_b;
327     __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
328     __riscv_vsseg4e8_v_u8m2(dst_abgr, v_r, v_g, v_b, v_a, vl);
329     w -= vl;
330     src_argb += vl * 4;
331     dst_abgr += vl * 4;
332   } while (w > 0);
333 }
334 #endif
335 
336 #ifdef HAS_ARGBTOBGRAROW_RVV
ARGBToBGRARow_RVV(const uint8_t * src_argb,uint8_t * dst_bgra,int width)337 void ARGBToBGRARow_RVV(const uint8_t* src_argb, uint8_t* dst_bgra, int width) {
338   size_t w = (size_t)width;
339   do {
340     size_t vl = __riscv_vsetvl_e8m2(w);
341     vuint8m2_t v_a, v_r, v_g, v_b;
342     __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
343     __riscv_vsseg4e8_v_u8m2(dst_bgra, v_a, v_r, v_g, v_b, vl);
344     w -= vl;
345     src_argb += vl * 4;
346     dst_bgra += vl * 4;
347   } while (w > 0);
348 }
349 #endif
350 
351 #ifdef HAS_ARGBTORGBAROW_RVV
ARGBToRGBARow_RVV(const uint8_t * src_argb,uint8_t * dst_rgba,int width)352 void ARGBToRGBARow_RVV(const uint8_t* src_argb, uint8_t* dst_rgba, int width) {
353   size_t w = (size_t)width;
354   do {
355     size_t vl = __riscv_vsetvl_e8m2(w);
356     vuint8m2_t v_a, v_r, v_g, v_b;
357     __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
358     __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl);
359     w -= vl;
360     src_argb += vl * 4;
361     dst_rgba += vl * 4;
362   } while (w > 0);
363 }
364 #endif
365 
366 #ifdef HAS_RGBATOARGBROW_RVV
RGBAToARGBRow_RVV(const uint8_t * src_rgba,uint8_t * dst_argb,int width)367 void RGBAToARGBRow_RVV(const uint8_t* src_rgba, uint8_t* dst_argb, int width) {
368   size_t w = (size_t)width;
369   do {
370     size_t vl = __riscv_vsetvl_e8m2(w);
371     vuint8m2_t v_a, v_r, v_g, v_b;
372     __riscv_vlseg4e8_v_u8m2(&v_a, &v_b, &v_g, &v_r, src_rgba, vl);
373     __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
374     w -= vl;
375     src_rgba += vl * 4;
376     dst_argb += vl * 4;
377   } while (w > 0);
378 }
379 #endif
380 
381 #ifdef HAS_RGB24TOARGBROW_RVV
RGB24ToARGBRow_RVV(const uint8_t * src_rgb24,uint8_t * dst_argb,int width)382 void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24,
383                         uint8_t* dst_argb,
384                         int width) {
385   size_t w = (size_t)width;
386   size_t vl = __riscv_vsetvl_e8m2(w);
387   vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
388   do {
389     vuint8m2_t v_b, v_g, v_r;
390     __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_rgb24, vl);
391     __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
392     w -= vl;
393     src_rgb24 += vl * 3;
394     dst_argb += vl * 4;
395     vl = __riscv_vsetvl_e8m2(w);
396   } while (w > 0);
397 }
398 #endif
399 
400 #ifdef HAS_I444TOARGBROW_RVV
I444ToARGBRow_RVV(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)401 void I444ToARGBRow_RVV(const uint8_t* src_y,
402                        const uint8_t* src_u,
403                        const uint8_t* src_v,
404                        uint8_t* dst_argb,
405                        const struct YuvConstants* yuvconstants,
406                        int width) {
407   size_t w = (size_t)width;
408   size_t vl = __riscv_vsetvl_e8m2(w);
409   uint8_t ub, vr, ug, vg;
410   int16_t yg, bb, bg, br;
411   vuint8m2_t v_u, v_v;
412   vuint8m2_t v_b, v_g, v_r, v_a;
413   vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
414   YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
415   v_a = __riscv_vmv_v_x_u8m2(255u, vl);
416   do {
417     READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
418     YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
419              v_b_16, v_r_16);
420     RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
421     __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
422     w -= vl;
423     src_y += vl;
424     src_u += vl;
425     src_v += vl;
426     dst_argb += vl * 4;
427   } while (w > 0);
428 }
429 #endif
430 
431 #ifdef HAS_I444ALPHATOARGBROW_RVV
I444AlphaToARGBRow_RVV(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,const uint8_t * src_a,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)432 void I444AlphaToARGBRow_RVV(const uint8_t* src_y,
433                             const uint8_t* src_u,
434                             const uint8_t* src_v,
435                             const uint8_t* src_a,
436                             uint8_t* dst_argb,
437                             const struct YuvConstants* yuvconstants,
438                             int width) {
439   size_t vl;
440   size_t w = (size_t)width;
441   uint8_t ub, vr, ug, vg;
442   int16_t yg, bb, bg, br;
443   vuint8m2_t v_u, v_v;
444   vuint8m2_t v_b, v_g, v_r, v_a;
445   vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
446   YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
447   do {
448     READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
449     v_a = __riscv_vle8_v_u8m2(src_a, vl);
450     YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
451              v_b_16, v_r_16);
452     RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
453     __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
454     w -= vl;
455     src_y += vl;
456     src_a += vl;
457     src_u += vl;
458     src_v += vl;
459     dst_argb += vl * 4;
460   } while (w > 0);
461 }
462 #endif
463 
464 #ifdef HAS_I444TORGB24ROW_RVV
I444ToRGB24Row_RVV(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)465 void I444ToRGB24Row_RVV(const uint8_t* src_y,
466                         const uint8_t* src_u,
467                         const uint8_t* src_v,
468                         uint8_t* dst_rgb24,
469                         const struct YuvConstants* yuvconstants,
470                         int width) {
471   size_t vl;
472   size_t w = (size_t)width;
473   uint8_t ub, vr, ug, vg;
474   int16_t yg, bb, bg, br;
475   vuint8m2_t v_u, v_v;
476   vuint8m2_t v_b, v_g, v_r;
477   vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
478   YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
479   do {
480     READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
481     YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
482              v_b_16, v_r_16);
483     RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
484     __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl);
485     w -= vl;
486     src_y += vl;
487     src_u += vl;
488     src_v += vl;
489     dst_rgb24 += vl * 3;
490   } while (w > 0);
491 }
492 #endif
493 
494 #ifdef HAS_I422TOARGBROW_RVV
I422ToARGBRow_RVV(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)495 void I422ToARGBRow_RVV(const uint8_t* src_y,
496                        const uint8_t* src_u,
497                        const uint8_t* src_v,
498                        uint8_t* dst_argb,
499                        const struct YuvConstants* yuvconstants,
500                        int width) {
501   size_t w = (size_t)width;
502   size_t vl = __riscv_vsetvl_e8m2(w);
503   uint8_t ub, vr, ug, vg;
504   int16_t yg, bb, bg, br;
505   vuint8m2_t v_u, v_v;
506   vuint8m2_t v_b, v_g, v_r, v_a;
507   vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
508   YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
509   v_a = __riscv_vmv_v_x_u8m2(255u, vl);
510   do {
511     READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
512     YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
513              v_b_16, v_r_16);
514     RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
515     __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
516     w -= vl;
517     src_y += vl;
518     src_u += vl / 2;
519     src_v += vl / 2;
520     dst_argb += vl * 4;
521   } while (w > 0);
522 }
523 #endif
524 
525 #ifdef HAS_I422ALPHATOARGBROW_RVV
I422AlphaToARGBRow_RVV(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,const uint8_t * src_a,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)526 void I422AlphaToARGBRow_RVV(const uint8_t* src_y,
527                             const uint8_t* src_u,
528                             const uint8_t* src_v,
529                             const uint8_t* src_a,
530                             uint8_t* dst_argb,
531                             const struct YuvConstants* yuvconstants,
532                             int width) {
533   size_t vl;
534   size_t w = (size_t)width;
535   uint8_t ub, vr, ug, vg;
536   int16_t yg, bb, bg, br;
537   vuint8m2_t v_u, v_v;
538   vuint8m2_t v_b, v_g, v_r, v_a;
539   vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
540   YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
541   do {
542     READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
543     v_a = __riscv_vle8_v_u8m2(src_a, vl);
544     YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
545              v_b_16, v_r_16);
546     RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
547     __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
548     w -= vl;
549     src_y += vl;
550     src_a += vl;
551     src_u += vl / 2;
552     src_v += vl / 2;
553     dst_argb += vl * 4;
554   } while (w > 0);
555 }
556 #endif
557 
558 #ifdef HAS_I422TORGBAROW_RVV
I422ToRGBARow_RVV(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgba,const struct YuvConstants * yuvconstants,int width)559 void I422ToRGBARow_RVV(const uint8_t* src_y,
560                        const uint8_t* src_u,
561                        const uint8_t* src_v,
562                        uint8_t* dst_rgba,
563                        const struct YuvConstants* yuvconstants,
564                        int width) {
565   size_t w = (size_t)width;
566   size_t vl = __riscv_vsetvl_e8m2(w);
567   uint8_t ub, vr, ug, vg;
568   int16_t yg, bb, bg, br;
569   vuint8m2_t v_u, v_v;
570   vuint8m2_t v_b, v_g, v_r, v_a;
571   vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
572   YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
573   v_a = __riscv_vmv_v_x_u8m2(255u, vl);
574   do {
575     READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
576     YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
577              v_b_16, v_r_16);
578     RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
579     __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl);
580     w -= vl;
581     src_y += vl;
582     src_u += vl / 2;
583     src_v += vl / 2;
584     dst_rgba += vl * 4;
585   } while (w > 0);
586 }
587 #endif
588 
589 #ifdef HAS_I422TORGB24ROW_RVV
I422ToRGB24Row_RVV(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)590 void I422ToRGB24Row_RVV(const uint8_t* src_y,
591                         const uint8_t* src_u,
592                         const uint8_t* src_v,
593                         uint8_t* dst_rgb24,
594                         const struct YuvConstants* yuvconstants,
595                         int width) {
596   size_t vl;
597   size_t w = (size_t)width;
598   uint8_t ub, vr, ug, vg;
599   int16_t yg, bb, bg, br;
600   vuint8m2_t v_u, v_v;
601   vuint8m2_t v_b, v_g, v_r;
602   vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
603   YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
604   do {
605     READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
606     YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
607              v_b_16, v_r_16);
608     RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
609     __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl);
610     w -= vl;
611     src_y += vl;
612     src_u += vl / 2;
613     src_v += vl / 2;
614     dst_rgb24 += vl * 3;
615   } while (w > 0);
616 }
617 #endif
618 
619 #ifdef HAS_I400TOARGBROW_RVV
I400ToARGBRow_RVV(const uint8_t * src_y,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)620 void I400ToARGBRow_RVV(const uint8_t* src_y,
621                        uint8_t* dst_argb,
622                        const struct YuvConstants* yuvconstants,
623                        int width) {
624   size_t w = (size_t)width;
625   size_t vl = __riscv_vsetvl_e8m2(w);
626   const bool is_yb_positive = (yuvconstants->kRGBCoeffBias[4] >= 0);
627   vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
628   vuint16m4_t v_yb;
629   vuint16m4_t v_yg = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[0], vl);
630   // To match behavior on other platforms, vxrm (fixed-point rounding mode
631   // register) sets to round-to-nearest-up mode(0).
632   asm volatile("csrwi vxrm, 0");
633   if (is_yb_positive) {
634     v_yb = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[4] - 32, vl);
635   } else {
636     v_yb = __riscv_vmv_v_x_u16m4(-yuvconstants->kRGBCoeffBias[4] + 32, vl);
637   }
638   do {
639     vuint8m2_t v_y, v_out;
640     vuint16m4_t v_y_16, v_tmp0, v_tmp1, v_tmp2;
641     vl = __riscv_vsetvl_e8m2(w);
642     v_y = __riscv_vle8_v_u8m2(src_y, vl);
643     v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl);
644     v_tmp0 = __riscv_vmul_vx_u16m4(v_y_16, 0x0101, vl);  // 257 * v_y
645     v_tmp1 = __riscv_vmulhu_vv_u16m4(v_tmp0, v_yg, vl);
646     if (is_yb_positive) {
647       v_tmp2 = __riscv_vsaddu_vv_u16m4(v_tmp1, v_yb, vl);
648     } else {
649       v_tmp2 = __riscv_vssubu_vv_u16m4(v_tmp1, v_yb, vl);
650     }
651     v_out = __riscv_vnclipu_wx_u8m2(v_tmp2, 6, vl);
652     __riscv_vsseg4e8_v_u8m2(dst_argb, v_out, v_out, v_out, v_a, vl);
653     w -= vl;
654     src_y += vl;
655     dst_argb += vl * 4;
656   } while (w > 0);
657 }
658 #endif
659 
660 #ifdef HAS_J400TOARGBROW_RVV
J400ToARGBRow_RVV(const uint8_t * src_y,uint8_t * dst_argb,int width)661 void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width) {
662   size_t w = (size_t)width;
663   size_t vl = __riscv_vsetvl_e8m2(w);
664   vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
665   do {
666     vuint8m2_t v_y;
667     v_y = __riscv_vle8_v_u8m2(src_y, vl);
668     __riscv_vsseg4e8_v_u8m2(dst_argb, v_y, v_y, v_y, v_a, vl);
669     w -= vl;
670     src_y += vl;
671     dst_argb += vl * 4;
672     vl = __riscv_vsetvl_e8m2(w);
673   } while (w > 0);
674 }
675 #endif
676 
677 #ifdef HAS_COPYROW_RVV
CopyRow_RVV(const uint8_t * src,uint8_t * dst,int width)678 void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int width) {
679   size_t w = (size_t)width;
680   do {
681     size_t vl = __riscv_vsetvl_e8m8(w);
682     vuint8m8_t v_data = __riscv_vle8_v_u8m8(src, vl);
683     __riscv_vse8_v_u8m8(dst, v_data, vl);
684     w -= vl;
685     src += vl;
686     dst += vl;
687   } while (w > 0);
688 }
689 #endif
690 
691 #ifdef HAS_NV12TOARGBROW_RVV
NV12ToARGBRow_RVV(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)692 void NV12ToARGBRow_RVV(const uint8_t* src_y,
693                        const uint8_t* src_uv,
694                        uint8_t* dst_argb,
695                        const struct YuvConstants* yuvconstants,
696                        int width) {
697   size_t w = (size_t)width;
698   size_t vl = __riscv_vsetvl_e8m2(w);
699   uint8_t ub, vr, ug, vg;
700   int16_t yg, bb, bg, br;
701   vuint8m2_t v_u, v_v;
702   vuint8m2_t v_b, v_g, v_r, v_a;
703   vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
704   YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
705   v_a = __riscv_vmv_v_x_u8m2(255u, vl);
706   do {
707     READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16);
708     YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
709              v_b_16, v_r_16);
710     RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
711     __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
712     w -= vl;
713     src_y += vl;
714     src_uv += vl;
715     dst_argb += vl * 4;
716   } while (w > 0);
717 }
718 #endif
719 
720 #ifdef HAS_NV12TORGB24ROW_RVV
NV12ToRGB24Row_RVV(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)721 void NV12ToRGB24Row_RVV(const uint8_t* src_y,
722                         const uint8_t* src_uv,
723                         uint8_t* dst_rgb24,
724                         const struct YuvConstants* yuvconstants,
725                         int width) {
726   size_t w = (size_t)width;
727   size_t vl = __riscv_vsetvl_e8m2(w);
728   uint8_t ub, vr, ug, vg;
729   int16_t yg, bb, bg, br;
730   vuint8m2_t v_u, v_v;
731   vuint8m2_t v_b, v_g, v_r;
732   vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
733   YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
734   do {
735     READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16);
736     YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
737              v_b_16, v_r_16);
738     RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
739     __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl);
740     w -= vl;
741     src_y += vl;
742     src_uv += vl;
743     dst_rgb24 += vl * 3;
744   } while (w > 0);
745 }
746 #endif
747 
748 #ifdef HAS_NV21TOARGBROW_RVV
NV21ToARGBRow_RVV(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)749 void NV21ToARGBRow_RVV(const uint8_t* src_y,
750                        const uint8_t* src_vu,
751                        uint8_t* dst_argb,
752                        const struct YuvConstants* yuvconstants,
753                        int width) {
754   size_t w = (size_t)width;
755   size_t vl = __riscv_vsetvl_e8m2(w);
756   uint8_t ub, vr, ug, vg;
757   int16_t yg, bb, bg, br;
758   vuint8m2_t v_u, v_v;
759   vuint8m2_t v_b, v_g, v_r, v_a;
760   vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
761   YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
762   v_a = __riscv_vmv_v_x_u8m2(255u, vl);
763   do {
764     READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16);
765     YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
766              v_b_16, v_r_16);
767     RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
768     __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
769     w -= vl;
770     src_y += vl;
771     src_vu += vl;
772     dst_argb += vl * 4;
773   } while (w > 0);
774 }
775 #endif
776 
777 #ifdef HAS_NV21TORGB24ROW_RVV
NV21ToRGB24Row_RVV(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)778 void NV21ToRGB24Row_RVV(const uint8_t* src_y,
779                         const uint8_t* src_vu,
780                         uint8_t* dst_rgb24,
781                         const struct YuvConstants* yuvconstants,
782                         int width) {
783   size_t w = (size_t)width;
784   size_t vl = __riscv_vsetvl_e8m2(w);
785   uint8_t ub, vr, ug, vg;
786   int16_t yg, bb, bg, br;
787   vuint8m2_t v_u, v_v;
788   vuint8m2_t v_b, v_g, v_r;
789   vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
790   YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
791   do {
792     READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16);
793     YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
794              v_b_16, v_r_16);
795     RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
796     __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl);
797     w -= vl;
798     src_y += vl;
799     src_vu += vl;
800     dst_rgb24 += vl * 3;
801   } while (w > 0);
802 }
803 #endif
804 
805 // Bilinear filter [VLEN/8]x2 -> [VLEN/8]x1
806 
807 #ifdef HAS_INTERPOLATEROW_RVV
InterpolateRow_RVV(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)808 void InterpolateRow_RVV(uint8_t* dst_ptr,
809                         const uint8_t* src_ptr,
810                         ptrdiff_t src_stride,
811                         int dst_width,
812                         int source_y_fraction) {
813   int y1_fraction = source_y_fraction;
814   int y0_fraction = 256 - y1_fraction;
815   const uint8_t* src_ptr1 = src_ptr + src_stride;
816   size_t dst_w = (size_t)dst_width;
817   assert(source_y_fraction >= 0);
818   assert(source_y_fraction < 256);
819   // Blend 100 / 0 - Copy row unchanged.
820   if (y1_fraction == 0) {
821     do {
822       size_t vl = __riscv_vsetvl_e8m8(dst_w);
823       __riscv_vse8_v_u8m8(dst_ptr, __riscv_vle8_v_u8m8(src_ptr, vl), vl);
824       dst_w -= vl;
825       src_ptr += vl;
826       dst_ptr += vl;
827     } while (dst_w > 0);
828     return;
829   }
830   // To match behavior on other platforms, vxrm (fixed-point rounding mode
831   // register) is set to round-to-nearest-up(0).
832   asm volatile("csrwi vxrm, 0");
833   // Blend 50 / 50.
834   if (y1_fraction == 128) {
835     do {
836       size_t vl = __riscv_vsetvl_e8m8(dst_w);
837       vuint8m8_t row0 = __riscv_vle8_v_u8m8(src_ptr, vl);
838       vuint8m8_t row1 = __riscv_vle8_v_u8m8(src_ptr1, vl);
839       // Use round-to-nearest-up mode for averaging add
840       vuint8m8_t row_out = __riscv_vaaddu_vv_u8m8(row0, row1, vl);
841       __riscv_vse8_v_u8m8(dst_ptr, row_out, vl);
842       dst_w -= vl;
843       src_ptr += vl;
844       src_ptr1 += vl;
845       dst_ptr += vl;
846     } while (dst_w > 0);
847     return;
848   }
849   // General purpose row blend.
850   do {
851     size_t vl = __riscv_vsetvl_e8m4(dst_w);
852     vuint8m4_t row0 = __riscv_vle8_v_u8m4(src_ptr, vl);
853     vuint16m8_t acc = __riscv_vwmulu_vx_u16m8(row0, y0_fraction, vl);
854     vuint8m4_t row1 = __riscv_vle8_v_u8m4(src_ptr1, vl);
855     acc = __riscv_vwmaccu_vx_u16m8(acc, y1_fraction, row1, vl);
856     // Use round-to-nearest-up mode for vnclip
857     __riscv_vse8_v_u8m4(dst_ptr, __riscv_vnclipu_wx_u8m4(acc, 8, vl), vl);
858     dst_w -= vl;
859     src_ptr += vl;
860     src_ptr1 += vl;
861     dst_ptr += vl;
862   } while (dst_w > 0);
863 }
864 #endif
865 
866 #ifdef HAS_SPLITRGBROW_RVV
SplitRGBRow_RVV(const uint8_t * src_rgb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)867 void SplitRGBRow_RVV(const uint8_t* src_rgb,
868                      uint8_t* dst_r,
869                      uint8_t* dst_g,
870                      uint8_t* dst_b,
871                      int width) {
872   size_t w = (size_t)width;
873   do {
874     vuint8m2_t v_b, v_g, v_r;
875     size_t vl = __riscv_vsetvl_e8m2(w);
876     __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_rgb, vl);
877     __riscv_vse8_v_u8m2(dst_r, v_r, vl);
878     __riscv_vse8_v_u8m2(dst_g, v_g, vl);
879     __riscv_vse8_v_u8m2(dst_b, v_b, vl);
880     w -= vl;
881     dst_r += vl;
882     dst_g += vl;
883     dst_b += vl;
884     src_rgb += vl * 3;
885   } while (w > 0);
886 }
887 #endif
888 
889 #ifdef HAS_MERGERGBROW_RVV
MergeRGBRow_RVV(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_rgb,int width)890 void MergeRGBRow_RVV(const uint8_t* src_r,
891                      const uint8_t* src_g,
892                      const uint8_t* src_b,
893                      uint8_t* dst_rgb,
894                      int width) {
895   size_t w = (size_t)width;
896   do {
897     size_t vl = __riscv_vsetvl_e8m2(w);
898     vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl);
899     vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl);
900     vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl);
901     __riscv_vsseg3e8_v_u8m2(dst_rgb, v_r, v_g, v_b, vl);
902     w -= vl;
903     src_r += vl;
904     src_g += vl;
905     src_b += vl;
906     dst_rgb += vl * 3;
907   } while (w > 0);
908 }
909 #endif
910 
911 #ifdef HAS_SPLITARGBROW_RVV
SplitARGBRow_RVV(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,uint8_t * dst_a,int width)912 void SplitARGBRow_RVV(const uint8_t* src_argb,
913                       uint8_t* dst_r,
914                       uint8_t* dst_g,
915                       uint8_t* dst_b,
916                       uint8_t* dst_a,
917                       int width) {
918   size_t w = (size_t)width;
919   do {
920     vuint8m2_t v_b, v_g, v_r, v_a;
921     size_t vl = __riscv_vsetvl_e8m2(w);
922     __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
923     __riscv_vse8_v_u8m2(dst_a, v_a, vl);
924     __riscv_vse8_v_u8m2(dst_r, v_r, vl);
925     __riscv_vse8_v_u8m2(dst_g, v_g, vl);
926     __riscv_vse8_v_u8m2(dst_b, v_b, vl);
927     w -= vl;
928     dst_a += vl;
929     dst_r += vl;
930     dst_g += vl;
931     dst_b += vl;
932     src_argb += vl * 4;
933   } while (w > 0);
934 }
935 #endif
936 
937 #ifdef HAS_MERGEARGBROW_RVV
MergeARGBRow_RVV(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,const uint8_t * src_a,uint8_t * dst_argb,int width)938 void MergeARGBRow_RVV(const uint8_t* src_r,
939                       const uint8_t* src_g,
940                       const uint8_t* src_b,
941                       const uint8_t* src_a,
942                       uint8_t* dst_argb,
943                       int width) {
944   size_t w = (size_t)width;
945   do {
946     size_t vl = __riscv_vsetvl_e8m2(w);
947     vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl);
948     vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl);
949     vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl);
950     vuint8m2_t v_a = __riscv_vle8_v_u8m2(src_a, vl);
951     __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
952     w -= vl;
953     src_r += vl;
954     src_g += vl;
955     src_b += vl;
956     src_a += vl;
957     dst_argb += vl * 4;
958   } while (w > 0);
959 }
960 #endif
961 
962 #ifdef HAS_SPLITXRGBROW_RVV
SplitXRGBRow_RVV(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)963 void SplitXRGBRow_RVV(const uint8_t* src_argb,
964                       uint8_t* dst_r,
965                       uint8_t* dst_g,
966                       uint8_t* dst_b,
967                       int width) {
968   size_t w = (size_t)width;
969   do {
970     vuint8m2_t v_b, v_g, v_r, v_a;
971     size_t vl = __riscv_vsetvl_e8m2(w);
972     __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
973     __riscv_vse8_v_u8m2(dst_r, v_r, vl);
974     __riscv_vse8_v_u8m2(dst_g, v_g, vl);
975     __riscv_vse8_v_u8m2(dst_b, v_b, vl);
976     w -= vl;
977     dst_r += vl;
978     dst_g += vl;
979     dst_b += vl;
980     src_argb += vl * 4;
981   } while (w > 0);
982 }
983 #endif
984 
985 #ifdef HAS_MERGEXRGBROW_RVV
MergeXRGBRow_RVV(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_argb,int width)986 void MergeXRGBRow_RVV(const uint8_t* src_r,
987                       const uint8_t* src_g,
988                       const uint8_t* src_b,
989                       uint8_t* dst_argb,
990                       int width) {
991   size_t w = (size_t)width;
992   size_t vl = __riscv_vsetvl_e8m2(w);
993   vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
994   do {
995     vuint8m2_t v_r, v_g, v_b;
996     v_r = __riscv_vle8_v_u8m2(src_r, vl);
997     v_g = __riscv_vle8_v_u8m2(src_g, vl);
998     v_b = __riscv_vle8_v_u8m2(src_b, vl);
999     __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
1000     w -= vl;
1001     src_r += vl;
1002     src_g += vl;
1003     src_b += vl;
1004     dst_argb += vl * 4;
1005     vl = __riscv_vsetvl_e8m2(w);
1006   } while (w > 0);
1007 }
1008 #endif
1009 
1010 #ifdef HAS_SPLITUVROW_RVV
SplitUVRow_RVV(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)1011 void SplitUVRow_RVV(const uint8_t* src_uv,
1012                     uint8_t* dst_u,
1013                     uint8_t* dst_v,
1014                     int width) {
1015   size_t w = (size_t)width;
1016   do {
1017     size_t vl = __riscv_vsetvl_e8m4(w);
1018     vuint8m4_t v_u, v_v;
1019     __riscv_vlseg2e8_v_u8m4(&v_u, &v_v, src_uv, vl);
1020     __riscv_vse8_v_u8m4(dst_u, v_u, vl);
1021     __riscv_vse8_v_u8m4(dst_v, v_v, vl);
1022     w -= vl;
1023     dst_u += vl;
1024     dst_v += vl;
1025     src_uv += 2 * vl;
1026   } while (w > 0);
1027 }
1028 #endif
1029 
1030 #ifdef HAS_MERGEUVROW_RVV
MergeUVRow_RVV(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)1031 void MergeUVRow_RVV(const uint8_t* src_u,
1032                     const uint8_t* src_v,
1033                     uint8_t* dst_uv,
1034                     int width) {
1035   size_t w = (size_t)width;
1036   do {
1037     vuint8m4_t v_u, v_v;
1038     size_t vl = __riscv_vsetvl_e8m4(w);
1039     v_u = __riscv_vle8_v_u8m4(src_u, vl);
1040     v_v = __riscv_vle8_v_u8m4(src_v, vl);
1041     __riscv_vsseg2e8_v_u8m4(dst_uv, v_u, v_v, vl);
1042     w -= vl;
1043     src_u += vl;
1044     src_v += vl;
1045     dst_uv += 2 * vl;
1046   } while (w > 0);
1047 }
1048 #endif
1049 
1050 struct RgbConstants {
1051   uint8_t kRGBToY[4];
1052   uint16_t kAddY;
1053   uint16_t pad;
1054 };
1055 
1056 // RGB to JPeg coefficients
1057 // B * 0.1140 coefficient = 29
1058 // G * 0.5870 coefficient = 150
1059 // R * 0.2990 coefficient = 77
1060 // Add 0.5 = 0x80
1061 static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
1062                                                         128,
1063                                                         0};
1064 
1065 static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
1066 
1067 // RGB to BT.601 coefficients
1068 // B * 0.1016 coefficient = 25
1069 // G * 0.5078 coefficient = 129
1070 // R * 0.2578 coefficient = 66
1071 // Add 16.5 = 0x1080
1072 
1073 static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
1074                                                         0x1080,
1075                                                         0};
1076 
1077 static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0},
1078                                                       0x1080,
1079                                                       0};
1080 
1081 // ARGB expects first 3 values to contain RGB and 4th value is ignored
1082 #ifdef HAS_ARGBTOYMATRIXROW_RVV
ARGBToYMatrixRow_RVV(const uint8_t * src_argb,uint8_t * dst_y,int width,const struct RgbConstants * rgbconstants)1083 void ARGBToYMatrixRow_RVV(const uint8_t* src_argb,
1084                           uint8_t* dst_y,
1085                           int width,
1086                           const struct RgbConstants* rgbconstants) {
1087   assert(width != 0);
1088   size_t w = (size_t)width;
1089   vuint8m2_t v_by, v_gy, v_ry;  // vectors are to store RGBToY constant
1090   vuint16m4_t v_addy;           // vector is to store kAddY
1091   size_t vl = __riscv_vsetvl_e8m2(w);
1092   v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl);
1093   v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl);
1094   v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl);
1095   v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl);
1096   do {
1097     vuint8m2_t v_b, v_g, v_r, v_a, v_y;
1098     vuint16m4_t v_y_u16;
1099     size_t vl = __riscv_vsetvl_e8m2(w);
1100     __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
1101     v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl);
1102     v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl);
1103     v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl);
1104     v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl);
1105     v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl);
1106     __riscv_vse8_v_u8m2(dst_y, v_y, vl);
1107     w -= vl;
1108     src_argb += 4 * vl;
1109     dst_y += vl;
1110   } while (w > 0);
1111 }
1112 #endif
1113 
1114 #ifdef HAS_ARGBTOYROW_RVV
ARGBToYRow_RVV(const uint8_t * src_argb,uint8_t * dst_y,int width)1115 void ARGBToYRow_RVV(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1116   ARGBToYMatrixRow_RVV(src_argb, dst_y, width, &kRgb24I601Constants);
1117 }
1118 #endif
1119 
1120 #ifdef HAS_ARGBTOYJROW_RVV
ARGBToYJRow_RVV(const uint8_t * src_argb,uint8_t * dst_yj,int width)1121 void ARGBToYJRow_RVV(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
1122   ARGBToYMatrixRow_RVV(src_argb, dst_yj, width, &kRgb24JPEGConstants);
1123 }
1124 #endif
1125 
1126 #ifdef HAS_ABGRTOYROW_RVV
ABGRToYRow_RVV(const uint8_t * src_abgr,uint8_t * dst_y,int width)1127 void ABGRToYRow_RVV(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
1128   ARGBToYMatrixRow_RVV(src_abgr, dst_y, width, &kRawI601Constants);
1129 }
1130 #endif
1131 
1132 #ifdef HAS_ABGRTOYJROW_RVV
ABGRToYJRow_RVV(const uint8_t * src_abgr,uint8_t * dst_yj,int width)1133 void ABGRToYJRow_RVV(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
1134   ARGBToYMatrixRow_RVV(src_abgr, dst_yj, width, &kRawJPEGConstants);
1135 }
1136 #endif
1137 
1138 // RGBA expects first value to be A and ignored, then 3 values to contain RGB.
1139 #ifdef HAS_RGBATOYMATRIXROW_RVV
RGBAToYMatrixRow_RVV(const uint8_t * src_rgba,uint8_t * dst_y,int width,const struct RgbConstants * rgbconstants)1140 void RGBAToYMatrixRow_RVV(const uint8_t* src_rgba,
1141                           uint8_t* dst_y,
1142                           int width,
1143                           const struct RgbConstants* rgbconstants) {
1144   assert(width != 0);
1145   size_t w = (size_t)width;
1146   vuint8m2_t v_by, v_gy, v_ry;  // vectors are to store RGBToY constant
1147   vuint16m4_t v_addy;           // vector is to store kAddY
1148   size_t vl = __riscv_vsetvl_e8m2(w);
1149   v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl);
1150   v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl);
1151   v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl);
1152   v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl);
1153   do {
1154     vuint8m2_t v_b, v_g, v_r, v_a, v_y;
1155     vuint16m4_t v_y_u16;
1156     size_t vl = __riscv_vsetvl_e8m2(w);
1157     __riscv_vlseg4e8_v_u8m2(&v_a, &v_b, &v_g, &v_r, src_rgba, vl);
1158     v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl);
1159     v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl);
1160     v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl);
1161     v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl);
1162     v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl);
1163     __riscv_vse8_v_u8m2(dst_y, v_y, vl);
1164     w -= vl;
1165     src_rgba += 4 * vl;
1166     dst_y += vl;
1167   } while (w > 0);
1168 }
1169 #endif
1170 
1171 #ifdef HAS_RGBATOYROW_RVV
RGBAToYRow_RVV(const uint8_t * src_rgba,uint8_t * dst_y,int width)1172 void RGBAToYRow_RVV(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
1173   RGBAToYMatrixRow_RVV(src_rgba, dst_y, width, &kRgb24I601Constants);
1174 }
1175 #endif
1176 
1177 #ifdef HAS_RGBATOYJROW_RVV
RGBAToYJRow_RVV(const uint8_t * src_rgba,uint8_t * dst_yj,int width)1178 void RGBAToYJRow_RVV(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
1179   RGBAToYMatrixRow_RVV(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
1180 }
1181 #endif
1182 
1183 #ifdef HAS_BGRATOYROW_RVV
BGRAToYRow_RVV(const uint8_t * src_bgra,uint8_t * dst_y,int width)1184 void BGRAToYRow_RVV(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
1185   RGBAToYMatrixRow_RVV(src_bgra, dst_y, width, &kRawI601Constants);
1186 }
1187 #endif
1188 
1189 #ifdef HAS_RGBTOYMATRIXROW_RVV
RGBToYMatrixRow_RVV(const uint8_t * src_rgb,uint8_t * dst_y,int width,const struct RgbConstants * rgbconstants)1190 void RGBToYMatrixRow_RVV(const uint8_t* src_rgb,
1191                          uint8_t* dst_y,
1192                          int width,
1193                          const struct RgbConstants* rgbconstants) {
1194   assert(width != 0);
1195   size_t w = (size_t)width;
1196   vuint8m2_t v_by, v_gy, v_ry;  // vectors are to store RGBToY constant
1197   vuint16m4_t v_addy;           // vector is to store kAddY
1198   size_t vl = __riscv_vsetvl_e8m2(w);
1199   v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl);
1200   v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl);
1201   v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl);
1202   v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl);
1203   do {
1204     vuint8m2_t v_b, v_g, v_r, v_y;
1205     vuint16m4_t v_y_u16;
1206     size_t vl = __riscv_vsetvl_e8m2(w);
1207     __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_rgb, vl);
1208     v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl);
1209     v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl);
1210     v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl);
1211     v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl);
1212     v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl);
1213     __riscv_vse8_v_u8m2(dst_y, v_y, vl);
1214     w -= vl;
1215     src_rgb += 3 * vl;
1216     dst_y += vl;
1217   } while (w > 0);
1218 }
1219 #endif
1220 
1221 #ifdef HAS_RGB24TOYJROW_RVV
RGB24ToYJRow_RVV(const uint8_t * src_rgb24,uint8_t * dst_yj,int width)1222 void RGB24ToYJRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
1223   RGBToYMatrixRow_RVV(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
1224 }
1225 #endif
1226 
1227 #ifdef HAS_RAWTOYJROW_RVV
RAWToYJRow_RVV(const uint8_t * src_raw,uint8_t * dst_yj,int width)1228 void RAWToYJRow_RVV(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
1229   RGBToYMatrixRow_RVV(src_raw, dst_yj, width, &kRawJPEGConstants);
1230 }
1231 #endif
1232 
1233 #ifdef HAS_RGB24TOYROW_RVV
RGB24ToYRow_RVV(const uint8_t * src_rgb24,uint8_t * dst_y,int width)1234 void RGB24ToYRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
1235   RGBToYMatrixRow_RVV(src_rgb24, dst_y, width, &kRgb24I601Constants);
1236 }
1237 #endif
1238 
1239 #ifdef HAS_RAWTOYROW_RVV
RAWToYRow_RVV(const uint8_t * src_raw,uint8_t * dst_y,int width)1240 void RAWToYRow_RVV(const uint8_t* src_raw, uint8_t* dst_y, int width) {
1241   RGBToYMatrixRow_RVV(src_raw, dst_y, width, &kRawI601Constants);
1242 }
1243 #endif
1244 
1245 // Blend src_argb over src_argb1 and store to dst_argb.
1246 // dst_argb may be src_argb or src_argb1.
1247 // src_argb: RGB values have already been pre-multiplied by the a.
1248 #ifdef HAS_ARGBBLENDROW_RVV
ARGBBlendRow_RVV(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)1249 void ARGBBlendRow_RVV(const uint8_t* src_argb,
1250                       const uint8_t* src_argb1,
1251                       uint8_t* dst_argb,
1252                       int width) {
1253   size_t w = (size_t)width;
1254   size_t vl = __riscv_vsetvlmax_e8m2();
1255   // clamp255((((256 - a) * b) >> 8) + f)
1256   // = b * (256 - a) / 256 + f
1257   // = b - (b * a / 256) + f
1258   vuint8m2_t v_255 = __riscv_vmv_v_x_u8m2(255, vl);
1259   do {
1260     vuint8m2_t v_src0_b, v_src0_g, v_src0_r, v_src0_a;
1261     vuint8m2_t v_src1_b, v_src1_g, v_src1_r, v_src1_a;
1262     vuint8m2_t v_tmp_b, v_tmp_g, v_tmp_r;
1263     vuint8m2_t v_dst_b, v_dst_g, v_dst_r;
1264     vl = __riscv_vsetvl_e8m2(w);
1265     __riscv_vlseg4e8_v_u8m2(&v_src0_b, &v_src0_g, &v_src0_r, &v_src0_a,
1266                             src_argb, vl);
1267     __riscv_vlseg4e8_v_u8m2(&v_src1_b, &v_src1_g, &v_src1_r, &v_src1_a,
1268                             src_argb1, vl);
1269 
1270     v_tmp_b = __riscv_vmulhu_vv_u8m2(v_src1_b, v_src0_a, vl);
1271     v_tmp_g = __riscv_vmulhu_vv_u8m2(v_src1_g, v_src0_a, vl);
1272     v_tmp_r = __riscv_vmulhu_vv_u8m2(v_src1_r, v_src0_a, vl);
1273 
1274     v_dst_b = __riscv_vsub_vv_u8m2(v_src1_b, v_tmp_b, vl);
1275     v_dst_g = __riscv_vsub_vv_u8m2(v_src1_g, v_tmp_g, vl);
1276     v_dst_r = __riscv_vsub_vv_u8m2(v_src1_r, v_tmp_r, vl);
1277 
1278     v_dst_b = __riscv_vsaddu_vv_u8m2(v_dst_b, v_src0_b, vl);
1279     v_dst_g = __riscv_vsaddu_vv_u8m2(v_dst_g, v_src0_g, vl);
1280     v_dst_r = __riscv_vsaddu_vv_u8m2(v_dst_r, v_src0_r, vl);
1281     __riscv_vsseg4e8_v_u8m2(dst_argb, v_dst_b, v_dst_g, v_dst_r, v_255, vl);
1282 
1283     w -= vl;
1284     src_argb += 4 * vl;
1285     src_argb1 += 4 * vl;
1286     dst_argb += 4 * vl;
1287   } while (w > 0);
1288 }
1289 #endif
1290 
1291 #ifdef HAS_BLENDPLANEROW_RVV
BlendPlaneRow_RVV(const uint8_t * src0,const uint8_t * src1,const uint8_t * alpha,uint8_t * dst,int width)1292 void BlendPlaneRow_RVV(const uint8_t* src0,
1293                        const uint8_t* src1,
1294                        const uint8_t* alpha,
1295                        uint8_t* dst,
1296                        int width) {
1297   size_t w = (size_t)width;
1298   do {
1299     vuint16m8_t v_dst_u16;
1300     vuint8m4_t v_dst;
1301     size_t vl = __riscv_vsetvl_e8m4(w);
1302     vuint8m4_t v_src0 = __riscv_vle8_v_u8m4(src0, vl);
1303     vuint8m4_t v_src1 = __riscv_vle8_v_u8m4(src1, vl);
1304     vuint8m4_t v_alpha = __riscv_vle8_v_u8m4(alpha, vl);
1305     vuint8m4_t v_255_minus_alpha = __riscv_vrsub_vx_u8m4(v_alpha, 255u, vl);
1306 
1307     // (a * foreground) + (1-a) * background
1308     v_dst_u16 = __riscv_vwmulu_vv_u16m8(v_alpha, v_src0, vl);
1309     v_dst_u16 =
1310         __riscv_vwmaccu_vv_u16m8(v_dst_u16, v_255_minus_alpha, v_src1, vl);
1311     v_dst_u16 = __riscv_vadd_vx_u16m8(v_dst_u16, 255u, vl);
1312     v_dst = __riscv_vnsrl_wx_u8m4(v_dst_u16, 8, vl);
1313 
1314     __riscv_vse8_v_u8m4(dst, v_dst, vl);
1315     w -= vl;
1316     src0 += vl;
1317     src1 += vl;
1318     alpha += vl;
1319     dst += vl;
1320   } while (w > 0);
1321 }
1322 #endif
1323 
1324 // Attenuate: (f * a + 255) >> 8
1325 #ifdef HAS_ARGBATTENUATEROW_RVV
ARGBAttenuateRow_RVV(const uint8_t * src_argb,uint8_t * dst_argb,int width)1326 void ARGBAttenuateRow_RVV(const uint8_t* src_argb,
1327                           uint8_t* dst_argb,
1328                           int width) {
1329   size_t w = (size_t)width;
1330   do {
1331     vuint8m2_t v_b, v_g, v_r, v_a;
1332     vuint16m4_t v_ba_16, v_ga_16, v_ra_16;
1333     size_t vl = __riscv_vsetvl_e8m2(w);
1334     __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
1335     // f * a
1336     v_ba_16 = __riscv_vwmulu_vv_u16m4(v_b, v_a, vl);
1337     v_ga_16 = __riscv_vwmulu_vv_u16m4(v_g, v_a, vl);
1338     v_ra_16 = __riscv_vwmulu_vv_u16m4(v_r, v_a, vl);
1339     // f * a + 255
1340     v_ba_16 = __riscv_vadd_vx_u16m4(v_ba_16, 255u, vl);
1341     v_ga_16 = __riscv_vadd_vx_u16m4(v_ga_16, 255u, vl);
1342     v_ra_16 = __riscv_vadd_vx_u16m4(v_ra_16, 255u, vl);
1343     // (f * a + 255) >> 8
1344     v_b = __riscv_vnsrl_wx_u8m2(v_ba_16, 8, vl);
1345     v_g = __riscv_vnsrl_wx_u8m2(v_ga_16, 8, vl);
1346     v_r = __riscv_vnsrl_wx_u8m2(v_ra_16, 8, vl);
1347     __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
1348     w -= vl;
1349     src_argb += vl * 4;
1350     dst_argb += vl * 4;
1351   } while (w > 0);
1352 }
1353 #endif
1354 
1355 #ifdef HAS_ARGBEXTRACTALPHAROW_RVV
ARGBExtractAlphaRow_RVV(const uint8_t * src_argb,uint8_t * dst_a,int width)1356 void ARGBExtractAlphaRow_RVV(const uint8_t* src_argb,
1357                              uint8_t* dst_a,
1358                              int width) {
1359   size_t w = (size_t)width;
1360   do {
1361     size_t vl = __riscv_vsetvl_e8m2(w);
1362     vuint8m2_t v_b, v_g, v_r, v_a;
1363     __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
1364     __riscv_vse8_v_u8m2(dst_a, v_a, vl);
1365     w -= vl;
1366     src_argb += vl * 4;
1367     dst_a += vl;
1368   } while (w > 0);
1369 }
1370 #endif
1371 
1372 #ifdef HAS_ARGBCOPYYTOALPHAROW_RVV
ARGBCopyYToAlphaRow_RVV(const uint8_t * src,uint8_t * dst,int width)1373 void ARGBCopyYToAlphaRow_RVV(const uint8_t* src, uint8_t* dst, int width) {
1374   size_t w = (size_t)width;
1375   const ptrdiff_t dst_stride = 4;
1376   dst += 3;
1377   do {
1378     size_t vl = __riscv_vsetvl_e8m8(w);
1379     vuint8m8_t v_a = __riscv_vle8_v_u8m8(src, vl);
1380     __riscv_vsse8_v_u8m8(dst, dst_stride, v_a, vl);
1381     w -= vl;
1382     src += vl;
1383     dst += vl * dst_stride;
1384   } while (w > 0);
1385 }
1386 #endif
1387 
1388 #ifdef __cplusplus
1389 }  // extern "C"
1390 }  // namespace libyuv
1391 #endif
1392 
1393 #endif  // !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) &&
1394         // defined(__clang__)
1395