1 /*
2 * Copyright 2023 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 /*
12 * Copyright (c) 2023 SiFive, Inc. All rights reserved.
13 *
14 * Contributed by Darren Hsieh <[email protected]>
15 * Contributed by Bruce Lai <[email protected]>
16 */
17
18 #include "libyuv/row.h"
19
20 // This module is for clang rvv. GCC hasn't supported segment load & store.
21 #if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) && \
22 defined(__clang__)
23 #include <assert.h>
24 #include <riscv_vector.h>
25
26 #ifdef __cplusplus
27 namespace libyuv {
28 extern "C" {
29 #endif
30
31 // Fill YUV -> RGB conversion constants into vectors
32 // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
33 // register) is set to round-to-nearest-up mode(0).
34 #define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \
35 { \
36 asm volatile("csrwi vxrm, 0"); \
37 ub = yuvconst->kUVCoeff[0]; \
38 vr = yuvconst->kUVCoeff[1]; \
39 ug = yuvconst->kUVCoeff[2]; \
40 vg = yuvconst->kUVCoeff[3]; \
41 yg = yuvconst->kRGBCoeffBias[0]; \
42 bb = yuvconst->kRGBCoeffBias[1] + 32; \
43 bg = yuvconst->kRGBCoeffBias[2] - 32; \
44 br = yuvconst->kRGBCoeffBias[3] + 32; \
45 }
46
47 // Read [2*VLEN/8] Y, [VLEN/8] U and [VLEN/8] V from 422
48 #define READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16) \
49 { \
50 vuint8m1_t v_tmp0, v_tmp1; \
51 vuint8m2_t v_y; \
52 vuint16m2_t v_u_16, v_v_16; \
53 vl = __riscv_vsetvl_e8m1((w + 1) / 2); \
54 v_tmp0 = __riscv_vle8_v_u8m1(src_u, vl); \
55 v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \
56 v_tmp1 = __riscv_vle8_v_u8m1(src_v, vl); \
57 v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \
58 v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \
59 v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \
60 v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \
61 v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \
62 vl = __riscv_vsetvl_e8m2(w); \
63 v_y = __riscv_vle8_v_u8m2(src_y, vl); \
64 v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \
65 }
66
67 // Read [2*VLEN/8] Y, [2*VLEN/8] U, and [2*VLEN/8] V from 444
68 #define READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16) \
69 { \
70 vuint8m2_t v_y; \
71 vl = __riscv_vsetvl_e8m2(w); \
72 v_y = __riscv_vle8_v_u8m2(src_y, vl); \
73 v_u = __riscv_vle8_v_u8m2(src_u, vl); \
74 v_v = __riscv_vle8_v_u8m2(src_v, vl); \
75 v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \
76 }
77
78 // Convert from YUV to fixed point RGB
79 #define YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16, \
80 v_b_16, v_r_16) \
81 { \
82 vuint16m4_t v_tmp0, v_tmp1, v_tmp2, v_tmp3, v_tmp4; \
83 vuint32m8_t v_tmp5; \
84 v_tmp0 = __riscv_vwmulu_vx_u16m4(v_u, ug, vl); \
85 v_y_16 = __riscv_vmul_vx_u16m4(v_y_16, 0x0101, vl); \
86 v_tmp0 = __riscv_vwmaccu_vx_u16m4(v_tmp0, vg, v_v, vl); \
87 v_tmp1 = __riscv_vwmulu_vx_u16m4(v_u, ub, vl); \
88 v_tmp5 = __riscv_vwmulu_vx_u32m8(v_y_16, yg, vl); \
89 v_tmp2 = __riscv_vnsrl_wx_u16m4(v_tmp5, 16, vl); \
90 v_tmp3 = __riscv_vadd_vx_u16m4(v_tmp2, bg, vl); \
91 v_tmp4 = __riscv_vadd_vv_u16m4(v_tmp2, v_tmp1, vl); \
92 v_tmp2 = __riscv_vwmaccu_vx_u16m4(v_tmp2, vr, v_v, vl); \
93 v_g_16 = __riscv_vssubu_vv_u16m4(v_tmp3, v_tmp0, vl); \
94 v_b_16 = __riscv_vssubu_vx_u16m4(v_tmp4, bb, vl); \
95 v_r_16 = __riscv_vssubu_vx_u16m4(v_tmp2, br, vl); \
96 }
97
98 // Convert from fixed point RGB To 8 bit RGB
99 #define RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r) \
100 { \
101 v_g = __riscv_vnclipu_wx_u8m2(v_g_16, 6, vl); \
102 v_b = __riscv_vnclipu_wx_u8m2(v_b_16, 6, vl); \
103 v_r = __riscv_vnclipu_wx_u8m2(v_r_16, 6, vl); \
104 }
105
106 // Read [2*VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_uv
107 #define READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16) \
108 { \
109 vuint8m1_t v_tmp0, v_tmp1; \
110 vuint8m2_t v_y; \
111 vuint16m2_t v_u_16, v_v_16; \
112 vl = __riscv_vsetvl_e8m1((w + 1) / 2); \
113 __riscv_vlseg2e8_v_u8m1(&v_tmp0, &v_tmp1, src_uv, vl); \
114 v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \
115 v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \
116 v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \
117 v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \
118 v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \
119 v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \
120 vl = __riscv_vsetvl_e8m2(w); \
121 v_y = __riscv_vle8_v_u8m2(src_y, vl); \
122 v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \
123 }
124
125 // Read 2*[VLEN/8] Y from src_y; Read [VLEN/8] U and [VLEN/8] V from src_vu
126 #define READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16) \
127 { \
128 vuint8m1_t v_tmp0, v_tmp1; \
129 vuint8m2_t v_y; \
130 vuint16m2_t v_u_16, v_v_16; \
131 vl = __riscv_vsetvl_e8m1((w + 1) / 2); \
132 __riscv_vlseg2e8_v_u8m1(&v_tmp0, &v_tmp1, src_vu, vl); \
133 v_u_16 = __riscv_vwaddu_vx_u16m2(v_tmp1, 0, vl); \
134 v_v_16 = __riscv_vwaddu_vx_u16m2(v_tmp0, 0, vl); \
135 v_v_16 = __riscv_vmul_vx_u16m2(v_v_16, 0x0101, vl); \
136 v_u_16 = __riscv_vmul_vx_u16m2(v_u_16, 0x0101, vl); \
137 v_v = __riscv_vreinterpret_v_u16m2_u8m2(v_v_16); \
138 v_u = __riscv_vreinterpret_v_u16m2_u8m2(v_u_16); \
139 vl = __riscv_vsetvl_e8m2(w); \
140 v_y = __riscv_vle8_v_u8m2(src_y, vl); \
141 v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl); \
142 }
143
144 #ifdef HAS_ARGBTOAR64ROW_RVV
ARGBToAR64Row_RVV(const uint8_t * src_argb,uint16_t * dst_ar64,int width)145 void ARGBToAR64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ar64, int width) {
146 size_t avl = (size_t)4 * width;
147 do {
148 vuint16m8_t v_ar64;
149 vuint8m4_t v_argb;
150 size_t vl = __riscv_vsetvl_e8m4(avl);
151 v_argb = __riscv_vle8_v_u8m4(src_argb, vl);
152 v_ar64 = __riscv_vwaddu_vx_u16m8(v_argb, 0, vl);
153 v_ar64 = __riscv_vmul_vx_u16m8(v_ar64, 0x0101, vl);
154 __riscv_vse16_v_u16m8(dst_ar64, v_ar64, vl);
155 avl -= vl;
156 src_argb += vl;
157 dst_ar64 += vl;
158 } while (avl > 0);
159 }
160 #endif
161
162 #ifdef HAS_ARGBTOAB64ROW_RVV
ARGBToAB64Row_RVV(const uint8_t * src_argb,uint16_t * dst_ab64,int width)163 void ARGBToAB64Row_RVV(const uint8_t* src_argb, uint16_t* dst_ab64, int width) {
164 size_t avl = (size_t)width;
165 do {
166 vuint16m2_t v_b_16, v_g_16, v_r_16, v_a_16;
167 vuint8m1_t v_b, v_g, v_r, v_a;
168 size_t vl = __riscv_vsetvl_e8m1(avl);
169 __riscv_vlseg4e8_v_u8m1(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
170 v_b_16 = __riscv_vwaddu_vx_u16m2(v_b, 0, vl);
171 v_g_16 = __riscv_vwaddu_vx_u16m2(v_g, 0, vl);
172 v_r_16 = __riscv_vwaddu_vx_u16m2(v_r, 0, vl);
173 v_a_16 = __riscv_vwaddu_vx_u16m2(v_a, 0, vl);
174 v_b_16 = __riscv_vmul_vx_u16m2(v_b_16, 0x0101, vl);
175 v_g_16 = __riscv_vmul_vx_u16m2(v_g_16, 0x0101, vl);
176 v_r_16 = __riscv_vmul_vx_u16m2(v_r_16, 0x0101, vl);
177 v_a_16 = __riscv_vmul_vx_u16m2(v_a_16, 0x0101, vl);
178 __riscv_vsseg4e16_v_u16m2(dst_ab64, v_r_16, v_g_16, v_b_16, v_a_16, vl);
179 avl -= vl;
180 src_argb += 4 * vl;
181 dst_ab64 += 4 * vl;
182 } while (avl > 0);
183 }
184 #endif
185
186 #ifdef HAS_AR64TOARGBROW_RVV
AR64ToARGBRow_RVV(const uint16_t * src_ar64,uint8_t * dst_argb,int width)187 void AR64ToARGBRow_RVV(const uint16_t* src_ar64, uint8_t* dst_argb, int width) {
188 size_t avl = (size_t)4 * width;
189 do {
190 vuint16m8_t v_ar64;
191 vuint8m4_t v_argb;
192 size_t vl = __riscv_vsetvl_e16m8(avl);
193 v_ar64 = __riscv_vle16_v_u16m8(src_ar64, vl);
194 v_argb = __riscv_vnsrl_wx_u8m4(v_ar64, 8, vl);
195 __riscv_vse8_v_u8m4(dst_argb, v_argb, vl);
196 avl -= vl;
197 src_ar64 += vl;
198 dst_argb += vl;
199 } while (avl > 0);
200 }
201 #endif
202
203 #ifdef HAS_AR64TOAB64ROW_RVV
AR64ToAB64Row_RVV(const uint16_t * src_ar64,uint16_t * dst_ab64,int width)204 void AR64ToAB64Row_RVV(const uint16_t* src_ar64,
205 uint16_t* dst_ab64,
206 int width) {
207 size_t w = (size_t)width;
208 do {
209 size_t vl = __riscv_vsetvl_e16m2(w);
210 vuint16m2_t v_b, v_g, v_r, v_a;
211 __riscv_vlseg4e16_v_u16m2(&v_b, &v_g, &v_r, &v_a, src_ar64, vl);
212 __riscv_vsseg4e16_v_u16m2(dst_ab64, v_r, v_g, v_b, v_a, vl);
213 w -= vl;
214 src_ar64 += vl * 4;
215 dst_ab64 += vl * 4;
216 } while (w > 0);
217 }
218 #endif
219
220 #ifdef HAS_AB64TOARGBROW_RVV
AB64ToARGBRow_RVV(const uint16_t * src_ab64,uint8_t * dst_argb,int width)221 void AB64ToARGBRow_RVV(const uint16_t* src_ab64, uint8_t* dst_argb, int width) {
222 size_t avl = (size_t)width;
223 do {
224 vuint16m2_t v_b_16, v_g_16, v_r_16, v_a_16;
225 vuint8m1_t v_b, v_g, v_r, v_a;
226 size_t vl = __riscv_vsetvl_e16m2(avl);
227 __riscv_vlseg4e16_v_u16m2(&v_r_16, &v_g_16, &v_b_16, &v_a_16, src_ab64, vl);
228 v_b = __riscv_vnsrl_wx_u8m1(v_b_16, 8, vl);
229 v_g = __riscv_vnsrl_wx_u8m1(v_g_16, 8, vl);
230 v_r = __riscv_vnsrl_wx_u8m1(v_r_16, 8, vl);
231 v_a = __riscv_vnsrl_wx_u8m1(v_a_16, 8, vl);
232 __riscv_vsseg4e8_v_u8m1(dst_argb, v_b, v_g, v_r, v_a, vl);
233 avl -= vl;
234 src_ab64 += 4 * vl;
235 dst_argb += 4 * vl;
236 } while (avl > 0);
237 }
238 #endif
239
240 #ifdef HAS_RAWTOARGBROW_RVV
RAWToARGBRow_RVV(const uint8_t * src_raw,uint8_t * dst_argb,int width)241 void RAWToARGBRow_RVV(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
242 size_t w = (size_t)width;
243 size_t vl = __riscv_vsetvl_e8m2(w);
244 vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
245 do {
246 vuint8m2_t v_b, v_g, v_r;
247 __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_raw, vl);
248 __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
249 w -= vl;
250 src_raw += vl * 3;
251 dst_argb += vl * 4;
252 vl = __riscv_vsetvl_e8m2(w);
253 } while (w > 0);
254 }
255 #endif
256
257 #ifdef HAS_RAWTORGBAROW_RVV
RAWToRGBARow_RVV(const uint8_t * src_raw,uint8_t * dst_rgba,int width)258 void RAWToRGBARow_RVV(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
259 size_t w = (size_t)width;
260 size_t vl = __riscv_vsetvl_e8m2(w);
261 vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
262 do {
263 vuint8m2_t v_b, v_g, v_r;
264 __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_raw, vl);
265 __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl);
266 w -= vl;
267 src_raw += vl * 3;
268 dst_rgba += vl * 4;
269 vl = __riscv_vsetvl_e8m2(w);
270 } while (w > 0);
271 }
272 #endif
273
274 #ifdef HAS_RAWTORGB24ROW_RVV
RAWToRGB24Row_RVV(const uint8_t * src_raw,uint8_t * dst_rgb24,int width)275 void RAWToRGB24Row_RVV(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
276 size_t w = (size_t)width;
277 do {
278 vuint8m2_t v_b, v_g, v_r;
279 size_t vl = __riscv_vsetvl_e8m2(w);
280 __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_raw, vl);
281 __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_r, v_g, v_b, vl);
282 w -= vl;
283 src_raw += vl * 3;
284 dst_rgb24 += vl * 3;
285 } while (w > 0);
286 }
287 #endif
288
289 #ifdef HAS_ARGBTORAWROW_RVV
ARGBToRAWRow_RVV(const uint8_t * src_argb,uint8_t * dst_raw,int width)290 void ARGBToRAWRow_RVV(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
291 size_t w = (size_t)width;
292 do {
293 vuint8m2_t v_b, v_g, v_r, v_a;
294 size_t vl = __riscv_vsetvl_e8m2(w);
295 __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
296 __riscv_vsseg3e8_v_u8m2(dst_raw, v_r, v_g, v_b, vl);
297 w -= vl;
298 src_argb += vl * 4;
299 dst_raw += vl * 3;
300 } while (w > 0);
301 }
302 #endif
303
304 #ifdef HAS_ARGBTORGB24ROW_RVV
ARGBToRGB24Row_RVV(const uint8_t * src_argb,uint8_t * dst_rgb24,int width)305 void ARGBToRGB24Row_RVV(const uint8_t* src_argb,
306 uint8_t* dst_rgb24,
307 int width) {
308 size_t w = (size_t)width;
309 do {
310 vuint8m2_t v_b, v_g, v_r, v_a;
311 size_t vl = __riscv_vsetvl_e8m2(w);
312 __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
313 __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl);
314 w -= vl;
315 src_argb += vl * 4;
316 dst_rgb24 += vl * 3;
317 } while (w > 0);
318 }
319 #endif
320
321 #ifdef HAS_ARGBTOABGRROW_RVV
ARGBToABGRRow_RVV(const uint8_t * src_argb,uint8_t * dst_abgr,int width)322 void ARGBToABGRRow_RVV(const uint8_t* src_argb, uint8_t* dst_abgr, int width) {
323 size_t w = (size_t)width;
324 do {
325 size_t vl = __riscv_vsetvl_e8m2(w);
326 vuint8m2_t v_a, v_r, v_g, v_b;
327 __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
328 __riscv_vsseg4e8_v_u8m2(dst_abgr, v_r, v_g, v_b, v_a, vl);
329 w -= vl;
330 src_argb += vl * 4;
331 dst_abgr += vl * 4;
332 } while (w > 0);
333 }
334 #endif
335
336 #ifdef HAS_ARGBTOBGRAROW_RVV
ARGBToBGRARow_RVV(const uint8_t * src_argb,uint8_t * dst_bgra,int width)337 void ARGBToBGRARow_RVV(const uint8_t* src_argb, uint8_t* dst_bgra, int width) {
338 size_t w = (size_t)width;
339 do {
340 size_t vl = __riscv_vsetvl_e8m2(w);
341 vuint8m2_t v_a, v_r, v_g, v_b;
342 __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
343 __riscv_vsseg4e8_v_u8m2(dst_bgra, v_a, v_r, v_g, v_b, vl);
344 w -= vl;
345 src_argb += vl * 4;
346 dst_bgra += vl * 4;
347 } while (w > 0);
348 }
349 #endif
350
351 #ifdef HAS_ARGBTORGBAROW_RVV
ARGBToRGBARow_RVV(const uint8_t * src_argb,uint8_t * dst_rgba,int width)352 void ARGBToRGBARow_RVV(const uint8_t* src_argb, uint8_t* dst_rgba, int width) {
353 size_t w = (size_t)width;
354 do {
355 size_t vl = __riscv_vsetvl_e8m2(w);
356 vuint8m2_t v_a, v_r, v_g, v_b;
357 __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
358 __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl);
359 w -= vl;
360 src_argb += vl * 4;
361 dst_rgba += vl * 4;
362 } while (w > 0);
363 }
364 #endif
365
366 #ifdef HAS_RGBATOARGBROW_RVV
RGBAToARGBRow_RVV(const uint8_t * src_rgba,uint8_t * dst_argb,int width)367 void RGBAToARGBRow_RVV(const uint8_t* src_rgba, uint8_t* dst_argb, int width) {
368 size_t w = (size_t)width;
369 do {
370 size_t vl = __riscv_vsetvl_e8m2(w);
371 vuint8m2_t v_a, v_r, v_g, v_b;
372 __riscv_vlseg4e8_v_u8m2(&v_a, &v_b, &v_g, &v_r, src_rgba, vl);
373 __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
374 w -= vl;
375 src_rgba += vl * 4;
376 dst_argb += vl * 4;
377 } while (w > 0);
378 }
379 #endif
380
381 #ifdef HAS_RGB24TOARGBROW_RVV
RGB24ToARGBRow_RVV(const uint8_t * src_rgb24,uint8_t * dst_argb,int width)382 void RGB24ToARGBRow_RVV(const uint8_t* src_rgb24,
383 uint8_t* dst_argb,
384 int width) {
385 size_t w = (size_t)width;
386 size_t vl = __riscv_vsetvl_e8m2(w);
387 vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
388 do {
389 vuint8m2_t v_b, v_g, v_r;
390 __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_rgb24, vl);
391 __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
392 w -= vl;
393 src_rgb24 += vl * 3;
394 dst_argb += vl * 4;
395 vl = __riscv_vsetvl_e8m2(w);
396 } while (w > 0);
397 }
398 #endif
399
400 #ifdef HAS_I444TOARGBROW_RVV
I444ToARGBRow_RVV(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)401 void I444ToARGBRow_RVV(const uint8_t* src_y,
402 const uint8_t* src_u,
403 const uint8_t* src_v,
404 uint8_t* dst_argb,
405 const struct YuvConstants* yuvconstants,
406 int width) {
407 size_t w = (size_t)width;
408 size_t vl = __riscv_vsetvl_e8m2(w);
409 uint8_t ub, vr, ug, vg;
410 int16_t yg, bb, bg, br;
411 vuint8m2_t v_u, v_v;
412 vuint8m2_t v_b, v_g, v_r, v_a;
413 vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
414 YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
415 v_a = __riscv_vmv_v_x_u8m2(255u, vl);
416 do {
417 READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
418 YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
419 v_b_16, v_r_16);
420 RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
421 __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
422 w -= vl;
423 src_y += vl;
424 src_u += vl;
425 src_v += vl;
426 dst_argb += vl * 4;
427 } while (w > 0);
428 }
429 #endif
430
431 #ifdef HAS_I444ALPHATOARGBROW_RVV
I444AlphaToARGBRow_RVV(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,const uint8_t * src_a,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)432 void I444AlphaToARGBRow_RVV(const uint8_t* src_y,
433 const uint8_t* src_u,
434 const uint8_t* src_v,
435 const uint8_t* src_a,
436 uint8_t* dst_argb,
437 const struct YuvConstants* yuvconstants,
438 int width) {
439 size_t vl;
440 size_t w = (size_t)width;
441 uint8_t ub, vr, ug, vg;
442 int16_t yg, bb, bg, br;
443 vuint8m2_t v_u, v_v;
444 vuint8m2_t v_b, v_g, v_r, v_a;
445 vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
446 YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
447 do {
448 READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
449 v_a = __riscv_vle8_v_u8m2(src_a, vl);
450 YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
451 v_b_16, v_r_16);
452 RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
453 __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
454 w -= vl;
455 src_y += vl;
456 src_a += vl;
457 src_u += vl;
458 src_v += vl;
459 dst_argb += vl * 4;
460 } while (w > 0);
461 }
462 #endif
463
464 #ifdef HAS_I444TORGB24ROW_RVV
I444ToRGB24Row_RVV(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)465 void I444ToRGB24Row_RVV(const uint8_t* src_y,
466 const uint8_t* src_u,
467 const uint8_t* src_v,
468 uint8_t* dst_rgb24,
469 const struct YuvConstants* yuvconstants,
470 int width) {
471 size_t vl;
472 size_t w = (size_t)width;
473 uint8_t ub, vr, ug, vg;
474 int16_t yg, bb, bg, br;
475 vuint8m2_t v_u, v_v;
476 vuint8m2_t v_b, v_g, v_r;
477 vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
478 YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
479 do {
480 READYUV444(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
481 YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
482 v_b_16, v_r_16);
483 RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
484 __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl);
485 w -= vl;
486 src_y += vl;
487 src_u += vl;
488 src_v += vl;
489 dst_rgb24 += vl * 3;
490 } while (w > 0);
491 }
492 #endif
493
494 #ifdef HAS_I422TOARGBROW_RVV
I422ToARGBRow_RVV(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)495 void I422ToARGBRow_RVV(const uint8_t* src_y,
496 const uint8_t* src_u,
497 const uint8_t* src_v,
498 uint8_t* dst_argb,
499 const struct YuvConstants* yuvconstants,
500 int width) {
501 size_t w = (size_t)width;
502 size_t vl = __riscv_vsetvl_e8m2(w);
503 uint8_t ub, vr, ug, vg;
504 int16_t yg, bb, bg, br;
505 vuint8m2_t v_u, v_v;
506 vuint8m2_t v_b, v_g, v_r, v_a;
507 vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
508 YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
509 v_a = __riscv_vmv_v_x_u8m2(255u, vl);
510 do {
511 READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
512 YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
513 v_b_16, v_r_16);
514 RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
515 __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
516 w -= vl;
517 src_y += vl;
518 src_u += vl / 2;
519 src_v += vl / 2;
520 dst_argb += vl * 4;
521 } while (w > 0);
522 }
523 #endif
524
525 #ifdef HAS_I422ALPHATOARGBROW_RVV
I422AlphaToARGBRow_RVV(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,const uint8_t * src_a,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)526 void I422AlphaToARGBRow_RVV(const uint8_t* src_y,
527 const uint8_t* src_u,
528 const uint8_t* src_v,
529 const uint8_t* src_a,
530 uint8_t* dst_argb,
531 const struct YuvConstants* yuvconstants,
532 int width) {
533 size_t vl;
534 size_t w = (size_t)width;
535 uint8_t ub, vr, ug, vg;
536 int16_t yg, bb, bg, br;
537 vuint8m2_t v_u, v_v;
538 vuint8m2_t v_b, v_g, v_r, v_a;
539 vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
540 YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
541 do {
542 READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
543 v_a = __riscv_vle8_v_u8m2(src_a, vl);
544 YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
545 v_b_16, v_r_16);
546 RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
547 __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
548 w -= vl;
549 src_y += vl;
550 src_a += vl;
551 src_u += vl / 2;
552 src_v += vl / 2;
553 dst_argb += vl * 4;
554 } while (w > 0);
555 }
556 #endif
557
558 #ifdef HAS_I422TORGBAROW_RVV
I422ToRGBARow_RVV(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgba,const struct YuvConstants * yuvconstants,int width)559 void I422ToRGBARow_RVV(const uint8_t* src_y,
560 const uint8_t* src_u,
561 const uint8_t* src_v,
562 uint8_t* dst_rgba,
563 const struct YuvConstants* yuvconstants,
564 int width) {
565 size_t w = (size_t)width;
566 size_t vl = __riscv_vsetvl_e8m2(w);
567 uint8_t ub, vr, ug, vg;
568 int16_t yg, bb, bg, br;
569 vuint8m2_t v_u, v_v;
570 vuint8m2_t v_b, v_g, v_r, v_a;
571 vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
572 YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
573 v_a = __riscv_vmv_v_x_u8m2(255u, vl);
574 do {
575 READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
576 YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
577 v_b_16, v_r_16);
578 RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
579 __riscv_vsseg4e8_v_u8m2(dst_rgba, v_a, v_b, v_g, v_r, vl);
580 w -= vl;
581 src_y += vl;
582 src_u += vl / 2;
583 src_v += vl / 2;
584 dst_rgba += vl * 4;
585 } while (w > 0);
586 }
587 #endif
588
589 #ifdef HAS_I422TORGB24ROW_RVV
I422ToRGB24Row_RVV(const uint8_t * src_y,const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)590 void I422ToRGB24Row_RVV(const uint8_t* src_y,
591 const uint8_t* src_u,
592 const uint8_t* src_v,
593 uint8_t* dst_rgb24,
594 const struct YuvConstants* yuvconstants,
595 int width) {
596 size_t vl;
597 size_t w = (size_t)width;
598 uint8_t ub, vr, ug, vg;
599 int16_t yg, bb, bg, br;
600 vuint8m2_t v_u, v_v;
601 vuint8m2_t v_b, v_g, v_r;
602 vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
603 YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
604 do {
605 READYUV422(vl, w, src_y, src_u, src_v, v_u, v_v, v_y_16);
606 YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
607 v_b_16, v_r_16);
608 RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
609 __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl);
610 w -= vl;
611 src_y += vl;
612 src_u += vl / 2;
613 src_v += vl / 2;
614 dst_rgb24 += vl * 3;
615 } while (w > 0);
616 }
617 #endif
618
619 #ifdef HAS_I400TOARGBROW_RVV
I400ToARGBRow_RVV(const uint8_t * src_y,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)620 void I400ToARGBRow_RVV(const uint8_t* src_y,
621 uint8_t* dst_argb,
622 const struct YuvConstants* yuvconstants,
623 int width) {
624 size_t w = (size_t)width;
625 size_t vl = __riscv_vsetvl_e8m2(w);
626 const bool is_yb_positive = (yuvconstants->kRGBCoeffBias[4] >= 0);
627 vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
628 vuint16m4_t v_yb;
629 vuint16m4_t v_yg = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[0], vl);
630 // To match behavior on other platforms, vxrm (fixed-point rounding mode
631 // register) sets to round-to-nearest-up mode(0).
632 asm volatile("csrwi vxrm, 0");
633 if (is_yb_positive) {
634 v_yb = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[4] - 32, vl);
635 } else {
636 v_yb = __riscv_vmv_v_x_u16m4(-yuvconstants->kRGBCoeffBias[4] + 32, vl);
637 }
638 do {
639 vuint8m2_t v_y, v_out;
640 vuint16m4_t v_y_16, v_tmp0, v_tmp1, v_tmp2;
641 vl = __riscv_vsetvl_e8m2(w);
642 v_y = __riscv_vle8_v_u8m2(src_y, vl);
643 v_y_16 = __riscv_vwaddu_vx_u16m4(v_y, 0, vl);
644 v_tmp0 = __riscv_vmul_vx_u16m4(v_y_16, 0x0101, vl); // 257 * v_y
645 v_tmp1 = __riscv_vmulhu_vv_u16m4(v_tmp0, v_yg, vl);
646 if (is_yb_positive) {
647 v_tmp2 = __riscv_vsaddu_vv_u16m4(v_tmp1, v_yb, vl);
648 } else {
649 v_tmp2 = __riscv_vssubu_vv_u16m4(v_tmp1, v_yb, vl);
650 }
651 v_out = __riscv_vnclipu_wx_u8m2(v_tmp2, 6, vl);
652 __riscv_vsseg4e8_v_u8m2(dst_argb, v_out, v_out, v_out, v_a, vl);
653 w -= vl;
654 src_y += vl;
655 dst_argb += vl * 4;
656 } while (w > 0);
657 }
658 #endif
659
660 #ifdef HAS_J400TOARGBROW_RVV
J400ToARGBRow_RVV(const uint8_t * src_y,uint8_t * dst_argb,int width)661 void J400ToARGBRow_RVV(const uint8_t* src_y, uint8_t* dst_argb, int width) {
662 size_t w = (size_t)width;
663 size_t vl = __riscv_vsetvl_e8m2(w);
664 vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
665 do {
666 vuint8m2_t v_y;
667 v_y = __riscv_vle8_v_u8m2(src_y, vl);
668 __riscv_vsseg4e8_v_u8m2(dst_argb, v_y, v_y, v_y, v_a, vl);
669 w -= vl;
670 src_y += vl;
671 dst_argb += vl * 4;
672 vl = __riscv_vsetvl_e8m2(w);
673 } while (w > 0);
674 }
675 #endif
676
677 #ifdef HAS_COPYROW_RVV
CopyRow_RVV(const uint8_t * src,uint8_t * dst,int width)678 void CopyRow_RVV(const uint8_t* src, uint8_t* dst, int width) {
679 size_t w = (size_t)width;
680 do {
681 size_t vl = __riscv_vsetvl_e8m8(w);
682 vuint8m8_t v_data = __riscv_vle8_v_u8m8(src, vl);
683 __riscv_vse8_v_u8m8(dst, v_data, vl);
684 w -= vl;
685 src += vl;
686 dst += vl;
687 } while (w > 0);
688 }
689 #endif
690
691 #ifdef HAS_NV12TOARGBROW_RVV
NV12ToARGBRow_RVV(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)692 void NV12ToARGBRow_RVV(const uint8_t* src_y,
693 const uint8_t* src_uv,
694 uint8_t* dst_argb,
695 const struct YuvConstants* yuvconstants,
696 int width) {
697 size_t w = (size_t)width;
698 size_t vl = __riscv_vsetvl_e8m2(w);
699 uint8_t ub, vr, ug, vg;
700 int16_t yg, bb, bg, br;
701 vuint8m2_t v_u, v_v;
702 vuint8m2_t v_b, v_g, v_r, v_a;
703 vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
704 YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
705 v_a = __riscv_vmv_v_x_u8m2(255u, vl);
706 do {
707 READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16);
708 YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
709 v_b_16, v_r_16);
710 RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
711 __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
712 w -= vl;
713 src_y += vl;
714 src_uv += vl;
715 dst_argb += vl * 4;
716 } while (w > 0);
717 }
718 #endif
719
720 #ifdef HAS_NV12TORGB24ROW_RVV
NV12ToRGB24Row_RVV(const uint8_t * src_y,const uint8_t * src_uv,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)721 void NV12ToRGB24Row_RVV(const uint8_t* src_y,
722 const uint8_t* src_uv,
723 uint8_t* dst_rgb24,
724 const struct YuvConstants* yuvconstants,
725 int width) {
726 size_t w = (size_t)width;
727 size_t vl = __riscv_vsetvl_e8m2(w);
728 uint8_t ub, vr, ug, vg;
729 int16_t yg, bb, bg, br;
730 vuint8m2_t v_u, v_v;
731 vuint8m2_t v_b, v_g, v_r;
732 vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
733 YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
734 do {
735 READNV12(vl, w, src_y, src_uv, v_u, v_v, v_y_16);
736 YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
737 v_b_16, v_r_16);
738 RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
739 __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl);
740 w -= vl;
741 src_y += vl;
742 src_uv += vl;
743 dst_rgb24 += vl * 3;
744 } while (w > 0);
745 }
746 #endif
747
748 #ifdef HAS_NV21TOARGBROW_RVV
NV21ToARGBRow_RVV(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_argb,const struct YuvConstants * yuvconstants,int width)749 void NV21ToARGBRow_RVV(const uint8_t* src_y,
750 const uint8_t* src_vu,
751 uint8_t* dst_argb,
752 const struct YuvConstants* yuvconstants,
753 int width) {
754 size_t w = (size_t)width;
755 size_t vl = __riscv_vsetvl_e8m2(w);
756 uint8_t ub, vr, ug, vg;
757 int16_t yg, bb, bg, br;
758 vuint8m2_t v_u, v_v;
759 vuint8m2_t v_b, v_g, v_r, v_a;
760 vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
761 YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
762 v_a = __riscv_vmv_v_x_u8m2(255u, vl);
763 do {
764 READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16);
765 YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
766 v_b_16, v_r_16);
767 RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
768 __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
769 w -= vl;
770 src_y += vl;
771 src_vu += vl;
772 dst_argb += vl * 4;
773 } while (w > 0);
774 }
775 #endif
776
777 #ifdef HAS_NV21TORGB24ROW_RVV
NV21ToRGB24Row_RVV(const uint8_t * src_y,const uint8_t * src_vu,uint8_t * dst_rgb24,const struct YuvConstants * yuvconstants,int width)778 void NV21ToRGB24Row_RVV(const uint8_t* src_y,
779 const uint8_t* src_vu,
780 uint8_t* dst_rgb24,
781 const struct YuvConstants* yuvconstants,
782 int width) {
783 size_t w = (size_t)width;
784 size_t vl = __riscv_vsetvl_e8m2(w);
785 uint8_t ub, vr, ug, vg;
786 int16_t yg, bb, bg, br;
787 vuint8m2_t v_u, v_v;
788 vuint8m2_t v_b, v_g, v_r;
789 vuint16m4_t v_y_16, v_g_16, v_b_16, v_r_16;
790 YUVTORGB_SETUP(yuvconstants, ub, vr, ug, vg, yg, bb, bg, br);
791 do {
792 READNV21(vl, w, src_y, src_vu, v_u, v_v, v_y_16);
793 YUVTORGB(vl, v_u, v_v, ub, vr, ug, vg, yg, bb, bg, br, v_y_16, v_g_16,
794 v_b_16, v_r_16);
795 RGBTORGB8(vl, v_g_16, v_b_16, v_r_16, v_g, v_b, v_r);
796 __riscv_vsseg3e8_v_u8m2(dst_rgb24, v_b, v_g, v_r, vl);
797 w -= vl;
798 src_y += vl;
799 src_vu += vl;
800 dst_rgb24 += vl * 3;
801 } while (w > 0);
802 }
803 #endif
804
805 // Bilinear filter [VLEN/8]x2 -> [VLEN/8]x1
806
807 #ifdef HAS_INTERPOLATEROW_RVV
InterpolateRow_RVV(uint8_t * dst_ptr,const uint8_t * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)808 void InterpolateRow_RVV(uint8_t* dst_ptr,
809 const uint8_t* src_ptr,
810 ptrdiff_t src_stride,
811 int dst_width,
812 int source_y_fraction) {
813 int y1_fraction = source_y_fraction;
814 int y0_fraction = 256 - y1_fraction;
815 const uint8_t* src_ptr1 = src_ptr + src_stride;
816 size_t dst_w = (size_t)dst_width;
817 assert(source_y_fraction >= 0);
818 assert(source_y_fraction < 256);
819 // Blend 100 / 0 - Copy row unchanged.
820 if (y1_fraction == 0) {
821 do {
822 size_t vl = __riscv_vsetvl_e8m8(dst_w);
823 __riscv_vse8_v_u8m8(dst_ptr, __riscv_vle8_v_u8m8(src_ptr, vl), vl);
824 dst_w -= vl;
825 src_ptr += vl;
826 dst_ptr += vl;
827 } while (dst_w > 0);
828 return;
829 }
830 // To match behavior on other platforms, vxrm (fixed-point rounding mode
831 // register) is set to round-to-nearest-up(0).
832 asm volatile("csrwi vxrm, 0");
833 // Blend 50 / 50.
834 if (y1_fraction == 128) {
835 do {
836 size_t vl = __riscv_vsetvl_e8m8(dst_w);
837 vuint8m8_t row0 = __riscv_vle8_v_u8m8(src_ptr, vl);
838 vuint8m8_t row1 = __riscv_vle8_v_u8m8(src_ptr1, vl);
839 // Use round-to-nearest-up mode for averaging add
840 vuint8m8_t row_out = __riscv_vaaddu_vv_u8m8(row0, row1, vl);
841 __riscv_vse8_v_u8m8(dst_ptr, row_out, vl);
842 dst_w -= vl;
843 src_ptr += vl;
844 src_ptr1 += vl;
845 dst_ptr += vl;
846 } while (dst_w > 0);
847 return;
848 }
849 // General purpose row blend.
850 do {
851 size_t vl = __riscv_vsetvl_e8m4(dst_w);
852 vuint8m4_t row0 = __riscv_vle8_v_u8m4(src_ptr, vl);
853 vuint16m8_t acc = __riscv_vwmulu_vx_u16m8(row0, y0_fraction, vl);
854 vuint8m4_t row1 = __riscv_vle8_v_u8m4(src_ptr1, vl);
855 acc = __riscv_vwmaccu_vx_u16m8(acc, y1_fraction, row1, vl);
856 // Use round-to-nearest-up mode for vnclip
857 __riscv_vse8_v_u8m4(dst_ptr, __riscv_vnclipu_wx_u8m4(acc, 8, vl), vl);
858 dst_w -= vl;
859 src_ptr += vl;
860 src_ptr1 += vl;
861 dst_ptr += vl;
862 } while (dst_w > 0);
863 }
864 #endif
865
866 #ifdef HAS_SPLITRGBROW_RVV
SplitRGBRow_RVV(const uint8_t * src_rgb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)867 void SplitRGBRow_RVV(const uint8_t* src_rgb,
868 uint8_t* dst_r,
869 uint8_t* dst_g,
870 uint8_t* dst_b,
871 int width) {
872 size_t w = (size_t)width;
873 do {
874 vuint8m2_t v_b, v_g, v_r;
875 size_t vl = __riscv_vsetvl_e8m2(w);
876 __riscv_vlseg3e8_v_u8m2(&v_r, &v_g, &v_b, src_rgb, vl);
877 __riscv_vse8_v_u8m2(dst_r, v_r, vl);
878 __riscv_vse8_v_u8m2(dst_g, v_g, vl);
879 __riscv_vse8_v_u8m2(dst_b, v_b, vl);
880 w -= vl;
881 dst_r += vl;
882 dst_g += vl;
883 dst_b += vl;
884 src_rgb += vl * 3;
885 } while (w > 0);
886 }
887 #endif
888
889 #ifdef HAS_MERGERGBROW_RVV
MergeRGBRow_RVV(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_rgb,int width)890 void MergeRGBRow_RVV(const uint8_t* src_r,
891 const uint8_t* src_g,
892 const uint8_t* src_b,
893 uint8_t* dst_rgb,
894 int width) {
895 size_t w = (size_t)width;
896 do {
897 size_t vl = __riscv_vsetvl_e8m2(w);
898 vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl);
899 vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl);
900 vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl);
901 __riscv_vsseg3e8_v_u8m2(dst_rgb, v_r, v_g, v_b, vl);
902 w -= vl;
903 src_r += vl;
904 src_g += vl;
905 src_b += vl;
906 dst_rgb += vl * 3;
907 } while (w > 0);
908 }
909 #endif
910
911 #ifdef HAS_SPLITARGBROW_RVV
SplitARGBRow_RVV(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,uint8_t * dst_a,int width)912 void SplitARGBRow_RVV(const uint8_t* src_argb,
913 uint8_t* dst_r,
914 uint8_t* dst_g,
915 uint8_t* dst_b,
916 uint8_t* dst_a,
917 int width) {
918 size_t w = (size_t)width;
919 do {
920 vuint8m2_t v_b, v_g, v_r, v_a;
921 size_t vl = __riscv_vsetvl_e8m2(w);
922 __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
923 __riscv_vse8_v_u8m2(dst_a, v_a, vl);
924 __riscv_vse8_v_u8m2(dst_r, v_r, vl);
925 __riscv_vse8_v_u8m2(dst_g, v_g, vl);
926 __riscv_vse8_v_u8m2(dst_b, v_b, vl);
927 w -= vl;
928 dst_a += vl;
929 dst_r += vl;
930 dst_g += vl;
931 dst_b += vl;
932 src_argb += vl * 4;
933 } while (w > 0);
934 }
935 #endif
936
937 #ifdef HAS_MERGEARGBROW_RVV
MergeARGBRow_RVV(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,const uint8_t * src_a,uint8_t * dst_argb,int width)938 void MergeARGBRow_RVV(const uint8_t* src_r,
939 const uint8_t* src_g,
940 const uint8_t* src_b,
941 const uint8_t* src_a,
942 uint8_t* dst_argb,
943 int width) {
944 size_t w = (size_t)width;
945 do {
946 size_t vl = __riscv_vsetvl_e8m2(w);
947 vuint8m2_t v_r = __riscv_vle8_v_u8m2(src_r, vl);
948 vuint8m2_t v_g = __riscv_vle8_v_u8m2(src_g, vl);
949 vuint8m2_t v_b = __riscv_vle8_v_u8m2(src_b, vl);
950 vuint8m2_t v_a = __riscv_vle8_v_u8m2(src_a, vl);
951 __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
952 w -= vl;
953 src_r += vl;
954 src_g += vl;
955 src_b += vl;
956 src_a += vl;
957 dst_argb += vl * 4;
958 } while (w > 0);
959 }
960 #endif
961
962 #ifdef HAS_SPLITXRGBROW_RVV
SplitXRGBRow_RVV(const uint8_t * src_argb,uint8_t * dst_r,uint8_t * dst_g,uint8_t * dst_b,int width)963 void SplitXRGBRow_RVV(const uint8_t* src_argb,
964 uint8_t* dst_r,
965 uint8_t* dst_g,
966 uint8_t* dst_b,
967 int width) {
968 size_t w = (size_t)width;
969 do {
970 vuint8m2_t v_b, v_g, v_r, v_a;
971 size_t vl = __riscv_vsetvl_e8m2(w);
972 __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
973 __riscv_vse8_v_u8m2(dst_r, v_r, vl);
974 __riscv_vse8_v_u8m2(dst_g, v_g, vl);
975 __riscv_vse8_v_u8m2(dst_b, v_b, vl);
976 w -= vl;
977 dst_r += vl;
978 dst_g += vl;
979 dst_b += vl;
980 src_argb += vl * 4;
981 } while (w > 0);
982 }
983 #endif
984
985 #ifdef HAS_MERGEXRGBROW_RVV
MergeXRGBRow_RVV(const uint8_t * src_r,const uint8_t * src_g,const uint8_t * src_b,uint8_t * dst_argb,int width)986 void MergeXRGBRow_RVV(const uint8_t* src_r,
987 const uint8_t* src_g,
988 const uint8_t* src_b,
989 uint8_t* dst_argb,
990 int width) {
991 size_t w = (size_t)width;
992 size_t vl = __riscv_vsetvl_e8m2(w);
993 vuint8m2_t v_a = __riscv_vmv_v_x_u8m2(255u, vl);
994 do {
995 vuint8m2_t v_r, v_g, v_b;
996 v_r = __riscv_vle8_v_u8m2(src_r, vl);
997 v_g = __riscv_vle8_v_u8m2(src_g, vl);
998 v_b = __riscv_vle8_v_u8m2(src_b, vl);
999 __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
1000 w -= vl;
1001 src_r += vl;
1002 src_g += vl;
1003 src_b += vl;
1004 dst_argb += vl * 4;
1005 vl = __riscv_vsetvl_e8m2(w);
1006 } while (w > 0);
1007 }
1008 #endif
1009
1010 #ifdef HAS_SPLITUVROW_RVV
SplitUVRow_RVV(const uint8_t * src_uv,uint8_t * dst_u,uint8_t * dst_v,int width)1011 void SplitUVRow_RVV(const uint8_t* src_uv,
1012 uint8_t* dst_u,
1013 uint8_t* dst_v,
1014 int width) {
1015 size_t w = (size_t)width;
1016 do {
1017 size_t vl = __riscv_vsetvl_e8m4(w);
1018 vuint8m4_t v_u, v_v;
1019 __riscv_vlseg2e8_v_u8m4(&v_u, &v_v, src_uv, vl);
1020 __riscv_vse8_v_u8m4(dst_u, v_u, vl);
1021 __riscv_vse8_v_u8m4(dst_v, v_v, vl);
1022 w -= vl;
1023 dst_u += vl;
1024 dst_v += vl;
1025 src_uv += 2 * vl;
1026 } while (w > 0);
1027 }
1028 #endif
1029
1030 #ifdef HAS_MERGEUVROW_RVV
MergeUVRow_RVV(const uint8_t * src_u,const uint8_t * src_v,uint8_t * dst_uv,int width)1031 void MergeUVRow_RVV(const uint8_t* src_u,
1032 const uint8_t* src_v,
1033 uint8_t* dst_uv,
1034 int width) {
1035 size_t w = (size_t)width;
1036 do {
1037 vuint8m4_t v_u, v_v;
1038 size_t vl = __riscv_vsetvl_e8m4(w);
1039 v_u = __riscv_vle8_v_u8m4(src_u, vl);
1040 v_v = __riscv_vle8_v_u8m4(src_v, vl);
1041 __riscv_vsseg2e8_v_u8m4(dst_uv, v_u, v_v, vl);
1042 w -= vl;
1043 src_u += vl;
1044 src_v += vl;
1045 dst_uv += 2 * vl;
1046 } while (w > 0);
1047 }
1048 #endif
1049
1050 struct RgbConstants {
1051 uint8_t kRGBToY[4];
1052 uint16_t kAddY;
1053 uint16_t pad;
1054 };
1055
1056 // RGB to JPeg coefficients
1057 // B * 0.1140 coefficient = 29
1058 // G * 0.5870 coefficient = 150
1059 // R * 0.2990 coefficient = 77
1060 // Add 0.5 = 0x80
1061 static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
1062 128,
1063 0};
1064
1065 static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0};
1066
1067 // RGB to BT.601 coefficients
1068 // B * 0.1016 coefficient = 25
1069 // G * 0.5078 coefficient = 129
1070 // R * 0.2578 coefficient = 66
1071 // Add 16.5 = 0x1080
1072
1073 static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
1074 0x1080,
1075 0};
1076
1077 static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0},
1078 0x1080,
1079 0};
1080
1081 // ARGB expects first 3 values to contain RGB and 4th value is ignored
1082 #ifdef HAS_ARGBTOYMATRIXROW_RVV
ARGBToYMatrixRow_RVV(const uint8_t * src_argb,uint8_t * dst_y,int width,const struct RgbConstants * rgbconstants)1083 void ARGBToYMatrixRow_RVV(const uint8_t* src_argb,
1084 uint8_t* dst_y,
1085 int width,
1086 const struct RgbConstants* rgbconstants) {
1087 assert(width != 0);
1088 size_t w = (size_t)width;
1089 vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant
1090 vuint16m4_t v_addy; // vector is to store kAddY
1091 size_t vl = __riscv_vsetvl_e8m2(w);
1092 v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl);
1093 v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl);
1094 v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl);
1095 v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl);
1096 do {
1097 vuint8m2_t v_b, v_g, v_r, v_a, v_y;
1098 vuint16m4_t v_y_u16;
1099 size_t vl = __riscv_vsetvl_e8m2(w);
1100 __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
1101 v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl);
1102 v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl);
1103 v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl);
1104 v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl);
1105 v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl);
1106 __riscv_vse8_v_u8m2(dst_y, v_y, vl);
1107 w -= vl;
1108 src_argb += 4 * vl;
1109 dst_y += vl;
1110 } while (w > 0);
1111 }
1112 #endif
1113
1114 #ifdef HAS_ARGBTOYROW_RVV
ARGBToYRow_RVV(const uint8_t * src_argb,uint8_t * dst_y,int width)1115 void ARGBToYRow_RVV(const uint8_t* src_argb, uint8_t* dst_y, int width) {
1116 ARGBToYMatrixRow_RVV(src_argb, dst_y, width, &kRgb24I601Constants);
1117 }
1118 #endif
1119
1120 #ifdef HAS_ARGBTOYJROW_RVV
ARGBToYJRow_RVV(const uint8_t * src_argb,uint8_t * dst_yj,int width)1121 void ARGBToYJRow_RVV(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
1122 ARGBToYMatrixRow_RVV(src_argb, dst_yj, width, &kRgb24JPEGConstants);
1123 }
1124 #endif
1125
1126 #ifdef HAS_ABGRTOYROW_RVV
ABGRToYRow_RVV(const uint8_t * src_abgr,uint8_t * dst_y,int width)1127 void ABGRToYRow_RVV(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
1128 ARGBToYMatrixRow_RVV(src_abgr, dst_y, width, &kRawI601Constants);
1129 }
1130 #endif
1131
1132 #ifdef HAS_ABGRTOYJROW_RVV
ABGRToYJRow_RVV(const uint8_t * src_abgr,uint8_t * dst_yj,int width)1133 void ABGRToYJRow_RVV(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
1134 ARGBToYMatrixRow_RVV(src_abgr, dst_yj, width, &kRawJPEGConstants);
1135 }
1136 #endif
1137
1138 // RGBA expects first value to be A and ignored, then 3 values to contain RGB.
1139 #ifdef HAS_RGBATOYMATRIXROW_RVV
RGBAToYMatrixRow_RVV(const uint8_t * src_rgba,uint8_t * dst_y,int width,const struct RgbConstants * rgbconstants)1140 void RGBAToYMatrixRow_RVV(const uint8_t* src_rgba,
1141 uint8_t* dst_y,
1142 int width,
1143 const struct RgbConstants* rgbconstants) {
1144 assert(width != 0);
1145 size_t w = (size_t)width;
1146 vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant
1147 vuint16m4_t v_addy; // vector is to store kAddY
1148 size_t vl = __riscv_vsetvl_e8m2(w);
1149 v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl);
1150 v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl);
1151 v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl);
1152 v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl);
1153 do {
1154 vuint8m2_t v_b, v_g, v_r, v_a, v_y;
1155 vuint16m4_t v_y_u16;
1156 size_t vl = __riscv_vsetvl_e8m2(w);
1157 __riscv_vlseg4e8_v_u8m2(&v_a, &v_b, &v_g, &v_r, src_rgba, vl);
1158 v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl);
1159 v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl);
1160 v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl);
1161 v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl);
1162 v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl);
1163 __riscv_vse8_v_u8m2(dst_y, v_y, vl);
1164 w -= vl;
1165 src_rgba += 4 * vl;
1166 dst_y += vl;
1167 } while (w > 0);
1168 }
1169 #endif
1170
1171 #ifdef HAS_RGBATOYROW_RVV
RGBAToYRow_RVV(const uint8_t * src_rgba,uint8_t * dst_y,int width)1172 void RGBAToYRow_RVV(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
1173 RGBAToYMatrixRow_RVV(src_rgba, dst_y, width, &kRgb24I601Constants);
1174 }
1175 #endif
1176
1177 #ifdef HAS_RGBATOYJROW_RVV
RGBAToYJRow_RVV(const uint8_t * src_rgba,uint8_t * dst_yj,int width)1178 void RGBAToYJRow_RVV(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
1179 RGBAToYMatrixRow_RVV(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
1180 }
1181 #endif
1182
1183 #ifdef HAS_BGRATOYROW_RVV
BGRAToYRow_RVV(const uint8_t * src_bgra,uint8_t * dst_y,int width)1184 void BGRAToYRow_RVV(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
1185 RGBAToYMatrixRow_RVV(src_bgra, dst_y, width, &kRawI601Constants);
1186 }
1187 #endif
1188
1189 #ifdef HAS_RGBTOYMATRIXROW_RVV
RGBToYMatrixRow_RVV(const uint8_t * src_rgb,uint8_t * dst_y,int width,const struct RgbConstants * rgbconstants)1190 void RGBToYMatrixRow_RVV(const uint8_t* src_rgb,
1191 uint8_t* dst_y,
1192 int width,
1193 const struct RgbConstants* rgbconstants) {
1194 assert(width != 0);
1195 size_t w = (size_t)width;
1196 vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant
1197 vuint16m4_t v_addy; // vector is to store kAddY
1198 size_t vl = __riscv_vsetvl_e8m2(w);
1199 v_by = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[0], vl);
1200 v_gy = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[1], vl);
1201 v_ry = __riscv_vmv_v_x_u8m2(rgbconstants->kRGBToY[2], vl);
1202 v_addy = __riscv_vmv_v_x_u16m4(rgbconstants->kAddY, vl);
1203 do {
1204 vuint8m2_t v_b, v_g, v_r, v_y;
1205 vuint16m4_t v_y_u16;
1206 size_t vl = __riscv_vsetvl_e8m2(w);
1207 __riscv_vlseg3e8_v_u8m2(&v_b, &v_g, &v_r, src_rgb, vl);
1208 v_y_u16 = __riscv_vwmulu_vv_u16m4(v_r, v_ry, vl);
1209 v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_gy, v_g, vl);
1210 v_y_u16 = __riscv_vwmaccu_vv_u16m4(v_y_u16, v_by, v_b, vl);
1211 v_y_u16 = __riscv_vadd_vv_u16m4(v_y_u16, v_addy, vl);
1212 v_y = __riscv_vnsrl_wx_u8m2(v_y_u16, 8, vl);
1213 __riscv_vse8_v_u8m2(dst_y, v_y, vl);
1214 w -= vl;
1215 src_rgb += 3 * vl;
1216 dst_y += vl;
1217 } while (w > 0);
1218 }
1219 #endif
1220
1221 #ifdef HAS_RGB24TOYJROW_RVV
RGB24ToYJRow_RVV(const uint8_t * src_rgb24,uint8_t * dst_yj,int width)1222 void RGB24ToYJRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
1223 RGBToYMatrixRow_RVV(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
1224 }
1225 #endif
1226
1227 #ifdef HAS_RAWTOYJROW_RVV
RAWToYJRow_RVV(const uint8_t * src_raw,uint8_t * dst_yj,int width)1228 void RAWToYJRow_RVV(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
1229 RGBToYMatrixRow_RVV(src_raw, dst_yj, width, &kRawJPEGConstants);
1230 }
1231 #endif
1232
1233 #ifdef HAS_RGB24TOYROW_RVV
RGB24ToYRow_RVV(const uint8_t * src_rgb24,uint8_t * dst_y,int width)1234 void RGB24ToYRow_RVV(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
1235 RGBToYMatrixRow_RVV(src_rgb24, dst_y, width, &kRgb24I601Constants);
1236 }
1237 #endif
1238
1239 #ifdef HAS_RAWTOYROW_RVV
RAWToYRow_RVV(const uint8_t * src_raw,uint8_t * dst_y,int width)1240 void RAWToYRow_RVV(const uint8_t* src_raw, uint8_t* dst_y, int width) {
1241 RGBToYMatrixRow_RVV(src_raw, dst_y, width, &kRawI601Constants);
1242 }
1243 #endif
1244
1245 // Blend src_argb over src_argb1 and store to dst_argb.
1246 // dst_argb may be src_argb or src_argb1.
1247 // src_argb: RGB values have already been pre-multiplied by the a.
1248 #ifdef HAS_ARGBBLENDROW_RVV
ARGBBlendRow_RVV(const uint8_t * src_argb,const uint8_t * src_argb1,uint8_t * dst_argb,int width)1249 void ARGBBlendRow_RVV(const uint8_t* src_argb,
1250 const uint8_t* src_argb1,
1251 uint8_t* dst_argb,
1252 int width) {
1253 size_t w = (size_t)width;
1254 size_t vl = __riscv_vsetvlmax_e8m2();
1255 // clamp255((((256 - a) * b) >> 8) + f)
1256 // = b * (256 - a) / 256 + f
1257 // = b - (b * a / 256) + f
1258 vuint8m2_t v_255 = __riscv_vmv_v_x_u8m2(255, vl);
1259 do {
1260 vuint8m2_t v_src0_b, v_src0_g, v_src0_r, v_src0_a;
1261 vuint8m2_t v_src1_b, v_src1_g, v_src1_r, v_src1_a;
1262 vuint8m2_t v_tmp_b, v_tmp_g, v_tmp_r;
1263 vuint8m2_t v_dst_b, v_dst_g, v_dst_r;
1264 vl = __riscv_vsetvl_e8m2(w);
1265 __riscv_vlseg4e8_v_u8m2(&v_src0_b, &v_src0_g, &v_src0_r, &v_src0_a,
1266 src_argb, vl);
1267 __riscv_vlseg4e8_v_u8m2(&v_src1_b, &v_src1_g, &v_src1_r, &v_src1_a,
1268 src_argb1, vl);
1269
1270 v_tmp_b = __riscv_vmulhu_vv_u8m2(v_src1_b, v_src0_a, vl);
1271 v_tmp_g = __riscv_vmulhu_vv_u8m2(v_src1_g, v_src0_a, vl);
1272 v_tmp_r = __riscv_vmulhu_vv_u8m2(v_src1_r, v_src0_a, vl);
1273
1274 v_dst_b = __riscv_vsub_vv_u8m2(v_src1_b, v_tmp_b, vl);
1275 v_dst_g = __riscv_vsub_vv_u8m2(v_src1_g, v_tmp_g, vl);
1276 v_dst_r = __riscv_vsub_vv_u8m2(v_src1_r, v_tmp_r, vl);
1277
1278 v_dst_b = __riscv_vsaddu_vv_u8m2(v_dst_b, v_src0_b, vl);
1279 v_dst_g = __riscv_vsaddu_vv_u8m2(v_dst_g, v_src0_g, vl);
1280 v_dst_r = __riscv_vsaddu_vv_u8m2(v_dst_r, v_src0_r, vl);
1281 __riscv_vsseg4e8_v_u8m2(dst_argb, v_dst_b, v_dst_g, v_dst_r, v_255, vl);
1282
1283 w -= vl;
1284 src_argb += 4 * vl;
1285 src_argb1 += 4 * vl;
1286 dst_argb += 4 * vl;
1287 } while (w > 0);
1288 }
1289 #endif
1290
1291 #ifdef HAS_BLENDPLANEROW_RVV
BlendPlaneRow_RVV(const uint8_t * src0,const uint8_t * src1,const uint8_t * alpha,uint8_t * dst,int width)1292 void BlendPlaneRow_RVV(const uint8_t* src0,
1293 const uint8_t* src1,
1294 const uint8_t* alpha,
1295 uint8_t* dst,
1296 int width) {
1297 size_t w = (size_t)width;
1298 do {
1299 vuint16m8_t v_dst_u16;
1300 vuint8m4_t v_dst;
1301 size_t vl = __riscv_vsetvl_e8m4(w);
1302 vuint8m4_t v_src0 = __riscv_vle8_v_u8m4(src0, vl);
1303 vuint8m4_t v_src1 = __riscv_vle8_v_u8m4(src1, vl);
1304 vuint8m4_t v_alpha = __riscv_vle8_v_u8m4(alpha, vl);
1305 vuint8m4_t v_255_minus_alpha = __riscv_vrsub_vx_u8m4(v_alpha, 255u, vl);
1306
1307 // (a * foreground) + (1-a) * background
1308 v_dst_u16 = __riscv_vwmulu_vv_u16m8(v_alpha, v_src0, vl);
1309 v_dst_u16 =
1310 __riscv_vwmaccu_vv_u16m8(v_dst_u16, v_255_minus_alpha, v_src1, vl);
1311 v_dst_u16 = __riscv_vadd_vx_u16m8(v_dst_u16, 255u, vl);
1312 v_dst = __riscv_vnsrl_wx_u8m4(v_dst_u16, 8, vl);
1313
1314 __riscv_vse8_v_u8m4(dst, v_dst, vl);
1315 w -= vl;
1316 src0 += vl;
1317 src1 += vl;
1318 alpha += vl;
1319 dst += vl;
1320 } while (w > 0);
1321 }
1322 #endif
1323
1324 // Attenuate: (f * a + 255) >> 8
1325 #ifdef HAS_ARGBATTENUATEROW_RVV
ARGBAttenuateRow_RVV(const uint8_t * src_argb,uint8_t * dst_argb,int width)1326 void ARGBAttenuateRow_RVV(const uint8_t* src_argb,
1327 uint8_t* dst_argb,
1328 int width) {
1329 size_t w = (size_t)width;
1330 do {
1331 vuint8m2_t v_b, v_g, v_r, v_a;
1332 vuint16m4_t v_ba_16, v_ga_16, v_ra_16;
1333 size_t vl = __riscv_vsetvl_e8m2(w);
1334 __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
1335 // f * a
1336 v_ba_16 = __riscv_vwmulu_vv_u16m4(v_b, v_a, vl);
1337 v_ga_16 = __riscv_vwmulu_vv_u16m4(v_g, v_a, vl);
1338 v_ra_16 = __riscv_vwmulu_vv_u16m4(v_r, v_a, vl);
1339 // f * a + 255
1340 v_ba_16 = __riscv_vadd_vx_u16m4(v_ba_16, 255u, vl);
1341 v_ga_16 = __riscv_vadd_vx_u16m4(v_ga_16, 255u, vl);
1342 v_ra_16 = __riscv_vadd_vx_u16m4(v_ra_16, 255u, vl);
1343 // (f * a + 255) >> 8
1344 v_b = __riscv_vnsrl_wx_u8m2(v_ba_16, 8, vl);
1345 v_g = __riscv_vnsrl_wx_u8m2(v_ga_16, 8, vl);
1346 v_r = __riscv_vnsrl_wx_u8m2(v_ra_16, 8, vl);
1347 __riscv_vsseg4e8_v_u8m2(dst_argb, v_b, v_g, v_r, v_a, vl);
1348 w -= vl;
1349 src_argb += vl * 4;
1350 dst_argb += vl * 4;
1351 } while (w > 0);
1352 }
1353 #endif
1354
1355 #ifdef HAS_ARGBEXTRACTALPHAROW_RVV
ARGBExtractAlphaRow_RVV(const uint8_t * src_argb,uint8_t * dst_a,int width)1356 void ARGBExtractAlphaRow_RVV(const uint8_t* src_argb,
1357 uint8_t* dst_a,
1358 int width) {
1359 size_t w = (size_t)width;
1360 do {
1361 size_t vl = __riscv_vsetvl_e8m2(w);
1362 vuint8m2_t v_b, v_g, v_r, v_a;
1363 __riscv_vlseg4e8_v_u8m2(&v_b, &v_g, &v_r, &v_a, src_argb, vl);
1364 __riscv_vse8_v_u8m2(dst_a, v_a, vl);
1365 w -= vl;
1366 src_argb += vl * 4;
1367 dst_a += vl;
1368 } while (w > 0);
1369 }
1370 #endif
1371
1372 #ifdef HAS_ARGBCOPYYTOALPHAROW_RVV
ARGBCopyYToAlphaRow_RVV(const uint8_t * src,uint8_t * dst,int width)1373 void ARGBCopyYToAlphaRow_RVV(const uint8_t* src, uint8_t* dst, int width) {
1374 size_t w = (size_t)width;
1375 const ptrdiff_t dst_stride = 4;
1376 dst += 3;
1377 do {
1378 size_t vl = __riscv_vsetvl_e8m8(w);
1379 vuint8m8_t v_a = __riscv_vle8_v_u8m8(src, vl);
1380 __riscv_vsse8_v_u8m8(dst, dst_stride, v_a, vl);
1381 w -= vl;
1382 src += vl;
1383 dst += vl * dst_stride;
1384 } while (w > 0);
1385 }
1386 #endif
1387
1388 #ifdef __cplusplus
1389 } // extern "C"
1390 } // namespace libyuv
1391 #endif
1392
1393 #endif // !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) &&
1394 // defined(__clang__)
1395