xref: /aosp_15_r20/external/libyuv/source/scale_rvv.cc (revision 4e366538070a3a6c5c163c31b791eab742e1657a)
1 /*
2  *  Copyright 2023 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 /*
12  * Copyright (c) 2023 SiFive, Inc. All rights reserved.
13  *
14  * Contributed by Darren Hsieh <[email protected]>
15  * Contributed by Bruce Lai <[email protected]>
16  */
17 
18 #include "libyuv/row.h"
19 #include "libyuv/scale_row.h"
20 
21 // This module is for clang rvv. GCC hasn't supported segment load & store.
22 #if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) && \
23     defined(__clang__)
24 #include <assert.h>
25 #include <riscv_vector.h>
26 #ifdef __cplusplus
27 namespace libyuv {
28 extern "C" {
29 #endif
30 
31 #ifdef HAS_SCALEADDROW_RVV
ScaleAddRow_RVV(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)32 void ScaleAddRow_RVV(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
33   size_t w = (size_t)src_width;
34   do {
35     size_t vl = __riscv_vsetvl_e8m4(w);
36     vuint8m4_t v_src = __riscv_vle8_v_u8m4(src_ptr, vl);
37     vuint16m8_t v_dst = __riscv_vle16_v_u16m8(dst_ptr, vl);
38     // Use widening multiply-add instead of widening + add
39     v_dst = __riscv_vwmaccu_vx_u16m8(v_dst, 1, v_src, vl);
40     __riscv_vse16_v_u16m8(dst_ptr, v_dst, vl);
41     w -= vl;
42     src_ptr += vl;
43     dst_ptr += vl;
44   } while (w > 0);
45 }
46 #endif
47 
48 #ifdef HAS_SCALEARGBROWDOWN2_RVV
ScaleARGBRowDown2_RVV(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)49 void ScaleARGBRowDown2_RVV(const uint8_t* src_argb,
50                            ptrdiff_t src_stride,
51                            uint8_t* dst_argb,
52                            int dst_width) {
53   (void)src_stride;
54   size_t w = (size_t)dst_width;
55   const uint64_t* src = (const uint64_t*)(src_argb);
56   uint32_t* dst = (uint32_t*)(dst_argb);
57   do {
58     size_t vl = __riscv_vsetvl_e64m8(w);
59     vuint64m8_t v_data = __riscv_vle64_v_u64m8(src, vl);
60     vuint32m4_t v_dst = __riscv_vnsrl_wx_u32m4(v_data, 32, vl);
61     __riscv_vse32_v_u32m4(dst, v_dst, vl);
62     w -= vl;
63     src += vl;
64     dst += vl;
65   } while (w > 0);
66 }
67 #endif
68 
69 #ifdef HAS_SCALEARGBROWDOWN2LINEAR_RVV
ScaleARGBRowDown2Linear_RVV(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)70 void ScaleARGBRowDown2Linear_RVV(const uint8_t* src_argb,
71                                  ptrdiff_t src_stride,
72                                  uint8_t* dst_argb,
73                                  int dst_width) {
74   (void)src_stride;
75   size_t w = (size_t)dst_width;
76   const uint32_t* src = (const uint32_t*)(src_argb);
77   // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
78   // register) is set to round-to-nearest-up mode(0).
79   asm volatile("csrwi vxrm, 0");
80   do {
81     vuint8m4_t v_odd, v_even, v_dst;
82     vuint32m4_t v_odd_32, v_even_32;
83     size_t vl = __riscv_vsetvl_e32m4(w);
84     __riscv_vlseg2e32_v_u32m4(&v_even_32, &v_odd_32, src, vl);
85     v_even = __riscv_vreinterpret_v_u32m4_u8m4(v_even_32);
86     v_odd = __riscv_vreinterpret_v_u32m4_u8m4(v_odd_32);
87     // Use round-to-nearest-up mode for averaging add
88     v_dst = __riscv_vaaddu_vv_u8m4(v_even, v_odd, vl * 4);
89     __riscv_vse8_v_u8m4(dst_argb, v_dst, vl * 4);
90     w -= vl;
91     src += vl * 2;
92     dst_argb += vl * 4;
93   } while (w > 0);
94 }
95 #endif
96 
97 #ifdef HAS_SCALEARGBROWDOWN2BOX_RVV
ScaleARGBRowDown2Box_RVV(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)98 void ScaleARGBRowDown2Box_RVV(const uint8_t* src_argb,
99                               ptrdiff_t src_stride,
100                               uint8_t* dst_argb,
101                               int dst_width) {
102   size_t w = (size_t)dst_width;
103   const uint32_t* src0 = (const uint32_t*)(src_argb);
104   const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride);
105   // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
106   // register) is set to round-to-nearest-up mode(0).
107   asm volatile("csrwi vxrm, 0");
108   do {
109     vuint8m4_t v_row0_odd, v_row0_even, v_row1_odd, v_row1_even, v_dst;
110     vuint16m8_t v_row0_sum, v_row1_sum, v_dst_16;
111     vuint32m4_t v_row0_odd_32, v_row0_even_32, v_row1_odd_32, v_row1_even_32;
112     size_t vl = __riscv_vsetvl_e32m4(w);
113     __riscv_vlseg2e32_v_u32m4(&v_row0_even_32, &v_row0_odd_32, src0, vl);
114     __riscv_vlseg2e32_v_u32m4(&v_row1_even_32, &v_row1_odd_32, src1, vl);
115     v_row0_even = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_even_32);
116     v_row0_odd = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_odd_32);
117     v_row1_even = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_even_32);
118     v_row1_odd = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_odd_32);
119     v_row0_sum = __riscv_vwaddu_vv_u16m8(v_row0_even, v_row0_odd, vl * 4);
120     v_row1_sum = __riscv_vwaddu_vv_u16m8(v_row1_even, v_row1_odd, vl * 4);
121     v_dst_16 = __riscv_vadd_vv_u16m8(v_row0_sum, v_row1_sum, vl * 4);
122     // Use round-to-nearest-up mode for vnclip
123     v_dst = __riscv_vnclipu_wx_u8m4(v_dst_16, 2, vl * 4);
124     __riscv_vse8_v_u8m4(dst_argb, v_dst, vl * 4);
125     w -= vl;
126     src0 += vl * 2;
127     src1 += vl * 2;
128     dst_argb += vl * 4;
129   } while (w > 0);
130 }
131 #endif
132 
133 #ifdef HAS_SCALEARGBROWDOWNEVEN_RVV
ScaleARGBRowDownEven_RVV(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)134 void ScaleARGBRowDownEven_RVV(const uint8_t* src_argb,
135                               ptrdiff_t src_stride,
136                               int src_stepx,
137                               uint8_t* dst_argb,
138                               int dst_width) {
139   size_t w = (size_t)dst_width;
140   const uint32_t* src = (const uint32_t*)(src_argb);
141   uint32_t* dst = (uint32_t*)(dst_argb);
142   const int stride_byte = src_stepx * 4;
143   do {
144     size_t vl = __riscv_vsetvl_e32m8(w);
145     vuint32m8_t v_row = __riscv_vlse32_v_u32m8(src, stride_byte, vl);
146     __riscv_vse32_v_u32m8(dst, v_row, vl);
147     w -= vl;
148     src += vl * src_stepx;
149     dst += vl;
150   } while (w > 0);
151 }
152 #endif
153 
154 #ifdef HAS_SCALEARGBROWDOWNEVENBOX_RVV
ScaleARGBRowDownEvenBox_RVV(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)155 void ScaleARGBRowDownEvenBox_RVV(const uint8_t* src_argb,
156                                  ptrdiff_t src_stride,
157                                  int src_stepx,
158                                  uint8_t* dst_argb,
159                                  int dst_width) {
160   size_t w = (size_t)dst_width;
161   const uint32_t* src0 = (const uint32_t*)(src_argb);
162   const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride);
163   const int stride_byte = src_stepx * 4;
164   // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
165   // register) is set to round-to-nearest-up mode(0).
166   asm volatile("csrwi vxrm, 0");
167   do {
168     vuint8m4_t v_row0_low, v_row0_high, v_row1_low, v_row1_high, v_dst;
169     vuint16m8_t v_row0_sum, v_row1_sum, v_sum;
170     vuint32m4_t v_row0_low_32, v_row0_high_32, v_row1_low_32, v_row1_high_32;
171     size_t vl = __riscv_vsetvl_e32m4(w);
172     __riscv_vlsseg2e32_v_u32m4(&v_row0_low_32, &v_row0_high_32, src0,
173                                stride_byte, vl);
174     __riscv_vlsseg2e32_v_u32m4(&v_row1_low_32, &v_row1_high_32, src1,
175                                stride_byte, vl);
176     v_row0_low = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_low_32);
177     v_row0_high = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_high_32);
178     v_row1_low = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_low_32);
179     v_row1_high = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_high_32);
180     v_row0_sum = __riscv_vwaddu_vv_u16m8(v_row0_low, v_row0_high, vl * 4);
181     v_row1_sum = __riscv_vwaddu_vv_u16m8(v_row1_low, v_row1_high, vl * 4);
182     v_sum = __riscv_vadd_vv_u16m8(v_row0_sum, v_row1_sum, vl * 4);
183     // Use round-to-nearest-up mode for vnclip
184     v_dst = __riscv_vnclipu_wx_u8m4(v_sum, 2, vl * 4);
185     __riscv_vse8_v_u8m4(dst_argb, v_dst, vl * 4);
186     w -= vl;
187     src0 += vl * src_stepx;
188     src1 += vl * src_stepx;
189     dst_argb += vl * 4;
190   } while (w > 0);
191 }
192 #endif
193 
194 #ifdef HAS_SCALEROWDOWN2_RVV
ScaleRowDown2_RVV(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)195 void ScaleRowDown2_RVV(const uint8_t* src_ptr,
196                        ptrdiff_t src_stride,
197                        uint8_t* dst,
198                        int dst_width) {
199   size_t w = (size_t)dst_width;
200   const uint16_t* src = (const uint16_t*)src_ptr;
201   (void)src_stride;
202   do {
203     size_t vl = __riscv_vsetvl_e16m8(w);
204     vuint16m8_t v_src = __riscv_vle16_v_u16m8(src, vl);
205     vuint8m4_t v_dst = __riscv_vnsrl_wx_u8m4(v_src, 8, vl);
206     __riscv_vse8_v_u8m4(dst, v_dst, vl);
207     w -= vl;
208     src += vl;
209     dst += vl;
210   } while (w > 0);
211 }
212 #endif
213 
214 #ifdef HAS_SCALEROWDOWN2LINEAR_RVV
ScaleRowDown2Linear_RVV(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)215 void ScaleRowDown2Linear_RVV(const uint8_t* src_ptr,
216                              ptrdiff_t src_stride,
217                              uint8_t* dst,
218                              int dst_width) {
219   size_t w = (size_t)dst_width;
220   (void)src_stride;
221   // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
222   // register) is set to round-to-nearest-up mode(0).
223   asm volatile("csrwi vxrm, 0");
224   do {
225     vuint8m4_t v_s0, v_s1, v_dst;
226     size_t vl = __riscv_vsetvl_e8m4(w);
227     __riscv_vlseg2e8_v_u8m4(&v_s0, &v_s1, src_ptr, vl);
228     // Use round-to-nearest-up mode for averaging add
229     v_dst = __riscv_vaaddu_vv_u8m4(v_s0, v_s1, vl);
230     __riscv_vse8_v_u8m4(dst, v_dst, vl);
231     w -= vl;
232     src_ptr += 2 * vl;
233     dst += vl;
234   } while (w > 0);
235 }
236 #endif
237 
238 #ifdef HAS_SCALEROWDOWN2BOX_RVV
ScaleRowDown2Box_RVV(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)239 void ScaleRowDown2Box_RVV(const uint8_t* src_ptr,
240                           ptrdiff_t src_stride,
241                           uint8_t* dst,
242                           int dst_width) {
243   const uint8_t* s = src_ptr;
244   const uint8_t* t = src_ptr + src_stride;
245   size_t w = (size_t)dst_width;
246   // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
247   // register) is set to round-to-nearest-up mode(0).
248   asm volatile("csrwi vxrm, 0");
249   do {
250     size_t vl = __riscv_vsetvl_e8m4(w);
251     vuint8m4_t v_s0, v_s1, v_t0, v_t1;
252     vuint16m8_t v_s01, v_t01, v_st01;
253     vuint8m4_t v_dst;
254     __riscv_vlseg2e8_v_u8m4(&v_s0, &v_s1, s, vl);
255     __riscv_vlseg2e8_v_u8m4(&v_t0, &v_t1, t, vl);
256     v_s01 = __riscv_vwaddu_vv_u16m8(v_s0, v_s1, vl);
257     v_t01 = __riscv_vwaddu_vv_u16m8(v_t0, v_t1, vl);
258     v_st01 = __riscv_vadd_vv_u16m8(v_s01, v_t01, vl);
259     // Use round-to-nearest-up mode for vnclip
260     v_dst = __riscv_vnclipu_wx_u8m4(v_st01, 2, vl);
261     __riscv_vse8_v_u8m4(dst, v_dst, vl);
262     w -= vl;
263     s += 2 * vl;
264     t += 2 * vl;
265     dst += vl;
266   } while (w > 0);
267 }
268 #endif
269 
270 #ifdef HAS_SCALEROWDOWN4_RVV
ScaleRowDown4_RVV(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)271 void ScaleRowDown4_RVV(const uint8_t* src_ptr,
272                        ptrdiff_t src_stride,
273                        uint8_t* dst_ptr,
274                        int dst_width) {
275   size_t w = (size_t)dst_width;
276   (void)src_stride;
277   do {
278     size_t vl = __riscv_vsetvl_e8m2(w);
279     vuint8m2_t v_s0, v_s1, v_s2, v_s3;
280     __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, src_ptr, vl);
281     __riscv_vse8_v_u8m2(dst_ptr, v_s2, vl);
282     w -= vl;
283     src_ptr += (4 * vl);
284     dst_ptr += vl;
285   } while (w > 0);
286 }
287 #endif
288 
289 #ifdef HAS_SCALEROWDOWN4BOX_RVV
ScaleRowDown4Box_RVV(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)290 void ScaleRowDown4Box_RVV(const uint8_t* src_ptr,
291                           ptrdiff_t src_stride,
292                           uint8_t* dst_ptr,
293                           int dst_width) {
294   const uint8_t* src_ptr1 = src_ptr + src_stride;
295   const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
296   const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
297   size_t w = (size_t)dst_width;
298   // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
299   // register) is set to round-to-nearest-up mode(0).
300   asm volatile("csrwi vxrm, 0");
301   do {
302     vuint8m2_t v_s0, v_s1, v_s2, v_s3;
303     vuint8m2_t v_t0, v_t1, v_t2, v_t3;
304     vuint8m2_t v_u0, v_u1, v_u2, v_u3;
305     vuint8m2_t v_v0, v_v1, v_v2, v_v3;
306     vuint16m4_t v_s01, v_s23, v_t01, v_t23;
307     vuint16m4_t v_u01, v_u23, v_v01, v_v23;
308     vuint16m4_t v_st01, v_st23, v_uv01, v_uv23;
309     vuint16m4_t v_st0123, v_uv0123, v_stuv0123;
310     vuint8m2_t v_dst;
311     size_t vl = __riscv_vsetvl_e8m2(w);
312 
313     __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, src_ptr, vl);
314     v_s01 = __riscv_vwaddu_vv_u16m4(v_s0, v_s1, vl);
315 
316     __riscv_vlseg4e8_v_u8m2(&v_t0, &v_t1, &v_t2, &v_t3, src_ptr1, vl);
317     v_t01 = __riscv_vwaddu_vv_u16m4(v_t0, v_t1, vl);
318 
319     __riscv_vlseg4e8_v_u8m2(&v_u0, &v_u1, &v_u2, &v_u3, src_ptr2, vl);
320     v_u01 = __riscv_vwaddu_vv_u16m4(v_u0, v_u1, vl);
321     v_u23 = __riscv_vwaddu_vv_u16m4(v_u2, v_u3, vl);
322 
323     v_s23 = __riscv_vwaddu_vv_u16m4(v_s2, v_s3, vl);
324     v_t23 = __riscv_vwaddu_vv_u16m4(v_t2, v_t3, vl);
325     v_st01 = __riscv_vadd_vv_u16m4(v_s01, v_t01, vl);
326     v_st23 = __riscv_vadd_vv_u16m4(v_s23, v_t23, vl);
327 
328     __riscv_vlseg4e8_v_u8m2(&v_v0, &v_v1, &v_v2, &v_v3, src_ptr3, vl);
329 
330     v_v01 = __riscv_vwaddu_vv_u16m4(v_v0, v_v1, vl);
331     v_v23 = __riscv_vwaddu_vv_u16m4(v_v2, v_v3, vl);
332 
333     v_uv01 = __riscv_vadd_vv_u16m4(v_u01, v_v01, vl);
334     v_uv23 = __riscv_vadd_vv_u16m4(v_u23, v_v23, vl);
335 
336     v_st0123 = __riscv_vadd_vv_u16m4(v_st01, v_st23, vl);
337     v_uv0123 = __riscv_vadd_vv_u16m4(v_uv01, v_uv23, vl);
338     v_stuv0123 = __riscv_vadd_vv_u16m4(v_st0123, v_uv0123, vl);
339     // Use round-to-nearest-up mode for vnclip
340     v_dst = __riscv_vnclipu_wx_u8m2(v_stuv0123, 4, vl);
341     __riscv_vse8_v_u8m2(dst_ptr, v_dst, vl);
342     w -= vl;
343     src_ptr += 4 * vl;
344     src_ptr1 += 4 * vl;
345     src_ptr2 += 4 * vl;
346     src_ptr3 += 4 * vl;
347     dst_ptr += vl;
348   } while (w > 0);
349 }
350 #endif
351 
352 #ifdef HAS_SCALEROWDOWN34_RVV
ScaleRowDown34_RVV(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)353 void ScaleRowDown34_RVV(const uint8_t* src_ptr,
354                         ptrdiff_t src_stride,
355                         uint8_t* dst_ptr,
356                         int dst_width) {
357   size_t w = (size_t)dst_width / 3u;
358   do {
359     size_t vl = __riscv_vsetvl_e8m2(w);
360     vuint8m2_t v_s0, v_s1, v_s2, v_s3;
361     __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, src_ptr, vl);
362     __riscv_vsseg3e8_v_u8m2(dst_ptr, v_s0, v_s1, v_s3, vl);
363     w -= vl;
364     src_ptr += 4 * vl;
365     dst_ptr += 3 * vl;
366   } while (w > 0);
367 }
368 #endif
369 
370 #ifdef HAS_SCALEROWDOWN34_0_BOX_RVV
ScaleRowDown34_0_Box_RVV(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)371 void ScaleRowDown34_0_Box_RVV(const uint8_t* src_ptr,
372                               ptrdiff_t src_stride,
373                               uint8_t* dst_ptr,
374                               int dst_width) {
375   size_t w = (size_t)dst_width / 3u;
376   const uint8_t* s = src_ptr;
377   const uint8_t* t = src_ptr + src_stride;
378   // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
379   // register) is set to round-to-nearest-up mode(0).
380   asm volatile("csrwi vxrm, 0");
381   do {
382     vuint8m2_t v_s0, v_s1, v_s2, v_s3;
383     vuint16m4_t v_t0_u16, v_t1_u16, v_t2_u16, v_t3_u16;
384     vuint8m2_t v_u0, v_u1, v_u2, v_u3;
385     vuint16m4_t v_u1_u16;
386     vuint8m2_t v_a0, v_a1, v_a2;
387     size_t vl = __riscv_vsetvl_e8m2(w);
388     __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, s, vl);
389 
390     if (src_stride == 0) {
391       v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_s0, 2, vl);
392       v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_s1, 2, vl);
393       v_t2_u16 = __riscv_vwaddu_vx_u16m4(v_s2, 2, vl);
394       v_t3_u16 = __riscv_vwaddu_vx_u16m4(v_s3, 2, vl);
395     } else {
396       vuint8m2_t v_t0, v_t1, v_t2, v_t3;
397       __riscv_vlseg4e8_v_u8m2(&v_t0, &v_t1, &v_t2, &v_t3, t, vl);
398       v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_t0, 0, vl);
399       v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_t1, 0, vl);
400       v_t2_u16 = __riscv_vwaddu_vx_u16m4(v_t2, 0, vl);
401       v_t3_u16 = __riscv_vwaddu_vx_u16m4(v_t3, 0, vl);
402       t += 4 * vl;
403     }
404 
405     v_t0_u16 = __riscv_vwmaccu_vx_u16m4(v_t0_u16, 3, v_s0, vl);
406     v_t1_u16 = __riscv_vwmaccu_vx_u16m4(v_t1_u16, 3, v_s1, vl);
407     v_t2_u16 = __riscv_vwmaccu_vx_u16m4(v_t2_u16, 3, v_s2, vl);
408     v_t3_u16 = __riscv_vwmaccu_vx_u16m4(v_t3_u16, 3, v_s3, vl);
409 
410     // Use round-to-nearest-up mode for vnclip & averaging add
411     v_u0 = __riscv_vnclipu_wx_u8m2(v_t0_u16, 2, vl);
412     v_u1 = __riscv_vnclipu_wx_u8m2(v_t1_u16, 2, vl);
413     v_u2 = __riscv_vnclipu_wx_u8m2(v_t2_u16, 2, vl);
414     v_u3 = __riscv_vnclipu_wx_u8m2(v_t3_u16, 2, vl);
415 
416     // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
417     v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_u1, 0, vl);
418     v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_u0, vl);
419     v_a0 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl);
420 
421     // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
422     v_a1 = __riscv_vaaddu_vv_u8m2(v_u1, v_u2, vl);
423 
424     // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
425     v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_u2, 0, vl);
426     v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_u3, vl);
427     v_a2 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl);
428 
429     __riscv_vsseg3e8_v_u8m2(dst_ptr, v_a0, v_a1, v_a2, vl);
430 
431     w -= vl;
432     s += 4 * vl;
433     dst_ptr += 3 * vl;
434   } while (w > 0);
435 }
436 #endif
437 
438 #ifdef HAS_SCALEROWDOWN34_1_BOX_RVV
ScaleRowDown34_1_Box_RVV(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)439 void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr,
440                               ptrdiff_t src_stride,
441                               uint8_t* dst_ptr,
442                               int dst_width) {
443   size_t w = (size_t)dst_width / 3u;
444   const uint8_t* s = src_ptr;
445   const uint8_t* t = src_ptr + src_stride;
446   // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
447   // register) is set to round-to-nearest-up mode(0).
448   asm volatile("csrwi vxrm, 0");
449   do {
450     vuint8m2_t v_s0, v_s1, v_s2, v_s3;
451     vuint8m2_t v_ave0, v_ave1, v_ave2, v_ave3;
452     vuint16m4_t v_u1_u16;
453     vuint8m2_t v_a0, v_a1, v_a2;
454     size_t vl = __riscv_vsetvl_e8m2(w);
455     __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, s, vl);
456 
457     // Use round-to-nearest-up mode for vnclip & averaging add
458     if (src_stride == 0) {
459       v_ave0 = __riscv_vaaddu_vv_u8m2(v_s0, v_s0, vl);
460       v_ave1 = __riscv_vaaddu_vv_u8m2(v_s1, v_s1, vl);
461       v_ave2 = __riscv_vaaddu_vv_u8m2(v_s2, v_s2, vl);
462       v_ave3 = __riscv_vaaddu_vv_u8m2(v_s3, v_s3, vl);
463     } else {
464       vuint8m2_t v_t0, v_t1, v_t2, v_t3;
465       __riscv_vlseg4e8_v_u8m2(&v_t0, &v_t1, &v_t2, &v_t3, t, vl);
466       v_ave0 = __riscv_vaaddu_vv_u8m2(v_s0, v_t0, vl);
467       v_ave1 = __riscv_vaaddu_vv_u8m2(v_s1, v_t1, vl);
468       v_ave2 = __riscv_vaaddu_vv_u8m2(v_s2, v_t2, vl);
469       v_ave3 = __riscv_vaaddu_vv_u8m2(v_s3, v_t3, vl);
470       t += 4 * vl;
471     }
472     // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
473     v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_ave1, 0, vl);
474     v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_ave0, vl);
475     v_a0 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl);
476 
477     // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
478     v_a1 = __riscv_vaaddu_vv_u8m2(v_ave1, v_ave2, vl);
479 
480     // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
481     v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_ave2, 0, vl);
482     v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_ave3, vl);
483     v_a2 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl);
484 
485     __riscv_vsseg3e8_v_u8m2(dst_ptr, v_a0, v_a1, v_a2, vl);
486 
487     w -= vl;
488     s += 4 * vl;
489     dst_ptr += 3 * vl;
490   } while (w > 0);
491 }
492 #endif
493 
494 #ifdef HAS_SCALEROWDOWN38_RVV
ScaleRowDown38_RVV(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)495 void ScaleRowDown38_RVV(const uint8_t* src_ptr,
496                         ptrdiff_t src_stride,
497                         uint8_t* dst_ptr,
498                         int dst_width) {
499   size_t w = (size_t)dst_width / 3u;
500   (void)src_stride;
501   assert(dst_width % 3 == 0);
502   do {
503     vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7;
504     size_t vl = __riscv_vsetvl_e8m1(w);
505     __riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6,
506                             &v_s7, src_ptr, vl);
507     __riscv_vsseg3e8_v_u8m1(dst_ptr, v_s0, v_s3, v_s6, vl);
508     w -= vl;
509     src_ptr += 8 * vl;
510     dst_ptr += 3 * vl;
511   } while (w > 0);
512 }
513 #endif
514 
515 #ifdef HAS_SCALEROWDOWN38_2_BOX_RVV
ScaleRowDown38_2_Box_RVV(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)516 void ScaleRowDown38_2_Box_RVV(const uint8_t* src_ptr,
517                               ptrdiff_t src_stride,
518                               uint8_t* dst_ptr,
519                               int dst_width) {
520   size_t w = (size_t)dst_width / 3u;
521   const uint16_t coeff_a = (65536u / 6u);
522   const uint16_t coeff_b = (65536u / 4u);
523   assert((dst_width % 3 == 0) && (dst_width > 0));
524   do {
525     vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7;
526     vuint8m1_t v_t0, v_t1, v_t2, v_t3, v_t4, v_t5, v_t6, v_t7;
527     vuint16m2_t v_e0, v_e1, v_e2, v_e;
528     vuint16m2_t v_f0, v_f1, v_f2, v_f;
529     vuint16m2_t v_g0, v_g1, v_g;
530     vuint8m1_t v_dst_e, v_dst_f, v_dst_g;
531     size_t vl = __riscv_vsetvl_e8m1(w);
532     // s: e00, e10, e20, f00, f10, f20, g00, g10
533     // t: e01, e11, e21, f01, f11, f21, g01, g11
534     __riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6,
535                             &v_s7, src_ptr, vl);
536     __riscv_vlseg8e8_v_u8m1(&v_t0, &v_t1, &v_t2, &v_t3, &v_t4, &v_t5, &v_t6,
537                             &v_t7, src_ptr + src_stride, vl);
538     // Calculate sum of [e00, e21] to v_e
539     // Calculate sum of [f00, f21] to v_f
540     // Calculate sum of [g00, g11] to v_g
541     v_e0 = __riscv_vwaddu_vv_u16m2(v_s0, v_t0, vl);
542     v_e1 = __riscv_vwaddu_vv_u16m2(v_s1, v_t1, vl);
543     v_e2 = __riscv_vwaddu_vv_u16m2(v_s2, v_t2, vl);
544     v_f0 = __riscv_vwaddu_vv_u16m2(v_s3, v_t3, vl);
545     v_f1 = __riscv_vwaddu_vv_u16m2(v_s4, v_t4, vl);
546     v_f2 = __riscv_vwaddu_vv_u16m2(v_s5, v_t5, vl);
547     v_g0 = __riscv_vwaddu_vv_u16m2(v_s6, v_t6, vl);
548     v_g1 = __riscv_vwaddu_vv_u16m2(v_s7, v_t7, vl);
549 
550     v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e1, vl);
551     v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f1, vl);
552     v_e = __riscv_vadd_vv_u16m2(v_e0, v_e2, vl);
553     v_f = __riscv_vadd_vv_u16m2(v_f0, v_f2, vl);
554     v_g = __riscv_vadd_vv_u16m2(v_g0, v_g1, vl);
555 
556     // Average in 16-bit fixed-point
557     v_e = __riscv_vmulhu_vx_u16m2(v_e, coeff_a, vl);
558     v_f = __riscv_vmulhu_vx_u16m2(v_f, coeff_a, vl);
559     v_g = __riscv_vmulhu_vx_u16m2(v_g, coeff_b, vl);
560 
561     v_dst_e = __riscv_vnsrl_wx_u8m1(v_e, 0, vl);
562     v_dst_f = __riscv_vnsrl_wx_u8m1(v_f, 0, vl);
563     v_dst_g = __riscv_vnsrl_wx_u8m1(v_g, 0, vl);
564 
565     __riscv_vsseg3e8_v_u8m1(dst_ptr, v_dst_e, v_dst_f, v_dst_g, vl);
566     w -= vl;
567     src_ptr += 8 * vl;
568     dst_ptr += 3 * vl;
569   } while (w > 0);
570 }
571 #endif
572 
573 #ifdef HAS_SCALEROWDOWN38_3_BOX_RVV
ScaleRowDown38_3_Box_RVV(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)574 void ScaleRowDown38_3_Box_RVV(const uint8_t* src_ptr,
575                               ptrdiff_t src_stride,
576                               uint8_t* dst_ptr,
577                               int dst_width) {
578   size_t w = (size_t)dst_width / 3u;
579   const uint16_t coeff_a = (65536u / 9u);
580   const uint16_t coeff_b = (65536u / 6u);
581   assert((dst_width % 3 == 0) && (dst_width > 0));
582   do {
583     vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7;
584     vuint8m1_t v_t0, v_t1, v_t2, v_t3, v_t4, v_t5, v_t6, v_t7;
585     vuint8m1_t v_u0, v_u1, v_u2, v_u3, v_u4, v_u5, v_u6, v_u7;
586     vuint16m2_t v_e0, v_e1, v_e2, v_e3, v_e4, v_e;
587     vuint16m2_t v_f0, v_f1, v_f2, v_f3, v_f4, v_f;
588     vuint16m2_t v_g0, v_g1, v_g2, v_g;
589     vuint8m1_t v_dst_e, v_dst_f, v_dst_g;
590     size_t vl = __riscv_vsetvl_e8m1(w);
591     // s: e00, e10, e20, f00, f10, f20, g00, g10
592     // t: e01, e11, e21, f01, f11, f21, g01, g11
593     // u: e02, e12, e22, f02, f12, f22, g02, g12
594     __riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6,
595                             &v_s7, src_ptr, vl);
596     __riscv_vlseg8e8_v_u8m1(&v_t0, &v_t1, &v_t2, &v_t3, &v_t4, &v_t5, &v_t6,
597                             &v_t7, src_ptr + src_stride, vl);
598     __riscv_vlseg8e8_v_u8m1(&v_u0, &v_u1, &v_u2, &v_u3, &v_u4, &v_u5, &v_u6,
599                             &v_u7, src_ptr + 2 * src_stride, vl);
600     // Calculate sum of [e00, e22]
601     v_e0 = __riscv_vwaddu_vv_u16m2(v_s0, v_t0, vl);
602     v_e1 = __riscv_vwaddu_vv_u16m2(v_s1, v_t1, vl);
603     v_e2 = __riscv_vwaddu_vv_u16m2(v_s2, v_t2, vl);
604     v_e3 = __riscv_vwaddu_vv_u16m2(v_u0, v_u1, vl);
605     v_e4 = __riscv_vwaddu_vx_u16m2(v_u2, 0, vl);
606 
607     v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e1, vl);
608     v_e2 = __riscv_vadd_vv_u16m2(v_e2, v_e3, vl);
609     v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e4, vl);
610     v_e = __riscv_vadd_vv_u16m2(v_e0, v_e2, vl);
611     // Calculate sum of [f00, f22]
612     v_f0 = __riscv_vwaddu_vv_u16m2(v_s3, v_t3, vl);
613     v_f1 = __riscv_vwaddu_vv_u16m2(v_s4, v_t4, vl);
614     v_f2 = __riscv_vwaddu_vv_u16m2(v_s5, v_t5, vl);
615     v_f3 = __riscv_vwaddu_vv_u16m2(v_u3, v_u4, vl);
616     v_f4 = __riscv_vwaddu_vx_u16m2(v_u5, 0, vl);
617 
618     v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f1, vl);
619     v_f2 = __riscv_vadd_vv_u16m2(v_f2, v_f3, vl);
620     v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f4, vl);
621     v_f = __riscv_vadd_vv_u16m2(v_f0, v_f2, vl);
622     // Calculate sum of [g00, g12]
623     v_g0 = __riscv_vwaddu_vv_u16m2(v_s6, v_t6, vl);
624     v_g1 = __riscv_vwaddu_vv_u16m2(v_s7, v_t7, vl);
625     v_g2 = __riscv_vwaddu_vv_u16m2(v_u6, v_u7, vl);
626 
627     v_g = __riscv_vadd_vv_u16m2(v_g0, v_g1, vl);
628     v_g = __riscv_vadd_vv_u16m2(v_g, v_g2, vl);
629 
630     // Average in 16-bit fixed-point
631     v_e = __riscv_vmulhu_vx_u16m2(v_e, coeff_a, vl);
632     v_f = __riscv_vmulhu_vx_u16m2(v_f, coeff_a, vl);
633     v_g = __riscv_vmulhu_vx_u16m2(v_g, coeff_b, vl);
634 
635     v_dst_e = __riscv_vnsrl_wx_u8m1(v_e, 0, vl);
636     v_dst_f = __riscv_vnsrl_wx_u8m1(v_f, 0, vl);
637     v_dst_g = __riscv_vnsrl_wx_u8m1(v_g, 0, vl);
638     __riscv_vsseg3e8_v_u8m1(dst_ptr, v_dst_e, v_dst_f, v_dst_g, vl);
639     w -= vl;
640     src_ptr += 8 * vl;
641     dst_ptr += 3 * vl;
642   } while (w > 0);
643 }
644 #endif
645 
646 // ScaleUVRowUp2_(Bi)linear_RVV function is equal to other platforms'
647 // ScaleRowUp2_(Bi)linear_Any_XXX. We process entire row in this function. Other
648 // platforms only implement non-edge part of image and process edge with scalar.
649 
650 #ifdef HAS_SCALEROWUP2_LINEAR_RVV
ScaleRowUp2_Linear_RVV(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)651 void ScaleRowUp2_Linear_RVV(const uint8_t* src_ptr,
652                             uint8_t* dst_ptr,
653                             int dst_width) {
654   size_t work_width = (size_t)dst_width - 1u;
655   size_t src_width = work_width >> 1u;
656   const uint8_t* work_src_ptr = src_ptr;
657   uint8_t* work_dst_ptr = dst_ptr + 1;
658   size_t vl = __riscv_vsetvlmax_e8m4();
659   vuint8m4_t v_3 = __riscv_vmv_v_x_u8m4(3, vl);
660   dst_ptr[0] = src_ptr[0];
661   while (src_width > 0) {
662     vuint8m4_t v_src0, v_src1, v_dst_odd, v_dst_even;
663     vuint16m8_t v_src0_u16, v_src1_u16;
664     size_t vl = __riscv_vsetvl_e8m4(src_width);
665     v_src0 = __riscv_vle8_v_u8m4(work_src_ptr, vl);
666     v_src1 = __riscv_vle8_v_u8m4(work_src_ptr + 1, vl);
667 
668     v_src0_u16 = __riscv_vwaddu_vx_u16m8(v_src0, 2, vl);
669     v_src1_u16 = __riscv_vwaddu_vx_u16m8(v_src1, 2, vl);
670     v_src0_u16 = __riscv_vwmaccu_vv_u16m8(v_src0_u16, v_3, v_src1, vl);
671     v_src1_u16 = __riscv_vwmaccu_vv_u16m8(v_src1_u16, v_3, v_src0, vl);
672 
673     v_dst_odd = __riscv_vnsrl_wx_u8m4(v_src0_u16, 2, vl);
674     v_dst_even = __riscv_vnsrl_wx_u8m4(v_src1_u16, 2, vl);
675 
676     __riscv_vsseg2e8_v_u8m4(work_dst_ptr, v_dst_even, v_dst_odd, vl);
677 
678     src_width -= vl;
679     work_src_ptr += vl;
680     work_dst_ptr += 2 * vl;
681   }
682   dst_ptr[dst_width - 1] = src_ptr[(dst_width - 1) / 2];
683 }
684 #endif
685 
686 #ifdef HAS_SCALEROWUP2_BILINEAR_RVV
ScaleRowUp2_Bilinear_RVV(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)687 void ScaleRowUp2_Bilinear_RVV(const uint8_t* src_ptr,
688                               ptrdiff_t src_stride,
689                               uint8_t* dst_ptr,
690                               ptrdiff_t dst_stride,
691                               int dst_width) {
692   size_t work_width = ((size_t)dst_width - 1u) & ~1u;
693   size_t src_width = work_width >> 1u;
694   const uint8_t* work_s = src_ptr;
695   const uint8_t* work_t = src_ptr + src_stride;
696   const uint8_t* s = work_s;
697   const uint8_t* t = work_t;
698   uint8_t* d = dst_ptr;
699   uint8_t* e = dst_ptr + dst_stride;
700   uint8_t* work_d = d + 1;
701   uint8_t* work_e = e + 1;
702   size_t vl = __riscv_vsetvlmax_e16m4();
703   vuint16m4_t v_3_u16 = __riscv_vmv_v_x_u16m4(3, vl);
704   vuint8m2_t v_3_u8 = __riscv_vmv_v_x_u8m2(3, vl);
705   d[0] = (3 * s[0] + t[0] + 2) >> 2;
706   e[0] = (s[0] + 3 * t[0] + 2) >> 2;
707   while (src_width > 0) {
708     vuint8m2_t v_s0, v_s1, v_t0, v_t1;
709     vuint16m4_t v_s0_u16, v_s1_u16, v_t0_u16, v_t1_u16;
710     vuint16m4_t v_t0_u16_, v_t1_u16_;
711     vuint8m2_t v_dst0_even, v_dst0_odd, v_dst1_even, v_dst1_odd;
712     size_t vl = __riscv_vsetvl_e8m2(src_width);
713     v_s0 = __riscv_vle8_v_u8m2(work_s, vl);
714     v_s1 = __riscv_vle8_v_u8m2(work_s + 1, vl);
715 
716     v_s0_u16 = __riscv_vwaddu_vx_u16m4(v_s0, 2, vl);
717     v_s1_u16 = __riscv_vwaddu_vx_u16m4(v_s1, 2, vl);
718     v_s0_u16 = __riscv_vwmaccu_vv_u16m4(v_s0_u16, v_3_u8, v_s1, vl);
719     v_s1_u16 = __riscv_vwmaccu_vv_u16m4(v_s1_u16, v_3_u8, v_s0, vl);
720 
721     v_t0 = __riscv_vle8_v_u8m2(work_t, vl);
722     v_t1 = __riscv_vle8_v_u8m2(work_t + 1, vl);
723 
724     v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_t0, 2, vl);
725     v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_t1, 2, vl);
726     v_t0_u16 = __riscv_vwmaccu_vv_u16m4(v_t0_u16, v_3_u8, v_t1, vl);
727     v_t1_u16 = __riscv_vwmaccu_vv_u16m4(v_t1_u16, v_3_u8, v_t0, vl);
728 
729     v_t0_u16_ = __riscv_vmv_v_v_u16m4(v_t0_u16, vl);
730     v_t1_u16_ = __riscv_vmv_v_v_u16m4(v_t1_u16, vl);
731 
732     v_t0_u16 = __riscv_vmacc_vv_u16m4(v_t0_u16, v_3_u16, v_s0_u16, vl);
733     v_t1_u16 = __riscv_vmacc_vv_u16m4(v_t1_u16, v_3_u16, v_s1_u16, vl);
734     v_s0_u16 = __riscv_vmacc_vv_u16m4(v_s0_u16, v_3_u16, v_t0_u16_, vl);
735     v_s1_u16 = __riscv_vmacc_vv_u16m4(v_s1_u16, v_3_u16, v_t1_u16_, vl);
736 
737     v_dst0_odd = __riscv_vnsrl_wx_u8m2(v_t0_u16, 4, vl);
738     v_dst0_even = __riscv_vnsrl_wx_u8m2(v_t1_u16, 4, vl);
739     v_dst1_odd = __riscv_vnsrl_wx_u8m2(v_s0_u16, 4, vl);
740     v_dst1_even = __riscv_vnsrl_wx_u8m2(v_s1_u16, 4, vl);
741 
742     __riscv_vsseg2e8_v_u8m2(work_d, v_dst0_even, v_dst0_odd, vl);
743     __riscv_vsseg2e8_v_u8m2(work_e, v_dst1_even, v_dst1_odd, vl);
744 
745     src_width -= vl;
746     work_s += vl;
747     work_t += vl;
748     work_d += 2 * vl;
749     work_e += 2 * vl;
750   }
751   d[dst_width - 1] =
752       (3 * s[(dst_width - 1) / 2] + t[(dst_width - 1) / 2] + 2) >> 2;
753   e[dst_width - 1] =
754       (s[(dst_width - 1) / 2] + 3 * t[(dst_width - 1) / 2] + 2) >> 2;
755 }
756 #endif
757 
758 #ifdef HAS_SCALEUVROWDOWN2_RVV
ScaleUVRowDown2_RVV(const uint8_t * src_uv,ptrdiff_t src_stride,uint8_t * dst_uv,int dst_width)759 void ScaleUVRowDown2_RVV(const uint8_t* src_uv,
760                          ptrdiff_t src_stride,
761                          uint8_t* dst_uv,
762                          int dst_width) {
763   size_t w = (size_t)dst_width;
764   const uint32_t* src = (const uint32_t*)src_uv;
765   uint16_t* dst = (uint16_t*)dst_uv;
766   (void)src_stride;
767   do {
768     size_t vl = __riscv_vsetvl_e32m8(w);
769     vuint32m8_t v_data = __riscv_vle32_v_u32m8(src, vl);
770     vuint16m4_t v_u1v1 = __riscv_vnsrl_wx_u16m4(v_data, 16, vl);
771     __riscv_vse16_v_u16m4(dst, v_u1v1, vl);
772     w -= vl;
773     src += vl;
774     dst += vl;
775   } while (w > 0);
776 }
777 #endif
778 
779 #ifdef HAS_SCALEUVROWDOWN2LINEAR_RVV
ScaleUVRowDown2Linear_RVV(const uint8_t * src_uv,ptrdiff_t src_stride,uint8_t * dst_uv,int dst_width)780 void ScaleUVRowDown2Linear_RVV(const uint8_t* src_uv,
781                                ptrdiff_t src_stride,
782                                uint8_t* dst_uv,
783                                int dst_width) {
784   size_t w = (size_t)dst_width;
785   const uint16_t* src = (const uint16_t*)src_uv;
786   (void)src_stride;
787   // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
788   // register) is set to round-to-nearest-up mode(0).
789   asm volatile("csrwi vxrm, 0");
790   do {
791     vuint8m4_t v_u0v0, v_u1v1, v_avg;
792     vuint16m4_t v_u0v0_16, v_u1v1_16;
793     size_t vl = __riscv_vsetvl_e16m4(w);
794     __riscv_vlseg2e16_v_u16m4(&v_u0v0_16, &v_u1v1_16, src, vl);
795     v_u0v0 = __riscv_vreinterpret_v_u16m4_u8m4(v_u0v0_16);
796     v_u1v1 = __riscv_vreinterpret_v_u16m4_u8m4(v_u1v1_16);
797     // Use round-to-nearest-up mode for averaging add
798     v_avg = __riscv_vaaddu_vv_u8m4(v_u0v0, v_u1v1, vl * 2);
799     __riscv_vse8_v_u8m4(dst_uv, v_avg, vl * 2);
800     w -= vl;
801     src += vl * 2;
802     dst_uv += vl * 2;
803   } while (w > 0);
804 }
805 #endif
806 
807 #ifdef HAS_SCALEUVROWDOWN2BOX_RVV
ScaleUVRowDown2Box_RVV(const uint8_t * src_uv,ptrdiff_t src_stride,uint8_t * dst_uv,int dst_width)808 void ScaleUVRowDown2Box_RVV(const uint8_t* src_uv,
809                             ptrdiff_t src_stride,
810                             uint8_t* dst_uv,
811                             int dst_width) {
812   const uint8_t* src_uv_row1 = src_uv + src_stride;
813   size_t w = (size_t)dst_width;
814   // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
815   // register) is set to round-to-nearest-up mode(0).
816   asm volatile("csrwi vxrm, 0");
817   do {
818     vuint8m2_t v_u0_row0, v_v0_row0, v_u1_row0, v_v1_row0;
819     vuint8m2_t v_u0_row1, v_v0_row1, v_u1_row1, v_v1_row1;
820     vuint16m4_t v_u0u1_row0, v_u0u1_row1, v_v0v1_row0, v_v0v1_row1;
821     vuint16m4_t v_sum0, v_sum1;
822     vuint8m2_t v_dst_u, v_dst_v;
823     size_t vl = __riscv_vsetvl_e8m2(w);
824 
825     __riscv_vlseg4e8_v_u8m2(&v_u0_row0, &v_v0_row0, &v_u1_row0, &v_v1_row0,
826                             src_uv, vl);
827     __riscv_vlseg4e8_v_u8m2(&v_u0_row1, &v_v0_row1, &v_u1_row1, &v_v1_row1,
828                             src_uv_row1, vl);
829 
830     v_u0u1_row0 = __riscv_vwaddu_vv_u16m4(v_u0_row0, v_u1_row0, vl);
831     v_u0u1_row1 = __riscv_vwaddu_vv_u16m4(v_u0_row1, v_u1_row1, vl);
832     v_v0v1_row0 = __riscv_vwaddu_vv_u16m4(v_v0_row0, v_v1_row0, vl);
833     v_v0v1_row1 = __riscv_vwaddu_vv_u16m4(v_v0_row1, v_v1_row1, vl);
834 
835     v_sum0 = __riscv_vadd_vv_u16m4(v_u0u1_row0, v_u0u1_row1, vl);
836     v_sum1 = __riscv_vadd_vv_u16m4(v_v0v1_row0, v_v0v1_row1, vl);
837     // Use round-to-nearest-up mode for vnclip
838     v_dst_u = __riscv_vnclipu_wx_u8m2(v_sum0, 2, vl);
839     v_dst_v = __riscv_vnclipu_wx_u8m2(v_sum1, 2, vl);
840 
841     __riscv_vsseg2e8_v_u8m2(dst_uv, v_dst_u, v_dst_v, vl);
842 
843     dst_uv += 2 * vl;
844     src_uv += 4 * vl;
845     w -= vl;
846     src_uv_row1 += 4 * vl;
847   } while (w > 0);
848 }
849 #endif
850 
851 #ifdef HAS_SCALEUVROWDOWN4_RVV
ScaleUVRowDown4_RVV(const uint8_t * src_uv,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_uv,int dst_width)852 void ScaleUVRowDown4_RVV(const uint8_t* src_uv,
853                          ptrdiff_t src_stride,
854                          int src_stepx,
855                          uint8_t* dst_uv,
856                          int dst_width) {
857   // Overflow will never happen here, since sizeof(size_t)/sizeof(int)=2.
858   // dst_width = src_width / 4 and src_width is also int.
859   size_t w = (size_t)dst_width * 8;
860   (void)src_stride;
861   (void)src_stepx;
862   do {
863     size_t vl = __riscv_vsetvl_e8m8(w);
864     vuint8m8_t v_row = __riscv_vle8_v_u8m8(src_uv, vl);
865     vuint64m8_t v_row_64 = __riscv_vreinterpret_v_u8m8_u64m8(v_row);
866     // Narrowing without clipping
867     vuint32m4_t v_tmp = __riscv_vncvt_x_x_w_u32m4(v_row_64, vl / 8);
868     vuint16m2_t v_dst_16 = __riscv_vncvt_x_x_w_u16m2(v_tmp, vl / 8);
869     vuint8m2_t v_dst = __riscv_vreinterpret_v_u16m2_u8m2(v_dst_16);
870     __riscv_vse8_v_u8m2(dst_uv, v_dst, vl / 4);
871     w -= vl;
872     src_uv += vl;
873     dst_uv += vl / 4;
874   } while (w > 0);
875 }
876 #endif
877 
878 #ifdef HAS_SCALEUVROWDOWNEVEN_RVV
ScaleUVRowDownEven_RVV(const uint8_t * src_uv,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_uv,int dst_width)879 void ScaleUVRowDownEven_RVV(const uint8_t* src_uv,
880                             ptrdiff_t src_stride,
881                             int src_stepx,
882                             uint8_t* dst_uv,
883                             int dst_width) {
884   size_t w = (size_t)dst_width;
885   const ptrdiff_t stride_byte = (ptrdiff_t)src_stepx * 2;
886   const uint16_t* src = (const uint16_t*)(src_uv);
887   uint16_t* dst = (uint16_t*)(dst_uv);
888   (void)src_stride;
889   do {
890     size_t vl = __riscv_vsetvl_e16m8(w);
891     vuint16m8_t v_row = __riscv_vlse16_v_u16m8(src, stride_byte, vl);
892     __riscv_vse16_v_u16m8(dst, v_row, vl);
893     w -= vl;
894     src += vl * src_stepx;
895     dst += vl;
896   } while (w > 0);
897 }
898 #endif
899 
900 // ScaleUVRowUp2_(Bi)linear_RVV function is equal to other platforms'
901 // ScaleUVRowUp2_(Bi)linear_Any_XXX. We process entire row in this function.
902 // Other platforms only implement non-edge part of image and process edge with
903 // scalar.
904 
905 #ifdef HAS_SCALEUVROWUP2_LINEAR_RVV
ScaleUVRowUp2_Linear_RVV(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)906 void ScaleUVRowUp2_Linear_RVV(const uint8_t* src_ptr,
907                               uint8_t* dst_ptr,
908                               int dst_width) {
909   size_t work_width = ((size_t)dst_width - 1u) & ~1u;
910   uint16_t* work_dst_ptr = (uint16_t*)dst_ptr + 1;
911   const uint8_t* work_src_ptr = src_ptr;
912   size_t vl = __riscv_vsetvlmax_e8m4();
913   vuint8m4_t v_3_u8 = __riscv_vmv_v_x_u8m4(3, vl);
914   dst_ptr[0] = src_ptr[0];
915   dst_ptr[1] = src_ptr[1];
916   while (work_width > 0) {
917     vuint8m4_t v_uv0, v_uv1, v_dst_odd_u8, v_dst_even_u8;
918     vuint16m4_t v_dst_odd, v_dst_even;
919     vuint16m8_t v_uv0_u16, v_uv1_u16;
920     size_t vl = __riscv_vsetvl_e8m4(work_width);
921     v_uv0 = __riscv_vle8_v_u8m4(work_src_ptr, vl);
922     v_uv1 = __riscv_vle8_v_u8m4(work_src_ptr + 2, vl);
923 
924     v_uv0_u16 = __riscv_vwaddu_vx_u16m8(v_uv0, 2, vl);
925     v_uv1_u16 = __riscv_vwaddu_vx_u16m8(v_uv1, 2, vl);
926 
927     v_uv0_u16 = __riscv_vwmaccu_vv_u16m8(v_uv0_u16, v_3_u8, v_uv1, vl);
928     v_uv1_u16 = __riscv_vwmaccu_vv_u16m8(v_uv1_u16, v_3_u8, v_uv0, vl);
929 
930     v_dst_odd_u8 = __riscv_vnsrl_wx_u8m4(v_uv0_u16, 2, vl);
931     v_dst_even_u8 = __riscv_vnsrl_wx_u8m4(v_uv1_u16, 2, vl);
932 
933     v_dst_even = __riscv_vreinterpret_v_u8m4_u16m4(v_dst_even_u8);
934     v_dst_odd = __riscv_vreinterpret_v_u8m4_u16m4(v_dst_odd_u8);
935 
936     __riscv_vsseg2e16_v_u16m4(work_dst_ptr, v_dst_even, v_dst_odd, vl / 2);
937 
938     work_width -= vl;
939     work_src_ptr += vl;
940     work_dst_ptr += vl;
941   }
942   dst_ptr[2 * dst_width - 2] = src_ptr[((dst_width + 1) & ~1) - 2];
943   dst_ptr[2 * dst_width - 1] = src_ptr[((dst_width + 1) & ~1) - 1];
944 }
945 #endif
946 
947 #ifdef HAS_SCALEUVROWUP2_BILINEAR_RVV
ScaleUVRowUp2_Bilinear_RVV(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)948 void ScaleUVRowUp2_Bilinear_RVV(const uint8_t* src_ptr,
949                                 ptrdiff_t src_stride,
950                                 uint8_t* dst_ptr,
951                                 ptrdiff_t dst_stride,
952                                 int dst_width) {
953   size_t work_width = ((size_t)dst_width - 1u) & ~1u;
954   const uint8_t* work_s = src_ptr;
955   const uint8_t* work_t = src_ptr + src_stride;
956   const uint8_t* s = work_s;
957   const uint8_t* t = work_t;
958   uint8_t* d = dst_ptr;
959   uint8_t* e = dst_ptr + dst_stride;
960   uint16_t* work_d = (uint16_t*)d + 1;
961   uint16_t* work_e = (uint16_t*)e + 1;
962   size_t vl = __riscv_vsetvlmax_e16m4();
963   vuint16m4_t v_3_u16 = __riscv_vmv_v_x_u16m4(3, vl);
964   vuint8m2_t v_3_u8 = __riscv_vmv_v_x_u8m2(3, vl);
965   d[0] = (3 * s[0] + t[0] + 2) >> 2;
966   e[0] = (s[0] + 3 * t[0] + 2) >> 2;
967   d[1] = (3 * s[1] + t[1] + 2) >> 2;
968   e[1] = (s[1] + 3 * t[1] + 2) >> 2;
969   while (work_width > 0) {
970     vuint8m2_t v_s0, v_s1, v_t0, v_t1;
971     vuint16m4_t v_s0_u16, v_s1_u16, v_t0_u16, v_t1_u16;
972     vuint16m4_t v_t0_u16_, v_t1_u16_;
973     vuint8m2_t v_dst0_odd_u8, v_dst0_even_u8, v_dst1_odd_u8, v_dst1_even_u8;
974     vuint16m2_t v_dst0_even, v_dst0_odd, v_dst1_even, v_dst1_odd;
975     size_t vl = __riscv_vsetvl_e8m2(work_width);
976     v_s0 = __riscv_vle8_v_u8m2(work_s, vl);
977     v_s1 = __riscv_vle8_v_u8m2(work_s + 2, vl);
978 
979     v_s0_u16 = __riscv_vwaddu_vx_u16m4(v_s0, 2, vl);
980     v_s1_u16 = __riscv_vwaddu_vx_u16m4(v_s1, 2, vl);
981     v_s0_u16 = __riscv_vwmaccu_vv_u16m4(v_s0_u16, v_3_u8, v_s1, vl);
982     v_s1_u16 = __riscv_vwmaccu_vv_u16m4(v_s1_u16, v_3_u8, v_s0, vl);
983 
984     v_t0 = __riscv_vle8_v_u8m2(work_t, vl);
985     v_t1 = __riscv_vle8_v_u8m2(work_t + 2, vl);
986 
987     v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_t0, 2, vl);
988     v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_t1, 2, vl);
989     v_t0_u16 = __riscv_vwmaccu_vv_u16m4(v_t0_u16, v_3_u8, v_t1, vl);
990     v_t1_u16 = __riscv_vwmaccu_vv_u16m4(v_t1_u16, v_3_u8, v_t0, vl);
991 
992     v_t0_u16_ = __riscv_vmv_v_v_u16m4(v_t0_u16, vl);
993     v_t1_u16_ = __riscv_vmv_v_v_u16m4(v_t1_u16, vl);
994 
995     v_t0_u16 = __riscv_vmacc_vv_u16m4(v_t0_u16, v_3_u16, v_s0_u16, vl);
996     v_t1_u16 = __riscv_vmacc_vv_u16m4(v_t1_u16, v_3_u16, v_s1_u16, vl);
997     v_s0_u16 = __riscv_vmacc_vv_u16m4(v_s0_u16, v_3_u16, v_t0_u16_, vl);
998     v_s1_u16 = __riscv_vmacc_vv_u16m4(v_s1_u16, v_3_u16, v_t1_u16_, vl);
999 
1000     v_dst0_odd_u8 = __riscv_vnsrl_wx_u8m2(v_t0_u16, 4, vl);
1001     v_dst0_even_u8 = __riscv_vnsrl_wx_u8m2(v_t1_u16, 4, vl);
1002     v_dst1_odd_u8 = __riscv_vnsrl_wx_u8m2(v_s0_u16, 4, vl);
1003     v_dst1_even_u8 = __riscv_vnsrl_wx_u8m2(v_s1_u16, 4, vl);
1004 
1005     v_dst0_even = __riscv_vreinterpret_v_u8m2_u16m2(v_dst0_even_u8);
1006     v_dst0_odd = __riscv_vreinterpret_v_u8m2_u16m2(v_dst0_odd_u8);
1007     v_dst1_even = __riscv_vreinterpret_v_u8m2_u16m2(v_dst1_even_u8);
1008     v_dst1_odd = __riscv_vreinterpret_v_u8m2_u16m2(v_dst1_odd_u8);
1009 
1010     __riscv_vsseg2e16_v_u16m2(work_d, v_dst0_even, v_dst0_odd, vl / 2);
1011     __riscv_vsseg2e16_v_u16m2(work_e, v_dst1_even, v_dst1_odd, vl / 2);
1012 
1013     work_width -= vl;
1014     work_s += vl;
1015     work_t += vl;
1016     work_d += vl;
1017     work_e += vl;
1018   }
1019   d[2 * dst_width - 2] =
1020       (3 * s[((dst_width + 1) & ~1) - 2] + t[((dst_width + 1) & ~1) - 2] + 2) >>
1021       2;
1022   e[2 * dst_width - 2] =
1023       (s[((dst_width + 1) & ~1) - 2] + 3 * t[((dst_width + 1) & ~1) - 2] + 2) >>
1024       2;
1025   d[2 * dst_width - 1] =
1026       (3 * s[((dst_width + 1) & ~1) - 1] + t[((dst_width + 1) & ~1) - 1] + 2) >>
1027       2;
1028   e[2 * dst_width - 1] =
1029       (s[((dst_width + 1) & ~1) - 1] + 3 * t[((dst_width + 1) & ~1) - 1] + 2) >>
1030       2;
1031 }
1032 #endif
1033 
1034 #ifdef __cplusplus
1035 }  // extern "C"
1036 }  // namespace libyuv
1037 #endif
1038 
1039 #endif  // !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) &&
1040         // defined(__clang__)
1041