1 /*
2 * Copyright 2023 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 /*
12 * Copyright (c) 2023 SiFive, Inc. All rights reserved.
13 *
14 * Contributed by Darren Hsieh <[email protected]>
15 * Contributed by Bruce Lai <[email protected]>
16 */
17
18 #include "libyuv/row.h"
19 #include "libyuv/scale_row.h"
20
21 // This module is for clang rvv. GCC hasn't supported segment load & store.
22 #if !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) && \
23 defined(__clang__)
24 #include <assert.h>
25 #include <riscv_vector.h>
26 #ifdef __cplusplus
27 namespace libyuv {
28 extern "C" {
29 #endif
30
31 #ifdef HAS_SCALEADDROW_RVV
ScaleAddRow_RVV(const uint8_t * src_ptr,uint16_t * dst_ptr,int src_width)32 void ScaleAddRow_RVV(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) {
33 size_t w = (size_t)src_width;
34 do {
35 size_t vl = __riscv_vsetvl_e8m4(w);
36 vuint8m4_t v_src = __riscv_vle8_v_u8m4(src_ptr, vl);
37 vuint16m8_t v_dst = __riscv_vle16_v_u16m8(dst_ptr, vl);
38 // Use widening multiply-add instead of widening + add
39 v_dst = __riscv_vwmaccu_vx_u16m8(v_dst, 1, v_src, vl);
40 __riscv_vse16_v_u16m8(dst_ptr, v_dst, vl);
41 w -= vl;
42 src_ptr += vl;
43 dst_ptr += vl;
44 } while (w > 0);
45 }
46 #endif
47
48 #ifdef HAS_SCALEARGBROWDOWN2_RVV
ScaleARGBRowDown2_RVV(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)49 void ScaleARGBRowDown2_RVV(const uint8_t* src_argb,
50 ptrdiff_t src_stride,
51 uint8_t* dst_argb,
52 int dst_width) {
53 (void)src_stride;
54 size_t w = (size_t)dst_width;
55 const uint64_t* src = (const uint64_t*)(src_argb);
56 uint32_t* dst = (uint32_t*)(dst_argb);
57 do {
58 size_t vl = __riscv_vsetvl_e64m8(w);
59 vuint64m8_t v_data = __riscv_vle64_v_u64m8(src, vl);
60 vuint32m4_t v_dst = __riscv_vnsrl_wx_u32m4(v_data, 32, vl);
61 __riscv_vse32_v_u32m4(dst, v_dst, vl);
62 w -= vl;
63 src += vl;
64 dst += vl;
65 } while (w > 0);
66 }
67 #endif
68
69 #ifdef HAS_SCALEARGBROWDOWN2LINEAR_RVV
ScaleARGBRowDown2Linear_RVV(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)70 void ScaleARGBRowDown2Linear_RVV(const uint8_t* src_argb,
71 ptrdiff_t src_stride,
72 uint8_t* dst_argb,
73 int dst_width) {
74 (void)src_stride;
75 size_t w = (size_t)dst_width;
76 const uint32_t* src = (const uint32_t*)(src_argb);
77 // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
78 // register) is set to round-to-nearest-up mode(0).
79 asm volatile("csrwi vxrm, 0");
80 do {
81 vuint8m4_t v_odd, v_even, v_dst;
82 vuint32m4_t v_odd_32, v_even_32;
83 size_t vl = __riscv_vsetvl_e32m4(w);
84 __riscv_vlseg2e32_v_u32m4(&v_even_32, &v_odd_32, src, vl);
85 v_even = __riscv_vreinterpret_v_u32m4_u8m4(v_even_32);
86 v_odd = __riscv_vreinterpret_v_u32m4_u8m4(v_odd_32);
87 // Use round-to-nearest-up mode for averaging add
88 v_dst = __riscv_vaaddu_vv_u8m4(v_even, v_odd, vl * 4);
89 __riscv_vse8_v_u8m4(dst_argb, v_dst, vl * 4);
90 w -= vl;
91 src += vl * 2;
92 dst_argb += vl * 4;
93 } while (w > 0);
94 }
95 #endif
96
97 #ifdef HAS_SCALEARGBROWDOWN2BOX_RVV
ScaleARGBRowDown2Box_RVV(const uint8_t * src_argb,ptrdiff_t src_stride,uint8_t * dst_argb,int dst_width)98 void ScaleARGBRowDown2Box_RVV(const uint8_t* src_argb,
99 ptrdiff_t src_stride,
100 uint8_t* dst_argb,
101 int dst_width) {
102 size_t w = (size_t)dst_width;
103 const uint32_t* src0 = (const uint32_t*)(src_argb);
104 const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride);
105 // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
106 // register) is set to round-to-nearest-up mode(0).
107 asm volatile("csrwi vxrm, 0");
108 do {
109 vuint8m4_t v_row0_odd, v_row0_even, v_row1_odd, v_row1_even, v_dst;
110 vuint16m8_t v_row0_sum, v_row1_sum, v_dst_16;
111 vuint32m4_t v_row0_odd_32, v_row0_even_32, v_row1_odd_32, v_row1_even_32;
112 size_t vl = __riscv_vsetvl_e32m4(w);
113 __riscv_vlseg2e32_v_u32m4(&v_row0_even_32, &v_row0_odd_32, src0, vl);
114 __riscv_vlseg2e32_v_u32m4(&v_row1_even_32, &v_row1_odd_32, src1, vl);
115 v_row0_even = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_even_32);
116 v_row0_odd = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_odd_32);
117 v_row1_even = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_even_32);
118 v_row1_odd = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_odd_32);
119 v_row0_sum = __riscv_vwaddu_vv_u16m8(v_row0_even, v_row0_odd, vl * 4);
120 v_row1_sum = __riscv_vwaddu_vv_u16m8(v_row1_even, v_row1_odd, vl * 4);
121 v_dst_16 = __riscv_vadd_vv_u16m8(v_row0_sum, v_row1_sum, vl * 4);
122 // Use round-to-nearest-up mode for vnclip
123 v_dst = __riscv_vnclipu_wx_u8m4(v_dst_16, 2, vl * 4);
124 __riscv_vse8_v_u8m4(dst_argb, v_dst, vl * 4);
125 w -= vl;
126 src0 += vl * 2;
127 src1 += vl * 2;
128 dst_argb += vl * 4;
129 } while (w > 0);
130 }
131 #endif
132
133 #ifdef HAS_SCALEARGBROWDOWNEVEN_RVV
ScaleARGBRowDownEven_RVV(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)134 void ScaleARGBRowDownEven_RVV(const uint8_t* src_argb,
135 ptrdiff_t src_stride,
136 int src_stepx,
137 uint8_t* dst_argb,
138 int dst_width) {
139 size_t w = (size_t)dst_width;
140 const uint32_t* src = (const uint32_t*)(src_argb);
141 uint32_t* dst = (uint32_t*)(dst_argb);
142 const int stride_byte = src_stepx * 4;
143 do {
144 size_t vl = __riscv_vsetvl_e32m8(w);
145 vuint32m8_t v_row = __riscv_vlse32_v_u32m8(src, stride_byte, vl);
146 __riscv_vse32_v_u32m8(dst, v_row, vl);
147 w -= vl;
148 src += vl * src_stepx;
149 dst += vl;
150 } while (w > 0);
151 }
152 #endif
153
154 #ifdef HAS_SCALEARGBROWDOWNEVENBOX_RVV
ScaleARGBRowDownEvenBox_RVV(const uint8_t * src_argb,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_argb,int dst_width)155 void ScaleARGBRowDownEvenBox_RVV(const uint8_t* src_argb,
156 ptrdiff_t src_stride,
157 int src_stepx,
158 uint8_t* dst_argb,
159 int dst_width) {
160 size_t w = (size_t)dst_width;
161 const uint32_t* src0 = (const uint32_t*)(src_argb);
162 const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride);
163 const int stride_byte = src_stepx * 4;
164 // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
165 // register) is set to round-to-nearest-up mode(0).
166 asm volatile("csrwi vxrm, 0");
167 do {
168 vuint8m4_t v_row0_low, v_row0_high, v_row1_low, v_row1_high, v_dst;
169 vuint16m8_t v_row0_sum, v_row1_sum, v_sum;
170 vuint32m4_t v_row0_low_32, v_row0_high_32, v_row1_low_32, v_row1_high_32;
171 size_t vl = __riscv_vsetvl_e32m4(w);
172 __riscv_vlsseg2e32_v_u32m4(&v_row0_low_32, &v_row0_high_32, src0,
173 stride_byte, vl);
174 __riscv_vlsseg2e32_v_u32m4(&v_row1_low_32, &v_row1_high_32, src1,
175 stride_byte, vl);
176 v_row0_low = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_low_32);
177 v_row0_high = __riscv_vreinterpret_v_u32m4_u8m4(v_row0_high_32);
178 v_row1_low = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_low_32);
179 v_row1_high = __riscv_vreinterpret_v_u32m4_u8m4(v_row1_high_32);
180 v_row0_sum = __riscv_vwaddu_vv_u16m8(v_row0_low, v_row0_high, vl * 4);
181 v_row1_sum = __riscv_vwaddu_vv_u16m8(v_row1_low, v_row1_high, vl * 4);
182 v_sum = __riscv_vadd_vv_u16m8(v_row0_sum, v_row1_sum, vl * 4);
183 // Use round-to-nearest-up mode for vnclip
184 v_dst = __riscv_vnclipu_wx_u8m4(v_sum, 2, vl * 4);
185 __riscv_vse8_v_u8m4(dst_argb, v_dst, vl * 4);
186 w -= vl;
187 src0 += vl * src_stepx;
188 src1 += vl * src_stepx;
189 dst_argb += vl * 4;
190 } while (w > 0);
191 }
192 #endif
193
194 #ifdef HAS_SCALEROWDOWN2_RVV
ScaleRowDown2_RVV(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)195 void ScaleRowDown2_RVV(const uint8_t* src_ptr,
196 ptrdiff_t src_stride,
197 uint8_t* dst,
198 int dst_width) {
199 size_t w = (size_t)dst_width;
200 const uint16_t* src = (const uint16_t*)src_ptr;
201 (void)src_stride;
202 do {
203 size_t vl = __riscv_vsetvl_e16m8(w);
204 vuint16m8_t v_src = __riscv_vle16_v_u16m8(src, vl);
205 vuint8m4_t v_dst = __riscv_vnsrl_wx_u8m4(v_src, 8, vl);
206 __riscv_vse8_v_u8m4(dst, v_dst, vl);
207 w -= vl;
208 src += vl;
209 dst += vl;
210 } while (w > 0);
211 }
212 #endif
213
214 #ifdef HAS_SCALEROWDOWN2LINEAR_RVV
ScaleRowDown2Linear_RVV(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)215 void ScaleRowDown2Linear_RVV(const uint8_t* src_ptr,
216 ptrdiff_t src_stride,
217 uint8_t* dst,
218 int dst_width) {
219 size_t w = (size_t)dst_width;
220 (void)src_stride;
221 // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
222 // register) is set to round-to-nearest-up mode(0).
223 asm volatile("csrwi vxrm, 0");
224 do {
225 vuint8m4_t v_s0, v_s1, v_dst;
226 size_t vl = __riscv_vsetvl_e8m4(w);
227 __riscv_vlseg2e8_v_u8m4(&v_s0, &v_s1, src_ptr, vl);
228 // Use round-to-nearest-up mode for averaging add
229 v_dst = __riscv_vaaddu_vv_u8m4(v_s0, v_s1, vl);
230 __riscv_vse8_v_u8m4(dst, v_dst, vl);
231 w -= vl;
232 src_ptr += 2 * vl;
233 dst += vl;
234 } while (w > 0);
235 }
236 #endif
237
238 #ifdef HAS_SCALEROWDOWN2BOX_RVV
ScaleRowDown2Box_RVV(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst,int dst_width)239 void ScaleRowDown2Box_RVV(const uint8_t* src_ptr,
240 ptrdiff_t src_stride,
241 uint8_t* dst,
242 int dst_width) {
243 const uint8_t* s = src_ptr;
244 const uint8_t* t = src_ptr + src_stride;
245 size_t w = (size_t)dst_width;
246 // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
247 // register) is set to round-to-nearest-up mode(0).
248 asm volatile("csrwi vxrm, 0");
249 do {
250 size_t vl = __riscv_vsetvl_e8m4(w);
251 vuint8m4_t v_s0, v_s1, v_t0, v_t1;
252 vuint16m8_t v_s01, v_t01, v_st01;
253 vuint8m4_t v_dst;
254 __riscv_vlseg2e8_v_u8m4(&v_s0, &v_s1, s, vl);
255 __riscv_vlseg2e8_v_u8m4(&v_t0, &v_t1, t, vl);
256 v_s01 = __riscv_vwaddu_vv_u16m8(v_s0, v_s1, vl);
257 v_t01 = __riscv_vwaddu_vv_u16m8(v_t0, v_t1, vl);
258 v_st01 = __riscv_vadd_vv_u16m8(v_s01, v_t01, vl);
259 // Use round-to-nearest-up mode for vnclip
260 v_dst = __riscv_vnclipu_wx_u8m4(v_st01, 2, vl);
261 __riscv_vse8_v_u8m4(dst, v_dst, vl);
262 w -= vl;
263 s += 2 * vl;
264 t += 2 * vl;
265 dst += vl;
266 } while (w > 0);
267 }
268 #endif
269
270 #ifdef HAS_SCALEROWDOWN4_RVV
ScaleRowDown4_RVV(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)271 void ScaleRowDown4_RVV(const uint8_t* src_ptr,
272 ptrdiff_t src_stride,
273 uint8_t* dst_ptr,
274 int dst_width) {
275 size_t w = (size_t)dst_width;
276 (void)src_stride;
277 do {
278 size_t vl = __riscv_vsetvl_e8m2(w);
279 vuint8m2_t v_s0, v_s1, v_s2, v_s3;
280 __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, src_ptr, vl);
281 __riscv_vse8_v_u8m2(dst_ptr, v_s2, vl);
282 w -= vl;
283 src_ptr += (4 * vl);
284 dst_ptr += vl;
285 } while (w > 0);
286 }
287 #endif
288
289 #ifdef HAS_SCALEROWDOWN4BOX_RVV
ScaleRowDown4Box_RVV(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)290 void ScaleRowDown4Box_RVV(const uint8_t* src_ptr,
291 ptrdiff_t src_stride,
292 uint8_t* dst_ptr,
293 int dst_width) {
294 const uint8_t* src_ptr1 = src_ptr + src_stride;
295 const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
296 const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
297 size_t w = (size_t)dst_width;
298 // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
299 // register) is set to round-to-nearest-up mode(0).
300 asm volatile("csrwi vxrm, 0");
301 do {
302 vuint8m2_t v_s0, v_s1, v_s2, v_s3;
303 vuint8m2_t v_t0, v_t1, v_t2, v_t3;
304 vuint8m2_t v_u0, v_u1, v_u2, v_u3;
305 vuint8m2_t v_v0, v_v1, v_v2, v_v3;
306 vuint16m4_t v_s01, v_s23, v_t01, v_t23;
307 vuint16m4_t v_u01, v_u23, v_v01, v_v23;
308 vuint16m4_t v_st01, v_st23, v_uv01, v_uv23;
309 vuint16m4_t v_st0123, v_uv0123, v_stuv0123;
310 vuint8m2_t v_dst;
311 size_t vl = __riscv_vsetvl_e8m2(w);
312
313 __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, src_ptr, vl);
314 v_s01 = __riscv_vwaddu_vv_u16m4(v_s0, v_s1, vl);
315
316 __riscv_vlseg4e8_v_u8m2(&v_t0, &v_t1, &v_t2, &v_t3, src_ptr1, vl);
317 v_t01 = __riscv_vwaddu_vv_u16m4(v_t0, v_t1, vl);
318
319 __riscv_vlseg4e8_v_u8m2(&v_u0, &v_u1, &v_u2, &v_u3, src_ptr2, vl);
320 v_u01 = __riscv_vwaddu_vv_u16m4(v_u0, v_u1, vl);
321 v_u23 = __riscv_vwaddu_vv_u16m4(v_u2, v_u3, vl);
322
323 v_s23 = __riscv_vwaddu_vv_u16m4(v_s2, v_s3, vl);
324 v_t23 = __riscv_vwaddu_vv_u16m4(v_t2, v_t3, vl);
325 v_st01 = __riscv_vadd_vv_u16m4(v_s01, v_t01, vl);
326 v_st23 = __riscv_vadd_vv_u16m4(v_s23, v_t23, vl);
327
328 __riscv_vlseg4e8_v_u8m2(&v_v0, &v_v1, &v_v2, &v_v3, src_ptr3, vl);
329
330 v_v01 = __riscv_vwaddu_vv_u16m4(v_v0, v_v1, vl);
331 v_v23 = __riscv_vwaddu_vv_u16m4(v_v2, v_v3, vl);
332
333 v_uv01 = __riscv_vadd_vv_u16m4(v_u01, v_v01, vl);
334 v_uv23 = __riscv_vadd_vv_u16m4(v_u23, v_v23, vl);
335
336 v_st0123 = __riscv_vadd_vv_u16m4(v_st01, v_st23, vl);
337 v_uv0123 = __riscv_vadd_vv_u16m4(v_uv01, v_uv23, vl);
338 v_stuv0123 = __riscv_vadd_vv_u16m4(v_st0123, v_uv0123, vl);
339 // Use round-to-nearest-up mode for vnclip
340 v_dst = __riscv_vnclipu_wx_u8m2(v_stuv0123, 4, vl);
341 __riscv_vse8_v_u8m2(dst_ptr, v_dst, vl);
342 w -= vl;
343 src_ptr += 4 * vl;
344 src_ptr1 += 4 * vl;
345 src_ptr2 += 4 * vl;
346 src_ptr3 += 4 * vl;
347 dst_ptr += vl;
348 } while (w > 0);
349 }
350 #endif
351
352 #ifdef HAS_SCALEROWDOWN34_RVV
ScaleRowDown34_RVV(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)353 void ScaleRowDown34_RVV(const uint8_t* src_ptr,
354 ptrdiff_t src_stride,
355 uint8_t* dst_ptr,
356 int dst_width) {
357 size_t w = (size_t)dst_width / 3u;
358 do {
359 size_t vl = __riscv_vsetvl_e8m2(w);
360 vuint8m2_t v_s0, v_s1, v_s2, v_s3;
361 __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, src_ptr, vl);
362 __riscv_vsseg3e8_v_u8m2(dst_ptr, v_s0, v_s1, v_s3, vl);
363 w -= vl;
364 src_ptr += 4 * vl;
365 dst_ptr += 3 * vl;
366 } while (w > 0);
367 }
368 #endif
369
370 #ifdef HAS_SCALEROWDOWN34_0_BOX_RVV
ScaleRowDown34_0_Box_RVV(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)371 void ScaleRowDown34_0_Box_RVV(const uint8_t* src_ptr,
372 ptrdiff_t src_stride,
373 uint8_t* dst_ptr,
374 int dst_width) {
375 size_t w = (size_t)dst_width / 3u;
376 const uint8_t* s = src_ptr;
377 const uint8_t* t = src_ptr + src_stride;
378 // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
379 // register) is set to round-to-nearest-up mode(0).
380 asm volatile("csrwi vxrm, 0");
381 do {
382 vuint8m2_t v_s0, v_s1, v_s2, v_s3;
383 vuint16m4_t v_t0_u16, v_t1_u16, v_t2_u16, v_t3_u16;
384 vuint8m2_t v_u0, v_u1, v_u2, v_u3;
385 vuint16m4_t v_u1_u16;
386 vuint8m2_t v_a0, v_a1, v_a2;
387 size_t vl = __riscv_vsetvl_e8m2(w);
388 __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, s, vl);
389
390 if (src_stride == 0) {
391 v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_s0, 2, vl);
392 v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_s1, 2, vl);
393 v_t2_u16 = __riscv_vwaddu_vx_u16m4(v_s2, 2, vl);
394 v_t3_u16 = __riscv_vwaddu_vx_u16m4(v_s3, 2, vl);
395 } else {
396 vuint8m2_t v_t0, v_t1, v_t2, v_t3;
397 __riscv_vlseg4e8_v_u8m2(&v_t0, &v_t1, &v_t2, &v_t3, t, vl);
398 v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_t0, 0, vl);
399 v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_t1, 0, vl);
400 v_t2_u16 = __riscv_vwaddu_vx_u16m4(v_t2, 0, vl);
401 v_t3_u16 = __riscv_vwaddu_vx_u16m4(v_t3, 0, vl);
402 t += 4 * vl;
403 }
404
405 v_t0_u16 = __riscv_vwmaccu_vx_u16m4(v_t0_u16, 3, v_s0, vl);
406 v_t1_u16 = __riscv_vwmaccu_vx_u16m4(v_t1_u16, 3, v_s1, vl);
407 v_t2_u16 = __riscv_vwmaccu_vx_u16m4(v_t2_u16, 3, v_s2, vl);
408 v_t3_u16 = __riscv_vwmaccu_vx_u16m4(v_t3_u16, 3, v_s3, vl);
409
410 // Use round-to-nearest-up mode for vnclip & averaging add
411 v_u0 = __riscv_vnclipu_wx_u8m2(v_t0_u16, 2, vl);
412 v_u1 = __riscv_vnclipu_wx_u8m2(v_t1_u16, 2, vl);
413 v_u2 = __riscv_vnclipu_wx_u8m2(v_t2_u16, 2, vl);
414 v_u3 = __riscv_vnclipu_wx_u8m2(v_t3_u16, 2, vl);
415
416 // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
417 v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_u1, 0, vl);
418 v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_u0, vl);
419 v_a0 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl);
420
421 // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
422 v_a1 = __riscv_vaaddu_vv_u8m2(v_u1, v_u2, vl);
423
424 // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
425 v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_u2, 0, vl);
426 v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_u3, vl);
427 v_a2 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl);
428
429 __riscv_vsseg3e8_v_u8m2(dst_ptr, v_a0, v_a1, v_a2, vl);
430
431 w -= vl;
432 s += 4 * vl;
433 dst_ptr += 3 * vl;
434 } while (w > 0);
435 }
436 #endif
437
438 #ifdef HAS_SCALEROWDOWN34_1_BOX_RVV
ScaleRowDown34_1_Box_RVV(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)439 void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr,
440 ptrdiff_t src_stride,
441 uint8_t* dst_ptr,
442 int dst_width) {
443 size_t w = (size_t)dst_width / 3u;
444 const uint8_t* s = src_ptr;
445 const uint8_t* t = src_ptr + src_stride;
446 // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
447 // register) is set to round-to-nearest-up mode(0).
448 asm volatile("csrwi vxrm, 0");
449 do {
450 vuint8m2_t v_s0, v_s1, v_s2, v_s3;
451 vuint8m2_t v_ave0, v_ave1, v_ave2, v_ave3;
452 vuint16m4_t v_u1_u16;
453 vuint8m2_t v_a0, v_a1, v_a2;
454 size_t vl = __riscv_vsetvl_e8m2(w);
455 __riscv_vlseg4e8_v_u8m2(&v_s0, &v_s1, &v_s2, &v_s3, s, vl);
456
457 // Use round-to-nearest-up mode for vnclip & averaging add
458 if (src_stride == 0) {
459 v_ave0 = __riscv_vaaddu_vv_u8m2(v_s0, v_s0, vl);
460 v_ave1 = __riscv_vaaddu_vv_u8m2(v_s1, v_s1, vl);
461 v_ave2 = __riscv_vaaddu_vv_u8m2(v_s2, v_s2, vl);
462 v_ave3 = __riscv_vaaddu_vv_u8m2(v_s3, v_s3, vl);
463 } else {
464 vuint8m2_t v_t0, v_t1, v_t2, v_t3;
465 __riscv_vlseg4e8_v_u8m2(&v_t0, &v_t1, &v_t2, &v_t3, t, vl);
466 v_ave0 = __riscv_vaaddu_vv_u8m2(v_s0, v_t0, vl);
467 v_ave1 = __riscv_vaaddu_vv_u8m2(v_s1, v_t1, vl);
468 v_ave2 = __riscv_vaaddu_vv_u8m2(v_s2, v_t2, vl);
469 v_ave3 = __riscv_vaaddu_vv_u8m2(v_s3, v_t3, vl);
470 t += 4 * vl;
471 }
472 // a0 = (src[0] * 3 + s[1] * 1 + 2) >> 2
473 v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_ave1, 0, vl);
474 v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_ave0, vl);
475 v_a0 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl);
476
477 // a1 = (src[1] * 1 + s[2] * 1 + 1) >> 1
478 v_a1 = __riscv_vaaddu_vv_u8m2(v_ave1, v_ave2, vl);
479
480 // a2 = (src[2] * 1 + s[3] * 3 + 2) >> 2
481 v_u1_u16 = __riscv_vwaddu_vx_u16m4(v_ave2, 0, vl);
482 v_u1_u16 = __riscv_vwmaccu_vx_u16m4(v_u1_u16, 3, v_ave3, vl);
483 v_a2 = __riscv_vnclipu_wx_u8m2(v_u1_u16, 2, vl);
484
485 __riscv_vsseg3e8_v_u8m2(dst_ptr, v_a0, v_a1, v_a2, vl);
486
487 w -= vl;
488 s += 4 * vl;
489 dst_ptr += 3 * vl;
490 } while (w > 0);
491 }
492 #endif
493
494 #ifdef HAS_SCALEROWDOWN38_RVV
ScaleRowDown38_RVV(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)495 void ScaleRowDown38_RVV(const uint8_t* src_ptr,
496 ptrdiff_t src_stride,
497 uint8_t* dst_ptr,
498 int dst_width) {
499 size_t w = (size_t)dst_width / 3u;
500 (void)src_stride;
501 assert(dst_width % 3 == 0);
502 do {
503 vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7;
504 size_t vl = __riscv_vsetvl_e8m1(w);
505 __riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6,
506 &v_s7, src_ptr, vl);
507 __riscv_vsseg3e8_v_u8m1(dst_ptr, v_s0, v_s3, v_s6, vl);
508 w -= vl;
509 src_ptr += 8 * vl;
510 dst_ptr += 3 * vl;
511 } while (w > 0);
512 }
513 #endif
514
515 #ifdef HAS_SCALEROWDOWN38_2_BOX_RVV
ScaleRowDown38_2_Box_RVV(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)516 void ScaleRowDown38_2_Box_RVV(const uint8_t* src_ptr,
517 ptrdiff_t src_stride,
518 uint8_t* dst_ptr,
519 int dst_width) {
520 size_t w = (size_t)dst_width / 3u;
521 const uint16_t coeff_a = (65536u / 6u);
522 const uint16_t coeff_b = (65536u / 4u);
523 assert((dst_width % 3 == 0) && (dst_width > 0));
524 do {
525 vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7;
526 vuint8m1_t v_t0, v_t1, v_t2, v_t3, v_t4, v_t5, v_t6, v_t7;
527 vuint16m2_t v_e0, v_e1, v_e2, v_e;
528 vuint16m2_t v_f0, v_f1, v_f2, v_f;
529 vuint16m2_t v_g0, v_g1, v_g;
530 vuint8m1_t v_dst_e, v_dst_f, v_dst_g;
531 size_t vl = __riscv_vsetvl_e8m1(w);
532 // s: e00, e10, e20, f00, f10, f20, g00, g10
533 // t: e01, e11, e21, f01, f11, f21, g01, g11
534 __riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6,
535 &v_s7, src_ptr, vl);
536 __riscv_vlseg8e8_v_u8m1(&v_t0, &v_t1, &v_t2, &v_t3, &v_t4, &v_t5, &v_t6,
537 &v_t7, src_ptr + src_stride, vl);
538 // Calculate sum of [e00, e21] to v_e
539 // Calculate sum of [f00, f21] to v_f
540 // Calculate sum of [g00, g11] to v_g
541 v_e0 = __riscv_vwaddu_vv_u16m2(v_s0, v_t0, vl);
542 v_e1 = __riscv_vwaddu_vv_u16m2(v_s1, v_t1, vl);
543 v_e2 = __riscv_vwaddu_vv_u16m2(v_s2, v_t2, vl);
544 v_f0 = __riscv_vwaddu_vv_u16m2(v_s3, v_t3, vl);
545 v_f1 = __riscv_vwaddu_vv_u16m2(v_s4, v_t4, vl);
546 v_f2 = __riscv_vwaddu_vv_u16m2(v_s5, v_t5, vl);
547 v_g0 = __riscv_vwaddu_vv_u16m2(v_s6, v_t6, vl);
548 v_g1 = __riscv_vwaddu_vv_u16m2(v_s7, v_t7, vl);
549
550 v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e1, vl);
551 v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f1, vl);
552 v_e = __riscv_vadd_vv_u16m2(v_e0, v_e2, vl);
553 v_f = __riscv_vadd_vv_u16m2(v_f0, v_f2, vl);
554 v_g = __riscv_vadd_vv_u16m2(v_g0, v_g1, vl);
555
556 // Average in 16-bit fixed-point
557 v_e = __riscv_vmulhu_vx_u16m2(v_e, coeff_a, vl);
558 v_f = __riscv_vmulhu_vx_u16m2(v_f, coeff_a, vl);
559 v_g = __riscv_vmulhu_vx_u16m2(v_g, coeff_b, vl);
560
561 v_dst_e = __riscv_vnsrl_wx_u8m1(v_e, 0, vl);
562 v_dst_f = __riscv_vnsrl_wx_u8m1(v_f, 0, vl);
563 v_dst_g = __riscv_vnsrl_wx_u8m1(v_g, 0, vl);
564
565 __riscv_vsseg3e8_v_u8m1(dst_ptr, v_dst_e, v_dst_f, v_dst_g, vl);
566 w -= vl;
567 src_ptr += 8 * vl;
568 dst_ptr += 3 * vl;
569 } while (w > 0);
570 }
571 #endif
572
573 #ifdef HAS_SCALEROWDOWN38_3_BOX_RVV
ScaleRowDown38_3_Box_RVV(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,int dst_width)574 void ScaleRowDown38_3_Box_RVV(const uint8_t* src_ptr,
575 ptrdiff_t src_stride,
576 uint8_t* dst_ptr,
577 int dst_width) {
578 size_t w = (size_t)dst_width / 3u;
579 const uint16_t coeff_a = (65536u / 9u);
580 const uint16_t coeff_b = (65536u / 6u);
581 assert((dst_width % 3 == 0) && (dst_width > 0));
582 do {
583 vuint8m1_t v_s0, v_s1, v_s2, v_s3, v_s4, v_s5, v_s6, v_s7;
584 vuint8m1_t v_t0, v_t1, v_t2, v_t3, v_t4, v_t5, v_t6, v_t7;
585 vuint8m1_t v_u0, v_u1, v_u2, v_u3, v_u4, v_u5, v_u6, v_u7;
586 vuint16m2_t v_e0, v_e1, v_e2, v_e3, v_e4, v_e;
587 vuint16m2_t v_f0, v_f1, v_f2, v_f3, v_f4, v_f;
588 vuint16m2_t v_g0, v_g1, v_g2, v_g;
589 vuint8m1_t v_dst_e, v_dst_f, v_dst_g;
590 size_t vl = __riscv_vsetvl_e8m1(w);
591 // s: e00, e10, e20, f00, f10, f20, g00, g10
592 // t: e01, e11, e21, f01, f11, f21, g01, g11
593 // u: e02, e12, e22, f02, f12, f22, g02, g12
594 __riscv_vlseg8e8_v_u8m1(&v_s0, &v_s1, &v_s2, &v_s3, &v_s4, &v_s5, &v_s6,
595 &v_s7, src_ptr, vl);
596 __riscv_vlseg8e8_v_u8m1(&v_t0, &v_t1, &v_t2, &v_t3, &v_t4, &v_t5, &v_t6,
597 &v_t7, src_ptr + src_stride, vl);
598 __riscv_vlseg8e8_v_u8m1(&v_u0, &v_u1, &v_u2, &v_u3, &v_u4, &v_u5, &v_u6,
599 &v_u7, src_ptr + 2 * src_stride, vl);
600 // Calculate sum of [e00, e22]
601 v_e0 = __riscv_vwaddu_vv_u16m2(v_s0, v_t0, vl);
602 v_e1 = __riscv_vwaddu_vv_u16m2(v_s1, v_t1, vl);
603 v_e2 = __riscv_vwaddu_vv_u16m2(v_s2, v_t2, vl);
604 v_e3 = __riscv_vwaddu_vv_u16m2(v_u0, v_u1, vl);
605 v_e4 = __riscv_vwaddu_vx_u16m2(v_u2, 0, vl);
606
607 v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e1, vl);
608 v_e2 = __riscv_vadd_vv_u16m2(v_e2, v_e3, vl);
609 v_e0 = __riscv_vadd_vv_u16m2(v_e0, v_e4, vl);
610 v_e = __riscv_vadd_vv_u16m2(v_e0, v_e2, vl);
611 // Calculate sum of [f00, f22]
612 v_f0 = __riscv_vwaddu_vv_u16m2(v_s3, v_t3, vl);
613 v_f1 = __riscv_vwaddu_vv_u16m2(v_s4, v_t4, vl);
614 v_f2 = __riscv_vwaddu_vv_u16m2(v_s5, v_t5, vl);
615 v_f3 = __riscv_vwaddu_vv_u16m2(v_u3, v_u4, vl);
616 v_f4 = __riscv_vwaddu_vx_u16m2(v_u5, 0, vl);
617
618 v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f1, vl);
619 v_f2 = __riscv_vadd_vv_u16m2(v_f2, v_f3, vl);
620 v_f0 = __riscv_vadd_vv_u16m2(v_f0, v_f4, vl);
621 v_f = __riscv_vadd_vv_u16m2(v_f0, v_f2, vl);
622 // Calculate sum of [g00, g12]
623 v_g0 = __riscv_vwaddu_vv_u16m2(v_s6, v_t6, vl);
624 v_g1 = __riscv_vwaddu_vv_u16m2(v_s7, v_t7, vl);
625 v_g2 = __riscv_vwaddu_vv_u16m2(v_u6, v_u7, vl);
626
627 v_g = __riscv_vadd_vv_u16m2(v_g0, v_g1, vl);
628 v_g = __riscv_vadd_vv_u16m2(v_g, v_g2, vl);
629
630 // Average in 16-bit fixed-point
631 v_e = __riscv_vmulhu_vx_u16m2(v_e, coeff_a, vl);
632 v_f = __riscv_vmulhu_vx_u16m2(v_f, coeff_a, vl);
633 v_g = __riscv_vmulhu_vx_u16m2(v_g, coeff_b, vl);
634
635 v_dst_e = __riscv_vnsrl_wx_u8m1(v_e, 0, vl);
636 v_dst_f = __riscv_vnsrl_wx_u8m1(v_f, 0, vl);
637 v_dst_g = __riscv_vnsrl_wx_u8m1(v_g, 0, vl);
638 __riscv_vsseg3e8_v_u8m1(dst_ptr, v_dst_e, v_dst_f, v_dst_g, vl);
639 w -= vl;
640 src_ptr += 8 * vl;
641 dst_ptr += 3 * vl;
642 } while (w > 0);
643 }
644 #endif
645
646 // ScaleUVRowUp2_(Bi)linear_RVV function is equal to other platforms'
647 // ScaleRowUp2_(Bi)linear_Any_XXX. We process entire row in this function. Other
648 // platforms only implement non-edge part of image and process edge with scalar.
649
650 #ifdef HAS_SCALEROWUP2_LINEAR_RVV
ScaleRowUp2_Linear_RVV(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)651 void ScaleRowUp2_Linear_RVV(const uint8_t* src_ptr,
652 uint8_t* dst_ptr,
653 int dst_width) {
654 size_t work_width = (size_t)dst_width - 1u;
655 size_t src_width = work_width >> 1u;
656 const uint8_t* work_src_ptr = src_ptr;
657 uint8_t* work_dst_ptr = dst_ptr + 1;
658 size_t vl = __riscv_vsetvlmax_e8m4();
659 vuint8m4_t v_3 = __riscv_vmv_v_x_u8m4(3, vl);
660 dst_ptr[0] = src_ptr[0];
661 while (src_width > 0) {
662 vuint8m4_t v_src0, v_src1, v_dst_odd, v_dst_even;
663 vuint16m8_t v_src0_u16, v_src1_u16;
664 size_t vl = __riscv_vsetvl_e8m4(src_width);
665 v_src0 = __riscv_vle8_v_u8m4(work_src_ptr, vl);
666 v_src1 = __riscv_vle8_v_u8m4(work_src_ptr + 1, vl);
667
668 v_src0_u16 = __riscv_vwaddu_vx_u16m8(v_src0, 2, vl);
669 v_src1_u16 = __riscv_vwaddu_vx_u16m8(v_src1, 2, vl);
670 v_src0_u16 = __riscv_vwmaccu_vv_u16m8(v_src0_u16, v_3, v_src1, vl);
671 v_src1_u16 = __riscv_vwmaccu_vv_u16m8(v_src1_u16, v_3, v_src0, vl);
672
673 v_dst_odd = __riscv_vnsrl_wx_u8m4(v_src0_u16, 2, vl);
674 v_dst_even = __riscv_vnsrl_wx_u8m4(v_src1_u16, 2, vl);
675
676 __riscv_vsseg2e8_v_u8m4(work_dst_ptr, v_dst_even, v_dst_odd, vl);
677
678 src_width -= vl;
679 work_src_ptr += vl;
680 work_dst_ptr += 2 * vl;
681 }
682 dst_ptr[dst_width - 1] = src_ptr[(dst_width - 1) / 2];
683 }
684 #endif
685
686 #ifdef HAS_SCALEROWUP2_BILINEAR_RVV
ScaleRowUp2_Bilinear_RVV(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)687 void ScaleRowUp2_Bilinear_RVV(const uint8_t* src_ptr,
688 ptrdiff_t src_stride,
689 uint8_t* dst_ptr,
690 ptrdiff_t dst_stride,
691 int dst_width) {
692 size_t work_width = ((size_t)dst_width - 1u) & ~1u;
693 size_t src_width = work_width >> 1u;
694 const uint8_t* work_s = src_ptr;
695 const uint8_t* work_t = src_ptr + src_stride;
696 const uint8_t* s = work_s;
697 const uint8_t* t = work_t;
698 uint8_t* d = dst_ptr;
699 uint8_t* e = dst_ptr + dst_stride;
700 uint8_t* work_d = d + 1;
701 uint8_t* work_e = e + 1;
702 size_t vl = __riscv_vsetvlmax_e16m4();
703 vuint16m4_t v_3_u16 = __riscv_vmv_v_x_u16m4(3, vl);
704 vuint8m2_t v_3_u8 = __riscv_vmv_v_x_u8m2(3, vl);
705 d[0] = (3 * s[0] + t[0] + 2) >> 2;
706 e[0] = (s[0] + 3 * t[0] + 2) >> 2;
707 while (src_width > 0) {
708 vuint8m2_t v_s0, v_s1, v_t0, v_t1;
709 vuint16m4_t v_s0_u16, v_s1_u16, v_t0_u16, v_t1_u16;
710 vuint16m4_t v_t0_u16_, v_t1_u16_;
711 vuint8m2_t v_dst0_even, v_dst0_odd, v_dst1_even, v_dst1_odd;
712 size_t vl = __riscv_vsetvl_e8m2(src_width);
713 v_s0 = __riscv_vle8_v_u8m2(work_s, vl);
714 v_s1 = __riscv_vle8_v_u8m2(work_s + 1, vl);
715
716 v_s0_u16 = __riscv_vwaddu_vx_u16m4(v_s0, 2, vl);
717 v_s1_u16 = __riscv_vwaddu_vx_u16m4(v_s1, 2, vl);
718 v_s0_u16 = __riscv_vwmaccu_vv_u16m4(v_s0_u16, v_3_u8, v_s1, vl);
719 v_s1_u16 = __riscv_vwmaccu_vv_u16m4(v_s1_u16, v_3_u8, v_s0, vl);
720
721 v_t0 = __riscv_vle8_v_u8m2(work_t, vl);
722 v_t1 = __riscv_vle8_v_u8m2(work_t + 1, vl);
723
724 v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_t0, 2, vl);
725 v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_t1, 2, vl);
726 v_t0_u16 = __riscv_vwmaccu_vv_u16m4(v_t0_u16, v_3_u8, v_t1, vl);
727 v_t1_u16 = __riscv_vwmaccu_vv_u16m4(v_t1_u16, v_3_u8, v_t0, vl);
728
729 v_t0_u16_ = __riscv_vmv_v_v_u16m4(v_t0_u16, vl);
730 v_t1_u16_ = __riscv_vmv_v_v_u16m4(v_t1_u16, vl);
731
732 v_t0_u16 = __riscv_vmacc_vv_u16m4(v_t0_u16, v_3_u16, v_s0_u16, vl);
733 v_t1_u16 = __riscv_vmacc_vv_u16m4(v_t1_u16, v_3_u16, v_s1_u16, vl);
734 v_s0_u16 = __riscv_vmacc_vv_u16m4(v_s0_u16, v_3_u16, v_t0_u16_, vl);
735 v_s1_u16 = __riscv_vmacc_vv_u16m4(v_s1_u16, v_3_u16, v_t1_u16_, vl);
736
737 v_dst0_odd = __riscv_vnsrl_wx_u8m2(v_t0_u16, 4, vl);
738 v_dst0_even = __riscv_vnsrl_wx_u8m2(v_t1_u16, 4, vl);
739 v_dst1_odd = __riscv_vnsrl_wx_u8m2(v_s0_u16, 4, vl);
740 v_dst1_even = __riscv_vnsrl_wx_u8m2(v_s1_u16, 4, vl);
741
742 __riscv_vsseg2e8_v_u8m2(work_d, v_dst0_even, v_dst0_odd, vl);
743 __riscv_vsseg2e8_v_u8m2(work_e, v_dst1_even, v_dst1_odd, vl);
744
745 src_width -= vl;
746 work_s += vl;
747 work_t += vl;
748 work_d += 2 * vl;
749 work_e += 2 * vl;
750 }
751 d[dst_width - 1] =
752 (3 * s[(dst_width - 1) / 2] + t[(dst_width - 1) / 2] + 2) >> 2;
753 e[dst_width - 1] =
754 (s[(dst_width - 1) / 2] + 3 * t[(dst_width - 1) / 2] + 2) >> 2;
755 }
756 #endif
757
758 #ifdef HAS_SCALEUVROWDOWN2_RVV
ScaleUVRowDown2_RVV(const uint8_t * src_uv,ptrdiff_t src_stride,uint8_t * dst_uv,int dst_width)759 void ScaleUVRowDown2_RVV(const uint8_t* src_uv,
760 ptrdiff_t src_stride,
761 uint8_t* dst_uv,
762 int dst_width) {
763 size_t w = (size_t)dst_width;
764 const uint32_t* src = (const uint32_t*)src_uv;
765 uint16_t* dst = (uint16_t*)dst_uv;
766 (void)src_stride;
767 do {
768 size_t vl = __riscv_vsetvl_e32m8(w);
769 vuint32m8_t v_data = __riscv_vle32_v_u32m8(src, vl);
770 vuint16m4_t v_u1v1 = __riscv_vnsrl_wx_u16m4(v_data, 16, vl);
771 __riscv_vse16_v_u16m4(dst, v_u1v1, vl);
772 w -= vl;
773 src += vl;
774 dst += vl;
775 } while (w > 0);
776 }
777 #endif
778
779 #ifdef HAS_SCALEUVROWDOWN2LINEAR_RVV
ScaleUVRowDown2Linear_RVV(const uint8_t * src_uv,ptrdiff_t src_stride,uint8_t * dst_uv,int dst_width)780 void ScaleUVRowDown2Linear_RVV(const uint8_t* src_uv,
781 ptrdiff_t src_stride,
782 uint8_t* dst_uv,
783 int dst_width) {
784 size_t w = (size_t)dst_width;
785 const uint16_t* src = (const uint16_t*)src_uv;
786 (void)src_stride;
787 // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
788 // register) is set to round-to-nearest-up mode(0).
789 asm volatile("csrwi vxrm, 0");
790 do {
791 vuint8m4_t v_u0v0, v_u1v1, v_avg;
792 vuint16m4_t v_u0v0_16, v_u1v1_16;
793 size_t vl = __riscv_vsetvl_e16m4(w);
794 __riscv_vlseg2e16_v_u16m4(&v_u0v0_16, &v_u1v1_16, src, vl);
795 v_u0v0 = __riscv_vreinterpret_v_u16m4_u8m4(v_u0v0_16);
796 v_u1v1 = __riscv_vreinterpret_v_u16m4_u8m4(v_u1v1_16);
797 // Use round-to-nearest-up mode for averaging add
798 v_avg = __riscv_vaaddu_vv_u8m4(v_u0v0, v_u1v1, vl * 2);
799 __riscv_vse8_v_u8m4(dst_uv, v_avg, vl * 2);
800 w -= vl;
801 src += vl * 2;
802 dst_uv += vl * 2;
803 } while (w > 0);
804 }
805 #endif
806
807 #ifdef HAS_SCALEUVROWDOWN2BOX_RVV
ScaleUVRowDown2Box_RVV(const uint8_t * src_uv,ptrdiff_t src_stride,uint8_t * dst_uv,int dst_width)808 void ScaleUVRowDown2Box_RVV(const uint8_t* src_uv,
809 ptrdiff_t src_stride,
810 uint8_t* dst_uv,
811 int dst_width) {
812 const uint8_t* src_uv_row1 = src_uv + src_stride;
813 size_t w = (size_t)dst_width;
814 // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
815 // register) is set to round-to-nearest-up mode(0).
816 asm volatile("csrwi vxrm, 0");
817 do {
818 vuint8m2_t v_u0_row0, v_v0_row0, v_u1_row0, v_v1_row0;
819 vuint8m2_t v_u0_row1, v_v0_row1, v_u1_row1, v_v1_row1;
820 vuint16m4_t v_u0u1_row0, v_u0u1_row1, v_v0v1_row0, v_v0v1_row1;
821 vuint16m4_t v_sum0, v_sum1;
822 vuint8m2_t v_dst_u, v_dst_v;
823 size_t vl = __riscv_vsetvl_e8m2(w);
824
825 __riscv_vlseg4e8_v_u8m2(&v_u0_row0, &v_v0_row0, &v_u1_row0, &v_v1_row0,
826 src_uv, vl);
827 __riscv_vlseg4e8_v_u8m2(&v_u0_row1, &v_v0_row1, &v_u1_row1, &v_v1_row1,
828 src_uv_row1, vl);
829
830 v_u0u1_row0 = __riscv_vwaddu_vv_u16m4(v_u0_row0, v_u1_row0, vl);
831 v_u0u1_row1 = __riscv_vwaddu_vv_u16m4(v_u0_row1, v_u1_row1, vl);
832 v_v0v1_row0 = __riscv_vwaddu_vv_u16m4(v_v0_row0, v_v1_row0, vl);
833 v_v0v1_row1 = __riscv_vwaddu_vv_u16m4(v_v0_row1, v_v1_row1, vl);
834
835 v_sum0 = __riscv_vadd_vv_u16m4(v_u0u1_row0, v_u0u1_row1, vl);
836 v_sum1 = __riscv_vadd_vv_u16m4(v_v0v1_row0, v_v0v1_row1, vl);
837 // Use round-to-nearest-up mode for vnclip
838 v_dst_u = __riscv_vnclipu_wx_u8m2(v_sum0, 2, vl);
839 v_dst_v = __riscv_vnclipu_wx_u8m2(v_sum1, 2, vl);
840
841 __riscv_vsseg2e8_v_u8m2(dst_uv, v_dst_u, v_dst_v, vl);
842
843 dst_uv += 2 * vl;
844 src_uv += 4 * vl;
845 w -= vl;
846 src_uv_row1 += 4 * vl;
847 } while (w > 0);
848 }
849 #endif
850
851 #ifdef HAS_SCALEUVROWDOWN4_RVV
ScaleUVRowDown4_RVV(const uint8_t * src_uv,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_uv,int dst_width)852 void ScaleUVRowDown4_RVV(const uint8_t* src_uv,
853 ptrdiff_t src_stride,
854 int src_stepx,
855 uint8_t* dst_uv,
856 int dst_width) {
857 // Overflow will never happen here, since sizeof(size_t)/sizeof(int)=2.
858 // dst_width = src_width / 4 and src_width is also int.
859 size_t w = (size_t)dst_width * 8;
860 (void)src_stride;
861 (void)src_stepx;
862 do {
863 size_t vl = __riscv_vsetvl_e8m8(w);
864 vuint8m8_t v_row = __riscv_vle8_v_u8m8(src_uv, vl);
865 vuint64m8_t v_row_64 = __riscv_vreinterpret_v_u8m8_u64m8(v_row);
866 // Narrowing without clipping
867 vuint32m4_t v_tmp = __riscv_vncvt_x_x_w_u32m4(v_row_64, vl / 8);
868 vuint16m2_t v_dst_16 = __riscv_vncvt_x_x_w_u16m2(v_tmp, vl / 8);
869 vuint8m2_t v_dst = __riscv_vreinterpret_v_u16m2_u8m2(v_dst_16);
870 __riscv_vse8_v_u8m2(dst_uv, v_dst, vl / 4);
871 w -= vl;
872 src_uv += vl;
873 dst_uv += vl / 4;
874 } while (w > 0);
875 }
876 #endif
877
878 #ifdef HAS_SCALEUVROWDOWNEVEN_RVV
ScaleUVRowDownEven_RVV(const uint8_t * src_uv,ptrdiff_t src_stride,int src_stepx,uint8_t * dst_uv,int dst_width)879 void ScaleUVRowDownEven_RVV(const uint8_t* src_uv,
880 ptrdiff_t src_stride,
881 int src_stepx,
882 uint8_t* dst_uv,
883 int dst_width) {
884 size_t w = (size_t)dst_width;
885 const ptrdiff_t stride_byte = (ptrdiff_t)src_stepx * 2;
886 const uint16_t* src = (const uint16_t*)(src_uv);
887 uint16_t* dst = (uint16_t*)(dst_uv);
888 (void)src_stride;
889 do {
890 size_t vl = __riscv_vsetvl_e16m8(w);
891 vuint16m8_t v_row = __riscv_vlse16_v_u16m8(src, stride_byte, vl);
892 __riscv_vse16_v_u16m8(dst, v_row, vl);
893 w -= vl;
894 src += vl * src_stepx;
895 dst += vl;
896 } while (w > 0);
897 }
898 #endif
899
900 // ScaleUVRowUp2_(Bi)linear_RVV function is equal to other platforms'
901 // ScaleUVRowUp2_(Bi)linear_Any_XXX. We process entire row in this function.
902 // Other platforms only implement non-edge part of image and process edge with
903 // scalar.
904
905 #ifdef HAS_SCALEUVROWUP2_LINEAR_RVV
ScaleUVRowUp2_Linear_RVV(const uint8_t * src_ptr,uint8_t * dst_ptr,int dst_width)906 void ScaleUVRowUp2_Linear_RVV(const uint8_t* src_ptr,
907 uint8_t* dst_ptr,
908 int dst_width) {
909 size_t work_width = ((size_t)dst_width - 1u) & ~1u;
910 uint16_t* work_dst_ptr = (uint16_t*)dst_ptr + 1;
911 const uint8_t* work_src_ptr = src_ptr;
912 size_t vl = __riscv_vsetvlmax_e8m4();
913 vuint8m4_t v_3_u8 = __riscv_vmv_v_x_u8m4(3, vl);
914 dst_ptr[0] = src_ptr[0];
915 dst_ptr[1] = src_ptr[1];
916 while (work_width > 0) {
917 vuint8m4_t v_uv0, v_uv1, v_dst_odd_u8, v_dst_even_u8;
918 vuint16m4_t v_dst_odd, v_dst_even;
919 vuint16m8_t v_uv0_u16, v_uv1_u16;
920 size_t vl = __riscv_vsetvl_e8m4(work_width);
921 v_uv0 = __riscv_vle8_v_u8m4(work_src_ptr, vl);
922 v_uv1 = __riscv_vle8_v_u8m4(work_src_ptr + 2, vl);
923
924 v_uv0_u16 = __riscv_vwaddu_vx_u16m8(v_uv0, 2, vl);
925 v_uv1_u16 = __riscv_vwaddu_vx_u16m8(v_uv1, 2, vl);
926
927 v_uv0_u16 = __riscv_vwmaccu_vv_u16m8(v_uv0_u16, v_3_u8, v_uv1, vl);
928 v_uv1_u16 = __riscv_vwmaccu_vv_u16m8(v_uv1_u16, v_3_u8, v_uv0, vl);
929
930 v_dst_odd_u8 = __riscv_vnsrl_wx_u8m4(v_uv0_u16, 2, vl);
931 v_dst_even_u8 = __riscv_vnsrl_wx_u8m4(v_uv1_u16, 2, vl);
932
933 v_dst_even = __riscv_vreinterpret_v_u8m4_u16m4(v_dst_even_u8);
934 v_dst_odd = __riscv_vreinterpret_v_u8m4_u16m4(v_dst_odd_u8);
935
936 __riscv_vsseg2e16_v_u16m4(work_dst_ptr, v_dst_even, v_dst_odd, vl / 2);
937
938 work_width -= vl;
939 work_src_ptr += vl;
940 work_dst_ptr += vl;
941 }
942 dst_ptr[2 * dst_width - 2] = src_ptr[((dst_width + 1) & ~1) - 2];
943 dst_ptr[2 * dst_width - 1] = src_ptr[((dst_width + 1) & ~1) - 1];
944 }
945 #endif
946
947 #ifdef HAS_SCALEUVROWUP2_BILINEAR_RVV
ScaleUVRowUp2_Bilinear_RVV(const uint8_t * src_ptr,ptrdiff_t src_stride,uint8_t * dst_ptr,ptrdiff_t dst_stride,int dst_width)948 void ScaleUVRowUp2_Bilinear_RVV(const uint8_t* src_ptr,
949 ptrdiff_t src_stride,
950 uint8_t* dst_ptr,
951 ptrdiff_t dst_stride,
952 int dst_width) {
953 size_t work_width = ((size_t)dst_width - 1u) & ~1u;
954 const uint8_t* work_s = src_ptr;
955 const uint8_t* work_t = src_ptr + src_stride;
956 const uint8_t* s = work_s;
957 const uint8_t* t = work_t;
958 uint8_t* d = dst_ptr;
959 uint8_t* e = dst_ptr + dst_stride;
960 uint16_t* work_d = (uint16_t*)d + 1;
961 uint16_t* work_e = (uint16_t*)e + 1;
962 size_t vl = __riscv_vsetvlmax_e16m4();
963 vuint16m4_t v_3_u16 = __riscv_vmv_v_x_u16m4(3, vl);
964 vuint8m2_t v_3_u8 = __riscv_vmv_v_x_u8m2(3, vl);
965 d[0] = (3 * s[0] + t[0] + 2) >> 2;
966 e[0] = (s[0] + 3 * t[0] + 2) >> 2;
967 d[1] = (3 * s[1] + t[1] + 2) >> 2;
968 e[1] = (s[1] + 3 * t[1] + 2) >> 2;
969 while (work_width > 0) {
970 vuint8m2_t v_s0, v_s1, v_t0, v_t1;
971 vuint16m4_t v_s0_u16, v_s1_u16, v_t0_u16, v_t1_u16;
972 vuint16m4_t v_t0_u16_, v_t1_u16_;
973 vuint8m2_t v_dst0_odd_u8, v_dst0_even_u8, v_dst1_odd_u8, v_dst1_even_u8;
974 vuint16m2_t v_dst0_even, v_dst0_odd, v_dst1_even, v_dst1_odd;
975 size_t vl = __riscv_vsetvl_e8m2(work_width);
976 v_s0 = __riscv_vle8_v_u8m2(work_s, vl);
977 v_s1 = __riscv_vle8_v_u8m2(work_s + 2, vl);
978
979 v_s0_u16 = __riscv_vwaddu_vx_u16m4(v_s0, 2, vl);
980 v_s1_u16 = __riscv_vwaddu_vx_u16m4(v_s1, 2, vl);
981 v_s0_u16 = __riscv_vwmaccu_vv_u16m4(v_s0_u16, v_3_u8, v_s1, vl);
982 v_s1_u16 = __riscv_vwmaccu_vv_u16m4(v_s1_u16, v_3_u8, v_s0, vl);
983
984 v_t0 = __riscv_vle8_v_u8m2(work_t, vl);
985 v_t1 = __riscv_vle8_v_u8m2(work_t + 2, vl);
986
987 v_t0_u16 = __riscv_vwaddu_vx_u16m4(v_t0, 2, vl);
988 v_t1_u16 = __riscv_vwaddu_vx_u16m4(v_t1, 2, vl);
989 v_t0_u16 = __riscv_vwmaccu_vv_u16m4(v_t0_u16, v_3_u8, v_t1, vl);
990 v_t1_u16 = __riscv_vwmaccu_vv_u16m4(v_t1_u16, v_3_u8, v_t0, vl);
991
992 v_t0_u16_ = __riscv_vmv_v_v_u16m4(v_t0_u16, vl);
993 v_t1_u16_ = __riscv_vmv_v_v_u16m4(v_t1_u16, vl);
994
995 v_t0_u16 = __riscv_vmacc_vv_u16m4(v_t0_u16, v_3_u16, v_s0_u16, vl);
996 v_t1_u16 = __riscv_vmacc_vv_u16m4(v_t1_u16, v_3_u16, v_s1_u16, vl);
997 v_s0_u16 = __riscv_vmacc_vv_u16m4(v_s0_u16, v_3_u16, v_t0_u16_, vl);
998 v_s1_u16 = __riscv_vmacc_vv_u16m4(v_s1_u16, v_3_u16, v_t1_u16_, vl);
999
1000 v_dst0_odd_u8 = __riscv_vnsrl_wx_u8m2(v_t0_u16, 4, vl);
1001 v_dst0_even_u8 = __riscv_vnsrl_wx_u8m2(v_t1_u16, 4, vl);
1002 v_dst1_odd_u8 = __riscv_vnsrl_wx_u8m2(v_s0_u16, 4, vl);
1003 v_dst1_even_u8 = __riscv_vnsrl_wx_u8m2(v_s1_u16, 4, vl);
1004
1005 v_dst0_even = __riscv_vreinterpret_v_u8m2_u16m2(v_dst0_even_u8);
1006 v_dst0_odd = __riscv_vreinterpret_v_u8m2_u16m2(v_dst0_odd_u8);
1007 v_dst1_even = __riscv_vreinterpret_v_u8m2_u16m2(v_dst1_even_u8);
1008 v_dst1_odd = __riscv_vreinterpret_v_u8m2_u16m2(v_dst1_odd_u8);
1009
1010 __riscv_vsseg2e16_v_u16m2(work_d, v_dst0_even, v_dst0_odd, vl / 2);
1011 __riscv_vsseg2e16_v_u16m2(work_e, v_dst1_even, v_dst1_odd, vl / 2);
1012
1013 work_width -= vl;
1014 work_s += vl;
1015 work_t += vl;
1016 work_d += vl;
1017 work_e += vl;
1018 }
1019 d[2 * dst_width - 2] =
1020 (3 * s[((dst_width + 1) & ~1) - 2] + t[((dst_width + 1) & ~1) - 2] + 2) >>
1021 2;
1022 e[2 * dst_width - 2] =
1023 (s[((dst_width + 1) & ~1) - 2] + 3 * t[((dst_width + 1) & ~1) - 2] + 2) >>
1024 2;
1025 d[2 * dst_width - 1] =
1026 (3 * s[((dst_width + 1) & ~1) - 1] + t[((dst_width + 1) & ~1) - 1] + 2) >>
1027 2;
1028 e[2 * dst_width - 1] =
1029 (s[((dst_width + 1) & ~1) - 1] + 3 * t[((dst_width + 1) & ~1) - 1] + 2) >>
1030 2;
1031 }
1032 #endif
1033
1034 #ifdef __cplusplus
1035 } // extern "C"
1036 } // namespace libyuv
1037 #endif
1038
1039 #endif // !defined(LIBYUV_DISABLE_RVV) && defined(__riscv_vector) &&
1040 // defined(__clang__)
1041