xref: /aosp_15_r20/external/skia/src/core/SkBlitRow_D32.cpp (revision c8dee2aa9b3f27cf6c858bd81872bdeb2c07ed17)
1 /*
2  * Copyright 2011 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "include/core/SkColor.h"
9 #include "include/core/SkColorPriv.h"
10 #include "include/core/SkTypes.h"
11 #include "include/private/SkColorData.h"
12 #include "include/private/base/SkCPUTypes.h"
13 #include "src/core/SkBlitRow.h"
14 #include "src/core/SkMemset.h"
15 
16 #include <cstring>
17 #include <iterator>
18 
19 // Everyone agrees memcpy() is the best way to do this.
blit_row_s32_opaque(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)20 static void blit_row_s32_opaque(SkPMColor* dst,
21                                 const SkPMColor* src,
22                                 int count,
23                                 U8CPU alpha) {
24     SkASSERT(255 == alpha);
25     memcpy(dst, src, count * sizeof(SkPMColor));
26 }
27 
28 // We have SSE2, NEON, and portable implementations of
29 // blit_row_s32_blend() and blit_row_s32a_blend().
30 
31 // TODO(mtklein): can we do better in NEON than 2 pixels at a time?
32 
33 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
34     #include <emmintrin.h>
35     #include <xmmintrin.h>
36 
SkPMLerp_SSE2(const __m128i & src,const __m128i & dst,const unsigned src_scale)37     static inline __m128i SkPMLerp_SSE2(const __m128i& src,
38                                         const __m128i& dst,
39                                         const unsigned src_scale) {
40         // Computes dst + (((src - dst)*src_scale)>>8)
41         const __m128i mask = _mm_set1_epi32(0x00FF00FF);
42 
43         // Unpack the 16x8-bit source into 2 8x16-bit splayed halves.
44         __m128i src_rb = _mm_and_si128(mask, src);
45         __m128i src_ag = _mm_srli_epi16(src, 8);
46         __m128i dst_rb = _mm_and_si128(mask, dst);
47         __m128i dst_ag = _mm_srli_epi16(dst, 8);
48 
49         // Compute scaled differences.
50         __m128i diff_rb = _mm_sub_epi16(src_rb, dst_rb);
51         __m128i diff_ag = _mm_sub_epi16(src_ag, dst_ag);
52         __m128i s = _mm_set1_epi16(src_scale);
53         diff_rb = _mm_mullo_epi16(diff_rb, s);
54         diff_ag = _mm_mullo_epi16(diff_ag, s);
55 
56         // Pack the differences back together.
57         diff_rb = _mm_srli_epi16(diff_rb, 8);
58         diff_ag = _mm_andnot_si128(mask, diff_ag);
59         __m128i diff = _mm_or_si128(diff_rb, diff_ag);
60 
61         // Add difference to destination.
62         return _mm_add_epi8(dst, diff);
63     }
64 
65 
blit_row_s32_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)66     static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
67         SkASSERT(alpha <= 255);
68 
69         auto src4 = (const __m128i*)src;
70         auto dst4 = (      __m128i*)dst;
71 
72         while (count >= 4) {
73             _mm_storeu_si128(dst4, SkPMLerp_SSE2(_mm_loadu_si128(src4),
74                                                  _mm_loadu_si128(dst4),
75                                                  SkAlpha255To256(alpha)));
76             src4++;
77             dst4++;
78             count -= 4;
79         }
80 
81         src = (const SkPMColor*)src4;
82         dst = (      SkPMColor*)dst4;
83 
84         while (count --> 0) {
85             *dst = SkPMLerp(*src, *dst, SkAlpha255To256(alpha));
86             src++;
87             dst++;
88         }
89     }
90 
SkBlendARGB32_SSE2(const __m128i & src,const __m128i & dst,const unsigned aa)91     static inline __m128i SkBlendARGB32_SSE2(const __m128i& src,
92                                              const __m128i& dst,
93                                              const unsigned aa) {
94         unsigned alpha = SkAlpha255To256(aa);
95         __m128i src_scale = _mm_set1_epi16(alpha);
96         // SkAlphaMulInv256(SkGetPackedA32(src), src_scale)
97         __m128i dst_scale = _mm_srli_epi32(src, 24);
98         // High words in dst_scale are 0, so it's safe to multiply with 16-bit src_scale.
99         dst_scale = _mm_mullo_epi16(dst_scale, src_scale);
100         dst_scale = _mm_sub_epi32(_mm_set1_epi32(0xFFFF), dst_scale);
101         dst_scale = _mm_add_epi32(dst_scale, _mm_srli_epi32(dst_scale, 8));
102         dst_scale = _mm_srli_epi32(dst_scale, 8);
103         // Duplicate scales into 2x16-bit pattern per pixel.
104         dst_scale = _mm_shufflelo_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0));
105         dst_scale = _mm_shufflehi_epi16(dst_scale, _MM_SHUFFLE(2, 2, 0, 0));
106 
107         const __m128i mask = _mm_set1_epi32(0x00FF00FF);
108 
109         // Unpack the 16x8-bit source/destination into 2 8x16-bit splayed halves.
110         __m128i src_rb = _mm_and_si128(mask, src);
111         __m128i src_ag = _mm_srli_epi16(src, 8);
112         __m128i dst_rb = _mm_and_si128(mask, dst);
113         __m128i dst_ag = _mm_srli_epi16(dst, 8);
114 
115         // Scale them.
116         src_rb = _mm_mullo_epi16(src_rb, src_scale);
117         src_ag = _mm_mullo_epi16(src_ag, src_scale);
118         dst_rb = _mm_mullo_epi16(dst_rb, dst_scale);
119         dst_ag = _mm_mullo_epi16(dst_ag, dst_scale);
120 
121         // Add the scaled source and destination.
122         dst_rb = _mm_add_epi16(src_rb, dst_rb);
123         dst_ag = _mm_add_epi16(src_ag, dst_ag);
124 
125         // Unsplay the halves back together.
126         dst_rb = _mm_srli_epi16(dst_rb, 8);
127         dst_ag = _mm_andnot_si128(mask, dst_ag);
128         return _mm_or_si128(dst_rb, dst_ag);
129     }
130 
blit_row_s32a_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)131     static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
132         SkASSERT(alpha <= 255);
133 
134         auto src4 = (const __m128i*)src;
135         auto dst4 = (      __m128i*)dst;
136 
137         while (count >= 4) {
138             _mm_storeu_si128(dst4, SkBlendARGB32_SSE2(_mm_loadu_si128(src4),
139                                                       _mm_loadu_si128(dst4),
140                                                       alpha));
141             src4++;
142             dst4++;
143             count -= 4;
144         }
145 
146         src = (const SkPMColor*)src4;
147         dst = (      SkPMColor*)dst4;
148 
149         while (count --> 0) {
150             *dst = SkBlendARGB32(*src, *dst, alpha);
151             src++;
152             dst++;
153         }
154     }
155 
156 #elif defined(SK_ARM_HAS_NEON)
157     #include <arm_neon.h>
158 
blit_row_s32_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)159     static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
160         SkASSERT(alpha <= 255);
161 
162         uint16_t src_scale = SkAlpha255To256(alpha);
163         uint16_t dst_scale = 256 - src_scale;
164 
165         while (count >= 2) {
166             uint8x8_t vsrc, vdst, vres;
167             uint16x8_t vsrc_wide, vdst_wide;
168 
169             vsrc = vreinterpret_u8_u32(vld1_u32(src));
170             vdst = vreinterpret_u8_u32(vld1_u32(dst));
171 
172             vsrc_wide = vmovl_u8(vsrc);
173             vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
174 
175             vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
176 
177             vdst_wide += vsrc_wide;
178             vres = vshrn_n_u16(vdst_wide, 8);
179 
180             vst1_u32(dst, vreinterpret_u32_u8(vres));
181 
182             src += 2;
183             dst += 2;
184             count -= 2;
185         }
186 
187         if (count == 1) {
188             uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
189             uint16x8_t vsrc_wide, vdst_wide;
190 
191             vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
192             vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
193 
194             vsrc_wide = vmovl_u8(vsrc);
195             vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
196             vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
197             vdst_wide += vsrc_wide;
198             vres = vshrn_n_u16(vdst_wide, 8);
199 
200             vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
201         }
202     }
203 
blit_row_s32a_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)204     static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
205         SkASSERT(alpha < 255);
206 
207         unsigned alpha256 = SkAlpha255To256(alpha);
208 
209         if (count & 1) {
210             uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
211             uint16x8_t vdst_wide, vsrc_wide;
212             unsigned dst_scale;
213 
214             vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
215             vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
216 
217             dst_scale = vget_lane_u8(vsrc, 3);
218             dst_scale = SkAlphaMulInv256(dst_scale, alpha256);
219 
220             vsrc_wide = vmovl_u8(vsrc);
221             vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
222 
223             vdst_wide = vmovl_u8(vdst);
224             vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
225 
226             vdst_wide += vsrc_wide;
227             vres = vshrn_n_u16(vdst_wide, 8);
228 
229             vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
230             dst++;
231             src++;
232             count--;
233         }
234 
235         uint8x8_t alpha_mask;
236         static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
237         alpha_mask = vld1_u8(alpha_mask_setup);
238 
239         while (count) {
240 
241             uint8x8_t vsrc, vdst, vres, vsrc_alphas;
242             uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
243 
244             __builtin_prefetch(src+32);
245             __builtin_prefetch(dst+32);
246 
247             vsrc = vreinterpret_u8_u32(vld1_u32(src));
248             vdst = vreinterpret_u8_u32(vld1_u32(dst));
249 
250             vsrc_scale = vdupq_n_u16(alpha256);
251 
252             vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
253             vdst_scale = vmovl_u8(vsrc_alphas);
254             // Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale).
255             // A 16-bit lane would overflow if we used 0xFFFF here,
256             // so use an approximation with 0xFF00 that is off by 1,
257             // and add back 1 after to get the correct value.
258             // This is valid if alpha256 <= 255.
259             vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale);
260             vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8);
261             vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8);
262 
263             vsrc_wide = vmovl_u8(vsrc);
264             vsrc_wide *= vsrc_scale;
265 
266             vdst_wide = vmovl_u8(vdst);
267             vdst_wide *= vdst_scale;
268 
269             vdst_wide += vsrc_wide;
270             vres = vshrn_n_u16(vdst_wide, 8);
271 
272             vst1_u32(dst, vreinterpret_u32_u8(vres));
273 
274             src += 2;
275             dst += 2;
276             count -= 2;
277         }
278     }
279 
280 #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
281     #include <lasxintrin.h>
282 
SkPMLerp_LASX(const __m256i & src,const __m256i & dst,const unsigned src_scale)283     static inline __m256i SkPMLerp_LASX(const __m256i& src,
284                                         const __m256i& dst,
285                                         const unsigned src_scale) {
286         // Computes dst + (((src - dst)*src_scale)>>8)
287         const __m256i mask = __lasx_xvreplgr2vr_w(0x00FF00FF);
288 
289         // Unpack the 16x16-bit source into 4 8x16-bit splayed halves.
290         __m256i src_rb = __lasx_xvand_v(mask, src);
291         __m256i src_ag = __lasx_xvsrli_h(src, 8);
292         __m256i dst_rb = __lasx_xvand_v(mask, dst);
293         __m256i dst_ag = __lasx_xvsrli_h(dst, 8);
294 
295         // Compute scaled differences.
296         __m256i diff_rb = __lasx_xvsub_h(src_rb, dst_rb);
297         __m256i diff_ag = __lasx_xvsub_h(src_ag, dst_ag);
298         __m256i s = __lasx_xvreplgr2vr_h(src_scale);
299         diff_rb = __lasx_xvmul_h(diff_rb, s);
300         diff_ag = __lasx_xvmul_h(diff_ag, s);
301 
302         // Pack the differences back together.
303         diff_rb = __lasx_xvsrli_h(diff_rb, 8);
304         diff_ag = __lasx_xvandn_v(mask, diff_ag);
305         __m256i diff = __lasx_xvor_v(diff_rb, diff_ag);
306 
307         // Add difference to destination.
308         return __lasx_xvadd_b(dst, diff);
309     }
310 
311 
blit_row_s32_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)312     static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
313         SkASSERT(alpha <= 255);
314 
315         auto src8 = (const __m256i*)src;
316         auto dst8 = (      __m256i*)dst;
317 
318         while (count >= 8) {
319             __lasx_xvst(SkPMLerp_LASX(__lasx_xvld(src8, 0),
320                                       __lasx_xvld(dst8, 0),
321                                       SkAlpha255To256(alpha)), dst8, 0);
322             src8++;
323             dst8++;
324             count -= 8;
325         }
326 
327         src = (const SkPMColor*)src8;
328         dst = (      SkPMColor*)dst8;
329 
330         while (count --> 0) {
331             *dst = SkPMLerp(*src, *dst, SkAlpha255To256(alpha));
332             src++;
333             dst++;
334         }
335     }
336 
SkBlendARGB32_LASX(const __m256i & src,const __m256i & dst,const unsigned aa)337     static inline __m256i SkBlendARGB32_LASX(const __m256i& src,
338                                              const __m256i& dst,
339                                              const unsigned aa) {
340         unsigned alpha = SkAlpha255To256(aa);
341         __m256i src_scale = __lasx_xvreplgr2vr_h(alpha);
342         __m256i dst_scale = __lasx_xvsrli_w(src, 24);
343         // High words in dst_scale are 0, so it's safe to multiply with 16-bit src_scale.
344         dst_scale = __lasx_xvmul_h(dst_scale, src_scale);
345         dst_scale = __lasx_xvsub_w(__lasx_xvreplgr2vr_w(0xFFFF), dst_scale);
346         dst_scale = __lasx_xvadd_w(dst_scale, __lasx_xvsrli_w(dst_scale, 8));
347         dst_scale = __lasx_xvsrli_w(dst_scale, 8);
348         // Duplicate scales into 2x16-bit pattern per pixel.
349         dst_scale = __lasx_xvshuf4i_h(dst_scale, 0xA0);
350 
351         const __m256i mask = __lasx_xvreplgr2vr_w(0x00FF00FF);
352 
353         // Unpack the 16x16-bit source/destination into 4 8x16-bit splayed halves.
354         __m256i src_rb = __lasx_xvand_v(mask, src);
355         __m256i src_ag = __lasx_xvsrli_h(src, 8);
356         __m256i dst_rb = __lasx_xvand_v(mask, dst);
357         __m256i dst_ag = __lasx_xvsrli_h(dst, 8);
358 
359         // Scale them.
360         src_rb = __lasx_xvmul_h(src_rb, src_scale);
361         src_ag = __lasx_xvmul_h(src_ag, src_scale);
362         dst_rb = __lasx_xvmul_h(dst_rb, dst_scale);
363         dst_ag = __lasx_xvmul_h(dst_ag, dst_scale);
364 
365         // Add the scaled source and destination.
366         dst_rb = __lasx_xvadd_h(src_rb, dst_rb);
367         dst_ag = __lasx_xvadd_h(src_ag, dst_ag);
368 
369         // Unsplay the halves back together.
370         dst_rb = __lasx_xvsrli_h(dst_rb, 8);
371         dst_ag = __lasx_xvandn_v(mask, dst_ag);
372         return __lasx_xvor_v(dst_rb, dst_ag);
373     }
374 
blit_row_s32a_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)375     static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
376         SkASSERT(alpha <= 255);
377 
378         auto src8 = (const __m256i*)src;
379         auto dst8 = (      __m256i*)dst;
380 
381         while (count >= 8) {
382             __lasx_xvst(SkBlendARGB32_LASX(__lasx_xvld(src8, 0),
383                                            __lasx_xvld(dst8, 0),
384                                            alpha), dst8, 0);
385             src8++;
386             dst8++;
387             count -= 8;
388         }
389 
390         src = (const SkPMColor*)src8;
391         dst = (      SkPMColor*)dst8;
392 
393         while (count --> 0) {
394             *dst = SkBlendARGB32(*src, *dst, alpha);
395             src++;
396             dst++;
397         }
398     }
399 
400 #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
401     #include <lsxintrin.h>
402 
SkPMLerp_LSX(const __m128i & src,const __m128i & dst,const unsigned src_scale)403     static inline __m128i SkPMLerp_LSX(const __m128i& src,
404                                        const __m128i& dst,
405                                        const unsigned src_scale) {
406         // Computes dst + (((src - dst)*src_scale)>>8)
407         const __m128i mask = __lsx_vreplgr2vr_w(0x00FF00FF);
408 
409         // Unpack the 16x8-bit source into 2 8x16-bit splayed halves.
410         __m128i src_rb = __lsx_vand_v(mask, src);
411         __m128i src_ag = __lsx_vsrli_h(src, 8);
412         __m128i dst_rb = __lsx_vand_v(mask, dst);
413         __m128i dst_ag = __lsx_vsrli_h(dst, 8);
414 
415         // Compute scaled differences.
416         __m128i diff_rb = __lsx_vsub_h(src_rb, dst_rb);
417         __m128i diff_ag = __lsx_vsub_h(src_ag, dst_ag);
418         __m128i s = __lsx_vreplgr2vr_h(src_scale);
419         diff_rb = __lsx_vmul_h(diff_rb, s);
420         diff_ag = __lsx_vmul_h(diff_ag, s);
421 
422         // Pack the differences back together.
423         diff_rb = __lsx_vsrli_h(diff_rb, 8);
424         diff_ag = __lsx_vandn_v(mask, diff_ag);
425         __m128i diff = __lsx_vor_v(diff_rb, diff_ag);
426 
427         // Add difference to destination.
428         return __lsx_vadd_b(dst, diff);
429     }
430 
431 
blit_row_s32_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)432     static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
433         SkASSERT(alpha <= 255);
434 
435         auto src4 = (const __m128i*)src;
436         auto dst4 = (      __m128i*)dst;
437 
438         while (count >= 4) {
439             __lsx_vst(SkPMLerp_LSX(__lsx_vld(src4, 0),
440                                    __lsx_vld(dst4, 0),
441                                    SkAlpha255To256(alpha)), dst4, 0);
442             src4++;
443             dst4++;
444             count -= 4;
445         }
446 
447         src = (const SkPMColor*)src4;
448         dst = (      SkPMColor*)dst4;
449 
450         while (count --> 0) {
451             *dst = SkPMLerp(*src, *dst, SkAlpha255To256(alpha));
452             src++;
453             dst++;
454         }
455     }
456 
SkBlendARGB32_LSX(const __m128i & src,const __m128i & dst,const unsigned aa)457     static inline __m128i SkBlendARGB32_LSX(const __m128i& src,
458                                             const __m128i& dst,
459                                             const unsigned aa) {
460         unsigned alpha = SkAlpha255To256(aa);
461         __m128i src_scale = __lsx_vreplgr2vr_h(alpha);
462         __m128i dst_scale = __lsx_vsrli_w(src, 24);
463         // High words in dst_scale are 0, so it's safe to multiply with 16-bit src_scale.
464         dst_scale = __lsx_vmul_h(dst_scale, src_scale);
465         dst_scale = __lsx_vsub_w(__lsx_vreplgr2vr_w(0xFFFF), dst_scale);
466         dst_scale = __lsx_vadd_w(dst_scale, __lsx_vsrli_w(dst_scale, 8));
467         dst_scale = __lsx_vsrli_w(dst_scale, 8);
468         // Duplicate scales into 2x16-bit pattern per pixel.
469         dst_scale = __lsx_vshuf4i_h(dst_scale, 0xA0);
470 
471         const __m128i mask = __lsx_vreplgr2vr_w(0x00FF00FF);
472 
473         // Unpack the 16x8-bit source/destination into 2 8x16-bit splayed halves.
474         __m128i src_rb = __lsx_vand_v(mask, src);
475         __m128i src_ag = __lsx_vsrli_h(src, 8);
476         __m128i dst_rb = __lsx_vand_v(mask, dst);
477         __m128i dst_ag = __lsx_vsrli_h(dst, 8);
478 
479         // Scale them.
480         src_rb = __lsx_vmul_h(src_rb, src_scale);
481         src_ag = __lsx_vmul_h(src_ag, src_scale);
482         dst_rb = __lsx_vmul_h(dst_rb, dst_scale);
483         dst_ag = __lsx_vmul_h(dst_ag, dst_scale);
484 
485         // Add the scaled source and destination.
486         dst_rb = __lsx_vadd_h(src_rb, dst_rb);
487         dst_ag = __lsx_vadd_h(src_ag, dst_ag);
488 
489         // Unsplay the halves back together.
490         dst_rb = __lsx_vsrli_h(dst_rb, 8);
491         dst_ag = __lsx_vandn_v(mask, dst_ag);
492         return __lsx_vor_v(dst_rb, dst_ag);
493     }
494 
blit_row_s32a_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)495     static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
496         SkASSERT(alpha <= 255);
497 
498         auto src4 = (const __m128i*)src;
499         auto dst4 = (      __m128i*)dst;
500 
501         while (count >= 4) {
502             __lsx_vst(SkBlendARGB32_LSX(__lsx_vld(src4, 0),
503                                         __lsx_vld(dst4, 0),
504                                         alpha), dst4, 0);
505             src4++;
506             dst4++;
507             count -= 4;
508         }
509 
510         src = (const SkPMColor*)src4;
511         dst = (      SkPMColor*)dst4;
512 
513         while (count --> 0) {
514             *dst = SkBlendARGB32(*src, *dst, alpha);
515             src++;
516             dst++;
517         }
518     }
519 
520 #else
blit_row_s32_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)521     static void blit_row_s32_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
522         SkASSERT(alpha <= 255);
523         while (count --> 0) {
524             *dst = SkPMLerp(*src, *dst, SkAlpha255To256(alpha));
525             src++;
526             dst++;
527         }
528     }
529 
blit_row_s32a_blend(SkPMColor * dst,const SkPMColor * src,int count,U8CPU alpha)530     static void blit_row_s32a_blend(SkPMColor* dst, const SkPMColor* src, int count, U8CPU alpha) {
531         SkASSERT(alpha <= 255);
532         while (count --> 0) {
533             *dst = SkBlendARGB32(*src, *dst, alpha);
534             src++;
535             dst++;
536         }
537     }
538 #endif
539 
Factory32(unsigned flags)540 SkBlitRow::Proc32 SkBlitRow::Factory32(unsigned flags) {
541     static const SkBlitRow::Proc32 kProcs[] = {
542         blit_row_s32_opaque,
543         blit_row_s32_blend,
544         nullptr,  // blit_row_s32a_opaque is in SkOpts
545         blit_row_s32a_blend
546     };
547 
548     SkASSERT(flags < std::size(kProcs));
549     flags &= std::size(kProcs) - 1;  // just to be safe
550 
551     return flags == 2 ? SkOpts::blit_row_s32a_opaque
552                       : kProcs[flags];
553 }
554 
Color32(SkPMColor dst[],int count,SkPMColor color)555 void SkBlitRow::Color32(SkPMColor dst[], int count, SkPMColor color) {
556     switch (SkGetPackedA32(color)) {
557         case   0: /* Nothing to do */                  return;
558         case 255: SkOpts::memset32(dst, color, count); return;
559     }
560     return SkOpts::blit_row_color32(dst, count, color);
561 }
562