xref: /aosp_15_r20/external/skia/src/core/SkBlitter_ARGB32.cpp (revision c8dee2aa9b3f27cf6c858bd81872bdeb2c07ed17)
1 /*
2  * Copyright 2006 The Android Open Source Project
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "include/core/SkColor.h"
9 #include "include/core/SkColorPriv.h"
10 #include "include/core/SkColorType.h"
11 #include "include/core/SkPaint.h"
12 #include "include/core/SkPixmap.h"
13 #include "include/core/SkRect.h"
14 #include "include/core/SkTypes.h"
15 #include "include/private/SkColorData.h"
16 #include "include/private/base/SkAlign.h"
17 #include "include/private/base/SkCPUTypes.h"
18 #include "include/private/base/SkDebug.h"
19 #include "include/private/base/SkMalloc.h"
20 #include "include/private/base/SkTo.h"
21 #include "src/base/SkUtils.h"
22 #include "src/base/SkVx.h"
23 #include "src/core/SkBlitMask.h"
24 #include "src/core/SkBlitRow.h"
25 #include "src/core/SkCoreBlitters.h"
26 #include "src/core/SkMask.h"
27 #include "src/core/SkMemset.h"
28 #include "src/shaders/SkShaderBase.h"
29 
30 #include <algorithm>
31 #include <cstddef>
32 #include <cstdint>
33 
upscale_31_to_32(int value)34 static inline int upscale_31_to_32(int value) {
35     SkASSERT((unsigned)value <= 31);
36     return value + (value >> 4);
37 }
38 
blend_32(int src,int dst,int scale)39 static inline int blend_32(int src, int dst, int scale) {
40     SkASSERT((unsigned)src <= 0xFF);
41     SkASSERT((unsigned)dst <= 0xFF);
42     SkASSERT((unsigned)scale <= 32);
43     return dst + ((src - dst) * scale >> 5);
44 }
45 
blend_lcd16(int srcA,int srcR,int srcG,int srcB,SkPMColor dst,uint16_t mask)46 static inline SkPMColor blend_lcd16(int srcA, int srcR, int srcG, int srcB,
47                                      SkPMColor dst, uint16_t mask) {
48     if (mask == 0) {
49         return dst;
50     }
51 
52     /*  We want all of these in 5bits, hence the shifts in case one of them
53      *  (green) is 6bits.
54      */
55     int maskR = SkGetPackedR16(mask) >> (SK_R16_BITS - 5);
56     int maskG = SkGetPackedG16(mask) >> (SK_G16_BITS - 5);
57     int maskB = SkGetPackedB16(mask) >> (SK_B16_BITS - 5);
58 
59     // Now upscale them to 0..32, so we can use blend32
60     maskR = upscale_31_to_32(maskR);
61     maskG = upscale_31_to_32(maskG);
62     maskB = upscale_31_to_32(maskB);
63 
64     // srcA has been upscaled to 256 before passed into this function
65     maskR = maskR * srcA >> 8;
66     maskG = maskG * srcA >> 8;
67     maskB = maskB * srcA >> 8;
68 
69     int dstA = SkGetPackedA32(dst);
70     int dstR = SkGetPackedR32(dst);
71     int dstG = SkGetPackedG32(dst);
72     int dstB = SkGetPackedB32(dst);
73 
74     // Subtract 1 from srcA to bring it back to [0-255] to compare against dstA, alpha needs to
75     // use either the min or the max of the LCD coverages. See https:/skbug.com/40037823
76     int maskA = (srcA-1) < dstA ? std::min(maskR, std::min(maskG, maskB))
77                                 : std::max(maskR, std::max(maskG, maskB));
78 
79     return SkPackARGB32(blend_32(0xFF, dstA, maskA),
80                         blend_32(srcR, dstR, maskR),
81                         blend_32(srcG, dstG, maskG),
82                         blend_32(srcB, dstB, maskB));
83 }
84 
blend_lcd16_opaque(int srcR,int srcG,int srcB,SkPMColor dst,uint16_t mask,SkPMColor opaqueDst)85 static inline SkPMColor blend_lcd16_opaque(int srcR, int srcG, int srcB,
86                                            SkPMColor dst, uint16_t mask,
87                                            SkPMColor opaqueDst) {
88     if (mask == 0) {
89         return dst;
90     }
91 
92     if (0xFFFF == mask) {
93         return opaqueDst;
94     }
95 
96     /*  We want all of these in 5bits, hence the shifts in case one of them
97      *  (green) is 6bits.
98      */
99     int maskR = SkGetPackedR16(mask) >> (SK_R16_BITS - 5);
100     int maskG = SkGetPackedG16(mask) >> (SK_G16_BITS - 5);
101     int maskB = SkGetPackedB16(mask) >> (SK_B16_BITS - 5);
102 
103     // Now upscale them to 0..32, so we can use blend32
104     maskR = upscale_31_to_32(maskR);
105     maskG = upscale_31_to_32(maskG);
106     maskB = upscale_31_to_32(maskB);
107 
108     int dstA = SkGetPackedA32(dst);
109     int dstR = SkGetPackedR32(dst);
110     int dstG = SkGetPackedG32(dst);
111     int dstB = SkGetPackedB32(dst);
112 
113     // Opaque src alpha always uses the max of the LCD coverages.
114     int maskA = std::max(maskR, std::max(maskG, maskB));
115 
116     // LCD blitting is only supported if the dst is known/required
117     // to be opaque
118     return SkPackARGB32(blend_32(0xFF, dstA, maskA),
119                         blend_32(srcR, dstR, maskR),
120                         blend_32(srcG, dstG, maskG),
121                         blend_32(srcB, dstB, maskB));
122 }
123 
124 
125 // TODO: rewrite at least the SSE code here.  It's miserable.
126 
127 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
128     #include <emmintrin.h>
129 
130     // The following (left) shifts cause the top 5 bits of the mask components to
131     // line up with the corresponding components in an SkPMColor.
132     // Note that the mask's RGB16 order may differ from the SkPMColor order.
133     #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
134     #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
135     #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
136 
137     #if SK_R16x5_R32x5_SHIFT == 0
138         #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
139     #elif SK_R16x5_R32x5_SHIFT > 0
140         #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
141     #else
142         #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
143     #endif
144 
145     #if SK_G16x5_G32x5_SHIFT == 0
146         #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
147     #elif SK_G16x5_G32x5_SHIFT > 0
148         #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
149     #else
150         #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
151     #endif
152 
153     #if SK_B16x5_B32x5_SHIFT == 0
154         #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
155     #elif SK_B16x5_B32x5_SHIFT > 0
156         #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
157     #else
158         #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
159     #endif
160 
blend_lcd16_sse2(__m128i & src,__m128i & dst,__m128i & mask,__m128i & srcA)161     static __m128i blend_lcd16_sse2(__m128i &src, __m128i &dst, __m128i &mask, __m128i &srcA) {
162         // In the following comments, the components of src, dst and mask are
163         // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
164         // by an R, G, B, or A suffix. Components of one of the four pixels that
165         // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
166         // example is the blue channel of the second destination pixel. Memory
167         // layout is shown for an ARGB byte order in a color value.
168 
169         // src and srcA store 8-bit values interleaved with zeros.
170         // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
171         // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
172         //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
173         // mask stores 16-bit values (compressed three channels) interleaved with zeros.
174         // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
175         // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
176         //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
177 
178         // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
179         // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
180         __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
181                                   _mm_set1_epi32(0x1F << SK_R32_SHIFT));
182 
183         // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
184         __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
185                                   _mm_set1_epi32(0x1F << SK_G32_SHIFT));
186 
187         // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
188         __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
189                                   _mm_set1_epi32(0x1F << SK_B32_SHIFT));
190 
191         // a needs to be either the min or the max of the LCD coverages, depending on srcA < dstA
192         __m128i aMin = _mm_min_epu8(_mm_slli_epi32(r, SK_A32_SHIFT - SK_R32_SHIFT),
193                        _mm_min_epu8(_mm_slli_epi32(g, SK_A32_SHIFT - SK_G32_SHIFT),
194                                     _mm_slli_epi32(b, SK_A32_SHIFT - SK_B32_SHIFT)));
195         __m128i aMax = _mm_max_epu8(_mm_slli_epi32(r, SK_A32_SHIFT - SK_R32_SHIFT),
196                        _mm_max_epu8(_mm_slli_epi32(g, SK_A32_SHIFT - SK_G32_SHIFT),
197                                     _mm_slli_epi32(b, SK_A32_SHIFT - SK_B32_SHIFT)));
198         // srcA has been biased to [0-256], so compare srcA against (dstA+1)
199         __m128i a = _mm_cmplt_epi32(srcA,
200                                     _mm_and_si128(
201                                             _mm_add_epi32(dst, _mm_set1_epi32(1 << SK_A32_SHIFT)),
202                                             _mm_set1_epi32(SK_A32_MASK)));
203         // a = if_then_else(a, aMin, aMax) == (aMin & a) | (aMax & ~a)
204         a = _mm_or_si128(_mm_and_si128(a, aMin), _mm_andnot_si128(a, aMax));
205 
206         // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
207         // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
208         // 8-bit position
209         // mask = (m0A, m0R, m0G, m0B, m1A, m1R, m1G, m1B,
210         //         m2A, m2R, m2G, m2B, m3A, m3R, m3G, m3B)
211         mask = _mm_or_si128(_mm_or_si128(a, r), _mm_or_si128(g, b));
212 
213         // Interleave R,G,B into the lower byte of word.
214         // i.e. split the sixteen 8-bit values from mask into two sets of eight
215         // 16-bit values, padded by zero.
216         __m128i maskLo, maskHi;
217         // maskLo = (m0A, 0, m0R, 0, m0G, 0, m0B, 0, m1A, 0, m1R, 0, m1G, 0, m1B, 0)
218         maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
219         // maskHi = (m2A, 0, m2R, 0, m2G, 0, m2B, 0, m3A, 0, m3R, 0, m3G, 0, m3B, 0)
220         maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
221 
222         // Upscale from 0..31 to 0..32
223         // (allows to replace division by left-shift further down)
224         // Left-shift each component by 4 and add the result back to that component,
225         // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
226         maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
227         maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
228 
229         // Multiply each component of maskLo and maskHi by srcA
230         maskLo = _mm_mullo_epi16(maskLo, srcA);
231         maskHi = _mm_mullo_epi16(maskHi, srcA);
232 
233         // Left shift mask components by 8 (divide by 256)
234         maskLo = _mm_srli_epi16(maskLo, 8);
235         maskHi = _mm_srli_epi16(maskHi, 8);
236 
237         // Interleave R,G,B into the lower byte of the word
238         // dstLo = (d0A, 0, d0R, 0, d0G, 0, d0B, 0, d1A, 0, d1R, 0, d1G, 0, d1B, 0)
239         __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
240         // dstLo = (d2A, 0, d2R, 0, d2G, 0, d2B, 0, d3A, 0, d3R, 0, d3G, 0, d3B, 0)
241         __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
242 
243         // mask = (src - dst) * mask
244         maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
245         maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
246 
247         // mask = (src - dst) * mask >> 5
248         maskLo = _mm_srai_epi16(maskLo, 5);
249         maskHi = _mm_srai_epi16(maskHi, 5);
250 
251         // Add two pixels into result.
252         // result = dst + ((src - dst) * mask >> 5)
253         __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
254         __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
255 
256         // Pack into 4 32bit dst pixels.
257         // resultLo and resultHi contain eight 16-bit components (two pixels) each.
258         // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
259         // clamping to 255 if necessary.
260         return _mm_packus_epi16(resultLo, resultHi);
261     }
262 
blend_lcd16_opaque_sse2(__m128i & src,__m128i & dst,__m128i & mask)263     static __m128i blend_lcd16_opaque_sse2(__m128i &src, __m128i &dst, __m128i &mask) {
264         // In the following comments, the components of src, dst and mask are
265         // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
266         // by an R, G, B, or A suffix. Components of one of the four pixels that
267         // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
268         // example is the blue channel of the second destination pixel. Memory
269         // layout is shown for an ARGB byte order in a color value.
270 
271         // src and srcA store 8-bit values interleaved with zeros.
272         // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
273         // mask stores 16-bit values (shown as high and low bytes) interleaved with
274         // zeros
275         // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
276         //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
277 
278         // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
279         // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
280         __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
281                                   _mm_set1_epi32(0x1F << SK_R32_SHIFT));
282 
283         // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
284         __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
285                                   _mm_set1_epi32(0x1F << SK_G32_SHIFT));
286 
287         // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
288         __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
289                                   _mm_set1_epi32(0x1F << SK_B32_SHIFT));
290 
291         // a = max(r, g, b) since opaque src alpha uses max of LCD coverages
292         __m128i a = _mm_max_epu8(_mm_slli_epi32(r, SK_A32_SHIFT - SK_R32_SHIFT),
293                     _mm_max_epu8(_mm_slli_epi32(g, SK_A32_SHIFT - SK_G32_SHIFT),
294                                  _mm_slli_epi32(b, SK_A32_SHIFT - SK_B32_SHIFT)));
295 
296         // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
297         // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
298         // 8-bit position
299         // mask = (m0A, m0R, m0G, m0B, m1A, m1R, m1G, m1B,
300         //         m2A, m2R, m2G, m2B, m3A, m3R, m3G, m3B)
301         mask = _mm_or_si128(_mm_or_si128(a, r), _mm_or_si128(g, b));
302 
303         // Interleave R,G,B into the lower byte of word.
304         // i.e. split the sixteen 8-bit values from mask into two sets of eight
305         // 16-bit values, padded by zero.
306         __m128i maskLo, maskHi;
307         // maskLo = (m0A, 0, m0R, 0, m0G, 0, m0B, 0, m1A, 0, m1R, 0, m1G, 0, m1B, 0)
308         maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
309         // maskHi = (m2A, 0, m2R, 0, m2G, 0, m2B, 0, m3A, 0, m3R, 0, m3G, 0, m3B, 0)
310         maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
311 
312         // Upscale from 0..31 to 0..32
313         // (allows to replace division by left-shift further down)
314         // Left-shift each component by 4 and add the result back to that component,
315         // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
316         maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
317         maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
318 
319         // Interleave R,G,B into the lower byte of the word
320         // dstLo = (d0A, 0, d0R, 0, d0G, 0, d0B, 0, d1A, 0, d1R, 0, d1G, 0, d1B, 0)
321         __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
322         // dstLo = (d2A, 0, d2R, 0, d2G, 0, d2B, 0, d3A, 0, d3R, 0, d3G, 0, d3B, 0)
323         __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
324 
325         // mask = (src - dst) * mask
326         maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
327         maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
328 
329         // mask = (src - dst) * mask >> 5
330         maskLo = _mm_srai_epi16(maskLo, 5);
331         maskHi = _mm_srai_epi16(maskHi, 5);
332 
333         // Add two pixels into result.
334         // result = dst + ((src - dst) * mask >> 5)
335         __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
336         __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
337 
338         // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
339         // clamping to 255 if necessary.
340         return _mm_packus_epi16(resultLo, resultHi);
341     }
342 
blit_row_lcd16(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor)343     void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[], SkColor src, int width, SkPMColor) {
344         if (width <= 0) {
345             return;
346         }
347 
348         int srcA = SkColorGetA(src);
349         int srcR = SkColorGetR(src);
350         int srcG = SkColorGetG(src);
351         int srcB = SkColorGetB(src);
352 
353         srcA = SkAlpha255To256(srcA);
354 
355         if (width >= 4) {
356             SkASSERT(SkIsAlign4((uintptr_t) dst));
357             while (!SkIsAlign16((uintptr_t) dst)) {
358                 *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
359                 mask++;
360                 dst++;
361                 width--;
362             }
363 
364             __m128i *d = reinterpret_cast<__m128i*>(dst);
365             // Set alpha to 0xFF and replicate source four times in SSE register.
366             __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
367             // Interleave with zeros to get two sets of four 16-bit values.
368             src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
369             // Set srcA_sse to contain eight copies of srcA, padded with zero.
370             // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
371             __m128i srcA_sse = _mm_set1_epi16(srcA);
372             while (width >= 4) {
373                 // Load four destination pixels into dst_sse.
374                 __m128i dst_sse = _mm_load_si128(d);
375                 // Load four 16-bit masks into lower half of mask_sse.
376                 // mask does *not* actually need to be 16 byte alligned to use this command
377                 __m128i mask_sse = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(mask));
378 
379                 // Check whether masks are equal to 0 and get the highest bit
380                 // of each byte of result, if masks are all zero, we will get
381                 // pack_cmp to 0xFFFF
382                 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
383                                                  _mm_setzero_si128()));
384 
385                 // if mask pixels are not all zero, we will blend the dst pixels
386                 if (pack_cmp != 0xFFFF) {
387                     // Unpack 4 16bit mask pixels to
388                     // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
389                     //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
390                     mask_sse = _mm_unpacklo_epi16(mask_sse,
391                                                   _mm_setzero_si128());
392 
393                     // Process 4 32bit dst pixels
394                     __m128i result = blend_lcd16_sse2(src_sse, dst_sse, mask_sse, srcA_sse);
395                     _mm_store_si128(d, result);
396                 }
397 
398                 d++;
399                 mask += 4;
400                 width -= 4;
401             }
402 
403             dst = reinterpret_cast<SkPMColor*>(d);
404         }
405 
406         while (width > 0) {
407             *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
408             mask++;
409             dst++;
410             width--;
411         }
412     }
413 
blit_row_lcd16_opaque(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor opaqueDst)414     void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[],
415                                SkColor src, int width, SkPMColor opaqueDst) {
416         if (width <= 0) {
417             return;
418         }
419 
420         int srcR = SkColorGetR(src);
421         int srcG = SkColorGetG(src);
422         int srcB = SkColorGetB(src);
423 
424         if (width >= 4) {
425             SkASSERT(SkIsAlign4((uintptr_t) dst));
426             while (!SkIsAlign16((uintptr_t) dst)) {
427                 *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
428                 mask++;
429                 dst++;
430                 width--;
431             }
432 
433             __m128i *d = reinterpret_cast<__m128i*>(dst);
434             // Set alpha to 0xFF and replicate source four times in SSE register.
435             __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
436             // Set srcA_sse to contain eight copies of srcA, padded with zero.
437             // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
438             src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
439             while (width >= 4) {
440                 // Load four destination pixels into dst_sse.
441                 __m128i dst_sse = _mm_load_si128(d);
442                 // Load four 16-bit masks into lower half of mask_sse.
443                 // mask does *not* actually need to be 16 byte alligned to use this command
444                 __m128i mask_sse = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(mask));
445 
446                 // Check whether masks are equal to 0 and get the highest bit
447                 // of each byte of result, if masks are all zero, we will get
448                 // pack_cmp to 0xFFFF
449                 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
450                                                  _mm_setzero_si128()));
451 
452                 // if mask pixels are not all zero, we will blend the dst pixels
453                 if (pack_cmp != 0xFFFF) {
454                     // Unpack 4 16bit mask pixels to
455                     // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
456                     //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
457                     mask_sse = _mm_unpacklo_epi16(mask_sse,
458                                                   _mm_setzero_si128());
459 
460                     // Process 4 32bit dst pixels
461                     __m128i result = blend_lcd16_opaque_sse2(src_sse, dst_sse, mask_sse);
462                     _mm_store_si128(d, result);
463                 }
464 
465                 d++;
466                 mask += 4;
467                 width -= 4;
468             }
469 
470             dst = reinterpret_cast<SkPMColor*>(d);
471         }
472 
473         while (width > 0) {
474             *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
475             mask++;
476             dst++;
477             width--;
478         }
479     }
480 
481 #elif defined(SK_ARM_HAS_NEON)
482     #include <arm_neon.h>
483 
484     #define NEON_A (SK_A32_SHIFT / 8)
485     #define NEON_R (SK_R32_SHIFT / 8)
486     #define NEON_G (SK_G32_SHIFT / 8)
487     #define NEON_B (SK_B32_SHIFT / 8)
488 
blend_32_neon(uint8x8_t src,uint8x8_t dst,uint16x8_t scale)489     static inline uint8x8_t blend_32_neon(uint8x8_t src, uint8x8_t dst, uint16x8_t scale) {
490         int16x8_t src_wide, dst_wide;
491 
492         src_wide = vreinterpretq_s16_u16(vmovl_u8(src));
493         dst_wide = vreinterpretq_s16_u16(vmovl_u8(dst));
494 
495         src_wide = (src_wide - dst_wide) * vreinterpretq_s16_u16(scale);
496 
497         dst_wide += vshrq_n_s16(src_wide, 5);
498 
499         return vmovn_u16(vreinterpretq_u16_s16(dst_wide));
500     }
501 
blit_row_lcd16_opaque(SkPMColor dst[],const uint16_t src[],SkColor color,int width,SkPMColor opaqueDst)502     void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t src[],
503                                SkColor color, int width,
504                                SkPMColor opaqueDst) {
505         int colR = SkColorGetR(color);
506         int colG = SkColorGetG(color);
507         int colB = SkColorGetB(color);
508 
509         uint8x8_t vcolA = vdup_n_u8(0xFF);
510         uint8x8_t vcolR = vdup_n_u8(colR);
511         uint8x8_t vcolG = vdup_n_u8(colG);
512         uint8x8_t vcolB = vdup_n_u8(colB);
513 
514         while (width >= 8) {
515             uint8x8x4_t vdst;
516             uint16x8_t vmask;
517             uint16x8_t vmaskR, vmaskG, vmaskB, vmaskA;
518 
519             vdst = vld4_u8((uint8_t*)dst);
520             vmask = vld1q_u16(src);
521 
522             // Get all the color masks on 5 bits
523             vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
524             vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
525                                  SK_B16_BITS + SK_R16_BITS + 1);
526             vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
527 
528             // Upscale to 0..32
529             vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
530             vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
531             vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
532             // Opaque srcAlpha always uses the max of the 3 LCD coverage values
533             vmaskA = vmaxq_u16(vmaskR, vmaxq_u16(vmaskG, vmaskB));
534 
535             vdst.val[NEON_R] = blend_32_neon(vcolR, vdst.val[NEON_R], vmaskR);
536             vdst.val[NEON_G] = blend_32_neon(vcolG, vdst.val[NEON_G], vmaskG);
537             vdst.val[NEON_B] = blend_32_neon(vcolB, vdst.val[NEON_B], vmaskB);
538             vdst.val[NEON_A] = blend_32_neon(vcolA, vdst.val[NEON_A], vmaskA);
539 
540             vst4_u8((uint8_t*)dst, vdst);
541 
542             dst += 8;
543             src += 8;
544             width -= 8;
545         }
546 
547         // Leftovers
548         for (int i = 0; i < width; i++) {
549             dst[i] = blend_lcd16_opaque(colR, colG, colB, dst[i], src[i], opaqueDst);
550         }
551     }
552 
blit_row_lcd16(SkPMColor dst[],const uint16_t src[],SkColor color,int width,SkPMColor)553     void blit_row_lcd16(SkPMColor dst[], const uint16_t src[],
554                         SkColor color, int width, SkPMColor) {
555         int colA = SkColorGetA(color);
556         int colR = SkColorGetR(color);
557         int colG = SkColorGetG(color);
558         int colB = SkColorGetB(color);
559 
560         // srcA in [0-255] to compare vs dstA
561         uint16x8_t vcolACmp = vdupq_n_u16(colA);
562         colA = SkAlpha255To256(colA);
563 
564         uint16x8_t vcolA = vdupq_n_u16(colA); // srcA in [0-256] to combine with coverage
565         uint8x8_t vcolR = vdup_n_u8(colR);
566         uint8x8_t vcolG = vdup_n_u8(colG);
567         uint8x8_t vcolB = vdup_n_u8(colB);
568 
569         while (width >= 8) {
570             uint8x8x4_t vdst;
571             uint16x8_t vmask;
572             uint16x8_t vmaskR, vmaskG, vmaskB, vmaskA;
573 
574             vdst = vld4_u8((uint8_t*)dst);
575             vmask = vld1q_u16(src);
576 
577             // Get all the color masks on 5 bits
578             vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
579             vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
580                                  SK_B16_BITS + SK_R16_BITS + 1);
581             vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
582 
583             // Upscale to 0..32
584             vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
585             vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
586             vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
587 
588             vmaskR = vshrq_n_u16(vmaskR * vcolA, 8);
589             vmaskG = vshrq_n_u16(vmaskG * vcolA, 8);
590             vmaskB = vshrq_n_u16(vmaskB * vcolA, 8);
591 
592             // Select either the min or the max of the RGB mask values, depending on if the src
593             // alpha is less than the dst alpha.
594             vmaskA = vbslq_u16(vcleq_u16(vcolACmp, vmovl_u8(vdst.val[NEON_A])), // srcA < dstA
595                                vminq_u16(vmaskR, vminq_u16(vmaskG, vmaskB)),    // ? min(r,g,b)
596                                vmaxq_u16(vmaskR, vmaxq_u16(vmaskG, vmaskB)));   // : max(r,g,b)
597 
598             vdst.val[NEON_R] = blend_32_neon(vcolR, vdst.val[NEON_R], vmaskR);
599             vdst.val[NEON_G] = blend_32_neon(vcolG, vdst.val[NEON_G], vmaskG);
600             vdst.val[NEON_B] = blend_32_neon(vcolB, vdst.val[NEON_B], vmaskB);
601             // vmaskA already includes vcolA so blend against 0xFF
602             vdst.val[NEON_A] = blend_32_neon(vdup_n_u8(0xFF), vdst.val[NEON_A], vmaskA);
603             vst4_u8((uint8_t*)dst, vdst);
604 
605             dst += 8;
606             src += 8;
607             width -= 8;
608         }
609 
610         for (int i = 0; i < width; i++) {
611             dst[i] = blend_lcd16(colA, colR, colG, colB, dst[i], src[i]);
612         }
613     }
614 
615 #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
616 
617     // The following (left) shifts cause the top 5 bits of the mask components to
618     // line up with the corresponding components in an SkPMColor.
619     // Note that the mask's RGB16 order may differ from the SkPMColor order.
620     #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
621     #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
622     #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
623 
624     #if SK_R16x5_R32x5_SHIFT == 0
625         #define SkPackedR16x5ToUnmaskedR32x5_LASX(x) (x)
626     #elif SK_R16x5_R32x5_SHIFT > 0
627         #define SkPackedR16x5ToUnmaskedR32x5_LASX(x) (__lasx_xvslli_w(x, SK_R16x5_R32x5_SHIFT))
628     #else
629         #define SkPackedR16x5ToUnmaskedR32x5_LASX(x) (__lasx_xvsrli_w(x, -SK_R16x5_R32x5_SHIFT))
630     #endif
631 
632     #if SK_G16x5_G32x5_SHIFT == 0
633         #define SkPackedG16x5ToUnmaskedG32x5_LASX(x) (x)
634     #elif SK_G16x5_G32x5_SHIFT > 0
635         #define SkPackedG16x5ToUnmaskedG32x5_LASX(x) (__lasx_xvslli_w(x, SK_G16x5_G32x5_SHIFT))
636     #else
637         #define SkPackedG16x5ToUnmaskedG32x5_LASX(x) (__lasx_xvsrli_w(x, -SK_G16x5_G32x5_SHIFT))
638     #endif
639 
640     #if SK_B16x5_B32x5_SHIFT == 0
641         #define SkPackedB16x5ToUnmaskedB32x5_LASX(x) (x)
642     #elif SK_B16x5_B32x5_SHIFT > 0
643         #define SkPackedB16x5ToUnmaskedB32x5_LASX(x) (__lasx_xvslli_w(x, SK_B16x5_B32x5_SHIFT))
644     #else
645         #define SkPackedB16x5ToUnmaskedB32x5_LASX(x) (__lasx_xvsrli_w(x, -SK_B16x5_B32x5_SHIFT))
646     #endif
647 
blend_lcd16_lasx(__m256i & src,__m256i & dst,__m256i & mask,__m256i & srcA)648     static __m256i blend_lcd16_lasx(__m256i &src, __m256i &dst, __m256i &mask, __m256i &srcA) {
649         // In the following comments, the components of src, dst and mask are
650         // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
651         // by an R, G, B, or A suffix. Components of one of the four pixels that
652         // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
653         // example is the blue channel of the second destination pixel. Memory
654         // layout is shown for an ARGB byte order in a color value.
655 
656         // src and srcA store 8-bit values interleaved with zeros.
657         // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0,
658         //         0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
659         // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
660         //         srcA, 0, srcA, 0, srcA, 0, srcA, 0,
661         //         srcA, 0, srcA, 0, srcA, 0, srcA, 0,
662         //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
663         // mask stores 16-bit values (compressed three channels) interleaved with zeros.
664         // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
665         // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
666         //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0,
667         //         m4RGBLo, m4RGBHi, 0, 0, m5RGBLo, m5RGBHi, 0, 0,
668         //         m6RGBLo, m6RGBHi, 0, 0, m7RGBLo, m7RGBHi, 0, 0)
669 
670         __m256i xv_zero = __lasx_xvldi(0);
671 
672         // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
673         // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0,
674         //      0, m4R, 0, 0, 0, m5R, 0, 0, 0, m6R, 0, 0, 0, m7R, 0, 0)
675         __m256i r = __lasx_xvand_v(SkPackedR16x5ToUnmaskedR32x5_LASX(mask),
676                                    __lasx_xvreplgr2vr_w(0x1F << SK_R32_SHIFT));
677 
678         // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
679         //      0, 0, m4G, 0, 0, 0, m5G, 0, 0, 0, m6G, 0, 0, 0, m7R, 0)
680         __m256i g = __lasx_xvand_v(SkPackedG16x5ToUnmaskedG32x5_LASX(mask),
681                                    __lasx_xvreplgr2vr_w(0x1F << SK_G32_SHIFT));
682 
683         // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
684         //      0, 0, 0, m4B, 0, 0, 0, m5B, 0, 0, 0, m6B, 0, 0, 0, m7B)
685         __m256i b = __lasx_xvand_v(SkPackedB16x5ToUnmaskedB32x5_LASX(mask),
686                                    __lasx_xvreplgr2vr_w(0x1F << SK_B32_SHIFT));
687 
688         // a needs to be either the min or the max of the LCD coverages, depending on srcA < dstA
689         __m256i aMin = __lasx_xvmin_b(__lasx_xvslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
690                        __lasx_xvmin_b(__lasx_xvslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
691                                       __lasx_xvslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT)));
692         __m256i aMax = __lasx_xvmax_b(__lasx_xvslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
693                        __lasx_xvmax_b(__lasx_xvslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
694                                       __lasx_xvslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT)));
695         // srcA has been biased to [0-256], so compare srcA against (dstA+1)
696         __m256i a = __lasx_xvmskltz_w(srcA -
697                                     __lasx_xvand_v(
698                                            __lasx_xvadd_w(dst,
699                                                           __lasx_xvreplgr2vr_w(1 << SK_A32_SHIFT)),
700                                            __lasx_xvreplgr2vr_w(SK_A32_MASK)));
701         // a = if_then_else(a, aMin, aMax) == (aMin & a) | (aMax & ~a)
702         a = __lasx_xvor_v(__lasx_xvand_v(a, aMin), __lasx_xvandn_v(a, aMax));
703 
704         // Pack the 8 16bit mask pixels into 8 32bit pixels, (p0, p1, p2, p3)
705         // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
706         // 8-bit position
707         // mask = (m0A, m0R, m0G, m0B, m1R, m1R, m1G, m1B,
708         //         m2A, m2R, m2G, m2B, m3R, m3R, m3G, m3B,
709         //         m4A, m4R, m4G, m4B, m5R, m5R, m5G, m5B,
710         //         m6A, m6R, m6G, m6B, m7R, m7R, m7G, m7B)
711         mask = __lasx_xvor_v(__lasx_xvor_v(a, r), __lasx_xvor_v(g, b));
712 
713         // Interleave R,G,B into the lower byte of word.
714         // i.e. split the sixteen 8-bit values from mask into two sets of sixteen
715         // 16-bit values, padded by zero.
716         __m256i maskLo, maskHi;
717         // maskLo = (m0A, 0, m0R, 0, m0G, 0, m0B, 0, m1A, 0, m1R, 0, m1G, 0, m1B, 0,
718         //           m2A, 0, m2R, 0, m2G, 0, m2B, 0, m3A, 0, m3R, 0, m3G, 0, m3B, 0)
719         maskLo = __lasx_xvilvl_b(xv_zero, mask);
720         // maskHi = (m4A, 0, m4R, 0, m4G, 0, m4B, 0, m5A, 0, m5R, 0, m5G, 0, m5B, 0,
721         //           m6A, 0, m6R, 0, m6G, 0, m6B, 0, m7A, 0, m7R, 0, m7G, 0, m7B, 0)
722         maskHi = __lasx_xvilvh_b(xv_zero, mask);
723 
724         // Upscale from 0..31 to 0..32
725         // (allows to replace division by left-shift further down)
726         // Left-shift each component by 4 and add the result back to that component,
727         // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
728         maskLo = __lasx_xvadd_h(maskLo, __lasx_xvsrli_h(maskLo, 4));
729         maskHi = __lasx_xvadd_h(maskHi, __lasx_xvsrli_h(maskHi, 4));
730 
731         // Multiply each component of maskLo and maskHi by srcA
732         maskLo = __lasx_xvmul_h(maskLo, srcA);
733         maskHi = __lasx_xvmul_h(maskHi, srcA);
734 
735         // Left shift mask components by 8 (divide by 256)
736         maskLo = __lasx_xvsrli_h(maskLo, 8);
737         maskHi = __lasx_xvsrli_h(maskHi, 8);
738 
739         // Interleave R,G,B into the lower byte of the word
740         // dstLo = (d0A, 0, d0R, 0, d0G, 0, d0B, 0, d1A, 0, d1R, 0, d1G, 0, d1B, 0)
741         //          d2A, 0, d2R, 0, d2G, 0, d2B, 0, d3A, 0, d3R, 0, d3G, 0, d3B, 0)
742         __m256i dstLo = __lasx_xvilvl_b(xv_zero, dst);
743         // dstLo = (d4A, 0, d4R, 0, d4G, 0, d4B, 0, d5A, 0, d5R, 0, d5G, 0, d5B, 0)
744         //          d6A, 0, d6R, 0, d6G, 0, d6B, 0, d7A, 0, d7R, 0, d7G, 0, d7B, 0)
745         __m256i dstHi = __lasx_xvilvh_b(xv_zero, dst);
746 
747         // mask = (src - dst) * mask
748         maskLo = __lasx_xvmul_h(maskLo, __lasx_xvsub_h(src, dstLo));
749         maskHi = __lasx_xvmul_h(maskHi, __lasx_xvsub_h(src, dstHi));
750 
751         // mask = (src - dst) * mask >> 5
752         maskLo = __lasx_xvsrai_h(maskLo, 5);
753         maskHi = __lasx_xvsrai_h(maskHi, 5);
754 
755         // Add two pixels into result.
756         // result = dst + ((src - dst) * mask >> 5)
757         __m256i resultLo = __lasx_xvadd_h(dstLo, maskLo);
758         __m256i resultHi = __lasx_xvadd_h(dstHi, maskHi);
759 
760         // Pack into 8 32bit dst pixels.
761         // resultLo and resultHi contain sixteen 16-bit components (four pixels) each.
762         // Merge into one LASX regsiter with 32 8-bit values (eight pixels),
763         // clamping to 255 if necessary.
764         __m256i tmpl = __lasx_xvsat_hu(resultLo, 7);
765         __m256i tmph = __lasx_xvsat_hu(resultHi, 7);
766         return __lasx_xvpickev_b(tmph, tmpl);
767     }
768 
blend_lcd16_opaque_lasx(__m256i & src,__m256i & dst,__m256i & mask)769     static __m256i blend_lcd16_opaque_lasx(__m256i &src, __m256i &dst, __m256i &mask) {
770         // In the following comments, the components of src, dst and mask are
771         // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
772         // by an R, G, B, or A suffix. Components of one of the four pixels that
773         // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
774         // example is the blue channel of the second destination pixel. Memory
775         // layout is shown for an ARGB byte order in a color value.
776 
777         // src and srcA store 8-bit values interleaved with zeros.
778         // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0,
779         //         0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
780         // mask stores 16-bit values (shown as high and low bytes) interleaved with
781         // zeros
782         // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
783         //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0,
784         //         m4RGBLo, m4RGBHi, 0, 0, m5RGBLo, m5RGBHi, 0, 0,
785         //         m6RGBLo, m6RGBHi, 0, 0, m7RGBLo, m7RGBHi, 0, 0)
786 
787         __m256i xv_zero = __lasx_xvldi(0);
788 
789         // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
790         // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0,
791         //      0, m4R, 0, 0, 0, m5R, 0, 0, 0, m6R, 0, 0, 0, m7R, 0, 0)
792         __m256i r = __lasx_xvand_v(SkPackedR16x5ToUnmaskedR32x5_LASX(mask),
793                                    __lasx_xvreplgr2vr_w(0x1F << SK_R32_SHIFT));
794 
795         // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0,
796         //      0, 0, m4G, 0, 0, 0, m5G, 0, 0, 0, m6G, 0, 0, 0, m7G, 0)
797         __m256i g = __lasx_xvand_v(SkPackedG16x5ToUnmaskedG32x5_LASX(mask),
798                                    __lasx_xvreplgr2vr_w(0x1F << SK_G32_SHIFT));
799 
800         // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B,
801         //      0, 0, 0, m4B, 0, 0, 0, m5B, 0, 0, 0, m6B, 0, 0, 0, m7B)
802         __m256i b = __lasx_xvand_v(SkPackedB16x5ToUnmaskedB32x5_LASX(mask),
803                                    __lasx_xvreplgr2vr_w(0x1F << SK_B32_SHIFT));
804 
805         // a = max(r, g, b) since opaque src alpha uses max of LCD coverages
806         __m256i a = __lasx_xvmax_b(__lasx_xvslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
807                     __lasx_xvmax_b(__lasx_xvslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
808                                    __lasx_xvslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT)));
809 
810         // Pack the 8 16bit mask pixels into 8 32bit pixels, (p0, p1, p2, p3,
811         // p4, p5, p6, p7)
812         // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
813         // 8-bit position
814         // mask = (m0A, m0R, m0G, m0B, m1A, m1R, m1G, m1B,
815         //         m2A, m2R, m2G, m2B, m3A, m3R, m3G, m3B,
816         //         m4A, m4R, m4G, m4B, m5A, m5R, m5G, m5B,
817         //         m6A, m6R, m6G, m6B, m7A, m7R, m7G, m7B)
818         mask = __lasx_xvor_v(__lasx_xvor_v(a, r), __lasx_xvor_v(g, b));
819 
820         // Interleave R,G,B into the lower byte of word.
821         // i.e. split the 32 8-bit values from mask into two sets of sixteen
822         // 16-bit values, padded by zero.
823         __m256i maskLo, maskHi;
824         // maskLo = (m0A, 0, m0R, 0, m0G, 0, m0B, 0, m1A, 0, m1R, 0, m1G, 0, m1B, 0,
825         //           m2A, 0, m2R, 0, m2G, 0, m2B, 0, m3A, 0, m3R, 0, m3G, 0, m3B, 0)
826         maskLo = __lasx_xvilvl_b(xv_zero, mask);
827         // maskHi = (m4A, 0, m4R, 0, m4G, 0, m4B, 0, m5A, 0, m5R, 0, m5G, 0, m5B, 0,
828         //           m6A, 0, m6R, 0, m6G, 0, m6B, 0, m7A, 0, m7R, 0, m7G, 0, m7B, 0)
829         maskHi = __lasx_xvilvh_b(xv_zero, mask);
830 
831         // Upscale from 0..31 to 0..32
832         // (allows to replace division by left-shift further down)
833         // Left-shift each component by 4 and add the result back to that component,
834         // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
835         maskLo = __lasx_xvadd_h(maskLo, __lasx_xvsrli_h(maskLo, 4));
836         maskHi = __lasx_xvadd_h(maskHi, __lasx_xvsrli_h(maskHi, 4));
837 
838         // Interleave R,G,B into the lower byte of the word
839         // dstLo = (d0A, 0, d0R, 0, d0G, 0, d0B, 0, d1A, 0, d1R, 0, d1G, 0, d1B, 0,
840         //          d2A, 0, d2R, 0, d2G, 0, d2B, 0, d3A, 0, d3R, 0, d3G, 0, d3B, 0)
841         __m256i dstLo = __lasx_xvilvl_b(xv_zero, dst);
842         // dstLo = (d4A, 0, d4R, 0, d4G, 0, d4B, 0, d5A, 0, d5R, 0, d5G, 0, d5B, 0,
843         // dstLo = (d6A, 0, d6R, 0, d6G, 0, d6B, 0, d7A, 0, d7R, 0, d7G, 0, d7B, 0)
844         __m256i dstHi = __lasx_xvilvh_b(xv_zero, dst);
845 
846         // mask = (src - dst) * mask
847         maskLo = __lasx_xvmul_h(maskLo, __lasx_xvsub_h(src, dstLo));
848         maskHi = __lasx_xvmul_h(maskHi, __lasx_xvsub_h(src, dstHi));
849 
850         // mask = (src - dst) * mask >> 5
851         maskLo = __lasx_xvsrai_h(maskLo, 5);
852         maskHi = __lasx_xvsrai_h(maskHi, 5);
853 
854         // Add two pixels into result.
855         // result = dst + ((src - dst) * mask >> 5)
856         __m256i resultLo = __lasx_xvadd_h(dstLo, maskLo);
857         __m256i resultHi = __lasx_xvadd_h(dstHi, maskHi);
858 
859         // Merge into one SSE regsiter with 32 8-bit values (eight pixels),
860         // clamping to 255 if necessary.
861         __m256i tmpl = __lasx_xvsat_hu(resultLo, 7);
862         __m256i tmph = __lasx_xvsat_hu(resultHi, 7);
863 
864         return __lasx_xvpickev_b(tmph, tmpl);
865     }
866 
blit_row_lcd16(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor)867     void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[], SkColor src, int width, SkPMColor) {
868         if (width <= 0) {
869             return;
870         }
871 
872         int srcA = SkColorGetA(src);
873         int srcR = SkColorGetR(src);
874         int srcG = SkColorGetG(src);
875         int srcB = SkColorGetB(src);
876         __m256i xv_zero = __lasx_xvldi(0);
877 
878         srcA = SkAlpha255To256(srcA);
879         if (width >= 8) {
880             SkASSERT(SkIsAlign4((uintptr_t) dst));
881             while (!SkIsAlign16((uintptr_t) dst)) {
882                 *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
883                 mask++;
884                 dst++;
885                 width--;
886             }
887 
888             __m256i *d = reinterpret_cast<__m256i*>(dst);
889             // Set alpha to 0xFF and replicate source eight times in LASX register.
890             unsigned int skpackargb32 = SkPackARGB32(0xFF, srcR, srcG, srcB);
891             __m256i src_lasx = __lasx_xvreplgr2vr_w(skpackargb32);
892             // Interleave with zeros to get two sets of eight 16-bit values.
893             src_lasx = __lasx_xvilvl_b(xv_zero, src_lasx);
894             // Set srcA_lasx to contain sixteen copies of srcA, padded with zero.
895             // src_lasx=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0,
896             //           0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
897             __m256i srcA_lasx = __lasx_xvreplgr2vr_h(srcA);
898 
899             while (width >= 8) {
900                 // Load eight destination pixels into dst_lasx.
901                 __m256i dst_lasx = __lasx_xvld(d, 0);
902                 // Load eight 16-bit masks into lower half of mask_lasx.
903                 __m256i mask_lasx = __lasx_xvld(mask, 0);
904                 mask_lasx = (__m256i){mask_lasx[0], 0, mask_lasx[1], 0};
905 
906                 int pack_cmp = __lasx_xbz_v(mask_lasx);
907                 // if mask pixels are not all zero, we will blend the dst pixels
908                 if (pack_cmp != 1) {
909                     // Unpack 8 16bit mask pixels to
910                     // mask_lasx = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
911                     //              m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0,
912                     //              m4RGBLo, m4RGBHi, 0, 0, m5RGBLo, m5RGBHi, 0, 0,
913                     //              m6RGBLo, m6RGBHi, 0, 0, m7RGBLo, m7RGBHi, 0, 0)
914                     mask_lasx = __lasx_xvilvl_h(xv_zero, mask_lasx);
915 
916                     // Process 8 32bit dst pixels
917                     __m256i result = blend_lcd16_lasx(src_lasx, dst_lasx, mask_lasx, srcA_lasx);
918                     __lasx_xvst(result, d, 0);
919                 }
920                 d++;
921                 mask += 8;
922                 width -= 8;
923             }
924             dst = reinterpret_cast<SkPMColor*>(d);
925         }
926 
927         while (width > 0) {
928             *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
929             mask++;
930             dst++;
931             width--;
932         }
933     }
934 
blit_row_lcd16_opaque(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor opaqueDst)935     void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[],
936                                SkColor src, int width, SkPMColor opaqueDst) {
937         if (width <= 0) {
938             return;
939         }
940 
941         int srcR = SkColorGetR(src);
942         int srcG = SkColorGetG(src);
943         int srcB = SkColorGetB(src);
944         __m256i xv_zero = __lasx_xvldi(0);
945 
946         if (width >= 8) {
947             SkASSERT(SkIsAlign4((uintptr_t) dst));
948             while (!SkIsAlign16((uintptr_t) dst)) {
949                 *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
950                 mask++;
951                 dst++;
952                 width--;
953             }
954 
955             __m256i *d = reinterpret_cast<__m256i*>(dst);
956             // Set alpha to 0xFF and replicate source four times in LASX register.
957             unsigned int sk_pack_argb32 = SkPackARGB32(0xFF, srcR, srcG, srcB);
958             __m256i src_lasx = __lasx_xvreplgr2vr_w(sk_pack_argb32);
959             // Set srcA_lasx to contain sixteen copies of srcA, padded with zero.
960             // src_lasx=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0,
961             //           0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
962             src_lasx = __lasx_xvilvl_b(xv_zero, src_lasx);
963 
964             while (width >= 8) {
965                 // Load eight destination pixels into dst_lasx.
966                 __m256i dst_lasx = __lasx_xvld(d, 0);
967                 // Load eight 16-bit masks into lower half of mask_lasx.
968                 __m256i mask_lasx = __lasx_xvld(mask, 0);
969                 mask_lasx = (__m256i){mask_lasx[0], 0, mask_lasx[1], 0};
970 
971                 int32_t pack_cmp = __lasx_xbz_v(mask_lasx);
972                 // if mask pixels are not all zero, we will blend the dst pixels
973                 if (pack_cmp != 1) {
974                     // Unpack 8 16bit mask pixels to
975                     // mask_lasx = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
976                     //              m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0,
977                     //              m4RGBLo, m4RGBHi, 0, 0, m5RGBLo, m5RGBHi, 0, 0,
978                     //              m6RGBLo, m6RGBHi, 0, 0, m7RGBLo, m7RGBHi, 0, 0)
979                     mask_lasx = __lasx_xvilvl_h(xv_zero, mask_lasx);
980                     // Process 8 32bit dst pixels
981                     __m256i result = blend_lcd16_opaque_lasx(src_lasx, dst_lasx, mask_lasx);
982                     __lasx_xvst(result, d, 0);
983                 }
984                 d++;
985                 mask += 8;
986                 width -= 8;
987             }
988 
989             dst = reinterpret_cast<SkPMColor*>(d);
990         }
991 
992         while (width > 0) {
993             *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
994             mask++;
995             dst++;
996             width--;
997         }
998     }
999 
1000 #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
1001 
1002     // The following (left) shifts cause the top 5 bits of the mask components to
1003     // line up with the corresponding components in an SkPMColor.
1004     // Note that the mask's RGB16 order may differ from the SkPMColor order.
1005     #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
1006     #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
1007     #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
1008 
1009     #if SK_R16x5_R32x5_SHIFT == 0
1010         #define SkPackedR16x5ToUnmaskedR32x5_LSX(x) (x)
1011     #elif SK_R16x5_R32x5_SHIFT > 0
1012         #define SkPackedR16x5ToUnmaskedR32x5_LSX(x) (__lsx_vslli_w(x, SK_R16x5_R32x5_SHIFT))
1013     #else
1014         #define SkPackedR16x5ToUnmaskedR32x5_LSX(x) (__lsx_vsrli_w(x, -SK_R16x5_R32x5_SHIFT))
1015     #endif
1016 
1017     #if SK_G16x5_G32x5_SHIFT == 0
1018         #define SkPackedG16x5ToUnmaskedG32x5_LSX(x) (x)
1019     #elif SK_G16x5_G32x5_SHIFT > 0
1020         #define SkPackedG16x5ToUnmaskedG32x5_LSX(x) (__lsx_vslli_w(x, SK_G16x5_G32x5_SHIFT))
1021     #else
1022         #define SkPackedG16x5ToUnmaskedG32x5_LSX(x) (__lsx_vsrli_w(x, -SK_G16x5_G32x5_SHIFT))
1023     #endif
1024 
1025     #if SK_B16x5_B32x5_SHIFT == 0
1026         #define SkPackedB16x5ToUnmaskedB32x5_LSX(x) (x)
1027     #elif SK_B16x5_B32x5_SHIFT > 0
1028         #define SkPackedB16x5ToUnmaskedB32x5_LSX(x) (__lsx_vslli_w(x, SK_B16x5_B32x5_SHIFT))
1029     #else
1030         #define SkPackedB16x5ToUnmaskedB32x5_LSX(x) (__lsx_vsrli_w(x, -SK_B16x5_B32x5_SHIFT))
1031     #endif
1032 
blend_lcd16_lsx(__m128i & src,__m128i & dst,__m128i & mask,__m128i & srcA)1033     static __m128i blend_lcd16_lsx(__m128i &src, __m128i &dst, __m128i &mask, __m128i &srcA) {
1034         // In the following comments, the components of src, dst and mask are
1035         // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
1036         // by an R, G, B, or A suffix. Components of one of the four pixels that
1037         // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
1038         // example is the blue channel of the second destination pixel. Memory
1039         // layout is shown for an ARGB byte order in a color value.
1040 
1041         // src and srcA store 8-bit values interleaved with zeros.
1042         // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
1043         // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
1044         //         srcA, 0, srcA, 0, srcA, 0, srcA, 0)
1045         // mask stores 16-bit values (compressed three channels) interleaved with zeros.
1046         // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
1047         // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
1048         //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
1049 
1050         __m128i v_zero = __lsx_vldi(0);
1051 
1052         // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
1053         // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
1054         __m128i r = __lsx_vand_v(SkPackedR16x5ToUnmaskedR32x5_LSX(mask),
1055                                  __lsx_vreplgr2vr_w(0x1F << SK_R32_SHIFT));
1056 
1057         // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
1058         __m128i g = __lsx_vand_v(SkPackedG16x5ToUnmaskedG32x5_LSX(mask),
1059                                  __lsx_vreplgr2vr_w(0x1F << SK_G32_SHIFT));
1060 
1061         // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
1062         __m128i b = __lsx_vand_v(SkPackedB16x5ToUnmaskedB32x5_LSX(mask),
1063                                  __lsx_vreplgr2vr_w(0x1F << SK_B32_SHIFT));
1064 
1065         // a needs to be either the min or the max of the LCD coverages, depending on srcA < dstA
1066         __m128i aMin = __lsx_vmin_b(__lsx_vslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
1067                        __lsx_vmin_b(__lsx_vslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
1068                                     __lsx_vslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT)));
1069         __m128i aMax = __lsx_vmax_b(__lsx_vslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
1070                        __lsx_vmax_b(__lsx_vslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
1071                                     __lsx_vslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT)));
1072         // srcA has been biased to [0-256], so compare srcA against (dstA+1)
1073         __m128i a = __lsx_vmskltz_w(srcA -
1074                                     __lsx_vand_v(
1075                                           __lsx_vadd_w(dst,
1076                                                        __lsx_vreplgr2vr_w(1 << SK_A32_SHIFT)),
1077                                           __lsx_vreplgr2vr_w(SK_A32_MASK)));
1078         // a = if_then_else(a, aMin, aMax) == (aMin & a) | (aMax & ~a)
1079         a = __lsx_vor_v(__lsx_vand_v(a, aMin), __lsx_vandn_v(a, aMax));
1080 
1081         // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
1082         // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
1083         // 8-bit position
1084         // mask = (m0A, m0R, m0G, m0B, m1A, m1R, m1G, m1B,
1085         //         m2A, m2R, m2G, m2B, m3A, m3R, m3G, m3B)
1086         mask = __lsx_vor_v(__lsx_vor_v(a, r), __lsx_vor_v(g, b));
1087 
1088         // Interleave R,G,B into the lower byte of word.
1089         // i.e. split the sixteen 8-bit values from mask into two sets of eight
1090         // 16-bit values, padded by zero.
1091         __m128i maskLo, maskHi;
1092         // maskLo = (m0A, 0, m0R, 0, m0G, 0, m0B, 0, m1A, 0, m1R, 0, m1G, 0, m1B, 0)
1093         maskLo = __lsx_vilvl_b(v_zero, mask);
1094         // maskHi = (m2A, 0, m2R, 0, m2G, 0, m2B, 0, m3A, 0, m3R, 0, m3G, 0, m3B, 0)
1095         maskHi = __lsx_vilvh_b(v_zero, mask);
1096 
1097         // Upscale from 0..31 to 0..32
1098         // (allows to replace division by left-shift further down)
1099         // Left-shift each component by 4 and add the result back to that component,
1100         // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
1101         maskLo = __lsx_vadd_h(maskLo, __lsx_vsrli_h(maskLo, 4));
1102         maskHi = __lsx_vadd_h(maskHi, __lsx_vsrli_h(maskHi, 4));
1103 
1104         // Multiply each component of maskLo and maskHi by srcA
1105         maskLo = __lsx_vmul_h(maskLo, srcA);
1106         maskHi = __lsx_vmul_h(maskHi, srcA);
1107 
1108         // Left shift mask components by 8 (divide by 256)
1109         maskLo = __lsx_vsrli_h(maskLo, 8);
1110         maskHi = __lsx_vsrli_h(maskHi, 8);
1111 
1112         // Interleave R,G,B into the lower byte of the word
1113         // dstLo = (d0A, 0, d0R, 0, d0G, 0, d0B, 0, d1A, 0, d1R, 0, d1G, 0, d1B, 0)
1114         __m128i dstLo = __lsx_vilvl_b(v_zero, dst);
1115         // dstLo = (d2A, 0, d2R, 0, d2G, 0, d2B, 0, d3A, 0, d3R, 0, d3G, 0, d3B, 0)
1116         __m128i dstHi = __lsx_vilvh_b(v_zero, dst);
1117 
1118         // mask = (src - dst) * mask
1119         maskLo = __lsx_vmul_h(maskLo, __lsx_vsub_h(src, dstLo));
1120         maskHi = __lsx_vmul_h(maskHi, __lsx_vsub_h(src, dstHi));
1121 
1122         // mask = (src - dst) * mask >> 5
1123         maskLo = __lsx_vsrai_h(maskLo, 5);
1124         maskHi = __lsx_vsrai_h(maskHi, 5);
1125 
1126         // Add two pixels into result.
1127         // result = dst + ((src - dst) * mask >> 5)
1128         __m128i resultLo = __lsx_vadd_h(dstLo, maskLo);
1129         __m128i resultHi = __lsx_vadd_h(dstHi, maskHi);
1130 
1131         // Pack into 4 32bit dst pixels.
1132         // resultLo and resultHi contain eight 16-bit components (two pixels) each.
1133         // Merge into one LSX regsiter with sixteen 8-bit values (four pixels),
1134         // clamping to 255 if necessary.
1135         __m128i tmpl = __lsx_vsat_hu(resultLo, 7);
1136         __m128i tmph = __lsx_vsat_hu(resultHi, 7);
1137         return __lsx_vpickev_b(tmph, tmpl);
1138     }
1139 
blend_lcd16_opaque_lsx(__m128i & src,__m128i & dst,__m128i & mask)1140     static __m128i blend_lcd16_opaque_lsx(__m128i &src, __m128i &dst, __m128i &mask) {
1141         // In the following comments, the components of src, dst and mask are
1142         // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
1143         // by an R, G, B, or A suffix. Components of one of the four pixels that
1144         // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
1145         // example is the blue channel of the second destination pixel. Memory
1146         // layout is shown for an ARGB byte order in a color value.
1147 
1148         // src and srcA store 8-bit values interleaved with zeros.
1149         // src  = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
1150         // mask stores 16-bit values (shown as high and low bytes) interleaved with
1151         // zeros
1152         // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
1153         //         m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
1154 
1155         __m128i v_zero = __lsx_vldi(0);
1156 
1157         // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
1158         // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
1159         __m128i r = __lsx_vand_v(SkPackedR16x5ToUnmaskedR32x5_LSX(mask),
1160                                  __lsx_vreplgr2vr_w(0x1F << SK_R32_SHIFT));
1161 
1162         // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
1163         __m128i g = __lsx_vand_v(SkPackedG16x5ToUnmaskedG32x5_LSX(mask),
1164                                  __lsx_vreplgr2vr_w(0x1F << SK_G32_SHIFT));
1165 
1166         // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
1167         __m128i b = __lsx_vand_v(SkPackedB16x5ToUnmaskedB32x5_LSX(mask),
1168                                  __lsx_vreplgr2vr_w(0x1F << SK_B32_SHIFT));
1169 
1170         // a = max(r, g, b) since opaque src alpha uses max of LCD coverages
1171         __m128i a = __lsx_vmax_b(__lsx_vslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
1172                     __lsx_vmax_b(__lsx_vslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
1173                                  __lsx_vslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT)));
1174 
1175         // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
1176         // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
1177         // 8-bit position
1178         // mask = (m0A, m0R, m0G, m0B, m1A, m1R, m1G, m1B,
1179         //         m2A, m2R, m2G, m2B, m3A, m3R, m3G, m3B)
1180         mask = __lsx_vor_v(__lsx_vor_v(a, r), __lsx_vor_v(g, b));
1181 
1182         // Interleave R,G,B into the lower byte of word.
1183         // i.e. split the sixteen 8-bit values from mask into two sets of eight
1184         // 16-bit values, padded by zero.
1185         __m128i maskLo, maskHi;
1186         // maskLo = (m0A, 0, m0R, 0, m0G, 0, m0B, 0, m1A, 0, m1R, 0, m1G, 0, m1B, 0)
1187         maskLo = __lsx_vilvl_b(v_zero, mask);
1188         // maskHi = (m2A, 0, m2R, 0, m2G, 0, m2B, 0, m3A, 0, m3R, 0, m3G, 0, m3B, 0)
1189         maskHi = __lsx_vilvh_b(v_zero, mask);
1190 
1191         // Upscale from 0..31 to 0..32
1192         // (allows to replace division by left-shift further down)
1193         // Left-shift each component by 4 and add the result back to that component,
1194         // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
1195         maskLo = __lsx_vadd_h(maskLo, __lsx_vsrli_h(maskLo, 4));
1196         maskHi = __lsx_vadd_h(maskHi, __lsx_vsrli_h(maskHi, 4));
1197 
1198         // Interleave R,G,B into the lower byte of the word
1199         // dstLo = (d0A, 0, d0R, 0, d0G, 0, d0B, 0, d1A, 0, d1R, 0, d1G, 0, d1B, 0)
1200         __m128i dstLo = __lsx_vilvl_b(v_zero, dst);
1201         // dstLo = (d2A, 0, d2R, 0, d2G, 0, d2B, 0, d3A, 0, d3R, 0, d3G, 0, d3B, 0)
1202         __m128i dstHi = __lsx_vilvh_b(v_zero, dst);
1203 
1204         // mask = (src - dst) * mask
1205         maskLo = __lsx_vmul_h(maskLo, __lsx_vsub_h(src, dstLo));
1206         maskHi = __lsx_vmul_h(maskHi, __lsx_vsub_h(src, dstHi));
1207 
1208         // mask = (src - dst) * mask >> 5
1209         maskLo = __lsx_vsrai_h(maskLo, 5);
1210         maskHi = __lsx_vsrai_h(maskHi, 5);
1211 
1212         // Add two pixels into result.
1213         // result = dst + ((src - dst) * mask >> 5)
1214         __m128i resultLo = __lsx_vadd_h(dstLo, maskLo);
1215         __m128i resultHi = __lsx_vadd_h(dstHi, maskHi);
1216 
1217         // Merge into one LSX regsiter with sixteen 8-bit values (four pixels),
1218         // clamping to 255 if necessary.
1219         __m128i tmpl = __lsx_vsat_hu(resultLo, 7);
1220         __m128i tmph = __lsx_vsat_hu(resultHi, 7);
1221         return __lsx_vpickev_b(tmph, tmpl);
1222     }
1223 
blit_row_lcd16(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor)1224     void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[], SkColor src, int width, SkPMColor) {
1225         if (width <= 0) {
1226             return;
1227         }
1228 
1229         int srcA = SkColorGetA(src);
1230         int srcR = SkColorGetR(src);
1231         int srcG = SkColorGetG(src);
1232         int srcB = SkColorGetB(src);
1233         __m128i v_zero = __lsx_vldi(0);
1234 
1235         srcA = SkAlpha255To256(srcA);
1236         if (width >= 4) {
1237             SkASSERT(SkIsAlign4((uintptr_t) dst));
1238             while (!SkIsAlign16((uintptr_t) dst)) {
1239                 *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
1240                 mask++;
1241                 dst++;
1242                 width--;
1243             }
1244 
1245             __m128i *d = reinterpret_cast<__m128i*>(dst);
1246             // Set alpha to 0xFF and replicate source eight times in LSX register.
1247             unsigned int skpackargb32 = SkPackARGB32(0xFF, srcR, srcG, srcB);
1248             __m128i src_lsx = __lsx_vreplgr2vr_w(skpackargb32);
1249             // Interleave with zeros to get two sets of eight 16-bit values.
1250             src_lsx = __lsx_vilvl_b(v_zero, src_lsx);
1251             // Set srcA_lsx to contain eight copies of srcA, padded with zero.
1252             // src_lsx=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
1253             __m128i srcA_lsx = __lsx_vreplgr2vr_h(srcA);
1254 
1255             while (width >= 4) {
1256                 // Load eight destination pixels into dst_lsx.
1257                 __m128i dst_lsx = __lsx_vld(d, 0);
1258                 // Load four 16-bit masks into lower half of mask_lsx.
1259                 __m128i mask_lsx = __lsx_vldrepl_d((void *)mask, 0);
1260                 mask_lsx =  __lsx_vilvl_d(v_zero, mask_lsx);
1261 
1262                 int pack_cmp = __lsx_bz_v(mask_lsx);
1263                 // if mask pixels are not all zero, we will blend the dst pixels
1264                 if (pack_cmp != 1) {
1265                     // Unpack 4 16bit mask pixels to
1266                     // mask_lsx = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
1267                     //             m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
1268                     mask_lsx = __lsx_vilvl_h(v_zero, mask_lsx);
1269 
1270                     // Process 8 32bit dst pixels
1271                     __m128i result = blend_lcd16_lsx(src_lsx, dst_lsx, mask_lsx, srcA_lsx);
1272                     __lsx_vst(result, d, 0);
1273                 }
1274 
1275                 d++;
1276                 mask += 4;
1277                 width -= 4;
1278             }
1279 
1280             dst = reinterpret_cast<SkPMColor*>(d);
1281         }
1282 
1283         while (width > 0) {
1284             *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
1285             mask++;
1286             dst++;
1287             width--;
1288         }
1289     }
1290 
blit_row_lcd16_opaque(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor opaqueDst)1291     void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[],
1292                                SkColor src, int width, SkPMColor opaqueDst) {
1293         if (width <= 0) {
1294             return;
1295         }
1296 
1297         int srcR = SkColorGetR(src);
1298         int srcG = SkColorGetG(src);
1299         int srcB = SkColorGetB(src);
1300         __m128i v_zero = __lsx_vldi(0);
1301 
1302         if (width >= 4) {
1303             SkASSERT(SkIsAlign4((uintptr_t) dst));
1304             while (!SkIsAlign16((uintptr_t) dst)) {
1305                 *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
1306                 mask++;
1307                 dst++;
1308                 width--;
1309             }
1310 
1311             __m128i *d = reinterpret_cast<__m128i*>(dst);
1312             // Set alpha to 0xFF and replicate source four times in LSX register.
1313             unsigned int sk_pack_argb32 = SkPackARGB32(0xFF, srcR, srcG, srcB);
1314             __m128i src_lsx = __lsx_vreplgr2vr_w(sk_pack_argb32);
1315             // Set srcA_lsx to contain eight copies of srcA, padded with zero.
1316             // src_lsx=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
1317             src_lsx = __lsx_vilvl_b(v_zero, src_lsx);
1318 
1319             while (width >= 4) {
1320                 // Load four destination pixels into dst_lsx.
1321                 __m128i dst_lsx = __lsx_vld(d, 0);
1322                 // Load four 16-bit masks into lower half of mask_lsx.
1323                 __m128i mask_lsx = __lsx_vldrepl_d((void *)(mask), 0);
1324                 mask_lsx =  __lsx_vilvl_d(v_zero, mask_lsx);
1325 
1326                 int pack_cmp = __lsx_bz_v(mask_lsx);
1327                 // if mask pixels are not all zero, we will blend the dst pixels
1328                 if (pack_cmp != 1) {
1329                     // Unpack 4 16bit mask pixels to
1330                     mask_lsx = __lsx_vilvl_h(v_zero, mask_lsx);
1331 
1332                     // Process 8 32bit dst pixels
1333                     __m128i result = blend_lcd16_opaque_lsx(src_lsx, dst_lsx, mask_lsx);
1334                     __lsx_vst(result, d, 0);
1335                 }
1336                 d++;
1337                 mask += 4;
1338                 width -= 4;
1339             }
1340 
1341             dst = reinterpret_cast<SkPMColor*>(d);
1342         }
1343 
1344         while (width > 0) {
1345             *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
1346             mask++;
1347             dst++;
1348             width--;
1349         }
1350     }
1351 
1352 #else
1353 
blit_row_lcd16(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor)1354     static inline void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[],
1355                                       SkColor src, int width, SkPMColor) {
1356         int srcA = SkColorGetA(src);
1357         int srcR = SkColorGetR(src);
1358         int srcG = SkColorGetG(src);
1359         int srcB = SkColorGetB(src);
1360 
1361         srcA = SkAlpha255To256(srcA);
1362 
1363         for (int i = 0; i < width; i++) {
1364             dst[i] = blend_lcd16(srcA, srcR, srcG, srcB, dst[i], mask[i]);
1365         }
1366     }
1367 
blit_row_lcd16_opaque(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor opaqueDst)1368     static inline void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[],
1369                                              SkColor src, int width,
1370                                              SkPMColor opaqueDst) {
1371         int srcR = SkColorGetR(src);
1372         int srcG = SkColorGetG(src);
1373         int srcB = SkColorGetB(src);
1374 
1375         for (int i = 0; i < width; i++) {
1376             dst[i] = blend_lcd16_opaque(srcR, srcG, srcB, dst[i], mask[i], opaqueDst);
1377         }
1378     }
1379 
1380 #endif
1381 
blit_color(const SkPixmap & device,const SkMask & mask,const SkIRect & clip,SkColor color)1382 static bool blit_color(const SkPixmap& device,
1383                        const SkMask& mask,
1384                        const SkIRect& clip,
1385                        SkColor color) {
1386     int x = clip.fLeft,
1387         y = clip.fTop;
1388 
1389     if (device.colorType() == kN32_SkColorType && mask.fFormat == SkMask::kA8_Format) {
1390         SkOpts::blit_mask_d32_a8(device.writable_addr32(x,y), device.rowBytes(),
1391                                  (const SkAlpha*)mask.getAddr(x,y), mask.fRowBytes,
1392                                  color, clip.width(), clip.height());
1393         return true;
1394     }
1395 
1396     if (device.colorType() == kN32_SkColorType && mask.fFormat == SkMask::kLCD16_Format) {
1397         auto dstRow  = device.writable_addr32(x,y);
1398         auto maskRow = (const uint16_t*)mask.getAddr(x,y);
1399 
1400         auto blit_row = blit_row_lcd16;
1401         SkPMColor opaqueDst = 0;  // ignored unless opaque
1402 
1403         if (0xff == SkColorGetA(color)) {
1404             blit_row  = blit_row_lcd16_opaque;
1405             opaqueDst = SkPreMultiplyColor(color);
1406         }
1407 
1408         for (int height = clip.height(); height --> 0; ) {
1409             blit_row(dstRow, maskRow, color, clip.width(), opaqueDst);
1410 
1411             dstRow  = (SkPMColor*)     ((      char*) dstRow + device.rowBytes());
1412             maskRow = (const uint16_t*)((const char*)maskRow +  mask.fRowBytes);
1413         }
1414         return true;
1415     }
1416 
1417     return false;
1418 }
1419 
1420 ///////////////////////////////////////////////////////////////////////////////
1421 
SkARGB32_Blit32(const SkPixmap & device,const SkMask & mask,const SkIRect & clip,SkPMColor srcColor)1422 static void SkARGB32_Blit32(const SkPixmap& device, const SkMask& mask,
1423                             const SkIRect& clip, SkPMColor srcColor) {
1424     U8CPU alpha = SkGetPackedA32(srcColor);
1425     unsigned flags = SkBlitRow::kSrcPixelAlpha_Flag32;
1426     if (alpha != 255) {
1427         flags |= SkBlitRow::kGlobalAlpha_Flag32;
1428     }
1429     SkBlitRow::Proc32 proc = SkBlitRow::Factory32(flags);
1430 
1431     int x = clip.fLeft;
1432     int y = clip.fTop;
1433     int width = clip.width();
1434     int height = clip.height();
1435 
1436     SkPMColor* dstRow = device.writable_addr32(x, y);
1437     const SkPMColor* srcRow = reinterpret_cast<const SkPMColor*>(mask.getAddr8(x, y));
1438 
1439     do {
1440         proc(dstRow, srcRow, width, alpha);
1441         dstRow = (SkPMColor*)((char*)dstRow + device.rowBytes());
1442         srcRow = (const SkPMColor*)((const char*)srcRow + mask.fRowBytes);
1443     } while (--height != 0);
1444 }
1445 
1446 //////////////////////////////////////////////////////////////////////////////////////
1447 
SkARGB32_Blitter(const SkPixmap & device,const SkPaint & paint)1448 SkARGB32_Blitter::SkARGB32_Blitter(const SkPixmap& device, const SkPaint& paint)
1449         : INHERITED(device) {
1450     SkColor color = paint.getColor();
1451     fColor = color;
1452 
1453     fSrcA = SkColorGetA(color);
1454     unsigned scale = SkAlpha255To256(fSrcA);
1455     fSrcR = SkAlphaMul(SkColorGetR(color), scale);
1456     fSrcG = SkAlphaMul(SkColorGetG(color), scale);
1457     fSrcB = SkAlphaMul(SkColorGetB(color), scale);
1458 
1459     fPMColor = SkPackARGB32(fSrcA, fSrcR, fSrcG, fSrcB);
1460 }
1461 
1462 #if defined _WIN32  // disable warning : local variable used without having been initialized
1463 #pragma warning ( push )
1464 #pragma warning ( disable : 4701 )
1465 #endif
1466 
blitH(int x,int y,int width)1467 void SkARGB32_Blitter::blitH(int x, int y, int width) {
1468     SkASSERT(x >= 0 && y >= 0 && x + width <= fDevice.width());
1469 
1470     uint32_t* device = fDevice.writable_addr32(x, y);
1471     SkBlitRow::Color32(device, width, fPMColor);
1472 }
1473 
blitAntiH(int x,int y,const SkAlpha antialias[],const int16_t runs[])1474 void SkARGB32_Blitter::blitAntiH(int x, int y, const SkAlpha antialias[],
1475                                  const int16_t runs[]) {
1476     if (fSrcA == 0) {
1477         return;
1478     }
1479 
1480     uint32_t    color = fPMColor;
1481     uint32_t*   device = fDevice.writable_addr32(x, y);
1482     unsigned    opaqueMask = fSrcA; // if fSrcA is 0xFF, then we will catch the fast opaque case
1483 
1484     for (;;) {
1485         int count = runs[0];
1486         SkASSERT(count >= 0);
1487         if (count <= 0) {
1488             return;
1489         }
1490         unsigned aa = antialias[0];
1491         if (aa) {
1492             if ((opaqueMask & aa) == 255) {
1493                 SkOpts::memset32(device, color, count);
1494             } else {
1495                 uint32_t sc = SkAlphaMulQ(color, SkAlpha255To256(aa));
1496                 SkBlitRow::Color32(device, count, sc);
1497             }
1498         }
1499         runs += count;
1500         antialias += count;
1501         device += count;
1502     }
1503 }
1504 
blitAntiH2(int x,int y,U8CPU a0,U8CPU a1)1505 void SkARGB32_Blitter::blitAntiH2(int x, int y, U8CPU a0, U8CPU a1) {
1506     uint32_t* device = fDevice.writable_addr32(x, y);
1507     SkDEBUGCODE((void)fDevice.writable_addr32(x + 1, y);)
1508 
1509     device[0] = SkBlendARGB32(fPMColor, device[0], a0);
1510     device[1] = SkBlendARGB32(fPMColor, device[1], a1);
1511 }
1512 
blitAntiV2(int x,int y,U8CPU a0,U8CPU a1)1513 void SkARGB32_Blitter::blitAntiV2(int x, int y, U8CPU a0, U8CPU a1) {
1514     uint32_t* device = fDevice.writable_addr32(x, y);
1515     SkDEBUGCODE((void)fDevice.writable_addr32(x, y + 1);)
1516 
1517     device[0] = SkBlendARGB32(fPMColor, device[0], a0);
1518     device = (uint32_t*)((char*)device + fDevice.rowBytes());
1519     device[0] = SkBlendARGB32(fPMColor, device[0], a1);
1520 }
1521 
1522 //////////////////////////////////////////////////////////////////////////////////////
1523 
1524 #define solid_8_pixels(mask, dst, color)    \
1525     do {                                    \
1526         if (mask & 0x80) dst[0] = color;    \
1527         if (mask & 0x40) dst[1] = color;    \
1528         if (mask & 0x20) dst[2] = color;    \
1529         if (mask & 0x10) dst[3] = color;    \
1530         if (mask & 0x08) dst[4] = color;    \
1531         if (mask & 0x04) dst[5] = color;    \
1532         if (mask & 0x02) dst[6] = color;    \
1533         if (mask & 0x01) dst[7] = color;    \
1534     } while (0)
1535 
1536 #define SK_BLITBWMASK_NAME                  SkARGB32_BlitBW
1537 #define SK_BLITBWMASK_ARGS                  , SkPMColor color
1538 #define SK_BLITBWMASK_BLIT8(mask, dst)      solid_8_pixels(mask, dst, color)
1539 #define SK_BLITBWMASK_GETADDR               writable_addr32
1540 #define SK_BLITBWMASK_DEVTYPE               uint32_t
1541 #include "src/core/SkBlitBWMaskTemplate.h"
1542 
1543 #define blend_8_pixels(mask, dst, sc, dst_scale)                            \
1544     do {                                                                    \
1545         if (mask & 0x80) { dst[0] = sc + SkAlphaMulQ(dst[0], dst_scale); }  \
1546         if (mask & 0x40) { dst[1] = sc + SkAlphaMulQ(dst[1], dst_scale); }  \
1547         if (mask & 0x20) { dst[2] = sc + SkAlphaMulQ(dst[2], dst_scale); }  \
1548         if (mask & 0x10) { dst[3] = sc + SkAlphaMulQ(dst[3], dst_scale); }  \
1549         if (mask & 0x08) { dst[4] = sc + SkAlphaMulQ(dst[4], dst_scale); }  \
1550         if (mask & 0x04) { dst[5] = sc + SkAlphaMulQ(dst[5], dst_scale); }  \
1551         if (mask & 0x02) { dst[6] = sc + SkAlphaMulQ(dst[6], dst_scale); }  \
1552         if (mask & 0x01) { dst[7] = sc + SkAlphaMulQ(dst[7], dst_scale); }  \
1553     } while (0)
1554 
1555 #define SK_BLITBWMASK_NAME                  SkARGB32_BlendBW
1556 #define SK_BLITBWMASK_ARGS                  , uint32_t sc, unsigned dst_scale
1557 #define SK_BLITBWMASK_BLIT8(mask, dst)      blend_8_pixels(mask, dst, sc, dst_scale)
1558 #define SK_BLITBWMASK_GETADDR               writable_addr32
1559 #define SK_BLITBWMASK_DEVTYPE               uint32_t
1560 #include "src/core/SkBlitBWMaskTemplate.h"
1561 
blitMask(const SkMask & mask,const SkIRect & clip)1562 void SkARGB32_Blitter::blitMask(const SkMask& mask, const SkIRect& clip) {
1563     SkASSERT(mask.fBounds.contains(clip));
1564     SkASSERT(fSrcA != 0xFF);
1565 
1566     if (fSrcA == 0) {
1567         return;
1568     }
1569 
1570     if (blit_color(fDevice, mask, clip, fColor)) {
1571         return;
1572     }
1573 
1574     switch (mask.fFormat) {
1575         case SkMask::kBW_Format:
1576             SkARGB32_BlendBW(fDevice, mask, clip, fPMColor, SkAlpha255To256(255 - fSrcA));
1577             break;
1578         case SkMask::kARGB32_Format:
1579             SkARGB32_Blit32(fDevice, mask, clip, fPMColor);
1580             break;
1581         default:
1582             SK_ABORT("Mask format not handled.");
1583     }
1584 }
1585 
blitMask(const SkMask & mask,const SkIRect & clip)1586 void SkARGB32_Opaque_Blitter::blitMask(const SkMask& mask,
1587                                        const SkIRect& clip) {
1588     SkASSERT(mask.fBounds.contains(clip));
1589 
1590     if (blit_color(fDevice, mask, clip, fColor)) {
1591         return;
1592     }
1593 
1594     switch (mask.fFormat) {
1595         case SkMask::kBW_Format:
1596             SkARGB32_BlitBW(fDevice, mask, clip, fPMColor);
1597             break;
1598         case SkMask::kARGB32_Format:
1599             SkARGB32_Blit32(fDevice, mask, clip, fPMColor);
1600             break;
1601         default:
1602             SK_ABORT("Mask format not handled.");
1603     }
1604 }
1605 
blitAntiH2(int x,int y,U8CPU a0,U8CPU a1)1606 void SkARGB32_Opaque_Blitter::blitAntiH2(int x, int y, U8CPU a0, U8CPU a1) {
1607     uint32_t* device = fDevice.writable_addr32(x, y);
1608     SkDEBUGCODE((void)fDevice.writable_addr32(x + 1, y);)
1609 
1610     device[0] = SkFastFourByteInterp(fPMColor, device[0], a0);
1611     device[1] = SkFastFourByteInterp(fPMColor, device[1], a1);
1612 }
1613 
blitAntiV2(int x,int y,U8CPU a0,U8CPU a1)1614 void SkARGB32_Opaque_Blitter::blitAntiV2(int x, int y, U8CPU a0, U8CPU a1) {
1615     uint32_t* device = fDevice.writable_addr32(x, y);
1616     SkDEBUGCODE((void)fDevice.writable_addr32(x, y + 1);)
1617 
1618     device[0] = SkFastFourByteInterp(fPMColor, device[0], a0);
1619     device = (uint32_t*)((char*)device + fDevice.rowBytes());
1620     device[0] = SkFastFourByteInterp(fPMColor, device[0], a1);
1621 }
1622 
1623 ///////////////////////////////////////////////////////////////////////////////
1624 
blitV(int x,int y,int height,SkAlpha alpha)1625 void SkARGB32_Blitter::blitV(int x, int y, int height, SkAlpha alpha) {
1626     if (alpha == 0 || fSrcA == 0) {
1627         return;
1628     }
1629 
1630     uint32_t* device = fDevice.writable_addr32(x, y);
1631     uint32_t  color = fPMColor;
1632 
1633     if (alpha != 255) {
1634         color = SkAlphaMulQ(color, SkAlpha255To256(alpha));
1635     }
1636 
1637     unsigned dst_scale = SkAlpha255To256(255 - SkGetPackedA32(color));
1638     size_t rowBytes = fDevice.rowBytes();
1639     while (--height >= 0) {
1640         device[0] = color + SkAlphaMulQ(device[0], dst_scale);
1641         device = (uint32_t*)((char*)device + rowBytes);
1642     }
1643 }
1644 
blitRect(int x,int y,int width,int height)1645 void SkARGB32_Blitter::blitRect(int x, int y, int width, int height) {
1646     SkASSERT(x >= 0 && y >= 0 && x + width <= fDevice.width() && y + height <= fDevice.height());
1647 
1648     if (fSrcA == 0) {
1649         return;
1650     }
1651 
1652     uint32_t*   device = fDevice.writable_addr32(x, y);
1653     uint32_t    color = fPMColor;
1654     size_t      rowBytes = fDevice.rowBytes();
1655 
1656     if (SkGetPackedA32(fPMColor) == 0xFF) {
1657         SkOpts::rect_memset32(device, color, width, rowBytes, height);
1658     } else {
1659         while (height --> 0) {
1660             SkBlitRow::Color32(device, width, color);
1661             device = (uint32_t*)((char*)device + rowBytes);
1662         }
1663     }
1664 }
1665 
1666 #if defined _WIN32
1667 #pragma warning ( pop )
1668 #endif
1669 
1670 ///////////////////////////////////////////////////////////////////////
1671 
blitAntiH(int x,int y,const SkAlpha antialias[],const int16_t runs[])1672 void SkARGB32_Black_Blitter::blitAntiH(int x, int y, const SkAlpha antialias[],
1673                                        const int16_t runs[]) {
1674     uint32_t*   device = fDevice.writable_addr32(x, y);
1675     SkPMColor   black = (SkPMColor)(SK_A32_MASK << SK_A32_SHIFT);
1676 
1677     for (;;) {
1678         int count = runs[0];
1679         SkASSERT(count >= 0);
1680         if (count <= 0) {
1681             return;
1682         }
1683         unsigned aa = antialias[0];
1684         if (aa) {
1685             if (aa == 255) {
1686                 SkOpts::memset32(device, black, count);
1687             } else {
1688                 SkPMColor src = aa << SK_A32_SHIFT;
1689                 unsigned dst_scale = 256 - aa;
1690                 int n = count;
1691                 do {
1692                     --n;
1693                     device[n] = src + SkAlphaMulQ(device[n], dst_scale);
1694                 } while (n > 0);
1695             }
1696         }
1697         runs += count;
1698         antialias += count;
1699         device += count;
1700     }
1701 }
1702 
blitAntiH2(int x,int y,U8CPU a0,U8CPU a1)1703 void SkARGB32_Black_Blitter::blitAntiH2(int x, int y, U8CPU a0, U8CPU a1) {
1704     uint32_t* device = fDevice.writable_addr32(x, y);
1705     SkDEBUGCODE((void)fDevice.writable_addr32(x + 1, y);)
1706 
1707     device[0] = (a0 << SK_A32_SHIFT) + SkAlphaMulQ(device[0], 256 - a0);
1708     device[1] = (a1 << SK_A32_SHIFT) + SkAlphaMulQ(device[1], 256 - a1);
1709 }
1710 
blitAntiV2(int x,int y,U8CPU a0,U8CPU a1)1711 void SkARGB32_Black_Blitter::blitAntiV2(int x, int y, U8CPU a0, U8CPU a1) {
1712     uint32_t* device = fDevice.writable_addr32(x, y);
1713     SkDEBUGCODE((void)fDevice.writable_addr32(x, y + 1);)
1714 
1715     device[0] = (a0 << SK_A32_SHIFT) + SkAlphaMulQ(device[0], 256 - a0);
1716     device = (uint32_t*)((char*)device + fDevice.rowBytes());
1717     device[0] = (a1 << SK_A32_SHIFT) + SkAlphaMulQ(device[0], 256 - a1);
1718 }
1719 
1720 ///////////////////////////////////////////////////////////////////////////////
1721 
SkARGB32_Shader_Blitter(const SkPixmap & device,const SkPaint & paint,SkShaderBase::Context * shaderContext)1722 SkARGB32_Shader_Blitter::SkARGB32_Shader_Blitter(const SkPixmap& device,
1723         const SkPaint& paint, SkShaderBase::Context* shaderContext)
1724     : INHERITED(device, paint, shaderContext)
1725 {
1726     fBuffer = (SkPMColor*)sk_malloc_throw(device.width() * (sizeof(SkPMColor)));
1727 
1728     SkASSERT(paint.isSrcOver());
1729 
1730     int flags = 0;
1731     if (!(shaderContext->getFlags() & SkShaderBase::kOpaqueAlpha_Flag)) {
1732         flags |= SkBlitRow::kSrcPixelAlpha_Flag32;
1733     }
1734     // we call this on the output from the shader
1735     fProc32 = SkBlitRow::Factory32(flags);
1736     // we call this on the output from the shader + alpha from the aa buffer
1737     fProc32Blend = SkBlitRow::Factory32(flags | SkBlitRow::kGlobalAlpha_Flag32);
1738 
1739     fShadeDirectlyIntoDevice =
1740             SkToBool(shaderContext->getFlags() & SkShaderBase::kOpaqueAlpha_Flag);
1741 }
1742 
~SkARGB32_Shader_Blitter()1743 SkARGB32_Shader_Blitter::~SkARGB32_Shader_Blitter() {
1744     sk_free(fBuffer);
1745 }
1746 
blitH(int x,int y,int width)1747 void SkARGB32_Shader_Blitter::blitH(int x, int y, int width) {
1748     SkASSERT(x >= 0 && y >= 0 && x + width <= fDevice.width());
1749 
1750     uint32_t* device = fDevice.writable_addr32(x, y);
1751 
1752     if (fShadeDirectlyIntoDevice) {
1753         fShaderContext->shadeSpan(x, y, device, width);
1754     } else {
1755         SkPMColor*  span = fBuffer;
1756         fShaderContext->shadeSpan(x, y, span, width);
1757         fProc32(device, span, width, 255);
1758     }
1759 }
1760 
blitRect(int x,int y,int width,int height)1761 void SkARGB32_Shader_Blitter::blitRect(int x, int y, int width, int height) {
1762     SkASSERT(x >= 0 && y >= 0 &&
1763              x + width <= fDevice.width() && y + height <= fDevice.height());
1764 
1765     uint32_t*  device = fDevice.writable_addr32(x, y);
1766     size_t     deviceRB = fDevice.rowBytes();
1767     auto*      shaderContext = fShaderContext;
1768     SkPMColor* span = fBuffer;
1769 
1770     if (fShadeDirectlyIntoDevice) {
1771         do {
1772             shaderContext->shadeSpan(x, y, device, width);
1773             y += 1;
1774             device = (uint32_t*)((char*)device + deviceRB);
1775         } while (--height > 0);
1776     } else {
1777         SkBlitRow::Proc32 proc = fProc32;
1778         do {
1779             shaderContext->shadeSpan(x, y, span, width);
1780             proc(device, span, width, 255);
1781             y += 1;
1782             device = (uint32_t*)((char*)device + deviceRB);
1783         } while (--height > 0);
1784     }
1785 }
1786 
blitAntiH(int x,int y,const SkAlpha antialias[],const int16_t runs[])1787 void SkARGB32_Shader_Blitter::blitAntiH(int x, int y, const SkAlpha antialias[],
1788                                         const int16_t runs[]) {
1789     SkPMColor* span = fBuffer;
1790     uint32_t*  device = fDevice.writable_addr32(x, y);
1791     auto*      shaderContext = fShaderContext;
1792 
1793     if (fShadeDirectlyIntoDevice || (shaderContext->getFlags() & SkShaderBase::kOpaqueAlpha_Flag)) {
1794         for (;;) {
1795             int count = *runs;
1796             if (count <= 0) {
1797                 break;
1798             }
1799             int aa = *antialias;
1800             if (aa) {
1801                 if (aa == 255) {
1802                     // cool, have the shader draw right into the device
1803                     shaderContext->shadeSpan(x, y, device, count);
1804                 } else {
1805                     shaderContext->shadeSpan(x, y, span, count);
1806                     fProc32Blend(device, span, count, aa);
1807                 }
1808             }
1809             device += count;
1810             runs += count;
1811             antialias += count;
1812             x += count;
1813         }
1814     } else {
1815         for (;;) {
1816             int count = *runs;
1817             if (count <= 0) {
1818                 break;
1819             }
1820             int aa = *antialias;
1821             if (aa) {
1822                 shaderContext->shadeSpan(x, y, span, count);
1823                 if (aa == 255) {
1824                     fProc32(device, span, count, 255);
1825                 } else {
1826                     fProc32Blend(device, span, count, aa);
1827                 }
1828             }
1829             device += count;
1830             runs += count;
1831             antialias += count;
1832             x += count;
1833         }
1834     }
1835 }
1836 
1837 using U32  = skvx::Vec< 4, uint32_t>;
1838 using U8x4 = skvx::Vec<16, uint8_t>;
1839 using U8   = skvx::Vec< 4, uint8_t>;
1840 
drive(SkPMColor * dst,const SkPMColor * src,const uint8_t * cov,int n,U8x4 (* kernel)(U8x4,U8x4,U8x4))1841 static void drive(SkPMColor* dst, const SkPMColor* src, const uint8_t* cov, int n,
1842                   U8x4 (*kernel)(U8x4,U8x4,U8x4)) {
1843 
1844     auto apply = [kernel](U32 dst, U32 src, U8 cov) -> U32 {
1845         U8x4 cov_splat = skvx::shuffle<0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3>(cov);
1846         return sk_bit_cast<U32>(kernel(sk_bit_cast<U8x4>(dst),
1847                                        sk_bit_cast<U8x4>(src),
1848                                        cov_splat));
1849     };
1850     while (n >= 4) {
1851         apply(U32::Load(dst), U32::Load(src), U8::Load(cov)).store(dst);
1852         dst += 4;
1853         src += 4;
1854         cov += 4;
1855         n   -= 4;
1856     }
1857     while (n --> 0) {
1858         *dst = apply(U32{*dst}, U32{*src}, U8{*cov})[0];
1859         dst++;
1860         src++;
1861         cov++;
1862     }
1863 }
1864 
blend_row_A8(SkPMColor * dst,const void * mask,const SkPMColor * src,int n)1865 static void blend_row_A8(SkPMColor* dst, const void* mask, const SkPMColor* src, int n) {
1866     auto cov = (const uint8_t*)mask;
1867     drive(dst, src, cov, n, [](U8x4 d, U8x4 s, U8x4 c) {
1868         U8x4 s_aa  = skvx::approx_scale(s, c),
1869              alpha = skvx::shuffle<3,3,3,3, 7,7,7,7, 11,11,11,11, 15,15,15,15>(s_aa);
1870         return s_aa + skvx::approx_scale(d, 255 - alpha);
1871     });
1872 }
1873 
blend_row_A8_opaque(SkPMColor * dst,const void * mask,const SkPMColor * src,int n)1874 static void blend_row_A8_opaque(SkPMColor* dst, const void* mask, const SkPMColor* src, int n) {
1875     auto cov = (const uint8_t*)mask;
1876     drive(dst, src, cov, n, [](U8x4 d, U8x4 s, U8x4 c) {
1877         return skvx::div255( skvx::cast<uint16_t>(s) * skvx::cast<uint16_t>(  c  )
1878                            + skvx::cast<uint16_t>(d) * skvx::cast<uint16_t>(255-c));
1879     });
1880 }
1881 
blend_row_lcd16(SkPMColor * dst,const void * vmask,const SkPMColor * src,int n)1882 static void blend_row_lcd16(SkPMColor* dst, const void* vmask, const SkPMColor* src, int n) {
1883     auto src_alpha_blend = [](int s, int d, int sa, int m) {
1884         return d + SkAlphaMul(s - SkAlphaMul(sa, d), m);
1885     };
1886 
1887     auto upscale_31_to_255 = [](int v) {
1888         return (v << 3) | (v >> 2);
1889     };
1890 
1891     auto mask = (const uint16_t*)vmask;
1892     for (int i = 0; i < n; ++i) {
1893         uint16_t m = mask[i];
1894         if (0 == m) {
1895             continue;
1896         }
1897 
1898         SkPMColor s = src[i];
1899         SkPMColor d = dst[i];
1900 
1901         int srcA = SkGetPackedA32(s);
1902         int srcR = SkGetPackedR32(s);
1903         int srcG = SkGetPackedG32(s);
1904         int srcB = SkGetPackedB32(s);
1905 
1906         srcA += srcA >> 7;
1907 
1908         // We're ignoring the least significant bit of the green coverage channel here.
1909         int maskR = SkGetPackedR16(m) >> (SK_R16_BITS - 5);
1910         int maskG = SkGetPackedG16(m) >> (SK_G16_BITS - 5);
1911         int maskB = SkGetPackedB16(m) >> (SK_B16_BITS - 5);
1912 
1913         // Scale up to 8-bit coverage to work with SkAlphaMul() in src_alpha_blend().
1914         maskR = upscale_31_to_255(maskR);
1915         maskG = upscale_31_to_255(maskG);
1916         maskB = upscale_31_to_255(maskB);
1917 
1918         // This LCD blit routine only works if the destination is opaque.
1919         dst[i] = SkPackARGB32(0xFF,
1920                               src_alpha_blend(srcR, SkGetPackedR32(d), srcA, maskR),
1921                               src_alpha_blend(srcG, SkGetPackedG32(d), srcA, maskG),
1922                               src_alpha_blend(srcB, SkGetPackedB32(d), srcA, maskB));
1923     }
1924 }
1925 
blend_row_LCD16_opaque(SkPMColor * dst,const void * vmask,const SkPMColor * src,int n)1926 static void blend_row_LCD16_opaque(SkPMColor* dst, const void* vmask, const SkPMColor* src, int n) {
1927     auto mask = (const uint16_t*)vmask;
1928 
1929     for (int i = 0; i < n; ++i) {
1930         uint16_t m = mask[i];
1931         if (0 == m) {
1932             continue;
1933         }
1934 
1935         SkPMColor s = src[i];
1936         SkPMColor d = dst[i];
1937 
1938         int srcR = SkGetPackedR32(s);
1939         int srcG = SkGetPackedG32(s);
1940         int srcB = SkGetPackedB32(s);
1941 
1942         // We're ignoring the least significant bit of the green coverage channel here.
1943         int maskR = SkGetPackedR16(m) >> (SK_R16_BITS - 5);
1944         int maskG = SkGetPackedG16(m) >> (SK_G16_BITS - 5);
1945         int maskB = SkGetPackedB16(m) >> (SK_B16_BITS - 5);
1946 
1947         // Now upscale them to 0..32, so we can use blend_32.
1948         maskR = upscale_31_to_32(maskR);
1949         maskG = upscale_31_to_32(maskG);
1950         maskB = upscale_31_to_32(maskB);
1951 
1952         // This LCD blit routine only works if the destination is opaque.
1953         dst[i] = SkPackARGB32(0xFF,
1954                               blend_32(srcR, SkGetPackedR32(d), maskR),
1955                               blend_32(srcG, SkGetPackedG32(d), maskG),
1956                               blend_32(srcB, SkGetPackedB32(d), maskB));
1957     }
1958 }
1959 
blitMask(const SkMask & mask,const SkIRect & clip)1960 void SkARGB32_Shader_Blitter::blitMask(const SkMask& mask, const SkIRect& clip) {
1961     SkASSERT(mask.fBounds.contains(clip));
1962 
1963     void (*blend_row)(SkPMColor*, const void* mask, const SkPMColor*, int) = nullptr;
1964 
1965     bool opaque = (fShaderContext->getFlags() & SkShaderBase::kOpaqueAlpha_Flag);
1966 
1967     if (mask.fFormat == SkMask::kA8_Format && opaque) {
1968         blend_row = blend_row_A8_opaque;
1969     } else if (mask.fFormat == SkMask::kA8_Format) {
1970         blend_row = blend_row_A8;
1971     } else if (mask.fFormat == SkMask::kLCD16_Format && opaque) {
1972         blend_row = blend_row_LCD16_opaque;
1973     } else if (mask.fFormat == SkMask::kLCD16_Format) {
1974         blend_row = blend_row_lcd16;
1975     } else {
1976         this->INHERITED::blitMask(mask, clip);
1977         return;
1978     }
1979 
1980     const int x = clip.fLeft;
1981     const int width = clip.width();
1982     int y = clip.fTop;
1983     int height = clip.height();
1984 
1985     char* dstRow = (char*)fDevice.writable_addr32(x, y);
1986     const size_t dstRB = fDevice.rowBytes();
1987     const uint8_t* maskRow = (const uint8_t*)mask.getAddr(x, y);
1988     const size_t maskRB = mask.fRowBytes;
1989 
1990     SkPMColor* span = fBuffer;
1991     SkASSERT(blend_row);
1992     do {
1993         fShaderContext->shadeSpan(x, y, span, width);
1994         blend_row(reinterpret_cast<SkPMColor*>(dstRow), maskRow, span, width);
1995         dstRow += dstRB;
1996         maskRow += maskRB;
1997         y += 1;
1998     } while (--height > 0);
1999 }
2000 
blitV(int x,int y,int height,SkAlpha alpha)2001 void SkARGB32_Shader_Blitter::blitV(int x, int y, int height, SkAlpha alpha) {
2002     SkASSERT(x >= 0 && y >= 0 && y + height <= fDevice.height());
2003 
2004     uint32_t* device = fDevice.writable_addr32(x, y);
2005     size_t    deviceRB = fDevice.rowBytes();
2006 
2007     if (fShadeDirectlyIntoDevice) {
2008         if (255 == alpha) {
2009             do {
2010                 fShaderContext->shadeSpan(x, y, device, 1);
2011                 y += 1;
2012                 device = (uint32_t*)((char*)device + deviceRB);
2013             } while (--height > 0);
2014         } else {
2015             do {
2016                 SkPMColor c;
2017                 fShaderContext->shadeSpan(x, y, &c, 1);
2018                 *device = SkFourByteInterp(c, *device, alpha);
2019                 y += 1;
2020                 device = (uint32_t*)((char*)device + deviceRB);
2021             } while (--height > 0);
2022         }
2023     } else {
2024         SkPMColor* span = fBuffer;
2025         SkBlitRow::Proc32 proc = (255 == alpha) ? fProc32 : fProc32Blend;
2026         do {
2027             fShaderContext->shadeSpan(x, y, span, 1);
2028             proc(device, span, 1, alpha);
2029             y += 1;
2030             device = (uint32_t*)((char*)device + deviceRB);
2031         } while (--height > 0);
2032     }
2033 }
2034