1 /*
2 * Copyright 2006 The Android Open Source Project
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #include "include/core/SkColor.h"
9 #include "include/core/SkColorPriv.h"
10 #include "include/core/SkColorType.h"
11 #include "include/core/SkPaint.h"
12 #include "include/core/SkPixmap.h"
13 #include "include/core/SkRect.h"
14 #include "include/core/SkTypes.h"
15 #include "include/private/SkColorData.h"
16 #include "include/private/base/SkAlign.h"
17 #include "include/private/base/SkCPUTypes.h"
18 #include "include/private/base/SkDebug.h"
19 #include "include/private/base/SkMalloc.h"
20 #include "include/private/base/SkTo.h"
21 #include "src/base/SkUtils.h"
22 #include "src/base/SkVx.h"
23 #include "src/core/SkBlitMask.h"
24 #include "src/core/SkBlitRow.h"
25 #include "src/core/SkCoreBlitters.h"
26 #include "src/core/SkMask.h"
27 #include "src/core/SkMemset.h"
28 #include "src/shaders/SkShaderBase.h"
29
30 #include <algorithm>
31 #include <cstddef>
32 #include <cstdint>
33
upscale_31_to_32(int value)34 static inline int upscale_31_to_32(int value) {
35 SkASSERT((unsigned)value <= 31);
36 return value + (value >> 4);
37 }
38
blend_32(int src,int dst,int scale)39 static inline int blend_32(int src, int dst, int scale) {
40 SkASSERT((unsigned)src <= 0xFF);
41 SkASSERT((unsigned)dst <= 0xFF);
42 SkASSERT((unsigned)scale <= 32);
43 return dst + ((src - dst) * scale >> 5);
44 }
45
blend_lcd16(int srcA,int srcR,int srcG,int srcB,SkPMColor dst,uint16_t mask)46 static inline SkPMColor blend_lcd16(int srcA, int srcR, int srcG, int srcB,
47 SkPMColor dst, uint16_t mask) {
48 if (mask == 0) {
49 return dst;
50 }
51
52 /* We want all of these in 5bits, hence the shifts in case one of them
53 * (green) is 6bits.
54 */
55 int maskR = SkGetPackedR16(mask) >> (SK_R16_BITS - 5);
56 int maskG = SkGetPackedG16(mask) >> (SK_G16_BITS - 5);
57 int maskB = SkGetPackedB16(mask) >> (SK_B16_BITS - 5);
58
59 // Now upscale them to 0..32, so we can use blend32
60 maskR = upscale_31_to_32(maskR);
61 maskG = upscale_31_to_32(maskG);
62 maskB = upscale_31_to_32(maskB);
63
64 // srcA has been upscaled to 256 before passed into this function
65 maskR = maskR * srcA >> 8;
66 maskG = maskG * srcA >> 8;
67 maskB = maskB * srcA >> 8;
68
69 int dstA = SkGetPackedA32(dst);
70 int dstR = SkGetPackedR32(dst);
71 int dstG = SkGetPackedG32(dst);
72 int dstB = SkGetPackedB32(dst);
73
74 // Subtract 1 from srcA to bring it back to [0-255] to compare against dstA, alpha needs to
75 // use either the min or the max of the LCD coverages. See https:/skbug.com/40037823
76 int maskA = (srcA-1) < dstA ? std::min(maskR, std::min(maskG, maskB))
77 : std::max(maskR, std::max(maskG, maskB));
78
79 return SkPackARGB32(blend_32(0xFF, dstA, maskA),
80 blend_32(srcR, dstR, maskR),
81 blend_32(srcG, dstG, maskG),
82 blend_32(srcB, dstB, maskB));
83 }
84
blend_lcd16_opaque(int srcR,int srcG,int srcB,SkPMColor dst,uint16_t mask,SkPMColor opaqueDst)85 static inline SkPMColor blend_lcd16_opaque(int srcR, int srcG, int srcB,
86 SkPMColor dst, uint16_t mask,
87 SkPMColor opaqueDst) {
88 if (mask == 0) {
89 return dst;
90 }
91
92 if (0xFFFF == mask) {
93 return opaqueDst;
94 }
95
96 /* We want all of these in 5bits, hence the shifts in case one of them
97 * (green) is 6bits.
98 */
99 int maskR = SkGetPackedR16(mask) >> (SK_R16_BITS - 5);
100 int maskG = SkGetPackedG16(mask) >> (SK_G16_BITS - 5);
101 int maskB = SkGetPackedB16(mask) >> (SK_B16_BITS - 5);
102
103 // Now upscale them to 0..32, so we can use blend32
104 maskR = upscale_31_to_32(maskR);
105 maskG = upscale_31_to_32(maskG);
106 maskB = upscale_31_to_32(maskB);
107
108 int dstA = SkGetPackedA32(dst);
109 int dstR = SkGetPackedR32(dst);
110 int dstG = SkGetPackedG32(dst);
111 int dstB = SkGetPackedB32(dst);
112
113 // Opaque src alpha always uses the max of the LCD coverages.
114 int maskA = std::max(maskR, std::max(maskG, maskB));
115
116 // LCD blitting is only supported if the dst is known/required
117 // to be opaque
118 return SkPackARGB32(blend_32(0xFF, dstA, maskA),
119 blend_32(srcR, dstR, maskR),
120 blend_32(srcG, dstG, maskG),
121 blend_32(srcB, dstB, maskB));
122 }
123
124
125 // TODO: rewrite at least the SSE code here. It's miserable.
126
127 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
128 #include <emmintrin.h>
129
130 // The following (left) shifts cause the top 5 bits of the mask components to
131 // line up with the corresponding components in an SkPMColor.
132 // Note that the mask's RGB16 order may differ from the SkPMColor order.
133 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
134 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
135 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
136
137 #if SK_R16x5_R32x5_SHIFT == 0
138 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (x)
139 #elif SK_R16x5_R32x5_SHIFT > 0
140 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_slli_epi32(x, SK_R16x5_R32x5_SHIFT))
141 #else
142 #define SkPackedR16x5ToUnmaskedR32x5_SSE2(x) (_mm_srli_epi32(x, -SK_R16x5_R32x5_SHIFT))
143 #endif
144
145 #if SK_G16x5_G32x5_SHIFT == 0
146 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (x)
147 #elif SK_G16x5_G32x5_SHIFT > 0
148 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_slli_epi32(x, SK_G16x5_G32x5_SHIFT))
149 #else
150 #define SkPackedG16x5ToUnmaskedG32x5_SSE2(x) (_mm_srli_epi32(x, -SK_G16x5_G32x5_SHIFT))
151 #endif
152
153 #if SK_B16x5_B32x5_SHIFT == 0
154 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (x)
155 #elif SK_B16x5_B32x5_SHIFT > 0
156 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_slli_epi32(x, SK_B16x5_B32x5_SHIFT))
157 #else
158 #define SkPackedB16x5ToUnmaskedB32x5_SSE2(x) (_mm_srli_epi32(x, -SK_B16x5_B32x5_SHIFT))
159 #endif
160
blend_lcd16_sse2(__m128i & src,__m128i & dst,__m128i & mask,__m128i & srcA)161 static __m128i blend_lcd16_sse2(__m128i &src, __m128i &dst, __m128i &mask, __m128i &srcA) {
162 // In the following comments, the components of src, dst and mask are
163 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
164 // by an R, G, B, or A suffix. Components of one of the four pixels that
165 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
166 // example is the blue channel of the second destination pixel. Memory
167 // layout is shown for an ARGB byte order in a color value.
168
169 // src and srcA store 8-bit values interleaved with zeros.
170 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
171 // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
172 // srcA, 0, srcA, 0, srcA, 0, srcA, 0)
173 // mask stores 16-bit values (compressed three channels) interleaved with zeros.
174 // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
175 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
176 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
177
178 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
179 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
180 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
181 _mm_set1_epi32(0x1F << SK_R32_SHIFT));
182
183 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
184 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
185 _mm_set1_epi32(0x1F << SK_G32_SHIFT));
186
187 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
188 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
189 _mm_set1_epi32(0x1F << SK_B32_SHIFT));
190
191 // a needs to be either the min or the max of the LCD coverages, depending on srcA < dstA
192 __m128i aMin = _mm_min_epu8(_mm_slli_epi32(r, SK_A32_SHIFT - SK_R32_SHIFT),
193 _mm_min_epu8(_mm_slli_epi32(g, SK_A32_SHIFT - SK_G32_SHIFT),
194 _mm_slli_epi32(b, SK_A32_SHIFT - SK_B32_SHIFT)));
195 __m128i aMax = _mm_max_epu8(_mm_slli_epi32(r, SK_A32_SHIFT - SK_R32_SHIFT),
196 _mm_max_epu8(_mm_slli_epi32(g, SK_A32_SHIFT - SK_G32_SHIFT),
197 _mm_slli_epi32(b, SK_A32_SHIFT - SK_B32_SHIFT)));
198 // srcA has been biased to [0-256], so compare srcA against (dstA+1)
199 __m128i a = _mm_cmplt_epi32(srcA,
200 _mm_and_si128(
201 _mm_add_epi32(dst, _mm_set1_epi32(1 << SK_A32_SHIFT)),
202 _mm_set1_epi32(SK_A32_MASK)));
203 // a = if_then_else(a, aMin, aMax) == (aMin & a) | (aMax & ~a)
204 a = _mm_or_si128(_mm_and_si128(a, aMin), _mm_andnot_si128(a, aMax));
205
206 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
207 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
208 // 8-bit position
209 // mask = (m0A, m0R, m0G, m0B, m1A, m1R, m1G, m1B,
210 // m2A, m2R, m2G, m2B, m3A, m3R, m3G, m3B)
211 mask = _mm_or_si128(_mm_or_si128(a, r), _mm_or_si128(g, b));
212
213 // Interleave R,G,B into the lower byte of word.
214 // i.e. split the sixteen 8-bit values from mask into two sets of eight
215 // 16-bit values, padded by zero.
216 __m128i maskLo, maskHi;
217 // maskLo = (m0A, 0, m0R, 0, m0G, 0, m0B, 0, m1A, 0, m1R, 0, m1G, 0, m1B, 0)
218 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
219 // maskHi = (m2A, 0, m2R, 0, m2G, 0, m2B, 0, m3A, 0, m3R, 0, m3G, 0, m3B, 0)
220 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
221
222 // Upscale from 0..31 to 0..32
223 // (allows to replace division by left-shift further down)
224 // Left-shift each component by 4 and add the result back to that component,
225 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
226 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
227 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
228
229 // Multiply each component of maskLo and maskHi by srcA
230 maskLo = _mm_mullo_epi16(maskLo, srcA);
231 maskHi = _mm_mullo_epi16(maskHi, srcA);
232
233 // Left shift mask components by 8 (divide by 256)
234 maskLo = _mm_srli_epi16(maskLo, 8);
235 maskHi = _mm_srli_epi16(maskHi, 8);
236
237 // Interleave R,G,B into the lower byte of the word
238 // dstLo = (d0A, 0, d0R, 0, d0G, 0, d0B, 0, d1A, 0, d1R, 0, d1G, 0, d1B, 0)
239 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
240 // dstLo = (d2A, 0, d2R, 0, d2G, 0, d2B, 0, d3A, 0, d3R, 0, d3G, 0, d3B, 0)
241 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
242
243 // mask = (src - dst) * mask
244 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
245 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
246
247 // mask = (src - dst) * mask >> 5
248 maskLo = _mm_srai_epi16(maskLo, 5);
249 maskHi = _mm_srai_epi16(maskHi, 5);
250
251 // Add two pixels into result.
252 // result = dst + ((src - dst) * mask >> 5)
253 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
254 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
255
256 // Pack into 4 32bit dst pixels.
257 // resultLo and resultHi contain eight 16-bit components (two pixels) each.
258 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
259 // clamping to 255 if necessary.
260 return _mm_packus_epi16(resultLo, resultHi);
261 }
262
blend_lcd16_opaque_sse2(__m128i & src,__m128i & dst,__m128i & mask)263 static __m128i blend_lcd16_opaque_sse2(__m128i &src, __m128i &dst, __m128i &mask) {
264 // In the following comments, the components of src, dst and mask are
265 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
266 // by an R, G, B, or A suffix. Components of one of the four pixels that
267 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
268 // example is the blue channel of the second destination pixel. Memory
269 // layout is shown for an ARGB byte order in a color value.
270
271 // src and srcA store 8-bit values interleaved with zeros.
272 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
273 // mask stores 16-bit values (shown as high and low bytes) interleaved with
274 // zeros
275 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
276 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
277
278 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
279 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
280 __m128i r = _mm_and_si128(SkPackedR16x5ToUnmaskedR32x5_SSE2(mask),
281 _mm_set1_epi32(0x1F << SK_R32_SHIFT));
282
283 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
284 __m128i g = _mm_and_si128(SkPackedG16x5ToUnmaskedG32x5_SSE2(mask),
285 _mm_set1_epi32(0x1F << SK_G32_SHIFT));
286
287 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
288 __m128i b = _mm_and_si128(SkPackedB16x5ToUnmaskedB32x5_SSE2(mask),
289 _mm_set1_epi32(0x1F << SK_B32_SHIFT));
290
291 // a = max(r, g, b) since opaque src alpha uses max of LCD coverages
292 __m128i a = _mm_max_epu8(_mm_slli_epi32(r, SK_A32_SHIFT - SK_R32_SHIFT),
293 _mm_max_epu8(_mm_slli_epi32(g, SK_A32_SHIFT - SK_G32_SHIFT),
294 _mm_slli_epi32(b, SK_A32_SHIFT - SK_B32_SHIFT)));
295
296 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
297 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
298 // 8-bit position
299 // mask = (m0A, m0R, m0G, m0B, m1A, m1R, m1G, m1B,
300 // m2A, m2R, m2G, m2B, m3A, m3R, m3G, m3B)
301 mask = _mm_or_si128(_mm_or_si128(a, r), _mm_or_si128(g, b));
302
303 // Interleave R,G,B into the lower byte of word.
304 // i.e. split the sixteen 8-bit values from mask into two sets of eight
305 // 16-bit values, padded by zero.
306 __m128i maskLo, maskHi;
307 // maskLo = (m0A, 0, m0R, 0, m0G, 0, m0B, 0, m1A, 0, m1R, 0, m1G, 0, m1B, 0)
308 maskLo = _mm_unpacklo_epi8(mask, _mm_setzero_si128());
309 // maskHi = (m2A, 0, m2R, 0, m2G, 0, m2B, 0, m3A, 0, m3R, 0, m3G, 0, m3B, 0)
310 maskHi = _mm_unpackhi_epi8(mask, _mm_setzero_si128());
311
312 // Upscale from 0..31 to 0..32
313 // (allows to replace division by left-shift further down)
314 // Left-shift each component by 4 and add the result back to that component,
315 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
316 maskLo = _mm_add_epi16(maskLo, _mm_srli_epi16(maskLo, 4));
317 maskHi = _mm_add_epi16(maskHi, _mm_srli_epi16(maskHi, 4));
318
319 // Interleave R,G,B into the lower byte of the word
320 // dstLo = (d0A, 0, d0R, 0, d0G, 0, d0B, 0, d1A, 0, d1R, 0, d1G, 0, d1B, 0)
321 __m128i dstLo = _mm_unpacklo_epi8(dst, _mm_setzero_si128());
322 // dstLo = (d2A, 0, d2R, 0, d2G, 0, d2B, 0, d3A, 0, d3R, 0, d3G, 0, d3B, 0)
323 __m128i dstHi = _mm_unpackhi_epi8(dst, _mm_setzero_si128());
324
325 // mask = (src - dst) * mask
326 maskLo = _mm_mullo_epi16(maskLo, _mm_sub_epi16(src, dstLo));
327 maskHi = _mm_mullo_epi16(maskHi, _mm_sub_epi16(src, dstHi));
328
329 // mask = (src - dst) * mask >> 5
330 maskLo = _mm_srai_epi16(maskLo, 5);
331 maskHi = _mm_srai_epi16(maskHi, 5);
332
333 // Add two pixels into result.
334 // result = dst + ((src - dst) * mask >> 5)
335 __m128i resultLo = _mm_add_epi16(dstLo, maskLo);
336 __m128i resultHi = _mm_add_epi16(dstHi, maskHi);
337
338 // Merge into one SSE regsiter with sixteen 8-bit values (four pixels),
339 // clamping to 255 if necessary.
340 return _mm_packus_epi16(resultLo, resultHi);
341 }
342
blit_row_lcd16(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor)343 void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[], SkColor src, int width, SkPMColor) {
344 if (width <= 0) {
345 return;
346 }
347
348 int srcA = SkColorGetA(src);
349 int srcR = SkColorGetR(src);
350 int srcG = SkColorGetG(src);
351 int srcB = SkColorGetB(src);
352
353 srcA = SkAlpha255To256(srcA);
354
355 if (width >= 4) {
356 SkASSERT(SkIsAlign4((uintptr_t) dst));
357 while (!SkIsAlign16((uintptr_t) dst)) {
358 *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
359 mask++;
360 dst++;
361 width--;
362 }
363
364 __m128i *d = reinterpret_cast<__m128i*>(dst);
365 // Set alpha to 0xFF and replicate source four times in SSE register.
366 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
367 // Interleave with zeros to get two sets of four 16-bit values.
368 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
369 // Set srcA_sse to contain eight copies of srcA, padded with zero.
370 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
371 __m128i srcA_sse = _mm_set1_epi16(srcA);
372 while (width >= 4) {
373 // Load four destination pixels into dst_sse.
374 __m128i dst_sse = _mm_load_si128(d);
375 // Load four 16-bit masks into lower half of mask_sse.
376 // mask does *not* actually need to be 16 byte alligned to use this command
377 __m128i mask_sse = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(mask));
378
379 // Check whether masks are equal to 0 and get the highest bit
380 // of each byte of result, if masks are all zero, we will get
381 // pack_cmp to 0xFFFF
382 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
383 _mm_setzero_si128()));
384
385 // if mask pixels are not all zero, we will blend the dst pixels
386 if (pack_cmp != 0xFFFF) {
387 // Unpack 4 16bit mask pixels to
388 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
389 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
390 mask_sse = _mm_unpacklo_epi16(mask_sse,
391 _mm_setzero_si128());
392
393 // Process 4 32bit dst pixels
394 __m128i result = blend_lcd16_sse2(src_sse, dst_sse, mask_sse, srcA_sse);
395 _mm_store_si128(d, result);
396 }
397
398 d++;
399 mask += 4;
400 width -= 4;
401 }
402
403 dst = reinterpret_cast<SkPMColor*>(d);
404 }
405
406 while (width > 0) {
407 *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
408 mask++;
409 dst++;
410 width--;
411 }
412 }
413
blit_row_lcd16_opaque(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor opaqueDst)414 void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[],
415 SkColor src, int width, SkPMColor opaqueDst) {
416 if (width <= 0) {
417 return;
418 }
419
420 int srcR = SkColorGetR(src);
421 int srcG = SkColorGetG(src);
422 int srcB = SkColorGetB(src);
423
424 if (width >= 4) {
425 SkASSERT(SkIsAlign4((uintptr_t) dst));
426 while (!SkIsAlign16((uintptr_t) dst)) {
427 *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
428 mask++;
429 dst++;
430 width--;
431 }
432
433 __m128i *d = reinterpret_cast<__m128i*>(dst);
434 // Set alpha to 0xFF and replicate source four times in SSE register.
435 __m128i src_sse = _mm_set1_epi32(SkPackARGB32(0xFF, srcR, srcG, srcB));
436 // Set srcA_sse to contain eight copies of srcA, padded with zero.
437 // src_sse=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
438 src_sse = _mm_unpacklo_epi8(src_sse, _mm_setzero_si128());
439 while (width >= 4) {
440 // Load four destination pixels into dst_sse.
441 __m128i dst_sse = _mm_load_si128(d);
442 // Load four 16-bit masks into lower half of mask_sse.
443 // mask does *not* actually need to be 16 byte alligned to use this command
444 __m128i mask_sse = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(mask));
445
446 // Check whether masks are equal to 0 and get the highest bit
447 // of each byte of result, if masks are all zero, we will get
448 // pack_cmp to 0xFFFF
449 int pack_cmp = _mm_movemask_epi8(_mm_cmpeq_epi16(mask_sse,
450 _mm_setzero_si128()));
451
452 // if mask pixels are not all zero, we will blend the dst pixels
453 if (pack_cmp != 0xFFFF) {
454 // Unpack 4 16bit mask pixels to
455 // mask_sse = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
456 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
457 mask_sse = _mm_unpacklo_epi16(mask_sse,
458 _mm_setzero_si128());
459
460 // Process 4 32bit dst pixels
461 __m128i result = blend_lcd16_opaque_sse2(src_sse, dst_sse, mask_sse);
462 _mm_store_si128(d, result);
463 }
464
465 d++;
466 mask += 4;
467 width -= 4;
468 }
469
470 dst = reinterpret_cast<SkPMColor*>(d);
471 }
472
473 while (width > 0) {
474 *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
475 mask++;
476 dst++;
477 width--;
478 }
479 }
480
481 #elif defined(SK_ARM_HAS_NEON)
482 #include <arm_neon.h>
483
484 #define NEON_A (SK_A32_SHIFT / 8)
485 #define NEON_R (SK_R32_SHIFT / 8)
486 #define NEON_G (SK_G32_SHIFT / 8)
487 #define NEON_B (SK_B32_SHIFT / 8)
488
blend_32_neon(uint8x8_t src,uint8x8_t dst,uint16x8_t scale)489 static inline uint8x8_t blend_32_neon(uint8x8_t src, uint8x8_t dst, uint16x8_t scale) {
490 int16x8_t src_wide, dst_wide;
491
492 src_wide = vreinterpretq_s16_u16(vmovl_u8(src));
493 dst_wide = vreinterpretq_s16_u16(vmovl_u8(dst));
494
495 src_wide = (src_wide - dst_wide) * vreinterpretq_s16_u16(scale);
496
497 dst_wide += vshrq_n_s16(src_wide, 5);
498
499 return vmovn_u16(vreinterpretq_u16_s16(dst_wide));
500 }
501
blit_row_lcd16_opaque(SkPMColor dst[],const uint16_t src[],SkColor color,int width,SkPMColor opaqueDst)502 void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t src[],
503 SkColor color, int width,
504 SkPMColor opaqueDst) {
505 int colR = SkColorGetR(color);
506 int colG = SkColorGetG(color);
507 int colB = SkColorGetB(color);
508
509 uint8x8_t vcolA = vdup_n_u8(0xFF);
510 uint8x8_t vcolR = vdup_n_u8(colR);
511 uint8x8_t vcolG = vdup_n_u8(colG);
512 uint8x8_t vcolB = vdup_n_u8(colB);
513
514 while (width >= 8) {
515 uint8x8x4_t vdst;
516 uint16x8_t vmask;
517 uint16x8_t vmaskR, vmaskG, vmaskB, vmaskA;
518
519 vdst = vld4_u8((uint8_t*)dst);
520 vmask = vld1q_u16(src);
521
522 // Get all the color masks on 5 bits
523 vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
524 vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
525 SK_B16_BITS + SK_R16_BITS + 1);
526 vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
527
528 // Upscale to 0..32
529 vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
530 vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
531 vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
532 // Opaque srcAlpha always uses the max of the 3 LCD coverage values
533 vmaskA = vmaxq_u16(vmaskR, vmaxq_u16(vmaskG, vmaskB));
534
535 vdst.val[NEON_R] = blend_32_neon(vcolR, vdst.val[NEON_R], vmaskR);
536 vdst.val[NEON_G] = blend_32_neon(vcolG, vdst.val[NEON_G], vmaskG);
537 vdst.val[NEON_B] = blend_32_neon(vcolB, vdst.val[NEON_B], vmaskB);
538 vdst.val[NEON_A] = blend_32_neon(vcolA, vdst.val[NEON_A], vmaskA);
539
540 vst4_u8((uint8_t*)dst, vdst);
541
542 dst += 8;
543 src += 8;
544 width -= 8;
545 }
546
547 // Leftovers
548 for (int i = 0; i < width; i++) {
549 dst[i] = blend_lcd16_opaque(colR, colG, colB, dst[i], src[i], opaqueDst);
550 }
551 }
552
blit_row_lcd16(SkPMColor dst[],const uint16_t src[],SkColor color,int width,SkPMColor)553 void blit_row_lcd16(SkPMColor dst[], const uint16_t src[],
554 SkColor color, int width, SkPMColor) {
555 int colA = SkColorGetA(color);
556 int colR = SkColorGetR(color);
557 int colG = SkColorGetG(color);
558 int colB = SkColorGetB(color);
559
560 // srcA in [0-255] to compare vs dstA
561 uint16x8_t vcolACmp = vdupq_n_u16(colA);
562 colA = SkAlpha255To256(colA);
563
564 uint16x8_t vcolA = vdupq_n_u16(colA); // srcA in [0-256] to combine with coverage
565 uint8x8_t vcolR = vdup_n_u8(colR);
566 uint8x8_t vcolG = vdup_n_u8(colG);
567 uint8x8_t vcolB = vdup_n_u8(colB);
568
569 while (width >= 8) {
570 uint8x8x4_t vdst;
571 uint16x8_t vmask;
572 uint16x8_t vmaskR, vmaskG, vmaskB, vmaskA;
573
574 vdst = vld4_u8((uint8_t*)dst);
575 vmask = vld1q_u16(src);
576
577 // Get all the color masks on 5 bits
578 vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
579 vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
580 SK_B16_BITS + SK_R16_BITS + 1);
581 vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
582
583 // Upscale to 0..32
584 vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
585 vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
586 vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
587
588 vmaskR = vshrq_n_u16(vmaskR * vcolA, 8);
589 vmaskG = vshrq_n_u16(vmaskG * vcolA, 8);
590 vmaskB = vshrq_n_u16(vmaskB * vcolA, 8);
591
592 // Select either the min or the max of the RGB mask values, depending on if the src
593 // alpha is less than the dst alpha.
594 vmaskA = vbslq_u16(vcleq_u16(vcolACmp, vmovl_u8(vdst.val[NEON_A])), // srcA < dstA
595 vminq_u16(vmaskR, vminq_u16(vmaskG, vmaskB)), // ? min(r,g,b)
596 vmaxq_u16(vmaskR, vmaxq_u16(vmaskG, vmaskB))); // : max(r,g,b)
597
598 vdst.val[NEON_R] = blend_32_neon(vcolR, vdst.val[NEON_R], vmaskR);
599 vdst.val[NEON_G] = blend_32_neon(vcolG, vdst.val[NEON_G], vmaskG);
600 vdst.val[NEON_B] = blend_32_neon(vcolB, vdst.val[NEON_B], vmaskB);
601 // vmaskA already includes vcolA so blend against 0xFF
602 vdst.val[NEON_A] = blend_32_neon(vdup_n_u8(0xFF), vdst.val[NEON_A], vmaskA);
603 vst4_u8((uint8_t*)dst, vdst);
604
605 dst += 8;
606 src += 8;
607 width -= 8;
608 }
609
610 for (int i = 0; i < width; i++) {
611 dst[i] = blend_lcd16(colA, colR, colG, colB, dst[i], src[i]);
612 }
613 }
614
615 #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
616
617 // The following (left) shifts cause the top 5 bits of the mask components to
618 // line up with the corresponding components in an SkPMColor.
619 // Note that the mask's RGB16 order may differ from the SkPMColor order.
620 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
621 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
622 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
623
624 #if SK_R16x5_R32x5_SHIFT == 0
625 #define SkPackedR16x5ToUnmaskedR32x5_LASX(x) (x)
626 #elif SK_R16x5_R32x5_SHIFT > 0
627 #define SkPackedR16x5ToUnmaskedR32x5_LASX(x) (__lasx_xvslli_w(x, SK_R16x5_R32x5_SHIFT))
628 #else
629 #define SkPackedR16x5ToUnmaskedR32x5_LASX(x) (__lasx_xvsrli_w(x, -SK_R16x5_R32x5_SHIFT))
630 #endif
631
632 #if SK_G16x5_G32x5_SHIFT == 0
633 #define SkPackedG16x5ToUnmaskedG32x5_LASX(x) (x)
634 #elif SK_G16x5_G32x5_SHIFT > 0
635 #define SkPackedG16x5ToUnmaskedG32x5_LASX(x) (__lasx_xvslli_w(x, SK_G16x5_G32x5_SHIFT))
636 #else
637 #define SkPackedG16x5ToUnmaskedG32x5_LASX(x) (__lasx_xvsrli_w(x, -SK_G16x5_G32x5_SHIFT))
638 #endif
639
640 #if SK_B16x5_B32x5_SHIFT == 0
641 #define SkPackedB16x5ToUnmaskedB32x5_LASX(x) (x)
642 #elif SK_B16x5_B32x5_SHIFT > 0
643 #define SkPackedB16x5ToUnmaskedB32x5_LASX(x) (__lasx_xvslli_w(x, SK_B16x5_B32x5_SHIFT))
644 #else
645 #define SkPackedB16x5ToUnmaskedB32x5_LASX(x) (__lasx_xvsrli_w(x, -SK_B16x5_B32x5_SHIFT))
646 #endif
647
blend_lcd16_lasx(__m256i & src,__m256i & dst,__m256i & mask,__m256i & srcA)648 static __m256i blend_lcd16_lasx(__m256i &src, __m256i &dst, __m256i &mask, __m256i &srcA) {
649 // In the following comments, the components of src, dst and mask are
650 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
651 // by an R, G, B, or A suffix. Components of one of the four pixels that
652 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
653 // example is the blue channel of the second destination pixel. Memory
654 // layout is shown for an ARGB byte order in a color value.
655
656 // src and srcA store 8-bit values interleaved with zeros.
657 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0,
658 // 0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
659 // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
660 // srcA, 0, srcA, 0, srcA, 0, srcA, 0,
661 // srcA, 0, srcA, 0, srcA, 0, srcA, 0,
662 // srcA, 0, srcA, 0, srcA, 0, srcA, 0)
663 // mask stores 16-bit values (compressed three channels) interleaved with zeros.
664 // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
665 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
666 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0,
667 // m4RGBLo, m4RGBHi, 0, 0, m5RGBLo, m5RGBHi, 0, 0,
668 // m6RGBLo, m6RGBHi, 0, 0, m7RGBLo, m7RGBHi, 0, 0)
669
670 __m256i xv_zero = __lasx_xvldi(0);
671
672 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
673 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0,
674 // 0, m4R, 0, 0, 0, m5R, 0, 0, 0, m6R, 0, 0, 0, m7R, 0, 0)
675 __m256i r = __lasx_xvand_v(SkPackedR16x5ToUnmaskedR32x5_LASX(mask),
676 __lasx_xvreplgr2vr_w(0x1F << SK_R32_SHIFT));
677
678 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
679 // 0, 0, m4G, 0, 0, 0, m5G, 0, 0, 0, m6G, 0, 0, 0, m7R, 0)
680 __m256i g = __lasx_xvand_v(SkPackedG16x5ToUnmaskedG32x5_LASX(mask),
681 __lasx_xvreplgr2vr_w(0x1F << SK_G32_SHIFT));
682
683 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
684 // 0, 0, 0, m4B, 0, 0, 0, m5B, 0, 0, 0, m6B, 0, 0, 0, m7B)
685 __m256i b = __lasx_xvand_v(SkPackedB16x5ToUnmaskedB32x5_LASX(mask),
686 __lasx_xvreplgr2vr_w(0x1F << SK_B32_SHIFT));
687
688 // a needs to be either the min or the max of the LCD coverages, depending on srcA < dstA
689 __m256i aMin = __lasx_xvmin_b(__lasx_xvslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
690 __lasx_xvmin_b(__lasx_xvslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
691 __lasx_xvslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT)));
692 __m256i aMax = __lasx_xvmax_b(__lasx_xvslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
693 __lasx_xvmax_b(__lasx_xvslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
694 __lasx_xvslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT)));
695 // srcA has been biased to [0-256], so compare srcA against (dstA+1)
696 __m256i a = __lasx_xvmskltz_w(srcA -
697 __lasx_xvand_v(
698 __lasx_xvadd_w(dst,
699 __lasx_xvreplgr2vr_w(1 << SK_A32_SHIFT)),
700 __lasx_xvreplgr2vr_w(SK_A32_MASK)));
701 // a = if_then_else(a, aMin, aMax) == (aMin & a) | (aMax & ~a)
702 a = __lasx_xvor_v(__lasx_xvand_v(a, aMin), __lasx_xvandn_v(a, aMax));
703
704 // Pack the 8 16bit mask pixels into 8 32bit pixels, (p0, p1, p2, p3)
705 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
706 // 8-bit position
707 // mask = (m0A, m0R, m0G, m0B, m1R, m1R, m1G, m1B,
708 // m2A, m2R, m2G, m2B, m3R, m3R, m3G, m3B,
709 // m4A, m4R, m4G, m4B, m5R, m5R, m5G, m5B,
710 // m6A, m6R, m6G, m6B, m7R, m7R, m7G, m7B)
711 mask = __lasx_xvor_v(__lasx_xvor_v(a, r), __lasx_xvor_v(g, b));
712
713 // Interleave R,G,B into the lower byte of word.
714 // i.e. split the sixteen 8-bit values from mask into two sets of sixteen
715 // 16-bit values, padded by zero.
716 __m256i maskLo, maskHi;
717 // maskLo = (m0A, 0, m0R, 0, m0G, 0, m0B, 0, m1A, 0, m1R, 0, m1G, 0, m1B, 0,
718 // m2A, 0, m2R, 0, m2G, 0, m2B, 0, m3A, 0, m3R, 0, m3G, 0, m3B, 0)
719 maskLo = __lasx_xvilvl_b(xv_zero, mask);
720 // maskHi = (m4A, 0, m4R, 0, m4G, 0, m4B, 0, m5A, 0, m5R, 0, m5G, 0, m5B, 0,
721 // m6A, 0, m6R, 0, m6G, 0, m6B, 0, m7A, 0, m7R, 0, m7G, 0, m7B, 0)
722 maskHi = __lasx_xvilvh_b(xv_zero, mask);
723
724 // Upscale from 0..31 to 0..32
725 // (allows to replace division by left-shift further down)
726 // Left-shift each component by 4 and add the result back to that component,
727 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
728 maskLo = __lasx_xvadd_h(maskLo, __lasx_xvsrli_h(maskLo, 4));
729 maskHi = __lasx_xvadd_h(maskHi, __lasx_xvsrli_h(maskHi, 4));
730
731 // Multiply each component of maskLo and maskHi by srcA
732 maskLo = __lasx_xvmul_h(maskLo, srcA);
733 maskHi = __lasx_xvmul_h(maskHi, srcA);
734
735 // Left shift mask components by 8 (divide by 256)
736 maskLo = __lasx_xvsrli_h(maskLo, 8);
737 maskHi = __lasx_xvsrli_h(maskHi, 8);
738
739 // Interleave R,G,B into the lower byte of the word
740 // dstLo = (d0A, 0, d0R, 0, d0G, 0, d0B, 0, d1A, 0, d1R, 0, d1G, 0, d1B, 0)
741 // d2A, 0, d2R, 0, d2G, 0, d2B, 0, d3A, 0, d3R, 0, d3G, 0, d3B, 0)
742 __m256i dstLo = __lasx_xvilvl_b(xv_zero, dst);
743 // dstLo = (d4A, 0, d4R, 0, d4G, 0, d4B, 0, d5A, 0, d5R, 0, d5G, 0, d5B, 0)
744 // d6A, 0, d6R, 0, d6G, 0, d6B, 0, d7A, 0, d7R, 0, d7G, 0, d7B, 0)
745 __m256i dstHi = __lasx_xvilvh_b(xv_zero, dst);
746
747 // mask = (src - dst) * mask
748 maskLo = __lasx_xvmul_h(maskLo, __lasx_xvsub_h(src, dstLo));
749 maskHi = __lasx_xvmul_h(maskHi, __lasx_xvsub_h(src, dstHi));
750
751 // mask = (src - dst) * mask >> 5
752 maskLo = __lasx_xvsrai_h(maskLo, 5);
753 maskHi = __lasx_xvsrai_h(maskHi, 5);
754
755 // Add two pixels into result.
756 // result = dst + ((src - dst) * mask >> 5)
757 __m256i resultLo = __lasx_xvadd_h(dstLo, maskLo);
758 __m256i resultHi = __lasx_xvadd_h(dstHi, maskHi);
759
760 // Pack into 8 32bit dst pixels.
761 // resultLo and resultHi contain sixteen 16-bit components (four pixels) each.
762 // Merge into one LASX regsiter with 32 8-bit values (eight pixels),
763 // clamping to 255 if necessary.
764 __m256i tmpl = __lasx_xvsat_hu(resultLo, 7);
765 __m256i tmph = __lasx_xvsat_hu(resultHi, 7);
766 return __lasx_xvpickev_b(tmph, tmpl);
767 }
768
blend_lcd16_opaque_lasx(__m256i & src,__m256i & dst,__m256i & mask)769 static __m256i blend_lcd16_opaque_lasx(__m256i &src, __m256i &dst, __m256i &mask) {
770 // In the following comments, the components of src, dst and mask are
771 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
772 // by an R, G, B, or A suffix. Components of one of the four pixels that
773 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
774 // example is the blue channel of the second destination pixel. Memory
775 // layout is shown for an ARGB byte order in a color value.
776
777 // src and srcA store 8-bit values interleaved with zeros.
778 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0,
779 // 0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
780 // mask stores 16-bit values (shown as high and low bytes) interleaved with
781 // zeros
782 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
783 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0,
784 // m4RGBLo, m4RGBHi, 0, 0, m5RGBLo, m5RGBHi, 0, 0,
785 // m6RGBLo, m6RGBHi, 0, 0, m7RGBLo, m7RGBHi, 0, 0)
786
787 __m256i xv_zero = __lasx_xvldi(0);
788
789 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
790 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0,
791 // 0, m4R, 0, 0, 0, m5R, 0, 0, 0, m6R, 0, 0, 0, m7R, 0, 0)
792 __m256i r = __lasx_xvand_v(SkPackedR16x5ToUnmaskedR32x5_LASX(mask),
793 __lasx_xvreplgr2vr_w(0x1F << SK_R32_SHIFT));
794
795 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0,
796 // 0, 0, m4G, 0, 0, 0, m5G, 0, 0, 0, m6G, 0, 0, 0, m7G, 0)
797 __m256i g = __lasx_xvand_v(SkPackedG16x5ToUnmaskedG32x5_LASX(mask),
798 __lasx_xvreplgr2vr_w(0x1F << SK_G32_SHIFT));
799
800 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B,
801 // 0, 0, 0, m4B, 0, 0, 0, m5B, 0, 0, 0, m6B, 0, 0, 0, m7B)
802 __m256i b = __lasx_xvand_v(SkPackedB16x5ToUnmaskedB32x5_LASX(mask),
803 __lasx_xvreplgr2vr_w(0x1F << SK_B32_SHIFT));
804
805 // a = max(r, g, b) since opaque src alpha uses max of LCD coverages
806 __m256i a = __lasx_xvmax_b(__lasx_xvslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
807 __lasx_xvmax_b(__lasx_xvslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
808 __lasx_xvslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT)));
809
810 // Pack the 8 16bit mask pixels into 8 32bit pixels, (p0, p1, p2, p3,
811 // p4, p5, p6, p7)
812 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
813 // 8-bit position
814 // mask = (m0A, m0R, m0G, m0B, m1A, m1R, m1G, m1B,
815 // m2A, m2R, m2G, m2B, m3A, m3R, m3G, m3B,
816 // m4A, m4R, m4G, m4B, m5A, m5R, m5G, m5B,
817 // m6A, m6R, m6G, m6B, m7A, m7R, m7G, m7B)
818 mask = __lasx_xvor_v(__lasx_xvor_v(a, r), __lasx_xvor_v(g, b));
819
820 // Interleave R,G,B into the lower byte of word.
821 // i.e. split the 32 8-bit values from mask into two sets of sixteen
822 // 16-bit values, padded by zero.
823 __m256i maskLo, maskHi;
824 // maskLo = (m0A, 0, m0R, 0, m0G, 0, m0B, 0, m1A, 0, m1R, 0, m1G, 0, m1B, 0,
825 // m2A, 0, m2R, 0, m2G, 0, m2B, 0, m3A, 0, m3R, 0, m3G, 0, m3B, 0)
826 maskLo = __lasx_xvilvl_b(xv_zero, mask);
827 // maskHi = (m4A, 0, m4R, 0, m4G, 0, m4B, 0, m5A, 0, m5R, 0, m5G, 0, m5B, 0,
828 // m6A, 0, m6R, 0, m6G, 0, m6B, 0, m7A, 0, m7R, 0, m7G, 0, m7B, 0)
829 maskHi = __lasx_xvilvh_b(xv_zero, mask);
830
831 // Upscale from 0..31 to 0..32
832 // (allows to replace division by left-shift further down)
833 // Left-shift each component by 4 and add the result back to that component,
834 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
835 maskLo = __lasx_xvadd_h(maskLo, __lasx_xvsrli_h(maskLo, 4));
836 maskHi = __lasx_xvadd_h(maskHi, __lasx_xvsrli_h(maskHi, 4));
837
838 // Interleave R,G,B into the lower byte of the word
839 // dstLo = (d0A, 0, d0R, 0, d0G, 0, d0B, 0, d1A, 0, d1R, 0, d1G, 0, d1B, 0,
840 // d2A, 0, d2R, 0, d2G, 0, d2B, 0, d3A, 0, d3R, 0, d3G, 0, d3B, 0)
841 __m256i dstLo = __lasx_xvilvl_b(xv_zero, dst);
842 // dstLo = (d4A, 0, d4R, 0, d4G, 0, d4B, 0, d5A, 0, d5R, 0, d5G, 0, d5B, 0,
843 // dstLo = (d6A, 0, d6R, 0, d6G, 0, d6B, 0, d7A, 0, d7R, 0, d7G, 0, d7B, 0)
844 __m256i dstHi = __lasx_xvilvh_b(xv_zero, dst);
845
846 // mask = (src - dst) * mask
847 maskLo = __lasx_xvmul_h(maskLo, __lasx_xvsub_h(src, dstLo));
848 maskHi = __lasx_xvmul_h(maskHi, __lasx_xvsub_h(src, dstHi));
849
850 // mask = (src - dst) * mask >> 5
851 maskLo = __lasx_xvsrai_h(maskLo, 5);
852 maskHi = __lasx_xvsrai_h(maskHi, 5);
853
854 // Add two pixels into result.
855 // result = dst + ((src - dst) * mask >> 5)
856 __m256i resultLo = __lasx_xvadd_h(dstLo, maskLo);
857 __m256i resultHi = __lasx_xvadd_h(dstHi, maskHi);
858
859 // Merge into one SSE regsiter with 32 8-bit values (eight pixels),
860 // clamping to 255 if necessary.
861 __m256i tmpl = __lasx_xvsat_hu(resultLo, 7);
862 __m256i tmph = __lasx_xvsat_hu(resultHi, 7);
863
864 return __lasx_xvpickev_b(tmph, tmpl);
865 }
866
blit_row_lcd16(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor)867 void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[], SkColor src, int width, SkPMColor) {
868 if (width <= 0) {
869 return;
870 }
871
872 int srcA = SkColorGetA(src);
873 int srcR = SkColorGetR(src);
874 int srcG = SkColorGetG(src);
875 int srcB = SkColorGetB(src);
876 __m256i xv_zero = __lasx_xvldi(0);
877
878 srcA = SkAlpha255To256(srcA);
879 if (width >= 8) {
880 SkASSERT(SkIsAlign4((uintptr_t) dst));
881 while (!SkIsAlign16((uintptr_t) dst)) {
882 *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
883 mask++;
884 dst++;
885 width--;
886 }
887
888 __m256i *d = reinterpret_cast<__m256i*>(dst);
889 // Set alpha to 0xFF and replicate source eight times in LASX register.
890 unsigned int skpackargb32 = SkPackARGB32(0xFF, srcR, srcG, srcB);
891 __m256i src_lasx = __lasx_xvreplgr2vr_w(skpackargb32);
892 // Interleave with zeros to get two sets of eight 16-bit values.
893 src_lasx = __lasx_xvilvl_b(xv_zero, src_lasx);
894 // Set srcA_lasx to contain sixteen copies of srcA, padded with zero.
895 // src_lasx=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0,
896 // 0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
897 __m256i srcA_lasx = __lasx_xvreplgr2vr_h(srcA);
898
899 while (width >= 8) {
900 // Load eight destination pixels into dst_lasx.
901 __m256i dst_lasx = __lasx_xvld(d, 0);
902 // Load eight 16-bit masks into lower half of mask_lasx.
903 __m256i mask_lasx = __lasx_xvld(mask, 0);
904 mask_lasx = (__m256i){mask_lasx[0], 0, mask_lasx[1], 0};
905
906 int pack_cmp = __lasx_xbz_v(mask_lasx);
907 // if mask pixels are not all zero, we will blend the dst pixels
908 if (pack_cmp != 1) {
909 // Unpack 8 16bit mask pixels to
910 // mask_lasx = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
911 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0,
912 // m4RGBLo, m4RGBHi, 0, 0, m5RGBLo, m5RGBHi, 0, 0,
913 // m6RGBLo, m6RGBHi, 0, 0, m7RGBLo, m7RGBHi, 0, 0)
914 mask_lasx = __lasx_xvilvl_h(xv_zero, mask_lasx);
915
916 // Process 8 32bit dst pixels
917 __m256i result = blend_lcd16_lasx(src_lasx, dst_lasx, mask_lasx, srcA_lasx);
918 __lasx_xvst(result, d, 0);
919 }
920 d++;
921 mask += 8;
922 width -= 8;
923 }
924 dst = reinterpret_cast<SkPMColor*>(d);
925 }
926
927 while (width > 0) {
928 *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
929 mask++;
930 dst++;
931 width--;
932 }
933 }
934
blit_row_lcd16_opaque(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor opaqueDst)935 void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[],
936 SkColor src, int width, SkPMColor opaqueDst) {
937 if (width <= 0) {
938 return;
939 }
940
941 int srcR = SkColorGetR(src);
942 int srcG = SkColorGetG(src);
943 int srcB = SkColorGetB(src);
944 __m256i xv_zero = __lasx_xvldi(0);
945
946 if (width >= 8) {
947 SkASSERT(SkIsAlign4((uintptr_t) dst));
948 while (!SkIsAlign16((uintptr_t) dst)) {
949 *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
950 mask++;
951 dst++;
952 width--;
953 }
954
955 __m256i *d = reinterpret_cast<__m256i*>(dst);
956 // Set alpha to 0xFF and replicate source four times in LASX register.
957 unsigned int sk_pack_argb32 = SkPackARGB32(0xFF, srcR, srcG, srcB);
958 __m256i src_lasx = __lasx_xvreplgr2vr_w(sk_pack_argb32);
959 // Set srcA_lasx to contain sixteen copies of srcA, padded with zero.
960 // src_lasx=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0,
961 // 0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
962 src_lasx = __lasx_xvilvl_b(xv_zero, src_lasx);
963
964 while (width >= 8) {
965 // Load eight destination pixels into dst_lasx.
966 __m256i dst_lasx = __lasx_xvld(d, 0);
967 // Load eight 16-bit masks into lower half of mask_lasx.
968 __m256i mask_lasx = __lasx_xvld(mask, 0);
969 mask_lasx = (__m256i){mask_lasx[0], 0, mask_lasx[1], 0};
970
971 int32_t pack_cmp = __lasx_xbz_v(mask_lasx);
972 // if mask pixels are not all zero, we will blend the dst pixels
973 if (pack_cmp != 1) {
974 // Unpack 8 16bit mask pixels to
975 // mask_lasx = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
976 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0,
977 // m4RGBLo, m4RGBHi, 0, 0, m5RGBLo, m5RGBHi, 0, 0,
978 // m6RGBLo, m6RGBHi, 0, 0, m7RGBLo, m7RGBHi, 0, 0)
979 mask_lasx = __lasx_xvilvl_h(xv_zero, mask_lasx);
980 // Process 8 32bit dst pixels
981 __m256i result = blend_lcd16_opaque_lasx(src_lasx, dst_lasx, mask_lasx);
982 __lasx_xvst(result, d, 0);
983 }
984 d++;
985 mask += 8;
986 width -= 8;
987 }
988
989 dst = reinterpret_cast<SkPMColor*>(d);
990 }
991
992 while (width > 0) {
993 *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
994 mask++;
995 dst++;
996 width--;
997 }
998 }
999
1000 #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
1001
1002 // The following (left) shifts cause the top 5 bits of the mask components to
1003 // line up with the corresponding components in an SkPMColor.
1004 // Note that the mask's RGB16 order may differ from the SkPMColor order.
1005 #define SK_R16x5_R32x5_SHIFT (SK_R32_SHIFT - SK_R16_SHIFT - SK_R16_BITS + 5)
1006 #define SK_G16x5_G32x5_SHIFT (SK_G32_SHIFT - SK_G16_SHIFT - SK_G16_BITS + 5)
1007 #define SK_B16x5_B32x5_SHIFT (SK_B32_SHIFT - SK_B16_SHIFT - SK_B16_BITS + 5)
1008
1009 #if SK_R16x5_R32x5_SHIFT == 0
1010 #define SkPackedR16x5ToUnmaskedR32x5_LSX(x) (x)
1011 #elif SK_R16x5_R32x5_SHIFT > 0
1012 #define SkPackedR16x5ToUnmaskedR32x5_LSX(x) (__lsx_vslli_w(x, SK_R16x5_R32x5_SHIFT))
1013 #else
1014 #define SkPackedR16x5ToUnmaskedR32x5_LSX(x) (__lsx_vsrli_w(x, -SK_R16x5_R32x5_SHIFT))
1015 #endif
1016
1017 #if SK_G16x5_G32x5_SHIFT == 0
1018 #define SkPackedG16x5ToUnmaskedG32x5_LSX(x) (x)
1019 #elif SK_G16x5_G32x5_SHIFT > 0
1020 #define SkPackedG16x5ToUnmaskedG32x5_LSX(x) (__lsx_vslli_w(x, SK_G16x5_G32x5_SHIFT))
1021 #else
1022 #define SkPackedG16x5ToUnmaskedG32x5_LSX(x) (__lsx_vsrli_w(x, -SK_G16x5_G32x5_SHIFT))
1023 #endif
1024
1025 #if SK_B16x5_B32x5_SHIFT == 0
1026 #define SkPackedB16x5ToUnmaskedB32x5_LSX(x) (x)
1027 #elif SK_B16x5_B32x5_SHIFT > 0
1028 #define SkPackedB16x5ToUnmaskedB32x5_LSX(x) (__lsx_vslli_w(x, SK_B16x5_B32x5_SHIFT))
1029 #else
1030 #define SkPackedB16x5ToUnmaskedB32x5_LSX(x) (__lsx_vsrli_w(x, -SK_B16x5_B32x5_SHIFT))
1031 #endif
1032
blend_lcd16_lsx(__m128i & src,__m128i & dst,__m128i & mask,__m128i & srcA)1033 static __m128i blend_lcd16_lsx(__m128i &src, __m128i &dst, __m128i &mask, __m128i &srcA) {
1034 // In the following comments, the components of src, dst and mask are
1035 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
1036 // by an R, G, B, or A suffix. Components of one of the four pixels that
1037 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
1038 // example is the blue channel of the second destination pixel. Memory
1039 // layout is shown for an ARGB byte order in a color value.
1040
1041 // src and srcA store 8-bit values interleaved with zeros.
1042 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
1043 // srcA = (srcA, 0, srcA, 0, srcA, 0, srcA, 0,
1044 // srcA, 0, srcA, 0, srcA, 0, srcA, 0)
1045 // mask stores 16-bit values (compressed three channels) interleaved with zeros.
1046 // Lo and Hi denote the low and high bytes of a 16-bit value, respectively.
1047 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
1048 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
1049
1050 __m128i v_zero = __lsx_vldi(0);
1051
1052 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
1053 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
1054 __m128i r = __lsx_vand_v(SkPackedR16x5ToUnmaskedR32x5_LSX(mask),
1055 __lsx_vreplgr2vr_w(0x1F << SK_R32_SHIFT));
1056
1057 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
1058 __m128i g = __lsx_vand_v(SkPackedG16x5ToUnmaskedG32x5_LSX(mask),
1059 __lsx_vreplgr2vr_w(0x1F << SK_G32_SHIFT));
1060
1061 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
1062 __m128i b = __lsx_vand_v(SkPackedB16x5ToUnmaskedB32x5_LSX(mask),
1063 __lsx_vreplgr2vr_w(0x1F << SK_B32_SHIFT));
1064
1065 // a needs to be either the min or the max of the LCD coverages, depending on srcA < dstA
1066 __m128i aMin = __lsx_vmin_b(__lsx_vslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
1067 __lsx_vmin_b(__lsx_vslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
1068 __lsx_vslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT)));
1069 __m128i aMax = __lsx_vmax_b(__lsx_vslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
1070 __lsx_vmax_b(__lsx_vslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
1071 __lsx_vslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT)));
1072 // srcA has been biased to [0-256], so compare srcA against (dstA+1)
1073 __m128i a = __lsx_vmskltz_w(srcA -
1074 __lsx_vand_v(
1075 __lsx_vadd_w(dst,
1076 __lsx_vreplgr2vr_w(1 << SK_A32_SHIFT)),
1077 __lsx_vreplgr2vr_w(SK_A32_MASK)));
1078 // a = if_then_else(a, aMin, aMax) == (aMin & a) | (aMax & ~a)
1079 a = __lsx_vor_v(__lsx_vand_v(a, aMin), __lsx_vandn_v(a, aMax));
1080
1081 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
1082 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
1083 // 8-bit position
1084 // mask = (m0A, m0R, m0G, m0B, m1A, m1R, m1G, m1B,
1085 // m2A, m2R, m2G, m2B, m3A, m3R, m3G, m3B)
1086 mask = __lsx_vor_v(__lsx_vor_v(a, r), __lsx_vor_v(g, b));
1087
1088 // Interleave R,G,B into the lower byte of word.
1089 // i.e. split the sixteen 8-bit values from mask into two sets of eight
1090 // 16-bit values, padded by zero.
1091 __m128i maskLo, maskHi;
1092 // maskLo = (m0A, 0, m0R, 0, m0G, 0, m0B, 0, m1A, 0, m1R, 0, m1G, 0, m1B, 0)
1093 maskLo = __lsx_vilvl_b(v_zero, mask);
1094 // maskHi = (m2A, 0, m2R, 0, m2G, 0, m2B, 0, m3A, 0, m3R, 0, m3G, 0, m3B, 0)
1095 maskHi = __lsx_vilvh_b(v_zero, mask);
1096
1097 // Upscale from 0..31 to 0..32
1098 // (allows to replace division by left-shift further down)
1099 // Left-shift each component by 4 and add the result back to that component,
1100 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
1101 maskLo = __lsx_vadd_h(maskLo, __lsx_vsrli_h(maskLo, 4));
1102 maskHi = __lsx_vadd_h(maskHi, __lsx_vsrli_h(maskHi, 4));
1103
1104 // Multiply each component of maskLo and maskHi by srcA
1105 maskLo = __lsx_vmul_h(maskLo, srcA);
1106 maskHi = __lsx_vmul_h(maskHi, srcA);
1107
1108 // Left shift mask components by 8 (divide by 256)
1109 maskLo = __lsx_vsrli_h(maskLo, 8);
1110 maskHi = __lsx_vsrli_h(maskHi, 8);
1111
1112 // Interleave R,G,B into the lower byte of the word
1113 // dstLo = (d0A, 0, d0R, 0, d0G, 0, d0B, 0, d1A, 0, d1R, 0, d1G, 0, d1B, 0)
1114 __m128i dstLo = __lsx_vilvl_b(v_zero, dst);
1115 // dstLo = (d2A, 0, d2R, 0, d2G, 0, d2B, 0, d3A, 0, d3R, 0, d3G, 0, d3B, 0)
1116 __m128i dstHi = __lsx_vilvh_b(v_zero, dst);
1117
1118 // mask = (src - dst) * mask
1119 maskLo = __lsx_vmul_h(maskLo, __lsx_vsub_h(src, dstLo));
1120 maskHi = __lsx_vmul_h(maskHi, __lsx_vsub_h(src, dstHi));
1121
1122 // mask = (src - dst) * mask >> 5
1123 maskLo = __lsx_vsrai_h(maskLo, 5);
1124 maskHi = __lsx_vsrai_h(maskHi, 5);
1125
1126 // Add two pixels into result.
1127 // result = dst + ((src - dst) * mask >> 5)
1128 __m128i resultLo = __lsx_vadd_h(dstLo, maskLo);
1129 __m128i resultHi = __lsx_vadd_h(dstHi, maskHi);
1130
1131 // Pack into 4 32bit dst pixels.
1132 // resultLo and resultHi contain eight 16-bit components (two pixels) each.
1133 // Merge into one LSX regsiter with sixteen 8-bit values (four pixels),
1134 // clamping to 255 if necessary.
1135 __m128i tmpl = __lsx_vsat_hu(resultLo, 7);
1136 __m128i tmph = __lsx_vsat_hu(resultHi, 7);
1137 return __lsx_vpickev_b(tmph, tmpl);
1138 }
1139
blend_lcd16_opaque_lsx(__m128i & src,__m128i & dst,__m128i & mask)1140 static __m128i blend_lcd16_opaque_lsx(__m128i &src, __m128i &dst, __m128i &mask) {
1141 // In the following comments, the components of src, dst and mask are
1142 // abbreviated as (s)rc, (d)st, and (m)ask. Color components are marked
1143 // by an R, G, B, or A suffix. Components of one of the four pixels that
1144 // are processed in parallel are marked with 0, 1, 2, and 3. "d1B", for
1145 // example is the blue channel of the second destination pixel. Memory
1146 // layout is shown for an ARGB byte order in a color value.
1147
1148 // src and srcA store 8-bit values interleaved with zeros.
1149 // src = (0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
1150 // mask stores 16-bit values (shown as high and low bytes) interleaved with
1151 // zeros
1152 // mask = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
1153 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
1154
1155 __m128i v_zero = __lsx_vldi(0);
1156
1157 // Get the R,G,B of each 16bit mask pixel, we want all of them in 5 bits.
1158 // r = (0, m0R, 0, 0, 0, m1R, 0, 0, 0, m2R, 0, 0, 0, m3R, 0, 0)
1159 __m128i r = __lsx_vand_v(SkPackedR16x5ToUnmaskedR32x5_LSX(mask),
1160 __lsx_vreplgr2vr_w(0x1F << SK_R32_SHIFT));
1161
1162 // g = (0, 0, m0G, 0, 0, 0, m1G, 0, 0, 0, m2G, 0, 0, 0, m3G, 0)
1163 __m128i g = __lsx_vand_v(SkPackedG16x5ToUnmaskedG32x5_LSX(mask),
1164 __lsx_vreplgr2vr_w(0x1F << SK_G32_SHIFT));
1165
1166 // b = (0, 0, 0, m0B, 0, 0, 0, m1B, 0, 0, 0, m2B, 0, 0, 0, m3B)
1167 __m128i b = __lsx_vand_v(SkPackedB16x5ToUnmaskedB32x5_LSX(mask),
1168 __lsx_vreplgr2vr_w(0x1F << SK_B32_SHIFT));
1169
1170 // a = max(r, g, b) since opaque src alpha uses max of LCD coverages
1171 __m128i a = __lsx_vmax_b(__lsx_vslli_w(r, SK_A32_SHIFT - SK_R32_SHIFT),
1172 __lsx_vmax_b(__lsx_vslli_w(g, SK_A32_SHIFT - SK_G32_SHIFT),
1173 __lsx_vslli_w(b, SK_A32_SHIFT - SK_B32_SHIFT)));
1174
1175 // Pack the 4 16bit mask pixels into 4 32bit pixels, (p0, p1, p2, p3)
1176 // Each component (m0R, m0G, etc.) is then a 5-bit value aligned to an
1177 // 8-bit position
1178 // mask = (m0A, m0R, m0G, m0B, m1A, m1R, m1G, m1B,
1179 // m2A, m2R, m2G, m2B, m3A, m3R, m3G, m3B)
1180 mask = __lsx_vor_v(__lsx_vor_v(a, r), __lsx_vor_v(g, b));
1181
1182 // Interleave R,G,B into the lower byte of word.
1183 // i.e. split the sixteen 8-bit values from mask into two sets of eight
1184 // 16-bit values, padded by zero.
1185 __m128i maskLo, maskHi;
1186 // maskLo = (m0A, 0, m0R, 0, m0G, 0, m0B, 0, m1A, 0, m1R, 0, m1G, 0, m1B, 0)
1187 maskLo = __lsx_vilvl_b(v_zero, mask);
1188 // maskHi = (m2A, 0, m2R, 0, m2G, 0, m2B, 0, m3A, 0, m3R, 0, m3G, 0, m3B, 0)
1189 maskHi = __lsx_vilvh_b(v_zero, mask);
1190
1191 // Upscale from 0..31 to 0..32
1192 // (allows to replace division by left-shift further down)
1193 // Left-shift each component by 4 and add the result back to that component,
1194 // mapping numbers in the range 0..15 to 0..15, and 16..31 to 17..32
1195 maskLo = __lsx_vadd_h(maskLo, __lsx_vsrli_h(maskLo, 4));
1196 maskHi = __lsx_vadd_h(maskHi, __lsx_vsrli_h(maskHi, 4));
1197
1198 // Interleave R,G,B into the lower byte of the word
1199 // dstLo = (d0A, 0, d0R, 0, d0G, 0, d0B, 0, d1A, 0, d1R, 0, d1G, 0, d1B, 0)
1200 __m128i dstLo = __lsx_vilvl_b(v_zero, dst);
1201 // dstLo = (d2A, 0, d2R, 0, d2G, 0, d2B, 0, d3A, 0, d3R, 0, d3G, 0, d3B, 0)
1202 __m128i dstHi = __lsx_vilvh_b(v_zero, dst);
1203
1204 // mask = (src - dst) * mask
1205 maskLo = __lsx_vmul_h(maskLo, __lsx_vsub_h(src, dstLo));
1206 maskHi = __lsx_vmul_h(maskHi, __lsx_vsub_h(src, dstHi));
1207
1208 // mask = (src - dst) * mask >> 5
1209 maskLo = __lsx_vsrai_h(maskLo, 5);
1210 maskHi = __lsx_vsrai_h(maskHi, 5);
1211
1212 // Add two pixels into result.
1213 // result = dst + ((src - dst) * mask >> 5)
1214 __m128i resultLo = __lsx_vadd_h(dstLo, maskLo);
1215 __m128i resultHi = __lsx_vadd_h(dstHi, maskHi);
1216
1217 // Merge into one LSX regsiter with sixteen 8-bit values (four pixels),
1218 // clamping to 255 if necessary.
1219 __m128i tmpl = __lsx_vsat_hu(resultLo, 7);
1220 __m128i tmph = __lsx_vsat_hu(resultHi, 7);
1221 return __lsx_vpickev_b(tmph, tmpl);
1222 }
1223
blit_row_lcd16(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor)1224 void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[], SkColor src, int width, SkPMColor) {
1225 if (width <= 0) {
1226 return;
1227 }
1228
1229 int srcA = SkColorGetA(src);
1230 int srcR = SkColorGetR(src);
1231 int srcG = SkColorGetG(src);
1232 int srcB = SkColorGetB(src);
1233 __m128i v_zero = __lsx_vldi(0);
1234
1235 srcA = SkAlpha255To256(srcA);
1236 if (width >= 4) {
1237 SkASSERT(SkIsAlign4((uintptr_t) dst));
1238 while (!SkIsAlign16((uintptr_t) dst)) {
1239 *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
1240 mask++;
1241 dst++;
1242 width--;
1243 }
1244
1245 __m128i *d = reinterpret_cast<__m128i*>(dst);
1246 // Set alpha to 0xFF and replicate source eight times in LSX register.
1247 unsigned int skpackargb32 = SkPackARGB32(0xFF, srcR, srcG, srcB);
1248 __m128i src_lsx = __lsx_vreplgr2vr_w(skpackargb32);
1249 // Interleave with zeros to get two sets of eight 16-bit values.
1250 src_lsx = __lsx_vilvl_b(v_zero, src_lsx);
1251 // Set srcA_lsx to contain eight copies of srcA, padded with zero.
1252 // src_lsx=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
1253 __m128i srcA_lsx = __lsx_vreplgr2vr_h(srcA);
1254
1255 while (width >= 4) {
1256 // Load eight destination pixels into dst_lsx.
1257 __m128i dst_lsx = __lsx_vld(d, 0);
1258 // Load four 16-bit masks into lower half of mask_lsx.
1259 __m128i mask_lsx = __lsx_vldrepl_d((void *)mask, 0);
1260 mask_lsx = __lsx_vilvl_d(v_zero, mask_lsx);
1261
1262 int pack_cmp = __lsx_bz_v(mask_lsx);
1263 // if mask pixels are not all zero, we will blend the dst pixels
1264 if (pack_cmp != 1) {
1265 // Unpack 4 16bit mask pixels to
1266 // mask_lsx = (m0RGBLo, m0RGBHi, 0, 0, m1RGBLo, m1RGBHi, 0, 0,
1267 // m2RGBLo, m2RGBHi, 0, 0, m3RGBLo, m3RGBHi, 0, 0)
1268 mask_lsx = __lsx_vilvl_h(v_zero, mask_lsx);
1269
1270 // Process 8 32bit dst pixels
1271 __m128i result = blend_lcd16_lsx(src_lsx, dst_lsx, mask_lsx, srcA_lsx);
1272 __lsx_vst(result, d, 0);
1273 }
1274
1275 d++;
1276 mask += 4;
1277 width -= 4;
1278 }
1279
1280 dst = reinterpret_cast<SkPMColor*>(d);
1281 }
1282
1283 while (width > 0) {
1284 *dst = blend_lcd16(srcA, srcR, srcG, srcB, *dst, *mask);
1285 mask++;
1286 dst++;
1287 width--;
1288 }
1289 }
1290
blit_row_lcd16_opaque(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor opaqueDst)1291 void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[],
1292 SkColor src, int width, SkPMColor opaqueDst) {
1293 if (width <= 0) {
1294 return;
1295 }
1296
1297 int srcR = SkColorGetR(src);
1298 int srcG = SkColorGetG(src);
1299 int srcB = SkColorGetB(src);
1300 __m128i v_zero = __lsx_vldi(0);
1301
1302 if (width >= 4) {
1303 SkASSERT(SkIsAlign4((uintptr_t) dst));
1304 while (!SkIsAlign16((uintptr_t) dst)) {
1305 *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
1306 mask++;
1307 dst++;
1308 width--;
1309 }
1310
1311 __m128i *d = reinterpret_cast<__m128i*>(dst);
1312 // Set alpha to 0xFF and replicate source four times in LSX register.
1313 unsigned int sk_pack_argb32 = SkPackARGB32(0xFF, srcR, srcG, srcB);
1314 __m128i src_lsx = __lsx_vreplgr2vr_w(sk_pack_argb32);
1315 // Set srcA_lsx to contain eight copies of srcA, padded with zero.
1316 // src_lsx=(0xFF, 0, sR, 0, sG, 0, sB, 0, 0xFF, 0, sR, 0, sG, 0, sB, 0)
1317 src_lsx = __lsx_vilvl_b(v_zero, src_lsx);
1318
1319 while (width >= 4) {
1320 // Load four destination pixels into dst_lsx.
1321 __m128i dst_lsx = __lsx_vld(d, 0);
1322 // Load four 16-bit masks into lower half of mask_lsx.
1323 __m128i mask_lsx = __lsx_vldrepl_d((void *)(mask), 0);
1324 mask_lsx = __lsx_vilvl_d(v_zero, mask_lsx);
1325
1326 int pack_cmp = __lsx_bz_v(mask_lsx);
1327 // if mask pixels are not all zero, we will blend the dst pixels
1328 if (pack_cmp != 1) {
1329 // Unpack 4 16bit mask pixels to
1330 mask_lsx = __lsx_vilvl_h(v_zero, mask_lsx);
1331
1332 // Process 8 32bit dst pixels
1333 __m128i result = blend_lcd16_opaque_lsx(src_lsx, dst_lsx, mask_lsx);
1334 __lsx_vst(result, d, 0);
1335 }
1336 d++;
1337 mask += 4;
1338 width -= 4;
1339 }
1340
1341 dst = reinterpret_cast<SkPMColor*>(d);
1342 }
1343
1344 while (width > 0) {
1345 *dst = blend_lcd16_opaque(srcR, srcG, srcB, *dst, *mask, opaqueDst);
1346 mask++;
1347 dst++;
1348 width--;
1349 }
1350 }
1351
1352 #else
1353
blit_row_lcd16(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor)1354 static inline void blit_row_lcd16(SkPMColor dst[], const uint16_t mask[],
1355 SkColor src, int width, SkPMColor) {
1356 int srcA = SkColorGetA(src);
1357 int srcR = SkColorGetR(src);
1358 int srcG = SkColorGetG(src);
1359 int srcB = SkColorGetB(src);
1360
1361 srcA = SkAlpha255To256(srcA);
1362
1363 for (int i = 0; i < width; i++) {
1364 dst[i] = blend_lcd16(srcA, srcR, srcG, srcB, dst[i], mask[i]);
1365 }
1366 }
1367
blit_row_lcd16_opaque(SkPMColor dst[],const uint16_t mask[],SkColor src,int width,SkPMColor opaqueDst)1368 static inline void blit_row_lcd16_opaque(SkPMColor dst[], const uint16_t mask[],
1369 SkColor src, int width,
1370 SkPMColor opaqueDst) {
1371 int srcR = SkColorGetR(src);
1372 int srcG = SkColorGetG(src);
1373 int srcB = SkColorGetB(src);
1374
1375 for (int i = 0; i < width; i++) {
1376 dst[i] = blend_lcd16_opaque(srcR, srcG, srcB, dst[i], mask[i], opaqueDst);
1377 }
1378 }
1379
1380 #endif
1381
blit_color(const SkPixmap & device,const SkMask & mask,const SkIRect & clip,SkColor color)1382 static bool blit_color(const SkPixmap& device,
1383 const SkMask& mask,
1384 const SkIRect& clip,
1385 SkColor color) {
1386 int x = clip.fLeft,
1387 y = clip.fTop;
1388
1389 if (device.colorType() == kN32_SkColorType && mask.fFormat == SkMask::kA8_Format) {
1390 SkOpts::blit_mask_d32_a8(device.writable_addr32(x,y), device.rowBytes(),
1391 (const SkAlpha*)mask.getAddr(x,y), mask.fRowBytes,
1392 color, clip.width(), clip.height());
1393 return true;
1394 }
1395
1396 if (device.colorType() == kN32_SkColorType && mask.fFormat == SkMask::kLCD16_Format) {
1397 auto dstRow = device.writable_addr32(x,y);
1398 auto maskRow = (const uint16_t*)mask.getAddr(x,y);
1399
1400 auto blit_row = blit_row_lcd16;
1401 SkPMColor opaqueDst = 0; // ignored unless opaque
1402
1403 if (0xff == SkColorGetA(color)) {
1404 blit_row = blit_row_lcd16_opaque;
1405 opaqueDst = SkPreMultiplyColor(color);
1406 }
1407
1408 for (int height = clip.height(); height --> 0; ) {
1409 blit_row(dstRow, maskRow, color, clip.width(), opaqueDst);
1410
1411 dstRow = (SkPMColor*) (( char*) dstRow + device.rowBytes());
1412 maskRow = (const uint16_t*)((const char*)maskRow + mask.fRowBytes);
1413 }
1414 return true;
1415 }
1416
1417 return false;
1418 }
1419
1420 ///////////////////////////////////////////////////////////////////////////////
1421
SkARGB32_Blit32(const SkPixmap & device,const SkMask & mask,const SkIRect & clip,SkPMColor srcColor)1422 static void SkARGB32_Blit32(const SkPixmap& device, const SkMask& mask,
1423 const SkIRect& clip, SkPMColor srcColor) {
1424 U8CPU alpha = SkGetPackedA32(srcColor);
1425 unsigned flags = SkBlitRow::kSrcPixelAlpha_Flag32;
1426 if (alpha != 255) {
1427 flags |= SkBlitRow::kGlobalAlpha_Flag32;
1428 }
1429 SkBlitRow::Proc32 proc = SkBlitRow::Factory32(flags);
1430
1431 int x = clip.fLeft;
1432 int y = clip.fTop;
1433 int width = clip.width();
1434 int height = clip.height();
1435
1436 SkPMColor* dstRow = device.writable_addr32(x, y);
1437 const SkPMColor* srcRow = reinterpret_cast<const SkPMColor*>(mask.getAddr8(x, y));
1438
1439 do {
1440 proc(dstRow, srcRow, width, alpha);
1441 dstRow = (SkPMColor*)((char*)dstRow + device.rowBytes());
1442 srcRow = (const SkPMColor*)((const char*)srcRow + mask.fRowBytes);
1443 } while (--height != 0);
1444 }
1445
1446 //////////////////////////////////////////////////////////////////////////////////////
1447
SkARGB32_Blitter(const SkPixmap & device,const SkPaint & paint)1448 SkARGB32_Blitter::SkARGB32_Blitter(const SkPixmap& device, const SkPaint& paint)
1449 : INHERITED(device) {
1450 SkColor color = paint.getColor();
1451 fColor = color;
1452
1453 fSrcA = SkColorGetA(color);
1454 unsigned scale = SkAlpha255To256(fSrcA);
1455 fSrcR = SkAlphaMul(SkColorGetR(color), scale);
1456 fSrcG = SkAlphaMul(SkColorGetG(color), scale);
1457 fSrcB = SkAlphaMul(SkColorGetB(color), scale);
1458
1459 fPMColor = SkPackARGB32(fSrcA, fSrcR, fSrcG, fSrcB);
1460 }
1461
1462 #if defined _WIN32 // disable warning : local variable used without having been initialized
1463 #pragma warning ( push )
1464 #pragma warning ( disable : 4701 )
1465 #endif
1466
blitH(int x,int y,int width)1467 void SkARGB32_Blitter::blitH(int x, int y, int width) {
1468 SkASSERT(x >= 0 && y >= 0 && x + width <= fDevice.width());
1469
1470 uint32_t* device = fDevice.writable_addr32(x, y);
1471 SkBlitRow::Color32(device, width, fPMColor);
1472 }
1473
blitAntiH(int x,int y,const SkAlpha antialias[],const int16_t runs[])1474 void SkARGB32_Blitter::blitAntiH(int x, int y, const SkAlpha antialias[],
1475 const int16_t runs[]) {
1476 if (fSrcA == 0) {
1477 return;
1478 }
1479
1480 uint32_t color = fPMColor;
1481 uint32_t* device = fDevice.writable_addr32(x, y);
1482 unsigned opaqueMask = fSrcA; // if fSrcA is 0xFF, then we will catch the fast opaque case
1483
1484 for (;;) {
1485 int count = runs[0];
1486 SkASSERT(count >= 0);
1487 if (count <= 0) {
1488 return;
1489 }
1490 unsigned aa = antialias[0];
1491 if (aa) {
1492 if ((opaqueMask & aa) == 255) {
1493 SkOpts::memset32(device, color, count);
1494 } else {
1495 uint32_t sc = SkAlphaMulQ(color, SkAlpha255To256(aa));
1496 SkBlitRow::Color32(device, count, sc);
1497 }
1498 }
1499 runs += count;
1500 antialias += count;
1501 device += count;
1502 }
1503 }
1504
blitAntiH2(int x,int y,U8CPU a0,U8CPU a1)1505 void SkARGB32_Blitter::blitAntiH2(int x, int y, U8CPU a0, U8CPU a1) {
1506 uint32_t* device = fDevice.writable_addr32(x, y);
1507 SkDEBUGCODE((void)fDevice.writable_addr32(x + 1, y);)
1508
1509 device[0] = SkBlendARGB32(fPMColor, device[0], a0);
1510 device[1] = SkBlendARGB32(fPMColor, device[1], a1);
1511 }
1512
blitAntiV2(int x,int y,U8CPU a0,U8CPU a1)1513 void SkARGB32_Blitter::blitAntiV2(int x, int y, U8CPU a0, U8CPU a1) {
1514 uint32_t* device = fDevice.writable_addr32(x, y);
1515 SkDEBUGCODE((void)fDevice.writable_addr32(x, y + 1);)
1516
1517 device[0] = SkBlendARGB32(fPMColor, device[0], a0);
1518 device = (uint32_t*)((char*)device + fDevice.rowBytes());
1519 device[0] = SkBlendARGB32(fPMColor, device[0], a1);
1520 }
1521
1522 //////////////////////////////////////////////////////////////////////////////////////
1523
1524 #define solid_8_pixels(mask, dst, color) \
1525 do { \
1526 if (mask & 0x80) dst[0] = color; \
1527 if (mask & 0x40) dst[1] = color; \
1528 if (mask & 0x20) dst[2] = color; \
1529 if (mask & 0x10) dst[3] = color; \
1530 if (mask & 0x08) dst[4] = color; \
1531 if (mask & 0x04) dst[5] = color; \
1532 if (mask & 0x02) dst[6] = color; \
1533 if (mask & 0x01) dst[7] = color; \
1534 } while (0)
1535
1536 #define SK_BLITBWMASK_NAME SkARGB32_BlitBW
1537 #define SK_BLITBWMASK_ARGS , SkPMColor color
1538 #define SK_BLITBWMASK_BLIT8(mask, dst) solid_8_pixels(mask, dst, color)
1539 #define SK_BLITBWMASK_GETADDR writable_addr32
1540 #define SK_BLITBWMASK_DEVTYPE uint32_t
1541 #include "src/core/SkBlitBWMaskTemplate.h"
1542
1543 #define blend_8_pixels(mask, dst, sc, dst_scale) \
1544 do { \
1545 if (mask & 0x80) { dst[0] = sc + SkAlphaMulQ(dst[0], dst_scale); } \
1546 if (mask & 0x40) { dst[1] = sc + SkAlphaMulQ(dst[1], dst_scale); } \
1547 if (mask & 0x20) { dst[2] = sc + SkAlphaMulQ(dst[2], dst_scale); } \
1548 if (mask & 0x10) { dst[3] = sc + SkAlphaMulQ(dst[3], dst_scale); } \
1549 if (mask & 0x08) { dst[4] = sc + SkAlphaMulQ(dst[4], dst_scale); } \
1550 if (mask & 0x04) { dst[5] = sc + SkAlphaMulQ(dst[5], dst_scale); } \
1551 if (mask & 0x02) { dst[6] = sc + SkAlphaMulQ(dst[6], dst_scale); } \
1552 if (mask & 0x01) { dst[7] = sc + SkAlphaMulQ(dst[7], dst_scale); } \
1553 } while (0)
1554
1555 #define SK_BLITBWMASK_NAME SkARGB32_BlendBW
1556 #define SK_BLITBWMASK_ARGS , uint32_t sc, unsigned dst_scale
1557 #define SK_BLITBWMASK_BLIT8(mask, dst) blend_8_pixels(mask, dst, sc, dst_scale)
1558 #define SK_BLITBWMASK_GETADDR writable_addr32
1559 #define SK_BLITBWMASK_DEVTYPE uint32_t
1560 #include "src/core/SkBlitBWMaskTemplate.h"
1561
blitMask(const SkMask & mask,const SkIRect & clip)1562 void SkARGB32_Blitter::blitMask(const SkMask& mask, const SkIRect& clip) {
1563 SkASSERT(mask.fBounds.contains(clip));
1564 SkASSERT(fSrcA != 0xFF);
1565
1566 if (fSrcA == 0) {
1567 return;
1568 }
1569
1570 if (blit_color(fDevice, mask, clip, fColor)) {
1571 return;
1572 }
1573
1574 switch (mask.fFormat) {
1575 case SkMask::kBW_Format:
1576 SkARGB32_BlendBW(fDevice, mask, clip, fPMColor, SkAlpha255To256(255 - fSrcA));
1577 break;
1578 case SkMask::kARGB32_Format:
1579 SkARGB32_Blit32(fDevice, mask, clip, fPMColor);
1580 break;
1581 default:
1582 SK_ABORT("Mask format not handled.");
1583 }
1584 }
1585
blitMask(const SkMask & mask,const SkIRect & clip)1586 void SkARGB32_Opaque_Blitter::blitMask(const SkMask& mask,
1587 const SkIRect& clip) {
1588 SkASSERT(mask.fBounds.contains(clip));
1589
1590 if (blit_color(fDevice, mask, clip, fColor)) {
1591 return;
1592 }
1593
1594 switch (mask.fFormat) {
1595 case SkMask::kBW_Format:
1596 SkARGB32_BlitBW(fDevice, mask, clip, fPMColor);
1597 break;
1598 case SkMask::kARGB32_Format:
1599 SkARGB32_Blit32(fDevice, mask, clip, fPMColor);
1600 break;
1601 default:
1602 SK_ABORT("Mask format not handled.");
1603 }
1604 }
1605
blitAntiH2(int x,int y,U8CPU a0,U8CPU a1)1606 void SkARGB32_Opaque_Blitter::blitAntiH2(int x, int y, U8CPU a0, U8CPU a1) {
1607 uint32_t* device = fDevice.writable_addr32(x, y);
1608 SkDEBUGCODE((void)fDevice.writable_addr32(x + 1, y);)
1609
1610 device[0] = SkFastFourByteInterp(fPMColor, device[0], a0);
1611 device[1] = SkFastFourByteInterp(fPMColor, device[1], a1);
1612 }
1613
blitAntiV2(int x,int y,U8CPU a0,U8CPU a1)1614 void SkARGB32_Opaque_Blitter::blitAntiV2(int x, int y, U8CPU a0, U8CPU a1) {
1615 uint32_t* device = fDevice.writable_addr32(x, y);
1616 SkDEBUGCODE((void)fDevice.writable_addr32(x, y + 1);)
1617
1618 device[0] = SkFastFourByteInterp(fPMColor, device[0], a0);
1619 device = (uint32_t*)((char*)device + fDevice.rowBytes());
1620 device[0] = SkFastFourByteInterp(fPMColor, device[0], a1);
1621 }
1622
1623 ///////////////////////////////////////////////////////////////////////////////
1624
blitV(int x,int y,int height,SkAlpha alpha)1625 void SkARGB32_Blitter::blitV(int x, int y, int height, SkAlpha alpha) {
1626 if (alpha == 0 || fSrcA == 0) {
1627 return;
1628 }
1629
1630 uint32_t* device = fDevice.writable_addr32(x, y);
1631 uint32_t color = fPMColor;
1632
1633 if (alpha != 255) {
1634 color = SkAlphaMulQ(color, SkAlpha255To256(alpha));
1635 }
1636
1637 unsigned dst_scale = SkAlpha255To256(255 - SkGetPackedA32(color));
1638 size_t rowBytes = fDevice.rowBytes();
1639 while (--height >= 0) {
1640 device[0] = color + SkAlphaMulQ(device[0], dst_scale);
1641 device = (uint32_t*)((char*)device + rowBytes);
1642 }
1643 }
1644
blitRect(int x,int y,int width,int height)1645 void SkARGB32_Blitter::blitRect(int x, int y, int width, int height) {
1646 SkASSERT(x >= 0 && y >= 0 && x + width <= fDevice.width() && y + height <= fDevice.height());
1647
1648 if (fSrcA == 0) {
1649 return;
1650 }
1651
1652 uint32_t* device = fDevice.writable_addr32(x, y);
1653 uint32_t color = fPMColor;
1654 size_t rowBytes = fDevice.rowBytes();
1655
1656 if (SkGetPackedA32(fPMColor) == 0xFF) {
1657 SkOpts::rect_memset32(device, color, width, rowBytes, height);
1658 } else {
1659 while (height --> 0) {
1660 SkBlitRow::Color32(device, width, color);
1661 device = (uint32_t*)((char*)device + rowBytes);
1662 }
1663 }
1664 }
1665
1666 #if defined _WIN32
1667 #pragma warning ( pop )
1668 #endif
1669
1670 ///////////////////////////////////////////////////////////////////////
1671
blitAntiH(int x,int y,const SkAlpha antialias[],const int16_t runs[])1672 void SkARGB32_Black_Blitter::blitAntiH(int x, int y, const SkAlpha antialias[],
1673 const int16_t runs[]) {
1674 uint32_t* device = fDevice.writable_addr32(x, y);
1675 SkPMColor black = (SkPMColor)(SK_A32_MASK << SK_A32_SHIFT);
1676
1677 for (;;) {
1678 int count = runs[0];
1679 SkASSERT(count >= 0);
1680 if (count <= 0) {
1681 return;
1682 }
1683 unsigned aa = antialias[0];
1684 if (aa) {
1685 if (aa == 255) {
1686 SkOpts::memset32(device, black, count);
1687 } else {
1688 SkPMColor src = aa << SK_A32_SHIFT;
1689 unsigned dst_scale = 256 - aa;
1690 int n = count;
1691 do {
1692 --n;
1693 device[n] = src + SkAlphaMulQ(device[n], dst_scale);
1694 } while (n > 0);
1695 }
1696 }
1697 runs += count;
1698 antialias += count;
1699 device += count;
1700 }
1701 }
1702
blitAntiH2(int x,int y,U8CPU a0,U8CPU a1)1703 void SkARGB32_Black_Blitter::blitAntiH2(int x, int y, U8CPU a0, U8CPU a1) {
1704 uint32_t* device = fDevice.writable_addr32(x, y);
1705 SkDEBUGCODE((void)fDevice.writable_addr32(x + 1, y);)
1706
1707 device[0] = (a0 << SK_A32_SHIFT) + SkAlphaMulQ(device[0], 256 - a0);
1708 device[1] = (a1 << SK_A32_SHIFT) + SkAlphaMulQ(device[1], 256 - a1);
1709 }
1710
blitAntiV2(int x,int y,U8CPU a0,U8CPU a1)1711 void SkARGB32_Black_Blitter::blitAntiV2(int x, int y, U8CPU a0, U8CPU a1) {
1712 uint32_t* device = fDevice.writable_addr32(x, y);
1713 SkDEBUGCODE((void)fDevice.writable_addr32(x, y + 1);)
1714
1715 device[0] = (a0 << SK_A32_SHIFT) + SkAlphaMulQ(device[0], 256 - a0);
1716 device = (uint32_t*)((char*)device + fDevice.rowBytes());
1717 device[0] = (a1 << SK_A32_SHIFT) + SkAlphaMulQ(device[0], 256 - a1);
1718 }
1719
1720 ///////////////////////////////////////////////////////////////////////////////
1721
SkARGB32_Shader_Blitter(const SkPixmap & device,const SkPaint & paint,SkShaderBase::Context * shaderContext)1722 SkARGB32_Shader_Blitter::SkARGB32_Shader_Blitter(const SkPixmap& device,
1723 const SkPaint& paint, SkShaderBase::Context* shaderContext)
1724 : INHERITED(device, paint, shaderContext)
1725 {
1726 fBuffer = (SkPMColor*)sk_malloc_throw(device.width() * (sizeof(SkPMColor)));
1727
1728 SkASSERT(paint.isSrcOver());
1729
1730 int flags = 0;
1731 if (!(shaderContext->getFlags() & SkShaderBase::kOpaqueAlpha_Flag)) {
1732 flags |= SkBlitRow::kSrcPixelAlpha_Flag32;
1733 }
1734 // we call this on the output from the shader
1735 fProc32 = SkBlitRow::Factory32(flags);
1736 // we call this on the output from the shader + alpha from the aa buffer
1737 fProc32Blend = SkBlitRow::Factory32(flags | SkBlitRow::kGlobalAlpha_Flag32);
1738
1739 fShadeDirectlyIntoDevice =
1740 SkToBool(shaderContext->getFlags() & SkShaderBase::kOpaqueAlpha_Flag);
1741 }
1742
~SkARGB32_Shader_Blitter()1743 SkARGB32_Shader_Blitter::~SkARGB32_Shader_Blitter() {
1744 sk_free(fBuffer);
1745 }
1746
blitH(int x,int y,int width)1747 void SkARGB32_Shader_Blitter::blitH(int x, int y, int width) {
1748 SkASSERT(x >= 0 && y >= 0 && x + width <= fDevice.width());
1749
1750 uint32_t* device = fDevice.writable_addr32(x, y);
1751
1752 if (fShadeDirectlyIntoDevice) {
1753 fShaderContext->shadeSpan(x, y, device, width);
1754 } else {
1755 SkPMColor* span = fBuffer;
1756 fShaderContext->shadeSpan(x, y, span, width);
1757 fProc32(device, span, width, 255);
1758 }
1759 }
1760
blitRect(int x,int y,int width,int height)1761 void SkARGB32_Shader_Blitter::blitRect(int x, int y, int width, int height) {
1762 SkASSERT(x >= 0 && y >= 0 &&
1763 x + width <= fDevice.width() && y + height <= fDevice.height());
1764
1765 uint32_t* device = fDevice.writable_addr32(x, y);
1766 size_t deviceRB = fDevice.rowBytes();
1767 auto* shaderContext = fShaderContext;
1768 SkPMColor* span = fBuffer;
1769
1770 if (fShadeDirectlyIntoDevice) {
1771 do {
1772 shaderContext->shadeSpan(x, y, device, width);
1773 y += 1;
1774 device = (uint32_t*)((char*)device + deviceRB);
1775 } while (--height > 0);
1776 } else {
1777 SkBlitRow::Proc32 proc = fProc32;
1778 do {
1779 shaderContext->shadeSpan(x, y, span, width);
1780 proc(device, span, width, 255);
1781 y += 1;
1782 device = (uint32_t*)((char*)device + deviceRB);
1783 } while (--height > 0);
1784 }
1785 }
1786
blitAntiH(int x,int y,const SkAlpha antialias[],const int16_t runs[])1787 void SkARGB32_Shader_Blitter::blitAntiH(int x, int y, const SkAlpha antialias[],
1788 const int16_t runs[]) {
1789 SkPMColor* span = fBuffer;
1790 uint32_t* device = fDevice.writable_addr32(x, y);
1791 auto* shaderContext = fShaderContext;
1792
1793 if (fShadeDirectlyIntoDevice || (shaderContext->getFlags() & SkShaderBase::kOpaqueAlpha_Flag)) {
1794 for (;;) {
1795 int count = *runs;
1796 if (count <= 0) {
1797 break;
1798 }
1799 int aa = *antialias;
1800 if (aa) {
1801 if (aa == 255) {
1802 // cool, have the shader draw right into the device
1803 shaderContext->shadeSpan(x, y, device, count);
1804 } else {
1805 shaderContext->shadeSpan(x, y, span, count);
1806 fProc32Blend(device, span, count, aa);
1807 }
1808 }
1809 device += count;
1810 runs += count;
1811 antialias += count;
1812 x += count;
1813 }
1814 } else {
1815 for (;;) {
1816 int count = *runs;
1817 if (count <= 0) {
1818 break;
1819 }
1820 int aa = *antialias;
1821 if (aa) {
1822 shaderContext->shadeSpan(x, y, span, count);
1823 if (aa == 255) {
1824 fProc32(device, span, count, 255);
1825 } else {
1826 fProc32Blend(device, span, count, aa);
1827 }
1828 }
1829 device += count;
1830 runs += count;
1831 antialias += count;
1832 x += count;
1833 }
1834 }
1835 }
1836
1837 using U32 = skvx::Vec< 4, uint32_t>;
1838 using U8x4 = skvx::Vec<16, uint8_t>;
1839 using U8 = skvx::Vec< 4, uint8_t>;
1840
drive(SkPMColor * dst,const SkPMColor * src,const uint8_t * cov,int n,U8x4 (* kernel)(U8x4,U8x4,U8x4))1841 static void drive(SkPMColor* dst, const SkPMColor* src, const uint8_t* cov, int n,
1842 U8x4 (*kernel)(U8x4,U8x4,U8x4)) {
1843
1844 auto apply = [kernel](U32 dst, U32 src, U8 cov) -> U32 {
1845 U8x4 cov_splat = skvx::shuffle<0,0,0,0, 1,1,1,1, 2,2,2,2, 3,3,3,3>(cov);
1846 return sk_bit_cast<U32>(kernel(sk_bit_cast<U8x4>(dst),
1847 sk_bit_cast<U8x4>(src),
1848 cov_splat));
1849 };
1850 while (n >= 4) {
1851 apply(U32::Load(dst), U32::Load(src), U8::Load(cov)).store(dst);
1852 dst += 4;
1853 src += 4;
1854 cov += 4;
1855 n -= 4;
1856 }
1857 while (n --> 0) {
1858 *dst = apply(U32{*dst}, U32{*src}, U8{*cov})[0];
1859 dst++;
1860 src++;
1861 cov++;
1862 }
1863 }
1864
blend_row_A8(SkPMColor * dst,const void * mask,const SkPMColor * src,int n)1865 static void blend_row_A8(SkPMColor* dst, const void* mask, const SkPMColor* src, int n) {
1866 auto cov = (const uint8_t*)mask;
1867 drive(dst, src, cov, n, [](U8x4 d, U8x4 s, U8x4 c) {
1868 U8x4 s_aa = skvx::approx_scale(s, c),
1869 alpha = skvx::shuffle<3,3,3,3, 7,7,7,7, 11,11,11,11, 15,15,15,15>(s_aa);
1870 return s_aa + skvx::approx_scale(d, 255 - alpha);
1871 });
1872 }
1873
blend_row_A8_opaque(SkPMColor * dst,const void * mask,const SkPMColor * src,int n)1874 static void blend_row_A8_opaque(SkPMColor* dst, const void* mask, const SkPMColor* src, int n) {
1875 auto cov = (const uint8_t*)mask;
1876 drive(dst, src, cov, n, [](U8x4 d, U8x4 s, U8x4 c) {
1877 return skvx::div255( skvx::cast<uint16_t>(s) * skvx::cast<uint16_t>( c )
1878 + skvx::cast<uint16_t>(d) * skvx::cast<uint16_t>(255-c));
1879 });
1880 }
1881
blend_row_lcd16(SkPMColor * dst,const void * vmask,const SkPMColor * src,int n)1882 static void blend_row_lcd16(SkPMColor* dst, const void* vmask, const SkPMColor* src, int n) {
1883 auto src_alpha_blend = [](int s, int d, int sa, int m) {
1884 return d + SkAlphaMul(s - SkAlphaMul(sa, d), m);
1885 };
1886
1887 auto upscale_31_to_255 = [](int v) {
1888 return (v << 3) | (v >> 2);
1889 };
1890
1891 auto mask = (const uint16_t*)vmask;
1892 for (int i = 0; i < n; ++i) {
1893 uint16_t m = mask[i];
1894 if (0 == m) {
1895 continue;
1896 }
1897
1898 SkPMColor s = src[i];
1899 SkPMColor d = dst[i];
1900
1901 int srcA = SkGetPackedA32(s);
1902 int srcR = SkGetPackedR32(s);
1903 int srcG = SkGetPackedG32(s);
1904 int srcB = SkGetPackedB32(s);
1905
1906 srcA += srcA >> 7;
1907
1908 // We're ignoring the least significant bit of the green coverage channel here.
1909 int maskR = SkGetPackedR16(m) >> (SK_R16_BITS - 5);
1910 int maskG = SkGetPackedG16(m) >> (SK_G16_BITS - 5);
1911 int maskB = SkGetPackedB16(m) >> (SK_B16_BITS - 5);
1912
1913 // Scale up to 8-bit coverage to work with SkAlphaMul() in src_alpha_blend().
1914 maskR = upscale_31_to_255(maskR);
1915 maskG = upscale_31_to_255(maskG);
1916 maskB = upscale_31_to_255(maskB);
1917
1918 // This LCD blit routine only works if the destination is opaque.
1919 dst[i] = SkPackARGB32(0xFF,
1920 src_alpha_blend(srcR, SkGetPackedR32(d), srcA, maskR),
1921 src_alpha_blend(srcG, SkGetPackedG32(d), srcA, maskG),
1922 src_alpha_blend(srcB, SkGetPackedB32(d), srcA, maskB));
1923 }
1924 }
1925
blend_row_LCD16_opaque(SkPMColor * dst,const void * vmask,const SkPMColor * src,int n)1926 static void blend_row_LCD16_opaque(SkPMColor* dst, const void* vmask, const SkPMColor* src, int n) {
1927 auto mask = (const uint16_t*)vmask;
1928
1929 for (int i = 0; i < n; ++i) {
1930 uint16_t m = mask[i];
1931 if (0 == m) {
1932 continue;
1933 }
1934
1935 SkPMColor s = src[i];
1936 SkPMColor d = dst[i];
1937
1938 int srcR = SkGetPackedR32(s);
1939 int srcG = SkGetPackedG32(s);
1940 int srcB = SkGetPackedB32(s);
1941
1942 // We're ignoring the least significant bit of the green coverage channel here.
1943 int maskR = SkGetPackedR16(m) >> (SK_R16_BITS - 5);
1944 int maskG = SkGetPackedG16(m) >> (SK_G16_BITS - 5);
1945 int maskB = SkGetPackedB16(m) >> (SK_B16_BITS - 5);
1946
1947 // Now upscale them to 0..32, so we can use blend_32.
1948 maskR = upscale_31_to_32(maskR);
1949 maskG = upscale_31_to_32(maskG);
1950 maskB = upscale_31_to_32(maskB);
1951
1952 // This LCD blit routine only works if the destination is opaque.
1953 dst[i] = SkPackARGB32(0xFF,
1954 blend_32(srcR, SkGetPackedR32(d), maskR),
1955 blend_32(srcG, SkGetPackedG32(d), maskG),
1956 blend_32(srcB, SkGetPackedB32(d), maskB));
1957 }
1958 }
1959
blitMask(const SkMask & mask,const SkIRect & clip)1960 void SkARGB32_Shader_Blitter::blitMask(const SkMask& mask, const SkIRect& clip) {
1961 SkASSERT(mask.fBounds.contains(clip));
1962
1963 void (*blend_row)(SkPMColor*, const void* mask, const SkPMColor*, int) = nullptr;
1964
1965 bool opaque = (fShaderContext->getFlags() & SkShaderBase::kOpaqueAlpha_Flag);
1966
1967 if (mask.fFormat == SkMask::kA8_Format && opaque) {
1968 blend_row = blend_row_A8_opaque;
1969 } else if (mask.fFormat == SkMask::kA8_Format) {
1970 blend_row = blend_row_A8;
1971 } else if (mask.fFormat == SkMask::kLCD16_Format && opaque) {
1972 blend_row = blend_row_LCD16_opaque;
1973 } else if (mask.fFormat == SkMask::kLCD16_Format) {
1974 blend_row = blend_row_lcd16;
1975 } else {
1976 this->INHERITED::blitMask(mask, clip);
1977 return;
1978 }
1979
1980 const int x = clip.fLeft;
1981 const int width = clip.width();
1982 int y = clip.fTop;
1983 int height = clip.height();
1984
1985 char* dstRow = (char*)fDevice.writable_addr32(x, y);
1986 const size_t dstRB = fDevice.rowBytes();
1987 const uint8_t* maskRow = (const uint8_t*)mask.getAddr(x, y);
1988 const size_t maskRB = mask.fRowBytes;
1989
1990 SkPMColor* span = fBuffer;
1991 SkASSERT(blend_row);
1992 do {
1993 fShaderContext->shadeSpan(x, y, span, width);
1994 blend_row(reinterpret_cast<SkPMColor*>(dstRow), maskRow, span, width);
1995 dstRow += dstRB;
1996 maskRow += maskRB;
1997 y += 1;
1998 } while (--height > 0);
1999 }
2000
blitV(int x,int y,int height,SkAlpha alpha)2001 void SkARGB32_Shader_Blitter::blitV(int x, int y, int height, SkAlpha alpha) {
2002 SkASSERT(x >= 0 && y >= 0 && y + height <= fDevice.height());
2003
2004 uint32_t* device = fDevice.writable_addr32(x, y);
2005 size_t deviceRB = fDevice.rowBytes();
2006
2007 if (fShadeDirectlyIntoDevice) {
2008 if (255 == alpha) {
2009 do {
2010 fShaderContext->shadeSpan(x, y, device, 1);
2011 y += 1;
2012 device = (uint32_t*)((char*)device + deviceRB);
2013 } while (--height > 0);
2014 } else {
2015 do {
2016 SkPMColor c;
2017 fShaderContext->shadeSpan(x, y, &c, 1);
2018 *device = SkFourByteInterp(c, *device, alpha);
2019 y += 1;
2020 device = (uint32_t*)((char*)device + deviceRB);
2021 } while (--height > 0);
2022 }
2023 } else {
2024 SkPMColor* span = fBuffer;
2025 SkBlitRow::Proc32 proc = (255 == alpha) ? fProc32 : fProc32Blend;
2026 do {
2027 fShaderContext->shadeSpan(x, y, span, 1);
2028 proc(device, span, 1, alpha);
2029 y += 1;
2030 device = (uint32_t*)((char*)device + deviceRB);
2031 } while (--height > 0);
2032 }
2033 }
2034