1 /*
2 * Copyright 2018 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #ifndef SkRasterPipeline_opts_DEFINED
9 #define SkRasterPipeline_opts_DEFINED
10
11 #include "include/core/SkTypes.h"
12 #include "include/private/base/SkMalloc.h"
13 #include "include/private/base/SkSpan_impl.h"
14 #include "include/private/base/SkTemplates.h"
15 #include "modules/skcms/skcms.h"
16 #include "src/base/SkUtils.h" // unaligned_{load,store}
17 #include "src/core/SkRasterPipeline.h"
18 #include "src/core/SkRasterPipelineContextUtils.h"
19 #include "src/shaders/SkPerlinNoiseShaderType.h"
20 #include "src/sksl/tracing/SkSLTraceHook.h"
21
22 #include <cstdint>
23 #include <type_traits>
24
25 // Every function in this file should be marked static and inline using SI.
26 #if defined(__clang__) || defined(__GNUC__)
27 #define SI __attribute__((always_inline)) static inline
28 #else
29 #define SI static inline
30 #endif
31
32 #if defined(__clang__)
33 #define SK_UNROLL _Pragma("unroll")
34 #else
35 #define SK_UNROLL
36 #endif
37
38 #if defined(__clang__)
39 template <int N, typename T> using Vec = T __attribute__((ext_vector_type(N)));
40 #elif defined(__GNUC__)
41 // Unfortunately, GCC does not allow us to omit the struct. This will not compile:
42 // template <int N, typename T> using Vec = T __attribute__((vector_size(N*sizeof(T))));
43 template <int N, typename T> struct VecHelper {
44 typedef T __attribute__((vector_size(N * sizeof(T)))) V;
45 };
46 template <int N, typename T> using Vec = typename VecHelper<N, T>::V;
47 #endif
48
49 template <typename Dst, typename Src>
widen_cast(const Src & src)50 SI Dst widen_cast(const Src& src) {
51 static_assert(sizeof(Dst) > sizeof(Src));
52 static_assert(std::is_trivially_copyable<Dst>::value);
53 static_assert(std::is_trivially_copyable<Src>::value);
54 Dst dst;
55 memcpy(&dst, &src, sizeof(Src));
56 return dst;
57 }
58
59 struct Ctx {
60 SkRasterPipelineStage* fStage;
61
62 template <typename T>
63 operator T*() {
64 return (T*)fStage->ctx;
65 }
66 };
67
68 using NoCtx = const void*;
69
70 #if defined(SKRP_CPU_SCALAR) || defined(SKRP_CPU_NEON) || defined(SKRP_CPU_HSW) || \
71 defined(SKRP_CPU_SKX) || defined(SKRP_CPU_AVX) || defined(SKRP_CPU_SSE41) || \
72 defined(SKRP_CPU_SSE2)
73 // Honor the existing setting
74 #elif !defined(__clang__) && !defined(__GNUC__)
75 #define SKRP_CPU_SCALAR
76 #elif defined(SK_ARM_HAS_NEON)
77 #define SKRP_CPU_NEON
78 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX
79 #define SKRP_CPU_SKX
80 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
81 #define SKRP_CPU_HSW
82 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX
83 #define SKRP_CPU_AVX
84 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE41
85 #define SKRP_CPU_SSE41
86 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE2
87 #define SKRP_CPU_SSE2
88 #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LASX
89 #define SKRP_CPU_LASX
90 #elif SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
91 #define SKRP_CPU_LSX
92 #else
93 #define SKRP_CPU_SCALAR
94 #endif
95
96 #if defined(SKRP_CPU_SCALAR)
97 #include <math.h>
98 #elif defined(SKRP_CPU_NEON)
99 #include <arm_neon.h>
100 #elif defined(SKRP_CPU_LASX)
101 #include <lasxintrin.h>
102 #include <lsxintrin.h>
103 #elif defined(SKRP_CPU_LSX)
104 #include <lsxintrin.h>
105 #else
106 #include <immintrin.h>
107 #endif
108
109 // Notes:
110 // * rcp_fast and rcp_precise both produce a reciprocal, but rcp_fast is an estimate with at least
111 // 12 bits of precision while rcp_precise should be accurate for float size. For ARM rcp_precise
112 // requires 2 Newton-Raphson refinement steps because its estimate has 8 bit precision, and for
113 // Intel this requires one additional step because its estimate has 12 bit precision.
114 //
115 // * Don't call rcp_approx or rsqrt_approx directly; only use rcp_fast and rsqrt.
116
117 namespace SK_OPTS_NS {
118 #if defined(SKRP_CPU_SCALAR)
119 // This path should lead to portable scalar code.
120 using F = float ;
121 using I32 = int32_t;
122 using U64 = uint64_t;
123 using U32 = uint32_t;
124 using U16 = uint16_t;
125 using U8 = uint8_t ;
126
min(F a,F b)127 SI F min(F a, F b) { return fminf(a,b); }
min(I32 a,I32 b)128 SI I32 min(I32 a, I32 b) { return a < b ? a : b; }
min(U32 a,U32 b)129 SI U32 min(U32 a, U32 b) { return a < b ? a : b; }
max(F a,F b)130 SI F max(F a, F b) { return fmaxf(a,b); }
max(I32 a,I32 b)131 SI I32 max(I32 a, I32 b) { return a > b ? a : b; }
max(U32 a,U32 b)132 SI U32 max(U32 a, U32 b) { return a > b ? a : b; }
133
mad(F f,F m,F a)134 SI F mad(F f, F m, F a) { return a+f*m; }
nmad(F f,F m,F a)135 SI F nmad(F f, F m, F a) { return a-f*m; }
abs_(F v)136 SI F abs_ (F v) { return fabsf(v); }
abs_(I32 v)137 SI I32 abs_ (I32 v) { return v < 0 ? -v : v; }
floor_(F v)138 SI F floor_(F v) { return floorf(v); }
ceil_(F v)139 SI F ceil_(F v) { return ceilf(v); }
rcp_approx(F v)140 SI F rcp_approx(F v) { return 1.0f / v; } // use rcp_fast instead
rsqrt_approx(F v)141 SI F rsqrt_approx(F v) { return 1.0f / sqrtf(v); }
sqrt_(F v)142 SI F sqrt_ (F v) { return sqrtf(v); }
rcp_precise(F v)143 SI F rcp_precise (F v) { return 1.0f / v; }
144
iround(F v)145 SI I32 iround(F v) { return (I32)(v + 0.5f); }
round(F v)146 SI U32 round(F v) { return (U32)(v + 0.5f); }
round(F v,F scale)147 SI U32 round(F v, F scale) { return (U32)(v*scale + 0.5f); }
pack(U32 v)148 SI U16 pack(U32 v) { return (U16)v; }
pack(U16 v)149 SI U8 pack(U16 v) { return (U8)v; }
150
if_then_else(I32 c,F t,F e)151 SI F if_then_else(I32 c, F t, F e) { return c ? t : e; }
if_then_else(I32 c,I32 t,I32 e)152 SI I32 if_then_else(I32 c, I32 t, I32 e) { return c ? t : e; }
153
any(I32 c)154 SI bool any(I32 c) { return c != 0; }
all(I32 c)155 SI bool all(I32 c) { return c != 0; }
156
157 template <typename T>
gather(const T * p,U32 ix)158 SI T gather(const T* p, U32 ix) { return p[ix]; }
159
scatter_masked(I32 src,int * dst,U32 ix,I32 mask)160 SI void scatter_masked(I32 src, int* dst, U32 ix, I32 mask) {
161 dst[ix] = mask ? src : dst[ix];
162 }
163
load2(const uint16_t * ptr,U16 * r,U16 * g)164 SI void load2(const uint16_t* ptr, U16* r, U16* g) {
165 *r = ptr[0];
166 *g = ptr[1];
167 }
store2(uint16_t * ptr,U16 r,U16 g)168 SI void store2(uint16_t* ptr, U16 r, U16 g) {
169 ptr[0] = r;
170 ptr[1] = g;
171 }
load4(const uint16_t * ptr,U16 * r,U16 * g,U16 * b,U16 * a)172 SI void load4(const uint16_t* ptr, U16* r, U16* g, U16* b, U16* a) {
173 *r = ptr[0];
174 *g = ptr[1];
175 *b = ptr[2];
176 *a = ptr[3];
177 }
store4(uint16_t * ptr,U16 r,U16 g,U16 b,U16 a)178 SI void store4(uint16_t* ptr, U16 r, U16 g, U16 b, U16 a) {
179 ptr[0] = r;
180 ptr[1] = g;
181 ptr[2] = b;
182 ptr[3] = a;
183 }
184
load4(const float * ptr,F * r,F * g,F * b,F * a)185 SI void load4(const float* ptr, F* r, F* g, F* b, F* a) {
186 *r = ptr[0];
187 *g = ptr[1];
188 *b = ptr[2];
189 *a = ptr[3];
190 }
store4(float * ptr,F r,F g,F b,F a)191 SI void store4(float* ptr, F r, F g, F b, F a) {
192 ptr[0] = r;
193 ptr[1] = g;
194 ptr[2] = b;
195 ptr[3] = a;
196 }
197
198 #elif defined(SKRP_CPU_NEON)
199 template <typename T> using V = Vec<4, T>;
200 using F = V<float >;
201 using I32 = V< int32_t>;
202 using U64 = V<uint64_t>;
203 using U32 = V<uint32_t>;
204 using U16 = V<uint16_t>;
205 using U8 = V<uint8_t >;
206
207 // We polyfill a few routines that Clang doesn't build into ext_vector_types.
208 SI F min(F a, F b) { return vminq_f32(a,b); }
209 SI I32 min(I32 a, I32 b) { return vminq_s32(a,b); }
210 SI U32 min(U32 a, U32 b) { return vminq_u32(a,b); }
211 SI F max(F a, F b) { return vmaxq_f32(a,b); }
212 SI I32 max(I32 a, I32 b) { return vmaxq_s32(a,b); }
213 SI U32 max(U32 a, U32 b) { return vmaxq_u32(a,b); }
214
215 SI F abs_ (F v) { return vabsq_f32(v); }
216 SI I32 abs_ (I32 v) { return vabsq_s32(v); }
217 SI F rcp_approx(F v) { auto e = vrecpeq_f32(v); return vrecpsq_f32 (v,e ) * e; }
218 SI F rcp_precise(F v) { auto e = rcp_approx(v); return vrecpsq_f32 (v,e ) * e; }
219 SI F rsqrt_approx(F v) { auto e = vrsqrteq_f32(v); return vrsqrtsq_f32(v,e*e) * e; }
220
221 SI U16 pack(U32 v) { return __builtin_convertvector(v, U16); }
222 SI U8 pack(U16 v) { return __builtin_convertvector(v, U8); }
223
224 SI F if_then_else(I32 c, F t, F e) { return vbslq_f32((U32)c,t,e); }
225 SI I32 if_then_else(I32 c, I32 t, I32 e) { return vbslq_s32((U32)c,t,e); }
226
227 #if defined(SK_CPU_ARM64)
228 SI bool any(I32 c) { return vmaxvq_u32((U32)c) != 0; }
229 SI bool all(I32 c) { return vminvq_u32((U32)c) != 0; }
230
231 SI F mad(F f, F m, F a) { return vfmaq_f32(a,f,m); }
232 SI F nmad(F f, F m, F a) { return vfmsq_f32(a,f,m); }
233 SI F floor_(F v) { return vrndmq_f32(v); }
234 SI F ceil_(F v) { return vrndpq_f32(v); }
235 SI F sqrt_(F v) { return vsqrtq_f32(v); }
236 SI I32 iround(F v) { return vcvtnq_s32_f32(v); }
237 SI U32 round(F v) { return vcvtnq_u32_f32(v); }
238 SI U32 round(F v, F scale) { return vcvtnq_u32_f32(v*scale); }
239 #else
240 SI bool any(I32 c) { return c[0] | c[1] | c[2] | c[3]; }
241 SI bool all(I32 c) { return c[0] & c[1] & c[2] & c[3]; }
242
243 SI F mad(F f, F m, F a) { return vmlaq_f32(a,f,m); }
244 SI F nmad(F f, F m, F a) { return vmlsq_f32(a,f,m); }
245
246 SI F floor_(F v) {
247 F roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v));
248 return roundtrip - if_then_else(roundtrip > v, F() + 1, F());
249 }
250
251 SI F ceil_(F v) {
252 F roundtrip = vcvtq_f32_s32(vcvtq_s32_f32(v));
253 return roundtrip + if_then_else(roundtrip < v, F() + 1, F());
254 }
255
256 SI F sqrt_(F v) {
257 auto e = vrsqrteq_f32(v); // Estimate and two refinement steps for e = rsqrt(v).
258 e *= vrsqrtsq_f32(v,e*e);
259 e *= vrsqrtsq_f32(v,e*e);
260 return v*e; // sqrt(v) == v*rsqrt(v).
261 }
262
263 SI I32 iround(F v) {
264 return vcvtq_s32_f32(v + 0.5f);
265 }
266
267 SI U32 round(F v) {
268 return vcvtq_u32_f32(v + 0.5f);
269 }
270
271 SI U32 round(F v, F scale) {
272 return vcvtq_u32_f32(mad(v, scale, F() + 0.5f));
273 }
274 #endif
275
276 template <typename T>
277 SI V<T> gather(const T* p, U32 ix) {
278 return V<T>{p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]};
279 }
280 SI void scatter_masked(I32 src, int* dst, U32 ix, I32 mask) {
281 I32 before = gather(dst, ix);
282 I32 after = if_then_else(mask, src, before);
283 dst[ix[0]] = after[0];
284 dst[ix[1]] = after[1];
285 dst[ix[2]] = after[2];
286 dst[ix[3]] = after[3];
287 }
288 SI void load2(const uint16_t* ptr, U16* r, U16* g) {
289 uint16x4x2_t rg = vld2_u16(ptr);
290 *r = rg.val[0];
291 *g = rg.val[1];
292 }
293 SI void store2(uint16_t* ptr, U16 r, U16 g) {
294 vst2_u16(ptr, (uint16x4x2_t{{r,g}}));
295 }
296 SI void load4(const uint16_t* ptr, U16* r, U16* g, U16* b, U16* a) {
297 uint16x4x4_t rgba = vld4_u16(ptr);
298 *r = rgba.val[0];
299 *g = rgba.val[1];
300 *b = rgba.val[2];
301 *a = rgba.val[3];
302 }
303
304 SI void store4(uint16_t* ptr, U16 r, U16 g, U16 b, U16 a) {
305 vst4_u16(ptr, (uint16x4x4_t{{r,g,b,a}}));
306 }
307 SI void load4(const float* ptr, F* r, F* g, F* b, F* a) {
308 float32x4x4_t rgba = vld4q_f32(ptr);
309 *r = rgba.val[0];
310 *g = rgba.val[1];
311 *b = rgba.val[2];
312 *a = rgba.val[3];
313 }
314 SI void store4(float* ptr, F r, F g, F b, F a) {
315 vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}}));
316 }
317
318 #elif defined(SKRP_CPU_SKX)
319 template <typename T> using V = Vec<16, T>;
320 using F = V<float >;
321 using I32 = V< int32_t>;
322 using U64 = V<uint64_t>;
323 using U32 = V<uint32_t>;
324 using U16 = V<uint16_t>;
325 using U8 = V<uint8_t >;
326
327 SI F mad(F f, F m, F a) { return _mm512_fmadd_ps(f, m, a); }
328 SI F nmad(F f, F m, F a) { return _mm512_fnmadd_ps(f, m, a); }
329 SI F min(F a, F b) { return _mm512_min_ps(a,b); }
330 SI I32 min(I32 a, I32 b) { return (I32)_mm512_min_epi32((__m512i)a,(__m512i)b); }
331 SI U32 min(U32 a, U32 b) { return (U32)_mm512_min_epu32((__m512i)a,(__m512i)b); }
332 SI F max(F a, F b) { return _mm512_max_ps(a,b); }
333 SI I32 max(I32 a, I32 b) { return (I32)_mm512_max_epi32((__m512i)a,(__m512i)b); }
334 SI U32 max(U32 a, U32 b) { return (U32)_mm512_max_epu32((__m512i)a,(__m512i)b); }
335 SI F abs_ (F v) { return _mm512_and_ps(v, _mm512_sub_ps(_mm512_setzero(), v)); }
336 SI I32 abs_ (I32 v) { return (I32)_mm512_abs_epi32((__m512i)v); }
337 SI F floor_(F v) { return _mm512_floor_ps(v); }
338 SI F ceil_(F v) { return _mm512_ceil_ps(v); }
339 SI F rcp_approx(F v) { return _mm512_rcp14_ps (v); }
340 SI F rsqrt_approx (F v) { return _mm512_rsqrt14_ps(v); }
341 SI F sqrt_ (F v) { return _mm512_sqrt_ps (v); }
342 SI F rcp_precise (F v) {
343 F e = rcp_approx(v);
344 return _mm512_fnmadd_ps(v, e, _mm512_set1_ps(2.0f)) * e;
345 }
346 SI I32 iround(F v) { return (I32)_mm512_cvtps_epi32(v); }
347 SI U32 round(F v) { return (U32)_mm512_cvtps_epi32(v); }
348 SI U32 round(F v, F scale) { return (U32)_mm512_cvtps_epi32(v*scale); }
349 SI U16 pack(U32 v) {
350 __m256i rst = _mm256_packus_epi32(_mm512_castsi512_si256((__m512i)v),
351 _mm512_extracti64x4_epi64((__m512i)v, 1));
352 return (U16)_mm256_permutex_epi64(rst, 216);
353 }
354 SI U8 pack(U16 v) {
355 __m256i rst = _mm256_packus_epi16((__m256i)v, (__m256i)v);
356 return (U8)_mm256_castsi256_si128(_mm256_permute4x64_epi64(rst, 8));
357 }
358 SI F if_then_else(I32 c, F t, F e) {
359 __m512i mask = _mm512_set1_epi32(0x80000000);
360 __m512i aa = _mm512_and_si512((__m512i)c, mask);
361 return _mm512_mask_blend_ps(_mm512_test_epi32_mask(aa, aa),e,t);
362 }
363 SI I32 if_then_else(I32 c, I32 t, I32 e) {
364 __m512i mask = _mm512_set1_epi32(0x80000000);
365 __m512i aa = _mm512_and_si512((__m512i)c, mask);
366 return (I32)_mm512_mask_blend_epi32(_mm512_test_epi32_mask(aa, aa),(__m512i)e,(__m512i)t);
367 }
368 SI bool any(I32 c) {
369 __mmask16 mask32 = _mm512_test_epi32_mask((__m512i)c, (__m512i)c);
370 return mask32 != 0;
371 }
372 SI bool all(I32 c) {
373 __mmask16 mask32 = _mm512_test_epi32_mask((__m512i)c, (__m512i)c);
374 return mask32 == 0xffff;
375 }
376 template <typename T>
377 SI V<T> gather(const T* p, U32 ix) {
378 return V<T>{ p[ix[ 0]], p[ix[ 1]], p[ix[ 2]], p[ix[ 3]],
379 p[ix[ 4]], p[ix[ 5]], p[ix[ 6]], p[ix[ 7]],
380 p[ix[ 8]], p[ix[ 9]], p[ix[10]], p[ix[11]],
381 p[ix[12]], p[ix[13]], p[ix[14]], p[ix[15]] };
382 }
383 SI F gather(const float* p, U32 ix) { return _mm512_i32gather_ps((__m512i)ix, p, 4); }
384 SI U32 gather(const uint32_t* p, U32 ix) {
385 return (U32)_mm512_i32gather_epi32((__m512i)ix, p, 4); }
386 SI U64 gather(const uint64_t* p, U32 ix) {
387 __m512i parts[] = {
388 _mm512_i32gather_epi64(_mm512_castsi512_si256((__m512i)ix), p, 8),
389 _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)ix, 1), p, 8),
390 };
391 return sk_bit_cast<U64>(parts);
392 }
393 template <typename V, typename S>
394 SI void scatter_masked(V src, S* dst, U32 ix, I32 mask) {
395 V before = gather(dst, ix);
396 V after = if_then_else(mask, src, before);
397 dst[ix[0]] = after[0];
398 dst[ix[1]] = after[1];
399 dst[ix[2]] = after[2];
400 dst[ix[3]] = after[3];
401 dst[ix[4]] = after[4];
402 dst[ix[5]] = after[5];
403 dst[ix[6]] = after[6];
404 dst[ix[7]] = after[7];
405 dst[ix[8]] = after[8];
406 dst[ix[9]] = after[9];
407 dst[ix[10]] = after[10];
408 dst[ix[11]] = after[11];
409 dst[ix[12]] = after[12];
410 dst[ix[13]] = after[13];
411 dst[ix[14]] = after[14];
412 dst[ix[15]] = after[15];
413 }
414
415 SI void load2(const uint16_t* ptr, U16* r, U16* g) {
416 __m256i _01234567 = _mm256_loadu_si256(((const __m256i*)ptr) + 0);
417 __m256i _89abcdef = _mm256_loadu_si256(((const __m256i*)ptr) + 1);
418
419 *r = (U16)_mm256_permute4x64_epi64(_mm256_packs_epi32(_mm256_srai_epi32(_mm256_slli_epi32
420 (_01234567, 16), 16), _mm256_srai_epi32(_mm256_slli_epi32(_89abcdef, 16), 16)), 216);
421 *g = (U16)_mm256_permute4x64_epi64(_mm256_packs_epi32(_mm256_srai_epi32(_01234567, 16),
422 _mm256_srai_epi32(_89abcdef, 16)), 216);
423 }
424 SI void store2(uint16_t* ptr, U16 r, U16 g) {
425 __m256i _01234567 = _mm256_unpacklo_epi16((__m256i)r, (__m256i)g);
426 __m256i _89abcdef = _mm256_unpackhi_epi16((__m256i)r, (__m256i)g);
427 __m512i combinedVector = _mm512_inserti64x4(_mm512_castsi256_si512(_01234567),
428 _89abcdef, 1);
429 __m512i aa = _mm512_permutexvar_epi64(_mm512_setr_epi64(0,1,4,5,2,3,6,7), combinedVector);
430 _01234567 = _mm512_castsi512_si256(aa);
431 _89abcdef = _mm512_extracti64x4_epi64(aa, 1);
432
433 _mm256_storeu_si256((__m256i*)ptr + 0, _01234567);
434 _mm256_storeu_si256((__m256i*)ptr + 1, _89abcdef);
435 }
436
437 SI void load4(const uint16_t* ptr, U16* r, U16* g, U16* b, U16* a) {
438 __m256i _0123 = _mm256_loadu_si256((const __m256i*)ptr),
439 _4567 = _mm256_loadu_si256(((const __m256i*)ptr) + 1),
440 _89ab = _mm256_loadu_si256(((const __m256i*)ptr) + 2),
441 _cdef = _mm256_loadu_si256(((const __m256i*)ptr) + 3);
442
443 auto a0 = _mm256_unpacklo_epi16(_0123, _4567),
444 a1 = _mm256_unpackhi_epi16(_0123, _4567),
445 b0 = _mm256_unpacklo_epi16(a0, a1),
446 b1 = _mm256_unpackhi_epi16(a0, a1),
447 a2 = _mm256_unpacklo_epi16(_89ab, _cdef),
448 a3 = _mm256_unpackhi_epi16(_89ab, _cdef),
449 b2 = _mm256_unpacklo_epi16(a2, a3),
450 b3 = _mm256_unpackhi_epi16(a2, a3),
451 rr = _mm256_unpacklo_epi64(b0, b2),
452 gg = _mm256_unpackhi_epi64(b0, b2),
453 bb = _mm256_unpacklo_epi64(b1, b3),
454 aa = _mm256_unpackhi_epi64(b1, b3);
455
456 *r = (U16)_mm256_permutexvar_epi32(_mm256_setr_epi32(0,4,1,5,2,6,3,7), rr);
457 *g = (U16)_mm256_permutexvar_epi32(_mm256_setr_epi32(0,4,1,5,2,6,3,7), gg);
458 *b = (U16)_mm256_permutexvar_epi32(_mm256_setr_epi32(0,4,1,5,2,6,3,7), bb);
459 *a = (U16)_mm256_permutexvar_epi32(_mm256_setr_epi32(0,4,1,5,2,6,3,7), aa);
460 }
461 SI void store4(uint16_t* ptr, U16 r, U16 g, U16 b, U16 a) {
462 auto rg012389ab = _mm256_unpacklo_epi16((__m256i)r, (__m256i)g),
463 rg4567cdef = _mm256_unpackhi_epi16((__m256i)r, (__m256i)g),
464 ba012389ab = _mm256_unpacklo_epi16((__m256i)b, (__m256i)a),
465 ba4567cdef = _mm256_unpackhi_epi16((__m256i)b, (__m256i)a);
466
467 auto _0189 = _mm256_unpacklo_epi32(rg012389ab, ba012389ab),
468 _23ab = _mm256_unpackhi_epi32(rg012389ab, ba012389ab),
469 _45cd = _mm256_unpacklo_epi32(rg4567cdef, ba4567cdef),
470 _67ef = _mm256_unpackhi_epi32(rg4567cdef, ba4567cdef);
471
472 auto _ab23 = _mm256_permutex_epi64(_23ab, 78),
473 _0123 = _mm256_blend_epi32(_0189, _ab23, 0xf0),
474 _89ab = _mm256_permutex_epi64(_mm256_blend_epi32(_0189, _ab23, 0x0f), 78),
475 _ef67 = _mm256_permutex_epi64(_67ef, 78),
476 _4567 = _mm256_blend_epi32(_45cd, _ef67, 0xf0),
477 _cdef = _mm256_permutex_epi64(_mm256_blend_epi32(_45cd, _ef67, 0x0f), 78);
478
479 _mm256_storeu_si256((__m256i*)ptr, _0123);
480 _mm256_storeu_si256((__m256i*)ptr + 1, _4567);
481 _mm256_storeu_si256((__m256i*)ptr + 2, _89ab);
482 _mm256_storeu_si256((__m256i*)ptr + 3, _cdef);
483 }
484
485 SI void load4(const float* ptr, F* r, F* g, F* b, F* a) {
486 F _048c, _159d, _26ae, _37bf;
487
488 _048c = _mm512_castps128_ps512(_mm_loadu_ps(ptr) );
489 _048c = _mm512_insertf32x4(_048c, _mm_loadu_ps(ptr+16), 1);
490 _048c = _mm512_insertf32x4(_048c, _mm_loadu_ps(ptr+32), 2);
491 _048c = _mm512_insertf32x4(_048c, _mm_loadu_ps(ptr+48), 3);
492 _159d = _mm512_castps128_ps512(_mm_loadu_ps(ptr+4) );
493 _159d = _mm512_insertf32x4(_159d, _mm_loadu_ps(ptr+20), 1);
494 _159d = _mm512_insertf32x4(_159d, _mm_loadu_ps(ptr+36), 2);
495 _159d = _mm512_insertf32x4(_159d, _mm_loadu_ps(ptr+52), 3);
496 _26ae = _mm512_castps128_ps512(_mm_loadu_ps(ptr+8) );
497 _26ae = _mm512_insertf32x4(_26ae, _mm_loadu_ps(ptr+24), 1);
498 _26ae = _mm512_insertf32x4(_26ae, _mm_loadu_ps(ptr+40), 2);
499 _26ae = _mm512_insertf32x4(_26ae, _mm_loadu_ps(ptr+56), 3);
500 _37bf = _mm512_castps128_ps512(_mm_loadu_ps(ptr+12) );
501 _37bf = _mm512_insertf32x4(_37bf, _mm_loadu_ps(ptr+28), 1);
502 _37bf = _mm512_insertf32x4(_37bf, _mm_loadu_ps(ptr+44), 2);
503 _37bf = _mm512_insertf32x4(_37bf, _mm_loadu_ps(ptr+60), 3);
504
505 F rg02468acf = _mm512_unpacklo_ps(_048c, _26ae),
506 ba02468acf = _mm512_unpackhi_ps(_048c, _26ae),
507 rg13579bde = _mm512_unpacklo_ps(_159d, _37bf),
508 ba13579bde = _mm512_unpackhi_ps(_159d, _37bf);
509
510 *r = (F)_mm512_unpacklo_ps(rg02468acf, rg13579bde);
511 *g = (F)_mm512_unpackhi_ps(rg02468acf, rg13579bde);
512 *b = (F)_mm512_unpacklo_ps(ba02468acf, ba13579bde);
513 *a = (F)_mm512_unpackhi_ps(ba02468acf, ba13579bde);
514 }
515
516 SI void store4(float* ptr, F r, F g, F b, F a) {
517 F rg014589cd = _mm512_unpacklo_ps(r, g),
518 rg2367abef = _mm512_unpackhi_ps(r, g),
519 ba014589cd = _mm512_unpacklo_ps(b, a),
520 ba2367abef = _mm512_unpackhi_ps(b, a);
521
522 F _048c = (F)_mm512_unpacklo_pd((__m512d)rg014589cd, (__m512d)ba014589cd),
523 _26ae = (F)_mm512_unpacklo_pd((__m512d)rg2367abef, (__m512d)ba2367abef),
524 _159d = (F)_mm512_unpackhi_pd((__m512d)rg014589cd, (__m512d)ba014589cd),
525 _37bf = (F)_mm512_unpackhi_pd((__m512d)rg2367abef, (__m512d)ba2367abef);
526
527 F _ae26 = (F)_mm512_permutexvar_pd(_mm512_setr_epi64(4,5,6,7,0,1,2,3), (__m512d)_26ae),
528 _bf37 = (F)_mm512_permutexvar_pd(_mm512_setr_epi64(4,5,6,7,0,1,2,3), (__m512d)_37bf),
529 _8c04 = (F)_mm512_permutexvar_pd(_mm512_setr_epi64(4,5,6,7,0,1,2,3), (__m512d)_048c),
530 _9d15 = (F)_mm512_permutexvar_pd(_mm512_setr_epi64(4,5,6,7,0,1,2,3), (__m512d)_159d);
531
532 __m512i index = _mm512_setr_epi32(4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11);
533 F _0426 = (F)_mm512_permutex2var_pd((__m512d)_048c, _mm512_setr_epi64(0,1,2,3,12,13,14,15),
534 (__m512d)_ae26),
535 _1537 = (F)_mm512_permutex2var_pd((__m512d)_159d, _mm512_setr_epi64(0,1,2,3,12,13,14,15),
536 (__m512d)_bf37),
537 _5173 = _mm512_permutexvar_ps(index, _1537),
538 _0123 = (F)_mm512_permutex2var_pd((__m512d)_0426, _mm512_setr_epi64(0,1,10,11,4,5,14,15),
539 (__m512d)_5173);
540
541 F _5476 = (F)_mm512_permutex2var_pd((__m512d)_5173, _mm512_setr_epi64(0,1,10,11,4,5,14,15),
542 (__m512d)_0426),
543 _4567 = _mm512_permutexvar_ps(index, _5476),
544 _8cae = (F)_mm512_permutex2var_pd((__m512d)_8c04, _mm512_setr_epi64(0,1,2,3,12,13,14,15),
545 (__m512d)_26ae),
546 _9dbf = (F)_mm512_permutex2var_pd((__m512d)_9d15, _mm512_setr_epi64(0,1,2,3,12,13,14,15),
547 (__m512d)_37bf),
548 _d9fb = _mm512_permutexvar_ps(index, _9dbf),
549 _89ab = (F)_mm512_permutex2var_pd((__m512d)_8cae, _mm512_setr_epi64(0,1,10,11,4,5,14,15),
550 (__m512d)_d9fb),
551 _dcfe = (F)_mm512_permutex2var_pd((__m512d)_d9fb, _mm512_setr_epi64(0,1,10,11,4,5,14,15),
552 (__m512d)_8cae),
553 _cdef = _mm512_permutexvar_ps(index, _dcfe);
554
555 _mm512_storeu_ps(ptr+0, _0123);
556 _mm512_storeu_ps(ptr+16, _4567);
557 _mm512_storeu_ps(ptr+32, _89ab);
558 _mm512_storeu_ps(ptr+48, _cdef);
559 }
560
561 #elif defined(SKRP_CPU_HSW)
562 // These are __m256 and __m256i, but friendlier and strongly-typed.
563 template <typename T> using V = Vec<8, T>;
564 using F = V<float >;
565 using I32 = V< int32_t>;
566 using U64 = V<uint64_t>;
567 using U32 = V<uint32_t>;
568 using U16 = V<uint16_t>;
569 using U8 = V<uint8_t >;
570
571 SI F mad(F f, F m, F a) { return _mm256_fmadd_ps(f, m, a); }
572 SI F nmad(F f, F m, F a) { return _mm256_fnmadd_ps(f, m, a); }
573
574 SI F min(F a, F b) { return _mm256_min_ps(a,b); }
575 SI I32 min(I32 a, I32 b) { return (I32)_mm256_min_epi32((__m256i)a,(__m256i)b); }
576 SI U32 min(U32 a, U32 b) { return (U32)_mm256_min_epu32((__m256i)a,(__m256i)b); }
577 SI F max(F a, F b) { return _mm256_max_ps(a,b); }
578 SI I32 max(I32 a, I32 b) { return (I32)_mm256_max_epi32((__m256i)a,(__m256i)b); }
579 SI U32 max(U32 a, U32 b) { return (U32)_mm256_max_epu32((__m256i)a,(__m256i)b); }
580
581 SI F abs_ (F v) { return _mm256_and_ps(v, 0-v); }
582 SI I32 abs_ (I32 v) { return (I32)_mm256_abs_epi32((__m256i)v); }
583 SI F floor_(F v) { return _mm256_floor_ps(v); }
584 SI F ceil_(F v) { return _mm256_ceil_ps(v); }
585 SI F rcp_approx(F v) { return _mm256_rcp_ps (v); } // use rcp_fast instead
586 SI F rsqrt_approx(F v) { return _mm256_rsqrt_ps(v); }
587 SI F sqrt_ (F v) { return _mm256_sqrt_ps (v); }
588 SI F rcp_precise (F v) {
589 F e = rcp_approx(v);
590 return _mm256_fnmadd_ps(v, e, _mm256_set1_ps(2.0f)) * e;
591 }
592
593 SI I32 iround(F v) { return (I32)_mm256_cvtps_epi32(v); }
594 SI U32 round(F v) { return (U32)_mm256_cvtps_epi32(v); }
595 SI U32 round(F v, F scale) { return (U32)_mm256_cvtps_epi32(v*scale); }
596 SI U16 pack(U32 v) {
597 return (U16)_mm_packus_epi32(_mm256_extractf128_si256((__m256i)v, 0),
598 _mm256_extractf128_si256((__m256i)v, 1));
599 }
600 SI U8 pack(U16 v) {
601 auto r = _mm_packus_epi16((__m128i)v,(__m128i)v);
602 return sk_unaligned_load<U8>(&r);
603 }
604
605 SI F if_then_else(I32 c, F t, F e) { return _mm256_blendv_ps(e, t, (__m256)c); }
606 SI I32 if_then_else(I32 c, I32 t, I32 e) {
607 return (I32)_mm256_blendv_ps((__m256)e, (__m256)t, (__m256)c);
608 }
609
610 // NOTE: This version of 'all' only works with mask values (true == all bits set)
611 SI bool any(I32 c) { return !_mm256_testz_si256((__m256i)c, _mm256_set1_epi32(-1)); }
612 SI bool all(I32 c) { return _mm256_testc_si256((__m256i)c, _mm256_set1_epi32(-1)); }
613
614 template <typename T>
615 SI V<T> gather(const T* p, U32 ix) {
616 return V<T>{ p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
617 p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]], };
618 }
619 SI F gather(const float* p, U32 ix) { return _mm256_i32gather_ps(p, (__m256i)ix, 4); }
620 SI U32 gather(const uint32_t* p, U32 ix) {
621 return (U32)_mm256_i32gather_epi32((const int*)p, (__m256i)ix, 4);
622 }
623 SI U64 gather(const uint64_t* p, U32 ix) {
624 __m256i parts[] = {
625 _mm256_i32gather_epi64(
626 (const long long int*)p, _mm256_extracti128_si256((__m256i)ix, 0), 8),
627 _mm256_i32gather_epi64(
628 (const long long int*)p, _mm256_extracti128_si256((__m256i)ix, 1), 8),
629 };
630 return sk_bit_cast<U64>(parts);
631 }
632 SI void scatter_masked(I32 src, int* dst, U32 ix, I32 mask) {
633 I32 before = gather(dst, ix);
634 I32 after = if_then_else(mask, src, before);
635 dst[ix[0]] = after[0];
636 dst[ix[1]] = after[1];
637 dst[ix[2]] = after[2];
638 dst[ix[3]] = after[3];
639 dst[ix[4]] = after[4];
640 dst[ix[5]] = after[5];
641 dst[ix[6]] = after[6];
642 dst[ix[7]] = after[7];
643 }
644
645 SI void load2(const uint16_t* ptr, U16* r, U16* g) {
646 __m128i _0123 = _mm_loadu_si128(((const __m128i*)ptr) + 0),
647 _4567 = _mm_loadu_si128(((const __m128i*)ptr) + 1);
648 *r = (U16)_mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(_0123, 16), 16),
649 _mm_srai_epi32(_mm_slli_epi32(_4567, 16), 16));
650 *g = (U16)_mm_packs_epi32(_mm_srai_epi32(_0123, 16),
651 _mm_srai_epi32(_4567, 16));
652 }
653 SI void store2(uint16_t* ptr, U16 r, U16 g) {
654 auto _0123 = _mm_unpacklo_epi16((__m128i)r, (__m128i)g),
655 _4567 = _mm_unpackhi_epi16((__m128i)r, (__m128i)g);
656 _mm_storeu_si128((__m128i*)ptr + 0, _0123);
657 _mm_storeu_si128((__m128i*)ptr + 1, _4567);
658 }
659
660 SI void load4(const uint16_t* ptr, U16* r, U16* g, U16* b, U16* a) {
661 __m128i _01 = _mm_loadu_si128(((const __m128i*)ptr) + 0),
662 _23 = _mm_loadu_si128(((const __m128i*)ptr) + 1),
663 _45 = _mm_loadu_si128(((const __m128i*)ptr) + 2),
664 _67 = _mm_loadu_si128(((const __m128i*)ptr) + 3);
665
666 auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2
667 _13 = _mm_unpackhi_epi16(_01, _23), // r1 r3 g1 g3 b1 b3 a1 a3
668 _46 = _mm_unpacklo_epi16(_45, _67),
669 _57 = _mm_unpackhi_epi16(_45, _67);
670
671 auto rg0123 = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
672 ba0123 = _mm_unpackhi_epi16(_02, _13), // b0 b1 b2 b3 a0 a1 a2 a3
673 rg4567 = _mm_unpacklo_epi16(_46, _57),
674 ba4567 = _mm_unpackhi_epi16(_46, _57);
675
676 *r = (U16)_mm_unpacklo_epi64(rg0123, rg4567);
677 *g = (U16)_mm_unpackhi_epi64(rg0123, rg4567);
678 *b = (U16)_mm_unpacklo_epi64(ba0123, ba4567);
679 *a = (U16)_mm_unpackhi_epi64(ba0123, ba4567);
680 }
681 SI void store4(uint16_t* ptr, U16 r, U16 g, U16 b, U16 a) {
682 auto rg0123 = _mm_unpacklo_epi16((__m128i)r, (__m128i)g), // r0 g0 r1 g1 r2 g2 r3 g3
683 rg4567 = _mm_unpackhi_epi16((__m128i)r, (__m128i)g), // r4 g4 r5 g5 r6 g6 r7 g7
684 ba0123 = _mm_unpacklo_epi16((__m128i)b, (__m128i)a),
685 ba4567 = _mm_unpackhi_epi16((__m128i)b, (__m128i)a);
686
687 auto _01 = _mm_unpacklo_epi32(rg0123, ba0123),
688 _23 = _mm_unpackhi_epi32(rg0123, ba0123),
689 _45 = _mm_unpacklo_epi32(rg4567, ba4567),
690 _67 = _mm_unpackhi_epi32(rg4567, ba4567);
691
692 _mm_storeu_si128((__m128i*)ptr + 0, _01);
693 _mm_storeu_si128((__m128i*)ptr + 1, _23);
694 _mm_storeu_si128((__m128i*)ptr + 2, _45);
695 _mm_storeu_si128((__m128i*)ptr + 3, _67);
696 }
697
698 SI void load4(const float* ptr, F* r, F* g, F* b, F* a) {
699 F _04 = _mm256_castps128_ps256(_mm_loadu_ps(ptr+ 0)),
700 _15 = _mm256_castps128_ps256(_mm_loadu_ps(ptr+ 4)),
701 _26 = _mm256_castps128_ps256(_mm_loadu_ps(ptr+ 8)),
702 _37 = _mm256_castps128_ps256(_mm_loadu_ps(ptr+12));
703 _04 = _mm256_insertf128_ps(_04, _mm_loadu_ps(ptr+16), 1);
704 _15 = _mm256_insertf128_ps(_15, _mm_loadu_ps(ptr+20), 1);
705 _26 = _mm256_insertf128_ps(_26, _mm_loadu_ps(ptr+24), 1);
706 _37 = _mm256_insertf128_ps(_37, _mm_loadu_ps(ptr+28), 1);
707
708 F rg0145 = _mm256_unpacklo_ps(_04,_15), // r0 r1 g0 g1 | r4 r5 g4 g5
709 ba0145 = _mm256_unpackhi_ps(_04,_15),
710 rg2367 = _mm256_unpacklo_ps(_26,_37),
711 ba2367 = _mm256_unpackhi_ps(_26,_37);
712
713 *r = (F)_mm256_unpacklo_pd((__m256d)rg0145, (__m256d)rg2367);
714 *g = (F)_mm256_unpackhi_pd((__m256d)rg0145, (__m256d)rg2367);
715 *b = (F)_mm256_unpacklo_pd((__m256d)ba0145, (__m256d)ba2367);
716 *a = (F)_mm256_unpackhi_pd((__m256d)ba0145, (__m256d)ba2367);
717 }
718 SI void store4(float* ptr, F r, F g, F b, F a) {
719 F rg0145 = _mm256_unpacklo_ps(r, g), // r0 g0 r1 g1 | r4 g4 r5 g5
720 rg2367 = _mm256_unpackhi_ps(r, g), // r2 ... | r6 ...
721 ba0145 = _mm256_unpacklo_ps(b, a), // b0 a0 b1 a1 | b4 a4 b5 a5
722 ba2367 = _mm256_unpackhi_ps(b, a); // b2 ... | b6 ...
723
724 F _04 = (F)_mm256_unpacklo_pd((__m256d)rg0145, (__m256d)ba0145),// r0 g0 b0 a0 | r4 g4 b4 a4
725 _15 = (F)_mm256_unpackhi_pd((__m256d)rg0145, (__m256d)ba0145),// r1 ... | r5 ...
726 _26 = (F)_mm256_unpacklo_pd((__m256d)rg2367, (__m256d)ba2367),// r2 ... | r6 ...
727 _37 = (F)_mm256_unpackhi_pd((__m256d)rg2367, (__m256d)ba2367);// r3 ... | r7 ...
728
729 F _01 = _mm256_permute2f128_ps(_04, _15, 32), // 32 == 0010 0000 == lo, lo
730 _23 = _mm256_permute2f128_ps(_26, _37, 32),
731 _45 = _mm256_permute2f128_ps(_04, _15, 49), // 49 == 0011 0001 == hi, hi
732 _67 = _mm256_permute2f128_ps(_26, _37, 49);
733 _mm256_storeu_ps(ptr+ 0, _01);
734 _mm256_storeu_ps(ptr+ 8, _23);
735 _mm256_storeu_ps(ptr+16, _45);
736 _mm256_storeu_ps(ptr+24, _67);
737 }
738
739 #elif defined(SKRP_CPU_SSE2) || defined(SKRP_CPU_SSE41) || defined(SKRP_CPU_AVX)
740 template <typename T> using V = Vec<4, T>;
741 using F = V<float >;
742 using I32 = V< int32_t>;
743 using U64 = V<uint64_t>;
744 using U32 = V<uint32_t>;
745 using U16 = V<uint16_t>;
746 using U8 = V<uint8_t >;
747
748 SI F if_then_else(I32 c, F t, F e) {
749 return _mm_or_ps(_mm_and_ps((__m128)c, t), _mm_andnot_ps((__m128)c, e));
750 }
751 SI I32 if_then_else(I32 c, I32 t, I32 e) {
752 return (I32)_mm_or_ps(_mm_and_ps((__m128)c, (__m128)t),
753 _mm_andnot_ps((__m128)c, (__m128)e));
754 }
755
756 SI F min(F a, F b) { return _mm_min_ps(a,b); }
757 SI F max(F a, F b) { return _mm_max_ps(a,b); }
758 #if defined(SKRP_CPU_SSE41) || defined(SKRP_CPU_AVX)
759 SI I32 min(I32 a, I32 b) { return (I32)_mm_min_epi32((__m128i)a,(__m128i)b); }
760 SI U32 min(U32 a, U32 b) { return (U32)_mm_min_epu32((__m128i)a,(__m128i)b); }
761 SI I32 max(I32 a, I32 b) { return (I32)_mm_max_epi32((__m128i)a,(__m128i)b); }
762 SI U32 max(U32 a, U32 b) { return (U32)_mm_max_epu32((__m128i)a,(__m128i)b); }
763 #else
764 SI I32 min(I32 a, I32 b) { return if_then_else(a < b, a, b); }
765 SI I32 max(I32 a, I32 b) { return if_then_else(a > b, a, b); }
766 SI U32 min(U32 a, U32 b) {
767 return sk_bit_cast<U32>(if_then_else(a < b, sk_bit_cast<I32>(a), sk_bit_cast<I32>(b)));
768 }
769 SI U32 max(U32 a, U32 b) {
770 return sk_bit_cast<U32>(if_then_else(a > b, sk_bit_cast<I32>(a), sk_bit_cast<I32>(b)));
771 }
772 #endif
773
774 SI F mad(F f, F m, F a) { return a+f*m; }
775 SI F nmad(F f, F m, F a) { return a-f*m; }
776 SI F abs_(F v) { return _mm_and_ps(v, 0-v); }
777 #if defined(SKRP_CPU_SSE41) || defined(SKRP_CPU_AVX)
778 SI I32 abs_(I32 v) { return (I32)_mm_abs_epi32((__m128i)v); }
779 #else
780 SI I32 abs_(I32 v) { return max(v, -v); }
781 #endif
782 SI F rcp_approx(F v) { return _mm_rcp_ps (v); } // use rcp_fast instead
783 SI F rcp_precise (F v) { F e = rcp_approx(v); return e * (2.0f - v * e); }
784 SI F rsqrt_approx(F v) { return _mm_rsqrt_ps(v); }
785 SI F sqrt_(F v) { return _mm_sqrt_ps (v); }
786
787 SI I32 iround(F v) { return (I32)_mm_cvtps_epi32(v); }
788 SI U32 round(F v) { return (U32)_mm_cvtps_epi32(v); }
789 SI U32 round(F v, F scale) { return (U32)_mm_cvtps_epi32(v*scale); }
790
791 SI U16 pack(U32 v) {
792 #if defined(SKRP_CPU_SSE41) || defined(SKRP_CPU_AVX)
793 auto p = _mm_packus_epi32((__m128i)v,(__m128i)v);
794 #else
795 // Sign extend so that _mm_packs_epi32() does the pack we want.
796 auto p = _mm_srai_epi32(_mm_slli_epi32((__m128i)v, 16), 16);
797 p = _mm_packs_epi32(p,p);
798 #endif
799 return sk_unaligned_load<U16>(&p); // We have two copies. Return (the lower) one.
800 }
801 SI U8 pack(U16 v) {
802 auto r = widen_cast<__m128i>(v);
803 r = _mm_packus_epi16(r,r);
804 return sk_unaligned_load<U8>(&r);
805 }
806
807 // NOTE: This only checks the top bit of each lane, and is incorrect with non-mask values.
808 SI bool any(I32 c) { return _mm_movemask_ps(sk_bit_cast<F>(c)) != 0b0000; }
809 SI bool all(I32 c) { return _mm_movemask_ps(sk_bit_cast<F>(c)) == 0b1111; }
810
811 SI F floor_(F v) {
812 #if defined(SKRP_CPU_SSE41) || defined(SKRP_CPU_AVX)
813 return _mm_floor_ps(v);
814 #else
815 F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v));
816 return roundtrip - if_then_else(roundtrip > v, F() + 1, F() + 0);
817 #endif
818 }
819
820 SI F ceil_(F v) {
821 #if defined(SKRP_CPU_SSE41) || defined(SKRP_CPU_AVX)
822 return _mm_ceil_ps(v);
823 #else
824 F roundtrip = _mm_cvtepi32_ps(_mm_cvttps_epi32(v));
825 return roundtrip + if_then_else(roundtrip < v, F() + 1, F() + 0);
826 #endif
827 }
828
829 template <typename T>
830 SI V<T> gather(const T* p, U32 ix) {
831 return V<T>{p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]};
832 }
833 SI void scatter_masked(I32 src, int* dst, U32 ix, I32 mask) {
834 I32 before = gather(dst, ix);
835 I32 after = if_then_else(mask, src, before);
836 dst[ix[0]] = after[0];
837 dst[ix[1]] = after[1];
838 dst[ix[2]] = after[2];
839 dst[ix[3]] = after[3];
840 }
841 SI void load2(const uint16_t* ptr, U16* r, U16* g) {
842 __m128i _01 = _mm_loadu_si128(((const __m128i*)ptr) + 0); // r0 g0 r1 g1 r2 g2 r3 g3
843 auto rg01_23 = _mm_shufflelo_epi16(_01, 0xD8); // r0 r1 g0 g1 r2 g2 r3 g3
844 auto rg = _mm_shufflehi_epi16(rg01_23, 0xD8); // r0 r1 g0 g1 r2 r3 g2 g3
845
846 auto R = _mm_shuffle_epi32(rg, 0x88); // r0 r1 r2 r3 r0 r1 r2 r3
847 auto G = _mm_shuffle_epi32(rg, 0xDD); // g0 g1 g2 g3 g0 g1 g2 g3
848 *r = sk_unaligned_load<U16>(&R);
849 *g = sk_unaligned_load<U16>(&G);
850 }
851 SI void store2(uint16_t* ptr, U16 r, U16 g) {
852 __m128i rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g));
853 _mm_storeu_si128((__m128i*)ptr + 0, rg);
854 }
855
856 SI void load4(const uint16_t* ptr, U16* r, U16* g, U16* b, U16* a) {
857 __m128i _01 = _mm_loadu_si128(((const __m128i*)ptr) + 0), // r0 g0 b0 a0 r1 g1 b1 a1
858 _23 = _mm_loadu_si128(((const __m128i*)ptr) + 1); // r2 g2 b2 a2 r3 g3 b3 a3
859
860 auto _02 = _mm_unpacklo_epi16(_01, _23), // r0 r2 g0 g2 b0 b2 a0 a2
861 _13 = _mm_unpackhi_epi16(_01, _23); // r1 r3 g1 g3 b1 b3 a1 a3
862
863 auto rg = _mm_unpacklo_epi16(_02, _13), // r0 r1 r2 r3 g0 g1 g2 g3
864 ba = _mm_unpackhi_epi16(_02, _13); // b0 b1 b2 b3 a0 a1 a2 a3
865
866 *r = sk_unaligned_load<U16>((uint16_t*)&rg + 0);
867 *g = sk_unaligned_load<U16>((uint16_t*)&rg + 4);
868 *b = sk_unaligned_load<U16>((uint16_t*)&ba + 0);
869 *a = sk_unaligned_load<U16>((uint16_t*)&ba + 4);
870 }
871
872 SI void store4(uint16_t* ptr, U16 r, U16 g, U16 b, U16 a) {
873 auto rg = _mm_unpacklo_epi16(widen_cast<__m128i>(r), widen_cast<__m128i>(g)),
874 ba = _mm_unpacklo_epi16(widen_cast<__m128i>(b), widen_cast<__m128i>(a));
875
876 _mm_storeu_si128((__m128i*)ptr + 0, _mm_unpacklo_epi32(rg, ba));
877 _mm_storeu_si128((__m128i*)ptr + 1, _mm_unpackhi_epi32(rg, ba));
878 }
879
880 SI void load4(const float* ptr, F* r, F* g, F* b, F* a) {
881 F _0 = _mm_loadu_ps(ptr + 0),
882 _1 = _mm_loadu_ps(ptr + 4),
883 _2 = _mm_loadu_ps(ptr + 8),
884 _3 = _mm_loadu_ps(ptr +12);
885 _MM_TRANSPOSE4_PS(_0,_1,_2,_3);
886 *r = _0;
887 *g = _1;
888 *b = _2;
889 *a = _3;
890 }
891
892 SI void store4(float* ptr, F r, F g, F b, F a) {
893 _MM_TRANSPOSE4_PS(r,g,b,a);
894 _mm_storeu_ps(ptr + 0, r);
895 _mm_storeu_ps(ptr + 4, g);
896 _mm_storeu_ps(ptr + 8, b);
897 _mm_storeu_ps(ptr +12, a);
898 }
899
900 #elif defined(SKRP_CPU_LASX)
901 // These are __m256 and __m256i, but friendlier and strongly-typed.
902 template <typename T> using V = Vec<8, T>;
903 using F = V<float >;
904 using I32 = V<int32_t>;
905 using U64 = V<uint64_t>;
906 using U32 = V<uint32_t>;
907 using U16 = V<uint16_t>;
908 using U8 = V<uint8_t >;
909
910 SI __m128i emulate_lasx_d_xr2vr_l(__m256i a) {
911 v4i64 tmp = a;
912 v2i64 al = {tmp[0], tmp[1]};
913 return (__m128i)al;
914 }
915
916 SI __m128i emulate_lasx_d_xr2vr_h(__m256i a) {
917 v4i64 tmp = a;
918 v2i64 ah = {tmp[2], tmp[3]};
919 return (__m128i)ah;
920 }
921
922 SI F if_then_else(I32 c, F t, F e) {
923 return sk_bit_cast<Vec<8,float>>(__lasx_xvbitsel_v(sk_bit_cast<__m256i>(e),
924 sk_bit_cast<__m256i>(t),
925 sk_bit_cast<__m256i>(c)));
926 }
927
928 SI I32 if_then_else(I32 c, I32 t, I32 e) {
929 return sk_bit_cast<Vec<8,int32_t>>(__lasx_xvbitsel_v(sk_bit_cast<__m256i>(e),
930 sk_bit_cast<__m256i>(t),
931 sk_bit_cast<__m256i>(c)));
932 }
933
934 SI F min(F a, F b) { return __lasx_xvfmin_s(a,b); }
935 SI F max(F a, F b) { return __lasx_xvfmax_s(a,b); }
936 SI I32 min(I32 a, I32 b) { return __lasx_xvmin_w(a,b); }
937 SI U32 min(U32 a, U32 b) { return __lasx_xvmin_wu(a,b); }
938 SI I32 max(I32 a, I32 b) { return __lasx_xvmax_w(a,b); }
939 SI U32 max(U32 a, U32 b) { return __lasx_xvmax_wu(a,b); }
940
941 SI F mad(F f, F m, F a) { return __lasx_xvfmadd_s(f, m, a); }
942 SI F nmad(F f, F m, F a) { return __lasx_xvfmadd_s(-f, m, a); }
943 SI F abs_ (F v) { return (F)__lasx_xvand_v((I32)v, (I32)(0-v)); }
944 SI I32 abs_(I32 v) { return max(v, -v); }
945 SI F rcp_approx(F v) { return __lasx_xvfrecip_s(v); }
946 SI F rcp_precise (F v) { F e = rcp_approx(v); return e * nmad(v, e, F() + 2.0f); }
947 SI F rsqrt_approx (F v) { return __lasx_xvfrsqrt_s(v); }
948 SI F sqrt_(F v) { return __lasx_xvfsqrt_s(v); }
949
950 SI U32 iround(F v) {
951 F t = F() + 0.5f;
952 return __lasx_xvftintrz_w_s(v + t);
953 }
954
955 SI U32 round(F v) {
956 F t = F() + 0.5f;
957 return __lasx_xvftintrz_w_s(v + t);
958 }
959
960 SI U32 round(F v, F scale) {
961 F t = F() + 0.5f;
962 return __lasx_xvftintrz_w_s(mad(v, scale, t));
963 }
964
965 SI U16 pack(U32 v) {
966 return __lsx_vpickev_h(__lsx_vsat_wu(emulate_lasx_d_xr2vr_h(v), 15),
967 __lsx_vsat_wu(emulate_lasx_d_xr2vr_l(v), 15));
968 }
969
970 SI U8 pack(U16 v) {
971 __m128i tmp = __lsx_vsat_hu(v, 7);
972 auto r = __lsx_vpickev_b(tmp, tmp);
973 return sk_unaligned_load<U8>(&r);
974 }
975
976 SI bool any(I32 c){
977 v8i32 retv = (v8i32)__lasx_xvmskltz_w(__lasx_xvslt_wu(__lasx_xvldi(0), c));
978 return (retv[0] | retv[4]) != 0b0000;
979 }
980
981 SI bool all(I32 c){
982 v8i32 retv = (v8i32)__lasx_xvmskltz_w(__lasx_xvslt_wu(__lasx_xvldi(0), c));
983 return (retv[0] & retv[4]) == 0b1111;
984 }
985
986 SI F floor_(F v) {
987 return __lasx_xvfrintrm_s(v);
988 }
989
990 SI F ceil_(F v) {
991 return __lasx_xvfrintrp_s(v);
992 }
993
994 template <typename T>
995 SI V<T> gather(const T* p, U32 ix) {
996 return V<T>{ p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
997 p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]], };
998 }
999
1000 template <typename V, typename S>
1001 SI void scatter_masked(V src, S* dst, U32 ix, I32 mask) {
1002 V before = gather(dst, ix);
1003 V after = if_then_else(mask, src, before);
1004 dst[ix[0]] = after[0];
1005 dst[ix[1]] = after[1];
1006 dst[ix[2]] = after[2];
1007 dst[ix[3]] = after[3];
1008 dst[ix[4]] = after[4];
1009 dst[ix[5]] = after[5];
1010 dst[ix[6]] = after[6];
1011 dst[ix[7]] = after[7];
1012 }
1013
1014 SI void load2(const uint16_t* ptr, U16* r, U16* g) {
1015 U16 _0123 = __lsx_vld(ptr, 0),
1016 _4567 = __lsx_vld(ptr, 16);
1017 *r = __lsx_vpickev_h(__lsx_vsat_w(__lsx_vsrai_w(__lsx_vslli_w(_4567, 16), 16), 15),
1018 __lsx_vsat_w(__lsx_vsrai_w(__lsx_vslli_w(_0123, 16), 16), 15));
1019 *g = __lsx_vpickev_h(__lsx_vsat_w(__lsx_vsrai_w(_4567, 16), 15),
1020 __lsx_vsat_w(__lsx_vsrai_w(_0123, 16), 15));
1021 }
1022 SI void store2(uint16_t* ptr, U16 r, U16 g) {
1023 auto _0123 = __lsx_vilvl_h(g, r),
1024 _4567 = __lsx_vilvh_h(g, r);
1025 __lsx_vst(_0123, ptr, 0);
1026 __lsx_vst(_4567, ptr, 16);
1027 }
1028
1029 SI void load4(const uint16_t* ptr, U16* r, U16* g, U16* b, U16* a) {
1030 __m128i _01 = __lsx_vld(ptr, 0),
1031 _23 = __lsx_vld(ptr, 16),
1032 _45 = __lsx_vld(ptr, 32),
1033 _67 = __lsx_vld(ptr, 48);
1034
1035 auto _02 = __lsx_vilvl_h(_23, _01), // r0 r2 g0 g2 b0 b2 a0 a2
1036 _13 = __lsx_vilvh_h(_23, _01), // r1 r3 g1 g3 b1 b3 a1 a3
1037 _46 = __lsx_vilvl_h(_67, _45),
1038 _57 = __lsx_vilvh_h(_67, _45);
1039
1040 auto rg0123 = __lsx_vilvl_h(_13, _02), // r0 r1 r2 r3 g0 g1 g2 g3
1041 ba0123 = __lsx_vilvh_h(_13, _02), // b0 b1 b2 b3 a0 a1 a2 a3
1042 rg4567 = __lsx_vilvl_h(_57, _46),
1043 ba4567 = __lsx_vilvh_h(_57, _46);
1044
1045 *r = __lsx_vilvl_d(rg4567, rg0123);
1046 *g = __lsx_vilvh_d(rg4567, rg0123);
1047 *b = __lsx_vilvl_d(ba4567, ba0123);
1048 *a = __lsx_vilvh_d(ba4567, ba0123);
1049 }
1050
1051 SI void store4(uint16_t* ptr, U16 r, U16 g, U16 b, U16 a) {
1052 auto rg0123 = __lsx_vilvl_h(g, r), // r0 g0 r1 g1 r2 g2 r3 g3
1053 rg4567 = __lsx_vilvh_h(g, r), // r4 g4 r5 g5 r6 g6 r7 g7
1054 ba0123 = __lsx_vilvl_h(a, b),
1055 ba4567 = __lsx_vilvh_h(a, b);
1056
1057 auto _01 =__lsx_vilvl_w(ba0123, rg0123),
1058 _23 =__lsx_vilvh_w(ba0123, rg0123),
1059 _45 =__lsx_vilvl_w(ba4567, rg4567),
1060 _67 =__lsx_vilvh_w(ba4567, rg4567);
1061
1062 __lsx_vst(_01, ptr, 0);
1063 __lsx_vst(_23, ptr, 16);
1064 __lsx_vst(_45, ptr, 32);
1065 __lsx_vst(_67, ptr, 48);
1066 }
1067
1068 SI void load4(const float* ptr, F* r, F* g, F* b, F* a) {
1069 F _04 = (F)__lasx_xvpermi_q(__lasx_xvld(ptr, 0), __lasx_xvld(ptr, 64), 0x02);
1070 F _15 = (F)__lasx_xvpermi_q(__lasx_xvld(ptr, 16), __lasx_xvld(ptr, 80), 0x02);
1071 F _26 = (F)__lasx_xvpermi_q(__lasx_xvld(ptr, 32), __lasx_xvld(ptr, 96), 0x02);
1072 F _37 = (F)__lasx_xvpermi_q(__lasx_xvld(ptr, 48), __lasx_xvld(ptr, 112), 0x02);
1073
1074 F rg0145 = (F)__lasx_xvilvl_w((__m256i)_15, (__m256i)_04), // r0 r1 g0 g1 | r4 r5 g4 g5
1075 ba0145 = (F)__lasx_xvilvh_w((__m256i)_15, (__m256i)_04),
1076 rg2367 = (F)__lasx_xvilvl_w((__m256i)_37, (__m256i)_26),
1077 ba2367 = (F)__lasx_xvilvh_w((__m256i)_37, (__m256i)_26);
1078
1079 *r = (F)__lasx_xvilvl_d((__m256i)rg2367, (__m256i)rg0145);
1080 *g = (F)__lasx_xvilvh_d((__m256i)rg2367, (__m256i)rg0145);
1081 *b = (F)__lasx_xvilvl_d((__m256i)ba2367, (__m256i)ba0145);
1082 *a = (F)__lasx_xvilvh_d((__m256i)ba2367, (__m256i)ba0145);
1083 }
1084 SI void store4(float* ptr, F r, F g, F b, F a) {
1085 F rg0145 = (F)__lasx_xvilvl_w((__m256i)g, (__m256i)r), // r0 g0 r1 g1 | r4 g4 r5 g5
1086 rg2367 = (F)__lasx_xvilvh_w((__m256i)g, (__m256i)r), // r2 ... | r6 ...
1087 ba0145 = (F)__lasx_xvilvl_w((__m256i)a, (__m256i)b), // b0 a0 b1 a1 | b4 a4 b5 a5
1088 ba2367 = (F)__lasx_xvilvh_w((__m256i)a, (__m256i)b); // b2 ... | b6 ...
1089
1090 F _04 = (F)__lasx_xvilvl_d((__m256i)ba0145, (__m256i)rg0145), // r0 g0 b0 a0 | r4 g4 b4 a4
1091 _15 = (F)__lasx_xvilvh_d((__m256i)ba0145, (__m256i)rg0145), // r1 ... | r5 ...
1092 _26 = (F)__lasx_xvilvl_d((__m256i)ba2367, (__m256i)rg2367), // r2 ... | r6 ...
1093 _37 = (F)__lasx_xvilvh_d((__m256i)ba2367, (__m256i)rg2367); // r3 ... | r7 ...
1094
1095 F _01 = (F)__lasx_xvpermi_q((__m256i)_04, (__m256i)_15, 0x02),
1096 _23 = (F)__lasx_xvpermi_q((__m256i)_26, (__m256i)_37, 0x02),
1097 _45 = (F)__lasx_xvpermi_q((__m256i)_04, (__m256i)_15, 0x13),
1098 _67 = (F)__lasx_xvpermi_q((__m256i)_26, (__m256i)_37, 0x13);
1099 __lasx_xvst(_01, ptr, 0);
1100 __lasx_xvst(_23, ptr, 32);
1101 __lasx_xvst(_45, ptr, 64);
1102 __lasx_xvst(_67, ptr, 96);
1103 }
1104
1105 #elif defined(SKRP_CPU_LSX)
1106 template <typename T> using V = Vec<4, T>;
1107 using F = V<float >;
1108 using I32 = V<int32_t >;
1109 using U64 = V<uint64_t>;
1110 using U32 = V<uint32_t>;
1111 using U16 = V<uint16_t>;
1112 using U8 = V<uint8_t >;
1113
1114 #define _LSX_TRANSPOSE4_S(row0, row1, row2, row3) \
1115 do { \
1116 __m128 __t0 = (__m128)__lsx_vilvl_w ((__m128i)row1, (__m128i)row0); \
1117 __m128 __t1 = (__m128)__lsx_vilvl_w ((__m128i)row3, (__m128i)row2); \
1118 __m128 __t2 = (__m128)__lsx_vilvh_w ((__m128i)row1, (__m128i)row0); \
1119 __m128 __t3 = (__m128)__lsx_vilvh_w ((__m128i)row3, (__m128i)row2); \
1120 (row0) = (__m128)__lsx_vilvl_d ((__m128i)__t1, (__m128i)__t0); \
1121 (row1) = (__m128)__lsx_vilvh_d ((__m128i)__t1, (__m128i)__t0); \
1122 (row2) = (__m128)__lsx_vilvl_d ((__m128i)__t3, (__m128i)__t2); \
1123 (row3) = (__m128)__lsx_vilvh_d ((__m128i)__t3, (__m128i)__t2); \
1124 } while (0)
1125
1126 SI F if_then_else(I32 c, F t, F e) {
1127 return sk_bit_cast<Vec<4,float>>(__lsx_vbitsel_v(sk_bit_cast<__m128i>(e),
1128 sk_bit_cast<__m128i>(t),
1129 sk_bit_cast<__m128i>(c)));
1130 }
1131
1132 SI I32 if_then_else(I32 c, I32 t, I32 e) {
1133 return sk_bit_cast<Vec<4,int32_t>>(__lsx_vbitsel_v(sk_bit_cast<__m128i>(e),
1134 sk_bit_cast<__m128i>(t),
1135 sk_bit_cast<__m128i>(c)));
1136 }
1137
1138 SI F min(F a, F b) { return __lsx_vfmin_s(a,b); }
1139 SI F max(F a, F b) { return __lsx_vfmax_s(a,b); }
1140 SI I32 min(I32 a, I32 b) { return __lsx_vmin_w(a,b); }
1141 SI U32 min(U32 a, U32 b) { return __lsx_vmin_wu(a,b); }
1142 SI I32 max(I32 a, I32 b) { return __lsx_vmax_w(a,b); }
1143 SI U32 max(U32 a, U32 b) { return __lsx_vmax_wu(a,b); }
1144
1145 SI F mad(F f, F m, F a) { return __lsx_vfmadd_s(f, m, a); }
1146 SI F nmad(F f, F m, F a) { return __lsx_vfmadd_s(-f, m, a); }
1147 SI F abs_(F v) { return (F)__lsx_vand_v((I32)v, (I32)(0-v)); }
1148 SI I32 abs_(I32 v) { return max(v, -v); }
1149 SI F rcp_approx (F v) { return __lsx_vfrecip_s(v); }
1150 SI F rcp_precise (F v) { F e = rcp_approx(v); return e * nmad(v, e, F() + 2.0f); }
1151 SI F rsqrt_approx (F v) { return __lsx_vfrsqrt_s(v); }
1152 SI F sqrt_(F v) { return __lsx_vfsqrt_s (v); }
1153
1154 SI U32 iround(F v) {
1155 F t = F() + 0.5f;
1156 return __lsx_vftintrz_w_s(v + t); }
1157
1158 SI U32 round(F v) {
1159 F t = F() + 0.5f;
1160 return __lsx_vftintrz_w_s(v + t); }
1161
1162 SI U32 round(F v, F scale) {
1163 F t = F() + 0.5f;
1164 return __lsx_vftintrz_w_s(mad(v, scale, t)); }
1165
1166 SI U16 pack(U32 v) {
1167 __m128i tmp = __lsx_vsat_wu(v, 15);
1168 auto p = __lsx_vpickev_h(tmp, tmp);
1169 return sk_unaligned_load<U16>(&p); // We have two copies. Return (the lower) one.
1170 }
1171
1172 SI U8 pack(U16 v) {
1173 auto r = widen_cast<__m128i>(v);
1174 __m128i tmp = __lsx_vsat_hu(r, 7);
1175 r = __lsx_vpickev_b(tmp, tmp);
1176 return sk_unaligned_load<U8>(&r);
1177 }
1178
1179 SI bool any(I32 c){
1180 v4i32 retv = (v4i32)__lsx_vmskltz_w(__lsx_vslt_wu(__lsx_vldi(0), c));
1181 return retv[0] != 0b0000;
1182 }
1183
1184 SI bool all(I32 c){
1185 v4i32 retv = (v4i32)__lsx_vmskltz_w(__lsx_vslt_wu(__lsx_vldi(0), c));
1186 return retv[0] == 0b1111;
1187 }
1188
1189 SI F floor_(F v) {
1190 return __lsx_vfrintrm_s(v);
1191 }
1192
1193 SI F ceil_(F v) {
1194 return __lsx_vfrintrp_s(v);
1195 }
1196
1197 template <typename T>
1198 SI V<T> gather(const T* p, U32 ix) {
1199 return V<T>{p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]]};
1200 }
1201 // Using 'int*' prevents data from passing through floating-point registers.
1202 SI F gather(const int* p, int ix0, int ix1, int ix2, int ix3) {
1203 F ret = {0.0};
1204 ret = (F)__lsx_vinsgr2vr_w(ret, p[ix0], 0);
1205 ret = (F)__lsx_vinsgr2vr_w(ret, p[ix1], 1);
1206 ret = (F)__lsx_vinsgr2vr_w(ret, p[ix2], 2);
1207 ret = (F)__lsx_vinsgr2vr_w(ret, p[ix3], 3);
1208 return ret;
1209 }
1210
1211 template <typename V, typename S>
1212 SI void scatter_masked(V src, S* dst, U32 ix, I32 mask) {
1213 V before = gather(dst, ix);
1214 V after = if_then_else(mask, src, before);
1215 dst[ix[0]] = after[0];
1216 dst[ix[1]] = after[1];
1217 dst[ix[2]] = after[2];
1218 dst[ix[3]] = after[3];
1219 }
1220
1221 SI void load2(const uint16_t* ptr, U16* r, U16* g) {
1222 __m128i _01 = __lsx_vld(ptr, 0); // r0 g0 r1 g1 r2 g2 r3 g3
1223 auto rg = __lsx_vshuf4i_h(_01, 0xD8); // r0 r1 g0 g1 r2 r3 g2 g3
1224
1225 auto R = __lsx_vshuf4i_w(rg, 0x88); // r0 r1 r2 r3 r0 r1 r2 r3
1226 auto G = __lsx_vshuf4i_w(rg, 0xDD); // g0 g1 g2 g3 g0 g1 g2 g3
1227 *r = sk_unaligned_load<U16>(&R);
1228 *g = sk_unaligned_load<U16>(&G);
1229 }
1230
1231 SI void store2(uint16_t* ptr, U16 r, U16 g) {
1232 U32 rg = __lsx_vilvl_h(widen_cast<__m128i>(g), widen_cast<__m128i>(r));
1233 __lsx_vst(rg, ptr, 0);
1234 }
1235
1236 SI void load4(const uint16_t* ptr, U16* r, U16* g, U16* b, U16* a) {
1237 __m128i _01 = __lsx_vld(ptr, 0), // r0 g0 b0 a0 r1 g1 b1 a1
1238 _23 = __lsx_vld(ptr, 16); // r2 g2 b2 a2 r3 g3 b3 a3
1239
1240 auto _02 = __lsx_vilvl_h(_23, _01), // r0 r2 g0 g2 b0 b2 a0 a2
1241 _13 = __lsx_vilvh_h(_23, _01); // r1 r3 g1 g3 b1 b3 a1 a3
1242
1243 auto rg = __lsx_vilvl_h(_13, _02), // r0 r1 r2 r3 g0 g1 g2 g3
1244 ba = __lsx_vilvh_h(_13, _02); // b0 b1 b2 b3 a0 a1 a2 a3
1245
1246 *r = sk_unaligned_load<U16>((uint16_t*)&rg + 0);
1247 *g = sk_unaligned_load<U16>((uint16_t*)&rg + 4);
1248 *b = sk_unaligned_load<U16>((uint16_t*)&ba + 0);
1249 *a = sk_unaligned_load<U16>((uint16_t*)&ba + 4);
1250 }
1251
1252 SI void store4(uint16_t* ptr, U16 r, U16 g, U16 b, U16 a) {
1253 auto rg = __lsx_vilvl_h(widen_cast<__m128i>(g), widen_cast<__m128i>(r)),
1254 ba = __lsx_vilvl_h(widen_cast<__m128i>(a), widen_cast<__m128i>(b));
1255
1256 __lsx_vst(__lsx_vilvl_w(ba, rg), ptr, 0);
1257 __lsx_vst(__lsx_vilvh_w(ba, rg), ptr, 16);
1258 }
1259
1260 SI void load4(const float* ptr, F* r, F* g, F* b, F* a) {
1261 F _0 = (F)__lsx_vld(ptr, 0),
1262 _1 = (F)__lsx_vld(ptr, 16),
1263 _2 = (F)__lsx_vld(ptr, 32),
1264 _3 = (F)__lsx_vld(ptr, 48);
1265 _LSX_TRANSPOSE4_S(_0,_1,_2,_3);
1266 *r = _0;
1267 *g = _1;
1268 *b = _2;
1269 *a = _3;
1270 }
1271
1272 SI void store4(float* ptr, F r, F g, F b, F a) {
1273 _LSX_TRANSPOSE4_S(r,g,b,a);
1274 __lsx_vst(r, ptr, 0);
1275 __lsx_vst(g, ptr, 16);
1276 __lsx_vst(b, ptr, 32);
1277 __lsx_vst(a, ptr, 48);
1278 }
1279
1280 #endif
1281
1282 // Helpers to do scalar -> vector promotion on GCC (clang does this automatically)
1283 // We need to subtract (not add) zero to keep float conversion zero-cost. See:
1284 // https://stackoverflow.com/q/48255293
1285 //
1286 // The GCC implementation should be usable everywhere, but Mac clang (only) complains that the
1287 // expressions make these functions not constexpr.
1288 //
1289 // Further: We can't use the subtract-zero version in scalar mode. There, the subtraction will
1290 // really happen (at least at low optimization levels), which can alter the bit pattern of NaNs.
1291 // Because F_() is used when copying uniforms (even integer uniforms), this can corrupt values.
1292 // The vector subtraction of zero doesn't appear to ever alter NaN bit patterns.
1293 #if defined(__clang__) || defined(SKRP_CPU_SCALAR)
F_(float x)1294 SI constexpr F F_(float x) { return x; }
I32_(int32_t x)1295 SI constexpr I32 I32_(int32_t x) { return x; }
U32_(uint32_t x)1296 SI constexpr U32 U32_(uint32_t x) { return x; }
1297 #else
F_(float x)1298 SI constexpr F F_(float x) { return x - F(); }
I32_(int32_t x)1299 SI constexpr I32 I32_(int32_t x) { return x + I32(); }
U32_(uint32_t x)1300 SI constexpr U32 U32_(uint32_t x) { return x + U32(); }
1301 #endif
1302
1303 // Extremely helpful literals:
1304 static constexpr F F0 = F_(0.0f),
1305 F1 = F_(1.0f);
1306
1307 #if !defined(SKRP_CPU_SCALAR)
min(F a,float b)1308 SI F min(F a, float b) { return min(a, F_(b)); }
min(float a,F b)1309 SI F min(float a, F b) { return min(F_(a), b); }
max(F a,float b)1310 SI F max(F a, float b) { return max(a, F_(b)); }
max(float a,F b)1311 SI F max(float a, F b) { return max(F_(a), b); }
1312
mad(F f,F m,float a)1313 SI F mad(F f, F m, float a) { return mad(f, m, F_(a)); }
mad(F f,float m,F a)1314 SI F mad(F f, float m, F a) { return mad(f, F_(m), a); }
mad(F f,float m,float a)1315 SI F mad(F f, float m, float a) { return mad(f, F_(m), F_(a)); }
mad(float f,F m,F a)1316 SI F mad(float f, F m, F a) { return mad(F_(f), m, a); }
mad(float f,F m,float a)1317 SI F mad(float f, F m, float a) { return mad(F_(f), m, F_(a)); }
mad(float f,float m,F a)1318 SI F mad(float f, float m, F a) { return mad(F_(f), F_(m), a); }
1319
nmad(F f,F m,float a)1320 SI F nmad(F f, F m, float a) { return nmad(f, m, F_(a)); }
nmad(F f,float m,F a)1321 SI F nmad(F f, float m, F a) { return nmad(f, F_(m), a); }
nmad(F f,float m,float a)1322 SI F nmad(F f, float m, float a) { return nmad(f, F_(m), F_(a)); }
nmad(float f,F m,F a)1323 SI F nmad(float f, F m, F a) { return nmad(F_(f), m, a); }
nmad(float f,F m,float a)1324 SI F nmad(float f, F m, float a) { return nmad(F_(f), m, F_(a)); }
nmad(float f,float m,F a)1325 SI F nmad(float f, float m, F a) { return nmad(F_(f), F_(m), a); }
1326 #endif
1327
1328 // We need to be a careful with casts.
1329 // (F)x means cast x to float in the portable path, but bit_cast x to float in the others.
1330 // These named casts and bit_cast() are always what they seem to be.
1331 #if defined(SKRP_CPU_SCALAR)
cast(U32 v)1332 SI F cast (U32 v) { return (F)v; }
cast64(U64 v)1333 SI F cast64(U64 v) { return (F)v; }
trunc_(F v)1334 SI U32 trunc_(F v) { return (U32)v; }
expand(U16 v)1335 SI U32 expand(U16 v) { return (U32)v; }
expand(U8 v)1336 SI U32 expand(U8 v) { return (U32)v; }
1337 #else
cast(U32 v)1338 SI F cast (U32 v) { return __builtin_convertvector((I32)v, F); }
cast64(U64 v)1339 SI F cast64(U64 v) { return __builtin_convertvector( v, F); }
trunc_(F v)1340 SI U32 trunc_(F v) { return (U32)__builtin_convertvector( v, I32); }
expand(U16 v)1341 SI U32 expand(U16 v) { return __builtin_convertvector( v, U32); }
expand(U8 v)1342 SI U32 expand(U8 v) { return __builtin_convertvector( v, U32); }
1343 #endif
1344
1345 #if !defined(SKRP_CPU_SCALAR)
if_then_else(I32 c,F t,float e)1346 SI F if_then_else(I32 c, F t, float e) { return if_then_else(c, t , F_(e)); }
if_then_else(I32 c,float t,F e)1347 SI F if_then_else(I32 c, float t, F e) { return if_then_else(c, F_(t), e ); }
if_then_else(I32 c,float t,float e)1348 SI F if_then_else(I32 c, float t, float e) { return if_then_else(c, F_(t), F_(e)); }
1349 #endif
1350
fract(F v)1351 SI F fract(F v) { return v - floor_(v); }
1352
1353 // See http://www.machinedlearnings.com/2011/06/fast-approximate-logarithm-exponential.html
approx_log2(F x)1354 SI F approx_log2(F x) {
1355 // e - 127 is a fair approximation of log2(x) in its own right...
1356 F e = cast(sk_bit_cast<U32>(x)) * (1.0f / (1<<23));
1357
1358 // ... but using the mantissa to refine its error is _much_ better.
1359 F m = sk_bit_cast<F>((sk_bit_cast<U32>(x) & 0x007fffff) | 0x3f000000);
1360
1361 return nmad(m, 1.498030302f, e - 124.225514990f) - 1.725879990f / (0.3520887068f + m);
1362 }
1363
approx_log(F x)1364 SI F approx_log(F x) {
1365 const float ln2 = 0.69314718f;
1366 return ln2 * approx_log2(x);
1367 }
1368
approx_pow2(F x)1369 SI F approx_pow2(F x) {
1370 constexpr float kInfinityBits = 0x7f800000;
1371
1372 F f = fract(x);
1373 F approx = nmad(f, 1.490129070f, x + 121.274057500f);
1374 approx += 27.728023300f / (4.84252568f - f);
1375 approx *= 1.0f * (1<<23);
1376 approx = min(max(approx, F0), F_(kInfinityBits)); // guard against underflow/overflow
1377
1378 return sk_bit_cast<F>(round(approx));
1379 }
1380
approx_exp(F x)1381 SI F approx_exp(F x) {
1382 const float log2_e = 1.4426950408889634074f;
1383 return approx_pow2(log2_e * x);
1384 }
1385
approx_powf(F x,F y)1386 SI F approx_powf(F x, F y) {
1387 return if_then_else((x == 0)|(x == 1), x
1388 , approx_pow2(approx_log2(x) * y));
1389 }
1390 #if !defined(SKRP_CPU_SCALAR)
approx_powf(F x,float y)1391 SI F approx_powf(F x, float y) { return approx_powf(x, F_(y)); }
1392 #endif
1393
from_half(U16 h)1394 SI F from_half(U16 h) {
1395 #if defined(SKRP_CPU_NEON) && defined(SK_CPU_ARM64)
1396 return vcvt_f32_f16((float16x4_t)h);
1397
1398 #elif defined(SKRP_CPU_SKX)
1399 return _mm512_cvtph_ps((__m256i)h);
1400
1401 #elif defined(SKRP_CPU_HSW)
1402 return _mm256_cvtph_ps((__m128i)h);
1403
1404 #else
1405 // Remember, a half is 1-5-10 (sign-exponent-mantissa) with 15 exponent bias.
1406 U32 sem = expand(h),
1407 s = sem & 0x8000,
1408 em = sem ^ s;
1409
1410 // Convert to 1-8-23 float with 127 bias, flushing denorm halfs (including zero) to zero.
1411 auto denorm = (I32)em < 0x0400; // I32 comparison is often quicker, and always safe here.
1412 return if_then_else(denorm, F0
1413 , sk_bit_cast<F>( (s<<16) + (em<<13) + ((127-15)<<23) ));
1414 #endif
1415 }
1416
to_half(F f)1417 SI U16 to_half(F f) {
1418 #if defined(SKRP_CPU_NEON) && defined(SK_CPU_ARM64)
1419 return (U16)vcvt_f16_f32(f);
1420
1421 #elif defined(SKRP_CPU_SKX)
1422 return (U16)_mm512_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION);
1423
1424 #elif defined(SKRP_CPU_HSW)
1425 return (U16)_mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION);
1426
1427 #else
1428 // Remember, a float is 1-8-23 (sign-exponent-mantissa) with 127 exponent bias.
1429 U32 sem = sk_bit_cast<U32>(f),
1430 s = sem & 0x80000000,
1431 em = sem ^ s;
1432
1433 // Convert to 1-5-10 half with 15 bias, flushing denorm halfs (including zero) to zero.
1434 auto denorm = (I32)em < 0x38800000; // I32 comparison is often quicker, and always safe here.
1435 return pack((U32)if_then_else(denorm, I32_(0)
1436 , (I32)((s>>16) + (em>>13) - ((127-15)<<10))));
1437 #endif
1438 }
1439
patch_memory_contexts(SkSpan<SkRasterPipeline_MemoryCtxPatch> memoryCtxPatches,size_t dx,size_t dy,size_t tail)1440 static void patch_memory_contexts(SkSpan<SkRasterPipeline_MemoryCtxPatch> memoryCtxPatches,
1441 size_t dx, size_t dy, size_t tail) {
1442 for (SkRasterPipeline_MemoryCtxPatch& patch : memoryCtxPatches) {
1443 SkRasterPipeline_MemoryCtx* ctx = patch.info.context;
1444
1445 const ptrdiff_t offset = patch.info.bytesPerPixel * (dy * ctx->stride + dx);
1446 if (patch.info.load) {
1447 void* ctxData = SkTAddOffset<void>(ctx->pixels, offset);
1448 memcpy(patch.scratch, ctxData, patch.info.bytesPerPixel * tail);
1449 }
1450
1451 SkASSERT(patch.backup == nullptr);
1452 void* scratchFakeBase = SkTAddOffset<void>(patch.scratch, -offset);
1453 patch.backup = ctx->pixels;
1454 ctx->pixels = scratchFakeBase;
1455 }
1456 }
1457
restore_memory_contexts(SkSpan<SkRasterPipeline_MemoryCtxPatch> memoryCtxPatches,size_t dx,size_t dy,size_t tail)1458 static void restore_memory_contexts(SkSpan<SkRasterPipeline_MemoryCtxPatch> memoryCtxPatches,
1459 size_t dx, size_t dy, size_t tail) {
1460 for (SkRasterPipeline_MemoryCtxPatch& patch : memoryCtxPatches) {
1461 SkRasterPipeline_MemoryCtx* ctx = patch.info.context;
1462
1463 SkASSERT(patch.backup != nullptr);
1464 ctx->pixels = patch.backup;
1465 patch.backup = nullptr;
1466
1467 const ptrdiff_t offset = patch.info.bytesPerPixel * (dy * ctx->stride + dx);
1468 if (patch.info.store) {
1469 void* ctxData = SkTAddOffset<void>(ctx->pixels, offset);
1470 memcpy(ctxData, patch.scratch, patch.info.bytesPerPixel * tail);
1471 }
1472 }
1473 }
1474
1475 #if defined(SKRP_CPU_SCALAR) || defined(SKRP_CPU_SSE2)
1476 // In scalar and SSE2 mode, we always use precise math so we can have more predictable results.
1477 // Chrome will use the SSE2 implementation when --disable-skia-runtime-opts is set. (b/40042946)
rcp_fast(F v)1478 SI F rcp_fast(F v) { return rcp_precise(v); }
rsqrt(F v)1479 SI F rsqrt(F v) { return rcp_precise(sqrt_(v)); }
1480 #else
rcp_fast(F v)1481 SI F rcp_fast(F v) { return rcp_approx(v); }
rsqrt(F v)1482 SI F rsqrt(F v) { return rsqrt_approx(v); }
1483 #endif
1484
1485 // Our fundamental vector depth is our pixel stride.
1486 static constexpr size_t N = sizeof(F) / sizeof(float);
1487
1488 // We're finally going to get to what a Stage function looks like!
1489
1490 // Any custom ABI to use for all (non-externally-facing) stage functions?
1491 // Also decide here whether to use narrow (compromise) or wide (ideal) stages.
1492 #if defined(SK_CPU_ARM32) && defined(SKRP_CPU_NEON)
1493 // This lets us pass vectors more efficiently on 32-bit ARM.
1494 // We can still only pass 16 floats, so best as 4x {r,g,b,a}.
1495 #define ABI __attribute__((pcs("aapcs-vfp")))
1496 #define SKRP_NARROW_STAGES 1
1497 #elif defined(_MSC_VER)
1498 // Even if not vectorized, this lets us pass {r,g,b,a} as registers,
1499 // instead of {b,a} on the stack. Narrow stages work best for __vectorcall.
1500 #define ABI __vectorcall
1501 #define SKRP_NARROW_STAGES 1
1502 #elif defined(__x86_64__) || defined(SK_CPU_ARM64) || defined(SK_CPU_LOONGARCH)
1503 // These platforms are ideal for wider stages, and their default ABI is ideal.
1504 #define ABI
1505 #define SKRP_NARROW_STAGES 0
1506 #else
1507 // 32-bit or unknown... shunt them down the narrow path.
1508 // Odds are these have few registers and are better off there.
1509 #define ABI
1510 #define SKRP_NARROW_STAGES 1
1511 #endif
1512
1513 #if SKRP_NARROW_STAGES
1514 struct Params {
1515 size_t dx, dy;
1516 std::byte* base;
1517 F dr,dg,db,da;
1518 };
1519 using Stage = void(ABI*)(Params*, SkRasterPipelineStage* program, F r, F g, F b, F a);
1520 #else
1521 using Stage = void(ABI*)(SkRasterPipelineStage* program, size_t dx, size_t dy,
1522 std::byte* base, F,F,F,F, F,F,F,F);
1523 #endif
1524
start_pipeline(size_t dx,size_t dy,size_t xlimit,size_t ylimit,SkRasterPipelineStage * program,SkSpan<SkRasterPipeline_MemoryCtxPatch> memoryCtxPatches,uint8_t * tailPointer)1525 static void start_pipeline(size_t dx, size_t dy,
1526 size_t xlimit, size_t ylimit,
1527 SkRasterPipelineStage* program,
1528 SkSpan<SkRasterPipeline_MemoryCtxPatch> memoryCtxPatches,
1529 uint8_t* tailPointer) {
1530 uint8_t unreferencedTail;
1531 if (!tailPointer) {
1532 tailPointer = &unreferencedTail;
1533 }
1534 auto start = (Stage)program->fn;
1535 const size_t x0 = dx;
1536 std::byte* const base = nullptr;
1537 for (; dy < ylimit; dy++) {
1538 #if SKRP_NARROW_STAGES
1539 Params params = { x0,dy,base, F0,F0,F0,F0 };
1540 while (params.dx + N <= xlimit) {
1541 start(¶ms,program, F0,F0,F0,F0);
1542 params.dx += N;
1543 }
1544 if (size_t tail = xlimit - params.dx) {
1545 *tailPointer = tail;
1546 patch_memory_contexts(memoryCtxPatches, params.dx, dy, tail);
1547 start(¶ms,program, F0,F0,F0,F0);
1548 restore_memory_contexts(memoryCtxPatches, params.dx, dy, tail);
1549 *tailPointer = 0xFF;
1550 }
1551 #else
1552 dx = x0;
1553 while (dx + N <= xlimit) {
1554 start(program,dx,dy,base, F0,F0,F0,F0, F0,F0,F0,F0);
1555 dx += N;
1556 }
1557 if (size_t tail = xlimit - dx) {
1558 *tailPointer = tail;
1559 patch_memory_contexts(memoryCtxPatches, dx, dy, tail);
1560 start(program,dx,dy,base, F0,F0,F0,F0, F0,F0,F0,F0);
1561 restore_memory_contexts(memoryCtxPatches, dx, dy, tail);
1562 *tailPointer = 0xFF;
1563 }
1564 #endif
1565 }
1566 }
1567
1568 #if SK_HAS_MUSTTAIL
1569 #define SKRP_MUSTTAIL [[clang::musttail]]
1570 #else
1571 #define SKRP_MUSTTAIL
1572 #endif
1573
1574 #if SKRP_NARROW_STAGES
1575 #define DECLARE_STAGE(name, ARG, STAGE_RET, INC, OFFSET, MUSTTAIL) \
1576 SI STAGE_RET name##_k(ARG, size_t dx, size_t dy, std::byte*& base, \
1577 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
1578 static void ABI name(Params* params, SkRasterPipelineStage* program, \
1579 F r, F g, F b, F a) { \
1580 OFFSET name##_k(Ctx{program}, params->dx,params->dy,params->base, \
1581 r,g,b,a, params->dr, params->dg, params->db, params->da); \
1582 INC; \
1583 auto fn = (Stage)program->fn; \
1584 MUSTTAIL return fn(params, program, r,g,b,a); \
1585 } \
1586 SI STAGE_RET name##_k(ARG, size_t dx, size_t dy, std::byte*& base, \
1587 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
1588 #else
1589 #define DECLARE_STAGE(name, ARG, STAGE_RET, INC, OFFSET, MUSTTAIL) \
1590 SI STAGE_RET name##_k(ARG, size_t dx, size_t dy, std::byte*& base, \
1591 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da); \
1592 static void ABI name(SkRasterPipelineStage* program, size_t dx, size_t dy, \
1593 std::byte* base, F r, F g, F b, F a, F dr, F dg, F db, F da) { \
1594 OFFSET name##_k(Ctx{program}, dx,dy,base, r,g,b,a, dr,dg,db,da); \
1595 INC; \
1596 auto fn = (Stage)program->fn; \
1597 MUSTTAIL return fn(program, dx,dy,base, r,g,b,a, dr,dg,db,da); \
1598 } \
1599 SI STAGE_RET name##_k(ARG, size_t dx, size_t dy, std::byte*& base, \
1600 F& r, F& g, F& b, F& a, F& dr, F& dg, F& db, F& da)
1601 #endif
1602
1603 // A typical stage returns void, always increments the program counter by 1, and lets the optimizer
1604 // decide whether or not tail-calling is appropriate.
1605 #define STAGE(name, arg) \
1606 DECLARE_STAGE(name, arg, void, ++program, /*no offset*/, /*no musttail*/)
1607
1608 // A tail stage returns void, always increments the program counter by 1, and uses tail-calling.
1609 // Tail-calling is necessary in SkSL-generated programs, which can be thousands of ops long, and
1610 // could overflow the stack (particularly in debug).
1611 #define STAGE_TAIL(name, arg) \
1612 DECLARE_STAGE(name, arg, void, ++program, /*no offset*/, SKRP_MUSTTAIL)
1613
1614 // A branch stage returns an integer, which is added directly to the program counter, and tailcalls.
1615 #define STAGE_BRANCH(name, arg) \
1616 DECLARE_STAGE(name, arg, int, /*no increment*/, program +=, SKRP_MUSTTAIL)
1617
1618 // just_return() is a simple no-op stage that only exists to end the chain,
1619 // returning back up to start_pipeline(), and from there to the caller.
1620 #if SKRP_NARROW_STAGES
just_return(Params *,SkRasterPipelineStage *,F,F,F,F)1621 static void ABI just_return(Params*, SkRasterPipelineStage*, F,F,F,F) {}
1622 #else
just_return(SkRasterPipelineStage *,size_t,size_t,std::byte *,F,F,F,F,F,F,F,F)1623 static void ABI just_return(SkRasterPipelineStage*, size_t,size_t, std::byte*,
1624 F,F,F,F, F,F,F,F) {}
1625 #endif
1626
1627 // Note that in release builds, most stages consume no stack (thanks to tail call optimization).
1628 // However: certain builds (especially with non-clang compilers) may fail to optimize tail
1629 // calls, resulting in actual stack frames being generated.
1630 //
1631 // stack_checkpoint() and stack_rewind() are special stages that can be used to manage stack growth.
1632 // If a pipeline contains a stack_checkpoint, followed by any number of stack_rewind (at any point),
1633 // the C++ stack will be reset to the state it was at when the stack_checkpoint was initially hit.
1634 //
1635 // All instances of stack_rewind (as well as the one instance of stack_checkpoint near the start of
1636 // a pipeline) share a single context (of type SkRasterPipeline_RewindCtx). That context holds the
1637 // full state of the mutable registers that are normally passed to the next stage in the program.
1638 //
1639 // stack_rewind is the only stage other than just_return that actually returns (rather than jumping
1640 // to the next stage in the program). Before it does so, it stashes all of the registers in the
1641 // context. This includes the updated `program` pointer. Unlike stages that tail call exactly once,
1642 // stack_checkpoint calls the next stage in the program repeatedly, as long as the `program` in the
1643 // context is overwritten (i.e., as long as a stack_rewind was the reason the pipeline returned,
1644 // rather than a just_return).
1645 //
1646 // Normally, just_return is the only stage that returns, and no other stage does anything after a
1647 // subsequent (called) stage returns, so the stack just unwinds all the way to start_pipeline.
1648 // With stack_checkpoint on the stack, any stack_rewind stages will return all the way up to the
1649 // stack_checkpoint. That grabs the values that would have been passed to the next stage (from the
1650 // context), and continues the linear execution of stages, but has reclaimed all of the stack frames
1651 // pushed before the stack_rewind before doing so.
1652 #if SKRP_NARROW_STAGES
stack_checkpoint(Params * params,SkRasterPipelineStage * program,F r,F g,F b,F a)1653 static void ABI stack_checkpoint(Params* params, SkRasterPipelineStage* program,
1654 F r, F g, F b, F a) {
1655 SkRasterPipeline_RewindCtx* ctx = Ctx{program};
1656 while (program) {
1657 auto next = (Stage)(++program)->fn;
1658
1659 ctx->stage = nullptr;
1660 next(params, program, r, g, b, a);
1661 program = ctx->stage;
1662
1663 if (program) {
1664 r = sk_unaligned_load<F>(ctx->r );
1665 g = sk_unaligned_load<F>(ctx->g );
1666 b = sk_unaligned_load<F>(ctx->b );
1667 a = sk_unaligned_load<F>(ctx->a );
1668 params->dr = sk_unaligned_load<F>(ctx->dr);
1669 params->dg = sk_unaligned_load<F>(ctx->dg);
1670 params->db = sk_unaligned_load<F>(ctx->db);
1671 params->da = sk_unaligned_load<F>(ctx->da);
1672 params->base = ctx->base;
1673 }
1674 }
1675 }
stack_rewind(Params * params,SkRasterPipelineStage * program,F r,F g,F b,F a)1676 static void ABI stack_rewind(Params* params, SkRasterPipelineStage* program,
1677 F r, F g, F b, F a) {
1678 SkRasterPipeline_RewindCtx* ctx = Ctx{program};
1679 sk_unaligned_store(ctx->r , r );
1680 sk_unaligned_store(ctx->g , g );
1681 sk_unaligned_store(ctx->b , b );
1682 sk_unaligned_store(ctx->a , a );
1683 sk_unaligned_store(ctx->dr, params->dr);
1684 sk_unaligned_store(ctx->dg, params->dg);
1685 sk_unaligned_store(ctx->db, params->db);
1686 sk_unaligned_store(ctx->da, params->da);
1687 ctx->base = params->base;
1688 ctx->stage = program;
1689 }
1690 #else
stack_checkpoint(SkRasterPipelineStage * program,size_t dx,size_t dy,std::byte * base,F r,F g,F b,F a,F dr,F dg,F db,F da)1691 static void ABI stack_checkpoint(SkRasterPipelineStage* program,
1692 size_t dx, size_t dy, std::byte* base,
1693 F r, F g, F b, F a, F dr, F dg, F db, F da) {
1694 SkRasterPipeline_RewindCtx* ctx = Ctx{program};
1695 while (program) {
1696 auto next = (Stage)(++program)->fn;
1697
1698 ctx->stage = nullptr;
1699 next(program, dx, dy, base, r, g, b, a, dr, dg, db, da);
1700 program = ctx->stage;
1701
1702 if (program) {
1703 r = sk_unaligned_load<F>(ctx->r );
1704 g = sk_unaligned_load<F>(ctx->g );
1705 b = sk_unaligned_load<F>(ctx->b );
1706 a = sk_unaligned_load<F>(ctx->a );
1707 dr = sk_unaligned_load<F>(ctx->dr);
1708 dg = sk_unaligned_load<F>(ctx->dg);
1709 db = sk_unaligned_load<F>(ctx->db);
1710 da = sk_unaligned_load<F>(ctx->da);
1711 base = ctx->base;
1712 }
1713 }
1714 }
stack_rewind(SkRasterPipelineStage * program,size_t dx,size_t dy,std::byte * base,F r,F g,F b,F a,F dr,F dg,F db,F da)1715 static void ABI stack_rewind(SkRasterPipelineStage* program,
1716 size_t dx, size_t dy, std::byte* base,
1717 F r, F g, F b, F a, F dr, F dg, F db, F da) {
1718 SkRasterPipeline_RewindCtx* ctx = Ctx{program};
1719 sk_unaligned_store(ctx->r , r );
1720 sk_unaligned_store(ctx->g , g );
1721 sk_unaligned_store(ctx->b , b );
1722 sk_unaligned_store(ctx->a , a );
1723 sk_unaligned_store(ctx->dr, dr);
1724 sk_unaligned_store(ctx->dg, dg);
1725 sk_unaligned_store(ctx->db, db);
1726 sk_unaligned_store(ctx->da, da);
1727 ctx->base = base;
1728 ctx->stage = program;
1729 }
1730 #endif
1731
1732
1733 // We could start defining normal Stages now. But first, some helper functions.
1734
1735 template <typename V, typename T>
load(const T * src)1736 SI V load(const T* src) {
1737 return sk_unaligned_load<V>(src);
1738 }
1739
1740 template <typename V, typename T>
store(T * dst,V v)1741 SI void store(T* dst, V v) {
1742 sk_unaligned_store(dst, v);
1743 }
1744
from_byte(U8 b)1745 SI F from_byte(U8 b) {
1746 return cast(expand(b)) * (1/255.0f);
1747 }
from_short(U16 s)1748 SI F from_short(U16 s) {
1749 return cast(expand(s)) * (1/65535.0f);
1750 }
from_565(U16 _565,F * r,F * g,F * b)1751 SI void from_565(U16 _565, F* r, F* g, F* b) {
1752 U32 wide = expand(_565);
1753 *r = cast(wide & (31<<11)) * (1.0f / (31<<11));
1754 *g = cast(wide & (63<< 5)) * (1.0f / (63<< 5));
1755 *b = cast(wide & (31<< 0)) * (1.0f / (31<< 0));
1756 }
from_4444(U16 _4444,F * r,F * g,F * b,F * a)1757 SI void from_4444(U16 _4444, F* r, F* g, F* b, F* a) {
1758 U32 wide = expand(_4444);
1759 *r = cast(wide & (15<<12)) * (1.0f / (15<<12));
1760 *g = cast(wide & (15<< 8)) * (1.0f / (15<< 8));
1761 *b = cast(wide & (15<< 4)) * (1.0f / (15<< 4));
1762 *a = cast(wide & (15<< 0)) * (1.0f / (15<< 0));
1763 }
from_8888(U32 _8888,F * r,F * g,F * b,F * a)1764 SI void from_8888(U32 _8888, F* r, F* g, F* b, F* a) {
1765 *r = cast((_8888 ) & 0xff) * (1/255.0f);
1766 *g = cast((_8888 >> 8) & 0xff) * (1/255.0f);
1767 *b = cast((_8888 >> 16) & 0xff) * (1/255.0f);
1768 *a = cast((_8888 >> 24) ) * (1/255.0f);
1769 }
from_88(U16 _88,F * r,F * g)1770 SI void from_88(U16 _88, F* r, F* g) {
1771 U32 wide = expand(_88);
1772 *r = cast((wide ) & 0xff) * (1/255.0f);
1773 *g = cast((wide >> 8) & 0xff) * (1/255.0f);
1774 }
from_1010102(U32 rgba,F * r,F * g,F * b,F * a)1775 SI void from_1010102(U32 rgba, F* r, F* g, F* b, F* a) {
1776 *r = cast((rgba ) & 0x3ff) * (1/1023.0f);
1777 *g = cast((rgba >> 10) & 0x3ff) * (1/1023.0f);
1778 *b = cast((rgba >> 20) & 0x3ff) * (1/1023.0f);
1779 *a = cast((rgba >> 30) ) * (1/ 3.0f);
1780 }
from_1010102_xr(U32 rgba,F * r,F * g,F * b,F * a)1781 SI void from_1010102_xr(U32 rgba, F* r, F* g, F* b, F* a) {
1782 static constexpr float min = -0.752941f;
1783 static constexpr float max = 1.25098f;
1784 static constexpr float range = max - min;
1785 *r = cast((rgba ) & 0x3ff) * (1/1023.0f) * range + min;
1786 *g = cast((rgba >> 10) & 0x3ff) * (1/1023.0f) * range + min;
1787 *b = cast((rgba >> 20) & 0x3ff) * (1/1023.0f) * range + min;
1788 *a = cast((rgba >> 30) ) * (1/ 3.0f);
1789 }
from_10101010_xr(U64 _10x6,F * r,F * g,F * b,F * a)1790 SI void from_10101010_xr(U64 _10x6, F* r, F* g, F* b, F* a) {
1791 *r = (cast64((_10x6 >> 6) & 0x3ff) - 384.f) / 510.f;
1792 *g = (cast64((_10x6 >> 22) & 0x3ff) - 384.f) / 510.f;
1793 *b = (cast64((_10x6 >> 38) & 0x3ff) - 384.f) / 510.f;
1794 *a = (cast64((_10x6 >> 54) & 0x3ff) - 384.f) / 510.f;
1795 }
from_10x6(U64 _10x6,F * r,F * g,F * b,F * a)1796 SI void from_10x6(U64 _10x6, F* r, F* g, F* b, F* a) {
1797 *r = cast64((_10x6 >> 6) & 0x3ff) * (1/1023.0f);
1798 *g = cast64((_10x6 >> 22) & 0x3ff) * (1/1023.0f);
1799 *b = cast64((_10x6 >> 38) & 0x3ff) * (1/1023.0f);
1800 *a = cast64((_10x6 >> 54) & 0x3ff) * (1/1023.0f);
1801 }
from_1616(U32 _1616,F * r,F * g)1802 SI void from_1616(U32 _1616, F* r, F* g) {
1803 *r = cast((_1616 ) & 0xffff) * (1/65535.0f);
1804 *g = cast((_1616 >> 16) & 0xffff) * (1/65535.0f);
1805 }
from_16161616(U64 _16161616,F * r,F * g,F * b,F * a)1806 SI void from_16161616(U64 _16161616, F* r, F* g, F* b, F* a) {
1807 *r = cast64((_16161616 ) & 0xffff) * (1/65535.0f);
1808 *g = cast64((_16161616 >> 16) & 0xffff) * (1/65535.0f);
1809 *b = cast64((_16161616 >> 32) & 0xffff) * (1/65535.0f);
1810 *a = cast64((_16161616 >> 48) & 0xffff) * (1/65535.0f);
1811 }
1812
1813 // Used by load_ and store_ stages to get to the right (dx,dy) starting point of contiguous memory.
1814 template <typename T>
ptr_at_xy(const SkRasterPipeline_MemoryCtx * ctx,size_t dx,size_t dy)1815 SI T* ptr_at_xy(const SkRasterPipeline_MemoryCtx* ctx, size_t dx, size_t dy) {
1816 return (T*)ctx->pixels + dy*ctx->stride + dx;
1817 }
1818
1819 // clamp v to [0,limit).
clamp(F v,F limit)1820 SI F clamp(F v, F limit) {
1821 F inclusive = sk_bit_cast<F>(sk_bit_cast<U32>(limit) - 1); // Exclusive -> inclusive.
1822 return min(max(0.0f, v), inclusive);
1823 }
1824
1825 // clamp to (0,limit).
clamp_ex(F v,float limit)1826 SI F clamp_ex(F v, float limit) {
1827 const F inclusiveZ = F_(std::numeric_limits<float>::min()),
1828 inclusiveL = sk_bit_cast<F>( sk_bit_cast<U32>(F_(limit)) - 1 );
1829 return min(max(inclusiveZ, v), inclusiveL);
1830 }
1831
1832 // Polynomial approximation of degree 5 for sin(x * 2 * pi) in the range [-1/4, 1/4]
1833 // Adapted from https://github.com/google/swiftshader/blob/master/docs/Sin-Cos-Optimization.pdf
sin5q_(F x)1834 SI F sin5q_(F x) {
1835 // A * x + B * x^3 + C * x^5
1836 // Exact at x = 0, 1/12, 1/6, 1/4, and their negatives,
1837 // which correspond to x * 2 * pi = 0, pi/6, pi/3, pi/2
1838 constexpr float A = 6.28230858f;
1839 constexpr float B = -41.1693687f;
1840 constexpr float C = 74.4388885f;
1841 F x2 = x * x;
1842 return x * mad(mad(x2, C, B), x2, A);
1843 }
1844
sin_(F x)1845 SI F sin_(F x) {
1846 constexpr float one_over_pi2 = 1 / (2 * SK_FloatPI);
1847 x = mad(x, -one_over_pi2, 0.25f);
1848 x = 0.25f - abs_(x - floor_(x + 0.5f));
1849 return sin5q_(x);
1850 }
1851
cos_(F x)1852 SI F cos_(F x) {
1853 constexpr float one_over_pi2 = 1 / (2 * SK_FloatPI);
1854 x *= one_over_pi2;
1855 x = 0.25f - abs_(x - floor_(x + 0.5f));
1856 return sin5q_(x);
1857 }
1858
1859 /* "GENERATING ACCURATE VALUES FOR THE TANGENT FUNCTION"
1860 https://mae.ufl.edu/~uhk/ACCURATE-TANGENT.pdf
1861
1862 approx = x + (1/3)x^3 + (2/15)x^5 + (17/315)x^7 + (62/2835)x^9
1863
1864 Some simplifications:
1865 1. tan(x) is periodic, -PI/2 < x < PI/2
1866 2. tan(x) is odd, so tan(-x) = -tan(x)
1867 3. Our polynomial approximation is best near zero, so we use the following identity
1868 tan(x) + tan(y)
1869 tan(x + y) = -----------------
1870 1 - tan(x)*tan(y)
1871 tan(PI/4) = 1
1872
1873 So for x > PI/8, we do the following refactor:
1874 x' = x - PI/4
1875
1876 1 + tan(x')
1877 tan(x) = ------------
1878 1 - tan(x')
1879 */
tan_(F x)1880 SI F tan_(F x) {
1881 constexpr float Pi = SK_FloatPI;
1882 // periodic between -pi/2 ... pi/2
1883 // shift to 0...Pi, scale 1/Pi to get into 0...1, then fract, scale-up, shift-back
1884 x = mad(fract(mad(x, 1/Pi, 0.5f)), Pi, -Pi/2);
1885
1886 I32 neg = (x < 0.0f);
1887 x = if_then_else(neg, -x, x);
1888
1889 // minimize total error by shifting if x > pi/8
1890 I32 use_quotient = (x > (Pi/8));
1891 x = if_then_else(use_quotient, x - (Pi/4), x);
1892
1893 // 9th order poly = 4th order(x^2) * x
1894 const float c4 = 62 / 2835.0f;
1895 const float c3 = 17 / 315.0f;
1896 const float c2 = 2 / 15.0f;
1897 const float c1 = 1 / 3.0f;
1898 const float c0 = 1.0f;
1899 F x2 = x * x;
1900 x *= mad(x2, mad(x2, mad(x2, mad(x2, c4, c3), c2), c1), c0);
1901 x = if_then_else(use_quotient, (1+x)/(1-x), x);
1902 x = if_then_else(neg, -x, x);
1903 return x;
1904 }
1905
1906 /* Use 4th order polynomial approximation from https://arachnoid.com/polysolve/
1907 with 129 values of x,atan(x) for x:[0...1]
1908 This only works for 0 <= x <= 1
1909 */
approx_atan_unit(F x)1910 SI F approx_atan_unit(F x) {
1911 // y = 0.14130025741326729 x⁴
1912 // - 0.34312835980675116 x³
1913 // - 0.016172900528248768 x²
1914 // + 1.00376969762003850 x
1915 // - 0.00014758242182738969
1916 const float c4 = 0.14130025741326729f;
1917 const float c3 = -0.34312835980675116f;
1918 const float c2 = -0.016172900528248768f;
1919 const float c1 = 1.0037696976200385f;
1920 const float c0 = -0.00014758242182738969f;
1921 return mad(x, mad(x, mad(x, mad(x, c4, c3), c2), c1), c0);
1922 }
1923
1924 // Use identity atan(x) = pi/2 - atan(1/x) for x > 1
atan_(F x)1925 SI F atan_(F x) {
1926 I32 neg = (x < 0.0f);
1927 x = if_then_else(neg, -x, x);
1928 I32 flip = (x > 1.0f);
1929 x = if_then_else(flip, 1/x, x);
1930 x = approx_atan_unit(x);
1931 x = if_then_else(flip, SK_FloatPI/2 - x, x);
1932 x = if_then_else(neg, -x, x);
1933 return x;
1934 }
1935
1936 // Handbook of Mathematical Functions, by Milton Abramowitz and Irene Stegun:
1937 // https://books.google.com/books/content?id=ZboM5tOFWtsC&pg=PA81&img=1&zoom=3&hl=en&bul=1&sig=ACfU3U2M75tG_iGVOS92eQspr14LTq02Nw&ci=0%2C15%2C999%2C1279&edge=0
1938 // http://screen/8YGJxUGFQ49bVX6
asin_(F x)1939 SI F asin_(F x) {
1940 I32 neg = (x < 0.0f);
1941 x = if_then_else(neg, -x, x);
1942 const float c3 = -0.0187293f;
1943 const float c2 = 0.0742610f;
1944 const float c1 = -0.2121144f;
1945 const float c0 = 1.5707288f;
1946 F poly = mad(x, mad(x, mad(x, c3, c2), c1), c0);
1947 x = nmad(sqrt_(1 - x), poly, SK_FloatPI/2);
1948 x = if_then_else(neg, -x, x);
1949 return x;
1950 }
1951
acos_(F x)1952 SI F acos_(F x) {
1953 return SK_FloatPI/2 - asin_(x);
1954 }
1955
1956 /* Use identity atan(x) = pi/2 - atan(1/x) for x > 1
1957 By swapping y,x to ensure the ratio is <= 1, we can safely call atan_unit()
1958 which avoids a 2nd divide instruction if we had instead called atan().
1959 */
atan2_(F y0,F x0)1960 SI F atan2_(F y0, F x0) {
1961 I32 flip = (abs_(y0) > abs_(x0));
1962 F y = if_then_else(flip, x0, y0);
1963 F x = if_then_else(flip, y0, x0);
1964 F arg = y/x;
1965
1966 I32 neg = (arg < 0.0f);
1967 arg = if_then_else(neg, -arg, arg);
1968
1969 F r = approx_atan_unit(arg);
1970 r = if_then_else(flip, SK_FloatPI/2 - r, r);
1971 r = if_then_else(neg, -r, r);
1972
1973 // handle quadrant distinctions
1974 r = if_then_else((y0 >= 0) & (x0 < 0), r + SK_FloatPI, r);
1975 r = if_then_else((y0 < 0) & (x0 <= 0), r - SK_FloatPI, r);
1976 // Note: we don't try to handle 0,0 or infinities
1977 return r;
1978 }
1979
1980 // Used by gather_ stages to calculate the base pointer and a vector of indices to load.
1981 template <typename T>
ix_and_ptr(T ** ptr,const SkRasterPipeline_GatherCtx * ctx,F x,F y)1982 SI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, F x, F y) {
1983 // We use exclusive clamp so that our min value is > 0 because ULP subtraction using U32 would
1984 // produce a NaN if applied to +0.f.
1985 x = clamp_ex(x, ctx->width );
1986 y = clamp_ex(y, ctx->height);
1987 x = sk_bit_cast<F>(sk_bit_cast<U32>(x) - (uint32_t)ctx->roundDownAtInteger);
1988 y = sk_bit_cast<F>(sk_bit_cast<U32>(y) - (uint32_t)ctx->roundDownAtInteger);
1989 *ptr = (const T*)ctx->pixels;
1990 return trunc_(y)*ctx->stride + trunc_(x);
1991 }
1992
1993 // We often have a nominally [0,1] float value we need to scale and convert to an integer,
1994 // whether for a table lookup or to pack back down into bytes for storage.
1995 //
1996 // In practice, especially when dealing with interesting color spaces, that notionally
1997 // [0,1] float may be out of [0,1] range. Unorms cannot represent that, so we must clamp.
1998 //
1999 // You can adjust the expected input to [0,bias] by tweaking that parameter.
2000 SI U32 to_unorm(F v, float scale, float bias = 1.0f) {
2001 // Any time we use round() we probably want to use to_unorm().
2002 return round(min(max(0.0f, v), bias), F_(scale));
2003 }
2004
cond_to_mask(I32 cond)2005 SI I32 cond_to_mask(I32 cond) {
2006 #if defined(SKRP_CPU_SCALAR)
2007 // In scalar mode, conditions are bools (0 or 1), but we want to store and operate on masks
2008 // (eg, using bitwise operations to select values).
2009 return if_then_else(cond, I32(~0), I32(0));
2010 #else
2011 // In SIMD mode, our various instruction sets already represent conditions as masks.
2012 return cond;
2013 #endif
2014 }
2015
2016 #if defined(SKRP_CPU_SCALAR)
2017 // In scalar mode, `data` only contains a single lane.
select_lane(uint32_t data,int)2018 SI uint32_t select_lane(uint32_t data, int /*lane*/) { return data; }
select_lane(int32_t data,int)2019 SI int32_t select_lane( int32_t data, int /*lane*/) { return data; }
2020 #else
2021 // In SIMD mode, `data` contains a vector of lanes.
select_lane(U32 data,int lane)2022 SI uint32_t select_lane(U32 data, int lane) { return data[lane]; }
select_lane(I32 data,int lane)2023 SI int32_t select_lane(I32 data, int lane) { return data[lane]; }
2024 #endif
2025
2026 // Now finally, normal Stages!
2027
STAGE(seed_shader,NoCtx)2028 STAGE(seed_shader, NoCtx) {
2029 static constexpr float iota[] = {
2030 0.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 7.5f,
2031 8.5f, 9.5f,10.5f,11.5f,12.5f,13.5f,14.5f,15.5f,
2032 };
2033 static_assert(std::size(iota) >= SkRasterPipeline_kMaxStride_highp);
2034
2035 // It's important for speed to explicitly cast(dx) and cast(dy),
2036 // which has the effect of splatting them to vectors before converting to floats.
2037 // On Intel this breaks a data dependency on previous loop iterations' registers.
2038 r = cast(U32_(dx)) + sk_unaligned_load<F>(iota);
2039 g = cast(U32_(dy)) + 0.5f;
2040 b = F1; // This is w=1 for matrix multiplies by the device coords.
2041 a = F0;
2042 }
2043
STAGE(dither,const float * rate)2044 STAGE(dither, const float* rate) {
2045 // Get [(dx,dy), (dx+1,dy), (dx+2,dy), ...] loaded up in integer vectors.
2046 uint32_t iota[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
2047 static_assert(std::size(iota) >= SkRasterPipeline_kMaxStride_highp);
2048
2049 U32 X = U32_(dx) + sk_unaligned_load<U32>(iota),
2050 Y = U32_(dy);
2051
2052 // We're doing 8x8 ordered dithering, see https://en.wikipedia.org/wiki/Ordered_dithering.
2053 // In this case n=8 and we're using the matrix that looks like 1/64 x [ 0 48 12 60 ... ].
2054
2055 // We only need X and X^Y from here on, so it's easier to just think of that as "Y".
2056 Y ^= X;
2057
2058 // We'll mix the bottom 3 bits of each of X and Y to make 6 bits,
2059 // for 2^6 == 64 == 8x8 matrix values. If X=abc and Y=def, we make fcebda.
2060 U32 M = (Y & 1) << 5 | (X & 1) << 4
2061 | (Y & 2) << 2 | (X & 2) << 1
2062 | (Y & 4) >> 1 | (X & 4) >> 2;
2063
2064 // Scale that dither to [0,1), then (-0.5,+0.5), here using 63/128 = 0.4921875 as 0.5-epsilon.
2065 // We want to make sure our dither is less than 0.5 in either direction to keep exact values
2066 // like 0 and 1 unchanged after rounding.
2067 F dither = mad(cast(M), 2/128.0f, -63/128.0f);
2068
2069 r = mad(dither, *rate, r);
2070 g = mad(dither, *rate, g);
2071 b = mad(dither, *rate, b);
2072
2073 r = max(0.0f, min(r, a));
2074 g = max(0.0f, min(g, a));
2075 b = max(0.0f, min(b, a));
2076 }
2077
2078 // load 4 floats from memory, and splat them into r,g,b,a
STAGE(uniform_color,const SkRasterPipeline_UniformColorCtx * c)2079 STAGE(uniform_color, const SkRasterPipeline_UniformColorCtx* c) {
2080 r = F_(c->r);
2081 g = F_(c->g);
2082 b = F_(c->b);
2083 a = F_(c->a);
2084 }
STAGE(unbounded_uniform_color,const SkRasterPipeline_UniformColorCtx * c)2085 STAGE(unbounded_uniform_color, const SkRasterPipeline_UniformColorCtx* c) {
2086 r = F_(c->r);
2087 g = F_(c->g);
2088 b = F_(c->b);
2089 a = F_(c->a);
2090 }
2091 // load 4 floats from memory, and splat them into dr,dg,db,da
STAGE(uniform_color_dst,const SkRasterPipeline_UniformColorCtx * c)2092 STAGE(uniform_color_dst, const SkRasterPipeline_UniformColorCtx* c) {
2093 dr = F_(c->r);
2094 dg = F_(c->g);
2095 db = F_(c->b);
2096 da = F_(c->a);
2097 }
2098
2099 // splats opaque-black into r,g,b,a
STAGE(black_color,NoCtx)2100 STAGE(black_color, NoCtx) {
2101 r = g = b = F0;
2102 a = F1;
2103 }
2104
STAGE(white_color,NoCtx)2105 STAGE(white_color, NoCtx) {
2106 r = g = b = a = F1;
2107 }
2108
2109 // load registers r,g,b,a from context (mirrors store_src)
STAGE(load_src,const float * ptr)2110 STAGE(load_src, const float* ptr) {
2111 r = sk_unaligned_load<F>(ptr + 0*N);
2112 g = sk_unaligned_load<F>(ptr + 1*N);
2113 b = sk_unaligned_load<F>(ptr + 2*N);
2114 a = sk_unaligned_load<F>(ptr + 3*N);
2115 }
2116
2117 // store registers r,g,b,a into context (mirrors load_src)
STAGE(store_src,float * ptr)2118 STAGE(store_src, float* ptr) {
2119 sk_unaligned_store(ptr + 0*N, r);
2120 sk_unaligned_store(ptr + 1*N, g);
2121 sk_unaligned_store(ptr + 2*N, b);
2122 sk_unaligned_store(ptr + 3*N, a);
2123 }
2124 // store registers r,g into context
STAGE(store_src_rg,float * ptr)2125 STAGE(store_src_rg, float* ptr) {
2126 sk_unaligned_store(ptr + 0*N, r);
2127 sk_unaligned_store(ptr + 1*N, g);
2128 }
2129 // load registers r,g from context
STAGE(load_src_rg,float * ptr)2130 STAGE(load_src_rg, float* ptr) {
2131 r = sk_unaligned_load<F>(ptr + 0*N);
2132 g = sk_unaligned_load<F>(ptr + 1*N);
2133 }
2134 // store register a into context
STAGE(store_src_a,float * ptr)2135 STAGE(store_src_a, float* ptr) {
2136 sk_unaligned_store(ptr, a);
2137 }
2138
2139 // load registers dr,dg,db,da from context (mirrors store_dst)
STAGE(load_dst,const float * ptr)2140 STAGE(load_dst, const float* ptr) {
2141 dr = sk_unaligned_load<F>(ptr + 0*N);
2142 dg = sk_unaligned_load<F>(ptr + 1*N);
2143 db = sk_unaligned_load<F>(ptr + 2*N);
2144 da = sk_unaligned_load<F>(ptr + 3*N);
2145 }
2146
2147 // store registers dr,dg,db,da into context (mirrors load_dst)
STAGE(store_dst,float * ptr)2148 STAGE(store_dst, float* ptr) {
2149 sk_unaligned_store(ptr + 0*N, dr);
2150 sk_unaligned_store(ptr + 1*N, dg);
2151 sk_unaligned_store(ptr + 2*N, db);
2152 sk_unaligned_store(ptr + 3*N, da);
2153 }
2154
2155 // Most blend modes apply the same logic to each channel.
2156 #define BLEND_MODE(name) \
2157 SI F name##_channel(F s, F d, F sa, F da); \
2158 STAGE(name, NoCtx) { \
2159 r = name##_channel(r,dr,a,da); \
2160 g = name##_channel(g,dg,a,da); \
2161 b = name##_channel(b,db,a,da); \
2162 a = name##_channel(a,da,a,da); \
2163 } \
2164 SI F name##_channel(F s, F d, F sa, F da)
2165
inv(F x)2166 SI F inv(F x) { return 1.0f - x; }
two(F x)2167 SI F two(F x) { return x + x; }
2168
BLEND_MODE(clear)2169 BLEND_MODE(clear) { return F0; }
BLEND_MODE(srcatop)2170 BLEND_MODE(srcatop) { return mad(s, da, d*inv(sa)); }
BLEND_MODE(dstatop)2171 BLEND_MODE(dstatop) { return mad(d, sa, s*inv(da)); }
BLEND_MODE(srcin)2172 BLEND_MODE(srcin) { return s * da; }
BLEND_MODE(dstin)2173 BLEND_MODE(dstin) { return d * sa; }
BLEND_MODE(srcout)2174 BLEND_MODE(srcout) { return s * inv(da); }
BLEND_MODE(dstout)2175 BLEND_MODE(dstout) { return d * inv(sa); }
BLEND_MODE(srcover)2176 BLEND_MODE(srcover) { return mad(d, inv(sa), s); }
BLEND_MODE(dstover)2177 BLEND_MODE(dstover) { return mad(s, inv(da), d); }
2178
BLEND_MODE(modulate)2179 BLEND_MODE(modulate) { return s*d; }
BLEND_MODE(multiply)2180 BLEND_MODE(multiply) { return mad(s, d, mad(s, inv(da), d*inv(sa))); }
BLEND_MODE(plus_)2181 BLEND_MODE(plus_) { return min(s + d, 1.0f); } // We can clamp to either 1 or sa.
BLEND_MODE(screen)2182 BLEND_MODE(screen) { return nmad(s, d, s + d); }
BLEND_MODE(xor_)2183 BLEND_MODE(xor_) { return mad(s, inv(da), d*inv(sa)); }
2184 #undef BLEND_MODE
2185
2186 // Most other blend modes apply the same logic to colors, and srcover to alpha.
2187 #define BLEND_MODE(name) \
2188 SI F name##_channel(F s, F d, F sa, F da); \
2189 STAGE(name, NoCtx) { \
2190 r = name##_channel(r,dr,a,da); \
2191 g = name##_channel(g,dg,a,da); \
2192 b = name##_channel(b,db,a,da); \
2193 a = mad(da, inv(a), a); \
2194 } \
2195 SI F name##_channel(F s, F d, F sa, F da)
2196
BLEND_MODE(darken)2197 BLEND_MODE(darken) { return s + d - max(s*da, d*sa) ; }
BLEND_MODE(lighten)2198 BLEND_MODE(lighten) { return s + d - min(s*da, d*sa) ; }
BLEND_MODE(difference)2199 BLEND_MODE(difference) { return s + d - two(min(s*da, d*sa)); }
BLEND_MODE(exclusion)2200 BLEND_MODE(exclusion) { return s + d - two(s*d); }
2201
BLEND_MODE(colorburn)2202 BLEND_MODE(colorburn) {
2203 return if_then_else(d == da, d + s*inv(da),
2204 if_then_else(s == 0, /* s + */ d*inv(sa),
2205 sa*(da - min(da, (da-d)*sa*rcp_fast(s))) + s*inv(da) + d*inv(sa)));
2206 }
BLEND_MODE(colordodge)2207 BLEND_MODE(colordodge) {
2208 return if_then_else(d == 0, /* d + */ s*inv(da),
2209 if_then_else(s == sa, s + d*inv(sa),
2210 sa*min(da, (d*sa)*rcp_fast(sa - s)) + s*inv(da) + d*inv(sa)));
2211 }
BLEND_MODE(hardlight)2212 BLEND_MODE(hardlight) {
2213 return s*inv(da) + d*inv(sa)
2214 + if_then_else(two(s) <= sa, two(s*d), sa*da - two((da-d)*(sa-s)));
2215 }
BLEND_MODE(overlay)2216 BLEND_MODE(overlay) {
2217 return s*inv(da) + d*inv(sa)
2218 + if_then_else(two(d) <= da, two(s*d), sa*da - two((da-d)*(sa-s)));
2219 }
2220
BLEND_MODE(softlight)2221 BLEND_MODE(softlight) {
2222 F m = if_then_else(da > 0, d / da, 0.0f),
2223 s2 = two(s),
2224 m4 = two(two(m));
2225
2226 // The logic forks three ways:
2227 // 1. dark src?
2228 // 2. light src, dark dst?
2229 // 3. light src, light dst?
2230 F darkSrc = d*(sa + (s2 - sa)*(1.0f - m)), // Used in case 1.
2231 darkDst = (m4*m4 + m4)*(m - 1.0f) + 7.0f*m, // Used in case 2.
2232 liteDst = sqrt_(m) - m,
2233 liteSrc = d*sa + da*(s2 - sa) * if_then_else(two(two(d)) <= da, darkDst, liteDst); // 2 or 3?
2234 return s*inv(da) + d*inv(sa) + if_then_else(s2 <= sa, darkSrc, liteSrc); // 1 or (2 or 3)?
2235 }
2236 #undef BLEND_MODE
2237
2238 // We're basing our implemenation of non-separable blend modes on
2239 // https://www.w3.org/TR/compositing-1/#blendingnonseparable.
2240 // and
2241 // https://www.khronos.org/registry/OpenGL/specs/es/3.2/es_spec_3.2.pdf
2242 // They're equivalent, but ES' math has been better simplified.
2243 //
2244 // Anything extra we add beyond that is to make the math work with premul inputs.
2245
sat(F r,F g,F b)2246 SI F sat(F r, F g, F b) { return max(r, max(g,b)) - min(r, min(g,b)); }
lum(F r,F g,F b)2247 SI F lum(F r, F g, F b) { return mad(r, 0.30f, mad(g, 0.59f, b*0.11f)); }
2248
set_sat(F * r,F * g,F * b,F s)2249 SI void set_sat(F* r, F* g, F* b, F s) {
2250 F mn = min(*r, min(*g,*b)),
2251 mx = max(*r, max(*g,*b)),
2252 sat = mx - mn;
2253
2254 // Map min channel to 0, max channel to s, and scale the middle proportionally.
2255 s = if_then_else(sat == 0.0f, 0.0f, s * rcp_fast(sat));
2256 *r = (*r - mn) * s;
2257 *g = (*g - mn) * s;
2258 *b = (*b - mn) * s;
2259 }
set_lum(F * r,F * g,F * b,F l)2260 SI void set_lum(F* r, F* g, F* b, F l) {
2261 F diff = l - lum(*r, *g, *b);
2262 *r += diff;
2263 *g += diff;
2264 *b += diff;
2265 }
clip_channel(F c,F l,I32 clip_low,I32 clip_high,F mn_scale,F mx_scale)2266 SI F clip_channel(F c, F l, I32 clip_low, I32 clip_high, F mn_scale, F mx_scale) {
2267 c = if_then_else(clip_low, mad(mn_scale, c - l, l), c);
2268 c = if_then_else(clip_high, mad(mx_scale, c - l, l), c);
2269 c = max(c, 0.0f); // Sometimes without this we may dip just a little negative.
2270 return c;
2271 }
clip_color(F * r,F * g,F * b,F a)2272 SI void clip_color(F* r, F* g, F* b, F a) {
2273 F mn = min(*r, min(*g, *b)),
2274 mx = max(*r, max(*g, *b)),
2275 l = lum(*r, *g, *b),
2276 mn_scale = ( l) * rcp_fast(l - mn),
2277 mx_scale = (a - l) * rcp_fast(mx - l);
2278 I32 clip_low = cond_to_mask(mn < 0 && l != mn),
2279 clip_high = cond_to_mask(mx > a && l != mx);
2280
2281 *r = clip_channel(*r, l, clip_low, clip_high, mn_scale, mx_scale);
2282 *g = clip_channel(*g, l, clip_low, clip_high, mn_scale, mx_scale);
2283 *b = clip_channel(*b, l, clip_low, clip_high, mn_scale, mx_scale);
2284 }
2285
STAGE(hue,NoCtx)2286 STAGE(hue, NoCtx) {
2287 F R = r*a,
2288 G = g*a,
2289 B = b*a;
2290
2291 set_sat(&R, &G, &B, sat(dr,dg,db)*a);
2292 set_lum(&R, &G, &B, lum(dr,dg,db)*a);
2293 clip_color(&R,&G,&B, a*da);
2294
2295 r = mad(r, inv(da), mad(dr, inv(a), R));
2296 g = mad(g, inv(da), mad(dg, inv(a), G));
2297 b = mad(b, inv(da), mad(db, inv(a), B));
2298 a = a + nmad(a, da, da);
2299 }
STAGE(saturation,NoCtx)2300 STAGE(saturation, NoCtx) {
2301 F R = dr*a,
2302 G = dg*a,
2303 B = db*a;
2304
2305 set_sat(&R, &G, &B, sat( r, g, b)*da);
2306 set_lum(&R, &G, &B, lum(dr,dg,db)* a); // (This is not redundant.)
2307 clip_color(&R,&G,&B, a*da);
2308
2309 r = mad(r, inv(da), mad(dr, inv(a), R));
2310 g = mad(g, inv(da), mad(dg, inv(a), G));
2311 b = mad(b, inv(da), mad(db, inv(a), B));
2312 a = a + nmad(a, da, da);
2313 }
STAGE(color,NoCtx)2314 STAGE(color, NoCtx) {
2315 F R = r*da,
2316 G = g*da,
2317 B = b*da;
2318
2319 set_lum(&R, &G, &B, lum(dr,dg,db)*a);
2320 clip_color(&R,&G,&B, a*da);
2321
2322 r = mad(r, inv(da), mad(dr, inv(a), R));
2323 g = mad(g, inv(da), mad(dg, inv(a), G));
2324 b = mad(b, inv(da), mad(db, inv(a), B));
2325 a = a + nmad(a, da, da);
2326 }
STAGE(luminosity,NoCtx)2327 STAGE(luminosity, NoCtx) {
2328 F R = dr*a,
2329 G = dg*a,
2330 B = db*a;
2331
2332 set_lum(&R, &G, &B, lum(r,g,b)*da);
2333 clip_color(&R,&G,&B, a*da);
2334
2335 r = mad(r, inv(da), mad(dr, inv(a), R));
2336 g = mad(g, inv(da), mad(dg, inv(a), G));
2337 b = mad(b, inv(da), mad(db, inv(a), B));
2338 a = a + nmad(a, da, da);
2339 }
2340
STAGE(srcover_rgba_8888,const SkRasterPipeline_MemoryCtx * ctx)2341 STAGE(srcover_rgba_8888, const SkRasterPipeline_MemoryCtx* ctx) {
2342 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
2343
2344 U32 dst = load<U32>(ptr);
2345 dr = cast((dst ) & 0xff);
2346 dg = cast((dst >> 8) & 0xff);
2347 db = cast((dst >> 16) & 0xff);
2348 da = cast((dst >> 24) );
2349 // {dr,dg,db,da} are in [0,255]
2350 // { r, g, b, a} are in [0, 1] (but may be out of gamut)
2351
2352 r = mad(dr, inv(a), r*255.0f);
2353 g = mad(dg, inv(a), g*255.0f);
2354 b = mad(db, inv(a), b*255.0f);
2355 a = mad(da, inv(a), a*255.0f);
2356 // { r, g, b, a} are now in [0,255] (but may be out of gamut)
2357
2358 // to_unorm() clamps back to gamut. Scaling by 1 since we're already 255-biased.
2359 dst = to_unorm(r, 1, 255)
2360 | to_unorm(g, 1, 255) << 8
2361 | to_unorm(b, 1, 255) << 16
2362 | to_unorm(a, 1, 255) << 24;
2363 store(ptr, dst);
2364 }
2365
clamp_01_(F v)2366 SI F clamp_01_(F v) { return min(max(0.0f, v), 1.0f); }
2367
STAGE(clamp_01,NoCtx)2368 STAGE(clamp_01, NoCtx) {
2369 r = clamp_01_(r);
2370 g = clamp_01_(g);
2371 b = clamp_01_(b);
2372 a = clamp_01_(a);
2373 }
2374
STAGE(clamp_a_01,NoCtx)2375 STAGE(clamp_a_01, NoCtx) {
2376 a = clamp_01_(a);
2377 }
2378
STAGE(clamp_gamut,NoCtx)2379 STAGE(clamp_gamut, NoCtx) {
2380 a = min(max(a, 0.0f), 1.0f);
2381 r = min(max(r, 0.0f), a);
2382 g = min(max(g, 0.0f), a);
2383 b = min(max(b, 0.0f), a);
2384 }
2385
STAGE(set_rgb,const float * rgb)2386 STAGE(set_rgb, const float* rgb) {
2387 r = F_(rgb[0]);
2388 g = F_(rgb[1]);
2389 b = F_(rgb[2]);
2390 }
2391
STAGE(unbounded_set_rgb,const float * rgb)2392 STAGE(unbounded_set_rgb, const float* rgb) {
2393 r = F_(rgb[0]);
2394 g = F_(rgb[1]);
2395 b = F_(rgb[2]);
2396 }
2397
STAGE(swap_rb,NoCtx)2398 STAGE(swap_rb, NoCtx) {
2399 auto tmp = r;
2400 r = b;
2401 b = tmp;
2402 }
STAGE(swap_rb_dst,NoCtx)2403 STAGE(swap_rb_dst, NoCtx) {
2404 auto tmp = dr;
2405 dr = db;
2406 db = tmp;
2407 }
2408
STAGE(move_src_dst,NoCtx)2409 STAGE(move_src_dst, NoCtx) {
2410 dr = r;
2411 dg = g;
2412 db = b;
2413 da = a;
2414 }
STAGE(move_dst_src,NoCtx)2415 STAGE(move_dst_src, NoCtx) {
2416 r = dr;
2417 g = dg;
2418 b = db;
2419 a = da;
2420 }
STAGE(swap_src_dst,NoCtx)2421 STAGE(swap_src_dst, NoCtx) {
2422 std::swap(r, dr);
2423 std::swap(g, dg);
2424 std::swap(b, db);
2425 std::swap(a, da);
2426 }
2427
STAGE(premul,NoCtx)2428 STAGE(premul, NoCtx) {
2429 r = r * a;
2430 g = g * a;
2431 b = b * a;
2432 }
STAGE(premul_dst,NoCtx)2433 STAGE(premul_dst, NoCtx) {
2434 dr = dr * da;
2435 dg = dg * da;
2436 db = db * da;
2437 }
STAGE(unpremul,NoCtx)2438 STAGE(unpremul, NoCtx) {
2439 float inf = sk_bit_cast<float>(0x7f800000);
2440 auto scale = if_then_else(1.0f/a < inf, 1.0f/a, 0.0f);
2441 r *= scale;
2442 g *= scale;
2443 b *= scale;
2444 }
STAGE(unpremul_polar,NoCtx)2445 STAGE(unpremul_polar, NoCtx) {
2446 float inf = sk_bit_cast<float>(0x7f800000);
2447 auto scale = if_then_else(1.0f/a < inf, 1.0f/a, 0.0f);
2448 g *= scale;
2449 b *= scale;
2450 }
2451
STAGE(force_opaque,NoCtx)2452 STAGE(force_opaque , NoCtx) { a = F1; }
STAGE(force_opaque_dst,NoCtx)2453 STAGE(force_opaque_dst, NoCtx) { da = F1; }
2454
STAGE(rgb_to_hsl,NoCtx)2455 STAGE(rgb_to_hsl, NoCtx) {
2456 F mx = max(r, max(g,b)),
2457 mn = min(r, min(g,b)),
2458 d = mx - mn,
2459 d_rcp = 1.0f / d;
2460
2461 F h = (1/6.0f) *
2462 if_then_else(mx == mn, 0.0f,
2463 if_then_else(mx == r, (g-b)*d_rcp + if_then_else(g < b, 6.0f, 0.0f),
2464 if_then_else(mx == g, (b-r)*d_rcp + 2.0f,
2465 (r-g)*d_rcp + 4.0f)));
2466
2467 F l = (mx + mn) * 0.5f;
2468 F s = if_then_else(mx == mn, 0.0f,
2469 d / if_then_else(l > 0.5f, 2.0f-mx-mn, mx+mn));
2470
2471 r = h;
2472 g = s;
2473 b = l;
2474 }
STAGE(hsl_to_rgb,NoCtx)2475 STAGE(hsl_to_rgb, NoCtx) {
2476 // See GrRGBToHSLFilterEffect.fp
2477
2478 F h = r,
2479 s = g,
2480 l = b,
2481 c = (1.0f - abs_(2.0f * l - 1)) * s;
2482
2483 auto hue_to_rgb = [&](F hue) {
2484 F q = clamp_01_(abs_(fract(hue) * 6.0f - 3.0f) - 1.0f);
2485 return (q - 0.5f) * c + l;
2486 };
2487
2488 r = hue_to_rgb(h + 0.0f/3.0f);
2489 g = hue_to_rgb(h + 2.0f/3.0f);
2490 b = hue_to_rgb(h + 1.0f/3.0f);
2491 }
2492
2493 // Color conversion functions used in gradient interpolation, based on
2494 // https://www.w3.org/TR/css-color-4/#color-conversion-code
STAGE(css_lab_to_xyz,NoCtx)2495 STAGE(css_lab_to_xyz, NoCtx) {
2496 constexpr float k = 24389 / 27.0f;
2497 constexpr float e = 216 / 24389.0f;
2498
2499 F f[3];
2500 f[1] = (r + 16) * (1 / 116.0f);
2501 f[0] = (g * (1 / 500.0f)) + f[1];
2502 f[2] = f[1] - (b * (1 / 200.0f));
2503
2504 F f_cubed[3] = { f[0]*f[0]*f[0], f[1]*f[1]*f[1], f[2]*f[2]*f[2] };
2505
2506 F xyz[3] = {
2507 if_then_else(f_cubed[0] > e, f_cubed[0], (116 * f[0] - 16) * (1 / k)),
2508 if_then_else(r > k * e, f_cubed[1], r * (1 / k)),
2509 if_then_else(f_cubed[2] > e, f_cubed[2], (116 * f[2] - 16) * (1 / k))
2510 };
2511
2512 constexpr float D50[3] = { 0.3457f / 0.3585f, 1.0f, (1.0f - 0.3457f - 0.3585f) / 0.3585f };
2513 r = xyz[0]*D50[0];
2514 g = xyz[1]*D50[1];
2515 b = xyz[2]*D50[2];
2516 }
2517
STAGE(css_oklab_to_linear_srgb,NoCtx)2518 STAGE(css_oklab_to_linear_srgb, NoCtx) {
2519 F l_ = r + 0.3963377774f * g + 0.2158037573f * b,
2520 m_ = r - 0.1055613458f * g - 0.0638541728f * b,
2521 s_ = r - 0.0894841775f * g - 1.2914855480f * b;
2522
2523 F l = l_*l_*l_,
2524 m = m_*m_*m_,
2525 s = s_*s_*s_;
2526
2527 r = +4.0767416621f * l - 3.3077115913f * m + 0.2309699292f * s;
2528 g = -1.2684380046f * l + 2.6097574011f * m - 0.3413193965f * s;
2529 b = -0.0041960863f * l - 0.7034186147f * m + 1.7076147010f * s;
2530 }
2531
STAGE(css_oklab_gamut_map_to_linear_srgb,NoCtx)2532 STAGE(css_oklab_gamut_map_to_linear_srgb, NoCtx) {
2533 // TODO(https://crbug.com/1508329): Add support for gamut mapping.
2534 // Return a greyscale value, so that accidental use is obvious.
2535 F l_ = r,
2536 m_ = r,
2537 s_ = r;
2538
2539 F l = l_*l_*l_,
2540 m = m_*m_*m_,
2541 s = s_*s_*s_;
2542
2543 r = +4.0767416621f * l - 3.3077115913f * m + 0.2309699292f * s;
2544 g = -1.2684380046f * l + 2.6097574011f * m - 0.3413193965f * s;
2545 b = -0.0041960863f * l - 0.7034186147f * m + 1.7076147010f * s;
2546 }
2547
2548 // Skia stores all polar colors with hue in the first component, so this "LCH -> Lab" transform
2549 // actually takes "HCL". This is also used to do the same polar transform for OkHCL to OkLAB.
2550 // See similar comments & logic in SkGradientBaseShader.cpp.
STAGE(css_hcl_to_lab,NoCtx)2551 STAGE(css_hcl_to_lab, NoCtx) {
2552 F H = r,
2553 C = g,
2554 L = b;
2555
2556 F hueRadians = H * (SK_FloatPI / 180);
2557
2558 r = L;
2559 g = C * cos_(hueRadians);
2560 b = C * sin_(hueRadians);
2561 }
2562
mod_(F x,float y)2563 SI F mod_(F x, float y) {
2564 return nmad(y, floor_(x * (1 / y)), x);
2565 }
2566
2567 struct RGB { F r, g, b; };
2568
css_hsl_to_srgb_(F h,F s,F l)2569 SI RGB css_hsl_to_srgb_(F h, F s, F l) {
2570 h = mod_(h, 360);
2571
2572 s *= 0.01f;
2573 l *= 0.01f;
2574
2575 F k[3] = {
2576 mod_(0 + h * (1 / 30.0f), 12),
2577 mod_(8 + h * (1 / 30.0f), 12),
2578 mod_(4 + h * (1 / 30.0f), 12)
2579 };
2580 F a = s * min(l, 1 - l);
2581 return {
2582 l - a * max(-1.0f, min(min(k[0] - 3.0f, 9.0f - k[0]), 1.0f)),
2583 l - a * max(-1.0f, min(min(k[1] - 3.0f, 9.0f - k[1]), 1.0f)),
2584 l - a * max(-1.0f, min(min(k[2] - 3.0f, 9.0f - k[2]), 1.0f))
2585 };
2586 }
2587
STAGE(css_hsl_to_srgb,NoCtx)2588 STAGE(css_hsl_to_srgb, NoCtx) {
2589 RGB rgb = css_hsl_to_srgb_(r, g, b);
2590 r = rgb.r;
2591 g = rgb.g;
2592 b = rgb.b;
2593 }
2594
STAGE(css_hwb_to_srgb,NoCtx)2595 STAGE(css_hwb_to_srgb, NoCtx) {
2596 g *= 0.01f;
2597 b *= 0.01f;
2598
2599 F gray = g / (g + b);
2600
2601 RGB rgb = css_hsl_to_srgb_(r, F_(100.0f), F_(50.0f));
2602 rgb.r = rgb.r * (1 - g - b) + g;
2603 rgb.g = rgb.g * (1 - g - b) + g;
2604 rgb.b = rgb.b * (1 - g - b) + g;
2605
2606 auto isGray = (g + b) >= 1;
2607
2608 r = if_then_else(isGray, gray, rgb.r);
2609 g = if_then_else(isGray, gray, rgb.g);
2610 b = if_then_else(isGray, gray, rgb.b);
2611 }
2612
2613 // Derive alpha's coverage from rgb coverage and the values of src and dst alpha.
alpha_coverage_from_rgb_coverage(F a,F da,F cr,F cg,F cb)2614 SI F alpha_coverage_from_rgb_coverage(F a, F da, F cr, F cg, F cb) {
2615 return if_then_else(a < da, min(cr, min(cg,cb))
2616 , max(cr, max(cg,cb)));
2617 }
2618
STAGE(scale_1_float,const float * c)2619 STAGE(scale_1_float, const float* c) {
2620 r = r * *c;
2621 g = g * *c;
2622 b = b * *c;
2623 a = a * *c;
2624 }
STAGE(scale_u8,const SkRasterPipeline_MemoryCtx * ctx)2625 STAGE(scale_u8, const SkRasterPipeline_MemoryCtx* ctx) {
2626 auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
2627
2628 auto scales = load<U8>(ptr);
2629 auto c = from_byte(scales);
2630
2631 r = r * c;
2632 g = g * c;
2633 b = b * c;
2634 a = a * c;
2635 }
STAGE(scale_565,const SkRasterPipeline_MemoryCtx * ctx)2636 STAGE(scale_565, const SkRasterPipeline_MemoryCtx* ctx) {
2637 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2638
2639 F cr,cg,cb;
2640 from_565(load<U16>(ptr), &cr, &cg, &cb);
2641
2642 F ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
2643
2644 r = r * cr;
2645 g = g * cg;
2646 b = b * cb;
2647 a = a * ca;
2648 }
2649
lerp(F from,F to,F t)2650 SI F lerp(F from, F to, F t) {
2651 return mad(to-from, t, from);
2652 }
2653
STAGE(lerp_1_float,const float * c)2654 STAGE(lerp_1_float, const float* c) {
2655 r = lerp(dr, r, F_(*c));
2656 g = lerp(dg, g, F_(*c));
2657 b = lerp(db, b, F_(*c));
2658 a = lerp(da, a, F_(*c));
2659 }
STAGE(scale_native,const float scales[])2660 STAGE(scale_native, const float scales[]) {
2661 auto c = sk_unaligned_load<F>(scales);
2662 r = r * c;
2663 g = g * c;
2664 b = b * c;
2665 a = a * c;
2666 }
STAGE(lerp_native,const float scales[])2667 STAGE(lerp_native, const float scales[]) {
2668 auto c = sk_unaligned_load<F>(scales);
2669 r = lerp(dr, r, c);
2670 g = lerp(dg, g, c);
2671 b = lerp(db, b, c);
2672 a = lerp(da, a, c);
2673 }
STAGE(lerp_u8,const SkRasterPipeline_MemoryCtx * ctx)2674 STAGE(lerp_u8, const SkRasterPipeline_MemoryCtx* ctx) {
2675 auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
2676
2677 auto scales = load<U8>(ptr);
2678 auto c = from_byte(scales);
2679
2680 r = lerp(dr, r, c);
2681 g = lerp(dg, g, c);
2682 b = lerp(db, b, c);
2683 a = lerp(da, a, c);
2684 }
STAGE(lerp_565,const SkRasterPipeline_MemoryCtx * ctx)2685 STAGE(lerp_565, const SkRasterPipeline_MemoryCtx* ctx) {
2686 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2687
2688 F cr,cg,cb;
2689 from_565(load<U16>(ptr), &cr, &cg, &cb);
2690
2691 F ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
2692
2693 r = lerp(dr, r, cr);
2694 g = lerp(dg, g, cg);
2695 b = lerp(db, b, cb);
2696 a = lerp(da, a, ca);
2697 }
2698
STAGE(emboss,const SkRasterPipeline_EmbossCtx * ctx)2699 STAGE(emboss, const SkRasterPipeline_EmbossCtx* ctx) {
2700 auto mptr = ptr_at_xy<const uint8_t>(&ctx->mul, dx,dy),
2701 aptr = ptr_at_xy<const uint8_t>(&ctx->add, dx,dy);
2702
2703 F mul = from_byte(load<U8>(mptr)),
2704 add = from_byte(load<U8>(aptr));
2705
2706 r = mad(r, mul, add);
2707 g = mad(g, mul, add);
2708 b = mad(b, mul, add);
2709 }
2710
STAGE(byte_tables,const SkRasterPipeline_TablesCtx * tables)2711 STAGE(byte_tables, const SkRasterPipeline_TablesCtx* tables) {
2712 r = from_byte(gather(tables->r, to_unorm(r, 255)));
2713 g = from_byte(gather(tables->g, to_unorm(g, 255)));
2714 b = from_byte(gather(tables->b, to_unorm(b, 255)));
2715 a = from_byte(gather(tables->a, to_unorm(a, 255)));
2716 }
2717
strip_sign(F x,U32 * sign)2718 SI F strip_sign(F x, U32* sign) {
2719 U32 bits = sk_bit_cast<U32>(x);
2720 *sign = bits & 0x80000000;
2721 return sk_bit_cast<F>(bits ^ *sign);
2722 }
2723
apply_sign(F x,U32 sign)2724 SI F apply_sign(F x, U32 sign) {
2725 return sk_bit_cast<F>(sign | sk_bit_cast<U32>(x));
2726 }
2727
STAGE(parametric,const skcms_TransferFunction * ctx)2728 STAGE(parametric, const skcms_TransferFunction* ctx) {
2729 auto fn = [&](F v) {
2730 U32 sign;
2731 v = strip_sign(v, &sign);
2732
2733 F r = if_then_else(v <= ctx->d, mad(ctx->c, v, ctx->f)
2734 , approx_powf(mad(ctx->a, v, ctx->b), ctx->g) + ctx->e);
2735 return apply_sign(r, sign);
2736 };
2737 r = fn(r);
2738 g = fn(g);
2739 b = fn(b);
2740 }
2741
STAGE(gamma_,const float * G)2742 STAGE(gamma_, const float* G) {
2743 auto fn = [&](F v) {
2744 U32 sign;
2745 v = strip_sign(v, &sign);
2746 return apply_sign(approx_powf(v, *G), sign);
2747 };
2748 r = fn(r);
2749 g = fn(g);
2750 b = fn(b);
2751 }
2752
STAGE(PQish,const skcms_TransferFunction * ctx)2753 STAGE(PQish, const skcms_TransferFunction* ctx) {
2754 auto fn = [&](F v) {
2755 U32 sign;
2756 v = strip_sign(v, &sign);
2757
2758 F r = approx_powf(max(mad(ctx->b, approx_powf(v, ctx->c), ctx->a), 0.0f)
2759 / (mad(ctx->e, approx_powf(v, ctx->c), ctx->d)),
2760 ctx->f);
2761
2762 return apply_sign(r, sign);
2763 };
2764 r = fn(r);
2765 g = fn(g);
2766 b = fn(b);
2767 }
2768
STAGE(HLGish,const skcms_TransferFunction * ctx)2769 STAGE(HLGish, const skcms_TransferFunction* ctx) {
2770 auto fn = [&](F v) {
2771 U32 sign;
2772 v = strip_sign(v, &sign);
2773
2774 const float R = ctx->a, G = ctx->b,
2775 a = ctx->c, b = ctx->d, c = ctx->e,
2776 K = ctx->f + 1.0f;
2777
2778 F r = if_then_else(v*R <= 1, approx_powf(v*R, G)
2779 , approx_exp((v-c)*a) + b);
2780
2781 return K * apply_sign(r, sign);
2782 };
2783 r = fn(r);
2784 g = fn(g);
2785 b = fn(b);
2786 }
2787
STAGE(HLGinvish,const skcms_TransferFunction * ctx)2788 STAGE(HLGinvish, const skcms_TransferFunction* ctx) {
2789 auto fn = [&](F v) {
2790 U32 sign;
2791 v = strip_sign(v, &sign);
2792
2793 const float R = ctx->a, G = ctx->b,
2794 a = ctx->c, b = ctx->d, c = ctx->e,
2795 K = ctx->f + 1.0f;
2796
2797 v /= K;
2798 F r = if_then_else(v <= 1, R * approx_powf(v, G)
2799 , a * approx_log(v - b) + c);
2800
2801 return apply_sign(r, sign);
2802 };
2803 r = fn(r);
2804 g = fn(g);
2805 b = fn(b);
2806 }
2807
STAGE(load_a8,const SkRasterPipeline_MemoryCtx * ctx)2808 STAGE(load_a8, const SkRasterPipeline_MemoryCtx* ctx) {
2809 auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
2810
2811 r = g = b = F0;
2812 a = from_byte(load<U8>(ptr));
2813 }
STAGE(load_a8_dst,const SkRasterPipeline_MemoryCtx * ctx)2814 STAGE(load_a8_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2815 auto ptr = ptr_at_xy<const uint8_t>(ctx, dx,dy);
2816
2817 dr = dg = db = F0;
2818 da = from_byte(load<U8>(ptr));
2819 }
STAGE(gather_a8,const SkRasterPipeline_GatherCtx * ctx)2820 STAGE(gather_a8, const SkRasterPipeline_GatherCtx* ctx) {
2821 const uint8_t* ptr;
2822 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
2823 r = g = b = F0;
2824 a = from_byte(gather(ptr, ix));
2825 }
STAGE(store_a8,const SkRasterPipeline_MemoryCtx * ctx)2826 STAGE(store_a8, const SkRasterPipeline_MemoryCtx* ctx) {
2827 auto ptr = ptr_at_xy<uint8_t>(ctx, dx,dy);
2828
2829 U8 packed = pack(pack(to_unorm(a, 255)));
2830 store(ptr, packed);
2831 }
STAGE(store_r8,const SkRasterPipeline_MemoryCtx * ctx)2832 STAGE(store_r8, const SkRasterPipeline_MemoryCtx* ctx) {
2833 auto ptr = ptr_at_xy<uint8_t>(ctx, dx,dy);
2834
2835 U8 packed = pack(pack(to_unorm(r, 255)));
2836 store(ptr, packed);
2837 }
2838
STAGE(load_565,const SkRasterPipeline_MemoryCtx * ctx)2839 STAGE(load_565, const SkRasterPipeline_MemoryCtx* ctx) {
2840 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2841
2842 from_565(load<U16>(ptr), &r,&g,&b);
2843 a = F1;
2844 }
STAGE(load_565_dst,const SkRasterPipeline_MemoryCtx * ctx)2845 STAGE(load_565_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2846 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2847
2848 from_565(load<U16>(ptr), &dr,&dg,&db);
2849 da = F1;
2850 }
STAGE(gather_565,const SkRasterPipeline_GatherCtx * ctx)2851 STAGE(gather_565, const SkRasterPipeline_GatherCtx* ctx) {
2852 const uint16_t* ptr;
2853 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
2854 from_565(gather(ptr, ix), &r,&g,&b);
2855 a = F1;
2856 }
STAGE(store_565,const SkRasterPipeline_MemoryCtx * ctx)2857 STAGE(store_565, const SkRasterPipeline_MemoryCtx* ctx) {
2858 auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
2859
2860 U16 px = pack( to_unorm(r, 31) << 11
2861 | to_unorm(g, 63) << 5
2862 | to_unorm(b, 31) );
2863 store(ptr, px);
2864 }
2865
STAGE(load_4444,const SkRasterPipeline_MemoryCtx * ctx)2866 STAGE(load_4444, const SkRasterPipeline_MemoryCtx* ctx) {
2867 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2868 from_4444(load<U16>(ptr), &r,&g,&b,&a);
2869 }
STAGE(load_4444_dst,const SkRasterPipeline_MemoryCtx * ctx)2870 STAGE(load_4444_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2871 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2872 from_4444(load<U16>(ptr), &dr,&dg,&db,&da);
2873 }
STAGE(gather_4444,const SkRasterPipeline_GatherCtx * ctx)2874 STAGE(gather_4444, const SkRasterPipeline_GatherCtx* ctx) {
2875 const uint16_t* ptr;
2876 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
2877 from_4444(gather(ptr, ix), &r,&g,&b,&a);
2878 }
STAGE(store_4444,const SkRasterPipeline_MemoryCtx * ctx)2879 STAGE(store_4444, const SkRasterPipeline_MemoryCtx* ctx) {
2880 auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
2881 U16 px = pack( to_unorm(r, 15) << 12
2882 | to_unorm(g, 15) << 8
2883 | to_unorm(b, 15) << 4
2884 | to_unorm(a, 15) );
2885 store(ptr, px);
2886 }
2887
STAGE(load_8888,const SkRasterPipeline_MemoryCtx * ctx)2888 STAGE(load_8888, const SkRasterPipeline_MemoryCtx* ctx) {
2889 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2890 from_8888(load<U32>(ptr), &r,&g,&b,&a);
2891 }
STAGE(load_8888_dst,const SkRasterPipeline_MemoryCtx * ctx)2892 STAGE(load_8888_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2893 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
2894 from_8888(load<U32>(ptr), &dr,&dg,&db,&da);
2895 }
STAGE(gather_8888,const SkRasterPipeline_GatherCtx * ctx)2896 STAGE(gather_8888, const SkRasterPipeline_GatherCtx* ctx) {
2897 const uint32_t* ptr;
2898 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
2899 from_8888(gather(ptr, ix), &r,&g,&b,&a);
2900 }
STAGE(store_8888,const SkRasterPipeline_MemoryCtx * ctx)2901 STAGE(store_8888, const SkRasterPipeline_MemoryCtx* ctx) {
2902 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
2903
2904 U32 px = to_unorm(r, 255)
2905 | to_unorm(g, 255) << 8
2906 | to_unorm(b, 255) << 16
2907 | to_unorm(a, 255) << 24;
2908 store(ptr, px);
2909 }
2910
STAGE(load_rg88,const SkRasterPipeline_MemoryCtx * ctx)2911 STAGE(load_rg88, const SkRasterPipeline_MemoryCtx* ctx) {
2912 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy);
2913 from_88(load<U16>(ptr), &r, &g);
2914 b = F0;
2915 a = F1;
2916 }
STAGE(load_rg88_dst,const SkRasterPipeline_MemoryCtx * ctx)2917 STAGE(load_rg88_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2918 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy);
2919 from_88(load<U16>(ptr), &dr, &dg);
2920 db = F0;
2921 da = F1;
2922 }
STAGE(gather_rg88,const SkRasterPipeline_GatherCtx * ctx)2923 STAGE(gather_rg88, const SkRasterPipeline_GatherCtx* ctx) {
2924 const uint16_t* ptr;
2925 U32 ix = ix_and_ptr(&ptr, ctx, r, g);
2926 from_88(gather(ptr, ix), &r, &g);
2927 b = F0;
2928 a = F1;
2929 }
STAGE(store_rg88,const SkRasterPipeline_MemoryCtx * ctx)2930 STAGE(store_rg88, const SkRasterPipeline_MemoryCtx* ctx) {
2931 auto ptr = ptr_at_xy<uint16_t>(ctx, dx, dy);
2932 U16 px = pack( to_unorm(r, 255) | to_unorm(g, 255) << 8 );
2933 store(ptr, px);
2934 }
2935
STAGE(load_a16,const SkRasterPipeline_MemoryCtx * ctx)2936 STAGE(load_a16, const SkRasterPipeline_MemoryCtx* ctx) {
2937 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
2938 r = g = b = F0;
2939 a = from_short(load<U16>(ptr));
2940 }
STAGE(load_a16_dst,const SkRasterPipeline_MemoryCtx * ctx)2941 STAGE(load_a16_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2942 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy);
2943 dr = dg = db = F0;
2944 da = from_short(load<U16>(ptr));
2945 }
STAGE(gather_a16,const SkRasterPipeline_GatherCtx * ctx)2946 STAGE(gather_a16, const SkRasterPipeline_GatherCtx* ctx) {
2947 const uint16_t* ptr;
2948 U32 ix = ix_and_ptr(&ptr, ctx, r, g);
2949 r = g = b = F0;
2950 a = from_short(gather(ptr, ix));
2951 }
STAGE(store_a16,const SkRasterPipeline_MemoryCtx * ctx)2952 STAGE(store_a16, const SkRasterPipeline_MemoryCtx* ctx) {
2953 auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
2954
2955 U16 px = pack(to_unorm(a, 65535));
2956 store(ptr, px);
2957 }
2958
STAGE(load_rg1616,const SkRasterPipeline_MemoryCtx * ctx)2959 STAGE(load_rg1616, const SkRasterPipeline_MemoryCtx* ctx) {
2960 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy);
2961 b = F0;
2962 a = F1;
2963 from_1616(load<U32>(ptr), &r,&g);
2964 }
STAGE(load_rg1616_dst,const SkRasterPipeline_MemoryCtx * ctx)2965 STAGE(load_rg1616_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2966 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy);
2967 from_1616(load<U32>(ptr), &dr, &dg);
2968 db = F0;
2969 da = F1;
2970 }
STAGE(gather_rg1616,const SkRasterPipeline_GatherCtx * ctx)2971 STAGE(gather_rg1616, const SkRasterPipeline_GatherCtx* ctx) {
2972 const uint32_t* ptr;
2973 U32 ix = ix_and_ptr(&ptr, ctx, r, g);
2974 from_1616(gather(ptr, ix), &r, &g);
2975 b = F0;
2976 a = F1;
2977 }
STAGE(store_rg1616,const SkRasterPipeline_MemoryCtx * ctx)2978 STAGE(store_rg1616, const SkRasterPipeline_MemoryCtx* ctx) {
2979 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
2980
2981 U32 px = to_unorm(r, 65535)
2982 | to_unorm(g, 65535) << 16;
2983 store(ptr, px);
2984 }
2985
STAGE(load_16161616,const SkRasterPipeline_MemoryCtx * ctx)2986 STAGE(load_16161616, const SkRasterPipeline_MemoryCtx* ctx) {
2987 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy);
2988 from_16161616(load<U64>(ptr), &r,&g, &b, &a);
2989 }
STAGE(load_16161616_dst,const SkRasterPipeline_MemoryCtx * ctx)2990 STAGE(load_16161616_dst, const SkRasterPipeline_MemoryCtx* ctx) {
2991 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy);
2992 from_16161616(load<U64>(ptr), &dr, &dg, &db, &da);
2993 }
STAGE(gather_16161616,const SkRasterPipeline_GatherCtx * ctx)2994 STAGE(gather_16161616, const SkRasterPipeline_GatherCtx* ctx) {
2995 const uint64_t* ptr;
2996 U32 ix = ix_and_ptr(&ptr, ctx, r, g);
2997 from_16161616(gather(ptr, ix), &r, &g, &b, &a);
2998 }
STAGE(store_16161616,const SkRasterPipeline_MemoryCtx * ctx)2999 STAGE(store_16161616, const SkRasterPipeline_MemoryCtx* ctx) {
3000 auto ptr = ptr_at_xy<uint16_t>(ctx, 4*dx,4*dy);
3001
3002 U16 R = pack(to_unorm(r, 65535)),
3003 G = pack(to_unorm(g, 65535)),
3004 B = pack(to_unorm(b, 65535)),
3005 A = pack(to_unorm(a, 65535));
3006
3007 store4(ptr, R,G,B,A);
3008 }
3009
STAGE(load_10x6,const SkRasterPipeline_MemoryCtx * ctx)3010 STAGE(load_10x6, const SkRasterPipeline_MemoryCtx* ctx) {
3011 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy);
3012 from_10x6(load<U64>(ptr), &r,&g, &b, &a);
3013 }
STAGE(load_10x6_dst,const SkRasterPipeline_MemoryCtx * ctx)3014 STAGE(load_10x6_dst, const SkRasterPipeline_MemoryCtx* ctx) {
3015 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy);
3016 from_10x6(load<U64>(ptr), &dr, &dg, &db, &da);
3017 }
STAGE(gather_10x6,const SkRasterPipeline_GatherCtx * ctx)3018 STAGE(gather_10x6, const SkRasterPipeline_GatherCtx* ctx) {
3019 const uint64_t* ptr;
3020 U32 ix = ix_and_ptr(&ptr, ctx, r, g);
3021 from_10x6(gather(ptr, ix), &r, &g, &b, &a);
3022 }
STAGE(store_10x6,const SkRasterPipeline_MemoryCtx * ctx)3023 STAGE(store_10x6, const SkRasterPipeline_MemoryCtx* ctx) {
3024 auto ptr = ptr_at_xy<uint16_t>(ctx, 4*dx,4*dy);
3025
3026 U16 R = pack(to_unorm(r, 1023)) << 6,
3027 G = pack(to_unorm(g, 1023)) << 6,
3028 B = pack(to_unorm(b, 1023)) << 6,
3029 A = pack(to_unorm(a, 1023)) << 6;
3030
3031 store4(ptr, R,G,B,A);
3032 }
3033
3034
STAGE(load_1010102,const SkRasterPipeline_MemoryCtx * ctx)3035 STAGE(load_1010102, const SkRasterPipeline_MemoryCtx* ctx) {
3036 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
3037 from_1010102(load<U32>(ptr), &r,&g,&b,&a);
3038 }
STAGE(load_1010102_dst,const SkRasterPipeline_MemoryCtx * ctx)3039 STAGE(load_1010102_dst, const SkRasterPipeline_MemoryCtx* ctx) {
3040 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
3041 from_1010102(load<U32>(ptr), &dr,&dg,&db,&da);
3042 }
STAGE(load_1010102_xr,const SkRasterPipeline_MemoryCtx * ctx)3043 STAGE(load_1010102_xr, const SkRasterPipeline_MemoryCtx* ctx) {
3044 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
3045 from_1010102_xr(load<U32>(ptr), &r,&g,&b,&a);
3046 }
STAGE(load_1010102_xr_dst,const SkRasterPipeline_MemoryCtx * ctx)3047 STAGE(load_1010102_xr_dst, const SkRasterPipeline_MemoryCtx* ctx) {
3048 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx,dy);
3049 from_1010102_xr(load<U32>(ptr), &dr,&dg,&db,&da);
3050 }
STAGE(gather_1010102,const SkRasterPipeline_GatherCtx * ctx)3051 STAGE(gather_1010102, const SkRasterPipeline_GatherCtx* ctx) {
3052 const uint32_t* ptr;
3053 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
3054 from_1010102(gather(ptr, ix), &r,&g,&b,&a);
3055 }
STAGE(gather_1010102_xr,const SkRasterPipeline_GatherCtx * ctx)3056 STAGE(gather_1010102_xr, const SkRasterPipeline_GatherCtx* ctx) {
3057 const uint32_t* ptr;
3058 U32 ix = ix_and_ptr(&ptr, ctx, r, g);
3059 from_1010102_xr(gather(ptr, ix), &r,&g,&b,&a);
3060 }
STAGE(gather_10101010_xr,const SkRasterPipeline_GatherCtx * ctx)3061 STAGE(gather_10101010_xr, const SkRasterPipeline_GatherCtx* ctx) {
3062 const uint64_t* ptr;
3063 U32 ix = ix_and_ptr(&ptr, ctx, r, g);
3064 from_10101010_xr(gather(ptr, ix), &r, &g, &b, &a);
3065 }
STAGE(load_10101010_xr,const SkRasterPipeline_MemoryCtx * ctx)3066 STAGE(load_10101010_xr, const SkRasterPipeline_MemoryCtx* ctx) {
3067 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy);
3068 from_10101010_xr(load<U64>(ptr), &r,&g, &b, &a);
3069 }
STAGE(load_10101010_xr_dst,const SkRasterPipeline_MemoryCtx * ctx)3070 STAGE(load_10101010_xr_dst, const SkRasterPipeline_MemoryCtx* ctx) {
3071 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx, dy);
3072 from_10101010_xr(load<U64>(ptr), &dr, &dg, &db, &da);
3073 }
STAGE(store_10101010_xr,const SkRasterPipeline_MemoryCtx * ctx)3074 STAGE(store_10101010_xr, const SkRasterPipeline_MemoryCtx* ctx) {
3075 static constexpr float min = -0.752941f;
3076 static constexpr float max = 1.25098f;
3077 static constexpr float range = max - min;
3078 auto ptr = ptr_at_xy<uint16_t>(ctx, 4*dx,4*dy);
3079
3080 U16 R = pack(to_unorm((r - min) / range, 1023)) << 6,
3081 G = pack(to_unorm((g - min) / range, 1023)) << 6,
3082 B = pack(to_unorm((b - min) / range, 1023)) << 6,
3083 A = pack(to_unorm((a - min) / range, 1023)) << 6;
3084
3085 store4(ptr, R,G,B,A);
3086 }
STAGE(store_1010102,const SkRasterPipeline_MemoryCtx * ctx)3087 STAGE(store_1010102, const SkRasterPipeline_MemoryCtx* ctx) {
3088 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
3089
3090 U32 px = to_unorm(r, 1023)
3091 | to_unorm(g, 1023) << 10
3092 | to_unorm(b, 1023) << 20
3093 | to_unorm(a, 3) << 30;
3094 store(ptr, px);
3095 }
STAGE(store_1010102_xr,const SkRasterPipeline_MemoryCtx * ctx)3096 STAGE(store_1010102_xr, const SkRasterPipeline_MemoryCtx* ctx) {
3097 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
3098 static constexpr float min = -0.752941f;
3099 static constexpr float max = 1.25098f;
3100 static constexpr float range = max - min;
3101 U32 px = to_unorm((r - min) / range, 1023)
3102 | to_unorm((g - min) / range, 1023) << 10
3103 | to_unorm((b - min) / range, 1023) << 20
3104 | to_unorm(a, 3) << 30;
3105 store(ptr, px);
3106 }
3107
STAGE(load_f16,const SkRasterPipeline_MemoryCtx * ctx)3108 STAGE(load_f16, const SkRasterPipeline_MemoryCtx* ctx) {
3109 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy);
3110
3111 U16 R,G,B,A;
3112 load4((const uint16_t*)ptr, &R,&G,&B,&A);
3113 r = from_half(R);
3114 g = from_half(G);
3115 b = from_half(B);
3116 a = from_half(A);
3117 }
STAGE(load_f16_dst,const SkRasterPipeline_MemoryCtx * ctx)3118 STAGE(load_f16_dst, const SkRasterPipeline_MemoryCtx* ctx) {
3119 auto ptr = ptr_at_xy<const uint64_t>(ctx, dx,dy);
3120
3121 U16 R,G,B,A;
3122 load4((const uint16_t*)ptr, &R,&G,&B,&A);
3123 dr = from_half(R);
3124 dg = from_half(G);
3125 db = from_half(B);
3126 da = from_half(A);
3127 }
STAGE(gather_f16,const SkRasterPipeline_GatherCtx * ctx)3128 STAGE(gather_f16, const SkRasterPipeline_GatherCtx* ctx) {
3129 const uint64_t* ptr;
3130 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
3131 auto px = gather(ptr, ix);
3132
3133 U16 R,G,B,A;
3134 load4((const uint16_t*)&px, &R,&G,&B,&A);
3135 r = from_half(R);
3136 g = from_half(G);
3137 b = from_half(B);
3138 a = from_half(A);
3139 }
STAGE(store_f16,const SkRasterPipeline_MemoryCtx * ctx)3140 STAGE(store_f16, const SkRasterPipeline_MemoryCtx* ctx) {
3141 auto ptr = ptr_at_xy<uint64_t>(ctx, dx,dy);
3142 store4((uint16_t*)ptr, to_half(r)
3143 , to_half(g)
3144 , to_half(b)
3145 , to_half(a));
3146 }
3147
STAGE(load_af16,const SkRasterPipeline_MemoryCtx * ctx)3148 STAGE(load_af16, const SkRasterPipeline_MemoryCtx* ctx) {
3149 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx,dy);
3150
3151 U16 A = load<U16>((const uint16_t*)ptr);
3152 r = F0;
3153 g = F0;
3154 b = F0;
3155 a = from_half(A);
3156 }
STAGE(load_af16_dst,const SkRasterPipeline_MemoryCtx * ctx)3157 STAGE(load_af16_dst, const SkRasterPipeline_MemoryCtx* ctx) {
3158 auto ptr = ptr_at_xy<const uint16_t>(ctx, dx, dy);
3159
3160 U16 A = load<U16>((const uint16_t*)ptr);
3161 dr = dg = db = F0;
3162 da = from_half(A);
3163 }
STAGE(gather_af16,const SkRasterPipeline_GatherCtx * ctx)3164 STAGE(gather_af16, const SkRasterPipeline_GatherCtx* ctx) {
3165 const uint16_t* ptr;
3166 U32 ix = ix_and_ptr(&ptr, ctx, r, g);
3167 r = g = b = F0;
3168 a = from_half(gather(ptr, ix));
3169 }
STAGE(store_af16,const SkRasterPipeline_MemoryCtx * ctx)3170 STAGE(store_af16, const SkRasterPipeline_MemoryCtx* ctx) {
3171 auto ptr = ptr_at_xy<uint16_t>(ctx, dx,dy);
3172 store(ptr, to_half(a));
3173 }
3174
STAGE(load_rgf16,const SkRasterPipeline_MemoryCtx * ctx)3175 STAGE(load_rgf16, const SkRasterPipeline_MemoryCtx* ctx) {
3176 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy);
3177
3178 U16 R,G;
3179 load2((const uint16_t*)ptr, &R, &G);
3180 r = from_half(R);
3181 g = from_half(G);
3182 b = F0;
3183 a = F1;
3184 }
STAGE(load_rgf16_dst,const SkRasterPipeline_MemoryCtx * ctx)3185 STAGE(load_rgf16_dst, const SkRasterPipeline_MemoryCtx* ctx) {
3186 auto ptr = ptr_at_xy<const uint32_t>(ctx, dx, dy);
3187
3188 U16 R,G;
3189 load2((const uint16_t*)ptr, &R, &G);
3190 dr = from_half(R);
3191 dg = from_half(G);
3192 db = F0;
3193 da = F1;
3194 }
STAGE(gather_rgf16,const SkRasterPipeline_GatherCtx * ctx)3195 STAGE(gather_rgf16, const SkRasterPipeline_GatherCtx* ctx) {
3196 const uint32_t* ptr;
3197 U32 ix = ix_and_ptr(&ptr, ctx, r, g);
3198 auto px = gather(ptr, ix);
3199
3200 U16 R,G;
3201 load2((const uint16_t*)&px, &R, &G);
3202 r = from_half(R);
3203 g = from_half(G);
3204 b = F0;
3205 a = F1;
3206 }
STAGE(store_rgf16,const SkRasterPipeline_MemoryCtx * ctx)3207 STAGE(store_rgf16, const SkRasterPipeline_MemoryCtx* ctx) {
3208 auto ptr = ptr_at_xy<uint32_t>(ctx, dx, dy);
3209 store2((uint16_t*)ptr, to_half(r)
3210 , to_half(g));
3211 }
3212
STAGE(load_f32,const SkRasterPipeline_MemoryCtx * ctx)3213 STAGE(load_f32, const SkRasterPipeline_MemoryCtx* ctx) {
3214 auto ptr = ptr_at_xy<const float>(ctx, 4*dx,4*dy);
3215 load4(ptr, &r,&g,&b,&a);
3216 }
STAGE(load_f32_dst,const SkRasterPipeline_MemoryCtx * ctx)3217 STAGE(load_f32_dst, const SkRasterPipeline_MemoryCtx* ctx) {
3218 auto ptr = ptr_at_xy<const float>(ctx, 4*dx,4*dy);
3219 load4(ptr, &dr,&dg,&db,&da);
3220 }
STAGE(gather_f32,const SkRasterPipeline_GatherCtx * ctx)3221 STAGE(gather_f32, const SkRasterPipeline_GatherCtx* ctx) {
3222 const float* ptr;
3223 U32 ix = ix_and_ptr(&ptr, ctx, r,g);
3224 r = gather(ptr, 4*ix + 0);
3225 g = gather(ptr, 4*ix + 1);
3226 b = gather(ptr, 4*ix + 2);
3227 a = gather(ptr, 4*ix + 3);
3228 }
STAGE(store_f32,const SkRasterPipeline_MemoryCtx * ctx)3229 STAGE(store_f32, const SkRasterPipeline_MemoryCtx* ctx) {
3230 auto ptr = ptr_at_xy<float>(ctx, 4*dx,4*dy);
3231 store4(ptr, r,g,b,a);
3232 }
3233
exclusive_repeat(F v,const SkRasterPipeline_TileCtx * ctx)3234 SI F exclusive_repeat(F v, const SkRasterPipeline_TileCtx* ctx) {
3235 return v - floor_(v*ctx->invScale)*ctx->scale;
3236 }
exclusive_mirror(F v,const SkRasterPipeline_TileCtx * ctx)3237 SI F exclusive_mirror(F v, const SkRasterPipeline_TileCtx* ctx) {
3238 auto limit = ctx->scale;
3239 auto invLimit = ctx->invScale;
3240
3241 // This is "repeat" over the range 0..2*limit
3242 auto u = v - floor_(v*invLimit*0.5f)*2*limit;
3243 // s will be 0 when moving forward (e.g. [0, limit)) and 1 when moving backward (e.g.
3244 // [limit, 2*limit)).
3245 auto s = floor_(u*invLimit);
3246 // This is the mirror result.
3247 auto m = u - 2*s*(u - limit);
3248 // Apply a bias to m if moving backwards so that we snap consistently at exact integer coords in
3249 // the logical infinite image. This is tested by mirror_tile GM. Note that all values
3250 // that have a non-zero bias applied are > 0.
3251 auto biasInUlps = trunc_(s);
3252 return sk_bit_cast<F>(sk_bit_cast<U32>(m) + ctx->mirrorBiasDir*biasInUlps);
3253 }
3254 // Tile x or y to [0,limit) == [0,limit - 1 ulp] (think, sampling from images).
3255 // The gather stages will hard clamp the output of these stages to [0,limit)...
3256 // we just need to do the basic repeat or mirroring.
STAGE(repeat_x,const SkRasterPipeline_TileCtx * ctx)3257 STAGE(repeat_x, const SkRasterPipeline_TileCtx* ctx) { r = exclusive_repeat(r, ctx); }
STAGE(repeat_y,const SkRasterPipeline_TileCtx * ctx)3258 STAGE(repeat_y, const SkRasterPipeline_TileCtx* ctx) { g = exclusive_repeat(g, ctx); }
STAGE(mirror_x,const SkRasterPipeline_TileCtx * ctx)3259 STAGE(mirror_x, const SkRasterPipeline_TileCtx* ctx) { r = exclusive_mirror(r, ctx); }
STAGE(mirror_y,const SkRasterPipeline_TileCtx * ctx)3260 STAGE(mirror_y, const SkRasterPipeline_TileCtx* ctx) { g = exclusive_mirror(g, ctx); }
3261
STAGE(clamp_x_1,NoCtx)3262 STAGE( clamp_x_1, NoCtx) { r = clamp_01_(r); }
STAGE(repeat_x_1,NoCtx)3263 STAGE(repeat_x_1, NoCtx) { r = clamp_01_(r - floor_(r)); }
STAGE(mirror_x_1,NoCtx)3264 STAGE(mirror_x_1, NoCtx) { r = clamp_01_(abs_( (r-1.0f) - two(floor_((r-1.0f)*0.5f)) - 1.0f )); }
3265
STAGE(clamp_x_and_y,const SkRasterPipeline_CoordClampCtx * ctx)3266 STAGE(clamp_x_and_y, const SkRasterPipeline_CoordClampCtx* ctx) {
3267 r = min(ctx->max_x, max(ctx->min_x, r));
3268 g = min(ctx->max_y, max(ctx->min_y, g));
3269 }
3270
3271 // Decal stores a 32bit mask after checking the coordinate (x and/or y) against its domain:
3272 // mask == 0x00000000 if the coordinate(s) are out of bounds
3273 // mask == 0xFFFFFFFF if the coordinate(s) are in bounds
3274 // After the gather stage, the r,g,b,a values are AND'd with this mask, setting them to 0
3275 // if either of the coordinates were out of bounds.
3276
STAGE(decal_x,SkRasterPipeline_DecalTileCtx * ctx)3277 STAGE(decal_x, SkRasterPipeline_DecalTileCtx* ctx) {
3278 auto w = ctx->limit_x;
3279 auto e = ctx->inclusiveEdge_x;
3280 auto cond = ((0 < r) & (r < w)) | (r == e);
3281 sk_unaligned_store(ctx->mask, cond_to_mask(cond));
3282 }
STAGE(decal_y,SkRasterPipeline_DecalTileCtx * ctx)3283 STAGE(decal_y, SkRasterPipeline_DecalTileCtx* ctx) {
3284 auto h = ctx->limit_y;
3285 auto e = ctx->inclusiveEdge_y;
3286 auto cond = ((0 < g) & (g < h)) | (g == e);
3287 sk_unaligned_store(ctx->mask, cond_to_mask(cond));
3288 }
STAGE(decal_x_and_y,SkRasterPipeline_DecalTileCtx * ctx)3289 STAGE(decal_x_and_y, SkRasterPipeline_DecalTileCtx* ctx) {
3290 auto w = ctx->limit_x;
3291 auto h = ctx->limit_y;
3292 auto ex = ctx->inclusiveEdge_x;
3293 auto ey = ctx->inclusiveEdge_y;
3294 auto cond = (((0 < r) & (r < w)) | (r == ex))
3295 & (((0 < g) & (g < h)) | (g == ey));
3296 sk_unaligned_store(ctx->mask, cond_to_mask(cond));
3297 }
STAGE(check_decal_mask,SkRasterPipeline_DecalTileCtx * ctx)3298 STAGE(check_decal_mask, SkRasterPipeline_DecalTileCtx* ctx) {
3299 auto mask = sk_unaligned_load<U32>(ctx->mask);
3300 r = sk_bit_cast<F>(sk_bit_cast<U32>(r) & mask);
3301 g = sk_bit_cast<F>(sk_bit_cast<U32>(g) & mask);
3302 b = sk_bit_cast<F>(sk_bit_cast<U32>(b) & mask);
3303 a = sk_bit_cast<F>(sk_bit_cast<U32>(a) & mask);
3304 }
3305
STAGE(alpha_to_gray,NoCtx)3306 STAGE(alpha_to_gray, NoCtx) {
3307 r = g = b = a;
3308 a = F1;
3309 }
STAGE(alpha_to_gray_dst,NoCtx)3310 STAGE(alpha_to_gray_dst, NoCtx) {
3311 dr = dg = db = da;
3312 da = F1;
3313 }
STAGE(alpha_to_red,NoCtx)3314 STAGE(alpha_to_red, NoCtx) {
3315 r = a;
3316 a = F1;
3317 }
STAGE(alpha_to_red_dst,NoCtx)3318 STAGE(alpha_to_red_dst, NoCtx) {
3319 dr = da;
3320 da = F1;
3321 }
3322
STAGE(bt709_luminance_or_luma_to_alpha,NoCtx)3323 STAGE(bt709_luminance_or_luma_to_alpha, NoCtx) {
3324 a = r*0.2126f + g*0.7152f + b*0.0722f;
3325 r = g = b = F0;
3326 }
STAGE(bt709_luminance_or_luma_to_rgb,NoCtx)3327 STAGE(bt709_luminance_or_luma_to_rgb, NoCtx) {
3328 r = g = b = r*0.2126f + g*0.7152f + b*0.0722f;
3329 }
3330
STAGE(matrix_translate,const float * m)3331 STAGE(matrix_translate, const float* m) {
3332 r += m[0];
3333 g += m[1];
3334 }
STAGE(matrix_scale_translate,const float * m)3335 STAGE(matrix_scale_translate, const float* m) {
3336 r = mad(r,m[0], m[2]);
3337 g = mad(g,m[1], m[3]);
3338 }
STAGE(matrix_2x3,const float * m)3339 STAGE(matrix_2x3, const float* m) {
3340 auto R = mad(r,m[0], mad(g,m[1], m[2])),
3341 G = mad(r,m[3], mad(g,m[4], m[5]));
3342 r = R;
3343 g = G;
3344 }
STAGE(matrix_3x3,const float * m)3345 STAGE(matrix_3x3, const float* m) {
3346 auto R = mad(r,m[0], mad(g,m[3], b*m[6])),
3347 G = mad(r,m[1], mad(g,m[4], b*m[7])),
3348 B = mad(r,m[2], mad(g,m[5], b*m[8]));
3349 r = R;
3350 g = G;
3351 b = B;
3352 }
STAGE(matrix_3x4,const float * m)3353 STAGE(matrix_3x4, const float* m) {
3354 auto R = mad(r,m[0], mad(g,m[3], mad(b,m[6], m[ 9]))),
3355 G = mad(r,m[1], mad(g,m[4], mad(b,m[7], m[10]))),
3356 B = mad(r,m[2], mad(g,m[5], mad(b,m[8], m[11])));
3357 r = R;
3358 g = G;
3359 b = B;
3360 }
STAGE(matrix_4x5,const float * m)3361 STAGE(matrix_4x5, const float* m) {
3362 auto R = mad(r,m[ 0], mad(g,m[ 1], mad(b,m[ 2], mad(a,m[ 3], m[ 4])))),
3363 G = mad(r,m[ 5], mad(g,m[ 6], mad(b,m[ 7], mad(a,m[ 8], m[ 9])))),
3364 B = mad(r,m[10], mad(g,m[11], mad(b,m[12], mad(a,m[13], m[14])))),
3365 A = mad(r,m[15], mad(g,m[16], mad(b,m[17], mad(a,m[18], m[19]))));
3366 r = R;
3367 g = G;
3368 b = B;
3369 a = A;
3370 }
STAGE(matrix_4x3,const float * m)3371 STAGE(matrix_4x3, const float* m) {
3372 auto X = r,
3373 Y = g;
3374
3375 r = mad(X, m[0], mad(Y, m[4], m[ 8]));
3376 g = mad(X, m[1], mad(Y, m[5], m[ 9]));
3377 b = mad(X, m[2], mad(Y, m[6], m[10]));
3378 a = mad(X, m[3], mad(Y, m[7], m[11]));
3379 }
STAGE(matrix_perspective,const float * m)3380 STAGE(matrix_perspective, const float* m) {
3381 // N.B. Unlike the other matrix_ stages, this matrix is row-major.
3382 auto R = mad(r,m[0], mad(g,m[1], m[2])),
3383 G = mad(r,m[3], mad(g,m[4], m[5])),
3384 Z = mad(r,m[6], mad(g,m[7], m[8]));
3385 r = R * rcp_precise(Z);
3386 g = G * rcp_precise(Z);
3387 }
3388
gradient_lookup(const SkRasterPipeline_GradientCtx * c,U32 idx,F t,F * r,F * g,F * b,F * a)3389 SI void gradient_lookup(const SkRasterPipeline_GradientCtx* c, U32 idx, F t,
3390 F* r, F* g, F* b, F* a) {
3391 F fr, br, fg, bg, fb, bb, fa, ba;
3392 #if defined(SKRP_CPU_HSW)
3393 if (c->stopCount <=8) {
3394 fr = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), (__m256i)idx);
3395 br = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), (__m256i)idx);
3396 fg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), (__m256i)idx);
3397 bg = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), (__m256i)idx);
3398 fb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), (__m256i)idx);
3399 bb = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), (__m256i)idx);
3400 fa = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), (__m256i)idx);
3401 ba = _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), (__m256i)idx);
3402 } else
3403 #elif defined(SKRP_CPU_LASX)
3404 if (c->stopCount <= 8) {
3405 fr = (__m256)__lasx_xvperm_w(__lasx_xvld(c->fs[0], 0), idx);
3406 br = (__m256)__lasx_xvperm_w(__lasx_xvld(c->bs[0], 0), idx);
3407 fg = (__m256)__lasx_xvperm_w(__lasx_xvld(c->fs[1], 0), idx);
3408 bg = (__m256)__lasx_xvperm_w(__lasx_xvld(c->bs[1], 0), idx);
3409 fb = (__m256)__lasx_xvperm_w(__lasx_xvld(c->fs[2], 0), idx);
3410 bb = (__m256)__lasx_xvperm_w(__lasx_xvld(c->bs[2], 0), idx);
3411 fa = (__m256)__lasx_xvperm_w(__lasx_xvld(c->fs[3], 0), idx);
3412 ba = (__m256)__lasx_xvperm_w(__lasx_xvld(c->bs[3], 0), idx);
3413 } else
3414 #elif defined(SKRP_CPU_LSX)
3415 if (c->stopCount <= 4) {
3416 __m128i zero = __lsx_vldi(0);
3417 fr = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->fs[0], 0));
3418 br = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->bs[0], 0));
3419 fg = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->fs[1], 0));
3420 bg = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->bs[1], 0));
3421 fb = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->fs[2], 0));
3422 bb = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->bs[2], 0));
3423 fa = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->fs[3], 0));
3424 ba = (__m128)__lsx_vshuf_w(idx, zero, __lsx_vld(c->bs[3], 0));
3425 } else
3426 #endif
3427 {
3428 #if defined(SKRP_CPU_LSX)
3429 // This can reduce some vpickve2gr instructions.
3430 int i0 = __lsx_vpickve2gr_w(idx, 0);
3431 int i1 = __lsx_vpickve2gr_w(idx, 1);
3432 int i2 = __lsx_vpickve2gr_w(idx, 2);
3433 int i3 = __lsx_vpickve2gr_w(idx, 3);
3434 fr = gather((int *)c->fs[0], i0, i1, i2, i3);
3435 br = gather((int *)c->bs[0], i0, i1, i2, i3);
3436 fg = gather((int *)c->fs[1], i0, i1, i2, i3);
3437 bg = gather((int *)c->bs[1], i0, i1, i2, i3);
3438 fb = gather((int *)c->fs[2], i0, i1, i2, i3);
3439 bb = gather((int *)c->bs[2], i0, i1, i2, i3);
3440 fa = gather((int *)c->fs[3], i0, i1, i2, i3);
3441 ba = gather((int *)c->bs[3], i0, i1, i2, i3);
3442 #else
3443 fr = gather(c->fs[0], idx);
3444 br = gather(c->bs[0], idx);
3445 fg = gather(c->fs[1], idx);
3446 bg = gather(c->bs[1], idx);
3447 fb = gather(c->fs[2], idx);
3448 bb = gather(c->bs[2], idx);
3449 fa = gather(c->fs[3], idx);
3450 ba = gather(c->bs[3], idx);
3451 #endif
3452 }
3453
3454 *r = mad(t, fr, br);
3455 *g = mad(t, fg, bg);
3456 *b = mad(t, fb, bb);
3457 *a = mad(t, fa, ba);
3458 }
3459
STAGE(evenly_spaced_gradient,const SkRasterPipeline_GradientCtx * c)3460 STAGE(evenly_spaced_gradient, const SkRasterPipeline_GradientCtx* c) {
3461 auto t = r;
3462 auto idx = trunc_(t * static_cast<float>(c->stopCount-1));
3463 gradient_lookup(c, idx, t, &r, &g, &b, &a);
3464 }
3465
STAGE(gradient,const SkRasterPipeline_GradientCtx * c)3466 STAGE(gradient, const SkRasterPipeline_GradientCtx* c) {
3467 auto t = r;
3468 U32 idx = U32_(0);
3469
3470 // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop.
3471 for (size_t i = 1; i < c->stopCount; i++) {
3472 idx += (U32)if_then_else(t >= c->ts[i], I32_(1), I32_(0));
3473 }
3474
3475 gradient_lookup(c, idx, t, &r, &g, &b, &a);
3476 }
3477
STAGE(evenly_spaced_2_stop_gradient,const SkRasterPipeline_EvenlySpaced2StopGradientCtx * c)3478 STAGE(evenly_spaced_2_stop_gradient, const SkRasterPipeline_EvenlySpaced2StopGradientCtx* c) {
3479 auto t = r;
3480 r = mad(t, c->f[0], c->b[0]);
3481 g = mad(t, c->f[1], c->b[1]);
3482 b = mad(t, c->f[2], c->b[2]);
3483 a = mad(t, c->f[3], c->b[3]);
3484 }
3485
STAGE(xy_to_unit_angle,NoCtx)3486 STAGE(xy_to_unit_angle, NoCtx) {
3487 F X = r,
3488 Y = g;
3489 F xabs = abs_(X),
3490 yabs = abs_(Y);
3491
3492 F slope = min(xabs, yabs)/max(xabs, yabs);
3493 F s = slope * slope;
3494
3495 // Use a 7th degree polynomial to approximate atan.
3496 // This was generated using sollya.gforge.inria.fr.
3497 // A float optimized polynomial was generated using the following command.
3498 // P1 = fpminimax((1/(2*Pi))*atan(x),[|1,3,5,7|],[|24...|],[2^(-40),1],relative);
3499 F phi = slope
3500 * (0.15912117063999176025390625f + s
3501 * (-5.185396969318389892578125e-2f + s
3502 * (2.476101927459239959716796875e-2f + s
3503 * (-7.0547382347285747528076171875e-3f))));
3504
3505 phi = if_then_else(xabs < yabs, 1.0f/4.0f - phi, phi);
3506 phi = if_then_else(X < 0.0f , 1.0f/2.0f - phi, phi);
3507 phi = if_then_else(Y < 0.0f , 1.0f - phi , phi);
3508 phi = if_then_else(phi != phi , 0.0f , phi); // Check for NaN.
3509 r = phi;
3510 }
3511
STAGE(xy_to_radius,NoCtx)3512 STAGE(xy_to_radius, NoCtx) {
3513 F X2 = r * r,
3514 Y2 = g * g;
3515 r = sqrt_(X2 + Y2);
3516 }
3517
3518 // Please see https://skia.org/dev/design/conical for how our 2pt conical shader works.
3519
STAGE(negate_x,NoCtx)3520 STAGE(negate_x, NoCtx) { r = -r; }
3521
STAGE(xy_to_2pt_conical_strip,const SkRasterPipeline_2PtConicalCtx * ctx)3522 STAGE(xy_to_2pt_conical_strip, const SkRasterPipeline_2PtConicalCtx* ctx) {
3523 F x = r, y = g, &t = r;
3524 t = x + sqrt_(ctx->fP0 - y*y); // ctx->fP0 = r0 * r0
3525 }
3526
STAGE(xy_to_2pt_conical_focal_on_circle,NoCtx)3527 STAGE(xy_to_2pt_conical_focal_on_circle, NoCtx) {
3528 F x = r, y = g, &t = r;
3529 t = x + y*y / x; // (x^2 + y^2) / x
3530 }
3531
STAGE(xy_to_2pt_conical_well_behaved,const SkRasterPipeline_2PtConicalCtx * ctx)3532 STAGE(xy_to_2pt_conical_well_behaved, const SkRasterPipeline_2PtConicalCtx* ctx) {
3533 F x = r, y = g, &t = r;
3534 t = sqrt_(x*x + y*y) - x * ctx->fP0; // ctx->fP0 = 1/r1
3535 }
3536
STAGE(xy_to_2pt_conical_greater,const SkRasterPipeline_2PtConicalCtx * ctx)3537 STAGE(xy_to_2pt_conical_greater, const SkRasterPipeline_2PtConicalCtx* ctx) {
3538 F x = r, y = g, &t = r;
3539 t = sqrt_(x*x - y*y) - x * ctx->fP0; // ctx->fP0 = 1/r1
3540 }
3541
STAGE(xy_to_2pt_conical_smaller,const SkRasterPipeline_2PtConicalCtx * ctx)3542 STAGE(xy_to_2pt_conical_smaller, const SkRasterPipeline_2PtConicalCtx* ctx) {
3543 F x = r, y = g, &t = r;
3544 t = -sqrt_(x*x - y*y) - x * ctx->fP0; // ctx->fP0 = 1/r1
3545 }
3546
STAGE(alter_2pt_conical_compensate_focal,const SkRasterPipeline_2PtConicalCtx * ctx)3547 STAGE(alter_2pt_conical_compensate_focal, const SkRasterPipeline_2PtConicalCtx* ctx) {
3548 F& t = r;
3549 t = t + ctx->fP1; // ctx->fP1 = f
3550 }
3551
STAGE(alter_2pt_conical_unswap,NoCtx)3552 STAGE(alter_2pt_conical_unswap, NoCtx) {
3553 F& t = r;
3554 t = 1 - t;
3555 }
3556
STAGE(mask_2pt_conical_nan,SkRasterPipeline_2PtConicalCtx * c)3557 STAGE(mask_2pt_conical_nan, SkRasterPipeline_2PtConicalCtx* c) {
3558 F& t = r;
3559 auto is_degenerate = (t != t); // NaN
3560 t = if_then_else(is_degenerate, F0, t);
3561 sk_unaligned_store(&c->fMask, cond_to_mask(!is_degenerate));
3562 }
3563
STAGE(mask_2pt_conical_degenerates,SkRasterPipeline_2PtConicalCtx * c)3564 STAGE(mask_2pt_conical_degenerates, SkRasterPipeline_2PtConicalCtx* c) {
3565 F& t = r;
3566 auto is_degenerate = (t <= 0) | (t != t);
3567 t = if_then_else(is_degenerate, F0, t);
3568 sk_unaligned_store(&c->fMask, cond_to_mask(!is_degenerate));
3569 }
3570
STAGE(apply_vector_mask,const uint32_t * ctx)3571 STAGE(apply_vector_mask, const uint32_t* ctx) {
3572 const U32 mask = sk_unaligned_load<U32>(ctx);
3573 r = sk_bit_cast<F>(sk_bit_cast<U32>(r) & mask);
3574 g = sk_bit_cast<F>(sk_bit_cast<U32>(g) & mask);
3575 b = sk_bit_cast<F>(sk_bit_cast<U32>(b) & mask);
3576 a = sk_bit_cast<F>(sk_bit_cast<U32>(a) & mask);
3577 }
3578
save_xy(F * r,F * g,SkRasterPipeline_SamplerCtx * c)3579 SI void save_xy(F* r, F* g, SkRasterPipeline_SamplerCtx* c) {
3580 // Whether bilinear or bicubic, all sample points are at the same fractional offset (fx,fy).
3581 // They're either the 4 corners of a logical 1x1 pixel or the 16 corners of a 3x3 grid
3582 // surrounding (x,y) at (0.5,0.5) off-center.
3583 F fx = fract(*r + 0.5f),
3584 fy = fract(*g + 0.5f);
3585
3586 // Samplers will need to load x and fx, or y and fy.
3587 sk_unaligned_store(c->x, *r);
3588 sk_unaligned_store(c->y, *g);
3589 sk_unaligned_store(c->fx, fx);
3590 sk_unaligned_store(c->fy, fy);
3591 }
3592
STAGE(accumulate,const SkRasterPipeline_SamplerCtx * c)3593 STAGE(accumulate, const SkRasterPipeline_SamplerCtx* c) {
3594 // Bilinear and bicubic filters are both separable, so we produce independent contributions
3595 // from x and y, multiplying them together here to get each pixel's total scale factor.
3596 auto scale = sk_unaligned_load<F>(c->scalex)
3597 * sk_unaligned_load<F>(c->scaley);
3598 dr = mad(scale, r, dr);
3599 dg = mad(scale, g, dg);
3600 db = mad(scale, b, db);
3601 da = mad(scale, a, da);
3602 }
3603
3604 // In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center
3605 // are combined in direct proportion to their area overlapping that logical query pixel.
3606 // At positive offsets, the x-axis contribution to that rectangle is fx, or (1-fx) at negative x.
3607 // The y-axis is symmetric.
3608
3609 template <int kScale>
bilinear_x(SkRasterPipeline_SamplerCtx * ctx,F * x)3610 SI void bilinear_x(SkRasterPipeline_SamplerCtx* ctx, F* x) {
3611 *x = sk_unaligned_load<F>(ctx->x) + (kScale * 0.5f);
3612 F fx = sk_unaligned_load<F>(ctx->fx);
3613
3614 F scalex;
3615 if (kScale == -1) { scalex = 1.0f - fx; }
3616 if (kScale == +1) { scalex = fx; }
3617 sk_unaligned_store(ctx->scalex, scalex);
3618 }
3619 template <int kScale>
bilinear_y(SkRasterPipeline_SamplerCtx * ctx,F * y)3620 SI void bilinear_y(SkRasterPipeline_SamplerCtx* ctx, F* y) {
3621 *y = sk_unaligned_load<F>(ctx->y) + (kScale * 0.5f);
3622 F fy = sk_unaligned_load<F>(ctx->fy);
3623
3624 F scaley;
3625 if (kScale == -1) { scaley = 1.0f - fy; }
3626 if (kScale == +1) { scaley = fy; }
3627 sk_unaligned_store(ctx->scaley, scaley);
3628 }
3629
STAGE(bilinear_setup,SkRasterPipeline_SamplerCtx * ctx)3630 STAGE(bilinear_setup, SkRasterPipeline_SamplerCtx* ctx) {
3631 save_xy(&r, &g, ctx);
3632 // Init for accumulate
3633 dr = dg = db = da = F0;
3634 }
3635
STAGE(bilinear_nx,SkRasterPipeline_SamplerCtx * ctx)3636 STAGE(bilinear_nx, SkRasterPipeline_SamplerCtx* ctx) { bilinear_x<-1>(ctx, &r); }
STAGE(bilinear_px,SkRasterPipeline_SamplerCtx * ctx)3637 STAGE(bilinear_px, SkRasterPipeline_SamplerCtx* ctx) { bilinear_x<+1>(ctx, &r); }
STAGE(bilinear_ny,SkRasterPipeline_SamplerCtx * ctx)3638 STAGE(bilinear_ny, SkRasterPipeline_SamplerCtx* ctx) { bilinear_y<-1>(ctx, &g); }
STAGE(bilinear_py,SkRasterPipeline_SamplerCtx * ctx)3639 STAGE(bilinear_py, SkRasterPipeline_SamplerCtx* ctx) { bilinear_y<+1>(ctx, &g); }
3640
3641
3642 // In bicubic interpolation, the 16 pixels and +/- 0.5 and +/- 1.5 offsets from the sample
3643 // pixel center are combined with a non-uniform cubic filter, with higher values near the center.
3644 //
3645 // This helper computes the total weight along one axis (our bicubic filter is separable), given one
3646 // column of the sampling matrix, and a fractional pixel offset. See SkCubicResampler for details.
3647
bicubic_wts(F t,float A,float B,float C,float D)3648 SI F bicubic_wts(F t, float A, float B, float C, float D) {
3649 return mad(t, mad(t, mad(t, D, C), B), A);
3650 }
3651
3652 template <int kScale>
bicubic_x(SkRasterPipeline_SamplerCtx * ctx,F * x)3653 SI void bicubic_x(SkRasterPipeline_SamplerCtx* ctx, F* x) {
3654 *x = sk_unaligned_load<F>(ctx->x) + (kScale * 0.5f);
3655
3656 F scalex;
3657 if (kScale == -3) { scalex = sk_unaligned_load<F>(ctx->wx[0]); }
3658 if (kScale == -1) { scalex = sk_unaligned_load<F>(ctx->wx[1]); }
3659 if (kScale == +1) { scalex = sk_unaligned_load<F>(ctx->wx[2]); }
3660 if (kScale == +3) { scalex = sk_unaligned_load<F>(ctx->wx[3]); }
3661 sk_unaligned_store(ctx->scalex, scalex);
3662 }
3663 template <int kScale>
bicubic_y(SkRasterPipeline_SamplerCtx * ctx,F * y)3664 SI void bicubic_y(SkRasterPipeline_SamplerCtx* ctx, F* y) {
3665 *y = sk_unaligned_load<F>(ctx->y) + (kScale * 0.5f);
3666
3667 F scaley;
3668 if (kScale == -3) { scaley = sk_unaligned_load<F>(ctx->wy[0]); }
3669 if (kScale == -1) { scaley = sk_unaligned_load<F>(ctx->wy[1]); }
3670 if (kScale == +1) { scaley = sk_unaligned_load<F>(ctx->wy[2]); }
3671 if (kScale == +3) { scaley = sk_unaligned_load<F>(ctx->wy[3]); }
3672 sk_unaligned_store(ctx->scaley, scaley);
3673 }
3674
STAGE(bicubic_setup,SkRasterPipeline_SamplerCtx * ctx)3675 STAGE(bicubic_setup, SkRasterPipeline_SamplerCtx* ctx) {
3676 save_xy(&r, &g, ctx);
3677
3678 const float* w = ctx->weights;
3679
3680 F fx = sk_unaligned_load<F>(ctx->fx);
3681 sk_unaligned_store(ctx->wx[0], bicubic_wts(fx, w[0], w[4], w[ 8], w[12]));
3682 sk_unaligned_store(ctx->wx[1], bicubic_wts(fx, w[1], w[5], w[ 9], w[13]));
3683 sk_unaligned_store(ctx->wx[2], bicubic_wts(fx, w[2], w[6], w[10], w[14]));
3684 sk_unaligned_store(ctx->wx[3], bicubic_wts(fx, w[3], w[7], w[11], w[15]));
3685
3686 F fy = sk_unaligned_load<F>(ctx->fy);
3687 sk_unaligned_store(ctx->wy[0], bicubic_wts(fy, w[0], w[4], w[ 8], w[12]));
3688 sk_unaligned_store(ctx->wy[1], bicubic_wts(fy, w[1], w[5], w[ 9], w[13]));
3689 sk_unaligned_store(ctx->wy[2], bicubic_wts(fy, w[2], w[6], w[10], w[14]));
3690 sk_unaligned_store(ctx->wy[3], bicubic_wts(fy, w[3], w[7], w[11], w[15]));
3691
3692 // Init for accumulate
3693 dr = dg = db = da = F0;
3694 }
3695
STAGE(bicubic_n3x,SkRasterPipeline_SamplerCtx * ctx)3696 STAGE(bicubic_n3x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<-3>(ctx, &r); }
STAGE(bicubic_n1x,SkRasterPipeline_SamplerCtx * ctx)3697 STAGE(bicubic_n1x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<-1>(ctx, &r); }
STAGE(bicubic_p1x,SkRasterPipeline_SamplerCtx * ctx)3698 STAGE(bicubic_p1x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<+1>(ctx, &r); }
STAGE(bicubic_p3x,SkRasterPipeline_SamplerCtx * ctx)3699 STAGE(bicubic_p3x, SkRasterPipeline_SamplerCtx* ctx) { bicubic_x<+3>(ctx, &r); }
3700
STAGE(bicubic_n3y,SkRasterPipeline_SamplerCtx * ctx)3701 STAGE(bicubic_n3y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<-3>(ctx, &g); }
STAGE(bicubic_n1y,SkRasterPipeline_SamplerCtx * ctx)3702 STAGE(bicubic_n1y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<-1>(ctx, &g); }
STAGE(bicubic_p1y,SkRasterPipeline_SamplerCtx * ctx)3703 STAGE(bicubic_p1y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<+1>(ctx, &g); }
STAGE(bicubic_p3y,SkRasterPipeline_SamplerCtx * ctx)3704 STAGE(bicubic_p3y, SkRasterPipeline_SamplerCtx* ctx) { bicubic_y<+3>(ctx, &g); }
3705
compute_perlin_vector(U32 sample,F x,F y)3706 SI F compute_perlin_vector(U32 sample, F x, F y) {
3707 // We're relying on the packing of uint16s within a uint32, which will vary based on endianness.
3708 #ifdef SK_CPU_BENDIAN
3709 U32 sampleLo = sample >> 16;
3710 U32 sampleHi = sample & 0xFFFF;
3711 #else
3712 U32 sampleLo = sample & 0xFFFF;
3713 U32 sampleHi = sample >> 16;
3714 #endif
3715
3716 // Convert 32-bit sample value into two floats in the [-1..1] range.
3717 F vecX = mad(cast(sampleLo), 2.0f / 65535.0f, -1.0f);
3718 F vecY = mad(cast(sampleHi), 2.0f / 65535.0f, -1.0f);
3719
3720 // Return the dot of the sample and the passed-in vector.
3721 return mad(vecX, x,
3722 vecY * y);
3723 }
3724
STAGE(perlin_noise,SkRasterPipeline_PerlinNoiseCtx * ctx)3725 STAGE(perlin_noise, SkRasterPipeline_PerlinNoiseCtx* ctx) {
3726 F noiseVecX = (r + 0.5) * ctx->baseFrequencyX;
3727 F noiseVecY = (g + 0.5) * ctx->baseFrequencyY;
3728 r = g = b = a = F0;
3729 F stitchDataX = F_(ctx->stitchDataInX);
3730 F stitchDataY = F_(ctx->stitchDataInY);
3731 F ratio = F1;
3732
3733 for (int octave = 0; octave < ctx->numOctaves; ++octave) {
3734 // Calculate noise coordinates. (Roughly $noise_helper in Graphite)
3735 F floorValX = floor_(noiseVecX);
3736 F floorValY = floor_(noiseVecY);
3737 F ceilValX = floorValX + 1.0f;
3738 F ceilValY = floorValY + 1.0f;
3739 F fractValX = noiseVecX - floorValX;
3740 F fractValY = noiseVecY - floorValY;
3741
3742 if (ctx->stitching) {
3743 // If we are stitching, wrap the coordinates to the stitch position.
3744 floorValX -= sk_bit_cast<F>(cond_to_mask(floorValX >= stitchDataX) &
3745 sk_bit_cast<I32>(stitchDataX));
3746 floorValY -= sk_bit_cast<F>(cond_to_mask(floorValY >= stitchDataY) &
3747 sk_bit_cast<I32>(stitchDataY));
3748 ceilValX -= sk_bit_cast<F>(cond_to_mask(ceilValX >= stitchDataX) &
3749 sk_bit_cast<I32>(stitchDataX));
3750 ceilValY -= sk_bit_cast<F>(cond_to_mask(ceilValY >= stitchDataY) &
3751 sk_bit_cast<I32>(stitchDataY));
3752 }
3753
3754 U32 latticeLookup = (U32)(iround(floorValX)) & 0xFF;
3755 F latticeIdxX = cast(expand(gather(ctx->latticeSelector, latticeLookup)));
3756 latticeLookup = (U32)(iround(ceilValX)) & 0xFF;
3757 F latticeIdxY = cast(expand(gather(ctx->latticeSelector, latticeLookup)));
3758
3759 U32 b00 = (U32)(iround(latticeIdxX + floorValY)) & 0xFF;
3760 U32 b10 = (U32)(iround(latticeIdxY + floorValY)) & 0xFF;
3761 U32 b01 = (U32)(iround(latticeIdxX + ceilValY)) & 0xFF;
3762 U32 b11 = (U32)(iround(latticeIdxY + ceilValY)) & 0xFF;
3763
3764 // Calculate noise colors. (Roughly $noise_function in Graphite)
3765 // Apply Hermite interpolation to the fractional value.
3766 F smoothX = fractValX * fractValX * (3.0f - 2.0f * fractValX);
3767 F smoothY = fractValY * fractValY * (3.0f - 2.0f * fractValY);
3768
3769 F color[4];
3770 const uint32_t* channelNoiseData = reinterpret_cast<const uint32_t*>(ctx->noiseData);
3771 for (int channel = 0; channel < 4; ++channel) {
3772 U32 sample00 = gather(channelNoiseData, b00);
3773 U32 sample10 = gather(channelNoiseData, b10);
3774 U32 sample01 = gather(channelNoiseData, b01);
3775 U32 sample11 = gather(channelNoiseData, b11);
3776 channelNoiseData += 256;
3777
3778 F u = compute_perlin_vector(sample00, fractValX, fractValY);
3779 F v = compute_perlin_vector(sample10, fractValX - 1.0f, fractValY);
3780 F A = lerp(u, v, smoothX);
3781
3782 u = compute_perlin_vector(sample01, fractValX, fractValY - 1.0f);
3783 v = compute_perlin_vector(sample11, fractValX - 1.0f, fractValY - 1.0f);
3784 F B = lerp(u, v, smoothX);
3785
3786 color[channel] = lerp(A, B, smoothY);
3787 }
3788
3789 if (ctx->noiseType != SkPerlinNoiseShaderType::kFractalNoise) {
3790 // For kTurbulence the result is: abs(noise[-1,1])
3791 color[0] = abs_(color[0]);
3792 color[1] = abs_(color[1]);
3793 color[2] = abs_(color[2]);
3794 color[3] = abs_(color[3]);
3795 }
3796
3797 r = mad(color[0], ratio, r);
3798 g = mad(color[1], ratio, g);
3799 b = mad(color[2], ratio, b);
3800 a = mad(color[3], ratio, a);
3801
3802 // Scale inputs for the next round.
3803 noiseVecX *= 2.0f;
3804 noiseVecY *= 2.0f;
3805 stitchDataX *= 2.0f;
3806 stitchDataY *= 2.0f;
3807 ratio *= 0.5f;
3808 }
3809
3810 if (ctx->noiseType == SkPerlinNoiseShaderType::kFractalNoise) {
3811 // For kFractalNoise the result is: noise[-1,1] * 0.5 + 0.5
3812 r = mad(r, 0.5f, 0.5f);
3813 g = mad(g, 0.5f, 0.5f);
3814 b = mad(b, 0.5f, 0.5f);
3815 a = mad(a, 0.5f, 0.5f);
3816 }
3817
3818 r = clamp_01_(r) * a;
3819 g = clamp_01_(g) * a;
3820 b = clamp_01_(b) * a;
3821 a = clamp_01_(a);
3822 }
3823
STAGE(mipmap_linear_init,SkRasterPipeline_MipmapCtx * ctx)3824 STAGE(mipmap_linear_init, SkRasterPipeline_MipmapCtx* ctx) {
3825 sk_unaligned_store(ctx->x, r);
3826 sk_unaligned_store(ctx->y, g);
3827 }
3828
STAGE(mipmap_linear_update,SkRasterPipeline_MipmapCtx * ctx)3829 STAGE(mipmap_linear_update, SkRasterPipeline_MipmapCtx* ctx) {
3830 sk_unaligned_store(ctx->r, r);
3831 sk_unaligned_store(ctx->g, g);
3832 sk_unaligned_store(ctx->b, b);
3833 sk_unaligned_store(ctx->a, a);
3834
3835 r = sk_unaligned_load<F>(ctx->x) * ctx->scaleX;
3836 g = sk_unaligned_load<F>(ctx->y) * ctx->scaleY;
3837 }
3838
STAGE(mipmap_linear_finish,SkRasterPipeline_MipmapCtx * ctx)3839 STAGE(mipmap_linear_finish, SkRasterPipeline_MipmapCtx* ctx) {
3840 r = lerp(sk_unaligned_load<F>(ctx->r), r, F_(ctx->lowerWeight));
3841 g = lerp(sk_unaligned_load<F>(ctx->g), g, F_(ctx->lowerWeight));
3842 b = lerp(sk_unaligned_load<F>(ctx->b), b, F_(ctx->lowerWeight));
3843 a = lerp(sk_unaligned_load<F>(ctx->a), a, F_(ctx->lowerWeight));
3844 }
3845
STAGE(callback,SkRasterPipeline_CallbackCtx * c)3846 STAGE(callback, SkRasterPipeline_CallbackCtx* c) {
3847 store4(c->rgba, r,g,b,a);
3848 c->fn(c, N);
3849 load4(c->read_from, &r,&g,&b,&a);
3850 }
3851
STAGE_TAIL(set_base_pointer,std::byte * p)3852 STAGE_TAIL(set_base_pointer, std::byte* p) {
3853 base = p;
3854 }
3855
3856 // All control flow stages used by SkSL maintain some state in the common registers:
3857 // r: condition mask
3858 // g: loop mask
3859 // b: return mask
3860 // a: execution mask (intersection of all three masks)
3861 // After updating r/g/b, you must invoke update_execution_mask().
3862 #define execution_mask() sk_bit_cast<I32>(a)
3863 #define update_execution_mask() a = sk_bit_cast<F>(sk_bit_cast<I32>(r) & \
3864 sk_bit_cast<I32>(g) & \
3865 sk_bit_cast<I32>(b))
3866
STAGE_TAIL(init_lane_masks,SkRasterPipeline_InitLaneMasksCtx * ctx)3867 STAGE_TAIL(init_lane_masks, SkRasterPipeline_InitLaneMasksCtx* ctx) {
3868 uint32_t iota[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
3869 static_assert(std::size(iota) >= SkRasterPipeline_kMaxStride_highp);
3870
3871 I32 mask = cond_to_mask(sk_unaligned_load<U32>(iota) < *ctx->tail);
3872 r = g = b = a = sk_bit_cast<F>(mask);
3873 }
3874
STAGE_TAIL(store_device_xy01,F * dst)3875 STAGE_TAIL(store_device_xy01, F* dst) {
3876 // This is very similar to `seed_shader + store_src`, but b/a are backwards.
3877 // (sk_FragCoord actually puts w=1 in the w slot.)
3878 static constexpr float iota[] = {
3879 0.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 7.5f,
3880 8.5f, 9.5f,10.5f,11.5f,12.5f,13.5f,14.5f,15.5f,
3881 };
3882 static_assert(std::size(iota) >= SkRasterPipeline_kMaxStride_highp);
3883
3884 dst[0] = cast(U32_(dx)) + sk_unaligned_load<F>(iota);
3885 dst[1] = cast(U32_(dy)) + 0.5f;
3886 dst[2] = F0;
3887 dst[3] = F1;
3888 }
3889
STAGE_TAIL(exchange_src,F * rgba)3890 STAGE_TAIL(exchange_src, F* rgba) {
3891 // Swaps r,g,b,a registers with the values at `rgba`.
3892 F temp[4] = {r, g, b, a};
3893 r = rgba[0];
3894 rgba[0] = temp[0];
3895 g = rgba[1];
3896 rgba[1] = temp[1];
3897 b = rgba[2];
3898 rgba[2] = temp[2];
3899 a = rgba[3];
3900 rgba[3] = temp[3];
3901 }
3902
STAGE_TAIL(load_condition_mask,F * ctx)3903 STAGE_TAIL(load_condition_mask, F* ctx) {
3904 r = sk_unaligned_load<F>(ctx);
3905 update_execution_mask();
3906 }
3907
STAGE_TAIL(store_condition_mask,F * ctx)3908 STAGE_TAIL(store_condition_mask, F* ctx) {
3909 sk_unaligned_store(ctx, r);
3910 }
3911
STAGE_TAIL(merge_condition_mask,I32 * ptr)3912 STAGE_TAIL(merge_condition_mask, I32* ptr) {
3913 // Set the condition-mask to the intersection of two adjacent masks at the pointer.
3914 r = sk_bit_cast<F>(ptr[0] & ptr[1]);
3915 update_execution_mask();
3916 }
3917
STAGE_TAIL(merge_inv_condition_mask,I32 * ptr)3918 STAGE_TAIL(merge_inv_condition_mask, I32* ptr) {
3919 // Set the condition-mask to the intersection of the first mask and the inverse of the second.
3920 r = sk_bit_cast<F>(ptr[0] & ~ptr[1]);
3921 update_execution_mask();
3922 }
3923
STAGE_TAIL(load_loop_mask,F * ctx)3924 STAGE_TAIL(load_loop_mask, F* ctx) {
3925 g = sk_unaligned_load<F>(ctx);
3926 update_execution_mask();
3927 }
3928
STAGE_TAIL(store_loop_mask,F * ctx)3929 STAGE_TAIL(store_loop_mask, F* ctx) {
3930 sk_unaligned_store(ctx, g);
3931 }
3932
STAGE_TAIL(mask_off_loop_mask,NoCtx)3933 STAGE_TAIL(mask_off_loop_mask, NoCtx) {
3934 // We encountered a break statement. If a lane was active, it should be masked off now, and stay
3935 // masked-off until the termination of the loop.
3936 g = sk_bit_cast<F>(sk_bit_cast<I32>(g) & ~execution_mask());
3937 update_execution_mask();
3938 }
3939
STAGE_TAIL(reenable_loop_mask,I32 * ptr)3940 STAGE_TAIL(reenable_loop_mask, I32* ptr) {
3941 // Set the loop-mask to the union of the current loop-mask with the mask at the pointer.
3942 g = sk_bit_cast<F>(sk_bit_cast<I32>(g) | ptr[0]);
3943 update_execution_mask();
3944 }
3945
STAGE_TAIL(merge_loop_mask,I32 * ptr)3946 STAGE_TAIL(merge_loop_mask, I32* ptr) {
3947 // Set the loop-mask to the intersection of the current loop-mask with the mask at the pointer.
3948 // (Note: this behavior subtly differs from merge_condition_mask!)
3949 g = sk_bit_cast<F>(sk_bit_cast<I32>(g) & ptr[0]);
3950 update_execution_mask();
3951 }
3952
STAGE_TAIL(continue_op,I32 * continueMask)3953 STAGE_TAIL(continue_op, I32* continueMask) {
3954 // Set any currently-executing lanes in the continue-mask to true.
3955 *continueMask |= execution_mask();
3956
3957 // Disable any currently-executing lanes from the loop mask. (Just like `mask_off_loop_mask`.)
3958 g = sk_bit_cast<F>(sk_bit_cast<I32>(g) & ~execution_mask());
3959 update_execution_mask();
3960 }
3961
STAGE_TAIL(case_op,SkRasterPipeline_CaseOpCtx * packed)3962 STAGE_TAIL(case_op, SkRasterPipeline_CaseOpCtx* packed) {
3963 auto ctx = SkRPCtxUtils::Unpack(packed);
3964
3965 // Check each lane to see if the case value matches the expectation.
3966 I32* actualValue = (I32*)(base + ctx.offset);
3967 I32 caseMatches = cond_to_mask(*actualValue == ctx.expectedValue);
3968
3969 // In lanes where we found a match, enable the loop mask...
3970 g = sk_bit_cast<F>(sk_bit_cast<I32>(g) | caseMatches);
3971 update_execution_mask();
3972
3973 // ... and clear the default-case mask.
3974 I32* defaultMask = actualValue + 1;
3975 *defaultMask &= ~caseMatches;
3976 }
3977
STAGE_TAIL(load_return_mask,F * ctx)3978 STAGE_TAIL(load_return_mask, F* ctx) {
3979 b = sk_unaligned_load<F>(ctx);
3980 update_execution_mask();
3981 }
3982
STAGE_TAIL(store_return_mask,F * ctx)3983 STAGE_TAIL(store_return_mask, F* ctx) {
3984 sk_unaligned_store(ctx, b);
3985 }
3986
STAGE_TAIL(mask_off_return_mask,NoCtx)3987 STAGE_TAIL(mask_off_return_mask, NoCtx) {
3988 // We encountered a return statement. If a lane was active, it should be masked off now, and
3989 // stay masked-off until the end of the function.
3990 b = sk_bit_cast<F>(sk_bit_cast<I32>(b) & ~execution_mask());
3991 update_execution_mask();
3992 }
3993
STAGE_BRANCH(branch_if_all_lanes_active,SkRasterPipeline_BranchIfAllLanesActiveCtx * ctx)3994 STAGE_BRANCH(branch_if_all_lanes_active, SkRasterPipeline_BranchIfAllLanesActiveCtx* ctx) {
3995 uint32_t iota[] = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
3996 static_assert(std::size(iota) >= SkRasterPipeline_kMaxStride_highp);
3997
3998 I32 tailLanes = cond_to_mask(*ctx->tail <= sk_unaligned_load<U32>(iota));
3999 return all(execution_mask() | tailLanes) ? ctx->offset : 1;
4000 }
4001
STAGE_BRANCH(branch_if_any_lanes_active,SkRasterPipeline_BranchCtx * ctx)4002 STAGE_BRANCH(branch_if_any_lanes_active, SkRasterPipeline_BranchCtx* ctx) {
4003 return any(execution_mask()) ? ctx->offset : 1;
4004 }
4005
STAGE_BRANCH(branch_if_no_lanes_active,SkRasterPipeline_BranchCtx * ctx)4006 STAGE_BRANCH(branch_if_no_lanes_active, SkRasterPipeline_BranchCtx* ctx) {
4007 return any(execution_mask()) ? 1 : ctx->offset;
4008 }
4009
STAGE_BRANCH(jump,SkRasterPipeline_BranchCtx * ctx)4010 STAGE_BRANCH(jump, SkRasterPipeline_BranchCtx* ctx) {
4011 return ctx->offset;
4012 }
4013
STAGE_BRANCH(branch_if_no_active_lanes_eq,SkRasterPipeline_BranchIfEqualCtx * ctx)4014 STAGE_BRANCH(branch_if_no_active_lanes_eq, SkRasterPipeline_BranchIfEqualCtx* ctx) {
4015 // Compare each lane against the expected value...
4016 I32 match = cond_to_mask(*(const I32*)ctx->ptr == ctx->value);
4017 // ... but mask off lanes that aren't executing.
4018 match &= execution_mask();
4019 // If any lanes matched, don't take the branch.
4020 return any(match) ? 1 : ctx->offset;
4021 }
4022
STAGE_TAIL(trace_line,SkRasterPipeline_TraceLineCtx * ctx)4023 STAGE_TAIL(trace_line, SkRasterPipeline_TraceLineCtx* ctx) {
4024 const I32* traceMask = (const I32*)ctx->traceMask;
4025 if (any(execution_mask() & *traceMask)) {
4026 ctx->traceHook->line(ctx->lineNumber);
4027 }
4028 }
4029
STAGE_TAIL(trace_enter,SkRasterPipeline_TraceFuncCtx * ctx)4030 STAGE_TAIL(trace_enter, SkRasterPipeline_TraceFuncCtx* ctx) {
4031 const I32* traceMask = (const I32*)ctx->traceMask;
4032 if (any(execution_mask() & *traceMask)) {
4033 ctx->traceHook->enter(ctx->funcIdx);
4034 }
4035 }
4036
STAGE_TAIL(trace_exit,SkRasterPipeline_TraceFuncCtx * ctx)4037 STAGE_TAIL(trace_exit, SkRasterPipeline_TraceFuncCtx* ctx) {
4038 const I32* traceMask = (const I32*)ctx->traceMask;
4039 if (any(execution_mask() & *traceMask)) {
4040 ctx->traceHook->exit(ctx->funcIdx);
4041 }
4042 }
4043
STAGE_TAIL(trace_scope,SkRasterPipeline_TraceScopeCtx * ctx)4044 STAGE_TAIL(trace_scope, SkRasterPipeline_TraceScopeCtx* ctx) {
4045 // Note that trace_scope intentionally does not incorporate the execution mask. Otherwise, the
4046 // scopes would become unbalanced if the execution mask changed in the middle of a block. The
4047 // caller is responsible for providing a combined trace- and execution-mask.
4048 const I32* traceMask = (const I32*)ctx->traceMask;
4049 if (any(*traceMask)) {
4050 ctx->traceHook->scope(ctx->delta);
4051 }
4052 }
4053
STAGE_TAIL(trace_var,SkRasterPipeline_TraceVarCtx * ctx)4054 STAGE_TAIL(trace_var, SkRasterPipeline_TraceVarCtx* ctx) {
4055 const I32* traceMask = (const I32*)ctx->traceMask;
4056 I32 mask = execution_mask() & *traceMask;
4057 if (any(mask)) {
4058 for (size_t lane = 0; lane < N; ++lane) {
4059 if (select_lane(mask, lane)) {
4060 const I32* data = (const I32*)ctx->data;
4061 int slotIdx = ctx->slotIdx, numSlots = ctx->numSlots;
4062 if (ctx->indirectOffset) {
4063 // If this was an indirect store, apply the indirect-offset to the data pointer.
4064 uint32_t indirectOffset = select_lane(*(const U32*)ctx->indirectOffset, lane);
4065 indirectOffset = std::min<uint32_t>(indirectOffset, ctx->indirectLimit);
4066 data += indirectOffset;
4067 slotIdx += indirectOffset;
4068 }
4069 while (numSlots--) {
4070 ctx->traceHook->var(slotIdx, select_lane(*data, lane));
4071 ++slotIdx;
4072 ++data;
4073 }
4074 break;
4075 }
4076 }
4077 }
4078 }
4079
STAGE_TAIL(copy_uniform,SkRasterPipeline_UniformCtx * ctx)4080 STAGE_TAIL(copy_uniform, SkRasterPipeline_UniformCtx* ctx) {
4081 const int* src = ctx->src;
4082 I32* dst = (I32*)ctx->dst;
4083 dst[0] = I32_(src[0]);
4084 }
STAGE_TAIL(copy_2_uniforms,SkRasterPipeline_UniformCtx * ctx)4085 STAGE_TAIL(copy_2_uniforms, SkRasterPipeline_UniformCtx* ctx) {
4086 const int* src = ctx->src;
4087 I32* dst = (I32*)ctx->dst;
4088 dst[0] = I32_(src[0]);
4089 dst[1] = I32_(src[1]);
4090 }
STAGE_TAIL(copy_3_uniforms,SkRasterPipeline_UniformCtx * ctx)4091 STAGE_TAIL(copy_3_uniforms, SkRasterPipeline_UniformCtx* ctx) {
4092 const int* src = ctx->src;
4093 I32* dst = (I32*)ctx->dst;
4094 dst[0] = I32_(src[0]);
4095 dst[1] = I32_(src[1]);
4096 dst[2] = I32_(src[2]);
4097 }
STAGE_TAIL(copy_4_uniforms,SkRasterPipeline_UniformCtx * ctx)4098 STAGE_TAIL(copy_4_uniforms, SkRasterPipeline_UniformCtx* ctx) {
4099 const int* src = ctx->src;
4100 I32* dst = (I32*)ctx->dst;
4101 dst[0] = I32_(src[0]);
4102 dst[1] = I32_(src[1]);
4103 dst[2] = I32_(src[2]);
4104 dst[3] = I32_(src[3]);
4105 }
4106
STAGE_TAIL(copy_constant,SkRasterPipeline_ConstantCtx * packed)4107 STAGE_TAIL(copy_constant, SkRasterPipeline_ConstantCtx* packed) {
4108 auto ctx = SkRPCtxUtils::Unpack(packed);
4109 I32* dst = (I32*)(base + ctx.dst);
4110 I32 value = I32_(ctx.value);
4111 dst[0] = value;
4112 }
STAGE_TAIL(splat_2_constants,SkRasterPipeline_ConstantCtx * packed)4113 STAGE_TAIL(splat_2_constants, SkRasterPipeline_ConstantCtx* packed) {
4114 auto ctx = SkRPCtxUtils::Unpack(packed);
4115 I32* dst = (I32*)(base + ctx.dst);
4116 I32 value = I32_(ctx.value);
4117 dst[0] = dst[1] = value;
4118 }
STAGE_TAIL(splat_3_constants,SkRasterPipeline_ConstantCtx * packed)4119 STAGE_TAIL(splat_3_constants, SkRasterPipeline_ConstantCtx* packed) {
4120 auto ctx = SkRPCtxUtils::Unpack(packed);
4121 I32* dst = (I32*)(base + ctx.dst);
4122 I32 value = I32_(ctx.value);
4123 dst[0] = dst[1] = dst[2] = value;
4124 }
STAGE_TAIL(splat_4_constants,SkRasterPipeline_ConstantCtx * packed)4125 STAGE_TAIL(splat_4_constants, SkRasterPipeline_ConstantCtx* packed) {
4126 auto ctx = SkRPCtxUtils::Unpack(packed);
4127 I32* dst = (I32*)(base + ctx.dst);
4128 I32 value = I32_(ctx.value);
4129 dst[0] = dst[1] = dst[2] = dst[3] = value;
4130 }
4131
4132 template <int NumSlots>
copy_n_slots_unmasked_fn(SkRasterPipeline_BinaryOpCtx * packed,std::byte * base)4133 SI void copy_n_slots_unmasked_fn(SkRasterPipeline_BinaryOpCtx* packed, std::byte* base) {
4134 auto ctx = SkRPCtxUtils::Unpack(packed);
4135 F* dst = (F*)(base + ctx.dst);
4136 F* src = (F*)(base + ctx.src);
4137 memcpy(dst, src, sizeof(F) * NumSlots);
4138 }
4139
STAGE_TAIL(copy_slot_unmasked,SkRasterPipeline_BinaryOpCtx * packed)4140 STAGE_TAIL(copy_slot_unmasked, SkRasterPipeline_BinaryOpCtx* packed) {
4141 copy_n_slots_unmasked_fn<1>(packed, base);
4142 }
STAGE_TAIL(copy_2_slots_unmasked,SkRasterPipeline_BinaryOpCtx * packed)4143 STAGE_TAIL(copy_2_slots_unmasked, SkRasterPipeline_BinaryOpCtx* packed) {
4144 copy_n_slots_unmasked_fn<2>(packed, base);
4145 }
STAGE_TAIL(copy_3_slots_unmasked,SkRasterPipeline_BinaryOpCtx * packed)4146 STAGE_TAIL(copy_3_slots_unmasked, SkRasterPipeline_BinaryOpCtx* packed) {
4147 copy_n_slots_unmasked_fn<3>(packed, base);
4148 }
STAGE_TAIL(copy_4_slots_unmasked,SkRasterPipeline_BinaryOpCtx * packed)4149 STAGE_TAIL(copy_4_slots_unmasked, SkRasterPipeline_BinaryOpCtx* packed) {
4150 copy_n_slots_unmasked_fn<4>(packed, base);
4151 }
4152
4153 template <int NumSlots>
copy_n_immutable_unmasked_fn(SkRasterPipeline_BinaryOpCtx * packed,std::byte * base)4154 SI void copy_n_immutable_unmasked_fn(SkRasterPipeline_BinaryOpCtx* packed, std::byte* base) {
4155 auto ctx = SkRPCtxUtils::Unpack(packed);
4156
4157 // Load the scalar values.
4158 float* src = (float*)(base + ctx.src);
4159 float values[NumSlots];
4160 SK_UNROLL for (int index = 0; index < NumSlots; ++index) {
4161 values[index] = src[index];
4162 }
4163 // Broadcast the scalars into the destination.
4164 F* dst = (F*)(base + ctx.dst);
4165 SK_UNROLL for (int index = 0; index < NumSlots; ++index) {
4166 dst[index] = F_(values[index]);
4167 }
4168 }
4169
STAGE_TAIL(copy_immutable_unmasked,SkRasterPipeline_BinaryOpCtx * packed)4170 STAGE_TAIL(copy_immutable_unmasked, SkRasterPipeline_BinaryOpCtx* packed) {
4171 copy_n_immutable_unmasked_fn<1>(packed, base);
4172 }
STAGE_TAIL(copy_2_immutables_unmasked,SkRasterPipeline_BinaryOpCtx * packed)4173 STAGE_TAIL(copy_2_immutables_unmasked, SkRasterPipeline_BinaryOpCtx* packed) {
4174 copy_n_immutable_unmasked_fn<2>(packed, base);
4175 }
STAGE_TAIL(copy_3_immutables_unmasked,SkRasterPipeline_BinaryOpCtx * packed)4176 STAGE_TAIL(copy_3_immutables_unmasked, SkRasterPipeline_BinaryOpCtx* packed) {
4177 copy_n_immutable_unmasked_fn<3>(packed, base);
4178 }
STAGE_TAIL(copy_4_immutables_unmasked,SkRasterPipeline_BinaryOpCtx * packed)4179 STAGE_TAIL(copy_4_immutables_unmasked, SkRasterPipeline_BinaryOpCtx* packed) {
4180 copy_n_immutable_unmasked_fn<4>(packed, base);
4181 }
4182
4183 template <int NumSlots>
copy_n_slots_masked_fn(SkRasterPipeline_BinaryOpCtx * packed,std::byte * base,I32 mask)4184 SI void copy_n_slots_masked_fn(SkRasterPipeline_BinaryOpCtx* packed, std::byte* base, I32 mask) {
4185 auto ctx = SkRPCtxUtils::Unpack(packed);
4186 I32* dst = (I32*)(base + ctx.dst);
4187 I32* src = (I32*)(base + ctx.src);
4188 SK_UNROLL for (int count = 0; count < NumSlots; ++count) {
4189 *dst = if_then_else(mask, *src, *dst);
4190 dst += 1;
4191 src += 1;
4192 }
4193 }
4194
STAGE_TAIL(copy_slot_masked,SkRasterPipeline_BinaryOpCtx * packed)4195 STAGE_TAIL(copy_slot_masked, SkRasterPipeline_BinaryOpCtx* packed) {
4196 copy_n_slots_masked_fn<1>(packed, base, execution_mask());
4197 }
STAGE_TAIL(copy_2_slots_masked,SkRasterPipeline_BinaryOpCtx * packed)4198 STAGE_TAIL(copy_2_slots_masked, SkRasterPipeline_BinaryOpCtx* packed) {
4199 copy_n_slots_masked_fn<2>(packed, base, execution_mask());
4200 }
STAGE_TAIL(copy_3_slots_masked,SkRasterPipeline_BinaryOpCtx * packed)4201 STAGE_TAIL(copy_3_slots_masked, SkRasterPipeline_BinaryOpCtx* packed) {
4202 copy_n_slots_masked_fn<3>(packed, base, execution_mask());
4203 }
STAGE_TAIL(copy_4_slots_masked,SkRasterPipeline_BinaryOpCtx * packed)4204 STAGE_TAIL(copy_4_slots_masked, SkRasterPipeline_BinaryOpCtx* packed) {
4205 copy_n_slots_masked_fn<4>(packed, base, execution_mask());
4206 }
4207
4208 template <int LoopCount, typename OffsetType>
shuffle_fn(std::byte * ptr,OffsetType * offsets,int numSlots)4209 SI void shuffle_fn(std::byte* ptr, OffsetType* offsets, int numSlots) {
4210 F scratch[16];
4211 SK_UNROLL for (int count = 0; count < LoopCount; ++count) {
4212 scratch[count] = *(F*)(ptr + offsets[count]);
4213 }
4214 // Surprisingly, this switch generates significantly better code than a memcpy (on x86-64) when
4215 // the number of slots is unknown at compile time, and generates roughly identical code when the
4216 // number of slots is hardcoded. Using a switch allows `scratch` to live in ymm0-ymm15 instead
4217 // of being written out to the stack and then read back in. Also, the intrinsic memcpy assumes
4218 // that `numSlots` could be arbitrarily large, and so it emits more code than we need.
4219 F* dst = (F*)ptr;
4220 switch (numSlots) {
4221 case 16: dst[15] = scratch[15]; [[fallthrough]];
4222 case 15: dst[14] = scratch[14]; [[fallthrough]];
4223 case 14: dst[13] = scratch[13]; [[fallthrough]];
4224 case 13: dst[12] = scratch[12]; [[fallthrough]];
4225 case 12: dst[11] = scratch[11]; [[fallthrough]];
4226 case 11: dst[10] = scratch[10]; [[fallthrough]];
4227 case 10: dst[ 9] = scratch[ 9]; [[fallthrough]];
4228 case 9: dst[ 8] = scratch[ 8]; [[fallthrough]];
4229 case 8: dst[ 7] = scratch[ 7]; [[fallthrough]];
4230 case 7: dst[ 6] = scratch[ 6]; [[fallthrough]];
4231 case 6: dst[ 5] = scratch[ 5]; [[fallthrough]];
4232 case 5: dst[ 4] = scratch[ 4]; [[fallthrough]];
4233 case 4: dst[ 3] = scratch[ 3]; [[fallthrough]];
4234 case 3: dst[ 2] = scratch[ 2]; [[fallthrough]];
4235 case 2: dst[ 1] = scratch[ 1]; [[fallthrough]];
4236 case 1: dst[ 0] = scratch[ 0];
4237 }
4238 }
4239
4240 template <int N>
small_swizzle_fn(SkRasterPipeline_SwizzleCtx * packed,std::byte * base)4241 SI void small_swizzle_fn(SkRasterPipeline_SwizzleCtx* packed, std::byte* base) {
4242 auto ctx = SkRPCtxUtils::Unpack(packed);
4243 shuffle_fn<N>(base + ctx.dst, ctx.offsets, N);
4244 }
4245
STAGE_TAIL(swizzle_1,SkRasterPipeline_SwizzleCtx * packed)4246 STAGE_TAIL(swizzle_1, SkRasterPipeline_SwizzleCtx* packed) {
4247 small_swizzle_fn<1>(packed, base);
4248 }
STAGE_TAIL(swizzle_2,SkRasterPipeline_SwizzleCtx * packed)4249 STAGE_TAIL(swizzle_2, SkRasterPipeline_SwizzleCtx* packed) {
4250 small_swizzle_fn<2>(packed, base);
4251 }
STAGE_TAIL(swizzle_3,SkRasterPipeline_SwizzleCtx * packed)4252 STAGE_TAIL(swizzle_3, SkRasterPipeline_SwizzleCtx* packed) {
4253 small_swizzle_fn<3>(packed, base);
4254 }
STAGE_TAIL(swizzle_4,SkRasterPipeline_SwizzleCtx * packed)4255 STAGE_TAIL(swizzle_4, SkRasterPipeline_SwizzleCtx* packed) {
4256 small_swizzle_fn<4>(packed, base);
4257 }
STAGE_TAIL(shuffle,SkRasterPipeline_ShuffleCtx * ctx)4258 STAGE_TAIL(shuffle, SkRasterPipeline_ShuffleCtx* ctx) {
4259 shuffle_fn<16>((std::byte*)ctx->ptr, ctx->offsets, ctx->count);
4260 }
4261
4262 template <int NumSlots>
swizzle_copy_masked_fn(I32 * dst,const I32 * src,uint16_t * offsets,I32 mask)4263 SI void swizzle_copy_masked_fn(I32* dst, const I32* src, uint16_t* offsets, I32 mask) {
4264 std::byte* dstB = (std::byte*)dst;
4265 SK_UNROLL for (int count = 0; count < NumSlots; ++count) {
4266 I32* dstS = (I32*)(dstB + *offsets);
4267 *dstS = if_then_else(mask, *src, *dstS);
4268 offsets += 1;
4269 src += 1;
4270 }
4271 }
4272
STAGE_TAIL(swizzle_copy_slot_masked,SkRasterPipeline_SwizzleCopyCtx * ctx)4273 STAGE_TAIL(swizzle_copy_slot_masked, SkRasterPipeline_SwizzleCopyCtx* ctx) {
4274 swizzle_copy_masked_fn<1>((I32*)ctx->dst, (const I32*)ctx->src, ctx->offsets, execution_mask());
4275 }
STAGE_TAIL(swizzle_copy_2_slots_masked,SkRasterPipeline_SwizzleCopyCtx * ctx)4276 STAGE_TAIL(swizzle_copy_2_slots_masked, SkRasterPipeline_SwizzleCopyCtx* ctx) {
4277 swizzle_copy_masked_fn<2>((I32*)ctx->dst, (const I32*)ctx->src, ctx->offsets, execution_mask());
4278 }
STAGE_TAIL(swizzle_copy_3_slots_masked,SkRasterPipeline_SwizzleCopyCtx * ctx)4279 STAGE_TAIL(swizzle_copy_3_slots_masked, SkRasterPipeline_SwizzleCopyCtx* ctx) {
4280 swizzle_copy_masked_fn<3>((I32*)ctx->dst, (const I32*)ctx->src, ctx->offsets, execution_mask());
4281 }
STAGE_TAIL(swizzle_copy_4_slots_masked,SkRasterPipeline_SwizzleCopyCtx * ctx)4282 STAGE_TAIL(swizzle_copy_4_slots_masked, SkRasterPipeline_SwizzleCopyCtx* ctx) {
4283 swizzle_copy_masked_fn<4>((I32*)ctx->dst, (const I32*)ctx->src, ctx->offsets, execution_mask());
4284 }
4285
STAGE_TAIL(copy_from_indirect_unmasked,SkRasterPipeline_CopyIndirectCtx * ctx)4286 STAGE_TAIL(copy_from_indirect_unmasked, SkRasterPipeline_CopyIndirectCtx* ctx) {
4287 // Clamp the indirect offsets to stay within the limit.
4288 U32 offsets = *(const U32*)ctx->indirectOffset;
4289 offsets = min(offsets, U32_(ctx->indirectLimit));
4290
4291 // Scale up the offsets to account for the N lanes per value.
4292 offsets *= N;
4293
4294 // Adjust the offsets forward so that they fetch from the correct lane.
4295 static constexpr uint32_t iota[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
4296 static_assert(std::size(iota) >= SkRasterPipeline_kMaxStride_highp);
4297 offsets += sk_unaligned_load<U32>(iota);
4298
4299 // Use gather to perform indirect lookups; write the results into `dst`.
4300 const int* src = ctx->src;
4301 I32* dst = (I32*)ctx->dst;
4302 I32* end = dst + ctx->slots;
4303 do {
4304 *dst = gather(src, offsets);
4305 dst += 1;
4306 src += N;
4307 } while (dst != end);
4308 }
4309
STAGE_TAIL(copy_from_indirect_uniform_unmasked,SkRasterPipeline_CopyIndirectCtx * ctx)4310 STAGE_TAIL(copy_from_indirect_uniform_unmasked, SkRasterPipeline_CopyIndirectCtx* ctx) {
4311 // Clamp the indirect offsets to stay within the limit.
4312 U32 offsets = *(const U32*)ctx->indirectOffset;
4313 offsets = min(offsets, U32_(ctx->indirectLimit));
4314
4315 // Use gather to perform indirect lookups; write the results into `dst`.
4316 const int* src = ctx->src;
4317 I32* dst = (I32*)ctx->dst;
4318 I32* end = dst + ctx->slots;
4319 do {
4320 *dst = gather(src, offsets);
4321 dst += 1;
4322 src += 1;
4323 } while (dst != end);
4324 }
4325
STAGE_TAIL(copy_to_indirect_masked,SkRasterPipeline_CopyIndirectCtx * ctx)4326 STAGE_TAIL(copy_to_indirect_masked, SkRasterPipeline_CopyIndirectCtx* ctx) {
4327 // Clamp the indirect offsets to stay within the limit.
4328 U32 offsets = *(const U32*)ctx->indirectOffset;
4329 offsets = min(offsets, U32_(ctx->indirectLimit));
4330
4331 // Scale up the offsets to account for the N lanes per value.
4332 offsets *= N;
4333
4334 // Adjust the offsets forward so that they store into the correct lane.
4335 static constexpr uint32_t iota[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
4336 static_assert(std::size(iota) >= SkRasterPipeline_kMaxStride_highp);
4337 offsets += sk_unaligned_load<U32>(iota);
4338
4339 // Perform indirect, masked writes into `dst`.
4340 const I32* src = (const I32*)ctx->src;
4341 const I32* end = src + ctx->slots;
4342 int* dst = ctx->dst;
4343 I32 mask = execution_mask();
4344 do {
4345 scatter_masked(*src, dst, offsets, mask);
4346 dst += N;
4347 src += 1;
4348 } while (src != end);
4349 }
4350
STAGE_TAIL(swizzle_copy_to_indirect_masked,SkRasterPipeline_SwizzleCopyIndirectCtx * ctx)4351 STAGE_TAIL(swizzle_copy_to_indirect_masked, SkRasterPipeline_SwizzleCopyIndirectCtx* ctx) {
4352 // Clamp the indirect offsets to stay within the limit.
4353 U32 offsets = *(const U32*)ctx->indirectOffset;
4354 offsets = min(offsets, U32_(ctx->indirectLimit));
4355
4356 // Scale up the offsets to account for the N lanes per value.
4357 offsets *= N;
4358
4359 // Adjust the offsets forward so that they store into the correct lane.
4360 static constexpr uint32_t iota[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
4361 static_assert(std::size(iota) >= SkRasterPipeline_kMaxStride_highp);
4362 offsets += sk_unaligned_load<U32>(iota);
4363
4364 // Perform indirect, masked, swizzled writes into `dst`.
4365 const I32* src = (const I32*)ctx->src;
4366 const I32* end = src + ctx->slots;
4367 std::byte* dstB = (std::byte*)ctx->dst;
4368 const uint16_t* swizzle = ctx->offsets;
4369 I32 mask = execution_mask();
4370 do {
4371 int* dst = (int*)(dstB + *swizzle);
4372 scatter_masked(*src, dst, offsets, mask);
4373 swizzle += 1;
4374 src += 1;
4375 } while (src != end);
4376 }
4377
4378 // Unary operations take a single input, and overwrite it with their output.
4379 // Unlike binary or ternary operations, we provide variations of 1-4 slots, but don't provide
4380 // an arbitrary-width "n-slot" variation; the Builder can chain together longer sequences manually.
4381 template <typename T, void (*ApplyFn)(T*)>
apply_adjacent_unary(T * dst,T * end)4382 SI void apply_adjacent_unary(T* dst, T* end) {
4383 do {
4384 ApplyFn(dst);
4385 dst += 1;
4386 } while (dst != end);
4387 }
4388
4389 #if defined(SKRP_CPU_SCALAR)
4390 template <typename T>
cast_to_float_from_fn(T * dst)4391 SI void cast_to_float_from_fn(T* dst) {
4392 *dst = sk_bit_cast<T>((F)*dst);
4393 }
cast_to_int_from_fn(F * dst)4394 SI void cast_to_int_from_fn(F* dst) {
4395 *dst = sk_bit_cast<F>((I32)*dst);
4396 }
cast_to_uint_from_fn(F * dst)4397 SI void cast_to_uint_from_fn(F* dst) {
4398 *dst = sk_bit_cast<F>((U32)*dst);
4399 }
4400 #else
4401 template <typename T>
cast_to_float_from_fn(T * dst)4402 SI void cast_to_float_from_fn(T* dst) {
4403 *dst = sk_bit_cast<T>(__builtin_convertvector(*dst, F));
4404 }
cast_to_int_from_fn(F * dst)4405 SI void cast_to_int_from_fn(F* dst) {
4406 *dst = sk_bit_cast<F>(__builtin_convertvector(*dst, I32));
4407 }
cast_to_uint_from_fn(F * dst)4408 SI void cast_to_uint_from_fn(F* dst) {
4409 *dst = sk_bit_cast<F>(__builtin_convertvector(*dst, U32));
4410 }
4411 #endif
4412
abs_fn(I32 * dst)4413 SI void abs_fn(I32* dst) {
4414 *dst = abs_(*dst);
4415 }
4416
floor_fn(F * dst)4417 SI void floor_fn(F* dst) {
4418 *dst = floor_(*dst);
4419 }
4420
ceil_fn(F * dst)4421 SI void ceil_fn(F* dst) {
4422 *dst = ceil_(*dst);
4423 }
4424
invsqrt_fn(F * dst)4425 SI void invsqrt_fn(F* dst) {
4426 *dst = rsqrt(*dst);
4427 }
4428
4429 #define DECLARE_UNARY_FLOAT(name) \
4430 STAGE_TAIL(name##_float, F* dst) { apply_adjacent_unary<F, &name##_fn>(dst, dst + 1); } \
4431 STAGE_TAIL(name##_2_floats, F* dst) { apply_adjacent_unary<F, &name##_fn>(dst, dst + 2); } \
4432 STAGE_TAIL(name##_3_floats, F* dst) { apply_adjacent_unary<F, &name##_fn>(dst, dst + 3); } \
4433 STAGE_TAIL(name##_4_floats, F* dst) { apply_adjacent_unary<F, &name##_fn>(dst, dst + 4); }
4434
4435 #define DECLARE_UNARY_INT(name) \
4436 STAGE_TAIL(name##_int, I32* dst) { apply_adjacent_unary<I32, &name##_fn>(dst, dst + 1); } \
4437 STAGE_TAIL(name##_2_ints, I32* dst) { apply_adjacent_unary<I32, &name##_fn>(dst, dst + 2); } \
4438 STAGE_TAIL(name##_3_ints, I32* dst) { apply_adjacent_unary<I32, &name##_fn>(dst, dst + 3); } \
4439 STAGE_TAIL(name##_4_ints, I32* dst) { apply_adjacent_unary<I32, &name##_fn>(dst, dst + 4); }
4440
4441 #define DECLARE_UNARY_UINT(name) \
4442 STAGE_TAIL(name##_uint, U32* dst) { apply_adjacent_unary<U32, &name##_fn>(dst, dst + 1); } \
4443 STAGE_TAIL(name##_2_uints, U32* dst) { apply_adjacent_unary<U32, &name##_fn>(dst, dst + 2); } \
4444 STAGE_TAIL(name##_3_uints, U32* dst) { apply_adjacent_unary<U32, &name##_fn>(dst, dst + 3); } \
4445 STAGE_TAIL(name##_4_uints, U32* dst) { apply_adjacent_unary<U32, &name##_fn>(dst, dst + 4); }
4446
DECLARE_UNARY_UINT(cast_to_float_from)4447 DECLARE_UNARY_INT(cast_to_float_from) DECLARE_UNARY_UINT(cast_to_float_from)
4448 DECLARE_UNARY_FLOAT(cast_to_int_from)
4449 DECLARE_UNARY_FLOAT(cast_to_uint_from)
4450 DECLARE_UNARY_FLOAT(floor)
4451 DECLARE_UNARY_FLOAT(ceil)
4452 DECLARE_UNARY_FLOAT(invsqrt)
4453 DECLARE_UNARY_INT(abs)
4454
4455 #undef DECLARE_UNARY_FLOAT
4456 #undef DECLARE_UNARY_INT
4457 #undef DECLARE_UNARY_UINT
4458
4459 // For complex unary ops, we only provide a 1-slot version to reduce code bloat.
4460 STAGE_TAIL(sin_float, F* dst) { *dst = sin_(*dst); }
STAGE_TAIL(cos_float,F * dst)4461 STAGE_TAIL(cos_float, F* dst) { *dst = cos_(*dst); }
STAGE_TAIL(tan_float,F * dst)4462 STAGE_TAIL(tan_float, F* dst) { *dst = tan_(*dst); }
STAGE_TAIL(asin_float,F * dst)4463 STAGE_TAIL(asin_float, F* dst) { *dst = asin_(*dst); }
STAGE_TAIL(acos_float,F * dst)4464 STAGE_TAIL(acos_float, F* dst) { *dst = acos_(*dst); }
STAGE_TAIL(atan_float,F * dst)4465 STAGE_TAIL(atan_float, F* dst) { *dst = atan_(*dst); }
STAGE_TAIL(sqrt_float,F * dst)4466 STAGE_TAIL(sqrt_float, F* dst) { *dst = sqrt_(*dst); }
STAGE_TAIL(exp_float,F * dst)4467 STAGE_TAIL(exp_float, F* dst) { *dst = approx_exp(*dst); }
STAGE_TAIL(exp2_float,F * dst)4468 STAGE_TAIL(exp2_float, F* dst) { *dst = approx_pow2(*dst); }
STAGE_TAIL(log_float,F * dst)4469 STAGE_TAIL(log_float, F* dst) { *dst = approx_log(*dst); }
STAGE_TAIL(log2_float,F * dst)4470 STAGE_TAIL(log2_float, F* dst) { *dst = approx_log2(*dst); }
4471
STAGE_TAIL(inverse_mat2,F * dst)4472 STAGE_TAIL(inverse_mat2, F* dst) {
4473 F a00 = dst[0], a01 = dst[1],
4474 a10 = dst[2], a11 = dst[3];
4475 F det = nmad(a01, a10, a00 * a11),
4476 invdet = rcp_precise(det);
4477 dst[0] = invdet * a11;
4478 dst[1] = -invdet * a01;
4479 dst[2] = -invdet * a10;
4480 dst[3] = invdet * a00;
4481 }
4482
STAGE_TAIL(inverse_mat3,F * dst)4483 STAGE_TAIL(inverse_mat3, F* dst) {
4484 F a00 = dst[0], a01 = dst[1], a02 = dst[2],
4485 a10 = dst[3], a11 = dst[4], a12 = dst[5],
4486 a20 = dst[6], a21 = dst[7], a22 = dst[8];
4487 F b01 = nmad(a12, a21, a22 * a11),
4488 b11 = nmad(a22, a10, a12 * a20),
4489 b21 = nmad(a11, a20, a21 * a10);
4490 F det = mad(a00, b01, mad(a01, b11, a02 * b21)),
4491 invdet = rcp_precise(det);
4492 dst[0] = invdet * b01;
4493 dst[1] = invdet * nmad(a22, a01, a02 * a21);
4494 dst[2] = invdet * nmad(a02, a11, a12 * a01);
4495 dst[3] = invdet * b11;
4496 dst[4] = invdet * nmad(a02, a20, a22 * a00);
4497 dst[5] = invdet * nmad(a12, a00, a02 * a10);
4498 dst[6] = invdet * b21;
4499 dst[7] = invdet * nmad(a21, a00, a01 * a20);
4500 dst[8] = invdet * nmad(a01, a10, a11 * a00);
4501 }
4502
STAGE_TAIL(inverse_mat4,F * dst)4503 STAGE_TAIL(inverse_mat4, F* dst) {
4504 F a00 = dst[0], a01 = dst[1], a02 = dst[2], a03 = dst[3],
4505 a10 = dst[4], a11 = dst[5], a12 = dst[6], a13 = dst[7],
4506 a20 = dst[8], a21 = dst[9], a22 = dst[10], a23 = dst[11],
4507 a30 = dst[12], a31 = dst[13], a32 = dst[14], a33 = dst[15];
4508 F b00 = nmad(a01, a10, a00 * a11),
4509 b01 = nmad(a02, a10, a00 * a12),
4510 b02 = nmad(a03, a10, a00 * a13),
4511 b03 = nmad(a02, a11, a01 * a12),
4512 b04 = nmad(a03, a11, a01 * a13),
4513 b05 = nmad(a03, a12, a02 * a13),
4514 b06 = nmad(a21, a30, a20 * a31),
4515 b07 = nmad(a22, a30, a20 * a32),
4516 b08 = nmad(a23, a30, a20 * a33),
4517 b09 = nmad(a22, a31, a21 * a32),
4518 b10 = nmad(a23, a31, a21 * a33),
4519 b11 = nmad(a23, a32, a22 * a33),
4520 det = mad(b00, b11, b05 * b06) + mad(b02, b09, b03 * b08) - mad(b01, b10, b04 * b07),
4521 invdet = rcp_precise(det);
4522 b00 *= invdet;
4523 b01 *= invdet;
4524 b02 *= invdet;
4525 b03 *= invdet;
4526 b04 *= invdet;
4527 b05 *= invdet;
4528 b06 *= invdet;
4529 b07 *= invdet;
4530 b08 *= invdet;
4531 b09 *= invdet;
4532 b10 *= invdet;
4533 b11 *= invdet;
4534 dst[0] = mad(a13, b09, nmad(a12, b10, a11*b11));
4535 dst[1] = nmad(a03, b09, nmad(a01, b11, a02*b10));
4536 dst[2] = mad(a33, b03, nmad(a32, b04, a31*b05));
4537 dst[3] = nmad(a23, b03, nmad(a21, b05, a22*b04));
4538 dst[4] = nmad(a13, b07, nmad(a10, b11, a12*b08));
4539 dst[5] = mad(a03, b07, nmad(a02, b08, a00*b11));
4540 dst[6] = nmad(a33, b01, nmad(a30, b05, a32*b02));
4541 dst[7] = mad(a23, b01, nmad(a22, b02, a20*b05));
4542 dst[8] = mad(a13, b06, nmad(a11, b08, a10*b10));
4543 dst[9] = nmad(a03, b06, nmad(a00, b10, a01*b08));
4544 dst[10] = mad(a33, b00, nmad(a31, b02, a30*b04));
4545 dst[11] = nmad(a23, b00, nmad(a20, b04, a21*b02));
4546 dst[12] = nmad(a12, b06, nmad(a10, b09, a11*b07));
4547 dst[13] = mad(a02, b06, nmad(a01, b07, a00*b09));
4548 dst[14] = nmad(a32, b00, nmad(a30, b03, a31*b01));
4549 dst[15] = mad(a22, b00, nmad(a21, b01, a20*b03));
4550 }
4551
4552 // Binary operations take two adjacent inputs, and write their output in the first position.
4553 template <typename T, void (*ApplyFn)(T*, T*)>
apply_adjacent_binary(T * dst,T * src)4554 SI void apply_adjacent_binary(T* dst, T* src) {
4555 T* end = src;
4556 do {
4557 ApplyFn(dst, src);
4558 dst += 1;
4559 src += 1;
4560 } while (dst != end);
4561 }
4562
4563 template <typename T, void (*ApplyFn)(T*, T*)>
apply_adjacent_binary_packed(SkRasterPipeline_BinaryOpCtx * packed,std::byte * base)4564 SI void apply_adjacent_binary_packed(SkRasterPipeline_BinaryOpCtx* packed, std::byte* base) {
4565 auto ctx = SkRPCtxUtils::Unpack(packed);
4566 std::byte* dst = base + ctx.dst;
4567 std::byte* src = base + ctx.src;
4568 apply_adjacent_binary<T, ApplyFn>((T*)dst, (T*)src);
4569 }
4570
4571 template <int N, typename V, typename S, void (*ApplyFn)(V*, V*)>
apply_binary_immediate(SkRasterPipeline_ConstantCtx * packed,std::byte * base)4572 SI void apply_binary_immediate(SkRasterPipeline_ConstantCtx* packed, std::byte* base) {
4573 auto ctx = SkRPCtxUtils::Unpack(packed);
4574 V* dst = (V*)(base + ctx.dst); // get a pointer to the destination
4575 S scalar = sk_bit_cast<S>(ctx.value); // bit-pun the constant value as desired
4576 V src = scalar - V(); // broadcast the constant value into a vector
4577 SK_UNROLL for (int index = 0; index < N; ++index) {
4578 ApplyFn(dst, &src); // perform the operation
4579 dst += 1;
4580 }
4581 }
4582
4583 template <typename T>
add_fn(T * dst,T * src)4584 SI void add_fn(T* dst, T* src) {
4585 *dst += *src;
4586 }
4587
4588 template <typename T>
sub_fn(T * dst,T * src)4589 SI void sub_fn(T* dst, T* src) {
4590 *dst -= *src;
4591 }
4592
4593 template <typename T>
mul_fn(T * dst,T * src)4594 SI void mul_fn(T* dst, T* src) {
4595 *dst *= *src;
4596 }
4597
4598 template <typename T>
div_fn(T * dst,T * src)4599 SI void div_fn(T* dst, T* src) {
4600 T divisor = *src;
4601 if constexpr (!std::is_same_v<T, F>) {
4602 // We will crash if we integer-divide against zero. Convert 0 to ~0 to avoid this.
4603 divisor |= (T)cond_to_mask(divisor == 0);
4604 }
4605 *dst /= divisor;
4606 }
4607
bitwise_and_fn(I32 * dst,I32 * src)4608 SI void bitwise_and_fn(I32* dst, I32* src) {
4609 *dst &= *src;
4610 }
4611
bitwise_or_fn(I32 * dst,I32 * src)4612 SI void bitwise_or_fn(I32* dst, I32* src) {
4613 *dst |= *src;
4614 }
4615
bitwise_xor_fn(I32 * dst,I32 * src)4616 SI void bitwise_xor_fn(I32* dst, I32* src) {
4617 *dst ^= *src;
4618 }
4619
4620 template <typename T>
max_fn(T * dst,T * src)4621 SI void max_fn(T* dst, T* src) {
4622 *dst = max(*dst, *src);
4623 }
4624
4625 template <typename T>
min_fn(T * dst,T * src)4626 SI void min_fn(T* dst, T* src) {
4627 *dst = min(*dst, *src);
4628 }
4629
4630 template <typename T>
cmplt_fn(T * dst,T * src)4631 SI void cmplt_fn(T* dst, T* src) {
4632 static_assert(sizeof(T) == sizeof(I32));
4633 I32 result = cond_to_mask(*dst < *src);
4634 memcpy(dst, &result, sizeof(I32));
4635 }
4636
4637 template <typename T>
cmple_fn(T * dst,T * src)4638 SI void cmple_fn(T* dst, T* src) {
4639 static_assert(sizeof(T) == sizeof(I32));
4640 I32 result = cond_to_mask(*dst <= *src);
4641 memcpy(dst, &result, sizeof(I32));
4642 }
4643
4644 template <typename T>
cmpeq_fn(T * dst,T * src)4645 SI void cmpeq_fn(T* dst, T* src) {
4646 static_assert(sizeof(T) == sizeof(I32));
4647 I32 result = cond_to_mask(*dst == *src);
4648 memcpy(dst, &result, sizeof(I32));
4649 }
4650
4651 template <typename T>
cmpne_fn(T * dst,T * src)4652 SI void cmpne_fn(T* dst, T* src) {
4653 static_assert(sizeof(T) == sizeof(I32));
4654 I32 result = cond_to_mask(*dst != *src);
4655 memcpy(dst, &result, sizeof(I32));
4656 }
4657
atan2_fn(F * dst,F * src)4658 SI void atan2_fn(F* dst, F* src) {
4659 *dst = atan2_(*dst, *src);
4660 }
4661
pow_fn(F * dst,F * src)4662 SI void pow_fn(F* dst, F* src) {
4663 *dst = approx_powf(*dst, *src);
4664 }
4665
mod_fn(F * dst,F * src)4666 SI void mod_fn(F* dst, F* src) {
4667 *dst = nmad(*src, floor_(*dst / *src), *dst);
4668 }
4669
4670 #define DECLARE_N_WAY_BINARY_FLOAT(name) \
4671 STAGE_TAIL(name##_n_floats, SkRasterPipeline_BinaryOpCtx* packed) { \
4672 apply_adjacent_binary_packed<F, &name##_fn>(packed, base); \
4673 }
4674
4675 #define DECLARE_BINARY_FLOAT(name) \
4676 STAGE_TAIL(name##_float, F* dst) { apply_adjacent_binary<F, &name##_fn>(dst, dst + 1); } \
4677 STAGE_TAIL(name##_2_floats, F* dst) { apply_adjacent_binary<F, &name##_fn>(dst, dst + 2); } \
4678 STAGE_TAIL(name##_3_floats, F* dst) { apply_adjacent_binary<F, &name##_fn>(dst, dst + 3); } \
4679 STAGE_TAIL(name##_4_floats, F* dst) { apply_adjacent_binary<F, &name##_fn>(dst, dst + 4); } \
4680 DECLARE_N_WAY_BINARY_FLOAT(name)
4681
4682 #define DECLARE_N_WAY_BINARY_INT(name) \
4683 STAGE_TAIL(name##_n_ints, SkRasterPipeline_BinaryOpCtx* packed) { \
4684 apply_adjacent_binary_packed<I32, &name##_fn>(packed, base); \
4685 }
4686
4687 #define DECLARE_BINARY_INT(name) \
4688 STAGE_TAIL(name##_int, I32* dst) { apply_adjacent_binary<I32, &name##_fn>(dst, dst + 1); } \
4689 STAGE_TAIL(name##_2_ints, I32* dst) { apply_adjacent_binary<I32, &name##_fn>(dst, dst + 2); } \
4690 STAGE_TAIL(name##_3_ints, I32* dst) { apply_adjacent_binary<I32, &name##_fn>(dst, dst + 3); } \
4691 STAGE_TAIL(name##_4_ints, I32* dst) { apply_adjacent_binary<I32, &name##_fn>(dst, dst + 4); } \
4692 DECLARE_N_WAY_BINARY_INT(name)
4693
4694 #define DECLARE_N_WAY_BINARY_UINT(name) \
4695 STAGE_TAIL(name##_n_uints, SkRasterPipeline_BinaryOpCtx* packed) { \
4696 apply_adjacent_binary_packed<U32, &name##_fn>(packed, base); \
4697 }
4698
4699 #define DECLARE_BINARY_UINT(name) \
4700 STAGE_TAIL(name##_uint, U32* dst) { apply_adjacent_binary<U32, &name##_fn>(dst, dst + 1); } \
4701 STAGE_TAIL(name##_2_uints, U32* dst) { apply_adjacent_binary<U32, &name##_fn>(dst, dst + 2); } \
4702 STAGE_TAIL(name##_3_uints, U32* dst) { apply_adjacent_binary<U32, &name##_fn>(dst, dst + 3); } \
4703 STAGE_TAIL(name##_4_uints, U32* dst) { apply_adjacent_binary<U32, &name##_fn>(dst, dst + 4); } \
4704 DECLARE_N_WAY_BINARY_UINT(name)
4705
4706 // Many ops reuse the int stages when performing uint arithmetic, since they're equivalent on a
4707 // two's-complement machine. (Even multiplication is equivalent in the lower 32 bits.)
DECLARE_BINARY_INT(add)4708 DECLARE_BINARY_FLOAT(add) DECLARE_BINARY_INT(add)
4709 DECLARE_BINARY_FLOAT(sub) DECLARE_BINARY_INT(sub)
4710 DECLARE_BINARY_FLOAT(mul) DECLARE_BINARY_INT(mul)
4711 DECLARE_BINARY_FLOAT(div) DECLARE_BINARY_INT(div) DECLARE_BINARY_UINT(div)
4712 DECLARE_BINARY_INT(bitwise_and)
4713 DECLARE_BINARY_INT(bitwise_or)
4714 DECLARE_BINARY_INT(bitwise_xor)
4715 DECLARE_BINARY_FLOAT(mod)
4716 DECLARE_BINARY_FLOAT(min) DECLARE_BINARY_INT(min) DECLARE_BINARY_UINT(min)
4717 DECLARE_BINARY_FLOAT(max) DECLARE_BINARY_INT(max) DECLARE_BINARY_UINT(max)
4718 DECLARE_BINARY_FLOAT(cmplt) DECLARE_BINARY_INT(cmplt) DECLARE_BINARY_UINT(cmplt)
4719 DECLARE_BINARY_FLOAT(cmple) DECLARE_BINARY_INT(cmple) DECLARE_BINARY_UINT(cmple)
4720 DECLARE_BINARY_FLOAT(cmpeq) DECLARE_BINARY_INT(cmpeq)
4721 DECLARE_BINARY_FLOAT(cmpne) DECLARE_BINARY_INT(cmpne)
4722
4723 // Sufficiently complex ops only provide an N-way version, to avoid code bloat from the dedicated
4724 // 1-4 slot versions.
4725 DECLARE_N_WAY_BINARY_FLOAT(atan2)
4726 DECLARE_N_WAY_BINARY_FLOAT(pow)
4727
4728 // Some ops have an optimized version when the right-side is an immediate value.
4729 #define DECLARE_IMM_BINARY_FLOAT(name) \
4730 STAGE_TAIL(name##_imm_float, SkRasterPipeline_ConstantCtx* packed) { \
4731 apply_binary_immediate<1, F, float, &name##_fn>(packed, base); \
4732 }
4733 #define DECLARE_IMM_BINARY_INT(name) \
4734 STAGE_TAIL(name##_imm_int, SkRasterPipeline_ConstantCtx* packed) { \
4735 apply_binary_immediate<1, I32, int32_t, &name##_fn>(packed, base); \
4736 }
4737 #define DECLARE_MULTI_IMM_BINARY_INT(name) \
4738 STAGE_TAIL(name##_imm_int, SkRasterPipeline_ConstantCtx* packed) { \
4739 apply_binary_immediate<1, I32, int32_t, &name##_fn>(packed, base); \
4740 } \
4741 STAGE_TAIL(name##_imm_2_ints, SkRasterPipeline_ConstantCtx* packed) { \
4742 apply_binary_immediate<2, I32, int32_t, &name##_fn>(packed, base); \
4743 } \
4744 STAGE_TAIL(name##_imm_3_ints, SkRasterPipeline_ConstantCtx* packed) { \
4745 apply_binary_immediate<3, I32, int32_t, &name##_fn>(packed, base); \
4746 } \
4747 STAGE_TAIL(name##_imm_4_ints, SkRasterPipeline_ConstantCtx* packed) { \
4748 apply_binary_immediate<4, I32, int32_t, &name##_fn>(packed, base); \
4749 }
4750 #define DECLARE_IMM_BINARY_UINT(name) \
4751 STAGE_TAIL(name##_imm_uint, SkRasterPipeline_ConstantCtx* packed) { \
4752 apply_binary_immediate<1, U32, uint32_t, &name##_fn>(packed, base); \
4753 }
4754
4755 DECLARE_IMM_BINARY_FLOAT(add) DECLARE_IMM_BINARY_INT(add)
4756 DECLARE_IMM_BINARY_FLOAT(mul) DECLARE_IMM_BINARY_INT(mul)
4757 DECLARE_MULTI_IMM_BINARY_INT(bitwise_and)
4758 DECLARE_IMM_BINARY_FLOAT(max)
4759 DECLARE_IMM_BINARY_FLOAT(min)
4760 DECLARE_IMM_BINARY_INT(bitwise_xor)
4761 DECLARE_IMM_BINARY_FLOAT(cmplt) DECLARE_IMM_BINARY_INT(cmplt) DECLARE_IMM_BINARY_UINT(cmplt)
4762 DECLARE_IMM_BINARY_FLOAT(cmple) DECLARE_IMM_BINARY_INT(cmple) DECLARE_IMM_BINARY_UINT(cmple)
4763 DECLARE_IMM_BINARY_FLOAT(cmpeq) DECLARE_IMM_BINARY_INT(cmpeq)
4764 DECLARE_IMM_BINARY_FLOAT(cmpne) DECLARE_IMM_BINARY_INT(cmpne)
4765
4766 #undef DECLARE_MULTI_IMM_BINARY_INT
4767 #undef DECLARE_IMM_BINARY_FLOAT
4768 #undef DECLARE_IMM_BINARY_INT
4769 #undef DECLARE_IMM_BINARY_UINT
4770 #undef DECLARE_BINARY_FLOAT
4771 #undef DECLARE_BINARY_INT
4772 #undef DECLARE_BINARY_UINT
4773 #undef DECLARE_N_WAY_BINARY_FLOAT
4774 #undef DECLARE_N_WAY_BINARY_INT
4775 #undef DECLARE_N_WAY_BINARY_UINT
4776
4777 // Dots can be represented with multiply and add ops, but they are so foundational that it's worth
4778 // having dedicated ops.
4779 STAGE_TAIL(dot_2_floats, F* dst) {
4780 dst[0] = mad(dst[0], dst[2],
4781 dst[1] * dst[3]);
4782 }
4783
STAGE_TAIL(dot_3_floats,F * dst)4784 STAGE_TAIL(dot_3_floats, F* dst) {
4785 dst[0] = mad(dst[0], dst[3],
4786 mad(dst[1], dst[4],
4787 dst[2] * dst[5]));
4788 }
4789
STAGE_TAIL(dot_4_floats,F * dst)4790 STAGE_TAIL(dot_4_floats, F* dst) {
4791 dst[0] = mad(dst[0], dst[4],
4792 mad(dst[1], dst[5],
4793 mad(dst[2], dst[6],
4794 dst[3] * dst[7])));
4795 }
4796
4797 // MxM, VxM and MxV multiplication all use matrix_multiply. Vectors are treated like a matrix with a
4798 // single column or row.
4799 template <int N>
matrix_multiply(SkRasterPipeline_MatrixMultiplyCtx * packed,std::byte * base)4800 SI void matrix_multiply(SkRasterPipeline_MatrixMultiplyCtx* packed, std::byte* base) {
4801 auto ctx = SkRPCtxUtils::Unpack(packed);
4802
4803 int outColumns = ctx.rightColumns,
4804 outRows = ctx.leftRows;
4805
4806 SkASSERT(outColumns >= 1);
4807 SkASSERT(outRows >= 1);
4808 SkASSERT(outColumns <= 4);
4809 SkASSERT(outRows <= 4);
4810
4811 SkASSERT(ctx.leftColumns == ctx.rightRows);
4812 SkASSERT(N == ctx.leftColumns); // N should match the result width
4813
4814 #if !defined(SKRP_CPU_SCALAR)
4815 // This prevents Clang from generating early-out checks for zero-sized matrices.
4816 SK_ASSUME(outColumns >= 1);
4817 SK_ASSUME(outRows >= 1);
4818 SK_ASSUME(outColumns <= 4);
4819 SK_ASSUME(outRows <= 4);
4820 #endif
4821
4822 // Get pointers to the adjacent left- and right-matrices.
4823 F* resultMtx = (F*)(base + ctx.dst);
4824 F* leftMtx = &resultMtx[ctx.rightColumns * ctx.leftRows];
4825 F* rightMtx = &leftMtx[N * ctx.leftRows];
4826
4827 // Emit each matrix element.
4828 for (int c = 0; c < outColumns; ++c) {
4829 for (int r = 0; r < outRows; ++r) {
4830 // Dot a vector from leftMtx[*][r] with rightMtx[c][*].
4831 F* leftRow = &leftMtx [r];
4832 F* rightColumn = &rightMtx[c * N];
4833
4834 F element = *leftRow * *rightColumn;
4835 for (int idx = 1; idx < N; ++idx) {
4836 leftRow += outRows;
4837 rightColumn += 1;
4838 element = mad(*leftRow, *rightColumn, element);
4839 }
4840
4841 *resultMtx++ = element;
4842 }
4843 }
4844 }
4845
STAGE_TAIL(matrix_multiply_2,SkRasterPipeline_MatrixMultiplyCtx * packed)4846 STAGE_TAIL(matrix_multiply_2, SkRasterPipeline_MatrixMultiplyCtx* packed) {
4847 matrix_multiply<2>(packed, base);
4848 }
4849
STAGE_TAIL(matrix_multiply_3,SkRasterPipeline_MatrixMultiplyCtx * packed)4850 STAGE_TAIL(matrix_multiply_3, SkRasterPipeline_MatrixMultiplyCtx* packed) {
4851 matrix_multiply<3>(packed, base);
4852 }
4853
STAGE_TAIL(matrix_multiply_4,SkRasterPipeline_MatrixMultiplyCtx * packed)4854 STAGE_TAIL(matrix_multiply_4, SkRasterPipeline_MatrixMultiplyCtx* packed) {
4855 matrix_multiply<4>(packed, base);
4856 }
4857
4858 // Refract always operates on 4-wide incident and normal vectors; for narrower inputs, the code
4859 // generator fills in the input columns with zero, and discards the extra output columns.
STAGE_TAIL(refract_4_floats,F * dst)4860 STAGE_TAIL(refract_4_floats, F* dst) {
4861 // Algorithm adapted from https://registry.khronos.org/OpenGL-Refpages/gl4/html/refract.xhtml
4862 F *incident = dst + 0;
4863 F *normal = dst + 4;
4864 F eta = dst[8];
4865
4866 F dotNI = mad(normal[0], incident[0],
4867 mad(normal[1], incident[1],
4868 mad(normal[2], incident[2],
4869 normal[3] * incident[3])));
4870
4871 F k = 1.0 - eta * eta * (1.0 - dotNI * dotNI);
4872 F sqrt_k = sqrt_(k);
4873
4874 for (int idx = 0; idx < 4; ++idx) {
4875 dst[idx] = if_then_else(k >= 0,
4876 eta * incident[idx] - (eta * dotNI + sqrt_k) * normal[idx],
4877 0.0);
4878 }
4879 }
4880
4881 // Ternary operations work like binary ops (see immediately above) but take two source inputs.
4882 template <typename T, void (*ApplyFn)(T*, T*, T*)>
apply_adjacent_ternary(T * dst,T * src0,T * src1)4883 SI void apply_adjacent_ternary(T* dst, T* src0, T* src1) {
4884 int count = src0 - dst;
4885 #if !defined(SKRP_CPU_SCALAR)
4886 SK_ASSUME(count >= 1);
4887 #endif
4888
4889 for (int index = 0; index < count; ++index) {
4890 ApplyFn(dst, src0, src1);
4891 dst += 1;
4892 src0 += 1;
4893 src1 += 1;
4894 }
4895 }
4896
4897 template <typename T, void (*ApplyFn)(T*, T*, T*)>
apply_adjacent_ternary_packed(SkRasterPipeline_TernaryOpCtx * packed,std::byte * base)4898 SI void apply_adjacent_ternary_packed(SkRasterPipeline_TernaryOpCtx* packed, std::byte* base) {
4899 auto ctx = SkRPCtxUtils::Unpack(packed);
4900 std::byte* dst = base + ctx.dst;
4901 std::byte* src0 = dst + ctx.delta;
4902 std::byte* src1 = src0 + ctx.delta;
4903 apply_adjacent_ternary<T, ApplyFn>((T*)dst, (T*)src0, (T*)src1);
4904 }
4905
mix_fn(F * a,F * x,F * y)4906 SI void mix_fn(F* a, F* x, F* y) {
4907 // We reorder the arguments here to match lerp's GLSL-style order (interpolation point last).
4908 *a = lerp(*x, *y, *a);
4909 }
4910
mix_fn(I32 * a,I32 * x,I32 * y)4911 SI void mix_fn(I32* a, I32* x, I32* y) {
4912 // We reorder the arguments here to match if_then_else's expected order (y before x).
4913 *a = if_then_else(*a, *y, *x);
4914 }
4915
smoothstep_fn(F * edge0,F * edge1,F * x)4916 SI void smoothstep_fn(F* edge0, F* edge1, F* x) {
4917 F t = clamp_01_((*x - *edge0) / (*edge1 - *edge0));
4918 *edge0 = t * t * (3.0 - 2.0 * t);
4919 }
4920
4921 #define DECLARE_N_WAY_TERNARY_FLOAT(name) \
4922 STAGE_TAIL(name##_n_floats, SkRasterPipeline_TernaryOpCtx* packed) { \
4923 apply_adjacent_ternary_packed<F, &name##_fn>(packed, base); \
4924 }
4925
4926 #define DECLARE_TERNARY_FLOAT(name) \
4927 STAGE_TAIL(name##_float, F* p) { apply_adjacent_ternary<F, &name##_fn>(p, p+1, p+2); } \
4928 STAGE_TAIL(name##_2_floats, F* p) { apply_adjacent_ternary<F, &name##_fn>(p, p+2, p+4); } \
4929 STAGE_TAIL(name##_3_floats, F* p) { apply_adjacent_ternary<F, &name##_fn>(p, p+3, p+6); } \
4930 STAGE_TAIL(name##_4_floats, F* p) { apply_adjacent_ternary<F, &name##_fn>(p, p+4, p+8); } \
4931 DECLARE_N_WAY_TERNARY_FLOAT(name)
4932
4933 #define DECLARE_TERNARY_INT(name) \
4934 STAGE_TAIL(name##_int, I32* p) { apply_adjacent_ternary<I32, &name##_fn>(p, p+1, p+2); } \
4935 STAGE_TAIL(name##_2_ints, I32* p) { apply_adjacent_ternary<I32, &name##_fn>(p, p+2, p+4); } \
4936 STAGE_TAIL(name##_3_ints, I32* p) { apply_adjacent_ternary<I32, &name##_fn>(p, p+3, p+6); } \
4937 STAGE_TAIL(name##_4_ints, I32* p) { apply_adjacent_ternary<I32, &name##_fn>(p, p+4, p+8); } \
4938 STAGE_TAIL(name##_n_ints, SkRasterPipeline_TernaryOpCtx* packed) { \
4939 apply_adjacent_ternary_packed<I32, &name##_fn>(packed, base); \
4940 }
4941
4942 DECLARE_N_WAY_TERNARY_FLOAT(smoothstep)
DECLARE_TERNARY_FLOAT(mix)4943 DECLARE_TERNARY_FLOAT(mix)
4944 DECLARE_TERNARY_INT(mix)
4945
4946 #undef DECLARE_N_WAY_TERNARY_FLOAT
4947 #undef DECLARE_TERNARY_FLOAT
4948 #undef DECLARE_TERNARY_INT
4949
4950 STAGE(gauss_a_to_rgba, NoCtx) {
4951 // x = 1 - x;
4952 // exp(-x * x * 4) - 0.018f;
4953 // ... now approximate with quartic
4954 //
4955 const float c4 = -2.26661229133605957031f;
4956 const float c3 = 2.89795351028442382812f;
4957 const float c2 = 0.21345567703247070312f;
4958 const float c1 = 0.15489584207534790039f;
4959 const float c0 = 0.00030726194381713867f;
4960 a = mad(a, mad(a, mad(a, mad(a, c4, c3), c2), c1), c0);
4961 r = a;
4962 g = a;
4963 b = a;
4964 }
4965
4966 // A specialized fused image shader for clamp-x, clamp-y, non-sRGB sampling.
STAGE(bilerp_clamp_8888,const SkRasterPipeline_GatherCtx * ctx)4967 STAGE(bilerp_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) {
4968 // (cx,cy) are the center of our sample.
4969 F cx = r,
4970 cy = g;
4971
4972 // All sample points are at the same fractional offset (fx,fy).
4973 // They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets.
4974 F fx = fract(cx + 0.5f),
4975 fy = fract(cy + 0.5f);
4976
4977 // We'll accumulate the color of all four samples into {r,g,b,a} directly.
4978 r = g = b = a = F0;
4979
4980 for (float py = -0.5f; py <= +0.5f; py += 1.0f)
4981 for (float px = -0.5f; px <= +0.5f; px += 1.0f) {
4982 // (x,y) are the coordinates of this sample point.
4983 F x = cx + px,
4984 y = cy + py;
4985
4986 // ix_and_ptr() will clamp to the image's bounds for us.
4987 const uint32_t* ptr;
4988 U32 ix = ix_and_ptr(&ptr, ctx, x,y);
4989
4990 F sr,sg,sb,sa;
4991 from_8888(gather(ptr, ix), &sr,&sg,&sb,&sa);
4992
4993 // In bilinear interpolation, the 4 pixels at +/- 0.5 offsets from the sample pixel center
4994 // are combined in direct proportion to their area overlapping that logical query pixel.
4995 // At positive offsets, the x-axis contribution to that rectangle is fx,
4996 // or (1-fx) at negative x. Same deal for y.
4997 F sx = (px > 0) ? fx : 1.0f - fx,
4998 sy = (py > 0) ? fy : 1.0f - fy,
4999 area = sx * sy;
5000
5001 r += sr * area;
5002 g += sg * area;
5003 b += sb * area;
5004 a += sa * area;
5005 }
5006 }
5007
5008 // A specialized fused image shader for clamp-x, clamp-y, non-sRGB sampling.
STAGE(bicubic_clamp_8888,const SkRasterPipeline_GatherCtx * ctx)5009 STAGE(bicubic_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) {
5010 // (cx,cy) are the center of our sample.
5011 F cx = r,
5012 cy = g;
5013
5014 // All sample points are at the same fractional offset (fx,fy).
5015 // They're the 4 corners of a logical 1x1 pixel surrounding (x,y) at (0.5,0.5) offsets.
5016 F fx = fract(cx + 0.5f),
5017 fy = fract(cy + 0.5f);
5018
5019 // We'll accumulate the color of all four samples into {r,g,b,a} directly.
5020 r = g = b = a = F0;
5021
5022 const float* w = ctx->weights;
5023 const F scaley[4] = {bicubic_wts(fy, w[0], w[4], w[ 8], w[12]),
5024 bicubic_wts(fy, w[1], w[5], w[ 9], w[13]),
5025 bicubic_wts(fy, w[2], w[6], w[10], w[14]),
5026 bicubic_wts(fy, w[3], w[7], w[11], w[15])};
5027 const F scalex[4] = {bicubic_wts(fx, w[0], w[4], w[ 8], w[12]),
5028 bicubic_wts(fx, w[1], w[5], w[ 9], w[13]),
5029 bicubic_wts(fx, w[2], w[6], w[10], w[14]),
5030 bicubic_wts(fx, w[3], w[7], w[11], w[15])};
5031
5032 F sample_y = cy - 1.5f;
5033 for (int yy = 0; yy <= 3; ++yy) {
5034 F sample_x = cx - 1.5f;
5035 for (int xx = 0; xx <= 3; ++xx) {
5036 F scale = scalex[xx] * scaley[yy];
5037
5038 // ix_and_ptr() will clamp to the image's bounds for us.
5039 const uint32_t* ptr;
5040 U32 ix = ix_and_ptr(&ptr, ctx, sample_x, sample_y);
5041
5042 F sr,sg,sb,sa;
5043 from_8888(gather(ptr, ix), &sr,&sg,&sb,&sa);
5044
5045 r = mad(scale, sr, r);
5046 g = mad(scale, sg, g);
5047 b = mad(scale, sb, b);
5048 a = mad(scale, sa, a);
5049
5050 sample_x += 1;
5051 }
5052 sample_y += 1;
5053 }
5054 }
5055
5056 // ~~~~~~ skgpu::Swizzle stage ~~~~~~ //
5057
STAGE(swizzle,void * ctx)5058 STAGE(swizzle, void* ctx) {
5059 auto ir = r, ig = g, ib = b, ia = a;
5060 F* o[] = {&r, &g, &b, &a};
5061 char swiz[4];
5062 memcpy(swiz, &ctx, sizeof(swiz));
5063
5064 for (int i = 0; i < 4; ++i) {
5065 switch (swiz[i]) {
5066 case 'r': *o[i] = ir; break;
5067 case 'g': *o[i] = ig; break;
5068 case 'b': *o[i] = ib; break;
5069 case 'a': *o[i] = ia; break;
5070 case '0': *o[i] = F0; break;
5071 case '1': *o[i] = F1; break;
5072 default: break;
5073 }
5074 }
5075 }
5076
5077 namespace lowp {
5078 #if defined(SKRP_CPU_SCALAR) || defined(SK_ENABLE_OPTIMIZE_SIZE) || \
5079 defined(SK_BUILD_FOR_GOOGLE3) || defined(SK_DISABLE_LOWP_RASTER_PIPELINE)
5080 // We don't bother generating the lowp stages if we are:
5081 // - ... in scalar mode (MSVC, old clang, etc...)
5082 // - ... trying to save code size
5083 // - ... building for Google3. (No justification for this, but changing it would be painful).
5084 // - ... explicitly disabling it. This is currently just used by Flutter.
5085 //
5086 // Having nullptr for every stage will cause SkRasterPipeline to always use the highp stages.
5087 #define M(st) static void (*st)(void) = nullptr;
5088 SK_RASTER_PIPELINE_OPS_LOWP(M)
5089 #undef M
5090 static void (*just_return)(void) = nullptr;
5091
start_pipeline(size_t,size_t,size_t,size_t,SkRasterPipelineStage *,SkSpan<SkRasterPipeline_MemoryCtxPatch>,uint8_t * tailPointer)5092 static void start_pipeline(size_t,size_t,size_t,size_t, SkRasterPipelineStage*,
5093 SkSpan<SkRasterPipeline_MemoryCtxPatch>,
5094 uint8_t* tailPointer) {}
5095
5096 #else // We are compiling vector code with Clang... let's make some lowp stages!
5097
5098 #if defined(SKRP_CPU_SKX) || defined(SKRP_CPU_HSW) || defined(SKRP_CPU_LASX)
5099 template <typename T> using V = Vec<16, T>;
5100 #else
5101 template <typename T> using V = Vec<8, T>;
5102 #endif
5103
5104 using U8 = V<uint8_t >;
5105 using U16 = V<uint16_t>;
5106 using I16 = V< int16_t>;
5107 using I32 = V< int32_t>;
5108 using U32 = V<uint32_t>;
5109 using I64 = V< int64_t>;
5110 using U64 = V<uint64_t>;
5111 using F = V<float >;
5112
5113 static constexpr size_t N = sizeof(U16) / sizeof(uint16_t);
5114
5115 // Promotion helpers (for GCC)
5116 #if defined(__clang__)
5117 SI constexpr U16 U16_(uint16_t x) { return x; }
5118 SI constexpr I32 I32_( int32_t x) { return x; }
5119 SI constexpr U32 U32_(uint32_t x) { return x; }
5120 SI constexpr F F_ (float x) { return x; }
5121 #else
5122 SI constexpr U16 U16_(uint16_t x) { return x + U16(); }
5123 SI constexpr I32 I32_( int32_t x) { return x + I32(); }
5124 SI constexpr U32 U32_(uint32_t x) { return x + U32(); }
5125 SI constexpr F F_ (float x) { return x - F (); }
5126 #endif
5127
5128 static constexpr U16 U16_0 = U16_(0),
5129 U16_255 = U16_(255);
5130
5131 // Once again, some platforms benefit from a restricted Stage calling convention,
5132 // but others can pass tons and tons of registers and we're happy to exploit that.
5133 // It's exactly the same decision and implementation strategy as the F stages above.
5134 #if SKRP_NARROW_STAGES
5135 struct Params {
5136 size_t dx, dy;
5137 U16 dr,dg,db,da;
5138 };
5139 using Stage = void (ABI*)(Params*, SkRasterPipelineStage* program, U16 r, U16 g, U16 b, U16 a);
5140 #else
5141 using Stage = void (ABI*)(SkRasterPipelineStage* program,
5142 size_t dx, size_t dy,
5143 U16 r, U16 g, U16 b, U16 a,
5144 U16 dr, U16 dg, U16 db, U16 da);
5145 #endif
5146
5147 static void start_pipeline(size_t x0, size_t y0,
5148 size_t xlimit, size_t ylimit,
5149 SkRasterPipelineStage* program,
5150 SkSpan<SkRasterPipeline_MemoryCtxPatch> memoryCtxPatches,
5151 uint8_t* tailPointer) {
5152 uint8_t unreferencedTail;
5153 if (!tailPointer) {
5154 tailPointer = &unreferencedTail;
5155 }
5156 auto start = (Stage)program->fn;
5157 for (size_t dy = y0; dy < ylimit; dy++) {
5158 #if SKRP_NARROW_STAGES
5159 Params params = { x0,dy, U16_0,U16_0,U16_0,U16_0 };
5160 for (; params.dx + N <= xlimit; params.dx += N) {
5161 start(¶ms, program, U16_0,U16_0,U16_0,U16_0);
5162 }
5163 if (size_t tail = xlimit - params.dx) {
5164 *tailPointer = tail;
5165 patch_memory_contexts(memoryCtxPatches, params.dx, dy, tail);
5166 start(¶ms, program, U16_0,U16_0,U16_0,U16_0);
5167 restore_memory_contexts(memoryCtxPatches, params.dx, dy, tail);
5168 *tailPointer = 0xFF;
5169 }
5170 #else
5171 size_t dx = x0;
5172 for (; dx + N <= xlimit; dx += N) {
5173 start(program, dx,dy, U16_0,U16_0,U16_0,U16_0, U16_0,U16_0,U16_0,U16_0);
5174 }
5175 if (size_t tail = xlimit - dx) {
5176 *tailPointer = tail;
5177 patch_memory_contexts(memoryCtxPatches, dx, dy, tail);
5178 start(program, dx,dy, U16_0,U16_0,U16_0,U16_0, U16_0,U16_0,U16_0,U16_0);
5179 restore_memory_contexts(memoryCtxPatches, dx, dy, tail);
5180 *tailPointer = 0xFF;
5181 }
5182 #endif
5183 }
5184 }
5185
5186 #if SKRP_NARROW_STAGES
5187 static void ABI just_return(Params*, SkRasterPipelineStage*, U16,U16,U16,U16) {}
5188 #else
5189 static void ABI just_return(SkRasterPipelineStage*, size_t,size_t,
5190 U16,U16,U16,U16, U16,U16,U16,U16) {}
5191 #endif
5192
5193 // All stages use the same function call ABI to chain into each other, but there are three types:
5194 // GG: geometry in, geometry out -- think, a matrix
5195 // GP: geometry in, pixels out. -- think, a memory gather
5196 // PP: pixels in, pixels out. -- think, a blend mode
5197 //
5198 // (Some stages ignore their inputs or produce no logical output. That's perfectly fine.)
5199 //
5200 // These three STAGE_ macros let you define each type of stage,
5201 // and will have (x,y) geometry and/or (r,g,b,a, dr,dg,db,da) pixel arguments as appropriate.
5202
5203 #if SKRP_NARROW_STAGES
5204 #define STAGE_GG(name, ARG) \
5205 SI void name##_k(ARG, size_t dx, size_t dy, F& x, F& y); \
5206 static void ABI name(Params* params, SkRasterPipelineStage* program, \
5207 U16 r, U16 g, U16 b, U16 a) { \
5208 auto x = join<F>(r,g), \
5209 y = join<F>(b,a); \
5210 name##_k(Ctx{program}, params->dx,params->dy, x,y); \
5211 split(x, &r,&g); \
5212 split(y, &b,&a); \
5213 auto fn = (Stage)(++program)->fn; \
5214 fn(params, program, r,g,b,a); \
5215 } \
5216 SI void name##_k(ARG, size_t dx, size_t dy, F& x, F& y)
5217
5218 #define STAGE_GP(name, ARG) \
5219 SI void name##_k(ARG, size_t dx, size_t dy, F x, F y, \
5220 U16& r, U16& g, U16& b, U16& a, \
5221 U16& dr, U16& dg, U16& db, U16& da); \
5222 static void ABI name(Params* params, SkRasterPipelineStage* program, \
5223 U16 r, U16 g, U16 b, U16 a) { \
5224 auto x = join<F>(r,g), \
5225 y = join<F>(b,a); \
5226 name##_k(Ctx{program}, params->dx,params->dy, x,y, r,g,b,a, \
5227 params->dr,params->dg,params->db,params->da); \
5228 auto fn = (Stage)(++program)->fn; \
5229 fn(params, program, r,g,b,a); \
5230 } \
5231 SI void name##_k(ARG, size_t dx, size_t dy, F x, F y, \
5232 U16& r, U16& g, U16& b, U16& a, \
5233 U16& dr, U16& dg, U16& db, U16& da)
5234
5235 #define STAGE_PP(name, ARG) \
5236 SI void name##_k(ARG, size_t dx, size_t dy, \
5237 U16& r, U16& g, U16& b, U16& a, \
5238 U16& dr, U16& dg, U16& db, U16& da); \
5239 static void ABI name(Params* params, SkRasterPipelineStage* program, \
5240 U16 r, U16 g, U16 b, U16 a) { \
5241 name##_k(Ctx{program}, params->dx,params->dy, r,g,b,a, \
5242 params->dr,params->dg,params->db,params->da); \
5243 auto fn = (Stage)(++program)->fn; \
5244 fn(params, program, r,g,b,a); \
5245 } \
5246 SI void name##_k(ARG, size_t dx, size_t dy, \
5247 U16& r, U16& g, U16& b, U16& a, \
5248 U16& dr, U16& dg, U16& db, U16& da)
5249 #else
5250 #define STAGE_GG(name, ARG) \
5251 SI void name##_k(ARG, size_t dx, size_t dy, F& x, F& y); \
5252 static void ABI name(SkRasterPipelineStage* program, \
5253 size_t dx, size_t dy, \
5254 U16 r, U16 g, U16 b, U16 a, \
5255 U16 dr, U16 dg, U16 db, U16 da) { \
5256 auto x = join<F>(r,g), \
5257 y = join<F>(b,a); \
5258 name##_k(Ctx{program}, dx,dy, x,y); \
5259 split(x, &r,&g); \
5260 split(y, &b,&a); \
5261 auto fn = (Stage)(++program)->fn; \
5262 fn(program, dx,dy, r,g,b,a, dr,dg,db,da); \
5263 } \
5264 SI void name##_k(ARG, size_t dx, size_t dy, F& x, F& y)
5265
5266 #define STAGE_GP(name, ARG) \
5267 SI void name##_k(ARG, size_t dx, size_t dy, F x, F y, \
5268 U16& r, U16& g, U16& b, U16& a, \
5269 U16& dr, U16& dg, U16& db, U16& da); \
5270 static void ABI name(SkRasterPipelineStage* program, \
5271 size_t dx, size_t dy, \
5272 U16 r, U16 g, U16 b, U16 a, \
5273 U16 dr, U16 dg, U16 db, U16 da) { \
5274 auto x = join<F>(r,g), \
5275 y = join<F>(b,a); \
5276 name##_k(Ctx{program}, dx,dy, x,y, r,g,b,a, dr,dg,db,da); \
5277 auto fn = (Stage)(++program)->fn; \
5278 fn(program, dx,dy, r,g,b,a, dr,dg,db,da); \
5279 } \
5280 SI void name##_k(ARG, size_t dx, size_t dy, F x, F y, \
5281 U16& r, U16& g, U16& b, U16& a, \
5282 U16& dr, U16& dg, U16& db, U16& da)
5283
5284 #define STAGE_PP(name, ARG) \
5285 SI void name##_k(ARG, size_t dx, size_t dy, \
5286 U16& r, U16& g, U16& b, U16& a, \
5287 U16& dr, U16& dg, U16& db, U16& da); \
5288 static void ABI name(SkRasterPipelineStage* program, \
5289 size_t dx, size_t dy, \
5290 U16 r, U16 g, U16 b, U16 a, \
5291 U16 dr, U16 dg, U16 db, U16 da) { \
5292 name##_k(Ctx{program}, dx,dy, r,g,b,a, dr,dg,db,da); \
5293 auto fn = (Stage)(++program)->fn; \
5294 fn(program, dx,dy, r,g,b,a, dr,dg,db,da); \
5295 } \
5296 SI void name##_k(ARG, size_t dx, size_t dy, \
5297 U16& r, U16& g, U16& b, U16& a, \
5298 U16& dr, U16& dg, U16& db, U16& da)
5299 #endif
5300
5301 // ~~~~~~ Commonly used helper functions ~~~~~~ //
5302
5303 /**
5304 * Helpers to to properly rounded division (by 255). The ideal answer we want to compute is slow,
5305 * thanks to a division by a non-power of two:
5306 * [1] (v + 127) / 255
5307 *
5308 * There is a two-step process that computes the correct answer for all inputs:
5309 * [2] (v + 128 + ((v + 128) >> 8)) >> 8
5310 *
5311 * There is also a single iteration approximation, but it's wrong (+-1) ~25% of the time:
5312 * [3] (v + 255) >> 8;
5313 *
5314 * We offer two different implementations here, depending on the requirements of the calling stage.
5315 */
5316
5317 /**
5318 * div255 favors speed over accuracy. It uses formula [2] on NEON (where we can compute it as fast
5319 * as [3]), and uses [3] elsewhere.
5320 */
5321 SI U16 div255(U16 v) {
5322 #if defined(SKRP_CPU_NEON)
5323 // With NEON we can compute [2] just as fast as [3], so let's be correct.
5324 // First we compute v + ((v+128)>>8), then one more round of (...+128)>>8 to finish up:
5325 return vrshrq_n_u16(vrsraq_n_u16(v, v, 8), 8);
5326 #else
5327 // Otherwise, use [3], which is never wrong by more than 1:
5328 return (v+255)/256;
5329 #endif
5330 }
5331
5332 /**
5333 * div255_accurate guarantees the right answer on all platforms, at the expense of performance.
5334 */
5335 SI U16 div255_accurate(U16 v) {
5336 #if defined(SKRP_CPU_NEON)
5337 // Our NEON implementation of div255 is already correct for all inputs:
5338 return div255(v);
5339 #else
5340 // This is [2] (the same formulation as NEON), but written without the benefit of intrinsics:
5341 v += 128;
5342 return (v+(v/256))/256;
5343 #endif
5344 }
5345
5346 SI U16 inv(U16 v) { return 255-v; }
5347
5348 SI U16 if_then_else(I16 c, U16 t, U16 e) {
5349 return (t & sk_bit_cast<U16>(c)) | (e & sk_bit_cast<U16>(~c));
5350 }
5351 SI U32 if_then_else(I32 c, U32 t, U32 e) {
5352 return (t & sk_bit_cast<U32>(c)) | (e & sk_bit_cast<U32>(~c));
5353 }
5354
5355 SI U16 max(U16 x, U16 y) { return if_then_else(x < y, y, x); }
5356 SI U16 min(U16 x, U16 y) { return if_then_else(x < y, x, y); }
5357
5358 SI U16 max(U16 a, uint16_t b) { return max( a , U16_(b)); }
5359 SI U16 max(uint16_t a, U16 b) { return max(U16_(a), b ); }
5360 SI U16 min(U16 a, uint16_t b) { return min( a , U16_(b)); }
5361 SI U16 min(uint16_t a, U16 b) { return min(U16_(a), b ); }
5362
5363 SI U16 from_float(float f) { return U16_(f * 255.0f + 0.5f); }
5364
5365 SI U16 lerp(U16 from, U16 to, U16 t) { return div255( from*inv(t) + to*t ); }
5366
5367 template <typename D, typename S>
5368 SI D cast(S src) {
5369 return __builtin_convertvector(src, D);
5370 }
5371
5372 template <typename D, typename S>
5373 SI void split(S v, D* lo, D* hi) {
5374 static_assert(2*sizeof(D) == sizeof(S), "");
5375 memcpy(lo, (const char*)&v + 0*sizeof(D), sizeof(D));
5376 memcpy(hi, (const char*)&v + 1*sizeof(D), sizeof(D));
5377 }
5378 template <typename D, typename S>
5379 SI D join(S lo, S hi) {
5380 static_assert(sizeof(D) == 2*sizeof(S), "");
5381 D v;
5382 memcpy((char*)&v + 0*sizeof(S), &lo, sizeof(S));
5383 memcpy((char*)&v + 1*sizeof(S), &hi, sizeof(S));
5384 return v;
5385 }
5386
5387 SI F if_then_else(I32 c, F t, F e) {
5388 return sk_bit_cast<F>( (sk_bit_cast<I32>(t) & c) | (sk_bit_cast<I32>(e) & ~c) );
5389 }
5390 SI F if_then_else(I32 c, F t, float e) { return if_then_else(c, t , F_(e)); }
5391 SI F if_then_else(I32 c, float t, F e) { return if_then_else(c, F_(t), e ); }
5392
5393 SI F max(F x, F y) { return if_then_else(x < y, y, x); }
5394 SI F min(F x, F y) { return if_then_else(x < y, x, y); }
5395
5396 SI F max(F a, float b) { return max( a , F_(b)); }
5397 SI F max(float a, F b) { return max(F_(a), b ); }
5398 SI F min(F a, float b) { return min( a , F_(b)); }
5399 SI F min(float a, F b) { return min(F_(a), b ); }
5400
5401 SI I32 if_then_else(I32 c, I32 t, I32 e) {
5402 return (t & c) | (e & ~c);
5403 }
5404 SI I32 max(I32 x, I32 y) { return if_then_else(x < y, y, x); }
5405 SI I32 min(I32 x, I32 y) { return if_then_else(x < y, x, y); }
5406
5407 SI I32 max(I32 a, int32_t b) { return max( a , I32_(b)); }
5408 SI I32 max(int32_t a, I32 b) { return max(I32_(a), b ); }
5409 SI I32 min(I32 a, int32_t b) { return min( a , I32_(b)); }
5410 SI I32 min(int32_t a, I32 b) { return min(I32_(a), b ); }
5411
5412 SI F mad(F f, F m, F a) { return a+f*m; }
5413 SI F mad(F f, F m, float a) { return mad( f , m , F_(a)); }
5414 SI F mad(F f, float m, F a) { return mad( f , F_(m), a ); }
5415 SI F mad(F f, float m, float a) { return mad( f , F_(m), F_(a)); }
5416 SI F mad(float f, F m, F a) { return mad(F_(f), m , a ); }
5417 SI F mad(float f, F m, float a) { return mad(F_(f), m , F_(a)); }
5418 SI F mad(float f, float m, F a) { return mad(F_(f), F_(m), a ); }
5419
5420 SI F nmad(F f, F m, F a) { return a-f*m; }
5421 SI F nmad(F f, F m, float a) { return nmad( f , m , F_(a)); }
5422 SI F nmad(F f, float m, F a) { return nmad( f , F_(m), a ); }
5423 SI F nmad(F f, float m, float a) { return nmad( f , F_(m), F_(a)); }
5424 SI F nmad(float f, F m, F a) { return nmad(F_(f), m , a ); }
5425 SI F nmad(float f, F m, float a) { return nmad(F_(f), m , F_(a)); }
5426 SI F nmad(float f, float m, F a) { return nmad(F_(f), F_(m), a ); }
5427
5428 SI U32 trunc_(F x) { return (U32)cast<I32>(x); }
5429
5430 // Use approximate instructions and one Newton-Raphson step to calculate 1/x.
5431 SI F rcp_precise(F x) {
5432 #if defined(SKRP_CPU_SKX)
5433 F e = _mm512_rcp14_ps(x);
5434 return _mm512_fnmadd_ps(x, e, _mm512_set1_ps(2.0f)) * e;
5435 #elif defined(SKRP_CPU_HSW)
5436 __m256 lo,hi;
5437 split(x, &lo,&hi);
5438 return join<F>(SK_OPTS_NS::rcp_precise(lo), SK_OPTS_NS::rcp_precise(hi));
5439 #elif defined(SKRP_CPU_SSE2) || defined(SKRP_CPU_SSE41) || defined(SKRP_CPU_AVX)
5440 __m128 lo,hi;
5441 split(x, &lo,&hi);
5442 return join<F>(SK_OPTS_NS::rcp_precise(lo), SK_OPTS_NS::rcp_precise(hi));
5443 #elif defined(SKRP_CPU_NEON)
5444 float32x4_t lo,hi;
5445 split(x, &lo,&hi);
5446 return join<F>(SK_OPTS_NS::rcp_precise(lo), SK_OPTS_NS::rcp_precise(hi));
5447 #elif defined(SKRP_CPU_LASX)
5448 __m256 lo,hi;
5449 split(x, &lo,&hi);
5450 return join<F>(__lasx_xvfrecip_s(lo), __lasx_xvfrecip_s(hi));
5451 #elif defined(SKRP_CPU_LSX)
5452 __m128 lo,hi;
5453 split(x, &lo,&hi);
5454 return join<F>(__lsx_vfrecip_s(lo), __lsx_vfrecip_s(hi));
5455 #else
5456 return 1.0f / x;
5457 #endif
5458 }
5459 SI F sqrt_(F x) {
5460 #if defined(SKRP_CPU_SKX)
5461 return _mm512_sqrt_ps(x);
5462 #elif defined(SKRP_CPU_HSW)
5463 __m256 lo,hi;
5464 split(x, &lo,&hi);
5465 return join<F>(_mm256_sqrt_ps(lo), _mm256_sqrt_ps(hi));
5466 #elif defined(SKRP_CPU_SSE2) || defined(SKRP_CPU_SSE41) || defined(SKRP_CPU_AVX)
5467 __m128 lo,hi;
5468 split(x, &lo,&hi);
5469 return join<F>(_mm_sqrt_ps(lo), _mm_sqrt_ps(hi));
5470 #elif defined(SK_CPU_ARM64)
5471 float32x4_t lo,hi;
5472 split(x, &lo,&hi);
5473 return join<F>(vsqrtq_f32(lo), vsqrtq_f32(hi));
5474 #elif defined(SKRP_CPU_NEON)
5475 auto sqrt = [](float32x4_t v) {
5476 auto est = vrsqrteq_f32(v); // Estimate and two refinement steps for est = rsqrt(v).
5477 est *= vrsqrtsq_f32(v,est*est);
5478 est *= vrsqrtsq_f32(v,est*est);
5479 return v*est; // sqrt(v) == v*rsqrt(v).
5480 };
5481 float32x4_t lo,hi;
5482 split(x, &lo,&hi);
5483 return join<F>(sqrt(lo), sqrt(hi));
5484 #elif defined(SKRP_CPU_LASX)
5485 __m256 lo,hi;
5486 split(x, &lo,&hi);
5487 return join<F>(__lasx_xvfsqrt_s(lo), __lasx_xvfsqrt_s(hi));
5488 #elif defined(SKRP_CPU_LSX)
5489 __m128 lo,hi;
5490 split(x, &lo,&hi);
5491 return join<F>(__lsx_vfsqrt_s(lo), __lsx_vfsqrt_s(hi));
5492 #else
5493 return F{
5494 sqrtf(x[0]), sqrtf(x[1]), sqrtf(x[2]), sqrtf(x[3]),
5495 sqrtf(x[4]), sqrtf(x[5]), sqrtf(x[6]), sqrtf(x[7]),
5496 };
5497 #endif
5498 }
5499
5500 SI F floor_(F x) {
5501 #if defined(SK_CPU_ARM64)
5502 float32x4_t lo,hi;
5503 split(x, &lo,&hi);
5504 return join<F>(vrndmq_f32(lo), vrndmq_f32(hi));
5505 #elif defined(SKRP_CPU_SKX)
5506 return _mm512_floor_ps(x);
5507 #elif defined(SKRP_CPU_HSW)
5508 __m256 lo,hi;
5509 split(x, &lo,&hi);
5510 return join<F>(_mm256_floor_ps(lo), _mm256_floor_ps(hi));
5511 #elif defined(SKRP_CPU_SSE41) || defined(SKRP_CPU_AVX)
5512 __m128 lo,hi;
5513 split(x, &lo,&hi);
5514 return join<F>(_mm_floor_ps(lo), _mm_floor_ps(hi));
5515 #elif defined(SKRP_CPU_LASX)
5516 __m256 lo,hi;
5517 split(x, &lo,&hi);
5518 return join<F>(__lasx_xvfrintrm_s(lo), __lasx_xvfrintrm_s(hi));
5519 #elif defined(SKRP_CPU_LSX)
5520 __m128 lo,hi;
5521 split(x, &lo,&hi);
5522 return join<F>(__lsx_vfrintrm_s(lo), __lsx_vfrintrm_s(hi));
5523 #else
5524 F roundtrip = cast<F>(cast<I32>(x));
5525 return roundtrip - if_then_else(roundtrip > x, F_(1), F_(0));
5526 #endif
5527 }
5528
5529 // scaled_mult interprets a and b as number on [-1, 1) which are numbers in Q15 format. Functionally
5530 // this multiply is:
5531 // (2 * a * b + (1 << 15)) >> 16
5532 // The result is a number on [-1, 1).
5533 // Note: on neon this is a saturating multiply while the others are not.
5534 SI I16 scaled_mult(I16 a, I16 b) {
5535 #if defined(SKRP_CPU_SKX)
5536 return (I16)_mm256_mulhrs_epi16((__m256i)a, (__m256i)b);
5537 #elif defined(SKRP_CPU_HSW)
5538 return (I16)_mm256_mulhrs_epi16((__m256i)a, (__m256i)b);
5539 #elif defined(SKRP_CPU_SSE41) || defined(SKRP_CPU_AVX)
5540 return (I16)_mm_mulhrs_epi16((__m128i)a, (__m128i)b);
5541 #elif defined(SK_CPU_ARM64)
5542 return vqrdmulhq_s16(a, b);
5543 #elif defined(SKRP_CPU_NEON)
5544 return vqrdmulhq_s16(a, b);
5545 #elif defined(SKRP_CPU_LASX)
5546 I16 res = __lasx_xvmuh_h(a, b);
5547 return __lasx_xvslli_h(res, 1);
5548 #elif defined(SKRP_CPU_LSX)
5549 I16 res = __lsx_vmuh_h(a, b);
5550 return __lsx_vslli_h(res, 1);
5551 #else
5552 const I32 roundingTerm = I32_(1 << 14);
5553 return cast<I16>((cast<I32>(a) * cast<I32>(b) + roundingTerm) >> 15);
5554 #endif
5555 }
5556
5557 // This sum is to support lerp where the result will always be a positive number. In general,
5558 // a sum like this would require an additional bit, but because we know the range of the result
5559 // we know that the extra bit will always be zero.
5560 SI U16 constrained_add(I16 a, U16 b) {
5561 #if defined(SK_DEBUG)
5562 for (size_t i = 0; i < N; i++) {
5563 // Ensure that a + b is on the interval [0, UINT16_MAX]
5564 int ia = a[i],
5565 ib = b[i];
5566 // Use 65535 here because fuchsia's compiler evaluates UINT16_MAX - ib, which is
5567 // 65536U - ib, as an uint32_t instead of an int32_t. This was forcing ia to be
5568 // interpreted as an uint32_t.
5569 SkASSERT(-ib <= ia && ia <= 65535 - ib);
5570 }
5571 #endif
5572 return b + sk_bit_cast<U16>(a);
5573 }
5574
5575 SI F fract(F x) { return x - floor_(x); }
5576 SI F abs_(F x) { return sk_bit_cast<F>( sk_bit_cast<I32>(x) & 0x7fffffff ); }
5577
5578 // ~~~~~~ Basic / misc. stages ~~~~~~ //
5579
5580 STAGE_GG(seed_shader, NoCtx) {
5581 #if defined(SKRP_CPU_LSX)
5582 __m128 val1 = {0.5f, 1.5f, 2.5f, 3.5f};
5583 __m128 val2 = {4.5f, 5.5f, 6.5f, 7.5f};
5584 __m128 val3 = {0.5f, 0.5f, 0.5f, 0.5f};
5585
5586 __m128i v_d = __lsx_vreplgr2vr_w(dx);
5587
5588 __m128 f_d = __lsx_vffint_s_w(v_d);
5589 val1 = __lsx_vfadd_s(val1, f_d);
5590 val2 = __lsx_vfadd_s(val2, f_d);
5591 x = join<F>(val1, val2);
5592
5593 v_d = __lsx_vreplgr2vr_w(dy);
5594 f_d = __lsx_vffint_s_w(v_d);
5595 val3 = __lsx_vfadd_s(val3, f_d);
5596 y = join<F>(val3, val3);
5597 #else
5598 static constexpr float iota[] = {
5599 0.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 7.5f,
5600 8.5f, 9.5f,10.5f,11.5f,12.5f,13.5f,14.5f,15.5f,
5601 };
5602 static_assert(std::size(iota) >= SkRasterPipeline_kMaxStride);
5603
5604 x = cast<F>(I32_(dx)) + sk_unaligned_load<F>(iota);
5605 y = cast<F>(I32_(dy)) + 0.5f;
5606 #endif
5607 }
5608
5609 STAGE_GG(matrix_translate, const float* m) {
5610 x += m[0];
5611 y += m[1];
5612 }
5613 STAGE_GG(matrix_scale_translate, const float* m) {
5614 x = mad(x,m[0], m[2]);
5615 y = mad(y,m[1], m[3]);
5616 }
5617 STAGE_GG(matrix_2x3, const float* m) {
5618 auto X = mad(x,m[0], mad(y,m[1], m[2])),
5619 Y = mad(x,m[3], mad(y,m[4], m[5]));
5620 x = X;
5621 y = Y;
5622 }
5623 STAGE_GG(matrix_perspective, const float* m) {
5624 // N.B. Unlike the other matrix_ stages, this matrix is row-major.
5625 auto X = mad(x,m[0], mad(y,m[1], m[2])),
5626 Y = mad(x,m[3], mad(y,m[4], m[5])),
5627 Z = mad(x,m[6], mad(y,m[7], m[8]));
5628 x = X * rcp_precise(Z);
5629 y = Y * rcp_precise(Z);
5630 }
5631
5632 STAGE_PP(uniform_color, const SkRasterPipeline_UniformColorCtx* c) {
5633 r = U16_(c->rgba[0]);
5634 g = U16_(c->rgba[1]);
5635 b = U16_(c->rgba[2]);
5636 a = U16_(c->rgba[3]);
5637 }
5638 STAGE_PP(uniform_color_dst, const SkRasterPipeline_UniformColorCtx* c) {
5639 dr = U16_(c->rgba[0]);
5640 dg = U16_(c->rgba[1]);
5641 db = U16_(c->rgba[2]);
5642 da = U16_(c->rgba[3]);
5643 }
5644 STAGE_PP(black_color, NoCtx) { r = g = b = U16_0; a = U16_255; }
5645 STAGE_PP(white_color, NoCtx) { r = g = b = U16_255; a = U16_255; }
5646
5647 STAGE_PP(set_rgb, const float rgb[3]) {
5648 r = from_float(rgb[0]);
5649 g = from_float(rgb[1]);
5650 b = from_float(rgb[2]);
5651 }
5652
5653 // No need to clamp against 0 here (values are unsigned)
5654 STAGE_PP(clamp_01, NoCtx) {
5655 r = min(r, 255);
5656 g = min(g, 255);
5657 b = min(b, 255);
5658 a = min(a, 255);
5659 }
5660
5661 STAGE_PP(clamp_a_01, NoCtx) {
5662 a = min(a, 255);
5663 }
5664
5665 STAGE_PP(clamp_gamut, NoCtx) {
5666 a = min(a, 255);
5667 r = min(r, a);
5668 g = min(g, a);
5669 b = min(b, a);
5670 }
5671
5672 STAGE_PP(premul, NoCtx) {
5673 r = div255_accurate(r * a);
5674 g = div255_accurate(g * a);
5675 b = div255_accurate(b * a);
5676 }
5677 STAGE_PP(premul_dst, NoCtx) {
5678 dr = div255_accurate(dr * da);
5679 dg = div255_accurate(dg * da);
5680 db = div255_accurate(db * da);
5681 }
5682
5683 STAGE_PP(force_opaque , NoCtx) { a = U16_255; }
5684 STAGE_PP(force_opaque_dst, NoCtx) { da = U16_255; }
5685
5686 STAGE_PP(swap_rb, NoCtx) {
5687 auto tmp = r;
5688 r = b;
5689 b = tmp;
5690 }
5691 STAGE_PP(swap_rb_dst, NoCtx) {
5692 auto tmp = dr;
5693 dr = db;
5694 db = tmp;
5695 }
5696
5697 STAGE_PP(move_src_dst, NoCtx) {
5698 dr = r;
5699 dg = g;
5700 db = b;
5701 da = a;
5702 }
5703
5704 STAGE_PP(move_dst_src, NoCtx) {
5705 r = dr;
5706 g = dg;
5707 b = db;
5708 a = da;
5709 }
5710
5711 STAGE_PP(swap_src_dst, NoCtx) {
5712 std::swap(r, dr);
5713 std::swap(g, dg);
5714 std::swap(b, db);
5715 std::swap(a, da);
5716 }
5717
5718 // ~~~~~~ Blend modes ~~~~~~ //
5719
5720 // The same logic applied to all 4 channels.
5721 #define BLEND_MODE(name) \
5722 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \
5723 STAGE_PP(name, NoCtx) { \
5724 r = name##_channel(r,dr,a,da); \
5725 g = name##_channel(g,dg,a,da); \
5726 b = name##_channel(b,db,a,da); \
5727 a = name##_channel(a,da,a,da); \
5728 } \
5729 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da)
5730
5731 #if defined(SK_USE_INACCURATE_DIV255_IN_BLEND)
5732 BLEND_MODE(clear) { return U16_0; }
5733 BLEND_MODE(srcatop) { return div255( s*da + d*inv(sa) ); }
5734 BLEND_MODE(dstatop) { return div255( d*sa + s*inv(da) ); }
5735 BLEND_MODE(srcin) { return div255( s*da ); }
5736 BLEND_MODE(dstin) { return div255( d*sa ); }
5737 BLEND_MODE(srcout) { return div255( s*inv(da) ); }
5738 BLEND_MODE(dstout) { return div255( d*inv(sa) ); }
5739 BLEND_MODE(srcover) { return s + div255( d*inv(sa) ); }
5740 BLEND_MODE(dstover) { return d + div255( s*inv(da) ); }
5741 BLEND_MODE(modulate) { return div255( s*d ); }
5742 BLEND_MODE(multiply) { return div255( s*inv(da) + d*inv(sa) + s*d ); }
5743 BLEND_MODE(plus_) { return min(s+d, 255); }
5744 BLEND_MODE(screen) { return s + d - div255( s*d ); }
5745 BLEND_MODE(xor_) { return div255( s*inv(da) + d*inv(sa) ); }
5746 #else
5747 BLEND_MODE(clear) { return U16_0; }
5748 BLEND_MODE(srcatop) { return div255( s*da + d*inv(sa) ); }
5749 BLEND_MODE(dstatop) { return div255( d*sa + s*inv(da) ); }
5750 BLEND_MODE(srcin) { return div255_accurate( s*da ); }
5751 BLEND_MODE(dstin) { return div255_accurate( d*sa ); }
5752 BLEND_MODE(srcout) { return div255_accurate( s*inv(da) ); }
5753 BLEND_MODE(dstout) { return div255_accurate( d*inv(sa) ); }
5754 BLEND_MODE(srcover) { return s + div255_accurate( d*inv(sa) ); }
5755 BLEND_MODE(dstover) { return d + div255_accurate( s*inv(da) ); }
5756 BLEND_MODE(modulate) { return div255_accurate( s*d ); }
5757 BLEND_MODE(multiply) { return div255( s*inv(da) + d*inv(sa) + s*d ); }
5758 BLEND_MODE(plus_) { return min(s+d, 255); }
5759 BLEND_MODE(screen) { return s + d - div255_accurate( s*d ); }
5760 BLEND_MODE(xor_) { return div255( s*inv(da) + d*inv(sa) ); }
5761 #endif
5762 #undef BLEND_MODE
5763
5764 // The same logic applied to color, and srcover for alpha.
5765 #define BLEND_MODE(name) \
5766 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da); \
5767 STAGE_PP(name, NoCtx) { \
5768 r = name##_channel(r,dr,a,da); \
5769 g = name##_channel(g,dg,a,da); \
5770 b = name##_channel(b,db,a,da); \
5771 a = a + div255( da*inv(a) ); \
5772 } \
5773 SI U16 name##_channel(U16 s, U16 d, U16 sa, U16 da)
5774
5775 BLEND_MODE(darken) { return s + d - div255( max(s*da, d*sa) ); }
5776 BLEND_MODE(lighten) { return s + d - div255( min(s*da, d*sa) ); }
5777 BLEND_MODE(difference) { return s + d - 2*div255( min(s*da, d*sa) ); }
5778 BLEND_MODE(exclusion) { return s + d - 2*div255( s*d ); }
5779
5780 BLEND_MODE(hardlight) {
5781 return div255( s*inv(da) + d*inv(sa) +
5782 if_then_else(2*s <= sa, 2*s*d, sa*da - 2*(sa-s)*(da-d)) );
5783 }
5784 BLEND_MODE(overlay) {
5785 return div255( s*inv(da) + d*inv(sa) +
5786 if_then_else(2*d <= da, 2*s*d, sa*da - 2*(sa-s)*(da-d)) );
5787 }
5788 #undef BLEND_MODE
5789
5790 // ~~~~~~ Helpers for interacting with memory ~~~~~~ //
5791
5792 template <typename T>
5793 SI T* ptr_at_xy(const SkRasterPipeline_MemoryCtx* ctx, size_t dx, size_t dy) {
5794 return (T*)ctx->pixels + dy*ctx->stride + dx;
5795 }
5796
5797 template <typename T>
5798 SI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, F x, F y) {
5799 // Exclusive -> inclusive.
5800 const F w = F_(sk_bit_cast<float>( sk_bit_cast<uint32_t>(ctx->width ) - 1)),
5801 h = F_(sk_bit_cast<float>( sk_bit_cast<uint32_t>(ctx->height) - 1));
5802
5803 const F z = F_(std::numeric_limits<float>::min());
5804
5805 x = min(max(z, x), w);
5806 y = min(max(z, y), h);
5807
5808 x = sk_bit_cast<F>(sk_bit_cast<U32>(x) - (uint32_t)ctx->roundDownAtInteger);
5809 y = sk_bit_cast<F>(sk_bit_cast<U32>(y) - (uint32_t)ctx->roundDownAtInteger);
5810
5811 *ptr = (const T*)ctx->pixels;
5812 return trunc_(y)*ctx->stride + trunc_(x);
5813 }
5814
5815 template <typename T>
5816 SI U32 ix_and_ptr(T** ptr, const SkRasterPipeline_GatherCtx* ctx, I32 x, I32 y) {
5817 // This flag doesn't make sense when the coords are integers.
5818 SkASSERT(ctx->roundDownAtInteger == 0);
5819 // Exclusive -> inclusive.
5820 const I32 w = I32_( ctx->width - 1),
5821 h = I32_(ctx->height - 1);
5822
5823 U32 ax = cast<U32>(min(max(0, x), w)),
5824 ay = cast<U32>(min(max(0, y), h));
5825
5826 *ptr = (const T*)ctx->pixels;
5827 return ay * ctx->stride + ax;
5828 }
5829
5830 template <typename V, typename T>
5831 SI V load(const T* ptr) {
5832 V v;
5833 memcpy(&v, ptr, sizeof(v));
5834 return v;
5835 }
5836 template <typename V, typename T>
5837 SI void store(T* ptr, V v) {
5838 memcpy(ptr, &v, sizeof(v));
5839 }
5840
5841 #if defined(SKRP_CPU_SKX)
5842 template <typename V, typename T>
5843 SI V gather(const T* ptr, U32 ix) {
5844 return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
5845 ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]],
5846 ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]],
5847 ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], };
5848 }
5849
5850 template<>
5851 F gather(const float* ptr, U32 ix) {
5852 return _mm512_i32gather_ps((__m512i)ix, ptr, 4);
5853 }
5854
5855 template<>
5856 U32 gather(const uint32_t* ptr, U32 ix) {
5857 return (U32)_mm512_i32gather_epi32((__m512i)ix, ptr, 4);
5858 }
5859
5860 #elif defined(SKRP_CPU_HSW)
5861 template <typename V, typename T>
5862 SI V gather(const T* ptr, U32 ix) {
5863 return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
5864 ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]],
5865 ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]],
5866 ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], };
5867 }
5868
5869 template<>
5870 F gather(const float* ptr, U32 ix) {
5871 __m256i lo, hi;
5872 split(ix, &lo, &hi);
5873
5874 return join<F>(_mm256_i32gather_ps(ptr, lo, 4),
5875 _mm256_i32gather_ps(ptr, hi, 4));
5876 }
5877
5878 template<>
5879 U32 gather(const uint32_t* ptr, U32 ix) {
5880 __m256i lo, hi;
5881 split(ix, &lo, &hi);
5882
5883 return join<U32>(_mm256_i32gather_epi32((const int*)ptr, lo, 4),
5884 _mm256_i32gather_epi32((const int*)ptr, hi, 4));
5885 }
5886 #elif defined(SKRP_CPU_LASX)
5887 template <typename V, typename T>
5888 SI V gather(const T* ptr, U32 ix) {
5889 return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
5890 ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]],
5891 ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]],
5892 ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], };
5893 }
5894 #else
5895 template <typename V, typename T>
5896 SI V gather(const T* ptr, U32 ix) {
5897 return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
5898 ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]], };
5899 }
5900 #endif
5901
5902
5903 // ~~~~~~ 32-bit memory loads and stores ~~~~~~ //
5904
5905 SI void from_8888(U32 rgba, U16* r, U16* g, U16* b, U16* a) {
5906 #if defined(SKRP_CPU_SKX)
5907 rgba = (U32)_mm512_permutexvar_epi64(_mm512_setr_epi64(0,1,4,5,2,3,6,7), (__m512i)rgba);
5908 auto cast_U16 = [](U32 v) -> U16 {
5909 return (U16)_mm256_packus_epi32(_mm512_castsi512_si256((__m512i)v),
5910 _mm512_extracti64x4_epi64((__m512i)v, 1));
5911 };
5912 #elif defined(SKRP_CPU_HSW)
5913 // Swap the middle 128-bit lanes to make _mm256_packus_epi32() in cast_U16() work out nicely.
5914 __m256i _01,_23;
5915 split(rgba, &_01, &_23);
5916 __m256i _02 = _mm256_permute2x128_si256(_01,_23, 0x20),
5917 _13 = _mm256_permute2x128_si256(_01,_23, 0x31);
5918 rgba = join<U32>(_02, _13);
5919
5920 auto cast_U16 = [](U32 v) -> U16 {
5921 __m256i _02,_13;
5922 split(v, &_02,&_13);
5923 return (U16)_mm256_packus_epi32(_02,_13);
5924 };
5925 #elif defined(SKRP_CPU_LASX)
5926 __m256i _01, _23;
5927 split(rgba, &_01, &_23);
5928 __m256i _02 = __lasx_xvpermi_q(_01, _23, 0x02),
5929 _13 = __lasx_xvpermi_q(_01, _23, 0x13);
5930 rgba = join<U32>(_02, _13);
5931
5932 auto cast_U16 = [](U32 v) -> U16 {
5933 __m256i _02,_13;
5934 split(v, &_02,&_13);
5935 __m256i tmp0 = __lasx_xvsat_wu(_02, 15);
5936 __m256i tmp1 = __lasx_xvsat_wu(_13, 15);
5937 return __lasx_xvpickev_h(tmp1, tmp0);
5938 };
5939 #elif defined(SKRP_CPU_LSX)
5940 __m128i _01, _23, rg, ba;
5941 split(rgba, &_01, &_23);
5942 rg = __lsx_vpickev_h(_23, _01);
5943 ba = __lsx_vpickod_h(_23, _01);
5944
5945 __m128i mask_00ff = __lsx_vreplgr2vr_h(0xff);
5946
5947 *r = __lsx_vand_v(rg, mask_00ff);
5948 *g = __lsx_vsrli_h(rg, 8);
5949 *b = __lsx_vand_v(ba, mask_00ff);
5950 *a = __lsx_vsrli_h(ba, 8);
5951 #else
5952 auto cast_U16 = [](U32 v) -> U16 {
5953 return cast<U16>(v);
5954 };
5955 #endif
5956 #if !defined(SKRP_CPU_LSX)
5957 *r = cast_U16(rgba & 65535) & 255;
5958 *g = cast_U16(rgba & 65535) >> 8;
5959 *b = cast_U16(rgba >> 16) & 255;
5960 *a = cast_U16(rgba >> 16) >> 8;
5961 #endif
5962 }
5963
5964 SI void load_8888_(const uint32_t* ptr, U16* r, U16* g, U16* b, U16* a) {
5965 #if 1 && defined(SKRP_CPU_NEON)
5966 uint8x8x4_t rgba = vld4_u8((const uint8_t*)(ptr));
5967 *r = cast<U16>(rgba.val[0]);
5968 *g = cast<U16>(rgba.val[1]);
5969 *b = cast<U16>(rgba.val[2]);
5970 *a = cast<U16>(rgba.val[3]);
5971 #else
5972 from_8888(load<U32>(ptr), r,g,b,a);
5973 #endif
5974 }
5975 SI void store_8888_(uint32_t* ptr, U16 r, U16 g, U16 b, U16 a) {
5976 #if defined(SKRP_CPU_LSX)
5977 __m128i mask = __lsx_vreplgr2vr_h(255);
5978 r = __lsx_vmin_hu(r, mask);
5979 g = __lsx_vmin_hu(g, mask);
5980 b = __lsx_vmin_hu(b, mask);
5981 a = __lsx_vmin_hu(a, mask);
5982
5983 g = __lsx_vslli_h(g, 8);
5984 r = r | g;
5985 a = __lsx_vslli_h(a, 8);
5986 a = a | b;
5987
5988 __m128i r_lo = __lsx_vsllwil_wu_hu(r, 0);
5989 __m128i r_hi = __lsx_vexth_wu_hu(r);
5990 __m128i a_lo = __lsx_vsllwil_wu_hu(a, 0);
5991 __m128i a_hi = __lsx_vexth_wu_hu(a);
5992
5993 a_lo = __lsx_vslli_w(a_lo, 16);
5994 a_hi = __lsx_vslli_w(a_hi, 16);
5995
5996 r = r_lo | a_lo;
5997 a = r_hi | a_hi;
5998 store(ptr, join<U32>(r, a));
5999 #else
6000 r = min(r, 255);
6001 g = min(g, 255);
6002 b = min(b, 255);
6003 a = min(a, 255);
6004
6005 #if 1 && defined(SKRP_CPU_NEON)
6006 uint8x8x4_t rgba = {{
6007 cast<U8>(r),
6008 cast<U8>(g),
6009 cast<U8>(b),
6010 cast<U8>(a),
6011 }};
6012 vst4_u8((uint8_t*)(ptr), rgba);
6013 #else
6014 store(ptr, cast<U32>(r | (g<<8)) << 0
6015 | cast<U32>(b | (a<<8)) << 16);
6016 #endif
6017 #endif
6018 }
6019
6020 STAGE_PP(load_8888, const SkRasterPipeline_MemoryCtx* ctx) {
6021 load_8888_(ptr_at_xy<const uint32_t>(ctx, dx,dy), &r,&g,&b,&a);
6022 }
6023 STAGE_PP(load_8888_dst, const SkRasterPipeline_MemoryCtx* ctx) {
6024 load_8888_(ptr_at_xy<const uint32_t>(ctx, dx,dy), &dr,&dg,&db,&da);
6025 }
6026 STAGE_PP(store_8888, const SkRasterPipeline_MemoryCtx* ctx) {
6027 store_8888_(ptr_at_xy<uint32_t>(ctx, dx,dy), r,g,b,a);
6028 }
6029 STAGE_GP(gather_8888, const SkRasterPipeline_GatherCtx* ctx) {
6030 const uint32_t* ptr;
6031 U32 ix = ix_and_ptr(&ptr, ctx, x,y);
6032 from_8888(gather<U32>(ptr, ix), &r, &g, &b, &a);
6033 }
6034
6035 // ~~~~~~ 16-bit memory loads and stores ~~~~~~ //
6036
6037 SI void from_565(U16 rgb, U16* r, U16* g, U16* b) {
6038 // Format for 565 buffers: 15|rrrrr gggggg bbbbb|0
6039 U16 R = (rgb >> 11) & 31,
6040 G = (rgb >> 5) & 63,
6041 B = (rgb >> 0) & 31;
6042
6043 // These bit replications are the same as multiplying by 255/31 or 255/63 to scale to 8-bit.
6044 *r = (R << 3) | (R >> 2);
6045 *g = (G << 2) | (G >> 4);
6046 *b = (B << 3) | (B >> 2);
6047 }
6048 SI void load_565_(const uint16_t* ptr, U16* r, U16* g, U16* b) {
6049 from_565(load<U16>(ptr), r,g,b);
6050 }
6051 SI void store_565_(uint16_t* ptr, U16 r, U16 g, U16 b) {
6052 r = min(r, 255);
6053 g = min(g, 255);
6054 b = min(b, 255);
6055
6056 // Round from [0,255] to [0,31] or [0,63], as if x * (31/255.0f) + 0.5f.
6057 // (Don't feel like you need to find some fundamental truth in these...
6058 // they were brute-force searched.)
6059 U16 R = (r * 9 + 36) / 74, // 9/74 ≈ 31/255, plus 36/74, about half.
6060 G = (g * 21 + 42) / 85, // 21/85 = 63/255 exactly.
6061 B = (b * 9 + 36) / 74;
6062 // Pack them back into 15|rrrrr gggggg bbbbb|0.
6063 store(ptr, R << 11
6064 | G << 5
6065 | B << 0);
6066 }
6067
6068 STAGE_PP(load_565, const SkRasterPipeline_MemoryCtx* ctx) {
6069 load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), &r,&g,&b);
6070 a = U16_255;
6071 }
6072 STAGE_PP(load_565_dst, const SkRasterPipeline_MemoryCtx* ctx) {
6073 load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), &dr,&dg,&db);
6074 da = U16_255;
6075 }
6076 STAGE_PP(store_565, const SkRasterPipeline_MemoryCtx* ctx) {
6077 store_565_(ptr_at_xy<uint16_t>(ctx, dx,dy), r,g,b);
6078 }
6079 STAGE_GP(gather_565, const SkRasterPipeline_GatherCtx* ctx) {
6080 const uint16_t* ptr;
6081 U32 ix = ix_and_ptr(&ptr, ctx, x,y);
6082 from_565(gather<U16>(ptr, ix), &r, &g, &b);
6083 a = U16_255;
6084 }
6085
6086 SI void from_4444(U16 rgba, U16* r, U16* g, U16* b, U16* a) {
6087 // Format for 4444 buffers: 15|rrrr gggg bbbb aaaa|0.
6088 U16 R = (rgba >> 12) & 15,
6089 G = (rgba >> 8) & 15,
6090 B = (rgba >> 4) & 15,
6091 A = (rgba >> 0) & 15;
6092
6093 // Scale [0,15] to [0,255].
6094 *r = (R << 4) | R;
6095 *g = (G << 4) | G;
6096 *b = (B << 4) | B;
6097 *a = (A << 4) | A;
6098 }
6099 SI void load_4444_(const uint16_t* ptr, U16* r, U16* g, U16* b, U16* a) {
6100 from_4444(load<U16>(ptr), r,g,b,a);
6101 }
6102 SI void store_4444_(uint16_t* ptr, U16 r, U16 g, U16 b, U16 a) {
6103 r = min(r, 255);
6104 g = min(g, 255);
6105 b = min(b, 255);
6106 a = min(a, 255);
6107
6108 // Round from [0,255] to [0,15], producing the same value as (x*(15/255.0f) + 0.5f).
6109 U16 R = (r + 8) / 17,
6110 G = (g + 8) / 17,
6111 B = (b + 8) / 17,
6112 A = (a + 8) / 17;
6113 // Pack them back into 15|rrrr gggg bbbb aaaa|0.
6114 store(ptr, R << 12
6115 | G << 8
6116 | B << 4
6117 | A << 0);
6118 }
6119
6120 STAGE_PP(load_4444, const SkRasterPipeline_MemoryCtx* ctx) {
6121 load_4444_(ptr_at_xy<const uint16_t>(ctx, dx,dy), &r,&g,&b,&a);
6122 }
6123 STAGE_PP(load_4444_dst, const SkRasterPipeline_MemoryCtx* ctx) {
6124 load_4444_(ptr_at_xy<const uint16_t>(ctx, dx,dy), &dr,&dg,&db,&da);
6125 }
6126 STAGE_PP(store_4444, const SkRasterPipeline_MemoryCtx* ctx) {
6127 store_4444_(ptr_at_xy<uint16_t>(ctx, dx,dy), r,g,b,a);
6128 }
6129 STAGE_GP(gather_4444, const SkRasterPipeline_GatherCtx* ctx) {
6130 const uint16_t* ptr;
6131 U32 ix = ix_and_ptr(&ptr, ctx, x,y);
6132 from_4444(gather<U16>(ptr, ix), &r,&g,&b,&a);
6133 }
6134
6135 SI void from_88(U16 rg, U16* r, U16* g) {
6136 *r = (rg & 0xFF);
6137 *g = (rg >> 8);
6138 }
6139
6140 SI void load_88_(const uint16_t* ptr, U16* r, U16* g) {
6141 #if 1 && defined(SKRP_CPU_NEON)
6142 uint8x8x2_t rg = vld2_u8((const uint8_t*)(ptr));
6143 *r = cast<U16>(rg.val[0]);
6144 *g = cast<U16>(rg.val[1]);
6145 #else
6146 from_88(load<U16>(ptr), r,g);
6147 #endif
6148 }
6149
6150 SI void store_88_(uint16_t* ptr, U16 r, U16 g) {
6151 r = min(r, 255);
6152 g = min(g, 255);
6153
6154 #if 1 && defined(SKRP_CPU_NEON)
6155 uint8x8x2_t rg = {{
6156 cast<U8>(r),
6157 cast<U8>(g),
6158 }};
6159 vst2_u8((uint8_t*)(ptr), rg);
6160 #else
6161 store(ptr, cast<U16>(r | (g<<8)) << 0);
6162 #endif
6163 }
6164
6165 STAGE_PP(load_rg88, const SkRasterPipeline_MemoryCtx* ctx) {
6166 load_88_(ptr_at_xy<const uint16_t>(ctx, dx, dy), &r, &g);
6167 b = U16_0;
6168 a = U16_255;
6169 }
6170 STAGE_PP(load_rg88_dst, const SkRasterPipeline_MemoryCtx* ctx) {
6171 load_88_(ptr_at_xy<const uint16_t>(ctx, dx, dy), &dr, &dg);
6172 db = U16_0;
6173 da = U16_255;
6174 }
6175 STAGE_PP(store_rg88, const SkRasterPipeline_MemoryCtx* ctx) {
6176 store_88_(ptr_at_xy<uint16_t>(ctx, dx, dy), r, g);
6177 }
6178 STAGE_GP(gather_rg88, const SkRasterPipeline_GatherCtx* ctx) {
6179 const uint16_t* ptr;
6180 U32 ix = ix_and_ptr(&ptr, ctx, x, y);
6181 from_88(gather<U16>(ptr, ix), &r, &g);
6182 b = U16_0;
6183 a = U16_255;
6184 }
6185
6186 // ~~~~~~ 8-bit memory loads and stores ~~~~~~ //
6187
6188 SI U16 load_8(const uint8_t* ptr) {
6189 return cast<U16>(load<U8>(ptr));
6190 }
6191 SI void store_8(uint8_t* ptr, U16 v) {
6192 v = min(v, 255);
6193 store(ptr, cast<U8>(v));
6194 }
6195
6196 STAGE_PP(load_a8, const SkRasterPipeline_MemoryCtx* ctx) {
6197 r = g = b = U16_0;
6198 a = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy));
6199 }
6200 STAGE_PP(load_a8_dst, const SkRasterPipeline_MemoryCtx* ctx) {
6201 dr = dg = db = U16_0;
6202 da = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy));
6203 }
6204 STAGE_PP(store_a8, const SkRasterPipeline_MemoryCtx* ctx) {
6205 store_8(ptr_at_xy<uint8_t>(ctx, dx,dy), a);
6206 }
6207 STAGE_GP(gather_a8, const SkRasterPipeline_GatherCtx* ctx) {
6208 const uint8_t* ptr;
6209 U32 ix = ix_and_ptr(&ptr, ctx, x,y);
6210 r = g = b = U16_0;
6211 a = cast<U16>(gather<U8>(ptr, ix));
6212 }
6213 STAGE_PP(store_r8, const SkRasterPipeline_MemoryCtx* ctx) {
6214 store_8(ptr_at_xy<uint8_t>(ctx, dx,dy), r);
6215 }
6216
6217 STAGE_PP(alpha_to_gray, NoCtx) {
6218 r = g = b = a;
6219 a = U16_255;
6220 }
6221 STAGE_PP(alpha_to_gray_dst, NoCtx) {
6222 dr = dg = db = da;
6223 da = U16_255;
6224 }
6225 STAGE_PP(alpha_to_red, NoCtx) {
6226 r = a;
6227 a = U16_255;
6228 }
6229 STAGE_PP(alpha_to_red_dst, NoCtx) {
6230 dr = da;
6231 da = U16_255;
6232 }
6233
6234 STAGE_PP(bt709_luminance_or_luma_to_alpha, NoCtx) {
6235 a = (r*54 + g*183 + b*19)/256; // 0.2126, 0.7152, 0.0722 with 256 denominator.
6236 r = g = b = U16_0;
6237 }
6238 STAGE_PP(bt709_luminance_or_luma_to_rgb, NoCtx) {
6239 r = g = b =(r*54 + g*183 + b*19)/256; // 0.2126, 0.7152, 0.0722 with 256 denominator.
6240 }
6241
6242 // ~~~~~~ Coverage scales / lerps ~~~~~~ //
6243
6244 STAGE_PP(load_src, const uint16_t* ptr) {
6245 r = sk_unaligned_load<U16>(ptr + 0*N);
6246 g = sk_unaligned_load<U16>(ptr + 1*N);
6247 b = sk_unaligned_load<U16>(ptr + 2*N);
6248 a = sk_unaligned_load<U16>(ptr + 3*N);
6249 }
6250 STAGE_PP(store_src, uint16_t* ptr) {
6251 sk_unaligned_store(ptr + 0*N, r);
6252 sk_unaligned_store(ptr + 1*N, g);
6253 sk_unaligned_store(ptr + 2*N, b);
6254 sk_unaligned_store(ptr + 3*N, a);
6255 }
6256 STAGE_PP(store_src_a, uint16_t* ptr) {
6257 sk_unaligned_store(ptr, a);
6258 }
6259 STAGE_PP(load_dst, const uint16_t* ptr) {
6260 dr = sk_unaligned_load<U16>(ptr + 0*N);
6261 dg = sk_unaligned_load<U16>(ptr + 1*N);
6262 db = sk_unaligned_load<U16>(ptr + 2*N);
6263 da = sk_unaligned_load<U16>(ptr + 3*N);
6264 }
6265 STAGE_PP(store_dst, uint16_t* ptr) {
6266 sk_unaligned_store(ptr + 0*N, dr);
6267 sk_unaligned_store(ptr + 1*N, dg);
6268 sk_unaligned_store(ptr + 2*N, db);
6269 sk_unaligned_store(ptr + 3*N, da);
6270 }
6271
6272 // ~~~~~~ Coverage scales / lerps ~~~~~~ //
6273
6274 STAGE_PP(scale_1_float, const float* f) {
6275 U16 c = from_float(*f);
6276 r = div255( r * c );
6277 g = div255( g * c );
6278 b = div255( b * c );
6279 a = div255( a * c );
6280 }
6281 STAGE_PP(lerp_1_float, const float* f) {
6282 U16 c = from_float(*f);
6283 r = lerp(dr, r, c);
6284 g = lerp(dg, g, c);
6285 b = lerp(db, b, c);
6286 a = lerp(da, a, c);
6287 }
6288 STAGE_PP(scale_native, const uint16_t scales[]) {
6289 auto c = sk_unaligned_load<U16>(scales);
6290 r = div255( r * c );
6291 g = div255( g * c );
6292 b = div255( b * c );
6293 a = div255( a * c );
6294 }
6295
6296 STAGE_PP(lerp_native, const uint16_t scales[]) {
6297 auto c = sk_unaligned_load<U16>(scales);
6298 r = lerp(dr, r, c);
6299 g = lerp(dg, g, c);
6300 b = lerp(db, b, c);
6301 a = lerp(da, a, c);
6302 }
6303
6304 STAGE_PP(scale_u8, const SkRasterPipeline_MemoryCtx* ctx) {
6305 U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy));
6306 r = div255( r * c );
6307 g = div255( g * c );
6308 b = div255( b * c );
6309 a = div255( a * c );
6310 }
6311 STAGE_PP(lerp_u8, const SkRasterPipeline_MemoryCtx* ctx) {
6312 U16 c = load_8(ptr_at_xy<const uint8_t>(ctx, dx,dy));
6313 r = lerp(dr, r, c);
6314 g = lerp(dg, g, c);
6315 b = lerp(db, b, c);
6316 a = lerp(da, a, c);
6317 }
6318
6319 // Derive alpha's coverage from rgb coverage and the values of src and dst alpha.
6320 SI U16 alpha_coverage_from_rgb_coverage(U16 a, U16 da, U16 cr, U16 cg, U16 cb) {
6321 return if_then_else(a < da, min(cr, min(cg,cb))
6322 , max(cr, max(cg,cb)));
6323 }
6324 STAGE_PP(scale_565, const SkRasterPipeline_MemoryCtx* ctx) {
6325 U16 cr,cg,cb;
6326 load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), &cr,&cg,&cb);
6327 U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
6328
6329 r = div255( r * cr );
6330 g = div255( g * cg );
6331 b = div255( b * cb );
6332 a = div255( a * ca );
6333 }
6334 STAGE_PP(lerp_565, const SkRasterPipeline_MemoryCtx* ctx) {
6335 U16 cr,cg,cb;
6336 load_565_(ptr_at_xy<const uint16_t>(ctx, dx,dy), &cr,&cg,&cb);
6337 U16 ca = alpha_coverage_from_rgb_coverage(a,da, cr,cg,cb);
6338
6339 r = lerp(dr, r, cr);
6340 g = lerp(dg, g, cg);
6341 b = lerp(db, b, cb);
6342 a = lerp(da, a, ca);
6343 }
6344
6345 STAGE_PP(emboss, const SkRasterPipeline_EmbossCtx* ctx) {
6346 U16 mul = load_8(ptr_at_xy<const uint8_t>(&ctx->mul, dx,dy)),
6347 add = load_8(ptr_at_xy<const uint8_t>(&ctx->add, dx,dy));
6348
6349 r = min(div255(r*mul) + add, a);
6350 g = min(div255(g*mul) + add, a);
6351 b = min(div255(b*mul) + add, a);
6352 }
6353
6354
6355 // ~~~~~~ Gradient stages ~~~~~~ //
6356
6357 // Clamp x to [0,1], both sides inclusive (think, gradients).
6358 // Even repeat and mirror funnel through a clamp to handle bad inputs like +Inf, NaN.
6359 SI F clamp_01_(F v) { return min(max(0, v), 1); }
6360
6361 STAGE_GG(clamp_x_1 , NoCtx) { x = clamp_01_(x); }
6362 STAGE_GG(repeat_x_1, NoCtx) { x = clamp_01_(x - floor_(x)); }
6363 STAGE_GG(mirror_x_1, NoCtx) {
6364 auto two = [](F x){ return x+x; };
6365 x = clamp_01_(abs_( (x-1.0f) - two(floor_((x-1.0f)*0.5f)) - 1.0f ));
6366 }
6367
6368 SI I16 cond_to_mask_16(I32 cond) { return cast<I16>(cond); }
6369
6370 STAGE_GG(decal_x, SkRasterPipeline_DecalTileCtx* ctx) {
6371 auto w = ctx->limit_x;
6372 sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= x) & (x < w)));
6373 }
6374 STAGE_GG(decal_y, SkRasterPipeline_DecalTileCtx* ctx) {
6375 auto h = ctx->limit_y;
6376 sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= y) & (y < h)));
6377 }
6378 STAGE_GG(decal_x_and_y, SkRasterPipeline_DecalTileCtx* ctx) {
6379 auto w = ctx->limit_x;
6380 auto h = ctx->limit_y;
6381 sk_unaligned_store(ctx->mask, cond_to_mask_16((0 <= x) & (x < w) & (0 <= y) & (y < h)));
6382 }
6383 STAGE_GG(clamp_x_and_y, SkRasterPipeline_CoordClampCtx* ctx) {
6384 x = min(ctx->max_x, max(ctx->min_x, x));
6385 y = min(ctx->max_y, max(ctx->min_y, y));
6386 }
6387 STAGE_PP(check_decal_mask, SkRasterPipeline_DecalTileCtx* ctx) {
6388 auto mask = sk_unaligned_load<U16>(ctx->mask);
6389 r = r & mask;
6390 g = g & mask;
6391 b = b & mask;
6392 a = a & mask;
6393 }
6394
6395 SI void round_F_to_U16(F R, F G, F B, F A, U16* r, U16* g, U16* b, U16* a) {
6396 auto round_color = [](F x) { return cast<U16>(x * 255.0f + 0.5f); };
6397
6398 *r = round_color(min(max(0, R), 1));
6399 *g = round_color(min(max(0, G), 1));
6400 *b = round_color(min(max(0, B), 1));
6401 *a = round_color(A); // we assume alpha is already in [0,1].
6402 }
6403
6404 SI void gradient_lookup(const SkRasterPipeline_GradientCtx* c, U32 idx, F t,
6405 U16* r, U16* g, U16* b, U16* a) {
6406
6407 F fr, fg, fb, fa, br, bg, bb, ba;
6408 #if defined(SKRP_CPU_HSW)
6409 if (c->stopCount <=8) {
6410 __m256i lo, hi;
6411 split(idx, &lo, &hi);
6412
6413 fr = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), lo),
6414 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[0]), hi));
6415 br = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), lo),
6416 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[0]), hi));
6417 fg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), lo),
6418 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[1]), hi));
6419 bg = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), lo),
6420 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[1]), hi));
6421 fb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), lo),
6422 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[2]), hi));
6423 bb = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), lo),
6424 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[2]), hi));
6425 fa = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), lo),
6426 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->fs[3]), hi));
6427 ba = join<F>(_mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), lo),
6428 _mm256_permutevar8x32_ps(_mm256_loadu_ps(c->bs[3]), hi));
6429 } else
6430 #elif defined(SKRP_CPU_LASX)
6431 if (c->stopCount <= 8) {
6432 __m256i lo, hi;
6433 split(idx, &lo, &hi);
6434
6435 fr = join<F>((__m256)__lasx_xvperm_w(__lasx_xvld(c->fs[0], 0), lo),
6436 (__m256)__lasx_xvperm_w(__lasx_xvld(c->fs[0], 0), hi));
6437 br = join<F>((__m256)__lasx_xvperm_w(__lasx_xvld(c->bs[0], 0), lo),
6438 (__m256)__lasx_xvperm_w(__lasx_xvld(c->bs[0], 0), hi));
6439 fg = join<F>((__m256)__lasx_xvperm_w(__lasx_xvld(c->fs[1], 0), lo),
6440 (__m256)__lasx_xvperm_w(__lasx_xvld(c->fs[1], 0), hi));
6441 bg = join<F>((__m256)__lasx_xvperm_w(__lasx_xvld(c->bs[1], 0), lo),
6442 (__m256)__lasx_xvperm_w(__lasx_xvld(c->bs[1], 0), hi));
6443 fb = join<F>((__m256)__lasx_xvperm_w(__lasx_xvld(c->fs[2], 0), lo),
6444 (__m256)__lasx_xvperm_w(__lasx_xvld(c->fs[2], 0), hi));
6445 bb = join<F>((__m256)__lasx_xvperm_w(__lasx_xvld(c->bs[2], 0), lo),
6446 (__m256)__lasx_xvperm_w(__lasx_xvld(c->bs[2], 0), hi));
6447 fa = join<F>((__m256)__lasx_xvperm_w(__lasx_xvld(c->fs[3], 0), lo),
6448 (__m256)__lasx_xvperm_w(__lasx_xvld(c->fs[3], 0), hi));
6449 ba = join<F>((__m256)__lasx_xvperm_w(__lasx_xvld(c->bs[3], 0), lo),
6450 (__m256)__lasx_xvperm_w(__lasx_xvld(c->bs[3], 0), hi));
6451 } else
6452 #elif defined(SKRP_CPU_LSX)
6453 if (c->stopCount <= 4) {
6454 __m128i lo, hi;
6455 split(idx, &lo, &hi);
6456 __m128i zero = __lsx_vldi(0);
6457 fr = join<F>((__m128)__lsx_vshuf_w(lo, zero, __lsx_vld(c->fs[0], 0)),
6458 (__m128)__lsx_vshuf_w(hi, zero, __lsx_vld(c->fs[0], 0)));
6459 br = join<F>((__m128)__lsx_vshuf_w(lo, zero, __lsx_vld(c->bs[0], 0)),
6460 (__m128)__lsx_vshuf_w(hi, zero, __lsx_vld(c->bs[0], 0)));
6461 fg = join<F>((__m128)__lsx_vshuf_w(lo, zero, __lsx_vld(c->fs[1], 0)),
6462 (__m128)__lsx_vshuf_w(hi, zero, __lsx_vld(c->fs[1], 0)));
6463 bg = join<F>((__m128)__lsx_vshuf_w(lo, zero, __lsx_vld(c->bs[1], 0)),
6464 (__m128)__lsx_vshuf_w(hi, zero, __lsx_vld(c->bs[1], 0)));
6465 fb = join<F>((__m128)__lsx_vshuf_w(lo, zero, __lsx_vld(c->fs[2], 0)),
6466 (__m128)__lsx_vshuf_w(hi, zero, __lsx_vld(c->fs[2], 0)));
6467 bb = join<F>((__m128)__lsx_vshuf_w(lo, zero, __lsx_vld(c->bs[2], 0)),
6468 (__m128)__lsx_vshuf_w(hi, zero, __lsx_vld(c->bs[2], 0)));
6469 fa = join<F>((__m128)__lsx_vshuf_w(lo, zero, __lsx_vld(c->fs[3], 0)),
6470 (__m128)__lsx_vshuf_w(hi, zero, __lsx_vld(c->fs[3], 0)));
6471 ba = join<F>((__m128)__lsx_vshuf_w(lo, zero, __lsx_vld(c->bs[3], 0)),
6472 (__m128)__lsx_vshuf_w(hi, zero, __lsx_vld(c->bs[3], 0)));
6473 } else
6474 #endif
6475 {
6476 fr = gather<F>(c->fs[0], idx);
6477 fg = gather<F>(c->fs[1], idx);
6478 fb = gather<F>(c->fs[2], idx);
6479 fa = gather<F>(c->fs[3], idx);
6480 br = gather<F>(c->bs[0], idx);
6481 bg = gather<F>(c->bs[1], idx);
6482 bb = gather<F>(c->bs[2], idx);
6483 ba = gather<F>(c->bs[3], idx);
6484 }
6485 round_F_to_U16(mad(t, fr, br),
6486 mad(t, fg, bg),
6487 mad(t, fb, bb),
6488 mad(t, fa, ba),
6489 r,g,b,a);
6490 }
6491
6492 STAGE_GP(gradient, const SkRasterPipeline_GradientCtx* c) {
6493 auto t = x;
6494 U32 idx = U32_(0);
6495
6496 // N.B. The loop starts at 1 because idx 0 is the color to use before the first stop.
6497 for (size_t i = 1; i < c->stopCount; i++) {
6498 idx += if_then_else(t >= c->ts[i], U32_(1), U32_(0));
6499 }
6500
6501 gradient_lookup(c, idx, t, &r, &g, &b, &a);
6502 }
6503
6504 STAGE_GP(evenly_spaced_gradient, const SkRasterPipeline_GradientCtx* c) {
6505 auto t = x;
6506 auto idx = trunc_(t * static_cast<float>(c->stopCount-1));
6507 gradient_lookup(c, idx, t, &r, &g, &b, &a);
6508 }
6509
6510 STAGE_GP(evenly_spaced_2_stop_gradient, const SkRasterPipeline_EvenlySpaced2StopGradientCtx* c) {
6511 auto t = x;
6512 round_F_to_U16(mad(t, c->f[0], c->b[0]),
6513 mad(t, c->f[1], c->b[1]),
6514 mad(t, c->f[2], c->b[2]),
6515 mad(t, c->f[3], c->b[3]),
6516 &r,&g,&b,&a);
6517 }
6518
6519 STAGE_GP(bilerp_clamp_8888, const SkRasterPipeline_GatherCtx* ctx) {
6520 // Quantize sample point and transform into lerp coordinates converting them to 16.16 fixed
6521 // point number.
6522 #if defined(SKRP_CPU_LSX)
6523 __m128 _01, _23, _45, _67;
6524 v4f32 v_tmp1 = {0.5f, 0.5f, 0.5f, 0.5f};
6525 v4f32 v_tmp2 = {65536.0f, 65536.0f, 65536.0f, 65536.0f};
6526 split(x, &_01,&_23);
6527 split(y, &_45,&_67);
6528 __m128 val1 = __lsx_vfmadd_s((__m128)v_tmp2, _01, (__m128)v_tmp1);
6529 __m128 val2 = __lsx_vfmadd_s((__m128)v_tmp2, _23, (__m128)v_tmp1);
6530 __m128 val3 = __lsx_vfmadd_s((__m128)v_tmp2, _45, (__m128)v_tmp1);
6531 __m128 val4 = __lsx_vfmadd_s((__m128)v_tmp2, _67, (__m128)v_tmp1);
6532 I32 qx = cast<I32>((join<F>(__lsx_vfrintrm_s(val1), __lsx_vfrintrm_s(val2)))) - 32768,
6533 qy = cast<I32>((join<F>(__lsx_vfrintrm_s(val3), __lsx_vfrintrm_s(val4)))) - 32768;
6534 #else
6535 I32 qx = cast<I32>(floor_(65536.0f * x + 0.5f)) - 32768,
6536 qy = cast<I32>(floor_(65536.0f * y + 0.5f)) - 32768;
6537 #endif
6538
6539 // Calculate screen coordinates sx & sy by flooring qx and qy.
6540 I32 sx = qx >> 16,
6541 sy = qy >> 16;
6542
6543 // We are going to perform a change of parameters for qx on [0, 1) to tx on [-1, 1).
6544 // This will put tx in Q15 format for use with q_mult.
6545 // Calculate tx and ty on the interval of [-1, 1). Give {qx} and {qy} are on the interval
6546 // [0, 1), where {v} is fract(v), we can transform to tx in the following manner ty follows
6547 // the same math:
6548 // tx = 2 * {qx} - 1, so
6549 // {qx} = (tx + 1) / 2.
6550 // Calculate {qx} - 1 and {qy} - 1 where the {} operation is handled by the cast, and the - 1
6551 // is handled by the ^ 0x8000, dividing by 2 is deferred and handled in lerpX and lerpY in
6552 // order to use the full 16-bit resolution.
6553 #if defined(SKRP_CPU_LSX)
6554 __m128i qx_lo, qx_hi, qy_lo, qy_hi;
6555 split(qx, &qx_lo, &qx_hi);
6556 split(qy, &qy_lo, &qy_hi);
6557 __m128i temp = __lsx_vreplgr2vr_w(0x8000);
6558 qx_lo = __lsx_vxor_v(qx_lo, temp);
6559 qx_hi = __lsx_vxor_v(qx_hi, temp);
6560 qy_lo = __lsx_vxor_v(qy_lo, temp);
6561 qy_hi = __lsx_vxor_v(qy_hi, temp);
6562
6563 I16 tx = __lsx_vpickev_h(qx_hi, qx_lo);
6564 I16 ty = __lsx_vpickev_h(qy_hi, qy_lo);
6565 #else
6566 I16 tx = cast<I16>(qx ^ 0x8000),
6567 ty = cast<I16>(qy ^ 0x8000);
6568 #endif
6569
6570 // Substituting the {qx} by the equation for tx from above into the lerp equation where v is
6571 // the lerped value:
6572 // v = {qx}*(R - L) + L,
6573 // v = 1/2*(tx + 1)*(R - L) + L
6574 // 2 * v = (tx + 1)*(R - L) + 2*L
6575 // = tx*R - tx*L + R - L + 2*L
6576 // = tx*(R - L) + (R + L).
6577 // Since R and L are on [0, 255] we need them on the interval [0, 1/2] to get them into form
6578 // for Q15_mult. If L and R where in 16.16 format, this would be done by dividing by 2^9. In
6579 // code, we can multiply by 2^7 to get the value directly.
6580 // 2 * v = tx*(R - L) + (R + L)
6581 // 2^-9 * 2 * v = tx*(R - L)*2^-9 + (R + L)*2^-9
6582 // 2^-8 * v = 2^-9 * (tx*(R - L) + (R + L))
6583 // v = 1/2 * (tx*(R - L) + (R + L))
6584 auto lerpX = [&](U16 left, U16 right) -> U16 {
6585 I16 width = (I16)(right - left) << 7;
6586 U16 middle = (right + left) << 7;
6587 // The constrained_add is the most subtle part of lerp. The first term is on the interval
6588 // [-1, 1), and the second term is on the interval is on the interval [0, 1) because
6589 // both terms are too high by a factor of 2 which will be handled below. (Both R and L are
6590 // on [0, 1/2), but the sum R + L is on the interval [0, 1).) Generally, the sum below
6591 // should overflow, but because we know that sum produces an output on the
6592 // interval [0, 1) we know that the extra bit that would be needed will always be 0. So
6593 // we need to be careful to treat this sum as an unsigned positive number in the divide
6594 // by 2 below. Add +1 for rounding.
6595 U16 v2 = constrained_add(scaled_mult(tx, width), middle) + 1;
6596 // Divide by 2 to calculate v and at the same time bring the intermediate value onto the
6597 // interval [0, 1/2] to set up for the lerpY.
6598 return v2 >> 1;
6599 };
6600
6601 const uint32_t* ptr;
6602 U32 ix = ix_and_ptr(&ptr, ctx, sx, sy);
6603 U16 leftR, leftG, leftB, leftA;
6604 from_8888(gather<U32>(ptr, ix), &leftR,&leftG,&leftB,&leftA);
6605
6606 ix = ix_and_ptr(&ptr, ctx, sx+1, sy);
6607 U16 rightR, rightG, rightB, rightA;
6608 from_8888(gather<U32>(ptr, ix), &rightR,&rightG,&rightB,&rightA);
6609
6610 U16 topR = lerpX(leftR, rightR),
6611 topG = lerpX(leftG, rightG),
6612 topB = lerpX(leftB, rightB),
6613 topA = lerpX(leftA, rightA);
6614
6615 ix = ix_and_ptr(&ptr, ctx, sx, sy+1);
6616 from_8888(gather<U32>(ptr, ix), &leftR,&leftG,&leftB,&leftA);
6617
6618 ix = ix_and_ptr(&ptr, ctx, sx+1, sy+1);
6619 from_8888(gather<U32>(ptr, ix), &rightR,&rightG,&rightB,&rightA);
6620
6621 U16 bottomR = lerpX(leftR, rightR),
6622 bottomG = lerpX(leftG, rightG),
6623 bottomB = lerpX(leftB, rightB),
6624 bottomA = lerpX(leftA, rightA);
6625
6626 // lerpY plays the same mathematical tricks as lerpX, but the final divide is by 256 resulting
6627 // in a value on [0, 255].
6628 auto lerpY = [&](U16 top, U16 bottom) -> U16 {
6629 I16 width = (I16)bottom - (I16)top;
6630 U16 middle = bottom + top;
6631 // Add + 0x80 for rounding.
6632 U16 blend = constrained_add(scaled_mult(ty, width), middle) + 0x80;
6633
6634 return blend >> 8;
6635 };
6636
6637 r = lerpY(topR, bottomR);
6638 g = lerpY(topG, bottomG);
6639 b = lerpY(topB, bottomB);
6640 a = lerpY(topA, bottomA);
6641 }
6642
6643 STAGE_GG(xy_to_unit_angle, NoCtx) {
6644 F xabs = abs_(x),
6645 yabs = abs_(y);
6646
6647 F slope = min(xabs, yabs)/max(xabs, yabs);
6648 F s = slope * slope;
6649
6650 // Use a 7th degree polynomial to approximate atan.
6651 // This was generated using sollya.gforge.inria.fr.
6652 // A float optimized polynomial was generated using the following command.
6653 // P1 = fpminimax((1/(2*Pi))*atan(x),[|1,3,5,7|],[|24...|],[2^(-40),1],relative);
6654 F phi = slope
6655 * (0.15912117063999176025390625f + s
6656 * (-5.185396969318389892578125e-2f + s
6657 * (2.476101927459239959716796875e-2f + s
6658 * (-7.0547382347285747528076171875e-3f))));
6659
6660 phi = if_then_else(xabs < yabs, 1.0f/4.0f - phi, phi);
6661 phi = if_then_else(x < 0.0f , 1.0f/2.0f - phi, phi);
6662 phi = if_then_else(y < 0.0f , 1.0f - phi , phi);
6663 phi = if_then_else(phi != phi , 0 , phi); // Check for NaN.
6664 x = phi;
6665 }
6666 STAGE_GG(xy_to_radius, NoCtx) {
6667 x = sqrt_(x*x + y*y);
6668 }
6669
6670 // ~~~~~~ Compound stages ~~~~~~ //
6671
6672 STAGE_PP(srcover_rgba_8888, const SkRasterPipeline_MemoryCtx* ctx) {
6673 auto ptr = ptr_at_xy<uint32_t>(ctx, dx,dy);
6674
6675 load_8888_(ptr, &dr,&dg,&db,&da);
6676 r = r + div255( dr*inv(a) );
6677 g = g + div255( dg*inv(a) );
6678 b = b + div255( db*inv(a) );
6679 a = a + div255( da*inv(a) );
6680 store_8888_(ptr, r,g,b,a);
6681 }
6682
6683 // ~~~~~~ skgpu::Swizzle stage ~~~~~~ //
6684
6685 STAGE_PP(swizzle, void* ctx) {
6686 auto ir = r, ig = g, ib = b, ia = a;
6687 U16* o[] = {&r, &g, &b, &a};
6688 char swiz[4];
6689 memcpy(swiz, &ctx, sizeof(swiz));
6690
6691 for (int i = 0; i < 4; ++i) {
6692 switch (swiz[i]) {
6693 case 'r': *o[i] = ir; break;
6694 case 'g': *o[i] = ig; break;
6695 case 'b': *o[i] = ib; break;
6696 case 'a': *o[i] = ia; break;
6697 case '0': *o[i] = U16_0; break;
6698 case '1': *o[i] = U16_255; break;
6699 default: break;
6700 }
6701 }
6702 }
6703
6704 #endif//defined(SKRP_CPU_SCALAR) controlling whether we build lowp stages
6705 } // namespace lowp
6706
6707 /* This gives us SK_OPTS::lowp::N if lowp::N has been set, or SK_OPTS::N if it hasn't. */
6708 namespace lowp { static constexpr size_t lowp_N = N; }
6709
6710 /** Allow outside code to access the Raster Pipeline pixel stride. */
raster_pipeline_lowp_stride()6711 constexpr size_t raster_pipeline_lowp_stride() { return lowp::lowp_N; }
raster_pipeline_highp_stride()6712 constexpr size_t raster_pipeline_highp_stride() { return N; }
6713
6714 } // namespace SK_OPTS_NS
6715
6716 #undef SI
6717
6718 #endif//SkRasterPipeline_opts_DEFINED
6719