xref: /aosp_15_r20/external/skia/modules/skcms/src/Transform_inl.h (revision c8dee2aa9b3f27cf6c858bd81872bdeb2c07ed17)
1 /*
2  * Copyright 2018 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 // Intentionally NO #pragma once... included multiple times.
9 
10 // This file is included from skcms.cc in a namespace with some pre-defines:
11 //    - N:    SIMD width of all vectors; 1, 4, 8 or 16 (preprocessor define)
12 //    - V<T>: a template to create a vector of N T's.
13 
14 using F   = V<float>;
15 using I32 = V<int32_t>;
16 using U64 = V<uint64_t>;
17 using U32 = V<uint32_t>;
18 using U16 = V<uint16_t>;
19 using U8  = V<uint8_t>;
20 
21 #if defined(__GNUC__) && !defined(__clang__)
22     // GCC is kind of weird, not allowing vector = scalar directly.
23     static constexpr F F0 = F() + 0.0f,
24                        F1 = F() + 1.0f,
25                        FInfBits = F() + 0x7f800000; // equals 2139095040, the bit pattern of +Inf
26 #else
27     static constexpr F F0 = 0.0f,
28                        F1 = 1.0f,
29                        FInfBits = 0x7f800000; // equals 2139095040, the bit pattern of +Inf
30 #endif
31 
32 // Instead of checking __AVX__ below, we'll check USING_AVX.
33 // This lets skcms.cc set USING_AVX to force us in even if the compiler's not set that way.
34 // Same deal for __F16C__ and __AVX2__ ~~~> USING_AVX_F16C, USING_AVX2.
35 
36 #if !defined(USING_AVX)      && N == 8 && defined(__AVX__)
37     #define  USING_AVX
38 #endif
39 #if !defined(USING_AVX_F16C) && defined(USING_AVX) && defined(__F16C__)
40     #define  USING_AVX_F16C
41 #endif
42 #if !defined(USING_AVX2)     && defined(USING_AVX) && defined(__AVX2__)
43     #define  USING_AVX2
44 #endif
45 #if !defined(USING_AVX512F)  && N == 16 && defined(__AVX512F__) && defined(__AVX512DQ__)
46     #define  USING_AVX512F
47 #endif
48 
49 // Similar to the AVX+ features, we define USING_NEON and USING_NEON_F16C.
50 // This is more for organizational clarity... skcms.cc doesn't force these.
51 #if N > 1 && defined(__ARM_NEON)
52     #define USING_NEON
53 
54     // We have to use two different mechanisms to enable the f16 conversion intrinsics:
55     #if defined(__clang__)
56         // Clang's arm_neon.h guards them with the FP hardware bit:
57         #if __ARM_FP & 2
58             #define USING_NEON_F16C
59         #endif
60     #elif defined(__GNUC__)
61         // GCC's arm_neon.h guards them with the FP16 format macros (IEEE and ALTERNATIVE).
62         // We don't actually want the alternative format - we're reading/writing IEEE f16 values.
63         #if defined(__ARM_FP16_FORMAT_IEEE)
64             #define USING_NEON_F16C
65         #endif
66     #endif
67 #endif
68 
69 // These -Wvector-conversion warnings seem to trigger in very bogus situations,
70 // like vst3q_f32() expecting a 16x char rather than a 4x float vector.  :/
71 #if defined(USING_NEON) && defined(__clang__)
72     #pragma clang diagnostic ignored "-Wvector-conversion"
73 #endif
74 
75 // GCC & Clang (but not clang-cl) warn returning U64 on x86 is larger than a register.
76 // You'd see warnings like, "using AVX even though AVX is not enabled".
77 // We stifle these warnings; our helpers that return U64 are always inlined.
78 #if defined(__SSE__) && defined(__GNUC__)
79     #if !defined(__has_warning)
80         #pragma GCC diagnostic ignored "-Wpsabi"
81     #elif __has_warning("-Wpsabi")
82         #pragma GCC diagnostic ignored "-Wpsabi"
83     #endif
84 #endif
85 
86 // We tag most helper functions as SI, to enforce good code generation
87 // but also work around what we think is a bug in GCC: when targeting 32-bit
88 // x86, GCC tends to pass U16 (4x uint16_t vector) function arguments in the
89 // MMX mm0 register, which seems to mess with unrelated code that later uses
90 // x87 FP instructions (MMX's mm0 is an alias for x87's st0 register).
91 #if defined(__clang__) || defined(__GNUC__)
92     #define SI static inline __attribute__((always_inline))
93 #else
94     #define SI static inline
95 #endif
96 
97 template <typename T, typename P>
load(const P * ptr)98 SI T load(const P* ptr) {
99     T val;
100     memcpy(&val, ptr, sizeof(val));
101     return val;
102 }
103 template <typename T, typename P>
store(P * ptr,const T & val)104 SI void store(P* ptr, const T& val) {
105     memcpy(ptr, &val, sizeof(val));
106 }
107 
108 // (T)v is a cast when N == 1 and a bit-pun when N>1,
109 // so we use cast<T>(v) to actually cast or bit_pun<T>(v) to bit-pun.
110 template <typename D, typename S>
cast(const S & v)111 SI D cast(const S& v) {
112 #if N == 1
113     return (D)v;
114 #elif defined(__clang__)
115     return __builtin_convertvector(v, D);
116 #else
117     D d;
118     for (int i = 0; i < N; i++) {
119         d[i] = v[i];
120     }
121     return d;
122 #endif
123 }
124 
125 template <typename D, typename S>
bit_pun(const S & v)126 SI D bit_pun(const S& v) {
127     static_assert(sizeof(D) == sizeof(v), "");
128     return load<D>(&v);
129 }
130 
131 // When we convert from float to fixed point, it's very common to want to round,
132 // and for some reason compilers generate better code when converting to int32_t.
133 // To serve both those ends, we use this function to_fixed() instead of direct cast().
to_fixed(F f)134 SI U32 to_fixed(F f) {  return (U32)cast<I32>(f + 0.5f); }
135 
136 // Sometimes we do something crazy on one branch of a conditonal,
137 // like divide by zero or convert a huge float to an integer,
138 // but then harmlessly select the other side.  That trips up N==1
139 // sanitizer builds, so we make if_then_else() a macro to avoid
140 // evaluating the unused side.
141 
142 #if N == 1
143     #define if_then_else(cond, t, e) ((cond) ? (t) : (e))
144 #else
145     template <typename C, typename T>
if_then_else(C cond,T t,T e)146     SI T if_then_else(C cond, T t, T e) {
147         return bit_pun<T>( ( cond & bit_pun<C>(t)) |
148                            (~cond & bit_pun<C>(e)) );
149     }
150 #endif
151 
152 
F_from_Half(U16 half)153 SI F F_from_Half(U16 half) {
154 #if defined(USING_NEON_F16C)
155     return vcvt_f32_f16((float16x4_t)half);
156 #elif defined(USING_AVX512F)
157     return (F)_mm512_cvtph_ps((__m256i)half);
158 #elif defined(USING_AVX_F16C)
159     typedef int16_t __attribute__((vector_size(16))) I16;
160     return __builtin_ia32_vcvtph2ps256((I16)half);
161 #else
162     U32 wide = cast<U32>(half);
163     // A half is 1-5-10 sign-exponent-mantissa, with 15 exponent bias.
164     U32 s  = wide & 0x8000,
165         em = wide ^ s;
166 
167     // Constructing the float is easy if the half is not denormalized.
168     F norm = bit_pun<F>( (s<<16) + (em<<13) + ((127-15)<<23) );
169 
170     // Simply flush all denorm half floats to zero.
171     return if_then_else(em < 0x0400, F0, norm);
172 #endif
173 }
174 
175 #if defined(__clang__)
176     // The -((127-15)<<10) underflows that side of the math when
177     // we pass a denorm half float.  It's harmless... we'll take the 0 side anyway.
178     __attribute__((no_sanitize("unsigned-integer-overflow")))
179 #endif
Half_from_F(F f)180 SI U16 Half_from_F(F f) {
181 #if defined(USING_NEON_F16C)
182     return (U16)vcvt_f16_f32(f);
183 #elif defined(USING_AVX512F)
184     return (U16)_mm512_cvtps_ph((__m512 )f, _MM_FROUND_CUR_DIRECTION );
185 #elif defined(USING_AVX_F16C)
186     return (U16)__builtin_ia32_vcvtps2ph256(f, 0x04/*_MM_FROUND_CUR_DIRECTION*/);
187 #else
188     // A float is 1-8-23 sign-exponent-mantissa, with 127 exponent bias.
189     U32 sem = bit_pun<U32>(f),
190         s   = sem & 0x80000000,
191          em = sem ^ s;
192 
193     // For simplicity we flush denorm half floats (including all denorm floats) to zero.
194     return cast<U16>(if_then_else(em < 0x38800000, (U32)F0
195                                                  , (s>>16) + (em>>13) - ((127-15)<<10)));
196 #endif
197 }
198 
199 // Swap high and low bytes of 16-bit lanes, converting between big-endian and little-endian.
200 #if defined(USING_NEON)
swap_endian_16(U16 v)201     SI U16 swap_endian_16(U16 v) {
202         return (U16)vrev16_u8((uint8x8_t) v);
203     }
204 #endif
205 
swap_endian_16x4(const U64 & rgba)206 SI U64 swap_endian_16x4(const U64& rgba) {
207     return (rgba & 0x00ff00ff00ff00ff) << 8
208          | (rgba & 0xff00ff00ff00ff00) >> 8;
209 }
210 
211 #if defined(USING_NEON)
min_(F x,F y)212     SI F min_(F x, F y) { return (F)vminq_f32((float32x4_t)x, (float32x4_t)y); }
max_(F x,F y)213     SI F max_(F x, F y) { return (F)vmaxq_f32((float32x4_t)x, (float32x4_t)y); }
214 #elif defined(__loongarch_sx)
min_(F x,F y)215     SI F min_(F x, F y) { return (F)__lsx_vfmin_s(x, y); }
max_(F x,F y)216     SI F max_(F x, F y) { return (F)__lsx_vfmax_s(x, y); }
217 #else
min_(F x,F y)218     SI F min_(F x, F y) { return if_then_else(x > y, y, x); }
max_(F x,F y)219     SI F max_(F x, F y) { return if_then_else(x < y, y, x); }
220 #endif
221 
floor_(F x)222 SI F floor_(F x) {
223 #if N == 1
224     return floorf_(x);
225 #elif defined(__aarch64__)
226     return vrndmq_f32(x);
227 #elif defined(USING_AVX512F)
228     // Clang's _mm512_floor_ps() passes its mask as -1, not (__mmask16)-1,
229     // and integer santizer catches that this implicit cast changes the
230     // value from -1 to 65535.  We'll cast manually to work around it.
231     // Read this as `return _mm512_floor_ps(x)`.
232     return _mm512_mask_floor_ps(x, (__mmask16)-1, x);
233 #elif defined(USING_AVX)
234     return __builtin_ia32_roundps256(x, 0x01/*_MM_FROUND_FLOOR*/);
235 #elif defined(__SSE4_1__)
236     return _mm_floor_ps(x);
237 #elif defined(__loongarch_sx)
238     return __lsx_vfrintrm_s((__m128)x);
239 #else
240     // Round trip through integers with a truncating cast.
241     F roundtrip = cast<F>(cast<I32>(x));
242     // If x is negative, truncating gives the ceiling instead of the floor.
243     return roundtrip - if_then_else(roundtrip > x, F1, F0);
244 
245     // This implementation fails for values of x that are outside
246     // the range an integer can represent.  We expect most x to be small.
247 #endif
248 }
249 
approx_log2(F x)250 SI F approx_log2(F x) {
251     // The first approximation of log2(x) is its exponent 'e', minus 127.
252     I32 bits = bit_pun<I32>(x);
253 
254     F e = cast<F>(bits) * (1.0f / (1<<23));
255 
256     // If we use the mantissa too we can refine the error signficantly.
257     F m = bit_pun<F>( (bits & 0x007fffff) | 0x3f000000 );
258 
259     return e - 124.225514990f
260              -   1.498030302f*m
261              -   1.725879990f/(0.3520887068f + m);
262 }
263 
approx_log(F x)264 SI F approx_log(F x) {
265     const float ln2 = 0.69314718f;
266     return ln2 * approx_log2(x);
267 }
268 
approx_exp2(F x)269 SI F approx_exp2(F x) {
270     F fract = x - floor_(x);
271 
272     F fbits = (1.0f * (1<<23)) * (x + 121.274057500f
273                                     -   1.490129070f*fract
274                                     +  27.728023300f/(4.84252568f - fract));
275     I32 bits = cast<I32>(min_(max_(fbits, F0), FInfBits));
276 
277     return bit_pun<F>(bits);
278 }
279 
approx_pow(F x,float y)280 SI F approx_pow(F x, float y) {
281     return if_then_else((x == F0) | (x == F1), x
282                                              , approx_exp2(approx_log2(x) * y));
283 }
284 
approx_exp(F x)285 SI F approx_exp(F x) {
286     const float log2_e = 1.4426950408889634074f;
287     return approx_exp2(log2_e * x);
288 }
289 
strip_sign(F x,U32 * sign)290 SI F strip_sign(F x, U32* sign) {
291     U32 bits = bit_pun<U32>(x);
292     *sign = bits & 0x80000000;
293     return bit_pun<F>(bits ^ *sign);
294 }
295 
apply_sign(F x,U32 sign)296 SI F apply_sign(F x, U32 sign) {
297     return bit_pun<F>(sign | bit_pun<U32>(x));
298 }
299 
300 // Return tf(x).
apply_tf(const skcms_TransferFunction * tf,F x)301 SI F apply_tf(const skcms_TransferFunction* tf, F x) {
302     // Peel off the sign bit and set x = |x|.
303     U32 sign;
304     x = strip_sign(x, &sign);
305 
306     // The transfer function has a linear part up to d, exponential at d and after.
307     F v = if_then_else(x < tf->d,            tf->c*x + tf->f
308                                 , approx_pow(tf->a*x + tf->b, tf->g) + tf->e);
309 
310     // Tack the sign bit back on.
311     return apply_sign(v, sign);
312 }
313 
314 // Return the gamma function (|x|^G with the original sign re-applied to x).
apply_gamma(const skcms_TransferFunction * tf,F x)315 SI F apply_gamma(const skcms_TransferFunction* tf, F x) {
316     U32 sign;
317     x = strip_sign(x, &sign);
318     return apply_sign(approx_pow(x, tf->g), sign);
319 }
320 
apply_pq(const skcms_TransferFunction * tf,F x)321 SI F apply_pq(const skcms_TransferFunction* tf, F x) {
322     U32 bits = bit_pun<U32>(x),
323         sign = bits & 0x80000000;
324     x = bit_pun<F>(bits ^ sign);
325 
326     F v = approx_pow(max_(tf->a + tf->b * approx_pow(x, tf->c), F0)
327                        / (tf->d + tf->e * approx_pow(x, tf->c)),
328                      tf->f);
329 
330     return bit_pun<F>(sign | bit_pun<U32>(v));
331 }
332 
apply_hlg(const skcms_TransferFunction * tf,F x)333 SI F apply_hlg(const skcms_TransferFunction* tf, F x) {
334     const float R = tf->a, G = tf->b,
335                 a = tf->c, b = tf->d, c = tf->e,
336                 K = tf->f + 1;
337     U32 bits = bit_pun<U32>(x),
338         sign = bits & 0x80000000;
339     x = bit_pun<F>(bits ^ sign);
340 
341     F v = if_then_else(x*R <= 1, approx_pow(x*R, G)
342                                , approx_exp((x-c)*a) + b);
343 
344     return K*bit_pun<F>(sign | bit_pun<U32>(v));
345 }
346 
apply_hlginv(const skcms_TransferFunction * tf,F x)347 SI F apply_hlginv(const skcms_TransferFunction* tf, F x) {
348     const float R = tf->a, G = tf->b,
349                 a = tf->c, b = tf->d, c = tf->e,
350                 K = tf->f + 1;
351     U32 bits = bit_pun<U32>(x),
352         sign = bits & 0x80000000;
353     x = bit_pun<F>(bits ^ sign);
354     x /= K;
355 
356     F v = if_then_else(x <= 1, R * approx_pow(x, G)
357                              , a * approx_log(x - b) + c);
358 
359     return bit_pun<F>(sign | bit_pun<U32>(v));
360 }
361 
362 
363 // Strided loads and stores of N values, starting from p.
364 template <typename T, typename P>
load_3(const P * p)365 SI T load_3(const P* p) {
366 #if N == 1
367     return (T)p[0];
368 #elif N == 4
369     return T{p[ 0],p[ 3],p[ 6],p[ 9]};
370 #elif N == 8
371     return T{p[ 0],p[ 3],p[ 6],p[ 9], p[12],p[15],p[18],p[21]};
372 #elif N == 16
373     return T{p[ 0],p[ 3],p[ 6],p[ 9], p[12],p[15],p[18],p[21],
374              p[24],p[27],p[30],p[33], p[36],p[39],p[42],p[45]};
375 #endif
376 }
377 
378 template <typename T, typename P>
load_4(const P * p)379 SI T load_4(const P* p) {
380 #if N == 1
381     return (T)p[0];
382 #elif N == 4
383     return T{p[ 0],p[ 4],p[ 8],p[12]};
384 #elif N == 8
385     return T{p[ 0],p[ 4],p[ 8],p[12], p[16],p[20],p[24],p[28]};
386 #elif N == 16
387     return T{p[ 0],p[ 4],p[ 8],p[12], p[16],p[20],p[24],p[28],
388              p[32],p[36],p[40],p[44], p[48],p[52],p[56],p[60]};
389 #endif
390 }
391 
392 template <typename T, typename P>
store_3(P * p,const T & v)393 SI void store_3(P* p, const T& v) {
394 #if N == 1
395     p[0] = v;
396 #elif N == 4
397     p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
398 #elif N == 8
399     p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
400     p[12] = v[ 4]; p[15] = v[ 5]; p[18] = v[ 6]; p[21] = v[ 7];
401 #elif N == 16
402     p[ 0] = v[ 0]; p[ 3] = v[ 1]; p[ 6] = v[ 2]; p[ 9] = v[ 3];
403     p[12] = v[ 4]; p[15] = v[ 5]; p[18] = v[ 6]; p[21] = v[ 7];
404     p[24] = v[ 8]; p[27] = v[ 9]; p[30] = v[10]; p[33] = v[11];
405     p[36] = v[12]; p[39] = v[13]; p[42] = v[14]; p[45] = v[15];
406 #endif
407 }
408 
409 template <typename T, typename P>
store_4(P * p,const T & v)410 SI void store_4(P* p, const T& v) {
411 #if N == 1
412     p[0] = v;
413 #elif N == 4
414     p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
415 #elif N == 8
416     p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
417     p[16] = v[ 4]; p[20] = v[ 5]; p[24] = v[ 6]; p[28] = v[ 7];
418 #elif N == 16
419     p[ 0] = v[ 0]; p[ 4] = v[ 1]; p[ 8] = v[ 2]; p[12] = v[ 3];
420     p[16] = v[ 4]; p[20] = v[ 5]; p[24] = v[ 6]; p[28] = v[ 7];
421     p[32] = v[ 8]; p[36] = v[ 9]; p[40] = v[10]; p[44] = v[11];
422     p[48] = v[12]; p[52] = v[13]; p[56] = v[14]; p[60] = v[15];
423 #endif
424 }
425 
426 
gather_8(const uint8_t * p,I32 ix)427 SI U8 gather_8(const uint8_t* p, I32 ix) {
428 #if N == 1
429     U8 v = p[ix];
430 #elif N == 4
431     U8 v = { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]] };
432 #elif N == 8
433     U8 v = { p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
434              p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]] };
435 #elif N == 16
436     U8 v = { p[ix[ 0]], p[ix[ 1]], p[ix[ 2]], p[ix[ 3]],
437              p[ix[ 4]], p[ix[ 5]], p[ix[ 6]], p[ix[ 7]],
438              p[ix[ 8]], p[ix[ 9]], p[ix[10]], p[ix[11]],
439              p[ix[12]], p[ix[13]], p[ix[14]], p[ix[15]] };
440 #endif
441     return v;
442 }
443 
gather_16(const uint8_t * p,I32 ix)444 SI U16 gather_16(const uint8_t* p, I32 ix) {
445     // Load the i'th 16-bit value from p.
446     auto load_16 = [p](int i) {
447         return load<uint16_t>(p + 2*i);
448     };
449 #if N == 1
450     U16 v = load_16(ix);
451 #elif N == 4
452     U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]) };
453 #elif N == 8
454     U16 v = { load_16(ix[0]), load_16(ix[1]), load_16(ix[2]), load_16(ix[3]),
455               load_16(ix[4]), load_16(ix[5]), load_16(ix[6]), load_16(ix[7]) };
456 #elif N == 16
457     U16 v = { load_16(ix[ 0]), load_16(ix[ 1]), load_16(ix[ 2]), load_16(ix[ 3]),
458               load_16(ix[ 4]), load_16(ix[ 5]), load_16(ix[ 6]), load_16(ix[ 7]),
459               load_16(ix[ 8]), load_16(ix[ 9]), load_16(ix[10]), load_16(ix[11]),
460               load_16(ix[12]), load_16(ix[13]), load_16(ix[14]), load_16(ix[15]) };
461 #endif
462     return v;
463 }
464 
gather_32(const uint8_t * p,I32 ix)465 SI U32 gather_32(const uint8_t* p, I32 ix) {
466     // Load the i'th 32-bit value from p.
467     auto load_32 = [p](int i) {
468         return load<uint32_t>(p + 4*i);
469     };
470 #if N == 1
471     U32 v = load_32(ix);
472 #elif N == 4
473     U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]) };
474 #elif N == 8
475     U32 v = { load_32(ix[0]), load_32(ix[1]), load_32(ix[2]), load_32(ix[3]),
476               load_32(ix[4]), load_32(ix[5]), load_32(ix[6]), load_32(ix[7]) };
477 #elif N == 16
478     U32 v = { load_32(ix[ 0]), load_32(ix[ 1]), load_32(ix[ 2]), load_32(ix[ 3]),
479               load_32(ix[ 4]), load_32(ix[ 5]), load_32(ix[ 6]), load_32(ix[ 7]),
480               load_32(ix[ 8]), load_32(ix[ 9]), load_32(ix[10]), load_32(ix[11]),
481               load_32(ix[12]), load_32(ix[13]), load_32(ix[14]), load_32(ix[15]) };
482 #endif
483     // TODO: AVX2 and AVX-512 gathers (c.f. gather_24).
484     return v;
485 }
486 
gather_24(const uint8_t * p,I32 ix)487 SI U32 gather_24(const uint8_t* p, I32 ix) {
488     // First, back up a byte.  Any place we're gathering from has a safe junk byte to read
489     // in front of it, either a previous table value, or some tag metadata.
490     p -= 1;
491 
492     // Load the i'th 24-bit value from p, and 1 extra byte.
493     auto load_24_32 = [p](int i) {
494         return load<uint32_t>(p + 3*i);
495     };
496 
497     // Now load multiples of 4 bytes (a junk byte, then r,g,b).
498 #if N == 1
499     U32 v = load_24_32(ix);
500 #elif N == 4
501     U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]) };
502 #elif N == 8 && !defined(USING_AVX2)
503     U32 v = { load_24_32(ix[0]), load_24_32(ix[1]), load_24_32(ix[2]), load_24_32(ix[3]),
504               load_24_32(ix[4]), load_24_32(ix[5]), load_24_32(ix[6]), load_24_32(ix[7]) };
505 #elif N == 8
506     (void)load_24_32;
507     // The gather instruction here doesn't need any particular alignment,
508     // but the intrinsic takes a const int*.
509     const int* p4 = bit_pun<const int*>(p);
510     I32 zero = { 0, 0, 0, 0,  0, 0, 0, 0},
511         mask = {-1,-1,-1,-1, -1,-1,-1,-1};
512     #if defined(__clang__)
513         U32 v = (U32)__builtin_ia32_gatherd_d256(zero, p4, 3*ix, mask, 1);
514     #elif defined(__GNUC__)
515         U32 v = (U32)__builtin_ia32_gathersiv8si(zero, p4, 3*ix, mask, 1);
516     #endif
517 #elif N == 16
518     (void)load_24_32;
519     // The intrinsic is supposed to take const void* now, but it takes const int*, just like AVX2.
520     // And AVX-512 swapped the order of arguments.  :/
521     const int* p4 = bit_pun<const int*>(p);
522     U32 v = (U32)_mm512_i32gather_epi32((__m512i)(3*ix), p4, 1);
523 #endif
524 
525     // Shift off the junk byte, leaving r,g,b in low 24 bits (and zero in the top 8).
526     return v >> 8;
527 }
528 
529 #if !defined(__arm__)
gather_48(const uint8_t * p,I32 ix,U64 * v)530     SI void gather_48(const uint8_t* p, I32 ix, U64* v) {
531         // As in gather_24(), with everything doubled.
532         p -= 2;
533 
534         // Load the i'th 48-bit value from p, and 2 extra bytes.
535         auto load_48_64 = [p](int i) {
536             return load<uint64_t>(p + 6*i);
537         };
538 
539     #if N == 1
540         *v = load_48_64(ix);
541     #elif N == 4
542         *v = U64{
543             load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
544         };
545     #elif N == 8 && !defined(USING_AVX2)
546         *v = U64{
547             load_48_64(ix[0]), load_48_64(ix[1]), load_48_64(ix[2]), load_48_64(ix[3]),
548             load_48_64(ix[4]), load_48_64(ix[5]), load_48_64(ix[6]), load_48_64(ix[7]),
549         };
550     #elif N == 8
551         (void)load_48_64;
552         typedef int32_t   __attribute__((vector_size(16))) Half_I32;
553         typedef long long __attribute__((vector_size(32))) Half_I64;
554 
555         // The gather instruction here doesn't need any particular alignment,
556         // but the intrinsic takes a const long long*.
557         const long long int* p8 = bit_pun<const long long int*>(p);
558 
559         Half_I64 zero = { 0, 0, 0, 0},
560                  mask = {-1,-1,-1,-1};
561 
562         ix *= 6;
563         Half_I32 ix_lo = { ix[0], ix[1], ix[2], ix[3] },
564                  ix_hi = { ix[4], ix[5], ix[6], ix[7] };
565 
566         #if defined(__clang__)
567             Half_I64 lo = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_lo, mask, 1),
568                      hi = (Half_I64)__builtin_ia32_gatherd_q256(zero, p8, ix_hi, mask, 1);
569         #elif defined(__GNUC__)
570             Half_I64 lo = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_lo, mask, 1),
571                      hi = (Half_I64)__builtin_ia32_gathersiv4di(zero, p8, ix_hi, mask, 1);
572         #endif
573         store((char*)v +  0, lo);
574         store((char*)v + 32, hi);
575     #elif N == 16
576         (void)load_48_64;
577         const long long int* p8 = bit_pun<const long long int*>(p);
578         __m512i lo = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 0), p8, 1),
579                 hi = _mm512_i32gather_epi64(_mm512_extracti32x8_epi32((__m512i)(6*ix), 1), p8, 1);
580         store((char*)v +  0, lo);
581         store((char*)v + 64, hi);
582     #endif
583 
584         *v >>= 16;
585     }
586 #endif
587 
F_from_U8(U8 v)588 SI F F_from_U8(U8 v) {
589     return cast<F>(v) * (1/255.0f);
590 }
591 
F_from_U16_BE(U16 v)592 SI F F_from_U16_BE(U16 v) {
593     // All 16-bit ICC values are big-endian, so we byte swap before converting to float.
594     // MSVC catches the "loss" of data here in the portable path, so we also make sure to mask.
595     U16 lo = (v >> 8),
596         hi = (v << 8) & 0xffff;
597     return cast<F>(lo|hi) * (1/65535.0f);
598 }
599 
U16_from_F(F v)600 SI U16 U16_from_F(F v) {
601     // 65535 == inf in FP16, so promote to FP32 before converting.
602     return cast<U16>(cast<V<float>>(v) * 65535 + 0.5f);
603 }
604 
minus_1_ulp(F v)605 SI F minus_1_ulp(F v) {
606     return bit_pun<F>( bit_pun<U32>(v) - 1 );
607 }
608 
table(const skcms_Curve * curve,F v)609 SI F table(const skcms_Curve* curve, F v) {
610     // Clamp the input to [0,1], then scale to a table index.
611     F ix = max_(F0, min_(v, F1)) * (float)(curve->table_entries - 1);
612 
613     // We'll look up (equal or adjacent) entries at lo and hi, then lerp by t between the two.
614     I32 lo = cast<I32>(            ix      ),
615         hi = cast<I32>(minus_1_ulp(ix+1.0f));
616     F t = ix - cast<F>(lo);  // i.e. the fractional part of ix.
617 
618     // TODO: can we load l and h simultaneously?  Each entry in 'h' is either
619     // the same as in 'l' or adjacent.  We have a rough idea that's it'd always be safe
620     // to read adjacent entries and perhaps underflow the table by a byte or two
621     // (it'd be junk, but always safe to read).  Not sure how to lerp yet.
622     F l,h;
623     if (curve->table_8) {
624         l = F_from_U8(gather_8(curve->table_8, lo));
625         h = F_from_U8(gather_8(curve->table_8, hi));
626     } else {
627         l = F_from_U16_BE(gather_16(curve->table_16, lo));
628         h = F_from_U16_BE(gather_16(curve->table_16, hi));
629     }
630     return l + (h-l)*t;
631 }
632 
sample_clut_8(const uint8_t * grid_8,I32 ix,F * r,F * g,F * b)633 SI void sample_clut_8(const uint8_t* grid_8, I32 ix, F* r, F* g, F* b) {
634     U32 rgb = gather_24(grid_8, ix);
635 
636     *r = cast<F>((rgb >>  0) & 0xff) * (1/255.0f);
637     *g = cast<F>((rgb >>  8) & 0xff) * (1/255.0f);
638     *b = cast<F>((rgb >> 16) & 0xff) * (1/255.0f);
639 }
640 
sample_clut_8(const uint8_t * grid_8,I32 ix,F * r,F * g,F * b,F * a)641 SI void sample_clut_8(const uint8_t* grid_8, I32 ix, F* r, F* g, F* b, F* a) {
642     // TODO: don't forget to optimize gather_32().
643     U32 rgba = gather_32(grid_8, ix);
644 
645     *r = cast<F>((rgba >>  0) & 0xff) * (1/255.0f);
646     *g = cast<F>((rgba >>  8) & 0xff) * (1/255.0f);
647     *b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
648     *a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
649 }
650 
sample_clut_16(const uint8_t * grid_16,I32 ix,F * r,F * g,F * b)651 SI void sample_clut_16(const uint8_t* grid_16, I32 ix, F* r, F* g, F* b) {
652 #if defined(__arm__) || defined(__loongarch_sx)
653     // This is up to 2x faster on 32-bit ARM than the #else-case fast path.
654     *r = F_from_U16_BE(gather_16(grid_16, 3*ix+0));
655     *g = F_from_U16_BE(gather_16(grid_16, 3*ix+1));
656     *b = F_from_U16_BE(gather_16(grid_16, 3*ix+2));
657 #else
658     // This strategy is much faster for 64-bit builds, and fine for 32-bit x86 too.
659     U64 rgb;
660     gather_48(grid_16, ix, &rgb);
661     rgb = swap_endian_16x4(rgb);
662 
663     *r = cast<F>((rgb >>  0) & 0xffff) * (1/65535.0f);
664     *g = cast<F>((rgb >> 16) & 0xffff) * (1/65535.0f);
665     *b = cast<F>((rgb >> 32) & 0xffff) * (1/65535.0f);
666 #endif
667 }
668 
sample_clut_16(const uint8_t * grid_16,I32 ix,F * r,F * g,F * b,F * a)669 SI void sample_clut_16(const uint8_t* grid_16, I32 ix, F* r, F* g, F* b, F* a) {
670     // TODO: gather_64()-based fast path?
671     *r = F_from_U16_BE(gather_16(grid_16, 4*ix+0));
672     *g = F_from_U16_BE(gather_16(grid_16, 4*ix+1));
673     *b = F_from_U16_BE(gather_16(grid_16, 4*ix+2));
674     *a = F_from_U16_BE(gather_16(grid_16, 4*ix+3));
675 }
676 
clut(uint32_t input_channels,uint32_t output_channels,const uint8_t grid_points[4],const uint8_t * grid_8,const uint8_t * grid_16,F * r,F * g,F * b,F * a)677 static void clut(uint32_t input_channels, uint32_t output_channels,
678                  const uint8_t grid_points[4], const uint8_t* grid_8, const uint8_t* grid_16,
679                  F* r, F* g, F* b, F* a) {
680 
681     const int dim = (int)input_channels;
682     assert (0 < dim && dim <= 4);
683     assert (output_channels == 3 ||
684             output_channels == 4);
685 
686     // For each of these arrays, think foo[2*dim], but we use foo[8] since we know dim <= 4.
687     I32 index [8];  // Index contribution by dimension, first low from 0, then high from 4.
688     F   weight[8];  // Weight for each contribution, again first low, then high.
689 
690     // O(dim) work first: calculate index,weight from r,g,b,a.
691     const F inputs[] = { *r,*g,*b,*a };
692     for (int i = dim-1, stride = 1; i >= 0; i--) {
693         // x is where we logically want to sample the grid in the i-th dimension.
694         F x = inputs[i] * (float)(grid_points[i] - 1);
695 
696         // But we can't index at floats.  lo and hi are the two integer grid points surrounding x.
697         I32 lo = cast<I32>(            x      ),   // i.e. trunc(x) == floor(x) here.
698             hi = cast<I32>(minus_1_ulp(x+1.0f));
699         // Notice how we fold in the accumulated stride across previous dimensions here.
700         index[i+0] = lo * stride;
701         index[i+4] = hi * stride;
702         stride *= grid_points[i];
703 
704         // We'll interpolate between those two integer grid points by t.
705         F t = x - cast<F>(lo);  // i.e. fract(x)
706         weight[i+0] = 1-t;
707         weight[i+4] = t;
708     }
709 
710     *r = *g = *b = F0;
711     if (output_channels == 4) {
712         *a = F0;
713     }
714 
715     // We'll sample 2^dim == 1<<dim table entries per pixel,
716     // in all combinations of low and high in each dimension.
717     for (int combo = 0; combo < (1<<dim); combo++) {  // This loop can be done in any order.
718 
719         // Each of these upcoming (combo&N)*K expressions here evaluates to 0 or 4,
720         // where 0 selects the low index contribution and its weight 1-t,
721         // or 4 the high index contribution and its weight t.
722 
723         // Since 0<dim≤4, we can always just start off with the 0-th channel,
724         // then handle the others conditionally.
725         I32 ix = index [0 + (combo&1)*4];
726         F    w = weight[0 + (combo&1)*4];
727 
728         switch ((dim-1)&3) {  // This lets the compiler know there are no other cases to handle.
729             case 3: ix += index [3 + (combo&8)/2];
730                     w  *= weight[3 + (combo&8)/2];
731                     SKCMS_FALLTHROUGH;
732                     // fall through
733 
734             case 2: ix += index [2 + (combo&4)*1];
735                     w  *= weight[2 + (combo&4)*1];
736                     SKCMS_FALLTHROUGH;
737                     // fall through
738 
739             case 1: ix += index [1 + (combo&2)*2];
740                     w  *= weight[1 + (combo&2)*2];
741         }
742 
743         F R,G,B,A=F0;
744         if (output_channels == 3) {
745             if (grid_8) { sample_clut_8 (grid_8 ,ix, &R,&G,&B); }
746             else        { sample_clut_16(grid_16,ix, &R,&G,&B); }
747         } else {
748             if (grid_8) { sample_clut_8 (grid_8 ,ix, &R,&G,&B,&A); }
749             else        { sample_clut_16(grid_16,ix, &R,&G,&B,&A); }
750         }
751         *r += w*R;
752         *g += w*G;
753         *b += w*B;
754         *a += w*A;
755     }
756 }
757 
clut(const skcms_A2B * a2b,F * r,F * g,F * b,F a)758 static void clut(const skcms_A2B* a2b, F* r, F* g, F* b, F a) {
759     clut(a2b->input_channels, a2b->output_channels,
760          a2b->grid_points, a2b->grid_8, a2b->grid_16,
761          r,g,b,&a);
762 }
clut(const skcms_B2A * b2a,F * r,F * g,F * b,F * a)763 static void clut(const skcms_B2A* b2a, F* r, F* g, F* b, F* a) {
764     clut(b2a->input_channels, b2a->output_channels,
765          b2a->grid_points, b2a->grid_8, b2a->grid_16,
766          r,g,b,a);
767 }
768 
769 struct NoCtx {};
770 
771 struct Ctx {
772     const void* fArg;
NoCtxCtx773     operator NoCtx()                    { return NoCtx{}; }
774     template <typename T> operator T*() { return (const T*)fArg; }
775 };
776 
777 #define STAGE_PARAMS(MAYBE_REF) SKCMS_MAYBE_UNUSED const char* src, \
778                                 SKCMS_MAYBE_UNUSED char* dst,       \
779                                 SKCMS_MAYBE_UNUSED F MAYBE_REF r,   \
780                                 SKCMS_MAYBE_UNUSED F MAYBE_REF g,   \
781                                 SKCMS_MAYBE_UNUSED F MAYBE_REF b,   \
782                                 SKCMS_MAYBE_UNUSED F MAYBE_REF a,   \
783                                 SKCMS_MAYBE_UNUSED int i
784 
785 #if SKCMS_HAS_MUSTTAIL
786 
787     // Stages take a stage list, and each stage is responsible for tail-calling the next one.
788     //
789     // Unfortunately, we can't declare a StageFn as a function pointer which takes a pointer to
790     // another StageFn; declaring this leads to a circular dependency. To avoid this, StageFn is
791     // wrapped in a single-element `struct StageList` which we are able to forward-declare.
792     struct StageList;
793     using StageFn = void (*)(StageList stages, const void** ctx, STAGE_PARAMS());
794     struct StageList {
795         const StageFn* fn;
796     };
797 
798     #define DECLARE_STAGE(name, arg, CALL_NEXT)                                 \
799         SI void Exec_##name##_k(arg, STAGE_PARAMS(&));                          \
800                                                                                 \
801         SI void Exec_##name(StageList list, const void** ctx, STAGE_PARAMS()) { \
802             Exec_##name##_k(Ctx{*ctx}, src, dst, r, g, b, a, i);                \
803             ++list.fn; ++ctx;                                                   \
804             CALL_NEXT;                                                          \
805         }                                                                       \
806                                                                                 \
807         SI void Exec_##name##_k(arg, STAGE_PARAMS(&))
808 
809     #define STAGE(name, arg)                                                                \
810         DECLARE_STAGE(name, arg, [[clang::musttail]] return (*list.fn)(list, ctx, src, dst, \
811                                                                        r, g, b, a, i))
812 
813     #define FINAL_STAGE(name, arg) \
814         DECLARE_STAGE(name, arg, /* Stop executing stages and return to the caller. */)
815 
816 #else
817 
818     #define DECLARE_STAGE(name, arg)                            \
819         SI void Exec_##name##_k(arg, STAGE_PARAMS(&));          \
820                                                                 \
821         SI void Exec_##name(const void* ctx, STAGE_PARAMS(&)) { \
822             Exec_##name##_k(Ctx{ctx}, src, dst, r, g, b, a, i); \
823         }                                                       \
824                                                                 \
825         SI void Exec_##name##_k(arg, STAGE_PARAMS(&))
826 
827     #define STAGE(name, arg)       DECLARE_STAGE(name, arg)
828     #define FINAL_STAGE(name, arg) DECLARE_STAGE(name, arg)
829 
830 #endif
831 
STAGE(load_a8,NoCtx)832 STAGE(load_a8, NoCtx) {
833     a = F_from_U8(load<U8>(src + 1*i));
834 }
835 
STAGE(load_g8,NoCtx)836 STAGE(load_g8, NoCtx) {
837     r = g = b = F_from_U8(load<U8>(src + 1*i));
838 }
839 
STAGE(load_ga88,NoCtx)840 STAGE(load_ga88, NoCtx) {
841     U16 u16 = load<U16>(src + 2 * i);
842     r = g = b = cast<F>((u16 >> 0) & 0xff) * (1 / 255.0f);
843             a = cast<F>((u16 >> 8) & 0xff) * (1 / 255.0f);
844 }
845 
STAGE(load_4444,NoCtx)846 STAGE(load_4444, NoCtx) {
847     U16 abgr = load<U16>(src + 2*i);
848 
849     r = cast<F>((abgr >> 12) & 0xf) * (1/15.0f);
850     g = cast<F>((abgr >>  8) & 0xf) * (1/15.0f);
851     b = cast<F>((abgr >>  4) & 0xf) * (1/15.0f);
852     a = cast<F>((abgr >>  0) & 0xf) * (1/15.0f);
853 }
854 
STAGE(load_565,NoCtx)855 STAGE(load_565, NoCtx) {
856     U16 rgb = load<U16>(src + 2*i);
857 
858     r = cast<F>(rgb & (uint16_t)(31<< 0)) * (1.0f / (31<< 0));
859     g = cast<F>(rgb & (uint16_t)(63<< 5)) * (1.0f / (63<< 5));
860     b = cast<F>(rgb & (uint16_t)(31<<11)) * (1.0f / (31<<11));
861 }
862 
STAGE(load_888,NoCtx)863 STAGE(load_888, NoCtx) {
864     const uint8_t* rgb = (const uint8_t*)(src + 3*i);
865 #if defined(USING_NEON)
866     // There's no uint8x4x3_t or vld3 load for it, so we'll load each rgb pixel one at
867     // a time.  Since we're doing that, we might as well load them into 16-bit lanes.
868     // (We'd even load into 32-bit lanes, but that's not possible on ARMv7.)
869     uint8x8x3_t v = {{ vdup_n_u8(0), vdup_n_u8(0), vdup_n_u8(0) }};
870     v = vld3_lane_u8(rgb+0, v, 0);
871     v = vld3_lane_u8(rgb+3, v, 2);
872     v = vld3_lane_u8(rgb+6, v, 4);
873     v = vld3_lane_u8(rgb+9, v, 6);
874 
875     // Now if we squint, those 3 uint8x8_t we constructed are really U16s, easy to
876     // convert to F.  (Again, U32 would be even better here if drop ARMv7 or split
877     // ARMv7 and ARMv8 impls.)
878     r = cast<F>((U16)v.val[0]) * (1/255.0f);
879     g = cast<F>((U16)v.val[1]) * (1/255.0f);
880     b = cast<F>((U16)v.val[2]) * (1/255.0f);
881 #else
882     r = cast<F>(load_3<U32>(rgb+0) ) * (1/255.0f);
883     g = cast<F>(load_3<U32>(rgb+1) ) * (1/255.0f);
884     b = cast<F>(load_3<U32>(rgb+2) ) * (1/255.0f);
885 #endif
886 }
887 
STAGE(load_8888,NoCtx)888 STAGE(load_8888, NoCtx) {
889     U32 rgba = load<U32>(src + 4*i);
890 
891     r = cast<F>((rgba >>  0) & 0xff) * (1/255.0f);
892     g = cast<F>((rgba >>  8) & 0xff) * (1/255.0f);
893     b = cast<F>((rgba >> 16) & 0xff) * (1/255.0f);
894     a = cast<F>((rgba >> 24) & 0xff) * (1/255.0f);
895 }
896 
STAGE(load_1010102,NoCtx)897 STAGE(load_1010102, NoCtx) {
898     U32 rgba = load<U32>(src + 4*i);
899 
900     r = cast<F>((rgba >>  0) & 0x3ff) * (1/1023.0f);
901     g = cast<F>((rgba >> 10) & 0x3ff) * (1/1023.0f);
902     b = cast<F>((rgba >> 20) & 0x3ff) * (1/1023.0f);
903     a = cast<F>((rgba >> 30) & 0x3  ) * (1/   3.0f);
904 }
905 
STAGE(load_101010x_XR,NoCtx)906 STAGE(load_101010x_XR, NoCtx) {
907     static constexpr float min = -0.752941f;
908     static constexpr float max = 1.25098f;
909     static constexpr float range = max - min;
910     U32 rgba = load<U32>(src + 4*i);
911     r = cast<F>((rgba >>  0) & 0x3ff) * (1/1023.0f) * range + min;
912     g = cast<F>((rgba >> 10) & 0x3ff) * (1/1023.0f) * range + min;
913     b = cast<F>((rgba >> 20) & 0x3ff) * (1/1023.0f) * range + min;
914 }
915 
STAGE(load_10101010_XR,NoCtx)916 STAGE(load_10101010_XR, NoCtx) {
917     static constexpr float min = -0.752941f;
918     static constexpr float max = 1.25098f;
919     static constexpr float range = max - min;
920     U64 rgba = load<U64>(src + 8 * i);
921     r = cast<F>((rgba >>  (0+6)) & 0x3ff) * (1/1023.0f) * range + min;
922     g = cast<F>((rgba >> (16+6)) & 0x3ff) * (1/1023.0f) * range + min;
923     b = cast<F>((rgba >> (32+6)) & 0x3ff) * (1/1023.0f) * range + min;
924     a = cast<F>((rgba >> (48+6)) & 0x3ff) * (1/1023.0f) * range + min;
925 }
926 
STAGE(load_161616LE,NoCtx)927 STAGE(load_161616LE, NoCtx) {
928     uintptr_t ptr = (uintptr_t)(src + 6*i);
929     assert( (ptr & 1) == 0 );                   // src must be 2-byte aligned for this
930     const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
931 #if defined(USING_NEON)
932     uint16x4x3_t v = vld3_u16(rgb);
933     r = cast<F>((U16)v.val[0]) * (1/65535.0f);
934     g = cast<F>((U16)v.val[1]) * (1/65535.0f);
935     b = cast<F>((U16)v.val[2]) * (1/65535.0f);
936 #else
937     r = cast<F>(load_3<U32>(rgb+0)) * (1/65535.0f);
938     g = cast<F>(load_3<U32>(rgb+1)) * (1/65535.0f);
939     b = cast<F>(load_3<U32>(rgb+2)) * (1/65535.0f);
940 #endif
941 }
942 
STAGE(load_16161616LE,NoCtx)943 STAGE(load_16161616LE, NoCtx) {
944     uintptr_t ptr = (uintptr_t)(src + 8*i);
945     assert( (ptr & 1) == 0 );                    // src must be 2-byte aligned for this
946     const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
947 #if defined(USING_NEON)
948     uint16x4x4_t v = vld4_u16(rgba);
949     r = cast<F>((U16)v.val[0]) * (1/65535.0f);
950     g = cast<F>((U16)v.val[1]) * (1/65535.0f);
951     b = cast<F>((U16)v.val[2]) * (1/65535.0f);
952     a = cast<F>((U16)v.val[3]) * (1/65535.0f);
953 #else
954     U64 px = load<U64>(rgba);
955 
956     r = cast<F>((px >>  0) & 0xffff) * (1/65535.0f);
957     g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
958     b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
959     a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
960 #endif
961 }
962 
STAGE(load_161616BE,NoCtx)963 STAGE(load_161616BE, NoCtx) {
964     uintptr_t ptr = (uintptr_t)(src + 6*i);
965     assert( (ptr & 1) == 0 );                   // src must be 2-byte aligned for this
966     const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
967 #if defined(USING_NEON)
968     uint16x4x3_t v = vld3_u16(rgb);
969     r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
970     g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
971     b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
972 #else
973     U32 R = load_3<U32>(rgb+0),
974         G = load_3<U32>(rgb+1),
975         B = load_3<U32>(rgb+2);
976     // R,G,B are big-endian 16-bit, so byte swap them before converting to float.
977     r = cast<F>((R & 0x00ff)<<8 | (R & 0xff00)>>8) * (1/65535.0f);
978     g = cast<F>((G & 0x00ff)<<8 | (G & 0xff00)>>8) * (1/65535.0f);
979     b = cast<F>((B & 0x00ff)<<8 | (B & 0xff00)>>8) * (1/65535.0f);
980 #endif
981 }
982 
STAGE(load_16161616BE,NoCtx)983 STAGE(load_16161616BE, NoCtx) {
984     uintptr_t ptr = (uintptr_t)(src + 8*i);
985     assert( (ptr & 1) == 0 );                    // src must be 2-byte aligned for this
986     const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
987 #if defined(USING_NEON)
988     uint16x4x4_t v = vld4_u16(rgba);
989     r = cast<F>(swap_endian_16((U16)v.val[0])) * (1/65535.0f);
990     g = cast<F>(swap_endian_16((U16)v.val[1])) * (1/65535.0f);
991     b = cast<F>(swap_endian_16((U16)v.val[2])) * (1/65535.0f);
992     a = cast<F>(swap_endian_16((U16)v.val[3])) * (1/65535.0f);
993 #else
994     U64 px = swap_endian_16x4(load<U64>(rgba));
995 
996     r = cast<F>((px >>  0) & 0xffff) * (1/65535.0f);
997     g = cast<F>((px >> 16) & 0xffff) * (1/65535.0f);
998     b = cast<F>((px >> 32) & 0xffff) * (1/65535.0f);
999     a = cast<F>((px >> 48) & 0xffff) * (1/65535.0f);
1000 #endif
1001 }
1002 
STAGE(load_hhh,NoCtx)1003 STAGE(load_hhh, NoCtx) {
1004     uintptr_t ptr = (uintptr_t)(src + 6*i);
1005     assert( (ptr & 1) == 0 );                   // src must be 2-byte aligned for this
1006     const uint16_t* rgb = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
1007 #if defined(USING_NEON)
1008     uint16x4x3_t v = vld3_u16(rgb);
1009     U16 R = (U16)v.val[0],
1010         G = (U16)v.val[1],
1011         B = (U16)v.val[2];
1012 #else
1013     U16 R = load_3<U16>(rgb+0),
1014         G = load_3<U16>(rgb+1),
1015         B = load_3<U16>(rgb+2);
1016 #endif
1017     r = F_from_Half(R);
1018     g = F_from_Half(G);
1019     b = F_from_Half(B);
1020 }
1021 
STAGE(load_hhhh,NoCtx)1022 STAGE(load_hhhh, NoCtx) {
1023     uintptr_t ptr = (uintptr_t)(src + 8*i);
1024     assert( (ptr & 1) == 0 );                    // src must be 2-byte aligned for this
1025     const uint16_t* rgba = (const uint16_t*)ptr; // cast to const uint16_t* to be safe.
1026 #if defined(USING_NEON)
1027     uint16x4x4_t v = vld4_u16(rgba);
1028     U16 R = (U16)v.val[0],
1029         G = (U16)v.val[1],
1030         B = (U16)v.val[2],
1031         A = (U16)v.val[3];
1032 #else
1033     U64 px = load<U64>(rgba);
1034     U16 R = cast<U16>((px >>  0) & 0xffff),
1035         G = cast<U16>((px >> 16) & 0xffff),
1036         B = cast<U16>((px >> 32) & 0xffff),
1037         A = cast<U16>((px >> 48) & 0xffff);
1038 #endif
1039     r = F_from_Half(R);
1040     g = F_from_Half(G);
1041     b = F_from_Half(B);
1042     a = F_from_Half(A);
1043 }
1044 
STAGE(load_fff,NoCtx)1045 STAGE(load_fff, NoCtx) {
1046     uintptr_t ptr = (uintptr_t)(src + 12*i);
1047     assert( (ptr & 3) == 0 );                   // src must be 4-byte aligned for this
1048     const float* rgb = (const float*)ptr;       // cast to const float* to be safe.
1049 #if defined(USING_NEON)
1050     float32x4x3_t v = vld3q_f32(rgb);
1051     r = (F)v.val[0];
1052     g = (F)v.val[1];
1053     b = (F)v.val[2];
1054 #else
1055     r = load_3<F>(rgb+0);
1056     g = load_3<F>(rgb+1);
1057     b = load_3<F>(rgb+2);
1058 #endif
1059 }
1060 
STAGE(load_ffff,NoCtx)1061 STAGE(load_ffff, NoCtx) {
1062     uintptr_t ptr = (uintptr_t)(src + 16*i);
1063     assert( (ptr & 3) == 0 );                   // src must be 4-byte aligned for this
1064     const float* rgba = (const float*)ptr;      // cast to const float* to be safe.
1065 #if defined(USING_NEON)
1066     float32x4x4_t v = vld4q_f32(rgba);
1067     r = (F)v.val[0];
1068     g = (F)v.val[1];
1069     b = (F)v.val[2];
1070     a = (F)v.val[3];
1071 #else
1072     r = load_4<F>(rgba+0);
1073     g = load_4<F>(rgba+1);
1074     b = load_4<F>(rgba+2);
1075     a = load_4<F>(rgba+3);
1076 #endif
1077 }
1078 
STAGE(swap_rb,NoCtx)1079 STAGE(swap_rb, NoCtx) {
1080     F t = r;
1081     r = b;
1082     b = t;
1083 }
1084 
STAGE(clamp,NoCtx)1085 STAGE(clamp, NoCtx) {
1086     r = max_(F0, min_(r, F1));
1087     g = max_(F0, min_(g, F1));
1088     b = max_(F0, min_(b, F1));
1089     a = max_(F0, min_(a, F1));
1090 }
1091 
STAGE(invert,NoCtx)1092 STAGE(invert, NoCtx) {
1093     r = F1 - r;
1094     g = F1 - g;
1095     b = F1 - b;
1096     a = F1 - a;
1097 }
1098 
STAGE(force_opaque,NoCtx)1099 STAGE(force_opaque, NoCtx) {
1100     a = F1;
1101 }
1102 
STAGE(premul,NoCtx)1103 STAGE(premul, NoCtx) {
1104     r *= a;
1105     g *= a;
1106     b *= a;
1107 }
1108 
STAGE(unpremul,NoCtx)1109 STAGE(unpremul, NoCtx) {
1110     F scale = if_then_else(F1 / a < INFINITY_, F1 / a, F0);
1111     r *= scale;
1112     g *= scale;
1113     b *= scale;
1114 }
1115 
STAGE(matrix_3x3,const skcms_Matrix3x3 * matrix)1116 STAGE(matrix_3x3, const skcms_Matrix3x3* matrix) {
1117     const float* m = &matrix->vals[0][0];
1118 
1119     F R = m[0]*r + m[1]*g + m[2]*b,
1120       G = m[3]*r + m[4]*g + m[5]*b,
1121       B = m[6]*r + m[7]*g + m[8]*b;
1122 
1123     r = R;
1124     g = G;
1125     b = B;
1126 }
1127 
STAGE(matrix_3x4,const skcms_Matrix3x4 * matrix)1128 STAGE(matrix_3x4, const skcms_Matrix3x4* matrix) {
1129     const float* m = &matrix->vals[0][0];
1130 
1131     F R = m[0]*r + m[1]*g + m[ 2]*b + m[ 3],
1132       G = m[4]*r + m[5]*g + m[ 6]*b + m[ 7],
1133       B = m[8]*r + m[9]*g + m[10]*b + m[11];
1134 
1135     r = R;
1136     g = G;
1137     b = B;
1138 }
1139 
STAGE(lab_to_xyz,NoCtx)1140 STAGE(lab_to_xyz, NoCtx) {
1141     // The L*a*b values are in r,g,b, but normalized to [0,1].  Reconstruct them:
1142     F L = r * 100.0f,
1143       A = g * 255.0f - 128.0f,
1144       B = b * 255.0f - 128.0f;
1145 
1146     // Convert to CIE XYZ.
1147     F Y = (L + 16.0f) * (1/116.0f),
1148       X = Y + A*(1/500.0f),
1149       Z = Y - B*(1/200.0f);
1150 
1151     X = if_then_else(X*X*X > 0.008856f, X*X*X, (X - (16/116.0f)) * (1/7.787f));
1152     Y = if_then_else(Y*Y*Y > 0.008856f, Y*Y*Y, (Y - (16/116.0f)) * (1/7.787f));
1153     Z = if_then_else(Z*Z*Z > 0.008856f, Z*Z*Z, (Z - (16/116.0f)) * (1/7.787f));
1154 
1155     // Adjust to XYZD50 illuminant, and stuff back into r,g,b for the next op.
1156     r = X * 0.9642f;
1157     g = Y          ;
1158     b = Z * 0.8249f;
1159 }
1160 
1161 // As above, in reverse.
STAGE(xyz_to_lab,NoCtx)1162 STAGE(xyz_to_lab, NoCtx) {
1163     F X = r * (1/0.9642f),
1164       Y = g,
1165       Z = b * (1/0.8249f);
1166 
1167     X = if_then_else(X > 0.008856f, approx_pow(X, 1/3.0f), X*7.787f + (16/116.0f));
1168     Y = if_then_else(Y > 0.008856f, approx_pow(Y, 1/3.0f), Y*7.787f + (16/116.0f));
1169     Z = if_then_else(Z > 0.008856f, approx_pow(Z, 1/3.0f), Z*7.787f + (16/116.0f));
1170 
1171     F L = Y*116.0f - 16.0f,
1172       A = (X-Y)*500.0f,
1173       B = (Y-Z)*200.0f;
1174 
1175     r = L * (1/100.f);
1176     g = (A + 128.0f) * (1/255.0f);
1177     b = (B + 128.0f) * (1/255.0f);
1178 }
1179 
STAGE(gamma_r,const skcms_TransferFunction * tf)1180 STAGE(gamma_r, const skcms_TransferFunction* tf) { r = apply_gamma(tf, r); }
STAGE(gamma_g,const skcms_TransferFunction * tf)1181 STAGE(gamma_g, const skcms_TransferFunction* tf) { g = apply_gamma(tf, g); }
STAGE(gamma_b,const skcms_TransferFunction * tf)1182 STAGE(gamma_b, const skcms_TransferFunction* tf) { b = apply_gamma(tf, b); }
STAGE(gamma_a,const skcms_TransferFunction * tf)1183 STAGE(gamma_a, const skcms_TransferFunction* tf) { a = apply_gamma(tf, a); }
1184 
STAGE(gamma_rgb,const skcms_TransferFunction * tf)1185 STAGE(gamma_rgb, const skcms_TransferFunction* tf) {
1186     r = apply_gamma(tf, r);
1187     g = apply_gamma(tf, g);
1188     b = apply_gamma(tf, b);
1189 }
1190 
STAGE(tf_r,const skcms_TransferFunction * tf)1191 STAGE(tf_r, const skcms_TransferFunction* tf) { r = apply_tf(tf, r); }
STAGE(tf_g,const skcms_TransferFunction * tf)1192 STAGE(tf_g, const skcms_TransferFunction* tf) { g = apply_tf(tf, g); }
STAGE(tf_b,const skcms_TransferFunction * tf)1193 STAGE(tf_b, const skcms_TransferFunction* tf) { b = apply_tf(tf, b); }
STAGE(tf_a,const skcms_TransferFunction * tf)1194 STAGE(tf_a, const skcms_TransferFunction* tf) { a = apply_tf(tf, a); }
1195 
STAGE(tf_rgb,const skcms_TransferFunction * tf)1196 STAGE(tf_rgb, const skcms_TransferFunction* tf) {
1197     r = apply_tf(tf, r);
1198     g = apply_tf(tf, g);
1199     b = apply_tf(tf, b);
1200 }
1201 
STAGE(pq_r,const skcms_TransferFunction * tf)1202 STAGE(pq_r, const skcms_TransferFunction* tf) { r = apply_pq(tf, r); }
STAGE(pq_g,const skcms_TransferFunction * tf)1203 STAGE(pq_g, const skcms_TransferFunction* tf) { g = apply_pq(tf, g); }
STAGE(pq_b,const skcms_TransferFunction * tf)1204 STAGE(pq_b, const skcms_TransferFunction* tf) { b = apply_pq(tf, b); }
STAGE(pq_a,const skcms_TransferFunction * tf)1205 STAGE(pq_a, const skcms_TransferFunction* tf) { a = apply_pq(tf, a); }
1206 
STAGE(pq_rgb,const skcms_TransferFunction * tf)1207 STAGE(pq_rgb, const skcms_TransferFunction* tf) {
1208     r = apply_pq(tf, r);
1209     g = apply_pq(tf, g);
1210     b = apply_pq(tf, b);
1211 }
1212 
STAGE(hlg_r,const skcms_TransferFunction * tf)1213 STAGE(hlg_r, const skcms_TransferFunction* tf) { r = apply_hlg(tf, r); }
STAGE(hlg_g,const skcms_TransferFunction * tf)1214 STAGE(hlg_g, const skcms_TransferFunction* tf) { g = apply_hlg(tf, g); }
STAGE(hlg_b,const skcms_TransferFunction * tf)1215 STAGE(hlg_b, const skcms_TransferFunction* tf) { b = apply_hlg(tf, b); }
STAGE(hlg_a,const skcms_TransferFunction * tf)1216 STAGE(hlg_a, const skcms_TransferFunction* tf) { a = apply_hlg(tf, a); }
1217 
STAGE(hlg_rgb,const skcms_TransferFunction * tf)1218 STAGE(hlg_rgb, const skcms_TransferFunction* tf) {
1219     r = apply_hlg(tf, r);
1220     g = apply_hlg(tf, g);
1221     b = apply_hlg(tf, b);
1222 }
1223 
STAGE(hlginv_r,const skcms_TransferFunction * tf)1224 STAGE(hlginv_r, const skcms_TransferFunction* tf) { r = apply_hlginv(tf, r); }
STAGE(hlginv_g,const skcms_TransferFunction * tf)1225 STAGE(hlginv_g, const skcms_TransferFunction* tf) { g = apply_hlginv(tf, g); }
STAGE(hlginv_b,const skcms_TransferFunction * tf)1226 STAGE(hlginv_b, const skcms_TransferFunction* tf) { b = apply_hlginv(tf, b); }
STAGE(hlginv_a,const skcms_TransferFunction * tf)1227 STAGE(hlginv_a, const skcms_TransferFunction* tf) { a = apply_hlginv(tf, a); }
1228 
STAGE(hlginv_rgb,const skcms_TransferFunction * tf)1229 STAGE(hlginv_rgb, const skcms_TransferFunction* tf) {
1230     r = apply_hlginv(tf, r);
1231     g = apply_hlginv(tf, g);
1232     b = apply_hlginv(tf, b);
1233 }
1234 
STAGE(table_r,const skcms_Curve * curve)1235 STAGE(table_r, const skcms_Curve* curve) { r = table(curve, r); }
STAGE(table_g,const skcms_Curve * curve)1236 STAGE(table_g, const skcms_Curve* curve) { g = table(curve, g); }
STAGE(table_b,const skcms_Curve * curve)1237 STAGE(table_b, const skcms_Curve* curve) { b = table(curve, b); }
STAGE(table_a,const skcms_Curve * curve)1238 STAGE(table_a, const skcms_Curve* curve) { a = table(curve, a); }
1239 
STAGE(clut_A2B,const skcms_A2B * a2b)1240 STAGE(clut_A2B, const skcms_A2B* a2b) {
1241     clut(a2b, &r,&g,&b,a);
1242 
1243     if (a2b->input_channels == 4) {
1244         // CMYK is opaque.
1245         a = F1;
1246     }
1247 }
1248 
STAGE(clut_B2A,const skcms_B2A * b2a)1249 STAGE(clut_B2A, const skcms_B2A* b2a) {
1250     clut(b2a, &r,&g,&b,&a);
1251 }
1252 
1253 // From here on down, the store_ ops are all "final stages," terminating processing of this group.
1254 
FINAL_STAGE(store_a8,NoCtx)1255 FINAL_STAGE(store_a8, NoCtx) {
1256     store(dst + 1*i, cast<U8>(to_fixed(a * 255)));
1257 }
1258 
FINAL_STAGE(store_g8,NoCtx)1259 FINAL_STAGE(store_g8, NoCtx) {
1260     // g should be holding luminance (Y) (r,g,b ~~~> X,Y,Z)
1261     store(dst + 1*i, cast<U8>(to_fixed(g * 255)));
1262 }
1263 
FINAL_STAGE(store_ga88,NoCtx)1264 FINAL_STAGE(store_ga88, NoCtx) {
1265     // g should be holding luminance (Y) (r,g,b ~~~> X,Y,Z)
1266     store<U16>(dst + 2*i, cast<U16>(to_fixed(g * 255) << 0 )
1267                         | cast<U16>(to_fixed(a * 255) << 8 ));
1268 }
1269 
FINAL_STAGE(store_4444,NoCtx)1270 FINAL_STAGE(store_4444, NoCtx) {
1271     store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 15) << 12)
1272                         | cast<U16>(to_fixed(g * 15) <<  8)
1273                         | cast<U16>(to_fixed(b * 15) <<  4)
1274                         | cast<U16>(to_fixed(a * 15) <<  0));
1275 }
1276 
FINAL_STAGE(store_565,NoCtx)1277 FINAL_STAGE(store_565, NoCtx) {
1278     store<U16>(dst + 2*i, cast<U16>(to_fixed(r * 31) <<  0 )
1279                         | cast<U16>(to_fixed(g * 63) <<  5 )
1280                         | cast<U16>(to_fixed(b * 31) << 11 ));
1281 }
1282 
FINAL_STAGE(store_888,NoCtx)1283 FINAL_STAGE(store_888, NoCtx) {
1284     uint8_t* rgb = (uint8_t*)dst + 3*i;
1285 #if defined(USING_NEON)
1286     // Same deal as load_888 but in reverse... we'll store using uint8x8x3_t, but
1287     // get there via U16 to save some instructions converting to float.  And just
1288     // like load_888, we'd prefer to go via U32 but for ARMv7 support.
1289     U16 R = cast<U16>(to_fixed(r * 255)),
1290         G = cast<U16>(to_fixed(g * 255)),
1291         B = cast<U16>(to_fixed(b * 255));
1292 
1293     uint8x8x3_t v = {{ (uint8x8_t)R, (uint8x8_t)G, (uint8x8_t)B }};
1294     vst3_lane_u8(rgb+0, v, 0);
1295     vst3_lane_u8(rgb+3, v, 2);
1296     vst3_lane_u8(rgb+6, v, 4);
1297     vst3_lane_u8(rgb+9, v, 6);
1298 #else
1299     store_3(rgb+0, cast<U8>(to_fixed(r * 255)) );
1300     store_3(rgb+1, cast<U8>(to_fixed(g * 255)) );
1301     store_3(rgb+2, cast<U8>(to_fixed(b * 255)) );
1302 #endif
1303 }
1304 
FINAL_STAGE(store_8888,NoCtx)1305 FINAL_STAGE(store_8888, NoCtx) {
1306     store(dst + 4*i, cast<U32>(to_fixed(r * 255)) <<  0
1307                    | cast<U32>(to_fixed(g * 255)) <<  8
1308                    | cast<U32>(to_fixed(b * 255)) << 16
1309                    | cast<U32>(to_fixed(a * 255)) << 24);
1310 }
1311 
FINAL_STAGE(store_101010x_XR,NoCtx)1312 FINAL_STAGE(store_101010x_XR, NoCtx) {
1313     static constexpr float min = -0.752941f;
1314     static constexpr float max = 1.25098f;
1315     static constexpr float range = max - min;
1316     store(dst + 4*i, cast<U32>(to_fixed(((r - min) / range) * 1023)) <<  0
1317                    | cast<U32>(to_fixed(((g - min) / range) * 1023)) << 10
1318                    | cast<U32>(to_fixed(((b - min) / range) * 1023)) << 20);
1319 }
1320 
FINAL_STAGE(store_1010102,NoCtx)1321 FINAL_STAGE(store_1010102, NoCtx) {
1322     store(dst + 4*i, cast<U32>(to_fixed(r * 1023)) <<  0
1323                    | cast<U32>(to_fixed(g * 1023)) << 10
1324                    | cast<U32>(to_fixed(b * 1023)) << 20
1325                    | cast<U32>(to_fixed(a *    3)) << 30);
1326 }
1327 
FINAL_STAGE(store_161616LE,NoCtx)1328 FINAL_STAGE(store_161616LE, NoCtx) {
1329     uintptr_t ptr = (uintptr_t)(dst + 6*i);
1330     assert( (ptr & 1) == 0 );                // The dst pointer must be 2-byte aligned
1331     uint16_t* rgb = (uint16_t*)ptr;          // for this cast to uint16_t* to be safe.
1332 #if defined(USING_NEON)
1333     uint16x4x3_t v = {{
1334         (uint16x4_t)U16_from_F(r),
1335         (uint16x4_t)U16_from_F(g),
1336         (uint16x4_t)U16_from_F(b),
1337     }};
1338     vst3_u16(rgb, v);
1339 #else
1340     store_3(rgb+0, U16_from_F(r));
1341     store_3(rgb+1, U16_from_F(g));
1342     store_3(rgb+2, U16_from_F(b));
1343 #endif
1344 
1345 }
1346 
FINAL_STAGE(store_16161616LE,NoCtx)1347 FINAL_STAGE(store_16161616LE, NoCtx) {
1348     uintptr_t ptr = (uintptr_t)(dst + 8*i);
1349     assert( (ptr & 1) == 0 );               // The dst pointer must be 2-byte aligned
1350     uint16_t* rgba = (uint16_t*)ptr;        // for this cast to uint16_t* to be safe.
1351 #if defined(USING_NEON)
1352     uint16x4x4_t v = {{
1353         (uint16x4_t)U16_from_F(r),
1354         (uint16x4_t)U16_from_F(g),
1355         (uint16x4_t)U16_from_F(b),
1356         (uint16x4_t)U16_from_F(a),
1357     }};
1358     vst4_u16(rgba, v);
1359 #else
1360     U64 px = cast<U64>(to_fixed(r * 65535)) <<  0
1361            | cast<U64>(to_fixed(g * 65535)) << 16
1362            | cast<U64>(to_fixed(b * 65535)) << 32
1363            | cast<U64>(to_fixed(a * 65535)) << 48;
1364     store(rgba, px);
1365 #endif
1366 }
1367 
FINAL_STAGE(store_161616BE,NoCtx)1368 FINAL_STAGE(store_161616BE, NoCtx) {
1369     uintptr_t ptr = (uintptr_t)(dst + 6*i);
1370     assert( (ptr & 1) == 0 );                // The dst pointer must be 2-byte aligned
1371     uint16_t* rgb = (uint16_t*)ptr;          // for this cast to uint16_t* to be safe.
1372 #if defined(USING_NEON)
1373     uint16x4x3_t v = {{
1374         (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
1375         (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
1376         (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
1377     }};
1378     vst3_u16(rgb, v);
1379 #else
1380     U32 R = to_fixed(r * 65535),
1381         G = to_fixed(g * 65535),
1382         B = to_fixed(b * 65535);
1383     store_3(rgb+0, cast<U16>((R & 0x00ff) << 8 | (R & 0xff00) >> 8) );
1384     store_3(rgb+1, cast<U16>((G & 0x00ff) << 8 | (G & 0xff00) >> 8) );
1385     store_3(rgb+2, cast<U16>((B & 0x00ff) << 8 | (B & 0xff00) >> 8) );
1386 #endif
1387 
1388 }
1389 
FINAL_STAGE(store_16161616BE,NoCtx)1390 FINAL_STAGE(store_16161616BE, NoCtx) {
1391     uintptr_t ptr = (uintptr_t)(dst + 8*i);
1392     assert( (ptr & 1) == 0 );               // The dst pointer must be 2-byte aligned
1393     uint16_t* rgba = (uint16_t*)ptr;        // for this cast to uint16_t* to be safe.
1394 #if defined(USING_NEON)
1395     uint16x4x4_t v = {{
1396         (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(r))),
1397         (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(g))),
1398         (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(b))),
1399         (uint16x4_t)swap_endian_16(cast<U16>(U16_from_F(a))),
1400     }};
1401     vst4_u16(rgba, v);
1402 #else
1403     U64 px = cast<U64>(to_fixed(r * 65535)) <<  0
1404            | cast<U64>(to_fixed(g * 65535)) << 16
1405            | cast<U64>(to_fixed(b * 65535)) << 32
1406            | cast<U64>(to_fixed(a * 65535)) << 48;
1407     store(rgba, swap_endian_16x4(px));
1408 #endif
1409 }
1410 
FINAL_STAGE(store_hhh,NoCtx)1411 FINAL_STAGE(store_hhh, NoCtx) {
1412     uintptr_t ptr = (uintptr_t)(dst + 6*i);
1413     assert( (ptr & 1) == 0 );                // The dst pointer must be 2-byte aligned
1414     uint16_t* rgb = (uint16_t*)ptr;          // for this cast to uint16_t* to be safe.
1415 
1416     U16 R = Half_from_F(r),
1417         G = Half_from_F(g),
1418         B = Half_from_F(b);
1419 #if defined(USING_NEON)
1420     uint16x4x3_t v = {{
1421         (uint16x4_t)R,
1422         (uint16x4_t)G,
1423         (uint16x4_t)B,
1424     }};
1425     vst3_u16(rgb, v);
1426 #else
1427     store_3(rgb+0, R);
1428     store_3(rgb+1, G);
1429     store_3(rgb+2, B);
1430 #endif
1431 }
1432 
FINAL_STAGE(store_hhhh,NoCtx)1433 FINAL_STAGE(store_hhhh, NoCtx) {
1434     uintptr_t ptr = (uintptr_t)(dst + 8*i);
1435     assert( (ptr & 1) == 0 );                // The dst pointer must be 2-byte aligned
1436     uint16_t* rgba = (uint16_t*)ptr;         // for this cast to uint16_t* to be safe.
1437 
1438     U16 R = Half_from_F(r),
1439         G = Half_from_F(g),
1440         B = Half_from_F(b),
1441         A = Half_from_F(a);
1442 #if defined(USING_NEON)
1443     uint16x4x4_t v = {{
1444         (uint16x4_t)R,
1445         (uint16x4_t)G,
1446         (uint16x4_t)B,
1447         (uint16x4_t)A,
1448     }};
1449     vst4_u16(rgba, v);
1450 #else
1451     store(rgba, cast<U64>(R) <<  0
1452               | cast<U64>(G) << 16
1453               | cast<U64>(B) << 32
1454               | cast<U64>(A) << 48);
1455 #endif
1456 }
1457 
FINAL_STAGE(store_fff,NoCtx)1458 FINAL_STAGE(store_fff, NoCtx) {
1459     uintptr_t ptr = (uintptr_t)(dst + 12*i);
1460     assert( (ptr & 3) == 0 );                // The dst pointer must be 4-byte aligned
1461     float* rgb = (float*)ptr;                // for this cast to float* to be safe.
1462 #if defined(USING_NEON)
1463     float32x4x3_t v = {{
1464         (float32x4_t)r,
1465         (float32x4_t)g,
1466         (float32x4_t)b,
1467     }};
1468     vst3q_f32(rgb, v);
1469 #else
1470     store_3(rgb+0, r);
1471     store_3(rgb+1, g);
1472     store_3(rgb+2, b);
1473 #endif
1474 }
1475 
FINAL_STAGE(store_ffff,NoCtx)1476 FINAL_STAGE(store_ffff, NoCtx) {
1477     uintptr_t ptr = (uintptr_t)(dst + 16*i);
1478     assert( (ptr & 3) == 0 );                // The dst pointer must be 4-byte aligned
1479     float* rgba = (float*)ptr;               // for this cast to float* to be safe.
1480 #if defined(USING_NEON)
1481     float32x4x4_t v = {{
1482         (float32x4_t)r,
1483         (float32x4_t)g,
1484         (float32x4_t)b,
1485         (float32x4_t)a,
1486     }};
1487     vst4q_f32(rgba, v);
1488 #else
1489     store_4(rgba+0, r);
1490     store_4(rgba+1, g);
1491     store_4(rgba+2, b);
1492     store_4(rgba+3, a);
1493 #endif
1494 }
1495 
1496 #if SKCMS_HAS_MUSTTAIL
1497 
exec_stages(StageFn * stages,const void ** contexts,const char * src,char * dst,int i)1498     SI void exec_stages(StageFn* stages, const void** contexts, const char* src, char* dst, int i) {
1499         (*stages)({stages}, contexts, src, dst, F0, F0, F0, F1, i);
1500     }
1501 
1502 #else
1503 
exec_stages(const Op * ops,const void ** contexts,const char * src,char * dst,int i)1504     static void exec_stages(const Op* ops, const void** contexts,
1505                             const char* src, char* dst, int i) {
1506         F r = F0, g = F0, b = F0, a = F1;
1507         while (true) {
1508             switch (*ops++) {
1509 #define M(name) case Op::name: Exec_##name(*contexts++, src, dst, r, g, b, a, i); break;
1510                 SKCMS_WORK_OPS(M)
1511 #undef M
1512 #define M(name) case Op::name: Exec_##name(*contexts++, src, dst, r, g, b, a, i); return;
1513                 SKCMS_STORE_OPS(M)
1514 #undef M
1515             }
1516         }
1517     }
1518 
1519 #endif
1520 
1521 // NOLINTNEXTLINE(misc-definitions-in-headers)
run_program(const Op * program,const void ** contexts,SKCMS_MAYBE_UNUSED ptrdiff_t programSize,const char * src,char * dst,int n,const size_t src_bpp,const size_t dst_bpp)1522 void run_program(const Op* program, const void** contexts, SKCMS_MAYBE_UNUSED ptrdiff_t programSize,
1523                  const char* src, char* dst, int n,
1524                  const size_t src_bpp, const size_t dst_bpp) {
1525 #if SKCMS_HAS_MUSTTAIL
1526     // Convert the program into an array of tailcall stages.
1527     StageFn stages[32];
1528     assert(programSize <= ARRAY_COUNT(stages));
1529 
1530     static constexpr StageFn kStageFns[] = {
1531 #define M(name) &Exec_##name,
1532         SKCMS_WORK_OPS(M)
1533         SKCMS_STORE_OPS(M)
1534 #undef M
1535     };
1536 
1537     for (ptrdiff_t index = 0; index < programSize; ++index) {
1538         stages[index] = kStageFns[(int)program[index]];
1539     }
1540 #else
1541     // Use the op array as-is.
1542     const Op* stages = program;
1543 #endif
1544 
1545     int i = 0;
1546     while (n >= N) {
1547         exec_stages(stages, contexts, src, dst, i);
1548         i += N;
1549         n -= N;
1550     }
1551     if (n > 0) {
1552         char tmp[4*4*N] = {0};
1553 
1554         memcpy(tmp, (const char*)src + (size_t)i*src_bpp, (size_t)n*src_bpp);
1555         exec_stages(stages, contexts, tmp, tmp, 0);
1556         memcpy((char*)dst + (size_t)i*dst_bpp, tmp, (size_t)n*dst_bpp);
1557     }
1558 }
1559