xref: /aosp_15_r20/external/skia/src/sksl/codegen/SkSLRasterPipelineBuilder.cpp (revision c8dee2aa9b3f27cf6c858bd81872bdeb2c07ed17)
1 /*
2  * Copyright 2022 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "src/sksl/codegen/SkSLRasterPipelineBuilder.h"
9 #include <cstdint>
10 #include <optional>
11 
12 #include "include/core/SkStream.h"
13 #include "include/private/base/SkMalloc.h"
14 #include "include/private/base/SkTFitsIn.h"
15 #include "include/private/base/SkTo.h"
16 #include "src/base/SkArenaAlloc.h"
17 #include "src/base/SkSafeMath.h"
18 #include "src/core/SkOpts.h"
19 #include "src/core/SkRasterPipelineContextUtils.h"
20 #include "src/core/SkRasterPipelineOpContexts.h"
21 #include "src/core/SkRasterPipelineOpList.h"
22 #include "src/core/SkTHash.h"
23 #include "src/sksl/SkSLPosition.h"
24 #include "src/sksl/SkSLString.h"
25 #include "src/sksl/tracing/SkSLDebugTracePriv.h"
26 #include "src/sksl/tracing/SkSLTraceHook.h"
27 
28 #if !defined(SKSL_STANDALONE)
29 #include "src/core/SkRasterPipeline.h"
30 #endif
31 
32 #include <algorithm>
33 #include <cmath>
34 #include <cstddef>
35 #include <cstring>
36 #include <iterator>
37 #include <string>
38 #include <string_view>
39 #include <tuple>
40 #include <utility>
41 #include <vector>
42 
43 using namespace skia_private;
44 
45 namespace SkSL::RP {
46 
47 #define ALL_SINGLE_SLOT_UNARY_OP_CASES  \
48          BuilderOp::acos_float:         \
49     case BuilderOp::asin_float:         \
50     case BuilderOp::atan_float:         \
51     case BuilderOp::cos_float:          \
52     case BuilderOp::exp_float:          \
53     case BuilderOp::exp2_float:         \
54     case BuilderOp::log_float:          \
55     case BuilderOp::log2_float:         \
56     case BuilderOp::sin_float:          \
57     case BuilderOp::sqrt_float:         \
58     case BuilderOp::tan_float
59 
60 #define ALL_MULTI_SLOT_UNARY_OP_CASES        \
61          BuilderOp::abs_int:                 \
62     case BuilderOp::cast_to_float_from_int:  \
63     case BuilderOp::cast_to_float_from_uint: \
64     case BuilderOp::cast_to_int_from_float:  \
65     case BuilderOp::cast_to_uint_from_float: \
66     case BuilderOp::ceil_float:              \
67     case BuilderOp::floor_float:             \
68     case BuilderOp::invsqrt_float
69 
70 #define ALL_N_WAY_BINARY_OP_CASES   \
71          BuilderOp::atan2_n_floats: \
72     case BuilderOp::pow_n_floats
73 
74 #define ALL_MULTI_SLOT_BINARY_OP_CASES  \
75          BuilderOp::add_n_floats:       \
76     case BuilderOp::add_n_ints:         \
77     case BuilderOp::sub_n_floats:       \
78     case BuilderOp::sub_n_ints:         \
79     case BuilderOp::mul_n_floats:       \
80     case BuilderOp::mul_n_ints:         \
81     case BuilderOp::div_n_floats:       \
82     case BuilderOp::div_n_ints:         \
83     case BuilderOp::div_n_uints:        \
84     case BuilderOp::bitwise_and_n_ints: \
85     case BuilderOp::bitwise_or_n_ints:  \
86     case BuilderOp::bitwise_xor_n_ints: \
87     case BuilderOp::mod_n_floats:       \
88     case BuilderOp::min_n_floats:       \
89     case BuilderOp::min_n_ints:         \
90     case BuilderOp::min_n_uints:        \
91     case BuilderOp::max_n_floats:       \
92     case BuilderOp::max_n_ints:         \
93     case BuilderOp::max_n_uints:        \
94     case BuilderOp::cmple_n_floats:     \
95     case BuilderOp::cmple_n_ints:       \
96     case BuilderOp::cmple_n_uints:      \
97     case BuilderOp::cmplt_n_floats:     \
98     case BuilderOp::cmplt_n_ints:       \
99     case BuilderOp::cmplt_n_uints:      \
100     case BuilderOp::cmpeq_n_floats:     \
101     case BuilderOp::cmpeq_n_ints:       \
102     case BuilderOp::cmpne_n_floats:     \
103     case BuilderOp::cmpne_n_ints
104 
105 #define ALL_IMMEDIATE_BINARY_OP_CASES    \
106          BuilderOp::add_imm_float:       \
107     case BuilderOp::add_imm_int:         \
108     case BuilderOp::mul_imm_float:       \
109     case BuilderOp::mul_imm_int:         \
110     case BuilderOp::bitwise_and_imm_int: \
111     case BuilderOp::bitwise_xor_imm_int: \
112     case BuilderOp::min_imm_float:       \
113     case BuilderOp::max_imm_float:       \
114     case BuilderOp::cmple_imm_float:     \
115     case BuilderOp::cmple_imm_int:       \
116     case BuilderOp::cmple_imm_uint:      \
117     case BuilderOp::cmplt_imm_float:     \
118     case BuilderOp::cmplt_imm_int:       \
119     case BuilderOp::cmplt_imm_uint:      \
120     case BuilderOp::cmpeq_imm_float:     \
121     case BuilderOp::cmpeq_imm_int:       \
122     case BuilderOp::cmpne_imm_float:     \
123     case BuilderOp::cmpne_imm_int
124 
125 #define ALL_IMMEDIATE_MULTI_SLOT_BINARY_OP_CASES \
126          BuilderOp::bitwise_and_imm_int
127 
128 #define ALL_N_WAY_TERNARY_OP_CASES       \
129          BuilderOp::smoothstep_n_floats
130 
131 #define ALL_MULTI_SLOT_TERNARY_OP_CASES \
132          BuilderOp::mix_n_floats:       \
133     case BuilderOp::mix_n_ints
134 
is_immediate_op(BuilderOp op)135 static bool is_immediate_op(BuilderOp op) {
136     switch (op) {
137         case ALL_IMMEDIATE_BINARY_OP_CASES: return true;
138         default:                            return false;
139     }
140 }
141 
is_multi_slot_immediate_op(BuilderOp op)142 static bool is_multi_slot_immediate_op(BuilderOp op) {
143     switch (op) {
144         case ALL_IMMEDIATE_MULTI_SLOT_BINARY_OP_CASES: return true;
145         default:                                       return false;
146     }
147 }
148 
convert_n_way_op_to_immediate(BuilderOp op,int slots,int32_t * constantValue)149 static BuilderOp convert_n_way_op_to_immediate(BuilderOp op, int slots, int32_t* constantValue) {
150     // We rely on the exact ordering of SkRP ops here; the immediate-mode op must always come
151     // directly before the n-way op. (If we have more than one, the increasing-slot variations
152     // continue backwards from there.)
153     BuilderOp immOp = (BuilderOp)((int)op - 1);
154 
155     // Some immediate ops support multiple slots.
156     if (is_multi_slot_immediate_op(immOp)) {
157         return immOp;
158     }
159 
160     // Most immediate ops only directly support a single slot. However, it's still faster to execute
161     // `add_imm_int, add_imm_int` instead of `splat_2_ints, add_2_ints`, so we allow those
162     // conversions as well.
163     if (slots <= 2) {
164         if (is_immediate_op(immOp)) {
165             return immOp;
166         }
167 
168         // We also allow for immediate-mode subtraction, by adding a negative value.
169         switch (op) {
170             case BuilderOp::sub_n_ints:
171                 *constantValue *= -1;
172                 return BuilderOp::add_imm_int;
173 
174             case BuilderOp::sub_n_floats: {
175                 // This negates the floating-point value by inverting its sign bit.
176                 *constantValue ^= 0x80000000;
177                 return BuilderOp::add_imm_float;
178             }
179             default:
180                 break;
181         }
182     }
183 
184     // We don't have an immediate-mode version of this op.
185     return op;
186 }
187 
appendInstruction(BuilderOp op,SlotList slots,int immA,int immB,int immC,int immD)188 void Builder::appendInstruction(BuilderOp op, SlotList slots,
189                                 int immA, int immB, int immC, int immD) {
190     fInstructions.push_back({op, slots.fSlotA, slots.fSlotB,
191                              immA, immB, immC, immD, fCurrentStackID});
192 }
193 
lastInstruction(int fromBack)194 Instruction* Builder::lastInstruction(int fromBack) {
195     if (fInstructions.size() <= fromBack) {
196         return nullptr;
197     }
198     Instruction* inst = &fInstructions.fromBack(fromBack);
199     if (inst->fStackID != fCurrentStackID) {
200         return nullptr;
201     }
202     return inst;
203 }
204 
lastInstructionOnAnyStack(int fromBack)205 Instruction* Builder::lastInstructionOnAnyStack(int fromBack) {
206     if (fInstructions.size() <= fromBack) {
207         return nullptr;
208     }
209     return &fInstructions.fromBack(fromBack);
210 }
211 
unary_op(BuilderOp op,int32_t slots)212 void Builder::unary_op(BuilderOp op, int32_t slots) {
213     switch (op) {
214         case ALL_SINGLE_SLOT_UNARY_OP_CASES:
215         case ALL_MULTI_SLOT_UNARY_OP_CASES:
216             this->appendInstruction(op, {}, slots);
217             break;
218 
219         default:
220             SkDEBUGFAIL("not a unary op");
221             break;
222     }
223 }
224 
binary_op(BuilderOp op,int32_t slots)225 void Builder::binary_op(BuilderOp op, int32_t slots) {
226     if (Instruction* lastInstruction = this->lastInstruction()) {
227         // If we just pushed or splatted a constant onto the stack...
228         if (lastInstruction->fOp == BuilderOp::push_constant &&
229             lastInstruction->fImmA >= slots) {
230             // ... and this op has an immediate-mode equivalent...
231             int32_t constantValue = lastInstruction->fImmB;
232             BuilderOp immOp = convert_n_way_op_to_immediate(op, slots, &constantValue);
233             if (immOp != op) {
234                 // ... discard the constants from the stack, and use an immediate-mode op.
235                 this->discard_stack(slots);
236                 this->appendInstruction(immOp, {}, slots, constantValue);
237                 return;
238             }
239         }
240     }
241 
242     switch (op) {
243         case ALL_N_WAY_BINARY_OP_CASES:
244         case ALL_MULTI_SLOT_BINARY_OP_CASES:
245             this->appendInstruction(op, {}, slots);
246             break;
247 
248         default:
249             SkDEBUGFAIL("not a binary op");
250             break;
251     }
252 }
253 
ternary_op(BuilderOp op,int32_t slots)254 void Builder::ternary_op(BuilderOp op, int32_t slots) {
255     switch (op) {
256         case ALL_N_WAY_TERNARY_OP_CASES:
257         case ALL_MULTI_SLOT_TERNARY_OP_CASES:
258             this->appendInstruction(op, {}, slots);
259             break;
260 
261         default:
262             SkDEBUGFAIL("not a ternary op");
263             break;
264     }
265 }
266 
dot_floats(int32_t slots)267 void Builder::dot_floats(int32_t slots) {
268     switch (slots) {
269         case 1: this->appendInstruction(BuilderOp::mul_n_floats, {}, slots); break;
270         case 2: this->appendInstruction(BuilderOp::dot_2_floats, {}, slots); break;
271         case 3: this->appendInstruction(BuilderOp::dot_3_floats, {}, slots); break;
272         case 4: this->appendInstruction(BuilderOp::dot_4_floats, {}, slots); break;
273 
274         default:
275             SkDEBUGFAIL("invalid number of slots");
276             break;
277     }
278 }
279 
refract_floats()280 void Builder::refract_floats() {
281     this->appendInstruction(BuilderOp::refract_4_floats, {});
282 }
283 
inverse_matrix(int32_t n)284 void Builder::inverse_matrix(int32_t n) {
285     switch (n) {
286         case 2:  this->appendInstruction(BuilderOp::inverse_mat2, {}, 4);  break;
287         case 3:  this->appendInstruction(BuilderOp::inverse_mat3, {}, 9);  break;
288         case 4:  this->appendInstruction(BuilderOp::inverse_mat4, {}, 16); break;
289         default: SkUNREACHABLE;
290     }
291 }
292 
pad_stack(int32_t count)293 void Builder::pad_stack(int32_t count) {
294     if (count > 0) {
295         this->appendInstruction(BuilderOp::pad_stack, {}, count);
296     }
297 }
298 
simplifyImmediateUnmaskedOp()299 bool Builder::simplifyImmediateUnmaskedOp() {
300     if (fInstructions.size() < 3) {
301         return false;
302     }
303 
304     // If we detect a pattern of 'push, immediate-op, unmasked pop', then we can
305     // convert it into an immediate-op directly onto the value slots and take the
306     // stack entirely out of the equation.
307     Instruction* popInstruction  = this->lastInstruction(/*fromBack=*/0);
308     Instruction* immInstruction  = this->lastInstruction(/*fromBack=*/1);
309     Instruction* pushInstruction = this->lastInstruction(/*fromBack=*/2);
310 
311     // If the last instruction is an unmasked pop...
312     if (popInstruction && immInstruction && pushInstruction &&
313         popInstruction->fOp == BuilderOp::copy_stack_to_slots_unmasked) {
314         // ... and the prior instruction was an immediate-mode op, with the same number of slots...
315         if (is_immediate_op(immInstruction->fOp) &&
316             immInstruction->fImmA == popInstruction->fImmA) {
317             // ... and we support multiple-slot immediates (if this op calls for it)...
318             if (immInstruction->fImmA == 1 || is_multi_slot_immediate_op(immInstruction->fOp)) {
319                 // ... and the prior instruction was `push_slots` or `push_immutable` of at least
320                 // that many slots...
321                 if ((pushInstruction->fOp == BuilderOp::push_slots ||
322                      pushInstruction->fOp == BuilderOp::push_immutable) &&
323                     pushInstruction->fImmA >= popInstruction->fImmA) {
324                     // ... onto the same slot range...
325                     Slot immSlot = popInstruction->fSlotA + popInstruction->fImmA;
326                     Slot pushSlot = pushInstruction->fSlotA + pushInstruction->fImmA;
327                     if (immSlot == pushSlot) {
328                         // ... we can shrink the push, eliminate the pop, and perform the immediate
329                         // op in-place instead.
330                         pushInstruction->fImmA -= immInstruction->fImmA;
331                         immInstruction->fSlotA = immSlot - immInstruction->fImmA;
332                         fInstructions.pop_back();
333                         return true;
334                     }
335                 }
336             }
337         }
338     }
339 
340     return false;
341 }
342 
discard_stack(int32_t count,int stackID)343 void Builder::discard_stack(int32_t count, int stackID) {
344     // If we pushed something onto the stack and then immediately discarded part of it, we can
345     // shrink or eliminate the push.
346     while (count > 0) {
347         Instruction* lastInstruction = this->lastInstructionOnAnyStack();
348         if (!lastInstruction || lastInstruction->fStackID != stackID) {
349             break;
350         }
351 
352         switch (lastInstruction->fOp) {
353             case BuilderOp::discard_stack:
354                 // Our last op was actually a separate discard_stack; combine the discards.
355                 lastInstruction->fImmA += count;
356                 return;
357 
358             case BuilderOp::push_clone:
359             case BuilderOp::push_clone_from_stack:
360             case BuilderOp::push_clone_indirect_from_stack:
361             case BuilderOp::push_constant:
362             case BuilderOp::push_immutable:
363             case BuilderOp::push_immutable_indirect:
364             case BuilderOp::push_slots:
365             case BuilderOp::push_slots_indirect:
366             case BuilderOp::push_uniform:
367             case BuilderOp::push_uniform_indirect:
368             case BuilderOp::pad_stack: {
369                 // Our last op was a multi-slot push; these cancel out. Eliminate the op if its
370                 // count reached zero.
371                 int cancelOut = std::min(count, lastInstruction->fImmA);
372                 count                  -= cancelOut;
373                 lastInstruction->fImmA -= cancelOut;
374                 if (lastInstruction->fImmA == 0) {
375                     fInstructions.pop_back();
376                 }
377                 continue;
378             }
379             case BuilderOp::push_condition_mask:
380             case BuilderOp::push_loop_mask:
381             case BuilderOp::push_return_mask:
382                 // Our last op was a single-slot push; cancel out one discard and eliminate the op.
383                 --count;
384                 fInstructions.pop_back();
385                 continue;
386 
387             case BuilderOp::copy_stack_to_slots_unmasked: {
388                 // Look for a pattern of `push, immediate-ops, pop` and simplify it down to an
389                 // immediate-op directly to the value slot.
390                 if (count == 1) {
391                     if (this->simplifyImmediateUnmaskedOp()) {
392                         return;
393                     }
394                 }
395 
396                 // A `copy_stack_to_slots_unmasked` op, followed immediately by a `discard_stack`
397                 // op with an equal number of slots, is interpreted as an unmasked stack pop.
398                 // We can simplify pops in a variety of ways. First, temporarily get rid of
399                 // `copy_stack_to_slots_unmasked`.
400                 if (count == lastInstruction->fImmA) {
401                     SlotRange dst{lastInstruction->fSlotA, lastInstruction->fImmA};
402                     fInstructions.pop_back();
403 
404                     // See if we can write this pop in a simpler way.
405                     this->simplifyPopSlotsUnmasked(&dst);
406 
407                     // If simplification consumed the entire range, we're done!
408                     if (dst.count == 0) {
409                         return;
410                     }
411 
412                     // Simplification did not consume the entire range. We are still responsible for
413                     // copying-back and discarding any remaining slots.
414                     this->copy_stack_to_slots_unmasked(dst);
415                     count = dst.count;
416                 }
417                 break;
418             }
419             default:
420                 break;
421         }
422 
423         // This instruction wasn't a push.
424         break;
425     }
426 
427     if (count > 0) {
428         this->appendInstruction(BuilderOp::discard_stack, {}, count);
429     }
430 }
431 
label(int labelID)432 void Builder::label(int labelID) {
433     SkASSERT(labelID >= 0 && labelID < fNumLabels);
434 
435     // If the previous instruction was a branch to this label, it's a no-op; jumping to the very
436     // next instruction is effectively meaningless.
437     while (const Instruction* lastInstruction = this->lastInstructionOnAnyStack()) {
438         switch (lastInstruction->fOp) {
439             case BuilderOp::jump:
440             case BuilderOp::branch_if_all_lanes_active:
441             case BuilderOp::branch_if_any_lanes_active:
442             case BuilderOp::branch_if_no_lanes_active:
443             case BuilderOp::branch_if_no_active_lanes_on_stack_top_equal:
444                 if (lastInstruction->fImmA == labelID) {
445                     fInstructions.pop_back();
446                     continue;
447                 }
448                 break;
449 
450             default:
451                 break;
452         }
453         break;
454     }
455     this->appendInstruction(BuilderOp::label, {}, labelID);
456 }
457 
jump(int labelID)458 void Builder::jump(int labelID) {
459     SkASSERT(labelID >= 0 && labelID < fNumLabels);
460     if (const Instruction* lastInstruction = this->lastInstructionOnAnyStack()) {
461         if (lastInstruction->fOp == BuilderOp::jump) {
462             // The previous instruction was also `jump`, so this branch could never possibly occur.
463             return;
464         }
465     }
466     this->appendInstruction(BuilderOp::jump, {}, labelID);
467 }
468 
branch_if_any_lanes_active(int labelID)469 void Builder::branch_if_any_lanes_active(int labelID) {
470     if (!this->executionMaskWritesAreEnabled()) {
471         this->jump(labelID);
472         return;
473     }
474 
475     SkASSERT(labelID >= 0 && labelID < fNumLabels);
476     if (const Instruction* lastInstruction = this->lastInstructionOnAnyStack()) {
477         if (lastInstruction->fOp == BuilderOp::branch_if_any_lanes_active ||
478             lastInstruction->fOp == BuilderOp::jump) {
479             // The previous instruction was `jump` or `branch_if_any_lanes_active`, so this branch
480             // could never possibly occur.
481             return;
482         }
483     }
484     this->appendInstruction(BuilderOp::branch_if_any_lanes_active, {}, labelID);
485 }
486 
branch_if_all_lanes_active(int labelID)487 void Builder::branch_if_all_lanes_active(int labelID) {
488     if (!this->executionMaskWritesAreEnabled()) {
489         this->jump(labelID);
490         return;
491     }
492 
493     SkASSERT(labelID >= 0 && labelID < fNumLabels);
494     if (const Instruction* lastInstruction = this->lastInstructionOnAnyStack()) {
495         if (lastInstruction->fOp == BuilderOp::branch_if_all_lanes_active ||
496             lastInstruction->fOp == BuilderOp::jump) {
497             // The previous instruction was `jump` or `branch_if_all_lanes_active`, so this branch
498             // could never possibly occur.
499             return;
500         }
501     }
502     this->appendInstruction(BuilderOp::branch_if_all_lanes_active, {}, labelID);
503 }
504 
branch_if_no_lanes_active(int labelID)505 void Builder::branch_if_no_lanes_active(int labelID) {
506     if (!this->executionMaskWritesAreEnabled()) {
507         return;
508     }
509 
510     SkASSERT(labelID >= 0 && labelID < fNumLabels);
511     if (const Instruction* lastInstruction = this->lastInstructionOnAnyStack()) {
512         if (lastInstruction->fOp == BuilderOp::branch_if_no_lanes_active ||
513             lastInstruction->fOp == BuilderOp::jump) {
514             // The previous instruction was `jump` or `branch_if_no_lanes_active`, so this branch
515             // could never possibly occur.
516             return;
517         }
518     }
519     this->appendInstruction(BuilderOp::branch_if_no_lanes_active, {}, labelID);
520 }
521 
branch_if_no_active_lanes_on_stack_top_equal(int value,int labelID)522 void Builder::branch_if_no_active_lanes_on_stack_top_equal(int value, int labelID) {
523     SkASSERT(labelID >= 0 && labelID < fNumLabels);
524     if (const Instruction* lastInstruction = this->lastInstructionOnAnyStack()) {
525         if (lastInstruction->fOp == BuilderOp::jump ||
526             (lastInstruction->fOp == BuilderOp::branch_if_no_active_lanes_on_stack_top_equal &&
527              lastInstruction->fImmB == value)) {
528             // The previous instruction was `jump` or `branch_if_no_active_lanes_on_stack_top_equal`
529             // (checking against the same value), so this branch could never possibly occur.
530             return;
531         }
532     }
533     this->appendInstruction(BuilderOp::branch_if_no_active_lanes_on_stack_top_equal,
534                             {}, labelID, value);
535 }
536 
push_slots_or_immutable(SlotRange src,BuilderOp op)537 void Builder::push_slots_or_immutable(SlotRange src, BuilderOp op) {
538     SkASSERT(src.count >= 0);
539     if (Instruction* lastInstruction = this->lastInstruction()) {
540         // If the previous instruction was pushing slots contiguous to this range, we can collapse
541         // the two pushes into one larger push.
542         if (lastInstruction->fOp == op &&
543             lastInstruction->fSlotA + lastInstruction->fImmA == src.index) {
544             lastInstruction->fImmA += src.count;
545             src.count = 0;
546         }
547     }
548 
549     if (src.count > 0) {
550         this->appendInstruction(op, {src.index}, src.count);
551     }
552 
553     // Look for a sequence of "copy stack to X, discard stack, copy X to stack". This is a common
554     // pattern when multiple operations in a row affect the same variable. When we see this, we can
555     // eliminate both the discard and the push.
556     if (fInstructions.size() >= 3) {
557         const Instruction* pushInst        = this->lastInstruction(/*fromBack=*/0);
558         const Instruction* discardInst     = this->lastInstruction(/*fromBack=*/1);
559         const Instruction* copyToSlotsInst = this->lastInstruction(/*fromBack=*/2);
560 
561         if (pushInst && discardInst && copyToSlotsInst && pushInst->fOp == BuilderOp::push_slots) {
562             int pushIndex = pushInst->fSlotA;
563             int pushCount = pushInst->fImmA;
564 
565             // Look for a `discard_stack` matching our push count.
566             if (discardInst->fOp == BuilderOp::discard_stack && discardInst->fImmA == pushCount) {
567                 // Look for a `copy_stack_to_slots` matching our push.
568                 if ((copyToSlotsInst->fOp == BuilderOp::copy_stack_to_slots ||
569                      copyToSlotsInst->fOp == BuilderOp::copy_stack_to_slots_unmasked) &&
570                     copyToSlotsInst->fSlotA == pushIndex && copyToSlotsInst->fImmA == pushCount) {
571                     // We found a matching sequence. Remove the discard and push.
572                     fInstructions.pop_back();
573                     fInstructions.pop_back();
574                     return;
575                 }
576             }
577         }
578     }
579 }
580 
push_slots_or_immutable_indirect(SlotRange fixedRange,int dynamicStackID,SlotRange limitRange,BuilderOp op)581 void Builder::push_slots_or_immutable_indirect(SlotRange fixedRange,
582                                                int dynamicStackID,
583                                                SlotRange limitRange,
584                                                BuilderOp op) {
585     // SlotA: fixed-range start
586     // SlotB: limit-range end
587     // immA: number of slots
588     // immB: dynamic stack ID
589     this->appendInstruction(op,
590                             {fixedRange.index, limitRange.index + limitRange.count},
591                             fixedRange.count,
592                             dynamicStackID);
593 }
594 
push_uniform(SlotRange src)595 void Builder::push_uniform(SlotRange src) {
596     SkASSERT(src.count >= 0);
597     if (Instruction* lastInstruction = this->lastInstruction()) {
598         // If the previous instruction was pushing uniforms contiguous to this range, we can
599         // collapse the two pushes into one larger push.
600         if (lastInstruction->fOp == BuilderOp::push_uniform &&
601             lastInstruction->fSlotA + lastInstruction->fImmA == src.index) {
602             lastInstruction->fImmA += src.count;
603             return;
604         }
605     }
606 
607     if (src.count > 0) {
608         this->appendInstruction(BuilderOp::push_uniform, {src.index}, src.count);
609     }
610 }
611 
push_uniform_indirect(SlotRange fixedRange,int dynamicStackID,SlotRange limitRange)612 void Builder::push_uniform_indirect(SlotRange fixedRange,
613                                     int dynamicStackID,
614                                     SlotRange limitRange) {
615     // SlotA: fixed-range start
616     // SlotB: limit-range end
617     // immA: number of slots
618     // immB: dynamic stack ID
619     this->appendInstruction(BuilderOp::push_uniform_indirect,
620                             {fixedRange.index, limitRange.index + limitRange.count},
621                             fixedRange.count,
622                             dynamicStackID);
623 }
624 
trace_var_indirect(int traceMaskStackID,SlotRange fixedRange,int dynamicStackID,SlotRange limitRange)625 void Builder::trace_var_indirect(int traceMaskStackID,
626                                  SlotRange fixedRange,
627                                  int dynamicStackID,
628                                  SlotRange limitRange) {
629     // SlotA: fixed-range start
630     // SlotB: limit-range end
631     // immA: trace-mask stack ID
632     // immB: number of slots
633     // immC: dynamic stack ID
634     this->appendInstruction(BuilderOp::trace_var_indirect,
635                             {fixedRange.index, limitRange.index + limitRange.count},
636                             traceMaskStackID,
637                             fixedRange.count,
638                             dynamicStackID);
639 }
640 
push_constant_i(int32_t val,int count)641 void Builder::push_constant_i(int32_t val, int count) {
642     SkASSERT(count >= 0);
643     if (count > 0) {
644         if (Instruction* lastInstruction = this->lastInstruction()) {
645             // If the previous op is pushing the same value, we can just push more of them.
646             if (lastInstruction->fOp == BuilderOp::push_constant && lastInstruction->fImmB == val) {
647                 lastInstruction->fImmA += count;
648                 return;
649             }
650         }
651         this->appendInstruction(BuilderOp::push_constant, {}, count, val);
652     }
653 }
654 
push_duplicates(int count)655 void Builder::push_duplicates(int count) {
656     if (Instruction* lastInstruction = this->lastInstruction()) {
657         // If the previous op is pushing a constant, we can just push more of them.
658         if (lastInstruction->fOp == BuilderOp::push_constant) {
659             lastInstruction->fImmA += count;
660             return;
661         }
662     }
663     SkASSERT(count >= 0);
664     if (count >= 3) {
665         // Use a swizzle to splat the input into a 4-slot value.
666         this->swizzle(/*consumedSlots=*/1, {0, 0, 0, 0});
667         count -= 3;
668     }
669     for (; count >= 4; count -= 4) {
670         // Clone the splatted value four slots at a time.
671         this->push_clone(/*numSlots=*/4);
672     }
673     // Use a swizzle or clone to handle the trailing items.
674     switch (count) {
675         case 3:  this->swizzle(/*consumedSlots=*/1, {0, 0, 0, 0}); break;
676         case 2:  this->swizzle(/*consumedSlots=*/1, {0, 0, 0});    break;
677         case 1:  this->push_clone(/*numSlots=*/1);                 break;
678         default: break;
679     }
680 }
681 
push_clone(int numSlots,int offsetFromStackTop)682 void Builder::push_clone(int numSlots, int offsetFromStackTop) {
683     // If we are cloning the stack top...
684     if (numSlots == 1 && offsetFromStackTop == 0) {
685         // ... and the previous op is pushing a constant...
686         if (Instruction* lastInstruction = this->lastInstruction()) {
687             if (lastInstruction->fOp == BuilderOp::push_constant) {
688                 // ... we can just push more of them.
689                 lastInstruction->fImmA += 1;
690                 return;
691             }
692         }
693     }
694     this->appendInstruction(BuilderOp::push_clone, {}, numSlots, numSlots + offsetFromStackTop);
695 }
696 
push_clone_from_stack(SlotRange range,int otherStackID,int offsetFromStackTop)697 void Builder::push_clone_from_stack(SlotRange range, int otherStackID, int offsetFromStackTop) {
698     // immA: number of slots
699     // immB: other stack ID
700     // immC: offset from stack top
701     offsetFromStackTop -= range.index;
702 
703     if (Instruction* lastInstruction = this->lastInstruction()) {
704         // If the previous op is also pushing a clone...
705         if (lastInstruction->fOp == BuilderOp::push_clone_from_stack &&
706             // ... from the same stack...
707             lastInstruction->fImmB == otherStackID &&
708             // ... and this clone starts at the same place that the last clone ends...
709             lastInstruction->fImmC - lastInstruction->fImmA == offsetFromStackTop) {
710             // ... just extend the existing clone-op.
711             lastInstruction->fImmA += range.count;
712             return;
713         }
714     }
715 
716     this->appendInstruction(BuilderOp::push_clone_from_stack, {},
717                             range.count, otherStackID, offsetFromStackTop);
718 }
719 
push_clone_indirect_from_stack(SlotRange fixedOffset,int dynamicStackID,int otherStackID,int offsetFromStackTop)720 void Builder::push_clone_indirect_from_stack(SlotRange fixedOffset,
721                                              int dynamicStackID,
722                                              int otherStackID,
723                                              int offsetFromStackTop) {
724     // immA: number of slots
725     // immB: other stack ID
726     // immC: offset from stack top
727     // immD: dynamic stack ID
728     offsetFromStackTop -= fixedOffset.index;
729 
730     this->appendInstruction(BuilderOp::push_clone_indirect_from_stack, {},
731                             fixedOffset.count, otherStackID, offsetFromStackTop, dynamicStackID);
732 }
733 
pop_slots(SlotRange dst)734 void Builder::pop_slots(SlotRange dst) {
735     if (!this->executionMaskWritesAreEnabled()) {
736         this->pop_slots_unmasked(dst);
737         return;
738     }
739 
740     this->copy_stack_to_slots(dst);
741     this->discard_stack(dst.count);
742 }
743 
simplifyPopSlotsUnmasked(SlotRange * dst)744 void Builder::simplifyPopSlotsUnmasked(SlotRange* dst) {
745     if (!dst->count) {
746         // There's nothing left to simplify.
747         return;
748     }
749     Instruction* lastInstruction = this->lastInstruction();
750     if (!lastInstruction) {
751         // There's nothing left to simplify.
752         return;
753     }
754     BuilderOp lastOp = lastInstruction->fOp;
755 
756     // If the last instruction is pushing a constant, we can simplify it by copying the constant
757     // directly into the destination slot.
758     if (lastOp == BuilderOp::push_constant) {
759         // Get the last slot.
760         int32_t value = lastInstruction->fImmB;
761         lastInstruction->fImmA--;
762         if (lastInstruction->fImmA == 0) {
763             fInstructions.pop_back();
764         }
765 
766         // Consume one destination slot.
767         dst->count--;
768         Slot destinationSlot = dst->index + dst->count;
769 
770         // Continue simplifying if possible.
771         this->simplifyPopSlotsUnmasked(dst);
772 
773         // Write the constant directly to the destination slot.
774         this->copy_constant(destinationSlot, value);
775         return;
776     }
777 
778     // If the last instruction is pushing a uniform, we can simplify it by copying the uniform
779     // directly into the destination slot.
780     if (lastOp == BuilderOp::push_uniform) {
781         // Get the last slot.
782         Slot sourceSlot = lastInstruction->fSlotA + lastInstruction->fImmA - 1;
783         lastInstruction->fImmA--;
784         if (lastInstruction->fImmA == 0) {
785             fInstructions.pop_back();
786         }
787 
788         // Consume one destination slot.
789         dst->count--;
790         Slot destinationSlot = dst->index + dst->count;
791 
792         // Continue simplifying if possible.
793         this->simplifyPopSlotsUnmasked(dst);
794 
795         // Write the constant directly to the destination slot.
796         this->copy_uniform_to_slots_unmasked({destinationSlot, 1}, {sourceSlot, 1});
797         return;
798     }
799 
800     // If the last instruction is pushing a slot or immutable, we can just copy that slot.
801     if (lastOp == BuilderOp::push_slots || lastOp == BuilderOp::push_immutable) {
802         // Get the last slot.
803         Slot sourceSlot = lastInstruction->fSlotA + lastInstruction->fImmA - 1;
804         lastInstruction->fImmA--;
805         if (lastInstruction->fImmA == 0) {
806             fInstructions.pop_back();
807         }
808 
809         // Consume one destination slot.
810         dst->count--;
811         Slot destinationSlot = dst->index + dst->count;
812 
813         // Try once more.
814         this->simplifyPopSlotsUnmasked(dst);
815 
816         // Copy the slot directly.
817         if (lastOp == BuilderOp::push_slots) {
818             if (destinationSlot != sourceSlot) {
819                 this->copy_slots_unmasked({destinationSlot, 1}, {sourceSlot, 1});
820             } else {
821                 // Copying from a value-slot into the same value-slot is a no-op.
822             }
823         } else {
824             // Copy from immutable data directly to the destination slot.
825             this->copy_immutable_unmasked({destinationSlot, 1}, {sourceSlot, 1});
826         }
827         return;
828     }
829 }
830 
pop_slots_unmasked(SlotRange dst)831 void Builder::pop_slots_unmasked(SlotRange dst) {
832     SkASSERT(dst.count >= 0);
833     this->copy_stack_to_slots_unmasked(dst);
834     this->discard_stack(dst.count);
835 }
836 
exchange_src()837 void Builder::exchange_src() {
838     if (Instruction* lastInstruction = this->lastInstruction()) {
839         // If the previous op is also an exchange-src...
840         if (lastInstruction->fOp == BuilderOp::exchange_src) {
841             // ... both ops can be eliminated. A double-swap is a no-op.
842             fInstructions.pop_back();
843             return;
844         }
845     }
846 
847     this->appendInstruction(BuilderOp::exchange_src, {});
848 }
849 
pop_src_rgba()850 void Builder::pop_src_rgba() {
851     if (Instruction* lastInstruction = this->lastInstruction()) {
852         // If the previous op is exchanging src.rgba with the stack...
853         if (lastInstruction->fOp == BuilderOp::exchange_src) {
854             // ... both ops can be eliminated. It's just sliding the color back and forth.
855             fInstructions.pop_back();
856             this->discard_stack(4);
857             return;
858         }
859     }
860 
861     this->appendInstruction(BuilderOp::pop_src_rgba, {});
862 }
863 
copy_stack_to_slots(SlotRange dst,int offsetFromStackTop)864 void Builder::copy_stack_to_slots(SlotRange dst, int offsetFromStackTop) {
865     // If the execution mask is known to be all-true, then we can ignore the write mask.
866     if (!this->executionMaskWritesAreEnabled()) {
867         this->copy_stack_to_slots_unmasked(dst, offsetFromStackTop);
868         return;
869     }
870 
871     // If the last instruction copied the previous stack slots, just extend it.
872     if (Instruction* lastInstruction = this->lastInstruction()) {
873         // If the last op is copy-stack-to-slots...
874         if (lastInstruction->fOp == BuilderOp::copy_stack_to_slots &&
875             // and this op's destination is immediately after the last copy-slots-op's destination
876             lastInstruction->fSlotA + lastInstruction->fImmA == dst.index &&
877             // and this op's source is immediately after the last copy-slots-op's source
878             lastInstruction->fImmB - lastInstruction->fImmA == offsetFromStackTop) {
879             // then we can just extend the copy!
880             lastInstruction->fImmA += dst.count;
881             return;
882         }
883     }
884 
885     this->appendInstruction(BuilderOp::copy_stack_to_slots, {dst.index},
886                             dst.count, offsetFromStackTop);
887 }
888 
copy_stack_to_slots_indirect(SlotRange fixedRange,int dynamicStackID,SlotRange limitRange)889 void Builder::copy_stack_to_slots_indirect(SlotRange fixedRange,
890                                            int dynamicStackID,
891                                            SlotRange limitRange) {
892     // SlotA: fixed-range start
893     // SlotB: limit-range end
894     // immA: number of slots
895     // immB: dynamic stack ID
896     this->appendInstruction(BuilderOp::copy_stack_to_slots_indirect,
897                             {fixedRange.index, limitRange.index + limitRange.count},
898                             fixedRange.count,
899                             dynamicStackID);
900 }
901 
slot_ranges_overlap(SlotRange x,SlotRange y)902 static bool slot_ranges_overlap(SlotRange x, SlotRange y) {
903     return x.index < y.index + y.count &&
904            y.index < x.index + x.count;
905 }
906 
copy_constant(Slot slot,int constantValue)907 void Builder::copy_constant(Slot slot, int constantValue) {
908     // If the last instruction copied the same constant, just extend it.
909     if (Instruction* lastInstr = this->lastInstruction()) {
910         // If the last op is copy-constant...
911         if (lastInstr->fOp == BuilderOp::copy_constant &&
912             // ... and has the same value...
913             lastInstr->fImmB == constantValue &&
914             // ... and the slot is immediately after the last copy-constant's destination...
915             lastInstr->fSlotA + lastInstr->fImmA == slot) {
916             // ... then we can extend the copy!
917             lastInstr->fImmA += 1;
918             return;
919         }
920     }
921 
922     this->appendInstruction(BuilderOp::copy_constant, {slot}, 1, constantValue);
923 }
924 
copy_slots_unmasked(SlotRange dst,SlotRange src)925 void Builder::copy_slots_unmasked(SlotRange dst, SlotRange src) {
926     // If the last instruction copied adjacent slots, just extend it.
927     if (Instruction* lastInstr = this->lastInstruction()) {
928         // If the last op is a match...
929         if (lastInstr->fOp == BuilderOp::copy_slot_unmasked &&
930             // and this op's destination is immediately after the last copy-slots-op's destination
931             lastInstr->fSlotA + lastInstr->fImmA == dst.index &&
932             // and this op's source is immediately after the last copy-slots-op's source
933             lastInstr->fSlotB + lastInstr->fImmA == src.index &&
934             // and the source/dest ranges will not overlap
935             !slot_ranges_overlap({lastInstr->fSlotB, lastInstr->fImmA + dst.count},
936                                  {lastInstr->fSlotA, lastInstr->fImmA + dst.count})) {
937             // then we can just extend the copy!
938             lastInstr->fImmA += dst.count;
939             return;
940         }
941     }
942 
943     SkASSERT(dst.count == src.count);
944     this->appendInstruction(BuilderOp::copy_slot_unmasked, {dst.index, src.index}, dst.count);
945 }
946 
copy_immutable_unmasked(SlotRange dst,SlotRange src)947 void Builder::copy_immutable_unmasked(SlotRange dst, SlotRange src) {
948     // If the last instruction copied adjacent immutable data, just extend it.
949     if (Instruction* lastInstr = this->lastInstruction()) {
950         // If the last op is a match...
951         if (lastInstr->fOp == BuilderOp::copy_immutable_unmasked &&
952             // and this op's destination is immediately after the last copy-slots-op's destination
953             lastInstr->fSlotA + lastInstr->fImmA == dst.index &&
954             // and this op's source is immediately after the last copy-slots-op's source
955             lastInstr->fSlotB + lastInstr->fImmA == src.index) {
956             // then we can just extend the copy!
957             lastInstr->fImmA += dst.count;
958             return;
959         }
960     }
961 
962     SkASSERT(dst.count == src.count);
963     this->appendInstruction(BuilderOp::copy_immutable_unmasked, {dst.index, src.index}, dst.count);
964 }
965 
copy_uniform_to_slots_unmasked(SlotRange dst,SlotRange src)966 void Builder::copy_uniform_to_slots_unmasked(SlotRange dst, SlotRange src) {
967     // If the last instruction copied adjacent uniforms, just extend it.
968     if (Instruction* lastInstr = this->lastInstruction()) {
969         // If the last op is copy-constant...
970         if (lastInstr->fOp == BuilderOp::copy_uniform_to_slots_unmasked &&
971             // and this op's destination is immediately after the last copy-constant's destination
972             lastInstr->fSlotB + lastInstr->fImmA == dst.index &&
973             // and this op's source is immediately after the last copy-constant's source
974             lastInstr->fSlotA + lastInstr->fImmA == src.index) {
975             // then we can just extend the copy!
976             lastInstr->fImmA += dst.count;
977             return;
978         }
979     }
980 
981     SkASSERT(dst.count == src.count);
982     this->appendInstruction(BuilderOp::copy_uniform_to_slots_unmasked, {src.index, dst.index},
983                             dst.count);
984 }
985 
copy_stack_to_slots_unmasked(SlotRange dst,int offsetFromStackTop)986 void Builder::copy_stack_to_slots_unmasked(SlotRange dst, int offsetFromStackTop) {
987     // If the last instruction copied the previous stack slots, just extend it.
988     if (Instruction* lastInstr = this->lastInstruction()) {
989         // If the last op is copy-stack-to-slots-unmasked...
990         if (lastInstr->fOp == BuilderOp::copy_stack_to_slots_unmasked &&
991             // and this op's destination is immediately after the last copy-slots-op's destination
992             lastInstr->fSlotA + lastInstr->fImmA == dst.index &&
993             // and this op's source is immediately after the last copy-slots-op's source
994             lastInstr->fImmB - lastInstr->fImmA == offsetFromStackTop) {
995             // then we can just extend the copy!
996             lastInstr->fImmA += dst.count;
997             return;
998         }
999     }
1000 
1001     this->appendInstruction(BuilderOp::copy_stack_to_slots_unmasked, {dst.index},
1002                             dst.count, offsetFromStackTop);
1003 }
1004 
pop_return_mask()1005 void Builder::pop_return_mask() {
1006     SkASSERT(this->executionMaskWritesAreEnabled());
1007 
1008     // This instruction is going to overwrite the return mask. If the previous instruction was
1009     // masking off the return mask, that's wasted work and it can be eliminated.
1010     if (Instruction* lastInstruction = this->lastInstructionOnAnyStack()) {
1011         if (lastInstruction->fOp == BuilderOp::mask_off_return_mask) {
1012             fInstructions.pop_back();
1013         }
1014     }
1015 
1016     this->appendInstruction(BuilderOp::pop_return_mask, {});
1017 }
1018 
push_condition_mask()1019 void Builder::push_condition_mask() {
1020     SkASSERT(this->executionMaskWritesAreEnabled());
1021 
1022     // If the previous instruction is popping the condition mask, we can restore it onto the stack
1023     // "for free" instead of copying it.
1024     if (Instruction* lastInstruction = this->lastInstruction()) {
1025         if (lastInstruction->fOp == BuilderOp::pop_condition_mask) {
1026             this->pad_stack(1);
1027             return;
1028         }
1029     }
1030     this->appendInstruction(BuilderOp::push_condition_mask, {});
1031 }
1032 
merge_condition_mask()1033 void Builder::merge_condition_mask() {
1034     SkASSERT(this->executionMaskWritesAreEnabled());
1035 
1036     // This instruction is going to overwrite the condition mask. If the previous instruction was
1037     // loading the condition mask, that's wasted work and it can be eliminated.
1038     if (Instruction* lastInstruction = this->lastInstructionOnAnyStack()) {
1039         if (lastInstruction->fOp == BuilderOp::pop_condition_mask) {
1040             int stackID = lastInstruction->fStackID;
1041             fInstructions.pop_back();
1042             this->discard_stack(/*count=*/1, stackID);
1043         }
1044     }
1045 
1046     this->appendInstruction(BuilderOp::merge_condition_mask, {});
1047 }
1048 
zero_slots_unmasked(SlotRange dst)1049 void Builder::zero_slots_unmasked(SlotRange dst) {
1050     if (Instruction* lastInstruction = this->lastInstruction()) {
1051         if (lastInstruction->fOp == BuilderOp::copy_constant && lastInstruction->fImmB == 0) {
1052             if (lastInstruction->fSlotA + lastInstruction->fImmA == dst.index) {
1053                 // The previous instruction was zeroing the range immediately before this range.
1054                 // Combine the ranges.
1055                 lastInstruction->fImmA += dst.count;
1056                 return;
1057             }
1058 
1059             if (lastInstruction->fSlotA == dst.index + dst.count) {
1060                 // The previous instruction was zeroing the range immediately after this range.
1061                 // Combine the ranges.
1062                 lastInstruction->fSlotA = dst.index;
1063                 lastInstruction->fImmA += dst.count;
1064                 return;
1065             }
1066         }
1067     }
1068 
1069     this->appendInstruction(BuilderOp::copy_constant, {dst.index}, dst.count, 0);
1070 }
1071 
pack_nybbles(SkSpan<const int8_t> components)1072 static int pack_nybbles(SkSpan<const int8_t> components) {
1073     // Pack up to 8 elements into nybbles, in reverse order.
1074     int packed = 0;
1075     for (auto iter = components.rbegin(); iter != components.rend(); ++iter) {
1076         SkASSERT(*iter >= 0 && *iter <= 0xF);
1077         packed <<= 4;
1078         packed |= *iter;
1079     }
1080     return packed;
1081 }
1082 
1083 template <typename T>
unpack_nybbles_to_offsets(uint32_t components,SkSpan<T> offsets)1084 static void unpack_nybbles_to_offsets(uint32_t components, SkSpan<T> offsets) {
1085     // Unpack component nybbles into byte-offsets pointing at stack slots.
1086     for (size_t index = 0; index < offsets.size(); ++index) {
1087         offsets[index] = (components & 0xF) * SkOpts::raster_pipeline_highp_stride * sizeof(float);
1088         components >>= 4;
1089     }
1090 }
1091 
max_packed_nybble(uint32_t components,size_t numComponents)1092 static int max_packed_nybble(uint32_t components, size_t numComponents) {
1093     int largest = 0;
1094     for (size_t index = 0; index < numComponents; ++index) {
1095         largest = std::max<int>(largest, components & 0xF);
1096         components >>= 4;
1097     }
1098     return largest;
1099 }
1100 
swizzle_copy_stack_to_slots(SlotRange dst,SkSpan<const int8_t> components,int offsetFromStackTop)1101 void Builder::swizzle_copy_stack_to_slots(SlotRange dst,
1102                                           SkSpan<const int8_t> components,
1103                                           int offsetFromStackTop) {
1104     // When the execution-mask writes-enabled flag is off, we could squeeze out a little bit of
1105     // extra speed here by implementing and using an unmasked version of this op.
1106 
1107     // SlotA: fixed-range start
1108     // immA: number of swizzle components
1109     // immB: swizzle components
1110     // immC: offset from stack top
1111     this->appendInstruction(BuilderOp::swizzle_copy_stack_to_slots, {dst.index},
1112                             (int)components.size(),
1113                             pack_nybbles(components),
1114                             offsetFromStackTop);
1115 }
1116 
swizzle_copy_stack_to_slots_indirect(SlotRange fixedRange,int dynamicStackID,SlotRange limitRange,SkSpan<const int8_t> components,int offsetFromStackTop)1117 void Builder::swizzle_copy_stack_to_slots_indirect(SlotRange fixedRange,
1118                                                    int dynamicStackID,
1119                                                    SlotRange limitRange,
1120                                                    SkSpan<const int8_t> components,
1121                                                    int offsetFromStackTop) {
1122     // When the execution-mask writes-enabled flag is off, we could squeeze out a little bit of
1123     // extra speed here by implementing and using an unmasked version of this op.
1124 
1125     // SlotA: fixed-range start
1126     // SlotB: limit-range end
1127     // immA: number of swizzle components
1128     // immB: swizzle components
1129     // immC: offset from stack top
1130     // immD: dynamic stack ID
1131     this->appendInstruction(BuilderOp::swizzle_copy_stack_to_slots_indirect,
1132                             {fixedRange.index, limitRange.index + limitRange.count},
1133                             (int)components.size(),
1134                             pack_nybbles(components),
1135                             offsetFromStackTop,
1136                             dynamicStackID);
1137 }
1138 
swizzle(int consumedSlots,SkSpan<const int8_t> components)1139 void Builder::swizzle(int consumedSlots, SkSpan<const int8_t> components) {
1140     // Consumes `consumedSlots` elements on the stack, then generates `elementSpan.size()` elements.
1141     SkASSERT(consumedSlots >= 0);
1142 
1143     // We only allow up to 16 elements, and they can only reach 0-15 slots, due to nybble packing.
1144     int numElements = components.size();
1145     SkASSERT(numElements <= 16);
1146     SkASSERT(std::all_of(components.begin(), components.end(), [](int8_t e){ return e >= 0; }));
1147     SkASSERT(std::all_of(components.begin(), components.end(), [](int8_t e){ return e <= 0xF; }));
1148 
1149     // Make a local copy of the element array.
1150     int8_t elements[16] = {};
1151     std::copy(components.begin(), components.end(), std::begin(elements));
1152 
1153     while (numElements > 0) {
1154         // If the first element of the swizzle is zero...
1155         if (elements[0] != 0) {
1156             break;
1157         }
1158         // ...and zero isn't used elsewhere in the swizzle...
1159         if (std::any_of(&elements[1], &elements[numElements], [](int8_t e) { return e == 0; })) {
1160             break;
1161         }
1162         // We can omit the first slot from the swizzle entirely.
1163         // Slide everything forward by one slot, and reduce the element index by one.
1164         for (int index = 1; index < numElements; ++index) {
1165             elements[index - 1] = elements[index] - 1;
1166         }
1167         elements[numElements - 1] = 0;
1168         --consumedSlots;
1169         --numElements;
1170     }
1171 
1172     // A completely empty swizzle is a discard.
1173     if (numElements == 0) {
1174         this->discard_stack(consumedSlots);
1175         return;
1176     }
1177 
1178     if (consumedSlots <= 4 && numElements <= 4) {
1179         // We can fit everything into a little swizzle.
1180         int op = (int)BuilderOp::swizzle_1 + numElements - 1;
1181         this->appendInstruction((BuilderOp)op, {}, consumedSlots,
1182                                 pack_nybbles(SkSpan(elements, numElements)));
1183         return;
1184     }
1185 
1186     // This is a big swizzle. We use the `shuffle` op to handle these. immA counts the consumed
1187     // slots. immB counts the generated slots. immC and immD hold packed-nybble shuffle values.
1188     this->appendInstruction(BuilderOp::shuffle, {},
1189                             consumedSlots, numElements,
1190                             pack_nybbles(SkSpan(&elements[0], 8)),
1191                             pack_nybbles(SkSpan(&elements[8], 8)));
1192 }
1193 
transpose(int columns,int rows)1194 void Builder::transpose(int columns, int rows) {
1195     // Transposes a matrix of size CxR on the stack (into a matrix of size RxC).
1196     int8_t elements[16] = {};
1197     size_t index = 0;
1198     for (int r = 0; r < rows; ++r) {
1199         for (int c = 0; c < columns; ++c) {
1200             elements[index++] = (c * rows) + r;
1201         }
1202     }
1203     this->swizzle(/*consumedSlots=*/columns * rows, SkSpan(elements, index));
1204 }
1205 
diagonal_matrix(int columns,int rows)1206 void Builder::diagonal_matrix(int columns, int rows) {
1207     // Generates a CxR diagonal matrix from the top two scalars on the stack.
1208     int8_t elements[16] = {};
1209     size_t index = 0;
1210     for (int c = 0; c < columns; ++c) {
1211         for (int r = 0; r < rows; ++r) {
1212             elements[index++] = (c == r) ? 1 : 0;
1213         }
1214     }
1215     this->swizzle(/*consumedSlots=*/2, SkSpan(elements, index));
1216 }
1217 
matrix_resize(int origColumns,int origRows,int newColumns,int newRows)1218 void Builder::matrix_resize(int origColumns, int origRows, int newColumns, int newRows) {
1219     // Resizes a CxR matrix at the top of the stack to C'xR'.
1220     int8_t elements[16] = {};
1221     size_t index = 0;
1222 
1223     size_t consumedSlots = origColumns * origRows;
1224     size_t zeroOffset = 0, oneOffset = 0;
1225 
1226     for (int c = 0; c < newColumns; ++c) {
1227         for (int r = 0; r < newRows; ++r) {
1228             if (c < origColumns && r < origRows) {
1229                 // Push an element from the original matrix.
1230                 elements[index++] = (c * origRows) + r;
1231             } else {
1232                 // This element is outside the original matrix; push 1 or 0.
1233                 if (c == r) {
1234                     // We need to synthesize a literal 1.
1235                     if (oneOffset == 0) {
1236                         this->push_constant_f(1.0f);
1237                         oneOffset = consumedSlots++;
1238                     }
1239                     elements[index++] = oneOffset;
1240                 } else {
1241                     // We need to synthesize a literal 0.
1242                     if (zeroOffset == 0) {
1243                         this->push_constant_f(0.0f);
1244                         zeroOffset = consumedSlots++;
1245                     }
1246                     elements[index++] = zeroOffset;
1247                 }
1248             }
1249         }
1250     }
1251     this->swizzle(consumedSlots, SkSpan(elements, index));
1252 }
1253 
matrix_multiply(int leftColumns,int leftRows,int rightColumns,int rightRows)1254 void Builder::matrix_multiply(int leftColumns, int leftRows, int rightColumns, int rightRows) {
1255     BuilderOp op;
1256     switch (leftColumns) {
1257         case 2:  op = BuilderOp::matrix_multiply_2; break;
1258         case 3:  op = BuilderOp::matrix_multiply_3; break;
1259         case 4:  op = BuilderOp::matrix_multiply_4; break;
1260         default: SkDEBUGFAIL("unsupported matrix dimensions"); return;
1261     }
1262 
1263     this->appendInstruction(op, {}, leftColumns, leftRows, rightColumns, rightRows);
1264 }
1265 
finish(int numValueSlots,int numUniformSlots,int numImmutableSlots,DebugTracePriv * debugTrace)1266 std::unique_ptr<Program> Builder::finish(int numValueSlots,
1267                                          int numUniformSlots,
1268                                          int numImmutableSlots,
1269                                          DebugTracePriv* debugTrace) {
1270     // Verify that calls to enableExecutionMaskWrites and disableExecutionMaskWrites are balanced.
1271     SkASSERT(fExecutionMaskWritesEnabled == 0);
1272 
1273     return std::make_unique<Program>(std::move(fInstructions), numValueSlots, numUniformSlots,
1274                                      numImmutableSlots, fNumLabels, debugTrace);
1275 }
1276 
stack_usage(const Instruction & inst)1277 static int stack_usage(const Instruction& inst) {
1278     switch (inst.fOp) {
1279         case BuilderOp::push_condition_mask:
1280         case BuilderOp::push_loop_mask:
1281         case BuilderOp::push_return_mask:
1282             return 1;
1283 
1284         case BuilderOp::push_src_rgba:
1285         case BuilderOp::push_dst_rgba:
1286         case BuilderOp::push_device_xy01:
1287             return 4;
1288 
1289         case BuilderOp::push_immutable:
1290         case BuilderOp::push_immutable_indirect:
1291         case BuilderOp::push_constant:
1292         case BuilderOp::push_slots:
1293         case BuilderOp::push_slots_indirect:
1294         case BuilderOp::push_uniform:
1295         case BuilderOp::push_uniform_indirect:
1296         case BuilderOp::push_clone:
1297         case BuilderOp::push_clone_from_stack:
1298         case BuilderOp::push_clone_indirect_from_stack:
1299         case BuilderOp::pad_stack:
1300             return inst.fImmA;
1301 
1302         case BuilderOp::pop_condition_mask:
1303         case BuilderOp::pop_loop_mask:
1304         case BuilderOp::pop_and_reenable_loop_mask:
1305         case BuilderOp::pop_return_mask:
1306             return -1;
1307 
1308         case BuilderOp::pop_src_rgba:
1309         case BuilderOp::pop_dst_rgba:
1310             return -4;
1311 
1312         case ALL_N_WAY_BINARY_OP_CASES:
1313         case ALL_MULTI_SLOT_BINARY_OP_CASES:
1314         case BuilderOp::discard_stack:
1315         case BuilderOp::select:
1316             return -inst.fImmA;
1317 
1318         case ALL_N_WAY_TERNARY_OP_CASES:
1319         case ALL_MULTI_SLOT_TERNARY_OP_CASES:
1320             return 2 * -inst.fImmA;
1321 
1322         case BuilderOp::swizzle_1:
1323             return 1 - inst.fImmA;  // consumes immA slots and emits a scalar
1324         case BuilderOp::swizzle_2:
1325             return 2 - inst.fImmA;  // consumes immA slots and emits a 2-slot vector
1326         case BuilderOp::swizzle_3:
1327             return 3 - inst.fImmA;  // consumes immA slots and emits a 3-slot vector
1328         case BuilderOp::swizzle_4:
1329             return 4 - inst.fImmA;  // consumes immA slots and emits a 4-slot vector
1330 
1331         case BuilderOp::dot_2_floats:
1332             return -3;  // consumes two 2-slot vectors and emits one scalar
1333         case BuilderOp::dot_3_floats:
1334             return -5;  // consumes two 3-slot vectors and emits one scalar
1335         case BuilderOp::dot_4_floats:
1336             return -7;  // consumes two 4-slot vectors and emits one scalar
1337 
1338         case BuilderOp::refract_4_floats:
1339             return -5;  // consumes nine slots (N + I + eta) and emits a 4-slot vector (R)
1340 
1341         case BuilderOp::matrix_multiply_2:
1342         case BuilderOp::matrix_multiply_3:
1343         case BuilderOp::matrix_multiply_4:
1344             // consumes the left- and right-matrices; emits result over existing padding slots
1345             return -(inst.fImmA * inst.fImmB + inst.fImmC * inst.fImmD);
1346 
1347         case BuilderOp::shuffle: {
1348             int consumed = inst.fImmA;
1349             int generated = inst.fImmB;
1350             return generated - consumed;
1351         }
1352         case ALL_SINGLE_SLOT_UNARY_OP_CASES:
1353         case ALL_MULTI_SLOT_UNARY_OP_CASES:
1354         case ALL_IMMEDIATE_BINARY_OP_CASES:
1355         default:
1356             return 0;
1357     }
1358 }
1359 
tempStackMaxDepths() const1360 Program::StackDepths Program::tempStackMaxDepths() const {
1361     // Count the number of separate temp stacks that the program uses.
1362     int numStacks = 1;
1363     for (const Instruction& inst : fInstructions) {
1364         numStacks = std::max(numStacks, inst.fStackID + 1);
1365     }
1366 
1367     // Walk the program and calculate how deep each stack can potentially get.
1368     StackDepths largest, current;
1369     largest.push_back_n(numStacks, 0);
1370     current.push_back_n(numStacks, 0);
1371 
1372     for (const Instruction& inst : fInstructions) {
1373         int stackID = inst.fStackID;
1374         current[stackID] += stack_usage(inst);
1375         largest[stackID] = std::max(current[stackID], largest[stackID]);
1376         // If we assert here, the generated program has popped off the top of the stack.
1377         SkASSERTF(current[stackID] >= 0, "unbalanced temp stack push/pop on stack %d", stackID);
1378     }
1379 
1380     // Ensure that when the program is complete, our stacks are fully balanced.
1381     for (int stackID = 0; stackID < numStacks; ++stackID) {
1382         // If we assert here, the generated program has pushed more data than it has popped.
1383         SkASSERTF(current[stackID] == 0, "unbalanced temp stack push/pop on stack %d", stackID);
1384     }
1385 
1386     return largest;
1387 }
1388 
Program(TArray<Instruction> instrs,int numValueSlots,int numUniformSlots,int numImmutableSlots,int numLabels,DebugTracePriv * debugTrace)1389 Program::Program(TArray<Instruction> instrs,
1390                  int numValueSlots,
1391                  int numUniformSlots,
1392                  int numImmutableSlots,
1393                  int numLabels,
1394                  DebugTracePriv* debugTrace)
1395         : fInstructions(std::move(instrs))
1396         , fNumValueSlots(numValueSlots)
1397         , fNumUniformSlots(numUniformSlots)
1398         , fNumImmutableSlots(numImmutableSlots)
1399         , fNumLabels(numLabels)
1400         , fDebugTrace(debugTrace) {
1401     fTempStackMaxDepths = this->tempStackMaxDepths();
1402 
1403     fNumTempStackSlots = 0;
1404     for (const int depth : fTempStackMaxDepths) {
1405         fNumTempStackSlots += depth;
1406     }
1407 
1408     if (fDebugTrace) {
1409         fTraceHook = SkSL::Tracer::Make(&fDebugTrace->fTraceInfo);
1410     }
1411 }
1412 
1413 Program::~Program() = default;
1414 
immutable_data_is_splattable(int32_t * immutablePtr,int numSlots)1415 static bool immutable_data_is_splattable(int32_t* immutablePtr, int numSlots) {
1416     // If every value between `immutablePtr[0]` and `immutablePtr[numSlots]` is bit-identical, we
1417     // can use a splat.
1418     for (int index = 1; index < numSlots; ++index) {
1419         if (immutablePtr[0] != immutablePtr[index]) {
1420             return false;
1421         }
1422     }
1423     return true;
1424 }
1425 
appendCopy(TArray<Stage> * pipeline,SkArenaAlloc * alloc,std::byte * basePtr,ProgramOp baseStage,SkRPOffset dst,int dstStride,SkRPOffset src,int srcStride,int numSlots) const1426 void Program::appendCopy(TArray<Stage>* pipeline,
1427                          SkArenaAlloc* alloc,
1428                          std::byte* basePtr,  // only used for immutable-value copies
1429                          ProgramOp baseStage,
1430                          SkRPOffset dst, int dstStride,
1431                          SkRPOffset src, int srcStride,
1432                          int numSlots) const {
1433     SkASSERT(numSlots >= 0);
1434     while (numSlots > 4) {
1435         // If we are appending a large copy, split it up into groups of four at a time.
1436         this->appendCopy(pipeline, alloc, basePtr,
1437                          baseStage,
1438                          dst, dstStride,
1439                          src, srcStride,
1440                          /*numSlots=*/4);
1441         dst += 4 * dstStride * sizeof(float);
1442         src += 4 * srcStride * sizeof(float);
1443         numSlots -= 4;
1444     }
1445 
1446     SkASSERT(numSlots <= 4);
1447 
1448     if (numSlots > 0) {
1449         // If we are copying immutable data, it might be representable by a splat; this is
1450         // preferable, since splats are a tiny bit faster than regular copies.
1451         if (basePtr) {
1452             SkASSERT(srcStride == 1);
1453             int32_t* immutablePtr = reinterpret_cast<int32_t*>(basePtr + src);
1454             if (immutable_data_is_splattable(immutablePtr, numSlots)) {
1455                 auto stage = (ProgramOp)((int)ProgramOp::copy_constant + numSlots - 1);
1456                 SkRasterPipeline_ConstantCtx ctx;
1457                 ctx.dst = dst;
1458                 ctx.value = *immutablePtr;
1459                 pipeline->push_back({stage, SkRPCtxUtils::Pack(ctx, alloc)});
1460                 return;
1461             }
1462         }
1463 
1464         // We can't use a splat, so emit the requested copy op.
1465         auto stage = (ProgramOp)((int)baseStage + numSlots - 1);
1466         SkRasterPipeline_BinaryOpCtx ctx;
1467         ctx.dst = dst;
1468         ctx.src = src;
1469         pipeline->push_back({stage, SkRPCtxUtils::Pack(ctx, alloc)});
1470     }
1471 }
1472 
appendCopySlotsUnmasked(TArray<Stage> * pipeline,SkArenaAlloc * alloc,SkRPOffset dst,SkRPOffset src,int numSlots) const1473 void Program::appendCopySlotsUnmasked(TArray<Stage>* pipeline,
1474                                       SkArenaAlloc* alloc,
1475                                       SkRPOffset dst,
1476                                       SkRPOffset src,
1477                                       int numSlots) const {
1478     this->appendCopy(pipeline, alloc, /*basePtr=*/nullptr,
1479                      ProgramOp::copy_slot_unmasked,
1480                      dst, SkOpts::raster_pipeline_highp_stride,
1481                      src, SkOpts::raster_pipeline_highp_stride,
1482                      numSlots);
1483 }
1484 
appendCopyImmutableUnmasked(TArray<Stage> * pipeline,SkArenaAlloc * alloc,std::byte * basePtr,SkRPOffset dst,SkRPOffset src,int numSlots) const1485 void Program::appendCopyImmutableUnmasked(TArray<Stage>* pipeline,
1486                                           SkArenaAlloc* alloc,
1487                                           std::byte* basePtr,
1488                                           SkRPOffset dst,
1489                                           SkRPOffset src,
1490                                           int numSlots) const {
1491     this->appendCopy(pipeline, alloc, basePtr,
1492                      ProgramOp::copy_immutable_unmasked,
1493                      dst, SkOpts::raster_pipeline_highp_stride,
1494                      src, 1,
1495                      numSlots);
1496 }
1497 
appendCopySlotsMasked(TArray<Stage> * pipeline,SkArenaAlloc * alloc,SkRPOffset dst,SkRPOffset src,int numSlots) const1498 void Program::appendCopySlotsMasked(TArray<Stage>* pipeline,
1499                                     SkArenaAlloc* alloc,
1500                                     SkRPOffset dst,
1501                                     SkRPOffset src,
1502                                     int numSlots) const {
1503     this->appendCopy(pipeline, alloc, /*basePtr=*/nullptr,
1504                      ProgramOp::copy_slot_masked,
1505                      dst, SkOpts::raster_pipeline_highp_stride,
1506                      src, SkOpts::raster_pipeline_highp_stride,
1507                      numSlots);
1508 }
1509 
appendSingleSlotUnaryOp(TArray<Stage> * pipeline,ProgramOp stage,float * dst,int numSlots) const1510 void Program::appendSingleSlotUnaryOp(TArray<Stage>* pipeline, ProgramOp stage,
1511                                       float* dst, int numSlots) const {
1512     SkASSERT(numSlots >= 0);
1513     while (numSlots--) {
1514         pipeline->push_back({stage, dst});
1515         dst += SkOpts::raster_pipeline_highp_stride;
1516     }
1517 }
1518 
appendMultiSlotUnaryOp(TArray<Stage> * pipeline,ProgramOp baseStage,float * dst,int numSlots) const1519 void Program::appendMultiSlotUnaryOp(TArray<Stage>* pipeline, ProgramOp baseStage,
1520                                      float* dst, int numSlots) const {
1521     SkASSERT(numSlots >= 0);
1522     while (numSlots > 0) {
1523         int currentSlots = std::min(numSlots, 4);
1524         auto stage = (ProgramOp)((int)baseStage + currentSlots - 1);
1525         pipeline->push_back({stage, dst});
1526 
1527         dst += 4 * SkOpts::raster_pipeline_highp_stride;
1528         numSlots -= 4;
1529     }
1530 }
1531 
appendImmediateBinaryOp(TArray<Stage> * pipeline,SkArenaAlloc * alloc,ProgramOp baseStage,SkRPOffset dst,int32_t value,int numSlots) const1532 void Program::appendImmediateBinaryOp(TArray<Stage>* pipeline, SkArenaAlloc* alloc,
1533                                       ProgramOp baseStage,
1534                                       SkRPOffset dst, int32_t value, int numSlots) const {
1535     SkASSERT(is_immediate_op((BuilderOp)baseStage));
1536     int slotsPerStage = is_multi_slot_immediate_op((BuilderOp)baseStage) ? 4 : 1;
1537 
1538     SkRasterPipeline_ConstantCtx ctx;
1539     ctx.dst = dst;
1540     ctx.value = value;
1541 
1542     SkASSERT(numSlots >= 0);
1543     while (numSlots > 0) {
1544         int currentSlots = std::min(numSlots, slotsPerStage);
1545         auto stage = (ProgramOp)((int)baseStage - (currentSlots - 1));
1546         pipeline->push_back({stage, SkRPCtxUtils::Pack(ctx, alloc)});
1547 
1548         ctx.dst += slotsPerStage * SkOpts::raster_pipeline_highp_stride * sizeof(float);
1549         numSlots -= slotsPerStage;
1550     }
1551 }
1552 
appendAdjacentNWayBinaryOp(TArray<Stage> * pipeline,SkArenaAlloc * alloc,ProgramOp stage,SkRPOffset dst,SkRPOffset src,int numSlots) const1553 void Program::appendAdjacentNWayBinaryOp(TArray<Stage>* pipeline, SkArenaAlloc* alloc,
1554                                          ProgramOp stage,
1555                                          SkRPOffset dst, SkRPOffset src, int numSlots) const {
1556     // The source and destination must be directly next to one another.
1557     SkASSERT(numSlots >= 0);
1558     SkASSERT((dst + SkOpts::raster_pipeline_highp_stride * numSlots * sizeof(float)) == src);
1559 
1560     if (numSlots > 0) {
1561         SkRasterPipeline_BinaryOpCtx ctx;
1562         ctx.dst = dst;
1563         ctx.src = src;
1564         pipeline->push_back({stage, SkRPCtxUtils::Pack(ctx, alloc)});
1565     }
1566 }
1567 
appendAdjacentMultiSlotBinaryOp(TArray<Stage> * pipeline,SkArenaAlloc * alloc,ProgramOp baseStage,std::byte * basePtr,SkRPOffset dst,SkRPOffset src,int numSlots) const1568 void Program::appendAdjacentMultiSlotBinaryOp(TArray<Stage>* pipeline, SkArenaAlloc* alloc,
1569                                               ProgramOp baseStage, std::byte* basePtr,
1570                                               SkRPOffset dst, SkRPOffset src, int numSlots) const {
1571     // The source and destination must be directly next to one another.
1572     SkASSERT(numSlots >= 0);
1573     SkASSERT((dst + SkOpts::raster_pipeline_highp_stride * numSlots * sizeof(float)) == src);
1574 
1575     if (numSlots > 4) {
1576         this->appendAdjacentNWayBinaryOp(pipeline, alloc, baseStage, dst, src, numSlots);
1577         return;
1578     }
1579     if (numSlots > 0) {
1580         auto specializedStage = (ProgramOp)((int)baseStage + numSlots);
1581         pipeline->push_back({specializedStage, basePtr + dst});
1582     }
1583 }
1584 
appendAdjacentNWayTernaryOp(TArray<Stage> * pipeline,SkArenaAlloc * alloc,ProgramOp stage,std::byte * basePtr,SkRPOffset dst,SkRPOffset src0,SkRPOffset src1,int numSlots) const1585 void Program::appendAdjacentNWayTernaryOp(TArray<Stage>* pipeline, SkArenaAlloc* alloc,
1586                                           ProgramOp stage, std::byte* basePtr, SkRPOffset dst,
1587                                           SkRPOffset src0, SkRPOffset src1, int numSlots) const {
1588     // The float pointers must all be immediately adjacent to each other.
1589     SkASSERT(numSlots >= 0);
1590     SkASSERT((dst  + SkOpts::raster_pipeline_highp_stride * numSlots * sizeof(float)) == src0);
1591     SkASSERT((src0 + SkOpts::raster_pipeline_highp_stride * numSlots * sizeof(float)) == src1);
1592 
1593     if (numSlots > 0) {
1594         SkRasterPipeline_TernaryOpCtx ctx;
1595         ctx.dst = dst;
1596         ctx.delta = src0 - dst;
1597         pipeline->push_back({stage, SkRPCtxUtils::Pack(ctx, alloc)});
1598     }
1599 }
1600 
appendAdjacentMultiSlotTernaryOp(TArray<Stage> * pipeline,SkArenaAlloc * alloc,ProgramOp baseStage,std::byte * basePtr,SkRPOffset dst,SkRPOffset src0,SkRPOffset src1,int numSlots) const1601 void Program::appendAdjacentMultiSlotTernaryOp(TArray<Stage>* pipeline, SkArenaAlloc* alloc,
1602                                                ProgramOp baseStage, std::byte* basePtr,
1603                                                SkRPOffset dst, SkRPOffset src0, SkRPOffset src1,
1604                                                int numSlots) const {
1605     // The float pointers must all be immediately adjacent to each other.
1606     SkASSERT(numSlots >= 0);
1607     SkASSERT((dst  + SkOpts::raster_pipeline_highp_stride * numSlots * sizeof(float)) == src0);
1608     SkASSERT((src0 + SkOpts::raster_pipeline_highp_stride * numSlots * sizeof(float)) == src1);
1609 
1610     if (numSlots > 4) {
1611         this->appendAdjacentNWayTernaryOp(pipeline, alloc, baseStage, basePtr,
1612                                           dst, src0, src1, numSlots);
1613         return;
1614     }
1615     if (numSlots > 0) {
1616         auto specializedStage = (ProgramOp)((int)baseStage + numSlots);
1617         pipeline->push_back({specializedStage, basePtr + dst});
1618     }
1619 }
1620 
appendStackRewindForNonTailcallers(TArray<Stage> * pipeline) const1621 void Program::appendStackRewindForNonTailcallers(TArray<Stage>* pipeline) const {
1622 #if defined(SKSL_STANDALONE) || !SK_HAS_MUSTTAIL
1623     // When SK_HAS_MUSTTAIL is not enabled, stack rewinds are critical because because the stack may
1624     // grow after every single SkSL stage.
1625     this->appendStackRewind(pipeline);
1626 #endif
1627 }
1628 
appendStackRewind(TArray<Stage> * pipeline) const1629 void Program::appendStackRewind(TArray<Stage>* pipeline) const {
1630     pipeline->push_back({ProgramOp::stack_rewind, nullptr});
1631 }
1632 
invoke_shader(int childIdx)1633 void Builder::invoke_shader(int childIdx) {
1634     this->appendInstruction(BuilderOp::invoke_shader, {}, childIdx);
1635 }
1636 
invoke_color_filter(int childIdx)1637 void Builder::invoke_color_filter(int childIdx) {
1638     this->appendInstruction(BuilderOp::invoke_color_filter, {}, childIdx);
1639 }
1640 
invoke_blender(int childIdx)1641 void Builder::invoke_blender(int childIdx) {
1642     this->appendInstruction(BuilderOp::invoke_blender, {}, childIdx);
1643 }
1644 
invoke_to_linear_srgb()1645 void Builder::invoke_to_linear_srgb() {
1646     // The intrinsics accept a three-component value; add a fourth padding element (which will be
1647     // ignored) since our RP ops deal in RGBA colors.
1648     this->pad_stack(1);
1649     this->appendInstruction(BuilderOp::invoke_to_linear_srgb, {});
1650     this->discard_stack(1);
1651 }
1652 
invoke_from_linear_srgb()1653 void Builder::invoke_from_linear_srgb() {
1654     // The intrinsics accept a three-component value; add a fourth padding element (which will be
1655     // ignored) since our RP ops deal in RGBA colors.
1656     this->pad_stack(1);
1657     this->appendInstruction(BuilderOp::invoke_from_linear_srgb, {});
1658     this->discard_stack(1);
1659 }
1660 
context_bit_pun(intptr_t val)1661 static void* context_bit_pun(intptr_t val) {
1662     return sk_bit_cast<void*>(val);
1663 }
1664 
allocateSlotData(SkArenaAlloc * alloc) const1665 std::optional<Program::SlotData> Program::allocateSlotData(SkArenaAlloc* alloc) const {
1666     // Allocate a contiguous slab of slot data for immutables, values, and stack entries.
1667     const int N = SkOpts::raster_pipeline_highp_stride;
1668     const int scalarWidth = 1 * sizeof(float);
1669     const int vectorWidth = N * sizeof(float);
1670     SkSafeMath safe;
1671     size_t allocSize = safe.add(safe.mul(vectorWidth, safe.add(fNumValueSlots, fNumTempStackSlots)),
1672                                 safe.mul(scalarWidth, fNumImmutableSlots));
1673     if (!safe || !SkTFitsIn<int>(allocSize)) {
1674         return std::nullopt;
1675     }
1676     float* slotPtr = static_cast<float*>(alloc->makeBytesAlignedTo(allocSize, vectorWidth));
1677     sk_bzero(slotPtr, allocSize);
1678 
1679     // Store the temp stack immediately after the values, and immutable data after the stack.
1680     SlotData s;
1681     s.values    = SkSpan<float>{slotPtr,        N * fNumValueSlots};
1682     s.stack     = SkSpan<float>{s.values.end(), N * fNumTempStackSlots};
1683     s.immutable = SkSpan<float>{s.stack.end(),  1 * fNumImmutableSlots};
1684     return s;
1685 }
1686 
appendStages(SkRasterPipeline * pipeline,SkArenaAlloc * alloc,RP::Callbacks * callbacks,SkSpan<const float> uniforms) const1687 bool Program::appendStages(SkRasterPipeline* pipeline,
1688                            SkArenaAlloc* alloc,
1689                            RP::Callbacks* callbacks,
1690                            SkSpan<const float> uniforms) const {
1691 #if defined(SKSL_STANDALONE)
1692     return false;
1693 #else
1694     // Convert our Instruction list to an array of ProgramOps.
1695     TArray<Stage> stages;
1696     std::optional<SlotData> slotData = this->allocateSlotData(alloc);
1697     if (!slotData) {
1698         return false;
1699     }
1700     this->makeStages(&stages, alloc, uniforms, *slotData);
1701 
1702     // Allocate buffers for branch targets and labels; these are needed to convert labels into
1703     // actual offsets into the pipeline and fix up branches.
1704     TArray<SkRasterPipeline_BranchCtx*> branchContexts;
1705     branchContexts.reserve_exact(fNumLabels);
1706     TArray<int> labelOffsets;
1707     labelOffsets.push_back_n(fNumLabels, -1);
1708     TArray<int> branchGoesToLabel;
1709     branchGoesToLabel.reserve_exact(fNumLabels);
1710 
1711     auto resetBasePointer = [&]() {
1712         // Whenever we hand off control to another shader, we have to assume that it might overwrite
1713         // the base pointer (if it uses SkSL, it will!), so we reset it on return.
1714         pipeline->append(SkRasterPipelineOp::set_base_pointer, (*slotData).values.data());
1715     };
1716 
1717     resetBasePointer();
1718 
1719     for (const Stage& stage : stages) {
1720         switch (stage.op) {
1721             case ProgramOp::stack_rewind:
1722                 pipeline->appendStackRewind();
1723                 break;
1724 
1725             case ProgramOp::invoke_shader:
1726                 if (!callbacks || !callbacks->appendShader(sk_bit_cast<intptr_t>(stage.ctx))) {
1727                     return false;
1728                 }
1729                 resetBasePointer();
1730                 break;
1731 
1732             case ProgramOp::invoke_color_filter:
1733                 if (!callbacks || !callbacks->appendColorFilter(sk_bit_cast<intptr_t>(stage.ctx))) {
1734                     return false;
1735                 }
1736                 resetBasePointer();
1737                 break;
1738 
1739             case ProgramOp::invoke_blender:
1740                 if (!callbacks || !callbacks->appendBlender(sk_bit_cast<intptr_t>(stage.ctx))) {
1741                     return false;
1742                 }
1743                 resetBasePointer();
1744                 break;
1745 
1746             case ProgramOp::invoke_to_linear_srgb:
1747                 if (!callbacks) {
1748                     return false;
1749                 }
1750                 callbacks->toLinearSrgb(stage.ctx);
1751                 // A ColorSpaceXform shouldn't ever alter the base pointer, so we don't need to call
1752                 // resetBasePointer here.
1753                 break;
1754 
1755             case ProgramOp::invoke_from_linear_srgb:
1756                 if (!callbacks) {
1757                     return false;
1758                 }
1759                 callbacks->fromLinearSrgb(stage.ctx);
1760                 // A ColorSpaceXform shouldn't ever alter the base pointer, so we don't need to call
1761                 // resetBasePointer here.
1762                 break;
1763 
1764             case ProgramOp::label: {
1765                 // Remember the absolute pipeline position of this label.
1766                 int labelID = sk_bit_cast<intptr_t>(stage.ctx);
1767                 SkASSERT(labelID >= 0 && labelID < fNumLabels);
1768                 labelOffsets[labelID] = pipeline->getNumStages();
1769                 break;
1770             }
1771             case ProgramOp::jump:
1772             case ProgramOp::branch_if_all_lanes_active:
1773             case ProgramOp::branch_if_any_lanes_active:
1774             case ProgramOp::branch_if_no_lanes_active:
1775             case ProgramOp::branch_if_no_active_lanes_eq: {
1776                 // The branch context contain a valid label ID at this point.
1777                 auto* branchCtx = static_cast<SkRasterPipeline_BranchCtx*>(stage.ctx);
1778                 int labelID = branchCtx->offset;
1779                 SkASSERT(labelID >= 0 && labelID < fNumLabels);
1780 
1781                 // Replace the label ID in the branch context with the absolute pipeline position.
1782                 // We will go back over the branch targets at the end and fix them up.
1783                 branchCtx->offset = pipeline->getNumStages();
1784 
1785                 SkASSERT(branchContexts.size() == branchGoesToLabel.size());
1786                 branchContexts.push_back(branchCtx);
1787                 branchGoesToLabel.push_back(labelID);
1788                 [[fallthrough]];
1789             }
1790             default:
1791                 // Append a regular op to the program.
1792                 SkASSERT((int)stage.op < kNumRasterPipelineHighpOps);
1793                 pipeline->append((SkRasterPipelineOp)stage.op, stage.ctx);
1794                 break;
1795         }
1796     }
1797 
1798     // Now that we have assembled the program and know the pipeline positions of each label and
1799     // branch, fix up every branch target.
1800     SkASSERT(branchContexts.size() == branchGoesToLabel.size());
1801     for (int index = 0; index < branchContexts.size(); ++index) {
1802         int branchFromIdx = branchContexts[index]->offset;
1803         int branchToIdx = labelOffsets[branchGoesToLabel[index]];
1804         branchContexts[index]->offset = branchToIdx - branchFromIdx;
1805     }
1806 
1807     return true;
1808 #endif
1809 }
1810 
makeStages(TArray<Stage> * pipeline,SkArenaAlloc * alloc,SkSpan<const float> uniforms,const SlotData & slots) const1811 void Program::makeStages(TArray<Stage>* pipeline,
1812                          SkArenaAlloc* alloc,
1813                          SkSpan<const float> uniforms,
1814                          const SlotData& slots) const {
1815     SkASSERT(fNumUniformSlots == SkToInt(uniforms.size()));
1816 
1817     const int N = SkOpts::raster_pipeline_highp_stride;
1818     int mostRecentRewind = 0;
1819 
1820     // Assemble a map holding the current stack-top for each temporary stack. Position each temp
1821     // stack immediately after the previous temp stack; temp stacks are never allowed to overlap.
1822     int pos = 0;
1823     TArray<float*> tempStackMap;
1824     tempStackMap.resize(fTempStackMaxDepths.size());
1825     for (int idx = 0; idx < fTempStackMaxDepths.size(); ++idx) {
1826         tempStackMap[idx] = slots.stack.begin() + (pos * N);
1827         pos += fTempStackMaxDepths[idx];
1828     }
1829 
1830     // Track labels that we have reached in processing.
1831     TArray<int> labelToInstructionIndex;
1832     labelToInstructionIndex.push_back_n(fNumLabels, -1);
1833 
1834     int mostRecentInvocationInstructionIdx = 0;
1835 
1836     auto EmitStackRewindForBackwardsBranch = [&](int labelID) {
1837         // If we have already encountered the label associated with this branch, this is a
1838         // backwards branch. Add a stack-rewind immediately before the branch to ensure that
1839         // long-running loops don't use an unbounded amount of stack space.
1840         int labelInstructionIdx = labelToInstructionIndex[labelID];
1841         if (labelInstructionIdx >= 0) {
1842             if (mostRecentInvocationInstructionIdx > labelInstructionIdx) {
1843                 // The backwards-branch range includes an external invocation to another shader,
1844                 // color filter, blender, or colorspace conversion. In this case, we always emit a
1845                 // stack rewind, since the non-tailcall stages may exist on the stack.
1846                 this->appendStackRewind(pipeline);
1847             } else {
1848                 // The backwards-branch range only includes SkSL ops. If tailcalling is supported,
1849                 // stack rewinding isn't needed. If the platform cannot tailcall, we need to rewind.
1850                 this->appendStackRewindForNonTailcallers(pipeline);
1851             }
1852             mostRecentRewind = pipeline->size();
1853         }
1854     };
1855 
1856     auto* const basePtr = (std::byte*)slots.values.data();
1857     auto OffsetFromBase = [&](const void* ptr) -> SkRPOffset {
1858         return (SkRPOffset)((const std::byte*)ptr - basePtr);
1859     };
1860 
1861     // Copy all immutable values into the immutable slots.
1862     for (const Instruction& inst : fInstructions) {
1863         if (inst.fOp == BuilderOp::store_immutable_value) {
1864             slots.immutable[inst.fSlotA] = sk_bit_cast<float>(inst.fImmA);
1865         }
1866     }
1867 
1868     // Write each BuilderOp to the pipeline array.
1869     pipeline->reserve_exact(pipeline->size() + fInstructions.size());
1870     for (int instructionIdx = 0; instructionIdx < fInstructions.size(); ++instructionIdx) {
1871         const Instruction& inst = fInstructions[instructionIdx];
1872 
1873         auto ImmutableA = [&]() { return &slots.immutable[1 * inst.fSlotA]; };
1874         auto ImmutableB = [&]() { return &slots.immutable[1 * inst.fSlotB]; };
1875         auto SlotA      = [&]() { return &slots.values[N * inst.fSlotA]; };
1876         auto SlotB      = [&]() { return &slots.values[N * inst.fSlotB]; };
1877         auto UniformA   = [&]() { return &uniforms[inst.fSlotA]; };
1878         auto AllocTraceContext = [&](auto* ctx) {
1879             // We pass `ctx` solely for its type; the value is unused.
1880             using ContextType = typename std::remove_reference<decltype(*ctx)>::type;
1881             ctx = alloc->make<ContextType>();
1882             ctx->traceMask = reinterpret_cast<int*>(tempStackMap[inst.fImmA] - N);
1883             ctx->traceHook = fTraceHook.get();
1884             return ctx;
1885         };
1886         float*& tempStackPtr = tempStackMap[inst.fStackID];
1887 
1888         switch (inst.fOp) {
1889             case BuilderOp::label: {
1890                 intptr_t labelID = inst.fImmA;
1891                 SkASSERT(labelID >= 0 && labelID < fNumLabels);
1892                 SkASSERT(labelToInstructionIndex[labelID] == -1);
1893                 labelToInstructionIndex[labelID] = instructionIdx;
1894                 pipeline->push_back({ProgramOp::label, context_bit_pun(labelID)});
1895                 break;
1896             }
1897             case BuilderOp::jump:
1898             case BuilderOp::branch_if_any_lanes_active:
1899             case BuilderOp::branch_if_no_lanes_active: {
1900                 SkASSERT(inst.fImmA >= 0 && inst.fImmA < fNumLabels);
1901                 EmitStackRewindForBackwardsBranch(inst.fImmA);
1902 
1903                 auto* ctx = alloc->make<SkRasterPipeline_BranchCtx>();
1904                 ctx->offset = inst.fImmA;
1905                 pipeline->push_back({(ProgramOp)inst.fOp, ctx});
1906                 break;
1907             }
1908             case BuilderOp::branch_if_all_lanes_active: {
1909                 SkASSERT(inst.fImmA >= 0 && inst.fImmA < fNumLabels);
1910                 EmitStackRewindForBackwardsBranch(inst.fImmA);
1911 
1912                 auto* ctx = alloc->make<SkRasterPipeline_BranchIfAllLanesActiveCtx>();
1913                 ctx->offset = inst.fImmA;
1914                 pipeline->push_back({ProgramOp::branch_if_all_lanes_active, ctx});
1915                 break;
1916             }
1917             case BuilderOp::branch_if_no_active_lanes_on_stack_top_equal: {
1918                 SkASSERT(inst.fImmA >= 0 && inst.fImmA < fNumLabels);
1919                 EmitStackRewindForBackwardsBranch(inst.fImmA);
1920 
1921                 auto* ctx = alloc->make<SkRasterPipeline_BranchIfEqualCtx>();
1922                 ctx->offset = inst.fImmA;
1923                 ctx->value = inst.fImmB;
1924                 ctx->ptr = reinterpret_cast<int*>(tempStackPtr - N);
1925                 pipeline->push_back({ProgramOp::branch_if_no_active_lanes_eq, ctx});
1926                 break;
1927             }
1928             case BuilderOp::init_lane_masks: {
1929                 auto* ctx = alloc->make<SkRasterPipeline_InitLaneMasksCtx>();
1930                 pipeline->push_back({ProgramOp::init_lane_masks, ctx});
1931                 break;
1932             }
1933             case BuilderOp::store_src_rg:
1934                 pipeline->push_back({ProgramOp::store_src_rg, SlotA()});
1935                 break;
1936 
1937             case BuilderOp::store_src:
1938                 pipeline->push_back({ProgramOp::store_src, SlotA()});
1939                 break;
1940 
1941             case BuilderOp::store_dst:
1942                 pipeline->push_back({ProgramOp::store_dst, SlotA()});
1943                 break;
1944 
1945             case BuilderOp::store_device_xy01:
1946                 pipeline->push_back({ProgramOp::store_device_xy01, SlotA()});
1947                 break;
1948 
1949             case BuilderOp::store_immutable_value:
1950                 // The immutable slots were populated in an earlier pass.
1951                 break;
1952 
1953             case BuilderOp::load_src:
1954                 pipeline->push_back({ProgramOp::load_src, SlotA()});
1955                 break;
1956 
1957             case BuilderOp::load_dst:
1958                 pipeline->push_back({ProgramOp::load_dst, SlotA()});
1959                 break;
1960 
1961             case ALL_SINGLE_SLOT_UNARY_OP_CASES: {
1962                 float* dst = tempStackPtr - (inst.fImmA * N);
1963                 this->appendSingleSlotUnaryOp(pipeline, (ProgramOp)inst.fOp, dst, inst.fImmA);
1964                 break;
1965             }
1966             case ALL_MULTI_SLOT_UNARY_OP_CASES: {
1967                 float* dst = tempStackPtr - (inst.fImmA * N);
1968                 this->appendMultiSlotUnaryOp(pipeline, (ProgramOp)inst.fOp, dst, inst.fImmA);
1969                 break;
1970             }
1971             case ALL_IMMEDIATE_BINARY_OP_CASES: {
1972                 float* dst = (inst.fSlotA == NA) ? tempStackPtr - (inst.fImmA * N)
1973                                                  : SlotA();
1974 
1975                 this->appendImmediateBinaryOp(pipeline, alloc, (ProgramOp)inst.fOp,
1976                                               OffsetFromBase(dst), inst.fImmB, inst.fImmA);
1977                 break;
1978             }
1979             case ALL_N_WAY_BINARY_OP_CASES: {
1980                 float* src = tempStackPtr - (inst.fImmA * N);
1981                 float* dst = tempStackPtr - (inst.fImmA * 2 * N);
1982                 this->appendAdjacentNWayBinaryOp(pipeline, alloc, (ProgramOp)inst.fOp,
1983                                                  OffsetFromBase(dst), OffsetFromBase(src),
1984                                                  inst.fImmA);
1985                 break;
1986             }
1987             case ALL_MULTI_SLOT_BINARY_OP_CASES: {
1988                 float* src = tempStackPtr - (inst.fImmA * N);
1989                 float* dst = tempStackPtr - (inst.fImmA * 2 * N);
1990                 this->appendAdjacentMultiSlotBinaryOp(pipeline, alloc, (ProgramOp)inst.fOp,
1991                                                       basePtr,
1992                                                       OffsetFromBase(dst),
1993                                                       OffsetFromBase(src),
1994                                                       inst.fImmA);
1995                 break;
1996             }
1997             case ALL_N_WAY_TERNARY_OP_CASES: {
1998                 float* src1 = tempStackPtr - (inst.fImmA * N);
1999                 float* src0 = tempStackPtr - (inst.fImmA * 2 * N);
2000                 float* dst  = tempStackPtr - (inst.fImmA * 3 * N);
2001                 this->appendAdjacentNWayTernaryOp(pipeline, alloc, (ProgramOp)inst.fOp, basePtr,
2002                                                   OffsetFromBase(dst),
2003                                                   OffsetFromBase(src0),
2004                                                   OffsetFromBase(src1),
2005                                                   inst.fImmA);
2006                 break;
2007             }
2008             case ALL_MULTI_SLOT_TERNARY_OP_CASES: {
2009                 float* src1 = tempStackPtr - (inst.fImmA * N);
2010                 float* src0 = tempStackPtr - (inst.fImmA * 2 * N);
2011                 float* dst  = tempStackPtr - (inst.fImmA * 3 * N);
2012                 this->appendAdjacentMultiSlotTernaryOp(pipeline, alloc,(ProgramOp)inst.fOp, basePtr,
2013                                                        OffsetFromBase(dst),
2014                                                        OffsetFromBase(src0),
2015                                                        OffsetFromBase(src1),
2016                                                        inst.fImmA);
2017                 break;
2018             }
2019             case BuilderOp::select: {
2020                 float* src = tempStackPtr - (inst.fImmA * N);
2021                 float* dst = tempStackPtr - (inst.fImmA * 2 * N);
2022                 this->appendCopySlotsMasked(pipeline, alloc,
2023                                             OffsetFromBase(dst),
2024                                             OffsetFromBase(src),
2025                                             inst.fImmA);
2026                 break;
2027             }
2028             case BuilderOp::copy_slot_masked:
2029                 this->appendCopySlotsMasked(pipeline, alloc,
2030                                             OffsetFromBase(SlotA()),
2031                                             OffsetFromBase(SlotB()),
2032                                             inst.fImmA);
2033                 break;
2034 
2035             case BuilderOp::copy_slot_unmasked:
2036                 this->appendCopySlotsUnmasked(pipeline, alloc,
2037                                               OffsetFromBase(SlotA()),
2038                                               OffsetFromBase(SlotB()),
2039                                               inst.fImmA);
2040                 break;
2041 
2042             case BuilderOp::copy_immutable_unmasked:
2043                 this->appendCopyImmutableUnmasked(pipeline, alloc, basePtr,
2044                                                   OffsetFromBase(SlotA()),
2045                                                   OffsetFromBase(ImmutableB()),
2046                                                   inst.fImmA);
2047                 break;
2048 
2049             case BuilderOp::refract_4_floats: {
2050                 float* dst = tempStackPtr - (9 * N);
2051                 pipeline->push_back({ProgramOp::refract_4_floats, dst});
2052                 break;
2053             }
2054             case BuilderOp::inverse_mat2:
2055             case BuilderOp::inverse_mat3:
2056             case BuilderOp::inverse_mat4: {
2057                 float* dst = tempStackPtr - (inst.fImmA * N);
2058                 pipeline->push_back({(ProgramOp)inst.fOp, dst});
2059                 break;
2060             }
2061             case BuilderOp::dot_2_floats:
2062             case BuilderOp::dot_3_floats:
2063             case BuilderOp::dot_4_floats: {
2064                 float* dst = tempStackPtr - (inst.fImmA * 2 * N);
2065                 pipeline->push_back({(ProgramOp)inst.fOp, dst});
2066                 break;
2067             }
2068             case BuilderOp::swizzle_1: {
2069                 // A single-component swizzle just copies a slot and shrinks the stack; we can
2070                 // slightly improve codegen by making that simplification here.
2071                 int offset = inst.fImmB;
2072                 SkASSERT(offset >= 0 && offset <= 15);
2073                 float* dst = tempStackPtr - (inst.fImmA * N);
2074                 float* src = dst + (offset * N);
2075                 if (src != dst) {
2076                     this->appendCopySlotsUnmasked(pipeline, alloc,
2077                                                   OffsetFromBase(dst),
2078                                                   OffsetFromBase(src),
2079                                                   /*numSlots=*/1);
2080                 }
2081                 break;
2082             }
2083             case BuilderOp::swizzle_2:
2084             case BuilderOp::swizzle_3:
2085             case BuilderOp::swizzle_4: {
2086                 SkRasterPipeline_SwizzleCtx ctx;
2087                 ctx.dst = OffsetFromBase(tempStackPtr - (N * inst.fImmA));
2088                 // Unpack component nybbles into byte-offsets pointing at stack slots.
2089                 unpack_nybbles_to_offsets(inst.fImmB, SkSpan(ctx.offsets));
2090                 pipeline->push_back({(ProgramOp)inst.fOp, SkRPCtxUtils::Pack(ctx, alloc)});
2091                 break;
2092             }
2093             case BuilderOp::shuffle: {
2094                 int consumed = inst.fImmA;
2095                 int generated = inst.fImmB;
2096 
2097                 auto* ctx = alloc->make<SkRasterPipeline_ShuffleCtx>();
2098                 ctx->ptr = reinterpret_cast<int32_t*>(tempStackPtr) - (N * consumed);
2099                 ctx->count = generated;
2100                 // Unpack immB and immC from nybble form into the offset array.
2101                 unpack_nybbles_to_offsets(inst.fImmC, SkSpan(&ctx->offsets[0], 8));
2102                 unpack_nybbles_to_offsets(inst.fImmD, SkSpan(&ctx->offsets[8], 8));
2103                 pipeline->push_back({ProgramOp::shuffle, ctx});
2104                 break;
2105             }
2106             case BuilderOp::matrix_multiply_2:
2107             case BuilderOp::matrix_multiply_3:
2108             case BuilderOp::matrix_multiply_4: {
2109                 int consumed = (inst.fImmB * inst.fImmC) +  // result
2110                                (inst.fImmA * inst.fImmB) +  // left-matrix
2111                                (inst.fImmC * inst.fImmD);   // right-matrix
2112 
2113                 SkRasterPipeline_MatrixMultiplyCtx ctx;
2114                 ctx.dst = OffsetFromBase(tempStackPtr - (N * consumed));
2115                 ctx.leftColumns  = inst.fImmA;
2116                 ctx.leftRows     = inst.fImmB;
2117                 ctx.rightColumns = inst.fImmC;
2118                 ctx.rightRows    = inst.fImmD;
2119                 pipeline->push_back({(ProgramOp)inst.fOp, SkRPCtxUtils::Pack(ctx, alloc)});
2120                 break;
2121             }
2122             case BuilderOp::exchange_src: {
2123                 float* dst = tempStackPtr - (4 * N);
2124                 pipeline->push_back({ProgramOp::exchange_src, dst});
2125                 break;
2126             }
2127             case BuilderOp::push_src_rgba: {
2128                 float* dst = tempStackPtr;
2129                 pipeline->push_back({ProgramOp::store_src, dst});
2130                 break;
2131             }
2132             case BuilderOp::push_dst_rgba: {
2133                 float* dst = tempStackPtr;
2134                 pipeline->push_back({ProgramOp::store_dst, dst});
2135                 break;
2136             }
2137             case BuilderOp::push_device_xy01: {
2138                 float* dst = tempStackPtr;
2139                 pipeline->push_back({ProgramOp::store_device_xy01, dst});
2140                 break;
2141             }
2142             case BuilderOp::pop_src_rgba: {
2143                 float* src = tempStackPtr - (4 * N);
2144                 pipeline->push_back({ProgramOp::load_src, src});
2145                 break;
2146             }
2147             case BuilderOp::pop_dst_rgba: {
2148                 float* src = tempStackPtr - (4 * N);
2149                 pipeline->push_back({ProgramOp::load_dst, src});
2150                 break;
2151             }
2152             case BuilderOp::push_slots: {
2153                 float* dst = tempStackPtr;
2154                 this->appendCopySlotsUnmasked(pipeline, alloc,
2155                                               OffsetFromBase(dst),
2156                                               OffsetFromBase(SlotA()),
2157                                               inst.fImmA);
2158                 break;
2159             }
2160             case BuilderOp::push_immutable: {
2161                 float* dst = tempStackPtr;
2162                 this->appendCopyImmutableUnmasked(pipeline, alloc, basePtr,
2163                                                   OffsetFromBase(dst),
2164                                                   OffsetFromBase(ImmutableA()),
2165                                                   inst.fImmA);
2166                 break;
2167             }
2168             case BuilderOp::copy_stack_to_slots_indirect:
2169             case BuilderOp::push_immutable_indirect:
2170             case BuilderOp::push_slots_indirect:
2171             case BuilderOp::push_uniform_indirect: {
2172                 // SlotA: fixed-range start
2173                 // SlotB: limit-range end
2174                 //  immA: number of slots to copy
2175                 //  immB: dynamic stack ID
2176                 ProgramOp op;
2177                 auto* ctx = alloc->make<SkRasterPipeline_CopyIndirectCtx>();
2178                 ctx->indirectOffset =
2179                         reinterpret_cast<const uint32_t*>(tempStackMap[inst.fImmB]) - (1 * N);
2180                 ctx->indirectLimit = inst.fSlotB - inst.fSlotA - inst.fImmA;
2181                 ctx->slots = inst.fImmA;
2182                 if (inst.fOp == BuilderOp::push_slots_indirect) {
2183                     op = ProgramOp::copy_from_indirect_unmasked;
2184                     ctx->src = reinterpret_cast<const int32_t*>(SlotA());
2185                     ctx->dst = reinterpret_cast<int32_t*>(tempStackPtr);
2186                 } else if (inst.fOp == BuilderOp::push_immutable_indirect) {
2187                     // We reuse the indirect-uniform op for indirect copies of immutable data.
2188                     op = ProgramOp::copy_from_indirect_uniform_unmasked;
2189                     ctx->src = reinterpret_cast<const int32_t*>(ImmutableA());
2190                     ctx->dst = reinterpret_cast<int32_t*>(tempStackPtr);
2191                 } else if (inst.fOp == BuilderOp::push_uniform_indirect) {
2192                     op = ProgramOp::copy_from_indirect_uniform_unmasked;
2193                     ctx->src = reinterpret_cast<const int32_t*>(UniformA());
2194                     ctx->dst = reinterpret_cast<int32_t*>(tempStackPtr);
2195                 } else {
2196                     op = ProgramOp::copy_to_indirect_masked;
2197                     ctx->src = reinterpret_cast<const int32_t*>(tempStackPtr) - (ctx->slots * N);
2198                     ctx->dst = reinterpret_cast<int32_t*>(SlotA());
2199                 }
2200                 pipeline->push_back({op, ctx});
2201                 break;
2202             }
2203             case BuilderOp::push_uniform:
2204             case BuilderOp::copy_uniform_to_slots_unmasked: {
2205                 const float* src = UniformA();
2206                 float* dst = (inst.fOp == BuilderOp::push_uniform) ? tempStackPtr : SlotB();
2207 
2208                 for (int remaining = inst.fImmA; remaining > 0; remaining -= 4) {
2209                     auto ctx = alloc->make<SkRasterPipeline_UniformCtx>();
2210                     ctx->dst = reinterpret_cast<int32_t*>(dst);
2211                     ctx->src = reinterpret_cast<const int32_t*>(src);
2212                     switch (remaining) {
2213                         case 1:  pipeline->push_back({ProgramOp::copy_uniform,    ctx}); break;
2214                         case 2:  pipeline->push_back({ProgramOp::copy_2_uniforms, ctx}); break;
2215                         case 3:  pipeline->push_back({ProgramOp::copy_3_uniforms, ctx}); break;
2216                         default: pipeline->push_back({ProgramOp::copy_4_uniforms, ctx}); break;
2217                     }
2218                     dst += 4 * N;
2219                     src += 4;
2220                 }
2221                 break;
2222             }
2223             case BuilderOp::push_condition_mask: {
2224                 float* dst = tempStackPtr;
2225                 pipeline->push_back({ProgramOp::store_condition_mask, dst});
2226                 break;
2227             }
2228             case BuilderOp::pop_condition_mask: {
2229                 float* src = tempStackPtr - (1 * N);
2230                 pipeline->push_back({ProgramOp::load_condition_mask, src});
2231                 break;
2232             }
2233             case BuilderOp::merge_condition_mask:
2234             case BuilderOp::merge_inv_condition_mask: {
2235                 float* ptr = tempStackPtr - (2 * N);
2236                 pipeline->push_back({(ProgramOp)inst.fOp, ptr});
2237                 break;
2238             }
2239             case BuilderOp::push_loop_mask: {
2240                 float* dst = tempStackPtr;
2241                 pipeline->push_back({ProgramOp::store_loop_mask, dst});
2242                 break;
2243             }
2244             case BuilderOp::pop_loop_mask: {
2245                 float* src = tempStackPtr - (1 * N);
2246                 pipeline->push_back({ProgramOp::load_loop_mask, src});
2247                 break;
2248             }
2249             case BuilderOp::pop_and_reenable_loop_mask: {
2250                 float* src = tempStackPtr - (1 * N);
2251                 pipeline->push_back({ProgramOp::reenable_loop_mask, src});
2252                 break;
2253             }
2254             case BuilderOp::reenable_loop_mask:
2255                 pipeline->push_back({ProgramOp::reenable_loop_mask, SlotA()});
2256                 break;
2257 
2258             case BuilderOp::mask_off_loop_mask:
2259                 pipeline->push_back({ProgramOp::mask_off_loop_mask, nullptr});
2260                 break;
2261 
2262             case BuilderOp::merge_loop_mask: {
2263                 float* src = tempStackPtr - (1 * N);
2264                 pipeline->push_back({ProgramOp::merge_loop_mask, src});
2265                 break;
2266             }
2267             case BuilderOp::push_return_mask: {
2268                 float* dst = tempStackPtr;
2269                 pipeline->push_back({ProgramOp::store_return_mask, dst});
2270                 break;
2271             }
2272             case BuilderOp::pop_return_mask: {
2273                 float* src = tempStackPtr - (1 * N);
2274                 pipeline->push_back({ProgramOp::load_return_mask, src});
2275                 break;
2276             }
2277             case BuilderOp::mask_off_return_mask:
2278                 pipeline->push_back({ProgramOp::mask_off_return_mask, nullptr});
2279                 break;
2280 
2281             case BuilderOp::copy_constant:
2282             case BuilderOp::push_constant: {
2283                 float* dst = (inst.fOp == BuilderOp::copy_constant) ? SlotA() : tempStackPtr;
2284                 // Splat constant values onto the stack.
2285                 for (int remaining = inst.fImmA; remaining > 0; remaining -= 4) {
2286                     SkRasterPipeline_ConstantCtx ctx;
2287                     ctx.dst = OffsetFromBase(dst);
2288                     ctx.value = inst.fImmB;
2289                     void* ptr = SkRPCtxUtils::Pack(ctx, alloc);
2290                     switch (remaining) {
2291                         case 1:  pipeline->push_back({ProgramOp::copy_constant,     ptr}); break;
2292                         case 2:  pipeline->push_back({ProgramOp::splat_2_constants, ptr}); break;
2293                         case 3:  pipeline->push_back({ProgramOp::splat_3_constants, ptr}); break;
2294                         default: pipeline->push_back({ProgramOp::splat_4_constants, ptr}); break;
2295                     }
2296                     dst += 4 * N;
2297                 }
2298                 break;
2299             }
2300             case BuilderOp::copy_stack_to_slots: {
2301                 float* src = tempStackPtr - (inst.fImmB * N);
2302                 this->appendCopySlotsMasked(pipeline, alloc,
2303                                             OffsetFromBase(SlotA()),
2304                                             OffsetFromBase(src),
2305                                             inst.fImmA);
2306                 break;
2307             }
2308             case BuilderOp::copy_stack_to_slots_unmasked: {
2309                 float* src = tempStackPtr - (inst.fImmB * N);
2310                 this->appendCopySlotsUnmasked(pipeline, alloc,
2311                                               OffsetFromBase(SlotA()),
2312                                               OffsetFromBase(src),
2313                                               inst.fImmA);
2314                 break;
2315             }
2316             case BuilderOp::swizzle_copy_stack_to_slots: {
2317                 // SlotA: fixed-range start
2318                 // immA: number of swizzle components
2319                 // immB: swizzle components
2320                 // immC: offset from stack top
2321                 auto stage = (ProgramOp)((int)ProgramOp::swizzle_copy_slot_masked + inst.fImmA - 1);
2322                 auto* ctx = alloc->make<SkRasterPipeline_SwizzleCopyCtx>();
2323                 ctx->src = reinterpret_cast<const int32_t*>(tempStackPtr) - (inst.fImmC * N);
2324                 ctx->dst = reinterpret_cast<int32_t*>(SlotA());
2325                 unpack_nybbles_to_offsets(inst.fImmB, SkSpan(ctx->offsets));
2326                 pipeline->push_back({stage, ctx});
2327                 break;
2328             }
2329             case BuilderOp::push_clone: {
2330                 float* src = tempStackPtr - (inst.fImmB * N);
2331                 float* dst = tempStackPtr;
2332                 this->appendCopySlotsUnmasked(pipeline, alloc,
2333                                               OffsetFromBase(dst),
2334                                               OffsetFromBase(src),
2335                                               inst.fImmA);
2336                 break;
2337             }
2338             case BuilderOp::push_clone_from_stack: {
2339                 // immA: number of slots
2340                 // immB: other stack ID
2341                 // immC: offset from stack top
2342                 float* sourceStackPtr = tempStackMap[inst.fImmB];
2343                 float* src = sourceStackPtr - (inst.fImmC * N);
2344                 float* dst = tempStackPtr;
2345                 this->appendCopySlotsUnmasked(pipeline, alloc,
2346                                               OffsetFromBase(dst),
2347                                               OffsetFromBase(src),
2348                                               inst.fImmA);
2349                 break;
2350             }
2351             case BuilderOp::push_clone_indirect_from_stack: {
2352                 // immA: number of slots
2353                 // immB: other stack ID
2354                 // immC: offset from stack top
2355                 // immD: dynamic stack ID
2356                 float* sourceStackPtr = tempStackMap[inst.fImmB];
2357 
2358                 auto* ctx = alloc->make<SkRasterPipeline_CopyIndirectCtx>();
2359                 ctx->dst = reinterpret_cast<int32_t*>(tempStackPtr);
2360                 ctx->src = reinterpret_cast<const int32_t*>(sourceStackPtr) - (inst.fImmC * N);
2361                 ctx->indirectOffset =
2362                         reinterpret_cast<const uint32_t*>(tempStackMap[inst.fImmD]) - (1 * N);
2363                 ctx->indirectLimit = inst.fImmC - inst.fImmA;
2364                 ctx->slots = inst.fImmA;
2365                 pipeline->push_back({ProgramOp::copy_from_indirect_unmasked, ctx});
2366                 break;
2367             }
2368             case BuilderOp::swizzle_copy_stack_to_slots_indirect: {
2369                 // SlotA: fixed-range start
2370                 // SlotB: limit-range end
2371                 // immA: number of swizzle components
2372                 // immB: swizzle components
2373                 // immC: offset from stack top
2374                 // immD: dynamic stack ID
2375                 auto* ctx = alloc->make<SkRasterPipeline_SwizzleCopyIndirectCtx>();
2376                 ctx->src = reinterpret_cast<const int32_t*>(tempStackPtr) - (inst.fImmC * N);
2377                 ctx->dst = reinterpret_cast<int32_t*>(SlotA());
2378                 ctx->indirectOffset =
2379                         reinterpret_cast<const uint32_t*>(tempStackMap[inst.fImmD]) - (1 * N);
2380                 ctx->indirectLimit =
2381                         inst.fSlotB - inst.fSlotA - (max_packed_nybble(inst.fImmB, inst.fImmA) + 1);
2382                 ctx->slots = inst.fImmA;
2383                 unpack_nybbles_to_offsets(inst.fImmB, SkSpan(ctx->offsets));
2384                 pipeline->push_back({ProgramOp::swizzle_copy_to_indirect_masked, ctx});
2385                 break;
2386             }
2387             case BuilderOp::case_op: {
2388                 SkRasterPipeline_CaseOpCtx ctx;
2389                 ctx.expectedValue = inst.fImmA;
2390                 ctx.offset = OffsetFromBase(tempStackPtr - (2 * N));
2391                 pipeline->push_back({ProgramOp::case_op, SkRPCtxUtils::Pack(ctx, alloc)});
2392                 break;
2393             }
2394             case BuilderOp::continue_op:
2395                 pipeline->push_back({ProgramOp::continue_op, tempStackMap[inst.fImmA] - (1 * N)});
2396                 break;
2397 
2398             case BuilderOp::pad_stack:
2399             case BuilderOp::discard_stack:
2400                 break;
2401 
2402             case BuilderOp::invoke_shader:
2403             case BuilderOp::invoke_color_filter:
2404             case BuilderOp::invoke_blender:
2405                 pipeline->push_back({(ProgramOp)inst.fOp, context_bit_pun(inst.fImmA)});
2406                 mostRecentInvocationInstructionIdx = instructionIdx;
2407                 break;
2408 
2409             case BuilderOp::invoke_to_linear_srgb:
2410             case BuilderOp::invoke_from_linear_srgb:
2411                 pipeline->push_back({(ProgramOp)inst.fOp, tempStackMap[inst.fImmA] - (4 * N)});
2412                 mostRecentInvocationInstructionIdx = instructionIdx;
2413                 break;
2414 
2415             case BuilderOp::trace_line: {
2416                 auto* ctx = AllocTraceContext((SkRasterPipeline_TraceLineCtx*)nullptr);
2417                 ctx->lineNumber = inst.fImmB;
2418                 pipeline->push_back({ProgramOp::trace_line, ctx});
2419                 break;
2420             }
2421             case BuilderOp::trace_scope: {
2422                 auto* ctx = AllocTraceContext((SkRasterPipeline_TraceScopeCtx*)nullptr);
2423                 ctx->delta = inst.fImmB;
2424                 pipeline->push_back({ProgramOp::trace_scope, ctx});
2425                 break;
2426             }
2427             case BuilderOp::trace_enter:
2428             case BuilderOp::trace_exit: {
2429                 auto* ctx = AllocTraceContext((SkRasterPipeline_TraceFuncCtx*)nullptr);
2430                 ctx->funcIdx = inst.fImmB;
2431                 pipeline->push_back({(ProgramOp)inst.fOp, ctx});
2432                 break;
2433             }
2434             case BuilderOp::trace_var:
2435             case BuilderOp::trace_var_indirect: {
2436                 // SlotA: fixed-range start
2437                 // SlotB: limit-range end
2438                 // immA: trace-mask stack ID
2439                 // immB: number of slots
2440                 // immC: dynamic stack ID
2441                 auto* ctx = AllocTraceContext((SkRasterPipeline_TraceVarCtx*)nullptr);
2442                 ctx->slotIdx = inst.fSlotA;
2443                 ctx->numSlots = inst.fImmB;
2444                 ctx->data = reinterpret_cast<int*>(SlotA());
2445                 if (inst.fOp == BuilderOp::trace_var_indirect) {
2446                     ctx->indirectOffset =
2447                             reinterpret_cast<const uint32_t*>(tempStackMap[inst.fImmC]) - (1 * N);
2448                     ctx->indirectLimit = inst.fSlotB - inst.fSlotA - inst.fImmB;
2449                 } else {
2450                     ctx->indirectOffset = nullptr;
2451                     ctx->indirectLimit = 0;
2452                 }
2453                 pipeline->push_back({ProgramOp::trace_var, ctx});
2454                 break;
2455             }
2456             default:
2457                 SkDEBUGFAILF("Raster Pipeline: unsupported instruction %d", (int)inst.fOp);
2458                 break;
2459         }
2460 
2461         int stackUsage = stack_usage(inst);
2462         if (stackUsage != 0) {
2463             tempStackPtr += stackUsage * N;
2464             SkASSERT(tempStackPtr >= slots.stack.begin());
2465             SkASSERT(tempStackPtr <= slots.stack.end());
2466         }
2467 
2468         // Periodically rewind the stack every 500 instructions. When SK_HAS_MUSTTAIL is set,
2469         // rewinds are not actually used; the appendStackRewind call becomes a no-op. On platforms
2470         // that don't support SK_HAS_MUSTTAIL, rewinding the stack periodically can prevent a
2471         // potential stack overflow when running a long program.
2472         int numPipelineStages = pipeline->size();
2473         if (numPipelineStages - mostRecentRewind > 500) {
2474             this->appendStackRewindForNonTailcallers(pipeline);
2475             mostRecentRewind = numPipelineStages;
2476         }
2477     }
2478 }
2479 
2480 class Program::Dumper {
2481 public:
Dumper(const Program & p)2482     Dumper(const Program& p) : fProgram(p) {}
2483 
2484     void dump(SkWStream* out, bool writeInstructionCount);
2485 
2486     // Finds the labels in the program, and keeps track of their offsets.
buildLabelToStageMap()2487     void buildLabelToStageMap() {
2488         for (int index = 0; index < fStages.size(); ++index) {
2489             if (fStages[index].op == ProgramOp::label) {
2490                 int labelID = sk_bit_cast<intptr_t>(fStages[index].ctx);
2491                 SkASSERT(!fLabelToStageMap.find(labelID));
2492                 fLabelToStageMap[labelID] = index;
2493             }
2494         }
2495     }
2496 
2497     // Assign unique names to each variable slot; our trace might have multiple variables with the
2498     // same name, which can make a dump hard to read. We disambiguate them with subscripts.
buildUniqueSlotNameList()2499     void buildUniqueSlotNameList() {
2500         if (fProgram.fDebugTrace) {
2501             fSlotNameList.reserve_exact(fProgram.fDebugTrace->fSlotInfo.size());
2502 
2503             // The map consists of <variable name, <source position, unique name>>.
2504             THashMap<std::string_view, THashMap<int, std::string>> uniqueNameMap;
2505 
2506             for (const SlotDebugInfo& slotInfo : fProgram.fDebugTrace->fSlotInfo) {
2507                 // Look up this variable by its name and source position.
2508                 int pos = slotInfo.pos.valid() ? slotInfo.pos.startOffset() : 0;
2509                 THashMap<int, std::string>& positionMap = uniqueNameMap[slotInfo.name];
2510                 std::string& uniqueName = positionMap[pos];
2511 
2512                 // Have we seen this variable name/position combination before?
2513                 if (uniqueName.empty()) {
2514                     // This is a unique name/position pair.
2515                     uniqueName = slotInfo.name;
2516 
2517                     // But if it's not a unique _name_, it deserves a subscript to disambiguate it.
2518                     int subscript = positionMap.count() - 1;
2519                     if (subscript > 0) {
2520                         for (char digit : std::to_string(subscript)) {
2521                             // U+2080 through U+2089 (₀₁₂₃₄₅₆₇₈₉) in UTF8:
2522                             uniqueName.push_back((char)0xE2);
2523                             uniqueName.push_back((char)0x82);
2524                             uniqueName.push_back((char)(0x80 + digit - '0'));
2525                         }
2526                     }
2527                 }
2528 
2529                 fSlotNameList.push_back(uniqueName);
2530             }
2531         }
2532     }
2533 
2534     // Interprets the context value as a branch offset.
branchOffset(const SkRasterPipeline_BranchCtx * ctx,int index) const2535     std::string branchOffset(const SkRasterPipeline_BranchCtx* ctx, int index) const {
2536         // The context's offset field contains a label ID
2537         int labelID = ctx->offset;
2538         const int* targetIndex = fLabelToStageMap.find(labelID);
2539         SkASSERT(targetIndex);
2540         return SkSL::String::printf("%+d (label %d at #%d)", *targetIndex - index, labelID,
2541                                                              *targetIndex + 1);
2542     }
2543 
2544     // Prints a 32-bit immediate value of unknown type (int/float).
imm(float immFloat,bool showAsFloat=true) const2545     std::string imm(float immFloat, bool showAsFloat = true) const {
2546         // Special case exact zero as "0" for readability (vs `0x00000000 (0.0)`).
2547         if (sk_bit_cast<int32_t>(immFloat) == 0) {
2548             return "0";
2549         }
2550         // Start with `0x3F800000` as a baseline.
2551         uint32_t immUnsigned;
2552         memcpy(&immUnsigned, &immFloat, sizeof(uint32_t));
2553         auto text = SkSL::String::printf("0x%08X", immUnsigned);
2554 
2555         // Extend it to `0x3F800000 (1.0)` for finite floating point values.
2556         if (showAsFloat && std::isfinite(immFloat)) {
2557             text += " (";
2558             text += skstd::to_string(immFloat);
2559             text += ')';
2560         }
2561         return text;
2562     }
2563 
2564     // Interprets the context pointer as a 32-bit immediate value of unknown type (int/float).
immCtx(const void * ctx,bool showAsFloat=true) const2565     std::string immCtx(const void* ctx, bool showAsFloat = true) const {
2566         float f;
2567         memcpy(&f, &ctx, sizeof(float));
2568         return this->imm(f, showAsFloat);
2569     }
2570 
2571     // Prints `1` for single slots and `1..3` for ranges of slots.
asRange(int first,int count) const2572     std::string asRange(int first, int count) const {
2573         std::string text = std::to_string(first);
2574         if (count > 1) {
2575             text += ".." + std::to_string(first + count - 1);
2576         }
2577         return text;
2578     }
2579 
2580     // Generates a reasonable name for a range of slots or uniforms, e.g.:
2581     // `val`: slot range points at one variable, named val
2582     // `val(0..1)`: slot range points at the first and second slot of val (which has 3+ slots)
2583     // `foo, bar`: slot range fully covers two variables, named foo and bar
2584     // `foo(3), bar(0)`: slot range covers the fourth slot of foo and the first slot of bar
slotOrUniformName(SkSpan<const SlotDebugInfo> debugInfo,SkSpan<const std::string> names,SlotRange range) const2585     std::string slotOrUniformName(SkSpan<const SlotDebugInfo> debugInfo,
2586                                   SkSpan<const std::string> names,
2587                                   SlotRange range) const {
2588         SkASSERT(range.index >= 0 && (range.index + range.count) <= (int)debugInfo.size());
2589 
2590         std::string text;
2591         auto separator = SkSL::String::Separator();
2592         while (range.count > 0) {
2593             const SlotDebugInfo& slotInfo = debugInfo[range.index];
2594             text += separator();
2595             text += names.empty() ? slotInfo.name : names[range.index];
2596 
2597             // Figure out how many slots we can chomp in this iteration.
2598             int entireVariable = slotInfo.columns * slotInfo.rows;
2599             int slotsToChomp = std::min(range.count, entireVariable - slotInfo.componentIndex);
2600             // If we aren't consuming an entire variable, from first slot to last...
2601             if (slotsToChomp != entireVariable) {
2602                 // ... decorate it with a range suffix.
2603                 text += '(' + this->asRange(slotInfo.componentIndex, slotsToChomp) + ')';
2604             }
2605             range.index += slotsToChomp;
2606             range.count -= slotsToChomp;
2607         }
2608 
2609         return text;
2610     }
2611 
2612     // Generates a reasonable name for a range of slots.
slotName(SlotRange range) const2613     std::string slotName(SlotRange range) const {
2614         return this->slotOrUniformName(fProgram.fDebugTrace->fSlotInfo, fSlotNameList, range);
2615     }
2616 
2617     // Generates a reasonable name for a range of uniforms.
uniformName(SlotRange range) const2618     std::string uniformName(SlotRange range) const {
2619         return this->slotOrUniformName(fProgram.fDebugTrace->fUniformInfo, /*names=*/{}, range);
2620     }
2621 
2622     // Attempts to interpret the passed-in pointer as a uniform range.
uniformPtrCtx(const float * ptr,int numSlots) const2623     std::string uniformPtrCtx(const float* ptr, int numSlots) const {
2624         const float* end = ptr + numSlots;
2625         if (ptr >= fUniforms.begin() && end <= fUniforms.end()) {
2626             int uniformIdx = ptr - fUniforms.begin();
2627             if (fProgram.fDebugTrace) {
2628                 // Handle pointers to named uniform slots.
2629                 std::string name = this->uniformName({uniformIdx, numSlots});
2630                 if (!name.empty()) {
2631                     return name;
2632                 }
2633             }
2634             // Handle pointers to uniforms (when no debug info exists).
2635             return 'u' + this->asRange(uniformIdx, numSlots);
2636         }
2637         return {};
2638     }
2639 
2640     // Attempts to interpret the passed-in pointer as a value slot range.
valuePtrCtx(const float * ptr,int numSlots) const2641     std::string valuePtrCtx(const float* ptr, int numSlots) const {
2642         const float* end = ptr + (N * numSlots);
2643         if (ptr >= fSlots.values.begin() && end <= fSlots.values.end()) {
2644             int valueIdx = ptr - fSlots.values.begin();
2645             SkASSERT((valueIdx % N) == 0);
2646             valueIdx /= N;
2647             if (fProgram.fDebugTrace) {
2648                 // Handle pointers to named value slots.
2649                 std::string name = this->slotName({valueIdx, numSlots});
2650                 if (!name.empty()) {
2651                     return name;
2652                 }
2653             }
2654             // Handle pointers to value slots (when no debug info exists).
2655             return 'v' + this->asRange(valueIdx, numSlots);
2656         }
2657         return {};
2658     }
2659 
2660     // Attempts to interpret the passed-in pointer as a immutable slot range.
immutablePtrCtx(const float * ptr,int numSlots) const2661     std::string immutablePtrCtx(const float* ptr, int numSlots) const {
2662         const float* end = ptr + numSlots;
2663         if (ptr >= fSlots.immutable.begin() && end <= fSlots.immutable.end()) {
2664             int index = ptr - fSlots.immutable.begin();
2665             return 'i' + this->asRange(index, numSlots) + ' ' +
2666                    this->multiImmCtx(ptr, numSlots);
2667         }
2668         return {};
2669     }
2670 
2671     // Interprets the context value as a pointer to `count` immediate values.
multiImmCtx(const float * ptr,int count) const2672     std::string multiImmCtx(const float* ptr, int count) const {
2673         // If this is a uniform, print it by name.
2674         if (std::string text = this->uniformPtrCtx(ptr, count); !text.empty()) {
2675             return text;
2676         }
2677         // Emit a single bracketed immediate.
2678         if (count == 1) {
2679             return '[' + this->imm(*ptr) + ']';
2680         }
2681         // Emit a list like `[0x00000000 (0.0), 0x3F80000 (1.0)]`.
2682         std::string text = "[";
2683         auto separator = SkSL::String::Separator();
2684         while (count--) {
2685             text += separator();
2686             text += this->imm(*ptr++);
2687         }
2688         return text + ']';
2689     }
2690 
2691     // Interprets the context value as a generic pointer.
ptrCtx(const void * ctx,int numSlots) const2692     std::string ptrCtx(const void* ctx, int numSlots) const {
2693         const float *ctxAsSlot = static_cast<const float*>(ctx);
2694         // Check for uniform, value, and immutable pointers.
2695         if (std::string uniform = this->uniformPtrCtx(ctxAsSlot, numSlots); !uniform.empty()) {
2696             return uniform;
2697         }
2698         if (std::string value = this->valuePtrCtx(ctxAsSlot, numSlots); !value.empty()) {
2699             return value;
2700         }
2701         if (std::string value = this->immutablePtrCtx(ctxAsSlot, numSlots); !value.empty()) {
2702             return value;
2703         }
2704         // Handle pointers to temporary stack slots.
2705         if (ctxAsSlot >= fSlots.stack.begin() && ctxAsSlot < fSlots.stack.end()) {
2706             int stackIdx = ctxAsSlot - fSlots.stack.begin();
2707             SkASSERT((stackIdx % N) == 0);
2708             return '$' + this->asRange(stackIdx / N, numSlots);
2709         }
2710         // This pointer is out of our expected bounds; this generally isn't expected to happen.
2711         return "ExternalPtr(" + this->asRange(0, numSlots) + ")";
2712     }
2713 
2714     // Converts an SkRPOffset to a pointer into the value-slot range.
offsetToPtr(SkRPOffset offset) const2715     std::byte* offsetToPtr(SkRPOffset offset) const {
2716         return (std::byte*)fSlots.values.data() + offset;
2717     }
2718 
2719     // Interprets a slab offset as a slot range.
offsetCtx(SkRPOffset offset,int numSlots) const2720     std::string offsetCtx(SkRPOffset offset, int numSlots) const {
2721         return this->ptrCtx(this->offsetToPtr(offset), numSlots);
2722     }
2723 
2724     // Interprets the context value as a packed ConstantCtx structure.
constantCtx(const void * v,int slots,bool showAsFloat=true) const2725     std::tuple<std::string, std::string> constantCtx(const void* v,
2726                                                      int slots,
2727                                                      bool showAsFloat = true) const {
2728         auto ctx = SkRPCtxUtils::Unpack((const SkRasterPipeline_ConstantCtx*)v);
2729         return {this->offsetCtx(ctx.dst, slots),
2730                 this->imm(sk_bit_cast<float>(ctx.value), showAsFloat)};
2731     }
2732 
2733     // Interprets the context value as a BinaryOp structure for copy_n_slots (numSlots is dictated
2734     // by the op itself).
binaryOpCtx(const void * v,int numSlots) const2735     std::tuple<std::string, std::string> binaryOpCtx(const void* v, int numSlots) const {
2736         auto ctx = SkRPCtxUtils::Unpack((const SkRasterPipeline_BinaryOpCtx*)v);
2737         return {this->offsetCtx(ctx.dst, numSlots),
2738                 this->offsetCtx(ctx.src, numSlots)};
2739     }
2740 
2741     // Interprets the context value as a BinaryOp structure for copy_n_uniforms (numSlots is
2742     // dictated by the op itself).
copyUniformCtx(const void * v,int numSlots) const2743     std::tuple<std::string, std::string> copyUniformCtx(const void* v, int numSlots) const {
2744         const auto *ctx = static_cast<const SkRasterPipeline_UniformCtx*>(v);
2745         return {this->ptrCtx(ctx->dst, numSlots),
2746                 this->multiImmCtx(reinterpret_cast<const float*>(ctx->src), numSlots)};
2747     }
2748 
2749     // Interprets the context value as a pointer to two adjacent values.
adjacentPtrCtx(const void * ctx,int numSlots) const2750     std::tuple<std::string, std::string> adjacentPtrCtx(const void* ctx, int numSlots) const {
2751         const float *ctxAsSlot = static_cast<const float*>(ctx);
2752         return std::make_tuple(this->ptrCtx(ctxAsSlot, numSlots),
2753                                this->ptrCtx(ctxAsSlot + (N * numSlots), numSlots));
2754     }
2755 
2756     // Interprets a slab offset as two adjacent slot ranges.
adjacentOffsetCtx(SkRPOffset offset,int numSlots) const2757     std::tuple<std::string, std::string> adjacentOffsetCtx(SkRPOffset offset, int numSlots) const {
2758         return this->adjacentPtrCtx((std::byte*)fSlots.values.data() + offset, numSlots);
2759     }
2760 
2761     // Interprets the context value as a BinaryOp structure (numSlots is inferred from the distance
2762     // between pointers).
adjacentBinaryOpCtx(const void * v) const2763     std::tuple<std::string, std::string> adjacentBinaryOpCtx(const void* v) const {
2764         auto ctx = SkRPCtxUtils::Unpack((const SkRasterPipeline_BinaryOpCtx*)v);
2765         int numSlots = (ctx.src - ctx.dst) / (N * sizeof(float));
2766         return this->adjacentOffsetCtx(ctx.dst, numSlots);
2767     }
2768 
2769     // Interprets the context value as a pointer to three adjacent values.
adjacent3PtrCtx(const void * ctx,int numSlots) const2770     std::tuple<std::string, std::string, std::string> adjacent3PtrCtx(const void* ctx,
2771                                                                       int numSlots) const {
2772         const float *ctxAsSlot = static_cast<const float*>(ctx);
2773         return {this->ptrCtx(ctxAsSlot, numSlots),
2774                 this->ptrCtx(ctxAsSlot + (N * numSlots), numSlots),
2775                 this->ptrCtx(ctxAsSlot + (2 * N * numSlots), numSlots)};
2776     }
2777 
2778     // Interprets a slab offset as three adjacent slot ranges.
adjacent3OffsetCtx(SkRPOffset offset,int numSlots) const2779     std::tuple<std::string, std::string, std::string> adjacent3OffsetCtx(SkRPOffset offset,
2780                                                                          int numSlots) const {
2781         return this->adjacent3PtrCtx((std::byte*)fSlots.values.data() + offset, numSlots);
2782     }
2783 
2784     // Interprets the context value as a TernaryOp structure (numSlots is inferred from `delta`).
adjacentTernaryOpCtx(const void * v) const2785     std::tuple<std::string, std::string, std::string> adjacentTernaryOpCtx(const void* v) const {
2786         auto ctx = SkRPCtxUtils::Unpack((const SkRasterPipeline_TernaryOpCtx*)v);
2787         int numSlots = ctx.delta / (sizeof(float) * N);
2788         return this->adjacent3OffsetCtx(ctx.dst, numSlots);
2789     }
2790 
2791     // Stringizes a span of swizzle offsets to the textual equivalent (`xyzw`).
2792     template <typename T>
swizzleOffsetSpan(SkSpan<T> offsets) const2793     std::string swizzleOffsetSpan(SkSpan<T> offsets) const {
2794         std::string src;
2795         for (uint16_t offset : offsets) {
2796             if (offset == (0 * N * sizeof(float))) {
2797                 src.push_back('x');
2798             } else if (offset == (1 * N * sizeof(float))) {
2799                 src.push_back('y');
2800             } else if (offset == (2 * N * sizeof(float))) {
2801                 src.push_back('z');
2802             } else if (offset == (3 * N * sizeof(float))) {
2803                 src.push_back('w');
2804             } else {
2805                 src.push_back('?');
2806             }
2807         }
2808         return src;
2809     }
2810 
2811     // Determines the effective width of a swizzle op. When we decode a swizzle, we don't know the
2812     // slot width of the original value; that's not preserved in the instruction encoding. (e.g.,
2813     // myFloat4.y would be indistinguishable from myFloat2.y.) We do our best to make a readable
2814     // dump using the data we have.
2815     template <typename T>
swizzleWidth(SkSpan<T> offsets) const2816     size_t swizzleWidth(SkSpan<T> offsets) const {
2817         size_t highestComponent = *std::max_element(offsets.begin(), offsets.end()) /
2818                                   (N * sizeof(float));
2819         size_t swizzleWidth = offsets.size();
2820         return std::max(swizzleWidth, highestComponent + 1);
2821     }
2822 
2823     // Stringizes a swizzled pointer.
2824     template <typename T>
swizzlePtr(const void * ptr,SkSpan<T> offsets) const2825     std::string swizzlePtr(const void* ptr, SkSpan<T> offsets) const {
2826         return "(" + this->ptrCtx(ptr, this->swizzleWidth(SkSpan(offsets))) + ")." +
2827                this->swizzleOffsetSpan(SkSpan(offsets));
2828     }
2829 
2830     // Interprets the context value as a SwizzleCtx structure.
swizzleCtx(ProgramOp op,const void * v) const2831     std::tuple<std::string, std::string> swizzleCtx(ProgramOp op, const void* v) const {
2832         auto ctx = SkRPCtxUtils::Unpack((const SkRasterPipeline_SwizzleCtx*)v);
2833         int destSlots = (int)op - (int)BuilderOp::swizzle_1 + 1;
2834         return {this->offsetCtx(ctx.dst, destSlots),
2835                 this->swizzlePtr(this->offsetToPtr(ctx.dst), SkSpan(ctx.offsets, destSlots))};
2836     }
2837 
2838     // Interprets the context value as a SwizzleCopyCtx structure.
swizzleCopyCtx(ProgramOp op,const void * v) const2839     std::tuple<std::string, std::string> swizzleCopyCtx(ProgramOp op, const void* v) const {
2840         const auto* ctx = static_cast<const SkRasterPipeline_SwizzleCopyCtx*>(v);
2841         int destSlots = (int)op - (int)BuilderOp::swizzle_copy_slot_masked + 1;
2842 
2843         return {this->swizzlePtr(ctx->dst, SkSpan(ctx->offsets, destSlots)),
2844                 this->ptrCtx(ctx->src, destSlots)};
2845     }
2846 
2847     // Interprets the context value as a ShuffleCtx structure.
shuffleCtx(const void * v) const2848     std::tuple<std::string, std::string> shuffleCtx(const void* v) const {
2849         const auto* ctx = static_cast<const SkRasterPipeline_ShuffleCtx*>(v);
2850 
2851         std::string dst = this->ptrCtx(ctx->ptr, ctx->count);
2852         std::string src = "(" + dst + ")[";
2853         for (int index = 0; index < ctx->count; ++index) {
2854             if (ctx->offsets[index] % (N * sizeof(float))) {
2855                 src.push_back('?');
2856             } else {
2857                 src += std::to_string(ctx->offsets[index] / (N * sizeof(float)));
2858             }
2859             src.push_back(' ');
2860         }
2861         src.back() = ']';
2862         return std::make_tuple(dst, src);
2863     }
2864 
2865     // Interprets the context value as a packed MatrixMultiplyCtx structure.
matrixMultiply(const void * v) const2866     std::tuple<std::string, std::string, std::string> matrixMultiply(const void* v) const {
2867         auto ctx = SkRPCtxUtils::Unpack((const SkRasterPipeline_MatrixMultiplyCtx*)v);
2868         int leftMatrix = ctx.leftColumns * ctx.leftRows;
2869         int rightMatrix = ctx.rightColumns * ctx.rightRows;
2870         int resultMatrix = ctx.rightColumns * ctx.leftRows;
2871         SkRPOffset leftOffset = ctx.dst + (ctx.rightColumns * ctx.leftRows * sizeof(float) * N);
2872         SkRPOffset rightOffset = leftOffset + (ctx.leftColumns * ctx.leftRows * sizeof(float) * N);
2873         return {SkSL::String::printf("mat%dx%d(%s)",
2874                                      ctx.rightColumns,
2875                                      ctx.leftRows,
2876                                      this->offsetCtx(ctx.dst, resultMatrix).c_str()),
2877                 SkSL::String::printf("mat%dx%d(%s)",
2878                                      ctx.leftColumns,
2879                                      ctx.leftRows,
2880                                      this->offsetCtx(leftOffset, leftMatrix).c_str()),
2881                 SkSL::String::printf("mat%dx%d(%s)",
2882                                      ctx.rightColumns,
2883                                      ctx.rightRows,
2884                                      this->offsetCtx(rightOffset, rightMatrix).c_str())};
2885     }
2886 
2887 private:
2888     const int N = SkOpts::raster_pipeline_highp_stride;
2889     const Program& fProgram;
2890     TArray<Stage> fStages;
2891     TArray<std::string> fSlotNameList;
2892     THashMap<int, int> fLabelToStageMap;  // <label ID, stage index>
2893     SlotData fSlots;
2894     SkSpan<float> fUniforms;
2895 };
2896 
dump(SkWStream * out,bool writeInstructionCount)2897 void Program::Dumper::dump(SkWStream* out, bool writeInstructionCount) {
2898     using POp = ProgramOp;
2899 
2900     // Allocate memory for the slot and uniform data, even though the program won't ever be
2901     // executed. The program requires pointer ranges for managing its data, and ASAN will report
2902     // errors if those pointers are pointing at unallocated memory.
2903     SkArenaAlloc alloc(/*firstHeapAllocation=*/1000);
2904     fSlots = fProgram.allocateSlotData(&alloc).value();
2905     float* uniformPtr = alloc.makeArray<float>(fProgram.fNumUniformSlots);
2906     fUniforms = SkSpan(uniformPtr, fProgram.fNumUniformSlots);
2907 
2908     // Turn this program into an array of Raster Pipeline stages.
2909     fProgram.makeStages(&fStages, &alloc, fUniforms, fSlots);
2910 
2911     // Assemble lookup tables for program labels and slot names.
2912     this->buildLabelToStageMap();
2913     this->buildUniqueSlotNameList();
2914 
2915     // Emit the program's instruction count.
2916     if (writeInstructionCount) {
2917         int invocationCount = 0, instructionCount = 0;
2918         for (const Stage& stage : fStages) {
2919             switch (stage.op) {
2920                 case POp::label:
2921                     // consumes zero instructions
2922                     break;
2923 
2924                 case POp::invoke_shader:
2925                 case POp::invoke_color_filter:
2926                 case POp::invoke_blender:
2927                 case POp::invoke_to_linear_srgb:
2928                 case POp::invoke_from_linear_srgb:
2929                     ++invocationCount;
2930                     break;
2931 
2932                 default:
2933                     ++instructionCount;
2934                     break;
2935             }
2936         }
2937 
2938         out->writeText(std::to_string(instructionCount).c_str());
2939         out->writeText(" instructions");
2940         if (invocationCount > 0) {
2941             out->writeText(", ");
2942             out->writeText(std::to_string(invocationCount).c_str());
2943             out->writeText(" invocations");
2944         }
2945         out->writeText("\n\n");
2946     }
2947 
2948     // Emit all of the program's immutable data.
2949     const char* header = "[immutable slots]\n";
2950     const char* footer = "";
2951     for (const Instruction& inst : fProgram.fInstructions) {
2952         if (inst.fOp == BuilderOp::store_immutable_value) {
2953             out->writeText(header);
2954             out->writeText("i");
2955             out->writeText(std::to_string(inst.fSlotA).c_str());
2956             out->writeText(" = ");
2957             out->writeText(this->imm(sk_bit_cast<float>(inst.fImmA)).c_str());
2958             out->writeText("\n");
2959 
2960             header = "";
2961             footer = "\n";
2962         }
2963     }
2964     out->writeText(footer);
2965 
2966     // Emit the program's instruction list.
2967     for (int index = 0; index < fStages.size(); ++index) {
2968         const Stage& stage = fStages[index];
2969 
2970         std::string opArg1, opArg2, opArg3, opSwizzle;
2971         switch (stage.op) {
2972             case POp::label:
2973             case POp::invoke_shader:
2974             case POp::invoke_color_filter:
2975             case POp::invoke_blender:
2976                 opArg1 = this->immCtx(stage.ctx, /*showAsFloat=*/false);
2977                 break;
2978 
2979             case POp::case_op: {
2980                 auto ctx = SkRPCtxUtils::Unpack((const SkRasterPipeline_CaseOpCtx*)stage.ctx);
2981                 opArg1 = this->offsetCtx(ctx.offset, 1);
2982                 opArg2 = this->offsetCtx(ctx.offset + sizeof(int32_t) * N, 1);
2983                 opArg3 = this->imm(sk_bit_cast<float>(ctx.expectedValue), /*showAsFloat=*/false);
2984                 break;
2985             }
2986             case POp::swizzle_1:
2987             case POp::swizzle_2:
2988             case POp::swizzle_3:
2989             case POp::swizzle_4:
2990                 std::tie(opArg1, opArg2) = this->swizzleCtx(stage.op, stage.ctx);
2991                 break;
2992 
2993             case POp::swizzle_copy_slot_masked:
2994             case POp::swizzle_copy_2_slots_masked:
2995             case POp::swizzle_copy_3_slots_masked:
2996             case POp::swizzle_copy_4_slots_masked:
2997                 std::tie(opArg1, opArg2) = this->swizzleCopyCtx(stage.op, stage.ctx);
2998                 break;
2999 
3000             case POp::refract_4_floats:
3001                 std::tie(opArg1, opArg2) = this->adjacentPtrCtx(stage.ctx, 4);
3002                 opArg3 = this->ptrCtx((const float*)(stage.ctx) + (8 * N), 1);
3003                 break;
3004 
3005             case POp::dot_2_floats:
3006                 opArg1 = this->ptrCtx(stage.ctx, 1);
3007                 std::tie(opArg2, opArg3) = this->adjacentPtrCtx(stage.ctx, 2);
3008                 break;
3009 
3010             case POp::dot_3_floats:
3011                 opArg1 = this->ptrCtx(stage.ctx, 1);
3012                 std::tie(opArg2, opArg3) = this->adjacentPtrCtx(stage.ctx, 3);
3013                 break;
3014 
3015             case POp::dot_4_floats:
3016                 opArg1 = this->ptrCtx(stage.ctx, 1);
3017                 std::tie(opArg2, opArg3) = this->adjacentPtrCtx(stage.ctx, 4);
3018                 break;
3019 
3020             case POp::shuffle:
3021                 std::tie(opArg1, opArg2) = this->shuffleCtx(stage.ctx);
3022                 break;
3023 
3024             case POp::matrix_multiply_2:
3025             case POp::matrix_multiply_3:
3026             case POp::matrix_multiply_4:
3027                 std::tie(opArg1, opArg2, opArg3) = this->matrixMultiply(stage.ctx);
3028                 break;
3029 
3030             case POp::load_condition_mask:
3031             case POp::store_condition_mask:
3032             case POp::load_loop_mask:
3033             case POp::store_loop_mask:
3034             case POp::merge_loop_mask:
3035             case POp::reenable_loop_mask:
3036             case POp::load_return_mask:
3037             case POp::store_return_mask:
3038             case POp::continue_op:
3039             case POp::cast_to_float_from_int: case POp::cast_to_float_from_uint:
3040             case POp::cast_to_int_from_float: case POp::cast_to_uint_from_float:
3041             case POp::abs_int:
3042             case POp::acos_float:
3043             case POp::asin_float:
3044             case POp::atan_float:
3045             case POp::ceil_float:
3046             case POp::cos_float:
3047             case POp::exp_float:
3048             case POp::exp2_float:
3049             case POp::log_float:
3050             case POp::log2_float:
3051             case POp::floor_float:
3052             case POp::invsqrt_float:
3053             case POp::sin_float:
3054             case POp::sqrt_float:
3055             case POp::tan_float:
3056                 opArg1 = this->ptrCtx(stage.ctx, 1);
3057                 break;
3058 
3059             case POp::store_src_rg:
3060             case POp::cast_to_float_from_2_ints: case POp::cast_to_float_from_2_uints:
3061             case POp::cast_to_int_from_2_floats: case POp::cast_to_uint_from_2_floats:
3062             case POp::abs_2_ints:
3063             case POp::ceil_2_floats:
3064             case POp::floor_2_floats:
3065             case POp::invsqrt_2_floats:
3066                 opArg1 = this->ptrCtx(stage.ctx, 2);
3067                 break;
3068 
3069             case POp::cast_to_float_from_3_ints: case POp::cast_to_float_from_3_uints:
3070             case POp::cast_to_int_from_3_floats: case POp::cast_to_uint_from_3_floats:
3071             case POp::abs_3_ints:
3072             case POp::ceil_3_floats:
3073             case POp::floor_3_floats:
3074             case POp::invsqrt_3_floats:
3075                 opArg1 = this->ptrCtx(stage.ctx, 3);
3076                 break;
3077 
3078             case POp::load_src:
3079             case POp::load_dst:
3080             case POp::exchange_src:
3081             case POp::store_src:
3082             case POp::store_dst:
3083             case POp::store_device_xy01:
3084             case POp::invoke_to_linear_srgb:
3085             case POp::invoke_from_linear_srgb:
3086             case POp::cast_to_float_from_4_ints: case POp::cast_to_float_from_4_uints:
3087             case POp::cast_to_int_from_4_floats: case POp::cast_to_uint_from_4_floats:
3088             case POp::abs_4_ints:
3089             case POp::ceil_4_floats:
3090             case POp::floor_4_floats:
3091             case POp::invsqrt_4_floats:
3092             case POp::inverse_mat2:
3093                 opArg1 = this->ptrCtx(stage.ctx, 4);
3094                 break;
3095 
3096             case POp::inverse_mat3:
3097                 opArg1 = this->ptrCtx(stage.ctx, 9);
3098                 break;
3099 
3100             case POp::inverse_mat4:
3101                 opArg1 = this->ptrCtx(stage.ctx, 16);
3102                 break;
3103 
3104             case POp::copy_constant:
3105             case POp::add_imm_float:
3106             case POp::mul_imm_float:
3107             case POp::cmple_imm_float:
3108             case POp::cmplt_imm_float:
3109             case POp::cmpeq_imm_float:
3110             case POp::cmpne_imm_float:
3111             case POp::min_imm_float:
3112             case POp::max_imm_float:
3113                 std::tie(opArg1, opArg2) = this->constantCtx(stage.ctx, 1);
3114                 break;
3115 
3116             case POp::add_imm_int:
3117             case POp::mul_imm_int:
3118             case POp::bitwise_and_imm_int:
3119             case POp::bitwise_xor_imm_int:
3120             case POp::cmple_imm_int:
3121             case POp::cmple_imm_uint:
3122             case POp::cmplt_imm_int:
3123             case POp::cmplt_imm_uint:
3124             case POp::cmpeq_imm_int:
3125             case POp::cmpne_imm_int:
3126                 std::tie(opArg1, opArg2) = this->constantCtx(stage.ctx, 1, /*showAsFloat=*/false);
3127                 break;
3128 
3129             case POp::splat_2_constants:
3130             case POp::bitwise_and_imm_2_ints:
3131                 std::tie(opArg1, opArg2) = this->constantCtx(stage.ctx, 2);
3132                 break;
3133 
3134             case POp::splat_3_constants:
3135             case POp::bitwise_and_imm_3_ints:
3136                 std::tie(opArg1, opArg2) = this->constantCtx(stage.ctx, 3);
3137                 break;
3138 
3139             case POp::splat_4_constants:
3140             case POp::bitwise_and_imm_4_ints:
3141                 std::tie(opArg1, opArg2) = this->constantCtx(stage.ctx, 4);
3142                 break;
3143 
3144             case POp::copy_uniform:
3145                 std::tie(opArg1, opArg2) = this->copyUniformCtx(stage.ctx, 1);
3146                 break;
3147 
3148             case POp::copy_2_uniforms:
3149                 std::tie(opArg1, opArg2) = this->copyUniformCtx(stage.ctx, 2);
3150                 break;
3151 
3152             case POp::copy_3_uniforms:
3153                 std::tie(opArg1, opArg2) = this->copyUniformCtx(stage.ctx, 3);
3154                 break;
3155 
3156             case POp::copy_4_uniforms:
3157                 std::tie(opArg1, opArg2) = this->copyUniformCtx(stage.ctx, 4);
3158                 break;
3159 
3160             case POp::copy_slot_masked:
3161             case POp::copy_slot_unmasked:
3162             case POp::copy_immutable_unmasked:
3163                 std::tie(opArg1, opArg2) = this->binaryOpCtx(stage.ctx, 1);
3164                 break;
3165 
3166             case POp::copy_2_slots_masked:
3167             case POp::copy_2_slots_unmasked:
3168             case POp::copy_2_immutables_unmasked:
3169                 std::tie(opArg1, opArg2) = this->binaryOpCtx(stage.ctx, 2);
3170                 break;
3171 
3172             case POp::copy_3_slots_masked:
3173             case POp::copy_3_slots_unmasked:
3174             case POp::copy_3_immutables_unmasked:
3175                 std::tie(opArg1, opArg2) = this->binaryOpCtx(stage.ctx, 3);
3176                 break;
3177 
3178             case POp::copy_4_slots_masked:
3179             case POp::copy_4_slots_unmasked:
3180             case POp::copy_4_immutables_unmasked:
3181                 std::tie(opArg1, opArg2) = this->binaryOpCtx(stage.ctx, 4);
3182                 break;
3183 
3184             case POp::copy_from_indirect_uniform_unmasked:
3185             case POp::copy_from_indirect_unmasked:
3186             case POp::copy_to_indirect_masked: {
3187                 const auto* ctx = static_cast<SkRasterPipeline_CopyIndirectCtx*>(stage.ctx);
3188                 // We don't incorporate the indirect-limit in the output
3189                 opArg1 = this->ptrCtx(ctx->dst, ctx->slots);
3190                 opArg2 = this->ptrCtx(ctx->src, ctx->slots);
3191                 opArg3 = this->ptrCtx(ctx->indirectOffset, 1);
3192                 break;
3193             }
3194             case POp::swizzle_copy_to_indirect_masked: {
3195                 const auto* ctx = static_cast<SkRasterPipeline_SwizzleCopyIndirectCtx*>(stage.ctx);
3196                 opArg1 = this->ptrCtx(ctx->dst, this->swizzleWidth(SkSpan(ctx->offsets,
3197                                                                           ctx->slots)));
3198                 opArg2 = this->ptrCtx(ctx->src, ctx->slots);
3199                 opArg3 = this->ptrCtx(ctx->indirectOffset, 1);
3200                 opSwizzle = this->swizzleOffsetSpan(SkSpan(ctx->offsets, ctx->slots));
3201                 break;
3202             }
3203             case POp::merge_condition_mask:
3204             case POp::merge_inv_condition_mask:
3205             case POp::add_float:   case POp::add_int:
3206             case POp::sub_float:   case POp::sub_int:
3207             case POp::mul_float:   case POp::mul_int:
3208             case POp::div_float:   case POp::div_int:   case POp::div_uint:
3209                                    case POp::bitwise_and_int:
3210                                    case POp::bitwise_or_int:
3211                                    case POp::bitwise_xor_int:
3212             case POp::mod_float:
3213             case POp::min_float:   case POp::min_int:   case POp::min_uint:
3214             case POp::max_float:   case POp::max_int:   case POp::max_uint:
3215             case POp::cmplt_float: case POp::cmplt_int: case POp::cmplt_uint:
3216             case POp::cmple_float: case POp::cmple_int: case POp::cmple_uint:
3217             case POp::cmpeq_float: case POp::cmpeq_int:
3218             case POp::cmpne_float: case POp::cmpne_int:
3219                 std::tie(opArg1, opArg2) = this->adjacentPtrCtx(stage.ctx, 1);
3220                 break;
3221 
3222             case POp::mix_float:   case POp::mix_int:
3223                 std::tie(opArg1, opArg2, opArg3) = this->adjacent3PtrCtx(stage.ctx, 1);
3224                 break;
3225 
3226             case POp::add_2_floats:   case POp::add_2_ints:
3227             case POp::sub_2_floats:   case POp::sub_2_ints:
3228             case POp::mul_2_floats:   case POp::mul_2_ints:
3229             case POp::div_2_floats:   case POp::div_2_ints:   case POp::div_2_uints:
3230                                       case POp::bitwise_and_2_ints:
3231                                       case POp::bitwise_or_2_ints:
3232                                       case POp::bitwise_xor_2_ints:
3233             case POp::mod_2_floats:
3234             case POp::min_2_floats:   case POp::min_2_ints:   case POp::min_2_uints:
3235             case POp::max_2_floats:   case POp::max_2_ints:   case POp::max_2_uints:
3236             case POp::cmplt_2_floats: case POp::cmplt_2_ints: case POp::cmplt_2_uints:
3237             case POp::cmple_2_floats: case POp::cmple_2_ints: case POp::cmple_2_uints:
3238             case POp::cmpeq_2_floats: case POp::cmpeq_2_ints:
3239             case POp::cmpne_2_floats: case POp::cmpne_2_ints:
3240                 std::tie(opArg1, opArg2) = this->adjacentPtrCtx(stage.ctx, 2);
3241                 break;
3242 
3243             case POp::mix_2_floats:   case POp::mix_2_ints:
3244                 std::tie(opArg1, opArg2, opArg3) = this->adjacent3PtrCtx(stage.ctx, 2);
3245                 break;
3246 
3247             case POp::add_3_floats:   case POp::add_3_ints:
3248             case POp::sub_3_floats:   case POp::sub_3_ints:
3249             case POp::mul_3_floats:   case POp::mul_3_ints:
3250             case POp::div_3_floats:   case POp::div_3_ints:   case POp::div_3_uints:
3251                                       case POp::bitwise_and_3_ints:
3252                                       case POp::bitwise_or_3_ints:
3253                                       case POp::bitwise_xor_3_ints:
3254             case POp::mod_3_floats:
3255             case POp::min_3_floats:   case POp::min_3_ints:   case POp::min_3_uints:
3256             case POp::max_3_floats:   case POp::max_3_ints:   case POp::max_3_uints:
3257             case POp::cmplt_3_floats: case POp::cmplt_3_ints: case POp::cmplt_3_uints:
3258             case POp::cmple_3_floats: case POp::cmple_3_ints: case POp::cmple_3_uints:
3259             case POp::cmpeq_3_floats: case POp::cmpeq_3_ints:
3260             case POp::cmpne_3_floats: case POp::cmpne_3_ints:
3261                 std::tie(opArg1, opArg2) = this->adjacentPtrCtx(stage.ctx, 3);
3262                 break;
3263 
3264             case POp::mix_3_floats:   case POp::mix_3_ints:
3265                 std::tie(opArg1, opArg2, opArg3) = this->adjacent3PtrCtx(stage.ctx, 3);
3266                 break;
3267 
3268             case POp::add_4_floats:   case POp::add_4_ints:
3269             case POp::sub_4_floats:   case POp::sub_4_ints:
3270             case POp::mul_4_floats:   case POp::mul_4_ints:
3271             case POp::div_4_floats:   case POp::div_4_ints:   case POp::div_4_uints:
3272                                       case POp::bitwise_and_4_ints:
3273                                       case POp::bitwise_or_4_ints:
3274                                       case POp::bitwise_xor_4_ints:
3275             case POp::mod_4_floats:
3276             case POp::min_4_floats:   case POp::min_4_ints:   case POp::min_4_uints:
3277             case POp::max_4_floats:   case POp::max_4_ints:   case POp::max_4_uints:
3278             case POp::cmplt_4_floats: case POp::cmplt_4_ints: case POp::cmplt_4_uints:
3279             case POp::cmple_4_floats: case POp::cmple_4_ints: case POp::cmple_4_uints:
3280             case POp::cmpeq_4_floats: case POp::cmpeq_4_ints:
3281             case POp::cmpne_4_floats: case POp::cmpne_4_ints:
3282                 std::tie(opArg1, opArg2) = this->adjacentPtrCtx(stage.ctx, 4);
3283                 break;
3284 
3285             case POp::mix_4_floats:   case POp::mix_4_ints:
3286                 std::tie(opArg1, opArg2, opArg3) = this->adjacent3PtrCtx(stage.ctx, 4);
3287                 break;
3288 
3289             case POp::add_n_floats:   case POp::add_n_ints:
3290             case POp::sub_n_floats:   case POp::sub_n_ints:
3291             case POp::mul_n_floats:   case POp::mul_n_ints:
3292             case POp::div_n_floats:   case POp::div_n_ints:   case POp::div_n_uints:
3293                                       case POp::bitwise_and_n_ints:
3294                                       case POp::bitwise_or_n_ints:
3295                                       case POp::bitwise_xor_n_ints:
3296             case POp::mod_n_floats:
3297             case POp::min_n_floats:   case POp::min_n_ints:   case POp::min_n_uints:
3298             case POp::max_n_floats:   case POp::max_n_ints:   case POp::max_n_uints:
3299             case POp::cmplt_n_floats: case POp::cmplt_n_ints: case POp::cmplt_n_uints:
3300             case POp::cmple_n_floats: case POp::cmple_n_ints: case POp::cmple_n_uints:
3301             case POp::cmpeq_n_floats: case POp::cmpeq_n_ints:
3302             case POp::cmpne_n_floats: case POp::cmpne_n_ints:
3303             case POp::atan2_n_floats:
3304             case POp::pow_n_floats:
3305                 std::tie(opArg1, opArg2) = this->adjacentBinaryOpCtx(stage.ctx);
3306                 break;
3307 
3308             case POp::mix_n_floats:        case POp::mix_n_ints:
3309             case POp::smoothstep_n_floats:
3310                 std::tie(opArg1, opArg2, opArg3) = this->adjacentTernaryOpCtx(stage.ctx);
3311                 break;
3312 
3313             case POp::jump:
3314             case POp::branch_if_all_lanes_active:
3315             case POp::branch_if_any_lanes_active:
3316             case POp::branch_if_no_lanes_active:
3317                 opArg1 = this->branchOffset(static_cast<SkRasterPipeline_BranchCtx*>(stage.ctx),
3318                                             index);
3319                 break;
3320 
3321             case POp::branch_if_no_active_lanes_eq: {
3322                 const auto* ctx = static_cast<SkRasterPipeline_BranchIfEqualCtx*>(stage.ctx);
3323                 opArg1 = this->branchOffset(ctx, index);
3324                 opArg2 = this->ptrCtx(ctx->ptr, 1);
3325                 opArg3 = this->imm(sk_bit_cast<float>(ctx->value));
3326                 break;
3327             }
3328             case POp::trace_var: {
3329                 const auto* ctx = static_cast<SkRasterPipeline_TraceVarCtx*>(stage.ctx);
3330                 opArg1 = this->ptrCtx(ctx->traceMask, 1);
3331                 opArg2 = this->ptrCtx(ctx->data, ctx->numSlots);
3332                 if (ctx->indirectOffset != nullptr) {
3333                     opArg3 = " + " + this->ptrCtx(ctx->indirectOffset, 1);
3334                 }
3335                 break;
3336             }
3337             case POp::trace_line: {
3338                 const auto* ctx = static_cast<SkRasterPipeline_TraceLineCtx*>(stage.ctx);
3339                 opArg1 = this->ptrCtx(ctx->traceMask, 1);
3340                 opArg2 = std::to_string(ctx->lineNumber);
3341                 break;
3342             }
3343             case POp::trace_enter:
3344             case POp::trace_exit: {
3345                 const auto* ctx = static_cast<SkRasterPipeline_TraceFuncCtx*>(stage.ctx);
3346                 opArg1 = this->ptrCtx(ctx->traceMask, 1);
3347                 opArg2 = (fProgram.fDebugTrace &&
3348                           ctx->funcIdx >= 0 &&
3349                           ctx->funcIdx < (int)fProgram.fDebugTrace->fFuncInfo.size())
3350                                  ? fProgram.fDebugTrace->fFuncInfo[ctx->funcIdx].name
3351                                  : "???";
3352                 break;
3353             }
3354             case POp::trace_scope: {
3355                 const auto* ctx = static_cast<SkRasterPipeline_TraceScopeCtx*>(stage.ctx);
3356                 opArg1 = this->ptrCtx(ctx->traceMask, 1);
3357                 opArg2 = SkSL::String::printf("%+d", ctx->delta);
3358                 break;
3359             }
3360             default:
3361                 break;
3362         }
3363 
3364         std::string_view opName;
3365         switch (stage.op) {
3366         #define M(x) case POp::x: opName = #x; break;
3367             SK_RASTER_PIPELINE_OPS_ALL(M)
3368             SKRP_EXTENDED_OPS(M)
3369         #undef M
3370         }
3371 
3372         std::string opText;
3373         switch (stage.op) {
3374             case POp::trace_var:
3375                 opText = "TraceVar(" + opArg2 + opArg3 + ") when " + opArg1 + " is true";
3376                 break;
3377 
3378             case POp::trace_line:
3379                 opText = "TraceLine(" + opArg2 + ") when " + opArg1 + " is true";
3380                 break;
3381 
3382             case POp::trace_enter:
3383                 opText = "TraceEnter(" + opArg2 + ") when " + opArg1 + " is true";
3384                 break;
3385 
3386             case POp::trace_exit:
3387                 opText = "TraceExit(" + opArg2 + ") when " + opArg1 + " is true";
3388                 break;
3389 
3390             case POp::trace_scope:
3391                 opText = "TraceScope(" + opArg2 + ") when " + opArg1 + " is true";
3392                 break;
3393 
3394             case POp::init_lane_masks:
3395                 opText = "CondMask = LoopMask = RetMask = true";
3396                 break;
3397 
3398             case POp::load_condition_mask:
3399                 opText = "CondMask = " + opArg1;
3400                 break;
3401 
3402             case POp::store_condition_mask:
3403                 opText = opArg1 + " = CondMask";
3404                 break;
3405 
3406             case POp::merge_condition_mask:
3407                 opText = "CondMask = " + opArg1 + " & " + opArg2;
3408                 break;
3409 
3410             case POp::merge_inv_condition_mask:
3411                 opText = "CondMask = " + opArg1 + " & ~" + opArg2;
3412                 break;
3413 
3414             case POp::load_loop_mask:
3415                 opText = "LoopMask = " + opArg1;
3416                 break;
3417 
3418             case POp::store_loop_mask:
3419                 opText = opArg1 + " = LoopMask";
3420                 break;
3421 
3422             case POp::mask_off_loop_mask:
3423                 opText = "LoopMask &= ~(CondMask & LoopMask & RetMask)";
3424                 break;
3425 
3426             case POp::reenable_loop_mask:
3427                 opText = "LoopMask |= " + opArg1;
3428                 break;
3429 
3430             case POp::merge_loop_mask:
3431                 opText = "LoopMask &= " + opArg1;
3432                 break;
3433 
3434             case POp::load_return_mask:
3435                 opText = "RetMask = " + opArg1;
3436                 break;
3437 
3438             case POp::store_return_mask:
3439                 opText = opArg1 + " = RetMask";
3440                 break;
3441 
3442             case POp::mask_off_return_mask:
3443                 opText = "RetMask &= ~(CondMask & LoopMask & RetMask)";
3444                 break;
3445 
3446             case POp::store_src_rg:
3447                 opText = opArg1 + " = src.rg";
3448                 break;
3449 
3450             case POp::exchange_src:
3451                 opText = "swap(src.rgba, " + opArg1 + ")";
3452                 break;
3453 
3454             case POp::store_src:
3455                 opText = opArg1 + " = src.rgba";
3456                 break;
3457 
3458             case POp::store_dst:
3459                 opText = opArg1 + " = dst.rgba";
3460                 break;
3461 
3462             case POp::store_device_xy01:
3463                 opText = opArg1 + " = DeviceCoords.xy01";
3464                 break;
3465 
3466             case POp::load_src:
3467                 opText = "src.rgba = " + opArg1;
3468                 break;
3469 
3470             case POp::load_dst:
3471                 opText = "dst.rgba = " + opArg1;
3472                 break;
3473 
3474             case POp::bitwise_and_int:
3475             case POp::bitwise_and_2_ints:
3476             case POp::bitwise_and_3_ints:
3477             case POp::bitwise_and_4_ints:
3478             case POp::bitwise_and_n_ints:
3479             case POp::bitwise_and_imm_int:
3480             case POp::bitwise_and_imm_2_ints:
3481             case POp::bitwise_and_imm_3_ints:
3482             case POp::bitwise_and_imm_4_ints:
3483                 opText = opArg1 + " &= " + opArg2;
3484                 break;
3485 
3486             case POp::bitwise_or_int:
3487             case POp::bitwise_or_2_ints:
3488             case POp::bitwise_or_3_ints:
3489             case POp::bitwise_or_4_ints:
3490             case POp::bitwise_or_n_ints:
3491                 opText = opArg1 + " |= " + opArg2;
3492                 break;
3493 
3494             case POp::bitwise_xor_int:
3495             case POp::bitwise_xor_2_ints:
3496             case POp::bitwise_xor_3_ints:
3497             case POp::bitwise_xor_4_ints:
3498             case POp::bitwise_xor_n_ints:
3499             case POp::bitwise_xor_imm_int:
3500                 opText = opArg1 + " ^= " + opArg2;
3501                 break;
3502 
3503             case POp::cast_to_float_from_int:
3504             case POp::cast_to_float_from_2_ints:
3505             case POp::cast_to_float_from_3_ints:
3506             case POp::cast_to_float_from_4_ints:
3507                 opText = opArg1 + " = IntToFloat(" + opArg1 + ")";
3508                 break;
3509 
3510             case POp::cast_to_float_from_uint:
3511             case POp::cast_to_float_from_2_uints:
3512             case POp::cast_to_float_from_3_uints:
3513             case POp::cast_to_float_from_4_uints:
3514                 opText = opArg1 + " = UintToFloat(" + opArg1 + ")";
3515                 break;
3516 
3517             case POp::cast_to_int_from_float:
3518             case POp::cast_to_int_from_2_floats:
3519             case POp::cast_to_int_from_3_floats:
3520             case POp::cast_to_int_from_4_floats:
3521                 opText = opArg1 + " = FloatToInt(" + opArg1 + ")";
3522                 break;
3523 
3524             case POp::cast_to_uint_from_float:
3525             case POp::cast_to_uint_from_2_floats:
3526             case POp::cast_to_uint_from_3_floats:
3527             case POp::cast_to_uint_from_4_floats:
3528                 opText = opArg1 + " = FloatToUint(" + opArg1 + ")";
3529                 break;
3530 
3531             case POp::copy_slot_masked:            case POp::copy_2_slots_masked:
3532             case POp::copy_3_slots_masked:         case POp::copy_4_slots_masked:
3533             case POp::swizzle_copy_slot_masked:    case POp::swizzle_copy_2_slots_masked:
3534             case POp::swizzle_copy_3_slots_masked: case POp::swizzle_copy_4_slots_masked:
3535                 opText = opArg1 + " = Mask(" + opArg2 + ")";
3536                 break;
3537 
3538             case POp::copy_uniform:                case POp::copy_2_uniforms:
3539             case POp::copy_3_uniforms:             case POp::copy_4_uniforms:
3540             case POp::copy_slot_unmasked:          case POp::copy_2_slots_unmasked:
3541             case POp::copy_3_slots_unmasked:       case POp::copy_4_slots_unmasked:
3542             case POp::copy_immutable_unmasked:     case POp::copy_2_immutables_unmasked:
3543             case POp::copy_3_immutables_unmasked:  case POp::copy_4_immutables_unmasked:
3544             case POp::copy_constant:               case POp::splat_2_constants:
3545             case POp::splat_3_constants:           case POp::splat_4_constants:
3546             case POp::swizzle_1:                   case POp::swizzle_2:
3547             case POp::swizzle_3:                   case POp::swizzle_4:
3548             case POp::shuffle:
3549                 opText = opArg1 + " = " + opArg2;
3550                 break;
3551 
3552             case POp::copy_from_indirect_unmasked:
3553             case POp::copy_from_indirect_uniform_unmasked:
3554                 opText = opArg1 + " = Indirect(" + opArg2 + " + " + opArg3 + ")";
3555                 break;
3556 
3557             case POp::copy_to_indirect_masked:
3558                 opText = "Indirect(" + opArg1 + " + " + opArg3 + ") = Mask(" + opArg2 + ")";
3559                 break;
3560 
3561             case POp::swizzle_copy_to_indirect_masked:
3562                 opText = "Indirect(" + opArg1 + " + " + opArg3 + ")." + opSwizzle + " = Mask(" +
3563                          opArg2 + ")";
3564                 break;
3565 
3566             case POp::abs_int:
3567             case POp::abs_2_ints:
3568             case POp::abs_3_ints:
3569             case POp::abs_4_ints:
3570                 opText = opArg1 + " = abs(" + opArg1 + ")";
3571                 break;
3572 
3573             case POp::acos_float:
3574                 opText = opArg1 + " = acos(" + opArg1 + ")";
3575                 break;
3576 
3577             case POp::asin_float:
3578                 opText = opArg1 + " = asin(" + opArg1 + ")";
3579                 break;
3580 
3581             case POp::atan_float:
3582                 opText = opArg1 + " = atan(" + opArg1 + ")";
3583                 break;
3584 
3585             case POp::atan2_n_floats:
3586                 opText = opArg1 + " = atan2(" + opArg1 + ", " + opArg2 + ")";
3587                 break;
3588 
3589             case POp::ceil_float:
3590             case POp::ceil_2_floats:
3591             case POp::ceil_3_floats:
3592             case POp::ceil_4_floats:
3593                 opText = opArg1 + " = ceil(" + opArg1 + ")";
3594                 break;
3595 
3596             case POp::cos_float:
3597                 opText = opArg1 + " = cos(" + opArg1 + ")";
3598                 break;
3599 
3600             case POp::refract_4_floats:
3601                 opText = opArg1 + " = refract(" + opArg1 + ", " + opArg2 + ", " + opArg3 + ")";
3602                 break;
3603 
3604             case POp::dot_2_floats:
3605             case POp::dot_3_floats:
3606             case POp::dot_4_floats:
3607                 opText = opArg1 + " = dot(" + opArg2 + ", " + opArg3 + ")";
3608                 break;
3609 
3610             case POp::exp_float:
3611                 opText = opArg1 + " = exp(" + opArg1 + ")";
3612                 break;
3613 
3614             case POp::exp2_float:
3615                 opText = opArg1 + " = exp2(" + opArg1 + ")";
3616                 break;
3617 
3618             case POp::log_float:
3619                 opText = opArg1 + " = log(" + opArg1 + ")";
3620                 break;
3621 
3622             case POp::log2_float:
3623                 opText = opArg1 + " = log2(" + opArg1 + ")";
3624                 break;
3625 
3626             case POp::pow_n_floats:
3627                 opText = opArg1 + " = pow(" + opArg1 + ", " + opArg2 + ")";
3628                 break;
3629 
3630             case POp::sin_float:
3631                 opText = opArg1 + " = sin(" + opArg1 + ")";
3632                 break;
3633 
3634             case POp::sqrt_float:
3635                 opText = opArg1 + " = sqrt(" + opArg1 + ")";
3636                 break;
3637 
3638             case POp::tan_float:
3639                 opText = opArg1 + " = tan(" + opArg1 + ")";
3640                 break;
3641 
3642             case POp::floor_float:
3643             case POp::floor_2_floats:
3644             case POp::floor_3_floats:
3645             case POp::floor_4_floats:
3646                 opText = opArg1 + " = floor(" + opArg1 + ")";
3647                 break;
3648 
3649             case POp::invsqrt_float:
3650             case POp::invsqrt_2_floats:
3651             case POp::invsqrt_3_floats:
3652             case POp::invsqrt_4_floats:
3653                 opText = opArg1 + " = inversesqrt(" + opArg1 + ")";
3654                 break;
3655 
3656             case POp::inverse_mat2:
3657             case POp::inverse_mat3:
3658             case POp::inverse_mat4:
3659                 opText = opArg1 + " = inverse(" + opArg1 + ")";
3660                 break;
3661 
3662             case POp::add_float:     case POp::add_int:
3663             case POp::add_2_floats:  case POp::add_2_ints:
3664             case POp::add_3_floats:  case POp::add_3_ints:
3665             case POp::add_4_floats:  case POp::add_4_ints:
3666             case POp::add_n_floats:  case POp::add_n_ints:
3667             case POp::add_imm_float: case POp::add_imm_int:
3668                 opText = opArg1 + " += " + opArg2;
3669                 break;
3670 
3671             case POp::sub_float:    case POp::sub_int:
3672             case POp::sub_2_floats: case POp::sub_2_ints:
3673             case POp::sub_3_floats: case POp::sub_3_ints:
3674             case POp::sub_4_floats: case POp::sub_4_ints:
3675             case POp::sub_n_floats: case POp::sub_n_ints:
3676                 opText = opArg1 + " -= " + opArg2;
3677                 break;
3678 
3679             case POp::mul_float:     case POp::mul_int:
3680             case POp::mul_2_floats:  case POp::mul_2_ints:
3681             case POp::mul_3_floats:  case POp::mul_3_ints:
3682             case POp::mul_4_floats:  case POp::mul_4_ints:
3683             case POp::mul_n_floats:  case POp::mul_n_ints:
3684             case POp::mul_imm_float: case POp::mul_imm_int:
3685                 opText = opArg1 + " *= " + opArg2;
3686                 break;
3687 
3688             case POp::div_float:    case POp::div_int:    case POp::div_uint:
3689             case POp::div_2_floats: case POp::div_2_ints: case POp::div_2_uints:
3690             case POp::div_3_floats: case POp::div_3_ints: case POp::div_3_uints:
3691             case POp::div_4_floats: case POp::div_4_ints: case POp::div_4_uints:
3692             case POp::div_n_floats: case POp::div_n_ints: case POp::div_n_uints:
3693                 opText = opArg1 + " /= " + opArg2;
3694                 break;
3695 
3696             case POp::matrix_multiply_2:
3697             case POp::matrix_multiply_3:
3698             case POp::matrix_multiply_4:
3699                 opText = opArg1 + " = " + opArg2 + " * " + opArg3;
3700                 break;
3701 
3702             case POp::mod_float:
3703             case POp::mod_2_floats:
3704             case POp::mod_3_floats:
3705             case POp::mod_4_floats:
3706             case POp::mod_n_floats:
3707                 opText = opArg1 + " = mod(" + opArg1 + ", " + opArg2 + ")";
3708                 break;
3709 
3710             case POp::min_float:        case POp::min_int:          case POp::min_uint:
3711             case POp::min_2_floats:     case POp::min_2_ints:       case POp::min_2_uints:
3712             case POp::min_3_floats:     case POp::min_3_ints:       case POp::min_3_uints:
3713             case POp::min_4_floats:     case POp::min_4_ints:       case POp::min_4_uints:
3714             case POp::min_n_floats:     case POp::min_n_ints:       case POp::min_n_uints:
3715             case POp::min_imm_float:
3716                 opText = opArg1 + " = min(" + opArg1 + ", " + opArg2 + ")";
3717                 break;
3718 
3719             case POp::max_float:        case POp::max_int:          case POp::max_uint:
3720             case POp::max_2_floats:     case POp::max_2_ints:       case POp::max_2_uints:
3721             case POp::max_3_floats:     case POp::max_3_ints:       case POp::max_3_uints:
3722             case POp::max_4_floats:     case POp::max_4_ints:       case POp::max_4_uints:
3723             case POp::max_n_floats:     case POp::max_n_ints:       case POp::max_n_uints:
3724             case POp::max_imm_float:
3725                 opText = opArg1 + " = max(" + opArg1 + ", " + opArg2 + ")";
3726                 break;
3727 
3728             case POp::cmplt_float:     case POp::cmplt_int:     case POp::cmplt_uint:
3729             case POp::cmplt_2_floats:  case POp::cmplt_2_ints:  case POp::cmplt_2_uints:
3730             case POp::cmplt_3_floats:  case POp::cmplt_3_ints:  case POp::cmplt_3_uints:
3731             case POp::cmplt_4_floats:  case POp::cmplt_4_ints:  case POp::cmplt_4_uints:
3732             case POp::cmplt_n_floats:  case POp::cmplt_n_ints:  case POp::cmplt_n_uints:
3733             case POp::cmplt_imm_float: case POp::cmplt_imm_int: case POp::cmplt_imm_uint:
3734                 opText = opArg1 + " = lessThan(" + opArg1 + ", " + opArg2 + ")";
3735                 break;
3736 
3737             case POp::cmple_float:     case POp::cmple_int:     case POp::cmple_uint:
3738             case POp::cmple_2_floats:  case POp::cmple_2_ints:  case POp::cmple_2_uints:
3739             case POp::cmple_3_floats:  case POp::cmple_3_ints:  case POp::cmple_3_uints:
3740             case POp::cmple_4_floats:  case POp::cmple_4_ints:  case POp::cmple_4_uints:
3741             case POp::cmple_n_floats:  case POp::cmple_n_ints:  case POp::cmple_n_uints:
3742             case POp::cmple_imm_float: case POp::cmple_imm_int: case POp::cmple_imm_uint:
3743                 opText = opArg1 + " = lessThanEqual(" + opArg1 + ", " + opArg2 + ")";
3744                 break;
3745 
3746             case POp::cmpeq_float:     case POp::cmpeq_int:
3747             case POp::cmpeq_2_floats:  case POp::cmpeq_2_ints:
3748             case POp::cmpeq_3_floats:  case POp::cmpeq_3_ints:
3749             case POp::cmpeq_4_floats:  case POp::cmpeq_4_ints:
3750             case POp::cmpeq_n_floats:  case POp::cmpeq_n_ints:
3751             case POp::cmpeq_imm_float: case POp::cmpeq_imm_int:
3752                 opText = opArg1 + " = equal(" + opArg1 + ", " + opArg2 + ")";
3753                 break;
3754 
3755             case POp::cmpne_float:     case POp::cmpne_int:
3756             case POp::cmpne_2_floats:  case POp::cmpne_2_ints:
3757             case POp::cmpne_3_floats:  case POp::cmpne_3_ints:
3758             case POp::cmpne_4_floats:  case POp::cmpne_4_ints:
3759             case POp::cmpne_n_floats:  case POp::cmpne_n_ints:
3760             case POp::cmpne_imm_float: case POp::cmpne_imm_int:
3761                 opText = opArg1 + " = notEqual(" + opArg1 + ", " + opArg2 + ")";
3762                 break;
3763 
3764             case POp::mix_float:      case POp::mix_int:
3765             case POp::mix_2_floats:   case POp::mix_2_ints:
3766             case POp::mix_3_floats:   case POp::mix_3_ints:
3767             case POp::mix_4_floats:   case POp::mix_4_ints:
3768             case POp::mix_n_floats:   case POp::mix_n_ints:
3769                 opText = opArg1 + " = mix(" + opArg2 + ", " + opArg3 + ", " + opArg1 + ")";
3770                 break;
3771 
3772             case POp::smoothstep_n_floats:
3773                 opText = opArg1 + " = smoothstep(" + opArg1 + ", " + opArg2 + ", " + opArg3 + ")";
3774                 break;
3775 
3776             case POp::jump:
3777             case POp::branch_if_all_lanes_active:
3778             case POp::branch_if_any_lanes_active:
3779             case POp::branch_if_no_lanes_active:
3780             case POp::invoke_shader:
3781             case POp::invoke_color_filter:
3782             case POp::invoke_blender:
3783                 opText = std::string(opName) + " " + opArg1;
3784                 break;
3785 
3786             case POp::invoke_to_linear_srgb:
3787                 opText = opArg1 + " = toLinearSrgb(" + opArg1 + ")";
3788                 break;
3789 
3790             case POp::invoke_from_linear_srgb:
3791                 opText = opArg1 + " = fromLinearSrgb(" + opArg1 + ")";
3792                 break;
3793 
3794             case POp::branch_if_no_active_lanes_eq:
3795                 opText = "branch " + opArg1 + " if no lanes of " + opArg2 + " == " + opArg3;
3796                 break;
3797 
3798             case POp::label:
3799                 opText = "label " + opArg1;
3800                 break;
3801 
3802             case POp::case_op:
3803                 opText = "if (" + opArg1 + " == " + opArg3 +
3804                          ") { LoopMask = true; " + opArg2 + " = false; }";
3805                 break;
3806 
3807             case POp::continue_op:
3808                 opText = opArg1 +
3809                          " |= Mask(0xFFFFFFFF); LoopMask &= ~(CondMask & LoopMask & RetMask)";
3810                 break;
3811 
3812             default:
3813                 break;
3814         }
3815 
3816         opName = opName.substr(0, 30);
3817         if (!opText.empty()) {
3818             out->writeText(SkSL::String::printf("%-30.*s %s\n",
3819                                                 (int)opName.size(), opName.data(),
3820                                                 opText.c_str()).c_str());
3821         } else {
3822             out->writeText(SkSL::String::printf("%.*s\n",
3823                                                 (int)opName.size(), opName.data()).c_str());
3824         }
3825     }
3826 }
3827 
dump(SkWStream * out,bool writeInstructionCount) const3828 void Program::dump(SkWStream* out, bool writeInstructionCount) const {
3829     Dumper(*this).dump(out, writeInstructionCount);
3830 }
3831 
3832 }  // namespace SkSL::RP
3833