xref: /aosp_15_r20/external/skia/src/gpu/graphite/compute/VelloComputeSteps.h (revision c8dee2aa9b3f27cf6c858bd81872bdeb2c07ed17)
1 /*
2  * Copyright 2023 Google LLC
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #ifndef skgpu_graphite_compute_VelloComputeSteps_DEFINED
9 #define skgpu_graphite_compute_VelloComputeSteps_DEFINED
10 
11 #include "include/core/SkColorType.h"
12 #include "include/core/SkSize.h"
13 #include "include/core/SkSpan.h"
14 #include "include/private/base/SkTArray.h"
15 #include "src/gpu/graphite/ComputeTypes.h"
16 #include "src/gpu/graphite/compute/ComputeStep.h"
17 
18 #include "third_party/vello/cpp/vello.h"
19 
20 #include <string_view>
21 
22 namespace skgpu::graphite {
23 
24 // This file defines ComputeSteps for all Vello compute stages and their permutations. The
25 // declaration of each ComputeStep subclass mirrors the name of the pipeline stage as defined in the
26 // shader metadata.
27 //
28 // The compute stages all operate over a shared set of buffer and image resources. The
29 // `kVelloSlot_*` constant definitions below each uniquely identify a shared resource that must be
30 // instantiated when assembling the ComputeSteps into a DispatchGroup.
31 //
32 // === Monoids and Prefix Sums ===
33 //
34 // Vello's GPU algorithms make repeated use of parallel prefix sums techniques. These occur
35 // frequently in path rasterization (e.g. winding number accummulation across a scanline can be
36 // thought of as per-pixel prefix sums) but Vello also uses them to calculate buffer offsets for
37 // associated entries across its variable length encoding streams.
38 //
39 // For instance, given a scene that contains Bézier paths, each path gets encoded as a transform,
40 // a sequence of path tags (verbs), and zero or more 2-D points associated with each
41 // tag. N paths will often map to N transforms, N + M tags, and N + M + L points (where N > 0, M >
42 // 0, L >= 0). These entries are stored in separate parallel transform, path tag, and path data
43 // streams. The correspondence between entries of these independent streams is implicit. To keep
44 // CPU encoding of these streams fast, the offsets into each buffer for a given "path object" is
45 // computed dynamically and in parallel on the GPU. Since the offsets for each object build
46 // additively on offsets that appear before it in the stream, parallel computation of
47 // offsets can be treated as a dynamic programming problem that maps well to parallel prefix sums
48 // where each object is a "monoid" (https://en.wikipedia.org/wiki/Monoid) that supports algebraic
49 // addition/subtraction over data encoded in the path tags themselves.
50 //
51 // Once computed, a monoid contains the offsets into the input (and sometimes output) buffers for a
52 // given object. The parallel prefix sums operation is defined as a monoidal reduce + pre-scan pair.
53 // (Prefix Sums and Their Applications, Blelloch, G., https://www.cs.cmu.edu/~guyb/papers/Ble93.pdf)
54 //
55 // While these concepts are an implementation detail they are core to the Vello algorithm and are
56 // reflected in the pipeline names and data slot definitions.
57 //
58 // === Full Pipeline ===
59 //
60 // The full Vello pipeline stages are as follows and should be dispatched in the following order:
61 //
62 // I. Build the path monoid stream:
63 //   If the input fits within the workgroup size:
64 //     pathtag_reduce, pathtag_scan_small
65 //   else
66 //     pathtag_reduce, pathtag_reduce2, pathtag_scan1, pathtag_scan_large
67 //
68 // II. Compute path bounding boxes, convert path segments into cubics:
69 //   bbox_clear, pathseg
70 //
71 // III. Process the draw object stream to build the draw monoids and inputs to the clip stage:
72 //   draw_reduce, draw_leaf
73 //
74 // IV. Compute the bounding boxes for the clip stack from the input stream, if the scene contains
75 // clips:
76 //   clip_reduce, clip_leaf
77 //
78 // V. Allocate tile and segment buffers for the individual bins and prepare for coarse rasterization
79 //   binning, tile_alloc, path_coarse
80 //
81 // VI. Coarse rasterization
82 //   backdrop_dyn, coarse
83 //
84 // VII. Fine rasterization
85 //   fine
86 //
87 // TODO: Document the coverage mask pipeline once it has been re-implemented.
88 
89 // ***
90 // Shared buffers that are accessed by various stages.
91 //
92 // The render configration uniform buffer.
93 constexpr int kVelloSlot_ConfigUniform = 0;
94 
95 // The scene encoding buffer.
96 constexpr int kVelloSlot_Scene = 1;
97 
98 // ***
99 // Buffers used during the element processing stage. This stage converts the stream of variable
100 // length path tags, transforms, brushes into a "path monoid" stream containing buffer offsets for
101 // the subsequent stages that associate the input streams with individual draw elements. This stage
102 // performs a parallel prefix sum (reduce + scan) which can be performed in two dispatches if the
103 // entire input can be processed by a single workgroup per dispatch. Otherwise, the algorithm
104 // requires two additional dispatches to continue the traversal (this is due to a lack of primitives
105 // to synchronize execution across workgroups in MSL and WGSL).
106 //
107 // Single pass variant pipelines: pathtag_reduce, pathtag_scan_small
108 // Multi-pass variant pipelines: pathtag_reduce, pathtag_reduce2, pathtag_scan1, pathtag_scan_large
109 constexpr int kVelloSlot_TagMonoid = 2;
110 
111 // Single pass variant slots:
112 constexpr int kVelloSlot_PathtagReduceOutput = 3;
113 
114 // Multi pass variant slots:
115 constexpr int kVelloSlot_LargePathtagReduceFirstPassOutput = kVelloSlot_PathtagReduceOutput;
116 constexpr int kVelloSlot_LargePathtagReduceSecondPassOutput = 4;
117 constexpr int kVelloSlot_LargePathtagScanFirstPassOutput = 5;
118 
119 // ***
120 // The second part of element processing flattens path elements (moveTo, lineTo, quadTo, etc) into
121 // an unordered line soup buffer and computes their bounding boxes. This stage is where strokes get
122 // expanded to fills and stroke styles get applied. The output is an unordered "line soup" buffer
123 // and the tight device-space bounding box of each path.
124 //
125 // Pipelines: bbox_clear, flatten
126 constexpr int kVelloSlot_PathBBoxes = 6;
127 constexpr int kVelloSlot_Lines = 7;
128 
129 // ***
130 // The next part prepares the draw object stream (entries in the per-tile command list aka PTCL)
131 // and additional metadata for the subsequent clipping and binning stages.
132 //
133 // Pipelines: draw_reduce, draw_leaf
134 constexpr int kVelloSlot_DrawReduceOutput = 8;
135 constexpr int kVelloSlot_DrawMonoid = 9;
136 constexpr int kVelloSlot_InfoBinData = 10;
137 constexpr int kVelloSlot_ClipInput = 11;
138 
139 // ***
140 // Clipping. The outputs of this stage are the finalized draw monoid and the clip bounding-boxes.
141 // Clipping involves evaluating the stack monoid: refer to the following paper for the meaning of
142 // these buffers: https://arxiv.org/pdf/2205.11659.pdf,
143 // https://en.wikipedia.org/wiki/Bicyclic_semigroup
144 //
145 // Pipelines: clip_reduce, clip_leaf
146 constexpr int kVelloSlot_ClipBicyclic = 12;
147 constexpr int kVelloSlot_ClipElement = 13;
148 constexpr int kVelloSlot_ClipBBoxes = 14;
149 
150 // ***
151 // Buffers containing bump allocated data, the inputs and outputs to the binning, coarse raster, and
152 // per-tile segment assembly stages.
153 //
154 // Pipelines: binning, tile_alloc, path_count, backdrop, coarse, path_tiling
155 constexpr int kVelloSlot_DrawBBoxes = 15;
156 constexpr int kVelloSlot_BumpAlloc = 16;
157 constexpr int kVelloSlot_BinHeader = 17;
158 
159 constexpr int kVelloSlot_Path = 18;
160 constexpr int kVelloSlot_Tile = 19;
161 constexpr int kVelloSlot_SegmentCounts = 20;
162 constexpr int kVelloSlot_Segments = 21;
163 constexpr int kVelloSlot_PTCL = 22;
164 
165 // ***
166 // Texture resources used by the fine rasterization stage. The gradient image needs to get populated
167 // on the CPU with pre-computed gradient ramps. The image atlas is intended to hold pre-uploaded
168 // images that are composited into the scene.
169 //
170 // The output image contains the final render.
171 constexpr int kVelloSlot_OutputImage = 23;
172 constexpr int kVelloSlot_GradientImage = 24;
173 constexpr int kVelloSlot_ImageAtlas = 25;
174 
175 // ***
176 // The indirect count buffer is used to issue an indirect dispatch of the path count and path tiling
177 // stages.
178 constexpr int kVelloSlot_IndirectCount = 26;
179 
180 // ***
181 // The sample mask lookup table used in MSAA modes of the fine rasterization stage.
182 constexpr int kVelloSlot_MaskLUT = 27;
183 
184 std::string_view VelloStageName(vello_cpp::ShaderStage);
185 WorkgroupSize VelloStageLocalSize(vello_cpp::ShaderStage);
186 skia_private::TArray<ComputeStep::WorkgroupBufferDesc> VelloWorkgroupBuffers(
187         vello_cpp::ShaderStage);
188 ComputeStep::NativeShaderSource VelloNativeShaderSource(vello_cpp::ShaderStage,
189                                                         ComputeStep::NativeShaderFormat);
190 
191 template <vello_cpp::ShaderStage S>
192 class VelloStep : public ComputeStep {
193 public:
194     ~VelloStep() override = default;
195 
nativeShaderSource(NativeShaderFormat format)196     NativeShaderSource nativeShaderSource(NativeShaderFormat format) const override {
197         return VelloNativeShaderSource(S, format);
198     }
199 
200 protected:
VelloStep(SkSpan<const ResourceDesc> resources)201     explicit VelloStep(SkSpan<const ResourceDesc> resources)
202             : ComputeStep(VelloStageName(S),
203                           VelloStageLocalSize(S),
204                           resources,
205                           AsSpan<ComputeStep::WorkgroupBufferDesc>(VelloWorkgroupBuffers(S)),
206                           Flags::kSupportsNativeShader) {}
207 
208 private:
209     // Helper that creates a SkSpan from a universal reference to a container. Generally, creating a
210     // SkSpan from an rvalue reference is not safe since the pointer stored in the SkSpan will
211     // dangle beyond the constructor expression. In our usage in the constructor above,
212     // the lifetime of the temporary TArray should match that of the SkSpan, both of which should
213     // live through the constructor call expression.
214     //
215     // From https://en.cppreference.com/w/cpp/language/reference_initialization#Lifetime_of_a_temporary:
216     //
217     //     a temporary bound to a reference parameter in a function call exists until the end of the
218     //     full expression containing that function call
219     //
220     template <typename T, typename C>
AsSpan(C && container)221     static SkSpan<const T> AsSpan(C&& container) {
222         return SkSpan(std::data(container), std::size(container));
223     }
224 };
225 
226 #define VELLO_COMPUTE_STEP(stage)                                                      \
227     class Vello##stage##Step final : public VelloStep<vello_cpp::ShaderStage::stage> { \
228     public:                                                                            \
229         Vello##stage##Step();                                                          \
230     };
231 
232 VELLO_COMPUTE_STEP(BackdropDyn);
233 VELLO_COMPUTE_STEP(BboxClear);
234 VELLO_COMPUTE_STEP(Binning);
235 VELLO_COMPUTE_STEP(ClipLeaf);
236 VELLO_COMPUTE_STEP(ClipReduce);
237 VELLO_COMPUTE_STEP(Coarse);
238 VELLO_COMPUTE_STEP(Flatten);
239 VELLO_COMPUTE_STEP(DrawLeaf);
240 VELLO_COMPUTE_STEP(DrawReduce);
241 VELLO_COMPUTE_STEP(PathCount);
242 VELLO_COMPUTE_STEP(PathCountSetup);
243 VELLO_COMPUTE_STEP(PathTiling);
244 VELLO_COMPUTE_STEP(PathTilingSetup);
245 VELLO_COMPUTE_STEP(PathtagReduce);
246 VELLO_COMPUTE_STEP(PathtagReduce2);
247 VELLO_COMPUTE_STEP(PathtagScan1);
248 VELLO_COMPUTE_STEP(PathtagScanLarge);
249 VELLO_COMPUTE_STEP(PathtagScanSmall);
250 VELLO_COMPUTE_STEP(TileAlloc);
251 
252 #undef VELLO_COMPUTE_STEP
253 
254 template <vello_cpp::ShaderStage S, SkColorType T> class VelloFineStepBase : public VelloStep<S> {
255 public:
256     // We need to return a texture format for the bound textures.
calculateTextureParameters(int index,const ComputeStep::ResourceDesc &)257     std::tuple<SkISize, SkColorType> calculateTextureParameters(
258             int index, const ComputeStep::ResourceDesc&) const override {
259         SkASSERT(index == 4);
260         // TODO: The texture dimensions are unknown here so this method returns 0 for the texture
261         // size. In this case this field is unused since VelloRenderer assigns texture resources
262         // directly to the DispatchGroupBuilder. The format must still be queried to describe the
263         // ComputeStep's binding layout. This method could be improved to enable conditional
264         // querying of optional/dynamic parameters.
265         return {{}, T};
266     }
267 
268 protected:
VelloFineStepBase(SkSpan<const ComputeStep::ResourceDesc> resources)269     explicit VelloFineStepBase(SkSpan<const ComputeStep::ResourceDesc> resources)
270             : VelloStep<S>(resources) {}
271 };
272 
273 template <vello_cpp::ShaderStage S, SkColorType T, ::rust::Vec<uint8_t> (*MaskLutBuilder)()>
274 class VelloFineMsaaStepBase : public VelloFineStepBase<S, T> {
275 public:
calculateBufferSize(int resourceIndex,const ComputeStep::ResourceDesc &)276     size_t calculateBufferSize(int resourceIndex, const ComputeStep::ResourceDesc&) const override {
277         SkASSERT(resourceIndex == 5);
278         return fMaskLut.size();
279     }
280 
prepareStorageBuffer(int resourceIndex,const ComputeStep::ResourceDesc &,void * buffer,size_t bufferSize)281     void prepareStorageBuffer(int resourceIndex,
282                               const ComputeStep::ResourceDesc&,
283                               void* buffer,
284                               size_t bufferSize) const override {
285         SkASSERT(resourceIndex == 5);
286         SkASSERT(fMaskLut.size() == bufferSize);
287         memcpy(buffer, fMaskLut.data(), fMaskLut.size());
288     }
289 
290 protected:
VelloFineMsaaStepBase(SkSpan<const ComputeStep::ResourceDesc> resources)291     explicit VelloFineMsaaStepBase(SkSpan<const ComputeStep::ResourceDesc> resources)
292             : VelloFineStepBase<S, T>(resources), fMaskLut(MaskLutBuilder()) {}
293 
294 private:
295     ::rust::Vec<uint8_t> fMaskLut;
296 };
297 
298 class VelloFineAreaStep final
299         : public VelloFineStepBase<vello_cpp::ShaderStage::FineArea, kRGBA_8888_SkColorType> {
300 public:
301     VelloFineAreaStep();
302 };
303 
304 class VelloFineAreaAlpha8Step final
305         : public VelloFineStepBase<vello_cpp::ShaderStage::FineAreaR8, kAlpha_8_SkColorType> {
306 public:
307     VelloFineAreaAlpha8Step();
308 };
309 
310 class VelloFineMsaa16Step final : public VelloFineMsaaStepBase<vello_cpp::ShaderStage::FineMsaa16,
311                                                                kRGBA_8888_SkColorType,
312                                                                vello_cpp::build_mask_lut_16> {
313 public:
314     VelloFineMsaa16Step();
315 };
316 
317 class VelloFineMsaa16Alpha8Step final
318         : public VelloFineMsaaStepBase<vello_cpp::ShaderStage::FineMsaa16R8,
319                                        kAlpha_8_SkColorType,
320                                        vello_cpp::build_mask_lut_16> {
321 public:
322     VelloFineMsaa16Alpha8Step();
323 };
324 
325 class VelloFineMsaa8Step final : public VelloFineMsaaStepBase<vello_cpp::ShaderStage::FineMsaa8,
326                                                               kRGBA_8888_SkColorType,
327                                                               vello_cpp::build_mask_lut_8> {
328 public:
329     VelloFineMsaa8Step();
330 };
331 
332 class VelloFineMsaa8Alpha8Step final
333         : public VelloFineMsaaStepBase<vello_cpp::ShaderStage::FineMsaa8R8,
334                                        kAlpha_8_SkColorType,
335                                        vello_cpp::build_mask_lut_8> {
336 public:
337     VelloFineMsaa8Alpha8Step();
338 };
339 
340 }  // namespace skgpu::graphite
341 
342 #endif  // skgpu_graphite_compute_VelloComputeSteps_DEFINED
343