1 /*
2 * Copyright 2024 Google LLC
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #include "src/core/SkBlurEngine.h"
9
10 #include "include/core/SkAlphaType.h"
11 #include "include/core/SkBitmap.h"
12 #include "include/core/SkBlendMode.h"
13 #include "include/core/SkClipOp.h"
14 #include "include/core/SkColor.h"
15 #include "include/core/SkColorSpace.h" // IWYU pragma: keep
16 #include "include/core/SkColorType.h"
17 #include "include/core/SkImageInfo.h"
18 #include "include/core/SkM44.h"
19 #include "include/core/SkMatrix.h"
20 #include "include/core/SkPaint.h"
21 #include "include/core/SkPoint.h"
22 #include "include/core/SkRect.h"
23 #include "include/core/SkSamplingOptions.h"
24 #include "include/core/SkScalar.h"
25 #include "include/core/SkSurfaceProps.h"
26 #include "include/core/SkTileMode.h"
27 #include "include/effects/SkRuntimeEffect.h"
28 #include "include/private/base/SkAssert.h"
29 #include "include/private/base/SkFeatures.h"
30 #include "include/private/base/SkMalloc.h"
31 #include "include/private/base/SkMath.h"
32 #include "include/private/base/SkTo.h"
33 #include "src/base/SkArenaAlloc.h"
34 #include "src/base/SkVx.h"
35 #include "src/core/SkBitmapDevice.h"
36 #include "src/core/SkDevice.h"
37 #include "src/core/SkKnownRuntimeEffects.h"
38 #include "src/core/SkSpecialImage.h"
39
40 #include <algorithm>
41 #include <array>
42 #include <cmath>
43 #include <cstdint>
44 #include <cstring>
45 #include <utility>
46
47
48 #if SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SSE1
49 #include <xmmintrin.h>
50 #define SK_PREFETCH(ptr) _mm_prefetch(reinterpret_cast<const char*>(ptr), _MM_HINT_T0)
51 #elif defined(__GNUC__)
52 #define SK_PREFETCH(ptr) __builtin_prefetch(ptr)
53 #else
54 #define SK_PREFETCH(ptr)
55 #endif
56
57 // RasterBlurEngine
58 // ----------------------------------------------------------------------------
59
60 namespace {
61
62 class Pass {
63 public:
Pass(int border)64 explicit Pass(int border) : fBorder(border) {}
65 virtual ~Pass() = default;
66
blur(int srcLeft,int srcRight,int dstRight,const uint32_t * src,int srcStride,uint32_t * dst,int dstStride)67 void blur(int srcLeft, int srcRight, int dstRight,
68 const uint32_t* src, int srcStride,
69 uint32_t* dst, int dstStride) {
70 this->startBlur();
71
72 auto srcStart = srcLeft - fBorder,
73 srcEnd = srcRight - fBorder,
74 dstEnd = dstRight,
75 srcIdx = srcStart,
76 dstIdx = 0;
77
78 const uint32_t* srcCursor = src;
79 uint32_t* dstCursor = dst;
80
81 if (dstIdx < srcIdx) {
82 // The destination pixels are not effected by the src pixels,
83 // change to zero as per the spec.
84 // https://drafts.fxtf.org/filter-effects/#FilterPrimitivesOverviewIntro
85 int commonEnd = std::min(srcIdx, dstEnd);
86 while (dstIdx < commonEnd) {
87 *dstCursor = 0;
88 dstCursor += dstStride;
89 SK_PREFETCH(dstCursor);
90 dstIdx++;
91 }
92 } else if (srcIdx < dstIdx) {
93 // The edge of the source is before the edge of the destination. Calculate the sums for
94 // the pixels before the start of the destination.
95 if (int commonEnd = std::min(dstIdx, srcEnd); srcIdx < commonEnd) {
96 // Preload the blur with values from src before dst is entered.
97 int n = commonEnd - srcIdx;
98 this->blurSegment(n, srcCursor, srcStride, nullptr, 0);
99 srcIdx += n;
100 srcCursor += n * srcStride;
101 }
102 if (srcIdx < dstIdx) {
103 // The weird case where src is out of pixels before dst is even started.
104 int n = dstIdx - srcIdx;
105 this->blurSegment(n, nullptr, 0, nullptr, 0);
106 srcIdx += n;
107 }
108 }
109
110 if (int commonEnd = std::min(dstEnd, srcEnd); dstIdx < commonEnd) {
111 // Both srcIdx and dstIdx are in sync now, and can run in a 1:1 fashion. This is the
112 // normal mode of operation.
113 SkASSERT(srcIdx == dstIdx);
114
115 int n = commonEnd - dstIdx;
116 this->blurSegment(n, srcCursor, srcStride, dstCursor, dstStride);
117 srcCursor += n * srcStride;
118 dstCursor += n * dstStride;
119 dstIdx += n;
120 srcIdx += n;
121 }
122
123 // Drain the remaining blur values into dst assuming 0's for the leading edge.
124 if (dstIdx < dstEnd) {
125 int n = dstEnd - dstIdx;
126 this->blurSegment(n, nullptr, 0, dstCursor, dstStride);
127 }
128 }
129
130 protected:
131 virtual void startBlur() = 0;
132 virtual void blurSegment(
133 int n, const uint32_t* src, int srcStride, uint32_t* dst, int dstStride) = 0;
134
135 private:
136 const int fBorder;
137 };
138
139 class PassMaker {
140 public:
PassMaker(int window)141 explicit PassMaker(int window) : fWindow{window} {}
142 virtual ~PassMaker() = default;
143 virtual Pass* makePass(void* buffer, SkArenaAlloc* alloc) const = 0;
144 virtual size_t bufferSizeBytes() const = 0;
window() const145 int window() const {return fWindow;}
146
147 private:
148 const int fWindow;
149 };
150
151 // Implement a scanline processor that uses a three-box filter to approximate a Gaussian blur.
152 // The GaussPass is limit to processing sigmas < 135.
153 class GaussPass final : public Pass {
154 public:
155 // NB 136 is the largest sigma that will not cause a buffer full of 255 mask values to overflow
156 // using the Gauss filter. It also limits the size of buffers used hold intermediate values.
157 // Explanation of maximums:
158 // sum0 = window * 255
159 // sum1 = window * sum0 -> window * window * 255
160 // sum2 = window * sum1 -> window * window * window * 255 -> window^3 * 255
161 //
162 // The value window^3 * 255 must fit in a uint32_t. So,
163 // window^3 < 2^32. window = 255.
164 //
165 // window = floor(sigma * 3 * sqrt(2 * kPi) / 4 + 0.5)
166 // For window <= 255, the largest value for sigma is 136.
MakeMaker(float sigma,SkArenaAlloc * alloc)167 static PassMaker* MakeMaker(float sigma, SkArenaAlloc* alloc) {
168 SkASSERT(0 <= sigma);
169 int window = SkBlurEngine::BoxBlurWindow(sigma);
170 if (255 <= window) {
171 return nullptr;
172 }
173
174 class Maker : public PassMaker {
175 public:
176 explicit Maker(int window) : PassMaker{window} {}
177 Pass* makePass(void* buffer, SkArenaAlloc* alloc) const override {
178 return GaussPass::Make(this->window(), buffer, alloc);
179 }
180
181 size_t bufferSizeBytes() const override {
182 int window = this->window();
183 size_t onePassSize = window - 1;
184 // If the window is odd, then there is an obvious middle element. For even sizes
185 // 2 passes are shifted, and the last pass has an extra element. Like this:
186 // S
187 // aaaAaa
188 // bbBbbb
189 // cccCccc
190 // D
191 size_t bufferCount = (window & 1) == 1 ? 3 * onePassSize : 3 * onePassSize + 1;
192 return bufferCount * sizeof(skvx::Vec<4, uint32_t>);
193 }
194 };
195
196 return alloc->make<Maker>(window);
197 }
198
Make(int window,void * buffers,SkArenaAlloc * alloc)199 static GaussPass* Make(int window, void* buffers, SkArenaAlloc* alloc) {
200 // We don't need to store the trailing edge pixel in the buffer;
201 int passSize = window - 1;
202 skvx::Vec<4, uint32_t>* buffer0 = static_cast<skvx::Vec<4, uint32_t>*>(buffers);
203 skvx::Vec<4, uint32_t>* buffer1 = buffer0 + passSize;
204 skvx::Vec<4, uint32_t>* buffer2 = buffer1 + passSize;
205 // If the window is odd just one buffer is needed, but if it's even, then there is one
206 // more element on that pass.
207 skvx::Vec<4, uint32_t>* buffersEnd = buffer2 + ((window & 1) ? passSize : passSize + 1);
208
209 // Calculating the border is tricky. The border is the distance in pixels between the first
210 // dst pixel and the first src pixel (or the last src pixel and the last dst pixel).
211 // I will go through the odd case which is simpler, and then through the even case. Given a
212 // stack of filters seven wide for the odd case of three passes.
213 //
214 // S
215 // aaaAaaa
216 // bbbBbbb
217 // cccCccc
218 // D
219 //
220 // The furthest changed pixel is when the filters are in the following configuration.
221 //
222 // S
223 // aaaAaaa
224 // bbbBbbb
225 // cccCccc
226 // D
227 //
228 // The A pixel is calculated using the value S, the B uses A, and the C uses B, and
229 // finally D is C. So, with a window size of seven the border is nine. In the odd case, the
230 // border is 3*((window - 1)/2).
231 //
232 // For even cases the filter stack is more complicated. The spec specifies two passes
233 // of even filters and a final pass of odd filters. A stack for a width of six looks like
234 // this.
235 //
236 // S
237 // aaaAaa
238 // bbBbbb
239 // cccCccc
240 // D
241 //
242 // The furthest pixel looks like this.
243 //
244 // S
245 // aaaAaa
246 // bbBbbb
247 // cccCccc
248 // D
249 //
250 // For a window of six, the border value is eight. In the even case the border is 3 *
251 // (window/2) - 1.
252 int border = (window & 1) == 1 ? 3 * ((window - 1) / 2) : 3 * (window / 2) - 1;
253
254 // If the window is odd then the divisor is just window ^ 3 otherwise,
255 // it is window * window * (window + 1) = window ^ 3 + window ^ 2;
256 int window2 = window * window;
257 int window3 = window2 * window;
258 int divisor = (window & 1) == 1 ? window3 : window3 + window2;
259 return alloc->make<GaussPass>(buffer0, buffer1, buffer2, buffersEnd, border, divisor);
260 }
261
GaussPass(skvx::Vec<4,uint32_t> * buffer0,skvx::Vec<4,uint32_t> * buffer1,skvx::Vec<4,uint32_t> * buffer2,skvx::Vec<4,uint32_t> * buffersEnd,int border,int divisor)262 GaussPass(skvx::Vec<4, uint32_t>* buffer0,
263 skvx::Vec<4, uint32_t>* buffer1,
264 skvx::Vec<4, uint32_t>* buffer2,
265 skvx::Vec<4, uint32_t>* buffersEnd,
266 int border,
267 int divisor)
268 : Pass{border}
269 , fBuffer0{buffer0}
270 , fBuffer1{buffer1}
271 , fBuffer2{buffer2}
272 , fBuffersEnd{buffersEnd}
273 , fDivider(divisor) {}
274
275 private:
startBlur()276 void startBlur() override {
277 skvx::Vec<4, uint32_t> zero = {0u, 0u, 0u, 0u};
278 zero.store(fSum0);
279 zero.store(fSum1);
280 auto half = fDivider.half();
281 skvx::Vec<4, uint32_t>{half, half, half, half}.store(fSum2);
282 sk_bzero(fBuffer0, (fBuffersEnd - fBuffer0) * sizeof(skvx::Vec<4, uint32_t>));
283
284 fBuffer0Cursor = fBuffer0;
285 fBuffer1Cursor = fBuffer1;
286 fBuffer2Cursor = fBuffer2;
287 }
288
289 // GaussPass implements the common three pass box filter approximation of Gaussian blur,
290 // but combines all three passes into a single pass. This approach is facilitated by three
291 // circular buffers the width of the window which track values for trailing edges of each of
292 // the three passes. This allows the algorithm to use more precision in the calculation
293 // because the values are not rounded each pass. And this implementation also avoids a trap
294 // that's easy to fall into resulting in blending in too many zeroes near the edge.
295 //
296 // In general, a window sum has the form:
297 // sum_n+1 = sum_n + leading_edge - trailing_edge.
298 // If instead we do the subtraction at the end of the previous iteration, we can just
299 // calculate the sums instead of having to do the subtractions too.
300 //
301 // In previous iteration:
302 // sum_n+1 = sum_n - trailing_edge.
303 //
304 // In this iteration:
305 // sum_n+1 = sum_n + leading_edge.
306 //
307 // Now we can stack all three sums and do them at once. Sum0 gets its leading edge from the
308 // actual data. Sum1's leading edge is just Sum0, and Sum2's leading edge is Sum1. So, doing the
309 // three passes at the same time has the form:
310 //
311 // sum0_n+1 = sum0_n + leading edge
312 // sum1_n+1 = sum1_n + sum0_n+1
313 // sum2_n+1 = sum2_n + sum1_n+1
314 //
315 // sum2_n+1 / window^3 is the new value of the destination pixel.
316 //
317 // Reduce the sums by the trailing edges which were stored in the circular buffers for the
318 // next go around. This is the case for odd sized windows, even windows the the third
319 // circular buffer is one larger then the first two circular buffers.
320 //
321 // sum2_n+2 = sum2_n+1 - buffer2[i];
322 // buffer2[i] = sum1;
323 // sum1_n+2 = sum1_n+1 - buffer1[i];
324 // buffer1[i] = sum0;
325 // sum0_n+2 = sum0_n+1 - buffer0[i];
326 // buffer0[i] = leading edge
blurSegment(int n,const uint32_t * src,int srcStride,uint32_t * dst,int dstStride)327 void blurSegment(
328 int n, const uint32_t* src, int srcStride, uint32_t* dst, int dstStride) override {
329 #if SK_CPU_LSX_LEVEL >= SK_CPU_LSX_LEVEL_LSX
330 skvx::Vec<4, uint32_t>* buffer0Cursor = fBuffer0Cursor;
331 skvx::Vec<4, uint32_t>* buffer1Cursor = fBuffer1Cursor;
332 skvx::Vec<4, uint32_t>* buffer2Cursor = fBuffer2Cursor;
333 v4u32 sum0 = __lsx_vld(fSum0, 0); // same as skvx::Vec<4, uint32_t>::Load(fSum0);
334 v4u32 sum1 = __lsx_vld(fSum1, 0);
335 v4u32 sum2 = __lsx_vld(fSum2, 0);
336
337 auto processValue = [&](v4u32& vLeadingEdge){
338 sum0 += vLeadingEdge;
339 sum1 += sum0;
340 sum2 += sum1;
341
342 v4u32 divisorFactor = __lsx_vreplgr2vr_w(fDivider.divisorFactor());
343 v4u32 blurred = __lsx_vmuh_w(divisorFactor, sum2);
344
345 v4u32 buffer2Value = __lsx_vld(buffer2Cursor, 0); //Not fBuffer0Cursor, out of bounds.
346 sum2 -= buffer2Value;
347 __lsx_vst(sum1, (void *)buffer2Cursor, 0);
348 buffer2Cursor = (buffer2Cursor + 1) < fBuffersEnd ? buffer2Cursor + 1 : fBuffer2;
349 v4u32 buffer1Value = __lsx_vld(buffer1Cursor, 0);
350 sum1 -= buffer1Value;
351 __lsx_vst(sum0, (void *)buffer1Cursor, 0);
352 buffer1Cursor = (buffer1Cursor + 1) < fBuffer2 ? buffer1Cursor + 1 : fBuffer1;
353 v4u32 buffer0Value = __lsx_vld(buffer0Cursor, 0);
354 sum0 -= buffer0Value;
355 __lsx_vst(vLeadingEdge, (void *)buffer0Cursor, 0);
356 buffer0Cursor = (buffer0Cursor + 1) < fBuffer1 ? buffer0Cursor + 1 : fBuffer0;
357
358 v16u8 shuf = {0x0,0x4,0x8,0xc,0x0};
359 v16u8 ret = __lsx_vshuf_b(blurred, blurred, shuf);
360 return ret;
361 };
362
363 v4u32 zero = __lsx_vldi(0x0);
364 if (!src && !dst) {
365 while (n --> 0) {
366 (void)processValue(zero);
367 }
368 } else if (src && !dst) {
369 while (n --> 0) {
370 v4u32 edge = __lsx_vinsgr2vr_w(zero, *src, 0);
371 edge = __lsx_vilvl_b(zero, edge);
372 edge = __lsx_vilvl_h(zero, edge);
373 (void)processValue(edge);
374 src += srcStride;
375 }
376 } else if (!src && dst) {
377 while (n --> 0) {
378 v4u32 ret = processValue(zero);
379 __lsx_vstelm_w(ret, dst, 0, 0); // 3rd is offset, 4th is idx.
380 dst += dstStride;
381 }
382 } else if (src && dst) {
383 while (n --> 0) {
384 v4u32 edge = __lsx_vinsgr2vr_w(zero, *src, 0);
385 edge = __lsx_vilvl_b(zero, edge);
386 edge = __lsx_vilvl_h(zero, edge);
387 v4u32 ret = processValue(edge);
388 __lsx_vstelm_w(ret, dst, 0, 0);
389 src += srcStride;
390 dst += dstStride;
391 }
392 }
393
394 // Store the state
395 fBuffer0Cursor = buffer0Cursor;
396 fBuffer1Cursor = buffer1Cursor;
397 fBuffer2Cursor = buffer2Cursor;
398
399 __lsx_vst(sum0, fSum0, 0);
400 __lsx_vst(sum1, fSum1, 0);
401 __lsx_vst(sum2, fSum2, 0);
402 #else
403 skvx::Vec<4, uint32_t>* buffer0Cursor = fBuffer0Cursor;
404 skvx::Vec<4, uint32_t>* buffer1Cursor = fBuffer1Cursor;
405 skvx::Vec<4, uint32_t>* buffer2Cursor = fBuffer2Cursor;
406 skvx::Vec<4, uint32_t> sum0 = skvx::Vec<4, uint32_t>::Load(fSum0);
407 skvx::Vec<4, uint32_t> sum1 = skvx::Vec<4, uint32_t>::Load(fSum1);
408 skvx::Vec<4, uint32_t> sum2 = skvx::Vec<4, uint32_t>::Load(fSum2);
409
410 // Given an expanded input pixel, move the window ahead using the leadingEdge value.
411 auto processValue = [&](const skvx::Vec<4, uint32_t>& leadingEdge) {
412 sum0 += leadingEdge;
413 sum1 += sum0;
414 sum2 += sum1;
415
416 skvx::Vec<4, uint32_t> blurred = fDivider.divide(sum2);
417
418 sum2 -= *buffer2Cursor;
419 *buffer2Cursor = sum1;
420 buffer2Cursor = (buffer2Cursor + 1) < fBuffersEnd ? buffer2Cursor + 1 : fBuffer2;
421 sum1 -= *buffer1Cursor;
422 *buffer1Cursor = sum0;
423 buffer1Cursor = (buffer1Cursor + 1) < fBuffer2 ? buffer1Cursor + 1 : fBuffer1;
424 sum0 -= *buffer0Cursor;
425 *buffer0Cursor = leadingEdge;
426 buffer0Cursor = (buffer0Cursor + 1) < fBuffer1 ? buffer0Cursor + 1 : fBuffer0;
427
428 return skvx::cast<uint8_t>(blurred);
429 };
430
431 auto loadEdge = [&](const uint32_t* srcCursor) {
432 return skvx::cast<uint32_t>(skvx::Vec<4, uint8_t>::Load(srcCursor));
433 };
434
435 if (!src && !dst) {
436 while (n --> 0) {
437 (void)processValue(0);
438 }
439 } else if (src && !dst) {
440 while (n --> 0) {
441 (void)processValue(loadEdge(src));
442 src += srcStride;
443 }
444 } else if (!src && dst) {
445 while (n --> 0) {
446 processValue(0u).store(dst);
447 dst += dstStride;
448 }
449 } else if (src && dst) {
450 while (n --> 0) {
451 processValue(loadEdge(src)).store(dst);
452 src += srcStride;
453 dst += dstStride;
454 }
455 }
456
457 // Store the state
458 fBuffer0Cursor = buffer0Cursor;
459 fBuffer1Cursor = buffer1Cursor;
460 fBuffer2Cursor = buffer2Cursor;
461
462 sum0.store(fSum0);
463 sum1.store(fSum1);
464 sum2.store(fSum2);
465 #endif
466 }
467
468 skvx::Vec<4, uint32_t>* const fBuffer0;
469 skvx::Vec<4, uint32_t>* const fBuffer1;
470 skvx::Vec<4, uint32_t>* const fBuffer2;
471 skvx::Vec<4, uint32_t>* const fBuffersEnd;
472 const skvx::ScaledDividerU32 fDivider;
473
474 // blur state
475 char fSum0[sizeof(skvx::Vec<4, uint32_t>)];
476 char fSum1[sizeof(skvx::Vec<4, uint32_t>)];
477 char fSum2[sizeof(skvx::Vec<4, uint32_t>)];
478 skvx::Vec<4, uint32_t>* fBuffer0Cursor;
479 skvx::Vec<4, uint32_t>* fBuffer1Cursor;
480 skvx::Vec<4, uint32_t>* fBuffer2Cursor;
481 };
482
483 // Implement a scanline processor that uses a two-box filter to approximate a Tent filter.
484 // The TentPass is limit to processing sigmas < 2183.
485 class TentPass final : public Pass {
486 public:
487 // NB 2183 is the largest sigma that will not cause a buffer full of 255 mask values to overflow
488 // using the Tent filter. It also limits the size of buffers used hold intermediate values.
489 // Explanation of maximums:
490 // sum0 = window * 255
491 // sum1 = window * sum0 -> window * window * 255
492 //
493 // The value window^2 * 255 must fit in a uint32_t. So,
494 // window^2 < 2^32. window = 4104.
495 //
496 // window = floor(sigma * 3 * sqrt(2 * kPi) / 4 + 0.5)
497 // For window <= 4104, the largest value for sigma is 2183.
MakeMaker(float sigma,SkArenaAlloc * alloc)498 static PassMaker* MakeMaker(float sigma, SkArenaAlloc* alloc) {
499 SkASSERT(0 <= sigma);
500 int gaussianWindow = SkBlurEngine::BoxBlurWindow(sigma);
501 // This is a naive method of using the window size for the Gaussian blur to calculate the
502 // window size for the Tent blur. This seems to work well in practice.
503 //
504 // We can use a single pixel to generate the effective blur area given a window size. For
505 // the Gaussian blur this is 3 * window size. For the Tent filter this is 2 * window size.
506 int tentWindow = 3 * gaussianWindow / 2;
507 if (tentWindow >= 4104) {
508 return nullptr;
509 }
510
511 class Maker : public PassMaker {
512 public:
513 explicit Maker(int window) : PassMaker{window} {}
514 Pass* makePass(void* buffer, SkArenaAlloc* alloc) const override {
515 return TentPass::Make(this->window(), buffer, alloc);
516 }
517
518 size_t bufferSizeBytes() const override {
519 size_t onePassSize = this->window() - 1;
520 // If the window is odd, then there is an obvious middle element. For even sizes 2
521 // passes are shifted, and the last pass has an extra element. Like this:
522 // S
523 // aaaAaa
524 // bbBbbb
525 // D
526 size_t bufferCount = 2 * onePassSize;
527 return bufferCount * sizeof(skvx::Vec<4, uint32_t>);
528 }
529 };
530
531 return alloc->make<Maker>(tentWindow);
532 }
533
Make(int window,void * buffers,SkArenaAlloc * alloc)534 static TentPass* Make(int window, void* buffers, SkArenaAlloc* alloc) {
535 if (window > 4104) {
536 return nullptr;
537 }
538
539 // We don't need to store the trailing edge pixel in the buffer;
540 int passSize = window - 1;
541 skvx::Vec<4, uint32_t>* buffer0 = static_cast<skvx::Vec<4, uint32_t>*>(buffers);
542 skvx::Vec<4, uint32_t>* buffer1 = buffer0 + passSize;
543 skvx::Vec<4, uint32_t>* buffersEnd = buffer1 + passSize;
544
545 // Calculating the border is tricky. The border is the distance in pixels between the first
546 // dst pixel and the first src pixel (or the last src pixel and the last dst pixel).
547 // I will go through the odd case which is simpler, and then through the even case. Given a
548 // stack of filters seven wide for the odd case of three passes.
549 //
550 // S
551 // aaaAaaa
552 // bbbBbbb
553 // D
554 //
555 // The furthest changed pixel is when the filters are in the following configuration.
556 //
557 // S
558 // aaaAaaa
559 // bbbBbbb
560 // D
561 //
562 // The A pixel is calculated using the value S, the B uses A, and the D uses B.
563 // So, with a window size of seven the border is nine. In the odd case, the border is
564 // window - 1.
565 //
566 // For even cases the filter stack is more complicated. It uses two passes
567 // of even filters offset from each other. A stack for a width of six looks like
568 // this.
569 //
570 // S
571 // aaaAaa
572 // bbBbbb
573 // D
574 //
575 // The furthest pixel looks like this.
576 //
577 // S
578 // aaaAaa
579 // bbBbbb
580 // D
581 //
582 // For a window of six, the border value is 5. In the even case the border is
583 // window - 1.
584 int border = window - 1;
585
586 int divisor = window * window;
587 return alloc->make<TentPass>(buffer0, buffer1, buffersEnd, border, divisor);
588 }
589
TentPass(skvx::Vec<4,uint32_t> * buffer0,skvx::Vec<4,uint32_t> * buffer1,skvx::Vec<4,uint32_t> * buffersEnd,int border,int divisor)590 TentPass(skvx::Vec<4, uint32_t>* buffer0,
591 skvx::Vec<4, uint32_t>* buffer1,
592 skvx::Vec<4, uint32_t>* buffersEnd,
593 int border,
594 int divisor)
595 : Pass{border}
596 , fBuffer0{buffer0}
597 , fBuffer1{buffer1}
598 , fBuffersEnd{buffersEnd}
599 , fDivider(divisor) {}
600
601 private:
startBlur()602 void startBlur() override {
603 skvx::Vec<4, uint32_t>{0u, 0u, 0u, 0u}.store(fSum0);
604 auto half = fDivider.half();
605 skvx::Vec<4, uint32_t>{half, half, half, half}.store(fSum1);
606 sk_bzero(fBuffer0, (fBuffersEnd - fBuffer0) * sizeof(skvx::Vec<4, uint32_t>));
607
608 fBuffer0Cursor = fBuffer0;
609 fBuffer1Cursor = fBuffer1;
610 }
611
612 // TentPass implements the common two pass box filter approximation of Tent filter,
613 // but combines all both passes into a single pass. This approach is facilitated by two
614 // circular buffers the width of the window which track values for trailing edges of each of
615 // both passes. This allows the algorithm to use more precision in the calculation
616 // because the values are not rounded each pass. And this implementation also avoids a trap
617 // that's easy to fall into resulting in blending in too many zeroes near the edge.
618 //
619 // In general, a window sum has the form:
620 // sum_n+1 = sum_n + leading_edge - trailing_edge.
621 // If instead we do the subtraction at the end of the previous iteration, we can just
622 // calculate the sums instead of having to do the subtractions too.
623 //
624 // In previous iteration:
625 // sum_n+1 = sum_n - trailing_edge.
626 //
627 // In this iteration:
628 // sum_n+1 = sum_n + leading_edge.
629 //
630 // Now we can stack all three sums and do them at once. Sum0 gets its leading edge from the
631 // actual data. Sum1's leading edge is just Sum0, and Sum2's leading edge is Sum1. So, doing the
632 // three passes at the same time has the form:
633 //
634 // sum0_n+1 = sum0_n + leading edge
635 // sum1_n+1 = sum1_n + sum0_n+1
636 //
637 // sum1_n+1 / window^2 is the new value of the destination pixel.
638 //
639 // Reduce the sums by the trailing edges which were stored in the circular buffers for the
640 // next go around.
641 //
642 // sum1_n+2 = sum1_n+1 - buffer1[i];
643 // buffer1[i] = sum0;
644 // sum0_n+2 = sum0_n+1 - buffer0[i];
645 // buffer0[i] = leading edge
blurSegment(int n,const uint32_t * src,int srcStride,uint32_t * dst,int dstStride)646 void blurSegment(
647 int n, const uint32_t* src, int srcStride, uint32_t* dst, int dstStride) override {
648 skvx::Vec<4, uint32_t>* buffer0Cursor = fBuffer0Cursor;
649 skvx::Vec<4, uint32_t>* buffer1Cursor = fBuffer1Cursor;
650 skvx::Vec<4, uint32_t> sum0 = skvx::Vec<4, uint32_t>::Load(fSum0);
651 skvx::Vec<4, uint32_t> sum1 = skvx::Vec<4, uint32_t>::Load(fSum1);
652
653 // Given an expanded input pixel, move the window ahead using the leadingEdge value.
654 auto processValue = [&](const skvx::Vec<4, uint32_t>& leadingEdge) {
655 sum0 += leadingEdge;
656 sum1 += sum0;
657
658 skvx::Vec<4, uint32_t> blurred = fDivider.divide(sum1);
659
660 sum1 -= *buffer1Cursor;
661 *buffer1Cursor = sum0;
662 buffer1Cursor = (buffer1Cursor + 1) < fBuffersEnd ? buffer1Cursor + 1 : fBuffer1;
663 sum0 -= *buffer0Cursor;
664 *buffer0Cursor = leadingEdge;
665 buffer0Cursor = (buffer0Cursor + 1) < fBuffer1 ? buffer0Cursor + 1 : fBuffer0;
666
667 return skvx::cast<uint8_t>(blurred);
668 };
669
670 auto loadEdge = [&](const uint32_t* srcCursor) {
671 return skvx::cast<uint32_t>(skvx::Vec<4, uint8_t>::Load(srcCursor));
672 };
673
674 if (!src && !dst) {
675 while (n --> 0) {
676 (void)processValue(0);
677 }
678 } else if (src && !dst) {
679 while (n --> 0) {
680 (void)processValue(loadEdge(src));
681 src += srcStride;
682 }
683 } else if (!src && dst) {
684 while (n --> 0) {
685 processValue(0u).store(dst);
686 dst += dstStride;
687 }
688 } else if (src && dst) {
689 while (n --> 0) {
690 processValue(loadEdge(src)).store(dst);
691 src += srcStride;
692 dst += dstStride;
693 }
694 }
695
696 // Store the state
697 fBuffer0Cursor = buffer0Cursor;
698 fBuffer1Cursor = buffer1Cursor;
699 sum0.store(fSum0);
700 sum1.store(fSum1);
701 }
702
703 skvx::Vec<4, uint32_t>* const fBuffer0;
704 skvx::Vec<4, uint32_t>* const fBuffer1;
705 skvx::Vec<4, uint32_t>* const fBuffersEnd;
706 const skvx::ScaledDividerU32 fDivider;
707
708 // blur state
709 char fSum0[sizeof(skvx::Vec<4, uint32_t>)];
710 char fSum1[sizeof(skvx::Vec<4, uint32_t>)];
711 skvx::Vec<4, uint32_t>* fBuffer0Cursor;
712 skvx::Vec<4, uint32_t>* fBuffer1Cursor;
713 };
714
715 class Raster8888BlurAlgorithm : public SkBlurEngine::Algorithm {
716 public:
717 // See analysis in description of TentPass for the max supported sigma.
maxSigma() const718 float maxSigma() const override {
719 // TentPass supports a sigma up to 2183, and was added so that the CPU blur algorithm's
720 // blur radius was as large as that supported by the GPU. GaussPass only supports up to 136.
721 // However, there is a very apparent pop in blur weight when switching from successive box
722 // blurs to the tent filter. The TentPass is preserved for legacy blurs, which do not use
723 // FilterResult::rescale(). However, using kMaxSigma = 135 with the raster SkBlurEngine
724 // ensures that the non-legacy raster blurs will always use the GaussPass implementation.
725 // This is about 6-7x faster on large blurs to rescale a few times to a lower resolution
726 // than it is to evaluate the much larger original window.
727 static constexpr float kMaxSigma = 135.f;
728 SkASSERT(SkBlurEngine::BoxBlurWindow(kMaxSigma) <= 255); // see GaussPass::MakeMaker().
729 return kMaxSigma;
730 }
731
732 // TODO: Implement CPU backend for different fTileMode. This is still worth doing inline with
733 // the blur; at the moment the tiling is applied via the CropImageFilter and carried as metadata
734 // on the FilterResult. This is forcefully applied in FilterResult::Builder::blur() when
735 // supportsOnlyDecalTiling() returns true.
supportsOnlyDecalTiling() const736 bool supportsOnlyDecalTiling() const override { return true; }
737
blur(SkSize sigma,sk_sp<SkSpecialImage> input,const SkIRect & originalSrcBounds,SkTileMode tileMode,const SkIRect & originalDstBounds) const738 sk_sp<SkSpecialImage> blur(SkSize sigma,
739 sk_sp<SkSpecialImage> input,
740 const SkIRect& originalSrcBounds,
741 SkTileMode tileMode,
742 const SkIRect& originalDstBounds) const override {
743 // TODO: Enable this assert when the TentPass is no longer used for legacy blurs
744 // (which supports blur sigmas larger than what's reported in maxSigma()).
745 // SkASSERT(sigma.width() <= this->maxSigma() && sigma.height() <= this->maxSigma());
746 SkASSERT(tileMode == SkTileMode::kDecal);
747
748 SkASSERT(SkIRect::MakeSize(input->dimensions()).contains(originalSrcBounds));
749
750 SkBitmap src;
751 if (!SkSpecialImages::AsBitmap(input.get(), &src)) {
752 return nullptr; // Should only have been called by CPU-backed images
753 }
754 // The blur engine should not have picked this algorithm for a non-32-bit color type
755 SkASSERT(src.colorType() == kRGBA_8888_SkColorType ||
756 src.colorType() == kBGRA_8888_SkColorType);
757
758 SkSTArenaAlloc<1024> alloc;
759 auto makeMaker = [&](float sigma) -> PassMaker* {
760 SkASSERT(0 <= sigma && sigma <= 2183); // should be guaranteed after map_sigma
761 if (PassMaker* maker = GaussPass::MakeMaker(sigma, &alloc)) {
762 return maker;
763 }
764 if (PassMaker* maker = TentPass::MakeMaker(sigma, &alloc)) {
765 return maker;
766 }
767 SK_ABORT("Sigma is out of range.");
768 };
769
770 PassMaker* makerX = makeMaker(sigma.width());
771 PassMaker* makerY = makeMaker(sigma.height());
772 // A blur with a sigma smaller than the successive box-blurs accuracy should have been
773 // routed to the shader-based algorithm.
774 SkASSERT(makerX->window() > 1 || makerY->window() > 1);
775
776 SkIRect srcBounds = originalSrcBounds;
777 SkIRect dstBounds = originalDstBounds;
778 if (makerX->window() > 1) {
779 // Inflate the dst by the window required for the Y pass so that the X pass can prepare
780 // it. The Y pass will be offset to only write to the original rows in dstBounds, but
781 // its window will access these extra rows calculated by the X pass. The SpecialImage
782 // factory will then subset the bitmap so it appears to match 'originalDstBounds'
783 // tightly. We make one slightly larger image to hold this extra data instead of two
784 // separate images sized exactly to each pass because the CPU blur can write in place.
785 dstBounds.outset(0, SkBlurEngine::SigmaToRadius(sigma.height()));
786 }
787
788 SkBitmap dst;
789 const SkIPoint dstOrigin = dstBounds.topLeft();
790 if (!dst.tryAllocPixels(src.info().makeWH(dstBounds.width(), dstBounds.height()))) {
791 return nullptr;
792 }
793 dst.eraseColor(SK_ColorTRANSPARENT);
794
795 auto buffer = alloc.makeBytesAlignedTo(std::max(makerX->bufferSizeBytes(),
796 makerY->bufferSizeBytes()),
797 alignof(skvx::Vec<4, uint32_t>));
798
799 // Basic Plan: The three cases to handle
800 // * Horizontal and Vertical - blur horizontally while copying values from the source to
801 // the destination. Then, do an in-place vertical blur.
802 // * Horizontal only - blur horizontally copying values from the source to the destination.
803 // * Vertical only - blur vertically copying values from the source to the destination.
804
805 // Initialize these assuming the Y-only case
806 int loopStart = std::max(srcBounds.left(), dstBounds.left());
807 int loopEnd = std::min(srcBounds.right(), dstBounds.right());
808 int dstYOffset = 0;
809
810 if (makerX->window() > 1) {
811 // First an X-only blur from src into dst, including the extra rows that will become
812 // input for the second Y pass, which will then be performed in place.
813 loopStart = std::max(srcBounds.top(), dstBounds.top());
814 loopEnd = std::min(srcBounds.bottom(), dstBounds.bottom());
815
816 auto srcAddr = src.getAddr32(0, loopStart - srcBounds.top());
817 auto dstAddr = dst.getAddr32(0, loopStart - dstBounds.top());
818
819 // Iterate over each row to calculate 1D blur along X.
820 Pass* pass = makerX->makePass(buffer, &alloc);
821 for (int y = loopStart; y < loopEnd; ++y) {
822 pass->blur(srcBounds.left() - dstBounds.left(),
823 srcBounds.right() - dstBounds.left(),
824 dstBounds.width(),
825 srcAddr, 1,
826 dstAddr, 1);
827 srcAddr += src.rowBytesAsPixels();
828 dstAddr += dst.rowBytesAsPixels();
829 }
830
831 // Set up the Y pass to blur from the full dst into the non-outset portion of dst
832 src = dst;
833 loopStart = originalDstBounds.left();
834 loopEnd = originalDstBounds.right();
835 // The new 'dst' is equal to dst.extractSubset(originalDstBounds.offset(-dstOrigin)),
836 // but by construction only the Y offset has an interesting value so this is a little
837 // more efficient.
838 dstYOffset = originalDstBounds.top() - dstBounds.top();
839
840 srcBounds = dstBounds;
841 dstBounds = originalDstBounds;
842 }
843
844 // Iterate over each column to calculate 1D blur along Y. This is either blurring from src
845 // into dst for a 1D blur; or it's blurring from dst into dst for the second pass of a 2D
846 // blur.
847 if (makerY->window() > 1) {
848 auto srcAddr = src.getAddr32(loopStart - srcBounds.left(), 0);
849 auto dstAddr = dst.getAddr32(loopStart - dstBounds.left(), dstYOffset);
850
851 Pass* pass = makerY->makePass(buffer, &alloc);
852 for (int x = loopStart; x < loopEnd; ++x) {
853 pass->blur(srcBounds.top() - dstBounds.top(),
854 srcBounds.bottom() - dstBounds.top(),
855 dstBounds.height(),
856 srcAddr, src.rowBytesAsPixels(),
857 dstAddr, dst.rowBytesAsPixels());
858 srcAddr += 1;
859 dstAddr += 1;
860 }
861 }
862
863 dstBounds = originalDstBounds.makeOffset(-dstOrigin); // Make relative to dst's pixels
864 return SkSpecialImages::MakeFromRaster(dstBounds, dst, SkSurfaceProps{});
865 }
866
867 };
868
869 class RasterShaderBlurAlgorithm : public SkShaderBlurAlgorithm {
870 public:
makeDevice(const SkImageInfo & imageInfo) const871 sk_sp<SkDevice> makeDevice(const SkImageInfo& imageInfo) const override {
872 // This Device will only be used to draw blurs, so use default SkSurfaceProps. The pixel
873 // geometry and font configuration do not matter. This is not a GPU surface, so DMSAA and
874 // the kAlwaysDither surface property are also irrelevant.
875 return SkBitmapDevice::Create(imageInfo, SkSurfaceProps{});
876 }
877 };
878
879 class RasterBlurEngine : public SkBlurEngine {
880 public:
findAlgorithm(SkSize sigma,SkColorType colorType) const881 const Algorithm* findAlgorithm(SkSize sigma, SkColorType colorType) const override {
882 static constexpr float kBoxBlurMinSigma = 2.f;
883
884 // If the sigma is larger than kBoxBlurMinSigma, we should assume that we won't encounter
885 // an identity window assertion later on.
886 SkASSERT(SkBlurEngine::BoxBlurWindow(kBoxBlurMinSigma) > 1);
887
888 // Using the shader-based blur for small blur sigmas only happens if both axes require a
889 // small blur. It's assumed that any inaccuracy along one axis is hidden by the large enough
890 // blur along the other axis.
891 const bool smallBlur = sigma.width() < kBoxBlurMinSigma &&
892 sigma.height() < kBoxBlurMinSigma;
893 // The box blur doesn't actually care about channel order as long as it's 4 8-bit channels.
894 const bool rgba8Blur = colorType == kRGBA_8888_SkColorType ||
895 colorType == kBGRA_8888_SkColorType;
896 // TODO: Specialize A8 color types as well by reusing the mask filter blur impl
897 if (smallBlur || !rgba8Blur) {
898 return &fShaderBlurAlgorithm;
899 } else {
900 return &fRGBA8BlurAlgorithm;
901 }
902 }
903
904 private:
905 // For small sigmas and non-8888 or A8 color types, use the shader algorithm
906 RasterShaderBlurAlgorithm fShaderBlurAlgorithm;
907 // For large blurs with RGBA8 or BGRA8, use consecutive box blurs
908 Raster8888BlurAlgorithm fRGBA8BlurAlgorithm;
909 };
910
911 } // anonymous namespace
912
GetRasterBlurEngine()913 const SkBlurEngine* SkBlurEngine::GetRasterBlurEngine() {
914 static const RasterBlurEngine kInstance;
915 return &kInstance;
916 }
917
918 // SkShaderBlurAlgorithm
919 // ----------------------------------------------------------------------------
920
Compute2DBlurKernel(SkSize sigma,SkISize radius,SkSpan<float> kernel)921 void SkShaderBlurAlgorithm::Compute2DBlurKernel(SkSize sigma,
922 SkISize radius,
923 SkSpan<float> kernel) {
924 // Callers likely had to calculate the radius prior to filling out the kernel value, which is
925 // why it's provided; but make sure it's consistent with expectations.
926 SkASSERT(SkBlurEngine::SigmaToRadius(sigma.width()) == radius.width() &&
927 SkBlurEngine::SigmaToRadius(sigma.height()) == radius.height());
928
929 // Callers are responsible for downscaling large sigmas to values that can be processed by the
930 // effects, so ensure the radius won't overflow 'kernel'
931 const int width = KernelWidth(radius.width());
932 const int height = KernelWidth(radius.height());
933 const size_t kernelSize = SkTo<size_t>(sk_64_mul(width, height));
934 SkASSERT(kernelSize <= kernel.size());
935
936 // And the definition of an identity blur should be sufficient that 2sigma^2 isn't near zero
937 // when there's a non-trivial radius.
938 const float twoSigmaSqrdX = 2.0f * sigma.width() * sigma.width();
939 const float twoSigmaSqrdY = 2.0f * sigma.height() * sigma.height();
940 SkASSERT((radius.width() == 0 || !SkScalarNearlyZero(twoSigmaSqrdX)) &&
941 (radius.height() == 0 || !SkScalarNearlyZero(twoSigmaSqrdY)));
942
943 // Setting the denominator to 1 when the radius is 0 automatically converts the remaining math
944 // to the 1D Gaussian distribution. When both radii are 0, it correctly computes a weight of 1.0
945 const float sigmaXDenom = radius.width() > 0 ? 1.0f / twoSigmaSqrdX : 1.f;
946 const float sigmaYDenom = radius.height() > 0 ? 1.0f / twoSigmaSqrdY : 1.f;
947
948 float sum = 0.0f;
949 for (int x = 0; x < width; x++) {
950 float xTerm = static_cast<float>(x - radius.width());
951 xTerm = xTerm * xTerm * sigmaXDenom;
952 for (int y = 0; y < height; y++) {
953 float yTerm = static_cast<float>(y - radius.height());
954 float xyTerm = std::exp(-(xTerm + yTerm * yTerm * sigmaYDenom));
955 // Note that the constant term (1/(sqrt(2*pi*sigma^2)) of the Gaussian
956 // is dropped here, since we renormalize the kernel below.
957 kernel[y * width + x] = xyTerm;
958 sum += xyTerm;
959 }
960 }
961 // Normalize the kernel
962 float scale = 1.0f / sum;
963 for (size_t i = 0; i < kernelSize; ++i) {
964 kernel[i] *= scale;
965 }
966 // Zero remainder of the array
967 memset(kernel.data() + kernelSize, 0, sizeof(float)*(kernel.size() - kernelSize));
968 }
969
Compute2DBlurKernel(SkSize sigma,SkISize radii,std::array<SkV4,kMaxSamples/4> & kernel)970 void SkShaderBlurAlgorithm::Compute2DBlurKernel(SkSize sigma,
971 SkISize radii,
972 std::array<SkV4, kMaxSamples/4>& kernel) {
973 static_assert(sizeof(kernel) == sizeof(std::array<float, kMaxSamples>));
974 static_assert(alignof(float) == alignof(SkV4));
975 float* data = kernel[0].ptr();
976 Compute2DBlurKernel(sigma, radii, SkSpan<float>(data, kMaxSamples));
977 }
978
Compute2DBlurOffsets(SkISize radius,std::array<SkV4,kMaxSamples/2> & offsets)979 void SkShaderBlurAlgorithm::Compute2DBlurOffsets(SkISize radius,
980 std::array<SkV4, kMaxSamples/2>& offsets) {
981 const int kernelArea = KernelWidth(radius.width()) * KernelWidth(radius.height());
982 SkASSERT(kernelArea <= kMaxSamples);
983
984 SkSpan<float> offsetView{offsets[0].ptr(), kMaxSamples*2};
985
986 int i = 0;
987 for (int y = -radius.height(); y <= radius.height(); ++y) {
988 for (int x = -radius.width(); x <= radius.width(); ++x) {
989 offsetView[2*i] = x;
990 offsetView[2*i+1] = y;
991 ++i;
992 }
993 }
994 SkASSERT(i == kernelArea);
995 const int lastValidOffset = 2*(kernelArea - 1);
996 for (; i < kMaxSamples; ++i) {
997 offsetView[2*i] = offsetView[lastValidOffset];
998 offsetView[2*i+1] = offsetView[lastValidOffset+1];
999 }
1000 }
1001
Compute1DBlurLinearKernel(float sigma,int radius,std::array<SkV4,kMaxSamples/2> & offsetsAndKernel)1002 void SkShaderBlurAlgorithm::Compute1DBlurLinearKernel(
1003 float sigma,
1004 int radius,
1005 std::array<SkV4, kMaxSamples/2>& offsetsAndKernel) {
1006 SkASSERT(sigma <= kMaxLinearSigma);
1007 SkASSERT(radius == SkBlurEngine::SigmaToRadius(sigma));
1008 SkASSERT(LinearKernelWidth(radius) <= kMaxSamples);
1009
1010 // Given 2 adjacent gaussian points, they are blended as: Wi * Ci + Wj * Cj.
1011 // The GPU will mix Ci and Cj as Ci * (1 - x) + Cj * x during sampling.
1012 // Compute W', x such that W' * (Ci * (1 - x) + Cj * x) = Wi * Ci + Wj * Cj.
1013 // Solving W' * x = Wj, W' * (1 - x) = Wi:
1014 // W' = Wi + Wj
1015 // x = Wj / (Wi + Wj)
1016 auto get_new_weight = [](float* new_w, float* offset, float wi, float wj) {
1017 *new_w = wi + wj;
1018 *offset = wj / (wi + wj);
1019 };
1020
1021 // Create a temporary standard kernel. The maximum blur radius that can be passed to this
1022 // function is (kMaxBlurSamples-1), so make an array large enough to hold the full kernel width.
1023 static constexpr int kMaxKernelWidth = KernelWidth(kMaxSamples - 1);
1024 SkASSERT(KernelWidth(radius) <= kMaxKernelWidth);
1025 std::array<float, kMaxKernelWidth> fullKernel;
1026 Compute1DBlurKernel(sigma, radius, SkSpan<float>{fullKernel.data(), KernelWidth(radius)});
1027
1028 std::array<float, kMaxSamples> kernel;
1029 std::array<float, kMaxSamples> offsets;
1030 // Note that halfsize isn't just size / 2, but radius + 1. This is the size of the output array.
1031 int halfSize = LinearKernelWidth(radius);
1032 int halfRadius = halfSize / 2;
1033 int lowIndex = halfRadius - 1;
1034
1035 // Compute1DGaussianKernel produces a full 2N + 1 kernel. Since the kernel can be mirrored,
1036 // compute only the upper half and mirror to the lower half.
1037
1038 int index = radius;
1039 if (radius & 1) {
1040 // If N is odd, then use two samples.
1041 // The centre texel gets sampled twice, so halve its influence for each sample.
1042 // We essentially sample like this:
1043 // Texel edges
1044 // v v v v
1045 // | | | |
1046 // \-----^---/ Lower sample
1047 // \---^-----/ Upper sample
1048 get_new_weight(&kernel[halfRadius],
1049 &offsets[halfRadius],
1050 fullKernel[index] * 0.5f,
1051 fullKernel[index + 1]);
1052 kernel[lowIndex] = kernel[halfRadius];
1053 offsets[lowIndex] = -offsets[halfRadius];
1054 index++;
1055 lowIndex--;
1056 } else {
1057 // If N is even, then there are an even number of texels on either side of the centre texel.
1058 // Sample the centre texel directly.
1059 kernel[halfRadius] = fullKernel[index];
1060 offsets[halfRadius] = 0.0f;
1061 }
1062 index++;
1063
1064 // Every other pair gets one sample.
1065 for (int i = halfRadius + 1; i < halfSize; index += 2, i++, lowIndex--) {
1066 get_new_weight(&kernel[i], &offsets[i], fullKernel[index], fullKernel[index + 1]);
1067 offsets[i] += static_cast<float>(index - radius);
1068
1069 // Mirror to lower half.
1070 kernel[lowIndex] = kernel[i];
1071 offsets[lowIndex] = -offsets[i];
1072 }
1073
1074 // Zero out remaining values in the kernel
1075 memset(kernel.data() + halfSize, 0, sizeof(float)*(kMaxSamples - halfSize));
1076 // But copy the last valid offset into the remaining offsets, to increase the chance that
1077 // over-iteration in a fragment shader will have a cache hit.
1078 for (int i = halfSize; i < kMaxSamples; ++i) {
1079 offsets[i] = offsets[halfSize - 1];
1080 }
1081
1082 // Interleave into the output array to match the 1D SkSL effect
1083 for (int i = 0; i < kMaxSamples / 2; ++i) {
1084 offsetsAndKernel[i] = SkV4{offsets[2*i], kernel[2*i], offsets[2*i+1], kernel[2*i+1]};
1085 }
1086 }
1087
to_stablekey(int kernelWidth,uint32_t baseKey)1088 static SkKnownRuntimeEffects::StableKey to_stablekey(int kernelWidth, uint32_t baseKey) {
1089 SkASSERT(kernelWidth >= 2 && kernelWidth <= SkShaderBlurAlgorithm::kMaxSamples);
1090 switch(kernelWidth) {
1091 // Batch on multiples of 4 (skipping width=1, since that can't happen)
1092 case 2: [[fallthrough]];
1093 case 3: [[fallthrough]];
1094 case 4: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey);
1095 case 5: [[fallthrough]];
1096 case 6: [[fallthrough]];
1097 case 7: [[fallthrough]];
1098 case 8: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+1);
1099 case 9: [[fallthrough]];
1100 case 10: [[fallthrough]];
1101 case 11: [[fallthrough]];
1102 case 12: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+2);
1103 case 13: [[fallthrough]];
1104 case 14: [[fallthrough]];
1105 case 15: [[fallthrough]];
1106 case 16: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+3);
1107 case 17: [[fallthrough]];
1108 case 18: [[fallthrough]];
1109 case 19: [[fallthrough]];
1110 // With larger kernels, batch on multiples of eight so up to 7 wasted samples.
1111 case 20: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+4);
1112 case 21: [[fallthrough]];
1113 case 22: [[fallthrough]];
1114 case 23: [[fallthrough]];
1115 case 24: [[fallthrough]];
1116 case 25: [[fallthrough]];
1117 case 26: [[fallthrough]];
1118 case 27: [[fallthrough]];
1119 case 28: return static_cast<SkKnownRuntimeEffects::StableKey>(baseKey+5);
1120 default:
1121 SkUNREACHABLE;
1122 }
1123 }
1124
GetLinearBlur1DEffect(int radius)1125 const SkRuntimeEffect* SkShaderBlurAlgorithm::GetLinearBlur1DEffect(int radius) {
1126 return GetKnownRuntimeEffect(
1127 to_stablekey(LinearKernelWidth(radius),
1128 static_cast<uint32_t>(SkKnownRuntimeEffects::StableKey::k1DBlurBase)));
1129 }
1130
GetBlur2DEffect(const SkISize & radii)1131 const SkRuntimeEffect* SkShaderBlurAlgorithm::GetBlur2DEffect(const SkISize& radii) {
1132 int kernelArea = KernelWidth(radii.width()) * KernelWidth(radii.height());
1133 return GetKnownRuntimeEffect(
1134 to_stablekey(kernelArea,
1135 static_cast<uint32_t>(SkKnownRuntimeEffects::StableKey::k2DBlurBase)));
1136 }
1137
renderBlur(SkRuntimeShaderBuilder * blurEffectBuilder,SkFilterMode filter,SkISize radii,sk_sp<SkSpecialImage> input,const SkIRect & srcRect,SkTileMode tileMode,const SkIRect & dstRect) const1138 sk_sp<SkSpecialImage> SkShaderBlurAlgorithm::renderBlur(SkRuntimeShaderBuilder* blurEffectBuilder,
1139 SkFilterMode filter,
1140 SkISize radii,
1141 sk_sp<SkSpecialImage> input,
1142 const SkIRect& srcRect,
1143 SkTileMode tileMode,
1144 const SkIRect& dstRect) const {
1145 SkImageInfo outII = SkImageInfo::Make({dstRect.width(), dstRect.height()},
1146 input->colorType(),
1147 kPremul_SkAlphaType,
1148 input->colorInfo().refColorSpace());
1149 sk_sp<SkDevice> device = this->makeDevice(outII);
1150 if (!device) {
1151 return nullptr;
1152 }
1153
1154 SkIRect subset = SkIRect::MakeSize(dstRect.size());
1155 device->clipRect(SkRect::Make(subset), SkClipOp::kIntersect, /*aa=*/false);
1156 device->setLocalToDevice(SkM44::Translate(-dstRect.left(), -dstRect.top()));
1157
1158 // renderBlur() will either mix multiple fast and strict draws to cover dstRect, or will issue
1159 // a single strict draw. While the SkShader object changes (really just strict mode), the rest
1160 // of the SkPaint remains the same.
1161 SkPaint paint;
1162 paint.setBlendMode(SkBlendMode::kSrc);
1163
1164 SkIRect safeSrcRect = srcRect.makeInset(radii.width(), radii.height());
1165 SkIRect fastDstRect = dstRect;
1166
1167 // Only consider the safeSrcRect for shader-based tiling if the original srcRect is different
1168 // from the backing store dimensions; when they match the full image we can use HW tiling.
1169 if (srcRect != SkIRect::MakeSize(input->backingStoreDimensions())) {
1170 if (fastDstRect.intersect(safeSrcRect)) {
1171 // If the area of the non-clamping shader is small, it's better to just issue a single
1172 // draw that performs shader tiling over the whole dst.
1173 if (fastDstRect != dstRect && fastDstRect.width() * fastDstRect.height() < 128 * 128) {
1174 fastDstRect.setEmpty();
1175 }
1176 } else {
1177 fastDstRect.setEmpty();
1178 }
1179 }
1180
1181 if (!fastDstRect.isEmpty()) {
1182 // Fill as much as possible without adding shader tiling logic to each blur sample,
1183 // switching to clamp tiling if we aren't in this block due to HW tiling.
1184 SkIRect untiledSrcRect = srcRect.makeInset(1, 1);
1185 SkTileMode fastTileMode = untiledSrcRect.contains(fastDstRect) ? SkTileMode::kClamp
1186 : tileMode;
1187 blurEffectBuilder->child("child") = input->asShader(
1188 fastTileMode, filter, SkMatrix::I(), /*strict=*/false);
1189 paint.setShader(blurEffectBuilder->makeShader());
1190 device->drawRect(SkRect::Make(fastDstRect), paint);
1191 }
1192
1193 // Switch to a strict shader if there are remaining pixels to fill
1194 if (fastDstRect != dstRect) {
1195 blurEffectBuilder->child("child") = input->makeSubset(srcRect)->asShader(
1196 tileMode, filter, SkMatrix::Translate(srcRect.left(), srcRect.top()));
1197 paint.setShader(blurEffectBuilder->makeShader());
1198 }
1199
1200 if (fastDstRect.isEmpty()) {
1201 // Fill the entire dst with the strict shader
1202 device->drawRect(SkRect::Make(dstRect), paint);
1203 } else if (fastDstRect != dstRect) {
1204 // There will be up to four additional strict draws to fill in the border. The left and
1205 // right sides will span the full height of the dst rect. The top and bottom will span
1206 // the just the width of the fast interior. Strict border draws with zero width/height
1207 // are skipped.
1208 auto drawBorder = [&](const SkIRect& r) {
1209 if (!r.isEmpty()) {
1210 device->drawRect(SkRect::Make(r), paint);
1211 }
1212 };
1213
1214 drawBorder({dstRect.left(), dstRect.top(),
1215 fastDstRect.left(), dstRect.bottom()}); // Left, spanning full height
1216 drawBorder({fastDstRect.right(), dstRect.top(),
1217 dstRect.right(), dstRect.bottom()}); // Right, spanning full height
1218 drawBorder({fastDstRect.left(), dstRect.top(),
1219 fastDstRect.right(), fastDstRect.top()}); // Top, spanning inner width
1220 drawBorder({fastDstRect.left(), fastDstRect.bottom(),
1221 fastDstRect.right(), dstRect.bottom()}); // Bottom, spanning inner width
1222 }
1223
1224 return device->snapSpecial(subset);
1225 }
1226
evalBlur2D(SkSize sigma,SkISize radii,sk_sp<SkSpecialImage> input,const SkIRect & srcRect,SkTileMode tileMode,const SkIRect & dstRect) const1227 sk_sp<SkSpecialImage> SkShaderBlurAlgorithm::evalBlur2D(SkSize sigma,
1228 SkISize radii,
1229 sk_sp<SkSpecialImage> input,
1230 const SkIRect& srcRect,
1231 SkTileMode tileMode,
1232 const SkIRect& dstRect) const {
1233 std::array<SkV4, kMaxSamples/4> kernel;
1234 std::array<SkV4, kMaxSamples/2> offsets;
1235 Compute2DBlurKernel(sigma, radii, kernel);
1236 Compute2DBlurOffsets(radii, offsets);
1237
1238 SkRuntimeShaderBuilder builder{sk_ref_sp(GetBlur2DEffect(radii))};
1239 builder.uniform("kernel") = kernel;
1240 builder.uniform("offsets") = offsets;
1241 // NOTE: renderBlur() will configure the "child" shader as needed. The 2D blur effect only
1242 // requires nearest-neighbor filtering.
1243 return this->renderBlur(&builder, SkFilterMode::kNearest, radii,
1244 std::move(input), srcRect, tileMode, dstRect);
1245 }
1246
evalBlur1D(float sigma,int radius,SkV2 dir,sk_sp<SkSpecialImage> input,SkIRect srcRect,SkTileMode tileMode,SkIRect dstRect) const1247 sk_sp<SkSpecialImage> SkShaderBlurAlgorithm::evalBlur1D(float sigma,
1248 int radius,
1249 SkV2 dir,
1250 sk_sp<SkSpecialImage> input,
1251 SkIRect srcRect,
1252 SkTileMode tileMode,
1253 SkIRect dstRect) const {
1254 std::array<SkV4, kMaxSamples/2> offsetsAndKernel;
1255 Compute1DBlurLinearKernel(sigma, radius, offsetsAndKernel);
1256
1257 SkRuntimeShaderBuilder builder{sk_ref_sp(GetLinearBlur1DEffect(radius))};
1258 builder.uniform("offsetsAndKernel") = offsetsAndKernel;
1259 builder.uniform("dir") = dir;
1260 // NOTE: renderBlur() will configure the "child" shader as needed. The 1D blur effect requires
1261 // linear filtering. Reconstruct the appropriate "2D" radii inset value from 'dir'.
1262 SkISize radii{dir.x ? radius : 0, dir.y ? radius : 0};
1263 return this->renderBlur(&builder, SkFilterMode::kLinear, radii,
1264 std::move(input), srcRect, tileMode, dstRect);
1265 }
1266
blur(SkSize sigma,sk_sp<SkSpecialImage> src,const SkIRect & srcRect,SkTileMode tileMode,const SkIRect & dstRect) const1267 sk_sp<SkSpecialImage> SkShaderBlurAlgorithm::blur(SkSize sigma,
1268 sk_sp<SkSpecialImage> src,
1269 const SkIRect& srcRect,
1270 SkTileMode tileMode,
1271 const SkIRect& dstRect) const {
1272 SkASSERT(sigma.width() <= kMaxLinearSigma && sigma.height() <= kMaxLinearSigma);
1273
1274 int radiusX = SkBlurEngine::SigmaToRadius(sigma.width());
1275 int radiusY = SkBlurEngine::SigmaToRadius(sigma.height());
1276 const int kernelArea = KernelWidth(radiusX) * KernelWidth(radiusY);
1277 if (kernelArea <= kMaxSamples && radiusX > 0 && radiusY > 0) {
1278 // Use a single-pass 2D kernel if it fits and isn't just 1D already
1279 return this->evalBlur2D(sigma,
1280 {radiusX, radiusY},
1281 std::move(src),
1282 srcRect,
1283 tileMode,
1284 dstRect);
1285 } else {
1286 // Use two passes of a 1D kernel (one per axis).
1287 SkIRect intermediateSrcRect = srcRect;
1288 SkIRect intermediateDstRect = dstRect;
1289 if (radiusX > 0) {
1290 if (radiusY > 0) {
1291 // May need to maintain extra rows above and below 'dstRect' for the follow-up pass.
1292 if (tileMode == SkTileMode::kRepeat || tileMode == SkTileMode::kMirror) {
1293 // If the srcRect and dstRect are aligned, then we don't need extra rows since
1294 // the periodic tiling on srcRect is the same for the intermediate. If they
1295 // are not aligned, then outset by the Y radius.
1296 const int period = srcRect.height() * (tileMode == SkTileMode::kMirror ? 2 : 1);
1297 if (std::abs(dstRect.fTop - srcRect.fTop) % period != 0 ||
1298 dstRect.height() != srcRect.height()) {
1299 intermediateDstRect.outset(0, radiusY);
1300 }
1301 } else {
1302 // For clamp and decal tiling, we outset by the Y radius up to what's available
1303 // from the srcRect. Anything beyond that is identical to tiling the
1304 // intermediate dst image directly.
1305 intermediateDstRect.outset(0, radiusY);
1306 intermediateDstRect.fTop = std::max(intermediateDstRect.fTop, srcRect.fTop);
1307 intermediateDstRect.fBottom =
1308 std::min(intermediateDstRect.fBottom, srcRect.fBottom);
1309 if (intermediateDstRect.fTop >= intermediateDstRect.fBottom) {
1310 return nullptr;
1311 }
1312 }
1313 }
1314
1315 src = this->evalBlur1D(sigma.width(),
1316 radiusX,
1317 /*dir=*/{1.f, 0.f},
1318 std::move(src),
1319 srcRect,
1320 tileMode,
1321 intermediateDstRect);
1322 if (!src) {
1323 return nullptr;
1324 }
1325 intermediateSrcRect = SkIRect::MakeWH(src->width(), src->height());
1326 intermediateDstRect = dstRect.makeOffset(-intermediateDstRect.left(),
1327 -intermediateDstRect.top());
1328 }
1329
1330 if (radiusY > 0) {
1331 src = this->evalBlur1D(sigma.height(),
1332 radiusY,
1333 /*dir=*/{0.f, 1.f},
1334 std::move(src),
1335 intermediateSrcRect,
1336 tileMode,
1337 intermediateDstRect);
1338 }
1339
1340 return src;
1341 }
1342 }
1343