xref: /aosp_15_r20/external/skia/src/core/SkMaskBlurFilter.cpp (revision c8dee2aa9b3f27cf6c858bd81872bdeb2c07ed17)
1 /*
2  * Copyright 2017 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "src/core/SkMaskBlurFilter.h"
9 
10 #include "include/core/SkColorPriv.h"
11 #include "include/private/base/SkMalloc.h"
12 #include "include/private/base/SkTPin.h"
13 #include "include/private/base/SkTemplates.h"
14 #include "include/private/base/SkTo.h"
15 #include "src/base/SkArenaAlloc.h"
16 #include "src/base/SkVx.h"
17 #include "src/core/SkGaussFilter.h"
18 
19 #include <cmath>
20 #include <climits>
21 
22 namespace {
23 static const double kPi = 3.14159265358979323846264338327950288;
24 
25 class PlanGauss final {
26 public:
PlanGauss(double sigma)27     explicit PlanGauss(double sigma) {
28         auto possibleWindow = static_cast<int>(floor(sigma * 3 * sqrt(2 * kPi) / 4 + 0.5));
29         auto window = std::max(1, possibleWindow);
30 
31         fPass0Size = window - 1;
32         fPass1Size = window - 1;
33         fPass2Size = (window & 1) == 1 ? window - 1 : window;
34 
35         // Calculating the border is tricky. I will go through the odd case which is simpler, and
36         // then through the even case. Given a stack of filters seven wide for the odd case of
37         // three passes.
38         //
39         //        S
40         //     aaaAaaa
41         //     bbbBbbb
42         //     cccCccc
43         //        D
44         //
45         // The furthest changed pixel is when the filters are in the following configuration.
46         //
47         //                 S
48         //           aaaAaaa
49         //        bbbBbbb
50         //     cccCccc
51         //        D
52         //
53         //  The A pixel is calculated using the value S, the B uses A, and the C uses B, and
54         // finally D is C. So, with a window size of seven the border is nine. In general, the
55         // border is 3*((window - 1)/2).
56         //
57         // For even cases the filter stack is more complicated. The spec specifies two passes
58         // of even filters and a final pass of odd filters. A stack for a width of six looks like
59         // this.
60         //
61         //       S
62         //    aaaAaa
63         //     bbBbbb
64         //    cccCccc
65         //       D
66         //
67         // The furthest pixel looks like this.
68         //
69         //               S
70         //          aaaAaa
71         //        bbBbbb
72         //    cccCccc
73         //       D
74         //
75         // For a window of size, the border value is seven. In general the border is 3 *
76         // (window/2) -1.
77         fBorder = (window & 1) == 1 ? 3 * ((window - 1) / 2) : 3 * (window / 2) - 1;
78         fSlidingWindow = 2 * fBorder + 1;
79 
80         // If the window is odd then the divisor is just window ^ 3 otherwise,
81         // it is window * window * (window + 1) = window ^ 2 + window ^ 3;
82         auto window2 = window * window;
83         auto window3 = window2 * window;
84         auto divisor = (window & 1) == 1 ? window3 : window3 + window2;
85 
86         fWeight = static_cast<uint64_t>(round(1.0 / divisor * (1ull << 32)));
87     }
88 
bufferSize() const89     size_t bufferSize() const { return fPass0Size + fPass1Size + fPass2Size; }
90 
border() const91     int    border()     const { return fBorder; }
92 
93 public:
94     class Scan {
95     public:
Scan(uint64_t weight,int noChangeCount,uint32_t * buffer0,uint32_t * buffer0End,uint32_t * buffer1,uint32_t * buffer1End,uint32_t * buffer2,uint32_t * buffer2End)96         Scan(uint64_t weight, int noChangeCount,
97              uint32_t* buffer0, uint32_t* buffer0End,
98              uint32_t* buffer1, uint32_t* buffer1End,
99              uint32_t* buffer2, uint32_t* buffer2End)
100             : fWeight{weight}
101             , fNoChangeCount{noChangeCount}
102             , fBuffer0{buffer0}
103             , fBuffer0End{buffer0End}
104             , fBuffer1{buffer1}
105             , fBuffer1End{buffer1End}
106             , fBuffer2{buffer2}
107             , fBuffer2End{buffer2End}
108         { }
109 
blur(const AlphaIter srcBegin,const AlphaIter srcEnd,uint8_t * dst,int dstStride,uint8_t * dstEnd) const110         template <typename AlphaIter> void blur(const AlphaIter srcBegin, const AlphaIter srcEnd,
111                     uint8_t* dst, int dstStride, uint8_t* dstEnd) const {
112             auto buffer0Cursor = fBuffer0;
113             auto buffer1Cursor = fBuffer1;
114             auto buffer2Cursor = fBuffer2;
115 
116             std::memset(fBuffer0, 0x00, (fBuffer2End - fBuffer0) * sizeof(*fBuffer0));
117 
118             uint32_t sum0 = 0;
119             uint32_t sum1 = 0;
120             uint32_t sum2 = 0;
121 
122             // Consume the source generating pixels.
123             for (AlphaIter src = srcBegin; src < srcEnd; ++src, dst += dstStride) {
124                 uint32_t leadingEdge = *src;
125                 sum0 += leadingEdge;
126                 sum1 += sum0;
127                 sum2 += sum1;
128 
129                 *dst = this->finalScale(sum2);
130 
131                 sum2 -= *buffer2Cursor;
132                 *buffer2Cursor = sum1;
133                 buffer2Cursor = (buffer2Cursor + 1) < fBuffer2End ? buffer2Cursor + 1 : fBuffer2;
134 
135                 sum1 -= *buffer1Cursor;
136                 *buffer1Cursor = sum0;
137                 buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1;
138 
139                 sum0 -= *buffer0Cursor;
140                 *buffer0Cursor = leadingEdge;
141                 buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0;
142             }
143 
144             // The leading edge is off the right side of the mask.
145             for (int i = 0; i < fNoChangeCount; i++) {
146                 uint32_t leadingEdge = 0;
147                 sum0 += leadingEdge;
148                 sum1 += sum0;
149                 sum2 += sum1;
150 
151                 *dst = this->finalScale(sum2);
152 
153                 sum2 -= *buffer2Cursor;
154                 *buffer2Cursor = sum1;
155                 buffer2Cursor = (buffer2Cursor + 1) < fBuffer2End ? buffer2Cursor + 1 : fBuffer2;
156 
157                 sum1 -= *buffer1Cursor;
158                 *buffer1Cursor = sum0;
159                 buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1;
160 
161                 sum0 -= *buffer0Cursor;
162                 *buffer0Cursor = leadingEdge;
163                 buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0;
164 
165                 dst += dstStride;
166             }
167 
168             // Starting from the right, fill in the rest of the buffer.
169             std::memset(fBuffer0, 0, (fBuffer2End - fBuffer0) * sizeof(*fBuffer0));
170 
171             sum0 = sum1 = sum2 = 0;
172 
173             uint8_t* dstCursor = dstEnd;
174             AlphaIter src = srcEnd;
175             while (dstCursor > dst) {
176                 dstCursor -= dstStride;
177                 uint32_t leadingEdge = *(--src);
178                 sum0 += leadingEdge;
179                 sum1 += sum0;
180                 sum2 += sum1;
181 
182                 *dstCursor = this->finalScale(sum2);
183 
184                 sum2 -= *buffer2Cursor;
185                 *buffer2Cursor = sum1;
186                 buffer2Cursor = (buffer2Cursor + 1) < fBuffer2End ? buffer2Cursor + 1 : fBuffer2;
187 
188                 sum1 -= *buffer1Cursor;
189                 *buffer1Cursor = sum0;
190                 buffer1Cursor = (buffer1Cursor + 1) < fBuffer1End ? buffer1Cursor + 1 : fBuffer1;
191 
192                 sum0 -= *buffer0Cursor;
193                 *buffer0Cursor = leadingEdge;
194                 buffer0Cursor = (buffer0Cursor + 1) < fBuffer0End ? buffer0Cursor + 1 : fBuffer0;
195             }
196         }
197 
198     private:
199         inline static constexpr uint64_t kHalf = static_cast<uint64_t>(1) << 31;
200 
finalScale(uint32_t sum) const201         uint8_t finalScale(uint32_t sum) const {
202             return SkTo<uint8_t>((fWeight * sum + kHalf) >> 32);
203         }
204 
205         uint64_t  fWeight;
206         int       fNoChangeCount;
207         uint32_t* fBuffer0;
208         uint32_t* fBuffer0End;
209         uint32_t* fBuffer1;
210         uint32_t* fBuffer1End;
211         uint32_t* fBuffer2;
212         uint32_t* fBuffer2End;
213     };
214 
makeBlurScan(int width,uint32_t * buffer) const215     Scan makeBlurScan(int width, uint32_t* buffer) const {
216         uint32_t* buffer0, *buffer0End, *buffer1, *buffer1End, *buffer2, *buffer2End;
217         buffer0 = buffer;
218         buffer0End = buffer1 = buffer0 + fPass0Size;
219         buffer1End = buffer2 = buffer1 + fPass1Size;
220         buffer2End = buffer2 + fPass2Size;
221         int noChangeCount = fSlidingWindow > width ? fSlidingWindow - width : 0;
222 
223         return Scan(
224             fWeight, noChangeCount,
225             buffer0, buffer0End,
226             buffer1, buffer1End,
227             buffer2, buffer2End);
228     }
229 
230     uint64_t fWeight;
231     int      fBorder;
232     int      fSlidingWindow;
233     int      fPass0Size;
234     int      fPass1Size;
235     int      fPass2Size;
236 };
237 
238 } // namespace
239 
240 // NB 135 is the largest sigma that will not cause a buffer full of 255 mask values to overflow
241 // using the Gauss filter. It also limits the size of buffers used hold intermediate values. The
242 // additional + 1 added to window represents adding one more leading element before subtracting the
243 // trailing element.
244 // Explanation of maximums:
245 //   sum0 = (window + 1) * 255
246 //   sum1 = (window + 1) * sum0 -> (window + 1) * (window + 1) * 255
247 //   sum2 = (window + 1) * sum1 -> (window + 1) * (window + 1) * (window + 1) * 255 -> window^3 * 255
248 //
249 //   The value (window + 1)^3 * 255 must fit in a uint32_t. So,
250 //      (window + 1)^3 * 255 < 2^32. window = 255.
251 //
252 //   window = floor(sigma * 3 * sqrt(2 * kPi) / 4)
253 //   For window <= 255, the largest value for sigma is 135.
SkMaskBlurFilter(double sigmaW,double sigmaH)254 SkMaskBlurFilter::SkMaskBlurFilter(double sigmaW, double sigmaH)
255     : fSigmaW{SkTPin(sigmaW, 0.0, 135.0)}
256     , fSigmaH{SkTPin(sigmaH, 0.0, 135.0)}
257 {
258     SkASSERT(sigmaW >= 0);
259     SkASSERT(sigmaH >= 0);
260 }
261 
hasNoBlur() const262 bool SkMaskBlurFilter::hasNoBlur() const {
263     return (3 * fSigmaW <= 1) && (3 * fSigmaH <= 1);
264 }
265 
266 // We favor A8 masks, and if we need to work with another format, we'll convert to A8 first.
267 // Each of these converts width (up to 8) mask values to A8.
bw_to_a8(uint8_t * a8,const uint8_t * from,int width)268 static void bw_to_a8(uint8_t* a8, const uint8_t* from, int width) {
269     SkASSERT(0 < width && width <= 8);
270 
271     uint8_t masks = *from;
272     for (int i = 0; i < width; ++i) {
273         a8[i] = (masks >> (7 - i)) & 1 ? 0xFF
274                                        : 0x00;
275     }
276 }
lcd_to_a8(uint8_t * a8,const uint8_t * from,int width)277 static void lcd_to_a8(uint8_t* a8, const uint8_t* from, int width) {
278     SkASSERT(0 < width && width <= 8);
279 
280     for (int i = 0; i < width; ++i) {
281         unsigned rgb = reinterpret_cast<const uint16_t*>(from)[i],
282                    r = SkPacked16ToR32(rgb),
283                    g = SkPacked16ToG32(rgb),
284                    b = SkPacked16ToB32(rgb);
285         a8[i] = (r + g + b) / 3;
286     }
287 }
argb32_to_a8(uint8_t * a8,const uint8_t * from,int width)288 static void argb32_to_a8(uint8_t* a8, const uint8_t* from, int width) {
289     SkASSERT(0 < width && width <= 8);
290     for (int i = 0; i < width; ++i) {
291         uint32_t rgba = reinterpret_cast<const uint32_t*>(from)[i];
292         a8[i] = SkGetPackedA32(rgba);
293     }
294 }
295 using ToA8 = decltype(bw_to_a8);
296 
297 using fp88 = skvx::Vec<8, uint16_t>; // 8-wide fixed point 8.8
298 
load(const uint8_t * from,int width,ToA8 * toA8)299 static fp88 load(const uint8_t* from, int width, ToA8* toA8) {
300     // Our fast path is a full 8-byte load of A8.
301     // So we'll conditionally handle the two slow paths using tmp:
302     //    - if we have a function to convert another mask to A8, use it;
303     //    - if not but we have less than 8 bytes to load, load them one at a time.
304     uint8_t tmp[8] = {0,0,0,0, 0,0,0,0};
305     if (toA8) {
306         toA8(tmp, from, width);
307         from = tmp;
308     } else if (width < 8) {
309         for (int i = 0; i < width; ++i) {
310             tmp[i] = from[i];
311         }
312         from = tmp;
313     }
314 
315     // Load A8 and convert to 8.8 fixed-point.
316     return skvx::cast<uint16_t>(skvx::byte8::Load(from)) << 8;
317 }
318 
store(uint8_t * to,const fp88 & v,int width)319 static void store(uint8_t* to, const fp88& v, int width) {
320     skvx::byte8 b = skvx::cast<uint8_t>(v >> 8);
321     if (width == 8) {
322         b.store(to);
323     } else {
324         uint8_t buffer[8];
325         b.store(buffer);
326         for (int i = 0; i < width; i++) {
327             to[i] = buffer[i];
328         }
329     }
330 }
331 
332 static constexpr uint16_t _____ = 0u;
333 static constexpr uint16_t kHalf = 0x80u;
334 
335 // In all the blur_x_radius_N and blur_y_radius_N functions the gaussian values are encoded
336 // in 0.16 format, none of the values is greater than one. The incoming mask values are in 8.8
337 // format. The resulting multiply has a 8.24 format, by the mulhi truncates the lower 16 bits
338 // resulting in a 8.8 format.
339 //
340 // The blur_x_radius_N function below blur along a row of pixels using a kernel with radius N. This
341 // system is setup to minimize the number of multiplies needed.
342 //
343 // Explanation:
344 //    Blurring a specific mask value is given by the following equation where D_n is the resulting
345 // mask value and S_n is the source value. The example below is for a filter with a radius of 1
346 // and a width of 3 (radius == (width-1)/2). The indexes for the source and destination are
347 // aligned. The filter is given by G_n where n is the symmetric filter value.
348 //
349 //   D[n] = S[n-1]*G[1] + S[n]*G[0] + S[n+1]*G[1].
350 //
351 // We can start the source index at an offset relative to the destination separated by the
352 // radius. This results in a non-traditional restating of the above filter.
353 //
354 //  D[n] = S[n]*G[1] + S[n+1]*G[0] + S[n+2]*G[1]
355 //
356 // If we look at three specific consecutive destinations the following equations result:
357 //
358 //   D[5] = S[5]*G[1] + S[6]*G[0] + S[7]*G[1]
359 //   D[7] = S[6]*G[1] + S[7]*G[0] + S[8]*G[1]
360 //   D[8] = S[7]*G[1] + S[8]*G[0] + S[9]*G[1].
361 //
362 // In the above equations, notice that S[7] is used in all three. In particular, two values are
363 // used: S[7]*G[0] and S[7]*G[1]. So, S[7] is only multiplied twice, but used in D[5], D[6] and
364 // D[7].
365 //
366 // From the point of view of a source value we end up with the following three equations.
367 //
368 // Given S[7]:
369 //   D[5] += S[7]*G[1]
370 //   D[6] += S[7]*G[0]
371 //   D[7] += S[7]*G[1]
372 //
373 // In General:
374 //   D[n]   += S[n]*G[1]
375 //   D[n+1] += S[n]*G[0]
376 //   D[n+2] += S[n]*G[1]
377 //
378 // Now these equations can be ganged using SIMD to form:
379 //   D[n..n+7]   += S[n..n+7]*G[1]
380 //   D[n+1..n+8] += S[n..n+7]*G[0]
381 //   D[n+2..n+9] += S[n..n+7]*G[1]
382 // The next set of values becomes.
383 //   D[n+8..n+15]  += S[n+8..n+15]*G[1]
384 //   D[n+9..n+16]  += S[n+8..n+15]*G[0]
385 //   D[n+10..n+17] += S[n+8..n+15]*G[1]
386 // You can see that the D[n+8] and D[n+9] values overlap the two sets, using parts of both
387 // S[n..7] and S[n+8..n+15].
388 //
389 // Just one more transformation allows the code to maintain all working values in
390 // registers. I introduce the notation {0, S[n..n+7] * G[k]} to mean that the value where 0 is
391 // prepended to the array of values to form {0, S[n] * G[k], ..., S[n+7]*G[k]}.
392 //
393 //   D[n..n+7]  += S[n..n+7] * G[1]
394 //   D[n..n+8]  += {0, S[n..n+7] * G[0]}
395 //   D[n..n+9]  += {0, 0, S[n..n+7] * G[1]}
396 //
397 // Now we can encode D[n..n+7] in a single Sk8h register called d0, and D[n+8..n+15] in a
398 // register d8. In addition, S[0..n+7] becomes s0.
399 //
400 // The translation of the {0, S[n..n+7] * G[k]} is translated in the following way below.
401 //
402 // Sk8h v0 = s0*G[0]
403 // Sk8h v1 = s0*G[1]
404 // /* D[n..n+7]  += S[n..n+7] * G[1] */
405 // d0 += v1;
406 // /* D[n..n+8]  += {0, S[n..n+7] * G[0]} */
407 // d0 += {_____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5], v0[6]}
408 // d1 += {v0[7], _____, _____, _____, _____, _____, _____, _____}
409 // /* D[n..n+9]  += {0, 0, S[n..n+7] * G[1]} */
410 // d0 += {_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]}
411 // d1 += {v1[6], v1[7], _____, _____, _____, _____, _____, _____}
412 // Where we rely on the compiler to generate efficient code for the {____, n, ....} notation.
413 
blur_x_radius_1(const fp88 & s0,const fp88 & g0,const fp88 & g1,const fp88 &,const fp88 &,const fp88 &,fp88 * d0,fp88 * d8)414 static void blur_x_radius_1(
415         const fp88& s0,
416         const fp88& g0, const fp88& g1, const fp88&, const fp88&, const fp88&,
417         fp88* d0, fp88* d8) {
418 
419     auto v1 = mulhi(s0, g1);
420     auto v0 = mulhi(s0, g0);
421 
422     // D[n..n+7]  += S[n..n+7] * G[1]
423     *d0 += v1;
424 
425     //D[n..n+8]  += {0, S[n..n+7] * G[0]}
426     *d0 += fp88{_____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5], v0[6]};
427     *d8 += fp88{v0[7], _____, _____, _____, _____, _____, _____, _____};
428 
429     // D[n..n+9]  += {0, 0, S[n..n+7] * G[1]}
430     *d0 += fp88{_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]};
431     *d8 += fp88{v1[6], v1[7], _____, _____, _____, _____, _____, _____};
432 
433 }
434 
blur_x_radius_2(const fp88 & s0,const fp88 & g0,const fp88 & g1,const fp88 & g2,const fp88 &,const fp88 &,fp88 * d0,fp88 * d8)435 static void blur_x_radius_2(
436         const fp88& s0,
437         const fp88& g0, const fp88& g1, const fp88& g2, const fp88&, const fp88&,
438         fp88* d0, fp88* d8) {
439     auto v0 = mulhi(s0, g0);
440     auto v1 = mulhi(s0, g1);
441     auto v2 = mulhi(s0, g2);
442 
443     // D[n..n+7]  += S[n..n+7] * G[2]
444     *d0 += v2;
445 
446     // D[n..n+8]  += {0, S[n..n+7] * G[1]}
447     *d0 += fp88{_____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5], v1[6]};
448     *d8 += fp88{v1[7], _____, _____, _____, _____, _____, _____, _____};
449 
450     // D[n..n+9]  += {0, 0, S[n..n+7] * G[0]}
451     *d0 += fp88{_____, _____, v0[0], v0[1], v0[2], v0[3], v0[4], v0[5]};
452     *d8 += fp88{v0[6], v0[7], _____, _____, _____, _____, _____, _____};
453 
454     // D[n..n+10]  += {0, 0, 0, S[n..n+7] * G[1]}
455     *d0 += fp88{_____, _____, _____, v1[0], v1[1], v1[2], v1[3], v1[4]};
456     *d8 += fp88{v1[5], v1[6], v1[7], _____, _____, _____, _____, _____};
457 
458     // D[n..n+11]  += {0, 0, 0, 0, S[n..n+7] * G[2]}
459     *d0 += fp88{_____, _____, _____, _____, v2[0], v2[1], v2[2], v2[3]};
460     *d8 += fp88{v2[4], v2[5], v2[6], v2[7], _____, _____, _____, _____};
461 }
462 
blur_x_radius_3(const fp88 & s0,const fp88 & g0,const fp88 & g1,const fp88 & g2,const fp88 & g3,const fp88 &,fp88 * d0,fp88 * d8)463 static void blur_x_radius_3(
464         const fp88& s0,
465         const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88&,
466         fp88* d0, fp88* d8) {
467     auto v0 = mulhi(s0, g0);
468     auto v1 = mulhi(s0, g1);
469     auto v2 = mulhi(s0, g2);
470     auto v3 = mulhi(s0, g3);
471 
472     // D[n..n+7]  += S[n..n+7] * G[3]
473     *d0 += v3;
474 
475     // D[n..n+8]  += {0, S[n..n+7] * G[2]}
476     *d0 += fp88{_____, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5], v2[6]};
477     *d8 += fp88{v2[7], _____, _____, _____, _____, _____, _____, _____};
478 
479     // D[n..n+9]  += {0, 0, S[n..n+7] * G[1]}
480     *d0 += fp88{_____, _____, v1[0], v1[1], v1[2], v1[3], v1[4], v1[5]};
481     *d8 += fp88{v1[6], v1[7], _____, _____, _____, _____, _____, _____};
482 
483     // D[n..n+10]  += {0, 0, 0, S[n..n+7] * G[0]}
484     *d0 += fp88{_____, _____, _____, v0[0], v0[1], v0[2], v0[3], v0[4]};
485     *d8 += fp88{v0[5], v0[6], v0[7], _____, _____, _____, _____, _____};
486 
487     // D[n..n+11]  += {0, 0, 0, 0, S[n..n+7] * G[1]}
488     *d0 += fp88{_____, _____, _____, _____, v1[0], v1[1], v1[2], v1[3]};
489     *d8 += fp88{v1[4], v1[5], v1[6], v1[7], _____, _____, _____, _____};
490 
491     // D[n..n+12]  += {0, 0, 0, 0, 0, S[n..n+7] * G[2]}
492     *d0 += fp88{_____, _____, _____, _____, _____, v2[0], v2[1], v2[2]};
493     *d8 += fp88{v2[3], v2[4], v2[5], v2[6], v2[7], _____, _____, _____};
494 
495     // D[n..n+13]  += {0, 0, 0, 0, 0, 0, S[n..n+7] * G[3]}
496     *d0 += fp88{_____, _____, _____, _____, _____, _____, v3[0], v3[1]};
497     *d8 += fp88{v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____, _____};
498 }
499 
blur_x_radius_4(const fp88 & s0,const fp88 & g0,const fp88 & g1,const fp88 & g2,const fp88 & g3,const fp88 & g4,fp88 * d0,fp88 * d8)500 static void blur_x_radius_4(
501         const fp88& s0,
502         const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88& g4,
503         fp88* d0, fp88* d8) {
504     auto v0 = mulhi(s0, g0);
505     auto v1 = mulhi(s0, g1);
506     auto v2 = mulhi(s0, g2);
507     auto v3 = mulhi(s0, g3);
508     auto v4 = mulhi(s0, g4);
509 
510     // D[n..n+7]  += S[n..n+7] * G[4]
511     *d0 += v4;
512 
513     // D[n..n+8]  += {0, S[n..n+7] * G[3]}
514     *d0 += fp88{_____, v3[0], v3[1], v3[2], v3[3], v3[4], v3[5], v3[6]};
515     *d8 += fp88{v3[7], _____, _____, _____, _____, _____, _____, _____};
516 
517     // D[n..n+9]  += {0, 0, S[n..n+7] * G[2]}
518     *d0 += fp88{_____, _____, v2[0], v2[1], v2[2], v2[3], v2[4], v2[5]};
519     *d8 += fp88{v2[6], v2[7], _____, _____, _____, _____, _____, _____};
520 
521     // D[n..n+10]  += {0, 0, 0, S[n..n+7] * G[1]}
522     *d0 += fp88{_____, _____, _____, v1[0], v1[1], v1[2], v1[3], v1[4]};
523     *d8 += fp88{v1[5], v1[6], v1[7], _____, _____, _____, _____, _____};
524 
525     // D[n..n+11]  += {0, 0, 0, 0, S[n..n+7] * G[0]}
526     *d0 += fp88{_____, _____, _____, _____, v0[0], v0[1], v0[2], v0[3]};
527     *d8 += fp88{v0[4], v0[5], v0[6], v0[7], _____, _____, _____, _____};
528 
529     // D[n..n+12]  += {0, 0, 0, 0, 0, S[n..n+7] * G[1]}
530     *d0 += fp88{_____, _____, _____, _____, _____, v1[0], v1[1], v1[2]};
531     *d8 += fp88{v1[3], v1[4], v1[5], v1[6], v1[7], _____, _____, _____};
532 
533     // D[n..n+13]  += {0, 0, 0, 0, 0, 0, S[n..n+7] * G[2]}
534     *d0 += fp88{_____, _____, _____, _____, _____, _____, v2[0], v2[1]};
535     *d8 += fp88{v2[2], v2[3], v2[4], v2[5], v2[6], v2[7], _____, _____};
536 
537     // D[n..n+14]  += {0, 0, 0, 0, 0, 0, 0, S[n..n+7] * G[3]}
538     *d0 += fp88{_____, _____, _____, _____, _____, _____, _____, v3[0]};
539     *d8 += fp88{v3[1], v3[2], v3[3], v3[4], v3[5], v3[6], v3[7], _____};
540 
541     // D[n..n+15]  += {0, 0, 0, 0, 0, 0, 0, 0, S[n..n+7] * G[4]}
542     *d8 += v4;
543 }
544 
545 using BlurX = decltype(blur_x_radius_1);
546 
547 // BlurX will only be one of the functions blur_x_radius_(1|2|3|4).
blur_row(BlurX blur,const fp88 & g0,const fp88 & g1,const fp88 & g2,const fp88 & g3,const fp88 & g4,const uint8_t * src,int srcW,uint8_t * dst,int dstW)548 static void blur_row(
549         BlurX blur,
550         const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88& g4,
551         const uint8_t* src, int srcW,
552               uint8_t* dst, int dstW) {
553     // Clear the buffer to handle summing wider than source.
554     fp88 d0(kHalf), d8(kHalf);
555 
556     // Go by multiples of 8 in src.
557     int x = 0;
558     for (; x <= srcW - 8; x += 8) {
559         blur(load(src, 8, nullptr), g0, g1, g2, g3, g4, &d0, &d8);
560 
561         store(dst, d0, 8);
562 
563         d0 = d8;
564         d8 = fp88(kHalf);
565 
566         src += 8;
567         dst += 8;
568     }
569 
570     // There are src values left, but the remainder of src values is not a multiple of 8.
571     int srcTail = srcW - x;
572     if (srcTail > 0) {
573 
574         blur(load(src, srcTail, nullptr), g0, g1, g2, g3, g4, &d0, &d8);
575 
576         int dstTail = std::min(8, dstW - x);
577         store(dst, d0, dstTail);
578 
579         d0 = d8;
580         dst += dstTail;
581         x += dstTail;
582     }
583 
584     // There are dst mask values to complete.
585     int dstTail = dstW - x;
586     if (dstTail > 0) {
587         store(dst, d0, dstTail);
588     }
589 }
590 
591 // BlurX will only be one of the functions blur_x_radius_(1|2|3|4).
blur_x_rect(BlurX blur,uint16_t * gauss,const uint8_t * src,size_t srcStride,int srcW,uint8_t * dst,size_t dstStride,int dstW,int dstH)592 static void blur_x_rect(BlurX blur,
593                         uint16_t* gauss,
594                         const uint8_t* src, size_t srcStride, int srcW,
595                         uint8_t* dst, size_t dstStride, int dstW, int dstH) {
596 
597     fp88 g0(gauss[0]),
598          g1(gauss[1]),
599          g2(gauss[2]),
600          g3(gauss[3]),
601          g4(gauss[4]);
602 
603     // Blur *ALL* the rows.
604     for (int y = 0; y < dstH; y++) {
605         blur_row(blur, g0, g1, g2, g3, g4, src, srcW, dst, dstW);
606         src += srcStride;
607         dst += dstStride;
608     }
609 }
610 
direct_blur_x(int radius,uint16_t * gauss,const uint8_t * src,size_t srcStride,int srcW,uint8_t * dst,size_t dstStride,int dstW,int dstH)611 static void direct_blur_x(int radius, uint16_t* gauss,
612                           const uint8_t* src, size_t srcStride, int srcW,
613                           uint8_t* dst, size_t dstStride, int dstW, int dstH) {
614 
615     switch (radius) {
616         case 1:
617             blur_x_rect(blur_x_radius_1, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH);
618             break;
619 
620         case 2:
621             blur_x_rect(blur_x_radius_2, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH);
622             break;
623 
624         case 3:
625             blur_x_rect(blur_x_radius_3, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH);
626             break;
627 
628         case 4:
629             blur_x_rect(blur_x_radius_4, gauss, src, srcStride, srcW, dst, dstStride, dstW, dstH);
630             break;
631 
632         default:
633             SkASSERTF(false, "The radius %d is not handled\n", radius);
634     }
635 }
636 
637 // The operations of the blur_y_radius_N functions work on a theme similar to the blur_x_radius_N
638 // functions, but end up being simpler because there is no complicated shift of registers. We
639 // start with the non-traditional form of the gaussian filter. In the following r is the value
640 // when added generates the next value in the column.
641 //
642 //   D[n+0r] = S[n+0r]*G[1]
643 //           + S[n+1r]*G[0]
644 //           + S[n+2r]*G[1]
645 //
646 // Expanding out in a way similar to blur_x_radius_N for specific values of n.
647 //
648 //   D[n+0r] = S[n-2r]*G[1] + S[n-1r]*G[0] + S[n+0r]*G[1]
649 //   D[n+1r] = S[n-1r]*G[1] + S[n+0r]*G[0] + S[n+1r]*G[1]
650 //   D[n+2r] = S[n+0r]*G[1] + S[n+1r]*G[0] + S[n+2r]*G[1]
651 //
652 // We can see that S[n+0r] is in all three D[] equations, but is only multiplied twice. Now we
653 // can look at the calculation form the point of view of a source value.
654 //
655 //   Given S[n+0r]:
656 //   D[n+0r] += S[n+0r]*G[1];
657 //   /* D[n+0r] is done and can be stored now. */
658 //   D[n+1r] += S[n+0r]*G[0];
659 //   D[n+2r]  = S[n+0r]*G[1];
660 //
661 // Remember, by induction, that D[n+0r] == S[n-2r]*G[1] + S[n-1r]*G[0] before adding in
662 // S[n+0r]*G[1]. So, after the addition D[n+0r] has finished calculation and can be stored. Also,
663 // notice that D[n+2r] is receiving its first value from S[n+0r]*G[1] and is not added in. Notice
664 // how values flow in the following two iterations in source.
665 //
666 //   D[n+0r] += S[n+0r]*G[1]
667 //   D[n+1r] += S[n+0r]*G[0]
668 //   D[n+2r]  = S[n+0r]*G[1]
669 //   /* ------- */
670 //   D[n+1r] += S[n+1r]*G[1]
671 //   D[n+2r] += S[n+1r]*G[0]
672 //   D[n+3r]  = S[n+1r]*G[1]
673 //
674 // Instead of using memory we can introduce temporaries d01 and d12. The update step changes
675 // to the following.
676 //
677 //   answer = d01 + S[n+0r]*G[1]
678 //   d01    = d12 + S[n+0r]*G[0]
679 //   d12    =       S[n+0r]*G[1]
680 //   return answer
681 //
682 // Finally, this can be ganged into SIMD style.
683 //   answer[0..7] = d01[0..7] + S[n+0r..n+0r+7]*G[1]
684 //   d01[0..7]    = d12[0..7] + S[n+0r..n+0r+7]*G[0]
685 //   d12[0..7]    =             S[n+0r..n+0r+7]*G[1]
686 //   return answer[0..7]
blur_y_radius_1(const fp88 & s0,const fp88 & g0,const fp88 & g1,const fp88 &,const fp88 &,const fp88 &,fp88 * d01,fp88 * d12,fp88 *,fp88 *,fp88 *,fp88 *,fp88 *,fp88 *)687 static fp88 blur_y_radius_1(
688         const fp88& s0,
689         const fp88& g0, const fp88& g1, const fp88&, const fp88&, const fp88&,
690         fp88* d01, fp88* d12, fp88*, fp88*, fp88*, fp88*, fp88*, fp88*) {
691     auto v0 = mulhi(s0, g0);
692     auto v1 = mulhi(s0, g1);
693 
694     fp88 answer = *d01 + v1;
695            *d01 = *d12 + v0;
696            *d12 =        v1 + kHalf;
697 
698     return answer;
699 }
700 
blur_y_radius_2(const fp88 & s0,const fp88 & g0,const fp88 & g1,const fp88 & g2,const fp88 &,const fp88 &,fp88 * d01,fp88 * d12,fp88 * d23,fp88 * d34,fp88 *,fp88 *,fp88 *,fp88 *)701 static fp88 blur_y_radius_2(
702         const fp88& s0,
703         const fp88& g0, const fp88& g1, const fp88& g2, const fp88&, const fp88&,
704         fp88* d01, fp88* d12, fp88* d23, fp88* d34, fp88*, fp88*, fp88*, fp88*) {
705     auto v0 = mulhi(s0, g0);
706     auto v1 = mulhi(s0, g1);
707     auto v2 = mulhi(s0, g2);
708 
709     fp88 answer = *d01 + v2;
710            *d01 = *d12 + v1;
711            *d12 = *d23 + v0;
712            *d23 = *d34 + v1;
713            *d34 =        v2 + kHalf;
714 
715     return answer;
716 }
717 
blur_y_radius_3(const fp88 & s0,const fp88 & g0,const fp88 & g1,const fp88 & g2,const fp88 & g3,const fp88 &,fp88 * d01,fp88 * d12,fp88 * d23,fp88 * d34,fp88 * d45,fp88 * d56,fp88 *,fp88 *)718 static fp88 blur_y_radius_3(
719         const fp88& s0,
720         const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88&,
721         fp88* d01, fp88* d12, fp88* d23, fp88* d34, fp88* d45, fp88* d56, fp88*, fp88*) {
722     auto v0 = mulhi(s0, g0);
723     auto v1 = mulhi(s0, g1);
724     auto v2 = mulhi(s0, g2);
725     auto v3 = mulhi(s0, g3);
726 
727     fp88 answer = *d01 + v3;
728            *d01 = *d12 + v2;
729            *d12 = *d23 + v1;
730            *d23 = *d34 + v0;
731            *d34 = *d45 + v1;
732            *d45 = *d56 + v2;
733            *d56 =        v3 + kHalf;
734 
735     return answer;
736 }
737 
blur_y_radius_4(const fp88 & s0,const fp88 & g0,const fp88 & g1,const fp88 & g2,const fp88 & g3,const fp88 & g4,fp88 * d01,fp88 * d12,fp88 * d23,fp88 * d34,fp88 * d45,fp88 * d56,fp88 * d67,fp88 * d78)738 static fp88 blur_y_radius_4(
739     const fp88& s0,
740     const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88& g4,
741     fp88* d01, fp88* d12, fp88* d23, fp88* d34, fp88* d45, fp88* d56, fp88* d67, fp88* d78) {
742     auto v0 = mulhi(s0, g0);
743     auto v1 = mulhi(s0, g1);
744     auto v2 = mulhi(s0, g2);
745     auto v3 = mulhi(s0, g3);
746     auto v4 = mulhi(s0, g4);
747 
748     fp88 answer = *d01 + v4;
749            *d01 = *d12 + v3;
750            *d12 = *d23 + v2;
751            *d23 = *d34 + v1;
752            *d34 = *d45 + v0;
753            *d45 = *d56 + v1;
754            *d56 = *d67 + v2;
755            *d67 = *d78 + v3;
756            *d78 =        v4 + kHalf;
757 
758     return answer;
759 }
760 
761 using BlurY = decltype(blur_y_radius_1);
762 
763 // BlurY will be one of blur_y_radius_(1|2|3|4).
blur_column(ToA8 toA8,BlurY blur,int radius,int width,const fp88 & g0,const fp88 & g1,const fp88 & g2,const fp88 & g3,const fp88 & g4,const uint8_t * src,size_t srcRB,int srcH,uint8_t * dst,size_t dstRB)764 static void blur_column(
765         ToA8 toA8,
766         BlurY blur, int radius, int width,
767         const fp88& g0, const fp88& g1, const fp88& g2, const fp88& g3, const fp88& g4,
768         const uint8_t* src, size_t srcRB, int srcH,
769         uint8_t* dst, size_t dstRB) {
770     fp88 d01(kHalf), d12(kHalf), d23(kHalf), d34(kHalf),
771          d45(kHalf), d56(kHalf), d67(kHalf), d78(kHalf);
772 
773     auto flush = [&](uint8_t* to, const fp88& v0, const fp88& v1) {
774         store(to, v0, width);
775         to += dstRB;
776         store(to, v1, width);
777         return to + dstRB;
778     };
779 
780     for (int y = 0; y < srcH; y += 1) {
781         auto s = load(src, width, toA8);
782         auto b = blur(s,
783                       g0, g1, g2, g3, g4,
784                       &d01, &d12, &d23, &d34, &d45, &d56, &d67, &d78);
785         store(dst, b, width);
786         src += srcRB;
787         dst += dstRB;
788     }
789 
790     if (radius >= 1) {
791         dst = flush(dst, d01, d12);
792     }
793     if (radius >= 2) {
794         dst = flush(dst, d23, d34);
795     }
796     if (radius >= 3) {
797         dst = flush(dst, d45, d56);
798     }
799     if (radius >= 4) {
800               flush(dst, d67, d78);
801     }
802 }
803 
804 // BlurY will be one of blur_y_radius_(1|2|3|4).
blur_y_rect(ToA8 toA8,const int strideOf8,BlurY blur,int radius,uint16_t * gauss,const uint8_t * src,size_t srcRB,int srcW,int srcH,uint8_t * dst,size_t dstRB)805 static void blur_y_rect(ToA8 toA8, const int strideOf8,
806                         BlurY blur, int radius, uint16_t *gauss,
807                         const uint8_t *src, size_t srcRB, int srcW, int srcH,
808                         uint8_t *dst, size_t dstRB) {
809 
810     fp88 g0(gauss[0]),
811          g1(gauss[1]),
812          g2(gauss[2]),
813          g3(gauss[3]),
814          g4(gauss[4]);
815 
816     int x = 0;
817     for (; x <= srcW - 8; x += 8) {
818         blur_column(toA8, blur, radius, 8,
819                     g0, g1, g2, g3, g4,
820                     src, srcRB, srcH,
821                     dst, dstRB);
822         src += strideOf8;
823         dst += 8;
824     }
825 
826     int xTail = srcW - x;
827     if (xTail > 0) {
828         blur_column(toA8, blur, radius, xTail,
829                     g0, g1, g2, g3, g4,
830                     src, srcRB, srcH,
831                     dst, dstRB);
832     }
833 }
834 
direct_blur_y(ToA8 toA8,const int strideOf8,int radius,uint16_t * gauss,const uint8_t * src,size_t srcRB,int srcW,int srcH,uint8_t * dst,size_t dstRB)835 static void direct_blur_y(ToA8 toA8, const int strideOf8,
836                           int radius, uint16_t* gauss,
837                           const uint8_t* src, size_t srcRB, int srcW, int srcH,
838                           uint8_t* dst, size_t dstRB) {
839 
840     switch (radius) {
841         case 1:
842             blur_y_rect(toA8, strideOf8, blur_y_radius_1, 1, gauss,
843                         src, srcRB, srcW, srcH,
844                         dst, dstRB);
845             break;
846 
847         case 2:
848             blur_y_rect(toA8, strideOf8, blur_y_radius_2, 2, gauss,
849                         src, srcRB, srcW, srcH,
850                         dst, dstRB);
851             break;
852 
853         case 3:
854             blur_y_rect(toA8, strideOf8, blur_y_radius_3, 3, gauss,
855                         src, srcRB, srcW, srcH,
856                         dst, dstRB);
857             break;
858 
859         case 4:
860             blur_y_rect(toA8, strideOf8, blur_y_radius_4, 4, gauss,
861                         src, srcRB, srcW, srcH,
862                         dst, dstRB);
863             break;
864 
865         default:
866             SkASSERTF(false, "The radius %d is not handled\n", radius);
867     }
868 }
869 
small_blur(double sigmaX,double sigmaY,const SkMask & src,SkMaskBuilder * dst)870 static SkIPoint small_blur(double sigmaX, double sigmaY, const SkMask& src, SkMaskBuilder* dst) {
871     SkASSERT(sigmaX == sigmaY); // TODO
872     SkASSERT(0.01 <= sigmaX && sigmaX < 2);
873     SkASSERT(0.01 <= sigmaY && sigmaY < 2);
874 
875     SkGaussFilter filterX{sigmaX},
876                   filterY{sigmaY};
877 
878     int radiusX = filterX.radius(),
879         radiusY = filterY.radius();
880 
881     SkASSERT(radiusX <= 4 && radiusY <= 4);
882 
883     auto prepareGauss = [](const SkGaussFilter& filter, uint16_t* factors) {
884         int i = 0;
885         for (double d : filter) {
886             factors[i++] = static_cast<uint16_t>(round(d * (1 << 16)));
887         }
888     };
889 
890     uint16_t gaussFactorsX[SkGaussFilter::kGaussArrayMax],
891              gaussFactorsY[SkGaussFilter::kGaussArrayMax];
892 
893     prepareGauss(filterX, gaussFactorsX);
894     prepareGauss(filterY, gaussFactorsY);
895 
896     *dst = SkMaskBuilder::PrepareDestination(radiusX, radiusY, src);
897     if (src.fImage == nullptr) {
898         return {SkTo<int32_t>(radiusX), SkTo<int32_t>(radiusY)};
899     }
900     if (dst->fImage == nullptr) {
901         dst->bounds().setEmpty();
902         return {0, 0};
903     }
904 
905     int srcW = src.fBounds.width(),
906         srcH = src.fBounds.height();
907 
908     int dstW = dst->fBounds.width(),
909         dstH = dst->fBounds.height();
910 
911     size_t srcRB = src.fRowBytes,
912            dstRB = dst->fRowBytes;
913 
914     //TODO: handle bluring in only one direction.
915 
916     // Blur vertically and copy to destination.
917     switch (src.fFormat) {
918         case SkMask::kBW_Format:
919             direct_blur_y(bw_to_a8, 1,
920                           radiusY, gaussFactorsY,
921                           src.fImage, srcRB, srcW, srcH,
922                           dst->image() + radiusX, dstRB);
923             break;
924         case SkMask::kA8_Format:
925             direct_blur_y(nullptr, 8,
926                           radiusY, gaussFactorsY,
927                           src.fImage, srcRB, srcW, srcH,
928                           dst->image() + radiusX, dstRB);
929             break;
930         case SkMask::kARGB32_Format:
931             direct_blur_y(argb32_to_a8, 32,
932                           radiusY, gaussFactorsY,
933                           src.fImage, srcRB, srcW, srcH,
934                           dst->image() + radiusX, dstRB);
935             break;
936         case SkMask::kLCD16_Format:
937             direct_blur_y(lcd_to_a8, 16, radiusY, gaussFactorsY,
938                           src.fImage, srcRB, srcW, srcH,
939                           dst->image() + radiusX, dstRB);
940             break;
941         default:
942             SK_ABORT("Unhandled format.");
943     }
944 
945     // Blur horizontally in place.
946     direct_blur_x(radiusX, gaussFactorsX,
947                   dst->fImage + radiusX,  dstRB, srcW,
948                   dst->image(),           dstRB, dstW, dstH);
949 
950     return {radiusX, radiusY};
951 }
952 
953 // TODO: assuming sigmaW = sigmaH. Allow different sigmas. Right now the
954 // API forces the sigmas to be the same.
blur(const SkMask & src,SkMaskBuilder * dst) const955 SkIPoint SkMaskBlurFilter::blur(const SkMask& src, SkMaskBuilder* dst) const {
956 
957     if (fSigmaW < 2.0 && fSigmaH < 2.0) {
958         return small_blur(fSigmaW, fSigmaH, src, dst);
959     }
960 
961     // 1024 is a place holder guess until more analysis can be done.
962     SkSTArenaAlloc<1024> alloc;
963 
964     PlanGauss planW(fSigmaW);
965     PlanGauss planH(fSigmaH);
966 
967     int borderW = planW.border(),
968         borderH = planH.border();
969     SkASSERT(borderH >= 0 && borderW >= 0);
970 
971     *dst = SkMaskBuilder::PrepareDestination(borderW, borderH, src);
972     if (src.fImage == nullptr) {
973         return {SkTo<int32_t>(borderW), SkTo<int32_t>(borderH)};
974     }
975     if (dst->fImage == nullptr) {
976         dst->bounds().setEmpty();
977         return {0, 0};
978     }
979 
980     int srcW = src.fBounds.width(),
981         srcH = src.fBounds.height(),
982         dstW = dst->fBounds.width(),
983         dstH = dst->fBounds.height();
984     SkASSERT(srcW >= 0 && srcH >= 0 && dstW >= 0 && dstH >= 0);
985 
986     auto bufferSize = std::max(planW.bufferSize(), planH.bufferSize());
987     auto buffer = alloc.makeArrayDefault<uint32_t>(bufferSize);
988 
989     // Blur both directions.
990     int tmpW = srcH,
991         tmpH = dstW;
992 
993     // Make sure not to overflow the multiply for the tmp buffer size.
994     if (tmpH > std::numeric_limits<int>::max() / tmpW) {
995         return {0, 0};
996     }
997     auto tmp = alloc.makeArrayDefault<uint8_t>(tmpW * tmpH);
998 
999     // Blur horizontally, and transpose.
1000     const PlanGauss::Scan& scanW = planW.makeBlurScan(srcW, buffer);
1001     switch (src.fFormat) {
1002         case SkMask::kBW_Format: {
1003             const uint8_t* bwStart = src.fImage;
1004             auto start = SkMask::AlphaIter<SkMask::kBW_Format>(bwStart, 0);
1005             auto end = SkMask::AlphaIter<SkMask::kBW_Format>(bwStart + (srcW / 8), srcW % 8);
1006             for (int y = 0; y < srcH; ++y, start >>= src.fRowBytes, end >>= src.fRowBytes) {
1007                 auto tmpStart = &tmp[y];
1008                 scanW.blur(start, end, tmpStart, tmpW, tmpStart + tmpW * tmpH);
1009             }
1010         } break;
1011         case SkMask::kA8_Format: {
1012             const uint8_t* a8Start = src.fImage;
1013             auto start = SkMask::AlphaIter<SkMask::kA8_Format>(a8Start);
1014             auto end = SkMask::AlphaIter<SkMask::kA8_Format>(a8Start + srcW);
1015             for (int y = 0; y < srcH; ++y, start >>= src.fRowBytes, end >>= src.fRowBytes) {
1016                 auto tmpStart = &tmp[y];
1017                 scanW.blur(start, end, tmpStart, tmpW, tmpStart + tmpW * tmpH);
1018             }
1019         } break;
1020         case SkMask::kARGB32_Format: {
1021             const uint32_t* argbStart = reinterpret_cast<const uint32_t*>(src.fImage);
1022             auto start = SkMask::AlphaIter<SkMask::kARGB32_Format>(argbStart);
1023             auto end = SkMask::AlphaIter<SkMask::kARGB32_Format>(argbStart + srcW);
1024             for (int y = 0; y < srcH; ++y, start >>= src.fRowBytes, end >>= src.fRowBytes) {
1025                 auto tmpStart = &tmp[y];
1026                 scanW.blur(start, end, tmpStart, tmpW, tmpStart + tmpW * tmpH);
1027             }
1028         } break;
1029         case SkMask::kLCD16_Format: {
1030             const uint16_t* lcdStart = reinterpret_cast<const uint16_t*>(src.fImage);
1031             auto start = SkMask::AlphaIter<SkMask::kLCD16_Format>(lcdStart);
1032             auto end = SkMask::AlphaIter<SkMask::kLCD16_Format>(lcdStart + srcW);
1033             for (int y = 0; y < srcH; ++y, start >>= src.fRowBytes, end >>= src.fRowBytes) {
1034                 auto tmpStart = &tmp[y];
1035                 scanW.blur(start, end, tmpStart, tmpW, tmpStart + tmpW * tmpH);
1036             }
1037         } break;
1038         default:
1039             SK_ABORT("Unhandled format.");
1040     }
1041 
1042     // Blur vertically (scan in memory order because of the transposition),
1043     // and transpose back to the original orientation.
1044     const PlanGauss::Scan& scanH = planH.makeBlurScan(tmpW, buffer);
1045     for (int y = 0; y < tmpH; y++) {
1046         auto tmpStart = &tmp[y * tmpW];
1047         auto dstStart = &dst->image()[y];
1048 
1049         scanH.blur(tmpStart, tmpStart + tmpW,
1050                    dstStart, dst->fRowBytes, dstStart + dst->fRowBytes * dstH);
1051     }
1052 
1053     return {SkTo<int32_t>(borderW), SkTo<int32_t>(borderH)};
1054 }
1055