xref: /aosp_15_r20/external/libaom/av1/encoder/arm/pickrst_sve.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <arm_neon.h>
13 #include <arm_sve.h>
14 #include <assert.h>
15 #include <string.h>
16 
17 #include "config/aom_config.h"
18 #include "config/av1_rtcd.h"
19 
20 #include "aom_dsp/arm/aom_neon_sve_bridge.h"
21 #include "aom_dsp/arm/mem_neon.h"
22 #include "aom_dsp/arm/sum_neon.h"
23 #include "aom_dsp/arm/transpose_neon.h"
24 #include "av1/common/restoration.h"
25 #include "av1/encoder/pickrst.h"
26 #include "av1/encoder/arm/pickrst_sve.h"
27 
find_average_sve(const uint8_t * src,int src_stride,int width,int height)28 static inline uint8_t find_average_sve(const uint8_t *src, int src_stride,
29                                        int width, int height) {
30   uint32x4_t avg_u32 = vdupq_n_u32(0);
31   uint8x16_t ones = vdupq_n_u8(1);
32 
33   // Use a predicate to compute the last columns.
34   svbool_t pattern = svwhilelt_b8_u32(0, width % 16);
35 
36   int h = height;
37   do {
38     int j = width;
39     const uint8_t *src_ptr = src;
40     while (j >= 16) {
41       uint8x16_t s = vld1q_u8(src_ptr);
42       avg_u32 = vdotq_u32(avg_u32, s, ones);
43 
44       j -= 16;
45       src_ptr += 16;
46     }
47     uint8x16_t s_end = svget_neonq_u8(svld1_u8(pattern, src_ptr));
48     avg_u32 = vdotq_u32(avg_u32, s_end, ones);
49 
50     src += src_stride;
51   } while (--h != 0);
52   return (uint8_t)(vaddlvq_u32(avg_u32) / (width * height));
53 }
54 
compute_sub_avg(const uint8_t * buf,int buf_stride,int avg,int16_t * buf_avg,int buf_avg_stride,int width,int height,int downsample_factor)55 static inline void compute_sub_avg(const uint8_t *buf, int buf_stride, int avg,
56                                    int16_t *buf_avg, int buf_avg_stride,
57                                    int width, int height,
58                                    int downsample_factor) {
59   uint8x8_t avg_u8 = vdup_n_u8(avg);
60 
61   // Use a predicate to compute the last columns.
62   svbool_t pattern = svwhilelt_b8_u32(0, width % 8);
63 
64   uint8x8_t avg_end = vget_low_u8(svget_neonq_u8(svdup_n_u8_z(pattern, avg)));
65 
66   do {
67     int j = width;
68     const uint8_t *buf_ptr = buf;
69     int16_t *buf_avg_ptr = buf_avg;
70     while (j >= 8) {
71       uint8x8_t d = vld1_u8(buf_ptr);
72       vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubl_u8(d, avg_u8)));
73 
74       j -= 8;
75       buf_ptr += 8;
76       buf_avg_ptr += 8;
77     }
78     uint8x8_t d_end = vget_low_u8(svget_neonq_u8(svld1_u8(pattern, buf_ptr)));
79     vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubl_u8(d_end, avg_end)));
80 
81     buf += buf_stride;
82     buf_avg += buf_avg_stride;
83     height -= downsample_factor;
84   } while (height > 0);
85 }
86 
copy_upper_triangle(int64_t * H,int64_t * H_tmp,const int wiener_win2,const int scale)87 static inline void copy_upper_triangle(int64_t *H, int64_t *H_tmp,
88                                        const int wiener_win2, const int scale) {
89   for (int i = 0; i < wiener_win2 - 2; i = i + 2) {
90     // Transpose the first 2x2 square. It needs a special case as the element
91     // of the bottom left is on the diagonal.
92     int64x2_t row0 = vld1q_s64(H_tmp + i * wiener_win2 + i + 1);
93     int64x2_t row1 = vld1q_s64(H_tmp + (i + 1) * wiener_win2 + i + 1);
94 
95     int64x2_t tr_row = aom_vtrn2q_s64(row0, row1);
96 
97     vst1_s64(H_tmp + (i + 1) * wiener_win2 + i, vget_low_s64(row0));
98     vst1q_s64(H_tmp + (i + 2) * wiener_win2 + i, tr_row);
99 
100     // Transpose and store all the remaining 2x2 squares of the line.
101     for (int j = i + 3; j < wiener_win2; j = j + 2) {
102       row0 = vld1q_s64(H_tmp + i * wiener_win2 + j);
103       row1 = vld1q_s64(H_tmp + (i + 1) * wiener_win2 + j);
104 
105       int64x2_t tr_row0 = aom_vtrn1q_s64(row0, row1);
106       int64x2_t tr_row1 = aom_vtrn2q_s64(row0, row1);
107 
108       vst1q_s64(H_tmp + j * wiener_win2 + i, tr_row0);
109       vst1q_s64(H_tmp + (j + 1) * wiener_win2 + i, tr_row1);
110     }
111   }
112   for (int i = 0; i < wiener_win2 * wiener_win2; i++) {
113     H[i] += H_tmp[i] * scale;
114   }
115 }
116 
117 // Transpose the matrix that has just been computed and accumulate it in M.
acc_transpose_M(int64_t * M,const int64_t * M_trn,const int wiener_win,int scale)118 static inline void acc_transpose_M(int64_t *M, const int64_t *M_trn,
119                                    const int wiener_win, int scale) {
120   for (int i = 0; i < wiener_win; ++i) {
121     for (int j = 0; j < wiener_win; ++j) {
122       int tr_idx = j * wiener_win + i;
123       *M++ += (int64_t)(M_trn[tr_idx] * scale);
124     }
125   }
126 }
127 
128 // This function computes two matrices: the cross-correlation between the src
129 // buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H).
130 //
131 // M is of size 7 * 7. It needs to be filled such that multiplying one element
132 // from src with each element of a row of the wiener window will fill one
133 // column of M. However this is not very convenient in terms of memory
134 // accesses, as it means we do contiguous loads of dgd but strided stores to M.
135 // As a result, we use an intermediate matrix M_trn which is instead filled
136 // such that one row of the wiener window gives one row of M_trn. Once fully
137 // computed, M_trn is then transposed to return M.
138 //
139 // H is of size 49 * 49. It is filled by multiplying every pair of elements of
140 // the wiener window together. Since it is a symmetric matrix, we only compute
141 // the upper triangle, and then copy it down to the lower one. Here we fill it
142 // by taking each different pair of columns, and multiplying all the elements of
143 // the first one with all the elements of the second one, with a special case
144 // when multiplying a column by itself.
compute_stats_win7_downsampled_sve(int16_t * dgd_avg,int dgd_avg_stride,int16_t * src_avg,int src_avg_stride,int width,int height,int64_t * M,int64_t * H,int downsample_factor)145 static inline void compute_stats_win7_downsampled_sve(
146     int16_t *dgd_avg, int dgd_avg_stride, int16_t *src_avg, int src_avg_stride,
147     int width, int height, int64_t *M, int64_t *H, int downsample_factor) {
148   const int wiener_win = 7;
149   const int wiener_win2 = wiener_win * wiener_win;
150 
151   // Use a predicate to compute the last columns of the block for H.
152   svbool_t pattern = svwhilelt_b16_u32(0, width % 8);
153 
154   // Use intermediate matrices for H and M to perform the computation, they
155   // will be accumulated into the original H and M at the end.
156   int64_t M_trn[49];
157   memset(M_trn, 0, sizeof(M_trn));
158 
159   int64_t H_tmp[49 * 49];
160   memset(H_tmp, 0, sizeof(H_tmp));
161 
162   assert(height > 0);
163   do {
164     // Cross-correlation (M).
165     for (int row = 0; row < wiener_win; row++) {
166       int j = 0;
167       while (j < width) {
168         int16x8_t dgd[7];
169         load_s16_8x7(dgd_avg + row * dgd_avg_stride + j, 1, &dgd[0], &dgd[1],
170                      &dgd[2], &dgd[3], &dgd[4], &dgd[5], &dgd[6]);
171         int16x8_t s = vld1q_s16(src_avg + j);
172 
173         // Compute all the elements of one row of M.
174         compute_M_one_row_win7(s, dgd, M_trn, row);
175 
176         j += 8;
177       }
178     }
179 
180     // Auto-covariance (H).
181     int j = 0;
182     while (j <= width - 8) {
183       for (int col0 = 0; col0 < wiener_win; col0++) {
184         int16x8_t dgd0[7];
185         load_s16_8x7(dgd_avg + j + col0, dgd_avg_stride, &dgd0[0], &dgd0[1],
186                      &dgd0[2], &dgd0[3], &dgd0[4], &dgd0[5], &dgd0[6]);
187 
188         // Perform computation of the first column with itself (28 elements).
189         // For the first column this will fill the upper triangle of the 7x7
190         // matrix at the top left of the H matrix. For the next columns this
191         // will fill the upper triangle of the other 7x7 matrices around H's
192         // diagonal.
193         compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
194 
195         // All computation next to the matrix diagonal has already been done.
196         for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
197           // Load second column and scale based on downsampling factor.
198           int16x8_t dgd1[7];
199           load_s16_8x7(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
200                        &dgd1[2], &dgd1[3], &dgd1[4], &dgd1[5], &dgd1[6]);
201 
202           // Compute all elements from the combination of both columns (49
203           // elements).
204           compute_H_two_rows_win7(dgd0, dgd1, col0, col1, H_tmp);
205         }
206       }
207       j += 8;
208     }
209 
210     if (j < width) {
211       // Process remaining columns using a predicate to discard excess elements.
212       for (int col0 = 0; col0 < wiener_win; col0++) {
213         // Load first column.
214         int16x8_t dgd0[7];
215         dgd0[0] = svget_neonq_s16(
216             svld1_s16(pattern, dgd_avg + 0 * dgd_avg_stride + j + col0));
217         dgd0[1] = svget_neonq_s16(
218             svld1_s16(pattern, dgd_avg + 1 * dgd_avg_stride + j + col0));
219         dgd0[2] = svget_neonq_s16(
220             svld1_s16(pattern, dgd_avg + 2 * dgd_avg_stride + j + col0));
221         dgd0[3] = svget_neonq_s16(
222             svld1_s16(pattern, dgd_avg + 3 * dgd_avg_stride + j + col0));
223         dgd0[4] = svget_neonq_s16(
224             svld1_s16(pattern, dgd_avg + 4 * dgd_avg_stride + j + col0));
225         dgd0[5] = svget_neonq_s16(
226             svld1_s16(pattern, dgd_avg + 5 * dgd_avg_stride + j + col0));
227         dgd0[6] = svget_neonq_s16(
228             svld1_s16(pattern, dgd_avg + 6 * dgd_avg_stride + j + col0));
229 
230         // Perform computation of the first column with itself (28 elements).
231         // For the first column this will fill the upper triangle of the 7x7
232         // matrix at the top left of the H matrix. For the next columns this
233         // will fill the upper triangle of the other 7x7 matrices around H's
234         // diagonal.
235         compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
236 
237         // All computation next to the matrix diagonal has already been done.
238         for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
239           // Load second column and scale based on downsampling factor.
240           int16x8_t dgd1[7];
241           load_s16_8x7(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
242                        &dgd1[2], &dgd1[3], &dgd1[4], &dgd1[5], &dgd1[6]);
243 
244           // Compute all elements from the combination of both columns (49
245           // elements).
246           compute_H_two_rows_win7(dgd0, dgd1, col0, col1, H_tmp);
247         }
248       }
249     }
250     dgd_avg += downsample_factor * dgd_avg_stride;
251     src_avg += src_avg_stride;
252   } while (--height != 0);
253 
254   // Transpose M_trn.
255   acc_transpose_M(M, M_trn, 7, downsample_factor);
256 
257   // Copy upper triangle of H in the lower one.
258   copy_upper_triangle(H, H_tmp, wiener_win2, downsample_factor);
259 }
260 
261 // This function computes two matrices: the cross-correlation between the src
262 // buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H).
263 //
264 // M is of size 5 * 5. It needs to be filled such that multiplying one element
265 // from src with each element of a row of the wiener window will fill one
266 // column of M. However this is not very convenient in terms of memory
267 // accesses, as it means we do contiguous loads of dgd but strided stores to M.
268 // As a result, we use an intermediate matrix M_trn which is instead filled
269 // such that one row of the wiener window gives one row of M_trn. Once fully
270 // computed, M_trn is then transposed to return M.
271 //
272 // H is of size 25 * 25. It is filled by multiplying every pair of elements of
273 // the wiener window together. Since it is a symmetric matrix, we only compute
274 // the upper triangle, and then copy it down to the lower one. Here we fill it
275 // by taking each different pair of columns, and multiplying all the elements of
276 // the first one with all the elements of the second one, with a special case
277 // when multiplying a column by itself.
compute_stats_win5_downsampled_sve(int16_t * dgd_avg,int dgd_avg_stride,int16_t * src_avg,int src_avg_stride,int width,int height,int64_t * M,int64_t * H,int downsample_factor)278 static inline void compute_stats_win5_downsampled_sve(
279     int16_t *dgd_avg, int dgd_avg_stride, int16_t *src_avg, int src_avg_stride,
280     int width, int height, int64_t *M, int64_t *H, int downsample_factor) {
281   const int wiener_win = 5;
282   const int wiener_win2 = wiener_win * wiener_win;
283 
284   // Use a predicate to compute the last columns of the block for H.
285   svbool_t pattern = svwhilelt_b16_u32(0, width % 8);
286 
287   // Use intermediate matrices for H and M to perform the computation, they
288   // will be accumulated into the original H and M at the end.
289   int64_t M_trn[25];
290   memset(M_trn, 0, sizeof(M_trn));
291 
292   int64_t H_tmp[25 * 25];
293   memset(H_tmp, 0, sizeof(H_tmp));
294 
295   assert(height > 0);
296   do {
297     // Cross-correlation (M).
298     for (int row = 0; row < wiener_win; row++) {
299       int j = 0;
300       while (j < width) {
301         int16x8_t dgd[5];
302         load_s16_8x5(dgd_avg + row * dgd_avg_stride + j, 1, &dgd[0], &dgd[1],
303                      &dgd[2], &dgd[3], &dgd[4]);
304         int16x8_t s = vld1q_s16(src_avg + j);
305 
306         // Compute all the elements of one row of M.
307         compute_M_one_row_win5(s, dgd, M_trn, row);
308 
309         j += 8;
310       }
311     }
312 
313     // Auto-covariance (H).
314     int j = 0;
315     while (j <= width - 8) {
316       for (int col0 = 0; col0 < wiener_win; col0++) {
317         // Load first column.
318         int16x8_t dgd0[5];
319         load_s16_8x5(dgd_avg + j + col0, dgd_avg_stride, &dgd0[0], &dgd0[1],
320                      &dgd0[2], &dgd0[3], &dgd0[4]);
321 
322         // Perform computation of the first column with itself (15 elements).
323         // For the first column this will fill the upper triangle of the 5x5
324         // matrix at the top left of the H matrix. For the next columns this
325         // will fill the upper triangle of the other 5x5 matrices around H's
326         // diagonal.
327         compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
328 
329         // All computation next to the matrix diagonal has already been done.
330         for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
331           // Load second column and scale based on downsampling factor.
332           int16x8_t dgd1[5];
333           load_s16_8x5(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
334                        &dgd1[2], &dgd1[3], &dgd1[4]);
335 
336           // Compute all elements from the combination of both columns (25
337           // elements).
338           compute_H_two_rows_win5(dgd0, dgd1, col0, col1, H_tmp);
339         }
340       }
341       j += 8;
342     }
343 
344     // Process remaining columns using a predicate to discard excess elements.
345     if (j < width) {
346       for (int col0 = 0; col0 < wiener_win; col0++) {
347         int16x8_t dgd0[5];
348         dgd0[0] = svget_neonq_s16(
349             svld1_s16(pattern, dgd_avg + 0 * dgd_avg_stride + j + col0));
350         dgd0[1] = svget_neonq_s16(
351             svld1_s16(pattern, dgd_avg + 1 * dgd_avg_stride + j + col0));
352         dgd0[2] = svget_neonq_s16(
353             svld1_s16(pattern, dgd_avg + 2 * dgd_avg_stride + j + col0));
354         dgd0[3] = svget_neonq_s16(
355             svld1_s16(pattern, dgd_avg + 3 * dgd_avg_stride + j + col0));
356         dgd0[4] = svget_neonq_s16(
357             svld1_s16(pattern, dgd_avg + 4 * dgd_avg_stride + j + col0));
358 
359         // Perform computation of the first column with itself (15 elements).
360         // For the first column this will fill the upper triangle of the 5x5
361         // matrix at the top left of the H matrix. For the next columns this
362         // will fill the upper triangle of the other 5x5 matrices around H's
363         // diagonal.
364         compute_H_one_col(dgd0, col0, H_tmp, wiener_win, wiener_win2);
365 
366         // All computation next to the matrix diagonal has already been done.
367         for (int col1 = col0 + 1; col1 < wiener_win; col1++) {
368           // Load second column and scale based on downsampling factor.
369           int16x8_t dgd1[5];
370           load_s16_8x5(dgd_avg + j + col1, dgd_avg_stride, &dgd1[0], &dgd1[1],
371                        &dgd1[2], &dgd1[3], &dgd1[4]);
372 
373           // Compute all elements from the combination of both columns (25
374           // elements).
375           compute_H_two_rows_win5(dgd0, dgd1, col0, col1, H_tmp);
376         }
377       }
378     }
379     dgd_avg += downsample_factor * dgd_avg_stride;
380     src_avg += src_avg_stride;
381   } while (--height != 0);
382 
383   // Transpose M_trn.
384   acc_transpose_M(M, M_trn, 5, downsample_factor);
385 
386   // Copy upper triangle of H in the lower one.
387   copy_upper_triangle(H, H_tmp, wiener_win2, downsample_factor);
388 }
389 
av1_compute_stats_downsampled_sve(int wiener_win,const uint8_t * dgd,const uint8_t * src,int16_t * dgd_avg,int16_t * src_avg,int h_start,int h_end,int v_start,int v_end,int dgd_stride,int src_stride,int64_t * M,int64_t * H)390 static inline void av1_compute_stats_downsampled_sve(
391     int wiener_win, const uint8_t *dgd, const uint8_t *src, int16_t *dgd_avg,
392     int16_t *src_avg, int h_start, int h_end, int v_start, int v_end,
393     int dgd_stride, int src_stride, int64_t *M, int64_t *H) {
394   assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA);
395 
396   const int wiener_win2 = wiener_win * wiener_win;
397   const int wiener_halfwin = wiener_win >> 1;
398   const int32_t width = h_end - h_start;
399   const int32_t height = v_end - v_start;
400   const uint8_t *dgd_start = &dgd[v_start * dgd_stride + h_start];
401   memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2);
402   memset(M, 0, sizeof(*M) * wiener_win * wiener_win);
403 
404   const uint8_t avg = find_average_sve(dgd_start, dgd_stride, width, height);
405   const int downsample_factor = WIENER_STATS_DOWNSAMPLE_FACTOR;
406 
407   // dgd_avg and src_avg have been memset to zero before calling this
408   // function, so round up the stride to the next multiple of 8 so that we
409   // don't have to worry about a tail loop when computing M.
410   const int dgd_avg_stride = ((width + 2 * wiener_halfwin) & ~7) + 8;
411   const int src_avg_stride = (width & ~7) + 8;
412 
413   // Compute (dgd - avg) and store it in dgd_avg.
414   // The wiener window will slide along the dgd frame, centered on each pixel.
415   // For the top left pixel and all the pixels on the side of the frame this
416   // means half of the window will be outside of the frame. As such the actual
417   // buffer that we need to subtract the avg from will be 2 * wiener_halfwin
418   // wider and 2 * wiener_halfwin higher than the original dgd buffer.
419   const int vert_offset = v_start - wiener_halfwin;
420   const int horiz_offset = h_start - wiener_halfwin;
421   const uint8_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride;
422   compute_sub_avg(dgd_win, dgd_stride, avg, dgd_avg, dgd_avg_stride,
423                   width + 2 * wiener_halfwin, height + 2 * wiener_halfwin, 1);
424 
425   // Compute (src - avg), downsample and store in src-avg.
426   const uint8_t *src_start = src + h_start + v_start * src_stride;
427   compute_sub_avg(src_start, src_stride * downsample_factor, avg, src_avg,
428                   src_avg_stride, width, height, downsample_factor);
429 
430   const int downsample_height = height / downsample_factor;
431 
432   // Since the height is not necessarily a multiple of the downsample factor,
433   // the last line of src will be scaled according to how many rows remain.
434   const int downsample_remainder = height % downsample_factor;
435 
436   if (downsample_height > 0) {
437     if (wiener_win == WIENER_WIN) {
438       compute_stats_win7_downsampled_sve(
439           dgd_avg, dgd_avg_stride, src_avg, src_avg_stride, width,
440           downsample_height, M, H, downsample_factor);
441     } else {
442       compute_stats_win5_downsampled_sve(
443           dgd_avg, dgd_avg_stride, src_avg, src_avg_stride, width,
444           downsample_height, M, H, downsample_factor);
445     }
446   }
447 
448   if (downsample_remainder > 0) {
449     const int remainder_offset = height - downsample_remainder;
450     if (wiener_win == WIENER_WIN) {
451       compute_stats_win7_downsampled_sve(
452           dgd_avg + remainder_offset * dgd_avg_stride, dgd_avg_stride,
453           src_avg + downsample_height * src_avg_stride, src_avg_stride, width,
454           1, M, H, downsample_remainder);
455     } else {
456       compute_stats_win5_downsampled_sve(
457           dgd_avg + remainder_offset * dgd_avg_stride, dgd_avg_stride,
458           src_avg + downsample_height * src_avg_stride, src_avg_stride, width,
459           1, M, H, downsample_remainder);
460     }
461   }
462 }
463 
av1_compute_stats_sve(int wiener_win,const uint8_t * dgd,const uint8_t * src,int16_t * dgd_avg,int16_t * src_avg,int h_start,int h_end,int v_start,int v_end,int dgd_stride,int src_stride,int64_t * M,int64_t * H,int use_downsampled_wiener_stats)464 void av1_compute_stats_sve(int wiener_win, const uint8_t *dgd,
465                            const uint8_t *src, int16_t *dgd_avg,
466                            int16_t *src_avg, int h_start, int h_end,
467                            int v_start, int v_end, int dgd_stride,
468                            int src_stride, int64_t *M, int64_t *H,
469                            int use_downsampled_wiener_stats) {
470   assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA);
471 
472   if (use_downsampled_wiener_stats) {
473     av1_compute_stats_downsampled_sve(wiener_win, dgd, src, dgd_avg, src_avg,
474                                       h_start, h_end, v_start, v_end,
475                                       dgd_stride, src_stride, M, H);
476     return;
477   }
478 
479   const int wiener_win2 = wiener_win * wiener_win;
480   const int wiener_halfwin = wiener_win >> 1;
481   const int32_t width = h_end - h_start;
482   const int32_t height = v_end - v_start;
483   const uint8_t *dgd_start = &dgd[v_start * dgd_stride + h_start];
484   memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2);
485   memset(M, 0, sizeof(*M) * wiener_win * wiener_win);
486 
487   const uint8_t avg = find_average_sve(dgd_start, dgd_stride, width, height);
488 
489   // dgd_avg and src_avg have been memset to zero before calling this
490   // function, so round up the stride to the next multiple of 8 so that we
491   // don't have to worry about a tail loop when computing M.
492   const int dgd_avg_stride = ((width + 2 * wiener_halfwin) & ~7) + 8;
493   const int src_avg_stride = (width & ~7) + 8;
494 
495   // Compute (dgd - avg) and store it in dgd_avg.
496   // The wiener window will slide along the dgd frame, centered on each pixel.
497   // For the top left pixel and all the pixels on the side of the frame this
498   // means half of the window will be outside of the frame. As such the actual
499   // buffer that we need to subtract the avg from will be 2 * wiener_halfwin
500   // wider and 2 * wiener_halfwin higher than the original dgd buffer.
501   const int vert_offset = v_start - wiener_halfwin;
502   const int horiz_offset = h_start - wiener_halfwin;
503   const uint8_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride;
504   compute_sub_avg(dgd_win, dgd_stride, avg, dgd_avg, dgd_avg_stride,
505                   width + 2 * wiener_halfwin, height + 2 * wiener_halfwin, 1);
506 
507   // Compute (src - avg), and store in src-avg.
508   const uint8_t *src_start = src + h_start + v_start * src_stride;
509   compute_sub_avg(src_start, src_stride, avg, src_avg, src_avg_stride, width,
510                   height, 1);
511 
512   if (wiener_win == WIENER_WIN) {
513     compute_stats_win7_sve(dgd_avg, dgd_avg_stride, src_avg, src_avg_stride,
514                            width, height, M, H);
515   } else {
516     compute_stats_win5_sve(dgd_avg, dgd_avg_stride, src_avg, src_avg_stride,
517                            width, height, M, H);
518   }
519 
520   // H is a symmetric matrix, so we only need to fill out the upper triangle.
521   // We can copy it down to the lower triangle outside the (i, j) loops.
522   diagonal_copy_stats_neon(wiener_win2, H);
523 }
524