xref: /aosp_15_r20/external/libaom/aom_dsp/arm/sum_squares_sve.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2023, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <arm_neon.h>
13 
14 #include "aom_dsp/arm/aom_neon_sve_bridge.h"
15 #include "aom_dsp/arm/mem_neon.h"
16 #include "config/aom_config.h"
17 #include "config/aom_dsp_rtcd.h"
18 
aom_sum_squares_2d_i16_4xh_sve(const int16_t * src,int stride,int height)19 static inline uint64_t aom_sum_squares_2d_i16_4xh_sve(const int16_t *src,
20                                                       int stride, int height) {
21   int64x2_t sum_squares = vdupq_n_s64(0);
22 
23   do {
24     int16x8_t s = vcombine_s16(vld1_s16(src), vld1_s16(src + stride));
25 
26     sum_squares = aom_sdotq_s16(sum_squares, s, s);
27 
28     src += 2 * stride;
29     height -= 2;
30   } while (height != 0);
31 
32   return (uint64_t)vaddvq_s64(sum_squares);
33 }
34 
aom_sum_squares_2d_i16_8xh_sve(const int16_t * src,int stride,int height)35 static inline uint64_t aom_sum_squares_2d_i16_8xh_sve(const int16_t *src,
36                                                       int stride, int height) {
37   int64x2_t sum_squares[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
38 
39   do {
40     int16x8_t s0 = vld1q_s16(src + 0 * stride);
41     int16x8_t s1 = vld1q_s16(src + 1 * stride);
42 
43     sum_squares[0] = aom_sdotq_s16(sum_squares[0], s0, s0);
44     sum_squares[1] = aom_sdotq_s16(sum_squares[1], s1, s1);
45 
46     src += 2 * stride;
47     height -= 2;
48   } while (height != 0);
49 
50   sum_squares[0] = vaddq_s64(sum_squares[0], sum_squares[1]);
51   return (uint64_t)vaddvq_s64(sum_squares[0]);
52 }
53 
aom_sum_squares_2d_i16_large_sve(const int16_t * src,int stride,int width,int height)54 static inline uint64_t aom_sum_squares_2d_i16_large_sve(const int16_t *src,
55                                                         int stride, int width,
56                                                         int height) {
57   int64x2_t sum_squares[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
58 
59   do {
60     const int16_t *src_ptr = src;
61     int w = width;
62     do {
63       int16x8_t s0 = vld1q_s16(src_ptr);
64       int16x8_t s1 = vld1q_s16(src_ptr + 8);
65 
66       sum_squares[0] = aom_sdotq_s16(sum_squares[0], s0, s0);
67       sum_squares[1] = aom_sdotq_s16(sum_squares[1], s1, s1);
68 
69       src_ptr += 16;
70       w -= 16;
71     } while (w != 0);
72 
73     src += stride;
74   } while (--height != 0);
75 
76   sum_squares[0] = vaddq_s64(sum_squares[0], sum_squares[1]);
77   return (uint64_t)vaddvq_s64(sum_squares[0]);
78 }
79 
aom_sum_squares_2d_i16_wxh_sve(const int16_t * src,int stride,int width,int height)80 static inline uint64_t aom_sum_squares_2d_i16_wxh_sve(const int16_t *src,
81                                                       int stride, int width,
82                                                       int height) {
83   svint64_t sum_squares = svdup_n_s64(0);
84   uint64_t step = svcnth();
85 
86   do {
87     const int16_t *src_ptr = src;
88     int w = 0;
89     do {
90       svbool_t pred = svwhilelt_b16_u32(w, width);
91       svint16_t s0 = svld1_s16(pred, src_ptr);
92 
93       sum_squares = svdot_s64(sum_squares, s0, s0);
94 
95       src_ptr += step;
96       w += step;
97     } while (w < width);
98 
99     src += stride;
100   } while (--height != 0);
101 
102   return (uint64_t)svaddv_s64(svptrue_b64(), sum_squares);
103 }
104 
aom_sum_squares_2d_i16_sve(const int16_t * src,int stride,int width,int height)105 uint64_t aom_sum_squares_2d_i16_sve(const int16_t *src, int stride, int width,
106                                     int height) {
107   if (width == 4) {
108     return aom_sum_squares_2d_i16_4xh_sve(src, stride, height);
109   }
110   if (width == 8) {
111     return aom_sum_squares_2d_i16_8xh_sve(src, stride, height);
112   }
113   if (width % 16 == 0) {
114     return aom_sum_squares_2d_i16_large_sve(src, stride, width, height);
115   }
116   return aom_sum_squares_2d_i16_wxh_sve(src, stride, width, height);
117 }
118 
aom_sum_squares_i16_sve(const int16_t * src,uint32_t n)119 uint64_t aom_sum_squares_i16_sve(const int16_t *src, uint32_t n) {
120   // This function seems to be called only for values of N >= 64. See
121   // av1/encoder/compound_type.c. Additionally, because N = width x height for
122   // width and height between the standard block sizes, N will also be a
123   // multiple of 64.
124   if (LIKELY(n % 64 == 0)) {
125     int64x2_t sum[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
126                          vdupq_n_s64(0) };
127 
128     do {
129       int16x8_t s0 = vld1q_s16(src);
130       int16x8_t s1 = vld1q_s16(src + 8);
131       int16x8_t s2 = vld1q_s16(src + 16);
132       int16x8_t s3 = vld1q_s16(src + 24);
133 
134       sum[0] = aom_sdotq_s16(sum[0], s0, s0);
135       sum[1] = aom_sdotq_s16(sum[1], s1, s1);
136       sum[2] = aom_sdotq_s16(sum[2], s2, s2);
137       sum[3] = aom_sdotq_s16(sum[3], s3, s3);
138 
139       src += 32;
140       n -= 32;
141     } while (n != 0);
142 
143     sum[0] = vaddq_s64(sum[0], sum[1]);
144     sum[2] = vaddq_s64(sum[2], sum[3]);
145     sum[0] = vaddq_s64(sum[0], sum[2]);
146     return vaddvq_s64(sum[0]);
147   }
148   return aom_sum_squares_i16_c(src, n);
149 }
150 
aom_sum_sse_2d_i16_4xh_sve(const int16_t * src,int stride,int height,int * sum)151 static inline uint64_t aom_sum_sse_2d_i16_4xh_sve(const int16_t *src,
152                                                   int stride, int height,
153                                                   int *sum) {
154   int64x2_t sse = vdupq_n_s64(0);
155   int32x4_t sum_s32 = vdupq_n_s32(0);
156 
157   do {
158     int16x8_t s = vcombine_s16(vld1_s16(src), vld1_s16(src + stride));
159 
160     sse = aom_sdotq_s16(sse, s, s);
161 
162     sum_s32 = vpadalq_s16(sum_s32, s);
163 
164     src += 2 * stride;
165     height -= 2;
166   } while (height != 0);
167 
168   *sum += vaddvq_s32(sum_s32);
169   return vaddvq_s64(sse);
170 }
171 
aom_sum_sse_2d_i16_8xh_sve(const int16_t * src,int stride,int height,int * sum)172 static inline uint64_t aom_sum_sse_2d_i16_8xh_sve(const int16_t *src,
173                                                   int stride, int height,
174                                                   int *sum) {
175   int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
176   int32x4_t sum_acc[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
177 
178   do {
179     int16x8_t s0 = vld1q_s16(src);
180     int16x8_t s1 = vld1q_s16(src + stride);
181 
182     sse[0] = aom_sdotq_s16(sse[0], s0, s0);
183     sse[1] = aom_sdotq_s16(sse[1], s1, s1);
184 
185     sum_acc[0] = vpadalq_s16(sum_acc[0], s0);
186     sum_acc[1] = vpadalq_s16(sum_acc[1], s1);
187 
188     src += 2 * stride;
189     height -= 2;
190   } while (height != 0);
191 
192   *sum += vaddvq_s32(vaddq_s32(sum_acc[0], sum_acc[1]));
193   return vaddvq_s64(vaddq_s64(sse[0], sse[1]));
194 }
195 
aom_sum_sse_2d_i16_16xh_sve(const int16_t * src,int stride,int width,int height,int * sum)196 static inline uint64_t aom_sum_sse_2d_i16_16xh_sve(const int16_t *src,
197                                                    int stride, int width,
198                                                    int height, int *sum) {
199   int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
200   int32x4_t sum_acc[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
201 
202   do {
203     int w = 0;
204     do {
205       int16x8_t s0 = vld1q_s16(src + w);
206       int16x8_t s1 = vld1q_s16(src + w + 8);
207 
208       sse[0] = aom_sdotq_s16(sse[0], s0, s0);
209       sse[1] = aom_sdotq_s16(sse[1], s1, s1);
210 
211       sum_acc[0] = vpadalq_s16(sum_acc[0], s0);
212       sum_acc[1] = vpadalq_s16(sum_acc[1], s1);
213 
214       w += 16;
215     } while (w < width);
216 
217     src += stride;
218   } while (--height != 0);
219 
220   *sum += vaddvq_s32(vaddq_s32(sum_acc[0], sum_acc[1]));
221   return vaddvq_s64(vaddq_s64(sse[0], sse[1]));
222 }
223 
aom_sum_sse_2d_i16_sve(const int16_t * src,int stride,int width,int height,int * sum)224 uint64_t aom_sum_sse_2d_i16_sve(const int16_t *src, int stride, int width,
225                                 int height, int *sum) {
226   uint64_t sse;
227 
228   if (width == 4) {
229     sse = aom_sum_sse_2d_i16_4xh_sve(src, stride, height, sum);
230   } else if (width == 8) {
231     sse = aom_sum_sse_2d_i16_8xh_sve(src, stride, height, sum);
232   } else if (width % 16 == 0) {
233     sse = aom_sum_sse_2d_i16_16xh_sve(src, stride, width, height, sum);
234   } else {
235     sse = aom_sum_sse_2d_i16_c(src, stride, width, height, sum);
236   }
237 
238   return sse;
239 }
240 
241 #if CONFIG_AV1_HIGHBITDEPTH
aom_var_2d_u16_4xh_sve(uint8_t * src,int src_stride,int width,int height)242 static inline uint64_t aom_var_2d_u16_4xh_sve(uint8_t *src, int src_stride,
243                                               int width, int height) {
244   uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src);
245   uint64_t sum = 0;
246   uint64_t sse = 0;
247   uint32x4_t sum_u32 = vdupq_n_u32(0);
248   uint64x2_t sse_u64 = vdupq_n_u64(0);
249 
250   int h = height;
251   do {
252     uint16x8_t s0 =
253         vcombine_u16(vld1_u16(src_u16), vld1_u16(src_u16 + src_stride));
254 
255     sum_u32 = vpadalq_u16(sum_u32, s0);
256 
257     sse_u64 = aom_udotq_u16(sse_u64, s0, s0);
258 
259     src_u16 += 2 * src_stride;
260     h -= 2;
261   } while (h != 0);
262 
263   sum += vaddlvq_u32(sum_u32);
264   sse += vaddvq_u64(sse_u64);
265 
266   return sse - sum * sum / (width * height);
267 }
268 
aom_var_2d_u16_8xh_sve(uint8_t * src,int src_stride,int width,int height)269 static inline uint64_t aom_var_2d_u16_8xh_sve(uint8_t *src, int src_stride,
270                                               int width, int height) {
271   uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src);
272   uint64_t sum = 0;
273   uint64_t sse = 0;
274   uint32x4_t sum_u32 = vdupq_n_u32(0);
275   uint64x2_t sse_u64 = vdupq_n_u64(0);
276 
277   int h = height;
278   do {
279     int w = width;
280     uint16_t *src_ptr = src_u16;
281     do {
282       uint16x8_t s0 = vld1q_u16(src_ptr);
283 
284       sum_u32 = vpadalq_u16(sum_u32, s0);
285 
286       sse_u64 = aom_udotq_u16(sse_u64, s0, s0);
287 
288       src_ptr += 8;
289       w -= 8;
290     } while (w != 0);
291 
292     src_u16 += src_stride;
293   } while (--h != 0);
294 
295   sum += vaddlvq_u32(sum_u32);
296   sse += vaddvq_u64(sse_u64);
297 
298   return sse - sum * sum / (width * height);
299 }
300 
aom_var_2d_u16_16xh_sve(uint8_t * src,int src_stride,int width,int height)301 static inline uint64_t aom_var_2d_u16_16xh_sve(uint8_t *src, int src_stride,
302                                                int width, int height) {
303   uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src);
304   uint64_t sum = 0;
305   uint64_t sse = 0;
306   uint32x4_t sum_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
307   uint64x2_t sse_u64[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
308 
309   int h = height;
310   do {
311     int w = width;
312     uint16_t *src_ptr = src_u16;
313     do {
314       uint16x8_t s0 = vld1q_u16(src_ptr);
315       uint16x8_t s1 = vld1q_u16(src_ptr + 8);
316 
317       sum_u32[0] = vpadalq_u16(sum_u32[0], s0);
318       sum_u32[1] = vpadalq_u16(sum_u32[1], s1);
319 
320       sse_u64[0] = aom_udotq_u16(sse_u64[0], s0, s0);
321       sse_u64[1] = aom_udotq_u16(sse_u64[1], s1, s1);
322 
323       src_ptr += 16;
324       w -= 16;
325     } while (w != 0);
326 
327     src_u16 += src_stride;
328   } while (--h != 0);
329 
330   sum_u32[0] = vaddq_u32(sum_u32[0], sum_u32[1]);
331   sse_u64[0] = vaddq_u64(sse_u64[0], sse_u64[1]);
332 
333   sum += vaddlvq_u32(sum_u32[0]);
334   sse += vaddvq_u64(sse_u64[0]);
335 
336   return sse - sum * sum / (width * height);
337 }
338 
aom_var_2d_u16_large_sve(uint8_t * src,int src_stride,int width,int height)339 static inline uint64_t aom_var_2d_u16_large_sve(uint8_t *src, int src_stride,
340                                                 int width, int height) {
341   uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src);
342   uint64_t sum = 0;
343   uint64_t sse = 0;
344   uint32x4_t sum_u32[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
345                             vdupq_n_u32(0) };
346   uint64x2_t sse_u64[4] = { vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0),
347                             vdupq_n_u64(0) };
348 
349   int h = height;
350   do {
351     int w = width;
352     uint16_t *src_ptr = src_u16;
353     do {
354       uint16x8_t s0 = vld1q_u16(src_ptr);
355       uint16x8_t s1 = vld1q_u16(src_ptr + 8);
356       uint16x8_t s2 = vld1q_u16(src_ptr + 16);
357       uint16x8_t s3 = vld1q_u16(src_ptr + 24);
358 
359       sum_u32[0] = vpadalq_u16(sum_u32[0], s0);
360       sum_u32[1] = vpadalq_u16(sum_u32[1], s1);
361       sum_u32[2] = vpadalq_u16(sum_u32[2], s2);
362       sum_u32[3] = vpadalq_u16(sum_u32[3], s3);
363 
364       sse_u64[0] = aom_udotq_u16(sse_u64[0], s0, s0);
365       sse_u64[1] = aom_udotq_u16(sse_u64[1], s1, s1);
366       sse_u64[2] = aom_udotq_u16(sse_u64[2], s2, s2);
367       sse_u64[3] = aom_udotq_u16(sse_u64[3], s3, s3);
368 
369       src_ptr += 32;
370       w -= 32;
371     } while (w != 0);
372 
373     src_u16 += src_stride;
374   } while (--h != 0);
375 
376   sum_u32[0] = vaddq_u32(sum_u32[0], sum_u32[1]);
377   sum_u32[2] = vaddq_u32(sum_u32[2], sum_u32[3]);
378   sum_u32[0] = vaddq_u32(sum_u32[0], sum_u32[2]);
379   sse_u64[0] = vaddq_u64(sse_u64[0], sse_u64[1]);
380   sse_u64[2] = vaddq_u64(sse_u64[2], sse_u64[3]);
381   sse_u64[0] = vaddq_u64(sse_u64[0], sse_u64[2]);
382 
383   sum += vaddlvq_u32(sum_u32[0]);
384   sse += vaddvq_u64(sse_u64[0]);
385 
386   return sse - sum * sum / (width * height);
387 }
388 
aom_var_2d_u16_sve(uint8_t * src,int src_stride,int width,int height)389 uint64_t aom_var_2d_u16_sve(uint8_t *src, int src_stride, int width,
390                             int height) {
391   if (width == 4) {
392     return aom_var_2d_u16_4xh_sve(src, src_stride, width, height);
393   }
394   if (width == 8) {
395     return aom_var_2d_u16_8xh_sve(src, src_stride, width, height);
396   }
397   if (width == 16) {
398     return aom_var_2d_u16_16xh_sve(src, src_stride, width, height);
399   }
400   if (width % 32 == 0) {
401     return aom_var_2d_u16_large_sve(src, src_stride, width, height);
402   }
403   return aom_var_2d_u16_neon(src, src_stride, width, height);
404 }
405 #endif  // CONFIG_AV1_HIGHBITDEPTH
406