xref: /aosp_15_r20/external/libaom/aom_dsp/x86/variance_ssse3.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <stddef.h>
13 #include <stdint.h>
14 
15 #include "config/aom_config.h"
16 #include "config/aom_dsp_rtcd.h"
17 
18 #include "aom_dsp/aom_dsp_common.h"
19 
20 // The 2 unused parameters are place holders for PIC enabled build.
21 // These definitions are for functions defined in subpel_variance.asm
22 #define DECL(w, opt)                                                           \
23   int aom_sub_pixel_variance##w##xh_##opt(                                     \
24       const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset,    \
25       const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \
26       void *unused0, void *unused)
27 #define DECLS(opt) \
28   DECL(4, opt);    \
29   DECL(8, opt);    \
30   DECL(16, opt)
31 
32 DECLS(ssse3);
33 #undef DECLS
34 #undef DECL
35 
36 #define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                      \
37   unsigned int aom_sub_pixel_variance##w##x##h##_##opt(                       \
38       const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
39       const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {            \
40     /*Avoid overflow in helper by capping height.*/                           \
41     const int hf = AOMMIN(h, 64);                                             \
42     unsigned int sse = 0;                                                     \
43     int se = 0;                                                               \
44     for (int i = 0; i < (w / wf); ++i) {                                      \
45       const uint8_t *src_ptr = src;                                           \
46       const uint8_t *dst_ptr = dst;                                           \
47       for (int j = 0; j < (h / hf); ++j) {                                    \
48         unsigned int sse2;                                                    \
49         const int se2 = aom_sub_pixel_variance##wf##xh_##opt(                 \
50             src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
51             &sse2, NULL, NULL);                                               \
52         dst_ptr += hf * dst_stride;                                           \
53         src_ptr += hf * src_stride;                                           \
54         se += se2;                                                            \
55         sse += sse2;                                                          \
56       }                                                                       \
57       src += wf;                                                              \
58       dst += wf;                                                              \
59     }                                                                         \
60     *sse_ptr = sse;                                                           \
61     return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));  \
62   }
63 
64 #if !CONFIG_REALTIME_ONLY
65 #define FNS(opt)                                    \
66   FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
67   FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
68   FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
69   FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
70   FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
71   FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
72   FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
73   FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
74   FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
75   FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
76   FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t))    \
77   FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t))     \
78   FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t))      \
79   FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t))      \
80   FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t))      \
81   FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t))      \
82   FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t))     \
83   FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t))    \
84   FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t))    \
85   FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t))   \
86   FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t))   \
87   FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
88 #else
89 #define FNS(opt)                                    \
90   FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
91   FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
92   FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
93   FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
94   FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
95   FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
96   FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
97   FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
98   FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
99   FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
100   FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t))    \
101   FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t))     \
102   FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t))      \
103   FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t))      \
104   FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t))      \
105   FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t))
106 #endif
107 
108 FNS(ssse3)
109 
110 #undef FNS
111 #undef FN
112 
113 // The 2 unused parameters are place holders for PIC enabled build.
114 #define DECL(w, opt)                                                        \
115   int aom_sub_pixel_avg_variance##w##xh_##opt(                              \
116       const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
117       const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec,         \
118       ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0,   \
119       void *unused)
120 #define DECLS(opt) \
121   DECL(4, opt);    \
122   DECL(8, opt);    \
123   DECL(16, opt)
124 
125 DECLS(ssse3);
126 #undef DECL
127 #undef DECLS
128 
129 #define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                     \
130   unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt(                  \
131       const uint8_t *src, int src_stride, int x_offset, int y_offset,        \
132       const uint8_t *dst, int dst_stride, unsigned int *sse_ptr,             \
133       const uint8_t *sec) {                                                  \
134     /*Avoid overflow in helper by capping height.*/                          \
135     const int hf = AOMMIN(h, 64);                                            \
136     unsigned int sse = 0;                                                    \
137     int se = 0;                                                              \
138     for (int i = 0; i < (w / wf); ++i) {                                     \
139       const uint8_t *src_ptr = src;                                          \
140       const uint8_t *dst_ptr = dst;                                          \
141       const uint8_t *sec_ptr = sec;                                          \
142       for (int j = 0; j < (h / hf); ++j) {                                   \
143         unsigned int sse2;                                                   \
144         const int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt(            \
145             src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride,    \
146             sec_ptr, w, hf, &sse2, NULL, NULL);                              \
147         dst_ptr += hf * dst_stride;                                          \
148         src_ptr += hf * src_stride;                                          \
149         sec_ptr += hf * w;                                                   \
150         se += se2;                                                           \
151         sse += sse2;                                                         \
152       }                                                                      \
153       src += wf;                                                             \
154       dst += wf;                                                             \
155       sec += wf;                                                             \
156     }                                                                        \
157     *sse_ptr = sse;                                                          \
158     return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
159   }
160 
161 #if !CONFIG_REALTIME_ONLY
162 #define FNS(opt)                                    \
163   FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
164   FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
165   FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
166   FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
167   FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
168   FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
169   FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
170   FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
171   FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
172   FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
173   FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t))   \
174   FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t))    \
175   FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t))     \
176   FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t))     \
177   FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t))     \
178   FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t))     \
179   FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t))     \
180   FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t))    \
181   FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t))    \
182   FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t))   \
183   FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t))   \
184   FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
185 #else
186 #define FNS(opt)                                    \
187   FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
188   FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
189   FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
190   FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
191   FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
192   FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
193   FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
194   FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
195   FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
196   FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
197   FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t))   \
198   FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t))    \
199   FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t))     \
200   FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t))     \
201   FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t))     \
202   FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t))
203 #endif
204 
205 FNS(ssse3)
206 
207 #undef FNS
208 #undef FN
209