1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/dsp/loop_restoration.h"
16 #include "src/utils/cpu.h"
17
18 #if LIBGAV1_TARGETING_SSE4_1
19 #include <smmintrin.h>
20
21 #include <algorithm>
22 #include <cassert>
23 #include <cstddef>
24 #include <cstdint>
25 #include <cstring>
26
27 #include "src/dsp/common.h"
28 #include "src/dsp/constants.h"
29 #include "src/dsp/dsp.h"
30 #include "src/dsp/x86/common_sse4.h"
31 #include "src/utils/common.h"
32 #include "src/utils/constants.h"
33
34 namespace libgav1 {
35 namespace dsp {
36 namespace low_bitdepth {
37 namespace {
38
WienerHorizontalClip(const __m128i s[2],const __m128i s_3x128,int16_t * const wiener_buffer)39 inline void WienerHorizontalClip(const __m128i s[2], const __m128i s_3x128,
40 int16_t* const wiener_buffer) {
41 constexpr int offset =
42 1 << (8 + kWienerFilterBits - kInterRoundBitsHorizontal - 1);
43 constexpr int limit =
44 (1 << (8 + 1 + kWienerFilterBits - kInterRoundBitsHorizontal)) - 1;
45 const __m128i offsets = _mm_set1_epi16(-offset);
46 const __m128i limits = _mm_set1_epi16(limit - offset);
47 // The sum range here is [-128 * 255 + 4, 90 * 255 + 4].
48 const __m128i sum = _mm_add_epi16(s[0], s[1]);
49 const __m128i rounded_sum0 = _mm_srai_epi16(sum, kInterRoundBitsHorizontal);
50 // Add back scaled down offset correction.
51 const __m128i rounded_sum1 = _mm_add_epi16(rounded_sum0, s_3x128);
52 const __m128i d0 = _mm_max_epi16(rounded_sum1, offsets);
53 const __m128i d1 = _mm_min_epi16(d0, limits);
54 StoreAligned16(wiener_buffer, d1);
55 }
56
WienerHorizontalTap7Kernel(const __m128i s[4],const __m128i filter[4],int16_t * const wiener_buffer)57 inline void WienerHorizontalTap7Kernel(const __m128i s[4],
58 const __m128i filter[4],
59 int16_t* const wiener_buffer) {
60 __m128i madds[4];
61 madds[0] = _mm_maddubs_epi16(s[0], filter[0]);
62 madds[1] = _mm_maddubs_epi16(s[1], filter[1]);
63 madds[2] = _mm_maddubs_epi16(s[2], filter[2]);
64 madds[3] = _mm_maddubs_epi16(s[3], filter[3]);
65 madds[0] = _mm_add_epi16(madds[0], madds[2]);
66 madds[1] = _mm_add_epi16(madds[1], madds[3]);
67 const __m128i s_3x128 =
68 _mm_slli_epi16(_mm_srli_epi16(s[1], 8), 7 - kInterRoundBitsHorizontal);
69 WienerHorizontalClip(madds, s_3x128, wiener_buffer);
70 }
71
WienerHorizontalTap5Kernel(const __m128i s[5],const __m128i filter[3],int16_t * const wiener_buffer)72 inline void WienerHorizontalTap5Kernel(const __m128i s[5],
73 const __m128i filter[3],
74 int16_t* const wiener_buffer) {
75 __m128i madds[3];
76 madds[0] = _mm_maddubs_epi16(s[0], filter[0]);
77 madds[1] = _mm_maddubs_epi16(s[1], filter[1]);
78 madds[2] = _mm_maddubs_epi16(s[2], filter[2]);
79 madds[0] = _mm_add_epi16(madds[0], madds[2]);
80 const __m128i s_3x128 =
81 _mm_srli_epi16(_mm_slli_epi16(s[1], 8), kInterRoundBitsHorizontal + 1);
82 WienerHorizontalClip(madds, s_3x128, wiener_buffer);
83 }
84
WienerHorizontalTap3Kernel(const __m128i s[2],const __m128i filter[2],int16_t * const wiener_buffer)85 inline void WienerHorizontalTap3Kernel(const __m128i s[2],
86 const __m128i filter[2],
87 int16_t* const wiener_buffer) {
88 __m128i madds[2];
89 madds[0] = _mm_maddubs_epi16(s[0], filter[0]);
90 madds[1] = _mm_maddubs_epi16(s[1], filter[1]);
91 const __m128i s_3x128 =
92 _mm_slli_epi16(_mm_srli_epi16(s[0], 8), 7 - kInterRoundBitsHorizontal);
93 WienerHorizontalClip(madds, s_3x128, wiener_buffer);
94 }
95
96 // loading all and unpacking is about 7% faster than using _mm_alignr_epi8().
WienerHorizontalTap7(const uint8_t * src,const ptrdiff_t src_stride,const ptrdiff_t width,const int height,const int coefficient0,const __m128i coefficients,int16_t ** const wiener_buffer)97 inline void WienerHorizontalTap7(const uint8_t* src, const ptrdiff_t src_stride,
98 const ptrdiff_t width, const int height,
99 const int coefficient0,
100 const __m128i coefficients,
101 int16_t** const wiener_buffer) {
102 const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1));
103 __m128i filter[4];
104 filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0200));
105 filter[1] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0604));
106 filter[2] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0204));
107 filter[3] = _mm_set1_epi16((1 << 8) | static_cast<uint8_t>(coefficient0));
108 for (int y = height; y != 0; --y) {
109 ptrdiff_t x = 0;
110 do {
111 __m128i s[7], ss[4];
112 s[0] = LoadUnaligned16(src + x + 0);
113 s[1] = LoadUnaligned16(src + x + 1);
114 s[2] = LoadUnaligned16(src + x + 2);
115 s[3] = LoadUnaligned16(src + x + 3);
116 s[4] = LoadUnaligned16(src + x + 4);
117 s[5] = LoadUnaligned16(src + x + 5);
118 s[6] = LoadUnaligned16(src + x + 6);
119 ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
120 ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
121 ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
122 ss[3] = _mm_unpacklo_epi8(s[6], round);
123 WienerHorizontalTap7Kernel(ss, filter, *wiener_buffer + x + 0);
124 ss[0] = _mm_unpackhi_epi8(s[0], s[1]);
125 ss[1] = _mm_unpackhi_epi8(s[2], s[3]);
126 ss[2] = _mm_unpackhi_epi8(s[4], s[5]);
127 ss[3] = _mm_unpackhi_epi8(s[6], round);
128 WienerHorizontalTap7Kernel(ss, filter, *wiener_buffer + x + 8);
129 x += 16;
130 } while (x < width);
131 src += src_stride;
132 *wiener_buffer += width;
133 }
134 }
135
WienerHorizontalTap5(const uint8_t * src,const ptrdiff_t src_stride,const ptrdiff_t width,const int height,const int coefficient1,const __m128i coefficients,int16_t ** const wiener_buffer)136 inline void WienerHorizontalTap5(const uint8_t* src, const ptrdiff_t src_stride,
137 const ptrdiff_t width, const int height,
138 const int coefficient1,
139 const __m128i coefficients,
140 int16_t** const wiener_buffer) {
141 const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1));
142 __m128i filter[3];
143 filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0402));
144 filter[1] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0406));
145 filter[2] = _mm_set1_epi16((1 << 8) | static_cast<uint8_t>(coefficient1));
146 for (int y = height; y != 0; --y) {
147 ptrdiff_t x = 0;
148 do {
149 __m128i s[5], ss[3];
150 s[0] = LoadUnaligned16(src + x + 0);
151 s[1] = LoadUnaligned16(src + x + 1);
152 s[2] = LoadUnaligned16(src + x + 2);
153 s[3] = LoadUnaligned16(src + x + 3);
154 s[4] = LoadUnaligned16(src + x + 4);
155 ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
156 ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
157 ss[2] = _mm_unpacklo_epi8(s[4], round);
158 WienerHorizontalTap5Kernel(ss, filter, *wiener_buffer + x + 0);
159 ss[0] = _mm_unpackhi_epi8(s[0], s[1]);
160 ss[1] = _mm_unpackhi_epi8(s[2], s[3]);
161 ss[2] = _mm_unpackhi_epi8(s[4], round);
162 WienerHorizontalTap5Kernel(ss, filter, *wiener_buffer + x + 8);
163 x += 16;
164 } while (x < width);
165 src += src_stride;
166 *wiener_buffer += width;
167 }
168 }
169
WienerHorizontalTap3(const uint8_t * src,const ptrdiff_t src_stride,const ptrdiff_t width,const int height,const int coefficient2,const __m128i coefficients,int16_t ** const wiener_buffer)170 inline void WienerHorizontalTap3(const uint8_t* src, const ptrdiff_t src_stride,
171 const ptrdiff_t width, const int height,
172 const int coefficient2,
173 const __m128i coefficients,
174 int16_t** const wiener_buffer) {
175 const __m128i round = _mm_set1_epi8(1 << (kInterRoundBitsHorizontal - 1));
176 __m128i filter[2];
177 filter[0] = _mm_shuffle_epi8(coefficients, _mm_set1_epi16(0x0604));
178 filter[1] = _mm_set1_epi16((1 << 8) | static_cast<uint8_t>(coefficient2));
179 for (int y = height; y != 0; --y) {
180 ptrdiff_t x = 0;
181 do {
182 __m128i s[3], ss[2];
183 s[0] = LoadUnaligned16(src + x + 0);
184 s[1] = LoadUnaligned16(src + x + 1);
185 s[2] = LoadUnaligned16(src + x + 2);
186 ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
187 ss[1] = _mm_unpacklo_epi8(s[2], round);
188 WienerHorizontalTap3Kernel(ss, filter, *wiener_buffer + x + 0);
189 ss[0] = _mm_unpackhi_epi8(s[0], s[1]);
190 ss[1] = _mm_unpackhi_epi8(s[2], round);
191 WienerHorizontalTap3Kernel(ss, filter, *wiener_buffer + x + 8);
192 x += 16;
193 } while (x < width);
194 src += src_stride;
195 *wiener_buffer += width;
196 }
197 }
198
WienerHorizontalTap1(const uint8_t * src,const ptrdiff_t src_stride,const ptrdiff_t width,const int height,int16_t ** const wiener_buffer)199 inline void WienerHorizontalTap1(const uint8_t* src, const ptrdiff_t src_stride,
200 const ptrdiff_t width, const int height,
201 int16_t** const wiener_buffer) {
202 for (int y = height; y != 0; --y) {
203 ptrdiff_t x = 0;
204 do {
205 const __m128i s = LoadUnaligned16(src + x);
206 const __m128i s0 = _mm_unpacklo_epi8(s, _mm_setzero_si128());
207 const __m128i s1 = _mm_unpackhi_epi8(s, _mm_setzero_si128());
208 const __m128i d0 = _mm_slli_epi16(s0, 4);
209 const __m128i d1 = _mm_slli_epi16(s1, 4);
210 StoreAligned16(*wiener_buffer + x + 0, d0);
211 StoreAligned16(*wiener_buffer + x + 8, d1);
212 x += 16;
213 } while (x < width);
214 src += src_stride;
215 *wiener_buffer += width;
216 }
217 }
218
WienerVertical7(const __m128i a[2],const __m128i filter[2])219 inline __m128i WienerVertical7(const __m128i a[2], const __m128i filter[2]) {
220 const __m128i round = _mm_set1_epi32(1 << (kInterRoundBitsVertical - 1));
221 const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
222 const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
223 const __m128i sum0 = _mm_add_epi32(round, madd0);
224 const __m128i sum1 = _mm_add_epi32(sum0, madd1);
225 return _mm_srai_epi32(sum1, kInterRoundBitsVertical);
226 }
227
WienerVertical5(const __m128i a[2],const __m128i filter[2])228 inline __m128i WienerVertical5(const __m128i a[2], const __m128i filter[2]) {
229 const __m128i madd0 = _mm_madd_epi16(a[0], filter[0]);
230 const __m128i madd1 = _mm_madd_epi16(a[1], filter[1]);
231 const __m128i sum = _mm_add_epi32(madd0, madd1);
232 return _mm_srai_epi32(sum, kInterRoundBitsVertical);
233 }
234
WienerVertical3(const __m128i a,const __m128i filter)235 inline __m128i WienerVertical3(const __m128i a, const __m128i filter) {
236 const __m128i round = _mm_set1_epi32(1 << (kInterRoundBitsVertical - 1));
237 const __m128i madd = _mm_madd_epi16(a, filter);
238 const __m128i sum = _mm_add_epi32(round, madd);
239 return _mm_srai_epi32(sum, kInterRoundBitsVertical);
240 }
241
WienerVerticalFilter7(const __m128i a[7],const __m128i filter[2])242 inline __m128i WienerVerticalFilter7(const __m128i a[7],
243 const __m128i filter[2]) {
244 __m128i b[2];
245 const __m128i a06 = _mm_add_epi16(a[0], a[6]);
246 const __m128i a15 = _mm_add_epi16(a[1], a[5]);
247 const __m128i a24 = _mm_add_epi16(a[2], a[4]);
248 b[0] = _mm_unpacklo_epi16(a06, a15);
249 b[1] = _mm_unpacklo_epi16(a24, a[3]);
250 const __m128i sum0 = WienerVertical7(b, filter);
251 b[0] = _mm_unpackhi_epi16(a06, a15);
252 b[1] = _mm_unpackhi_epi16(a24, a[3]);
253 const __m128i sum1 = WienerVertical7(b, filter);
254 return _mm_packs_epi32(sum0, sum1);
255 }
256
WienerVerticalFilter5(const __m128i a[5],const __m128i filter[2])257 inline __m128i WienerVerticalFilter5(const __m128i a[5],
258 const __m128i filter[2]) {
259 const __m128i round = _mm_set1_epi16(1 << (kInterRoundBitsVertical - 1));
260 __m128i b[2];
261 const __m128i a04 = _mm_add_epi16(a[0], a[4]);
262 const __m128i a13 = _mm_add_epi16(a[1], a[3]);
263 b[0] = _mm_unpacklo_epi16(a04, a13);
264 b[1] = _mm_unpacklo_epi16(a[2], round);
265 const __m128i sum0 = WienerVertical5(b, filter);
266 b[0] = _mm_unpackhi_epi16(a04, a13);
267 b[1] = _mm_unpackhi_epi16(a[2], round);
268 const __m128i sum1 = WienerVertical5(b, filter);
269 return _mm_packs_epi32(sum0, sum1);
270 }
271
WienerVerticalFilter3(const __m128i a[3],const __m128i filter)272 inline __m128i WienerVerticalFilter3(const __m128i a[3], const __m128i filter) {
273 __m128i b;
274 const __m128i a02 = _mm_add_epi16(a[0], a[2]);
275 b = _mm_unpacklo_epi16(a02, a[1]);
276 const __m128i sum0 = WienerVertical3(b, filter);
277 b = _mm_unpackhi_epi16(a02, a[1]);
278 const __m128i sum1 = WienerVertical3(b, filter);
279 return _mm_packs_epi32(sum0, sum1);
280 }
281
WienerVerticalTap7Kernel(const int16_t * wiener_buffer,const ptrdiff_t wiener_stride,const __m128i filter[2],__m128i a[7])282 inline __m128i WienerVerticalTap7Kernel(const int16_t* wiener_buffer,
283 const ptrdiff_t wiener_stride,
284 const __m128i filter[2], __m128i a[7]) {
285 a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
286 a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
287 a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
288 a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
289 a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
290 a[5] = LoadAligned16(wiener_buffer + 5 * wiener_stride);
291 a[6] = LoadAligned16(wiener_buffer + 6 * wiener_stride);
292 return WienerVerticalFilter7(a, filter);
293 }
294
WienerVerticalTap5Kernel(const int16_t * wiener_buffer,const ptrdiff_t wiener_stride,const __m128i filter[2],__m128i a[5])295 inline __m128i WienerVerticalTap5Kernel(const int16_t* wiener_buffer,
296 const ptrdiff_t wiener_stride,
297 const __m128i filter[2], __m128i a[5]) {
298 a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
299 a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
300 a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
301 a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
302 a[4] = LoadAligned16(wiener_buffer + 4 * wiener_stride);
303 return WienerVerticalFilter5(a, filter);
304 }
305
WienerVerticalTap3Kernel(const int16_t * wiener_buffer,const ptrdiff_t wiener_stride,const __m128i filter,__m128i a[3])306 inline __m128i WienerVerticalTap3Kernel(const int16_t* wiener_buffer,
307 const ptrdiff_t wiener_stride,
308 const __m128i filter, __m128i a[3]) {
309 a[0] = LoadAligned16(wiener_buffer + 0 * wiener_stride);
310 a[1] = LoadAligned16(wiener_buffer + 1 * wiener_stride);
311 a[2] = LoadAligned16(wiener_buffer + 2 * wiener_stride);
312 return WienerVerticalFilter3(a, filter);
313 }
314
WienerVerticalTap7Kernel2(const int16_t * wiener_buffer,const ptrdiff_t wiener_stride,const __m128i filter[2],__m128i d[2])315 inline void WienerVerticalTap7Kernel2(const int16_t* wiener_buffer,
316 const ptrdiff_t wiener_stride,
317 const __m128i filter[2], __m128i d[2]) {
318 __m128i a[8];
319 d[0] = WienerVerticalTap7Kernel(wiener_buffer, wiener_stride, filter, a);
320 a[7] = LoadAligned16(wiener_buffer + 7 * wiener_stride);
321 d[1] = WienerVerticalFilter7(a + 1, filter);
322 }
323
WienerVerticalTap5Kernel2(const int16_t * wiener_buffer,const ptrdiff_t wiener_stride,const __m128i filter[2],__m128i d[2])324 inline void WienerVerticalTap5Kernel2(const int16_t* wiener_buffer,
325 const ptrdiff_t wiener_stride,
326 const __m128i filter[2], __m128i d[2]) {
327 __m128i a[6];
328 d[0] = WienerVerticalTap5Kernel(wiener_buffer, wiener_stride, filter, a);
329 a[5] = LoadAligned16(wiener_buffer + 5 * wiener_stride);
330 d[1] = WienerVerticalFilter5(a + 1, filter);
331 }
332
WienerVerticalTap3Kernel2(const int16_t * wiener_buffer,const ptrdiff_t wiener_stride,const __m128i filter,__m128i d[2])333 inline void WienerVerticalTap3Kernel2(const int16_t* wiener_buffer,
334 const ptrdiff_t wiener_stride,
335 const __m128i filter, __m128i d[2]) {
336 __m128i a[4];
337 d[0] = WienerVerticalTap3Kernel(wiener_buffer, wiener_stride, filter, a);
338 a[3] = LoadAligned16(wiener_buffer + 3 * wiener_stride);
339 d[1] = WienerVerticalFilter3(a + 1, filter);
340 }
341
WienerVerticalTap7(const int16_t * wiener_buffer,const ptrdiff_t width,const int height,const int16_t coefficients[4],uint8_t * dst,const ptrdiff_t dst_stride)342 inline void WienerVerticalTap7(const int16_t* wiener_buffer,
343 const ptrdiff_t width, const int height,
344 const int16_t coefficients[4], uint8_t* dst,
345 const ptrdiff_t dst_stride) {
346 const __m128i c = LoadLo8(coefficients);
347 __m128i filter[2];
348 filter[0] = _mm_shuffle_epi32(c, 0x0);
349 filter[1] = _mm_shuffle_epi32(c, 0x55);
350 for (int y = height >> 1; y > 0; --y) {
351 ptrdiff_t x = 0;
352 do {
353 __m128i d[2][2];
354 WienerVerticalTap7Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
355 WienerVerticalTap7Kernel2(wiener_buffer + x + 8, width, filter, d[1]);
356 StoreAligned16(dst + x, _mm_packus_epi16(d[0][0], d[1][0]));
357 StoreAligned16(dst + dst_stride + x, _mm_packus_epi16(d[0][1], d[1][1]));
358 x += 16;
359 } while (x < width);
360 dst += 2 * dst_stride;
361 wiener_buffer += 2 * width;
362 }
363
364 if ((height & 1) != 0) {
365 ptrdiff_t x = 0;
366 do {
367 __m128i a[7];
368 const __m128i d0 =
369 WienerVerticalTap7Kernel(wiener_buffer + x + 0, width, filter, a);
370 const __m128i d1 =
371 WienerVerticalTap7Kernel(wiener_buffer + x + 8, width, filter, a);
372 StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
373 x += 16;
374 } while (x < width);
375 }
376 }
377
WienerVerticalTap5(const int16_t * wiener_buffer,const ptrdiff_t width,const int height,const int16_t coefficients[3],uint8_t * dst,const ptrdiff_t dst_stride)378 inline void WienerVerticalTap5(const int16_t* wiener_buffer,
379 const ptrdiff_t width, const int height,
380 const int16_t coefficients[3], uint8_t* dst,
381 const ptrdiff_t dst_stride) {
382 const __m128i c = Load4(coefficients);
383 __m128i filter[2];
384 filter[0] = _mm_shuffle_epi32(c, 0);
385 filter[1] =
386 _mm_set1_epi32((1 << 16) | static_cast<uint16_t>(coefficients[2]));
387 for (int y = height >> 1; y > 0; --y) {
388 ptrdiff_t x = 0;
389 do {
390 __m128i d[2][2];
391 WienerVerticalTap5Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
392 WienerVerticalTap5Kernel2(wiener_buffer + x + 8, width, filter, d[1]);
393 StoreAligned16(dst + x, _mm_packus_epi16(d[0][0], d[1][0]));
394 StoreAligned16(dst + dst_stride + x, _mm_packus_epi16(d[0][1], d[1][1]));
395 x += 16;
396 } while (x < width);
397 dst += 2 * dst_stride;
398 wiener_buffer += 2 * width;
399 }
400
401 if ((height & 1) != 0) {
402 ptrdiff_t x = 0;
403 do {
404 __m128i a[5];
405 const __m128i d0 =
406 WienerVerticalTap5Kernel(wiener_buffer + x + 0, width, filter, a);
407 const __m128i d1 =
408 WienerVerticalTap5Kernel(wiener_buffer + x + 8, width, filter, a);
409 StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
410 x += 16;
411 } while (x < width);
412 }
413 }
414
WienerVerticalTap3(const int16_t * wiener_buffer,const ptrdiff_t width,const int height,const int16_t coefficients[2],uint8_t * dst,const ptrdiff_t dst_stride)415 inline void WienerVerticalTap3(const int16_t* wiener_buffer,
416 const ptrdiff_t width, const int height,
417 const int16_t coefficients[2], uint8_t* dst,
418 const ptrdiff_t dst_stride) {
419 const __m128i filter =
420 _mm_set1_epi32(*reinterpret_cast<const int32_t*>(coefficients));
421 for (int y = height >> 1; y > 0; --y) {
422 ptrdiff_t x = 0;
423 do {
424 __m128i d[2][2];
425 WienerVerticalTap3Kernel2(wiener_buffer + x + 0, width, filter, d[0]);
426 WienerVerticalTap3Kernel2(wiener_buffer + x + 8, width, filter, d[1]);
427 StoreAligned16(dst + x, _mm_packus_epi16(d[0][0], d[1][0]));
428 StoreAligned16(dst + dst_stride + x, _mm_packus_epi16(d[0][1], d[1][1]));
429 x += 16;
430 } while (x < width);
431 dst += 2 * dst_stride;
432 wiener_buffer += 2 * width;
433 }
434
435 if ((height & 1) != 0) {
436 ptrdiff_t x = 0;
437 do {
438 __m128i a[3];
439 const __m128i d0 =
440 WienerVerticalTap3Kernel(wiener_buffer + x + 0, width, filter, a);
441 const __m128i d1 =
442 WienerVerticalTap3Kernel(wiener_buffer + x + 8, width, filter, a);
443 StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
444 x += 16;
445 } while (x < width);
446 }
447 }
448
WienerVerticalTap1Kernel(const int16_t * const wiener_buffer,uint8_t * const dst)449 inline void WienerVerticalTap1Kernel(const int16_t* const wiener_buffer,
450 uint8_t* const dst) {
451 const __m128i a0 = LoadAligned16(wiener_buffer + 0);
452 const __m128i a1 = LoadAligned16(wiener_buffer + 8);
453 const __m128i b0 = _mm_add_epi16(a0, _mm_set1_epi16(8));
454 const __m128i b1 = _mm_add_epi16(a1, _mm_set1_epi16(8));
455 const __m128i c0 = _mm_srai_epi16(b0, 4);
456 const __m128i c1 = _mm_srai_epi16(b1, 4);
457 const __m128i d = _mm_packus_epi16(c0, c1);
458 StoreAligned16(dst, d);
459 }
460
WienerVerticalTap1(const int16_t * wiener_buffer,const ptrdiff_t width,const int height,uint8_t * dst,const ptrdiff_t dst_stride)461 inline void WienerVerticalTap1(const int16_t* wiener_buffer,
462 const ptrdiff_t width, const int height,
463 uint8_t* dst, const ptrdiff_t dst_stride) {
464 for (int y = height >> 1; y > 0; --y) {
465 ptrdiff_t x = 0;
466 do {
467 WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
468 WienerVerticalTap1Kernel(wiener_buffer + width + x, dst + dst_stride + x);
469 x += 16;
470 } while (x < width);
471 dst += 2 * dst_stride;
472 wiener_buffer += 2 * width;
473 }
474
475 if ((height & 1) != 0) {
476 ptrdiff_t x = 0;
477 do {
478 WienerVerticalTap1Kernel(wiener_buffer + x, dst + x);
479 x += 16;
480 } while (x < width);
481 }
482 }
483
WienerFilter_SSE4_1(const RestorationUnitInfo & LIBGAV1_RESTRICT restoration_info,const void * LIBGAV1_RESTRICT const source,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_border,const ptrdiff_t top_border_stride,const void * LIBGAV1_RESTRICT const bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,RestorationBuffer * LIBGAV1_RESTRICT const restoration_buffer,void * LIBGAV1_RESTRICT const dest)484 void WienerFilter_SSE4_1(
485 const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
486 const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
487 const void* LIBGAV1_RESTRICT const top_border,
488 const ptrdiff_t top_border_stride,
489 const void* LIBGAV1_RESTRICT const bottom_border,
490 const ptrdiff_t bottom_border_stride, const int width, const int height,
491 RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
492 void* LIBGAV1_RESTRICT const dest) {
493 const int16_t* const number_leading_zero_coefficients =
494 restoration_info.wiener_info.number_leading_zero_coefficients;
495 const int number_rows_to_skip = std::max(
496 static_cast<int>(number_leading_zero_coefficients[WienerInfo::kVertical]),
497 1);
498 const ptrdiff_t wiener_stride = Align(width, 16);
499 int16_t* const wiener_buffer_vertical = restoration_buffer->wiener_buffer;
500 // The values are saturated to 13 bits before storing.
501 int16_t* wiener_buffer_horizontal =
502 wiener_buffer_vertical + number_rows_to_skip * wiener_stride;
503
504 // horizontal filtering.
505 // Over-reads up to 15 - |kRestorationHorizontalBorder| values.
506 const int height_horizontal =
507 height + kWienerFilterTaps - 1 - 2 * number_rows_to_skip;
508 const int height_extra = (height_horizontal - height) >> 1;
509 assert(height_extra <= 2);
510 const auto* const src = static_cast<const uint8_t*>(source);
511 const auto* const top = static_cast<const uint8_t*>(top_border);
512 const auto* const bottom = static_cast<const uint8_t*>(bottom_border);
513 const int16_t* const filter_horizontal =
514 restoration_info.wiener_info.filter[WienerInfo::kHorizontal];
515 const __m128i c = LoadLo8(filter_horizontal);
516 // In order to keep the horizontal pass intermediate values within 16 bits we
517 // offset |filter[3]| by 128. The 128 offset will be added back in the loop.
518 const __m128i coefficients_horizontal =
519 _mm_sub_epi16(c, _mm_setr_epi16(0, 0, 0, 128, 0, 0, 0, 0));
520 if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 0) {
521 WienerHorizontalTap7(top + (2 - height_extra) * top_border_stride - 3,
522 top_border_stride, wiener_stride, height_extra,
523 filter_horizontal[0], coefficients_horizontal,
524 &wiener_buffer_horizontal);
525 WienerHorizontalTap7(src - 3, stride, wiener_stride, height,
526 filter_horizontal[0], coefficients_horizontal,
527 &wiener_buffer_horizontal);
528 WienerHorizontalTap7(bottom - 3, bottom_border_stride, wiener_stride,
529 height_extra, filter_horizontal[0],
530 coefficients_horizontal, &wiener_buffer_horizontal);
531 } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 1) {
532 WienerHorizontalTap5(top + (2 - height_extra) * top_border_stride - 2,
533 top_border_stride, wiener_stride, height_extra,
534 filter_horizontal[1], coefficients_horizontal,
535 &wiener_buffer_horizontal);
536 WienerHorizontalTap5(src - 2, stride, wiener_stride, height,
537 filter_horizontal[1], coefficients_horizontal,
538 &wiener_buffer_horizontal);
539 WienerHorizontalTap5(bottom - 2, bottom_border_stride, wiener_stride,
540 height_extra, filter_horizontal[1],
541 coefficients_horizontal, &wiener_buffer_horizontal);
542 } else if (number_leading_zero_coefficients[WienerInfo::kHorizontal] == 2) {
543 // The maximum over-reads happen here.
544 WienerHorizontalTap3(top + (2 - height_extra) * top_border_stride - 1,
545 top_border_stride, wiener_stride, height_extra,
546 filter_horizontal[2], coefficients_horizontal,
547 &wiener_buffer_horizontal);
548 WienerHorizontalTap3(src - 1, stride, wiener_stride, height,
549 filter_horizontal[2], coefficients_horizontal,
550 &wiener_buffer_horizontal);
551 WienerHorizontalTap3(bottom - 1, bottom_border_stride, wiener_stride,
552 height_extra, filter_horizontal[2],
553 coefficients_horizontal, &wiener_buffer_horizontal);
554 } else {
555 assert(number_leading_zero_coefficients[WienerInfo::kHorizontal] == 3);
556 WienerHorizontalTap1(top + (2 - height_extra) * top_border_stride,
557 top_border_stride, wiener_stride, height_extra,
558 &wiener_buffer_horizontal);
559 WienerHorizontalTap1(src, stride, wiener_stride, height,
560 &wiener_buffer_horizontal);
561 WienerHorizontalTap1(bottom, bottom_border_stride, wiener_stride,
562 height_extra, &wiener_buffer_horizontal);
563 }
564
565 // vertical filtering.
566 // Over-writes up to 15 values.
567 const int16_t* const filter_vertical =
568 restoration_info.wiener_info.filter[WienerInfo::kVertical];
569 auto* dst = static_cast<uint8_t*>(dest);
570 if (number_leading_zero_coefficients[WienerInfo::kVertical] == 0) {
571 // Because the top row of |source| is a duplicate of the second row, and the
572 // bottom row of |source| is a duplicate of its above row, we can duplicate
573 // the top and bottom row of |wiener_buffer| accordingly.
574 memcpy(wiener_buffer_horizontal, wiener_buffer_horizontal - wiener_stride,
575 sizeof(*wiener_buffer_horizontal) * wiener_stride);
576 memcpy(restoration_buffer->wiener_buffer,
577 restoration_buffer->wiener_buffer + wiener_stride,
578 sizeof(*restoration_buffer->wiener_buffer) * wiener_stride);
579 WienerVerticalTap7(wiener_buffer_vertical, wiener_stride, height,
580 filter_vertical, dst, stride);
581 } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 1) {
582 WienerVerticalTap5(wiener_buffer_vertical + wiener_stride, wiener_stride,
583 height, filter_vertical + 1, dst, stride);
584 } else if (number_leading_zero_coefficients[WienerInfo::kVertical] == 2) {
585 WienerVerticalTap3(wiener_buffer_vertical + 2 * wiener_stride,
586 wiener_stride, height, filter_vertical + 2, dst, stride);
587 } else {
588 assert(number_leading_zero_coefficients[WienerInfo::kVertical] == 3);
589 WienerVerticalTap1(wiener_buffer_vertical + 3 * wiener_stride,
590 wiener_stride, height, dst, stride);
591 }
592 }
593
594 //------------------------------------------------------------------------------
595 // SGR
596
597 // SIMD overreads 16 - (width % 16) - 2 * padding pixels, where padding is 3 for
598 // Pass 1 and 2 for Pass 2.
599 constexpr int kOverreadInBytesPass1 = 10;
600 constexpr int kOverreadInBytesPass2 = 12;
601
LoadAligned16x2U16(const uint16_t * const src[2],const ptrdiff_t x,__m128i dst[2])602 inline void LoadAligned16x2U16(const uint16_t* const src[2], const ptrdiff_t x,
603 __m128i dst[2]) {
604 dst[0] = LoadAligned16(src[0] + x);
605 dst[1] = LoadAligned16(src[1] + x);
606 }
607
LoadAligned16x2U16Msan(const uint16_t * const src[2],const ptrdiff_t x,const ptrdiff_t border,__m128i dst[2])608 inline void LoadAligned16x2U16Msan(const uint16_t* const src[2],
609 const ptrdiff_t x, const ptrdiff_t border,
610 __m128i dst[2]) {
611 dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
612 dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
613 }
614
LoadAligned16x3U16(const uint16_t * const src[3],const ptrdiff_t x,__m128i dst[3])615 inline void LoadAligned16x3U16(const uint16_t* const src[3], const ptrdiff_t x,
616 __m128i dst[3]) {
617 dst[0] = LoadAligned16(src[0] + x);
618 dst[1] = LoadAligned16(src[1] + x);
619 dst[2] = LoadAligned16(src[2] + x);
620 }
621
LoadAligned16x3U16Msan(const uint16_t * const src[3],const ptrdiff_t x,const ptrdiff_t border,__m128i dst[3])622 inline void LoadAligned16x3U16Msan(const uint16_t* const src[3],
623 const ptrdiff_t x, const ptrdiff_t border,
624 __m128i dst[3]) {
625 dst[0] = LoadAligned16Msan(src[0] + x, sizeof(**src) * (x + 8 - border));
626 dst[1] = LoadAligned16Msan(src[1] + x, sizeof(**src) * (x + 8 - border));
627 dst[2] = LoadAligned16Msan(src[2] + x, sizeof(**src) * (x + 8 - border));
628 }
629
LoadAligned32U32(const uint32_t * const src,__m128i dst[2])630 inline void LoadAligned32U32(const uint32_t* const src, __m128i dst[2]) {
631 dst[0] = LoadAligned16(src + 0);
632 dst[1] = LoadAligned16(src + 4);
633 }
634
LoadAligned32U32Msan(const uint32_t * const src,const ptrdiff_t x,const ptrdiff_t border,__m128i dst[2])635 inline void LoadAligned32U32Msan(const uint32_t* const src, const ptrdiff_t x,
636 const ptrdiff_t border, __m128i dst[2]) {
637 dst[0] = LoadAligned16Msan(src + x + 0, sizeof(*src) * (x + 4 - border));
638 dst[1] = LoadAligned16Msan(src + x + 4, sizeof(*src) * (x + 8 - border));
639 }
640
LoadAligned32x2U32(const uint32_t * const src[2],const ptrdiff_t x,__m128i dst[2][2])641 inline void LoadAligned32x2U32(const uint32_t* const src[2], const ptrdiff_t x,
642 __m128i dst[2][2]) {
643 LoadAligned32U32(src[0] + x, dst[0]);
644 LoadAligned32U32(src[1] + x, dst[1]);
645 }
646
LoadAligned32x2U32Msan(const uint32_t * const src[2],const ptrdiff_t x,const ptrdiff_t border,__m128i dst[2][2])647 inline void LoadAligned32x2U32Msan(const uint32_t* const src[2],
648 const ptrdiff_t x, const ptrdiff_t border,
649 __m128i dst[2][2]) {
650 LoadAligned32U32Msan(src[0], x, border, dst[0]);
651 LoadAligned32U32Msan(src[1], x, border, dst[1]);
652 }
653
LoadAligned32x3U32(const uint32_t * const src[3],const ptrdiff_t x,__m128i dst[3][2])654 inline void LoadAligned32x3U32(const uint32_t* const src[3], const ptrdiff_t x,
655 __m128i dst[3][2]) {
656 LoadAligned32U32(src[0] + x, dst[0]);
657 LoadAligned32U32(src[1] + x, dst[1]);
658 LoadAligned32U32(src[2] + x, dst[2]);
659 }
660
LoadAligned32x3U32Msan(const uint32_t * const src[3],const ptrdiff_t x,const ptrdiff_t border,__m128i dst[3][2])661 inline void LoadAligned32x3U32Msan(const uint32_t* const src[3],
662 const ptrdiff_t x, const ptrdiff_t border,
663 __m128i dst[3][2]) {
664 LoadAligned32U32Msan(src[0], x, border, dst[0]);
665 LoadAligned32U32Msan(src[1], x, border, dst[1]);
666 LoadAligned32U32Msan(src[2], x, border, dst[2]);
667 }
668
StoreAligned32U16(uint16_t * const dst,const __m128i src[2])669 inline void StoreAligned32U16(uint16_t* const dst, const __m128i src[2]) {
670 StoreAligned16(dst + 0, src[0]);
671 StoreAligned16(dst + 8, src[1]);
672 }
673
StoreAligned32U32(uint32_t * const dst,const __m128i src[2])674 inline void StoreAligned32U32(uint32_t* const dst, const __m128i src[2]) {
675 StoreAligned16(dst + 0, src[0]);
676 StoreAligned16(dst + 4, src[1]);
677 }
678
StoreAligned64U32(uint32_t * const dst,const __m128i src[4])679 inline void StoreAligned64U32(uint32_t* const dst, const __m128i src[4]) {
680 StoreAligned32U32(dst + 0, src + 0);
681 StoreAligned32U32(dst + 8, src + 2);
682 }
683
684 // Don't use _mm_cvtepu8_epi16() or _mm_cvtepu16_epi32() in the following
685 // functions. Some compilers may generate super inefficient code and the whole
686 // decoder could be 15% slower.
687
VaddlLo8(const __m128i src0,const __m128i src1)688 inline __m128i VaddlLo8(const __m128i src0, const __m128i src1) {
689 const __m128i s0 = _mm_unpacklo_epi8(src0, _mm_setzero_si128());
690 const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
691 return _mm_add_epi16(s0, s1);
692 }
693
VaddlHi8(const __m128i src0,const __m128i src1)694 inline __m128i VaddlHi8(const __m128i src0, const __m128i src1) {
695 const __m128i s0 = _mm_unpackhi_epi8(src0, _mm_setzero_si128());
696 const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
697 return _mm_add_epi16(s0, s1);
698 }
699
VaddlLo16(const __m128i src0,const __m128i src1)700 inline __m128i VaddlLo16(const __m128i src0, const __m128i src1) {
701 const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
702 const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
703 return _mm_add_epi32(s0, s1);
704 }
705
VaddlHi16(const __m128i src0,const __m128i src1)706 inline __m128i VaddlHi16(const __m128i src0, const __m128i src1) {
707 const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
708 const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
709 return _mm_add_epi32(s0, s1);
710 }
711
VaddwLo8(const __m128i src0,const __m128i src1)712 inline __m128i VaddwLo8(const __m128i src0, const __m128i src1) {
713 const __m128i s1 = _mm_unpacklo_epi8(src1, _mm_setzero_si128());
714 return _mm_add_epi16(src0, s1);
715 }
716
VaddwHi8(const __m128i src0,const __m128i src1)717 inline __m128i VaddwHi8(const __m128i src0, const __m128i src1) {
718 const __m128i s1 = _mm_unpackhi_epi8(src1, _mm_setzero_si128());
719 return _mm_add_epi16(src0, s1);
720 }
721
VaddwLo16(const __m128i src0,const __m128i src1)722 inline __m128i VaddwLo16(const __m128i src0, const __m128i src1) {
723 const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
724 return _mm_add_epi32(src0, s1);
725 }
726
VaddwHi16(const __m128i src0,const __m128i src1)727 inline __m128i VaddwHi16(const __m128i src0, const __m128i src1) {
728 const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
729 return _mm_add_epi32(src0, s1);
730 }
731
VmullNLo8(const __m128i src0,const int src1)732 inline __m128i VmullNLo8(const __m128i src0, const int src1) {
733 const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
734 return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
735 }
736
VmullNHi8(const __m128i src0,const int src1)737 inline __m128i VmullNHi8(const __m128i src0, const int src1) {
738 const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
739 return _mm_madd_epi16(s0, _mm_set1_epi32(src1));
740 }
741
VmullLo16(const __m128i src0,const __m128i src1)742 inline __m128i VmullLo16(const __m128i src0, const __m128i src1) {
743 const __m128i s0 = _mm_unpacklo_epi16(src0, _mm_setzero_si128());
744 const __m128i s1 = _mm_unpacklo_epi16(src1, _mm_setzero_si128());
745 return _mm_madd_epi16(s0, s1);
746 }
747
VmullHi16(const __m128i src0,const __m128i src1)748 inline __m128i VmullHi16(const __m128i src0, const __m128i src1) {
749 const __m128i s0 = _mm_unpackhi_epi16(src0, _mm_setzero_si128());
750 const __m128i s1 = _mm_unpackhi_epi16(src1, _mm_setzero_si128());
751 return _mm_madd_epi16(s0, s1);
752 }
753
VrshrS32(const __m128i src0,const int src1)754 inline __m128i VrshrS32(const __m128i src0, const int src1) {
755 const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
756 return _mm_srai_epi32(sum, src1);
757 }
758
VrshrU32(const __m128i src0,const int src1)759 inline __m128i VrshrU32(const __m128i src0, const int src1) {
760 const __m128i sum = _mm_add_epi32(src0, _mm_set1_epi32(1 << (src1 - 1)));
761 return _mm_srli_epi32(sum, src1);
762 }
763
SquareLo8(const __m128i src)764 inline __m128i SquareLo8(const __m128i src) {
765 const __m128i s = _mm_unpacklo_epi8(src, _mm_setzero_si128());
766 return _mm_mullo_epi16(s, s);
767 }
768
SquareHi8(const __m128i src)769 inline __m128i SquareHi8(const __m128i src) {
770 const __m128i s = _mm_unpackhi_epi8(src, _mm_setzero_si128());
771 return _mm_mullo_epi16(s, s);
772 }
773
Prepare3Lo8(const __m128i src,__m128i dst[3])774 inline void Prepare3Lo8(const __m128i src, __m128i dst[3]) {
775 dst[0] = src;
776 dst[1] = _mm_srli_si128(src, 1);
777 dst[2] = _mm_srli_si128(src, 2);
778 }
779
780 template <int offset>
Prepare3_8(const __m128i src[2],__m128i dst[3])781 inline void Prepare3_8(const __m128i src[2], __m128i dst[3]) {
782 dst[0] = _mm_alignr_epi8(src[1], src[0], offset + 0);
783 dst[1] = _mm_alignr_epi8(src[1], src[0], offset + 1);
784 dst[2] = _mm_alignr_epi8(src[1], src[0], offset + 2);
785 }
786
Prepare3_16(const __m128i src[2],__m128i dst[3])787 inline void Prepare3_16(const __m128i src[2], __m128i dst[3]) {
788 dst[0] = src[0];
789 dst[1] = _mm_alignr_epi8(src[1], src[0], 2);
790 dst[2] = _mm_alignr_epi8(src[1], src[0], 4);
791 }
792
Prepare5Lo8(const __m128i src,__m128i dst[5])793 inline void Prepare5Lo8(const __m128i src, __m128i dst[5]) {
794 dst[0] = src;
795 dst[1] = _mm_srli_si128(src, 1);
796 dst[2] = _mm_srli_si128(src, 2);
797 dst[3] = _mm_srli_si128(src, 3);
798 dst[4] = _mm_srli_si128(src, 4);
799 }
800
801 template <int offset>
Prepare5_8(const __m128i src[2],__m128i dst[5])802 inline void Prepare5_8(const __m128i src[2], __m128i dst[5]) {
803 dst[0] = _mm_alignr_epi8(src[1], src[0], offset + 0);
804 dst[1] = _mm_alignr_epi8(src[1], src[0], offset + 1);
805 dst[2] = _mm_alignr_epi8(src[1], src[0], offset + 2);
806 dst[3] = _mm_alignr_epi8(src[1], src[0], offset + 3);
807 dst[4] = _mm_alignr_epi8(src[1], src[0], offset + 4);
808 }
809
Prepare5_16(const __m128i src[2],__m128i dst[5])810 inline void Prepare5_16(const __m128i src[2], __m128i dst[5]) {
811 Prepare3_16(src, dst);
812 dst[3] = _mm_alignr_epi8(src[1], src[0], 6);
813 dst[4] = _mm_alignr_epi8(src[1], src[0], 8);
814 }
815
Sum3_16(const __m128i src0,const __m128i src1,const __m128i src2)816 inline __m128i Sum3_16(const __m128i src0, const __m128i src1,
817 const __m128i src2) {
818 const __m128i sum = _mm_add_epi16(src0, src1);
819 return _mm_add_epi16(sum, src2);
820 }
821
Sum3_16(const __m128i src[3])822 inline __m128i Sum3_16(const __m128i src[3]) {
823 return Sum3_16(src[0], src[1], src[2]);
824 }
825
Sum3_32(const __m128i src0,const __m128i src1,const __m128i src2)826 inline __m128i Sum3_32(const __m128i src0, const __m128i src1,
827 const __m128i src2) {
828 const __m128i sum = _mm_add_epi32(src0, src1);
829 return _mm_add_epi32(sum, src2);
830 }
831
Sum3_32(const __m128i src[3][2],__m128i dst[2])832 inline void Sum3_32(const __m128i src[3][2], __m128i dst[2]) {
833 dst[0] = Sum3_32(src[0][0], src[1][0], src[2][0]);
834 dst[1] = Sum3_32(src[0][1], src[1][1], src[2][1]);
835 }
836
Sum3WLo16(const __m128i src[3])837 inline __m128i Sum3WLo16(const __m128i src[3]) {
838 const __m128i sum = VaddlLo8(src[0], src[1]);
839 return VaddwLo8(sum, src[2]);
840 }
841
Sum3WHi16(const __m128i src[3])842 inline __m128i Sum3WHi16(const __m128i src[3]) {
843 const __m128i sum = VaddlHi8(src[0], src[1]);
844 return VaddwHi8(sum, src[2]);
845 }
846
Sum3WLo32(const __m128i src[3])847 inline __m128i Sum3WLo32(const __m128i src[3]) {
848 const __m128i sum = VaddlLo16(src[0], src[1]);
849 return VaddwLo16(sum, src[2]);
850 }
851
Sum3WHi32(const __m128i src[3])852 inline __m128i Sum3WHi32(const __m128i src[3]) {
853 const __m128i sum = VaddlHi16(src[0], src[1]);
854 return VaddwHi16(sum, src[2]);
855 }
856
Sum5_16(const __m128i src[5])857 inline __m128i Sum5_16(const __m128i src[5]) {
858 const __m128i sum01 = _mm_add_epi16(src[0], src[1]);
859 const __m128i sum23 = _mm_add_epi16(src[2], src[3]);
860 const __m128i sum = _mm_add_epi16(sum01, sum23);
861 return _mm_add_epi16(sum, src[4]);
862 }
863
Sum5_32(const __m128i * const src0,const __m128i * const src1,const __m128i * const src2,const __m128i * const src3,const __m128i * const src4)864 inline __m128i Sum5_32(const __m128i* const src0, const __m128i* const src1,
865 const __m128i* const src2, const __m128i* const src3,
866 const __m128i* const src4) {
867 const __m128i sum01 = _mm_add_epi32(*src0, *src1);
868 const __m128i sum23 = _mm_add_epi32(*src2, *src3);
869 const __m128i sum = _mm_add_epi32(sum01, sum23);
870 return _mm_add_epi32(sum, *src4);
871 }
872
Sum5_32(const __m128i src[5][2],__m128i dst[2])873 inline void Sum5_32(const __m128i src[5][2], __m128i dst[2]) {
874 dst[0] = Sum5_32(&src[0][0], &src[1][0], &src[2][0], &src[3][0], &src[4][0]);
875 dst[1] = Sum5_32(&src[0][1], &src[1][1], &src[2][1], &src[3][1], &src[4][1]);
876 }
877
Sum5WLo16(const __m128i src[5])878 inline __m128i Sum5WLo16(const __m128i src[5]) {
879 const __m128i sum01 = VaddlLo8(src[0], src[1]);
880 const __m128i sum23 = VaddlLo8(src[2], src[3]);
881 const __m128i sum = _mm_add_epi16(sum01, sum23);
882 return VaddwLo8(sum, src[4]);
883 }
884
Sum5WHi16(const __m128i src[5])885 inline __m128i Sum5WHi16(const __m128i src[5]) {
886 const __m128i sum01 = VaddlHi8(src[0], src[1]);
887 const __m128i sum23 = VaddlHi8(src[2], src[3]);
888 const __m128i sum = _mm_add_epi16(sum01, sum23);
889 return VaddwHi8(sum, src[4]);
890 }
891
Sum3Horizontal(const __m128i src)892 inline __m128i Sum3Horizontal(const __m128i src) {
893 __m128i s[3];
894 Prepare3Lo8(src, s);
895 return Sum3WLo16(s);
896 }
897
898 template <int offset>
Sum3Horizontal(const __m128i src[2],__m128i dst[2])899 inline void Sum3Horizontal(const __m128i src[2], __m128i dst[2]) {
900 __m128i s[3];
901 Prepare3_8<offset>(src, s);
902 dst[0] = Sum3WLo16(s);
903 dst[1] = Sum3WHi16(s);
904 }
905
Sum3WHorizontal(const __m128i src[2],__m128i dst[2])906 inline void Sum3WHorizontal(const __m128i src[2], __m128i dst[2]) {
907 __m128i s[3];
908 Prepare3_16(src, s);
909 dst[0] = Sum3WLo32(s);
910 dst[1] = Sum3WHi32(s);
911 }
912
Sum5Horizontal(const __m128i src)913 inline __m128i Sum5Horizontal(const __m128i src) {
914 __m128i s[5];
915 Prepare5Lo8(src, s);
916 return Sum5WLo16(s);
917 }
918
919 template <int offset>
Sum5Horizontal(const __m128i src[2],__m128i * const dst0,__m128i * const dst1)920 inline void Sum5Horizontal(const __m128i src[2], __m128i* const dst0,
921 __m128i* const dst1) {
922 __m128i s[5];
923 Prepare5_8<offset>(src, s);
924 *dst0 = Sum5WLo16(s);
925 *dst1 = Sum5WHi16(s);
926 }
927
Sum5WHorizontal(const __m128i src[2],__m128i dst[2])928 inline void Sum5WHorizontal(const __m128i src[2], __m128i dst[2]) {
929 __m128i s[5];
930 Prepare5_16(src, s);
931 const __m128i sum01_lo = VaddlLo16(s[0], s[1]);
932 const __m128i sum23_lo = VaddlLo16(s[2], s[3]);
933 const __m128i sum0123_lo = _mm_add_epi32(sum01_lo, sum23_lo);
934 dst[0] = VaddwLo16(sum0123_lo, s[4]);
935 const __m128i sum01_hi = VaddlHi16(s[0], s[1]);
936 const __m128i sum23_hi = VaddlHi16(s[2], s[3]);
937 const __m128i sum0123_hi = _mm_add_epi32(sum01_hi, sum23_hi);
938 dst[1] = VaddwHi16(sum0123_hi, s[4]);
939 }
940
SumHorizontalLo(const __m128i src[5],__m128i * const row_sq3,__m128i * const row_sq5)941 void SumHorizontalLo(const __m128i src[5], __m128i* const row_sq3,
942 __m128i* const row_sq5) {
943 const __m128i sum04 = VaddlLo16(src[0], src[4]);
944 *row_sq3 = Sum3WLo32(src + 1);
945 *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
946 }
947
SumHorizontalHi(const __m128i src[5],__m128i * const row_sq3,__m128i * const row_sq5)948 void SumHorizontalHi(const __m128i src[5], __m128i* const row_sq3,
949 __m128i* const row_sq5) {
950 const __m128i sum04 = VaddlHi16(src[0], src[4]);
951 *row_sq3 = Sum3WHi32(src + 1);
952 *row_sq5 = _mm_add_epi32(sum04, *row_sq3);
953 }
954
SumHorizontalLo(const __m128i src,__m128i * const row3,__m128i * const row5)955 void SumHorizontalLo(const __m128i src, __m128i* const row3,
956 __m128i* const row5) {
957 __m128i s[5];
958 Prepare5Lo8(src, s);
959 const __m128i sum04 = VaddlLo8(s[0], s[4]);
960 *row3 = Sum3WLo16(s + 1);
961 *row5 = _mm_add_epi16(sum04, *row3);
962 }
963
964 template <int offset>
SumHorizontal(const __m128i src[2],__m128i * const row3_0,__m128i * const row3_1,__m128i * const row5_0,__m128i * const row5_1)965 void SumHorizontal(const __m128i src[2], __m128i* const row3_0,
966 __m128i* const row3_1, __m128i* const row5_0,
967 __m128i* const row5_1) {
968 __m128i s[5];
969 Prepare5_8<offset>(src, s);
970 const __m128i sum04_lo = VaddlLo8(s[0], s[4]);
971 const __m128i sum04_hi = VaddlHi8(s[0], s[4]);
972 *row3_0 = Sum3WLo16(s + 1);
973 *row3_1 = Sum3WHi16(s + 1);
974 *row5_0 = _mm_add_epi16(sum04_lo, *row3_0);
975 *row5_1 = _mm_add_epi16(sum04_hi, *row3_1);
976 }
977
SumHorizontal(const __m128i src[2],__m128i * const row_sq3_0,__m128i * const row_sq3_1,__m128i * const row_sq5_0,__m128i * const row_sq5_1)978 inline void SumHorizontal(const __m128i src[2], __m128i* const row_sq3_0,
979 __m128i* const row_sq3_1, __m128i* const row_sq5_0,
980 __m128i* const row_sq5_1) {
981 __m128i s[5];
982 Prepare5_16(src, s);
983 SumHorizontalLo(s, row_sq3_0, row_sq5_0);
984 SumHorizontalHi(s, row_sq3_1, row_sq5_1);
985 }
986
Sum343Lo(const __m128i ma3[3])987 inline __m128i Sum343Lo(const __m128i ma3[3]) {
988 const __m128i sum = Sum3WLo16(ma3);
989 const __m128i sum3 = Sum3_16(sum, sum, sum);
990 return VaddwLo8(sum3, ma3[1]);
991 }
992
Sum343Hi(const __m128i ma3[3])993 inline __m128i Sum343Hi(const __m128i ma3[3]) {
994 const __m128i sum = Sum3WHi16(ma3);
995 const __m128i sum3 = Sum3_16(sum, sum, sum);
996 return VaddwHi8(sum3, ma3[1]);
997 }
998
Sum343WLo(const __m128i src[3])999 inline __m128i Sum343WLo(const __m128i src[3]) {
1000 const __m128i sum = Sum3WLo32(src);
1001 const __m128i sum3 = Sum3_32(sum, sum, sum);
1002 return VaddwLo16(sum3, src[1]);
1003 }
1004
Sum343WHi(const __m128i src[3])1005 inline __m128i Sum343WHi(const __m128i src[3]) {
1006 const __m128i sum = Sum3WHi32(src);
1007 const __m128i sum3 = Sum3_32(sum, sum, sum);
1008 return VaddwHi16(sum3, src[1]);
1009 }
1010
Sum343W(const __m128i src[2],__m128i dst[2])1011 inline void Sum343W(const __m128i src[2], __m128i dst[2]) {
1012 __m128i s[3];
1013 Prepare3_16(src, s);
1014 dst[0] = Sum343WLo(s);
1015 dst[1] = Sum343WHi(s);
1016 }
1017
Sum565Lo(const __m128i src[3])1018 inline __m128i Sum565Lo(const __m128i src[3]) {
1019 const __m128i sum = Sum3WLo16(src);
1020 const __m128i sum4 = _mm_slli_epi16(sum, 2);
1021 const __m128i sum5 = _mm_add_epi16(sum4, sum);
1022 return VaddwLo8(sum5, src[1]);
1023 }
1024
Sum565Hi(const __m128i src[3])1025 inline __m128i Sum565Hi(const __m128i src[3]) {
1026 const __m128i sum = Sum3WHi16(src);
1027 const __m128i sum4 = _mm_slli_epi16(sum, 2);
1028 const __m128i sum5 = _mm_add_epi16(sum4, sum);
1029 return VaddwHi8(sum5, src[1]);
1030 }
1031
Sum565WLo(const __m128i src[3])1032 inline __m128i Sum565WLo(const __m128i src[3]) {
1033 const __m128i sum = Sum3WLo32(src);
1034 const __m128i sum4 = _mm_slli_epi32(sum, 2);
1035 const __m128i sum5 = _mm_add_epi32(sum4, sum);
1036 return VaddwLo16(sum5, src[1]);
1037 }
1038
Sum565WHi(const __m128i src[3])1039 inline __m128i Sum565WHi(const __m128i src[3]) {
1040 const __m128i sum = Sum3WHi32(src);
1041 const __m128i sum4 = _mm_slli_epi32(sum, 2);
1042 const __m128i sum5 = _mm_add_epi32(sum4, sum);
1043 return VaddwHi16(sum5, src[1]);
1044 }
1045
Sum565W(const __m128i src[2],__m128i dst[2])1046 inline void Sum565W(const __m128i src[2], __m128i dst[2]) {
1047 __m128i s[3];
1048 Prepare3_16(src, s);
1049 dst[0] = Sum565WLo(s);
1050 dst[1] = Sum565WHi(s);
1051 }
1052
BoxSum(const uint8_t * src,const ptrdiff_t src_stride,const ptrdiff_t width,const ptrdiff_t sum_stride,const ptrdiff_t sum_width,uint16_t * sum3,uint16_t * sum5,uint32_t * square_sum3,uint32_t * square_sum5)1053 inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
1054 const ptrdiff_t width, const ptrdiff_t sum_stride,
1055 const ptrdiff_t sum_width, uint16_t* sum3, uint16_t* sum5,
1056 uint32_t* square_sum3, uint32_t* square_sum5) {
1057 int y = 2;
1058 do {
1059 __m128i s[2], sq[3];
1060 s[0] = LoadUnaligned16Msan(src, kOverreadInBytesPass1 - width);
1061 sq[0] = SquareLo8(s[0]);
1062 ptrdiff_t x = sum_width;
1063 do {
1064 __m128i row3[2], row5[2], row_sq3[2], row_sq5[2];
1065 x -= 16;
1066 src += 16;
1067 s[1] = LoadUnaligned16Msan(src,
1068 sum_width - x + kOverreadInBytesPass1 - width);
1069 sq[1] = SquareHi8(s[0]);
1070 sq[2] = SquareLo8(s[1]);
1071 SumHorizontal<0>(s, &row3[0], &row3[1], &row5[0], &row5[1]);
1072 StoreAligned32U16(sum3, row3);
1073 StoreAligned32U16(sum5, row5);
1074 SumHorizontal(sq + 0, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
1075 StoreAligned32U32(square_sum3 + 0, row_sq3);
1076 StoreAligned32U32(square_sum5 + 0, row_sq5);
1077 SumHorizontal(sq + 1, &row_sq3[0], &row_sq3[1], &row_sq5[0], &row_sq5[1]);
1078 StoreAligned32U32(square_sum3 + 8, row_sq3);
1079 StoreAligned32U32(square_sum5 + 8, row_sq5);
1080 s[0] = s[1];
1081 sq[0] = sq[2];
1082 sum3 += 16;
1083 sum5 += 16;
1084 square_sum3 += 16;
1085 square_sum5 += 16;
1086 } while (x != 0);
1087 src += src_stride - sum_width;
1088 sum3 += sum_stride - sum_width;
1089 sum5 += sum_stride - sum_width;
1090 square_sum3 += sum_stride - sum_width;
1091 square_sum5 += sum_stride - sum_width;
1092 } while (--y != 0);
1093 }
1094
1095 template <int size>
BoxSum(const uint8_t * src,const ptrdiff_t src_stride,const ptrdiff_t width,const ptrdiff_t sum_stride,const ptrdiff_t sum_width,uint16_t * sums,uint32_t * square_sums)1096 inline void BoxSum(const uint8_t* src, const ptrdiff_t src_stride,
1097 const ptrdiff_t width, const ptrdiff_t sum_stride,
1098 const ptrdiff_t sum_width, uint16_t* sums,
1099 uint32_t* square_sums) {
1100 static_assert(size == 3 || size == 5, "");
1101 constexpr int kOverreadInBytes =
1102 (size == 5) ? kOverreadInBytesPass1 : kOverreadInBytesPass2;
1103 int y = 2;
1104 do {
1105 __m128i s[2], sq[3];
1106 s[0] = LoadUnaligned16Msan(src, kOverreadInBytes - width);
1107 sq[0] = SquareLo8(s[0]);
1108 ptrdiff_t x = sum_width;
1109 do {
1110 __m128i row[2], row_sq[4];
1111 x -= 16;
1112 src += 16;
1113 s[1] = LoadUnaligned16Msan(src, sum_width - x + kOverreadInBytes - width);
1114 sq[1] = SquareHi8(s[0]);
1115 sq[2] = SquareLo8(s[1]);
1116 if (size == 3) {
1117 Sum3Horizontal<0>(s, row);
1118 Sum3WHorizontal(sq + 0, row_sq + 0);
1119 Sum3WHorizontal(sq + 1, row_sq + 2);
1120 } else {
1121 Sum5Horizontal<0>(s, &row[0], &row[1]);
1122 Sum5WHorizontal(sq + 0, row_sq + 0);
1123 Sum5WHorizontal(sq + 1, row_sq + 2);
1124 }
1125 StoreAligned32U16(sums, row);
1126 StoreAligned64U32(square_sums, row_sq);
1127 s[0] = s[1];
1128 sq[0] = sq[2];
1129 sums += 16;
1130 square_sums += 16;
1131 } while (x != 0);
1132 src += src_stride - sum_width;
1133 sums += sum_stride - sum_width;
1134 square_sums += sum_stride - sum_width;
1135 } while (--y != 0);
1136 }
1137
1138 template <int n>
CalculateMa(const __m128i sum,const __m128i sum_sq,const uint32_t scale)1139 inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq,
1140 const uint32_t scale) {
1141 static_assert(n == 9 || n == 25, "");
1142 // a = |sum_sq|
1143 // d = |sum|
1144 // p = (a * n < d * d) ? 0 : a * n - d * d;
1145 const __m128i dxd = _mm_madd_epi16(sum, sum);
1146 // _mm_mullo_epi32() has high latency. Using shifts and additions instead.
1147 // Some compilers could do this for us but we make this explicit.
1148 // return _mm_mullo_epi32(sum_sq, _mm_set1_epi32(n));
1149 __m128i axn = _mm_add_epi32(sum_sq, _mm_slli_epi32(sum_sq, 3));
1150 if (n == 25) axn = _mm_add_epi32(axn, _mm_slli_epi32(sum_sq, 4));
1151 const __m128i sub = _mm_sub_epi32(axn, dxd);
1152 const __m128i p = _mm_max_epi32(sub, _mm_setzero_si128());
1153 const __m128i pxs = _mm_mullo_epi32(p, _mm_set1_epi32(scale));
1154 return VrshrU32(pxs, kSgrProjScaleBits);
1155 }
1156
1157 template <int n>
CalculateMa(const __m128i sum,const __m128i sum_sq[2],const uint32_t scale)1158 inline __m128i CalculateMa(const __m128i sum, const __m128i sum_sq[2],
1159 const uint32_t scale) {
1160 static_assert(n == 9 || n == 25, "");
1161 const __m128i sum_lo = _mm_unpacklo_epi16(sum, _mm_setzero_si128());
1162 const __m128i sum_hi = _mm_unpackhi_epi16(sum, _mm_setzero_si128());
1163 const __m128i z0 = CalculateMa<n>(sum_lo, sum_sq[0], scale);
1164 const __m128i z1 = CalculateMa<n>(sum_hi, sum_sq[1], scale);
1165 return _mm_packus_epi32(z0, z1);
1166 }
1167
CalculateB5(const __m128i sum,const __m128i ma)1168 inline __m128i CalculateB5(const __m128i sum, const __m128i ma) {
1169 // one_over_n == 164.
1170 constexpr uint32_t one_over_n =
1171 ((1 << kSgrProjReciprocalBits) + (25 >> 1)) / 25;
1172 // one_over_n_quarter == 41.
1173 constexpr uint32_t one_over_n_quarter = one_over_n >> 2;
1174 static_assert(one_over_n == one_over_n_quarter << 2, "");
1175 // |ma| is in range [0, 255].
1176 const __m128i m = _mm_maddubs_epi16(ma, _mm_set1_epi16(one_over_n_quarter));
1177 const __m128i m0 = VmullLo16(m, sum);
1178 const __m128i m1 = VmullHi16(m, sum);
1179 const __m128i b_lo = VrshrU32(m0, kSgrProjReciprocalBits - 2);
1180 const __m128i b_hi = VrshrU32(m1, kSgrProjReciprocalBits - 2);
1181 return _mm_packus_epi32(b_lo, b_hi);
1182 }
1183
CalculateB3(const __m128i sum,const __m128i ma)1184 inline __m128i CalculateB3(const __m128i sum, const __m128i ma) {
1185 // one_over_n == 455.
1186 constexpr uint32_t one_over_n =
1187 ((1 << kSgrProjReciprocalBits) + (9 >> 1)) / 9;
1188 const __m128i m0 = VmullLo16(ma, sum);
1189 const __m128i m1 = VmullHi16(ma, sum);
1190 const __m128i m2 = _mm_mullo_epi32(m0, _mm_set1_epi32(one_over_n));
1191 const __m128i m3 = _mm_mullo_epi32(m1, _mm_set1_epi32(one_over_n));
1192 const __m128i b_lo = VrshrU32(m2, kSgrProjReciprocalBits);
1193 const __m128i b_hi = VrshrU32(m3, kSgrProjReciprocalBits);
1194 return _mm_packus_epi32(b_lo, b_hi);
1195 }
1196
CalculateSumAndIndex5(const __m128i s5[5],const __m128i sq5[5][2],const uint32_t scale,__m128i * const sum,__m128i * const index)1197 inline void CalculateSumAndIndex5(const __m128i s5[5], const __m128i sq5[5][2],
1198 const uint32_t scale, __m128i* const sum,
1199 __m128i* const index) {
1200 __m128i sum_sq[2];
1201 *sum = Sum5_16(s5);
1202 Sum5_32(sq5, sum_sq);
1203 *index = CalculateMa<25>(*sum, sum_sq, scale);
1204 }
1205
CalculateSumAndIndex3(const __m128i s3[3],const __m128i sq3[3][2],const uint32_t scale,__m128i * const sum,__m128i * const index)1206 inline void CalculateSumAndIndex3(const __m128i s3[3], const __m128i sq3[3][2],
1207 const uint32_t scale, __m128i* const sum,
1208 __m128i* const index) {
1209 __m128i sum_sq[2];
1210 *sum = Sum3_16(s3);
1211 Sum3_32(sq3, sum_sq);
1212 *index = CalculateMa<9>(*sum, sum_sq, scale);
1213 }
1214
1215 template <int n, int offset>
LookupIntermediate(const __m128i sum,const __m128i index,__m128i * const ma,__m128i * const b)1216 inline void LookupIntermediate(const __m128i sum, const __m128i index,
1217 __m128i* const ma, __m128i* const b) {
1218 static_assert(n == 9 || n == 25, "");
1219 static_assert(offset == 0 || offset == 8, "");
1220 const __m128i idx = _mm_packus_epi16(index, index);
1221 // Actually it's not stored and loaded. The compiler will use a 64-bit
1222 // general-purpose register to process. Faster than using _mm_extract_epi8().
1223 uint8_t temp[8];
1224 StoreLo8(temp, idx);
1225 // offset == 0 is assumed to be the first call to this function. The value is
1226 // mov'd to avoid -Wuninitialized warnings under gcc. mov should at least
1227 // equivalent if not faster than pinsrb.
1228 if (offset == 0) {
1229 *ma = _mm_cvtsi32_si128(kSgrMaLookup[temp[0]]);
1230 } else {
1231 *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[0]], offset + 0);
1232 }
1233 *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[1]], offset + 1);
1234 *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[2]], offset + 2);
1235 *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[3]], offset + 3);
1236 *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[4]], offset + 4);
1237 *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[5]], offset + 5);
1238 *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[6]], offset + 6);
1239 *ma = _mm_insert_epi8(*ma, kSgrMaLookup[temp[7]], offset + 7);
1240 // b = ma * b * one_over_n
1241 // |ma| = [0, 255]
1242 // |sum| is a box sum with radius 1 or 2.
1243 // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
1244 // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
1245 // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
1246 // When radius is 2 |n| is 25. |one_over_n| is 164.
1247 // When radius is 1 |n| is 9. |one_over_n| is 455.
1248 // |kSgrProjReciprocalBits| is 12.
1249 // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
1250 // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
1251 __m128i maq;
1252 if (offset == 0) {
1253 maq = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
1254 } else {
1255 maq = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
1256 }
1257 *b = (n == 9) ? CalculateB3(sum, maq) : CalculateB5(sum, maq);
1258 }
1259
1260 // Set the shuffle control mask of indices out of range [0, 15] to (1xxxxxxx)b
1261 // to get value 0 as the shuffle result. The most significiant bit 1 comes
1262 // either from the comparison instruction, or from the sign bit of the index.
ShuffleIndex(const __m128i table,const __m128i index)1263 inline __m128i ShuffleIndex(const __m128i table, const __m128i index) {
1264 __m128i mask;
1265 mask = _mm_cmpgt_epi8(index, _mm_set1_epi8(15));
1266 mask = _mm_or_si128(mask, index);
1267 return _mm_shuffle_epi8(table, mask);
1268 }
1269
AdjustValue(const __m128i value,const __m128i index,const int threshold)1270 inline __m128i AdjustValue(const __m128i value, const __m128i index,
1271 const int threshold) {
1272 const __m128i thresholds = _mm_set1_epi8(threshold - 128);
1273 const __m128i offset = _mm_cmpgt_epi8(index, thresholds);
1274 return _mm_add_epi8(value, offset);
1275 }
1276
CalculateIntermediate(const __m128i sum[2],const __m128i index[2],__m128i * const ma,__m128i * const b0,__m128i * const b1)1277 inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
1278 __m128i* const ma, __m128i* const b0,
1279 __m128i* const b1) {
1280 // Use table lookup to read elements whose indices are less than 48.
1281 const __m128i c0 = LoadAligned16(kSgrMaLookup + 0 * 16);
1282 const __m128i c1 = LoadAligned16(kSgrMaLookup + 1 * 16);
1283 const __m128i c2 = LoadAligned16(kSgrMaLookup + 2 * 16);
1284 const __m128i indices = _mm_packus_epi16(index[0], index[1]);
1285 __m128i idx;
1286 // Clip idx to 127 to apply signed comparison instructions.
1287 idx = _mm_min_epu8(indices, _mm_set1_epi8(127));
1288 // All elements whose indices are less than 48 are set to 0.
1289 // Get shuffle results for indices in range [0, 15].
1290 *ma = ShuffleIndex(c0, idx);
1291 // Get shuffle results for indices in range [16, 31].
1292 // Subtract 16 to utilize the sign bit of the index.
1293 idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
1294 const __m128i res1 = ShuffleIndex(c1, idx);
1295 // Use OR instruction to combine shuffle results together.
1296 *ma = _mm_or_si128(*ma, res1);
1297 // Get shuffle results for indices in range [32, 47].
1298 // Subtract 16 to utilize the sign bit of the index.
1299 idx = _mm_sub_epi8(idx, _mm_set1_epi8(16));
1300 const __m128i res2 = ShuffleIndex(c2, idx);
1301 *ma = _mm_or_si128(*ma, res2);
1302
1303 // For elements whose indices are larger than 47, since they seldom change
1304 // values with the increase of the index, we use comparison and arithmetic
1305 // operations to calculate their values.
1306 // Add -128 to apply signed comparison instructions.
1307 idx = _mm_add_epi8(indices, _mm_set1_epi8(-128));
1308 // Elements whose indices are larger than 47 (with value 0) are set to 5.
1309 *ma = _mm_max_epu8(*ma, _mm_set1_epi8(5));
1310 *ma = AdjustValue(*ma, idx, 55); // 55 is the last index which value is 5.
1311 *ma = AdjustValue(*ma, idx, 72); // 72 is the last index which value is 4.
1312 *ma = AdjustValue(*ma, idx, 101); // 101 is the last index which value is 3.
1313 *ma = AdjustValue(*ma, idx, 169); // 169 is the last index which value is 2.
1314 *ma = AdjustValue(*ma, idx, 254); // 254 is the last index which value is 1.
1315
1316 // b = ma * b * one_over_n
1317 // |ma| = [0, 255]
1318 // |sum| is a box sum with radius 1 or 2.
1319 // For the first pass radius is 2. Maximum value is 5x5x255 = 6375.
1320 // For the second pass radius is 1. Maximum value is 3x3x255 = 2295.
1321 // |one_over_n| = ((1 << kSgrProjReciprocalBits) + (n >> 1)) / n
1322 // When radius is 2 |n| is 25. |one_over_n| is 164.
1323 // When radius is 1 |n| is 9. |one_over_n| is 455.
1324 // |kSgrProjReciprocalBits| is 12.
1325 // Radius 2: 255 * 6375 * 164 >> 12 = 65088 (16 bits).
1326 // Radius 1: 255 * 2295 * 455 >> 12 = 65009 (16 bits).
1327 const __m128i maq0 = _mm_unpacklo_epi8(*ma, _mm_setzero_si128());
1328 *b0 = CalculateB3(sum[0], maq0);
1329 const __m128i maq1 = _mm_unpackhi_epi8(*ma, _mm_setzero_si128());
1330 *b1 = CalculateB3(sum[1], maq1);
1331 }
1332
CalculateIntermediate(const __m128i sum[2],const __m128i index[2],__m128i ma[2],__m128i b[2])1333 inline void CalculateIntermediate(const __m128i sum[2], const __m128i index[2],
1334 __m128i ma[2], __m128i b[2]) {
1335 __m128i mas;
1336 CalculateIntermediate(sum, index, &mas, &b[0], &b[1]);
1337 ma[0] = _mm_unpacklo_epi64(ma[0], mas);
1338 ma[1] = _mm_srli_si128(mas, 8);
1339 }
1340
1341 // Note: It has been tried to call CalculateIntermediate() to replace the slow
1342 // LookupIntermediate() when calculating 16 intermediate data points. However,
1343 // the compiler generates even slower code.
1344 template <int offset>
CalculateIntermediate5(const __m128i s5[5],const __m128i sq5[5][2],const uint32_t scale,__m128i * const ma,__m128i * const b)1345 inline void CalculateIntermediate5(const __m128i s5[5], const __m128i sq5[5][2],
1346 const uint32_t scale, __m128i* const ma,
1347 __m128i* const b) {
1348 static_assert(offset == 0 || offset == 8, "");
1349 __m128i sum, index;
1350 CalculateSumAndIndex5(s5, sq5, scale, &sum, &index);
1351 LookupIntermediate<25, offset>(sum, index, ma, b);
1352 }
1353
CalculateIntermediate3(const __m128i s3[3],const __m128i sq3[3][2],const uint32_t scale,__m128i * const ma,__m128i * const b)1354 inline void CalculateIntermediate3(const __m128i s3[3], const __m128i sq3[3][2],
1355 const uint32_t scale, __m128i* const ma,
1356 __m128i* const b) {
1357 __m128i sum, index;
1358 CalculateSumAndIndex3(s3, sq3, scale, &sum, &index);
1359 LookupIntermediate<9, 0>(sum, index, ma, b);
1360 }
1361
Store343_444(const __m128i b3[2],const ptrdiff_t x,__m128i sum_b343[2],__m128i sum_b444[2],uint32_t * const b343,uint32_t * const b444)1362 inline void Store343_444(const __m128i b3[2], const ptrdiff_t x,
1363 __m128i sum_b343[2], __m128i sum_b444[2],
1364 uint32_t* const b343, uint32_t* const b444) {
1365 __m128i b[3], sum_b111[2];
1366 Prepare3_16(b3, b);
1367 sum_b111[0] = Sum3WLo32(b);
1368 sum_b111[1] = Sum3WHi32(b);
1369 sum_b444[0] = _mm_slli_epi32(sum_b111[0], 2);
1370 sum_b444[1] = _mm_slli_epi32(sum_b111[1], 2);
1371 StoreAligned32U32(b444 + x, sum_b444);
1372 sum_b343[0] = _mm_sub_epi32(sum_b444[0], sum_b111[0]);
1373 sum_b343[1] = _mm_sub_epi32(sum_b444[1], sum_b111[1]);
1374 sum_b343[0] = VaddwLo16(sum_b343[0], b[1]);
1375 sum_b343[1] = VaddwHi16(sum_b343[1], b[1]);
1376 StoreAligned32U32(b343 + x, sum_b343);
1377 }
1378
Store343_444Lo(const __m128i ma3[3],const __m128i b3[2],const ptrdiff_t x,__m128i * const sum_ma343,__m128i * const sum_ma444,__m128i sum_b343[2],__m128i sum_b444[2],uint16_t * const ma343,uint16_t * const ma444,uint32_t * const b343,uint32_t * const b444)1379 inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
1380 const ptrdiff_t x, __m128i* const sum_ma343,
1381 __m128i* const sum_ma444, __m128i sum_b343[2],
1382 __m128i sum_b444[2], uint16_t* const ma343,
1383 uint16_t* const ma444, uint32_t* const b343,
1384 uint32_t* const b444) {
1385 const __m128i sum_ma111 = Sum3WLo16(ma3);
1386 *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
1387 StoreAligned16(ma444 + x, *sum_ma444);
1388 const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
1389 *sum_ma343 = VaddwLo8(sum333, ma3[1]);
1390 StoreAligned16(ma343 + x, *sum_ma343);
1391 Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
1392 }
1393
Store343_444Hi(const __m128i ma3[3],const __m128i b3[2],const ptrdiff_t x,__m128i * const sum_ma343,__m128i * const sum_ma444,__m128i sum_b343[2],__m128i sum_b444[2],uint16_t * const ma343,uint16_t * const ma444,uint32_t * const b343,uint32_t * const b444)1394 inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
1395 const ptrdiff_t x, __m128i* const sum_ma343,
1396 __m128i* const sum_ma444, __m128i sum_b343[2],
1397 __m128i sum_b444[2], uint16_t* const ma343,
1398 uint16_t* const ma444, uint32_t* const b343,
1399 uint32_t* const b444) {
1400 const __m128i sum_ma111 = Sum3WHi16(ma3);
1401 *sum_ma444 = _mm_slli_epi16(sum_ma111, 2);
1402 StoreAligned16(ma444 + x, *sum_ma444);
1403 const __m128i sum333 = _mm_sub_epi16(*sum_ma444, sum_ma111);
1404 *sum_ma343 = VaddwHi8(sum333, ma3[1]);
1405 StoreAligned16(ma343 + x, *sum_ma343);
1406 Store343_444(b3, x, sum_b343, sum_b444, b343, b444);
1407 }
1408
Store343_444Lo(const __m128i ma3[3],const __m128i b3[2],const ptrdiff_t x,__m128i * const sum_ma343,__m128i sum_b343[2],uint16_t * const ma343,uint16_t * const ma444,uint32_t * const b343,uint32_t * const b444)1409 inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
1410 const ptrdiff_t x, __m128i* const sum_ma343,
1411 __m128i sum_b343[2], uint16_t* const ma343,
1412 uint16_t* const ma444, uint32_t* const b343,
1413 uint32_t* const b444) {
1414 __m128i sum_ma444, sum_b444[2];
1415 Store343_444Lo(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
1416 ma444, b343, b444);
1417 }
1418
Store343_444Hi(const __m128i ma3[3],const __m128i b3[2],const ptrdiff_t x,__m128i * const sum_ma343,__m128i sum_b343[2],uint16_t * const ma343,uint16_t * const ma444,uint32_t * const b343,uint32_t * const b444)1419 inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
1420 const ptrdiff_t x, __m128i* const sum_ma343,
1421 __m128i sum_b343[2], uint16_t* const ma343,
1422 uint16_t* const ma444, uint32_t* const b343,
1423 uint32_t* const b444) {
1424 __m128i sum_ma444, sum_b444[2];
1425 Store343_444Hi(ma3, b3, x, sum_ma343, &sum_ma444, sum_b343, sum_b444, ma343,
1426 ma444, b343, b444);
1427 }
1428
Store343_444Lo(const __m128i ma3[3],const __m128i b3[2],const ptrdiff_t x,uint16_t * const ma343,uint16_t * const ma444,uint32_t * const b343,uint32_t * const b444)1429 inline void Store343_444Lo(const __m128i ma3[3], const __m128i b3[2],
1430 const ptrdiff_t x, uint16_t* const ma343,
1431 uint16_t* const ma444, uint32_t* const b343,
1432 uint32_t* const b444) {
1433 __m128i sum_ma343, sum_b343[2];
1434 Store343_444Lo(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
1435 }
1436
Store343_444Hi(const __m128i ma3[3],const __m128i b3[2],const ptrdiff_t x,uint16_t * const ma343,uint16_t * const ma444,uint32_t * const b343,uint32_t * const b444)1437 inline void Store343_444Hi(const __m128i ma3[3], const __m128i b3[2],
1438 const ptrdiff_t x, uint16_t* const ma343,
1439 uint16_t* const ma444, uint32_t* const b343,
1440 uint32_t* const b444) {
1441 __m128i sum_ma343, sum_b343[2];
1442 Store343_444Hi(ma3, b3, x, &sum_ma343, sum_b343, ma343, ma444, b343, b444);
1443 }
1444
BoxFilterPreProcess5Lo(const __m128i s[2][2],const uint32_t scale,uint16_t * const sum5[5],uint32_t * const square_sum5[5],__m128i sq[2][4],__m128i * const ma,__m128i * const b)1445 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5Lo(
1446 const __m128i s[2][2], const uint32_t scale, uint16_t* const sum5[5],
1447 uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i* const ma,
1448 __m128i* const b) {
1449 __m128i s5[2][5], sq5[5][2];
1450 sq[0][1] = SquareHi8(s[0][0]);
1451 sq[1][1] = SquareHi8(s[1][0]);
1452 s5[0][3] = Sum5Horizontal(s[0][0]);
1453 StoreAligned16(sum5[3], s5[0][3]);
1454 s5[0][4] = Sum5Horizontal(s[1][0]);
1455 StoreAligned16(sum5[4], s5[0][4]);
1456 Sum5WHorizontal(sq[0], sq5[3]);
1457 StoreAligned32U32(square_sum5[3], sq5[3]);
1458 Sum5WHorizontal(sq[1], sq5[4]);
1459 StoreAligned32U32(square_sum5[4], sq5[4]);
1460 LoadAligned16x3U16(sum5, 0, s5[0]);
1461 LoadAligned32x3U32(square_sum5, 0, sq5);
1462 CalculateIntermediate5<0>(s5[0], sq5, scale, ma, b);
1463 }
1464
BoxFilterPreProcess5(const __m128i s[2][2],const ptrdiff_t sum_width,const ptrdiff_t x,const uint32_t scale,uint16_t * const sum5[5],uint32_t * const square_sum5[5],__m128i sq[2][4],__m128i ma[2],__m128i b[3])1465 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5(
1466 const __m128i s[2][2], const ptrdiff_t sum_width, const ptrdiff_t x,
1467 const uint32_t scale, uint16_t* const sum5[5],
1468 uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i ma[2],
1469 __m128i b[3]) {
1470 __m128i s5[2][5], sq5[5][2];
1471 sq[0][2] = SquareLo8(s[0][1]);
1472 sq[1][2] = SquareLo8(s[1][1]);
1473 Sum5Horizontal<8>(s[0], &s5[0][3], &s5[1][3]);
1474 StoreAligned16(sum5[3] + x + 0, s5[0][3]);
1475 StoreAligned16(sum5[3] + x + 8, s5[1][3]);
1476 Sum5Horizontal<8>(s[1], &s5[0][4], &s5[1][4]);
1477 StoreAligned16(sum5[4] + x + 0, s5[0][4]);
1478 StoreAligned16(sum5[4] + x + 8, s5[1][4]);
1479 Sum5WHorizontal(sq[0] + 1, sq5[3]);
1480 StoreAligned32U32(square_sum5[3] + x, sq5[3]);
1481 Sum5WHorizontal(sq[1] + 1, sq5[4]);
1482 StoreAligned32U32(square_sum5[4] + x, sq5[4]);
1483 LoadAligned16x3U16(sum5, x, s5[0]);
1484 LoadAligned32x3U32(square_sum5, x, sq5);
1485 CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[1]);
1486
1487 sq[0][3] = SquareHi8(s[0][1]);
1488 sq[1][3] = SquareHi8(s[1][1]);
1489 Sum5WHorizontal(sq[0] + 2, sq5[3]);
1490 StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
1491 Sum5WHorizontal(sq[1] + 2, sq5[4]);
1492 StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
1493 LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
1494 LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
1495 CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[2]);
1496 }
1497
BoxFilterPreProcess5LastRowLo(const __m128i s,const uint32_t scale,const uint16_t * const sum5[5],const uint32_t * const square_sum5[5],__m128i sq[2],__m128i * const ma,__m128i * const b)1498 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRowLo(
1499 const __m128i s, const uint32_t scale, const uint16_t* const sum5[5],
1500 const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma,
1501 __m128i* const b) {
1502 __m128i s5[5], sq5[5][2];
1503 sq[1] = SquareHi8(s);
1504 s5[3] = s5[4] = Sum5Horizontal(s);
1505 Sum5WHorizontal(sq, sq5[3]);
1506 sq5[4][0] = sq5[3][0];
1507 sq5[4][1] = sq5[3][1];
1508 LoadAligned16x3U16(sum5, 0, s5);
1509 LoadAligned32x3U32(square_sum5, 0, sq5);
1510 CalculateIntermediate5<0>(s5, sq5, scale, ma, b);
1511 }
1512
BoxFilterPreProcess5LastRow(const __m128i s[2],const ptrdiff_t sum_width,const ptrdiff_t x,const uint32_t scale,const uint16_t * const sum5[5],const uint32_t * const square_sum5[5],__m128i sq[4],__m128i ma[2],__m128i b[3])1513 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess5LastRow(
1514 const __m128i s[2], const ptrdiff_t sum_width, const ptrdiff_t x,
1515 const uint32_t scale, const uint16_t* const sum5[5],
1516 const uint32_t* const square_sum5[5], __m128i sq[4], __m128i ma[2],
1517 __m128i b[3]) {
1518 __m128i s5[2][5], sq5[5][2];
1519 sq[2] = SquareLo8(s[1]);
1520 Sum5Horizontal<8>(s, &s5[0][3], &s5[1][3]);
1521 s5[0][4] = s5[0][3];
1522 s5[1][4] = s5[1][3];
1523 Sum5WHorizontal(sq + 1, sq5[3]);
1524 sq5[4][0] = sq5[3][0];
1525 sq5[4][1] = sq5[3][1];
1526 LoadAligned16x3U16(sum5, x, s5[0]);
1527 LoadAligned32x3U32(square_sum5, x, sq5);
1528 CalculateIntermediate5<8>(s5[0], sq5, scale, &ma[0], &b[1]);
1529
1530 sq[3] = SquareHi8(s[1]);
1531 Sum5WHorizontal(sq + 2, sq5[3]);
1532 sq5[4][0] = sq5[3][0];
1533 sq5[4][1] = sq5[3][1];
1534 LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
1535 LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
1536 CalculateIntermediate5<0>(s5[1], sq5, scale, &ma[1], &b[2]);
1537 }
1538
BoxFilterPreProcess3Lo(const __m128i s,const uint32_t scale,uint16_t * const sum3[3],uint32_t * const square_sum3[3],__m128i sq[2],__m128i * const ma,__m128i * const b)1539 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3Lo(
1540 const __m128i s, const uint32_t scale, uint16_t* const sum3[3],
1541 uint32_t* const square_sum3[3], __m128i sq[2], __m128i* const ma,
1542 __m128i* const b) {
1543 __m128i s3[3], sq3[3][2];
1544 sq[1] = SquareHi8(s);
1545 s3[2] = Sum3Horizontal(s);
1546 StoreAligned16(sum3[2], s3[2]);
1547 Sum3WHorizontal(sq, sq3[2]);
1548 StoreAligned32U32(square_sum3[2], sq3[2]);
1549 LoadAligned16x2U16(sum3, 0, s3);
1550 LoadAligned32x2U32(square_sum3, 0, sq3);
1551 CalculateIntermediate3(s3, sq3, scale, ma, b);
1552 }
1553
BoxFilterPreProcess3(const __m128i s[2],const ptrdiff_t x,const ptrdiff_t sum_width,const uint32_t scale,uint16_t * const sum3[3],uint32_t * const square_sum3[3],__m128i sq[4],__m128i ma[2],__m128i b[3])1554 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess3(
1555 const __m128i s[2], const ptrdiff_t x, const ptrdiff_t sum_width,
1556 const uint32_t scale, uint16_t* const sum3[3],
1557 uint32_t* const square_sum3[3], __m128i sq[4], __m128i ma[2],
1558 __m128i b[3]) {
1559 __m128i s3[4], sq3[3][2], sum[2], index[2];
1560 sq[2] = SquareLo8(s[1]);
1561 Sum3Horizontal<8>(s, s3 + 2);
1562 StoreAligned32U16(sum3[2] + x, s3 + 2);
1563 Sum3WHorizontal(sq + 1, sq3[2]);
1564 StoreAligned32U32(square_sum3[2] + x + 0, sq3[2]);
1565 LoadAligned16x2U16(sum3, x, s3);
1566 LoadAligned32x2U32(square_sum3, x, sq3);
1567 CalculateSumAndIndex3(s3, sq3, scale, &sum[0], &index[0]);
1568
1569 sq[3] = SquareHi8(s[1]);
1570 Sum3WHorizontal(sq + 2, sq3[2]);
1571 StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
1572 LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3 + 1);
1573 LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
1574 CalculateSumAndIndex3(s3 + 1, sq3, scale, &sum[1], &index[1]);
1575 CalculateIntermediate(sum, index, ma, b + 1);
1576 }
1577
BoxFilterPreProcessLo(const __m128i s[2][2],const uint16_t scales[2],uint16_t * const sum3[4],uint16_t * const sum5[5],uint32_t * const square_sum3[4],uint32_t * const square_sum5[5],__m128i sq[2][4],__m128i ma3[2][2],__m128i b3[2][3],__m128i * const ma5,__m128i * const b5)1578 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLo(
1579 const __m128i s[2][2], const uint16_t scales[2], uint16_t* const sum3[4],
1580 uint16_t* const sum5[5], uint32_t* const square_sum3[4],
1581 uint32_t* const square_sum5[5], __m128i sq[2][4], __m128i ma3[2][2],
1582 __m128i b3[2][3], __m128i* const ma5, __m128i* const b5) {
1583 __m128i s3[4], s5[5], sq3[4][2], sq5[5][2], sum[2], index[2];
1584 sq[0][1] = SquareHi8(s[0][0]);
1585 sq[1][1] = SquareHi8(s[1][0]);
1586 SumHorizontalLo(s[0][0], &s3[2], &s5[3]);
1587 SumHorizontalLo(s[1][0], &s3[3], &s5[4]);
1588 StoreAligned16(sum3[2], s3[2]);
1589 StoreAligned16(sum3[3], s3[3]);
1590 StoreAligned16(sum5[3], s5[3]);
1591 StoreAligned16(sum5[4], s5[4]);
1592 SumHorizontal(sq[0], &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
1593 StoreAligned32U32(square_sum3[2], sq3[2]);
1594 StoreAligned32U32(square_sum5[3], sq5[3]);
1595 SumHorizontal(sq[1], &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
1596 StoreAligned32U32(square_sum3[3], sq3[3]);
1597 StoreAligned32U32(square_sum5[4], sq5[4]);
1598 LoadAligned16x2U16(sum3, 0, s3);
1599 LoadAligned32x2U32(square_sum3, 0, sq3);
1600 LoadAligned16x3U16(sum5, 0, s5);
1601 LoadAligned32x3U32(square_sum5, 0, sq5);
1602 CalculateSumAndIndex3(s3 + 0, sq3 + 0, scales[1], &sum[0], &index[0]);
1603 CalculateSumAndIndex3(s3 + 1, sq3 + 1, scales[1], &sum[1], &index[1]);
1604 CalculateIntermediate(sum, index, &ma3[0][0], &b3[0][0], &b3[1][0]);
1605 ma3[1][0] = _mm_srli_si128(ma3[0][0], 8);
1606 CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
1607 }
1608
BoxFilterPreProcess(const __m128i s[2][2],const ptrdiff_t x,const uint16_t scales[2],uint16_t * const sum3[4],uint16_t * const sum5[5],uint32_t * const square_sum3[4],uint32_t * const square_sum5[5],const ptrdiff_t sum_width,__m128i sq[2][4],__m128i ma3[2][2],__m128i b3[2][3],__m128i ma5[2],__m128i b5[3])1609 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcess(
1610 const __m128i s[2][2], const ptrdiff_t x, const uint16_t scales[2],
1611 uint16_t* const sum3[4], uint16_t* const sum5[5],
1612 uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
1613 const ptrdiff_t sum_width, __m128i sq[2][4], __m128i ma3[2][2],
1614 __m128i b3[2][3], __m128i ma5[2], __m128i b5[3]) {
1615 __m128i s3[2][4], s5[2][5], sq3[4][2], sq5[5][2], sum[2][2], index[2][2];
1616 SumHorizontal<8>(s[0], &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
1617 StoreAligned16(sum3[2] + x + 0, s3[0][2]);
1618 StoreAligned16(sum3[2] + x + 8, s3[1][2]);
1619 StoreAligned16(sum5[3] + x + 0, s5[0][3]);
1620 StoreAligned16(sum5[3] + x + 8, s5[1][3]);
1621 SumHorizontal<8>(s[1], &s3[0][3], &s3[1][3], &s5[0][4], &s5[1][4]);
1622 StoreAligned16(sum3[3] + x + 0, s3[0][3]);
1623 StoreAligned16(sum3[3] + x + 8, s3[1][3]);
1624 StoreAligned16(sum5[4] + x + 0, s5[0][4]);
1625 StoreAligned16(sum5[4] + x + 8, s5[1][4]);
1626 sq[0][2] = SquareLo8(s[0][1]);
1627 sq[1][2] = SquareLo8(s[1][1]);
1628 SumHorizontal(sq[0] + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
1629 StoreAligned32U32(square_sum3[2] + x, sq3[2]);
1630 StoreAligned32U32(square_sum5[3] + x, sq5[3]);
1631 SumHorizontal(sq[1] + 1, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
1632 StoreAligned32U32(square_sum3[3] + x, sq3[3]);
1633 StoreAligned32U32(square_sum5[4] + x, sq5[4]);
1634 LoadAligned16x2U16(sum3, x, s3[0]);
1635 LoadAligned32x2U32(square_sum3, x, sq3);
1636 CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0][0], &index[0][0]);
1637 CalculateSumAndIndex3(s3[0] + 1, sq3 + 1, scales[1], &sum[1][0],
1638 &index[1][0]);
1639 LoadAligned16x3U16(sum5, x, s5[0]);
1640 LoadAligned32x3U32(square_sum5, x, sq5);
1641 CalculateIntermediate5<8>(s5[0], sq5, scales[0], &ma5[0], &b5[1]);
1642
1643 sq[0][3] = SquareHi8(s[0][1]);
1644 sq[1][3] = SquareHi8(s[1][1]);
1645 SumHorizontal(sq[0] + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
1646 StoreAligned32U32(square_sum3[2] + x + 8, sq3[2]);
1647 StoreAligned32U32(square_sum5[3] + x + 8, sq5[3]);
1648 SumHorizontal(sq[1] + 2, &sq3[3][0], &sq3[3][1], &sq5[4][0], &sq5[4][1]);
1649 StoreAligned32U32(square_sum3[3] + x + 8, sq3[3]);
1650 StoreAligned32U32(square_sum5[4] + x + 8, sq5[4]);
1651 LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
1652 LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
1653 CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[0][1], &index[0][1]);
1654 CalculateSumAndIndex3(s3[1] + 1, sq3 + 1, scales[1], &sum[1][1],
1655 &index[1][1]);
1656 CalculateIntermediate(sum[0], index[0], ma3[0], b3[0] + 1);
1657 CalculateIntermediate(sum[1], index[1], ma3[1], b3[1] + 1);
1658 LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
1659 LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
1660 CalculateIntermediate5<0>(s5[1], sq5, scales[0], &ma5[1], &b5[2]);
1661 }
1662
BoxFilterPreProcessLastRowLo(const __m128i s,const uint16_t scales[2],const uint16_t * const sum3[4],const uint16_t * const sum5[5],const uint32_t * const square_sum3[4],const uint32_t * const square_sum5[5],__m128i sq[2],__m128i * const ma3,__m128i * const ma5,__m128i * const b3,__m128i * const b5)1663 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRowLo(
1664 const __m128i s, const uint16_t scales[2], const uint16_t* const sum3[4],
1665 const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
1666 const uint32_t* const square_sum5[5], __m128i sq[2], __m128i* const ma3,
1667 __m128i* const ma5, __m128i* const b3, __m128i* const b5) {
1668 __m128i s3[3], s5[5], sq3[3][2], sq5[5][2];
1669 sq[1] = SquareHi8(s);
1670 SumHorizontalLo(s, &s3[2], &s5[3]);
1671 SumHorizontal(sq, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
1672 LoadAligned16x3U16(sum5, 0, s5);
1673 s5[4] = s5[3];
1674 LoadAligned32x3U32(square_sum5, 0, sq5);
1675 sq5[4][0] = sq5[3][0];
1676 sq5[4][1] = sq5[3][1];
1677 CalculateIntermediate5<0>(s5, sq5, scales[0], ma5, b5);
1678 LoadAligned16x2U16(sum3, 0, s3);
1679 LoadAligned32x2U32(square_sum3, 0, sq3);
1680 CalculateIntermediate3(s3, sq3, scales[1], ma3, b3);
1681 }
1682
BoxFilterPreProcessLastRow(const __m128i s[2],const ptrdiff_t sum_width,const ptrdiff_t x,const uint16_t scales[2],const uint16_t * const sum3[4],const uint16_t * const sum5[5],const uint32_t * const square_sum3[4],const uint32_t * const square_sum5[5],__m128i sq[4],__m128i ma3[2],__m128i ma5[2],__m128i b3[3],__m128i b5[3])1683 LIBGAV1_ALWAYS_INLINE void BoxFilterPreProcessLastRow(
1684 const __m128i s[2], const ptrdiff_t sum_width, const ptrdiff_t x,
1685 const uint16_t scales[2], const uint16_t* const sum3[4],
1686 const uint16_t* const sum5[5], const uint32_t* const square_sum3[4],
1687 const uint32_t* const square_sum5[5], __m128i sq[4], __m128i ma3[2],
1688 __m128i ma5[2], __m128i b3[3], __m128i b5[3]) {
1689 __m128i s3[2][3], s5[2][5], sq3[3][2], sq5[5][2], sum[2], index[2];
1690 sq[2] = SquareLo8(s[1]);
1691 SumHorizontal<8>(s, &s3[0][2], &s3[1][2], &s5[0][3], &s5[1][3]);
1692 SumHorizontal(sq + 1, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
1693 LoadAligned16x3U16(sum5, x, s5[0]);
1694 s5[0][4] = s5[0][3];
1695 LoadAligned32x3U32(square_sum5, x, sq5);
1696 sq5[4][0] = sq5[3][0];
1697 sq5[4][1] = sq5[3][1];
1698 CalculateIntermediate5<8>(s5[0], sq5, scales[0], ma5, b5 + 1);
1699 LoadAligned16x2U16(sum3, x, s3[0]);
1700 LoadAligned32x2U32(square_sum3, x, sq3);
1701 CalculateSumAndIndex3(s3[0], sq3, scales[1], &sum[0], &index[0]);
1702
1703 sq[3] = SquareHi8(s[1]);
1704 SumHorizontal(sq + 2, &sq3[2][0], &sq3[2][1], &sq5[3][0], &sq5[3][1]);
1705 LoadAligned16x3U16Msan(sum5, x + 8, sum_width, s5[1]);
1706 s5[1][4] = s5[1][3];
1707 LoadAligned32x3U32Msan(square_sum5, x + 8, sum_width, sq5);
1708 sq5[4][0] = sq5[3][0];
1709 sq5[4][1] = sq5[3][1];
1710 CalculateIntermediate5<0>(s5[1], sq5, scales[0], ma5 + 1, b5 + 2);
1711 LoadAligned16x2U16Msan(sum3, x + 8, sum_width, s3[1]);
1712 LoadAligned32x2U32Msan(square_sum3, x + 8, sum_width, sq3);
1713 CalculateSumAndIndex3(s3[1], sq3, scales[1], &sum[1], &index[1]);
1714 CalculateIntermediate(sum, index, ma3, b3 + 1);
1715 }
1716
BoxSumFilterPreProcess5(const uint8_t * const src0,const uint8_t * const src1,const int width,const uint32_t scale,uint16_t * const sum5[5],uint32_t * const square_sum5[5],const ptrdiff_t sum_width,uint16_t * ma565,uint32_t * b565)1717 inline void BoxSumFilterPreProcess5(const uint8_t* const src0,
1718 const uint8_t* const src1, const int width,
1719 const uint32_t scale,
1720 uint16_t* const sum5[5],
1721 uint32_t* const square_sum5[5],
1722 const ptrdiff_t sum_width, uint16_t* ma565,
1723 uint32_t* b565) {
1724 __m128i s[2][2], mas[2], sq[2][4], bs[3];
1725 s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
1726 s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
1727 sq[0][0] = SquareLo8(s[0][0]);
1728 sq[1][0] = SquareLo8(s[1][0]);
1729 BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], &bs[0]);
1730
1731 int x = 0;
1732 do {
1733 __m128i ma5[3], ma[2], b[4];
1734 s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
1735 x + 16 + kOverreadInBytesPass1 - width);
1736 s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
1737 x + 16 + kOverreadInBytesPass1 - width);
1738 BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
1739 bs);
1740 Prepare3_8<0>(mas, ma5);
1741 ma[0] = Sum565Lo(ma5);
1742 ma[1] = Sum565Hi(ma5);
1743 StoreAligned32U16(ma565, ma);
1744 Sum565W(bs + 0, b + 0);
1745 Sum565W(bs + 1, b + 2);
1746 StoreAligned64U32(b565, b);
1747 s[0][0] = s[0][1];
1748 s[1][0] = s[1][1];
1749 sq[0][1] = sq[0][3];
1750 sq[1][1] = sq[1][3];
1751 mas[0] = mas[1];
1752 bs[0] = bs[2];
1753 ma565 += 16;
1754 b565 += 16;
1755 x += 16;
1756 } while (x < width);
1757 }
1758
1759 template <bool calculate444>
BoxSumFilterPreProcess3(const uint8_t * const src,const int width,const uint32_t scale,uint16_t * const sum3[3],uint32_t * const square_sum3[3],const ptrdiff_t sum_width,uint16_t * ma343,uint16_t * ma444,uint32_t * b343,uint32_t * b444)1760 LIBGAV1_ALWAYS_INLINE void BoxSumFilterPreProcess3(
1761 const uint8_t* const src, const int width, const uint32_t scale,
1762 uint16_t* const sum3[3], uint32_t* const square_sum3[3],
1763 const ptrdiff_t sum_width, uint16_t* ma343, uint16_t* ma444, uint32_t* b343,
1764 uint32_t* b444) {
1765 __m128i s[2], mas[2], sq[4], bs[3];
1766 s[0] = LoadUnaligned16Msan(src, kOverreadInBytesPass2 - width);
1767 sq[0] = SquareLo8(s[0]);
1768 BoxFilterPreProcess3Lo(s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]);
1769
1770 int x = 0;
1771 do {
1772 s[1] = LoadUnaligned16Msan(src + x + 16,
1773 x + 16 + kOverreadInBytesPass2 - width);
1774 BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
1775 bs);
1776 __m128i ma3[3];
1777 Prepare3_8<0>(mas, ma3);
1778 if (calculate444) { // NOLINT(readability-simplify-boolean-expr)
1779 Store343_444Lo(ma3, bs + 0, 0, ma343, ma444, b343, b444);
1780 Store343_444Hi(ma3, bs + 1, 8, ma343, ma444, b343, b444);
1781 ma444 += 16;
1782 b444 += 16;
1783 } else {
1784 __m128i ma[2], b[4];
1785 ma[0] = Sum343Lo(ma3);
1786 ma[1] = Sum343Hi(ma3);
1787 StoreAligned32U16(ma343, ma);
1788 Sum343W(bs + 0, b + 0);
1789 Sum343W(bs + 1, b + 2);
1790 StoreAligned64U32(b343, b);
1791 }
1792 s[0] = s[1];
1793 sq[1] = sq[3];
1794 mas[0] = mas[1];
1795 bs[0] = bs[2];
1796 ma343 += 16;
1797 b343 += 16;
1798 x += 16;
1799 } while (x < width);
1800 }
1801
BoxSumFilterPreProcess(const uint8_t * const src0,const uint8_t * const src1,const int width,const uint16_t scales[2],uint16_t * const sum3[4],uint16_t * const sum5[5],uint32_t * const square_sum3[4],uint32_t * const square_sum5[5],const ptrdiff_t sum_width,uint16_t * const ma343[4],uint16_t * const ma444,uint16_t * ma565,uint32_t * const b343[4],uint32_t * const b444,uint32_t * b565)1802 inline void BoxSumFilterPreProcess(
1803 const uint8_t* const src0, const uint8_t* const src1, const int width,
1804 const uint16_t scales[2], uint16_t* const sum3[4], uint16_t* const sum5[5],
1805 uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
1806 const ptrdiff_t sum_width, uint16_t* const ma343[4], uint16_t* const ma444,
1807 uint16_t* ma565, uint32_t* const b343[4], uint32_t* const b444,
1808 uint32_t* b565) {
1809 __m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3];
1810 s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
1811 s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
1812 sq[0][0] = SquareLo8(s[0][0]);
1813 sq[1][0] = SquareLo8(s[1][0]);
1814 BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
1815 ma3, b3, &ma5[0], &b5[0]);
1816
1817 int x = 0;
1818 do {
1819 __m128i ma[2], b[4], ma3x[3], ma5x[3];
1820 s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
1821 x + 16 + kOverreadInBytesPass1 - width);
1822 s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
1823 x + 16 + kOverreadInBytesPass1 - width);
1824 BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
1825 sum_width, sq, ma3, b3, ma5, b5);
1826
1827 Prepare3_8<0>(ma3[0], ma3x);
1828 ma[0] = Sum343Lo(ma3x);
1829 ma[1] = Sum343Hi(ma3x);
1830 StoreAligned32U16(ma343[0] + x, ma);
1831 Sum343W(b3[0] + 0, b + 0);
1832 Sum343W(b3[0] + 1, b + 2);
1833 StoreAligned64U32(b343[0] + x, b);
1834 Sum565W(b5 + 0, b + 0);
1835 Sum565W(b5 + 1, b + 2);
1836 StoreAligned64U32(b565, b);
1837 Prepare3_8<0>(ma3[1], ma3x);
1838 Store343_444Lo(ma3x, b3[1], x, ma343[1], ma444, b343[1], b444);
1839 Store343_444Hi(ma3x, b3[1] + 1, x + 8, ma343[1], ma444, b343[1], b444);
1840 Prepare3_8<0>(ma5, ma5x);
1841 ma[0] = Sum565Lo(ma5x);
1842 ma[1] = Sum565Hi(ma5x);
1843 StoreAligned32U16(ma565, ma);
1844 s[0][0] = s[0][1];
1845 s[1][0] = s[1][1];
1846 sq[0][1] = sq[0][3];
1847 sq[1][1] = sq[1][3];
1848 ma3[0][0] = ma3[0][1];
1849 ma3[1][0] = ma3[1][1];
1850 ma5[0] = ma5[1];
1851 b3[0][0] = b3[0][2];
1852 b3[1][0] = b3[1][2];
1853 b5[0] = b5[2];
1854 ma565 += 16;
1855 b565 += 16;
1856 x += 16;
1857 } while (x < width);
1858 }
1859
1860 template <int shift>
FilterOutput(const __m128i ma_x_src,const __m128i b)1861 inline __m128i FilterOutput(const __m128i ma_x_src, const __m128i b) {
1862 // ma: 255 * 32 = 8160 (13 bits)
1863 // b: 65088 * 32 = 2082816 (21 bits)
1864 // v: b - ma * 255 (22 bits)
1865 const __m128i v = _mm_sub_epi32(b, ma_x_src);
1866 // kSgrProjSgrBits = 8
1867 // kSgrProjRestoreBits = 4
1868 // shift = 4 or 5
1869 // v >> 8 or 9 (13 bits)
1870 return VrshrS32(v, kSgrProjSgrBits + shift - kSgrProjRestoreBits);
1871 }
1872
1873 template <int shift>
CalculateFilteredOutput(const __m128i src,const __m128i ma,const __m128i b[2])1874 inline __m128i CalculateFilteredOutput(const __m128i src, const __m128i ma,
1875 const __m128i b[2]) {
1876 const __m128i ma_x_src_lo = VmullLo16(ma, src);
1877 const __m128i ma_x_src_hi = VmullHi16(ma, src);
1878 const __m128i dst_lo = FilterOutput<shift>(ma_x_src_lo, b[0]);
1879 const __m128i dst_hi = FilterOutput<shift>(ma_x_src_hi, b[1]);
1880 return _mm_packs_epi32(dst_lo, dst_hi); // 13 bits
1881 }
1882
CalculateFilteredOutputPass1(const __m128i src,const __m128i ma[2],const __m128i b[2][2])1883 inline __m128i CalculateFilteredOutputPass1(const __m128i src,
1884 const __m128i ma[2],
1885 const __m128i b[2][2]) {
1886 const __m128i ma_sum = _mm_add_epi16(ma[0], ma[1]);
1887 __m128i b_sum[2];
1888 b_sum[0] = _mm_add_epi32(b[0][0], b[1][0]);
1889 b_sum[1] = _mm_add_epi32(b[0][1], b[1][1]);
1890 return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
1891 }
1892
CalculateFilteredOutputPass2(const __m128i src,const __m128i ma[3],const __m128i b[3][2])1893 inline __m128i CalculateFilteredOutputPass2(const __m128i src,
1894 const __m128i ma[3],
1895 const __m128i b[3][2]) {
1896 const __m128i ma_sum = Sum3_16(ma);
1897 __m128i b_sum[2];
1898 Sum3_32(b, b_sum);
1899 return CalculateFilteredOutput<5>(src, ma_sum, b_sum);
1900 }
1901
SelfGuidedFinal(const __m128i src,const __m128i v[2])1902 inline __m128i SelfGuidedFinal(const __m128i src, const __m128i v[2]) {
1903 const __m128i v_lo =
1904 VrshrS32(v[0], kSgrProjRestoreBits + kSgrProjPrecisionBits);
1905 const __m128i v_hi =
1906 VrshrS32(v[1], kSgrProjRestoreBits + kSgrProjPrecisionBits);
1907 const __m128i vv = _mm_packs_epi32(v_lo, v_hi);
1908 return _mm_add_epi16(src, vv);
1909 }
1910
SelfGuidedDoubleMultiplier(const __m128i src,const __m128i filter[2],const int w0,const int w2)1911 inline __m128i SelfGuidedDoubleMultiplier(const __m128i src,
1912 const __m128i filter[2], const int w0,
1913 const int w2) {
1914 __m128i v[2];
1915 const __m128i w0_w2 = _mm_set1_epi32((w2 << 16) | static_cast<uint16_t>(w0));
1916 const __m128i f_lo = _mm_unpacklo_epi16(filter[0], filter[1]);
1917 const __m128i f_hi = _mm_unpackhi_epi16(filter[0], filter[1]);
1918 v[0] = _mm_madd_epi16(w0_w2, f_lo);
1919 v[1] = _mm_madd_epi16(w0_w2, f_hi);
1920 return SelfGuidedFinal(src, v);
1921 }
1922
SelfGuidedSingleMultiplier(const __m128i src,const __m128i filter,const int w0)1923 inline __m128i SelfGuidedSingleMultiplier(const __m128i src,
1924 const __m128i filter, const int w0) {
1925 // weight: -96 to 96 (Sgrproj_Xqd_Min/Max)
1926 __m128i v[2];
1927 v[0] = VmullNLo8(filter, w0);
1928 v[1] = VmullNHi8(filter, w0);
1929 return SelfGuidedFinal(src, v);
1930 }
1931
BoxFilterPass1(const uint8_t * const src,const uint8_t * const src0,const uint8_t * const src1,const ptrdiff_t stride,uint16_t * const sum5[5],uint32_t * const square_sum5[5],const int width,const ptrdiff_t sum_width,const uint32_t scale,const int16_t w0,uint16_t * const ma565[2],uint32_t * const b565[2],uint8_t * const dst)1932 LIBGAV1_ALWAYS_INLINE void BoxFilterPass1(
1933 const uint8_t* const src, const uint8_t* const src0,
1934 const uint8_t* const src1, const ptrdiff_t stride, uint16_t* const sum5[5],
1935 uint32_t* const square_sum5[5], const int width, const ptrdiff_t sum_width,
1936 const uint32_t scale, const int16_t w0, uint16_t* const ma565[2],
1937 uint32_t* const b565[2], uint8_t* const dst) {
1938 __m128i s[2][2], mas[2], sq[2][4], bs[3];
1939 s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
1940 s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
1941 sq[0][0] = SquareLo8(s[0][0]);
1942 sq[1][0] = SquareLo8(s[1][0]);
1943 BoxFilterPreProcess5Lo(s, scale, sum5, square_sum5, sq, &mas[0], &bs[0]);
1944
1945 int x = 0;
1946 do {
1947 __m128i ma[2], ma5[3], b[2][2], sr[2], p[2];
1948 s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
1949 x + 16 + kOverreadInBytesPass1 - width);
1950 s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
1951 x + 16 + kOverreadInBytesPass1 - width);
1952 BoxFilterPreProcess5(s, sum_width, x + 8, scale, sum5, square_sum5, sq, mas,
1953 bs);
1954 Prepare3_8<0>(mas, ma5);
1955 ma[1] = Sum565Lo(ma5);
1956 StoreAligned16(ma565[1] + x, ma[1]);
1957 Sum565W(bs, b[1]);
1958 StoreAligned32U32(b565[1] + x, b[1]);
1959 sr[0] = LoadAligned16(src + x);
1960 sr[1] = LoadAligned16(src + stride + x);
1961 const __m128i sr0_lo = _mm_unpacklo_epi8(sr[0], _mm_setzero_si128());
1962 const __m128i sr1_lo = _mm_unpacklo_epi8(sr[1], _mm_setzero_si128());
1963 ma[0] = LoadAligned16(ma565[0] + x);
1964 LoadAligned32U32(b565[0] + x, b[0]);
1965 p[0] = CalculateFilteredOutputPass1(sr0_lo, ma, b);
1966 p[1] = CalculateFilteredOutput<4>(sr1_lo, ma[1], b[1]);
1967 const __m128i d00 = SelfGuidedSingleMultiplier(sr0_lo, p[0], w0);
1968 const __m128i d10 = SelfGuidedSingleMultiplier(sr1_lo, p[1], w0);
1969
1970 ma[1] = Sum565Hi(ma5);
1971 StoreAligned16(ma565[1] + x + 8, ma[1]);
1972 Sum565W(bs + 1, b[1]);
1973 StoreAligned32U32(b565[1] + x + 8, b[1]);
1974 const __m128i sr0_hi = _mm_unpackhi_epi8(sr[0], _mm_setzero_si128());
1975 const __m128i sr1_hi = _mm_unpackhi_epi8(sr[1], _mm_setzero_si128());
1976 ma[0] = LoadAligned16(ma565[0] + x + 8);
1977 LoadAligned32U32(b565[0] + x + 8, b[0]);
1978 p[0] = CalculateFilteredOutputPass1(sr0_hi, ma, b);
1979 p[1] = CalculateFilteredOutput<4>(sr1_hi, ma[1], b[1]);
1980 const __m128i d01 = SelfGuidedSingleMultiplier(sr0_hi, p[0], w0);
1981 StoreAligned16(dst + x, _mm_packus_epi16(d00, d01));
1982 const __m128i d11 = SelfGuidedSingleMultiplier(sr1_hi, p[1], w0);
1983 StoreAligned16(dst + stride + x, _mm_packus_epi16(d10, d11));
1984 s[0][0] = s[0][1];
1985 s[1][0] = s[1][1];
1986 sq[0][1] = sq[0][3];
1987 sq[1][1] = sq[1][3];
1988 mas[0] = mas[1];
1989 bs[0] = bs[2];
1990 x += 16;
1991 } while (x < width);
1992 }
1993
BoxFilterPass1LastRow(const uint8_t * const src,const uint8_t * const src0,const int width,const ptrdiff_t sum_width,const uint32_t scale,const int16_t w0,uint16_t * const sum5[5],uint32_t * const square_sum5[5],uint16_t * ma565,uint32_t * b565,uint8_t * const dst)1994 inline void BoxFilterPass1LastRow(
1995 const uint8_t* const src, const uint8_t* const src0, const int width,
1996 const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
1997 uint16_t* const sum5[5], uint32_t* const square_sum5[5], uint16_t* ma565,
1998 uint32_t* b565, uint8_t* const dst) {
1999 __m128i s[2], mas[2], sq[4], bs[3];
2000 s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
2001 sq[0] = SquareLo8(s[0]);
2002 BoxFilterPreProcess5LastRowLo(s[0], scale, sum5, square_sum5, sq, &mas[0],
2003 &bs[0]);
2004
2005 int x = 0;
2006 do {
2007 __m128i ma[2], ma5[3], b[2][2];
2008 s[1] = LoadUnaligned16Msan(src0 + x + 16,
2009 x + 16 + kOverreadInBytesPass1 - width);
2010 BoxFilterPreProcess5LastRow(s, sum_width, x + 8, scale, sum5, square_sum5,
2011 sq, mas, bs);
2012 Prepare3_8<0>(mas, ma5);
2013 ma[1] = Sum565Lo(ma5);
2014 Sum565W(bs, b[1]);
2015 ma[0] = LoadAligned16(ma565);
2016 LoadAligned32U32(b565, b[0]);
2017 const __m128i sr = LoadAligned16(src + x);
2018 const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
2019 __m128i p = CalculateFilteredOutputPass1(sr_lo, ma, b);
2020 const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p, w0);
2021
2022 ma[1] = Sum565Hi(ma5);
2023 Sum565W(bs + 1, b[1]);
2024 ma[0] = LoadAligned16(ma565 + 8);
2025 LoadAligned32U32(b565 + 8, b[0]);
2026 const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
2027 p = CalculateFilteredOutputPass1(sr_hi, ma, b);
2028 const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p, w0);
2029 StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
2030 s[0] = s[1];
2031 sq[1] = sq[3];
2032 mas[0] = mas[1];
2033 bs[0] = bs[2];
2034 ma565 += 16;
2035 b565 += 16;
2036 x += 16;
2037 } while (x < width);
2038 }
2039
BoxFilterPass2(const uint8_t * const src,const uint8_t * const src0,const int width,const ptrdiff_t sum_width,const uint32_t scale,const int16_t w0,uint16_t * const sum3[3],uint32_t * const square_sum3[3],uint16_t * const ma343[3],uint16_t * const ma444[2],uint32_t * const b343[3],uint32_t * const b444[2],uint8_t * const dst)2040 LIBGAV1_ALWAYS_INLINE void BoxFilterPass2(
2041 const uint8_t* const src, const uint8_t* const src0, const int width,
2042 const ptrdiff_t sum_width, const uint32_t scale, const int16_t w0,
2043 uint16_t* const sum3[3], uint32_t* const square_sum3[3],
2044 uint16_t* const ma343[3], uint16_t* const ma444[2], uint32_t* const b343[3],
2045 uint32_t* const b444[2], uint8_t* const dst) {
2046 __m128i s[2], mas[2], sq[4], bs[3];
2047 s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass2 - width);
2048 sq[0] = SquareLo8(s[0]);
2049 BoxFilterPreProcess3Lo(s[0], scale, sum3, square_sum3, sq, &mas[0], &bs[0]);
2050
2051 int x = 0;
2052 do {
2053 s[1] = LoadUnaligned16Msan(src0 + x + 16,
2054 x + 16 + kOverreadInBytesPass2 - width);
2055 BoxFilterPreProcess3(s, x + 8, sum_width, scale, sum3, square_sum3, sq, mas,
2056 bs);
2057 __m128i ma[3], b[3][2], ma3[3];
2058 Prepare3_8<0>(mas, ma3);
2059 Store343_444Lo(ma3, bs + 0, x, &ma[2], b[2], ma343[2], ma444[1], b343[2],
2060 b444[1]);
2061 const __m128i sr = LoadAligned16(src + x);
2062 const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
2063 ma[0] = LoadAligned16(ma343[0] + x);
2064 ma[1] = LoadAligned16(ma444[0] + x);
2065 LoadAligned32U32(b343[0] + x, b[0]);
2066 LoadAligned32U32(b444[0] + x, b[1]);
2067 const __m128i p0 = CalculateFilteredOutputPass2(sr_lo, ma, b);
2068
2069 Store343_444Hi(ma3, bs + 1, x + 8, &ma[2], b[2], ma343[2], ma444[1],
2070 b343[2], b444[1]);
2071 const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
2072 ma[0] = LoadAligned16(ma343[0] + x + 8);
2073 ma[1] = LoadAligned16(ma444[0] + x + 8);
2074 LoadAligned32U32(b343[0] + x + 8, b[0]);
2075 LoadAligned32U32(b444[0] + x + 8, b[1]);
2076 const __m128i p1 = CalculateFilteredOutputPass2(sr_hi, ma, b);
2077 const __m128i d0 = SelfGuidedSingleMultiplier(sr_lo, p0, w0);
2078 const __m128i d1 = SelfGuidedSingleMultiplier(sr_hi, p1, w0);
2079 StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
2080 s[0] = s[1];
2081 sq[1] = sq[3];
2082 mas[0] = mas[1];
2083 bs[0] = bs[2];
2084 x += 16;
2085 } while (x < width);
2086 }
2087
BoxFilter(const uint8_t * const src,const uint8_t * const src0,const uint8_t * const src1,const ptrdiff_t stride,const int width,const uint16_t scales[2],const int16_t w0,const int16_t w2,uint16_t * const sum3[4],uint16_t * const sum5[5],uint32_t * const square_sum3[4],uint32_t * const square_sum5[5],const ptrdiff_t sum_width,uint16_t * const ma343[4],uint16_t * const ma444[3],uint16_t * const ma565[2],uint32_t * const b343[4],uint32_t * const b444[3],uint32_t * const b565[2],uint8_t * const dst)2088 LIBGAV1_ALWAYS_INLINE void BoxFilter(
2089 const uint8_t* const src, const uint8_t* const src0,
2090 const uint8_t* const src1, const ptrdiff_t stride, const int width,
2091 const uint16_t scales[2], const int16_t w0, const int16_t w2,
2092 uint16_t* const sum3[4], uint16_t* const sum5[5],
2093 uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
2094 const ptrdiff_t sum_width, uint16_t* const ma343[4],
2095 uint16_t* const ma444[3], uint16_t* const ma565[2], uint32_t* const b343[4],
2096 uint32_t* const b444[3], uint32_t* const b565[2], uint8_t* const dst) {
2097 __m128i s[2][2], ma3[2][2], ma5[2], sq[2][4], b3[2][3], b5[3];
2098 ma5[1] = _mm_setzero_si128(); // Quiets -Wmaybe-unintialized with gcc.
2099 s[0][0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
2100 s[1][0] = LoadUnaligned16Msan(src1, kOverreadInBytesPass1 - width);
2101 sq[0][0] = SquareLo8(s[0][0]);
2102 sq[1][0] = SquareLo8(s[1][0]);
2103 BoxFilterPreProcessLo(s, scales, sum3, sum5, square_sum3, square_sum5, sq,
2104 ma3, b3, &ma5[0], &b5[0]);
2105
2106 int x = 0;
2107 do {
2108 __m128i ma[3][3], b[3][3][2], p[2][2], ma3x[2][3], ma5x[3];
2109 s[0][1] = LoadUnaligned16Msan(src0 + x + 16,
2110 x + 16 + kOverreadInBytesPass1 - width);
2111 s[1][1] = LoadUnaligned16Msan(src1 + x + 16,
2112 x + 16 + kOverreadInBytesPass1 - width);
2113 BoxFilterPreProcess(s, x + 8, scales, sum3, sum5, square_sum3, square_sum5,
2114 sum_width, sq, ma3, b3, ma5, b5);
2115 Prepare3_8<0>(ma3[0], ma3x[0]);
2116 Prepare3_8<0>(ma3[1], ma3x[1]);
2117 Prepare3_8<0>(ma5, ma5x);
2118 Store343_444Lo(ma3x[0], b3[0], x, &ma[1][2], &ma[2][1], b[1][2], b[2][1],
2119 ma343[2], ma444[1], b343[2], b444[1]);
2120 Store343_444Lo(ma3x[1], b3[1], x, &ma[2][2], b[2][2], ma343[3], ma444[2],
2121 b343[3], b444[2]);
2122 ma[0][1] = Sum565Lo(ma5x);
2123 StoreAligned16(ma565[1] + x, ma[0][1]);
2124 Sum565W(b5, b[0][1]);
2125 StoreAligned32U32(b565[1] + x, b[0][1]);
2126 const __m128i sr0 = LoadAligned16(src + x);
2127 const __m128i sr1 = LoadAligned16(src + stride + x);
2128 const __m128i sr0_lo = _mm_unpacklo_epi8(sr0, _mm_setzero_si128());
2129 const __m128i sr1_lo = _mm_unpacklo_epi8(sr1, _mm_setzero_si128());
2130 ma[0][0] = LoadAligned16(ma565[0] + x);
2131 LoadAligned32U32(b565[0] + x, b[0][0]);
2132 p[0][0] = CalculateFilteredOutputPass1(sr0_lo, ma[0], b[0]);
2133 p[1][0] = CalculateFilteredOutput<4>(sr1_lo, ma[0][1], b[0][1]);
2134 ma[1][0] = LoadAligned16(ma343[0] + x);
2135 ma[1][1] = LoadAligned16(ma444[0] + x);
2136 LoadAligned32U32(b343[0] + x, b[1][0]);
2137 LoadAligned32U32(b444[0] + x, b[1][1]);
2138 p[0][1] = CalculateFilteredOutputPass2(sr0_lo, ma[1], b[1]);
2139 const __m128i d00 = SelfGuidedDoubleMultiplier(sr0_lo, p[0], w0, w2);
2140 ma[2][0] = LoadAligned16(ma343[1] + x);
2141 LoadAligned32U32(b343[1] + x, b[2][0]);
2142 p[1][1] = CalculateFilteredOutputPass2(sr1_lo, ma[2], b[2]);
2143 const __m128i d10 = SelfGuidedDoubleMultiplier(sr1_lo, p[1], w0, w2);
2144
2145 Store343_444Hi(ma3x[0], b3[0] + 1, x + 8, &ma[1][2], &ma[2][1], b[1][2],
2146 b[2][1], ma343[2], ma444[1], b343[2], b444[1]);
2147 Store343_444Hi(ma3x[1], b3[1] + 1, x + 8, &ma[2][2], b[2][2], ma343[3],
2148 ma444[2], b343[3], b444[2]);
2149 ma[0][1] = Sum565Hi(ma5x);
2150 StoreAligned16(ma565[1] + x + 8, ma[0][1]);
2151 Sum565W(b5 + 1, b[0][1]);
2152 StoreAligned32U32(b565[1] + x + 8, b[0][1]);
2153 const __m128i sr0_hi = _mm_unpackhi_epi8(sr0, _mm_setzero_si128());
2154 const __m128i sr1_hi = _mm_unpackhi_epi8(sr1, _mm_setzero_si128());
2155 ma[0][0] = LoadAligned16(ma565[0] + x + 8);
2156 LoadAligned32U32(b565[0] + x + 8, b[0][0]);
2157 p[0][0] = CalculateFilteredOutputPass1(sr0_hi, ma[0], b[0]);
2158 p[1][0] = CalculateFilteredOutput<4>(sr1_hi, ma[0][1], b[0][1]);
2159 ma[1][0] = LoadAligned16(ma343[0] + x + 8);
2160 ma[1][1] = LoadAligned16(ma444[0] + x + 8);
2161 LoadAligned32U32(b343[0] + x + 8, b[1][0]);
2162 LoadAligned32U32(b444[0] + x + 8, b[1][1]);
2163 p[0][1] = CalculateFilteredOutputPass2(sr0_hi, ma[1], b[1]);
2164 const __m128i d01 = SelfGuidedDoubleMultiplier(sr0_hi, p[0], w0, w2);
2165 StoreAligned16(dst + x, _mm_packus_epi16(d00, d01));
2166 ma[2][0] = LoadAligned16(ma343[1] + x + 8);
2167 LoadAligned32U32(b343[1] + x + 8, b[2][0]);
2168 p[1][1] = CalculateFilteredOutputPass2(sr1_hi, ma[2], b[2]);
2169 const __m128i d11 = SelfGuidedDoubleMultiplier(sr1_hi, p[1], w0, w2);
2170 StoreAligned16(dst + stride + x, _mm_packus_epi16(d10, d11));
2171 s[0][0] = s[0][1];
2172 s[1][0] = s[1][1];
2173 sq[0][1] = sq[0][3];
2174 sq[1][1] = sq[1][3];
2175 ma3[0][0] = ma3[0][1];
2176 ma3[1][0] = ma3[1][1];
2177 ma5[0] = ma5[1];
2178 b3[0][0] = b3[0][2];
2179 b3[1][0] = b3[1][2];
2180 b5[0] = b5[2];
2181 x += 16;
2182 } while (x < width);
2183 }
2184
BoxFilterLastRow(const uint8_t * const src,const uint8_t * const src0,const int width,const ptrdiff_t sum_width,const uint16_t scales[2],const int16_t w0,const int16_t w2,uint16_t * const sum3[4],uint16_t * const sum5[5],uint32_t * const square_sum3[4],uint32_t * const square_sum5[5],uint16_t * const ma343,uint16_t * const ma444,uint16_t * const ma565,uint32_t * const b343,uint32_t * const b444,uint32_t * const b565,uint8_t * const dst)2185 inline void BoxFilterLastRow(
2186 const uint8_t* const src, const uint8_t* const src0, const int width,
2187 const ptrdiff_t sum_width, const uint16_t scales[2], const int16_t w0,
2188 const int16_t w2, uint16_t* const sum3[4], uint16_t* const sum5[5],
2189 uint32_t* const square_sum3[4], uint32_t* const square_sum5[5],
2190 uint16_t* const ma343, uint16_t* const ma444, uint16_t* const ma565,
2191 uint32_t* const b343, uint32_t* const b444, uint32_t* const b565,
2192 uint8_t* const dst) {
2193 __m128i s[2], ma3[2], ma5[2], sq[4], b3[3], b5[3], ma[3], b[3][2];
2194 s[0] = LoadUnaligned16Msan(src0, kOverreadInBytesPass1 - width);
2195 sq[0] = SquareLo8(s[0]);
2196 BoxFilterPreProcessLastRowLo(s[0], scales, sum3, sum5, square_sum3,
2197 square_sum5, sq, &ma3[0], &ma5[0], &b3[0],
2198 &b5[0]);
2199
2200 int x = 0;
2201 do {
2202 __m128i ma3x[3], ma5x[3], p[2];
2203 s[1] = LoadUnaligned16Msan(src0 + x + 16,
2204 x + 16 + kOverreadInBytesPass1 - width);
2205 BoxFilterPreProcessLastRow(s, sum_width, x + 8, scales, sum3, sum5,
2206 square_sum3, square_sum5, sq, ma3, ma5, b3, b5);
2207 Prepare3_8<0>(ma3, ma3x);
2208 Prepare3_8<0>(ma5, ma5x);
2209 ma[1] = Sum565Lo(ma5x);
2210 Sum565W(b5, b[1]);
2211 ma[2] = Sum343Lo(ma3x);
2212 Sum343W(b3, b[2]);
2213 const __m128i sr = LoadAligned16(src + x);
2214 const __m128i sr_lo = _mm_unpacklo_epi8(sr, _mm_setzero_si128());
2215 ma[0] = LoadAligned16(ma565 + x);
2216 LoadAligned32U32(b565 + x, b[0]);
2217 p[0] = CalculateFilteredOutputPass1(sr_lo, ma, b);
2218 ma[0] = LoadAligned16(ma343 + x);
2219 ma[1] = LoadAligned16(ma444 + x);
2220 LoadAligned32U32(b343 + x, b[0]);
2221 LoadAligned32U32(b444 + x, b[1]);
2222 p[1] = CalculateFilteredOutputPass2(sr_lo, ma, b);
2223 const __m128i d0 = SelfGuidedDoubleMultiplier(sr_lo, p, w0, w2);
2224
2225 ma[1] = Sum565Hi(ma5x);
2226 Sum565W(b5 + 1, b[1]);
2227 ma[2] = Sum343Hi(ma3x);
2228 Sum343W(b3 + 1, b[2]);
2229 const __m128i sr_hi = _mm_unpackhi_epi8(sr, _mm_setzero_si128());
2230 ma[0] = LoadAligned16(ma565 + x + 8);
2231 LoadAligned32U32(b565 + x + 8, b[0]);
2232 p[0] = CalculateFilteredOutputPass1(sr_hi, ma, b);
2233 ma[0] = LoadAligned16(ma343 + x + 8);
2234 ma[1] = LoadAligned16(ma444 + x + 8);
2235 LoadAligned32U32(b343 + x + 8, b[0]);
2236 LoadAligned32U32(b444 + x + 8, b[1]);
2237 p[1] = CalculateFilteredOutputPass2(sr_hi, ma, b);
2238 const __m128i d1 = SelfGuidedDoubleMultiplier(sr_hi, p, w0, w2);
2239 StoreAligned16(dst + x, _mm_packus_epi16(d0, d1));
2240 s[0] = s[1];
2241 sq[1] = sq[3];
2242 ma3[0] = ma3[1];
2243 ma5[0] = ma5[1];
2244 b3[0] = b3[2];
2245 b5[0] = b5[2];
2246 x += 16;
2247 } while (x < width);
2248 }
2249
BoxFilterProcess(const RestorationUnitInfo & restoration_info,const uint8_t * src,const ptrdiff_t stride,const uint8_t * const top_border,const ptrdiff_t top_border_stride,const uint8_t * bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,SgrBuffer * const sgr_buffer,uint8_t * dst)2250 LIBGAV1_ALWAYS_INLINE void BoxFilterProcess(
2251 const RestorationUnitInfo& restoration_info, const uint8_t* src,
2252 const ptrdiff_t stride, const uint8_t* const top_border,
2253 const ptrdiff_t top_border_stride, const uint8_t* bottom_border,
2254 const ptrdiff_t bottom_border_stride, const int width, const int height,
2255 SgrBuffer* const sgr_buffer, uint8_t* dst) {
2256 const auto temp_stride = Align<ptrdiff_t>(width, 16);
2257 const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
2258 const auto sum_stride = temp_stride + 16;
2259 const int sgr_proj_index = restoration_info.sgr_proj_info.index;
2260 const uint16_t* const scales = kSgrScaleParameter[sgr_proj_index]; // < 2^12.
2261 const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
2262 const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
2263 const int16_t w2 = (1 << kSgrProjPrecisionBits) - w0 - w1;
2264 uint16_t *sum3[4], *sum5[5], *ma343[4], *ma444[3], *ma565[2];
2265 uint32_t *square_sum3[4], *square_sum5[5], *b343[4], *b444[3], *b565[2];
2266 sum3[0] = sgr_buffer->sum3;
2267 square_sum3[0] = sgr_buffer->square_sum3;
2268 ma343[0] = sgr_buffer->ma343;
2269 b343[0] = sgr_buffer->b343;
2270 for (int i = 1; i <= 3; ++i) {
2271 sum3[i] = sum3[i - 1] + sum_stride;
2272 square_sum3[i] = square_sum3[i - 1] + sum_stride;
2273 ma343[i] = ma343[i - 1] + temp_stride;
2274 b343[i] = b343[i - 1] + temp_stride;
2275 }
2276 sum5[0] = sgr_buffer->sum5;
2277 square_sum5[0] = sgr_buffer->square_sum5;
2278 for (int i = 1; i <= 4; ++i) {
2279 sum5[i] = sum5[i - 1] + sum_stride;
2280 square_sum5[i] = square_sum5[i - 1] + sum_stride;
2281 }
2282 ma444[0] = sgr_buffer->ma444;
2283 b444[0] = sgr_buffer->b444;
2284 for (int i = 1; i <= 2; ++i) {
2285 ma444[i] = ma444[i - 1] + temp_stride;
2286 b444[i] = b444[i - 1] + temp_stride;
2287 }
2288 ma565[0] = sgr_buffer->ma565;
2289 ma565[1] = ma565[0] + temp_stride;
2290 b565[0] = sgr_buffer->b565;
2291 b565[1] = b565[0] + temp_stride;
2292 assert(scales[0] != 0);
2293 assert(scales[1] != 0);
2294 BoxSum(top_border, top_border_stride, width, sum_stride, sum_width, sum3[0],
2295 sum5[1], square_sum3[0], square_sum5[1]);
2296 sum5[0] = sum5[1];
2297 square_sum5[0] = square_sum5[1];
2298 const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
2299 BoxSumFilterPreProcess(src, s, width, scales, sum3, sum5, square_sum3,
2300 square_sum5, sum_width, ma343, ma444[0], ma565[0],
2301 b343, b444[0], b565[0]);
2302 sum5[0] = sgr_buffer->sum5;
2303 square_sum5[0] = sgr_buffer->square_sum5;
2304
2305 for (int y = (height >> 1) - 1; y > 0; --y) {
2306 Circulate4PointersBy2<uint16_t>(sum3);
2307 Circulate4PointersBy2<uint32_t>(square_sum3);
2308 Circulate5PointersBy2<uint16_t>(sum5);
2309 Circulate5PointersBy2<uint32_t>(square_sum5);
2310 BoxFilter(src + 3, src + 2 * stride, src + 3 * stride, stride, width,
2311 scales, w0, w2, sum3, sum5, square_sum3, square_sum5, sum_width,
2312 ma343, ma444, ma565, b343, b444, b565, dst);
2313 src += 2 * stride;
2314 dst += 2 * stride;
2315 Circulate4PointersBy2<uint16_t>(ma343);
2316 Circulate4PointersBy2<uint32_t>(b343);
2317 std::swap(ma444[0], ma444[2]);
2318 std::swap(b444[0], b444[2]);
2319 std::swap(ma565[0], ma565[1]);
2320 std::swap(b565[0], b565[1]);
2321 }
2322
2323 Circulate4PointersBy2<uint16_t>(sum3);
2324 Circulate4PointersBy2<uint32_t>(square_sum3);
2325 Circulate5PointersBy2<uint16_t>(sum5);
2326 Circulate5PointersBy2<uint32_t>(square_sum5);
2327 if ((height & 1) == 0 || height > 1) {
2328 const uint8_t* sr[2];
2329 if ((height & 1) == 0) {
2330 sr[0] = bottom_border;
2331 sr[1] = bottom_border + bottom_border_stride;
2332 } else {
2333 sr[0] = src + 2 * stride;
2334 sr[1] = bottom_border;
2335 }
2336 BoxFilter(src + 3, sr[0], sr[1], stride, width, scales, w0, w2, sum3, sum5,
2337 square_sum3, square_sum5, sum_width, ma343, ma444, ma565, b343,
2338 b444, b565, dst);
2339 }
2340 if ((height & 1) != 0) {
2341 if (height > 1) {
2342 src += 2 * stride;
2343 dst += 2 * stride;
2344 Circulate4PointersBy2<uint16_t>(sum3);
2345 Circulate4PointersBy2<uint32_t>(square_sum3);
2346 Circulate5PointersBy2<uint16_t>(sum5);
2347 Circulate5PointersBy2<uint32_t>(square_sum5);
2348 Circulate4PointersBy2<uint16_t>(ma343);
2349 Circulate4PointersBy2<uint32_t>(b343);
2350 std::swap(ma444[0], ma444[2]);
2351 std::swap(b444[0], b444[2]);
2352 std::swap(ma565[0], ma565[1]);
2353 std::swap(b565[0], b565[1]);
2354 }
2355 BoxFilterLastRow(src + 3, bottom_border + bottom_border_stride, width,
2356 sum_width, scales, w0, w2, sum3, sum5, square_sum3,
2357 square_sum5, ma343[0], ma444[0], ma565[0], b343[0],
2358 b444[0], b565[0], dst);
2359 }
2360 }
2361
BoxFilterProcessPass1(const RestorationUnitInfo & restoration_info,const uint8_t * src,const ptrdiff_t stride,const uint8_t * const top_border,const ptrdiff_t top_border_stride,const uint8_t * bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,SgrBuffer * const sgr_buffer,uint8_t * dst)2362 inline void BoxFilterProcessPass1(const RestorationUnitInfo& restoration_info,
2363 const uint8_t* src, const ptrdiff_t stride,
2364 const uint8_t* const top_border,
2365 const ptrdiff_t top_border_stride,
2366 const uint8_t* bottom_border,
2367 const ptrdiff_t bottom_border_stride,
2368 const int width, const int height,
2369 SgrBuffer* const sgr_buffer, uint8_t* dst) {
2370 const auto temp_stride = Align<ptrdiff_t>(width, 16);
2371 const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
2372 const auto sum_stride = temp_stride + 16;
2373 const int sgr_proj_index = restoration_info.sgr_proj_info.index;
2374 const uint32_t scale = kSgrScaleParameter[sgr_proj_index][0]; // < 2^12.
2375 const int16_t w0 = restoration_info.sgr_proj_info.multiplier[0];
2376 uint16_t *sum5[5], *ma565[2];
2377 uint32_t *square_sum5[5], *b565[2];
2378 sum5[0] = sgr_buffer->sum5;
2379 square_sum5[0] = sgr_buffer->square_sum5;
2380 for (int i = 1; i <= 4; ++i) {
2381 sum5[i] = sum5[i - 1] + sum_stride;
2382 square_sum5[i] = square_sum5[i - 1] + sum_stride;
2383 }
2384 ma565[0] = sgr_buffer->ma565;
2385 ma565[1] = ma565[0] + temp_stride;
2386 b565[0] = sgr_buffer->b565;
2387 b565[1] = b565[0] + temp_stride;
2388 assert(scale != 0);
2389 BoxSum<5>(top_border, top_border_stride, width, sum_stride, sum_width,
2390 sum5[1], square_sum5[1]);
2391 sum5[0] = sum5[1];
2392 square_sum5[0] = square_sum5[1];
2393 const uint8_t* const s = (height > 1) ? src + stride : bottom_border;
2394 BoxSumFilterPreProcess5(src, s, width, scale, sum5, square_sum5, sum_width,
2395 ma565[0], b565[0]);
2396 sum5[0] = sgr_buffer->sum5;
2397 square_sum5[0] = sgr_buffer->square_sum5;
2398
2399 for (int y = (height >> 1) - 1; y > 0; --y) {
2400 Circulate5PointersBy2<uint16_t>(sum5);
2401 Circulate5PointersBy2<uint32_t>(square_sum5);
2402 BoxFilterPass1(src + 3, src + 2 * stride, src + 3 * stride, stride, sum5,
2403 square_sum5, width, sum_width, scale, w0, ma565, b565, dst);
2404 src += 2 * stride;
2405 dst += 2 * stride;
2406 std::swap(ma565[0], ma565[1]);
2407 std::swap(b565[0], b565[1]);
2408 }
2409
2410 Circulate5PointersBy2<uint16_t>(sum5);
2411 Circulate5PointersBy2<uint32_t>(square_sum5);
2412 if ((height & 1) == 0 || height > 1) {
2413 const uint8_t* sr[2];
2414 if ((height & 1) == 0) {
2415 sr[0] = bottom_border;
2416 sr[1] = bottom_border + bottom_border_stride;
2417 } else {
2418 sr[0] = src + 2 * stride;
2419 sr[1] = bottom_border;
2420 }
2421 BoxFilterPass1(src + 3, sr[0], sr[1], stride, sum5, square_sum5, width,
2422 sum_width, scale, w0, ma565, b565, dst);
2423 }
2424 if ((height & 1) != 0) {
2425 src += 3;
2426 if (height > 1) {
2427 src += 2 * stride;
2428 dst += 2 * stride;
2429 std::swap(ma565[0], ma565[1]);
2430 std::swap(b565[0], b565[1]);
2431 Circulate5PointersBy2<uint16_t>(sum5);
2432 Circulate5PointersBy2<uint32_t>(square_sum5);
2433 }
2434 BoxFilterPass1LastRow(src, bottom_border + bottom_border_stride, width,
2435 sum_width, scale, w0, sum5, square_sum5, ma565[0],
2436 b565[0], dst);
2437 }
2438 }
2439
BoxFilterProcessPass2(const RestorationUnitInfo & restoration_info,const uint8_t * src,const ptrdiff_t stride,const uint8_t * const top_border,const ptrdiff_t top_border_stride,const uint8_t * bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,SgrBuffer * const sgr_buffer,uint8_t * dst)2440 inline void BoxFilterProcessPass2(const RestorationUnitInfo& restoration_info,
2441 const uint8_t* src, const ptrdiff_t stride,
2442 const uint8_t* const top_border,
2443 const ptrdiff_t top_border_stride,
2444 const uint8_t* bottom_border,
2445 const ptrdiff_t bottom_border_stride,
2446 const int width, const int height,
2447 SgrBuffer* const sgr_buffer, uint8_t* dst) {
2448 assert(restoration_info.sgr_proj_info.multiplier[0] == 0);
2449 const auto temp_stride = Align<ptrdiff_t>(width, 16);
2450 const auto sum_width = Align<ptrdiff_t>(width + 8, 16);
2451 const auto sum_stride = temp_stride + 16;
2452 const int16_t w1 = restoration_info.sgr_proj_info.multiplier[1];
2453 const int16_t w0 = (1 << kSgrProjPrecisionBits) - w1;
2454 const int sgr_proj_index = restoration_info.sgr_proj_info.index;
2455 const uint32_t scale = kSgrScaleParameter[sgr_proj_index][1]; // < 2^12.
2456 uint16_t *sum3[3], *ma343[3], *ma444[2];
2457 uint32_t *square_sum3[3], *b343[3], *b444[2];
2458 sum3[0] = sgr_buffer->sum3;
2459 square_sum3[0] = sgr_buffer->square_sum3;
2460 ma343[0] = sgr_buffer->ma343;
2461 b343[0] = sgr_buffer->b343;
2462 for (int i = 1; i <= 2; ++i) {
2463 sum3[i] = sum3[i - 1] + sum_stride;
2464 square_sum3[i] = square_sum3[i - 1] + sum_stride;
2465 ma343[i] = ma343[i - 1] + temp_stride;
2466 b343[i] = b343[i - 1] + temp_stride;
2467 }
2468 ma444[0] = sgr_buffer->ma444;
2469 ma444[1] = ma444[0] + temp_stride;
2470 b444[0] = sgr_buffer->b444;
2471 b444[1] = b444[0] + temp_stride;
2472 assert(scale != 0);
2473 BoxSum<3>(top_border, top_border_stride, width, sum_stride, sum_width,
2474 sum3[0], square_sum3[0]);
2475 BoxSumFilterPreProcess3<false>(src, width, scale, sum3, square_sum3,
2476 sum_width, ma343[0], nullptr, b343[0],
2477 nullptr);
2478 Circulate3PointersBy1<uint16_t>(sum3);
2479 Circulate3PointersBy1<uint32_t>(square_sum3);
2480 const uint8_t* s;
2481 if (height > 1) {
2482 s = src + stride;
2483 } else {
2484 s = bottom_border;
2485 bottom_border += bottom_border_stride;
2486 }
2487 BoxSumFilterPreProcess3<true>(s, width, scale, sum3, square_sum3, sum_width,
2488 ma343[1], ma444[0], b343[1], b444[0]);
2489
2490 for (int y = height - 2; y > 0; --y) {
2491 Circulate3PointersBy1<uint16_t>(sum3);
2492 Circulate3PointersBy1<uint32_t>(square_sum3);
2493 BoxFilterPass2(src + 2, src + 2 * stride, width, sum_width, scale, w0, sum3,
2494 square_sum3, ma343, ma444, b343, b444, dst);
2495 src += stride;
2496 dst += stride;
2497 Circulate3PointersBy1<uint16_t>(ma343);
2498 Circulate3PointersBy1<uint32_t>(b343);
2499 std::swap(ma444[0], ma444[1]);
2500 std::swap(b444[0], b444[1]);
2501 }
2502
2503 int y = std::min(height, 2);
2504 src += 2;
2505 do {
2506 Circulate3PointersBy1<uint16_t>(sum3);
2507 Circulate3PointersBy1<uint32_t>(square_sum3);
2508 BoxFilterPass2(src, bottom_border, width, sum_width, scale, w0, sum3,
2509 square_sum3, ma343, ma444, b343, b444, dst);
2510 src += stride;
2511 dst += stride;
2512 bottom_border += bottom_border_stride;
2513 Circulate3PointersBy1<uint16_t>(ma343);
2514 Circulate3PointersBy1<uint32_t>(b343);
2515 std::swap(ma444[0], ma444[1]);
2516 std::swap(b444[0], b444[1]);
2517 } while (--y != 0);
2518 }
2519
2520 // If |width| is non-multiple of 16, up to 15 more pixels are written to |dest|
2521 // in the end of each row. It is safe to overwrite the output as it will not be
2522 // part of the visible frame.
SelfGuidedFilter_SSE4_1(const RestorationUnitInfo & LIBGAV1_RESTRICT restoration_info,const void * LIBGAV1_RESTRICT const source,const ptrdiff_t stride,const void * LIBGAV1_RESTRICT const top_border,const ptrdiff_t top_border_stride,const void * LIBGAV1_RESTRICT const bottom_border,const ptrdiff_t bottom_border_stride,const int width,const int height,RestorationBuffer * LIBGAV1_RESTRICT const restoration_buffer,void * LIBGAV1_RESTRICT const dest)2523 void SelfGuidedFilter_SSE4_1(
2524 const RestorationUnitInfo& LIBGAV1_RESTRICT restoration_info,
2525 const void* LIBGAV1_RESTRICT const source, const ptrdiff_t stride,
2526 const void* LIBGAV1_RESTRICT const top_border,
2527 const ptrdiff_t top_border_stride,
2528 const void* LIBGAV1_RESTRICT const bottom_border,
2529 const ptrdiff_t bottom_border_stride, const int width, const int height,
2530 RestorationBuffer* LIBGAV1_RESTRICT const restoration_buffer,
2531 void* LIBGAV1_RESTRICT const dest) {
2532 const int index = restoration_info.sgr_proj_info.index;
2533 const int radius_pass_0 = kSgrProjParams[index][0]; // 2 or 0
2534 const int radius_pass_1 = kSgrProjParams[index][2]; // 1 or 0
2535 const auto* const src = static_cast<const uint8_t*>(source);
2536 const auto* top = static_cast<const uint8_t*>(top_border);
2537 const auto* bottom = static_cast<const uint8_t*>(bottom_border);
2538 auto* const dst = static_cast<uint8_t*>(dest);
2539 SgrBuffer* const sgr_buffer = &restoration_buffer->sgr_buffer;
2540 if (radius_pass_1 == 0) {
2541 // |radius_pass_0| and |radius_pass_1| cannot both be 0, so we have the
2542 // following assertion.
2543 assert(radius_pass_0 != 0);
2544 BoxFilterProcessPass1(restoration_info, src - 3, stride, top - 3,
2545 top_border_stride, bottom - 3, bottom_border_stride,
2546 width, height, sgr_buffer, dst);
2547 } else if (radius_pass_0 == 0) {
2548 BoxFilterProcessPass2(restoration_info, src - 2, stride, top - 2,
2549 top_border_stride, bottom - 2, bottom_border_stride,
2550 width, height, sgr_buffer, dst);
2551 } else {
2552 BoxFilterProcess(restoration_info, src - 3, stride, top - 3,
2553 top_border_stride, bottom - 3, bottom_border_stride, width,
2554 height, sgr_buffer, dst);
2555 }
2556 }
2557
Init8bpp()2558 void Init8bpp() {
2559 Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
2560 assert(dsp != nullptr);
2561 static_cast<void>(dsp);
2562 #if DSP_ENABLED_8BPP_SSE4_1(WienerFilter)
2563 dsp->loop_restorations[0] = WienerFilter_SSE4_1;
2564 #else
2565 static_cast<void>(WienerFilter_SSE4_1);
2566 #endif
2567 #if DSP_ENABLED_8BPP_SSE4_1(SelfGuidedFilter)
2568 dsp->loop_restorations[1] = SelfGuidedFilter_SSE4_1;
2569 #else
2570 static_cast<void>(SelfGuidedFilter_SSE4_1);
2571 #endif
2572 }
2573
2574 } // namespace
2575 } // namespace low_bitdepth
2576
LoopRestorationInit_SSE4_1()2577 void LoopRestorationInit_SSE4_1() { low_bitdepth::Init8bpp(); }
2578
2579 } // namespace dsp
2580 } // namespace libgav1
2581
2582 #else // !LIBGAV1_TARGETING_SSE4_1
2583 namespace libgav1 {
2584 namespace dsp {
2585
LoopRestorationInit_SSE4_1()2586 void LoopRestorationInit_SSE4_1() {}
2587
2588 } // namespace dsp
2589 } // namespace libgav1
2590 #endif // LIBGAV1_TARGETING_SSE4_1
2591