1*09537850SAkhilesh Sanikop // Copyright 2019 The libgav1 Authors
2*09537850SAkhilesh Sanikop //
3*09537850SAkhilesh Sanikop // Licensed under the Apache License, Version 2.0 (the "License");
4*09537850SAkhilesh Sanikop // you may not use this file except in compliance with the License.
5*09537850SAkhilesh Sanikop // You may obtain a copy of the License at
6*09537850SAkhilesh Sanikop //
7*09537850SAkhilesh Sanikop // http://www.apache.org/licenses/LICENSE-2.0
8*09537850SAkhilesh Sanikop //
9*09537850SAkhilesh Sanikop // Unless required by applicable law or agreed to in writing, software
10*09537850SAkhilesh Sanikop // distributed under the License is distributed on an "AS IS" BASIS,
11*09537850SAkhilesh Sanikop // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*09537850SAkhilesh Sanikop // See the License for the specific language governing permissions and
13*09537850SAkhilesh Sanikop // limitations under the License.
14*09537850SAkhilesh Sanikop
15*09537850SAkhilesh Sanikop #include "src/dsp/convolve.h"
16*09537850SAkhilesh Sanikop #include "src/utils/constants.h"
17*09537850SAkhilesh Sanikop #include "src/utils/cpu.h"
18*09537850SAkhilesh Sanikop
19*09537850SAkhilesh Sanikop #if LIBGAV1_TARGETING_SSE4_1
20*09537850SAkhilesh Sanikop #include <smmintrin.h>
21*09537850SAkhilesh Sanikop
22*09537850SAkhilesh Sanikop #include <algorithm>
23*09537850SAkhilesh Sanikop #include <cassert>
24*09537850SAkhilesh Sanikop #include <cstdint>
25*09537850SAkhilesh Sanikop #include <cstring>
26*09537850SAkhilesh Sanikop
27*09537850SAkhilesh Sanikop #include "src/dsp/constants.h"
28*09537850SAkhilesh Sanikop #include "src/dsp/dsp.h"
29*09537850SAkhilesh Sanikop #include "src/dsp/x86/common_sse4.h"
30*09537850SAkhilesh Sanikop #include "src/utils/common.h"
31*09537850SAkhilesh Sanikop #include "src/utils/compiler_attributes.h"
32*09537850SAkhilesh Sanikop
33*09537850SAkhilesh Sanikop namespace libgav1 {
34*09537850SAkhilesh Sanikop namespace dsp {
35*09537850SAkhilesh Sanikop namespace low_bitdepth {
36*09537850SAkhilesh Sanikop namespace {
37*09537850SAkhilesh Sanikop
38*09537850SAkhilesh Sanikop #include "src/dsp/x86/convolve_sse4.inc"
39*09537850SAkhilesh Sanikop
40*09537850SAkhilesh Sanikop template <int num_taps>
SumHorizontalTaps(const uint8_t * LIBGAV1_RESTRICT const src,const __m128i * const v_tap)41*09537850SAkhilesh Sanikop __m128i SumHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src,
42*09537850SAkhilesh Sanikop const __m128i* const v_tap) {
43*09537850SAkhilesh Sanikop __m128i v_src[4];
44*09537850SAkhilesh Sanikop const __m128i src_long = LoadUnaligned16(src);
45*09537850SAkhilesh Sanikop const __m128i src_long_dup_lo = _mm_unpacklo_epi8(src_long, src_long);
46*09537850SAkhilesh Sanikop const __m128i src_long_dup_hi = _mm_unpackhi_epi8(src_long, src_long);
47*09537850SAkhilesh Sanikop
48*09537850SAkhilesh Sanikop if (num_taps == 6) {
49*09537850SAkhilesh Sanikop // 6 taps.
50*09537850SAkhilesh Sanikop v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 3); // _21
51*09537850SAkhilesh Sanikop v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43
52*09537850SAkhilesh Sanikop v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 11); // _65
53*09537850SAkhilesh Sanikop } else if (num_taps == 8) {
54*09537850SAkhilesh Sanikop // 8 taps.
55*09537850SAkhilesh Sanikop v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 1); // _10
56*09537850SAkhilesh Sanikop v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32
57*09537850SAkhilesh Sanikop v_src[2] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54
58*09537850SAkhilesh Sanikop v_src[3] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 13); // _76
59*09537850SAkhilesh Sanikop } else if (num_taps == 2) {
60*09537850SAkhilesh Sanikop // 2 taps.
61*09537850SAkhilesh Sanikop v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 7); // _43
62*09537850SAkhilesh Sanikop } else {
63*09537850SAkhilesh Sanikop // 4 taps.
64*09537850SAkhilesh Sanikop v_src[0] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 5); // _32
65*09537850SAkhilesh Sanikop v_src[1] = _mm_alignr_epi8(src_long_dup_hi, src_long_dup_lo, 9); // _54
66*09537850SAkhilesh Sanikop }
67*09537850SAkhilesh Sanikop const __m128i sum = SumOnePassTaps<num_taps>(v_src, v_tap);
68*09537850SAkhilesh Sanikop return sum;
69*09537850SAkhilesh Sanikop }
70*09537850SAkhilesh Sanikop
71*09537850SAkhilesh Sanikop template <int num_taps>
SimpleHorizontalTaps(const uint8_t * LIBGAV1_RESTRICT const src,const __m128i * const v_tap)72*09537850SAkhilesh Sanikop __m128i SimpleHorizontalTaps(const uint8_t* LIBGAV1_RESTRICT const src,
73*09537850SAkhilesh Sanikop const __m128i* const v_tap) {
74*09537850SAkhilesh Sanikop __m128i sum = SumHorizontalTaps<num_taps>(src, v_tap);
75*09537850SAkhilesh Sanikop
76*09537850SAkhilesh Sanikop // Normally the Horizontal pass does the downshift in two passes:
77*09537850SAkhilesh Sanikop // kInterRoundBitsHorizontal - 1 and then (kFilterBits -
78*09537850SAkhilesh Sanikop // kInterRoundBitsHorizontal). Each one uses a rounding shift. Combining them
79*09537850SAkhilesh Sanikop // requires adding the rounding offset from the skipped shift.
80*09537850SAkhilesh Sanikop constexpr int first_shift_rounding_bit = 1 << (kInterRoundBitsHorizontal - 2);
81*09537850SAkhilesh Sanikop
82*09537850SAkhilesh Sanikop sum = _mm_add_epi16(sum, _mm_set1_epi16(first_shift_rounding_bit));
83*09537850SAkhilesh Sanikop sum = RightShiftWithRounding_S16(sum, kFilterBits - 1);
84*09537850SAkhilesh Sanikop return _mm_packus_epi16(sum, sum);
85*09537850SAkhilesh Sanikop }
86*09537850SAkhilesh Sanikop
87*09537850SAkhilesh Sanikop template <int num_taps>
HorizontalTaps8To16(const uint8_t * LIBGAV1_RESTRICT const src,const __m128i * const v_tap)88*09537850SAkhilesh Sanikop __m128i HorizontalTaps8To16(const uint8_t* LIBGAV1_RESTRICT const src,
89*09537850SAkhilesh Sanikop const __m128i* const v_tap) {
90*09537850SAkhilesh Sanikop const __m128i sum = SumHorizontalTaps<num_taps>(src, v_tap);
91*09537850SAkhilesh Sanikop
92*09537850SAkhilesh Sanikop return RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
93*09537850SAkhilesh Sanikop }
94*09537850SAkhilesh Sanikop
95*09537850SAkhilesh Sanikop template <int num_taps, bool is_2d = false, bool is_compound = false>
FilterHorizontal(const uint8_t * LIBGAV1_RESTRICT src,const ptrdiff_t src_stride,void * LIBGAV1_RESTRICT const dest,const ptrdiff_t pred_stride,const int width,const int height,const __m128i * const v_tap)96*09537850SAkhilesh Sanikop void FilterHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
97*09537850SAkhilesh Sanikop const ptrdiff_t src_stride,
98*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const dest,
99*09537850SAkhilesh Sanikop const ptrdiff_t pred_stride, const int width,
100*09537850SAkhilesh Sanikop const int height, const __m128i* const v_tap) {
101*09537850SAkhilesh Sanikop auto* dest8 = static_cast<uint8_t*>(dest);
102*09537850SAkhilesh Sanikop auto* dest16 = static_cast<uint16_t*>(dest);
103*09537850SAkhilesh Sanikop
104*09537850SAkhilesh Sanikop // 4 tap filters are never used when width > 4.
105*09537850SAkhilesh Sanikop if (num_taps != 4 && width > 4) {
106*09537850SAkhilesh Sanikop int y = height;
107*09537850SAkhilesh Sanikop do {
108*09537850SAkhilesh Sanikop int x = 0;
109*09537850SAkhilesh Sanikop do {
110*09537850SAkhilesh Sanikop if (is_2d || is_compound) {
111*09537850SAkhilesh Sanikop const __m128i v_sum = HorizontalTaps8To16<num_taps>(&src[x], v_tap);
112*09537850SAkhilesh Sanikop if (is_2d) {
113*09537850SAkhilesh Sanikop StoreAligned16(&dest16[x], v_sum);
114*09537850SAkhilesh Sanikop } else {
115*09537850SAkhilesh Sanikop StoreUnaligned16(&dest16[x], v_sum);
116*09537850SAkhilesh Sanikop }
117*09537850SAkhilesh Sanikop } else {
118*09537850SAkhilesh Sanikop const __m128i result = SimpleHorizontalTaps<num_taps>(&src[x], v_tap);
119*09537850SAkhilesh Sanikop StoreLo8(&dest8[x], result);
120*09537850SAkhilesh Sanikop }
121*09537850SAkhilesh Sanikop x += 8;
122*09537850SAkhilesh Sanikop } while (x < width);
123*09537850SAkhilesh Sanikop src += src_stride;
124*09537850SAkhilesh Sanikop dest8 += pred_stride;
125*09537850SAkhilesh Sanikop dest16 += pred_stride;
126*09537850SAkhilesh Sanikop } while (--y != 0);
127*09537850SAkhilesh Sanikop return;
128*09537850SAkhilesh Sanikop }
129*09537850SAkhilesh Sanikop
130*09537850SAkhilesh Sanikop // Horizontal passes only needs to account for |num_taps| 2 and 4 when
131*09537850SAkhilesh Sanikop // |width| <= 4.
132*09537850SAkhilesh Sanikop assert(width <= 4);
133*09537850SAkhilesh Sanikop assert(num_taps <= 4);
134*09537850SAkhilesh Sanikop if (num_taps <= 4) {
135*09537850SAkhilesh Sanikop if (width == 4) {
136*09537850SAkhilesh Sanikop int y = height;
137*09537850SAkhilesh Sanikop do {
138*09537850SAkhilesh Sanikop if (is_2d || is_compound) {
139*09537850SAkhilesh Sanikop const __m128i v_sum = HorizontalTaps8To16<num_taps>(src, v_tap);
140*09537850SAkhilesh Sanikop StoreLo8(dest16, v_sum);
141*09537850SAkhilesh Sanikop } else {
142*09537850SAkhilesh Sanikop const __m128i result = SimpleHorizontalTaps<num_taps>(src, v_tap);
143*09537850SAkhilesh Sanikop Store4(&dest8[0], result);
144*09537850SAkhilesh Sanikop }
145*09537850SAkhilesh Sanikop src += src_stride;
146*09537850SAkhilesh Sanikop dest8 += pred_stride;
147*09537850SAkhilesh Sanikop dest16 += pred_stride;
148*09537850SAkhilesh Sanikop } while (--y != 0);
149*09537850SAkhilesh Sanikop return;
150*09537850SAkhilesh Sanikop }
151*09537850SAkhilesh Sanikop
152*09537850SAkhilesh Sanikop if (!is_compound) {
153*09537850SAkhilesh Sanikop int y = height;
154*09537850SAkhilesh Sanikop if (is_2d) y -= 1;
155*09537850SAkhilesh Sanikop do {
156*09537850SAkhilesh Sanikop if (is_2d) {
157*09537850SAkhilesh Sanikop const __m128i sum =
158*09537850SAkhilesh Sanikop HorizontalTaps8To16_2x2<num_taps>(src, src_stride, v_tap);
159*09537850SAkhilesh Sanikop Store4(&dest16[0], sum);
160*09537850SAkhilesh Sanikop dest16 += pred_stride;
161*09537850SAkhilesh Sanikop Store4(&dest16[0], _mm_srli_si128(sum, 8));
162*09537850SAkhilesh Sanikop dest16 += pred_stride;
163*09537850SAkhilesh Sanikop } else {
164*09537850SAkhilesh Sanikop const __m128i sum =
165*09537850SAkhilesh Sanikop SimpleHorizontalTaps2x2<num_taps>(src, src_stride, v_tap);
166*09537850SAkhilesh Sanikop Store2(dest8, sum);
167*09537850SAkhilesh Sanikop dest8 += pred_stride;
168*09537850SAkhilesh Sanikop Store2(dest8, _mm_srli_si128(sum, 4));
169*09537850SAkhilesh Sanikop dest8 += pred_stride;
170*09537850SAkhilesh Sanikop }
171*09537850SAkhilesh Sanikop
172*09537850SAkhilesh Sanikop src += src_stride << 1;
173*09537850SAkhilesh Sanikop y -= 2;
174*09537850SAkhilesh Sanikop } while (y != 0);
175*09537850SAkhilesh Sanikop
176*09537850SAkhilesh Sanikop // The 2d filters have an odd |height| because the horizontal pass
177*09537850SAkhilesh Sanikop // generates context for the vertical pass.
178*09537850SAkhilesh Sanikop if (is_2d) {
179*09537850SAkhilesh Sanikop assert(height % 2 == 1);
180*09537850SAkhilesh Sanikop __m128i sum;
181*09537850SAkhilesh Sanikop const __m128i input = LoadLo8(&src[2]);
182*09537850SAkhilesh Sanikop if (num_taps == 2) {
183*09537850SAkhilesh Sanikop // 03 04 04 05 05 06 06 07 ....
184*09537850SAkhilesh Sanikop const __m128i v_src_43 =
185*09537850SAkhilesh Sanikop _mm_srli_si128(_mm_unpacklo_epi8(input, input), 3);
186*09537850SAkhilesh Sanikop sum = _mm_maddubs_epi16(v_src_43, v_tap[0]); // k4k3
187*09537850SAkhilesh Sanikop } else {
188*09537850SAkhilesh Sanikop // 02 03 03 04 04 05 05 06 06 07 ....
189*09537850SAkhilesh Sanikop const __m128i v_src_32 =
190*09537850SAkhilesh Sanikop _mm_srli_si128(_mm_unpacklo_epi8(input, input), 1);
191*09537850SAkhilesh Sanikop // 04 05 05 06 06 07 07 08 ...
192*09537850SAkhilesh Sanikop const __m128i v_src_54 = _mm_srli_si128(v_src_32, 4);
193*09537850SAkhilesh Sanikop const __m128i v_madd_32 =
194*09537850SAkhilesh Sanikop _mm_maddubs_epi16(v_src_32, v_tap[0]); // k3k2
195*09537850SAkhilesh Sanikop const __m128i v_madd_54 =
196*09537850SAkhilesh Sanikop _mm_maddubs_epi16(v_src_54, v_tap[1]); // k5k4
197*09537850SAkhilesh Sanikop sum = _mm_add_epi16(v_madd_54, v_madd_32);
198*09537850SAkhilesh Sanikop }
199*09537850SAkhilesh Sanikop sum = RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal - 1);
200*09537850SAkhilesh Sanikop Store4(dest16, sum);
201*09537850SAkhilesh Sanikop }
202*09537850SAkhilesh Sanikop }
203*09537850SAkhilesh Sanikop }
204*09537850SAkhilesh Sanikop }
205*09537850SAkhilesh Sanikop
206*09537850SAkhilesh Sanikop template <bool is_2d = false, bool is_compound = false>
DoHorizontalPass(const uint8_t * LIBGAV1_RESTRICT const src,const ptrdiff_t src_stride,void * LIBGAV1_RESTRICT const dst,const ptrdiff_t dst_stride,const int width,const int height,const int filter_id,const int filter_index)207*09537850SAkhilesh Sanikop LIBGAV1_ALWAYS_INLINE void DoHorizontalPass(
208*09537850SAkhilesh Sanikop const uint8_t* LIBGAV1_RESTRICT const src, const ptrdiff_t src_stride,
209*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const dst, const ptrdiff_t dst_stride,
210*09537850SAkhilesh Sanikop const int width, const int height, const int filter_id,
211*09537850SAkhilesh Sanikop const int filter_index) {
212*09537850SAkhilesh Sanikop assert(filter_id != 0);
213*09537850SAkhilesh Sanikop __m128i v_tap[4];
214*09537850SAkhilesh Sanikop const __m128i v_horizontal_filter =
215*09537850SAkhilesh Sanikop LoadLo8(kHalfSubPixelFilters[filter_index][filter_id]);
216*09537850SAkhilesh Sanikop
217*09537850SAkhilesh Sanikop if (filter_index == 2) { // 8 tap.
218*09537850SAkhilesh Sanikop SetupTaps<8>(&v_horizontal_filter, v_tap);
219*09537850SAkhilesh Sanikop FilterHorizontal<8, is_2d, is_compound>(src, src_stride, dst, dst_stride,
220*09537850SAkhilesh Sanikop width, height, v_tap);
221*09537850SAkhilesh Sanikop } else if (filter_index == 1) { // 6 tap.
222*09537850SAkhilesh Sanikop SetupTaps<6>(&v_horizontal_filter, v_tap);
223*09537850SAkhilesh Sanikop FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride,
224*09537850SAkhilesh Sanikop width, height, v_tap);
225*09537850SAkhilesh Sanikop } else if (filter_index == 0) { // 6 tap.
226*09537850SAkhilesh Sanikop SetupTaps<6>(&v_horizontal_filter, v_tap);
227*09537850SAkhilesh Sanikop FilterHorizontal<6, is_2d, is_compound>(src, src_stride, dst, dst_stride,
228*09537850SAkhilesh Sanikop width, height, v_tap);
229*09537850SAkhilesh Sanikop } else if ((filter_index & 0x4) != 0) { // 4 tap.
230*09537850SAkhilesh Sanikop // ((filter_index == 4) | (filter_index == 5))
231*09537850SAkhilesh Sanikop SetupTaps<4>(&v_horizontal_filter, v_tap);
232*09537850SAkhilesh Sanikop FilterHorizontal<4, is_2d, is_compound>(src, src_stride, dst, dst_stride,
233*09537850SAkhilesh Sanikop width, height, v_tap);
234*09537850SAkhilesh Sanikop } else { // 2 tap.
235*09537850SAkhilesh Sanikop SetupTaps<2>(&v_horizontal_filter, v_tap);
236*09537850SAkhilesh Sanikop FilterHorizontal<2, is_2d, is_compound>(src, src_stride, dst, dst_stride,
237*09537850SAkhilesh Sanikop width, height, v_tap);
238*09537850SAkhilesh Sanikop }
239*09537850SAkhilesh Sanikop }
240*09537850SAkhilesh Sanikop
Convolve2D_SSE4_1(const void * LIBGAV1_RESTRICT const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int vertical_filter_index,const int horizontal_filter_id,const int vertical_filter_id,const int width,const int height,void * LIBGAV1_RESTRICT prediction,const ptrdiff_t pred_stride)241*09537850SAkhilesh Sanikop void Convolve2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
242*09537850SAkhilesh Sanikop const ptrdiff_t reference_stride,
243*09537850SAkhilesh Sanikop const int horizontal_filter_index,
244*09537850SAkhilesh Sanikop const int vertical_filter_index,
245*09537850SAkhilesh Sanikop const int horizontal_filter_id,
246*09537850SAkhilesh Sanikop const int vertical_filter_id, const int width,
247*09537850SAkhilesh Sanikop const int height, void* LIBGAV1_RESTRICT prediction,
248*09537850SAkhilesh Sanikop const ptrdiff_t pred_stride) {
249*09537850SAkhilesh Sanikop const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
250*09537850SAkhilesh Sanikop const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
251*09537850SAkhilesh Sanikop const int vertical_taps =
252*09537850SAkhilesh Sanikop GetNumTapsInFilter(vert_filter_index, vertical_filter_id);
253*09537850SAkhilesh Sanikop
254*09537850SAkhilesh Sanikop // The output of the horizontal filter is guaranteed to fit in 16 bits.
255*09537850SAkhilesh Sanikop alignas(16) uint16_t
256*09537850SAkhilesh Sanikop intermediate_result[kMaxSuperBlockSizeInPixels *
257*09537850SAkhilesh Sanikop (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
258*09537850SAkhilesh Sanikop #if LIBGAV1_MSAN
259*09537850SAkhilesh Sanikop // Quiet msan warnings. Set with random non-zero value to aid in debugging.
260*09537850SAkhilesh Sanikop memset(intermediate_result, 0x33, sizeof(intermediate_result));
261*09537850SAkhilesh Sanikop #endif
262*09537850SAkhilesh Sanikop const int intermediate_height = height + vertical_taps - 1;
263*09537850SAkhilesh Sanikop
264*09537850SAkhilesh Sanikop const ptrdiff_t src_stride = reference_stride;
265*09537850SAkhilesh Sanikop const auto* src = static_cast<const uint8_t*>(reference) -
266*09537850SAkhilesh Sanikop (vertical_taps / 2 - 1) * src_stride - kHorizontalOffset;
267*09537850SAkhilesh Sanikop
268*09537850SAkhilesh Sanikop DoHorizontalPass</*is_2d=*/true>(src, src_stride, intermediate_result, width,
269*09537850SAkhilesh Sanikop width, intermediate_height,
270*09537850SAkhilesh Sanikop horizontal_filter_id, horiz_filter_index);
271*09537850SAkhilesh Sanikop
272*09537850SAkhilesh Sanikop // Vertical filter.
273*09537850SAkhilesh Sanikop auto* dest = static_cast<uint8_t*>(prediction);
274*09537850SAkhilesh Sanikop const ptrdiff_t dest_stride = pred_stride;
275*09537850SAkhilesh Sanikop assert(vertical_filter_id != 0);
276*09537850SAkhilesh Sanikop
277*09537850SAkhilesh Sanikop __m128i taps[4];
278*09537850SAkhilesh Sanikop const __m128i v_filter =
279*09537850SAkhilesh Sanikop LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
280*09537850SAkhilesh Sanikop
281*09537850SAkhilesh Sanikop if (vertical_taps == 8) {
282*09537850SAkhilesh Sanikop SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
283*09537850SAkhilesh Sanikop if (width == 2) {
284*09537850SAkhilesh Sanikop Filter2DVertical2xH<8>(intermediate_result, dest, dest_stride, height,
285*09537850SAkhilesh Sanikop taps);
286*09537850SAkhilesh Sanikop } else if (width == 4) {
287*09537850SAkhilesh Sanikop Filter2DVertical4xH<8>(intermediate_result, dest, dest_stride, height,
288*09537850SAkhilesh Sanikop taps);
289*09537850SAkhilesh Sanikop } else {
290*09537850SAkhilesh Sanikop Filter2DVertical<8>(intermediate_result, dest, dest_stride, width, height,
291*09537850SAkhilesh Sanikop taps);
292*09537850SAkhilesh Sanikop }
293*09537850SAkhilesh Sanikop } else if (vertical_taps == 6) {
294*09537850SAkhilesh Sanikop SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
295*09537850SAkhilesh Sanikop if (width == 2) {
296*09537850SAkhilesh Sanikop Filter2DVertical2xH<6>(intermediate_result, dest, dest_stride, height,
297*09537850SAkhilesh Sanikop taps);
298*09537850SAkhilesh Sanikop } else if (width == 4) {
299*09537850SAkhilesh Sanikop Filter2DVertical4xH<6>(intermediate_result, dest, dest_stride, height,
300*09537850SAkhilesh Sanikop taps);
301*09537850SAkhilesh Sanikop } else {
302*09537850SAkhilesh Sanikop Filter2DVertical<6>(intermediate_result, dest, dest_stride, width, height,
303*09537850SAkhilesh Sanikop taps);
304*09537850SAkhilesh Sanikop }
305*09537850SAkhilesh Sanikop } else if (vertical_taps == 4) {
306*09537850SAkhilesh Sanikop SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
307*09537850SAkhilesh Sanikop if (width == 2) {
308*09537850SAkhilesh Sanikop Filter2DVertical2xH<4>(intermediate_result, dest, dest_stride, height,
309*09537850SAkhilesh Sanikop taps);
310*09537850SAkhilesh Sanikop } else if (width == 4) {
311*09537850SAkhilesh Sanikop Filter2DVertical4xH<4>(intermediate_result, dest, dest_stride, height,
312*09537850SAkhilesh Sanikop taps);
313*09537850SAkhilesh Sanikop } else {
314*09537850SAkhilesh Sanikop Filter2DVertical<4>(intermediate_result, dest, dest_stride, width, height,
315*09537850SAkhilesh Sanikop taps);
316*09537850SAkhilesh Sanikop }
317*09537850SAkhilesh Sanikop } else { // |vertical_taps| == 2
318*09537850SAkhilesh Sanikop SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
319*09537850SAkhilesh Sanikop if (width == 2) {
320*09537850SAkhilesh Sanikop Filter2DVertical2xH<2>(intermediate_result, dest, dest_stride, height,
321*09537850SAkhilesh Sanikop taps);
322*09537850SAkhilesh Sanikop } else if (width == 4) {
323*09537850SAkhilesh Sanikop Filter2DVertical4xH<2>(intermediate_result, dest, dest_stride, height,
324*09537850SAkhilesh Sanikop taps);
325*09537850SAkhilesh Sanikop } else {
326*09537850SAkhilesh Sanikop Filter2DVertical<2>(intermediate_result, dest, dest_stride, width, height,
327*09537850SAkhilesh Sanikop taps);
328*09537850SAkhilesh Sanikop }
329*09537850SAkhilesh Sanikop }
330*09537850SAkhilesh Sanikop }
331*09537850SAkhilesh Sanikop
332*09537850SAkhilesh Sanikop template <int num_taps, bool is_compound = false>
FilterVertical(const uint8_t * LIBGAV1_RESTRICT src,const ptrdiff_t src_stride,void * LIBGAV1_RESTRICT const dst,const ptrdiff_t dst_stride,const int width,const int height,const __m128i * const v_tap)333*09537850SAkhilesh Sanikop void FilterVertical(const uint8_t* LIBGAV1_RESTRICT src,
334*09537850SAkhilesh Sanikop const ptrdiff_t src_stride,
335*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const dst,
336*09537850SAkhilesh Sanikop const ptrdiff_t dst_stride, const int width,
337*09537850SAkhilesh Sanikop const int height, const __m128i* const v_tap) {
338*09537850SAkhilesh Sanikop const int next_row = num_taps - 1;
339*09537850SAkhilesh Sanikop auto* dst8 = static_cast<uint8_t*>(dst);
340*09537850SAkhilesh Sanikop auto* dst16 = static_cast<uint16_t*>(dst);
341*09537850SAkhilesh Sanikop assert(width >= 8);
342*09537850SAkhilesh Sanikop
343*09537850SAkhilesh Sanikop int x = 0;
344*09537850SAkhilesh Sanikop do {
345*09537850SAkhilesh Sanikop const uint8_t* src_x = src + x;
346*09537850SAkhilesh Sanikop __m128i srcs[8];
347*09537850SAkhilesh Sanikop srcs[0] = LoadLo8(src_x);
348*09537850SAkhilesh Sanikop src_x += src_stride;
349*09537850SAkhilesh Sanikop if (num_taps >= 4) {
350*09537850SAkhilesh Sanikop srcs[1] = LoadLo8(src_x);
351*09537850SAkhilesh Sanikop src_x += src_stride;
352*09537850SAkhilesh Sanikop srcs[2] = LoadLo8(src_x);
353*09537850SAkhilesh Sanikop src_x += src_stride;
354*09537850SAkhilesh Sanikop if (num_taps >= 6) {
355*09537850SAkhilesh Sanikop srcs[3] = LoadLo8(src_x);
356*09537850SAkhilesh Sanikop src_x += src_stride;
357*09537850SAkhilesh Sanikop srcs[4] = LoadLo8(src_x);
358*09537850SAkhilesh Sanikop src_x += src_stride;
359*09537850SAkhilesh Sanikop if (num_taps == 8) {
360*09537850SAkhilesh Sanikop srcs[5] = LoadLo8(src_x);
361*09537850SAkhilesh Sanikop src_x += src_stride;
362*09537850SAkhilesh Sanikop srcs[6] = LoadLo8(src_x);
363*09537850SAkhilesh Sanikop src_x += src_stride;
364*09537850SAkhilesh Sanikop }
365*09537850SAkhilesh Sanikop }
366*09537850SAkhilesh Sanikop }
367*09537850SAkhilesh Sanikop
368*09537850SAkhilesh Sanikop auto* dst8_x = dst8 + x;
369*09537850SAkhilesh Sanikop auto* dst16_x = dst16 + x;
370*09537850SAkhilesh Sanikop int y = height;
371*09537850SAkhilesh Sanikop do {
372*09537850SAkhilesh Sanikop srcs[next_row] = LoadLo8(src_x);
373*09537850SAkhilesh Sanikop src_x += src_stride;
374*09537850SAkhilesh Sanikop
375*09537850SAkhilesh Sanikop const __m128i sums = SumVerticalTaps<num_taps>(srcs, v_tap);
376*09537850SAkhilesh Sanikop if (is_compound) {
377*09537850SAkhilesh Sanikop const __m128i results = Compound1DShift(sums);
378*09537850SAkhilesh Sanikop StoreUnaligned16(dst16_x, results);
379*09537850SAkhilesh Sanikop dst16_x += dst_stride;
380*09537850SAkhilesh Sanikop } else {
381*09537850SAkhilesh Sanikop const __m128i results =
382*09537850SAkhilesh Sanikop RightShiftWithRounding_S16(sums, kFilterBits - 1);
383*09537850SAkhilesh Sanikop StoreLo8(dst8_x, _mm_packus_epi16(results, results));
384*09537850SAkhilesh Sanikop dst8_x += dst_stride;
385*09537850SAkhilesh Sanikop }
386*09537850SAkhilesh Sanikop
387*09537850SAkhilesh Sanikop srcs[0] = srcs[1];
388*09537850SAkhilesh Sanikop if (num_taps >= 4) {
389*09537850SAkhilesh Sanikop srcs[1] = srcs[2];
390*09537850SAkhilesh Sanikop srcs[2] = srcs[3];
391*09537850SAkhilesh Sanikop if (num_taps >= 6) {
392*09537850SAkhilesh Sanikop srcs[3] = srcs[4];
393*09537850SAkhilesh Sanikop srcs[4] = srcs[5];
394*09537850SAkhilesh Sanikop if (num_taps == 8) {
395*09537850SAkhilesh Sanikop srcs[5] = srcs[6];
396*09537850SAkhilesh Sanikop srcs[6] = srcs[7];
397*09537850SAkhilesh Sanikop }
398*09537850SAkhilesh Sanikop }
399*09537850SAkhilesh Sanikop }
400*09537850SAkhilesh Sanikop } while (--y != 0);
401*09537850SAkhilesh Sanikop x += 8;
402*09537850SAkhilesh Sanikop } while (x < width);
403*09537850SAkhilesh Sanikop }
404*09537850SAkhilesh Sanikop
ConvolveVertical_SSE4_1(const void * LIBGAV1_RESTRICT const reference,const ptrdiff_t reference_stride,const int,const int vertical_filter_index,const int,const int vertical_filter_id,const int width,const int height,void * LIBGAV1_RESTRICT prediction,const ptrdiff_t pred_stride)405*09537850SAkhilesh Sanikop void ConvolveVertical_SSE4_1(
406*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const reference,
407*09537850SAkhilesh Sanikop const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
408*09537850SAkhilesh Sanikop const int vertical_filter_index, const int /*horizontal_filter_id*/,
409*09537850SAkhilesh Sanikop const int vertical_filter_id, const int width, const int height,
410*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
411*09537850SAkhilesh Sanikop const int filter_index = GetFilterIndex(vertical_filter_index, height);
412*09537850SAkhilesh Sanikop const int vertical_taps =
413*09537850SAkhilesh Sanikop GetNumTapsInFilter(filter_index, vertical_filter_id);
414*09537850SAkhilesh Sanikop const ptrdiff_t src_stride = reference_stride;
415*09537850SAkhilesh Sanikop const auto* src = static_cast<const uint8_t*>(reference) -
416*09537850SAkhilesh Sanikop (vertical_taps / 2 - 1) * src_stride;
417*09537850SAkhilesh Sanikop auto* dest = static_cast<uint8_t*>(prediction);
418*09537850SAkhilesh Sanikop const ptrdiff_t dest_stride = pred_stride;
419*09537850SAkhilesh Sanikop assert(vertical_filter_id != 0);
420*09537850SAkhilesh Sanikop
421*09537850SAkhilesh Sanikop __m128i taps[4];
422*09537850SAkhilesh Sanikop const __m128i v_filter =
423*09537850SAkhilesh Sanikop LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
424*09537850SAkhilesh Sanikop
425*09537850SAkhilesh Sanikop if (vertical_taps == 6) { // 6 tap.
426*09537850SAkhilesh Sanikop SetupTaps<6>(&v_filter, taps);
427*09537850SAkhilesh Sanikop if (width == 2) {
428*09537850SAkhilesh Sanikop FilterVertical2xH<6>(src, src_stride, dest, dest_stride, height, taps);
429*09537850SAkhilesh Sanikop } else if (width == 4) {
430*09537850SAkhilesh Sanikop FilterVertical4xH<6>(src, src_stride, dest, dest_stride, height, taps);
431*09537850SAkhilesh Sanikop } else {
432*09537850SAkhilesh Sanikop FilterVertical<6>(src, src_stride, dest, dest_stride, width, height,
433*09537850SAkhilesh Sanikop taps);
434*09537850SAkhilesh Sanikop }
435*09537850SAkhilesh Sanikop } else if (vertical_taps == 8) { // 8 tap.
436*09537850SAkhilesh Sanikop SetupTaps<8>(&v_filter, taps);
437*09537850SAkhilesh Sanikop if (width == 2) {
438*09537850SAkhilesh Sanikop FilterVertical2xH<8>(src, src_stride, dest, dest_stride, height, taps);
439*09537850SAkhilesh Sanikop } else if (width == 4) {
440*09537850SAkhilesh Sanikop FilterVertical4xH<8>(src, src_stride, dest, dest_stride, height, taps);
441*09537850SAkhilesh Sanikop } else {
442*09537850SAkhilesh Sanikop FilterVertical<8>(src, src_stride, dest, dest_stride, width, height,
443*09537850SAkhilesh Sanikop taps);
444*09537850SAkhilesh Sanikop }
445*09537850SAkhilesh Sanikop } else if (vertical_taps == 2) { // 2 tap.
446*09537850SAkhilesh Sanikop SetupTaps<2>(&v_filter, taps);
447*09537850SAkhilesh Sanikop if (width == 2) {
448*09537850SAkhilesh Sanikop FilterVertical2xH<2>(src, src_stride, dest, dest_stride, height, taps);
449*09537850SAkhilesh Sanikop } else if (width == 4) {
450*09537850SAkhilesh Sanikop FilterVertical4xH<2>(src, src_stride, dest, dest_stride, height, taps);
451*09537850SAkhilesh Sanikop } else {
452*09537850SAkhilesh Sanikop FilterVertical<2>(src, src_stride, dest, dest_stride, width, height,
453*09537850SAkhilesh Sanikop taps);
454*09537850SAkhilesh Sanikop }
455*09537850SAkhilesh Sanikop } else { // 4 tap
456*09537850SAkhilesh Sanikop SetupTaps<4>(&v_filter, taps);
457*09537850SAkhilesh Sanikop if (width == 2) {
458*09537850SAkhilesh Sanikop FilterVertical2xH<4>(src, src_stride, dest, dest_stride, height, taps);
459*09537850SAkhilesh Sanikop } else if (width == 4) {
460*09537850SAkhilesh Sanikop FilterVertical4xH<4>(src, src_stride, dest, dest_stride, height, taps);
461*09537850SAkhilesh Sanikop } else {
462*09537850SAkhilesh Sanikop FilterVertical<4>(src, src_stride, dest, dest_stride, width, height,
463*09537850SAkhilesh Sanikop taps);
464*09537850SAkhilesh Sanikop }
465*09537850SAkhilesh Sanikop }
466*09537850SAkhilesh Sanikop }
467*09537850SAkhilesh Sanikop
ConvolveCompoundCopy_SSE4_1(const void * LIBGAV1_RESTRICT const reference,const ptrdiff_t reference_stride,const int,const int,const int,const int,const int width,const int height,void * LIBGAV1_RESTRICT prediction,const ptrdiff_t pred_stride)468*09537850SAkhilesh Sanikop void ConvolveCompoundCopy_SSE4_1(
469*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const reference,
470*09537850SAkhilesh Sanikop const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
471*09537850SAkhilesh Sanikop const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
472*09537850SAkhilesh Sanikop const int /*vertical_filter_id*/, const int width, const int height,
473*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
474*09537850SAkhilesh Sanikop const auto* src = static_cast<const uint8_t*>(reference);
475*09537850SAkhilesh Sanikop const ptrdiff_t src_stride = reference_stride;
476*09537850SAkhilesh Sanikop auto* dest = static_cast<uint16_t*>(prediction);
477*09537850SAkhilesh Sanikop constexpr int kRoundBitsVertical =
478*09537850SAkhilesh Sanikop kInterRoundBitsVertical - kInterRoundBitsCompoundVertical;
479*09537850SAkhilesh Sanikop if (width >= 16) {
480*09537850SAkhilesh Sanikop int y = height;
481*09537850SAkhilesh Sanikop do {
482*09537850SAkhilesh Sanikop int x = 0;
483*09537850SAkhilesh Sanikop do {
484*09537850SAkhilesh Sanikop const __m128i v_src = LoadUnaligned16(&src[x]);
485*09537850SAkhilesh Sanikop const __m128i v_src_ext_lo = _mm_cvtepu8_epi16(v_src);
486*09537850SAkhilesh Sanikop const __m128i v_src_ext_hi =
487*09537850SAkhilesh Sanikop _mm_cvtepu8_epi16(_mm_srli_si128(v_src, 8));
488*09537850SAkhilesh Sanikop const __m128i v_dest_lo =
489*09537850SAkhilesh Sanikop _mm_slli_epi16(v_src_ext_lo, kRoundBitsVertical);
490*09537850SAkhilesh Sanikop const __m128i v_dest_hi =
491*09537850SAkhilesh Sanikop _mm_slli_epi16(v_src_ext_hi, kRoundBitsVertical);
492*09537850SAkhilesh Sanikop StoreUnaligned16(&dest[x], v_dest_lo);
493*09537850SAkhilesh Sanikop StoreUnaligned16(&dest[x + 8], v_dest_hi);
494*09537850SAkhilesh Sanikop x += 16;
495*09537850SAkhilesh Sanikop } while (x < width);
496*09537850SAkhilesh Sanikop src += src_stride;
497*09537850SAkhilesh Sanikop dest += pred_stride;
498*09537850SAkhilesh Sanikop } while (--y != 0);
499*09537850SAkhilesh Sanikop } else if (width == 8) {
500*09537850SAkhilesh Sanikop int y = height;
501*09537850SAkhilesh Sanikop do {
502*09537850SAkhilesh Sanikop const __m128i v_src = LoadLo8(&src[0]);
503*09537850SAkhilesh Sanikop const __m128i v_src_ext = _mm_cvtepu8_epi16(v_src);
504*09537850SAkhilesh Sanikop const __m128i v_dest = _mm_slli_epi16(v_src_ext, kRoundBitsVertical);
505*09537850SAkhilesh Sanikop StoreUnaligned16(&dest[0], v_dest);
506*09537850SAkhilesh Sanikop src += src_stride;
507*09537850SAkhilesh Sanikop dest += pred_stride;
508*09537850SAkhilesh Sanikop } while (--y != 0);
509*09537850SAkhilesh Sanikop } else { /* width == 4 */
510*09537850SAkhilesh Sanikop int y = height;
511*09537850SAkhilesh Sanikop do {
512*09537850SAkhilesh Sanikop const __m128i v_src0 = Load4(&src[0]);
513*09537850SAkhilesh Sanikop const __m128i v_src1 = Load4(&src[src_stride]);
514*09537850SAkhilesh Sanikop const __m128i v_src = _mm_unpacklo_epi32(v_src0, v_src1);
515*09537850SAkhilesh Sanikop const __m128i v_src_ext = _mm_cvtepu8_epi16(v_src);
516*09537850SAkhilesh Sanikop const __m128i v_dest = _mm_slli_epi16(v_src_ext, kRoundBitsVertical);
517*09537850SAkhilesh Sanikop StoreLo8(&dest[0], v_dest);
518*09537850SAkhilesh Sanikop StoreHi8(&dest[pred_stride], v_dest);
519*09537850SAkhilesh Sanikop src += src_stride * 2;
520*09537850SAkhilesh Sanikop dest += pred_stride * 2;
521*09537850SAkhilesh Sanikop y -= 2;
522*09537850SAkhilesh Sanikop } while (y != 0);
523*09537850SAkhilesh Sanikop }
524*09537850SAkhilesh Sanikop }
525*09537850SAkhilesh Sanikop
ConvolveCompoundVertical_SSE4_1(const void * LIBGAV1_RESTRICT const reference,const ptrdiff_t reference_stride,const int,const int vertical_filter_index,const int,const int vertical_filter_id,const int width,const int height,void * LIBGAV1_RESTRICT prediction,const ptrdiff_t)526*09537850SAkhilesh Sanikop void ConvolveCompoundVertical_SSE4_1(
527*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const reference,
528*09537850SAkhilesh Sanikop const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
529*09537850SAkhilesh Sanikop const int vertical_filter_index, const int /*horizontal_filter_id*/,
530*09537850SAkhilesh Sanikop const int vertical_filter_id, const int width, const int height,
531*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
532*09537850SAkhilesh Sanikop const int filter_index = GetFilterIndex(vertical_filter_index, height);
533*09537850SAkhilesh Sanikop const int vertical_taps =
534*09537850SAkhilesh Sanikop GetNumTapsInFilter(filter_index, vertical_filter_id);
535*09537850SAkhilesh Sanikop const ptrdiff_t src_stride = reference_stride;
536*09537850SAkhilesh Sanikop const auto* src = static_cast<const uint8_t*>(reference) -
537*09537850SAkhilesh Sanikop (vertical_taps / 2 - 1) * src_stride;
538*09537850SAkhilesh Sanikop auto* dest = static_cast<uint16_t*>(prediction);
539*09537850SAkhilesh Sanikop assert(vertical_filter_id != 0);
540*09537850SAkhilesh Sanikop
541*09537850SAkhilesh Sanikop __m128i taps[4];
542*09537850SAkhilesh Sanikop const __m128i v_filter =
543*09537850SAkhilesh Sanikop LoadLo8(kHalfSubPixelFilters[filter_index][vertical_filter_id]);
544*09537850SAkhilesh Sanikop
545*09537850SAkhilesh Sanikop if (vertical_taps == 6) { // 6 tap.
546*09537850SAkhilesh Sanikop SetupTaps<6>(&v_filter, taps);
547*09537850SAkhilesh Sanikop if (width == 4) {
548*09537850SAkhilesh Sanikop FilterVertical4xH<6, /*is_compound=*/true>(src, src_stride, dest, 4,
549*09537850SAkhilesh Sanikop height, taps);
550*09537850SAkhilesh Sanikop } else {
551*09537850SAkhilesh Sanikop FilterVertical<6, /*is_compound=*/true>(src, src_stride, dest, width,
552*09537850SAkhilesh Sanikop width, height, taps);
553*09537850SAkhilesh Sanikop }
554*09537850SAkhilesh Sanikop } else if (vertical_taps == 8) { // 8 tap.
555*09537850SAkhilesh Sanikop SetupTaps<8>(&v_filter, taps);
556*09537850SAkhilesh Sanikop if (width == 4) {
557*09537850SAkhilesh Sanikop FilterVertical4xH<8, /*is_compound=*/true>(src, src_stride, dest, 4,
558*09537850SAkhilesh Sanikop height, taps);
559*09537850SAkhilesh Sanikop } else {
560*09537850SAkhilesh Sanikop FilterVertical<8, /*is_compound=*/true>(src, src_stride, dest, width,
561*09537850SAkhilesh Sanikop width, height, taps);
562*09537850SAkhilesh Sanikop }
563*09537850SAkhilesh Sanikop } else if (vertical_taps == 2) { // 2 tap.
564*09537850SAkhilesh Sanikop SetupTaps<2>(&v_filter, taps);
565*09537850SAkhilesh Sanikop if (width == 4) {
566*09537850SAkhilesh Sanikop FilterVertical4xH<2, /*is_compound=*/true>(src, src_stride, dest, 4,
567*09537850SAkhilesh Sanikop height, taps);
568*09537850SAkhilesh Sanikop } else {
569*09537850SAkhilesh Sanikop FilterVertical<2, /*is_compound=*/true>(src, src_stride, dest, width,
570*09537850SAkhilesh Sanikop width, height, taps);
571*09537850SAkhilesh Sanikop }
572*09537850SAkhilesh Sanikop } else { // 4 tap
573*09537850SAkhilesh Sanikop SetupTaps<4>(&v_filter, taps);
574*09537850SAkhilesh Sanikop if (width == 4) {
575*09537850SAkhilesh Sanikop FilterVertical4xH<4, /*is_compound=*/true>(src, src_stride, dest, 4,
576*09537850SAkhilesh Sanikop height, taps);
577*09537850SAkhilesh Sanikop } else {
578*09537850SAkhilesh Sanikop FilterVertical<4, /*is_compound=*/true>(src, src_stride, dest, width,
579*09537850SAkhilesh Sanikop width, height, taps);
580*09537850SAkhilesh Sanikop }
581*09537850SAkhilesh Sanikop }
582*09537850SAkhilesh Sanikop }
583*09537850SAkhilesh Sanikop
ConvolveHorizontal_SSE4_1(const void * LIBGAV1_RESTRICT const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int,const int horizontal_filter_id,const int,const int width,const int height,void * LIBGAV1_RESTRICT prediction,const ptrdiff_t pred_stride)584*09537850SAkhilesh Sanikop void ConvolveHorizontal_SSE4_1(
585*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const reference,
586*09537850SAkhilesh Sanikop const ptrdiff_t reference_stride, const int horizontal_filter_index,
587*09537850SAkhilesh Sanikop const int /*vertical_filter_index*/, const int horizontal_filter_id,
588*09537850SAkhilesh Sanikop const int /*vertical_filter_id*/, const int width, const int height,
589*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT prediction, const ptrdiff_t pred_stride) {
590*09537850SAkhilesh Sanikop const int filter_index = GetFilterIndex(horizontal_filter_index, width);
591*09537850SAkhilesh Sanikop // Set |src| to the outermost tap.
592*09537850SAkhilesh Sanikop const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
593*09537850SAkhilesh Sanikop auto* dest = static_cast<uint8_t*>(prediction);
594*09537850SAkhilesh Sanikop
595*09537850SAkhilesh Sanikop DoHorizontalPass(src, reference_stride, dest, pred_stride, width, height,
596*09537850SAkhilesh Sanikop horizontal_filter_id, filter_index);
597*09537850SAkhilesh Sanikop }
598*09537850SAkhilesh Sanikop
ConvolveCompoundHorizontal_SSE4_1(const void * LIBGAV1_RESTRICT const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int,const int horizontal_filter_id,const int,const int width,const int height,void * LIBGAV1_RESTRICT prediction,const ptrdiff_t)599*09537850SAkhilesh Sanikop void ConvolveCompoundHorizontal_SSE4_1(
600*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const reference,
601*09537850SAkhilesh Sanikop const ptrdiff_t reference_stride, const int horizontal_filter_index,
602*09537850SAkhilesh Sanikop const int /*vertical_filter_index*/, const int horizontal_filter_id,
603*09537850SAkhilesh Sanikop const int /*vertical_filter_id*/, const int width, const int height,
604*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
605*09537850SAkhilesh Sanikop const int filter_index = GetFilterIndex(horizontal_filter_index, width);
606*09537850SAkhilesh Sanikop const auto* src = static_cast<const uint8_t*>(reference) - kHorizontalOffset;
607*09537850SAkhilesh Sanikop auto* dest = static_cast<uint16_t*>(prediction);
608*09537850SAkhilesh Sanikop
609*09537850SAkhilesh Sanikop DoHorizontalPass</*is_2d=*/false, /*is_compound=*/true>(
610*09537850SAkhilesh Sanikop src, reference_stride, dest, width, width, height, horizontal_filter_id,
611*09537850SAkhilesh Sanikop filter_index);
612*09537850SAkhilesh Sanikop }
613*09537850SAkhilesh Sanikop
ConvolveCompound2D_SSE4_1(const void * LIBGAV1_RESTRICT const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int vertical_filter_index,const int horizontal_filter_id,const int vertical_filter_id,const int width,const int height,void * LIBGAV1_RESTRICT prediction,const ptrdiff_t)614*09537850SAkhilesh Sanikop void ConvolveCompound2D_SSE4_1(
615*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const reference,
616*09537850SAkhilesh Sanikop const ptrdiff_t reference_stride, const int horizontal_filter_index,
617*09537850SAkhilesh Sanikop const int vertical_filter_index, const int horizontal_filter_id,
618*09537850SAkhilesh Sanikop const int vertical_filter_id, const int width, const int height,
619*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT prediction, const ptrdiff_t /*pred_stride*/) {
620*09537850SAkhilesh Sanikop // The output of the horizontal filter, i.e. the intermediate_result, is
621*09537850SAkhilesh Sanikop // guaranteed to fit in int16_t.
622*09537850SAkhilesh Sanikop alignas(16) uint16_t
623*09537850SAkhilesh Sanikop intermediate_result[kMaxSuperBlockSizeInPixels *
624*09537850SAkhilesh Sanikop (kMaxSuperBlockSizeInPixels + kSubPixelTaps - 1)];
625*09537850SAkhilesh Sanikop #if LIBGAV1_MSAN
626*09537850SAkhilesh Sanikop // Quiet msan warnings. Set with random non-zero value to aid in debugging.
627*09537850SAkhilesh Sanikop memset(intermediate_result, 0x33, sizeof(intermediate_result));
628*09537850SAkhilesh Sanikop #endif
629*09537850SAkhilesh Sanikop
630*09537850SAkhilesh Sanikop // Horizontal filter.
631*09537850SAkhilesh Sanikop // Filter types used for width <= 4 are different from those for width > 4.
632*09537850SAkhilesh Sanikop // When width > 4, the valid filter index range is always [0, 3].
633*09537850SAkhilesh Sanikop // When width <= 4, the valid filter index range is always [4, 5].
634*09537850SAkhilesh Sanikop // Similarly for height.
635*09537850SAkhilesh Sanikop const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
636*09537850SAkhilesh Sanikop const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
637*09537850SAkhilesh Sanikop const int vertical_taps =
638*09537850SAkhilesh Sanikop GetNumTapsInFilter(vert_filter_index, vertical_filter_id);
639*09537850SAkhilesh Sanikop const int intermediate_height = height + vertical_taps - 1;
640*09537850SAkhilesh Sanikop const ptrdiff_t src_stride = reference_stride;
641*09537850SAkhilesh Sanikop const auto* const src = static_cast<const uint8_t*>(reference) -
642*09537850SAkhilesh Sanikop (vertical_taps / 2 - 1) * src_stride -
643*09537850SAkhilesh Sanikop kHorizontalOffset;
644*09537850SAkhilesh Sanikop
645*09537850SAkhilesh Sanikop DoHorizontalPass</*is_2d=*/true, /*is_compound=*/true>(
646*09537850SAkhilesh Sanikop src, src_stride, intermediate_result, width, width, intermediate_height,
647*09537850SAkhilesh Sanikop horizontal_filter_id, horiz_filter_index);
648*09537850SAkhilesh Sanikop
649*09537850SAkhilesh Sanikop // Vertical filter.
650*09537850SAkhilesh Sanikop auto* dest = static_cast<uint16_t*>(prediction);
651*09537850SAkhilesh Sanikop assert(vertical_filter_id != 0);
652*09537850SAkhilesh Sanikop
653*09537850SAkhilesh Sanikop const ptrdiff_t dest_stride = width;
654*09537850SAkhilesh Sanikop __m128i taps[4];
655*09537850SAkhilesh Sanikop const __m128i v_filter =
656*09537850SAkhilesh Sanikop LoadLo8(kHalfSubPixelFilters[vert_filter_index][vertical_filter_id]);
657*09537850SAkhilesh Sanikop
658*09537850SAkhilesh Sanikop if (vertical_taps == 8) {
659*09537850SAkhilesh Sanikop SetupTaps<8, /*is_2d_vertical=*/true>(&v_filter, taps);
660*09537850SAkhilesh Sanikop if (width == 4) {
661*09537850SAkhilesh Sanikop Filter2DVertical4xH<8, /*is_compound=*/true>(intermediate_result, dest,
662*09537850SAkhilesh Sanikop dest_stride, height, taps);
663*09537850SAkhilesh Sanikop } else {
664*09537850SAkhilesh Sanikop Filter2DVertical<8, /*is_compound=*/true>(
665*09537850SAkhilesh Sanikop intermediate_result, dest, dest_stride, width, height, taps);
666*09537850SAkhilesh Sanikop }
667*09537850SAkhilesh Sanikop } else if (vertical_taps == 6) {
668*09537850SAkhilesh Sanikop SetupTaps<6, /*is_2d_vertical=*/true>(&v_filter, taps);
669*09537850SAkhilesh Sanikop if (width == 4) {
670*09537850SAkhilesh Sanikop Filter2DVertical4xH<6, /*is_compound=*/true>(intermediate_result, dest,
671*09537850SAkhilesh Sanikop dest_stride, height, taps);
672*09537850SAkhilesh Sanikop } else {
673*09537850SAkhilesh Sanikop Filter2DVertical<6, /*is_compound=*/true>(
674*09537850SAkhilesh Sanikop intermediate_result, dest, dest_stride, width, height, taps);
675*09537850SAkhilesh Sanikop }
676*09537850SAkhilesh Sanikop } else if (vertical_taps == 4) {
677*09537850SAkhilesh Sanikop SetupTaps<4, /*is_2d_vertical=*/true>(&v_filter, taps);
678*09537850SAkhilesh Sanikop if (width == 4) {
679*09537850SAkhilesh Sanikop Filter2DVertical4xH<4, /*is_compound=*/true>(intermediate_result, dest,
680*09537850SAkhilesh Sanikop dest_stride, height, taps);
681*09537850SAkhilesh Sanikop } else {
682*09537850SAkhilesh Sanikop Filter2DVertical<4, /*is_compound=*/true>(
683*09537850SAkhilesh Sanikop intermediate_result, dest, dest_stride, width, height, taps);
684*09537850SAkhilesh Sanikop }
685*09537850SAkhilesh Sanikop } else { // |vertical_taps| == 2
686*09537850SAkhilesh Sanikop SetupTaps<2, /*is_2d_vertical=*/true>(&v_filter, taps);
687*09537850SAkhilesh Sanikop if (width == 4) {
688*09537850SAkhilesh Sanikop Filter2DVertical4xH<2, /*is_compound=*/true>(intermediate_result, dest,
689*09537850SAkhilesh Sanikop dest_stride, height, taps);
690*09537850SAkhilesh Sanikop } else {
691*09537850SAkhilesh Sanikop Filter2DVertical<2, /*is_compound=*/true>(
692*09537850SAkhilesh Sanikop intermediate_result, dest, dest_stride, width, height, taps);
693*09537850SAkhilesh Sanikop }
694*09537850SAkhilesh Sanikop }
695*09537850SAkhilesh Sanikop }
696*09537850SAkhilesh Sanikop
697*09537850SAkhilesh Sanikop // Pre-transposed filters.
698*09537850SAkhilesh Sanikop template <int filter_index>
GetHalfSubPixelFilter(__m128i * output)699*09537850SAkhilesh Sanikop inline void GetHalfSubPixelFilter(__m128i* output) {
700*09537850SAkhilesh Sanikop // Filter 0
701*09537850SAkhilesh Sanikop alignas(
702*09537850SAkhilesh Sanikop 16) static constexpr int8_t kHalfSubPixel6TapSignedFilterColumns[6][16] =
703*09537850SAkhilesh Sanikop {{0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0},
704*09537850SAkhilesh Sanikop {0, -3, -5, -6, -7, -7, -8, -7, -7, -6, -6, -6, -5, -4, -2, -1},
705*09537850SAkhilesh Sanikop {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
706*09537850SAkhilesh Sanikop {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
707*09537850SAkhilesh Sanikop {0, -1, -2, -4, -5, -6, -6, -6, -7, -7, -8, -7, -7, -6, -5, -3},
708*09537850SAkhilesh Sanikop {0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
709*09537850SAkhilesh Sanikop // Filter 1
710*09537850SAkhilesh Sanikop alignas(16) static constexpr int8_t
711*09537850SAkhilesh Sanikop kHalfSubPixel6TapMixedSignedFilterColumns[6][16] = {
712*09537850SAkhilesh Sanikop {0, 1, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 0, 0},
713*09537850SAkhilesh Sanikop {0, 14, 13, 11, 10, 9, 8, 8, 7, 6, 5, 4, 3, 2, 2, 1},
714*09537850SAkhilesh Sanikop {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
715*09537850SAkhilesh Sanikop {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
716*09537850SAkhilesh Sanikop {0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 11, 13, 14},
717*09537850SAkhilesh Sanikop {0, 0, 0, 0, 0, 0, 0, 0, -1, -1, 0, 0, 0, 0, 0, 1}};
718*09537850SAkhilesh Sanikop // Filter 2
719*09537850SAkhilesh Sanikop alignas(
720*09537850SAkhilesh Sanikop 16) static constexpr int8_t kHalfSubPixel8TapSignedFilterColumns[8][16] =
721*09537850SAkhilesh Sanikop {{0, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1, -1, -1, -1, 0},
722*09537850SAkhilesh Sanikop {0, 1, 3, 4, 5, 5, 5, 5, 6, 5, 4, 4, 3, 3, 2, 1},
723*09537850SAkhilesh Sanikop {0, -3, -6, -9, -11, -11, -12, -12, -12, -11, -10, -9, -7, -5, -3, -1},
724*09537850SAkhilesh Sanikop {64, 63, 62, 60, 58, 54, 50, 45, 40, 35, 30, 24, 19, 13, 8, 4},
725*09537850SAkhilesh Sanikop {0, 4, 8, 13, 19, 24, 30, 35, 40, 45, 50, 54, 58, 60, 62, 63},
726*09537850SAkhilesh Sanikop {0, -1, -3, -5, -7, -9, -10, -11, -12, -12, -12, -11, -11, -9, -6, -3},
727*09537850SAkhilesh Sanikop {0, 1, 2, 3, 3, 4, 4, 5, 6, 5, 5, 5, 5, 4, 3, 1},
728*09537850SAkhilesh Sanikop {0, 0, -1, -1, -1, -1, -1, -1, -2, -2, -2, -2, -2, -1, -1, -1}};
729*09537850SAkhilesh Sanikop // Filter 3
730*09537850SAkhilesh Sanikop alignas(16) static constexpr uint8_t kHalfSubPixel2TapFilterColumns[2][16] = {
731*09537850SAkhilesh Sanikop {64, 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4},
732*09537850SAkhilesh Sanikop {0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60}};
733*09537850SAkhilesh Sanikop // Filter 4
734*09537850SAkhilesh Sanikop alignas(
735*09537850SAkhilesh Sanikop 16) static constexpr int8_t kHalfSubPixel4TapSignedFilterColumns[4][16] =
736*09537850SAkhilesh Sanikop {{0, -2, -4, -5, -6, -6, -7, -6, -6, -5, -5, -5, -4, -3, -2, -1},
737*09537850SAkhilesh Sanikop {64, 63, 61, 58, 55, 51, 47, 42, 38, 33, 29, 24, 19, 14, 9, 4},
738*09537850SAkhilesh Sanikop {0, 4, 9, 14, 19, 24, 29, 33, 38, 42, 47, 51, 55, 58, 61, 63},
739*09537850SAkhilesh Sanikop {0, -1, -2, -3, -4, -5, -5, -5, -6, -6, -7, -6, -6, -5, -4, -2}};
740*09537850SAkhilesh Sanikop // Filter 5
741*09537850SAkhilesh Sanikop alignas(
742*09537850SAkhilesh Sanikop 16) static constexpr uint8_t kSubPixel4TapPositiveFilterColumns[4][16] = {
743*09537850SAkhilesh Sanikop {0, 15, 13, 11, 10, 9, 8, 7, 6, 6, 5, 4, 3, 2, 2, 1},
744*09537850SAkhilesh Sanikop {64, 31, 31, 31, 30, 29, 28, 27, 26, 24, 23, 22, 21, 20, 18, 17},
745*09537850SAkhilesh Sanikop {0, 17, 18, 20, 21, 22, 23, 24, 26, 27, 28, 29, 30, 31, 31, 31},
746*09537850SAkhilesh Sanikop {0, 1, 2, 2, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11, 13, 15}};
747*09537850SAkhilesh Sanikop switch (filter_index) {
748*09537850SAkhilesh Sanikop case 0:
749*09537850SAkhilesh Sanikop output[0] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[0]);
750*09537850SAkhilesh Sanikop output[1] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[1]);
751*09537850SAkhilesh Sanikop output[2] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[2]);
752*09537850SAkhilesh Sanikop output[3] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[3]);
753*09537850SAkhilesh Sanikop output[4] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[4]);
754*09537850SAkhilesh Sanikop output[5] = LoadAligned16(kHalfSubPixel6TapSignedFilterColumns[5]);
755*09537850SAkhilesh Sanikop break;
756*09537850SAkhilesh Sanikop case 1:
757*09537850SAkhilesh Sanikop // The term "mixed" refers to the fact that the outer taps have a mix of
758*09537850SAkhilesh Sanikop // negative and positive values.
759*09537850SAkhilesh Sanikop output[0] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[0]);
760*09537850SAkhilesh Sanikop output[1] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[1]);
761*09537850SAkhilesh Sanikop output[2] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[2]);
762*09537850SAkhilesh Sanikop output[3] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[3]);
763*09537850SAkhilesh Sanikop output[4] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[4]);
764*09537850SAkhilesh Sanikop output[5] = LoadAligned16(kHalfSubPixel6TapMixedSignedFilterColumns[5]);
765*09537850SAkhilesh Sanikop break;
766*09537850SAkhilesh Sanikop case 2:
767*09537850SAkhilesh Sanikop output[0] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[0]);
768*09537850SAkhilesh Sanikop output[1] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[1]);
769*09537850SAkhilesh Sanikop output[2] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[2]);
770*09537850SAkhilesh Sanikop output[3] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[3]);
771*09537850SAkhilesh Sanikop output[4] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[4]);
772*09537850SAkhilesh Sanikop output[5] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[5]);
773*09537850SAkhilesh Sanikop output[6] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[6]);
774*09537850SAkhilesh Sanikop output[7] = LoadAligned16(kHalfSubPixel8TapSignedFilterColumns[7]);
775*09537850SAkhilesh Sanikop break;
776*09537850SAkhilesh Sanikop case 3:
777*09537850SAkhilesh Sanikop output[0] = LoadAligned16(kHalfSubPixel2TapFilterColumns[0]);
778*09537850SAkhilesh Sanikop output[1] = LoadAligned16(kHalfSubPixel2TapFilterColumns[1]);
779*09537850SAkhilesh Sanikop break;
780*09537850SAkhilesh Sanikop case 4:
781*09537850SAkhilesh Sanikop output[0] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[0]);
782*09537850SAkhilesh Sanikop output[1] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[1]);
783*09537850SAkhilesh Sanikop output[2] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[2]);
784*09537850SAkhilesh Sanikop output[3] = LoadAligned16(kHalfSubPixel4TapSignedFilterColumns[3]);
785*09537850SAkhilesh Sanikop break;
786*09537850SAkhilesh Sanikop default:
787*09537850SAkhilesh Sanikop assert(filter_index == 5);
788*09537850SAkhilesh Sanikop output[0] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[0]);
789*09537850SAkhilesh Sanikop output[1] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[1]);
790*09537850SAkhilesh Sanikop output[2] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[2]);
791*09537850SAkhilesh Sanikop output[3] = LoadAligned16(kSubPixel4TapPositiveFilterColumns[3]);
792*09537850SAkhilesh Sanikop break;
793*09537850SAkhilesh Sanikop }
794*09537850SAkhilesh Sanikop }
795*09537850SAkhilesh Sanikop
796*09537850SAkhilesh Sanikop // There are many opportunities for overreading in scaled convolve, because
797*09537850SAkhilesh Sanikop // the range of starting points for filter windows is anywhere from 0 to 16
798*09537850SAkhilesh Sanikop // for 8 destination pixels, and the window sizes range from 2 to 8. To
799*09537850SAkhilesh Sanikop // accommodate this range concisely, we use |grade_x| to mean the most steps
800*09537850SAkhilesh Sanikop // in src that can be traversed in a single |step_x| increment, i.e. 1 or 2.
801*09537850SAkhilesh Sanikop // More importantly, |grade_x| answers the question "how many vector loads are
802*09537850SAkhilesh Sanikop // needed to cover the source values?"
803*09537850SAkhilesh Sanikop // When |grade_x| == 1, the maximum number of source values needed is 8 separate
804*09537850SAkhilesh Sanikop // starting positions plus 7 more to cover taps, all fitting into 16 bytes.
805*09537850SAkhilesh Sanikop // When |grade_x| > 1, we are guaranteed to exceed 8 whole steps in src for
806*09537850SAkhilesh Sanikop // every 8 |step_x| increments, on top of 8 possible taps. The first load covers
807*09537850SAkhilesh Sanikop // the starting sources for each kernel, while the final load covers the taps.
808*09537850SAkhilesh Sanikop // Since the offset value of src_x cannot exceed 8 and |num_taps| does not
809*09537850SAkhilesh Sanikop // exceed 4 when width <= 4, |grade_x| is set to 1 regardless of the value of
810*09537850SAkhilesh Sanikop // |step_x|.
811*09537850SAkhilesh Sanikop template <int num_taps, int grade_x>
PrepareSourceVectors(const uint8_t * LIBGAV1_RESTRICT src,const __m128i src_indices,__m128i * const source)812*09537850SAkhilesh Sanikop inline void PrepareSourceVectors(const uint8_t* LIBGAV1_RESTRICT src,
813*09537850SAkhilesh Sanikop const __m128i src_indices,
814*09537850SAkhilesh Sanikop __m128i* const source /*[num_taps >> 1]*/) {
815*09537850SAkhilesh Sanikop // |used_bytes| is only computed in msan builds. Mask away unused bytes for
816*09537850SAkhilesh Sanikop // msan because it incorrectly models the outcome of the shuffles in some
817*09537850SAkhilesh Sanikop // cases. This has not been reproduced out of context.
818*09537850SAkhilesh Sanikop const int used_bytes = _mm_extract_epi8(src_indices, 15) + 1 + num_taps - 2;
819*09537850SAkhilesh Sanikop const __m128i src_vals = LoadUnaligned16Msan(src, 16 - used_bytes);
820*09537850SAkhilesh Sanikop source[0] = _mm_shuffle_epi8(src_vals, src_indices);
821*09537850SAkhilesh Sanikop if (grade_x == 1) {
822*09537850SAkhilesh Sanikop if (num_taps > 2) {
823*09537850SAkhilesh Sanikop source[1] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 2), src_indices);
824*09537850SAkhilesh Sanikop }
825*09537850SAkhilesh Sanikop if (num_taps > 4) {
826*09537850SAkhilesh Sanikop source[2] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 4), src_indices);
827*09537850SAkhilesh Sanikop }
828*09537850SAkhilesh Sanikop if (num_taps > 6) {
829*09537850SAkhilesh Sanikop source[3] = _mm_shuffle_epi8(_mm_srli_si128(src_vals, 6), src_indices);
830*09537850SAkhilesh Sanikop }
831*09537850SAkhilesh Sanikop } else {
832*09537850SAkhilesh Sanikop assert(grade_x > 1);
833*09537850SAkhilesh Sanikop assert(num_taps != 4);
834*09537850SAkhilesh Sanikop // grade_x > 1 also means width >= 8 && num_taps != 4
835*09537850SAkhilesh Sanikop const __m128i src_vals_ext = LoadLo8Msan(src + 16, 24 - used_bytes);
836*09537850SAkhilesh Sanikop if (num_taps > 2) {
837*09537850SAkhilesh Sanikop source[1] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 2),
838*09537850SAkhilesh Sanikop src_indices);
839*09537850SAkhilesh Sanikop source[2] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 4),
840*09537850SAkhilesh Sanikop src_indices);
841*09537850SAkhilesh Sanikop }
842*09537850SAkhilesh Sanikop if (num_taps > 6) {
843*09537850SAkhilesh Sanikop source[3] = _mm_shuffle_epi8(_mm_alignr_epi8(src_vals_ext, src_vals, 6),
844*09537850SAkhilesh Sanikop src_indices);
845*09537850SAkhilesh Sanikop }
846*09537850SAkhilesh Sanikop }
847*09537850SAkhilesh Sanikop }
848*09537850SAkhilesh Sanikop
849*09537850SAkhilesh Sanikop template <int num_taps>
PrepareHorizontalTaps(const __m128i subpel_indices,const __m128i * filter_taps,__m128i * out_taps)850*09537850SAkhilesh Sanikop inline void PrepareHorizontalTaps(const __m128i subpel_indices,
851*09537850SAkhilesh Sanikop const __m128i* filter_taps,
852*09537850SAkhilesh Sanikop __m128i* out_taps) {
853*09537850SAkhilesh Sanikop const __m128i scale_index_offsets =
854*09537850SAkhilesh Sanikop _mm_srli_epi16(subpel_indices, kFilterIndexShift);
855*09537850SAkhilesh Sanikop const __m128i filter_index_mask = _mm_set1_epi8(kSubPixelMask);
856*09537850SAkhilesh Sanikop const __m128i filter_indices =
857*09537850SAkhilesh Sanikop _mm_and_si128(_mm_packus_epi16(scale_index_offsets, scale_index_offsets),
858*09537850SAkhilesh Sanikop filter_index_mask);
859*09537850SAkhilesh Sanikop // Line up taps for maddubs_epi16.
860*09537850SAkhilesh Sanikop // The unpack is also assumed to be lighter than shift+alignr.
861*09537850SAkhilesh Sanikop for (int k = 0; k < (num_taps >> 1); ++k) {
862*09537850SAkhilesh Sanikop const __m128i taps0 = _mm_shuffle_epi8(filter_taps[2 * k], filter_indices);
863*09537850SAkhilesh Sanikop const __m128i taps1 =
864*09537850SAkhilesh Sanikop _mm_shuffle_epi8(filter_taps[2 * k + 1], filter_indices);
865*09537850SAkhilesh Sanikop out_taps[k] = _mm_unpacklo_epi8(taps0, taps1);
866*09537850SAkhilesh Sanikop }
867*09537850SAkhilesh Sanikop }
868*09537850SAkhilesh Sanikop
HorizontalScaleIndices(const __m128i subpel_indices)869*09537850SAkhilesh Sanikop inline __m128i HorizontalScaleIndices(const __m128i subpel_indices) {
870*09537850SAkhilesh Sanikop const __m128i src_indices16 =
871*09537850SAkhilesh Sanikop _mm_srli_epi16(subpel_indices, kScaleSubPixelBits);
872*09537850SAkhilesh Sanikop const __m128i src_indices = _mm_packus_epi16(src_indices16, src_indices16);
873*09537850SAkhilesh Sanikop return _mm_unpacklo_epi8(src_indices,
874*09537850SAkhilesh Sanikop _mm_add_epi8(src_indices, _mm_set1_epi8(1)));
875*09537850SAkhilesh Sanikop }
876*09537850SAkhilesh Sanikop
877*09537850SAkhilesh Sanikop template <int grade_x, int filter_index, int num_taps>
ConvolveHorizontalScale(const uint8_t * LIBGAV1_RESTRICT src,ptrdiff_t src_stride,int width,int subpixel_x,int step_x,int intermediate_height,int16_t * LIBGAV1_RESTRICT intermediate)878*09537850SAkhilesh Sanikop inline void ConvolveHorizontalScale(const uint8_t* LIBGAV1_RESTRICT src,
879*09537850SAkhilesh Sanikop ptrdiff_t src_stride, int width,
880*09537850SAkhilesh Sanikop int subpixel_x, int step_x,
881*09537850SAkhilesh Sanikop int intermediate_height,
882*09537850SAkhilesh Sanikop int16_t* LIBGAV1_RESTRICT intermediate) {
883*09537850SAkhilesh Sanikop // Account for the 0-taps that precede the 2 nonzero taps.
884*09537850SAkhilesh Sanikop const int kernel_offset = (8 - num_taps) >> 1;
885*09537850SAkhilesh Sanikop const int ref_x = subpixel_x >> kScaleSubPixelBits;
886*09537850SAkhilesh Sanikop const int step_x8 = step_x << 3;
887*09537850SAkhilesh Sanikop __m128i filter_taps[num_taps];
888*09537850SAkhilesh Sanikop GetHalfSubPixelFilter<filter_index>(filter_taps);
889*09537850SAkhilesh Sanikop const __m128i index_steps =
890*09537850SAkhilesh Sanikop _mm_mullo_epi16(_mm_set_epi16(7, 6, 5, 4, 3, 2, 1, 0),
891*09537850SAkhilesh Sanikop _mm_set1_epi16(static_cast<int16_t>(step_x)));
892*09537850SAkhilesh Sanikop
893*09537850SAkhilesh Sanikop __m128i taps[num_taps >> 1];
894*09537850SAkhilesh Sanikop __m128i source[num_taps >> 1];
895*09537850SAkhilesh Sanikop int p = subpixel_x;
896*09537850SAkhilesh Sanikop // Case when width <= 4 is possible.
897*09537850SAkhilesh Sanikop if (filter_index >= 3) {
898*09537850SAkhilesh Sanikop if (filter_index > 3 || width <= 4) {
899*09537850SAkhilesh Sanikop const uint8_t* src_x =
900*09537850SAkhilesh Sanikop &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
901*09537850SAkhilesh Sanikop // Only add steps to the 10-bit truncated p to avoid overflow.
902*09537850SAkhilesh Sanikop const __m128i p_fraction = _mm_set1_epi16(p & 1023);
903*09537850SAkhilesh Sanikop const __m128i subpel_indices = _mm_add_epi16(index_steps, p_fraction);
904*09537850SAkhilesh Sanikop PrepareHorizontalTaps<num_taps>(subpel_indices, filter_taps, taps);
905*09537850SAkhilesh Sanikop const __m128i packed_indices = HorizontalScaleIndices(subpel_indices);
906*09537850SAkhilesh Sanikop
907*09537850SAkhilesh Sanikop int y = intermediate_height;
908*09537850SAkhilesh Sanikop do {
909*09537850SAkhilesh Sanikop // Load and line up source values with the taps. Width 4 means no need
910*09537850SAkhilesh Sanikop // to load extended source.
911*09537850SAkhilesh Sanikop PrepareSourceVectors<num_taps, /*grade_x=*/1>(src_x, packed_indices,
912*09537850SAkhilesh Sanikop source);
913*09537850SAkhilesh Sanikop
914*09537850SAkhilesh Sanikop StoreLo8(intermediate, RightShiftWithRounding_S16(
915*09537850SAkhilesh Sanikop SumOnePassTaps<num_taps>(source, taps),
916*09537850SAkhilesh Sanikop kInterRoundBitsHorizontal - 1));
917*09537850SAkhilesh Sanikop src_x += src_stride;
918*09537850SAkhilesh Sanikop intermediate += kIntermediateStride;
919*09537850SAkhilesh Sanikop } while (--y != 0);
920*09537850SAkhilesh Sanikop return;
921*09537850SAkhilesh Sanikop }
922*09537850SAkhilesh Sanikop }
923*09537850SAkhilesh Sanikop
924*09537850SAkhilesh Sanikop // |width| >= 8
925*09537850SAkhilesh Sanikop int16_t* intermediate_x = intermediate;
926*09537850SAkhilesh Sanikop int x = 0;
927*09537850SAkhilesh Sanikop do {
928*09537850SAkhilesh Sanikop const uint8_t* src_x =
929*09537850SAkhilesh Sanikop &src[(p >> kScaleSubPixelBits) - ref_x + kernel_offset];
930*09537850SAkhilesh Sanikop // Only add steps to the 10-bit truncated p to avoid overflow.
931*09537850SAkhilesh Sanikop const __m128i p_fraction = _mm_set1_epi16(p & 1023);
932*09537850SAkhilesh Sanikop const __m128i subpel_indices = _mm_add_epi16(index_steps, p_fraction);
933*09537850SAkhilesh Sanikop PrepareHorizontalTaps<num_taps>(subpel_indices, filter_taps, taps);
934*09537850SAkhilesh Sanikop const __m128i packed_indices = HorizontalScaleIndices(subpel_indices);
935*09537850SAkhilesh Sanikop
936*09537850SAkhilesh Sanikop int y = intermediate_height;
937*09537850SAkhilesh Sanikop do {
938*09537850SAkhilesh Sanikop // For each x, a lane of src_k[k] contains src_x[k].
939*09537850SAkhilesh Sanikop PrepareSourceVectors<num_taps, grade_x>(src_x, packed_indices, source);
940*09537850SAkhilesh Sanikop
941*09537850SAkhilesh Sanikop // Shift by one less because the taps are halved.
942*09537850SAkhilesh Sanikop StoreAligned16(intermediate_x, RightShiftWithRounding_S16(
943*09537850SAkhilesh Sanikop SumOnePassTaps<num_taps>(source, taps),
944*09537850SAkhilesh Sanikop kInterRoundBitsHorizontal - 1));
945*09537850SAkhilesh Sanikop src_x += src_stride;
946*09537850SAkhilesh Sanikop intermediate_x += kIntermediateStride;
947*09537850SAkhilesh Sanikop } while (--y != 0);
948*09537850SAkhilesh Sanikop x += 8;
949*09537850SAkhilesh Sanikop p += step_x8;
950*09537850SAkhilesh Sanikop } while (x < width);
951*09537850SAkhilesh Sanikop }
952*09537850SAkhilesh Sanikop
953*09537850SAkhilesh Sanikop template <int num_taps>
PrepareVerticalTaps(const int8_t * LIBGAV1_RESTRICT taps,__m128i * output)954*09537850SAkhilesh Sanikop inline void PrepareVerticalTaps(const int8_t* LIBGAV1_RESTRICT taps,
955*09537850SAkhilesh Sanikop __m128i* output) {
956*09537850SAkhilesh Sanikop // Avoid overreading the filter due to starting at kernel_offset.
957*09537850SAkhilesh Sanikop // The only danger of overread is in the final filter, which has 4 taps.
958*09537850SAkhilesh Sanikop const __m128i filter =
959*09537850SAkhilesh Sanikop _mm_cvtepi8_epi16((num_taps > 4) ? LoadLo8(taps) : Load4(taps));
960*09537850SAkhilesh Sanikop output[0] = _mm_shuffle_epi32(filter, 0);
961*09537850SAkhilesh Sanikop if (num_taps > 2) {
962*09537850SAkhilesh Sanikop output[1] = _mm_shuffle_epi32(filter, 0x55);
963*09537850SAkhilesh Sanikop }
964*09537850SAkhilesh Sanikop if (num_taps > 4) {
965*09537850SAkhilesh Sanikop output[2] = _mm_shuffle_epi32(filter, 0xAA);
966*09537850SAkhilesh Sanikop }
967*09537850SAkhilesh Sanikop if (num_taps > 6) {
968*09537850SAkhilesh Sanikop output[3] = _mm_shuffle_epi32(filter, 0xFF);
969*09537850SAkhilesh Sanikop }
970*09537850SAkhilesh Sanikop }
971*09537850SAkhilesh Sanikop
972*09537850SAkhilesh Sanikop // Process eight 16 bit inputs and output eight 16 bit values.
973*09537850SAkhilesh Sanikop template <int num_taps, bool is_compound>
Sum2DVerticalTaps(const __m128i * const src,const __m128i * taps)974*09537850SAkhilesh Sanikop inline __m128i Sum2DVerticalTaps(const __m128i* const src,
975*09537850SAkhilesh Sanikop const __m128i* taps) {
976*09537850SAkhilesh Sanikop const __m128i src_lo_01 = _mm_unpacklo_epi16(src[0], src[1]);
977*09537850SAkhilesh Sanikop __m128i sum_lo = _mm_madd_epi16(src_lo_01, taps[0]);
978*09537850SAkhilesh Sanikop const __m128i src_hi_01 = _mm_unpackhi_epi16(src[0], src[1]);
979*09537850SAkhilesh Sanikop __m128i sum_hi = _mm_madd_epi16(src_hi_01, taps[0]);
980*09537850SAkhilesh Sanikop if (num_taps > 2) {
981*09537850SAkhilesh Sanikop const __m128i src_lo_23 = _mm_unpacklo_epi16(src[2], src[3]);
982*09537850SAkhilesh Sanikop sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_23, taps[1]));
983*09537850SAkhilesh Sanikop const __m128i src_hi_23 = _mm_unpackhi_epi16(src[2], src[3]);
984*09537850SAkhilesh Sanikop sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_23, taps[1]));
985*09537850SAkhilesh Sanikop }
986*09537850SAkhilesh Sanikop if (num_taps > 4) {
987*09537850SAkhilesh Sanikop const __m128i src_lo_45 = _mm_unpacklo_epi16(src[4], src[5]);
988*09537850SAkhilesh Sanikop sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_45, taps[2]));
989*09537850SAkhilesh Sanikop const __m128i src_hi_45 = _mm_unpackhi_epi16(src[4], src[5]);
990*09537850SAkhilesh Sanikop sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_45, taps[2]));
991*09537850SAkhilesh Sanikop }
992*09537850SAkhilesh Sanikop if (num_taps > 6) {
993*09537850SAkhilesh Sanikop const __m128i src_lo_67 = _mm_unpacklo_epi16(src[6], src[7]);
994*09537850SAkhilesh Sanikop sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_67, taps[3]));
995*09537850SAkhilesh Sanikop const __m128i src_hi_67 = _mm_unpackhi_epi16(src[6], src[7]);
996*09537850SAkhilesh Sanikop sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_67, taps[3]));
997*09537850SAkhilesh Sanikop }
998*09537850SAkhilesh Sanikop if (is_compound) {
999*09537850SAkhilesh Sanikop return _mm_packs_epi32(
1000*09537850SAkhilesh Sanikop RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
1001*09537850SAkhilesh Sanikop RightShiftWithRounding_S32(sum_hi,
1002*09537850SAkhilesh Sanikop kInterRoundBitsCompoundVertical - 1));
1003*09537850SAkhilesh Sanikop }
1004*09537850SAkhilesh Sanikop return _mm_packs_epi32(
1005*09537850SAkhilesh Sanikop RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
1006*09537850SAkhilesh Sanikop RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
1007*09537850SAkhilesh Sanikop }
1008*09537850SAkhilesh Sanikop
1009*09537850SAkhilesh Sanikop // Bottom half of each src[k] is the source for one filter, and the top half
1010*09537850SAkhilesh Sanikop // is the source for the other filter, for the next destination row.
1011*09537850SAkhilesh Sanikop template <int num_taps, bool is_compound>
Sum2DVerticalTaps4x2(const __m128i * const src,const __m128i * taps_lo,const __m128i * taps_hi)1012*09537850SAkhilesh Sanikop __m128i Sum2DVerticalTaps4x2(const __m128i* const src, const __m128i* taps_lo,
1013*09537850SAkhilesh Sanikop const __m128i* taps_hi) {
1014*09537850SAkhilesh Sanikop const __m128i src_lo_01 = _mm_unpacklo_epi16(src[0], src[1]);
1015*09537850SAkhilesh Sanikop __m128i sum_lo = _mm_madd_epi16(src_lo_01, taps_lo[0]);
1016*09537850SAkhilesh Sanikop const __m128i src_hi_01 = _mm_unpackhi_epi16(src[0], src[1]);
1017*09537850SAkhilesh Sanikop __m128i sum_hi = _mm_madd_epi16(src_hi_01, taps_hi[0]);
1018*09537850SAkhilesh Sanikop if (num_taps > 2) {
1019*09537850SAkhilesh Sanikop const __m128i src_lo_23 = _mm_unpacklo_epi16(src[2], src[3]);
1020*09537850SAkhilesh Sanikop sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_23, taps_lo[1]));
1021*09537850SAkhilesh Sanikop const __m128i src_hi_23 = _mm_unpackhi_epi16(src[2], src[3]);
1022*09537850SAkhilesh Sanikop sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_23, taps_hi[1]));
1023*09537850SAkhilesh Sanikop }
1024*09537850SAkhilesh Sanikop if (num_taps > 4) {
1025*09537850SAkhilesh Sanikop const __m128i src_lo_45 = _mm_unpacklo_epi16(src[4], src[5]);
1026*09537850SAkhilesh Sanikop sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_45, taps_lo[2]));
1027*09537850SAkhilesh Sanikop const __m128i src_hi_45 = _mm_unpackhi_epi16(src[4], src[5]);
1028*09537850SAkhilesh Sanikop sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_45, taps_hi[2]));
1029*09537850SAkhilesh Sanikop }
1030*09537850SAkhilesh Sanikop if (num_taps > 6) {
1031*09537850SAkhilesh Sanikop const __m128i src_lo_67 = _mm_unpacklo_epi16(src[6], src[7]);
1032*09537850SAkhilesh Sanikop sum_lo = _mm_add_epi32(sum_lo, _mm_madd_epi16(src_lo_67, taps_lo[3]));
1033*09537850SAkhilesh Sanikop const __m128i src_hi_67 = _mm_unpackhi_epi16(src[6], src[7]);
1034*09537850SAkhilesh Sanikop sum_hi = _mm_add_epi32(sum_hi, _mm_madd_epi16(src_hi_67, taps_hi[3]));
1035*09537850SAkhilesh Sanikop }
1036*09537850SAkhilesh Sanikop
1037*09537850SAkhilesh Sanikop if (is_compound) {
1038*09537850SAkhilesh Sanikop return _mm_packs_epi32(
1039*09537850SAkhilesh Sanikop RightShiftWithRounding_S32(sum_lo, kInterRoundBitsCompoundVertical - 1),
1040*09537850SAkhilesh Sanikop RightShiftWithRounding_S32(sum_hi,
1041*09537850SAkhilesh Sanikop kInterRoundBitsCompoundVertical - 1));
1042*09537850SAkhilesh Sanikop }
1043*09537850SAkhilesh Sanikop return _mm_packs_epi32(
1044*09537850SAkhilesh Sanikop RightShiftWithRounding_S32(sum_lo, kInterRoundBitsVertical - 1),
1045*09537850SAkhilesh Sanikop RightShiftWithRounding_S32(sum_hi, kInterRoundBitsVertical - 1));
1046*09537850SAkhilesh Sanikop }
1047*09537850SAkhilesh Sanikop
1048*09537850SAkhilesh Sanikop // |width_class| is 2, 4, or 8, according to the Store function that should be
1049*09537850SAkhilesh Sanikop // used.
1050*09537850SAkhilesh Sanikop template <int num_taps, int width_class, bool is_compound>
ConvolveVerticalScale(const int16_t * LIBGAV1_RESTRICT src,const int intermediate_height,const int width,const int subpixel_y,const int filter_index,const int step_y,const int height,void * LIBGAV1_RESTRICT dest,const ptrdiff_t dest_stride)1051*09537850SAkhilesh Sanikop inline void ConvolveVerticalScale(const int16_t* LIBGAV1_RESTRICT src,
1052*09537850SAkhilesh Sanikop const int intermediate_height,
1053*09537850SAkhilesh Sanikop const int width, const int subpixel_y,
1054*09537850SAkhilesh Sanikop const int filter_index, const int step_y,
1055*09537850SAkhilesh Sanikop const int height, void* LIBGAV1_RESTRICT dest,
1056*09537850SAkhilesh Sanikop const ptrdiff_t dest_stride) {
1057*09537850SAkhilesh Sanikop constexpr ptrdiff_t src_stride = kIntermediateStride;
1058*09537850SAkhilesh Sanikop constexpr int kernel_offset = (8 - num_taps) / 2;
1059*09537850SAkhilesh Sanikop const int16_t* src_y = src;
1060*09537850SAkhilesh Sanikop // |dest| is 16-bit in compound mode, Pixel otherwise.
1061*09537850SAkhilesh Sanikop auto* dest16_y = static_cast<uint16_t*>(dest);
1062*09537850SAkhilesh Sanikop auto* dest_y = static_cast<uint8_t*>(dest);
1063*09537850SAkhilesh Sanikop __m128i s[num_taps];
1064*09537850SAkhilesh Sanikop
1065*09537850SAkhilesh Sanikop int p = subpixel_y & 1023;
1066*09537850SAkhilesh Sanikop int y = height;
1067*09537850SAkhilesh Sanikop if (width_class <= 4) {
1068*09537850SAkhilesh Sanikop __m128i filter_taps_lo[num_taps >> 1];
1069*09537850SAkhilesh Sanikop __m128i filter_taps_hi[num_taps >> 1];
1070*09537850SAkhilesh Sanikop do { // y > 0
1071*09537850SAkhilesh Sanikop for (int i = 0; i < num_taps; ++i) {
1072*09537850SAkhilesh Sanikop s[i] = LoadLo8(src_y + i * src_stride);
1073*09537850SAkhilesh Sanikop }
1074*09537850SAkhilesh Sanikop int filter_id = (p >> 6) & kSubPixelMask;
1075*09537850SAkhilesh Sanikop const int8_t* filter0 =
1076*09537850SAkhilesh Sanikop kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset;
1077*09537850SAkhilesh Sanikop PrepareVerticalTaps<num_taps>(filter0, filter_taps_lo);
1078*09537850SAkhilesh Sanikop p += step_y;
1079*09537850SAkhilesh Sanikop src_y = src + (p >> kScaleSubPixelBits) * src_stride;
1080*09537850SAkhilesh Sanikop
1081*09537850SAkhilesh Sanikop for (int i = 0; i < num_taps; ++i) {
1082*09537850SAkhilesh Sanikop s[i] = LoadHi8(s[i], src_y + i * src_stride);
1083*09537850SAkhilesh Sanikop }
1084*09537850SAkhilesh Sanikop filter_id = (p >> 6) & kSubPixelMask;
1085*09537850SAkhilesh Sanikop const int8_t* filter1 =
1086*09537850SAkhilesh Sanikop kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset;
1087*09537850SAkhilesh Sanikop PrepareVerticalTaps<num_taps>(filter1, filter_taps_hi);
1088*09537850SAkhilesh Sanikop p += step_y;
1089*09537850SAkhilesh Sanikop src_y = src + (p >> kScaleSubPixelBits) * src_stride;
1090*09537850SAkhilesh Sanikop
1091*09537850SAkhilesh Sanikop const __m128i sums = Sum2DVerticalTaps4x2<num_taps, is_compound>(
1092*09537850SAkhilesh Sanikop s, filter_taps_lo, filter_taps_hi);
1093*09537850SAkhilesh Sanikop if (is_compound) {
1094*09537850SAkhilesh Sanikop assert(width_class > 2);
1095*09537850SAkhilesh Sanikop StoreLo8(dest16_y, sums);
1096*09537850SAkhilesh Sanikop dest16_y += dest_stride;
1097*09537850SAkhilesh Sanikop StoreHi8(dest16_y, sums);
1098*09537850SAkhilesh Sanikop dest16_y += dest_stride;
1099*09537850SAkhilesh Sanikop } else {
1100*09537850SAkhilesh Sanikop const __m128i result = _mm_packus_epi16(sums, sums);
1101*09537850SAkhilesh Sanikop if (width_class == 2) {
1102*09537850SAkhilesh Sanikop Store2(dest_y, result);
1103*09537850SAkhilesh Sanikop dest_y += dest_stride;
1104*09537850SAkhilesh Sanikop Store2(dest_y, _mm_srli_si128(result, 4));
1105*09537850SAkhilesh Sanikop } else {
1106*09537850SAkhilesh Sanikop Store4(dest_y, result);
1107*09537850SAkhilesh Sanikop dest_y += dest_stride;
1108*09537850SAkhilesh Sanikop Store4(dest_y, _mm_srli_si128(result, 4));
1109*09537850SAkhilesh Sanikop }
1110*09537850SAkhilesh Sanikop dest_y += dest_stride;
1111*09537850SAkhilesh Sanikop }
1112*09537850SAkhilesh Sanikop y -= 2;
1113*09537850SAkhilesh Sanikop } while (y != 0);
1114*09537850SAkhilesh Sanikop return;
1115*09537850SAkhilesh Sanikop }
1116*09537850SAkhilesh Sanikop
1117*09537850SAkhilesh Sanikop // |width_class| >= 8
1118*09537850SAkhilesh Sanikop __m128i filter_taps[num_taps >> 1];
1119*09537850SAkhilesh Sanikop int x = 0;
1120*09537850SAkhilesh Sanikop do { // x < width
1121*09537850SAkhilesh Sanikop auto* dest_y = static_cast<uint8_t*>(dest) + x;
1122*09537850SAkhilesh Sanikop auto* dest16_y = static_cast<uint16_t*>(dest) + x;
1123*09537850SAkhilesh Sanikop int p = subpixel_y & 1023;
1124*09537850SAkhilesh Sanikop int y = height;
1125*09537850SAkhilesh Sanikop do { // y > 0
1126*09537850SAkhilesh Sanikop const int filter_id = (p >> 6) & kSubPixelMask;
1127*09537850SAkhilesh Sanikop const int8_t* filter =
1128*09537850SAkhilesh Sanikop kHalfSubPixelFilters[filter_index][filter_id] + kernel_offset;
1129*09537850SAkhilesh Sanikop PrepareVerticalTaps<num_taps>(filter, filter_taps);
1130*09537850SAkhilesh Sanikop
1131*09537850SAkhilesh Sanikop src_y = src + (p >> kScaleSubPixelBits) * src_stride;
1132*09537850SAkhilesh Sanikop for (int i = 0; i < num_taps; ++i) {
1133*09537850SAkhilesh Sanikop s[i] = LoadUnaligned16(src_y + i * src_stride);
1134*09537850SAkhilesh Sanikop }
1135*09537850SAkhilesh Sanikop
1136*09537850SAkhilesh Sanikop const __m128i sums =
1137*09537850SAkhilesh Sanikop Sum2DVerticalTaps<num_taps, is_compound>(s, filter_taps);
1138*09537850SAkhilesh Sanikop if (is_compound) {
1139*09537850SAkhilesh Sanikop StoreUnaligned16(dest16_y, sums);
1140*09537850SAkhilesh Sanikop } else {
1141*09537850SAkhilesh Sanikop StoreLo8(dest_y, _mm_packus_epi16(sums, sums));
1142*09537850SAkhilesh Sanikop }
1143*09537850SAkhilesh Sanikop p += step_y;
1144*09537850SAkhilesh Sanikop dest_y += dest_stride;
1145*09537850SAkhilesh Sanikop dest16_y += dest_stride;
1146*09537850SAkhilesh Sanikop } while (--y != 0);
1147*09537850SAkhilesh Sanikop src += kIntermediateStride * intermediate_height;
1148*09537850SAkhilesh Sanikop x += 8;
1149*09537850SAkhilesh Sanikop } while (x < width);
1150*09537850SAkhilesh Sanikop }
1151*09537850SAkhilesh Sanikop
1152*09537850SAkhilesh Sanikop template <bool is_compound>
ConvolveScale2D_SSE4_1(const void * LIBGAV1_RESTRICT const reference,const ptrdiff_t reference_stride,const int horizontal_filter_index,const int vertical_filter_index,const int subpixel_x,const int subpixel_y,const int step_x,const int step_y,const int width,const int height,void * LIBGAV1_RESTRICT prediction,const ptrdiff_t pred_stride)1153*09537850SAkhilesh Sanikop void ConvolveScale2D_SSE4_1(const void* LIBGAV1_RESTRICT const reference,
1154*09537850SAkhilesh Sanikop const ptrdiff_t reference_stride,
1155*09537850SAkhilesh Sanikop const int horizontal_filter_index,
1156*09537850SAkhilesh Sanikop const int vertical_filter_index,
1157*09537850SAkhilesh Sanikop const int subpixel_x, const int subpixel_y,
1158*09537850SAkhilesh Sanikop const int step_x, const int step_y, const int width,
1159*09537850SAkhilesh Sanikop const int height, void* LIBGAV1_RESTRICT prediction,
1160*09537850SAkhilesh Sanikop const ptrdiff_t pred_stride) {
1161*09537850SAkhilesh Sanikop const int horiz_filter_index = GetFilterIndex(horizontal_filter_index, width);
1162*09537850SAkhilesh Sanikop const int vert_filter_index = GetFilterIndex(vertical_filter_index, height);
1163*09537850SAkhilesh Sanikop assert(step_x <= 2048);
1164*09537850SAkhilesh Sanikop // The output of the horizontal filter, i.e. the intermediate_result, is
1165*09537850SAkhilesh Sanikop // guaranteed to fit in int16_t.
1166*09537850SAkhilesh Sanikop alignas(16) int16_t
1167*09537850SAkhilesh Sanikop intermediate_result[kIntermediateAllocWidth *
1168*09537850SAkhilesh Sanikop (2 * kIntermediateAllocWidth + kSubPixelTaps)];
1169*09537850SAkhilesh Sanikop #if LIBGAV1_MSAN
1170*09537850SAkhilesh Sanikop // Quiet msan warnings. Set with random non-zero value to aid in debugging.
1171*09537850SAkhilesh Sanikop memset(intermediate_result, 0x44, sizeof(intermediate_result));
1172*09537850SAkhilesh Sanikop #endif
1173*09537850SAkhilesh Sanikop const int num_vert_taps = dsp::GetNumTapsInFilter(vert_filter_index);
1174*09537850SAkhilesh Sanikop const int intermediate_height =
1175*09537850SAkhilesh Sanikop (((height - 1) * step_y + (1 << kScaleSubPixelBits) - 1) >>
1176*09537850SAkhilesh Sanikop kScaleSubPixelBits) +
1177*09537850SAkhilesh Sanikop num_vert_taps;
1178*09537850SAkhilesh Sanikop
1179*09537850SAkhilesh Sanikop // Horizontal filter.
1180*09537850SAkhilesh Sanikop // Filter types used for width <= 4 are different from those for width > 4.
1181*09537850SAkhilesh Sanikop // When width > 4, the valid filter index range is always [0, 3].
1182*09537850SAkhilesh Sanikop // When width <= 4, the valid filter index range is always [3, 5].
1183*09537850SAkhilesh Sanikop // Similarly for height.
1184*09537850SAkhilesh Sanikop int16_t* intermediate = intermediate_result;
1185*09537850SAkhilesh Sanikop const ptrdiff_t src_stride = reference_stride;
1186*09537850SAkhilesh Sanikop const auto* src = static_cast<const uint8_t*>(reference);
1187*09537850SAkhilesh Sanikop const int vert_kernel_offset = (8 - num_vert_taps) / 2;
1188*09537850SAkhilesh Sanikop src += vert_kernel_offset * src_stride;
1189*09537850SAkhilesh Sanikop
1190*09537850SAkhilesh Sanikop // Derive the maximum value of |step_x| at which all source values fit in one
1191*09537850SAkhilesh Sanikop // 16-byte load. Final index is src_x + |num_taps| - 1 < 16
1192*09537850SAkhilesh Sanikop // step_x*7 is the final base sub-pixel index for the shuffle mask for filter
1193*09537850SAkhilesh Sanikop // inputs in each iteration on large blocks. When step_x is large, we need a
1194*09537850SAkhilesh Sanikop // second register and alignr in order to gather all filter inputs.
1195*09537850SAkhilesh Sanikop // |num_taps| - 1 is the offset for the shuffle of inputs to the final tap.
1196*09537850SAkhilesh Sanikop const int num_horiz_taps = dsp::GetNumTapsInFilter(horiz_filter_index);
1197*09537850SAkhilesh Sanikop const int kernel_start_ceiling = 16 - num_horiz_taps;
1198*09537850SAkhilesh Sanikop // This truncated quotient |grade_x_threshold| selects |step_x| such that:
1199*09537850SAkhilesh Sanikop // (step_x * 7) >> kScaleSubPixelBits < single load limit
1200*09537850SAkhilesh Sanikop const int grade_x_threshold =
1201*09537850SAkhilesh Sanikop (kernel_start_ceiling << kScaleSubPixelBits) / 7;
1202*09537850SAkhilesh Sanikop switch (horiz_filter_index) {
1203*09537850SAkhilesh Sanikop case 0:
1204*09537850SAkhilesh Sanikop if (step_x > grade_x_threshold) {
1205*09537850SAkhilesh Sanikop ConvolveHorizontalScale<2, 0, 6>(src, src_stride, width, subpixel_x,
1206*09537850SAkhilesh Sanikop step_x, intermediate_height,
1207*09537850SAkhilesh Sanikop intermediate);
1208*09537850SAkhilesh Sanikop } else {
1209*09537850SAkhilesh Sanikop ConvolveHorizontalScale<1, 0, 6>(src, src_stride, width, subpixel_x,
1210*09537850SAkhilesh Sanikop step_x, intermediate_height,
1211*09537850SAkhilesh Sanikop intermediate);
1212*09537850SAkhilesh Sanikop }
1213*09537850SAkhilesh Sanikop break;
1214*09537850SAkhilesh Sanikop case 1:
1215*09537850SAkhilesh Sanikop if (step_x > grade_x_threshold) {
1216*09537850SAkhilesh Sanikop ConvolveHorizontalScale<2, 1, 6>(src, src_stride, width, subpixel_x,
1217*09537850SAkhilesh Sanikop step_x, intermediate_height,
1218*09537850SAkhilesh Sanikop intermediate);
1219*09537850SAkhilesh Sanikop
1220*09537850SAkhilesh Sanikop } else {
1221*09537850SAkhilesh Sanikop ConvolveHorizontalScale<1, 1, 6>(src, src_stride, width, subpixel_x,
1222*09537850SAkhilesh Sanikop step_x, intermediate_height,
1223*09537850SAkhilesh Sanikop intermediate);
1224*09537850SAkhilesh Sanikop }
1225*09537850SAkhilesh Sanikop break;
1226*09537850SAkhilesh Sanikop case 2:
1227*09537850SAkhilesh Sanikop if (step_x > grade_x_threshold) {
1228*09537850SAkhilesh Sanikop ConvolveHorizontalScale<2, 2, 8>(src, src_stride, width, subpixel_x,
1229*09537850SAkhilesh Sanikop step_x, intermediate_height,
1230*09537850SAkhilesh Sanikop intermediate);
1231*09537850SAkhilesh Sanikop } else {
1232*09537850SAkhilesh Sanikop ConvolveHorizontalScale<1, 2, 8>(src, src_stride, width, subpixel_x,
1233*09537850SAkhilesh Sanikop step_x, intermediate_height,
1234*09537850SAkhilesh Sanikop intermediate);
1235*09537850SAkhilesh Sanikop }
1236*09537850SAkhilesh Sanikop break;
1237*09537850SAkhilesh Sanikop case 3:
1238*09537850SAkhilesh Sanikop if (step_x > grade_x_threshold) {
1239*09537850SAkhilesh Sanikop ConvolveHorizontalScale<2, 3, 2>(src, src_stride, width, subpixel_x,
1240*09537850SAkhilesh Sanikop step_x, intermediate_height,
1241*09537850SAkhilesh Sanikop intermediate);
1242*09537850SAkhilesh Sanikop } else {
1243*09537850SAkhilesh Sanikop ConvolveHorizontalScale<1, 3, 2>(src, src_stride, width, subpixel_x,
1244*09537850SAkhilesh Sanikop step_x, intermediate_height,
1245*09537850SAkhilesh Sanikop intermediate);
1246*09537850SAkhilesh Sanikop }
1247*09537850SAkhilesh Sanikop break;
1248*09537850SAkhilesh Sanikop case 4:
1249*09537850SAkhilesh Sanikop assert(width <= 4);
1250*09537850SAkhilesh Sanikop ConvolveHorizontalScale<1, 4, 4>(src, src_stride, width, subpixel_x,
1251*09537850SAkhilesh Sanikop step_x, intermediate_height,
1252*09537850SAkhilesh Sanikop intermediate);
1253*09537850SAkhilesh Sanikop break;
1254*09537850SAkhilesh Sanikop default:
1255*09537850SAkhilesh Sanikop assert(horiz_filter_index == 5);
1256*09537850SAkhilesh Sanikop assert(width <= 4);
1257*09537850SAkhilesh Sanikop ConvolveHorizontalScale<1, 5, 4>(src, src_stride, width, subpixel_x,
1258*09537850SAkhilesh Sanikop step_x, intermediate_height,
1259*09537850SAkhilesh Sanikop intermediate);
1260*09537850SAkhilesh Sanikop }
1261*09537850SAkhilesh Sanikop
1262*09537850SAkhilesh Sanikop // Vertical filter.
1263*09537850SAkhilesh Sanikop intermediate = intermediate_result;
1264*09537850SAkhilesh Sanikop switch (vert_filter_index) {
1265*09537850SAkhilesh Sanikop case 0:
1266*09537850SAkhilesh Sanikop case 1:
1267*09537850SAkhilesh Sanikop if (!is_compound && width == 2) {
1268*09537850SAkhilesh Sanikop ConvolveVerticalScale<6, 2, is_compound>(
1269*09537850SAkhilesh Sanikop intermediate, intermediate_height, width, subpixel_y,
1270*09537850SAkhilesh Sanikop vert_filter_index, step_y, height, prediction, pred_stride);
1271*09537850SAkhilesh Sanikop } else if (width == 4) {
1272*09537850SAkhilesh Sanikop ConvolveVerticalScale<6, 4, is_compound>(
1273*09537850SAkhilesh Sanikop intermediate, intermediate_height, width, subpixel_y,
1274*09537850SAkhilesh Sanikop vert_filter_index, step_y, height, prediction, pred_stride);
1275*09537850SAkhilesh Sanikop } else {
1276*09537850SAkhilesh Sanikop ConvolveVerticalScale<6, 8, is_compound>(
1277*09537850SAkhilesh Sanikop intermediate, intermediate_height, width, subpixel_y,
1278*09537850SAkhilesh Sanikop vert_filter_index, step_y, height, prediction, pred_stride);
1279*09537850SAkhilesh Sanikop }
1280*09537850SAkhilesh Sanikop break;
1281*09537850SAkhilesh Sanikop case 2:
1282*09537850SAkhilesh Sanikop if (!is_compound && width == 2) {
1283*09537850SAkhilesh Sanikop ConvolveVerticalScale<8, 2, is_compound>(
1284*09537850SAkhilesh Sanikop intermediate, intermediate_height, width, subpixel_y,
1285*09537850SAkhilesh Sanikop vert_filter_index, step_y, height, prediction, pred_stride);
1286*09537850SAkhilesh Sanikop } else if (width == 4) {
1287*09537850SAkhilesh Sanikop ConvolveVerticalScale<8, 4, is_compound>(
1288*09537850SAkhilesh Sanikop intermediate, intermediate_height, width, subpixel_y,
1289*09537850SAkhilesh Sanikop vert_filter_index, step_y, height, prediction, pred_stride);
1290*09537850SAkhilesh Sanikop } else {
1291*09537850SAkhilesh Sanikop ConvolveVerticalScale<8, 8, is_compound>(
1292*09537850SAkhilesh Sanikop intermediate, intermediate_height, width, subpixel_y,
1293*09537850SAkhilesh Sanikop vert_filter_index, step_y, height, prediction, pred_stride);
1294*09537850SAkhilesh Sanikop }
1295*09537850SAkhilesh Sanikop break;
1296*09537850SAkhilesh Sanikop case 3:
1297*09537850SAkhilesh Sanikop if (!is_compound && width == 2) {
1298*09537850SAkhilesh Sanikop ConvolveVerticalScale<2, 2, is_compound>(
1299*09537850SAkhilesh Sanikop intermediate, intermediate_height, width, subpixel_y,
1300*09537850SAkhilesh Sanikop vert_filter_index, step_y, height, prediction, pred_stride);
1301*09537850SAkhilesh Sanikop } else if (width == 4) {
1302*09537850SAkhilesh Sanikop ConvolveVerticalScale<2, 4, is_compound>(
1303*09537850SAkhilesh Sanikop intermediate, intermediate_height, width, subpixel_y,
1304*09537850SAkhilesh Sanikop vert_filter_index, step_y, height, prediction, pred_stride);
1305*09537850SAkhilesh Sanikop } else {
1306*09537850SAkhilesh Sanikop ConvolveVerticalScale<2, 8, is_compound>(
1307*09537850SAkhilesh Sanikop intermediate, intermediate_height, width, subpixel_y,
1308*09537850SAkhilesh Sanikop vert_filter_index, step_y, height, prediction, pred_stride);
1309*09537850SAkhilesh Sanikop }
1310*09537850SAkhilesh Sanikop break;
1311*09537850SAkhilesh Sanikop default:
1312*09537850SAkhilesh Sanikop assert(vert_filter_index == 4 || vert_filter_index == 5);
1313*09537850SAkhilesh Sanikop if (!is_compound && width == 2) {
1314*09537850SAkhilesh Sanikop ConvolveVerticalScale<4, 2, is_compound>(
1315*09537850SAkhilesh Sanikop intermediate, intermediate_height, width, subpixel_y,
1316*09537850SAkhilesh Sanikop vert_filter_index, step_y, height, prediction, pred_stride);
1317*09537850SAkhilesh Sanikop } else if (width == 4) {
1318*09537850SAkhilesh Sanikop ConvolveVerticalScale<4, 4, is_compound>(
1319*09537850SAkhilesh Sanikop intermediate, intermediate_height, width, subpixel_y,
1320*09537850SAkhilesh Sanikop vert_filter_index, step_y, height, prediction, pred_stride);
1321*09537850SAkhilesh Sanikop } else {
1322*09537850SAkhilesh Sanikop ConvolveVerticalScale<4, 8, is_compound>(
1323*09537850SAkhilesh Sanikop intermediate, intermediate_height, width, subpixel_y,
1324*09537850SAkhilesh Sanikop vert_filter_index, step_y, height, prediction, pred_stride);
1325*09537850SAkhilesh Sanikop }
1326*09537850SAkhilesh Sanikop }
1327*09537850SAkhilesh Sanikop }
1328*09537850SAkhilesh Sanikop
HalfAddHorizontal(const uint8_t * LIBGAV1_RESTRICT src,uint8_t * LIBGAV1_RESTRICT dst)1329*09537850SAkhilesh Sanikop inline void HalfAddHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
1330*09537850SAkhilesh Sanikop uint8_t* LIBGAV1_RESTRICT dst) {
1331*09537850SAkhilesh Sanikop const __m128i left = LoadUnaligned16(src);
1332*09537850SAkhilesh Sanikop const __m128i right = LoadUnaligned16(src + 1);
1333*09537850SAkhilesh Sanikop StoreUnaligned16(dst, _mm_avg_epu8(left, right));
1334*09537850SAkhilesh Sanikop }
1335*09537850SAkhilesh Sanikop
1336*09537850SAkhilesh Sanikop template <int width>
IntraBlockCopyHorizontal(const uint8_t * LIBGAV1_RESTRICT src,const ptrdiff_t src_stride,const int height,uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t dst_stride)1337*09537850SAkhilesh Sanikop inline void IntraBlockCopyHorizontal(const uint8_t* LIBGAV1_RESTRICT src,
1338*09537850SAkhilesh Sanikop const ptrdiff_t src_stride,
1339*09537850SAkhilesh Sanikop const int height,
1340*09537850SAkhilesh Sanikop uint8_t* LIBGAV1_RESTRICT dst,
1341*09537850SAkhilesh Sanikop const ptrdiff_t dst_stride) {
1342*09537850SAkhilesh Sanikop const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
1343*09537850SAkhilesh Sanikop const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
1344*09537850SAkhilesh Sanikop
1345*09537850SAkhilesh Sanikop int y = height;
1346*09537850SAkhilesh Sanikop do {
1347*09537850SAkhilesh Sanikop HalfAddHorizontal(src, dst);
1348*09537850SAkhilesh Sanikop if (width >= 32) {
1349*09537850SAkhilesh Sanikop src += 16;
1350*09537850SAkhilesh Sanikop dst += 16;
1351*09537850SAkhilesh Sanikop HalfAddHorizontal(src, dst);
1352*09537850SAkhilesh Sanikop if (width >= 64) {
1353*09537850SAkhilesh Sanikop src += 16;
1354*09537850SAkhilesh Sanikop dst += 16;
1355*09537850SAkhilesh Sanikop HalfAddHorizontal(src, dst);
1356*09537850SAkhilesh Sanikop src += 16;
1357*09537850SAkhilesh Sanikop dst += 16;
1358*09537850SAkhilesh Sanikop HalfAddHorizontal(src, dst);
1359*09537850SAkhilesh Sanikop if (width == 128) {
1360*09537850SAkhilesh Sanikop src += 16;
1361*09537850SAkhilesh Sanikop dst += 16;
1362*09537850SAkhilesh Sanikop HalfAddHorizontal(src, dst);
1363*09537850SAkhilesh Sanikop src += 16;
1364*09537850SAkhilesh Sanikop dst += 16;
1365*09537850SAkhilesh Sanikop HalfAddHorizontal(src, dst);
1366*09537850SAkhilesh Sanikop src += 16;
1367*09537850SAkhilesh Sanikop dst += 16;
1368*09537850SAkhilesh Sanikop HalfAddHorizontal(src, dst);
1369*09537850SAkhilesh Sanikop src += 16;
1370*09537850SAkhilesh Sanikop dst += 16;
1371*09537850SAkhilesh Sanikop HalfAddHorizontal(src, dst);
1372*09537850SAkhilesh Sanikop }
1373*09537850SAkhilesh Sanikop }
1374*09537850SAkhilesh Sanikop }
1375*09537850SAkhilesh Sanikop src += src_remainder_stride;
1376*09537850SAkhilesh Sanikop dst += dst_remainder_stride;
1377*09537850SAkhilesh Sanikop } while (--y != 0);
1378*09537850SAkhilesh Sanikop }
1379*09537850SAkhilesh Sanikop
ConvolveIntraBlockCopyHorizontal_SSE4_1(const void * LIBGAV1_RESTRICT const reference,const ptrdiff_t reference_stride,const int,const int,const int,const int,const int width,const int height,void * LIBGAV1_RESTRICT const prediction,const ptrdiff_t pred_stride)1380*09537850SAkhilesh Sanikop void ConvolveIntraBlockCopyHorizontal_SSE4_1(
1381*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const reference,
1382*09537850SAkhilesh Sanikop const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
1383*09537850SAkhilesh Sanikop const int /*vertical_filter_index*/, const int /*subpixel_x*/,
1384*09537850SAkhilesh Sanikop const int /*subpixel_y*/, const int width, const int height,
1385*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
1386*09537850SAkhilesh Sanikop const auto* src = static_cast<const uint8_t*>(reference);
1387*09537850SAkhilesh Sanikop auto* dest = static_cast<uint8_t*>(prediction);
1388*09537850SAkhilesh Sanikop
1389*09537850SAkhilesh Sanikop if (width == 128) {
1390*09537850SAkhilesh Sanikop IntraBlockCopyHorizontal<128>(src, reference_stride, height, dest,
1391*09537850SAkhilesh Sanikop pred_stride);
1392*09537850SAkhilesh Sanikop } else if (width == 64) {
1393*09537850SAkhilesh Sanikop IntraBlockCopyHorizontal<64>(src, reference_stride, height, dest,
1394*09537850SAkhilesh Sanikop pred_stride);
1395*09537850SAkhilesh Sanikop } else if (width == 32) {
1396*09537850SAkhilesh Sanikop IntraBlockCopyHorizontal<32>(src, reference_stride, height, dest,
1397*09537850SAkhilesh Sanikop pred_stride);
1398*09537850SAkhilesh Sanikop } else if (width == 16) {
1399*09537850SAkhilesh Sanikop IntraBlockCopyHorizontal<16>(src, reference_stride, height, dest,
1400*09537850SAkhilesh Sanikop pred_stride);
1401*09537850SAkhilesh Sanikop } else if (width == 8) {
1402*09537850SAkhilesh Sanikop int y = height;
1403*09537850SAkhilesh Sanikop do {
1404*09537850SAkhilesh Sanikop const __m128i left = LoadLo8(src);
1405*09537850SAkhilesh Sanikop const __m128i right = LoadLo8(src + 1);
1406*09537850SAkhilesh Sanikop StoreLo8(dest, _mm_avg_epu8(left, right));
1407*09537850SAkhilesh Sanikop
1408*09537850SAkhilesh Sanikop src += reference_stride;
1409*09537850SAkhilesh Sanikop dest += pred_stride;
1410*09537850SAkhilesh Sanikop } while (--y != 0);
1411*09537850SAkhilesh Sanikop } else if (width == 4) {
1412*09537850SAkhilesh Sanikop int y = height;
1413*09537850SAkhilesh Sanikop do {
1414*09537850SAkhilesh Sanikop __m128i left = Load4(src);
1415*09537850SAkhilesh Sanikop __m128i right = Load4(src + 1);
1416*09537850SAkhilesh Sanikop src += reference_stride;
1417*09537850SAkhilesh Sanikop left = _mm_unpacklo_epi32(left, Load4(src));
1418*09537850SAkhilesh Sanikop right = _mm_unpacklo_epi32(right, Load4(src + 1));
1419*09537850SAkhilesh Sanikop src += reference_stride;
1420*09537850SAkhilesh Sanikop
1421*09537850SAkhilesh Sanikop const __m128i result = _mm_avg_epu8(left, right);
1422*09537850SAkhilesh Sanikop
1423*09537850SAkhilesh Sanikop Store4(dest, result);
1424*09537850SAkhilesh Sanikop dest += pred_stride;
1425*09537850SAkhilesh Sanikop Store4(dest, _mm_srli_si128(result, 4));
1426*09537850SAkhilesh Sanikop dest += pred_stride;
1427*09537850SAkhilesh Sanikop y -= 2;
1428*09537850SAkhilesh Sanikop } while (y != 0);
1429*09537850SAkhilesh Sanikop } else {
1430*09537850SAkhilesh Sanikop assert(width == 2);
1431*09537850SAkhilesh Sanikop __m128i left = _mm_setzero_si128();
1432*09537850SAkhilesh Sanikop __m128i right = _mm_setzero_si128();
1433*09537850SAkhilesh Sanikop int y = height;
1434*09537850SAkhilesh Sanikop do {
1435*09537850SAkhilesh Sanikop left = Load2<0>(src, left);
1436*09537850SAkhilesh Sanikop right = Load2<0>(src + 1, right);
1437*09537850SAkhilesh Sanikop src += reference_stride;
1438*09537850SAkhilesh Sanikop left = Load2<1>(src, left);
1439*09537850SAkhilesh Sanikop right = Load2<1>(src + 1, right);
1440*09537850SAkhilesh Sanikop src += reference_stride;
1441*09537850SAkhilesh Sanikop
1442*09537850SAkhilesh Sanikop const __m128i result = _mm_avg_epu8(left, right);
1443*09537850SAkhilesh Sanikop
1444*09537850SAkhilesh Sanikop Store2(dest, result);
1445*09537850SAkhilesh Sanikop dest += pred_stride;
1446*09537850SAkhilesh Sanikop Store2(dest, _mm_srli_si128(result, 2));
1447*09537850SAkhilesh Sanikop dest += pred_stride;
1448*09537850SAkhilesh Sanikop y -= 2;
1449*09537850SAkhilesh Sanikop } while (y != 0);
1450*09537850SAkhilesh Sanikop }
1451*09537850SAkhilesh Sanikop }
1452*09537850SAkhilesh Sanikop
1453*09537850SAkhilesh Sanikop template <int width>
IntraBlockCopyVertical(const uint8_t * LIBGAV1_RESTRICT src,const ptrdiff_t src_stride,const int height,uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t dst_stride)1454*09537850SAkhilesh Sanikop inline void IntraBlockCopyVertical(const uint8_t* LIBGAV1_RESTRICT src,
1455*09537850SAkhilesh Sanikop const ptrdiff_t src_stride, const int height,
1456*09537850SAkhilesh Sanikop uint8_t* LIBGAV1_RESTRICT dst,
1457*09537850SAkhilesh Sanikop const ptrdiff_t dst_stride) {
1458*09537850SAkhilesh Sanikop const ptrdiff_t src_remainder_stride = src_stride - (width - 16);
1459*09537850SAkhilesh Sanikop const ptrdiff_t dst_remainder_stride = dst_stride - (width - 16);
1460*09537850SAkhilesh Sanikop __m128i row[8], below[8];
1461*09537850SAkhilesh Sanikop
1462*09537850SAkhilesh Sanikop row[0] = LoadUnaligned16(src);
1463*09537850SAkhilesh Sanikop if (width >= 32) {
1464*09537850SAkhilesh Sanikop src += 16;
1465*09537850SAkhilesh Sanikop row[1] = LoadUnaligned16(src);
1466*09537850SAkhilesh Sanikop if (width >= 64) {
1467*09537850SAkhilesh Sanikop src += 16;
1468*09537850SAkhilesh Sanikop row[2] = LoadUnaligned16(src);
1469*09537850SAkhilesh Sanikop src += 16;
1470*09537850SAkhilesh Sanikop row[3] = LoadUnaligned16(src);
1471*09537850SAkhilesh Sanikop if (width == 128) {
1472*09537850SAkhilesh Sanikop src += 16;
1473*09537850SAkhilesh Sanikop row[4] = LoadUnaligned16(src);
1474*09537850SAkhilesh Sanikop src += 16;
1475*09537850SAkhilesh Sanikop row[5] = LoadUnaligned16(src);
1476*09537850SAkhilesh Sanikop src += 16;
1477*09537850SAkhilesh Sanikop row[6] = LoadUnaligned16(src);
1478*09537850SAkhilesh Sanikop src += 16;
1479*09537850SAkhilesh Sanikop row[7] = LoadUnaligned16(src);
1480*09537850SAkhilesh Sanikop }
1481*09537850SAkhilesh Sanikop }
1482*09537850SAkhilesh Sanikop }
1483*09537850SAkhilesh Sanikop src += src_remainder_stride;
1484*09537850SAkhilesh Sanikop
1485*09537850SAkhilesh Sanikop int y = height;
1486*09537850SAkhilesh Sanikop do {
1487*09537850SAkhilesh Sanikop below[0] = LoadUnaligned16(src);
1488*09537850SAkhilesh Sanikop if (width >= 32) {
1489*09537850SAkhilesh Sanikop src += 16;
1490*09537850SAkhilesh Sanikop below[1] = LoadUnaligned16(src);
1491*09537850SAkhilesh Sanikop if (width >= 64) {
1492*09537850SAkhilesh Sanikop src += 16;
1493*09537850SAkhilesh Sanikop below[2] = LoadUnaligned16(src);
1494*09537850SAkhilesh Sanikop src += 16;
1495*09537850SAkhilesh Sanikop below[3] = LoadUnaligned16(src);
1496*09537850SAkhilesh Sanikop if (width == 128) {
1497*09537850SAkhilesh Sanikop src += 16;
1498*09537850SAkhilesh Sanikop below[4] = LoadUnaligned16(src);
1499*09537850SAkhilesh Sanikop src += 16;
1500*09537850SAkhilesh Sanikop below[5] = LoadUnaligned16(src);
1501*09537850SAkhilesh Sanikop src += 16;
1502*09537850SAkhilesh Sanikop below[6] = LoadUnaligned16(src);
1503*09537850SAkhilesh Sanikop src += 16;
1504*09537850SAkhilesh Sanikop below[7] = LoadUnaligned16(src);
1505*09537850SAkhilesh Sanikop }
1506*09537850SAkhilesh Sanikop }
1507*09537850SAkhilesh Sanikop }
1508*09537850SAkhilesh Sanikop src += src_remainder_stride;
1509*09537850SAkhilesh Sanikop
1510*09537850SAkhilesh Sanikop StoreUnaligned16(dst, _mm_avg_epu8(row[0], below[0]));
1511*09537850SAkhilesh Sanikop row[0] = below[0];
1512*09537850SAkhilesh Sanikop if (width >= 32) {
1513*09537850SAkhilesh Sanikop dst += 16;
1514*09537850SAkhilesh Sanikop StoreUnaligned16(dst, _mm_avg_epu8(row[1], below[1]));
1515*09537850SAkhilesh Sanikop row[1] = below[1];
1516*09537850SAkhilesh Sanikop if (width >= 64) {
1517*09537850SAkhilesh Sanikop dst += 16;
1518*09537850SAkhilesh Sanikop StoreUnaligned16(dst, _mm_avg_epu8(row[2], below[2]));
1519*09537850SAkhilesh Sanikop row[2] = below[2];
1520*09537850SAkhilesh Sanikop dst += 16;
1521*09537850SAkhilesh Sanikop StoreUnaligned16(dst, _mm_avg_epu8(row[3], below[3]));
1522*09537850SAkhilesh Sanikop row[3] = below[3];
1523*09537850SAkhilesh Sanikop if (width >= 128) {
1524*09537850SAkhilesh Sanikop dst += 16;
1525*09537850SAkhilesh Sanikop StoreUnaligned16(dst, _mm_avg_epu8(row[4], below[4]));
1526*09537850SAkhilesh Sanikop row[4] = below[4];
1527*09537850SAkhilesh Sanikop dst += 16;
1528*09537850SAkhilesh Sanikop StoreUnaligned16(dst, _mm_avg_epu8(row[5], below[5]));
1529*09537850SAkhilesh Sanikop row[5] = below[5];
1530*09537850SAkhilesh Sanikop dst += 16;
1531*09537850SAkhilesh Sanikop StoreUnaligned16(dst, _mm_avg_epu8(row[6], below[6]));
1532*09537850SAkhilesh Sanikop row[6] = below[6];
1533*09537850SAkhilesh Sanikop dst += 16;
1534*09537850SAkhilesh Sanikop StoreUnaligned16(dst, _mm_avg_epu8(row[7], below[7]));
1535*09537850SAkhilesh Sanikop row[7] = below[7];
1536*09537850SAkhilesh Sanikop }
1537*09537850SAkhilesh Sanikop }
1538*09537850SAkhilesh Sanikop }
1539*09537850SAkhilesh Sanikop dst += dst_remainder_stride;
1540*09537850SAkhilesh Sanikop } while (--y != 0);
1541*09537850SAkhilesh Sanikop }
1542*09537850SAkhilesh Sanikop
ConvolveIntraBlockCopyVertical_SSE4_1(const void * LIBGAV1_RESTRICT const reference,const ptrdiff_t reference_stride,const int,const int,const int,const int,const int width,const int height,void * LIBGAV1_RESTRICT const prediction,const ptrdiff_t pred_stride)1543*09537850SAkhilesh Sanikop void ConvolveIntraBlockCopyVertical_SSE4_1(
1544*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const reference,
1545*09537850SAkhilesh Sanikop const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
1546*09537850SAkhilesh Sanikop const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
1547*09537850SAkhilesh Sanikop const int /*vertical_filter_id*/, const int width, const int height,
1548*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
1549*09537850SAkhilesh Sanikop const auto* src = static_cast<const uint8_t*>(reference);
1550*09537850SAkhilesh Sanikop auto* dest = static_cast<uint8_t*>(prediction);
1551*09537850SAkhilesh Sanikop
1552*09537850SAkhilesh Sanikop if (width == 128) {
1553*09537850SAkhilesh Sanikop IntraBlockCopyVertical<128>(src, reference_stride, height, dest,
1554*09537850SAkhilesh Sanikop pred_stride);
1555*09537850SAkhilesh Sanikop } else if (width == 64) {
1556*09537850SAkhilesh Sanikop IntraBlockCopyVertical<64>(src, reference_stride, height, dest,
1557*09537850SAkhilesh Sanikop pred_stride);
1558*09537850SAkhilesh Sanikop } else if (width == 32) {
1559*09537850SAkhilesh Sanikop IntraBlockCopyVertical<32>(src, reference_stride, height, dest,
1560*09537850SAkhilesh Sanikop pred_stride);
1561*09537850SAkhilesh Sanikop } else if (width == 16) {
1562*09537850SAkhilesh Sanikop IntraBlockCopyVertical<16>(src, reference_stride, height, dest,
1563*09537850SAkhilesh Sanikop pred_stride);
1564*09537850SAkhilesh Sanikop } else if (width == 8) {
1565*09537850SAkhilesh Sanikop __m128i row, below;
1566*09537850SAkhilesh Sanikop row = LoadLo8(src);
1567*09537850SAkhilesh Sanikop src += reference_stride;
1568*09537850SAkhilesh Sanikop
1569*09537850SAkhilesh Sanikop int y = height;
1570*09537850SAkhilesh Sanikop do {
1571*09537850SAkhilesh Sanikop below = LoadLo8(src);
1572*09537850SAkhilesh Sanikop src += reference_stride;
1573*09537850SAkhilesh Sanikop
1574*09537850SAkhilesh Sanikop StoreLo8(dest, _mm_avg_epu8(row, below));
1575*09537850SAkhilesh Sanikop dest += pred_stride;
1576*09537850SAkhilesh Sanikop
1577*09537850SAkhilesh Sanikop row = below;
1578*09537850SAkhilesh Sanikop } while (--y != 0);
1579*09537850SAkhilesh Sanikop } else if (width == 4) {
1580*09537850SAkhilesh Sanikop __m128i row = Load4(src);
1581*09537850SAkhilesh Sanikop src += reference_stride;
1582*09537850SAkhilesh Sanikop
1583*09537850SAkhilesh Sanikop int y = height;
1584*09537850SAkhilesh Sanikop do {
1585*09537850SAkhilesh Sanikop __m128i below = Load4(src);
1586*09537850SAkhilesh Sanikop src += reference_stride;
1587*09537850SAkhilesh Sanikop
1588*09537850SAkhilesh Sanikop Store4(dest, _mm_avg_epu8(row, below));
1589*09537850SAkhilesh Sanikop dest += pred_stride;
1590*09537850SAkhilesh Sanikop
1591*09537850SAkhilesh Sanikop row = below;
1592*09537850SAkhilesh Sanikop } while (--y != 0);
1593*09537850SAkhilesh Sanikop } else {
1594*09537850SAkhilesh Sanikop assert(width == 2);
1595*09537850SAkhilesh Sanikop __m128i row = Load2(src);
1596*09537850SAkhilesh Sanikop __m128i below = _mm_setzero_si128();
1597*09537850SAkhilesh Sanikop src += reference_stride;
1598*09537850SAkhilesh Sanikop
1599*09537850SAkhilesh Sanikop int y = height;
1600*09537850SAkhilesh Sanikop do {
1601*09537850SAkhilesh Sanikop below = Load2<0>(src, below);
1602*09537850SAkhilesh Sanikop src += reference_stride;
1603*09537850SAkhilesh Sanikop
1604*09537850SAkhilesh Sanikop Store2(dest, _mm_avg_epu8(row, below));
1605*09537850SAkhilesh Sanikop dest += pred_stride;
1606*09537850SAkhilesh Sanikop
1607*09537850SAkhilesh Sanikop row = below;
1608*09537850SAkhilesh Sanikop } while (--y != 0);
1609*09537850SAkhilesh Sanikop }
1610*09537850SAkhilesh Sanikop }
1611*09537850SAkhilesh Sanikop
1612*09537850SAkhilesh Sanikop // Load then add two uint8_t vectors. Return the uint16_t vector result.
LoadU8AndAddLong(const uint8_t * LIBGAV1_RESTRICT src,const uint8_t * LIBGAV1_RESTRICT src1)1613*09537850SAkhilesh Sanikop inline __m128i LoadU8AndAddLong(const uint8_t* LIBGAV1_RESTRICT src,
1614*09537850SAkhilesh Sanikop const uint8_t* LIBGAV1_RESTRICT src1) {
1615*09537850SAkhilesh Sanikop const __m128i a = _mm_cvtepu8_epi16(LoadLo8(src));
1616*09537850SAkhilesh Sanikop const __m128i b = _mm_cvtepu8_epi16(LoadLo8(src1));
1617*09537850SAkhilesh Sanikop return _mm_add_epi16(a, b);
1618*09537850SAkhilesh Sanikop }
1619*09537850SAkhilesh Sanikop
AddU16RightShift2AndPack(__m128i v0,__m128i v1)1620*09537850SAkhilesh Sanikop inline __m128i AddU16RightShift2AndPack(__m128i v0, __m128i v1) {
1621*09537850SAkhilesh Sanikop const __m128i a = _mm_add_epi16(v0, v1);
1622*09537850SAkhilesh Sanikop const __m128i b = _mm_srli_epi16(a, 1);
1623*09537850SAkhilesh Sanikop // Use avg here to shift right by 1 with round.
1624*09537850SAkhilesh Sanikop const __m128i c = _mm_avg_epu16(b, _mm_setzero_si128());
1625*09537850SAkhilesh Sanikop return _mm_packus_epi16(c, c);
1626*09537850SAkhilesh Sanikop }
1627*09537850SAkhilesh Sanikop
1628*09537850SAkhilesh Sanikop template <int width>
IntraBlockCopy2D(const uint8_t * LIBGAV1_RESTRICT src,const ptrdiff_t src_stride,const int height,uint8_t * LIBGAV1_RESTRICT dst,const ptrdiff_t dst_stride)1629*09537850SAkhilesh Sanikop inline void IntraBlockCopy2D(const uint8_t* LIBGAV1_RESTRICT src,
1630*09537850SAkhilesh Sanikop const ptrdiff_t src_stride, const int height,
1631*09537850SAkhilesh Sanikop uint8_t* LIBGAV1_RESTRICT dst,
1632*09537850SAkhilesh Sanikop const ptrdiff_t dst_stride) {
1633*09537850SAkhilesh Sanikop const ptrdiff_t src_remainder_stride = src_stride - (width - 8);
1634*09537850SAkhilesh Sanikop const ptrdiff_t dst_remainder_stride = dst_stride - (width - 8);
1635*09537850SAkhilesh Sanikop __m128i row[16];
1636*09537850SAkhilesh Sanikop row[0] = LoadU8AndAddLong(src, src + 1);
1637*09537850SAkhilesh Sanikop if (width >= 16) {
1638*09537850SAkhilesh Sanikop src += 8;
1639*09537850SAkhilesh Sanikop row[1] = LoadU8AndAddLong(src, src + 1);
1640*09537850SAkhilesh Sanikop if (width >= 32) {
1641*09537850SAkhilesh Sanikop src += 8;
1642*09537850SAkhilesh Sanikop row[2] = LoadU8AndAddLong(src, src + 1);
1643*09537850SAkhilesh Sanikop src += 8;
1644*09537850SAkhilesh Sanikop row[3] = LoadU8AndAddLong(src, src + 1);
1645*09537850SAkhilesh Sanikop if (width >= 64) {
1646*09537850SAkhilesh Sanikop src += 8;
1647*09537850SAkhilesh Sanikop row[4] = LoadU8AndAddLong(src, src + 1);
1648*09537850SAkhilesh Sanikop src += 8;
1649*09537850SAkhilesh Sanikop row[5] = LoadU8AndAddLong(src, src + 1);
1650*09537850SAkhilesh Sanikop src += 8;
1651*09537850SAkhilesh Sanikop row[6] = LoadU8AndAddLong(src, src + 1);
1652*09537850SAkhilesh Sanikop src += 8;
1653*09537850SAkhilesh Sanikop row[7] = LoadU8AndAddLong(src, src + 1);
1654*09537850SAkhilesh Sanikop if (width == 128) {
1655*09537850SAkhilesh Sanikop src += 8;
1656*09537850SAkhilesh Sanikop row[8] = LoadU8AndAddLong(src, src + 1);
1657*09537850SAkhilesh Sanikop src += 8;
1658*09537850SAkhilesh Sanikop row[9] = LoadU8AndAddLong(src, src + 1);
1659*09537850SAkhilesh Sanikop src += 8;
1660*09537850SAkhilesh Sanikop row[10] = LoadU8AndAddLong(src, src + 1);
1661*09537850SAkhilesh Sanikop src += 8;
1662*09537850SAkhilesh Sanikop row[11] = LoadU8AndAddLong(src, src + 1);
1663*09537850SAkhilesh Sanikop src += 8;
1664*09537850SAkhilesh Sanikop row[12] = LoadU8AndAddLong(src, src + 1);
1665*09537850SAkhilesh Sanikop src += 8;
1666*09537850SAkhilesh Sanikop row[13] = LoadU8AndAddLong(src, src + 1);
1667*09537850SAkhilesh Sanikop src += 8;
1668*09537850SAkhilesh Sanikop row[14] = LoadU8AndAddLong(src, src + 1);
1669*09537850SAkhilesh Sanikop src += 8;
1670*09537850SAkhilesh Sanikop row[15] = LoadU8AndAddLong(src, src + 1);
1671*09537850SAkhilesh Sanikop }
1672*09537850SAkhilesh Sanikop }
1673*09537850SAkhilesh Sanikop }
1674*09537850SAkhilesh Sanikop }
1675*09537850SAkhilesh Sanikop src += src_remainder_stride;
1676*09537850SAkhilesh Sanikop
1677*09537850SAkhilesh Sanikop int y = height;
1678*09537850SAkhilesh Sanikop do {
1679*09537850SAkhilesh Sanikop const __m128i below_0 = LoadU8AndAddLong(src, src + 1);
1680*09537850SAkhilesh Sanikop StoreLo8(dst, AddU16RightShift2AndPack(row[0], below_0));
1681*09537850SAkhilesh Sanikop row[0] = below_0;
1682*09537850SAkhilesh Sanikop if (width >= 16) {
1683*09537850SAkhilesh Sanikop src += 8;
1684*09537850SAkhilesh Sanikop dst += 8;
1685*09537850SAkhilesh Sanikop
1686*09537850SAkhilesh Sanikop const __m128i below_1 = LoadU8AndAddLong(src, src + 1);
1687*09537850SAkhilesh Sanikop StoreLo8(dst, AddU16RightShift2AndPack(row[1], below_1));
1688*09537850SAkhilesh Sanikop row[1] = below_1;
1689*09537850SAkhilesh Sanikop if (width >= 32) {
1690*09537850SAkhilesh Sanikop src += 8;
1691*09537850SAkhilesh Sanikop dst += 8;
1692*09537850SAkhilesh Sanikop
1693*09537850SAkhilesh Sanikop const __m128i below_2 = LoadU8AndAddLong(src, src + 1);
1694*09537850SAkhilesh Sanikop StoreLo8(dst, AddU16RightShift2AndPack(row[2], below_2));
1695*09537850SAkhilesh Sanikop row[2] = below_2;
1696*09537850SAkhilesh Sanikop src += 8;
1697*09537850SAkhilesh Sanikop dst += 8;
1698*09537850SAkhilesh Sanikop
1699*09537850SAkhilesh Sanikop const __m128i below_3 = LoadU8AndAddLong(src, src + 1);
1700*09537850SAkhilesh Sanikop StoreLo8(dst, AddU16RightShift2AndPack(row[3], below_3));
1701*09537850SAkhilesh Sanikop row[3] = below_3;
1702*09537850SAkhilesh Sanikop if (width >= 64) {
1703*09537850SAkhilesh Sanikop src += 8;
1704*09537850SAkhilesh Sanikop dst += 8;
1705*09537850SAkhilesh Sanikop
1706*09537850SAkhilesh Sanikop const __m128i below_4 = LoadU8AndAddLong(src, src + 1);
1707*09537850SAkhilesh Sanikop StoreLo8(dst, AddU16RightShift2AndPack(row[4], below_4));
1708*09537850SAkhilesh Sanikop row[4] = below_4;
1709*09537850SAkhilesh Sanikop src += 8;
1710*09537850SAkhilesh Sanikop dst += 8;
1711*09537850SAkhilesh Sanikop
1712*09537850SAkhilesh Sanikop const __m128i below_5 = LoadU8AndAddLong(src, src + 1);
1713*09537850SAkhilesh Sanikop StoreLo8(dst, AddU16RightShift2AndPack(row[5], below_5));
1714*09537850SAkhilesh Sanikop row[5] = below_5;
1715*09537850SAkhilesh Sanikop src += 8;
1716*09537850SAkhilesh Sanikop dst += 8;
1717*09537850SAkhilesh Sanikop
1718*09537850SAkhilesh Sanikop const __m128i below_6 = LoadU8AndAddLong(src, src + 1);
1719*09537850SAkhilesh Sanikop StoreLo8(dst, AddU16RightShift2AndPack(row[6], below_6));
1720*09537850SAkhilesh Sanikop row[6] = below_6;
1721*09537850SAkhilesh Sanikop src += 8;
1722*09537850SAkhilesh Sanikop dst += 8;
1723*09537850SAkhilesh Sanikop
1724*09537850SAkhilesh Sanikop const __m128i below_7 = LoadU8AndAddLong(src, src + 1);
1725*09537850SAkhilesh Sanikop StoreLo8(dst, AddU16RightShift2AndPack(row[7], below_7));
1726*09537850SAkhilesh Sanikop row[7] = below_7;
1727*09537850SAkhilesh Sanikop if (width == 128) {
1728*09537850SAkhilesh Sanikop src += 8;
1729*09537850SAkhilesh Sanikop dst += 8;
1730*09537850SAkhilesh Sanikop
1731*09537850SAkhilesh Sanikop const __m128i below_8 = LoadU8AndAddLong(src, src + 1);
1732*09537850SAkhilesh Sanikop StoreLo8(dst, AddU16RightShift2AndPack(row[8], below_8));
1733*09537850SAkhilesh Sanikop row[8] = below_8;
1734*09537850SAkhilesh Sanikop src += 8;
1735*09537850SAkhilesh Sanikop dst += 8;
1736*09537850SAkhilesh Sanikop
1737*09537850SAkhilesh Sanikop const __m128i below_9 = LoadU8AndAddLong(src, src + 1);
1738*09537850SAkhilesh Sanikop StoreLo8(dst, AddU16RightShift2AndPack(row[9], below_9));
1739*09537850SAkhilesh Sanikop row[9] = below_9;
1740*09537850SAkhilesh Sanikop src += 8;
1741*09537850SAkhilesh Sanikop dst += 8;
1742*09537850SAkhilesh Sanikop
1743*09537850SAkhilesh Sanikop const __m128i below_10 = LoadU8AndAddLong(src, src + 1);
1744*09537850SAkhilesh Sanikop StoreLo8(dst, AddU16RightShift2AndPack(row[10], below_10));
1745*09537850SAkhilesh Sanikop row[10] = below_10;
1746*09537850SAkhilesh Sanikop src += 8;
1747*09537850SAkhilesh Sanikop dst += 8;
1748*09537850SAkhilesh Sanikop
1749*09537850SAkhilesh Sanikop const __m128i below_11 = LoadU8AndAddLong(src, src + 1);
1750*09537850SAkhilesh Sanikop StoreLo8(dst, AddU16RightShift2AndPack(row[11], below_11));
1751*09537850SAkhilesh Sanikop row[11] = below_11;
1752*09537850SAkhilesh Sanikop src += 8;
1753*09537850SAkhilesh Sanikop dst += 8;
1754*09537850SAkhilesh Sanikop
1755*09537850SAkhilesh Sanikop const __m128i below_12 = LoadU8AndAddLong(src, src + 1);
1756*09537850SAkhilesh Sanikop StoreLo8(dst, AddU16RightShift2AndPack(row[12], below_12));
1757*09537850SAkhilesh Sanikop row[12] = below_12;
1758*09537850SAkhilesh Sanikop src += 8;
1759*09537850SAkhilesh Sanikop dst += 8;
1760*09537850SAkhilesh Sanikop
1761*09537850SAkhilesh Sanikop const __m128i below_13 = LoadU8AndAddLong(src, src + 1);
1762*09537850SAkhilesh Sanikop StoreLo8(dst, AddU16RightShift2AndPack(row[13], below_13));
1763*09537850SAkhilesh Sanikop row[13] = below_13;
1764*09537850SAkhilesh Sanikop src += 8;
1765*09537850SAkhilesh Sanikop dst += 8;
1766*09537850SAkhilesh Sanikop
1767*09537850SAkhilesh Sanikop const __m128i below_14 = LoadU8AndAddLong(src, src + 1);
1768*09537850SAkhilesh Sanikop StoreLo8(dst, AddU16RightShift2AndPack(row[14], below_14));
1769*09537850SAkhilesh Sanikop row[14] = below_14;
1770*09537850SAkhilesh Sanikop src += 8;
1771*09537850SAkhilesh Sanikop dst += 8;
1772*09537850SAkhilesh Sanikop
1773*09537850SAkhilesh Sanikop const __m128i below_15 = LoadU8AndAddLong(src, src + 1);
1774*09537850SAkhilesh Sanikop StoreLo8(dst, AddU16RightShift2AndPack(row[15], below_15));
1775*09537850SAkhilesh Sanikop row[15] = below_15;
1776*09537850SAkhilesh Sanikop }
1777*09537850SAkhilesh Sanikop }
1778*09537850SAkhilesh Sanikop }
1779*09537850SAkhilesh Sanikop }
1780*09537850SAkhilesh Sanikop src += src_remainder_stride;
1781*09537850SAkhilesh Sanikop dst += dst_remainder_stride;
1782*09537850SAkhilesh Sanikop } while (--y != 0);
1783*09537850SAkhilesh Sanikop }
1784*09537850SAkhilesh Sanikop
ConvolveIntraBlockCopy2D_SSE4_1(const void * LIBGAV1_RESTRICT const reference,const ptrdiff_t reference_stride,const int,const int,const int,const int,const int width,const int height,void * LIBGAV1_RESTRICT const prediction,const ptrdiff_t pred_stride)1785*09537850SAkhilesh Sanikop void ConvolveIntraBlockCopy2D_SSE4_1(
1786*09537850SAkhilesh Sanikop const void* LIBGAV1_RESTRICT const reference,
1787*09537850SAkhilesh Sanikop const ptrdiff_t reference_stride, const int /*horizontal_filter_index*/,
1788*09537850SAkhilesh Sanikop const int /*vertical_filter_index*/, const int /*horizontal_filter_id*/,
1789*09537850SAkhilesh Sanikop const int /*vertical_filter_id*/, const int width, const int height,
1790*09537850SAkhilesh Sanikop void* LIBGAV1_RESTRICT const prediction, const ptrdiff_t pred_stride) {
1791*09537850SAkhilesh Sanikop const auto* src = static_cast<const uint8_t*>(reference);
1792*09537850SAkhilesh Sanikop auto* dest = static_cast<uint8_t*>(prediction);
1793*09537850SAkhilesh Sanikop // Note: allow vertical access to height + 1. Because this function is only
1794*09537850SAkhilesh Sanikop // for u/v plane of intra block copy, such access is guaranteed to be within
1795*09537850SAkhilesh Sanikop // the prediction block.
1796*09537850SAkhilesh Sanikop
1797*09537850SAkhilesh Sanikop if (width == 128) {
1798*09537850SAkhilesh Sanikop IntraBlockCopy2D<128>(src, reference_stride, height, dest, pred_stride);
1799*09537850SAkhilesh Sanikop } else if (width == 64) {
1800*09537850SAkhilesh Sanikop IntraBlockCopy2D<64>(src, reference_stride, height, dest, pred_stride);
1801*09537850SAkhilesh Sanikop } else if (width == 32) {
1802*09537850SAkhilesh Sanikop IntraBlockCopy2D<32>(src, reference_stride, height, dest, pred_stride);
1803*09537850SAkhilesh Sanikop } else if (width == 16) {
1804*09537850SAkhilesh Sanikop IntraBlockCopy2D<16>(src, reference_stride, height, dest, pred_stride);
1805*09537850SAkhilesh Sanikop } else if (width == 8) {
1806*09537850SAkhilesh Sanikop IntraBlockCopy2D<8>(src, reference_stride, height, dest, pred_stride);
1807*09537850SAkhilesh Sanikop } else if (width == 4) {
1808*09537850SAkhilesh Sanikop __m128i left = _mm_cvtepu8_epi16(Load4(src));
1809*09537850SAkhilesh Sanikop __m128i right = _mm_cvtepu8_epi16(Load4(src + 1));
1810*09537850SAkhilesh Sanikop src += reference_stride;
1811*09537850SAkhilesh Sanikop
1812*09537850SAkhilesh Sanikop __m128i row = _mm_add_epi16(left, right);
1813*09537850SAkhilesh Sanikop
1814*09537850SAkhilesh Sanikop int y = height;
1815*09537850SAkhilesh Sanikop do {
1816*09537850SAkhilesh Sanikop left = Load4(src);
1817*09537850SAkhilesh Sanikop right = Load4(src + 1);
1818*09537850SAkhilesh Sanikop src += reference_stride;
1819*09537850SAkhilesh Sanikop left = _mm_unpacklo_epi32(left, Load4(src));
1820*09537850SAkhilesh Sanikop right = _mm_unpacklo_epi32(right, Load4(src + 1));
1821*09537850SAkhilesh Sanikop src += reference_stride;
1822*09537850SAkhilesh Sanikop
1823*09537850SAkhilesh Sanikop const __m128i below =
1824*09537850SAkhilesh Sanikop _mm_add_epi16(_mm_cvtepu8_epi16(left), _mm_cvtepu8_epi16(right));
1825*09537850SAkhilesh Sanikop const __m128i result =
1826*09537850SAkhilesh Sanikop AddU16RightShift2AndPack(_mm_unpacklo_epi64(row, below), below);
1827*09537850SAkhilesh Sanikop
1828*09537850SAkhilesh Sanikop Store4(dest, result);
1829*09537850SAkhilesh Sanikop dest += pred_stride;
1830*09537850SAkhilesh Sanikop Store4(dest, _mm_srli_si128(result, 4));
1831*09537850SAkhilesh Sanikop dest += pred_stride;
1832*09537850SAkhilesh Sanikop
1833*09537850SAkhilesh Sanikop row = _mm_srli_si128(below, 8);
1834*09537850SAkhilesh Sanikop y -= 2;
1835*09537850SAkhilesh Sanikop } while (y != 0);
1836*09537850SAkhilesh Sanikop } else {
1837*09537850SAkhilesh Sanikop __m128i left = Load2(src);
1838*09537850SAkhilesh Sanikop __m128i right = Load2(src + 1);
1839*09537850SAkhilesh Sanikop src += reference_stride;
1840*09537850SAkhilesh Sanikop
1841*09537850SAkhilesh Sanikop __m128i row =
1842*09537850SAkhilesh Sanikop _mm_add_epi16(_mm_cvtepu8_epi16(left), _mm_cvtepu8_epi16(right));
1843*09537850SAkhilesh Sanikop
1844*09537850SAkhilesh Sanikop int y = height;
1845*09537850SAkhilesh Sanikop do {
1846*09537850SAkhilesh Sanikop left = Load2<0>(src, left);
1847*09537850SAkhilesh Sanikop right = Load2<0>(src + 1, right);
1848*09537850SAkhilesh Sanikop src += reference_stride;
1849*09537850SAkhilesh Sanikop left = Load2<2>(src, left);
1850*09537850SAkhilesh Sanikop right = Load2<2>(src + 1, right);
1851*09537850SAkhilesh Sanikop src += reference_stride;
1852*09537850SAkhilesh Sanikop
1853*09537850SAkhilesh Sanikop const __m128i below =
1854*09537850SAkhilesh Sanikop _mm_add_epi16(_mm_cvtepu8_epi16(left), _mm_cvtepu8_epi16(right));
1855*09537850SAkhilesh Sanikop const __m128i result =
1856*09537850SAkhilesh Sanikop AddU16RightShift2AndPack(_mm_unpacklo_epi64(row, below), below);
1857*09537850SAkhilesh Sanikop
1858*09537850SAkhilesh Sanikop Store2(dest, result);
1859*09537850SAkhilesh Sanikop dest += pred_stride;
1860*09537850SAkhilesh Sanikop Store2(dest, _mm_srli_si128(result, 4));
1861*09537850SAkhilesh Sanikop dest += pred_stride;
1862*09537850SAkhilesh Sanikop
1863*09537850SAkhilesh Sanikop row = _mm_srli_si128(below, 8);
1864*09537850SAkhilesh Sanikop y -= 2;
1865*09537850SAkhilesh Sanikop } while (y != 0);
1866*09537850SAkhilesh Sanikop }
1867*09537850SAkhilesh Sanikop }
1868*09537850SAkhilesh Sanikop
Init8bpp()1869*09537850SAkhilesh Sanikop void Init8bpp() {
1870*09537850SAkhilesh Sanikop Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
1871*09537850SAkhilesh Sanikop assert(dsp != nullptr);
1872*09537850SAkhilesh Sanikop dsp->convolve[0][0][0][1] = ConvolveHorizontal_SSE4_1;
1873*09537850SAkhilesh Sanikop dsp->convolve[0][0][1][0] = ConvolveVertical_SSE4_1;
1874*09537850SAkhilesh Sanikop dsp->convolve[0][0][1][1] = Convolve2D_SSE4_1;
1875*09537850SAkhilesh Sanikop
1876*09537850SAkhilesh Sanikop dsp->convolve[0][1][0][0] = ConvolveCompoundCopy_SSE4_1;
1877*09537850SAkhilesh Sanikop dsp->convolve[0][1][0][1] = ConvolveCompoundHorizontal_SSE4_1;
1878*09537850SAkhilesh Sanikop dsp->convolve[0][1][1][0] = ConvolveCompoundVertical_SSE4_1;
1879*09537850SAkhilesh Sanikop dsp->convolve[0][1][1][1] = ConvolveCompound2D_SSE4_1;
1880*09537850SAkhilesh Sanikop
1881*09537850SAkhilesh Sanikop dsp->convolve[1][0][0][1] = ConvolveIntraBlockCopyHorizontal_SSE4_1;
1882*09537850SAkhilesh Sanikop dsp->convolve[1][0][1][0] = ConvolveIntraBlockCopyVertical_SSE4_1;
1883*09537850SAkhilesh Sanikop dsp->convolve[1][0][1][1] = ConvolveIntraBlockCopy2D_SSE4_1;
1884*09537850SAkhilesh Sanikop
1885*09537850SAkhilesh Sanikop dsp->convolve_scale[0] = ConvolveScale2D_SSE4_1<false>;
1886*09537850SAkhilesh Sanikop dsp->convolve_scale[1] = ConvolveScale2D_SSE4_1<true>;
1887*09537850SAkhilesh Sanikop }
1888*09537850SAkhilesh Sanikop
1889*09537850SAkhilesh Sanikop } // namespace
1890*09537850SAkhilesh Sanikop } // namespace low_bitdepth
1891*09537850SAkhilesh Sanikop
ConvolveInit_SSE4_1()1892*09537850SAkhilesh Sanikop void ConvolveInit_SSE4_1() { low_bitdepth::Init8bpp(); }
1893*09537850SAkhilesh Sanikop
1894*09537850SAkhilesh Sanikop } // namespace dsp
1895*09537850SAkhilesh Sanikop } // namespace libgav1
1896*09537850SAkhilesh Sanikop
1897*09537850SAkhilesh Sanikop #else // !LIBGAV1_TARGETING_SSE4_1
1898*09537850SAkhilesh Sanikop namespace libgav1 {
1899*09537850SAkhilesh Sanikop namespace dsp {
1900*09537850SAkhilesh Sanikop
ConvolveInit_SSE4_1()1901*09537850SAkhilesh Sanikop void ConvolveInit_SSE4_1() {}
1902*09537850SAkhilesh Sanikop
1903*09537850SAkhilesh Sanikop } // namespace dsp
1904*09537850SAkhilesh Sanikop } // namespace libgav1
1905*09537850SAkhilesh Sanikop #endif // LIBGAV1_TARGETING_SSE4_1
1906