1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker *
4*77c1e3ccSAndroid Build Coastguard Worker * This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker * was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker * Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker */
11*77c1e3ccSAndroid Build Coastguard Worker
12*77c1e3ccSAndroid Build Coastguard Worker #include <immintrin.h> // AVX2
13*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/x86/mem_sse2.h"
14*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/x86/synonyms.h"
15*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/x86/synonyms_avx2.h"
16*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/x86/transpose_sse2.h"
17*77c1e3ccSAndroid Build Coastguard Worker
18*77c1e3ccSAndroid Build Coastguard Worker #include "config/av1_rtcd.h"
19*77c1e3ccSAndroid Build Coastguard Worker #include "av1/common/restoration.h"
20*77c1e3ccSAndroid Build Coastguard Worker #include "av1/encoder/pickrst.h"
21*77c1e3ccSAndroid Build Coastguard Worker
22*77c1e3ccSAndroid Build Coastguard Worker #if CONFIG_AV1_HIGHBITDEPTH
acc_stat_highbd_avx2(int64_t * dst,const uint16_t * dgd,const __m256i * shuffle,const __m256i * dgd_ijkl)23*77c1e3ccSAndroid Build Coastguard Worker static inline void acc_stat_highbd_avx2(int64_t *dst, const uint16_t *dgd,
24*77c1e3ccSAndroid Build Coastguard Worker const __m256i *shuffle,
25*77c1e3ccSAndroid Build Coastguard Worker const __m256i *dgd_ijkl) {
26*77c1e3ccSAndroid Build Coastguard Worker // Load two 128-bit chunks from dgd
27*77c1e3ccSAndroid Build Coastguard Worker const __m256i s0 = _mm256_inserti128_si256(
28*77c1e3ccSAndroid Build Coastguard Worker _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)dgd)),
29*77c1e3ccSAndroid Build Coastguard Worker _mm_loadu_si128((__m128i *)(dgd + 4)), 1);
30*77c1e3ccSAndroid Build Coastguard Worker // s0 = [11 10 9 8 7 6 5 4] [7 6 5 4 3 2 1 0] as u16 (values are dgd indices)
31*77c1e3ccSAndroid Build Coastguard Worker // The weird order is so the shuffle stays within 128-bit lanes
32*77c1e3ccSAndroid Build Coastguard Worker
33*77c1e3ccSAndroid Build Coastguard Worker // Shuffle 16x u16 values within lanes according to the mask:
34*77c1e3ccSAndroid Build Coastguard Worker // [0 1 1 2 2 3 3 4] [0 1 1 2 2 3 3 4]
35*77c1e3ccSAndroid Build Coastguard Worker // (Actually we shuffle u8 values as there's no 16-bit shuffle)
36*77c1e3ccSAndroid Build Coastguard Worker const __m256i s1 = _mm256_shuffle_epi8(s0, *shuffle);
37*77c1e3ccSAndroid Build Coastguard Worker // s1 = [8 7 7 6 6 5 5 4] [4 3 3 2 2 1 1 0] as u16 (values are dgd indices)
38*77c1e3ccSAndroid Build Coastguard Worker
39*77c1e3ccSAndroid Build Coastguard Worker // Multiply 16x 16-bit integers in dgd_ijkl and s1, resulting in 16x 32-bit
40*77c1e3ccSAndroid Build Coastguard Worker // integers then horizontally add pairs of these integers resulting in 8x
41*77c1e3ccSAndroid Build Coastguard Worker // 32-bit integers
42*77c1e3ccSAndroid Build Coastguard Worker const __m256i d0 = _mm256_madd_epi16(*dgd_ijkl, s1);
43*77c1e3ccSAndroid Build Coastguard Worker // d0 = [a b c d] [e f g h] as u32
44*77c1e3ccSAndroid Build Coastguard Worker
45*77c1e3ccSAndroid Build Coastguard Worker // Take the lower-half of d0, extend to u64, add it on to dst (H)
46*77c1e3ccSAndroid Build Coastguard Worker const __m256i d0l = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(d0, 0));
47*77c1e3ccSAndroid Build Coastguard Worker // d0l = [a b] [c d] as u64
48*77c1e3ccSAndroid Build Coastguard Worker const __m256i dst0 = yy_load_256(dst);
49*77c1e3ccSAndroid Build Coastguard Worker yy_store_256(dst, _mm256_add_epi64(d0l, dst0));
50*77c1e3ccSAndroid Build Coastguard Worker
51*77c1e3ccSAndroid Build Coastguard Worker // Take the upper-half of d0, extend to u64, add it on to dst (H)
52*77c1e3ccSAndroid Build Coastguard Worker const __m256i d0h = _mm256_cvtepu32_epi64(_mm256_extracti128_si256(d0, 1));
53*77c1e3ccSAndroid Build Coastguard Worker // d0h = [e f] [g h] as u64
54*77c1e3ccSAndroid Build Coastguard Worker const __m256i dst1 = yy_load_256(dst + 4);
55*77c1e3ccSAndroid Build Coastguard Worker yy_store_256(dst + 4, _mm256_add_epi64(d0h, dst1));
56*77c1e3ccSAndroid Build Coastguard Worker }
57*77c1e3ccSAndroid Build Coastguard Worker
acc_stat_highbd_win7_one_line_avx2(const uint16_t * dgd,const uint16_t * src,int h_start,int h_end,int dgd_stride,const __m256i * shuffle,int32_t * sumX,int32_t sumY[WIENER_WIN][WIENER_WIN],int64_t M_int[WIENER_WIN][WIENER_WIN],int64_t H_int[WIENER_WIN2][WIENER_WIN * 8])58*77c1e3ccSAndroid Build Coastguard Worker static inline void acc_stat_highbd_win7_one_line_avx2(
59*77c1e3ccSAndroid Build Coastguard Worker const uint16_t *dgd, const uint16_t *src, int h_start, int h_end,
60*77c1e3ccSAndroid Build Coastguard Worker int dgd_stride, const __m256i *shuffle, int32_t *sumX,
61*77c1e3ccSAndroid Build Coastguard Worker int32_t sumY[WIENER_WIN][WIENER_WIN], int64_t M_int[WIENER_WIN][WIENER_WIN],
62*77c1e3ccSAndroid Build Coastguard Worker int64_t H_int[WIENER_WIN2][WIENER_WIN * 8]) {
63*77c1e3ccSAndroid Build Coastguard Worker int j, k, l;
64*77c1e3ccSAndroid Build Coastguard Worker const int wiener_win = WIENER_WIN;
65*77c1e3ccSAndroid Build Coastguard Worker // Main loop handles two pixels at a time
66*77c1e3ccSAndroid Build Coastguard Worker // We can assume that h_start is even, since it will always be aligned to
67*77c1e3ccSAndroid Build Coastguard Worker // a tile edge + some number of restoration units, and both of those will
68*77c1e3ccSAndroid Build Coastguard Worker // be 64-pixel aligned.
69*77c1e3ccSAndroid Build Coastguard Worker // However, at the edge of the image, h_end may be odd, so we need to handle
70*77c1e3ccSAndroid Build Coastguard Worker // that case correctly.
71*77c1e3ccSAndroid Build Coastguard Worker assert(h_start % 2 == 0);
72*77c1e3ccSAndroid Build Coastguard Worker const int h_end_even = h_end & ~1;
73*77c1e3ccSAndroid Build Coastguard Worker const int has_odd_pixel = h_end & 1;
74*77c1e3ccSAndroid Build Coastguard Worker for (j = h_start; j < h_end_even; j += 2) {
75*77c1e3ccSAndroid Build Coastguard Worker const uint16_t X1 = src[j];
76*77c1e3ccSAndroid Build Coastguard Worker const uint16_t X2 = src[j + 1];
77*77c1e3ccSAndroid Build Coastguard Worker *sumX += X1 + X2;
78*77c1e3ccSAndroid Build Coastguard Worker const uint16_t *dgd_ij = dgd + j;
79*77c1e3ccSAndroid Build Coastguard Worker for (k = 0; k < wiener_win; k++) {
80*77c1e3ccSAndroid Build Coastguard Worker const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
81*77c1e3ccSAndroid Build Coastguard Worker for (l = 0; l < wiener_win; l++) {
82*77c1e3ccSAndroid Build Coastguard Worker int64_t *H_ = &H_int[(l * wiener_win + k)][0];
83*77c1e3ccSAndroid Build Coastguard Worker const uint16_t D1 = dgd_ijk[l];
84*77c1e3ccSAndroid Build Coastguard Worker const uint16_t D2 = dgd_ijk[l + 1];
85*77c1e3ccSAndroid Build Coastguard Worker sumY[k][l] += D1 + D2;
86*77c1e3ccSAndroid Build Coastguard Worker M_int[k][l] += D1 * X1 + D2 * X2;
87*77c1e3ccSAndroid Build Coastguard Worker
88*77c1e3ccSAndroid Build Coastguard Worker // Load two u16 values from dgd_ijkl combined as a u32,
89*77c1e3ccSAndroid Build Coastguard Worker // then broadcast to 8x u32 slots of a 256
90*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_ijkl = _mm256_set1_epi32(loadu_int32(dgd_ijk + l));
91*77c1e3ccSAndroid Build Coastguard Worker // dgd_ijkl = [y x y x y x y x] [y x y x y x y x] where each is a u16
92*77c1e3ccSAndroid Build Coastguard Worker
93*77c1e3ccSAndroid Build Coastguard Worker acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
94*77c1e3ccSAndroid Build Coastguard Worker &dgd_ijkl);
95*77c1e3ccSAndroid Build Coastguard Worker acc_stat_highbd_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
96*77c1e3ccSAndroid Build Coastguard Worker &dgd_ijkl);
97*77c1e3ccSAndroid Build Coastguard Worker acc_stat_highbd_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
98*77c1e3ccSAndroid Build Coastguard Worker &dgd_ijkl);
99*77c1e3ccSAndroid Build Coastguard Worker acc_stat_highbd_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
100*77c1e3ccSAndroid Build Coastguard Worker &dgd_ijkl);
101*77c1e3ccSAndroid Build Coastguard Worker acc_stat_highbd_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
102*77c1e3ccSAndroid Build Coastguard Worker &dgd_ijkl);
103*77c1e3ccSAndroid Build Coastguard Worker acc_stat_highbd_avx2(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle,
104*77c1e3ccSAndroid Build Coastguard Worker &dgd_ijkl);
105*77c1e3ccSAndroid Build Coastguard Worker acc_stat_highbd_avx2(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle,
106*77c1e3ccSAndroid Build Coastguard Worker &dgd_ijkl);
107*77c1e3ccSAndroid Build Coastguard Worker }
108*77c1e3ccSAndroid Build Coastguard Worker }
109*77c1e3ccSAndroid Build Coastguard Worker }
110*77c1e3ccSAndroid Build Coastguard Worker // If the width is odd, add in the final pixel
111*77c1e3ccSAndroid Build Coastguard Worker if (has_odd_pixel) {
112*77c1e3ccSAndroid Build Coastguard Worker const uint16_t X1 = src[j];
113*77c1e3ccSAndroid Build Coastguard Worker *sumX += X1;
114*77c1e3ccSAndroid Build Coastguard Worker const uint16_t *dgd_ij = dgd + j;
115*77c1e3ccSAndroid Build Coastguard Worker for (k = 0; k < wiener_win; k++) {
116*77c1e3ccSAndroid Build Coastguard Worker const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
117*77c1e3ccSAndroid Build Coastguard Worker for (l = 0; l < wiener_win; l++) {
118*77c1e3ccSAndroid Build Coastguard Worker int64_t *H_ = &H_int[(l * wiener_win + k)][0];
119*77c1e3ccSAndroid Build Coastguard Worker const uint16_t D1 = dgd_ijk[l];
120*77c1e3ccSAndroid Build Coastguard Worker sumY[k][l] += D1;
121*77c1e3ccSAndroid Build Coastguard Worker M_int[k][l] += D1 * X1;
122*77c1e3ccSAndroid Build Coastguard Worker
123*77c1e3ccSAndroid Build Coastguard Worker // The `acc_stat_highbd_avx2` function wants its input to have
124*77c1e3ccSAndroid Build Coastguard Worker // interleaved copies of two pixels, but we only have one. However, the
125*77c1e3ccSAndroid Build Coastguard Worker // pixels are (effectively) used as inputs to a multiply-accumulate. So
126*77c1e3ccSAndroid Build Coastguard Worker // if we set the extra pixel slot to 0, then it is effectively ignored.
127*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_ijkl = _mm256_set1_epi32((int)D1);
128*77c1e3ccSAndroid Build Coastguard Worker
129*77c1e3ccSAndroid Build Coastguard Worker acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
130*77c1e3ccSAndroid Build Coastguard Worker &dgd_ijkl);
131*77c1e3ccSAndroid Build Coastguard Worker acc_stat_highbd_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
132*77c1e3ccSAndroid Build Coastguard Worker &dgd_ijkl);
133*77c1e3ccSAndroid Build Coastguard Worker acc_stat_highbd_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
134*77c1e3ccSAndroid Build Coastguard Worker &dgd_ijkl);
135*77c1e3ccSAndroid Build Coastguard Worker acc_stat_highbd_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
136*77c1e3ccSAndroid Build Coastguard Worker &dgd_ijkl);
137*77c1e3ccSAndroid Build Coastguard Worker acc_stat_highbd_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
138*77c1e3ccSAndroid Build Coastguard Worker &dgd_ijkl);
139*77c1e3ccSAndroid Build Coastguard Worker acc_stat_highbd_avx2(H_ + 5 * 8, dgd_ij + 5 * dgd_stride, shuffle,
140*77c1e3ccSAndroid Build Coastguard Worker &dgd_ijkl);
141*77c1e3ccSAndroid Build Coastguard Worker acc_stat_highbd_avx2(H_ + 6 * 8, dgd_ij + 6 * dgd_stride, shuffle,
142*77c1e3ccSAndroid Build Coastguard Worker &dgd_ijkl);
143*77c1e3ccSAndroid Build Coastguard Worker }
144*77c1e3ccSAndroid Build Coastguard Worker }
145*77c1e3ccSAndroid Build Coastguard Worker }
146*77c1e3ccSAndroid Build Coastguard Worker }
147*77c1e3ccSAndroid Build Coastguard Worker
compute_stats_highbd_win7_opt_avx2(const uint8_t * dgd8,const uint8_t * src8,int h_start,int h_end,int v_start,int v_end,int dgd_stride,int src_stride,int64_t * M,int64_t * H,aom_bit_depth_t bit_depth)148*77c1e3ccSAndroid Build Coastguard Worker static inline void compute_stats_highbd_win7_opt_avx2(
149*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end,
150*77c1e3ccSAndroid Build Coastguard Worker int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M,
151*77c1e3ccSAndroid Build Coastguard Worker int64_t *H, aom_bit_depth_t bit_depth) {
152*77c1e3ccSAndroid Build Coastguard Worker int i, j, k, l, m, n;
153*77c1e3ccSAndroid Build Coastguard Worker const int wiener_win = WIENER_WIN;
154*77c1e3ccSAndroid Build Coastguard Worker const int pixel_count = (h_end - h_start) * (v_end - v_start);
155*77c1e3ccSAndroid Build Coastguard Worker const int wiener_win2 = wiener_win * wiener_win;
156*77c1e3ccSAndroid Build Coastguard Worker const int wiener_halfwin = (wiener_win >> 1);
157*77c1e3ccSAndroid Build Coastguard Worker const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
158*77c1e3ccSAndroid Build Coastguard Worker const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
159*77c1e3ccSAndroid Build Coastguard Worker const uint16_t avg =
160*77c1e3ccSAndroid Build Coastguard Worker find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
161*77c1e3ccSAndroid Build Coastguard Worker
162*77c1e3ccSAndroid Build Coastguard Worker int64_t M_int[WIENER_WIN][WIENER_WIN] = { { 0 } };
163*77c1e3ccSAndroid Build Coastguard Worker DECLARE_ALIGNED(32, int64_t, H_int[WIENER_WIN2][WIENER_WIN * 8]) = { { 0 } };
164*77c1e3ccSAndroid Build Coastguard Worker int32_t sumY[WIENER_WIN][WIENER_WIN] = { { 0 } };
165*77c1e3ccSAndroid Build Coastguard Worker int32_t sumX = 0;
166*77c1e3ccSAndroid Build Coastguard Worker const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
167*77c1e3ccSAndroid Build Coastguard Worker
168*77c1e3ccSAndroid Build Coastguard Worker const __m256i shuffle = yy_loadu_256(g_shuffle_stats_highbd_data);
169*77c1e3ccSAndroid Build Coastguard Worker for (j = v_start; j < v_end; j += 64) {
170*77c1e3ccSAndroid Build Coastguard Worker const int vert_end = AOMMIN(64, v_end - j) + j;
171*77c1e3ccSAndroid Build Coastguard Worker for (i = j; i < vert_end; i++) {
172*77c1e3ccSAndroid Build Coastguard Worker acc_stat_highbd_win7_one_line_avx2(
173*77c1e3ccSAndroid Build Coastguard Worker dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
174*77c1e3ccSAndroid Build Coastguard Worker dgd_stride, &shuffle, &sumX, sumY, M_int, H_int);
175*77c1e3ccSAndroid Build Coastguard Worker }
176*77c1e3ccSAndroid Build Coastguard Worker }
177*77c1e3ccSAndroid Build Coastguard Worker
178*77c1e3ccSAndroid Build Coastguard Worker uint8_t bit_depth_divider = 1;
179*77c1e3ccSAndroid Build Coastguard Worker if (bit_depth == AOM_BITS_12)
180*77c1e3ccSAndroid Build Coastguard Worker bit_depth_divider = 16;
181*77c1e3ccSAndroid Build Coastguard Worker else if (bit_depth == AOM_BITS_10)
182*77c1e3ccSAndroid Build Coastguard Worker bit_depth_divider = 4;
183*77c1e3ccSAndroid Build Coastguard Worker
184*77c1e3ccSAndroid Build Coastguard Worker const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
185*77c1e3ccSAndroid Build Coastguard Worker for (k = 0; k < wiener_win; k++) {
186*77c1e3ccSAndroid Build Coastguard Worker for (l = 0; l < wiener_win; l++) {
187*77c1e3ccSAndroid Build Coastguard Worker const int32_t idx0 = l * wiener_win + k;
188*77c1e3ccSAndroid Build Coastguard Worker M[idx0] = (M_int[k][l] +
189*77c1e3ccSAndroid Build Coastguard Worker (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) /
190*77c1e3ccSAndroid Build Coastguard Worker bit_depth_divider;
191*77c1e3ccSAndroid Build Coastguard Worker int64_t *H_ = H + idx0 * wiener_win2;
192*77c1e3ccSAndroid Build Coastguard Worker int64_t *H_int_ = &H_int[idx0][0];
193*77c1e3ccSAndroid Build Coastguard Worker for (m = 0; m < wiener_win; m++) {
194*77c1e3ccSAndroid Build Coastguard Worker for (n = 0; n < wiener_win; n++) {
195*77c1e3ccSAndroid Build Coastguard Worker H_[m * wiener_win + n] =
196*77c1e3ccSAndroid Build Coastguard Worker (H_int_[n * 8 + m] +
197*77c1e3ccSAndroid Build Coastguard Worker (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) /
198*77c1e3ccSAndroid Build Coastguard Worker bit_depth_divider;
199*77c1e3ccSAndroid Build Coastguard Worker }
200*77c1e3ccSAndroid Build Coastguard Worker }
201*77c1e3ccSAndroid Build Coastguard Worker }
202*77c1e3ccSAndroid Build Coastguard Worker }
203*77c1e3ccSAndroid Build Coastguard Worker }
204*77c1e3ccSAndroid Build Coastguard Worker
acc_stat_highbd_win5_one_line_avx2(const uint16_t * dgd,const uint16_t * src,int h_start,int h_end,int dgd_stride,const __m256i * shuffle,int32_t * sumX,int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],int64_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],int64_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8])205*77c1e3ccSAndroid Build Coastguard Worker static inline void acc_stat_highbd_win5_one_line_avx2(
206*77c1e3ccSAndroid Build Coastguard Worker const uint16_t *dgd, const uint16_t *src, int h_start, int h_end,
207*77c1e3ccSAndroid Build Coastguard Worker int dgd_stride, const __m256i *shuffle, int32_t *sumX,
208*77c1e3ccSAndroid Build Coastguard Worker int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
209*77c1e3ccSAndroid Build Coastguard Worker int64_t M_int[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA],
210*77c1e3ccSAndroid Build Coastguard Worker int64_t H_int[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) {
211*77c1e3ccSAndroid Build Coastguard Worker int j, k, l;
212*77c1e3ccSAndroid Build Coastguard Worker const int wiener_win = WIENER_WIN_CHROMA;
213*77c1e3ccSAndroid Build Coastguard Worker // Main loop handles two pixels at a time
214*77c1e3ccSAndroid Build Coastguard Worker // We can assume that h_start is even, since it will always be aligned to
215*77c1e3ccSAndroid Build Coastguard Worker // a tile edge + some number of restoration units, and both of those will
216*77c1e3ccSAndroid Build Coastguard Worker // be 64-pixel aligned.
217*77c1e3ccSAndroid Build Coastguard Worker // However, at the edge of the image, h_end may be odd, so we need to handle
218*77c1e3ccSAndroid Build Coastguard Worker // that case correctly.
219*77c1e3ccSAndroid Build Coastguard Worker assert(h_start % 2 == 0);
220*77c1e3ccSAndroid Build Coastguard Worker const int h_end_even = h_end & ~1;
221*77c1e3ccSAndroid Build Coastguard Worker const int has_odd_pixel = h_end & 1;
222*77c1e3ccSAndroid Build Coastguard Worker for (j = h_start; j < h_end_even; j += 2) {
223*77c1e3ccSAndroid Build Coastguard Worker const uint16_t X1 = src[j];
224*77c1e3ccSAndroid Build Coastguard Worker const uint16_t X2 = src[j + 1];
225*77c1e3ccSAndroid Build Coastguard Worker *sumX += X1 + X2;
226*77c1e3ccSAndroid Build Coastguard Worker const uint16_t *dgd_ij = dgd + j;
227*77c1e3ccSAndroid Build Coastguard Worker for (k = 0; k < wiener_win; k++) {
228*77c1e3ccSAndroid Build Coastguard Worker const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
229*77c1e3ccSAndroid Build Coastguard Worker for (l = 0; l < wiener_win; l++) {
230*77c1e3ccSAndroid Build Coastguard Worker int64_t *H_ = &H_int[(l * wiener_win + k)][0];
231*77c1e3ccSAndroid Build Coastguard Worker const uint16_t D1 = dgd_ijk[l];
232*77c1e3ccSAndroid Build Coastguard Worker const uint16_t D2 = dgd_ijk[l + 1];
233*77c1e3ccSAndroid Build Coastguard Worker sumY[k][l] += D1 + D2;
234*77c1e3ccSAndroid Build Coastguard Worker M_int[k][l] += D1 * X1 + D2 * X2;
235*77c1e3ccSAndroid Build Coastguard Worker
236*77c1e3ccSAndroid Build Coastguard Worker // Load two u16 values from dgd_ijkl combined as a u32,
237*77c1e3ccSAndroid Build Coastguard Worker // then broadcast to 8x u32 slots of a 256
238*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_ijkl = _mm256_set1_epi32(loadu_int32(dgd_ijk + l));
239*77c1e3ccSAndroid Build Coastguard Worker // dgd_ijkl = [x y x y x y x y] [x y x y x y x y] where each is a u16
240*77c1e3ccSAndroid Build Coastguard Worker
241*77c1e3ccSAndroid Build Coastguard Worker acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
242*77c1e3ccSAndroid Build Coastguard Worker &dgd_ijkl);
243*77c1e3ccSAndroid Build Coastguard Worker acc_stat_highbd_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
244*77c1e3ccSAndroid Build Coastguard Worker &dgd_ijkl);
245*77c1e3ccSAndroid Build Coastguard Worker acc_stat_highbd_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
246*77c1e3ccSAndroid Build Coastguard Worker &dgd_ijkl);
247*77c1e3ccSAndroid Build Coastguard Worker acc_stat_highbd_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
248*77c1e3ccSAndroid Build Coastguard Worker &dgd_ijkl);
249*77c1e3ccSAndroid Build Coastguard Worker acc_stat_highbd_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
250*77c1e3ccSAndroid Build Coastguard Worker &dgd_ijkl);
251*77c1e3ccSAndroid Build Coastguard Worker }
252*77c1e3ccSAndroid Build Coastguard Worker }
253*77c1e3ccSAndroid Build Coastguard Worker }
254*77c1e3ccSAndroid Build Coastguard Worker // If the width is odd, add in the final pixel
255*77c1e3ccSAndroid Build Coastguard Worker if (has_odd_pixel) {
256*77c1e3ccSAndroid Build Coastguard Worker const uint16_t X1 = src[j];
257*77c1e3ccSAndroid Build Coastguard Worker *sumX += X1;
258*77c1e3ccSAndroid Build Coastguard Worker const uint16_t *dgd_ij = dgd + j;
259*77c1e3ccSAndroid Build Coastguard Worker for (k = 0; k < wiener_win; k++) {
260*77c1e3ccSAndroid Build Coastguard Worker const uint16_t *dgd_ijk = dgd_ij + k * dgd_stride;
261*77c1e3ccSAndroid Build Coastguard Worker for (l = 0; l < wiener_win; l++) {
262*77c1e3ccSAndroid Build Coastguard Worker int64_t *H_ = &H_int[(l * wiener_win + k)][0];
263*77c1e3ccSAndroid Build Coastguard Worker const uint16_t D1 = dgd_ijk[l];
264*77c1e3ccSAndroid Build Coastguard Worker sumY[k][l] += D1;
265*77c1e3ccSAndroid Build Coastguard Worker M_int[k][l] += D1 * X1;
266*77c1e3ccSAndroid Build Coastguard Worker
267*77c1e3ccSAndroid Build Coastguard Worker // The `acc_stat_highbd_avx2` function wants its input to have
268*77c1e3ccSAndroid Build Coastguard Worker // interleaved copies of two pixels, but we only have one. However, the
269*77c1e3ccSAndroid Build Coastguard Worker // pixels are (effectively) used as inputs to a multiply-accumulate. So
270*77c1e3ccSAndroid Build Coastguard Worker // if we set the extra pixel slot to 0, then it is effectively ignored.
271*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_ijkl = _mm256_set1_epi32((int)D1);
272*77c1e3ccSAndroid Build Coastguard Worker
273*77c1e3ccSAndroid Build Coastguard Worker acc_stat_highbd_avx2(H_ + 0 * 8, dgd_ij + 0 * dgd_stride, shuffle,
274*77c1e3ccSAndroid Build Coastguard Worker &dgd_ijkl);
275*77c1e3ccSAndroid Build Coastguard Worker acc_stat_highbd_avx2(H_ + 1 * 8, dgd_ij + 1 * dgd_stride, shuffle,
276*77c1e3ccSAndroid Build Coastguard Worker &dgd_ijkl);
277*77c1e3ccSAndroid Build Coastguard Worker acc_stat_highbd_avx2(H_ + 2 * 8, dgd_ij + 2 * dgd_stride, shuffle,
278*77c1e3ccSAndroid Build Coastguard Worker &dgd_ijkl);
279*77c1e3ccSAndroid Build Coastguard Worker acc_stat_highbd_avx2(H_ + 3 * 8, dgd_ij + 3 * dgd_stride, shuffle,
280*77c1e3ccSAndroid Build Coastguard Worker &dgd_ijkl);
281*77c1e3ccSAndroid Build Coastguard Worker acc_stat_highbd_avx2(H_ + 4 * 8, dgd_ij + 4 * dgd_stride, shuffle,
282*77c1e3ccSAndroid Build Coastguard Worker &dgd_ijkl);
283*77c1e3ccSAndroid Build Coastguard Worker }
284*77c1e3ccSAndroid Build Coastguard Worker }
285*77c1e3ccSAndroid Build Coastguard Worker }
286*77c1e3ccSAndroid Build Coastguard Worker }
287*77c1e3ccSAndroid Build Coastguard Worker
compute_stats_highbd_win5_opt_avx2(const uint8_t * dgd8,const uint8_t * src8,int h_start,int h_end,int v_start,int v_end,int dgd_stride,int src_stride,int64_t * M,int64_t * H,aom_bit_depth_t bit_depth)288*77c1e3ccSAndroid Build Coastguard Worker static inline void compute_stats_highbd_win5_opt_avx2(
289*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end,
290*77c1e3ccSAndroid Build Coastguard Worker int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M,
291*77c1e3ccSAndroid Build Coastguard Worker int64_t *H, aom_bit_depth_t bit_depth) {
292*77c1e3ccSAndroid Build Coastguard Worker int i, j, k, l, m, n;
293*77c1e3ccSAndroid Build Coastguard Worker const int wiener_win = WIENER_WIN_CHROMA;
294*77c1e3ccSAndroid Build Coastguard Worker const int pixel_count = (h_end - h_start) * (v_end - v_start);
295*77c1e3ccSAndroid Build Coastguard Worker const int wiener_win2 = wiener_win * wiener_win;
296*77c1e3ccSAndroid Build Coastguard Worker const int wiener_halfwin = (wiener_win >> 1);
297*77c1e3ccSAndroid Build Coastguard Worker const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
298*77c1e3ccSAndroid Build Coastguard Worker const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
299*77c1e3ccSAndroid Build Coastguard Worker const uint16_t avg =
300*77c1e3ccSAndroid Build Coastguard Worker find_average_highbd(dgd, h_start, h_end, v_start, v_end, dgd_stride);
301*77c1e3ccSAndroid Build Coastguard Worker
302*77c1e3ccSAndroid Build Coastguard Worker int64_t M_int64[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
303*77c1e3ccSAndroid Build Coastguard Worker DECLARE_ALIGNED(
304*77c1e3ccSAndroid Build Coastguard Worker 32, int64_t,
305*77c1e3ccSAndroid Build Coastguard Worker H_int64[WIENER_WIN2_CHROMA][WIENER_WIN_CHROMA * 8]) = { { 0 } };
306*77c1e3ccSAndroid Build Coastguard Worker int32_t sumY[WIENER_WIN_CHROMA][WIENER_WIN_CHROMA] = { { 0 } };
307*77c1e3ccSAndroid Build Coastguard Worker int32_t sumX = 0;
308*77c1e3ccSAndroid Build Coastguard Worker const uint16_t *dgd_win = dgd - wiener_halfwin * dgd_stride - wiener_halfwin;
309*77c1e3ccSAndroid Build Coastguard Worker
310*77c1e3ccSAndroid Build Coastguard Worker const __m256i shuffle = yy_loadu_256(g_shuffle_stats_highbd_data);
311*77c1e3ccSAndroid Build Coastguard Worker for (j = v_start; j < v_end; j += 64) {
312*77c1e3ccSAndroid Build Coastguard Worker const int vert_end = AOMMIN(64, v_end - j) + j;
313*77c1e3ccSAndroid Build Coastguard Worker for (i = j; i < vert_end; i++) {
314*77c1e3ccSAndroid Build Coastguard Worker acc_stat_highbd_win5_one_line_avx2(
315*77c1e3ccSAndroid Build Coastguard Worker dgd_win + i * dgd_stride, src + i * src_stride, h_start, h_end,
316*77c1e3ccSAndroid Build Coastguard Worker dgd_stride, &shuffle, &sumX, sumY, M_int64, H_int64);
317*77c1e3ccSAndroid Build Coastguard Worker }
318*77c1e3ccSAndroid Build Coastguard Worker }
319*77c1e3ccSAndroid Build Coastguard Worker
320*77c1e3ccSAndroid Build Coastguard Worker uint8_t bit_depth_divider = 1;
321*77c1e3ccSAndroid Build Coastguard Worker if (bit_depth == AOM_BITS_12)
322*77c1e3ccSAndroid Build Coastguard Worker bit_depth_divider = 16;
323*77c1e3ccSAndroid Build Coastguard Worker else if (bit_depth == AOM_BITS_10)
324*77c1e3ccSAndroid Build Coastguard Worker bit_depth_divider = 4;
325*77c1e3ccSAndroid Build Coastguard Worker
326*77c1e3ccSAndroid Build Coastguard Worker const int64_t avg_square_sum = (int64_t)avg * (int64_t)avg * pixel_count;
327*77c1e3ccSAndroid Build Coastguard Worker for (k = 0; k < wiener_win; k++) {
328*77c1e3ccSAndroid Build Coastguard Worker for (l = 0; l < wiener_win; l++) {
329*77c1e3ccSAndroid Build Coastguard Worker const int32_t idx0 = l * wiener_win + k;
330*77c1e3ccSAndroid Build Coastguard Worker M[idx0] = (M_int64[k][l] +
331*77c1e3ccSAndroid Build Coastguard Worker (avg_square_sum - (int64_t)avg * (sumX + sumY[k][l]))) /
332*77c1e3ccSAndroid Build Coastguard Worker bit_depth_divider;
333*77c1e3ccSAndroid Build Coastguard Worker int64_t *H_ = H + idx0 * wiener_win2;
334*77c1e3ccSAndroid Build Coastguard Worker int64_t *H_int_ = &H_int64[idx0][0];
335*77c1e3ccSAndroid Build Coastguard Worker for (m = 0; m < wiener_win; m++) {
336*77c1e3ccSAndroid Build Coastguard Worker for (n = 0; n < wiener_win; n++) {
337*77c1e3ccSAndroid Build Coastguard Worker H_[m * wiener_win + n] =
338*77c1e3ccSAndroid Build Coastguard Worker (H_int_[n * 8 + m] +
339*77c1e3ccSAndroid Build Coastguard Worker (avg_square_sum - (int64_t)avg * (sumY[k][l] + sumY[n][m]))) /
340*77c1e3ccSAndroid Build Coastguard Worker bit_depth_divider;
341*77c1e3ccSAndroid Build Coastguard Worker }
342*77c1e3ccSAndroid Build Coastguard Worker }
343*77c1e3ccSAndroid Build Coastguard Worker }
344*77c1e3ccSAndroid Build Coastguard Worker }
345*77c1e3ccSAndroid Build Coastguard Worker }
346*77c1e3ccSAndroid Build Coastguard Worker
av1_compute_stats_highbd_avx2(int wiener_win,const uint8_t * dgd8,const uint8_t * src8,int16_t * dgd_avg,int16_t * src_avg,int h_start,int h_end,int v_start,int v_end,int dgd_stride,int src_stride,int64_t * M,int64_t * H,aom_bit_depth_t bit_depth)347*77c1e3ccSAndroid Build Coastguard Worker void av1_compute_stats_highbd_avx2(int wiener_win, const uint8_t *dgd8,
348*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src8, int16_t *dgd_avg,
349*77c1e3ccSAndroid Build Coastguard Worker int16_t *src_avg, int h_start, int h_end,
350*77c1e3ccSAndroid Build Coastguard Worker int v_start, int v_end, int dgd_stride,
351*77c1e3ccSAndroid Build Coastguard Worker int src_stride, int64_t *M, int64_t *H,
352*77c1e3ccSAndroid Build Coastguard Worker aom_bit_depth_t bit_depth) {
353*77c1e3ccSAndroid Build Coastguard Worker if (wiener_win == WIENER_WIN) {
354*77c1e3ccSAndroid Build Coastguard Worker (void)dgd_avg;
355*77c1e3ccSAndroid Build Coastguard Worker (void)src_avg;
356*77c1e3ccSAndroid Build Coastguard Worker compute_stats_highbd_win7_opt_avx2(dgd8, src8, h_start, h_end, v_start,
357*77c1e3ccSAndroid Build Coastguard Worker v_end, dgd_stride, src_stride, M, H,
358*77c1e3ccSAndroid Build Coastguard Worker bit_depth);
359*77c1e3ccSAndroid Build Coastguard Worker } else if (wiener_win == WIENER_WIN_CHROMA) {
360*77c1e3ccSAndroid Build Coastguard Worker (void)dgd_avg;
361*77c1e3ccSAndroid Build Coastguard Worker (void)src_avg;
362*77c1e3ccSAndroid Build Coastguard Worker compute_stats_highbd_win5_opt_avx2(dgd8, src8, h_start, h_end, v_start,
363*77c1e3ccSAndroid Build Coastguard Worker v_end, dgd_stride, src_stride, M, H,
364*77c1e3ccSAndroid Build Coastguard Worker bit_depth);
365*77c1e3ccSAndroid Build Coastguard Worker } else {
366*77c1e3ccSAndroid Build Coastguard Worker av1_compute_stats_highbd_c(wiener_win, dgd8, src8, dgd_avg, src_avg,
367*77c1e3ccSAndroid Build Coastguard Worker h_start, h_end, v_start, v_end, dgd_stride,
368*77c1e3ccSAndroid Build Coastguard Worker src_stride, M, H, bit_depth);
369*77c1e3ccSAndroid Build Coastguard Worker }
370*77c1e3ccSAndroid Build Coastguard Worker }
371*77c1e3ccSAndroid Build Coastguard Worker #endif // CONFIG_AV1_HIGHBITDEPTH
372*77c1e3ccSAndroid Build Coastguard Worker
madd_and_accum_avx2(__m256i src,__m256i dgd,__m256i * sum)373*77c1e3ccSAndroid Build Coastguard Worker static inline void madd_and_accum_avx2(__m256i src, __m256i dgd, __m256i *sum) {
374*77c1e3ccSAndroid Build Coastguard Worker *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(src, dgd));
375*77c1e3ccSAndroid Build Coastguard Worker }
376*77c1e3ccSAndroid Build Coastguard Worker
convert_and_add_avx2(__m256i src)377*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i convert_and_add_avx2(__m256i src) {
378*77c1e3ccSAndroid Build Coastguard Worker const __m256i s0 = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(src));
379*77c1e3ccSAndroid Build Coastguard Worker const __m256i s1 = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(src, 1));
380*77c1e3ccSAndroid Build Coastguard Worker return _mm256_add_epi64(s0, s1);
381*77c1e3ccSAndroid Build Coastguard Worker }
382*77c1e3ccSAndroid Build Coastguard Worker
hadd_four_32_to_64_avx2(__m256i src0,__m256i src1,__m256i * src2,__m256i * src3)383*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i hadd_four_32_to_64_avx2(__m256i src0, __m256i src1,
384*77c1e3ccSAndroid Build Coastguard Worker __m256i *src2, __m256i *src3) {
385*77c1e3ccSAndroid Build Coastguard Worker // 00 01 10 11 02 03 12 13
386*77c1e3ccSAndroid Build Coastguard Worker const __m256i s_0 = _mm256_hadd_epi32(src0, src1);
387*77c1e3ccSAndroid Build Coastguard Worker // 20 21 30 31 22 23 32 33
388*77c1e3ccSAndroid Build Coastguard Worker const __m256i s_1 = _mm256_hadd_epi32(*src2, *src3);
389*77c1e3ccSAndroid Build Coastguard Worker // 00+01 10+11 20+21 30+31 02+03 12+13 22+23 32+33
390*77c1e3ccSAndroid Build Coastguard Worker const __m256i s_2 = _mm256_hadd_epi32(s_0, s_1);
391*77c1e3ccSAndroid Build Coastguard Worker return convert_and_add_avx2(s_2);
392*77c1e3ccSAndroid Build Coastguard Worker }
393*77c1e3ccSAndroid Build Coastguard Worker
add_64bit_lvl_avx2(__m256i src0,__m256i src1)394*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i add_64bit_lvl_avx2(__m256i src0, __m256i src1) {
395*77c1e3ccSAndroid Build Coastguard Worker // 00 10 02 12
396*77c1e3ccSAndroid Build Coastguard Worker const __m256i t0 = _mm256_unpacklo_epi64(src0, src1);
397*77c1e3ccSAndroid Build Coastguard Worker // 01 11 03 13
398*77c1e3ccSAndroid Build Coastguard Worker const __m256i t1 = _mm256_unpackhi_epi64(src0, src1);
399*77c1e3ccSAndroid Build Coastguard Worker // 00+01 10+11 02+03 12+13
400*77c1e3ccSAndroid Build Coastguard Worker const __m256i sum = _mm256_add_epi64(t0, t1);
401*77c1e3ccSAndroid Build Coastguard Worker // 00+01 10+11
402*77c1e3ccSAndroid Build Coastguard Worker const __m128i sum0 = _mm256_castsi256_si128(sum);
403*77c1e3ccSAndroid Build Coastguard Worker // 02+03 12+13
404*77c1e3ccSAndroid Build Coastguard Worker const __m128i sum1 = _mm256_extracti128_si256(sum, 1);
405*77c1e3ccSAndroid Build Coastguard Worker // 00+01+02+03 10+11+12+13
406*77c1e3ccSAndroid Build Coastguard Worker return _mm_add_epi64(sum0, sum1);
407*77c1e3ccSAndroid Build Coastguard Worker }
408*77c1e3ccSAndroid Build Coastguard Worker
convert_32_to_64_add_avx2(__m256i src0,__m256i src1)409*77c1e3ccSAndroid Build Coastguard Worker static inline __m128i convert_32_to_64_add_avx2(__m256i src0, __m256i src1) {
410*77c1e3ccSAndroid Build Coastguard Worker // 00 01 02 03
411*77c1e3ccSAndroid Build Coastguard Worker const __m256i s0 = convert_and_add_avx2(src0);
412*77c1e3ccSAndroid Build Coastguard Worker // 10 11 12 13
413*77c1e3ccSAndroid Build Coastguard Worker const __m256i s1 = convert_and_add_avx2(src1);
414*77c1e3ccSAndroid Build Coastguard Worker return add_64bit_lvl_avx2(s0, s1);
415*77c1e3ccSAndroid Build Coastguard Worker }
416*77c1e3ccSAndroid Build Coastguard Worker
calc_sum_of_register(__m256i src)417*77c1e3ccSAndroid Build Coastguard Worker static inline int32_t calc_sum_of_register(__m256i src) {
418*77c1e3ccSAndroid Build Coastguard Worker const __m128i src_l = _mm256_castsi256_si128(src);
419*77c1e3ccSAndroid Build Coastguard Worker const __m128i src_h = _mm256_extracti128_si256(src, 1);
420*77c1e3ccSAndroid Build Coastguard Worker const __m128i sum = _mm_add_epi32(src_l, src_h);
421*77c1e3ccSAndroid Build Coastguard Worker const __m128i dst0 = _mm_add_epi32(sum, _mm_srli_si128(sum, 8));
422*77c1e3ccSAndroid Build Coastguard Worker const __m128i dst1 = _mm_add_epi32(dst0, _mm_srli_si128(dst0, 4));
423*77c1e3ccSAndroid Build Coastguard Worker return _mm_cvtsi128_si32(dst1);
424*77c1e3ccSAndroid Build Coastguard Worker }
425*77c1e3ccSAndroid Build Coastguard Worker
transpose_64bit_4x4_avx2(const __m256i * const src,__m256i * const dst)426*77c1e3ccSAndroid Build Coastguard Worker static inline void transpose_64bit_4x4_avx2(const __m256i *const src,
427*77c1e3ccSAndroid Build Coastguard Worker __m256i *const dst) {
428*77c1e3ccSAndroid Build Coastguard Worker // Unpack 64 bit elements. Goes from:
429*77c1e3ccSAndroid Build Coastguard Worker // src[0]: 00 01 02 03
430*77c1e3ccSAndroid Build Coastguard Worker // src[1]: 10 11 12 13
431*77c1e3ccSAndroid Build Coastguard Worker // src[2]: 20 21 22 23
432*77c1e3ccSAndroid Build Coastguard Worker // src[3]: 30 31 32 33
433*77c1e3ccSAndroid Build Coastguard Worker // to:
434*77c1e3ccSAndroid Build Coastguard Worker // reg0: 00 10 02 12
435*77c1e3ccSAndroid Build Coastguard Worker // reg1: 20 30 22 32
436*77c1e3ccSAndroid Build Coastguard Worker // reg2: 01 11 03 13
437*77c1e3ccSAndroid Build Coastguard Worker // reg3: 21 31 23 33
438*77c1e3ccSAndroid Build Coastguard Worker const __m256i reg0 = _mm256_unpacklo_epi64(src[0], src[1]);
439*77c1e3ccSAndroid Build Coastguard Worker const __m256i reg1 = _mm256_unpacklo_epi64(src[2], src[3]);
440*77c1e3ccSAndroid Build Coastguard Worker const __m256i reg2 = _mm256_unpackhi_epi64(src[0], src[1]);
441*77c1e3ccSAndroid Build Coastguard Worker const __m256i reg3 = _mm256_unpackhi_epi64(src[2], src[3]);
442*77c1e3ccSAndroid Build Coastguard Worker
443*77c1e3ccSAndroid Build Coastguard Worker // Unpack 64 bit elements resulting in:
444*77c1e3ccSAndroid Build Coastguard Worker // dst[0]: 00 10 20 30
445*77c1e3ccSAndroid Build Coastguard Worker // dst[1]: 01 11 21 31
446*77c1e3ccSAndroid Build Coastguard Worker // dst[2]: 02 12 22 32
447*77c1e3ccSAndroid Build Coastguard Worker // dst[3]: 03 13 23 33
448*77c1e3ccSAndroid Build Coastguard Worker dst[0] = _mm256_inserti128_si256(reg0, _mm256_castsi256_si128(reg1), 1);
449*77c1e3ccSAndroid Build Coastguard Worker dst[1] = _mm256_inserti128_si256(reg2, _mm256_castsi256_si128(reg3), 1);
450*77c1e3ccSAndroid Build Coastguard Worker dst[2] = _mm256_inserti128_si256(reg1, _mm256_extracti128_si256(reg0, 1), 0);
451*77c1e3ccSAndroid Build Coastguard Worker dst[3] = _mm256_inserti128_si256(reg3, _mm256_extracti128_si256(reg2, 1), 0);
452*77c1e3ccSAndroid Build Coastguard Worker }
453*77c1e3ccSAndroid Build Coastguard Worker
454*77c1e3ccSAndroid Build Coastguard Worker // When we load 32 values of int8_t type and need less than 32 values for
455*77c1e3ccSAndroid Build Coastguard Worker // processing, the below mask is used to make the extra values zero.
456*77c1e3ccSAndroid Build Coastguard Worker static const int8_t mask_8bit[32] = {
457*77c1e3ccSAndroid Build Coastguard Worker -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 16 bytes
458*77c1e3ccSAndroid Build Coastguard Worker 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16 bytes
459*77c1e3ccSAndroid Build Coastguard Worker };
460*77c1e3ccSAndroid Build Coastguard Worker
461*77c1e3ccSAndroid Build Coastguard Worker // When we load 16 values of int16_t type and need less than 16 values for
462*77c1e3ccSAndroid Build Coastguard Worker // processing, the below mask is used to make the extra values zero.
463*77c1e3ccSAndroid Build Coastguard Worker static const int16_t mask_16bit[32] = {
464*77c1e3ccSAndroid Build Coastguard Worker -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, // 16 bytes
465*77c1e3ccSAndroid Build Coastguard Worker 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 16 bytes
466*77c1e3ccSAndroid Build Coastguard Worker };
467*77c1e3ccSAndroid Build Coastguard Worker
calc_dgd_buf_avg_avx2(const uint8_t * src,int32_t h_start,int32_t h_end,int32_t v_start,int32_t v_end,int32_t stride)468*77c1e3ccSAndroid Build Coastguard Worker static inline uint8_t calc_dgd_buf_avg_avx2(const uint8_t *src, int32_t h_start,
469*77c1e3ccSAndroid Build Coastguard Worker int32_t h_end, int32_t v_start,
470*77c1e3ccSAndroid Build Coastguard Worker int32_t v_end, int32_t stride) {
471*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src_temp = src + v_start * stride + h_start;
472*77c1e3ccSAndroid Build Coastguard Worker const __m256i zero = _mm256_setzero_si256();
473*77c1e3ccSAndroid Build Coastguard Worker const int32_t width = h_end - h_start;
474*77c1e3ccSAndroid Build Coastguard Worker const int32_t height = v_end - v_start;
475*77c1e3ccSAndroid Build Coastguard Worker const int32_t wd_beyond_mul32 = width & 31;
476*77c1e3ccSAndroid Build Coastguard Worker const int32_t wd_mul32 = width - wd_beyond_mul32;
477*77c1e3ccSAndroid Build Coastguard Worker __m128i mask_low, mask_high;
478*77c1e3ccSAndroid Build Coastguard Worker __m256i ss = zero;
479*77c1e3ccSAndroid Build Coastguard Worker
480*77c1e3ccSAndroid Build Coastguard Worker // When width is not multiple of 32, it still loads 32 and to make the data
481*77c1e3ccSAndroid Build Coastguard Worker // which is extra (beyond required) as zero using the below mask.
482*77c1e3ccSAndroid Build Coastguard Worker if (wd_beyond_mul32 >= 16) {
483*77c1e3ccSAndroid Build Coastguard Worker mask_low = _mm_set1_epi8(-1);
484*77c1e3ccSAndroid Build Coastguard Worker mask_high = _mm_loadu_si128((__m128i *)(&mask_8bit[32 - wd_beyond_mul32]));
485*77c1e3ccSAndroid Build Coastguard Worker } else {
486*77c1e3ccSAndroid Build Coastguard Worker mask_low = _mm_loadu_si128((__m128i *)(&mask_8bit[16 - wd_beyond_mul32]));
487*77c1e3ccSAndroid Build Coastguard Worker mask_high = _mm_setzero_si128();
488*77c1e3ccSAndroid Build Coastguard Worker }
489*77c1e3ccSAndroid Build Coastguard Worker const __m256i mask =
490*77c1e3ccSAndroid Build Coastguard Worker _mm256_inserti128_si256(_mm256_castsi128_si256(mask_low), mask_high, 1);
491*77c1e3ccSAndroid Build Coastguard Worker
492*77c1e3ccSAndroid Build Coastguard Worker int32_t proc_ht = 0;
493*77c1e3ccSAndroid Build Coastguard Worker do {
494*77c1e3ccSAndroid Build Coastguard Worker // Process width in multiple of 32.
495*77c1e3ccSAndroid Build Coastguard Worker int32_t proc_wd = 0;
496*77c1e3ccSAndroid Build Coastguard Worker while (proc_wd < wd_mul32) {
497*77c1e3ccSAndroid Build Coastguard Worker const __m256i s_0 = _mm256_loadu_si256((__m256i *)(src_temp + proc_wd));
498*77c1e3ccSAndroid Build Coastguard Worker const __m256i sad_0 = _mm256_sad_epu8(s_0, zero);
499*77c1e3ccSAndroid Build Coastguard Worker ss = _mm256_add_epi32(ss, sad_0);
500*77c1e3ccSAndroid Build Coastguard Worker proc_wd += 32;
501*77c1e3ccSAndroid Build Coastguard Worker }
502*77c1e3ccSAndroid Build Coastguard Worker
503*77c1e3ccSAndroid Build Coastguard Worker // Process the remaining width.
504*77c1e3ccSAndroid Build Coastguard Worker if (wd_beyond_mul32) {
505*77c1e3ccSAndroid Build Coastguard Worker const __m256i s_0 = _mm256_loadu_si256((__m256i *)(src_temp + proc_wd));
506*77c1e3ccSAndroid Build Coastguard Worker const __m256i s_m_0 = _mm256_and_si256(s_0, mask);
507*77c1e3ccSAndroid Build Coastguard Worker const __m256i sad_0 = _mm256_sad_epu8(s_m_0, zero);
508*77c1e3ccSAndroid Build Coastguard Worker ss = _mm256_add_epi32(ss, sad_0);
509*77c1e3ccSAndroid Build Coastguard Worker }
510*77c1e3ccSAndroid Build Coastguard Worker src_temp += stride;
511*77c1e3ccSAndroid Build Coastguard Worker proc_ht++;
512*77c1e3ccSAndroid Build Coastguard Worker } while (proc_ht < height);
513*77c1e3ccSAndroid Build Coastguard Worker
514*77c1e3ccSAndroid Build Coastguard Worker const uint32_t sum = calc_sum_of_register(ss);
515*77c1e3ccSAndroid Build Coastguard Worker const uint8_t avg = sum / (width * height);
516*77c1e3ccSAndroid Build Coastguard Worker return avg;
517*77c1e3ccSAndroid Build Coastguard Worker }
518*77c1e3ccSAndroid Build Coastguard Worker
519*77c1e3ccSAndroid Build Coastguard Worker // Fill (src-avg) or (dgd-avg) buffers. Note that when n = (width % 16) is not
520*77c1e3ccSAndroid Build Coastguard Worker // 0, it writes (16 - n) more data than required.
sub_avg_block_avx2(const uint8_t * src,int32_t src_stride,uint8_t avg,int32_t width,int32_t height,int16_t * dst,int32_t dst_stride,int use_downsampled_wiener_stats)521*77c1e3ccSAndroid Build Coastguard Worker static inline void sub_avg_block_avx2(const uint8_t *src, int32_t src_stride,
522*77c1e3ccSAndroid Build Coastguard Worker uint8_t avg, int32_t width,
523*77c1e3ccSAndroid Build Coastguard Worker int32_t height, int16_t *dst,
524*77c1e3ccSAndroid Build Coastguard Worker int32_t dst_stride,
525*77c1e3ccSAndroid Build Coastguard Worker int use_downsampled_wiener_stats) {
526*77c1e3ccSAndroid Build Coastguard Worker const __m256i avg_reg = _mm256_set1_epi16(avg);
527*77c1e3ccSAndroid Build Coastguard Worker
528*77c1e3ccSAndroid Build Coastguard Worker int32_t proc_ht = 0;
529*77c1e3ccSAndroid Build Coastguard Worker do {
530*77c1e3ccSAndroid Build Coastguard Worker int ds_factor =
531*77c1e3ccSAndroid Build Coastguard Worker use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
532*77c1e3ccSAndroid Build Coastguard Worker if (use_downsampled_wiener_stats &&
533*77c1e3ccSAndroid Build Coastguard Worker (height - proc_ht < WIENER_STATS_DOWNSAMPLE_FACTOR)) {
534*77c1e3ccSAndroid Build Coastguard Worker ds_factor = height - proc_ht;
535*77c1e3ccSAndroid Build Coastguard Worker }
536*77c1e3ccSAndroid Build Coastguard Worker
537*77c1e3ccSAndroid Build Coastguard Worker int32_t proc_wd = 0;
538*77c1e3ccSAndroid Build Coastguard Worker while (proc_wd < width) {
539*77c1e3ccSAndroid Build Coastguard Worker const __m128i s = _mm_loadu_si128((__m128i *)(src + proc_wd));
540*77c1e3ccSAndroid Build Coastguard Worker const __m256i ss = _mm256_cvtepu8_epi16(s);
541*77c1e3ccSAndroid Build Coastguard Worker const __m256i d = _mm256_sub_epi16(ss, avg_reg);
542*77c1e3ccSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i *)(dst + proc_wd), d);
543*77c1e3ccSAndroid Build Coastguard Worker proc_wd += 16;
544*77c1e3ccSAndroid Build Coastguard Worker }
545*77c1e3ccSAndroid Build Coastguard Worker
546*77c1e3ccSAndroid Build Coastguard Worker src += ds_factor * src_stride;
547*77c1e3ccSAndroid Build Coastguard Worker dst += ds_factor * dst_stride;
548*77c1e3ccSAndroid Build Coastguard Worker proc_ht += ds_factor;
549*77c1e3ccSAndroid Build Coastguard Worker } while (proc_ht < height);
550*77c1e3ccSAndroid Build Coastguard Worker }
551*77c1e3ccSAndroid Build Coastguard Worker
552*77c1e3ccSAndroid Build Coastguard Worker // Fills lower-triangular elements of H buffer from upper triangular elements of
553*77c1e3ccSAndroid Build Coastguard Worker // the same
fill_lower_triag_elements_avx2(const int32_t wiener_win2,int64_t * const H)554*77c1e3ccSAndroid Build Coastguard Worker static inline void fill_lower_triag_elements_avx2(const int32_t wiener_win2,
555*77c1e3ccSAndroid Build Coastguard Worker int64_t *const H) {
556*77c1e3ccSAndroid Build Coastguard Worker for (int32_t i = 0; i < wiener_win2 - 1; i += 4) {
557*77c1e3ccSAndroid Build Coastguard Worker __m256i in[4], out[4];
558*77c1e3ccSAndroid Build Coastguard Worker
559*77c1e3ccSAndroid Build Coastguard Worker in[0] = _mm256_loadu_si256((__m256i *)(H + (i + 0) * wiener_win2 + i + 1));
560*77c1e3ccSAndroid Build Coastguard Worker in[1] = _mm256_loadu_si256((__m256i *)(H + (i + 1) * wiener_win2 + i + 1));
561*77c1e3ccSAndroid Build Coastguard Worker in[2] = _mm256_loadu_si256((__m256i *)(H + (i + 2) * wiener_win2 + i + 1));
562*77c1e3ccSAndroid Build Coastguard Worker in[3] = _mm256_loadu_si256((__m256i *)(H + (i + 3) * wiener_win2 + i + 1));
563*77c1e3ccSAndroid Build Coastguard Worker
564*77c1e3ccSAndroid Build Coastguard Worker transpose_64bit_4x4_avx2(in, out);
565*77c1e3ccSAndroid Build Coastguard Worker
566*77c1e3ccSAndroid Build Coastguard Worker _mm_storel_epi64((__m128i *)(H + (i + 1) * wiener_win2 + i),
567*77c1e3ccSAndroid Build Coastguard Worker _mm256_castsi256_si128(out[0]));
568*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)(H + (i + 2) * wiener_win2 + i),
569*77c1e3ccSAndroid Build Coastguard Worker _mm256_castsi256_si128(out[1]));
570*77c1e3ccSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i *)(H + (i + 3) * wiener_win2 + i), out[2]);
571*77c1e3ccSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i *)(H + (i + 4) * wiener_win2 + i), out[3]);
572*77c1e3ccSAndroid Build Coastguard Worker
573*77c1e3ccSAndroid Build Coastguard Worker for (int32_t j = i + 5; j < wiener_win2; j += 4) {
574*77c1e3ccSAndroid Build Coastguard Worker in[0] = _mm256_loadu_si256((__m256i *)(H + (i + 0) * wiener_win2 + j));
575*77c1e3ccSAndroid Build Coastguard Worker in[1] = _mm256_loadu_si256((__m256i *)(H + (i + 1) * wiener_win2 + j));
576*77c1e3ccSAndroid Build Coastguard Worker in[2] = _mm256_loadu_si256((__m256i *)(H + (i + 2) * wiener_win2 + j));
577*77c1e3ccSAndroid Build Coastguard Worker in[3] = _mm256_loadu_si256((__m256i *)(H + (i + 3) * wiener_win2 + j));
578*77c1e3ccSAndroid Build Coastguard Worker
579*77c1e3ccSAndroid Build Coastguard Worker transpose_64bit_4x4_avx2(in, out);
580*77c1e3ccSAndroid Build Coastguard Worker
581*77c1e3ccSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i *)(H + (j + 0) * wiener_win2 + i), out[0]);
582*77c1e3ccSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i *)(H + (j + 1) * wiener_win2 + i), out[1]);
583*77c1e3ccSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i *)(H + (j + 2) * wiener_win2 + i), out[2]);
584*77c1e3ccSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i *)(H + (j + 3) * wiener_win2 + i), out[3]);
585*77c1e3ccSAndroid Build Coastguard Worker }
586*77c1e3ccSAndroid Build Coastguard Worker }
587*77c1e3ccSAndroid Build Coastguard Worker }
588*77c1e3ccSAndroid Build Coastguard Worker
589*77c1e3ccSAndroid Build Coastguard Worker // Fill H buffer based on loop_count.
590*77c1e3ccSAndroid Build Coastguard Worker #define INIT_H_VALUES(d, loop_count) \
591*77c1e3ccSAndroid Build Coastguard Worker for (int g = 0; g < (loop_count); g++) { \
592*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd0 = \
593*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i *)((d) + (g * d_stride))); \
594*77c1e3ccSAndroid Build Coastguard Worker madd_and_accum_avx2(dgd_mul_df, dgd0, &sum_h[g]); \
595*77c1e3ccSAndroid Build Coastguard Worker }
596*77c1e3ccSAndroid Build Coastguard Worker
597*77c1e3ccSAndroid Build Coastguard Worker // Fill M & H buffer.
598*77c1e3ccSAndroid Build Coastguard Worker #define INIT_MH_VALUES(d) \
599*77c1e3ccSAndroid Build Coastguard Worker for (int g = 0; g < wiener_win; g++) { \
600*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgds_0 = \
601*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i *)((d) + (g * d_stride))); \
602*77c1e3ccSAndroid Build Coastguard Worker madd_and_accum_avx2(src_mul_df, dgds_0, &sum_m[g]); \
603*77c1e3ccSAndroid Build Coastguard Worker madd_and_accum_avx2(dgd_mul_df, dgds_0, &sum_h[g]); \
604*77c1e3ccSAndroid Build Coastguard Worker }
605*77c1e3ccSAndroid Build Coastguard Worker
606*77c1e3ccSAndroid Build Coastguard Worker // Update the dgd pointers appropriately.
607*77c1e3ccSAndroid Build Coastguard Worker #define INITIALIZATION(wiener_window_sz) \
608*77c1e3ccSAndroid Build Coastguard Worker j = i / (wiener_window_sz); \
609*77c1e3ccSAndroid Build Coastguard Worker const int16_t *d_window = d + j; \
610*77c1e3ccSAndroid Build Coastguard Worker const int16_t *d_current_row = \
611*77c1e3ccSAndroid Build Coastguard Worker d + j + ((i % (wiener_window_sz)) * d_stride); \
612*77c1e3ccSAndroid Build Coastguard Worker int proc_ht = v_start; \
613*77c1e3ccSAndroid Build Coastguard Worker downsample_factor = \
614*77c1e3ccSAndroid Build Coastguard Worker use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; \
615*77c1e3ccSAndroid Build Coastguard Worker __m256i sum_h[wiener_window_sz]; \
616*77c1e3ccSAndroid Build Coastguard Worker memset(sum_h, 0, sizeof(sum_h));
617*77c1e3ccSAndroid Build Coastguard Worker
618*77c1e3ccSAndroid Build Coastguard Worker // Update the downsample factor appropriately.
619*77c1e3ccSAndroid Build Coastguard Worker #define UPDATE_DOWNSAMPLE_FACTOR \
620*77c1e3ccSAndroid Build Coastguard Worker int proc_wd = 0; \
621*77c1e3ccSAndroid Build Coastguard Worker if (use_downsampled_wiener_stats && \
622*77c1e3ccSAndroid Build Coastguard Worker ((v_end - proc_ht) < WIENER_STATS_DOWNSAMPLE_FACTOR)) { \
623*77c1e3ccSAndroid Build Coastguard Worker downsample_factor = v_end - proc_ht; \
624*77c1e3ccSAndroid Build Coastguard Worker } \
625*77c1e3ccSAndroid Build Coastguard Worker const __m256i df_reg = _mm256_set1_epi16(downsample_factor);
626*77c1e3ccSAndroid Build Coastguard Worker
627*77c1e3ccSAndroid Build Coastguard Worker #define CALCULATE_REMAINING_H_WIN5 \
628*77c1e3ccSAndroid Build Coastguard Worker while (j < wiener_win) { \
629*77c1e3ccSAndroid Build Coastguard Worker d_window = d; \
630*77c1e3ccSAndroid Build Coastguard Worker d_current_row = d + (i / wiener_win) + ((i % wiener_win) * d_stride); \
631*77c1e3ccSAndroid Build Coastguard Worker const __m256i zero = _mm256_setzero_si256(); \
632*77c1e3ccSAndroid Build Coastguard Worker sum_h[0] = zero; \
633*77c1e3ccSAndroid Build Coastguard Worker sum_h[1] = zero; \
634*77c1e3ccSAndroid Build Coastguard Worker sum_h[2] = zero; \
635*77c1e3ccSAndroid Build Coastguard Worker sum_h[3] = zero; \
636*77c1e3ccSAndroid Build Coastguard Worker sum_h[4] = zero; \
637*77c1e3ccSAndroid Build Coastguard Worker \
638*77c1e3ccSAndroid Build Coastguard Worker proc_ht = v_start; \
639*77c1e3ccSAndroid Build Coastguard Worker downsample_factor = \
640*77c1e3ccSAndroid Build Coastguard Worker use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; \
641*77c1e3ccSAndroid Build Coastguard Worker do { \
642*77c1e3ccSAndroid Build Coastguard Worker UPDATE_DOWNSAMPLE_FACTOR; \
643*77c1e3ccSAndroid Build Coastguard Worker \
644*77c1e3ccSAndroid Build Coastguard Worker /* Process the amount of width multiple of 16.*/ \
645*77c1e3ccSAndroid Build Coastguard Worker while (proc_wd < wd_mul16) { \
646*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd = \
647*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); \
648*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); \
649*77c1e3ccSAndroid Build Coastguard Worker INIT_H_VALUES(d_window + j + proc_wd, 5) \
650*77c1e3ccSAndroid Build Coastguard Worker \
651*77c1e3ccSAndroid Build Coastguard Worker proc_wd += 16; \
652*77c1e3ccSAndroid Build Coastguard Worker }; \
653*77c1e3ccSAndroid Build Coastguard Worker \
654*77c1e3ccSAndroid Build Coastguard Worker /* Process the remaining width here. */ \
655*77c1e3ccSAndroid Build Coastguard Worker if (wd_beyond_mul16) { \
656*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd = \
657*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); \
658*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mask = _mm256_and_si256(dgd, mask); \
659*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); \
660*77c1e3ccSAndroid Build Coastguard Worker INIT_H_VALUES(d_window + j + proc_wd, 5) \
661*77c1e3ccSAndroid Build Coastguard Worker } \
662*77c1e3ccSAndroid Build Coastguard Worker proc_ht += downsample_factor; \
663*77c1e3ccSAndroid Build Coastguard Worker d_window += downsample_factor * d_stride; \
664*77c1e3ccSAndroid Build Coastguard Worker d_current_row += downsample_factor * d_stride; \
665*77c1e3ccSAndroid Build Coastguard Worker } while (proc_ht < v_end); \
666*77c1e3ccSAndroid Build Coastguard Worker const __m256i s_h0 = \
667*77c1e3ccSAndroid Build Coastguard Worker hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); \
668*77c1e3ccSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + (wiener_win * j)), \
669*77c1e3ccSAndroid Build Coastguard Worker s_h0); \
670*77c1e3ccSAndroid Build Coastguard Worker const __m256i s_m_h = convert_and_add_avx2(sum_h[4]); \
671*77c1e3ccSAndroid Build Coastguard Worker const __m128i s_m_h0 = add_64bit_lvl_avx2(s_m_h, s_m_h); \
672*77c1e3ccSAndroid Build Coastguard Worker _mm_storel_epi64( \
673*77c1e3ccSAndroid Build Coastguard Worker (__m128i *)(H + (i * wiener_win2) + (wiener_win * j) + 4), s_m_h0); \
674*77c1e3ccSAndroid Build Coastguard Worker j++; \
675*77c1e3ccSAndroid Build Coastguard Worker }
676*77c1e3ccSAndroid Build Coastguard Worker
677*77c1e3ccSAndroid Build Coastguard Worker #define CALCULATE_REMAINING_H_WIN7 \
678*77c1e3ccSAndroid Build Coastguard Worker while (j < wiener_win) { \
679*77c1e3ccSAndroid Build Coastguard Worker d_window = d; \
680*77c1e3ccSAndroid Build Coastguard Worker d_current_row = d + (i / wiener_win) + ((i % wiener_win) * d_stride); \
681*77c1e3ccSAndroid Build Coastguard Worker const __m256i zero = _mm256_setzero_si256(); \
682*77c1e3ccSAndroid Build Coastguard Worker sum_h[0] = zero; \
683*77c1e3ccSAndroid Build Coastguard Worker sum_h[1] = zero; \
684*77c1e3ccSAndroid Build Coastguard Worker sum_h[2] = zero; \
685*77c1e3ccSAndroid Build Coastguard Worker sum_h[3] = zero; \
686*77c1e3ccSAndroid Build Coastguard Worker sum_h[4] = zero; \
687*77c1e3ccSAndroid Build Coastguard Worker sum_h[5] = zero; \
688*77c1e3ccSAndroid Build Coastguard Worker sum_h[6] = zero; \
689*77c1e3ccSAndroid Build Coastguard Worker \
690*77c1e3ccSAndroid Build Coastguard Worker proc_ht = v_start; \
691*77c1e3ccSAndroid Build Coastguard Worker downsample_factor = \
692*77c1e3ccSAndroid Build Coastguard Worker use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; \
693*77c1e3ccSAndroid Build Coastguard Worker do { \
694*77c1e3ccSAndroid Build Coastguard Worker UPDATE_DOWNSAMPLE_FACTOR; \
695*77c1e3ccSAndroid Build Coastguard Worker \
696*77c1e3ccSAndroid Build Coastguard Worker /* Process the amount of width multiple of 16.*/ \
697*77c1e3ccSAndroid Build Coastguard Worker while (proc_wd < wd_mul16) { \
698*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd = \
699*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); \
700*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg); \
701*77c1e3ccSAndroid Build Coastguard Worker INIT_H_VALUES(d_window + j + proc_wd, 7) \
702*77c1e3ccSAndroid Build Coastguard Worker \
703*77c1e3ccSAndroid Build Coastguard Worker proc_wd += 16; \
704*77c1e3ccSAndroid Build Coastguard Worker }; \
705*77c1e3ccSAndroid Build Coastguard Worker \
706*77c1e3ccSAndroid Build Coastguard Worker /* Process the remaining width here. */ \
707*77c1e3ccSAndroid Build Coastguard Worker if (wd_beyond_mul16) { \
708*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd = \
709*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd)); \
710*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mask = _mm256_and_si256(dgd, mask); \
711*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg); \
712*77c1e3ccSAndroid Build Coastguard Worker INIT_H_VALUES(d_window + j + proc_wd, 7) \
713*77c1e3ccSAndroid Build Coastguard Worker } \
714*77c1e3ccSAndroid Build Coastguard Worker proc_ht += downsample_factor; \
715*77c1e3ccSAndroid Build Coastguard Worker d_window += downsample_factor * d_stride; \
716*77c1e3ccSAndroid Build Coastguard Worker d_current_row += downsample_factor * d_stride; \
717*77c1e3ccSAndroid Build Coastguard Worker } while (proc_ht < v_end); \
718*77c1e3ccSAndroid Build Coastguard Worker const __m256i s_h1 = \
719*77c1e3ccSAndroid Build Coastguard Worker hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]); \
720*77c1e3ccSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + (wiener_win * j)), \
721*77c1e3ccSAndroid Build Coastguard Worker s_h1); \
722*77c1e3ccSAndroid Build Coastguard Worker const __m256i s_h2 = \
723*77c1e3ccSAndroid Build Coastguard Worker hadd_four_32_to_64_avx2(sum_h[4], sum_h[5], &sum_h[6], &sum_h[6]); \
724*77c1e3ccSAndroid Build Coastguard Worker _mm256_storeu_si256( \
725*77c1e3ccSAndroid Build Coastguard Worker (__m256i *)(H + (i * wiener_win2) + (wiener_win * j) + 4), s_h2); \
726*77c1e3ccSAndroid Build Coastguard Worker j++; \
727*77c1e3ccSAndroid Build Coastguard Worker }
728*77c1e3ccSAndroid Build Coastguard Worker
729*77c1e3ccSAndroid Build Coastguard Worker // The buffers H(auto-covariance) and M(cross-correlation) are used to estimate
730*77c1e3ccSAndroid Build Coastguard Worker // the filter tap values required for wiener filtering. Here, the buffer H is of
731*77c1e3ccSAndroid Build Coastguard Worker // size ((wiener_window_size^2)*(wiener_window_size^2)) and M is of size
732*77c1e3ccSAndroid Build Coastguard Worker // (wiener_window_size*wiener_window_size). H is a symmetric matrix where the
733*77c1e3ccSAndroid Build Coastguard Worker // value above the diagonal (upper triangle) are equal to the values below the
734*77c1e3ccSAndroid Build Coastguard Worker // diagonal (lower triangle). The calculation of elements/stats of H(upper
735*77c1e3ccSAndroid Build Coastguard Worker // triangle) and M is done in steps as described below where each step fills
736*77c1e3ccSAndroid Build Coastguard Worker // specific values of H and M.
737*77c1e3ccSAndroid Build Coastguard Worker // Once the upper triangular elements of H matrix are derived, the same will be
738*77c1e3ccSAndroid Build Coastguard Worker // copied to lower triangular using the function
739*77c1e3ccSAndroid Build Coastguard Worker // fill_lower_triag_elements_avx2().
740*77c1e3ccSAndroid Build Coastguard Worker // Example: Wiener window size =
741*77c1e3ccSAndroid Build Coastguard Worker // WIENER_WIN_CHROMA (5) M buffer = [M0 M1 M2 ---- M23 M24] H buffer = Hxy
742*77c1e3ccSAndroid Build Coastguard Worker // (x-row, y-column) [H00 H01 H02 ---- H023 H024] [H10 H11 H12 ---- H123 H124]
743*77c1e3ccSAndroid Build Coastguard Worker // [H30 H31 H32 ---- H323 H324]
744*77c1e3ccSAndroid Build Coastguard Worker // [H40 H41 H42 ---- H423 H424]
745*77c1e3ccSAndroid Build Coastguard Worker // [H50 H51 H52 ---- H523 H524]
746*77c1e3ccSAndroid Build Coastguard Worker // [H60 H61 H62 ---- H623 H624]
747*77c1e3ccSAndroid Build Coastguard Worker // ||
748*77c1e3ccSAndroid Build Coastguard Worker // ||
749*77c1e3ccSAndroid Build Coastguard Worker // [H230 H231 H232 ---- H2323 H2324]
750*77c1e3ccSAndroid Build Coastguard Worker // [H240 H241 H242 ---- H2423 H2424]
751*77c1e3ccSAndroid Build Coastguard Worker // In Step 1, whole M buffers (i.e., M0 to M24) and the first row of H (i.e.,
752*77c1e3ccSAndroid Build Coastguard Worker // H00 to H024) is filled. The remaining rows of H buffer are filled through
753*77c1e3ccSAndroid Build Coastguard Worker // steps 2 to 6.
compute_stats_win5_avx2(const int16_t * const d,int32_t d_stride,const int16_t * const s,int32_t s_stride,int32_t width,int v_start,int v_end,int64_t * const M,int64_t * const H,int use_downsampled_wiener_stats)754*77c1e3ccSAndroid Build Coastguard Worker static void compute_stats_win5_avx2(const int16_t *const d, int32_t d_stride,
755*77c1e3ccSAndroid Build Coastguard Worker const int16_t *const s, int32_t s_stride,
756*77c1e3ccSAndroid Build Coastguard Worker int32_t width, int v_start, int v_end,
757*77c1e3ccSAndroid Build Coastguard Worker int64_t *const M, int64_t *const H,
758*77c1e3ccSAndroid Build Coastguard Worker int use_downsampled_wiener_stats) {
759*77c1e3ccSAndroid Build Coastguard Worker const int32_t wiener_win = WIENER_WIN_CHROMA;
760*77c1e3ccSAndroid Build Coastguard Worker const int32_t wiener_win2 = wiener_win * wiener_win;
761*77c1e3ccSAndroid Build Coastguard Worker // Amount of width which is beyond multiple of 16. This case is handled
762*77c1e3ccSAndroid Build Coastguard Worker // appropriately to process only the required width towards the end.
763*77c1e3ccSAndroid Build Coastguard Worker const int32_t wd_mul16 = width & ~15;
764*77c1e3ccSAndroid Build Coastguard Worker const int32_t wd_beyond_mul16 = width - wd_mul16;
765*77c1e3ccSAndroid Build Coastguard Worker const __m256i mask =
766*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i *)(&mask_16bit[16 - wd_beyond_mul16]));
767*77c1e3ccSAndroid Build Coastguard Worker int downsample_factor;
768*77c1e3ccSAndroid Build Coastguard Worker
769*77c1e3ccSAndroid Build Coastguard Worker // Step 1: Full M (i.e., M0 to M24) and first row H (i.e., H00 to H024)
770*77c1e3ccSAndroid Build Coastguard Worker // values are filled here. Here, the loop over 'j' is executed for values 0
771*77c1e3ccSAndroid Build Coastguard Worker // to 4 (wiener_win-1). When the loop executed for a specific 'j', 5 values of
772*77c1e3ccSAndroid Build Coastguard Worker // M and H are filled as shown below.
773*77c1e3ccSAndroid Build Coastguard Worker // j=0: M0-M4 and H00-H04, j=1: M5-M9 and H05-H09 are filled etc,.
774*77c1e3ccSAndroid Build Coastguard Worker int j = 0;
775*77c1e3ccSAndroid Build Coastguard Worker do {
776*77c1e3ccSAndroid Build Coastguard Worker const int16_t *s_t = s;
777*77c1e3ccSAndroid Build Coastguard Worker const int16_t *d_t = d;
778*77c1e3ccSAndroid Build Coastguard Worker __m256i sum_m[WIENER_WIN_CHROMA] = { _mm256_setzero_si256() };
779*77c1e3ccSAndroid Build Coastguard Worker __m256i sum_h[WIENER_WIN_CHROMA] = { _mm256_setzero_si256() };
780*77c1e3ccSAndroid Build Coastguard Worker downsample_factor =
781*77c1e3ccSAndroid Build Coastguard Worker use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
782*77c1e3ccSAndroid Build Coastguard Worker int proc_ht = v_start;
783*77c1e3ccSAndroid Build Coastguard Worker do {
784*77c1e3ccSAndroid Build Coastguard Worker UPDATE_DOWNSAMPLE_FACTOR
785*77c1e3ccSAndroid Build Coastguard Worker
786*77c1e3ccSAndroid Build Coastguard Worker // Process the amount of width multiple of 16.
787*77c1e3ccSAndroid Build Coastguard Worker while (proc_wd < wd_mul16) {
788*77c1e3ccSAndroid Build Coastguard Worker const __m256i src = _mm256_loadu_si256((__m256i *)(s_t + proc_wd));
789*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_t + proc_wd));
790*77c1e3ccSAndroid Build Coastguard Worker const __m256i src_mul_df = _mm256_mullo_epi16(src, df_reg);
791*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
792*77c1e3ccSAndroid Build Coastguard Worker INIT_MH_VALUES(d_t + j + proc_wd)
793*77c1e3ccSAndroid Build Coastguard Worker
794*77c1e3ccSAndroid Build Coastguard Worker proc_wd += 16;
795*77c1e3ccSAndroid Build Coastguard Worker }
796*77c1e3ccSAndroid Build Coastguard Worker
797*77c1e3ccSAndroid Build Coastguard Worker // Process the remaining width here.
798*77c1e3ccSAndroid Build Coastguard Worker if (wd_beyond_mul16) {
799*77c1e3ccSAndroid Build Coastguard Worker const __m256i src = _mm256_loadu_si256((__m256i *)(s_t + proc_wd));
800*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_t + proc_wd));
801*77c1e3ccSAndroid Build Coastguard Worker const __m256i src_mask = _mm256_and_si256(src, mask);
802*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
803*77c1e3ccSAndroid Build Coastguard Worker const __m256i src_mul_df = _mm256_mullo_epi16(src_mask, df_reg);
804*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
805*77c1e3ccSAndroid Build Coastguard Worker INIT_MH_VALUES(d_t + j + proc_wd)
806*77c1e3ccSAndroid Build Coastguard Worker }
807*77c1e3ccSAndroid Build Coastguard Worker proc_ht += downsample_factor;
808*77c1e3ccSAndroid Build Coastguard Worker s_t += downsample_factor * s_stride;
809*77c1e3ccSAndroid Build Coastguard Worker d_t += downsample_factor * d_stride;
810*77c1e3ccSAndroid Build Coastguard Worker } while (proc_ht < v_end);
811*77c1e3ccSAndroid Build Coastguard Worker
812*77c1e3ccSAndroid Build Coastguard Worker const __m256i s_m =
813*77c1e3ccSAndroid Build Coastguard Worker hadd_four_32_to_64_avx2(sum_m[0], sum_m[1], &sum_m[2], &sum_m[3]);
814*77c1e3ccSAndroid Build Coastguard Worker const __m128i s_m_h = convert_32_to_64_add_avx2(sum_m[4], sum_h[4]);
815*77c1e3ccSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i *)(M + wiener_win * j), s_m);
816*77c1e3ccSAndroid Build Coastguard Worker _mm_storel_epi64((__m128i *)&M[wiener_win * j + 4], s_m_h);
817*77c1e3ccSAndroid Build Coastguard Worker
818*77c1e3ccSAndroid Build Coastguard Worker const __m256i s_h =
819*77c1e3ccSAndroid Build Coastguard Worker hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
820*77c1e3ccSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i *)(H + wiener_win * j), s_h);
821*77c1e3ccSAndroid Build Coastguard Worker _mm_storeh_epi64((__m128i *)&H[wiener_win * j + 4], s_m_h);
822*77c1e3ccSAndroid Build Coastguard Worker } while (++j < wiener_win);
823*77c1e3ccSAndroid Build Coastguard Worker
824*77c1e3ccSAndroid Build Coastguard Worker // The below steps are designed to fill remaining rows of H buffer. Here, aim
825*77c1e3ccSAndroid Build Coastguard Worker // is to fill only upper triangle elements correspond to each row and lower
826*77c1e3ccSAndroid Build Coastguard Worker // triangle elements are copied from upper-triangle elements. Also, as
827*77c1e3ccSAndroid Build Coastguard Worker // mentioned in Step 1, the core function is designed to fill 5
828*77c1e3ccSAndroid Build Coastguard Worker // elements/stats/values of H buffer.
829*77c1e3ccSAndroid Build Coastguard Worker //
830*77c1e3ccSAndroid Build Coastguard Worker // Step 2: Here, the rows 1, 6, 11, 16 and 21 are filled. As we need to fill
831*77c1e3ccSAndroid Build Coastguard Worker // only upper-triangle elements, H10 from row1, H60-H64 and H65 from row6,etc,
832*77c1e3ccSAndroid Build Coastguard Worker // are need not be filled. As the core function process 5 values, in first
833*77c1e3ccSAndroid Build Coastguard Worker // iteration of 'j' only 4 values to be filled i.e., H11-H14 from row1,H66-H69
834*77c1e3ccSAndroid Build Coastguard Worker // from row6, etc.
835*77c1e3ccSAndroid Build Coastguard Worker for (int i = 1; i < wiener_win2; i += wiener_win) {
836*77c1e3ccSAndroid Build Coastguard Worker // Update the dgd pointers appropriately and also derive the 'j'th iteration
837*77c1e3ccSAndroid Build Coastguard Worker // from where the H buffer filling needs to be started.
838*77c1e3ccSAndroid Build Coastguard Worker INITIALIZATION(WIENER_WIN_CHROMA)
839*77c1e3ccSAndroid Build Coastguard Worker
840*77c1e3ccSAndroid Build Coastguard Worker do {
841*77c1e3ccSAndroid Build Coastguard Worker UPDATE_DOWNSAMPLE_FACTOR
842*77c1e3ccSAndroid Build Coastguard Worker
843*77c1e3ccSAndroid Build Coastguard Worker // Process the amount of width multiple of 16.
844*77c1e3ccSAndroid Build Coastguard Worker while (proc_wd < wd_mul16) {
845*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd =
846*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
847*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
848*77c1e3ccSAndroid Build Coastguard Worker INIT_H_VALUES(d_window + proc_wd + (1 * d_stride), 4)
849*77c1e3ccSAndroid Build Coastguard Worker
850*77c1e3ccSAndroid Build Coastguard Worker proc_wd += 16;
851*77c1e3ccSAndroid Build Coastguard Worker }
852*77c1e3ccSAndroid Build Coastguard Worker
853*77c1e3ccSAndroid Build Coastguard Worker // Process the remaining width here.
854*77c1e3ccSAndroid Build Coastguard Worker if (wd_beyond_mul16) {
855*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd =
856*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
857*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
858*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
859*77c1e3ccSAndroid Build Coastguard Worker INIT_H_VALUES(d_window + proc_wd + (1 * d_stride), 4)
860*77c1e3ccSAndroid Build Coastguard Worker }
861*77c1e3ccSAndroid Build Coastguard Worker proc_ht += downsample_factor;
862*77c1e3ccSAndroid Build Coastguard Worker d_window += downsample_factor * d_stride;
863*77c1e3ccSAndroid Build Coastguard Worker d_current_row += downsample_factor * d_stride;
864*77c1e3ccSAndroid Build Coastguard Worker } while (proc_ht < v_end);
865*77c1e3ccSAndroid Build Coastguard Worker const __m256i s_h =
866*77c1e3ccSAndroid Build Coastguard Worker hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
867*77c1e3ccSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h);
868*77c1e3ccSAndroid Build Coastguard Worker
869*77c1e3ccSAndroid Build Coastguard Worker // process the remaining 'j' iterations.
870*77c1e3ccSAndroid Build Coastguard Worker j++;
871*77c1e3ccSAndroid Build Coastguard Worker CALCULATE_REMAINING_H_WIN5
872*77c1e3ccSAndroid Build Coastguard Worker }
873*77c1e3ccSAndroid Build Coastguard Worker
874*77c1e3ccSAndroid Build Coastguard Worker // Step 3: Here, the rows 2, 7, 12, 17 and 22 are filled. As we need to fill
875*77c1e3ccSAndroid Build Coastguard Worker // only upper-triangle elements, H20-H21 from row2, H70-H74 and H75-H76 from
876*77c1e3ccSAndroid Build Coastguard Worker // row7, etc, are need not be filled. As the core function process 5 values,
877*77c1e3ccSAndroid Build Coastguard Worker // in first iteration of 'j' only 3 values to be filled i.e., H22-H24 from
878*77c1e3ccSAndroid Build Coastguard Worker // row2, H77-H79 from row7, etc.
879*77c1e3ccSAndroid Build Coastguard Worker for (int i = 2; i < wiener_win2; i += wiener_win) {
880*77c1e3ccSAndroid Build Coastguard Worker // Update the dgd pointers appropriately and also derive the 'j'th iteration
881*77c1e3ccSAndroid Build Coastguard Worker // from where the H buffer filling needs to be started.
882*77c1e3ccSAndroid Build Coastguard Worker INITIALIZATION(WIENER_WIN_CHROMA)
883*77c1e3ccSAndroid Build Coastguard Worker
884*77c1e3ccSAndroid Build Coastguard Worker do {
885*77c1e3ccSAndroid Build Coastguard Worker UPDATE_DOWNSAMPLE_FACTOR
886*77c1e3ccSAndroid Build Coastguard Worker
887*77c1e3ccSAndroid Build Coastguard Worker // Process the amount of width multiple of 16.
888*77c1e3ccSAndroid Build Coastguard Worker while (proc_wd < wd_mul16) {
889*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd =
890*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
891*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
892*77c1e3ccSAndroid Build Coastguard Worker INIT_H_VALUES(d_window + proc_wd + (2 * d_stride), 3)
893*77c1e3ccSAndroid Build Coastguard Worker
894*77c1e3ccSAndroid Build Coastguard Worker proc_wd += 16;
895*77c1e3ccSAndroid Build Coastguard Worker }
896*77c1e3ccSAndroid Build Coastguard Worker
897*77c1e3ccSAndroid Build Coastguard Worker // Process the remaining width here.
898*77c1e3ccSAndroid Build Coastguard Worker if (wd_beyond_mul16) {
899*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd =
900*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
901*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
902*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
903*77c1e3ccSAndroid Build Coastguard Worker INIT_H_VALUES(d_window + proc_wd + (2 * d_stride), 3)
904*77c1e3ccSAndroid Build Coastguard Worker }
905*77c1e3ccSAndroid Build Coastguard Worker proc_ht += downsample_factor;
906*77c1e3ccSAndroid Build Coastguard Worker d_window += downsample_factor * d_stride;
907*77c1e3ccSAndroid Build Coastguard Worker d_current_row += downsample_factor * d_stride;
908*77c1e3ccSAndroid Build Coastguard Worker } while (proc_ht < v_end);
909*77c1e3ccSAndroid Build Coastguard Worker const __m256i s_h =
910*77c1e3ccSAndroid Build Coastguard Worker hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
911*77c1e3ccSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h);
912*77c1e3ccSAndroid Build Coastguard Worker
913*77c1e3ccSAndroid Build Coastguard Worker // process the remaining 'j' iterations.
914*77c1e3ccSAndroid Build Coastguard Worker j++;
915*77c1e3ccSAndroid Build Coastguard Worker CALCULATE_REMAINING_H_WIN5
916*77c1e3ccSAndroid Build Coastguard Worker }
917*77c1e3ccSAndroid Build Coastguard Worker
918*77c1e3ccSAndroid Build Coastguard Worker // Step 4: Here, the rows 3, 8, 13, 18 and 23 are filled. As we need to fill
919*77c1e3ccSAndroid Build Coastguard Worker // only upper-triangle elements, H30-H32 from row3, H80-H84 and H85-H87 from
920*77c1e3ccSAndroid Build Coastguard Worker // row8, etc, are need not be filled. As the core function process 5 values,
921*77c1e3ccSAndroid Build Coastguard Worker // in first iteration of 'j' only 2 values to be filled i.e., H33-H34 from
922*77c1e3ccSAndroid Build Coastguard Worker // row3, H88-89 from row8, etc.
923*77c1e3ccSAndroid Build Coastguard Worker for (int i = 3; i < wiener_win2; i += wiener_win) {
924*77c1e3ccSAndroid Build Coastguard Worker // Update the dgd pointers appropriately and also derive the 'j'th iteration
925*77c1e3ccSAndroid Build Coastguard Worker // from where the H buffer filling needs to be started.
926*77c1e3ccSAndroid Build Coastguard Worker INITIALIZATION(WIENER_WIN_CHROMA)
927*77c1e3ccSAndroid Build Coastguard Worker
928*77c1e3ccSAndroid Build Coastguard Worker do {
929*77c1e3ccSAndroid Build Coastguard Worker UPDATE_DOWNSAMPLE_FACTOR
930*77c1e3ccSAndroid Build Coastguard Worker
931*77c1e3ccSAndroid Build Coastguard Worker // Process the amount of width multiple of 16.
932*77c1e3ccSAndroid Build Coastguard Worker while (proc_wd < wd_mul16) {
933*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd =
934*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
935*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
936*77c1e3ccSAndroid Build Coastguard Worker INIT_H_VALUES(d_window + proc_wd + (3 * d_stride), 2)
937*77c1e3ccSAndroid Build Coastguard Worker
938*77c1e3ccSAndroid Build Coastguard Worker proc_wd += 16;
939*77c1e3ccSAndroid Build Coastguard Worker }
940*77c1e3ccSAndroid Build Coastguard Worker
941*77c1e3ccSAndroid Build Coastguard Worker // Process the remaining width here.
942*77c1e3ccSAndroid Build Coastguard Worker if (wd_beyond_mul16) {
943*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd =
944*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
945*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
946*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
947*77c1e3ccSAndroid Build Coastguard Worker INIT_H_VALUES(d_window + proc_wd + (3 * d_stride), 2)
948*77c1e3ccSAndroid Build Coastguard Worker }
949*77c1e3ccSAndroid Build Coastguard Worker proc_ht += downsample_factor;
950*77c1e3ccSAndroid Build Coastguard Worker d_window += downsample_factor * d_stride;
951*77c1e3ccSAndroid Build Coastguard Worker d_current_row += downsample_factor * d_stride;
952*77c1e3ccSAndroid Build Coastguard Worker } while (proc_ht < v_end);
953*77c1e3ccSAndroid Build Coastguard Worker const __m128i s_h = convert_32_to_64_add_avx2(sum_h[0], sum_h[1]);
954*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)(H + (i * wiener_win2) + i), s_h);
955*77c1e3ccSAndroid Build Coastguard Worker
956*77c1e3ccSAndroid Build Coastguard Worker // process the remaining 'j' iterations.
957*77c1e3ccSAndroid Build Coastguard Worker j++;
958*77c1e3ccSAndroid Build Coastguard Worker CALCULATE_REMAINING_H_WIN5
959*77c1e3ccSAndroid Build Coastguard Worker }
960*77c1e3ccSAndroid Build Coastguard Worker
961*77c1e3ccSAndroid Build Coastguard Worker // Step 5: Here, the rows 4, 9, 14, 19 and 24 are filled. As we need to fill
962*77c1e3ccSAndroid Build Coastguard Worker // only upper-triangle elements, H40-H43 from row4, H90-H94 and H95-H98 from
963*77c1e3ccSAndroid Build Coastguard Worker // row9, etc, are need not be filled. As the core function process 5 values,
964*77c1e3ccSAndroid Build Coastguard Worker // in first iteration of 'j' only 1 values to be filled i.e., H44 from row4,
965*77c1e3ccSAndroid Build Coastguard Worker // H99 from row9, etc.
966*77c1e3ccSAndroid Build Coastguard Worker for (int i = 4; i < wiener_win2; i += wiener_win) {
967*77c1e3ccSAndroid Build Coastguard Worker // Update the dgd pointers appropriately and also derive the 'j'th iteration
968*77c1e3ccSAndroid Build Coastguard Worker // from where the H buffer filling needs to be started.
969*77c1e3ccSAndroid Build Coastguard Worker INITIALIZATION(WIENER_WIN_CHROMA)
970*77c1e3ccSAndroid Build Coastguard Worker do {
971*77c1e3ccSAndroid Build Coastguard Worker UPDATE_DOWNSAMPLE_FACTOR
972*77c1e3ccSAndroid Build Coastguard Worker
973*77c1e3ccSAndroid Build Coastguard Worker // Process the amount of width multiple of 16.
974*77c1e3ccSAndroid Build Coastguard Worker while (proc_wd < wd_mul16) {
975*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd =
976*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
977*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
978*77c1e3ccSAndroid Build Coastguard Worker INIT_H_VALUES(d_window + proc_wd + (4 * d_stride), 1)
979*77c1e3ccSAndroid Build Coastguard Worker
980*77c1e3ccSAndroid Build Coastguard Worker proc_wd += 16;
981*77c1e3ccSAndroid Build Coastguard Worker }
982*77c1e3ccSAndroid Build Coastguard Worker
983*77c1e3ccSAndroid Build Coastguard Worker // Process the remaining width here.
984*77c1e3ccSAndroid Build Coastguard Worker if (wd_beyond_mul16) {
985*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd =
986*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
987*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
988*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
989*77c1e3ccSAndroid Build Coastguard Worker INIT_H_VALUES(d_window + proc_wd + (4 * d_stride), 1)
990*77c1e3ccSAndroid Build Coastguard Worker }
991*77c1e3ccSAndroid Build Coastguard Worker proc_ht += downsample_factor;
992*77c1e3ccSAndroid Build Coastguard Worker d_window += downsample_factor * d_stride;
993*77c1e3ccSAndroid Build Coastguard Worker d_current_row += downsample_factor * d_stride;
994*77c1e3ccSAndroid Build Coastguard Worker } while (proc_ht < v_end);
995*77c1e3ccSAndroid Build Coastguard Worker const __m128i s_h = convert_32_to_64_add_avx2(sum_h[0], sum_h[1]);
996*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)(H + (i * wiener_win2) + i), s_h);
997*77c1e3ccSAndroid Build Coastguard Worker
998*77c1e3ccSAndroid Build Coastguard Worker // process the remaining 'j' iterations.
999*77c1e3ccSAndroid Build Coastguard Worker j++;
1000*77c1e3ccSAndroid Build Coastguard Worker CALCULATE_REMAINING_H_WIN5
1001*77c1e3ccSAndroid Build Coastguard Worker }
1002*77c1e3ccSAndroid Build Coastguard Worker
1003*77c1e3ccSAndroid Build Coastguard Worker // Step 6: Here, the rows 5, 10, 15 and 20 are filled. As we need to fill only
1004*77c1e3ccSAndroid Build Coastguard Worker // upper-triangle elements, H50-H54 from row5, H100-H104 and H105-H109 from
1005*77c1e3ccSAndroid Build Coastguard Worker // row10,etc, are need not be filled. The first iteration of 'j' fills H55-H59
1006*77c1e3ccSAndroid Build Coastguard Worker // from row5 and H1010-H1014 from row10, etc.
1007*77c1e3ccSAndroid Build Coastguard Worker for (int i = 5; i < wiener_win2; i += wiener_win) {
1008*77c1e3ccSAndroid Build Coastguard Worker // Derive j'th iteration from where the H buffer filling needs to be
1009*77c1e3ccSAndroid Build Coastguard Worker // started.
1010*77c1e3ccSAndroid Build Coastguard Worker j = i / wiener_win;
1011*77c1e3ccSAndroid Build Coastguard Worker int shift = 0;
1012*77c1e3ccSAndroid Build Coastguard Worker do {
1013*77c1e3ccSAndroid Build Coastguard Worker // Update the dgd pointers appropriately.
1014*77c1e3ccSAndroid Build Coastguard Worker int proc_ht = v_start;
1015*77c1e3ccSAndroid Build Coastguard Worker const int16_t *d_window = d + (i / wiener_win);
1016*77c1e3ccSAndroid Build Coastguard Worker const int16_t *d_current_row =
1017*77c1e3ccSAndroid Build Coastguard Worker d + (i / wiener_win) + ((i % wiener_win) * d_stride);
1018*77c1e3ccSAndroid Build Coastguard Worker downsample_factor =
1019*77c1e3ccSAndroid Build Coastguard Worker use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
1020*77c1e3ccSAndroid Build Coastguard Worker __m256i sum_h[WIENER_WIN_CHROMA] = { _mm256_setzero_si256() };
1021*77c1e3ccSAndroid Build Coastguard Worker do {
1022*77c1e3ccSAndroid Build Coastguard Worker UPDATE_DOWNSAMPLE_FACTOR
1023*77c1e3ccSAndroid Build Coastguard Worker
1024*77c1e3ccSAndroid Build Coastguard Worker // Process the amount of width multiple of 16.
1025*77c1e3ccSAndroid Build Coastguard Worker while (proc_wd < wd_mul16) {
1026*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd =
1027*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
1028*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
1029*77c1e3ccSAndroid Build Coastguard Worker INIT_H_VALUES(d_window + shift + proc_wd, 5)
1030*77c1e3ccSAndroid Build Coastguard Worker
1031*77c1e3ccSAndroid Build Coastguard Worker proc_wd += 16;
1032*77c1e3ccSAndroid Build Coastguard Worker }
1033*77c1e3ccSAndroid Build Coastguard Worker
1034*77c1e3ccSAndroid Build Coastguard Worker // Process the remaining width here.
1035*77c1e3ccSAndroid Build Coastguard Worker if (wd_beyond_mul16) {
1036*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd =
1037*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
1038*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
1039*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
1040*77c1e3ccSAndroid Build Coastguard Worker INIT_H_VALUES(d_window + shift + proc_wd, 5)
1041*77c1e3ccSAndroid Build Coastguard Worker }
1042*77c1e3ccSAndroid Build Coastguard Worker proc_ht += downsample_factor;
1043*77c1e3ccSAndroid Build Coastguard Worker d_window += downsample_factor * d_stride;
1044*77c1e3ccSAndroid Build Coastguard Worker d_current_row += downsample_factor * d_stride;
1045*77c1e3ccSAndroid Build Coastguard Worker } while (proc_ht < v_end);
1046*77c1e3ccSAndroid Build Coastguard Worker
1047*77c1e3ccSAndroid Build Coastguard Worker const __m256i s_h =
1048*77c1e3ccSAndroid Build Coastguard Worker hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
1049*77c1e3ccSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + (wiener_win * j)),
1050*77c1e3ccSAndroid Build Coastguard Worker s_h);
1051*77c1e3ccSAndroid Build Coastguard Worker const __m256i s_m_h = convert_and_add_avx2(sum_h[4]);
1052*77c1e3ccSAndroid Build Coastguard Worker const __m128i s_m_h0 = add_64bit_lvl_avx2(s_m_h, s_m_h);
1053*77c1e3ccSAndroid Build Coastguard Worker _mm_storel_epi64(
1054*77c1e3ccSAndroid Build Coastguard Worker (__m128i *)(H + (i * wiener_win2) + (wiener_win * j) + 4), s_m_h0);
1055*77c1e3ccSAndroid Build Coastguard Worker shift++;
1056*77c1e3ccSAndroid Build Coastguard Worker } while (++j < wiener_win);
1057*77c1e3ccSAndroid Build Coastguard Worker }
1058*77c1e3ccSAndroid Build Coastguard Worker
1059*77c1e3ccSAndroid Build Coastguard Worker fill_lower_triag_elements_avx2(wiener_win2, H);
1060*77c1e3ccSAndroid Build Coastguard Worker }
1061*77c1e3ccSAndroid Build Coastguard Worker
1062*77c1e3ccSAndroid Build Coastguard Worker // The buffers H(auto-covariance) and M(cross-correlation) are used to estimate
1063*77c1e3ccSAndroid Build Coastguard Worker // the filter tap values required for wiener filtering. Here, the buffer H is of
1064*77c1e3ccSAndroid Build Coastguard Worker // size ((wiener_window_size^2)*(wiener_window_size^2)) and M is of size
1065*77c1e3ccSAndroid Build Coastguard Worker // (wiener_window_size*wiener_window_size). H is a symmetric matrix where the
1066*77c1e3ccSAndroid Build Coastguard Worker // value above the diagonal (upper triangle) are equal to the values below the
1067*77c1e3ccSAndroid Build Coastguard Worker // diagonal (lower triangle). The calculation of elements/stats of H(upper
1068*77c1e3ccSAndroid Build Coastguard Worker // triangle) and M is done in steps as described below where each step fills
1069*77c1e3ccSAndroid Build Coastguard Worker // specific values of H and M.
1070*77c1e3ccSAndroid Build Coastguard Worker // Example:
1071*77c1e3ccSAndroid Build Coastguard Worker // Wiener window size = WIENER_WIN (7)
1072*77c1e3ccSAndroid Build Coastguard Worker // M buffer = [M0 M1 M2 ---- M47 M48]
1073*77c1e3ccSAndroid Build Coastguard Worker // H buffer = Hxy (x-row, y-column)
1074*77c1e3ccSAndroid Build Coastguard Worker // [H00 H01 H02 ---- H047 H048]
1075*77c1e3ccSAndroid Build Coastguard Worker // [H10 H11 H12 ---- H147 H148]
1076*77c1e3ccSAndroid Build Coastguard Worker // [H30 H31 H32 ---- H347 H348]
1077*77c1e3ccSAndroid Build Coastguard Worker // [H40 H41 H42 ---- H447 H448]
1078*77c1e3ccSAndroid Build Coastguard Worker // [H50 H51 H52 ---- H547 H548]
1079*77c1e3ccSAndroid Build Coastguard Worker // [H60 H61 H62 ---- H647 H648]
1080*77c1e3ccSAndroid Build Coastguard Worker // ||
1081*77c1e3ccSAndroid Build Coastguard Worker // ||
1082*77c1e3ccSAndroid Build Coastguard Worker // [H470 H471 H472 ---- H4747 H4748]
1083*77c1e3ccSAndroid Build Coastguard Worker // [H480 H481 H482 ---- H4847 H4848]
1084*77c1e3ccSAndroid Build Coastguard Worker // In Step 1, whole M buffers (i.e., M0 to M48) and the first row of H (i.e.,
1085*77c1e3ccSAndroid Build Coastguard Worker // H00 to H048) is filled. The remaining rows of H buffer are filled through
1086*77c1e3ccSAndroid Build Coastguard Worker // steps 2 to 8.
compute_stats_win7_avx2(const int16_t * const d,int32_t d_stride,const int16_t * const s,int32_t s_stride,int32_t width,int v_start,int v_end,int64_t * const M,int64_t * const H,int use_downsampled_wiener_stats)1087*77c1e3ccSAndroid Build Coastguard Worker static void compute_stats_win7_avx2(const int16_t *const d, int32_t d_stride,
1088*77c1e3ccSAndroid Build Coastguard Worker const int16_t *const s, int32_t s_stride,
1089*77c1e3ccSAndroid Build Coastguard Worker int32_t width, int v_start, int v_end,
1090*77c1e3ccSAndroid Build Coastguard Worker int64_t *const M, int64_t *const H,
1091*77c1e3ccSAndroid Build Coastguard Worker int use_downsampled_wiener_stats) {
1092*77c1e3ccSAndroid Build Coastguard Worker const int32_t wiener_win = WIENER_WIN;
1093*77c1e3ccSAndroid Build Coastguard Worker const int32_t wiener_win2 = wiener_win * wiener_win;
1094*77c1e3ccSAndroid Build Coastguard Worker // Amount of width which is beyond multiple of 16. This case is handled
1095*77c1e3ccSAndroid Build Coastguard Worker // appropriately to process only the required width towards the end.
1096*77c1e3ccSAndroid Build Coastguard Worker const int32_t wd_mul16 = width & ~15;
1097*77c1e3ccSAndroid Build Coastguard Worker const int32_t wd_beyond_mul16 = width - wd_mul16;
1098*77c1e3ccSAndroid Build Coastguard Worker const __m256i mask =
1099*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i *)(&mask_16bit[16 - wd_beyond_mul16]));
1100*77c1e3ccSAndroid Build Coastguard Worker int downsample_factor;
1101*77c1e3ccSAndroid Build Coastguard Worker
1102*77c1e3ccSAndroid Build Coastguard Worker // Step 1: Full M (i.e., M0 to M48) and first row H (i.e., H00 to H048)
1103*77c1e3ccSAndroid Build Coastguard Worker // values are filled here. Here, the loop over 'j' is executed for values 0
1104*77c1e3ccSAndroid Build Coastguard Worker // to 6. When the loop executed for a specific 'j', 7 values of M and H are
1105*77c1e3ccSAndroid Build Coastguard Worker // filled as shown below.
1106*77c1e3ccSAndroid Build Coastguard Worker // j=0: M0-M6 and H00-H06, j=1: M7-M13 and H07-H013 are filled etc,.
1107*77c1e3ccSAndroid Build Coastguard Worker int j = 0;
1108*77c1e3ccSAndroid Build Coastguard Worker do {
1109*77c1e3ccSAndroid Build Coastguard Worker const int16_t *s_t = s;
1110*77c1e3ccSAndroid Build Coastguard Worker const int16_t *d_t = d;
1111*77c1e3ccSAndroid Build Coastguard Worker __m256i sum_m[WIENER_WIN] = { _mm256_setzero_si256() };
1112*77c1e3ccSAndroid Build Coastguard Worker __m256i sum_h[WIENER_WIN] = { _mm256_setzero_si256() };
1113*77c1e3ccSAndroid Build Coastguard Worker downsample_factor =
1114*77c1e3ccSAndroid Build Coastguard Worker use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
1115*77c1e3ccSAndroid Build Coastguard Worker int proc_ht = v_start;
1116*77c1e3ccSAndroid Build Coastguard Worker do {
1117*77c1e3ccSAndroid Build Coastguard Worker UPDATE_DOWNSAMPLE_FACTOR
1118*77c1e3ccSAndroid Build Coastguard Worker
1119*77c1e3ccSAndroid Build Coastguard Worker // Process the amount of width multiple of 16.
1120*77c1e3ccSAndroid Build Coastguard Worker while (proc_wd < wd_mul16) {
1121*77c1e3ccSAndroid Build Coastguard Worker const __m256i src = _mm256_loadu_si256((__m256i *)(s_t + proc_wd));
1122*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_t + proc_wd));
1123*77c1e3ccSAndroid Build Coastguard Worker const __m256i src_mul_df = _mm256_mullo_epi16(src, df_reg);
1124*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
1125*77c1e3ccSAndroid Build Coastguard Worker INIT_MH_VALUES(d_t + j + proc_wd)
1126*77c1e3ccSAndroid Build Coastguard Worker
1127*77c1e3ccSAndroid Build Coastguard Worker proc_wd += 16;
1128*77c1e3ccSAndroid Build Coastguard Worker }
1129*77c1e3ccSAndroid Build Coastguard Worker
1130*77c1e3ccSAndroid Build Coastguard Worker if (wd_beyond_mul16) {
1131*77c1e3ccSAndroid Build Coastguard Worker const __m256i src = _mm256_loadu_si256((__m256i *)(s_t + proc_wd));
1132*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd = _mm256_loadu_si256((__m256i *)(d_t + proc_wd));
1133*77c1e3ccSAndroid Build Coastguard Worker const __m256i src_mask = _mm256_and_si256(src, mask);
1134*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
1135*77c1e3ccSAndroid Build Coastguard Worker const __m256i src_mul_df = _mm256_mullo_epi16(src_mask, df_reg);
1136*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
1137*77c1e3ccSAndroid Build Coastguard Worker INIT_MH_VALUES(d_t + j + proc_wd)
1138*77c1e3ccSAndroid Build Coastguard Worker }
1139*77c1e3ccSAndroid Build Coastguard Worker proc_ht += downsample_factor;
1140*77c1e3ccSAndroid Build Coastguard Worker s_t += downsample_factor * s_stride;
1141*77c1e3ccSAndroid Build Coastguard Worker d_t += downsample_factor * d_stride;
1142*77c1e3ccSAndroid Build Coastguard Worker } while (proc_ht < v_end);
1143*77c1e3ccSAndroid Build Coastguard Worker
1144*77c1e3ccSAndroid Build Coastguard Worker const __m256i s_m0 =
1145*77c1e3ccSAndroid Build Coastguard Worker hadd_four_32_to_64_avx2(sum_m[0], sum_m[1], &sum_m[2], &sum_m[3]);
1146*77c1e3ccSAndroid Build Coastguard Worker const __m256i s_m1 =
1147*77c1e3ccSAndroid Build Coastguard Worker hadd_four_32_to_64_avx2(sum_m[4], sum_m[5], &sum_m[6], &sum_m[6]);
1148*77c1e3ccSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i *)(M + wiener_win * j + 0), s_m0);
1149*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)(M + wiener_win * j + 4),
1150*77c1e3ccSAndroid Build Coastguard Worker _mm256_castsi256_si128(s_m1));
1151*77c1e3ccSAndroid Build Coastguard Worker _mm_storel_epi64((__m128i *)&M[wiener_win * j + 6],
1152*77c1e3ccSAndroid Build Coastguard Worker _mm256_extracti128_si256(s_m1, 1));
1153*77c1e3ccSAndroid Build Coastguard Worker
1154*77c1e3ccSAndroid Build Coastguard Worker const __m256i sh_0 =
1155*77c1e3ccSAndroid Build Coastguard Worker hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
1156*77c1e3ccSAndroid Build Coastguard Worker const __m256i sh_1 =
1157*77c1e3ccSAndroid Build Coastguard Worker hadd_four_32_to_64_avx2(sum_h[4], sum_h[5], &sum_h[6], &sum_h[6]);
1158*77c1e3ccSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i *)(H + wiener_win * j + 0), sh_0);
1159*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)(H + wiener_win * j + 4),
1160*77c1e3ccSAndroid Build Coastguard Worker _mm256_castsi256_si128(sh_1));
1161*77c1e3ccSAndroid Build Coastguard Worker _mm_storel_epi64((__m128i *)&H[wiener_win * j + 6],
1162*77c1e3ccSAndroid Build Coastguard Worker _mm256_extracti128_si256(sh_1, 1));
1163*77c1e3ccSAndroid Build Coastguard Worker } while (++j < wiener_win);
1164*77c1e3ccSAndroid Build Coastguard Worker
1165*77c1e3ccSAndroid Build Coastguard Worker // The below steps are designed to fill remaining rows of H buffer. Here, aim
1166*77c1e3ccSAndroid Build Coastguard Worker // is to fill only upper triangle elements correspond to each row and lower
1167*77c1e3ccSAndroid Build Coastguard Worker // triangle elements are copied from upper-triangle elements. Also, as
1168*77c1e3ccSAndroid Build Coastguard Worker // mentioned in Step 1, the core function is designed to fill 7
1169*77c1e3ccSAndroid Build Coastguard Worker // elements/stats/values of H buffer.
1170*77c1e3ccSAndroid Build Coastguard Worker //
1171*77c1e3ccSAndroid Build Coastguard Worker // Step 2: Here, the rows 1, 8, 15, 22, 29, 36 and 43 are filled. As we need
1172*77c1e3ccSAndroid Build Coastguard Worker // to fill only upper-triangle elements, H10 from row1, H80-H86 and H87 from
1173*77c1e3ccSAndroid Build Coastguard Worker // row8, etc. are need not be filled. As the core function process 7 values,
1174*77c1e3ccSAndroid Build Coastguard Worker // in first iteration of 'j' only 6 values to be filled i.e., H11-H16 from
1175*77c1e3ccSAndroid Build Coastguard Worker // row1 and H88-H813 from row8, etc.
1176*77c1e3ccSAndroid Build Coastguard Worker for (int i = 1; i < wiener_win2; i += wiener_win) {
1177*77c1e3ccSAndroid Build Coastguard Worker // Update the dgd pointers appropriately and also derive the 'j'th iteration
1178*77c1e3ccSAndroid Build Coastguard Worker // from where the H buffer filling needs to be started.
1179*77c1e3ccSAndroid Build Coastguard Worker INITIALIZATION(WIENER_WIN)
1180*77c1e3ccSAndroid Build Coastguard Worker
1181*77c1e3ccSAndroid Build Coastguard Worker do {
1182*77c1e3ccSAndroid Build Coastguard Worker UPDATE_DOWNSAMPLE_FACTOR
1183*77c1e3ccSAndroid Build Coastguard Worker
1184*77c1e3ccSAndroid Build Coastguard Worker // Process the amount of width multiple of 16.
1185*77c1e3ccSAndroid Build Coastguard Worker while (proc_wd < wd_mul16) {
1186*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd =
1187*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
1188*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
1189*77c1e3ccSAndroid Build Coastguard Worker INIT_H_VALUES(d_window + proc_wd + (1 * d_stride), 6)
1190*77c1e3ccSAndroid Build Coastguard Worker
1191*77c1e3ccSAndroid Build Coastguard Worker proc_wd += 16;
1192*77c1e3ccSAndroid Build Coastguard Worker }
1193*77c1e3ccSAndroid Build Coastguard Worker
1194*77c1e3ccSAndroid Build Coastguard Worker // Process the remaining width here.
1195*77c1e3ccSAndroid Build Coastguard Worker if (wd_beyond_mul16) {
1196*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd =
1197*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
1198*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
1199*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
1200*77c1e3ccSAndroid Build Coastguard Worker INIT_H_VALUES(d_window + proc_wd + (1 * d_stride), 6)
1201*77c1e3ccSAndroid Build Coastguard Worker }
1202*77c1e3ccSAndroid Build Coastguard Worker proc_ht += downsample_factor;
1203*77c1e3ccSAndroid Build Coastguard Worker d_window += downsample_factor * d_stride;
1204*77c1e3ccSAndroid Build Coastguard Worker d_current_row += downsample_factor * d_stride;
1205*77c1e3ccSAndroid Build Coastguard Worker } while (proc_ht < v_end);
1206*77c1e3ccSAndroid Build Coastguard Worker const __m256i s_h =
1207*77c1e3ccSAndroid Build Coastguard Worker hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
1208*77c1e3ccSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h);
1209*77c1e3ccSAndroid Build Coastguard Worker const __m128i s_h0 = convert_32_to_64_add_avx2(sum_h[4], sum_h[5]);
1210*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128((__m128i *)(H + (i * wiener_win2) + i + 4), s_h0);
1211*77c1e3ccSAndroid Build Coastguard Worker
1212*77c1e3ccSAndroid Build Coastguard Worker // process the remaining 'j' iterations.
1213*77c1e3ccSAndroid Build Coastguard Worker j++;
1214*77c1e3ccSAndroid Build Coastguard Worker CALCULATE_REMAINING_H_WIN7
1215*77c1e3ccSAndroid Build Coastguard Worker }
1216*77c1e3ccSAndroid Build Coastguard Worker
1217*77c1e3ccSAndroid Build Coastguard Worker // Step 3: Here, the rows 2, 9, 16, 23, 30, 37 and 44 are filled. As we need
1218*77c1e3ccSAndroid Build Coastguard Worker // to fill only upper-triangle elements, H20-H21 from row2, H90-H96 and
1219*77c1e3ccSAndroid Build Coastguard Worker // H97-H98 from row9, etc. are need not be filled. As the core function
1220*77c1e3ccSAndroid Build Coastguard Worker // process 7 values, in first iteration of 'j' only 5 values to be filled
1221*77c1e3ccSAndroid Build Coastguard Worker // i.e., H22-H26 from row2 and H99-H913 from row9, etc.
1222*77c1e3ccSAndroid Build Coastguard Worker for (int i = 2; i < wiener_win2; i += wiener_win) {
1223*77c1e3ccSAndroid Build Coastguard Worker // Update the dgd pointers appropriately and also derive the 'j'th iteration
1224*77c1e3ccSAndroid Build Coastguard Worker // from where the H buffer filling needs to be started.
1225*77c1e3ccSAndroid Build Coastguard Worker INITIALIZATION(WIENER_WIN)
1226*77c1e3ccSAndroid Build Coastguard Worker do {
1227*77c1e3ccSAndroid Build Coastguard Worker UPDATE_DOWNSAMPLE_FACTOR
1228*77c1e3ccSAndroid Build Coastguard Worker
1229*77c1e3ccSAndroid Build Coastguard Worker // Process the amount of width multiple of 16.
1230*77c1e3ccSAndroid Build Coastguard Worker while (proc_wd < wd_mul16) {
1231*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd =
1232*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
1233*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
1234*77c1e3ccSAndroid Build Coastguard Worker INIT_H_VALUES(d_window + proc_wd + (2 * d_stride), 5)
1235*77c1e3ccSAndroid Build Coastguard Worker
1236*77c1e3ccSAndroid Build Coastguard Worker proc_wd += 16;
1237*77c1e3ccSAndroid Build Coastguard Worker }
1238*77c1e3ccSAndroid Build Coastguard Worker
1239*77c1e3ccSAndroid Build Coastguard Worker // Process the remaining width here.
1240*77c1e3ccSAndroid Build Coastguard Worker if (wd_beyond_mul16) {
1241*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd =
1242*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
1243*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
1244*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
1245*77c1e3ccSAndroid Build Coastguard Worker INIT_H_VALUES(d_window + proc_wd + (2 * d_stride), 5)
1246*77c1e3ccSAndroid Build Coastguard Worker }
1247*77c1e3ccSAndroid Build Coastguard Worker proc_ht += downsample_factor;
1248*77c1e3ccSAndroid Build Coastguard Worker d_window += downsample_factor * d_stride;
1249*77c1e3ccSAndroid Build Coastguard Worker d_current_row += downsample_factor * d_stride;
1250*77c1e3ccSAndroid Build Coastguard Worker } while (proc_ht < v_end);
1251*77c1e3ccSAndroid Build Coastguard Worker const __m256i s_h =
1252*77c1e3ccSAndroid Build Coastguard Worker hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
1253*77c1e3ccSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h);
1254*77c1e3ccSAndroid Build Coastguard Worker const __m256i s_m_h = convert_and_add_avx2(sum_h[4]);
1255*77c1e3ccSAndroid Build Coastguard Worker const __m128i s_m_h0 = add_64bit_lvl_avx2(s_m_h, s_m_h);
1256*77c1e3ccSAndroid Build Coastguard Worker _mm_storel_epi64((__m128i *)(H + (i * wiener_win2) + i + 4), s_m_h0);
1257*77c1e3ccSAndroid Build Coastguard Worker
1258*77c1e3ccSAndroid Build Coastguard Worker // process the remaining 'j' iterations.
1259*77c1e3ccSAndroid Build Coastguard Worker j++;
1260*77c1e3ccSAndroid Build Coastguard Worker CALCULATE_REMAINING_H_WIN7
1261*77c1e3ccSAndroid Build Coastguard Worker }
1262*77c1e3ccSAndroid Build Coastguard Worker
1263*77c1e3ccSAndroid Build Coastguard Worker // Step 4: Here, the rows 3, 10, 17, 24, 31, 38 and 45 are filled. As we need
1264*77c1e3ccSAndroid Build Coastguard Worker // to fill only upper-triangle elements, H30-H32 from row3, H100-H106 and
1265*77c1e3ccSAndroid Build Coastguard Worker // H107-H109 from row10, etc. are need not be filled. As the core function
1266*77c1e3ccSAndroid Build Coastguard Worker // process 7 values, in first iteration of 'j' only 4 values to be filled
1267*77c1e3ccSAndroid Build Coastguard Worker // i.e., H33-H36 from row3 and H1010-H1013 from row10, etc.
1268*77c1e3ccSAndroid Build Coastguard Worker for (int i = 3; i < wiener_win2; i += wiener_win) {
1269*77c1e3ccSAndroid Build Coastguard Worker // Update the dgd pointers appropriately and also derive the 'j'th iteration
1270*77c1e3ccSAndroid Build Coastguard Worker // from where the H buffer filling needs to be started.
1271*77c1e3ccSAndroid Build Coastguard Worker INITIALIZATION(WIENER_WIN)
1272*77c1e3ccSAndroid Build Coastguard Worker
1273*77c1e3ccSAndroid Build Coastguard Worker do {
1274*77c1e3ccSAndroid Build Coastguard Worker UPDATE_DOWNSAMPLE_FACTOR
1275*77c1e3ccSAndroid Build Coastguard Worker
1276*77c1e3ccSAndroid Build Coastguard Worker // Process the amount of width multiple of 16.
1277*77c1e3ccSAndroid Build Coastguard Worker while (proc_wd < wd_mul16) {
1278*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd =
1279*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
1280*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
1281*77c1e3ccSAndroid Build Coastguard Worker INIT_H_VALUES(d_window + proc_wd + (3 * d_stride), 4)
1282*77c1e3ccSAndroid Build Coastguard Worker
1283*77c1e3ccSAndroid Build Coastguard Worker proc_wd += 16;
1284*77c1e3ccSAndroid Build Coastguard Worker }
1285*77c1e3ccSAndroid Build Coastguard Worker
1286*77c1e3ccSAndroid Build Coastguard Worker // Process the remaining width here.
1287*77c1e3ccSAndroid Build Coastguard Worker if (wd_beyond_mul16) {
1288*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd =
1289*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
1290*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
1291*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
1292*77c1e3ccSAndroid Build Coastguard Worker INIT_H_VALUES(d_window + proc_wd + (3 * d_stride), 4)
1293*77c1e3ccSAndroid Build Coastguard Worker }
1294*77c1e3ccSAndroid Build Coastguard Worker proc_ht += downsample_factor;
1295*77c1e3ccSAndroid Build Coastguard Worker d_window += downsample_factor * d_stride;
1296*77c1e3ccSAndroid Build Coastguard Worker d_current_row += downsample_factor * d_stride;
1297*77c1e3ccSAndroid Build Coastguard Worker } while (proc_ht < v_end);
1298*77c1e3ccSAndroid Build Coastguard Worker const __m256i s_h =
1299*77c1e3ccSAndroid Build Coastguard Worker hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
1300*77c1e3ccSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h);
1301*77c1e3ccSAndroid Build Coastguard Worker
1302*77c1e3ccSAndroid Build Coastguard Worker // process the remaining 'j' iterations.
1303*77c1e3ccSAndroid Build Coastguard Worker j++;
1304*77c1e3ccSAndroid Build Coastguard Worker CALCULATE_REMAINING_H_WIN7
1305*77c1e3ccSAndroid Build Coastguard Worker }
1306*77c1e3ccSAndroid Build Coastguard Worker
1307*77c1e3ccSAndroid Build Coastguard Worker // Step 5: Here, the rows 4, 11, 18, 25, 32, 39 and 46 are filled. As we need
1308*77c1e3ccSAndroid Build Coastguard Worker // to fill only upper-triangle elements, H40-H43 from row4, H110-H116 and
1309*77c1e3ccSAndroid Build Coastguard Worker // H117-H1110 from row10, etc. are need not be filled. As the core function
1310*77c1e3ccSAndroid Build Coastguard Worker // process 7 values, in first iteration of 'j' only 3 values to be filled
1311*77c1e3ccSAndroid Build Coastguard Worker // i.e., H44-H46 from row4 and H1111-H1113 from row11, etc.
1312*77c1e3ccSAndroid Build Coastguard Worker for (int i = 4; i < wiener_win2; i += wiener_win) {
1313*77c1e3ccSAndroid Build Coastguard Worker // Update the dgd pointers appropriately and also derive the 'j'th iteration
1314*77c1e3ccSAndroid Build Coastguard Worker // from where the H buffer filling needs to be started.
1315*77c1e3ccSAndroid Build Coastguard Worker INITIALIZATION(WIENER_WIN)
1316*77c1e3ccSAndroid Build Coastguard Worker
1317*77c1e3ccSAndroid Build Coastguard Worker do {
1318*77c1e3ccSAndroid Build Coastguard Worker UPDATE_DOWNSAMPLE_FACTOR
1319*77c1e3ccSAndroid Build Coastguard Worker
1320*77c1e3ccSAndroid Build Coastguard Worker // Process the amount of width multiple of 16.
1321*77c1e3ccSAndroid Build Coastguard Worker while (proc_wd < wd_mul16) {
1322*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd =
1323*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
1324*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
1325*77c1e3ccSAndroid Build Coastguard Worker INIT_H_VALUES(d_window + proc_wd + (4 * d_stride), 3)
1326*77c1e3ccSAndroid Build Coastguard Worker
1327*77c1e3ccSAndroid Build Coastguard Worker proc_wd += 16;
1328*77c1e3ccSAndroid Build Coastguard Worker }
1329*77c1e3ccSAndroid Build Coastguard Worker
1330*77c1e3ccSAndroid Build Coastguard Worker // Process the remaining width here.
1331*77c1e3ccSAndroid Build Coastguard Worker if (wd_beyond_mul16) {
1332*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd =
1333*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
1334*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
1335*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
1336*77c1e3ccSAndroid Build Coastguard Worker INIT_H_VALUES(d_window + proc_wd + (4 * d_stride), 3)
1337*77c1e3ccSAndroid Build Coastguard Worker }
1338*77c1e3ccSAndroid Build Coastguard Worker proc_ht += downsample_factor;
1339*77c1e3ccSAndroid Build Coastguard Worker d_window += downsample_factor * d_stride;
1340*77c1e3ccSAndroid Build Coastguard Worker d_current_row += downsample_factor * d_stride;
1341*77c1e3ccSAndroid Build Coastguard Worker } while (proc_ht < v_end);
1342*77c1e3ccSAndroid Build Coastguard Worker const __m256i s_h =
1343*77c1e3ccSAndroid Build Coastguard Worker hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
1344*77c1e3ccSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h);
1345*77c1e3ccSAndroid Build Coastguard Worker
1346*77c1e3ccSAndroid Build Coastguard Worker // process the remaining 'j' iterations.
1347*77c1e3ccSAndroid Build Coastguard Worker j++;
1348*77c1e3ccSAndroid Build Coastguard Worker CALCULATE_REMAINING_H_WIN7
1349*77c1e3ccSAndroid Build Coastguard Worker }
1350*77c1e3ccSAndroid Build Coastguard Worker
1351*77c1e3ccSAndroid Build Coastguard Worker // Step 6: Here, the rows 5, 12, 19, 26, 33, 40 and 47 are filled. As we need
1352*77c1e3ccSAndroid Build Coastguard Worker // to fill only upper-triangle elements, H50-H54 from row5, H120-H126 and
1353*77c1e3ccSAndroid Build Coastguard Worker // H127-H1211 from row12, etc. are need not be filled. As the core function
1354*77c1e3ccSAndroid Build Coastguard Worker // process 7 values, in first iteration of 'j' only 2 values to be filled
1355*77c1e3ccSAndroid Build Coastguard Worker // i.e., H55-H56 from row5 and H1212-H1213 from row12, etc.
1356*77c1e3ccSAndroid Build Coastguard Worker for (int i = 5; i < wiener_win2; i += wiener_win) {
1357*77c1e3ccSAndroid Build Coastguard Worker // Update the dgd pointers appropriately and also derive the 'j'th iteration
1358*77c1e3ccSAndroid Build Coastguard Worker // from where the H buffer filling needs to be started.
1359*77c1e3ccSAndroid Build Coastguard Worker INITIALIZATION(WIENER_WIN)
1360*77c1e3ccSAndroid Build Coastguard Worker do {
1361*77c1e3ccSAndroid Build Coastguard Worker UPDATE_DOWNSAMPLE_FACTOR
1362*77c1e3ccSAndroid Build Coastguard Worker
1363*77c1e3ccSAndroid Build Coastguard Worker // Process the amount of width multiple of 16.
1364*77c1e3ccSAndroid Build Coastguard Worker while (proc_wd < wd_mul16) {
1365*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd =
1366*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
1367*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
1368*77c1e3ccSAndroid Build Coastguard Worker INIT_H_VALUES(d_window + proc_wd + (5 * d_stride), 2)
1369*77c1e3ccSAndroid Build Coastguard Worker
1370*77c1e3ccSAndroid Build Coastguard Worker proc_wd += 16;
1371*77c1e3ccSAndroid Build Coastguard Worker }
1372*77c1e3ccSAndroid Build Coastguard Worker
1373*77c1e3ccSAndroid Build Coastguard Worker // Process the remaining width here.
1374*77c1e3ccSAndroid Build Coastguard Worker if (wd_beyond_mul16) {
1375*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd =
1376*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
1377*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
1378*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
1379*77c1e3ccSAndroid Build Coastguard Worker INIT_H_VALUES(d_window + proc_wd + (5 * d_stride), 2)
1380*77c1e3ccSAndroid Build Coastguard Worker }
1381*77c1e3ccSAndroid Build Coastguard Worker proc_ht += downsample_factor;
1382*77c1e3ccSAndroid Build Coastguard Worker d_window += downsample_factor * d_stride;
1383*77c1e3ccSAndroid Build Coastguard Worker d_current_row += downsample_factor * d_stride;
1384*77c1e3ccSAndroid Build Coastguard Worker } while (proc_ht < v_end);
1385*77c1e3ccSAndroid Build Coastguard Worker const __m256i s_h =
1386*77c1e3ccSAndroid Build Coastguard Worker hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
1387*77c1e3ccSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + i), s_h);
1388*77c1e3ccSAndroid Build Coastguard Worker
1389*77c1e3ccSAndroid Build Coastguard Worker // process the remaining 'j' iterations.
1390*77c1e3ccSAndroid Build Coastguard Worker j++;
1391*77c1e3ccSAndroid Build Coastguard Worker CALCULATE_REMAINING_H_WIN7
1392*77c1e3ccSAndroid Build Coastguard Worker }
1393*77c1e3ccSAndroid Build Coastguard Worker
1394*77c1e3ccSAndroid Build Coastguard Worker // Step 7: Here, the rows 6, 13, 20, 27, 34, 41 and 48 are filled. As we need
1395*77c1e3ccSAndroid Build Coastguard Worker // to fill only upper-triangle elements, H60-H65 from row6, H130-H136 and
1396*77c1e3ccSAndroid Build Coastguard Worker // H137-H1312 from row13, etc. are need not be filled. As the core function
1397*77c1e3ccSAndroid Build Coastguard Worker // process 7 values, in first iteration of 'j' only 1 value to be filled
1398*77c1e3ccSAndroid Build Coastguard Worker // i.e., H66 from row6 and H1313 from row13, etc.
1399*77c1e3ccSAndroid Build Coastguard Worker for (int i = 6; i < wiener_win2; i += wiener_win) {
1400*77c1e3ccSAndroid Build Coastguard Worker // Update the dgd pointers appropriately and also derive the 'j'th iteration
1401*77c1e3ccSAndroid Build Coastguard Worker // from where the H buffer filling needs to be started.
1402*77c1e3ccSAndroid Build Coastguard Worker INITIALIZATION(WIENER_WIN)
1403*77c1e3ccSAndroid Build Coastguard Worker do {
1404*77c1e3ccSAndroid Build Coastguard Worker UPDATE_DOWNSAMPLE_FACTOR
1405*77c1e3ccSAndroid Build Coastguard Worker
1406*77c1e3ccSAndroid Build Coastguard Worker // Process the amount of width multiple of 16.
1407*77c1e3ccSAndroid Build Coastguard Worker while (proc_wd < wd_mul16) {
1408*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd =
1409*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
1410*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
1411*77c1e3ccSAndroid Build Coastguard Worker INIT_H_VALUES(d_window + proc_wd + (6 * d_stride), 1)
1412*77c1e3ccSAndroid Build Coastguard Worker
1413*77c1e3ccSAndroid Build Coastguard Worker proc_wd += 16;
1414*77c1e3ccSAndroid Build Coastguard Worker }
1415*77c1e3ccSAndroid Build Coastguard Worker
1416*77c1e3ccSAndroid Build Coastguard Worker // Process the remaining width here.
1417*77c1e3ccSAndroid Build Coastguard Worker if (wd_beyond_mul16) {
1418*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd =
1419*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
1420*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
1421*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
1422*77c1e3ccSAndroid Build Coastguard Worker INIT_H_VALUES(d_window + proc_wd + (6 * d_stride), 1)
1423*77c1e3ccSAndroid Build Coastguard Worker }
1424*77c1e3ccSAndroid Build Coastguard Worker proc_ht += downsample_factor;
1425*77c1e3ccSAndroid Build Coastguard Worker d_window += downsample_factor * d_stride;
1426*77c1e3ccSAndroid Build Coastguard Worker d_current_row += downsample_factor * d_stride;
1427*77c1e3ccSAndroid Build Coastguard Worker } while (proc_ht < v_end);
1428*77c1e3ccSAndroid Build Coastguard Worker const __m256i s_h =
1429*77c1e3ccSAndroid Build Coastguard Worker hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
1430*77c1e3ccSAndroid Build Coastguard Worker xx_storel_64(&H[(i * wiener_win2) + i], _mm256_castsi256_si128(s_h));
1431*77c1e3ccSAndroid Build Coastguard Worker
1432*77c1e3ccSAndroid Build Coastguard Worker // process the remaining 'j' iterations.
1433*77c1e3ccSAndroid Build Coastguard Worker j++;
1434*77c1e3ccSAndroid Build Coastguard Worker CALCULATE_REMAINING_H_WIN7
1435*77c1e3ccSAndroid Build Coastguard Worker }
1436*77c1e3ccSAndroid Build Coastguard Worker
1437*77c1e3ccSAndroid Build Coastguard Worker // Step 8: Here, the rows 7, 14, 21, 28, 35 and 42 are filled. As we need
1438*77c1e3ccSAndroid Build Coastguard Worker // to fill only upper-triangle elements, H70-H75 from row7, H140-H146 and
1439*77c1e3ccSAndroid Build Coastguard Worker // H147-H1413 from row14, etc. are need not be filled. The first iteration of
1440*77c1e3ccSAndroid Build Coastguard Worker // 'j' fills H77-H713 from row7 and H1414-H1420 from row14, etc.
1441*77c1e3ccSAndroid Build Coastguard Worker for (int i = 7; i < wiener_win2; i += wiener_win) {
1442*77c1e3ccSAndroid Build Coastguard Worker // Derive j'th iteration from where the H buffer filling needs to be
1443*77c1e3ccSAndroid Build Coastguard Worker // started.
1444*77c1e3ccSAndroid Build Coastguard Worker j = i / wiener_win;
1445*77c1e3ccSAndroid Build Coastguard Worker int shift = 0;
1446*77c1e3ccSAndroid Build Coastguard Worker do {
1447*77c1e3ccSAndroid Build Coastguard Worker // Update the dgd pointers appropriately.
1448*77c1e3ccSAndroid Build Coastguard Worker int proc_ht = v_start;
1449*77c1e3ccSAndroid Build Coastguard Worker const int16_t *d_window = d + (i / WIENER_WIN);
1450*77c1e3ccSAndroid Build Coastguard Worker const int16_t *d_current_row =
1451*77c1e3ccSAndroid Build Coastguard Worker d + (i / WIENER_WIN) + ((i % WIENER_WIN) * d_stride);
1452*77c1e3ccSAndroid Build Coastguard Worker downsample_factor =
1453*77c1e3ccSAndroid Build Coastguard Worker use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1;
1454*77c1e3ccSAndroid Build Coastguard Worker __m256i sum_h[WIENER_WIN] = { _mm256_setzero_si256() };
1455*77c1e3ccSAndroid Build Coastguard Worker do {
1456*77c1e3ccSAndroid Build Coastguard Worker UPDATE_DOWNSAMPLE_FACTOR
1457*77c1e3ccSAndroid Build Coastguard Worker
1458*77c1e3ccSAndroid Build Coastguard Worker // Process the amount of width multiple of 16.
1459*77c1e3ccSAndroid Build Coastguard Worker while (proc_wd < wd_mul16) {
1460*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd =
1461*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
1462*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd, df_reg);
1463*77c1e3ccSAndroid Build Coastguard Worker INIT_H_VALUES(d_window + shift + proc_wd, 7)
1464*77c1e3ccSAndroid Build Coastguard Worker
1465*77c1e3ccSAndroid Build Coastguard Worker proc_wd += 16;
1466*77c1e3ccSAndroid Build Coastguard Worker }
1467*77c1e3ccSAndroid Build Coastguard Worker
1468*77c1e3ccSAndroid Build Coastguard Worker // Process the remaining width here.
1469*77c1e3ccSAndroid Build Coastguard Worker if (wd_beyond_mul16) {
1470*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd =
1471*77c1e3ccSAndroid Build Coastguard Worker _mm256_loadu_si256((__m256i *)(d_current_row + proc_wd));
1472*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mask = _mm256_and_si256(dgd, mask);
1473*77c1e3ccSAndroid Build Coastguard Worker const __m256i dgd_mul_df = _mm256_mullo_epi16(dgd_mask, df_reg);
1474*77c1e3ccSAndroid Build Coastguard Worker INIT_H_VALUES(d_window + shift + proc_wd, 7)
1475*77c1e3ccSAndroid Build Coastguard Worker }
1476*77c1e3ccSAndroid Build Coastguard Worker proc_ht += downsample_factor;
1477*77c1e3ccSAndroid Build Coastguard Worker d_window += downsample_factor * d_stride;
1478*77c1e3ccSAndroid Build Coastguard Worker d_current_row += downsample_factor * d_stride;
1479*77c1e3ccSAndroid Build Coastguard Worker } while (proc_ht < v_end);
1480*77c1e3ccSAndroid Build Coastguard Worker
1481*77c1e3ccSAndroid Build Coastguard Worker const __m256i sh_0 =
1482*77c1e3ccSAndroid Build Coastguard Worker hadd_four_32_to_64_avx2(sum_h[0], sum_h[1], &sum_h[2], &sum_h[3]);
1483*77c1e3ccSAndroid Build Coastguard Worker const __m256i sh_1 =
1484*77c1e3ccSAndroid Build Coastguard Worker hadd_four_32_to_64_avx2(sum_h[4], sum_h[5], &sum_h[6], &sum_h[6]);
1485*77c1e3ccSAndroid Build Coastguard Worker _mm256_storeu_si256((__m256i *)(H + (i * wiener_win2) + (wiener_win * j)),
1486*77c1e3ccSAndroid Build Coastguard Worker sh_0);
1487*77c1e3ccSAndroid Build Coastguard Worker _mm_storeu_si128(
1488*77c1e3ccSAndroid Build Coastguard Worker (__m128i *)(H + (i * wiener_win2) + (wiener_win * j) + 4),
1489*77c1e3ccSAndroid Build Coastguard Worker _mm256_castsi256_si128(sh_1));
1490*77c1e3ccSAndroid Build Coastguard Worker _mm_storel_epi64((__m128i *)&H[(i * wiener_win2) + (wiener_win * j) + 6],
1491*77c1e3ccSAndroid Build Coastguard Worker _mm256_extracti128_si256(sh_1, 1));
1492*77c1e3ccSAndroid Build Coastguard Worker shift++;
1493*77c1e3ccSAndroid Build Coastguard Worker } while (++j < wiener_win);
1494*77c1e3ccSAndroid Build Coastguard Worker }
1495*77c1e3ccSAndroid Build Coastguard Worker
1496*77c1e3ccSAndroid Build Coastguard Worker fill_lower_triag_elements_avx2(wiener_win2, H);
1497*77c1e3ccSAndroid Build Coastguard Worker }
1498*77c1e3ccSAndroid Build Coastguard Worker
av1_compute_stats_avx2(int wiener_win,const uint8_t * dgd,const uint8_t * src,int16_t * dgd_avg,int16_t * src_avg,int h_start,int h_end,int v_start,int v_end,int dgd_stride,int src_stride,int64_t * M,int64_t * H,int use_downsampled_wiener_stats)1499*77c1e3ccSAndroid Build Coastguard Worker void av1_compute_stats_avx2(int wiener_win, const uint8_t *dgd,
1500*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src, int16_t *dgd_avg,
1501*77c1e3ccSAndroid Build Coastguard Worker int16_t *src_avg, int h_start, int h_end,
1502*77c1e3ccSAndroid Build Coastguard Worker int v_start, int v_end, int dgd_stride,
1503*77c1e3ccSAndroid Build Coastguard Worker int src_stride, int64_t *M, int64_t *H,
1504*77c1e3ccSAndroid Build Coastguard Worker int use_downsampled_wiener_stats) {
1505*77c1e3ccSAndroid Build Coastguard Worker if (wiener_win != WIENER_WIN && wiener_win != WIENER_WIN_CHROMA) {
1506*77c1e3ccSAndroid Build Coastguard Worker // Currently, libaom supports Wiener filter processing with window sizes as
1507*77c1e3ccSAndroid Build Coastguard Worker // WIENER_WIN_CHROMA(5) and WIENER_WIN(7). For any other window size, SIMD
1508*77c1e3ccSAndroid Build Coastguard Worker // support is not facilitated. Hence, invoke C function for the same.
1509*77c1e3ccSAndroid Build Coastguard Worker av1_compute_stats_c(wiener_win, dgd, src, dgd_avg, src_avg, h_start, h_end,
1510*77c1e3ccSAndroid Build Coastguard Worker v_start, v_end, dgd_stride, src_stride, M, H,
1511*77c1e3ccSAndroid Build Coastguard Worker use_downsampled_wiener_stats);
1512*77c1e3ccSAndroid Build Coastguard Worker return;
1513*77c1e3ccSAndroid Build Coastguard Worker }
1514*77c1e3ccSAndroid Build Coastguard Worker
1515*77c1e3ccSAndroid Build Coastguard Worker const int32_t wiener_halfwin = wiener_win >> 1;
1516*77c1e3ccSAndroid Build Coastguard Worker const uint8_t avg =
1517*77c1e3ccSAndroid Build Coastguard Worker calc_dgd_buf_avg_avx2(dgd, h_start, h_end, v_start, v_end, dgd_stride);
1518*77c1e3ccSAndroid Build Coastguard Worker const int32_t width = h_end - h_start;
1519*77c1e3ccSAndroid Build Coastguard Worker const int32_t height = v_end - v_start;
1520*77c1e3ccSAndroid Build Coastguard Worker const int32_t d_stride = (width + 2 * wiener_halfwin + 15) & ~15;
1521*77c1e3ccSAndroid Build Coastguard Worker const int32_t s_stride = (width + 15) & ~15;
1522*77c1e3ccSAndroid Build Coastguard Worker
1523*77c1e3ccSAndroid Build Coastguard Worker // Based on the sf 'use_downsampled_wiener_stats', process either once for
1524*77c1e3ccSAndroid Build Coastguard Worker // UPDATE_DOWNSAMPLE_FACTOR or for each row.
1525*77c1e3ccSAndroid Build Coastguard Worker sub_avg_block_avx2(src + v_start * src_stride + h_start, src_stride, avg,
1526*77c1e3ccSAndroid Build Coastguard Worker width, height, src_avg, s_stride,
1527*77c1e3ccSAndroid Build Coastguard Worker use_downsampled_wiener_stats);
1528*77c1e3ccSAndroid Build Coastguard Worker
1529*77c1e3ccSAndroid Build Coastguard Worker // Compute (dgd-avg) buffer here which is used to fill H buffer.
1530*77c1e3ccSAndroid Build Coastguard Worker sub_avg_block_avx2(
1531*77c1e3ccSAndroid Build Coastguard Worker dgd + (v_start - wiener_halfwin) * dgd_stride + h_start - wiener_halfwin,
1532*77c1e3ccSAndroid Build Coastguard Worker dgd_stride, avg, width + 2 * wiener_halfwin, height + 2 * wiener_halfwin,
1533*77c1e3ccSAndroid Build Coastguard Worker dgd_avg, d_stride, 0);
1534*77c1e3ccSAndroid Build Coastguard Worker if (wiener_win == WIENER_WIN) {
1535*77c1e3ccSAndroid Build Coastguard Worker compute_stats_win7_avx2(dgd_avg, d_stride, src_avg, s_stride, width,
1536*77c1e3ccSAndroid Build Coastguard Worker v_start, v_end, M, H, use_downsampled_wiener_stats);
1537*77c1e3ccSAndroid Build Coastguard Worker } else if (wiener_win == WIENER_WIN_CHROMA) {
1538*77c1e3ccSAndroid Build Coastguard Worker compute_stats_win5_avx2(dgd_avg, d_stride, src_avg, s_stride, width,
1539*77c1e3ccSAndroid Build Coastguard Worker v_start, v_end, M, H, use_downsampled_wiener_stats);
1540*77c1e3ccSAndroid Build Coastguard Worker }
1541*77c1e3ccSAndroid Build Coastguard Worker }
1542*77c1e3ccSAndroid Build Coastguard Worker
pair_set_epi16(int a,int b)1543*77c1e3ccSAndroid Build Coastguard Worker static inline __m256i pair_set_epi16(int a, int b) {
1544*77c1e3ccSAndroid Build Coastguard Worker return _mm256_set1_epi32(
1545*77c1e3ccSAndroid Build Coastguard Worker (int32_t)(((uint16_t)(a)) | (((uint32_t)(uint16_t)(b)) << 16)));
1546*77c1e3ccSAndroid Build Coastguard Worker }
1547*77c1e3ccSAndroid Build Coastguard Worker
av1_lowbd_pixel_proj_error_avx2(const uint8_t * src8,int width,int height,int src_stride,const uint8_t * dat8,int dat_stride,int32_t * flt0,int flt0_stride,int32_t * flt1,int flt1_stride,int xq[2],const sgr_params_type * params)1548*77c1e3ccSAndroid Build Coastguard Worker int64_t av1_lowbd_pixel_proj_error_avx2(
1549*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src8, int width, int height, int src_stride,
1550*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
1551*77c1e3ccSAndroid Build Coastguard Worker int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
1552*77c1e3ccSAndroid Build Coastguard Worker int i, j, k;
1553*77c1e3ccSAndroid Build Coastguard Worker const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
1554*77c1e3ccSAndroid Build Coastguard Worker const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
1555*77c1e3ccSAndroid Build Coastguard Worker __m256i sum64 = _mm256_setzero_si256();
1556*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src = src8;
1557*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *dat = dat8;
1558*77c1e3ccSAndroid Build Coastguard Worker int64_t err = 0;
1559*77c1e3ccSAndroid Build Coastguard Worker if (params->r[0] > 0 && params->r[1] > 0) {
1560*77c1e3ccSAndroid Build Coastguard Worker __m256i xq_coeff = pair_set_epi16(xq[0], xq[1]);
1561*77c1e3ccSAndroid Build Coastguard Worker for (i = 0; i < height; ++i) {
1562*77c1e3ccSAndroid Build Coastguard Worker __m256i sum32 = _mm256_setzero_si256();
1563*77c1e3ccSAndroid Build Coastguard Worker for (j = 0; j <= width - 16; j += 16) {
1564*77c1e3ccSAndroid Build Coastguard Worker const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
1565*77c1e3ccSAndroid Build Coastguard Worker const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
1566*77c1e3ccSAndroid Build Coastguard Worker const __m256i flt0_16b = _mm256_permute4x64_epi64(
1567*77c1e3ccSAndroid Build Coastguard Worker _mm256_packs_epi32(yy_loadu_256(flt0 + j),
1568*77c1e3ccSAndroid Build Coastguard Worker yy_loadu_256(flt0 + j + 8)),
1569*77c1e3ccSAndroid Build Coastguard Worker 0xd8);
1570*77c1e3ccSAndroid Build Coastguard Worker const __m256i flt1_16b = _mm256_permute4x64_epi64(
1571*77c1e3ccSAndroid Build Coastguard Worker _mm256_packs_epi32(yy_loadu_256(flt1 + j),
1572*77c1e3ccSAndroid Build Coastguard Worker yy_loadu_256(flt1 + j + 8)),
1573*77c1e3ccSAndroid Build Coastguard Worker 0xd8);
1574*77c1e3ccSAndroid Build Coastguard Worker const __m256i u0 = _mm256_slli_epi16(d0, SGRPROJ_RST_BITS);
1575*77c1e3ccSAndroid Build Coastguard Worker const __m256i flt0_0_sub_u = _mm256_sub_epi16(flt0_16b, u0);
1576*77c1e3ccSAndroid Build Coastguard Worker const __m256i flt1_0_sub_u = _mm256_sub_epi16(flt1_16b, u0);
1577*77c1e3ccSAndroid Build Coastguard Worker const __m256i v0 = _mm256_madd_epi16(
1578*77c1e3ccSAndroid Build Coastguard Worker xq_coeff, _mm256_unpacklo_epi16(flt0_0_sub_u, flt1_0_sub_u));
1579*77c1e3ccSAndroid Build Coastguard Worker const __m256i v1 = _mm256_madd_epi16(
1580*77c1e3ccSAndroid Build Coastguard Worker xq_coeff, _mm256_unpackhi_epi16(flt0_0_sub_u, flt1_0_sub_u));
1581*77c1e3ccSAndroid Build Coastguard Worker const __m256i vr0 =
1582*77c1e3ccSAndroid Build Coastguard Worker _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
1583*77c1e3ccSAndroid Build Coastguard Worker const __m256i vr1 =
1584*77c1e3ccSAndroid Build Coastguard Worker _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
1585*77c1e3ccSAndroid Build Coastguard Worker const __m256i e0 = _mm256_sub_epi16(
1586*77c1e3ccSAndroid Build Coastguard Worker _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
1587*77c1e3ccSAndroid Build Coastguard Worker const __m256i err0 = _mm256_madd_epi16(e0, e0);
1588*77c1e3ccSAndroid Build Coastguard Worker sum32 = _mm256_add_epi32(sum32, err0);
1589*77c1e3ccSAndroid Build Coastguard Worker }
1590*77c1e3ccSAndroid Build Coastguard Worker for (k = j; k < width; ++k) {
1591*77c1e3ccSAndroid Build Coastguard Worker const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
1592*77c1e3ccSAndroid Build Coastguard Worker int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
1593*77c1e3ccSAndroid Build Coastguard Worker const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
1594*77c1e3ccSAndroid Build Coastguard Worker err += ((int64_t)e * e);
1595*77c1e3ccSAndroid Build Coastguard Worker }
1596*77c1e3ccSAndroid Build Coastguard Worker dat += dat_stride;
1597*77c1e3ccSAndroid Build Coastguard Worker src += src_stride;
1598*77c1e3ccSAndroid Build Coastguard Worker flt0 += flt0_stride;
1599*77c1e3ccSAndroid Build Coastguard Worker flt1 += flt1_stride;
1600*77c1e3ccSAndroid Build Coastguard Worker const __m256i sum64_0 =
1601*77c1e3ccSAndroid Build Coastguard Worker _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
1602*77c1e3ccSAndroid Build Coastguard Worker const __m256i sum64_1 =
1603*77c1e3ccSAndroid Build Coastguard Worker _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
1604*77c1e3ccSAndroid Build Coastguard Worker sum64 = _mm256_add_epi64(sum64, sum64_0);
1605*77c1e3ccSAndroid Build Coastguard Worker sum64 = _mm256_add_epi64(sum64, sum64_1);
1606*77c1e3ccSAndroid Build Coastguard Worker }
1607*77c1e3ccSAndroid Build Coastguard Worker } else if (params->r[0] > 0 || params->r[1] > 0) {
1608*77c1e3ccSAndroid Build Coastguard Worker const int xq_active = (params->r[0] > 0) ? xq[0] : xq[1];
1609*77c1e3ccSAndroid Build Coastguard Worker const __m256i xq_coeff =
1610*77c1e3ccSAndroid Build Coastguard Worker pair_set_epi16(xq_active, -xq_active * (1 << SGRPROJ_RST_BITS));
1611*77c1e3ccSAndroid Build Coastguard Worker const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
1612*77c1e3ccSAndroid Build Coastguard Worker const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
1613*77c1e3ccSAndroid Build Coastguard Worker for (i = 0; i < height; ++i) {
1614*77c1e3ccSAndroid Build Coastguard Worker __m256i sum32 = _mm256_setzero_si256();
1615*77c1e3ccSAndroid Build Coastguard Worker for (j = 0; j <= width - 16; j += 16) {
1616*77c1e3ccSAndroid Build Coastguard Worker const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
1617*77c1e3ccSAndroid Build Coastguard Worker const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
1618*77c1e3ccSAndroid Build Coastguard Worker const __m256i flt_16b = _mm256_permute4x64_epi64(
1619*77c1e3ccSAndroid Build Coastguard Worker _mm256_packs_epi32(yy_loadu_256(flt + j),
1620*77c1e3ccSAndroid Build Coastguard Worker yy_loadu_256(flt + j + 8)),
1621*77c1e3ccSAndroid Build Coastguard Worker 0xd8);
1622*77c1e3ccSAndroid Build Coastguard Worker const __m256i v0 =
1623*77c1e3ccSAndroid Build Coastguard Worker _mm256_madd_epi16(xq_coeff, _mm256_unpacklo_epi16(flt_16b, d0));
1624*77c1e3ccSAndroid Build Coastguard Worker const __m256i v1 =
1625*77c1e3ccSAndroid Build Coastguard Worker _mm256_madd_epi16(xq_coeff, _mm256_unpackhi_epi16(flt_16b, d0));
1626*77c1e3ccSAndroid Build Coastguard Worker const __m256i vr0 =
1627*77c1e3ccSAndroid Build Coastguard Worker _mm256_srai_epi32(_mm256_add_epi32(v0, rounding), shift);
1628*77c1e3ccSAndroid Build Coastguard Worker const __m256i vr1 =
1629*77c1e3ccSAndroid Build Coastguard Worker _mm256_srai_epi32(_mm256_add_epi32(v1, rounding), shift);
1630*77c1e3ccSAndroid Build Coastguard Worker const __m256i e0 = _mm256_sub_epi16(
1631*77c1e3ccSAndroid Build Coastguard Worker _mm256_add_epi16(_mm256_packs_epi32(vr0, vr1), d0), s0);
1632*77c1e3ccSAndroid Build Coastguard Worker const __m256i err0 = _mm256_madd_epi16(e0, e0);
1633*77c1e3ccSAndroid Build Coastguard Worker sum32 = _mm256_add_epi32(sum32, err0);
1634*77c1e3ccSAndroid Build Coastguard Worker }
1635*77c1e3ccSAndroid Build Coastguard Worker for (k = j; k < width; ++k) {
1636*77c1e3ccSAndroid Build Coastguard Worker const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
1637*77c1e3ccSAndroid Build Coastguard Worker int32_t v = xq_active * (flt[k] - u);
1638*77c1e3ccSAndroid Build Coastguard Worker const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
1639*77c1e3ccSAndroid Build Coastguard Worker err += ((int64_t)e * e);
1640*77c1e3ccSAndroid Build Coastguard Worker }
1641*77c1e3ccSAndroid Build Coastguard Worker dat += dat_stride;
1642*77c1e3ccSAndroid Build Coastguard Worker src += src_stride;
1643*77c1e3ccSAndroid Build Coastguard Worker flt += flt_stride;
1644*77c1e3ccSAndroid Build Coastguard Worker const __m256i sum64_0 =
1645*77c1e3ccSAndroid Build Coastguard Worker _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
1646*77c1e3ccSAndroid Build Coastguard Worker const __m256i sum64_1 =
1647*77c1e3ccSAndroid Build Coastguard Worker _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
1648*77c1e3ccSAndroid Build Coastguard Worker sum64 = _mm256_add_epi64(sum64, sum64_0);
1649*77c1e3ccSAndroid Build Coastguard Worker sum64 = _mm256_add_epi64(sum64, sum64_1);
1650*77c1e3ccSAndroid Build Coastguard Worker }
1651*77c1e3ccSAndroid Build Coastguard Worker } else {
1652*77c1e3ccSAndroid Build Coastguard Worker __m256i sum32 = _mm256_setzero_si256();
1653*77c1e3ccSAndroid Build Coastguard Worker for (i = 0; i < height; ++i) {
1654*77c1e3ccSAndroid Build Coastguard Worker for (j = 0; j <= width - 16; j += 16) {
1655*77c1e3ccSAndroid Build Coastguard Worker const __m256i d0 = _mm256_cvtepu8_epi16(xx_loadu_128(dat + j));
1656*77c1e3ccSAndroid Build Coastguard Worker const __m256i s0 = _mm256_cvtepu8_epi16(xx_loadu_128(src + j));
1657*77c1e3ccSAndroid Build Coastguard Worker const __m256i diff0 = _mm256_sub_epi16(d0, s0);
1658*77c1e3ccSAndroid Build Coastguard Worker const __m256i err0 = _mm256_madd_epi16(diff0, diff0);
1659*77c1e3ccSAndroid Build Coastguard Worker sum32 = _mm256_add_epi32(sum32, err0);
1660*77c1e3ccSAndroid Build Coastguard Worker }
1661*77c1e3ccSAndroid Build Coastguard Worker for (k = j; k < width; ++k) {
1662*77c1e3ccSAndroid Build Coastguard Worker const int32_t e = (int32_t)(dat[k]) - src[k];
1663*77c1e3ccSAndroid Build Coastguard Worker err += ((int64_t)e * e);
1664*77c1e3ccSAndroid Build Coastguard Worker }
1665*77c1e3ccSAndroid Build Coastguard Worker dat += dat_stride;
1666*77c1e3ccSAndroid Build Coastguard Worker src += src_stride;
1667*77c1e3ccSAndroid Build Coastguard Worker }
1668*77c1e3ccSAndroid Build Coastguard Worker const __m256i sum64_0 =
1669*77c1e3ccSAndroid Build Coastguard Worker _mm256_cvtepi32_epi64(_mm256_castsi256_si128(sum32));
1670*77c1e3ccSAndroid Build Coastguard Worker const __m256i sum64_1 =
1671*77c1e3ccSAndroid Build Coastguard Worker _mm256_cvtepi32_epi64(_mm256_extracti128_si256(sum32, 1));
1672*77c1e3ccSAndroid Build Coastguard Worker sum64 = _mm256_add_epi64(sum64_0, sum64_1);
1673*77c1e3ccSAndroid Build Coastguard Worker }
1674*77c1e3ccSAndroid Build Coastguard Worker int64_t sum[4];
1675*77c1e3ccSAndroid Build Coastguard Worker yy_storeu_256(sum, sum64);
1676*77c1e3ccSAndroid Build Coastguard Worker err += sum[0] + sum[1] + sum[2] + sum[3];
1677*77c1e3ccSAndroid Build Coastguard Worker return err;
1678*77c1e3ccSAndroid Build Coastguard Worker }
1679*77c1e3ccSAndroid Build Coastguard Worker
1680*77c1e3ccSAndroid Build Coastguard Worker // When params->r[0] > 0 and params->r[1] > 0. In this case all elements of
1681*77c1e3ccSAndroid Build Coastguard Worker // C and H need to be computed.
calc_proj_params_r0_r1_avx2(const uint8_t * src8,int width,int height,int src_stride,const uint8_t * dat8,int dat_stride,int32_t * flt0,int flt0_stride,int32_t * flt1,int flt1_stride,int64_t H[2][2],int64_t C[2])1682*77c1e3ccSAndroid Build Coastguard Worker static inline void calc_proj_params_r0_r1_avx2(
1683*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src8, int width, int height, int src_stride,
1684*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
1685*77c1e3ccSAndroid Build Coastguard Worker int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
1686*77c1e3ccSAndroid Build Coastguard Worker const int size = width * height;
1687*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src = src8;
1688*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *dat = dat8;
1689*77c1e3ccSAndroid Build Coastguard Worker __m256i h00, h01, h11, c0, c1;
1690*77c1e3ccSAndroid Build Coastguard Worker const __m256i zero = _mm256_setzero_si256();
1691*77c1e3ccSAndroid Build Coastguard Worker h01 = h11 = c0 = c1 = h00 = zero;
1692*77c1e3ccSAndroid Build Coastguard Worker
1693*77c1e3ccSAndroid Build Coastguard Worker for (int i = 0; i < height; ++i) {
1694*77c1e3ccSAndroid Build Coastguard Worker for (int j = 0; j < width; j += 8) {
1695*77c1e3ccSAndroid Build Coastguard Worker const __m256i u_load = _mm256_cvtepu8_epi32(
1696*77c1e3ccSAndroid Build Coastguard Worker _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
1697*77c1e3ccSAndroid Build Coastguard Worker const __m256i s_load = _mm256_cvtepu8_epi32(
1698*77c1e3ccSAndroid Build Coastguard Worker _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
1699*77c1e3ccSAndroid Build Coastguard Worker __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j));
1700*77c1e3ccSAndroid Build Coastguard Worker __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j));
1701*77c1e3ccSAndroid Build Coastguard Worker __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
1702*77c1e3ccSAndroid Build Coastguard Worker __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
1703*77c1e3ccSAndroid Build Coastguard Worker s = _mm256_sub_epi32(s, d);
1704*77c1e3ccSAndroid Build Coastguard Worker f1 = _mm256_sub_epi32(f1, d);
1705*77c1e3ccSAndroid Build Coastguard Worker f2 = _mm256_sub_epi32(f2, d);
1706*77c1e3ccSAndroid Build Coastguard Worker
1707*77c1e3ccSAndroid Build Coastguard Worker const __m256i h00_even = _mm256_mul_epi32(f1, f1);
1708*77c1e3ccSAndroid Build Coastguard Worker const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
1709*77c1e3ccSAndroid Build Coastguard Worker _mm256_srli_epi64(f1, 32));
1710*77c1e3ccSAndroid Build Coastguard Worker h00 = _mm256_add_epi64(h00, h00_even);
1711*77c1e3ccSAndroid Build Coastguard Worker h00 = _mm256_add_epi64(h00, h00_odd);
1712*77c1e3ccSAndroid Build Coastguard Worker
1713*77c1e3ccSAndroid Build Coastguard Worker const __m256i h01_even = _mm256_mul_epi32(f1, f2);
1714*77c1e3ccSAndroid Build Coastguard Worker const __m256i h01_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
1715*77c1e3ccSAndroid Build Coastguard Worker _mm256_srli_epi64(f2, 32));
1716*77c1e3ccSAndroid Build Coastguard Worker h01 = _mm256_add_epi64(h01, h01_even);
1717*77c1e3ccSAndroid Build Coastguard Worker h01 = _mm256_add_epi64(h01, h01_odd);
1718*77c1e3ccSAndroid Build Coastguard Worker
1719*77c1e3ccSAndroid Build Coastguard Worker const __m256i h11_even = _mm256_mul_epi32(f2, f2);
1720*77c1e3ccSAndroid Build Coastguard Worker const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32),
1721*77c1e3ccSAndroid Build Coastguard Worker _mm256_srli_epi64(f2, 32));
1722*77c1e3ccSAndroid Build Coastguard Worker h11 = _mm256_add_epi64(h11, h11_even);
1723*77c1e3ccSAndroid Build Coastguard Worker h11 = _mm256_add_epi64(h11, h11_odd);
1724*77c1e3ccSAndroid Build Coastguard Worker
1725*77c1e3ccSAndroid Build Coastguard Worker const __m256i c0_even = _mm256_mul_epi32(f1, s);
1726*77c1e3ccSAndroid Build Coastguard Worker const __m256i c0_odd =
1727*77c1e3ccSAndroid Build Coastguard Worker _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32));
1728*77c1e3ccSAndroid Build Coastguard Worker c0 = _mm256_add_epi64(c0, c0_even);
1729*77c1e3ccSAndroid Build Coastguard Worker c0 = _mm256_add_epi64(c0, c0_odd);
1730*77c1e3ccSAndroid Build Coastguard Worker
1731*77c1e3ccSAndroid Build Coastguard Worker const __m256i c1_even = _mm256_mul_epi32(f2, s);
1732*77c1e3ccSAndroid Build Coastguard Worker const __m256i c1_odd =
1733*77c1e3ccSAndroid Build Coastguard Worker _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32));
1734*77c1e3ccSAndroid Build Coastguard Worker c1 = _mm256_add_epi64(c1, c1_even);
1735*77c1e3ccSAndroid Build Coastguard Worker c1 = _mm256_add_epi64(c1, c1_odd);
1736*77c1e3ccSAndroid Build Coastguard Worker }
1737*77c1e3ccSAndroid Build Coastguard Worker }
1738*77c1e3ccSAndroid Build Coastguard Worker
1739*77c1e3ccSAndroid Build Coastguard Worker __m256i c_low = _mm256_unpacklo_epi64(c0, c1);
1740*77c1e3ccSAndroid Build Coastguard Worker const __m256i c_high = _mm256_unpackhi_epi64(c0, c1);
1741*77c1e3ccSAndroid Build Coastguard Worker c_low = _mm256_add_epi64(c_low, c_high);
1742*77c1e3ccSAndroid Build Coastguard Worker const __m128i c_128bit = _mm_add_epi64(_mm256_extracti128_si256(c_low, 1),
1743*77c1e3ccSAndroid Build Coastguard Worker _mm256_castsi256_si128(c_low));
1744*77c1e3ccSAndroid Build Coastguard Worker
1745*77c1e3ccSAndroid Build Coastguard Worker __m256i h0x_low = _mm256_unpacklo_epi64(h00, h01);
1746*77c1e3ccSAndroid Build Coastguard Worker const __m256i h0x_high = _mm256_unpackhi_epi64(h00, h01);
1747*77c1e3ccSAndroid Build Coastguard Worker h0x_low = _mm256_add_epi64(h0x_low, h0x_high);
1748*77c1e3ccSAndroid Build Coastguard Worker const __m128i h0x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h0x_low, 1),
1749*77c1e3ccSAndroid Build Coastguard Worker _mm256_castsi256_si128(h0x_low));
1750*77c1e3ccSAndroid Build Coastguard Worker
1751*77c1e3ccSAndroid Build Coastguard Worker // Using the symmetric properties of H, calculations of H[1][0] are not
1752*77c1e3ccSAndroid Build Coastguard Worker // needed.
1753*77c1e3ccSAndroid Build Coastguard Worker __m256i h1x_low = _mm256_unpacklo_epi64(zero, h11);
1754*77c1e3ccSAndroid Build Coastguard Worker const __m256i h1x_high = _mm256_unpackhi_epi64(zero, h11);
1755*77c1e3ccSAndroid Build Coastguard Worker h1x_low = _mm256_add_epi64(h1x_low, h1x_high);
1756*77c1e3ccSAndroid Build Coastguard Worker const __m128i h1x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h1x_low, 1),
1757*77c1e3ccSAndroid Build Coastguard Worker _mm256_castsi256_si128(h1x_low));
1758*77c1e3ccSAndroid Build Coastguard Worker
1759*77c1e3ccSAndroid Build Coastguard Worker xx_storeu_128(C, c_128bit);
1760*77c1e3ccSAndroid Build Coastguard Worker xx_storeu_128(H[0], h0x_128bit);
1761*77c1e3ccSAndroid Build Coastguard Worker xx_storeu_128(H[1], h1x_128bit);
1762*77c1e3ccSAndroid Build Coastguard Worker
1763*77c1e3ccSAndroid Build Coastguard Worker H[0][0] /= size;
1764*77c1e3ccSAndroid Build Coastguard Worker H[0][1] /= size;
1765*77c1e3ccSAndroid Build Coastguard Worker H[1][1] /= size;
1766*77c1e3ccSAndroid Build Coastguard Worker
1767*77c1e3ccSAndroid Build Coastguard Worker // Since H is a symmetric matrix
1768*77c1e3ccSAndroid Build Coastguard Worker H[1][0] = H[0][1];
1769*77c1e3ccSAndroid Build Coastguard Worker C[0] /= size;
1770*77c1e3ccSAndroid Build Coastguard Worker C[1] /= size;
1771*77c1e3ccSAndroid Build Coastguard Worker }
1772*77c1e3ccSAndroid Build Coastguard Worker
1773*77c1e3ccSAndroid Build Coastguard Worker // When only params->r[0] > 0. In this case only H[0][0] and C[0] are
1774*77c1e3ccSAndroid Build Coastguard Worker // non-zero and need to be computed.
calc_proj_params_r0_avx2(const uint8_t * src8,int width,int height,int src_stride,const uint8_t * dat8,int dat_stride,int32_t * flt0,int flt0_stride,int64_t H[2][2],int64_t C[2])1775*77c1e3ccSAndroid Build Coastguard Worker static inline void calc_proj_params_r0_avx2(const uint8_t *src8, int width,
1776*77c1e3ccSAndroid Build Coastguard Worker int height, int src_stride,
1777*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *dat8, int dat_stride,
1778*77c1e3ccSAndroid Build Coastguard Worker int32_t *flt0, int flt0_stride,
1779*77c1e3ccSAndroid Build Coastguard Worker int64_t H[2][2], int64_t C[2]) {
1780*77c1e3ccSAndroid Build Coastguard Worker const int size = width * height;
1781*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src = src8;
1782*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *dat = dat8;
1783*77c1e3ccSAndroid Build Coastguard Worker __m256i h00, c0;
1784*77c1e3ccSAndroid Build Coastguard Worker const __m256i zero = _mm256_setzero_si256();
1785*77c1e3ccSAndroid Build Coastguard Worker c0 = h00 = zero;
1786*77c1e3ccSAndroid Build Coastguard Worker
1787*77c1e3ccSAndroid Build Coastguard Worker for (int i = 0; i < height; ++i) {
1788*77c1e3ccSAndroid Build Coastguard Worker for (int j = 0; j < width; j += 8) {
1789*77c1e3ccSAndroid Build Coastguard Worker const __m256i u_load = _mm256_cvtepu8_epi32(
1790*77c1e3ccSAndroid Build Coastguard Worker _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
1791*77c1e3ccSAndroid Build Coastguard Worker const __m256i s_load = _mm256_cvtepu8_epi32(
1792*77c1e3ccSAndroid Build Coastguard Worker _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
1793*77c1e3ccSAndroid Build Coastguard Worker __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j));
1794*77c1e3ccSAndroid Build Coastguard Worker __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
1795*77c1e3ccSAndroid Build Coastguard Worker __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
1796*77c1e3ccSAndroid Build Coastguard Worker s = _mm256_sub_epi32(s, d);
1797*77c1e3ccSAndroid Build Coastguard Worker f1 = _mm256_sub_epi32(f1, d);
1798*77c1e3ccSAndroid Build Coastguard Worker
1799*77c1e3ccSAndroid Build Coastguard Worker const __m256i h00_even = _mm256_mul_epi32(f1, f1);
1800*77c1e3ccSAndroid Build Coastguard Worker const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
1801*77c1e3ccSAndroid Build Coastguard Worker _mm256_srli_epi64(f1, 32));
1802*77c1e3ccSAndroid Build Coastguard Worker h00 = _mm256_add_epi64(h00, h00_even);
1803*77c1e3ccSAndroid Build Coastguard Worker h00 = _mm256_add_epi64(h00, h00_odd);
1804*77c1e3ccSAndroid Build Coastguard Worker
1805*77c1e3ccSAndroid Build Coastguard Worker const __m256i c0_even = _mm256_mul_epi32(f1, s);
1806*77c1e3ccSAndroid Build Coastguard Worker const __m256i c0_odd =
1807*77c1e3ccSAndroid Build Coastguard Worker _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32));
1808*77c1e3ccSAndroid Build Coastguard Worker c0 = _mm256_add_epi64(c0, c0_even);
1809*77c1e3ccSAndroid Build Coastguard Worker c0 = _mm256_add_epi64(c0, c0_odd);
1810*77c1e3ccSAndroid Build Coastguard Worker }
1811*77c1e3ccSAndroid Build Coastguard Worker }
1812*77c1e3ccSAndroid Build Coastguard Worker const __m128i h00_128bit = _mm_add_epi64(_mm256_extracti128_si256(h00, 1),
1813*77c1e3ccSAndroid Build Coastguard Worker _mm256_castsi256_si128(h00));
1814*77c1e3ccSAndroid Build Coastguard Worker const __m128i h00_val =
1815*77c1e3ccSAndroid Build Coastguard Worker _mm_add_epi64(h00_128bit, _mm_srli_si128(h00_128bit, 8));
1816*77c1e3ccSAndroid Build Coastguard Worker
1817*77c1e3ccSAndroid Build Coastguard Worker const __m128i c0_128bit = _mm_add_epi64(_mm256_extracti128_si256(c0, 1),
1818*77c1e3ccSAndroid Build Coastguard Worker _mm256_castsi256_si128(c0));
1819*77c1e3ccSAndroid Build Coastguard Worker const __m128i c0_val = _mm_add_epi64(c0_128bit, _mm_srli_si128(c0_128bit, 8));
1820*77c1e3ccSAndroid Build Coastguard Worker
1821*77c1e3ccSAndroid Build Coastguard Worker const __m128i c = _mm_unpacklo_epi64(c0_val, _mm256_castsi256_si128(zero));
1822*77c1e3ccSAndroid Build Coastguard Worker const __m128i h0x = _mm_unpacklo_epi64(h00_val, _mm256_castsi256_si128(zero));
1823*77c1e3ccSAndroid Build Coastguard Worker
1824*77c1e3ccSAndroid Build Coastguard Worker xx_storeu_128(C, c);
1825*77c1e3ccSAndroid Build Coastguard Worker xx_storeu_128(H[0], h0x);
1826*77c1e3ccSAndroid Build Coastguard Worker
1827*77c1e3ccSAndroid Build Coastguard Worker H[0][0] /= size;
1828*77c1e3ccSAndroid Build Coastguard Worker C[0] /= size;
1829*77c1e3ccSAndroid Build Coastguard Worker }
1830*77c1e3ccSAndroid Build Coastguard Worker
1831*77c1e3ccSAndroid Build Coastguard Worker // When only params->r[1] > 0. In this case only H[1][1] and C[1] are
1832*77c1e3ccSAndroid Build Coastguard Worker // non-zero and need to be computed.
calc_proj_params_r1_avx2(const uint8_t * src8,int width,int height,int src_stride,const uint8_t * dat8,int dat_stride,int32_t * flt1,int flt1_stride,int64_t H[2][2],int64_t C[2])1833*77c1e3ccSAndroid Build Coastguard Worker static inline void calc_proj_params_r1_avx2(const uint8_t *src8, int width,
1834*77c1e3ccSAndroid Build Coastguard Worker int height, int src_stride,
1835*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *dat8, int dat_stride,
1836*77c1e3ccSAndroid Build Coastguard Worker int32_t *flt1, int flt1_stride,
1837*77c1e3ccSAndroid Build Coastguard Worker int64_t H[2][2], int64_t C[2]) {
1838*77c1e3ccSAndroid Build Coastguard Worker const int size = width * height;
1839*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src = src8;
1840*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *dat = dat8;
1841*77c1e3ccSAndroid Build Coastguard Worker __m256i h11, c1;
1842*77c1e3ccSAndroid Build Coastguard Worker const __m256i zero = _mm256_setzero_si256();
1843*77c1e3ccSAndroid Build Coastguard Worker c1 = h11 = zero;
1844*77c1e3ccSAndroid Build Coastguard Worker
1845*77c1e3ccSAndroid Build Coastguard Worker for (int i = 0; i < height; ++i) {
1846*77c1e3ccSAndroid Build Coastguard Worker for (int j = 0; j < width; j += 8) {
1847*77c1e3ccSAndroid Build Coastguard Worker const __m256i u_load = _mm256_cvtepu8_epi32(
1848*77c1e3ccSAndroid Build Coastguard Worker _mm_loadl_epi64((__m128i *)(dat + i * dat_stride + j)));
1849*77c1e3ccSAndroid Build Coastguard Worker const __m256i s_load = _mm256_cvtepu8_epi32(
1850*77c1e3ccSAndroid Build Coastguard Worker _mm_loadl_epi64((__m128i *)(src + i * src_stride + j)));
1851*77c1e3ccSAndroid Build Coastguard Worker __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j));
1852*77c1e3ccSAndroid Build Coastguard Worker __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
1853*77c1e3ccSAndroid Build Coastguard Worker __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
1854*77c1e3ccSAndroid Build Coastguard Worker s = _mm256_sub_epi32(s, d);
1855*77c1e3ccSAndroid Build Coastguard Worker f2 = _mm256_sub_epi32(f2, d);
1856*77c1e3ccSAndroid Build Coastguard Worker
1857*77c1e3ccSAndroid Build Coastguard Worker const __m256i h11_even = _mm256_mul_epi32(f2, f2);
1858*77c1e3ccSAndroid Build Coastguard Worker const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32),
1859*77c1e3ccSAndroid Build Coastguard Worker _mm256_srli_epi64(f2, 32));
1860*77c1e3ccSAndroid Build Coastguard Worker h11 = _mm256_add_epi64(h11, h11_even);
1861*77c1e3ccSAndroid Build Coastguard Worker h11 = _mm256_add_epi64(h11, h11_odd);
1862*77c1e3ccSAndroid Build Coastguard Worker
1863*77c1e3ccSAndroid Build Coastguard Worker const __m256i c1_even = _mm256_mul_epi32(f2, s);
1864*77c1e3ccSAndroid Build Coastguard Worker const __m256i c1_odd =
1865*77c1e3ccSAndroid Build Coastguard Worker _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32));
1866*77c1e3ccSAndroid Build Coastguard Worker c1 = _mm256_add_epi64(c1, c1_even);
1867*77c1e3ccSAndroid Build Coastguard Worker c1 = _mm256_add_epi64(c1, c1_odd);
1868*77c1e3ccSAndroid Build Coastguard Worker }
1869*77c1e3ccSAndroid Build Coastguard Worker }
1870*77c1e3ccSAndroid Build Coastguard Worker
1871*77c1e3ccSAndroid Build Coastguard Worker const __m128i h11_128bit = _mm_add_epi64(_mm256_extracti128_si256(h11, 1),
1872*77c1e3ccSAndroid Build Coastguard Worker _mm256_castsi256_si128(h11));
1873*77c1e3ccSAndroid Build Coastguard Worker const __m128i h11_val =
1874*77c1e3ccSAndroid Build Coastguard Worker _mm_add_epi64(h11_128bit, _mm_srli_si128(h11_128bit, 8));
1875*77c1e3ccSAndroid Build Coastguard Worker
1876*77c1e3ccSAndroid Build Coastguard Worker const __m128i c1_128bit = _mm_add_epi64(_mm256_extracti128_si256(c1, 1),
1877*77c1e3ccSAndroid Build Coastguard Worker _mm256_castsi256_si128(c1));
1878*77c1e3ccSAndroid Build Coastguard Worker const __m128i c1_val = _mm_add_epi64(c1_128bit, _mm_srli_si128(c1_128bit, 8));
1879*77c1e3ccSAndroid Build Coastguard Worker
1880*77c1e3ccSAndroid Build Coastguard Worker const __m128i c = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), c1_val);
1881*77c1e3ccSAndroid Build Coastguard Worker const __m128i h1x = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), h11_val);
1882*77c1e3ccSAndroid Build Coastguard Worker
1883*77c1e3ccSAndroid Build Coastguard Worker xx_storeu_128(C, c);
1884*77c1e3ccSAndroid Build Coastguard Worker xx_storeu_128(H[1], h1x);
1885*77c1e3ccSAndroid Build Coastguard Worker
1886*77c1e3ccSAndroid Build Coastguard Worker H[1][1] /= size;
1887*77c1e3ccSAndroid Build Coastguard Worker C[1] /= size;
1888*77c1e3ccSAndroid Build Coastguard Worker }
1889*77c1e3ccSAndroid Build Coastguard Worker
1890*77c1e3ccSAndroid Build Coastguard Worker // AVX2 variant of av1_calc_proj_params_c.
av1_calc_proj_params_avx2(const uint8_t * src8,int width,int height,int src_stride,const uint8_t * dat8,int dat_stride,int32_t * flt0,int flt0_stride,int32_t * flt1,int flt1_stride,int64_t H[2][2],int64_t C[2],const sgr_params_type * params)1891*77c1e3ccSAndroid Build Coastguard Worker void av1_calc_proj_params_avx2(const uint8_t *src8, int width, int height,
1892*77c1e3ccSAndroid Build Coastguard Worker int src_stride, const uint8_t *dat8,
1893*77c1e3ccSAndroid Build Coastguard Worker int dat_stride, int32_t *flt0, int flt0_stride,
1894*77c1e3ccSAndroid Build Coastguard Worker int32_t *flt1, int flt1_stride, int64_t H[2][2],
1895*77c1e3ccSAndroid Build Coastguard Worker int64_t C[2], const sgr_params_type *params) {
1896*77c1e3ccSAndroid Build Coastguard Worker if ((params->r[0] > 0) && (params->r[1] > 0)) {
1897*77c1e3ccSAndroid Build Coastguard Worker calc_proj_params_r0_r1_avx2(src8, width, height, src_stride, dat8,
1898*77c1e3ccSAndroid Build Coastguard Worker dat_stride, flt0, flt0_stride, flt1,
1899*77c1e3ccSAndroid Build Coastguard Worker flt1_stride, H, C);
1900*77c1e3ccSAndroid Build Coastguard Worker } else if (params->r[0] > 0) {
1901*77c1e3ccSAndroid Build Coastguard Worker calc_proj_params_r0_avx2(src8, width, height, src_stride, dat8, dat_stride,
1902*77c1e3ccSAndroid Build Coastguard Worker flt0, flt0_stride, H, C);
1903*77c1e3ccSAndroid Build Coastguard Worker } else if (params->r[1] > 0) {
1904*77c1e3ccSAndroid Build Coastguard Worker calc_proj_params_r1_avx2(src8, width, height, src_stride, dat8, dat_stride,
1905*77c1e3ccSAndroid Build Coastguard Worker flt1, flt1_stride, H, C);
1906*77c1e3ccSAndroid Build Coastguard Worker }
1907*77c1e3ccSAndroid Build Coastguard Worker }
1908*77c1e3ccSAndroid Build Coastguard Worker
1909*77c1e3ccSAndroid Build Coastguard Worker #if CONFIG_AV1_HIGHBITDEPTH
calc_proj_params_r0_r1_high_bd_avx2(const uint8_t * src8,int width,int height,int src_stride,const uint8_t * dat8,int dat_stride,int32_t * flt0,int flt0_stride,int32_t * flt1,int flt1_stride,int64_t H[2][2],int64_t C[2])1910*77c1e3ccSAndroid Build Coastguard Worker static inline void calc_proj_params_r0_r1_high_bd_avx2(
1911*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src8, int width, int height, int src_stride,
1912*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
1913*77c1e3ccSAndroid Build Coastguard Worker int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2]) {
1914*77c1e3ccSAndroid Build Coastguard Worker const int size = width * height;
1915*77c1e3ccSAndroid Build Coastguard Worker const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
1916*77c1e3ccSAndroid Build Coastguard Worker const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
1917*77c1e3ccSAndroid Build Coastguard Worker __m256i h00, h01, h11, c0, c1;
1918*77c1e3ccSAndroid Build Coastguard Worker const __m256i zero = _mm256_setzero_si256();
1919*77c1e3ccSAndroid Build Coastguard Worker h01 = h11 = c0 = c1 = h00 = zero;
1920*77c1e3ccSAndroid Build Coastguard Worker
1921*77c1e3ccSAndroid Build Coastguard Worker for (int i = 0; i < height; ++i) {
1922*77c1e3ccSAndroid Build Coastguard Worker for (int j = 0; j < width; j += 8) {
1923*77c1e3ccSAndroid Build Coastguard Worker const __m256i u_load = _mm256_cvtepu16_epi32(
1924*77c1e3ccSAndroid Build Coastguard Worker _mm_load_si128((__m128i *)(dat + i * dat_stride + j)));
1925*77c1e3ccSAndroid Build Coastguard Worker const __m256i s_load = _mm256_cvtepu16_epi32(
1926*77c1e3ccSAndroid Build Coastguard Worker _mm_load_si128((__m128i *)(src + i * src_stride + j)));
1927*77c1e3ccSAndroid Build Coastguard Worker __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j));
1928*77c1e3ccSAndroid Build Coastguard Worker __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j));
1929*77c1e3ccSAndroid Build Coastguard Worker __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
1930*77c1e3ccSAndroid Build Coastguard Worker __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
1931*77c1e3ccSAndroid Build Coastguard Worker s = _mm256_sub_epi32(s, d);
1932*77c1e3ccSAndroid Build Coastguard Worker f1 = _mm256_sub_epi32(f1, d);
1933*77c1e3ccSAndroid Build Coastguard Worker f2 = _mm256_sub_epi32(f2, d);
1934*77c1e3ccSAndroid Build Coastguard Worker
1935*77c1e3ccSAndroid Build Coastguard Worker const __m256i h00_even = _mm256_mul_epi32(f1, f1);
1936*77c1e3ccSAndroid Build Coastguard Worker const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
1937*77c1e3ccSAndroid Build Coastguard Worker _mm256_srli_epi64(f1, 32));
1938*77c1e3ccSAndroid Build Coastguard Worker h00 = _mm256_add_epi64(h00, h00_even);
1939*77c1e3ccSAndroid Build Coastguard Worker h00 = _mm256_add_epi64(h00, h00_odd);
1940*77c1e3ccSAndroid Build Coastguard Worker
1941*77c1e3ccSAndroid Build Coastguard Worker const __m256i h01_even = _mm256_mul_epi32(f1, f2);
1942*77c1e3ccSAndroid Build Coastguard Worker const __m256i h01_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
1943*77c1e3ccSAndroid Build Coastguard Worker _mm256_srli_epi64(f2, 32));
1944*77c1e3ccSAndroid Build Coastguard Worker h01 = _mm256_add_epi64(h01, h01_even);
1945*77c1e3ccSAndroid Build Coastguard Worker h01 = _mm256_add_epi64(h01, h01_odd);
1946*77c1e3ccSAndroid Build Coastguard Worker
1947*77c1e3ccSAndroid Build Coastguard Worker const __m256i h11_even = _mm256_mul_epi32(f2, f2);
1948*77c1e3ccSAndroid Build Coastguard Worker const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32),
1949*77c1e3ccSAndroid Build Coastguard Worker _mm256_srli_epi64(f2, 32));
1950*77c1e3ccSAndroid Build Coastguard Worker h11 = _mm256_add_epi64(h11, h11_even);
1951*77c1e3ccSAndroid Build Coastguard Worker h11 = _mm256_add_epi64(h11, h11_odd);
1952*77c1e3ccSAndroid Build Coastguard Worker
1953*77c1e3ccSAndroid Build Coastguard Worker const __m256i c0_even = _mm256_mul_epi32(f1, s);
1954*77c1e3ccSAndroid Build Coastguard Worker const __m256i c0_odd =
1955*77c1e3ccSAndroid Build Coastguard Worker _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32));
1956*77c1e3ccSAndroid Build Coastguard Worker c0 = _mm256_add_epi64(c0, c0_even);
1957*77c1e3ccSAndroid Build Coastguard Worker c0 = _mm256_add_epi64(c0, c0_odd);
1958*77c1e3ccSAndroid Build Coastguard Worker
1959*77c1e3ccSAndroid Build Coastguard Worker const __m256i c1_even = _mm256_mul_epi32(f2, s);
1960*77c1e3ccSAndroid Build Coastguard Worker const __m256i c1_odd =
1961*77c1e3ccSAndroid Build Coastguard Worker _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32));
1962*77c1e3ccSAndroid Build Coastguard Worker c1 = _mm256_add_epi64(c1, c1_even);
1963*77c1e3ccSAndroid Build Coastguard Worker c1 = _mm256_add_epi64(c1, c1_odd);
1964*77c1e3ccSAndroid Build Coastguard Worker }
1965*77c1e3ccSAndroid Build Coastguard Worker }
1966*77c1e3ccSAndroid Build Coastguard Worker
1967*77c1e3ccSAndroid Build Coastguard Worker __m256i c_low = _mm256_unpacklo_epi64(c0, c1);
1968*77c1e3ccSAndroid Build Coastguard Worker const __m256i c_high = _mm256_unpackhi_epi64(c0, c1);
1969*77c1e3ccSAndroid Build Coastguard Worker c_low = _mm256_add_epi64(c_low, c_high);
1970*77c1e3ccSAndroid Build Coastguard Worker const __m128i c_128bit = _mm_add_epi64(_mm256_extracti128_si256(c_low, 1),
1971*77c1e3ccSAndroid Build Coastguard Worker _mm256_castsi256_si128(c_low));
1972*77c1e3ccSAndroid Build Coastguard Worker
1973*77c1e3ccSAndroid Build Coastguard Worker __m256i h0x_low = _mm256_unpacklo_epi64(h00, h01);
1974*77c1e3ccSAndroid Build Coastguard Worker const __m256i h0x_high = _mm256_unpackhi_epi64(h00, h01);
1975*77c1e3ccSAndroid Build Coastguard Worker h0x_low = _mm256_add_epi64(h0x_low, h0x_high);
1976*77c1e3ccSAndroid Build Coastguard Worker const __m128i h0x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h0x_low, 1),
1977*77c1e3ccSAndroid Build Coastguard Worker _mm256_castsi256_si128(h0x_low));
1978*77c1e3ccSAndroid Build Coastguard Worker
1979*77c1e3ccSAndroid Build Coastguard Worker // Using the symmetric properties of H, calculations of H[1][0] are not
1980*77c1e3ccSAndroid Build Coastguard Worker // needed.
1981*77c1e3ccSAndroid Build Coastguard Worker __m256i h1x_low = _mm256_unpacklo_epi64(zero, h11);
1982*77c1e3ccSAndroid Build Coastguard Worker const __m256i h1x_high = _mm256_unpackhi_epi64(zero, h11);
1983*77c1e3ccSAndroid Build Coastguard Worker h1x_low = _mm256_add_epi64(h1x_low, h1x_high);
1984*77c1e3ccSAndroid Build Coastguard Worker const __m128i h1x_128bit = _mm_add_epi64(_mm256_extracti128_si256(h1x_low, 1),
1985*77c1e3ccSAndroid Build Coastguard Worker _mm256_castsi256_si128(h1x_low));
1986*77c1e3ccSAndroid Build Coastguard Worker
1987*77c1e3ccSAndroid Build Coastguard Worker xx_storeu_128(C, c_128bit);
1988*77c1e3ccSAndroid Build Coastguard Worker xx_storeu_128(H[0], h0x_128bit);
1989*77c1e3ccSAndroid Build Coastguard Worker xx_storeu_128(H[1], h1x_128bit);
1990*77c1e3ccSAndroid Build Coastguard Worker
1991*77c1e3ccSAndroid Build Coastguard Worker H[0][0] /= size;
1992*77c1e3ccSAndroid Build Coastguard Worker H[0][1] /= size;
1993*77c1e3ccSAndroid Build Coastguard Worker H[1][1] /= size;
1994*77c1e3ccSAndroid Build Coastguard Worker
1995*77c1e3ccSAndroid Build Coastguard Worker // Since H is a symmetric matrix
1996*77c1e3ccSAndroid Build Coastguard Worker H[1][0] = H[0][1];
1997*77c1e3ccSAndroid Build Coastguard Worker C[0] /= size;
1998*77c1e3ccSAndroid Build Coastguard Worker C[1] /= size;
1999*77c1e3ccSAndroid Build Coastguard Worker }
2000*77c1e3ccSAndroid Build Coastguard Worker
calc_proj_params_r0_high_bd_avx2(const uint8_t * src8,int width,int height,int src_stride,const uint8_t * dat8,int dat_stride,int32_t * flt0,int flt0_stride,int64_t H[2][2],int64_t C[2])2001*77c1e3ccSAndroid Build Coastguard Worker static inline void calc_proj_params_r0_high_bd_avx2(
2002*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src8, int width, int height, int src_stride,
2003*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
2004*77c1e3ccSAndroid Build Coastguard Worker int64_t H[2][2], int64_t C[2]) {
2005*77c1e3ccSAndroid Build Coastguard Worker const int size = width * height;
2006*77c1e3ccSAndroid Build Coastguard Worker const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
2007*77c1e3ccSAndroid Build Coastguard Worker const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
2008*77c1e3ccSAndroid Build Coastguard Worker __m256i h00, c0;
2009*77c1e3ccSAndroid Build Coastguard Worker const __m256i zero = _mm256_setzero_si256();
2010*77c1e3ccSAndroid Build Coastguard Worker c0 = h00 = zero;
2011*77c1e3ccSAndroid Build Coastguard Worker
2012*77c1e3ccSAndroid Build Coastguard Worker for (int i = 0; i < height; ++i) {
2013*77c1e3ccSAndroid Build Coastguard Worker for (int j = 0; j < width; j += 8) {
2014*77c1e3ccSAndroid Build Coastguard Worker const __m256i u_load = _mm256_cvtepu16_epi32(
2015*77c1e3ccSAndroid Build Coastguard Worker _mm_load_si128((__m128i *)(dat + i * dat_stride + j)));
2016*77c1e3ccSAndroid Build Coastguard Worker const __m256i s_load = _mm256_cvtepu16_epi32(
2017*77c1e3ccSAndroid Build Coastguard Worker _mm_load_si128((__m128i *)(src + i * src_stride + j)));
2018*77c1e3ccSAndroid Build Coastguard Worker __m256i f1 = _mm256_loadu_si256((__m256i *)(flt0 + i * flt0_stride + j));
2019*77c1e3ccSAndroid Build Coastguard Worker __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
2020*77c1e3ccSAndroid Build Coastguard Worker __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
2021*77c1e3ccSAndroid Build Coastguard Worker s = _mm256_sub_epi32(s, d);
2022*77c1e3ccSAndroid Build Coastguard Worker f1 = _mm256_sub_epi32(f1, d);
2023*77c1e3ccSAndroid Build Coastguard Worker
2024*77c1e3ccSAndroid Build Coastguard Worker const __m256i h00_even = _mm256_mul_epi32(f1, f1);
2025*77c1e3ccSAndroid Build Coastguard Worker const __m256i h00_odd = _mm256_mul_epi32(_mm256_srli_epi64(f1, 32),
2026*77c1e3ccSAndroid Build Coastguard Worker _mm256_srli_epi64(f1, 32));
2027*77c1e3ccSAndroid Build Coastguard Worker h00 = _mm256_add_epi64(h00, h00_even);
2028*77c1e3ccSAndroid Build Coastguard Worker h00 = _mm256_add_epi64(h00, h00_odd);
2029*77c1e3ccSAndroid Build Coastguard Worker
2030*77c1e3ccSAndroid Build Coastguard Worker const __m256i c0_even = _mm256_mul_epi32(f1, s);
2031*77c1e3ccSAndroid Build Coastguard Worker const __m256i c0_odd =
2032*77c1e3ccSAndroid Build Coastguard Worker _mm256_mul_epi32(_mm256_srli_epi64(f1, 32), _mm256_srli_epi64(s, 32));
2033*77c1e3ccSAndroid Build Coastguard Worker c0 = _mm256_add_epi64(c0, c0_even);
2034*77c1e3ccSAndroid Build Coastguard Worker c0 = _mm256_add_epi64(c0, c0_odd);
2035*77c1e3ccSAndroid Build Coastguard Worker }
2036*77c1e3ccSAndroid Build Coastguard Worker }
2037*77c1e3ccSAndroid Build Coastguard Worker const __m128i h00_128bit = _mm_add_epi64(_mm256_extracti128_si256(h00, 1),
2038*77c1e3ccSAndroid Build Coastguard Worker _mm256_castsi256_si128(h00));
2039*77c1e3ccSAndroid Build Coastguard Worker const __m128i h00_val =
2040*77c1e3ccSAndroid Build Coastguard Worker _mm_add_epi64(h00_128bit, _mm_srli_si128(h00_128bit, 8));
2041*77c1e3ccSAndroid Build Coastguard Worker
2042*77c1e3ccSAndroid Build Coastguard Worker const __m128i c0_128bit = _mm_add_epi64(_mm256_extracti128_si256(c0, 1),
2043*77c1e3ccSAndroid Build Coastguard Worker _mm256_castsi256_si128(c0));
2044*77c1e3ccSAndroid Build Coastguard Worker const __m128i c0_val = _mm_add_epi64(c0_128bit, _mm_srli_si128(c0_128bit, 8));
2045*77c1e3ccSAndroid Build Coastguard Worker
2046*77c1e3ccSAndroid Build Coastguard Worker const __m128i c = _mm_unpacklo_epi64(c0_val, _mm256_castsi256_si128(zero));
2047*77c1e3ccSAndroid Build Coastguard Worker const __m128i h0x = _mm_unpacklo_epi64(h00_val, _mm256_castsi256_si128(zero));
2048*77c1e3ccSAndroid Build Coastguard Worker
2049*77c1e3ccSAndroid Build Coastguard Worker xx_storeu_128(C, c);
2050*77c1e3ccSAndroid Build Coastguard Worker xx_storeu_128(H[0], h0x);
2051*77c1e3ccSAndroid Build Coastguard Worker
2052*77c1e3ccSAndroid Build Coastguard Worker H[0][0] /= size;
2053*77c1e3ccSAndroid Build Coastguard Worker C[0] /= size;
2054*77c1e3ccSAndroid Build Coastguard Worker }
2055*77c1e3ccSAndroid Build Coastguard Worker
calc_proj_params_r1_high_bd_avx2(const uint8_t * src8,int width,int height,int src_stride,const uint8_t * dat8,int dat_stride,int32_t * flt1,int flt1_stride,int64_t H[2][2],int64_t C[2])2056*77c1e3ccSAndroid Build Coastguard Worker static inline void calc_proj_params_r1_high_bd_avx2(
2057*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src8, int width, int height, int src_stride,
2058*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *dat8, int dat_stride, int32_t *flt1, int flt1_stride,
2059*77c1e3ccSAndroid Build Coastguard Worker int64_t H[2][2], int64_t C[2]) {
2060*77c1e3ccSAndroid Build Coastguard Worker const int size = width * height;
2061*77c1e3ccSAndroid Build Coastguard Worker const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
2062*77c1e3ccSAndroid Build Coastguard Worker const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
2063*77c1e3ccSAndroid Build Coastguard Worker __m256i h11, c1;
2064*77c1e3ccSAndroid Build Coastguard Worker const __m256i zero = _mm256_setzero_si256();
2065*77c1e3ccSAndroid Build Coastguard Worker c1 = h11 = zero;
2066*77c1e3ccSAndroid Build Coastguard Worker
2067*77c1e3ccSAndroid Build Coastguard Worker for (int i = 0; i < height; ++i) {
2068*77c1e3ccSAndroid Build Coastguard Worker for (int j = 0; j < width; j += 8) {
2069*77c1e3ccSAndroid Build Coastguard Worker const __m256i u_load = _mm256_cvtepu16_epi32(
2070*77c1e3ccSAndroid Build Coastguard Worker _mm_load_si128((__m128i *)(dat + i * dat_stride + j)));
2071*77c1e3ccSAndroid Build Coastguard Worker const __m256i s_load = _mm256_cvtepu16_epi32(
2072*77c1e3ccSAndroid Build Coastguard Worker _mm_load_si128((__m128i *)(src + i * src_stride + j)));
2073*77c1e3ccSAndroid Build Coastguard Worker __m256i f2 = _mm256_loadu_si256((__m256i *)(flt1 + i * flt1_stride + j));
2074*77c1e3ccSAndroid Build Coastguard Worker __m256i d = _mm256_slli_epi32(u_load, SGRPROJ_RST_BITS);
2075*77c1e3ccSAndroid Build Coastguard Worker __m256i s = _mm256_slli_epi32(s_load, SGRPROJ_RST_BITS);
2076*77c1e3ccSAndroid Build Coastguard Worker s = _mm256_sub_epi32(s, d);
2077*77c1e3ccSAndroid Build Coastguard Worker f2 = _mm256_sub_epi32(f2, d);
2078*77c1e3ccSAndroid Build Coastguard Worker
2079*77c1e3ccSAndroid Build Coastguard Worker const __m256i h11_even = _mm256_mul_epi32(f2, f2);
2080*77c1e3ccSAndroid Build Coastguard Worker const __m256i h11_odd = _mm256_mul_epi32(_mm256_srli_epi64(f2, 32),
2081*77c1e3ccSAndroid Build Coastguard Worker _mm256_srli_epi64(f2, 32));
2082*77c1e3ccSAndroid Build Coastguard Worker h11 = _mm256_add_epi64(h11, h11_even);
2083*77c1e3ccSAndroid Build Coastguard Worker h11 = _mm256_add_epi64(h11, h11_odd);
2084*77c1e3ccSAndroid Build Coastguard Worker
2085*77c1e3ccSAndroid Build Coastguard Worker const __m256i c1_even = _mm256_mul_epi32(f2, s);
2086*77c1e3ccSAndroid Build Coastguard Worker const __m256i c1_odd =
2087*77c1e3ccSAndroid Build Coastguard Worker _mm256_mul_epi32(_mm256_srli_epi64(f2, 32), _mm256_srli_epi64(s, 32));
2088*77c1e3ccSAndroid Build Coastguard Worker c1 = _mm256_add_epi64(c1, c1_even);
2089*77c1e3ccSAndroid Build Coastguard Worker c1 = _mm256_add_epi64(c1, c1_odd);
2090*77c1e3ccSAndroid Build Coastguard Worker }
2091*77c1e3ccSAndroid Build Coastguard Worker }
2092*77c1e3ccSAndroid Build Coastguard Worker
2093*77c1e3ccSAndroid Build Coastguard Worker const __m128i h11_128bit = _mm_add_epi64(_mm256_extracti128_si256(h11, 1),
2094*77c1e3ccSAndroid Build Coastguard Worker _mm256_castsi256_si128(h11));
2095*77c1e3ccSAndroid Build Coastguard Worker const __m128i h11_val =
2096*77c1e3ccSAndroid Build Coastguard Worker _mm_add_epi64(h11_128bit, _mm_srli_si128(h11_128bit, 8));
2097*77c1e3ccSAndroid Build Coastguard Worker
2098*77c1e3ccSAndroid Build Coastguard Worker const __m128i c1_128bit = _mm_add_epi64(_mm256_extracti128_si256(c1, 1),
2099*77c1e3ccSAndroid Build Coastguard Worker _mm256_castsi256_si128(c1));
2100*77c1e3ccSAndroid Build Coastguard Worker const __m128i c1_val = _mm_add_epi64(c1_128bit, _mm_srli_si128(c1_128bit, 8));
2101*77c1e3ccSAndroid Build Coastguard Worker
2102*77c1e3ccSAndroid Build Coastguard Worker const __m128i c = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), c1_val);
2103*77c1e3ccSAndroid Build Coastguard Worker const __m128i h1x = _mm_unpacklo_epi64(_mm256_castsi256_si128(zero), h11_val);
2104*77c1e3ccSAndroid Build Coastguard Worker
2105*77c1e3ccSAndroid Build Coastguard Worker xx_storeu_128(C, c);
2106*77c1e3ccSAndroid Build Coastguard Worker xx_storeu_128(H[1], h1x);
2107*77c1e3ccSAndroid Build Coastguard Worker
2108*77c1e3ccSAndroid Build Coastguard Worker H[1][1] /= size;
2109*77c1e3ccSAndroid Build Coastguard Worker C[1] /= size;
2110*77c1e3ccSAndroid Build Coastguard Worker }
2111*77c1e3ccSAndroid Build Coastguard Worker
2112*77c1e3ccSAndroid Build Coastguard Worker // AVX2 variant of av1_calc_proj_params_high_bd_c.
av1_calc_proj_params_high_bd_avx2(const uint8_t * src8,int width,int height,int src_stride,const uint8_t * dat8,int dat_stride,int32_t * flt0,int flt0_stride,int32_t * flt1,int flt1_stride,int64_t H[2][2],int64_t C[2],const sgr_params_type * params)2113*77c1e3ccSAndroid Build Coastguard Worker void av1_calc_proj_params_high_bd_avx2(const uint8_t *src8, int width,
2114*77c1e3ccSAndroid Build Coastguard Worker int height, int src_stride,
2115*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *dat8, int dat_stride,
2116*77c1e3ccSAndroid Build Coastguard Worker int32_t *flt0, int flt0_stride,
2117*77c1e3ccSAndroid Build Coastguard Worker int32_t *flt1, int flt1_stride,
2118*77c1e3ccSAndroid Build Coastguard Worker int64_t H[2][2], int64_t C[2],
2119*77c1e3ccSAndroid Build Coastguard Worker const sgr_params_type *params) {
2120*77c1e3ccSAndroid Build Coastguard Worker if ((params->r[0] > 0) && (params->r[1] > 0)) {
2121*77c1e3ccSAndroid Build Coastguard Worker calc_proj_params_r0_r1_high_bd_avx2(src8, width, height, src_stride, dat8,
2122*77c1e3ccSAndroid Build Coastguard Worker dat_stride, flt0, flt0_stride, flt1,
2123*77c1e3ccSAndroid Build Coastguard Worker flt1_stride, H, C);
2124*77c1e3ccSAndroid Build Coastguard Worker } else if (params->r[0] > 0) {
2125*77c1e3ccSAndroid Build Coastguard Worker calc_proj_params_r0_high_bd_avx2(src8, width, height, src_stride, dat8,
2126*77c1e3ccSAndroid Build Coastguard Worker dat_stride, flt0, flt0_stride, H, C);
2127*77c1e3ccSAndroid Build Coastguard Worker } else if (params->r[1] > 0) {
2128*77c1e3ccSAndroid Build Coastguard Worker calc_proj_params_r1_high_bd_avx2(src8, width, height, src_stride, dat8,
2129*77c1e3ccSAndroid Build Coastguard Worker dat_stride, flt1, flt1_stride, H, C);
2130*77c1e3ccSAndroid Build Coastguard Worker }
2131*77c1e3ccSAndroid Build Coastguard Worker }
2132*77c1e3ccSAndroid Build Coastguard Worker
av1_highbd_pixel_proj_error_avx2(const uint8_t * src8,int width,int height,int src_stride,const uint8_t * dat8,int dat_stride,int32_t * flt0,int flt0_stride,int32_t * flt1,int flt1_stride,int xq[2],const sgr_params_type * params)2133*77c1e3ccSAndroid Build Coastguard Worker int64_t av1_highbd_pixel_proj_error_avx2(
2134*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src8, int width, int height, int src_stride,
2135*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride,
2136*77c1e3ccSAndroid Build Coastguard Worker int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) {
2137*77c1e3ccSAndroid Build Coastguard Worker int i, j, k;
2138*77c1e3ccSAndroid Build Coastguard Worker const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS;
2139*77c1e3ccSAndroid Build Coastguard Worker const __m256i rounding = _mm256_set1_epi32(1 << (shift - 1));
2140*77c1e3ccSAndroid Build Coastguard Worker __m256i sum64 = _mm256_setzero_si256();
2141*77c1e3ccSAndroid Build Coastguard Worker const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
2142*77c1e3ccSAndroid Build Coastguard Worker const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8);
2143*77c1e3ccSAndroid Build Coastguard Worker int64_t err = 0;
2144*77c1e3ccSAndroid Build Coastguard Worker if (params->r[0] > 0 && params->r[1] > 0) { // Both filters are enabled
2145*77c1e3ccSAndroid Build Coastguard Worker const __m256i xq0 = _mm256_set1_epi32(xq[0]);
2146*77c1e3ccSAndroid Build Coastguard Worker const __m256i xq1 = _mm256_set1_epi32(xq[1]);
2147*77c1e3ccSAndroid Build Coastguard Worker for (i = 0; i < height; ++i) {
2148*77c1e3ccSAndroid Build Coastguard Worker __m256i sum32 = _mm256_setzero_si256();
2149*77c1e3ccSAndroid Build Coastguard Worker for (j = 0; j <= width - 16; j += 16) { // Process 16 pixels at a time
2150*77c1e3ccSAndroid Build Coastguard Worker // Load 16 pixels each from source image and corrupted image
2151*77c1e3ccSAndroid Build Coastguard Worker const __m256i s0 = yy_loadu_256(src + j);
2152*77c1e3ccSAndroid Build Coastguard Worker const __m256i d0 = yy_loadu_256(dat + j);
2153*77c1e3ccSAndroid Build Coastguard Worker // s0 = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16 (indices)
2154*77c1e3ccSAndroid Build Coastguard Worker
2155*77c1e3ccSAndroid Build Coastguard Worker // Shift-up each pixel to match filtered image scaling
2156*77c1e3ccSAndroid Build Coastguard Worker const __m256i u0 = _mm256_slli_epi16(d0, SGRPROJ_RST_BITS);
2157*77c1e3ccSAndroid Build Coastguard Worker
2158*77c1e3ccSAndroid Build Coastguard Worker // Split u0 into two halves and pad each from u16 to i32
2159*77c1e3ccSAndroid Build Coastguard Worker const __m256i u0l = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(u0));
2160*77c1e3ccSAndroid Build Coastguard Worker const __m256i u0h =
2161*77c1e3ccSAndroid Build Coastguard Worker _mm256_cvtepu16_epi32(_mm256_extracti128_si256(u0, 1));
2162*77c1e3ccSAndroid Build Coastguard Worker // u0h, u0l = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as u32
2163*77c1e3ccSAndroid Build Coastguard Worker
2164*77c1e3ccSAndroid Build Coastguard Worker // Load 16 pixels from each filtered image
2165*77c1e3ccSAndroid Build Coastguard Worker const __m256i flt0l = yy_loadu_256(flt0 + j);
2166*77c1e3ccSAndroid Build Coastguard Worker const __m256i flt0h = yy_loadu_256(flt0 + j + 8);
2167*77c1e3ccSAndroid Build Coastguard Worker const __m256i flt1l = yy_loadu_256(flt1 + j);
2168*77c1e3ccSAndroid Build Coastguard Worker const __m256i flt1h = yy_loadu_256(flt1 + j + 8);
2169*77c1e3ccSAndroid Build Coastguard Worker // flt?l, flt?h = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as u32
2170*77c1e3ccSAndroid Build Coastguard Worker
2171*77c1e3ccSAndroid Build Coastguard Worker // Subtract shifted corrupt image from each filtered image
2172*77c1e3ccSAndroid Build Coastguard Worker const __m256i flt0l_subu = _mm256_sub_epi32(flt0l, u0l);
2173*77c1e3ccSAndroid Build Coastguard Worker const __m256i flt0h_subu = _mm256_sub_epi32(flt0h, u0h);
2174*77c1e3ccSAndroid Build Coastguard Worker const __m256i flt1l_subu = _mm256_sub_epi32(flt1l, u0l);
2175*77c1e3ccSAndroid Build Coastguard Worker const __m256i flt1h_subu = _mm256_sub_epi32(flt1h, u0h);
2176*77c1e3ccSAndroid Build Coastguard Worker
2177*77c1e3ccSAndroid Build Coastguard Worker // Multiply basis vectors by appropriate coefficients
2178*77c1e3ccSAndroid Build Coastguard Worker const __m256i v0l = _mm256_mullo_epi32(flt0l_subu, xq0);
2179*77c1e3ccSAndroid Build Coastguard Worker const __m256i v0h = _mm256_mullo_epi32(flt0h_subu, xq0);
2180*77c1e3ccSAndroid Build Coastguard Worker const __m256i v1l = _mm256_mullo_epi32(flt1l_subu, xq1);
2181*77c1e3ccSAndroid Build Coastguard Worker const __m256i v1h = _mm256_mullo_epi32(flt1h_subu, xq1);
2182*77c1e3ccSAndroid Build Coastguard Worker
2183*77c1e3ccSAndroid Build Coastguard Worker // Add together the contributions from the two basis vectors
2184*77c1e3ccSAndroid Build Coastguard Worker const __m256i vl = _mm256_add_epi32(v0l, v1l);
2185*77c1e3ccSAndroid Build Coastguard Worker const __m256i vh = _mm256_add_epi32(v0h, v1h);
2186*77c1e3ccSAndroid Build Coastguard Worker
2187*77c1e3ccSAndroid Build Coastguard Worker // Right-shift v with appropriate rounding
2188*77c1e3ccSAndroid Build Coastguard Worker const __m256i vrl =
2189*77c1e3ccSAndroid Build Coastguard Worker _mm256_srai_epi32(_mm256_add_epi32(vl, rounding), shift);
2190*77c1e3ccSAndroid Build Coastguard Worker const __m256i vrh =
2191*77c1e3ccSAndroid Build Coastguard Worker _mm256_srai_epi32(_mm256_add_epi32(vh, rounding), shift);
2192*77c1e3ccSAndroid Build Coastguard Worker // vrh, vrl = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0]
2193*77c1e3ccSAndroid Build Coastguard Worker
2194*77c1e3ccSAndroid Build Coastguard Worker // Saturate each i32 to an i16 then combine both halves
2195*77c1e3ccSAndroid Build Coastguard Worker // The permute (control=[3 1 2 0]) fixes weird ordering from AVX lanes
2196*77c1e3ccSAndroid Build Coastguard Worker const __m256i vr =
2197*77c1e3ccSAndroid Build Coastguard Worker _mm256_permute4x64_epi64(_mm256_packs_epi32(vrl, vrh), 0xd8);
2198*77c1e3ccSAndroid Build Coastguard Worker // intermediate = [15 14 13 12 7 6 5 4] [11 10 9 8 3 2 1 0]
2199*77c1e3ccSAndroid Build Coastguard Worker // vr = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0]
2200*77c1e3ccSAndroid Build Coastguard Worker
2201*77c1e3ccSAndroid Build Coastguard Worker // Add twin-subspace-sgr-filter to corrupt image then subtract source
2202*77c1e3ccSAndroid Build Coastguard Worker const __m256i e0 = _mm256_sub_epi16(_mm256_add_epi16(vr, d0), s0);
2203*77c1e3ccSAndroid Build Coastguard Worker
2204*77c1e3ccSAndroid Build Coastguard Worker // Calculate squared error and add adjacent values
2205*77c1e3ccSAndroid Build Coastguard Worker const __m256i err0 = _mm256_madd_epi16(e0, e0);
2206*77c1e3ccSAndroid Build Coastguard Worker
2207*77c1e3ccSAndroid Build Coastguard Worker sum32 = _mm256_add_epi32(sum32, err0);
2208*77c1e3ccSAndroid Build Coastguard Worker }
2209*77c1e3ccSAndroid Build Coastguard Worker
2210*77c1e3ccSAndroid Build Coastguard Worker const __m256i sum32l =
2211*77c1e3ccSAndroid Build Coastguard Worker _mm256_cvtepu32_epi64(_mm256_castsi256_si128(sum32));
2212*77c1e3ccSAndroid Build Coastguard Worker sum64 = _mm256_add_epi64(sum64, sum32l);
2213*77c1e3ccSAndroid Build Coastguard Worker const __m256i sum32h =
2214*77c1e3ccSAndroid Build Coastguard Worker _mm256_cvtepu32_epi64(_mm256_extracti128_si256(sum32, 1));
2215*77c1e3ccSAndroid Build Coastguard Worker sum64 = _mm256_add_epi64(sum64, sum32h);
2216*77c1e3ccSAndroid Build Coastguard Worker
2217*77c1e3ccSAndroid Build Coastguard Worker // Process remaining pixels in this row (modulo 16)
2218*77c1e3ccSAndroid Build Coastguard Worker for (k = j; k < width; ++k) {
2219*77c1e3ccSAndroid Build Coastguard Worker const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
2220*77c1e3ccSAndroid Build Coastguard Worker int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u);
2221*77c1e3ccSAndroid Build Coastguard Worker const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
2222*77c1e3ccSAndroid Build Coastguard Worker err += ((int64_t)e * e);
2223*77c1e3ccSAndroid Build Coastguard Worker }
2224*77c1e3ccSAndroid Build Coastguard Worker dat += dat_stride;
2225*77c1e3ccSAndroid Build Coastguard Worker src += src_stride;
2226*77c1e3ccSAndroid Build Coastguard Worker flt0 += flt0_stride;
2227*77c1e3ccSAndroid Build Coastguard Worker flt1 += flt1_stride;
2228*77c1e3ccSAndroid Build Coastguard Worker }
2229*77c1e3ccSAndroid Build Coastguard Worker } else if (params->r[0] > 0 || params->r[1] > 0) { // Only one filter enabled
2230*77c1e3ccSAndroid Build Coastguard Worker const int32_t xq_on = (params->r[0] > 0) ? xq[0] : xq[1];
2231*77c1e3ccSAndroid Build Coastguard Worker const __m256i xq_active = _mm256_set1_epi32(xq_on);
2232*77c1e3ccSAndroid Build Coastguard Worker const __m256i xq_inactive =
2233*77c1e3ccSAndroid Build Coastguard Worker _mm256_set1_epi32(-xq_on * (1 << SGRPROJ_RST_BITS));
2234*77c1e3ccSAndroid Build Coastguard Worker const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1;
2235*77c1e3ccSAndroid Build Coastguard Worker const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride;
2236*77c1e3ccSAndroid Build Coastguard Worker for (i = 0; i < height; ++i) {
2237*77c1e3ccSAndroid Build Coastguard Worker __m256i sum32 = _mm256_setzero_si256();
2238*77c1e3ccSAndroid Build Coastguard Worker for (j = 0; j <= width - 16; j += 16) {
2239*77c1e3ccSAndroid Build Coastguard Worker // Load 16 pixels from source image
2240*77c1e3ccSAndroid Build Coastguard Worker const __m256i s0 = yy_loadu_256(src + j);
2241*77c1e3ccSAndroid Build Coastguard Worker // s0 = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16
2242*77c1e3ccSAndroid Build Coastguard Worker
2243*77c1e3ccSAndroid Build Coastguard Worker // Load 16 pixels from corrupted image and pad each u16 to i32
2244*77c1e3ccSAndroid Build Coastguard Worker const __m256i d0 = yy_loadu_256(dat + j);
2245*77c1e3ccSAndroid Build Coastguard Worker const __m256i d0h =
2246*77c1e3ccSAndroid Build Coastguard Worker _mm256_cvtepu16_epi32(_mm256_extracti128_si256(d0, 1));
2247*77c1e3ccSAndroid Build Coastguard Worker const __m256i d0l = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(d0));
2248*77c1e3ccSAndroid Build Coastguard Worker // d0 = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16
2249*77c1e3ccSAndroid Build Coastguard Worker // d0h, d0l = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as i32
2250*77c1e3ccSAndroid Build Coastguard Worker
2251*77c1e3ccSAndroid Build Coastguard Worker // Load 16 pixels from the filtered image
2252*77c1e3ccSAndroid Build Coastguard Worker const __m256i flth = yy_loadu_256(flt + j + 8);
2253*77c1e3ccSAndroid Build Coastguard Worker const __m256i fltl = yy_loadu_256(flt + j);
2254*77c1e3ccSAndroid Build Coastguard Worker // flth, fltl = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as i32
2255*77c1e3ccSAndroid Build Coastguard Worker
2256*77c1e3ccSAndroid Build Coastguard Worker const __m256i flth_xq = _mm256_mullo_epi32(flth, xq_active);
2257*77c1e3ccSAndroid Build Coastguard Worker const __m256i fltl_xq = _mm256_mullo_epi32(fltl, xq_active);
2258*77c1e3ccSAndroid Build Coastguard Worker const __m256i d0h_xq = _mm256_mullo_epi32(d0h, xq_inactive);
2259*77c1e3ccSAndroid Build Coastguard Worker const __m256i d0l_xq = _mm256_mullo_epi32(d0l, xq_inactive);
2260*77c1e3ccSAndroid Build Coastguard Worker
2261*77c1e3ccSAndroid Build Coastguard Worker const __m256i vh = _mm256_add_epi32(flth_xq, d0h_xq);
2262*77c1e3ccSAndroid Build Coastguard Worker const __m256i vl = _mm256_add_epi32(fltl_xq, d0l_xq);
2263*77c1e3ccSAndroid Build Coastguard Worker
2264*77c1e3ccSAndroid Build Coastguard Worker // Shift this down with appropriate rounding
2265*77c1e3ccSAndroid Build Coastguard Worker const __m256i vrh =
2266*77c1e3ccSAndroid Build Coastguard Worker _mm256_srai_epi32(_mm256_add_epi32(vh, rounding), shift);
2267*77c1e3ccSAndroid Build Coastguard Worker const __m256i vrl =
2268*77c1e3ccSAndroid Build Coastguard Worker _mm256_srai_epi32(_mm256_add_epi32(vl, rounding), shift);
2269*77c1e3ccSAndroid Build Coastguard Worker // vrh, vrl = [15 14 13 12] [11 10 9 8], [7 6 5 4] [3 2 1 0] as i32
2270*77c1e3ccSAndroid Build Coastguard Worker
2271*77c1e3ccSAndroid Build Coastguard Worker // Saturate each i32 to an i16 then combine both halves
2272*77c1e3ccSAndroid Build Coastguard Worker // The permute (control=[3 1 2 0]) fixes weird ordering from AVX lanes
2273*77c1e3ccSAndroid Build Coastguard Worker const __m256i vr =
2274*77c1e3ccSAndroid Build Coastguard Worker _mm256_permute4x64_epi64(_mm256_packs_epi32(vrl, vrh), 0xd8);
2275*77c1e3ccSAndroid Build Coastguard Worker // intermediate = [15 14 13 12 7 6 5 4] [11 10 9 8 3 2 1 0] as u16
2276*77c1e3ccSAndroid Build Coastguard Worker // vr = [15 14 13 12 11 10 9 8] [7 6 5 4 3 2 1 0] as u16
2277*77c1e3ccSAndroid Build Coastguard Worker
2278*77c1e3ccSAndroid Build Coastguard Worker // Subtract twin-subspace-sgr filtered from source image to get error
2279*77c1e3ccSAndroid Build Coastguard Worker const __m256i e0 = _mm256_sub_epi16(_mm256_add_epi16(vr, d0), s0);
2280*77c1e3ccSAndroid Build Coastguard Worker
2281*77c1e3ccSAndroid Build Coastguard Worker // Calculate squared error and add adjacent values
2282*77c1e3ccSAndroid Build Coastguard Worker const __m256i err0 = _mm256_madd_epi16(e0, e0);
2283*77c1e3ccSAndroid Build Coastguard Worker
2284*77c1e3ccSAndroid Build Coastguard Worker sum32 = _mm256_add_epi32(sum32, err0);
2285*77c1e3ccSAndroid Build Coastguard Worker }
2286*77c1e3ccSAndroid Build Coastguard Worker
2287*77c1e3ccSAndroid Build Coastguard Worker const __m256i sum32l =
2288*77c1e3ccSAndroid Build Coastguard Worker _mm256_cvtepu32_epi64(_mm256_castsi256_si128(sum32));
2289*77c1e3ccSAndroid Build Coastguard Worker sum64 = _mm256_add_epi64(sum64, sum32l);
2290*77c1e3ccSAndroid Build Coastguard Worker const __m256i sum32h =
2291*77c1e3ccSAndroid Build Coastguard Worker _mm256_cvtepu32_epi64(_mm256_extracti128_si256(sum32, 1));
2292*77c1e3ccSAndroid Build Coastguard Worker sum64 = _mm256_add_epi64(sum64, sum32h);
2293*77c1e3ccSAndroid Build Coastguard Worker
2294*77c1e3ccSAndroid Build Coastguard Worker // Process remaining pixels in this row (modulo 16)
2295*77c1e3ccSAndroid Build Coastguard Worker for (k = j; k < width; ++k) {
2296*77c1e3ccSAndroid Build Coastguard Worker const int32_t u = (int32_t)(dat[k] << SGRPROJ_RST_BITS);
2297*77c1e3ccSAndroid Build Coastguard Worker int32_t v = xq_on * (flt[k] - u);
2298*77c1e3ccSAndroid Build Coastguard Worker const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k];
2299*77c1e3ccSAndroid Build Coastguard Worker err += ((int64_t)e * e);
2300*77c1e3ccSAndroid Build Coastguard Worker }
2301*77c1e3ccSAndroid Build Coastguard Worker dat += dat_stride;
2302*77c1e3ccSAndroid Build Coastguard Worker src += src_stride;
2303*77c1e3ccSAndroid Build Coastguard Worker flt += flt_stride;
2304*77c1e3ccSAndroid Build Coastguard Worker }
2305*77c1e3ccSAndroid Build Coastguard Worker } else { // Neither filter is enabled
2306*77c1e3ccSAndroid Build Coastguard Worker for (i = 0; i < height; ++i) {
2307*77c1e3ccSAndroid Build Coastguard Worker __m256i sum32 = _mm256_setzero_si256();
2308*77c1e3ccSAndroid Build Coastguard Worker for (j = 0; j <= width - 32; j += 32) {
2309*77c1e3ccSAndroid Build Coastguard Worker // Load 2x16 u16 from source image
2310*77c1e3ccSAndroid Build Coastguard Worker const __m256i s0l = yy_loadu_256(src + j);
2311*77c1e3ccSAndroid Build Coastguard Worker const __m256i s0h = yy_loadu_256(src + j + 16);
2312*77c1e3ccSAndroid Build Coastguard Worker
2313*77c1e3ccSAndroid Build Coastguard Worker // Load 2x16 u16 from corrupted image
2314*77c1e3ccSAndroid Build Coastguard Worker const __m256i d0l = yy_loadu_256(dat + j);
2315*77c1e3ccSAndroid Build Coastguard Worker const __m256i d0h = yy_loadu_256(dat + j + 16);
2316*77c1e3ccSAndroid Build Coastguard Worker
2317*77c1e3ccSAndroid Build Coastguard Worker // Subtract corrupted image from source image
2318*77c1e3ccSAndroid Build Coastguard Worker const __m256i diffl = _mm256_sub_epi16(d0l, s0l);
2319*77c1e3ccSAndroid Build Coastguard Worker const __m256i diffh = _mm256_sub_epi16(d0h, s0h);
2320*77c1e3ccSAndroid Build Coastguard Worker
2321*77c1e3ccSAndroid Build Coastguard Worker // Square error and add adjacent values
2322*77c1e3ccSAndroid Build Coastguard Worker const __m256i err0l = _mm256_madd_epi16(diffl, diffl);
2323*77c1e3ccSAndroid Build Coastguard Worker const __m256i err0h = _mm256_madd_epi16(diffh, diffh);
2324*77c1e3ccSAndroid Build Coastguard Worker
2325*77c1e3ccSAndroid Build Coastguard Worker sum32 = _mm256_add_epi32(sum32, err0l);
2326*77c1e3ccSAndroid Build Coastguard Worker sum32 = _mm256_add_epi32(sum32, err0h);
2327*77c1e3ccSAndroid Build Coastguard Worker }
2328*77c1e3ccSAndroid Build Coastguard Worker
2329*77c1e3ccSAndroid Build Coastguard Worker const __m256i sum32l =
2330*77c1e3ccSAndroid Build Coastguard Worker _mm256_cvtepu32_epi64(_mm256_castsi256_si128(sum32));
2331*77c1e3ccSAndroid Build Coastguard Worker sum64 = _mm256_add_epi64(sum64, sum32l);
2332*77c1e3ccSAndroid Build Coastguard Worker const __m256i sum32h =
2333*77c1e3ccSAndroid Build Coastguard Worker _mm256_cvtepu32_epi64(_mm256_extracti128_si256(sum32, 1));
2334*77c1e3ccSAndroid Build Coastguard Worker sum64 = _mm256_add_epi64(sum64, sum32h);
2335*77c1e3ccSAndroid Build Coastguard Worker
2336*77c1e3ccSAndroid Build Coastguard Worker // Process remaining pixels (modulu 16)
2337*77c1e3ccSAndroid Build Coastguard Worker for (k = j; k < width; ++k) {
2338*77c1e3ccSAndroid Build Coastguard Worker const int32_t e = (int32_t)(dat[k]) - src[k];
2339*77c1e3ccSAndroid Build Coastguard Worker err += ((int64_t)e * e);
2340*77c1e3ccSAndroid Build Coastguard Worker }
2341*77c1e3ccSAndroid Build Coastguard Worker dat += dat_stride;
2342*77c1e3ccSAndroid Build Coastguard Worker src += src_stride;
2343*77c1e3ccSAndroid Build Coastguard Worker }
2344*77c1e3ccSAndroid Build Coastguard Worker }
2345*77c1e3ccSAndroid Build Coastguard Worker
2346*77c1e3ccSAndroid Build Coastguard Worker // Sum 4 values from sum64l and sum64h into err
2347*77c1e3ccSAndroid Build Coastguard Worker int64_t sum[4];
2348*77c1e3ccSAndroid Build Coastguard Worker yy_storeu_256(sum, sum64);
2349*77c1e3ccSAndroid Build Coastguard Worker err += sum[0] + sum[1] + sum[2] + sum[3];
2350*77c1e3ccSAndroid Build Coastguard Worker return err;
2351*77c1e3ccSAndroid Build Coastguard Worker }
2352*77c1e3ccSAndroid Build Coastguard Worker #endif // CONFIG_AV1_HIGHBITDEPTH
2353