xref: /aosp_15_r20/external/libvpx/vp8/encoder/x86/denoising_sse2.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1*fb1b10abSAndroid Build Coastguard Worker /*
2*fb1b10abSAndroid Build Coastguard Worker  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3*fb1b10abSAndroid Build Coastguard Worker  *
4*fb1b10abSAndroid Build Coastguard Worker  *  Use of this source code is governed by a BSD-style license
5*fb1b10abSAndroid Build Coastguard Worker  *  that can be found in the LICENSE file in the root of the source
6*fb1b10abSAndroid Build Coastguard Worker  *  tree. An additional intellectual property rights grant can be found
7*fb1b10abSAndroid Build Coastguard Worker  *  in the file PATENTS.  All contributing project authors may
8*fb1b10abSAndroid Build Coastguard Worker  *  be found in the AUTHORS file in the root of the source tree.
9*fb1b10abSAndroid Build Coastguard Worker  */
10*fb1b10abSAndroid Build Coastguard Worker 
11*fb1b10abSAndroid Build Coastguard Worker #include "vp8/encoder/denoising.h"
12*fb1b10abSAndroid Build Coastguard Worker #include "vp8/common/reconinter.h"
13*fb1b10abSAndroid Build Coastguard Worker #include "vpx/vpx_integer.h"
14*fb1b10abSAndroid Build Coastguard Worker #include "vpx_mem/vpx_mem.h"
15*fb1b10abSAndroid Build Coastguard Worker #include "vp8_rtcd.h"
16*fb1b10abSAndroid Build Coastguard Worker 
17*fb1b10abSAndroid Build Coastguard Worker #include <emmintrin.h>
18*fb1b10abSAndroid Build Coastguard Worker #include "vpx_ports/emmintrin_compat.h"
19*fb1b10abSAndroid Build Coastguard Worker 
20*fb1b10abSAndroid Build Coastguard Worker /* Compute the sum of all pixel differences of this MB. */
abs_sum_diff_16x1(__m128i acc_diff)21*fb1b10abSAndroid Build Coastguard Worker static INLINE unsigned int abs_sum_diff_16x1(__m128i acc_diff) {
22*fb1b10abSAndroid Build Coastguard Worker   const __m128i k_1 = _mm_set1_epi16(1);
23*fb1b10abSAndroid Build Coastguard Worker   const __m128i acc_diff_lo =
24*fb1b10abSAndroid Build Coastguard Worker       _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8);
25*fb1b10abSAndroid Build Coastguard Worker   const __m128i acc_diff_hi =
26*fb1b10abSAndroid Build Coastguard Worker       _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8);
27*fb1b10abSAndroid Build Coastguard Worker   const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi);
28*fb1b10abSAndroid Build Coastguard Worker   const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1);
29*fb1b10abSAndroid Build Coastguard Worker   const __m128i hgfe_dcba =
30*fb1b10abSAndroid Build Coastguard Worker       _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8));
31*fb1b10abSAndroid Build Coastguard Worker   const __m128i hgfedcba =
32*fb1b10abSAndroid Build Coastguard Worker       _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4));
33*fb1b10abSAndroid Build Coastguard Worker   unsigned int sum_diff = (unsigned int)abs(_mm_cvtsi128_si32(hgfedcba));
34*fb1b10abSAndroid Build Coastguard Worker 
35*fb1b10abSAndroid Build Coastguard Worker   return sum_diff;
36*fb1b10abSAndroid Build Coastguard Worker }
37*fb1b10abSAndroid Build Coastguard Worker 
vp8_denoiser_filter_sse2(unsigned char * mc_running_avg_y,int mc_avg_y_stride,unsigned char * running_avg_y,int avg_y_stride,unsigned char * sig,int sig_stride,unsigned int motion_magnitude,int increase_denoising)38*fb1b10abSAndroid Build Coastguard Worker int vp8_denoiser_filter_sse2(unsigned char *mc_running_avg_y,
39*fb1b10abSAndroid Build Coastguard Worker                              int mc_avg_y_stride, unsigned char *running_avg_y,
40*fb1b10abSAndroid Build Coastguard Worker                              int avg_y_stride, unsigned char *sig,
41*fb1b10abSAndroid Build Coastguard Worker                              int sig_stride, unsigned int motion_magnitude,
42*fb1b10abSAndroid Build Coastguard Worker                              int increase_denoising) {
43*fb1b10abSAndroid Build Coastguard Worker   unsigned char *running_avg_y_start = running_avg_y;
44*fb1b10abSAndroid Build Coastguard Worker   unsigned char *sig_start = sig;
45*fb1b10abSAndroid Build Coastguard Worker   unsigned int sum_diff_thresh;
46*fb1b10abSAndroid Build Coastguard Worker   int r;
47*fb1b10abSAndroid Build Coastguard Worker   int shift_inc =
48*fb1b10abSAndroid Build Coastguard Worker       (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
49*fb1b10abSAndroid Build Coastguard Worker           ? 1
50*fb1b10abSAndroid Build Coastguard Worker           : 0;
51*fb1b10abSAndroid Build Coastguard Worker   __m128i acc_diff = _mm_setzero_si128();
52*fb1b10abSAndroid Build Coastguard Worker   const __m128i k_0 = _mm_setzero_si128();
53*fb1b10abSAndroid Build Coastguard Worker   const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
54*fb1b10abSAndroid Build Coastguard Worker   const __m128i k_8 = _mm_set1_epi8(8);
55*fb1b10abSAndroid Build Coastguard Worker   const __m128i k_16 = _mm_set1_epi8(16);
56*fb1b10abSAndroid Build Coastguard Worker   /* Modify each level's adjustment according to motion_magnitude. */
57*fb1b10abSAndroid Build Coastguard Worker   const __m128i l3 = _mm_set1_epi8(
58*fb1b10abSAndroid Build Coastguard Worker       (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
59*fb1b10abSAndroid Build Coastguard Worker   /* Difference between level 3 and level 2 is 2. */
60*fb1b10abSAndroid Build Coastguard Worker   const __m128i l32 = _mm_set1_epi8(2);
61*fb1b10abSAndroid Build Coastguard Worker   /* Difference between level 2 and level 1 is 1. */
62*fb1b10abSAndroid Build Coastguard Worker   const __m128i l21 = _mm_set1_epi8(1);
63*fb1b10abSAndroid Build Coastguard Worker 
64*fb1b10abSAndroid Build Coastguard Worker   for (r = 0; r < 16; ++r) {
65*fb1b10abSAndroid Build Coastguard Worker     /* Calculate differences */
66*fb1b10abSAndroid Build Coastguard Worker     const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0]));
67*fb1b10abSAndroid Build Coastguard Worker     const __m128i v_mc_running_avg_y =
68*fb1b10abSAndroid Build Coastguard Worker         _mm_loadu_si128((__m128i *)(&mc_running_avg_y[0]));
69*fb1b10abSAndroid Build Coastguard Worker     __m128i v_running_avg_y;
70*fb1b10abSAndroid Build Coastguard Worker     const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
71*fb1b10abSAndroid Build Coastguard Worker     const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
72*fb1b10abSAndroid Build Coastguard Worker     /* Obtain the sign. FF if diff is negative. */
73*fb1b10abSAndroid Build Coastguard Worker     const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
74*fb1b10abSAndroid Build Coastguard Worker     /* Clamp absolute difference to 16 to be used to get mask. Doing this
75*fb1b10abSAndroid Build Coastguard Worker      * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */
76*fb1b10abSAndroid Build Coastguard Worker     const __m128i clamped_absdiff =
77*fb1b10abSAndroid Build Coastguard Worker         _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_16);
78*fb1b10abSAndroid Build Coastguard Worker     /* Get masks for l2 l1 and l0 adjustments */
79*fb1b10abSAndroid Build Coastguard Worker     const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff);
80*fb1b10abSAndroid Build Coastguard Worker     const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff);
81*fb1b10abSAndroid Build Coastguard Worker     const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff);
82*fb1b10abSAndroid Build Coastguard Worker     /* Get adjustments for l2, l1, and l0 */
83*fb1b10abSAndroid Build Coastguard Worker     __m128i adj2 = _mm_and_si128(mask2, l32);
84*fb1b10abSAndroid Build Coastguard Worker     const __m128i adj1 = _mm_and_si128(mask1, l21);
85*fb1b10abSAndroid Build Coastguard Worker     const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
86*fb1b10abSAndroid Build Coastguard Worker     __m128i adj, padj, nadj;
87*fb1b10abSAndroid Build Coastguard Worker 
88*fb1b10abSAndroid Build Coastguard Worker     /* Combine the adjustments and get absolute adjustments. */
89*fb1b10abSAndroid Build Coastguard Worker     adj2 = _mm_add_epi8(adj2, adj1);
90*fb1b10abSAndroid Build Coastguard Worker     adj = _mm_sub_epi8(l3, adj2);
91*fb1b10abSAndroid Build Coastguard Worker     adj = _mm_andnot_si128(mask0, adj);
92*fb1b10abSAndroid Build Coastguard Worker     adj = _mm_or_si128(adj, adj0);
93*fb1b10abSAndroid Build Coastguard Worker 
94*fb1b10abSAndroid Build Coastguard Worker     /* Restore the sign and get positive and negative adjustments. */
95*fb1b10abSAndroid Build Coastguard Worker     padj = _mm_andnot_si128(diff_sign, adj);
96*fb1b10abSAndroid Build Coastguard Worker     nadj = _mm_and_si128(diff_sign, adj);
97*fb1b10abSAndroid Build Coastguard Worker 
98*fb1b10abSAndroid Build Coastguard Worker     /* Calculate filtered value. */
99*fb1b10abSAndroid Build Coastguard Worker     v_running_avg_y = _mm_adds_epu8(v_sig, padj);
100*fb1b10abSAndroid Build Coastguard Worker     v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj);
101*fb1b10abSAndroid Build Coastguard Worker     _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
102*fb1b10abSAndroid Build Coastguard Worker 
103*fb1b10abSAndroid Build Coastguard Worker     /* Adjustments <=7, and each element in acc_diff can fit in signed
104*fb1b10abSAndroid Build Coastguard Worker      * char.
105*fb1b10abSAndroid Build Coastguard Worker      */
106*fb1b10abSAndroid Build Coastguard Worker     acc_diff = _mm_adds_epi8(acc_diff, padj);
107*fb1b10abSAndroid Build Coastguard Worker     acc_diff = _mm_subs_epi8(acc_diff, nadj);
108*fb1b10abSAndroid Build Coastguard Worker 
109*fb1b10abSAndroid Build Coastguard Worker     /* Update pointers for next iteration. */
110*fb1b10abSAndroid Build Coastguard Worker     sig += sig_stride;
111*fb1b10abSAndroid Build Coastguard Worker     mc_running_avg_y += mc_avg_y_stride;
112*fb1b10abSAndroid Build Coastguard Worker     running_avg_y += avg_y_stride;
113*fb1b10abSAndroid Build Coastguard Worker   }
114*fb1b10abSAndroid Build Coastguard Worker 
115*fb1b10abSAndroid Build Coastguard Worker   {
116*fb1b10abSAndroid Build Coastguard Worker     /* Compute the sum of all pixel differences of this MB. */
117*fb1b10abSAndroid Build Coastguard Worker     unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff);
118*fb1b10abSAndroid Build Coastguard Worker     sum_diff_thresh = SUM_DIFF_THRESHOLD;
119*fb1b10abSAndroid Build Coastguard Worker     if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
120*fb1b10abSAndroid Build Coastguard Worker     if (abs_sum_diff > sum_diff_thresh) {
121*fb1b10abSAndroid Build Coastguard Worker       // Before returning to copy the block (i.e., apply no denoising),
122*fb1b10abSAndroid Build Coastguard Worker       // check if we can still apply some (weaker) temporal filtering to
123*fb1b10abSAndroid Build Coastguard Worker       // this block, that would otherwise not be denoised at all. Simplest
124*fb1b10abSAndroid Build Coastguard Worker       // is to apply an additional adjustment to running_avg_y to bring it
125*fb1b10abSAndroid Build Coastguard Worker       // closer to sig. The adjustment is capped by a maximum delta, and
126*fb1b10abSAndroid Build Coastguard Worker       // chosen such that in most cases the resulting sum_diff will be
127*fb1b10abSAndroid Build Coastguard Worker       // within the acceptable range given by sum_diff_thresh.
128*fb1b10abSAndroid Build Coastguard Worker 
129*fb1b10abSAndroid Build Coastguard Worker       // The delta is set by the excess of absolute pixel diff over the
130*fb1b10abSAndroid Build Coastguard Worker       // threshold.
131*fb1b10abSAndroid Build Coastguard Worker       int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1;
132*fb1b10abSAndroid Build Coastguard Worker       // Only apply the adjustment for max delta up to 3.
133*fb1b10abSAndroid Build Coastguard Worker       if (delta < 4) {
134*fb1b10abSAndroid Build Coastguard Worker         const __m128i k_delta = _mm_set1_epi8(delta);
135*fb1b10abSAndroid Build Coastguard Worker         sig -= sig_stride * 16;
136*fb1b10abSAndroid Build Coastguard Worker         mc_running_avg_y -= mc_avg_y_stride * 16;
137*fb1b10abSAndroid Build Coastguard Worker         running_avg_y -= avg_y_stride * 16;
138*fb1b10abSAndroid Build Coastguard Worker         for (r = 0; r < 16; ++r) {
139*fb1b10abSAndroid Build Coastguard Worker           __m128i v_running_avg_y =
140*fb1b10abSAndroid Build Coastguard Worker               _mm_loadu_si128((__m128i *)(&running_avg_y[0]));
141*fb1b10abSAndroid Build Coastguard Worker           // Calculate differences.
142*fb1b10abSAndroid Build Coastguard Worker           const __m128i v_sig = _mm_loadu_si128((__m128i *)(&sig[0]));
143*fb1b10abSAndroid Build Coastguard Worker           const __m128i v_mc_running_avg_y =
144*fb1b10abSAndroid Build Coastguard Worker               _mm_loadu_si128((__m128i *)(&mc_running_avg_y[0]));
145*fb1b10abSAndroid Build Coastguard Worker           const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
146*fb1b10abSAndroid Build Coastguard Worker           const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
147*fb1b10abSAndroid Build Coastguard Worker           // Obtain the sign. FF if diff is negative.
148*fb1b10abSAndroid Build Coastguard Worker           const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
149*fb1b10abSAndroid Build Coastguard Worker           // Clamp absolute difference to delta to get the adjustment.
150*fb1b10abSAndroid Build Coastguard Worker           const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
151*fb1b10abSAndroid Build Coastguard Worker           // Restore the sign and get positive and negative adjustments.
152*fb1b10abSAndroid Build Coastguard Worker           __m128i padj, nadj;
153*fb1b10abSAndroid Build Coastguard Worker           padj = _mm_andnot_si128(diff_sign, adj);
154*fb1b10abSAndroid Build Coastguard Worker           nadj = _mm_and_si128(diff_sign, adj);
155*fb1b10abSAndroid Build Coastguard Worker           // Calculate filtered value.
156*fb1b10abSAndroid Build Coastguard Worker           v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj);
157*fb1b10abSAndroid Build Coastguard Worker           v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj);
158*fb1b10abSAndroid Build Coastguard Worker           _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
159*fb1b10abSAndroid Build Coastguard Worker 
160*fb1b10abSAndroid Build Coastguard Worker           // Accumulate the adjustments.
161*fb1b10abSAndroid Build Coastguard Worker           acc_diff = _mm_subs_epi8(acc_diff, padj);
162*fb1b10abSAndroid Build Coastguard Worker           acc_diff = _mm_adds_epi8(acc_diff, nadj);
163*fb1b10abSAndroid Build Coastguard Worker 
164*fb1b10abSAndroid Build Coastguard Worker           // Update pointers for next iteration.
165*fb1b10abSAndroid Build Coastguard Worker           sig += sig_stride;
166*fb1b10abSAndroid Build Coastguard Worker           mc_running_avg_y += mc_avg_y_stride;
167*fb1b10abSAndroid Build Coastguard Worker           running_avg_y += avg_y_stride;
168*fb1b10abSAndroid Build Coastguard Worker         }
169*fb1b10abSAndroid Build Coastguard Worker         abs_sum_diff = abs_sum_diff_16x1(acc_diff);
170*fb1b10abSAndroid Build Coastguard Worker         if (abs_sum_diff > sum_diff_thresh) {
171*fb1b10abSAndroid Build Coastguard Worker           return COPY_BLOCK;
172*fb1b10abSAndroid Build Coastguard Worker         }
173*fb1b10abSAndroid Build Coastguard Worker       } else {
174*fb1b10abSAndroid Build Coastguard Worker         return COPY_BLOCK;
175*fb1b10abSAndroid Build Coastguard Worker       }
176*fb1b10abSAndroid Build Coastguard Worker     }
177*fb1b10abSAndroid Build Coastguard Worker   }
178*fb1b10abSAndroid Build Coastguard Worker 
179*fb1b10abSAndroid Build Coastguard Worker   vp8_copy_mem16x16(running_avg_y_start, avg_y_stride, sig_start, sig_stride);
180*fb1b10abSAndroid Build Coastguard Worker   return FILTER_BLOCK;
181*fb1b10abSAndroid Build Coastguard Worker }
182*fb1b10abSAndroid Build Coastguard Worker 
vp8_denoiser_filter_uv_sse2(unsigned char * mc_running_avg,int mc_avg_stride,unsigned char * running_avg,int avg_stride,unsigned char * sig,int sig_stride,unsigned int motion_magnitude,int increase_denoising)183*fb1b10abSAndroid Build Coastguard Worker int vp8_denoiser_filter_uv_sse2(unsigned char *mc_running_avg,
184*fb1b10abSAndroid Build Coastguard Worker                                 int mc_avg_stride, unsigned char *running_avg,
185*fb1b10abSAndroid Build Coastguard Worker                                 int avg_stride, unsigned char *sig,
186*fb1b10abSAndroid Build Coastguard Worker                                 int sig_stride, unsigned int motion_magnitude,
187*fb1b10abSAndroid Build Coastguard Worker                                 int increase_denoising) {
188*fb1b10abSAndroid Build Coastguard Worker   unsigned char *running_avg_start = running_avg;
189*fb1b10abSAndroid Build Coastguard Worker   unsigned char *sig_start = sig;
190*fb1b10abSAndroid Build Coastguard Worker   unsigned int sum_diff_thresh;
191*fb1b10abSAndroid Build Coastguard Worker   int r;
192*fb1b10abSAndroid Build Coastguard Worker   int shift_inc =
193*fb1b10abSAndroid Build Coastguard Worker       (increase_denoising && motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV)
194*fb1b10abSAndroid Build Coastguard Worker           ? 1
195*fb1b10abSAndroid Build Coastguard Worker           : 0;
196*fb1b10abSAndroid Build Coastguard Worker   __m128i acc_diff = _mm_setzero_si128();
197*fb1b10abSAndroid Build Coastguard Worker   const __m128i k_0 = _mm_setzero_si128();
198*fb1b10abSAndroid Build Coastguard Worker   const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
199*fb1b10abSAndroid Build Coastguard Worker   const __m128i k_8 = _mm_set1_epi8(8);
200*fb1b10abSAndroid Build Coastguard Worker   const __m128i k_16 = _mm_set1_epi8(16);
201*fb1b10abSAndroid Build Coastguard Worker   /* Modify each level's adjustment according to motion_magnitude. */
202*fb1b10abSAndroid Build Coastguard Worker   const __m128i l3 = _mm_set1_epi8(
203*fb1b10abSAndroid Build Coastguard Worker       (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD_UV) ? 7 + shift_inc : 6);
204*fb1b10abSAndroid Build Coastguard Worker   /* Difference between level 3 and level 2 is 2. */
205*fb1b10abSAndroid Build Coastguard Worker   const __m128i l32 = _mm_set1_epi8(2);
206*fb1b10abSAndroid Build Coastguard Worker   /* Difference between level 2 and level 1 is 1. */
207*fb1b10abSAndroid Build Coastguard Worker   const __m128i l21 = _mm_set1_epi8(1);
208*fb1b10abSAndroid Build Coastguard Worker 
209*fb1b10abSAndroid Build Coastguard Worker   {
210*fb1b10abSAndroid Build Coastguard Worker     const __m128i k_1 = _mm_set1_epi16(1);
211*fb1b10abSAndroid Build Coastguard Worker     __m128i vec_sum_block = _mm_setzero_si128();
212*fb1b10abSAndroid Build Coastguard Worker 
213*fb1b10abSAndroid Build Coastguard Worker     // Avoid denoising color signal if its close to average level.
214*fb1b10abSAndroid Build Coastguard Worker     for (r = 0; r < 8; ++r) {
215*fb1b10abSAndroid Build Coastguard Worker       const __m128i v_sig = _mm_loadl_epi64((__m128i *)(&sig[0]));
216*fb1b10abSAndroid Build Coastguard Worker       const __m128i v_sig_unpack = _mm_unpacklo_epi8(v_sig, k_0);
217*fb1b10abSAndroid Build Coastguard Worker       vec_sum_block = _mm_add_epi16(vec_sum_block, v_sig_unpack);
218*fb1b10abSAndroid Build Coastguard Worker       sig += sig_stride;
219*fb1b10abSAndroid Build Coastguard Worker     }
220*fb1b10abSAndroid Build Coastguard Worker     sig -= sig_stride * 8;
221*fb1b10abSAndroid Build Coastguard Worker     {
222*fb1b10abSAndroid Build Coastguard Worker       const __m128i hg_fe_dc_ba = _mm_madd_epi16(vec_sum_block, k_1);
223*fb1b10abSAndroid Build Coastguard Worker       const __m128i hgfe_dcba =
224*fb1b10abSAndroid Build Coastguard Worker           _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8));
225*fb1b10abSAndroid Build Coastguard Worker       const __m128i hgfedcba =
226*fb1b10abSAndroid Build Coastguard Worker           _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4));
227*fb1b10abSAndroid Build Coastguard Worker       const int sum_block = _mm_cvtsi128_si32(hgfedcba);
228*fb1b10abSAndroid Build Coastguard Worker       if (abs(sum_block - (128 * 8 * 8)) < SUM_DIFF_FROM_AVG_THRESH_UV) {
229*fb1b10abSAndroid Build Coastguard Worker         return COPY_BLOCK;
230*fb1b10abSAndroid Build Coastguard Worker       }
231*fb1b10abSAndroid Build Coastguard Worker     }
232*fb1b10abSAndroid Build Coastguard Worker   }
233*fb1b10abSAndroid Build Coastguard Worker 
234*fb1b10abSAndroid Build Coastguard Worker   for (r = 0; r < 4; ++r) {
235*fb1b10abSAndroid Build Coastguard Worker     /* Calculate differences */
236*fb1b10abSAndroid Build Coastguard Worker     const __m128i v_sig_low =
237*fb1b10abSAndroid Build Coastguard Worker         _mm_castpd_si128(_mm_load_sd((double *)(&sig[0])));
238*fb1b10abSAndroid Build Coastguard Worker     const __m128i v_sig = _mm_castpd_si128(_mm_loadh_pd(
239*fb1b10abSAndroid Build Coastguard Worker         _mm_castsi128_pd(v_sig_low), (double *)(&sig[sig_stride])));
240*fb1b10abSAndroid Build Coastguard Worker     const __m128i v_mc_running_avg_low =
241*fb1b10abSAndroid Build Coastguard Worker         _mm_castpd_si128(_mm_load_sd((double *)(&mc_running_avg[0])));
242*fb1b10abSAndroid Build Coastguard Worker     const __m128i v_mc_running_avg = _mm_castpd_si128(
243*fb1b10abSAndroid Build Coastguard Worker         _mm_loadh_pd(_mm_castsi128_pd(v_mc_running_avg_low),
244*fb1b10abSAndroid Build Coastguard Worker                      (double *)(&mc_running_avg[mc_avg_stride])));
245*fb1b10abSAndroid Build Coastguard Worker     const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg, v_sig);
246*fb1b10abSAndroid Build Coastguard Worker     const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg);
247*fb1b10abSAndroid Build Coastguard Worker     /* Obtain the sign. FF if diff is negative. */
248*fb1b10abSAndroid Build Coastguard Worker     const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
249*fb1b10abSAndroid Build Coastguard Worker     /* Clamp absolute difference to 16 to be used to get mask. Doing this
250*fb1b10abSAndroid Build Coastguard Worker      * allows us to use _mm_cmpgt_epi8, which operates on signed byte. */
251*fb1b10abSAndroid Build Coastguard Worker     const __m128i clamped_absdiff =
252*fb1b10abSAndroid Build Coastguard Worker         _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_16);
253*fb1b10abSAndroid Build Coastguard Worker     /* Get masks for l2 l1 and l0 adjustments */
254*fb1b10abSAndroid Build Coastguard Worker     const __m128i mask2 = _mm_cmpgt_epi8(k_16, clamped_absdiff);
255*fb1b10abSAndroid Build Coastguard Worker     const __m128i mask1 = _mm_cmpgt_epi8(k_8, clamped_absdiff);
256*fb1b10abSAndroid Build Coastguard Worker     const __m128i mask0 = _mm_cmpgt_epi8(k_4, clamped_absdiff);
257*fb1b10abSAndroid Build Coastguard Worker     /* Get adjustments for l2, l1, and l0 */
258*fb1b10abSAndroid Build Coastguard Worker     __m128i adj2 = _mm_and_si128(mask2, l32);
259*fb1b10abSAndroid Build Coastguard Worker     const __m128i adj1 = _mm_and_si128(mask1, l21);
260*fb1b10abSAndroid Build Coastguard Worker     const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
261*fb1b10abSAndroid Build Coastguard Worker     __m128i adj, padj, nadj;
262*fb1b10abSAndroid Build Coastguard Worker     __m128i v_running_avg;
263*fb1b10abSAndroid Build Coastguard Worker 
264*fb1b10abSAndroid Build Coastguard Worker     /* Combine the adjustments and get absolute adjustments. */
265*fb1b10abSAndroid Build Coastguard Worker     adj2 = _mm_add_epi8(adj2, adj1);
266*fb1b10abSAndroid Build Coastguard Worker     adj = _mm_sub_epi8(l3, adj2);
267*fb1b10abSAndroid Build Coastguard Worker     adj = _mm_andnot_si128(mask0, adj);
268*fb1b10abSAndroid Build Coastguard Worker     adj = _mm_or_si128(adj, adj0);
269*fb1b10abSAndroid Build Coastguard Worker 
270*fb1b10abSAndroid Build Coastguard Worker     /* Restore the sign and get positive and negative adjustments. */
271*fb1b10abSAndroid Build Coastguard Worker     padj = _mm_andnot_si128(diff_sign, adj);
272*fb1b10abSAndroid Build Coastguard Worker     nadj = _mm_and_si128(diff_sign, adj);
273*fb1b10abSAndroid Build Coastguard Worker 
274*fb1b10abSAndroid Build Coastguard Worker     /* Calculate filtered value. */
275*fb1b10abSAndroid Build Coastguard Worker     v_running_avg = _mm_adds_epu8(v_sig, padj);
276*fb1b10abSAndroid Build Coastguard Worker     v_running_avg = _mm_subs_epu8(v_running_avg, nadj);
277*fb1b10abSAndroid Build Coastguard Worker 
278*fb1b10abSAndroid Build Coastguard Worker     _mm_storel_pd((double *)&running_avg[0], _mm_castsi128_pd(v_running_avg));
279*fb1b10abSAndroid Build Coastguard Worker     _mm_storeh_pd((double *)&running_avg[avg_stride],
280*fb1b10abSAndroid Build Coastguard Worker                   _mm_castsi128_pd(v_running_avg));
281*fb1b10abSAndroid Build Coastguard Worker 
282*fb1b10abSAndroid Build Coastguard Worker     /* Adjustments <=7, and each element in acc_diff can fit in signed
283*fb1b10abSAndroid Build Coastguard Worker      * char.
284*fb1b10abSAndroid Build Coastguard Worker      */
285*fb1b10abSAndroid Build Coastguard Worker     acc_diff = _mm_adds_epi8(acc_diff, padj);
286*fb1b10abSAndroid Build Coastguard Worker     acc_diff = _mm_subs_epi8(acc_diff, nadj);
287*fb1b10abSAndroid Build Coastguard Worker 
288*fb1b10abSAndroid Build Coastguard Worker     /* Update pointers for next iteration. */
289*fb1b10abSAndroid Build Coastguard Worker     sig += sig_stride * 2;
290*fb1b10abSAndroid Build Coastguard Worker     mc_running_avg += mc_avg_stride * 2;
291*fb1b10abSAndroid Build Coastguard Worker     running_avg += avg_stride * 2;
292*fb1b10abSAndroid Build Coastguard Worker   }
293*fb1b10abSAndroid Build Coastguard Worker 
294*fb1b10abSAndroid Build Coastguard Worker   {
295*fb1b10abSAndroid Build Coastguard Worker     unsigned int abs_sum_diff = abs_sum_diff_16x1(acc_diff);
296*fb1b10abSAndroid Build Coastguard Worker     sum_diff_thresh = SUM_DIFF_THRESHOLD_UV;
297*fb1b10abSAndroid Build Coastguard Worker     if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH_UV;
298*fb1b10abSAndroid Build Coastguard Worker     if (abs_sum_diff > sum_diff_thresh) {
299*fb1b10abSAndroid Build Coastguard Worker       // Before returning to copy the block (i.e., apply no denoising),
300*fb1b10abSAndroid Build Coastguard Worker       // check if we can still apply some (weaker) temporal filtering to
301*fb1b10abSAndroid Build Coastguard Worker       // this block, that would otherwise not be denoised at all. Simplest
302*fb1b10abSAndroid Build Coastguard Worker       // is to apply an additional adjustment to running_avg_y to bring it
303*fb1b10abSAndroid Build Coastguard Worker       // closer to sig. The adjustment is capped by a maximum delta, and
304*fb1b10abSAndroid Build Coastguard Worker       // chosen such that in most cases the resulting sum_diff will be
305*fb1b10abSAndroid Build Coastguard Worker       // within the acceptable range given by sum_diff_thresh.
306*fb1b10abSAndroid Build Coastguard Worker 
307*fb1b10abSAndroid Build Coastguard Worker       // The delta is set by the excess of absolute pixel diff over the
308*fb1b10abSAndroid Build Coastguard Worker       // threshold.
309*fb1b10abSAndroid Build Coastguard Worker       int delta = ((abs_sum_diff - sum_diff_thresh) >> 8) + 1;
310*fb1b10abSAndroid Build Coastguard Worker       // Only apply the adjustment for max delta up to 3.
311*fb1b10abSAndroid Build Coastguard Worker       if (delta < 4) {
312*fb1b10abSAndroid Build Coastguard Worker         const __m128i k_delta = _mm_set1_epi8(delta);
313*fb1b10abSAndroid Build Coastguard Worker         sig -= sig_stride * 8;
314*fb1b10abSAndroid Build Coastguard Worker         mc_running_avg -= mc_avg_stride * 8;
315*fb1b10abSAndroid Build Coastguard Worker         running_avg -= avg_stride * 8;
316*fb1b10abSAndroid Build Coastguard Worker         for (r = 0; r < 4; ++r) {
317*fb1b10abSAndroid Build Coastguard Worker           // Calculate differences.
318*fb1b10abSAndroid Build Coastguard Worker           const __m128i v_sig_low =
319*fb1b10abSAndroid Build Coastguard Worker               _mm_castpd_si128(_mm_load_sd((double *)(&sig[0])));
320*fb1b10abSAndroid Build Coastguard Worker           const __m128i v_sig = _mm_castpd_si128(_mm_loadh_pd(
321*fb1b10abSAndroid Build Coastguard Worker               _mm_castsi128_pd(v_sig_low), (double *)(&sig[sig_stride])));
322*fb1b10abSAndroid Build Coastguard Worker           const __m128i v_mc_running_avg_low =
323*fb1b10abSAndroid Build Coastguard Worker               _mm_castpd_si128(_mm_load_sd((double *)(&mc_running_avg[0])));
324*fb1b10abSAndroid Build Coastguard Worker           const __m128i v_mc_running_avg = _mm_castpd_si128(
325*fb1b10abSAndroid Build Coastguard Worker               _mm_loadh_pd(_mm_castsi128_pd(v_mc_running_avg_low),
326*fb1b10abSAndroid Build Coastguard Worker                            (double *)(&mc_running_avg[mc_avg_stride])));
327*fb1b10abSAndroid Build Coastguard Worker           const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg, v_sig);
328*fb1b10abSAndroid Build Coastguard Worker           const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg);
329*fb1b10abSAndroid Build Coastguard Worker           // Obtain the sign. FF if diff is negative.
330*fb1b10abSAndroid Build Coastguard Worker           const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
331*fb1b10abSAndroid Build Coastguard Worker           // Clamp absolute difference to delta to get the adjustment.
332*fb1b10abSAndroid Build Coastguard Worker           const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
333*fb1b10abSAndroid Build Coastguard Worker           // Restore the sign and get positive and negative adjustments.
334*fb1b10abSAndroid Build Coastguard Worker           __m128i padj, nadj;
335*fb1b10abSAndroid Build Coastguard Worker           const __m128i v_running_avg_low =
336*fb1b10abSAndroid Build Coastguard Worker               _mm_castpd_si128(_mm_load_sd((double *)(&running_avg[0])));
337*fb1b10abSAndroid Build Coastguard Worker           __m128i v_running_avg = _mm_castpd_si128(
338*fb1b10abSAndroid Build Coastguard Worker               _mm_loadh_pd(_mm_castsi128_pd(v_running_avg_low),
339*fb1b10abSAndroid Build Coastguard Worker                            (double *)(&running_avg[avg_stride])));
340*fb1b10abSAndroid Build Coastguard Worker           padj = _mm_andnot_si128(diff_sign, adj);
341*fb1b10abSAndroid Build Coastguard Worker           nadj = _mm_and_si128(diff_sign, adj);
342*fb1b10abSAndroid Build Coastguard Worker           // Calculate filtered value.
343*fb1b10abSAndroid Build Coastguard Worker           v_running_avg = _mm_subs_epu8(v_running_avg, padj);
344*fb1b10abSAndroid Build Coastguard Worker           v_running_avg = _mm_adds_epu8(v_running_avg, nadj);
345*fb1b10abSAndroid Build Coastguard Worker 
346*fb1b10abSAndroid Build Coastguard Worker           _mm_storel_pd((double *)&running_avg[0],
347*fb1b10abSAndroid Build Coastguard Worker                         _mm_castsi128_pd(v_running_avg));
348*fb1b10abSAndroid Build Coastguard Worker           _mm_storeh_pd((double *)&running_avg[avg_stride],
349*fb1b10abSAndroid Build Coastguard Worker                         _mm_castsi128_pd(v_running_avg));
350*fb1b10abSAndroid Build Coastguard Worker 
351*fb1b10abSAndroid Build Coastguard Worker           // Accumulate the adjustments.
352*fb1b10abSAndroid Build Coastguard Worker           acc_diff = _mm_subs_epi8(acc_diff, padj);
353*fb1b10abSAndroid Build Coastguard Worker           acc_diff = _mm_adds_epi8(acc_diff, nadj);
354*fb1b10abSAndroid Build Coastguard Worker 
355*fb1b10abSAndroid Build Coastguard Worker           // Update pointers for next iteration.
356*fb1b10abSAndroid Build Coastguard Worker           sig += sig_stride * 2;
357*fb1b10abSAndroid Build Coastguard Worker           mc_running_avg += mc_avg_stride * 2;
358*fb1b10abSAndroid Build Coastguard Worker           running_avg += avg_stride * 2;
359*fb1b10abSAndroid Build Coastguard Worker         }
360*fb1b10abSAndroid Build Coastguard Worker         abs_sum_diff = abs_sum_diff_16x1(acc_diff);
361*fb1b10abSAndroid Build Coastguard Worker         if (abs_sum_diff > sum_diff_thresh) {
362*fb1b10abSAndroid Build Coastguard Worker           return COPY_BLOCK;
363*fb1b10abSAndroid Build Coastguard Worker         }
364*fb1b10abSAndroid Build Coastguard Worker       } else {
365*fb1b10abSAndroid Build Coastguard Worker         return COPY_BLOCK;
366*fb1b10abSAndroid Build Coastguard Worker       }
367*fb1b10abSAndroid Build Coastguard Worker     }
368*fb1b10abSAndroid Build Coastguard Worker   }
369*fb1b10abSAndroid Build Coastguard Worker 
370*fb1b10abSAndroid Build Coastguard Worker   vp8_copy_mem8x8(running_avg_start, avg_stride, sig_start, sig_stride);
371*fb1b10abSAndroid Build Coastguard Worker   return FILTER_BLOCK;
372*fb1b10abSAndroid Build Coastguard Worker }
373