xref: /aosp_15_r20/external/libmpeg2/common/x86/ideint_cac_ssse3.c (revision a97c2a1f0a796dc32bed80d3353c69c5fc07c750)
1*a97c2a1fSXin Li /******************************************************************************
2*a97c2a1fSXin Li  *
3*a97c2a1fSXin Li  * Copyright (C) 2015 The Android Open Source Project
4*a97c2a1fSXin Li  *
5*a97c2a1fSXin Li  * Licensed under the Apache License, Version 2.0 (the "License");
6*a97c2a1fSXin Li  * you may not use this file except in compliance with the License.
7*a97c2a1fSXin Li  * You may obtain a copy of the License at:
8*a97c2a1fSXin Li  *
9*a97c2a1fSXin Li  * http://www.apache.org/licenses/LICENSE-2.0
10*a97c2a1fSXin Li  *
11*a97c2a1fSXin Li  * Unless required by applicable law or agreed to in writing, software
12*a97c2a1fSXin Li  * distributed under the License is distributed on an "AS IS" BASIS,
13*a97c2a1fSXin Li  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14*a97c2a1fSXin Li  * See the License for the specific language governing permissions and
15*a97c2a1fSXin Li  * limitations under the License.
16*a97c2a1fSXin Li  *
17*a97c2a1fSXin Li  *****************************************************************************
18*a97c2a1fSXin Li  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19*a97c2a1fSXin Li */
20*a97c2a1fSXin Li /**
21*a97c2a1fSXin Li *******************************************************************************
22*a97c2a1fSXin Li * @file
23*a97c2a1fSXin Li *  ideint_cac_ssse3.c
24*a97c2a1fSXin Li *
25*a97c2a1fSXin Li * @brief
26*a97c2a1fSXin Li *  This file include the definitions of the combing  artifact check function
27*a97c2a1fSXin Li * of the de-interlacer and some  variant of that.
28*a97c2a1fSXin Li *
29*a97c2a1fSXin Li * @author
30*a97c2a1fSXin Li *  Ittiam
31*a97c2a1fSXin Li *
32*a97c2a1fSXin Li * @par List of Functions:
33*a97c2a1fSXin Li *  cac_4x8()
34*a97c2a1fSXin Li *  ideint_cac()
35*a97c2a1fSXin Li *
36*a97c2a1fSXin Li * @remarks
37*a97c2a1fSXin Li *  In the de-interlacer workspace, cac is not a seperate  assembly module as
38*a97c2a1fSXin Li * it comes along with the  de_int_decision() function. But in C-Model, to
39*a97c2a1fSXin Li * keep  the things cleaner, it was made to be a separate  function during
40*a97c2a1fSXin Li * cac experiments long after the  assembly was written by Mudit.
41*a97c2a1fSXin Li *
42*a97c2a1fSXin Li *******************************************************************************
43*a97c2a1fSXin Li */
44*a97c2a1fSXin Li /*****************************************************************************/
45*a97c2a1fSXin Li /* File Includes                                                             */
46*a97c2a1fSXin Li /*****************************************************************************/
47*a97c2a1fSXin Li /* System include files */
48*a97c2a1fSXin Li #include <stdio.h>
49*a97c2a1fSXin Li #include <stdint.h>
50*a97c2a1fSXin Li #include <string.h>
51*a97c2a1fSXin Li #include <stdlib.h>
52*a97c2a1fSXin Li #include <immintrin.h>
53*a97c2a1fSXin Li 
54*a97c2a1fSXin Li /* User include files */
55*a97c2a1fSXin Li #include "icv_datatypes.h"
56*a97c2a1fSXin Li #include "icv_macros.h"
57*a97c2a1fSXin Li #include "icv.h"
58*a97c2a1fSXin Li #include "icv_variance.h"
59*a97c2a1fSXin Li #include "icv_sad.h"
60*a97c2a1fSXin Li #include "ideint.h"
61*a97c2a1fSXin Li #include "ideint_defs.h"
62*a97c2a1fSXin Li #include "ideint_structs.h"
63*a97c2a1fSXin Li #include "ideint_cac.h"
64*a97c2a1fSXin Li 
65*a97c2a1fSXin Li /**
66*a97c2a1fSXin Li *******************************************************************************
67*a97c2a1fSXin Li *
68*a97c2a1fSXin Li * @brief
69*a97c2a1fSXin Li * Combing artifact check function for 8x8 block
70*a97c2a1fSXin Li *
71*a97c2a1fSXin Li * @par   Description
72*a97c2a1fSXin Li * Determines CAC for 8x8 block by calling 8x4 CAC function
73*a97c2a1fSXin Li *
74*a97c2a1fSXin Li * @param[in] pu1_top
75*a97c2a1fSXin Li *  Top field
76*a97c2a1fSXin Li *
77*a97c2a1fSXin Li * @param[in] pu1_bot
78*a97c2a1fSXin Li *  Bottom field
79*a97c2a1fSXin Li *
80*a97c2a1fSXin Li * @param[in] top_strd
81*a97c2a1fSXin Li *  Top field Stride
82*a97c2a1fSXin Li *
83*a97c2a1fSXin Li * @param[in] bot_strd
84*a97c2a1fSXin Li *  Bottom field stride
85*a97c2a1fSXin Li *
86*a97c2a1fSXin Li * @returns
87*a97c2a1fSXin Li * combing artifact flag (1 = detected, 0 = not detected)
88*a97c2a1fSXin Li *
89*a97c2a1fSXin Li * @remarks
90*a97c2a1fSXin Li *
91*a97c2a1fSXin Li *******************************************************************************
92*a97c2a1fSXin Li */
ideint_cac_8x8_ssse3(UWORD8 * pu1_top,UWORD8 * pu1_bot,WORD32 top_strd,WORD32 bot_strd)93*a97c2a1fSXin Li WORD32 ideint_cac_8x8_ssse3(UWORD8 *pu1_top,
94*a97c2a1fSXin Li                             UWORD8 *pu1_bot,
95*a97c2a1fSXin Li                             WORD32 top_strd,
96*a97c2a1fSXin Li                             WORD32 bot_strd)
97*a97c2a1fSXin Li {
98*a97c2a1fSXin Li     WORD32 ca;        /* combing artifact result                          */
99*a97c2a1fSXin Li     WORD32 i;
100*a97c2a1fSXin Li     WORD32 adj[2] = {0};
101*a97c2a1fSXin Li     WORD32 alt[2] = {0};
102*a97c2a1fSXin Li     WORD32 sum_1, sum_2, sum_3, sum_4;
103*a97c2a1fSXin Li     WORD32 sum_diff, diff_sum;
104*a97c2a1fSXin Li 
105*a97c2a1fSXin Li     __m128i top[4];
106*a97c2a1fSXin Li     __m128i bot[4];
107*a97c2a1fSXin Li     __m128i sum_t[4];
108*a97c2a1fSXin Li     __m128i sum_b[4];
109*a97c2a1fSXin Li     __m128i zero;
110*a97c2a1fSXin Li 
111*a97c2a1fSXin Li 
112*a97c2a1fSXin Li     zero = _mm_setzero_si128();
113*a97c2a1fSXin Li 
114*a97c2a1fSXin Li     for(i = 0; i < 4; i++)
115*a97c2a1fSXin Li     {
116*a97c2a1fSXin Li         /* Load top */
117*a97c2a1fSXin Li         top[i] = (__m128i)_mm_loadl_epi64((__m128i *) (pu1_top));
118*a97c2a1fSXin Li         pu1_top += top_strd;
119*a97c2a1fSXin Li 
120*a97c2a1fSXin Li         /* Load bottom */
121*a97c2a1fSXin Li         bot[i] = (__m128i)_mm_loadl_epi64((__m128i *) (pu1_bot));
122*a97c2a1fSXin Li         pu1_bot += bot_strd;
123*a97c2a1fSXin Li 
124*a97c2a1fSXin Li         /* Unpack */
125*a97c2a1fSXin Li         top[i] = _mm_unpacklo_epi8(top[i], zero);
126*a97c2a1fSXin Li         bot[i] = _mm_unpacklo_epi8(bot[i], zero);
127*a97c2a1fSXin Li 
128*a97c2a1fSXin Li         /* Compute row sums */
129*a97c2a1fSXin Li         sum_t[i]  = _mm_sad_epu8(top[i], zero);
130*a97c2a1fSXin Li         sum_b[i]  = _mm_sad_epu8(bot[i], zero);
131*a97c2a1fSXin Li     }
132*a97c2a1fSXin Li 
133*a97c2a1fSXin Li     /* Compute row based alt and adj */
134*a97c2a1fSXin Li     for(i = 0; i < 4; i += 2)
135*a97c2a1fSXin Li     {
136*a97c2a1fSXin Li         sum_1 = _mm_cvtsi128_si32(sum_t[i + 0]);
137*a97c2a1fSXin Li         sum_2 = _mm_cvtsi128_si32(sum_b[i + 0]);
138*a97c2a1fSXin Li         sum_diff = ABS_DIF(sum_1, sum_2);
139*a97c2a1fSXin Li         if(sum_diff >= RSUM_CSUM_THRESH)
140*a97c2a1fSXin Li             adj[0] += sum_diff;
141*a97c2a1fSXin Li 
142*a97c2a1fSXin Li         sum_3 = _mm_cvtsi128_si32(sum_t[i + 1]);
143*a97c2a1fSXin Li         sum_4 = _mm_cvtsi128_si32(sum_b[i + 1]);
144*a97c2a1fSXin Li         sum_diff = ABS_DIF(sum_3, sum_4);
145*a97c2a1fSXin Li         if(sum_diff >= RSUM_CSUM_THRESH)
146*a97c2a1fSXin Li             adj[0] += sum_diff;
147*a97c2a1fSXin Li 
148*a97c2a1fSXin Li         alt[0] += ABS_DIF(sum_1, sum_3);
149*a97c2a1fSXin Li         alt[0] += ABS_DIF(sum_2, sum_4);
150*a97c2a1fSXin Li 
151*a97c2a1fSXin Li         sum_1 = _mm_cvtsi128_si32(_mm_srli_si128(sum_t[i + 0], 8));
152*a97c2a1fSXin Li         sum_2 = _mm_cvtsi128_si32(_mm_srli_si128(sum_b[i + 0], 8));
153*a97c2a1fSXin Li         sum_diff = ABS_DIF(sum_1, sum_2);
154*a97c2a1fSXin Li         if(sum_diff >= RSUM_CSUM_THRESH)
155*a97c2a1fSXin Li             adj[1] += sum_diff;
156*a97c2a1fSXin Li 
157*a97c2a1fSXin Li         sum_3 = _mm_cvtsi128_si32(_mm_srli_si128(sum_t[i + 1], 8));
158*a97c2a1fSXin Li         sum_4 = _mm_cvtsi128_si32(_mm_srli_si128(sum_b[i + 1], 8));
159*a97c2a1fSXin Li         sum_diff = ABS_DIF(sum_3, sum_4);
160*a97c2a1fSXin Li         if(sum_diff >= RSUM_CSUM_THRESH)
161*a97c2a1fSXin Li             adj[1] += sum_diff;
162*a97c2a1fSXin Li 
163*a97c2a1fSXin Li         alt[1] += ABS_DIF(sum_1, sum_3);
164*a97c2a1fSXin Li         alt[1] += ABS_DIF(sum_2, sum_4);
165*a97c2a1fSXin Li     }
166*a97c2a1fSXin Li 
167*a97c2a1fSXin Li     /* Compute column based adj */
168*a97c2a1fSXin Li     {
169*a97c2a1fSXin Li         __m128i avg1, avg2;
170*a97c2a1fSXin Li         __m128i top_avg, bot_avg;
171*a97c2a1fSXin Li         __m128i min, max, diff, thresh;
172*a97c2a1fSXin Li         __m128i mask;
173*a97c2a1fSXin Li         avg1 = _mm_avg_epu8(top[0], top[1]);
174*a97c2a1fSXin Li         avg2 = _mm_avg_epu8(top[2], top[3]);
175*a97c2a1fSXin Li         top_avg = _mm_avg_epu8(avg1, avg2);
176*a97c2a1fSXin Li 
177*a97c2a1fSXin Li         avg1 = _mm_avg_epu8(bot[0], bot[1]);
178*a97c2a1fSXin Li         avg2 = _mm_avg_epu8(bot[2], bot[3]);
179*a97c2a1fSXin Li         bot_avg = _mm_avg_epu8(avg1, avg2);
180*a97c2a1fSXin Li 
181*a97c2a1fSXin Li         min = _mm_min_epu8(top_avg, bot_avg);
182*a97c2a1fSXin Li         max = _mm_max_epu8(top_avg, bot_avg);
183*a97c2a1fSXin Li 
184*a97c2a1fSXin Li         diff = _mm_sub_epi16(max, min);
185*a97c2a1fSXin Li         thresh = _mm_set1_epi16((RSUM_CSUM_THRESH >> 2) - 1);
186*a97c2a1fSXin Li 
187*a97c2a1fSXin Li         mask = _mm_cmpgt_epi16(diff, thresh);
188*a97c2a1fSXin Li         diff = _mm_and_si128(diff, mask);
189*a97c2a1fSXin Li 
190*a97c2a1fSXin Li         diff_sum = _mm_extract_epi16(diff, 0);
191*a97c2a1fSXin Li         diff_sum += _mm_extract_epi16(diff, 1);
192*a97c2a1fSXin Li         diff_sum += _mm_extract_epi16(diff, 2);
193*a97c2a1fSXin Li         diff_sum += _mm_extract_epi16(diff, 3);
194*a97c2a1fSXin Li 
195*a97c2a1fSXin Li         adj[0] += diff_sum << 2;
196*a97c2a1fSXin Li 
197*a97c2a1fSXin Li         diff_sum = _mm_extract_epi16(diff, 4);
198*a97c2a1fSXin Li         diff_sum += _mm_extract_epi16(diff, 5);
199*a97c2a1fSXin Li         diff_sum += _mm_extract_epi16(diff, 6);
200*a97c2a1fSXin Li         diff_sum += _mm_extract_epi16(diff, 7);
201*a97c2a1fSXin Li 
202*a97c2a1fSXin Li         adj[1] += diff_sum << 2;
203*a97c2a1fSXin Li 
204*a97c2a1fSXin Li     }
205*a97c2a1fSXin Li 
206*a97c2a1fSXin Li     /* Compute column based alt */
207*a97c2a1fSXin Li     {
208*a97c2a1fSXin Li         __m128i avg1, avg2;
209*a97c2a1fSXin Li         __m128i even_avg, odd_avg, diff;
210*a97c2a1fSXin Li         avg1 = _mm_avg_epu8(top[0], bot[0]);
211*a97c2a1fSXin Li         avg2 = _mm_avg_epu8(top[2], bot[2]);
212*a97c2a1fSXin Li         even_avg = _mm_avg_epu8(avg1, avg2);
213*a97c2a1fSXin Li 
214*a97c2a1fSXin Li         avg1 = _mm_avg_epu8(top[1], bot[1]);
215*a97c2a1fSXin Li         avg2 = _mm_avg_epu8(top[3], bot[3]);
216*a97c2a1fSXin Li         odd_avg = _mm_avg_epu8(avg1, avg2);
217*a97c2a1fSXin Li 
218*a97c2a1fSXin Li         diff = _mm_sad_epu8(even_avg, odd_avg);
219*a97c2a1fSXin Li 
220*a97c2a1fSXin Li 
221*a97c2a1fSXin Li         diff_sum = _mm_cvtsi128_si32(diff);
222*a97c2a1fSXin Li         alt[0] += diff_sum << 2;
223*a97c2a1fSXin Li 
224*a97c2a1fSXin Li         diff_sum = _mm_cvtsi128_si32(_mm_srli_si128(diff, 8));
225*a97c2a1fSXin Li         alt[1] += diff_sum << 2;
226*a97c2a1fSXin Li 
227*a97c2a1fSXin Li     }
228*a97c2a1fSXin Li     alt[0] += (alt[0] >> SAD_BIAS_MULT_SHIFT) + (SAD_BIAS_ADDITIVE >> 1);
229*a97c2a1fSXin Li     alt[1] += (alt[1] >> SAD_BIAS_MULT_SHIFT) + (SAD_BIAS_ADDITIVE >> 1);
230*a97c2a1fSXin Li 
231*a97c2a1fSXin Li     ca    = (alt[0] < adj[0]);
232*a97c2a1fSXin Li     ca   |= (alt[1] < adj[1]);
233*a97c2a1fSXin Li 
234*a97c2a1fSXin Li     return ca;
235*a97c2a1fSXin Li }
236*a97c2a1fSXin Li 
237