xref: /aosp_15_r20/external/libmpeg2/common/x86/icv_sad_ssse3.c (revision a97c2a1f0a796dc32bed80d3353c69c5fc07c750)
1*a97c2a1fSXin Li /******************************************************************************
2*a97c2a1fSXin Li  *
3*a97c2a1fSXin Li  * Copyright (C) 2015 The Android Open Source Project
4*a97c2a1fSXin Li  *
5*a97c2a1fSXin Li  * Licensed under the Apache License, Version 2.0 (the "License");
6*a97c2a1fSXin Li  * you may not use this file except in compliance with the License.
7*a97c2a1fSXin Li  * You may obtain a copy of the License at:
8*a97c2a1fSXin Li  *
9*a97c2a1fSXin Li  * http://www.apache.org/licenses/LICENSE-2.0
10*a97c2a1fSXin Li  *
11*a97c2a1fSXin Li  * Unless required by applicable law or agreed to in writing, software
12*a97c2a1fSXin Li  * distributed under the License is distributed on an "AS IS" BASIS,
13*a97c2a1fSXin Li  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14*a97c2a1fSXin Li  * See the License for the specific language governing permissions and
15*a97c2a1fSXin Li  * limitations under the License.
16*a97c2a1fSXin Li  *
17*a97c2a1fSXin Li  *****************************************************************************
18*a97c2a1fSXin Li  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19*a97c2a1fSXin Li */
20*a97c2a1fSXin Li /**
21*a97c2a1fSXin Li *******************************************************************************
22*a97c2a1fSXin Li * @file
23*a97c2a1fSXin Li *  icv_sad.c
24*a97c2a1fSXin Li *
25*a97c2a1fSXin Li * @brief
26*a97c2a1fSXin Li *  This file contains the functions to compute SAD
27*a97c2a1fSXin Li *
28*a97c2a1fSXin Li * @author
29*a97c2a1fSXin Li *  Ittiam
30*a97c2a1fSXin Li *
31*a97c2a1fSXin Li * @par List of Functions:
32*a97c2a1fSXin Li *  icv_sad_8x4_ssse3()
33*a97c2a1fSXin Li *
34*a97c2a1fSXin Li * @remarks
35*a97c2a1fSXin Li *  None
36*a97c2a1fSXin Li *
37*a97c2a1fSXin Li *******************************************************************************
38*a97c2a1fSXin Li */
39*a97c2a1fSXin Li /*****************************************************************************/
40*a97c2a1fSXin Li /* File Includes                                                             */
41*a97c2a1fSXin Li /*****************************************************************************/
42*a97c2a1fSXin Li /* System include files */
43*a97c2a1fSXin Li #include <stdio.h>
44*a97c2a1fSXin Li #include <stdint.h>
45*a97c2a1fSXin Li #include <string.h>
46*a97c2a1fSXin Li #include <stdlib.h>
47*a97c2a1fSXin Li #include <assert.h>
48*a97c2a1fSXin Li #include <immintrin.h>
49*a97c2a1fSXin Li 
50*a97c2a1fSXin Li /* User include files */
51*a97c2a1fSXin Li #include "icv_datatypes.h"
52*a97c2a1fSXin Li #include "icv_macros.h"
53*a97c2a1fSXin Li #include "icv_platform_macros.h"
54*a97c2a1fSXin Li #include "icv.h"
55*a97c2a1fSXin Li 
56*a97c2a1fSXin Li /**
57*a97c2a1fSXin Li *******************************************************************************
58*a97c2a1fSXin Li *
59*a97c2a1fSXin Li * @brief
60*a97c2a1fSXin Li *  Compute 8x4 SAD
61*a97c2a1fSXin Li *
62*a97c2a1fSXin Li * @par   Description
63*a97c2a1fSXin Li *  Compute 8x4 sum of absolute differences between source and reference block
64*a97c2a1fSXin Li *
65*a97c2a1fSXin Li * @param[in] pu1_src
66*a97c2a1fSXin Li *  Source buffer
67*a97c2a1fSXin Li *
68*a97c2a1fSXin Li * @param[in] pu1_ref
69*a97c2a1fSXin Li *  Reference buffer
70*a97c2a1fSXin Li *
71*a97c2a1fSXin Li * @param[in] src_strd
72*a97c2a1fSXin Li *  Source stride
73*a97c2a1fSXin Li *
74*a97c2a1fSXin Li * @param[in] ref_strd
75*a97c2a1fSXin Li *  Reference stride
76*a97c2a1fSXin Li *
77*a97c2a1fSXin Li * @param[in] wd
78*a97c2a1fSXin Li *  Assumed to be 8
79*a97c2a1fSXin Li *
80*a97c2a1fSXin Li * @param[in] ht
81*a97c2a1fSXin Li *  Assumed to be 4
82*a97c2a1fSXin Li 
83*a97c2a1fSXin Li * @returns
84*a97c2a1fSXin Li *  SAD
85*a97c2a1fSXin Li *
86*a97c2a1fSXin Li * @remarks
87*a97c2a1fSXin Li *
88*a97c2a1fSXin Li *******************************************************************************
89*a97c2a1fSXin Li */
icv_sad_8x4_ssse3(UWORD8 * pu1_src,UWORD8 * pu1_ref,WORD32 src_strd,WORD32 ref_strd,WORD32 wd,WORD32 ht)90*a97c2a1fSXin Li WORD32 icv_sad_8x4_ssse3(UWORD8 *pu1_src,
91*a97c2a1fSXin Li                          UWORD8 *pu1_ref,
92*a97c2a1fSXin Li                          WORD32 src_strd,
93*a97c2a1fSXin Li                          WORD32 ref_strd,
94*a97c2a1fSXin Li                          WORD32 wd,
95*a97c2a1fSXin Li                          WORD32 ht)
96*a97c2a1fSXin Li {
97*a97c2a1fSXin Li     WORD32 sad;
98*a97c2a1fSXin Li     __m128 src_r0, src_r1;
99*a97c2a1fSXin Li     __m128 ref_r0, ref_r1;
100*a97c2a1fSXin Li     __m128i res_r0, res_r1;
101*a97c2a1fSXin Li 
102*a97c2a1fSXin Li     UNUSED(wd);
103*a97c2a1fSXin Li     UNUSED(ht);
104*a97c2a1fSXin Li     ASSERT(wd == 8);
105*a97c2a1fSXin Li     ASSERT(ht == 4);
106*a97c2a1fSXin Li 
107*a97c2a1fSXin Li     /* Load source */
108*a97c2a1fSXin Li     src_r0 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src));
109*a97c2a1fSXin Li     pu1_src += src_strd;
110*a97c2a1fSXin Li 
111*a97c2a1fSXin Li     src_r1 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_src));
112*a97c2a1fSXin Li     pu1_src += src_strd;
113*a97c2a1fSXin Li 
114*a97c2a1fSXin Li     src_r0 = _mm_loadh_pi (src_r0, (__m64 *) (pu1_src));
115*a97c2a1fSXin Li     pu1_src += src_strd;
116*a97c2a1fSXin Li 
117*a97c2a1fSXin Li     src_r1 = _mm_loadh_pi (src_r1, (__m64 *) (pu1_src));
118*a97c2a1fSXin Li     pu1_src += src_strd;
119*a97c2a1fSXin Li 
120*a97c2a1fSXin Li 
121*a97c2a1fSXin Li     /* Load reference */
122*a97c2a1fSXin Li     ref_r0 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_ref));
123*a97c2a1fSXin Li     pu1_ref += ref_strd;
124*a97c2a1fSXin Li 
125*a97c2a1fSXin Li     ref_r1 = (__m128)_mm_loadl_epi64((__m128i *) (pu1_ref));
126*a97c2a1fSXin Li     pu1_ref += ref_strd;
127*a97c2a1fSXin Li 
128*a97c2a1fSXin Li     ref_r0 = _mm_loadh_pi (ref_r0, (__m64 *) (pu1_ref));
129*a97c2a1fSXin Li     pu1_ref += ref_strd;
130*a97c2a1fSXin Li 
131*a97c2a1fSXin Li     ref_r1 = _mm_loadh_pi (ref_r1, (__m64 *) (pu1_ref));
132*a97c2a1fSXin Li     pu1_ref += ref_strd;
133*a97c2a1fSXin Li 
134*a97c2a1fSXin Li     /* Compute SAD for each row */
135*a97c2a1fSXin Li     res_r0 = _mm_sad_epu8((__m128i)src_r0, (__m128i)ref_r0);
136*a97c2a1fSXin Li     res_r1 = _mm_sad_epu8((__m128i)src_r1, (__m128i)ref_r1);
137*a97c2a1fSXin Li 
138*a97c2a1fSXin Li     /* Accumulate SAD */
139*a97c2a1fSXin Li     res_r0 = _mm_add_epi64(res_r0,  res_r1);
140*a97c2a1fSXin Li     res_r0 = _mm_add_epi64(res_r0, _mm_srli_si128(res_r0, 8));
141*a97c2a1fSXin Li 
142*a97c2a1fSXin Li     sad  = _mm_cvtsi128_si32(res_r0);
143*a97c2a1fSXin Li 
144*a97c2a1fSXin Li     return sad;
145*a97c2a1fSXin Li }
146