xref: /aosp_15_r20/external/libavc/common/x86/svc/isvc_intra_resample_sse42.c (revision 495ae853bb871d1e5a258cb02c2cc13cde8ddb9a)
1 /******************************************************************************
2  *
3  * Copyright (C) 2022 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19  */
20 /*!
21  **************************************************************************
22 
23  * * \file ih264d_resamp_svc.c
24  *
25  * \brief
26  *    Contains routines that
27  * resample for SVC resampling
28  *
29  * Detailed_description
30  *
31  * \date
32  *
33  *
34  *
35  * \author
36 
37  * **************************************************************************
38 
39  */
40 #include <immintrin.h>
41 
42 #include "ih264_typedefs.h"
43 #include "ih264_debug.h"
44 #include "isvc_intra_resample.h"
45 
isvc_interpolate_base_luma_dyadic_sse42(UWORD8 * pu1_inp_buf,WORD16 * pi2_tmp_filt_buf,UWORD8 * pu1_out_buf,WORD32 i4_out_stride)46 void isvc_interpolate_base_luma_dyadic_sse42(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
47                                              UWORD8 *pu1_out_buf, WORD32 i4_out_stride)
48 {
49     WORD32 i4_y;
50     WORD32 i4_filt_stride, i4_src_stride;
51     UWORD8 *pu1_inp, *pu1_out;
52     WORD16 *pi2_tmp;
53 
54     __m128i i4_samp_16x8b_0, i4_samp_16x8b_1, i4_samp_16x8b_2, i4_samp_16x8b_3;
55     __m128i i4_samp_8x16b_0, i4_samp_8x16b_1, i4_samp_8x16b_2, i4_samp_8x16b_3;
56     __m128i i4_res_8x16b_r1_1, i4_res_8x16b_r1_2, i4_res_8x16b_r1_3;
57     __m128i i4_res_8x16b_r2_1, i4_res_8x16b_r2_2, i4_res_8x16b_r2_3;
58 
59     /* Filter coefficient values for phase 4 */
60     __m128i i4_coeff_8x16b_0 = _mm_set1_epi16(-3);
61     __m128i i4_coeff_8x16b_1 = _mm_set1_epi16(28);
62     i4_filt_stride = 12;
63     i4_src_stride = DYADIC_REF_W_Y;
64 
65     /* Initializing pointers */
66     pu1_inp = pu1_inp_buf;
67     pi2_tmp = pi2_tmp_filt_buf;
68     pu1_out = pu1_out_buf;
69 
70     /* Vertical interpolation */
71     /*First 64 bit */
72     /* y = 0, y_phase = 12 */
73     i4_samp_16x8b_0 = _mm_loadl_epi64((__m128i *) (pu1_inp));
74     i4_samp_16x8b_1 = _mm_loadl_epi64((__m128i *) (pu1_inp + i4_src_stride));
75     i4_samp_16x8b_2 = _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1)));
76     i4_samp_16x8b_3 = _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1) + i4_src_stride));
77     pu1_inp += (i4_src_stride << 2);
78     i4_samp_8x16b_0 = _mm_cvtepu8_epi16(i4_samp_16x8b_0);
79     i4_samp_8x16b_1 = _mm_cvtepu8_epi16(i4_samp_16x8b_1);
80     i4_samp_8x16b_2 = _mm_cvtepu8_epi16(i4_samp_16x8b_2);
81     i4_samp_8x16b_3 = _mm_cvtepu8_epi16(i4_samp_16x8b_3);
82 
83     /* since y_phase 12 for y = 0 */
84     /*Multiply by 8 =>  left shift by 3*/
85     i4_res_8x16b_r1_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
86     i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
87     i4_res_8x16b_r1_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
88 
89     i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
90     i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_0);
91     i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
92 
93     _mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
94     pi2_tmp += i4_filt_stride;
95 
96     for(i4_y = 1; i4_y < 15; i4_y += 2)
97     {
98         i4_samp_8x16b_0 = i4_samp_8x16b_1;
99         i4_samp_8x16b_1 = i4_samp_8x16b_2;
100         i4_samp_8x16b_2 = i4_samp_8x16b_3;
101         i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
102 
103         /* y_phase is 4 for odd values of y */
104         /* and 12 for even values of y		*/
105         //*Multiply by 8 =>  left shift by 3*/
106         i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
107         i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
108         i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
109 
110         i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
111         i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
112         i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
113 
114         i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
115         i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
116 
117         i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
118         i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
119 
120         i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
121         i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
122 
123         /* Storing the results */
124         _mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
125         _mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
126         pi2_tmp += (i4_filt_stride << 1);
127         pu1_inp += i4_src_stride;
128 
129         } /* End of loop over y */
130 
131         /* y = 15, y_phase = 4 */
132         i4_samp_8x16b_0 = i4_samp_8x16b_1;
133         i4_samp_8x16b_1 = i4_samp_8x16b_2;
134         i4_samp_8x16b_2 = i4_samp_8x16b_3;
135         i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
136 
137         i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
138         i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
139         i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
140         i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
141 
142         i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
143         i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
144 
145         /* Store the output */
146         _mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
147 
148         /* Reinitializing the ptrs */
149         pu1_inp = pu1_inp_buf;
150         pi2_tmp = pi2_tmp_filt_buf;
151 
152     /*Remaining 32 bit */
153     pu1_inp += 8;
154     pi2_tmp += 8;
155 
156         /* y = 0, y_phase = 12 */
157         i4_samp_16x8b_0 = _mm_loadl_epi64((__m128i *) (pu1_inp));
158         i4_samp_16x8b_1 = _mm_loadl_epi64((__m128i *) (pu1_inp + i4_src_stride));
159         i4_samp_16x8b_2 = _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1)));
160         i4_samp_16x8b_3 =
161             _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1) + i4_src_stride));
162         pu1_inp += (i4_src_stride << 2);
163         i4_samp_8x16b_0 = _mm_cvtepu8_epi16(i4_samp_16x8b_0);
164         i4_samp_8x16b_1 = _mm_cvtepu8_epi16(i4_samp_16x8b_1);
165         i4_samp_8x16b_2 = _mm_cvtepu8_epi16(i4_samp_16x8b_2);
166         i4_samp_8x16b_3 = _mm_cvtepu8_epi16(i4_samp_16x8b_3);
167 
168         /* since y_phase 12 for y = 0 */
169         /*Multiply by 8 =>  left shift by 3*/
170         i4_res_8x16b_r1_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
171         i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
172         i4_res_8x16b_r1_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
173 
174         i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
175         i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_0);
176         i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
177 
178         _mm_storel_epi64((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
179         pi2_tmp += i4_filt_stride;
180 
181         for(i4_y = 1; i4_y < 15; i4_y += 2)
182         {
183             i4_samp_8x16b_0 = i4_samp_8x16b_1;
184             i4_samp_8x16b_1 = i4_samp_8x16b_2;
185             i4_samp_8x16b_2 = i4_samp_8x16b_3;
186             i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
187 
188             /* y_phase is 4 for odd values of y */
189             /* and 12 for even values of y		*/
190             //*Multiply by 8 =>  left shift by 3*/
191             i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
192             i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
193             i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
194 
195             i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
196             i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
197             i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
198 
199             i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
200             i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
201 
202             i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
203             i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
204 
205             i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
206             i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
207 
208             /* Storing the results */
209             _mm_storel_epi64((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
210             _mm_storel_epi64((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
211             pi2_tmp += (i4_filt_stride << 1);
212             pu1_inp += i4_src_stride;
213 
214         } /* End of loop over y */
215 
216         /* y = 15, y_phase = 4 */
217         i4_samp_8x16b_0 = i4_samp_8x16b_1;
218         i4_samp_8x16b_1 = i4_samp_8x16b_2;
219         i4_samp_8x16b_2 = i4_samp_8x16b_3;
220         i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
221 
222         i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
223         i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
224         i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
225         i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
226 
227         i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
228         i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
229 
230         /* Store the output */
231         _mm_storel_epi64((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
232 
233         /* Reinitializing the ptrs */
234         pu1_inp = pu1_inp_buf;
235         pi2_tmp = pi2_tmp_filt_buf;
236 
237     {
238         __m128i coeff_c0_c1_8x16b = _mm_set_epi16(28, -3, 28, -3, 28, -3, 28, -3);
239         __m128i coeff_c2_c3_8x16b = _mm_set_epi16(-1, 8, -1, 8, -1, 8, -1, 8);
240         __m128i coeff_c3_c2_8x16b = _mm_set_epi16(8, -1, 8, -1, 8, -1, 8, -1);
241         __m128i coeff_c1_c0_8x16b = _mm_set_epi16(-3, 28, -3, 28, -3, 28, -3, 28);
242 
243         __m128i i4_samp_8x16b_rpart1_0, i4_samp_8x16b_rpart2_0;
244         __m128i i4_samp_8x16b_rpart1_1, i4_samp_8x16b_rpart2_1;
245         __m128i i4_samp_8x16b_rpart1_2, i4_samp_8x16b_rpart2_2;
246         __m128i i4_samp_8x16b_rpart1_3, i4_samp_8x16b_rpart2_3;
247         __m128i i4_samp_8x16b_rpart1_4, i4_samp_8x16b_rpart2_4;
248 
249         __m128i i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart2_0;
250         __m128i i4_res_4x32b_rpart1_1, i4_res_4x32b_rpart2_1;
251         __m128i i4_res_4x32b_rpart1_2, i4_res_4x32b_rpart2_2;
252         __m128i i4_res_4x32b_rpart1_3, i4_res_4x32b_rpart2_3;
253 
254         __m128i res_512 = _mm_set1_epi32(512);
255         /* Horizontal interpolation */
256         for(i4_y = 0; i4_y < 16; i4_y++)
257         {
258             i4_samp_8x16b_rpart1_0 = _mm_loadu_si128((__m128i *) pi2_tmp);
259             i4_samp_8x16b_rpart2_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 4));
260 
261             i4_samp_8x16b_rpart1_1 = _mm_srli_si128(i4_samp_8x16b_rpart1_0, 2);
262             i4_samp_8x16b_rpart1_2 = _mm_srli_si128(i4_samp_8x16b_rpart1_0, 4);
263             i4_samp_8x16b_rpart1_3 = _mm_srli_si128(i4_samp_8x16b_rpart1_0, 6);
264             i4_samp_8x16b_rpart1_4 = _mm_srli_si128(i4_samp_8x16b_rpart1_0, 8);
265 
266             i4_samp_8x16b_rpart2_1 = _mm_srli_si128(i4_samp_8x16b_rpart2_0, 2);
267             i4_samp_8x16b_rpart2_2 = _mm_srli_si128(i4_samp_8x16b_rpart2_0, 4);
268             i4_samp_8x16b_rpart2_3 = _mm_srli_si128(i4_samp_8x16b_rpart2_0, 6);
269             i4_samp_8x16b_rpart2_4 = _mm_srli_si128(i4_samp_8x16b_rpart2_0, 8);
270 
271             i4_samp_8x16b_rpart1_0 =
272                 _mm_unpacklo_epi16(i4_samp_8x16b_rpart1_0, i4_samp_8x16b_rpart1_1);
273             i4_samp_8x16b_rpart1_1 =
274                 _mm_unpacklo_epi16(i4_samp_8x16b_rpart1_1, i4_samp_8x16b_rpart1_2);
275             i4_samp_8x16b_rpart1_2 =
276                 _mm_unpacklo_epi16(i4_samp_8x16b_rpart1_2, i4_samp_8x16b_rpart1_3);
277             i4_samp_8x16b_rpart1_3 =
278                 _mm_unpacklo_epi16(i4_samp_8x16b_rpart1_3, i4_samp_8x16b_rpart1_4);
279 
280             i4_samp_8x16b_rpart2_0 =
281                 _mm_unpacklo_epi16(i4_samp_8x16b_rpart2_0, i4_samp_8x16b_rpart2_1);
282             i4_samp_8x16b_rpart2_1 =
283                 _mm_unpacklo_epi16(i4_samp_8x16b_rpart2_1, i4_samp_8x16b_rpart2_2);
284             i4_samp_8x16b_rpart2_2 =
285                 _mm_unpacklo_epi16(i4_samp_8x16b_rpart2_2, i4_samp_8x16b_rpart2_3);
286             i4_samp_8x16b_rpart2_3 =
287                 _mm_unpacklo_epi16(i4_samp_8x16b_rpart2_3, i4_samp_8x16b_rpart2_4);
288 
289             i4_res_4x32b_rpart1_0 = _mm_madd_epi16(i4_samp_8x16b_rpart1_0, coeff_c3_c2_8x16b);
290             i4_res_4x32b_rpart1_2 = _mm_madd_epi16(i4_samp_8x16b_rpart1_2, coeff_c1_c0_8x16b);
291 
292             i4_res_4x32b_rpart1_1 = _mm_madd_epi16(i4_samp_8x16b_rpart1_1, coeff_c0_c1_8x16b);
293             i4_res_4x32b_rpart1_3 = _mm_madd_epi16(i4_samp_8x16b_rpart1_3, coeff_c2_c3_8x16b);
294 
295             i4_res_4x32b_rpart2_0 = _mm_madd_epi16(i4_samp_8x16b_rpart2_0, coeff_c3_c2_8x16b);
296             i4_res_4x32b_rpart2_2 = _mm_madd_epi16(i4_samp_8x16b_rpart2_2, coeff_c1_c0_8x16b);
297 
298             i4_res_4x32b_rpart2_1 = _mm_madd_epi16(i4_samp_8x16b_rpart2_1, coeff_c0_c1_8x16b);
299             i4_res_4x32b_rpart2_3 = _mm_madd_epi16(i4_samp_8x16b_rpart2_3, coeff_c2_c3_8x16b);
300 
301             i4_res_4x32b_rpart1_0 = _mm_add_epi32(i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart1_2);
302             i4_res_4x32b_rpart1_1 = _mm_add_epi32(i4_res_4x32b_rpart1_1, i4_res_4x32b_rpart1_3);
303 
304             i4_res_4x32b_rpart2_0 = _mm_add_epi32(i4_res_4x32b_rpart2_0, i4_res_4x32b_rpart2_2);
305             i4_res_4x32b_rpart2_1 = _mm_add_epi32(i4_res_4x32b_rpart2_1, i4_res_4x32b_rpart2_3);
306 
307             i4_res_4x32b_rpart1_2 =
308                 _mm_unpacklo_epi32(i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart1_1);
309             i4_res_4x32b_rpart1_3 =
310                 _mm_unpackhi_epi32(i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart1_1);
311 
312             i4_res_4x32b_rpart2_2 =
313                 _mm_unpacklo_epi32(i4_res_4x32b_rpart2_0, i4_res_4x32b_rpart2_1);
314             i4_res_4x32b_rpart2_3 =
315                 _mm_unpackhi_epi32(i4_res_4x32b_rpart2_0, i4_res_4x32b_rpart2_1);
316 
317             i4_res_4x32b_rpart1_0 = _mm_add_epi32(i4_res_4x32b_rpart1_2, res_512);
318             i4_res_4x32b_rpart1_1 = _mm_add_epi32(i4_res_4x32b_rpart1_3, res_512);
319 
320             i4_res_4x32b_rpart1_0 = _mm_srai_epi32(i4_res_4x32b_rpart1_0, 10);
321             i4_res_4x32b_rpart1_1 = _mm_srai_epi32(i4_res_4x32b_rpart1_1, 10);
322 
323             i4_res_4x32b_rpart2_0 = _mm_add_epi32(i4_res_4x32b_rpart2_2, res_512);
324             i4_res_4x32b_rpart2_1 = _mm_add_epi32(i4_res_4x32b_rpart2_3, res_512);
325 
326             i4_res_4x32b_rpart2_0 = _mm_srai_epi32(i4_res_4x32b_rpart2_0, 10);
327             i4_res_4x32b_rpart2_1 = _mm_srai_epi32(i4_res_4x32b_rpart2_1, 10);
328 
329             _mm_storeu_si128(
330                 (__m128i *) pu1_out,
331                 _mm_packus_epi16(_mm_packus_epi32(i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart1_1),
332                                  _mm_packus_epi32(i4_res_4x32b_rpart2_0, i4_res_4x32b_rpart2_1)));
333 
334             pi2_tmp += i4_filt_stride;
335             pu1_out += i4_out_stride;
336 
337         } /* End of loop over y */
338     }
339 }
340 
isvc_vert_interpol_chroma_dyadic_sse42(UWORD8 * pu1_inp_buf,WORD16 * pi2_tmp_filt_buf,WORD32 i4_phase_0,WORD32 i4_phase_1)341 void isvc_vert_interpol_chroma_dyadic_sse42(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
342                                             WORD32 i4_phase_0, WORD32 i4_phase_1)
343 {
344     WORD8 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
345     WORD32 i4_filt_stride, i4_src_stride;
346     UWORD8 *pu1_inp;
347     WORD16 *pi2_tmp;
348     __m128i i4_samp_16x8b_0, i4_samp_16x8b_1, i4_samp_16x8b_2, i4_samp_16x8b_3, i4_samp_16x8b_4,
349         i4_samp_16x8b_5;
350     __m128i i4_res_8x16b_r0, i4_res_8x16b_r1, i4_res_8x16b_r2, i4_res_8x16b_r3, i4_res_8x16b_r4,
351         i4_res_8x16b_r5, i4_res_8x16b_r6, i4_res_8x16b_r7;
352     __m128i i4_res_8x16b_r7_temp;
353     __m128i i4_c0_c1_16x8b, i4_c2_c3_16x8b;
354 
355     i4_coeff_0 = (WORD8) (16 - i4_phase_0);
356     i4_coeff_1 = (WORD8) (i4_phase_0);
357     i4_coeff_2 = (WORD8) (16 - i4_phase_1);
358     i4_coeff_3 = (WORD8) (i4_phase_1);
359 
360     i4_c0_c1_16x8b =
361         _mm_set_epi8(i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0,
362                      i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0,
363                      i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0);
364     i4_c2_c3_16x8b =
365         _mm_set_epi8(i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2,
366                      i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2,
367                      i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2);
368 
369     /* Initializing pointers */
370     pu1_inp = pu1_inp_buf;
371     pi2_tmp = pi2_tmp_filt_buf;
372     i4_filt_stride = 6;
373     i4_src_stride = DYADIC_REF_W_C;
374 
375     i4_samp_16x8b_0 = _mm_loadu_si128((__m128i *) (pu1_inp));
376     i4_samp_16x8b_1 = _mm_loadu_si128((__m128i *) (pu1_inp + i4_src_stride));
377     i4_samp_16x8b_2 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 1)));
378     i4_samp_16x8b_3 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 1) + i4_src_stride));
379     i4_samp_16x8b_4 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 2)));
380     i4_samp_16x8b_5 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 2) + i4_src_stride));
381 
382     i4_samp_16x8b_0 = _mm_unpacklo_epi8(i4_samp_16x8b_0, i4_samp_16x8b_1);
383     i4_res_8x16b_r0 = _mm_maddubs_epi16(i4_samp_16x8b_0, i4_c0_c1_16x8b);
384     _mm_storeu_si128((__m128i *) (pi2_tmp), i4_res_8x16b_r0);
385 
386     i4_samp_16x8b_1 = _mm_unpacklo_epi8(i4_samp_16x8b_1, i4_samp_16x8b_2);
387     i4_res_8x16b_r1 = _mm_maddubs_epi16(i4_samp_16x8b_1, i4_c2_c3_16x8b);
388     _mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r1);
389 
390     i4_res_8x16b_r2 = _mm_maddubs_epi16(i4_samp_16x8b_1, i4_c0_c1_16x8b);
391     _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 1)), i4_res_8x16b_r2);
392 
393     i4_samp_16x8b_2 = _mm_unpacklo_epi8(i4_samp_16x8b_2, i4_samp_16x8b_3);
394     i4_res_8x16b_r3 = _mm_maddubs_epi16(i4_samp_16x8b_2, i4_c2_c3_16x8b);
395     _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 1) + i4_filt_stride),
396                      i4_res_8x16b_r3);
397 
398     i4_res_8x16b_r4 = _mm_maddubs_epi16(i4_samp_16x8b_2, i4_c0_c1_16x8b);
399     _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2)), i4_res_8x16b_r4);
400 
401     i4_samp_16x8b_3 = _mm_unpacklo_epi8(i4_samp_16x8b_3, i4_samp_16x8b_4);
402     i4_res_8x16b_r5 = _mm_maddubs_epi16(i4_samp_16x8b_3, i4_c2_c3_16x8b);
403     _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + i4_filt_stride),
404                      i4_res_8x16b_r5);
405 
406     i4_res_8x16b_r6 = _mm_maddubs_epi16(i4_samp_16x8b_3, i4_c0_c1_16x8b);
407     _mm_storel_epi64((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + (i4_filt_stride << 1)),
408                      i4_res_8x16b_r6);
409 
410     i4_res_8x16b_r6 = _mm_shuffle_epi32(i4_res_8x16b_r6, 78);
411 
412     i4_samp_16x8b_4 = _mm_unpacklo_epi8(i4_samp_16x8b_4, i4_samp_16x8b_5);
413 
414     i4_res_8x16b_r7 = _mm_maddubs_epi16(i4_samp_16x8b_4, i4_c2_c3_16x8b);
415 
416     i4_res_8x16b_r7 = _mm_shuffle_epi32(i4_res_8x16b_r7, 147);
417 
418     i4_res_8x16b_r7_temp = _mm_blend_epi16(i4_res_8x16b_r6, i4_res_8x16b_r7, 252);
419 
420     _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + (i4_filt_stride << 1) + 4),
421                      i4_res_8x16b_r7_temp);
422 }
423 
isvc_horz_interpol_chroma_dyadic_sse42(WORD16 * pi2_tmp_filt_buf,UWORD8 * pu1_out_buf,WORD32 i4_out_stride,WORD32 i4_phase_0,WORD32 i4_phase_1)424 void isvc_horz_interpol_chroma_dyadic_sse42(WORD16 *pi2_tmp_filt_buf, UWORD8 *pu1_out_buf,
425                                             WORD32 i4_out_stride, WORD32 i4_phase_0,
426                                             WORD32 i4_phase_1)
427 {
428     WORD32 i4_dst_stride, i4_dst_stride2, i4_dst_stride4;
429     UWORD8 *pu1_out;
430     WORD16 *pi2_tmp;
431 
432     __m128i i4_samp_8x16b_r1_0, i4_samp_8x16b_r1_1, i4_samp_8x16b_r1_2;
433     __m128i i4_samp_8x16b_r2_0, i4_samp_8x16b_r2_1, i4_samp_8x16b_r2_2;
434     __m128i i4_samp_8x16b_r3_0, i4_samp_8x16b_r3_1, i4_samp_8x16b_r3_2;
435     __m128i i4_samp_8x16b_r4_0, i4_samp_8x16b_r4_1, i4_samp_8x16b_r4_2;
436     __m128i i4_samp_8x16b_r5_0, i4_samp_8x16b_r5_1, i4_samp_8x16b_r5_2;
437     __m128i i4_samp_8x16b_r6_0, i4_samp_8x16b_r6_1, i4_samp_8x16b_r6_2;
438     __m128i i4_samp_8x16b_r7_0, i4_samp_8x16b_r7_1, i4_samp_8x16b_r7_2;
439     __m128i i4_samp_8x16b_r8_0, i4_samp_8x16b_r8_1, i4_samp_8x16b_r8_2;
440 
441     __m128i i4_res_4x32b_r1_0, i4_res_4x32b_r1_1;
442     __m128i i4_res_4x32b_r2_0, i4_res_4x32b_r2_1;
443     __m128i i4_res_4x32b_r3_0, i4_res_4x32b_r3_1;
444     __m128i i4_res_4x32b_r4_0, i4_res_4x32b_r4_1;
445     __m128i i4_res_4x32b_r5_0, i4_res_4x32b_r5_1;
446     __m128i i4_res_4x32b_r6_0, i4_res_4x32b_r6_1;
447     __m128i i4_res_4x32b_r7_0, i4_res_4x32b_r7_1;
448     __m128i i4_res_4x32b_r8_0, i4_res_4x32b_r8_1;
449 
450     __m128i i4_res_final_8x16b_r1, i4_res_final_8x16b_r2, i4_res_final_8x16b_r3,
451         i4_res_final_8x16b_r4, i4_res_final_8x16b_r5, i4_res_final_8x16b_r6, i4_res_final_8x16b_r7,
452         i4_res_final_8x16b_r8;
453 
454     __m128i out_16x8b_r1, out_16x8b_r2, out_16x8b_r3, out_16x8b_r4, out_16x8b_r5, out_16x8b_r6,
455         out_16x8b_r7, out_16x8b_r8;
456 
457     __m128i i4_res_final_8x16b_r12_0, i4_res_final_8x16b_r12_1;
458     __m128i i4_res_final_8x16b_r34_0, i4_res_final_8x16b_r34_1;
459     __m128i i4_res_final_8x16b_r56_0, i4_res_final_8x16b_r56_1;
460     __m128i i4_res_final_8x16b_r67_0, i4_res_final_8x16b_r67_1;
461     __m128i chroma_mask, chroma_mask2;
462 
463     WORD32 i4_coeff_0 = 16 - i4_phase_0;
464     WORD32 i4_coeff_1 = i4_phase_0;
465     WORD32 i4_coeff_2 = 16 - i4_phase_1;
466     WORD32 i4_coeff_3 = i4_phase_1;
467     __m128i coeff_c0_c1_8x16b = _mm_set_epi16(i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0,
468                                               i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0);
469     __m128i coeff_c2_c3_8x16b = _mm_set_epi16(i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2,
470                                               i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2);
471     __m128i res_128 = _mm_set1_epi32(128);
472     UWORD32 u4_norm_factor = 8;
473 
474     /* Initializing pointers */
475     pu1_out = pu1_out_buf;
476     pi2_tmp = pi2_tmp_filt_buf;
477     i4_dst_stride = i4_out_stride;
478 
479     i4_dst_stride2 = i4_dst_stride << 1;
480     i4_dst_stride4 = i4_dst_stride << 2;
481 
482     /* Horizontal interpolation */
483     i4_samp_8x16b_r1_0 = _mm_loadu_si128((__m128i *) pi2_tmp);
484     i4_samp_8x16b_r2_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 6));
485     i4_samp_8x16b_r3_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 12));
486     i4_samp_8x16b_r4_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 18));
487     i4_samp_8x16b_r5_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 24));
488     i4_samp_8x16b_r6_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 30));
489     i4_samp_8x16b_r7_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 36));
490     i4_samp_8x16b_r8_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 42));
491 
492     i4_samp_8x16b_r1_1 = _mm_srli_si128(i4_samp_8x16b_r1_0, 2);
493     i4_samp_8x16b_r1_2 = _mm_srli_si128(i4_samp_8x16b_r1_0, 4);
494 
495     i4_samp_8x16b_r2_1 = _mm_srli_si128(i4_samp_8x16b_r2_0, 2);
496     i4_samp_8x16b_r2_2 = _mm_srli_si128(i4_samp_8x16b_r2_0, 4);
497 
498     i4_samp_8x16b_r3_1 = _mm_srli_si128(i4_samp_8x16b_r3_0, 2);
499     i4_samp_8x16b_r3_2 = _mm_srli_si128(i4_samp_8x16b_r3_0, 4);
500 
501     i4_samp_8x16b_r4_1 = _mm_srli_si128(i4_samp_8x16b_r4_0, 2);
502     i4_samp_8x16b_r4_2 = _mm_srli_si128(i4_samp_8x16b_r4_0, 4);
503 
504     i4_samp_8x16b_r5_1 = _mm_srli_si128(i4_samp_8x16b_r5_0, 2);
505     i4_samp_8x16b_r5_2 = _mm_srli_si128(i4_samp_8x16b_r5_0, 4);
506 
507     i4_samp_8x16b_r6_1 = _mm_srli_si128(i4_samp_8x16b_r6_0, 2);
508     i4_samp_8x16b_r6_2 = _mm_srli_si128(i4_samp_8x16b_r6_0, 4);
509 
510     i4_samp_8x16b_r7_1 = _mm_srli_si128(i4_samp_8x16b_r7_0, 2);
511     i4_samp_8x16b_r7_2 = _mm_srli_si128(i4_samp_8x16b_r7_0, 4);
512 
513     i4_samp_8x16b_r8_1 = _mm_srli_si128(i4_samp_8x16b_r8_0, 2);
514     i4_samp_8x16b_r8_2 = _mm_srli_si128(i4_samp_8x16b_r8_0, 4);
515 
516     i4_samp_8x16b_r1_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r1_0, i4_samp_8x16b_r1_1);
517     i4_samp_8x16b_r2_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r2_0, i4_samp_8x16b_r2_1);
518     i4_samp_8x16b_r3_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r3_0, i4_samp_8x16b_r3_1);
519     i4_samp_8x16b_r4_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r4_0, i4_samp_8x16b_r4_1);
520     i4_samp_8x16b_r5_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r5_0, i4_samp_8x16b_r5_1);
521     i4_samp_8x16b_r6_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r6_0, i4_samp_8x16b_r6_1);
522     i4_samp_8x16b_r7_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r7_0, i4_samp_8x16b_r7_1);
523     i4_samp_8x16b_r8_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r8_0, i4_samp_8x16b_r8_1);
524 
525     i4_samp_8x16b_r1_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r1_1, i4_samp_8x16b_r1_2);
526     i4_samp_8x16b_r2_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r2_1, i4_samp_8x16b_r2_2);
527     i4_samp_8x16b_r3_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r3_1, i4_samp_8x16b_r3_2);
528     i4_samp_8x16b_r4_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r4_1, i4_samp_8x16b_r4_2);
529     i4_samp_8x16b_r5_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r5_1, i4_samp_8x16b_r5_2);
530     i4_samp_8x16b_r6_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r6_1, i4_samp_8x16b_r6_2);
531     i4_samp_8x16b_r7_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r7_1, i4_samp_8x16b_r7_2);
532     i4_samp_8x16b_r8_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r8_1, i4_samp_8x16b_r8_2);
533 
534     // a0c0 + a1c1  a1c0 + a2c1  a2c0 + a3c1  a3c0 + a4c1
535     i4_res_4x32b_r1_0 = _mm_madd_epi16(i4_samp_8x16b_r1_0, coeff_c0_c1_8x16b);
536     // b0c0+b1c1  b1c0+b2c1  b2c0+b3c1  b3c0+b4c1
537     i4_res_4x32b_r2_0 = _mm_madd_epi16(i4_samp_8x16b_r2_0, coeff_c0_c1_8x16b);
538     i4_res_4x32b_r3_0 = _mm_madd_epi16(i4_samp_8x16b_r3_0, coeff_c0_c1_8x16b);
539     i4_res_4x32b_r4_0 = _mm_madd_epi16(i4_samp_8x16b_r4_0, coeff_c0_c1_8x16b);
540     i4_res_4x32b_r5_0 = _mm_madd_epi16(i4_samp_8x16b_r5_0, coeff_c0_c1_8x16b);
541     i4_res_4x32b_r6_0 = _mm_madd_epi16(i4_samp_8x16b_r6_0, coeff_c0_c1_8x16b);
542     i4_res_4x32b_r7_0 = _mm_madd_epi16(i4_samp_8x16b_r7_0, coeff_c0_c1_8x16b);
543     i4_res_4x32b_r8_0 = _mm_madd_epi16(i4_samp_8x16b_r8_0, coeff_c0_c1_8x16b);
544 
545     // a1c2+a2c3  a2c2+a3c3  a3c2+a4c3  a4c2+a5c3
546     i4_res_4x32b_r1_1 = _mm_madd_epi16(i4_samp_8x16b_r1_1, coeff_c2_c3_8x16b);
547     // b1c2+b2c3  b2c2+b3c3  b3c2+b4c3  b4c2+b5c3
548     i4_res_4x32b_r2_1 = _mm_madd_epi16(i4_samp_8x16b_r2_1, coeff_c2_c3_8x16b);
549     i4_res_4x32b_r3_1 = _mm_madd_epi16(i4_samp_8x16b_r3_1, coeff_c2_c3_8x16b);
550     i4_res_4x32b_r4_1 = _mm_madd_epi16(i4_samp_8x16b_r4_1, coeff_c2_c3_8x16b);
551     i4_res_4x32b_r5_1 = _mm_madd_epi16(i4_samp_8x16b_r5_1, coeff_c2_c3_8x16b);
552     i4_res_4x32b_r6_1 = _mm_madd_epi16(i4_samp_8x16b_r6_1, coeff_c2_c3_8x16b);
553     i4_res_4x32b_r7_1 = _mm_madd_epi16(i4_samp_8x16b_r7_1, coeff_c2_c3_8x16b);
554     i4_res_4x32b_r8_1 = _mm_madd_epi16(i4_samp_8x16b_r8_1, coeff_c2_c3_8x16b);
555 
556     i4_res_4x32b_r1_0 = _mm_add_epi32(i4_res_4x32b_r1_0, res_128);
557     i4_res_4x32b_r2_0 = _mm_add_epi32(i4_res_4x32b_r2_0, res_128);
558     i4_res_4x32b_r3_0 = _mm_add_epi32(i4_res_4x32b_r3_0, res_128);
559     i4_res_4x32b_r4_0 = _mm_add_epi32(i4_res_4x32b_r4_0, res_128);
560     i4_res_4x32b_r5_0 = _mm_add_epi32(i4_res_4x32b_r5_0, res_128);
561     i4_res_4x32b_r6_0 = _mm_add_epi32(i4_res_4x32b_r6_0, res_128);
562     i4_res_4x32b_r7_0 = _mm_add_epi32(i4_res_4x32b_r7_0, res_128);
563     i4_res_4x32b_r8_0 = _mm_add_epi32(i4_res_4x32b_r8_0, res_128);
564 
565     i4_res_4x32b_r1_1 = _mm_add_epi32(i4_res_4x32b_r1_1, res_128);
566     i4_res_4x32b_r2_1 = _mm_add_epi32(i4_res_4x32b_r2_1, res_128);
567     i4_res_4x32b_r3_1 = _mm_add_epi32(i4_res_4x32b_r3_1, res_128);
568     i4_res_4x32b_r4_1 = _mm_add_epi32(i4_res_4x32b_r4_1, res_128);
569     i4_res_4x32b_r5_1 = _mm_add_epi32(i4_res_4x32b_r5_1, res_128);
570     i4_res_4x32b_r6_1 = _mm_add_epi32(i4_res_4x32b_r6_1, res_128);
571     i4_res_4x32b_r7_1 = _mm_add_epi32(i4_res_4x32b_r7_1, res_128);
572     i4_res_4x32b_r8_1 = _mm_add_epi32(i4_res_4x32b_r8_1, res_128);
573 
574     i4_res_4x32b_r1_0 = _mm_srai_epi32(i4_res_4x32b_r1_0, u4_norm_factor);
575     i4_res_4x32b_r2_0 = _mm_srai_epi32(i4_res_4x32b_r2_0, u4_norm_factor);
576     i4_res_4x32b_r3_0 = _mm_srai_epi32(i4_res_4x32b_r3_0, u4_norm_factor);
577     i4_res_4x32b_r4_0 = _mm_srai_epi32(i4_res_4x32b_r4_0, u4_norm_factor);
578     i4_res_4x32b_r5_0 = _mm_srai_epi32(i4_res_4x32b_r5_0, u4_norm_factor);
579     i4_res_4x32b_r6_0 = _mm_srai_epi32(i4_res_4x32b_r6_0, u4_norm_factor);
580     i4_res_4x32b_r7_0 = _mm_srai_epi32(i4_res_4x32b_r7_0, u4_norm_factor);
581     i4_res_4x32b_r8_0 = _mm_srai_epi32(i4_res_4x32b_r8_0, u4_norm_factor);
582 
583     i4_res_4x32b_r1_1 = _mm_srai_epi32(i4_res_4x32b_r1_1, u4_norm_factor);
584     i4_res_4x32b_r2_1 = _mm_srai_epi32(i4_res_4x32b_r2_1, u4_norm_factor);
585     i4_res_4x32b_r3_1 = _mm_srai_epi32(i4_res_4x32b_r3_1, u4_norm_factor);
586     i4_res_4x32b_r4_1 = _mm_srai_epi32(i4_res_4x32b_r4_1, u4_norm_factor);
587     i4_res_4x32b_r5_1 = _mm_srai_epi32(i4_res_4x32b_r5_1, u4_norm_factor);
588     i4_res_4x32b_r6_1 = _mm_srai_epi32(i4_res_4x32b_r6_1, u4_norm_factor);
589     i4_res_4x32b_r7_1 = _mm_srai_epi32(i4_res_4x32b_r7_1, u4_norm_factor);
590     i4_res_4x32b_r8_1 = _mm_srai_epi32(i4_res_4x32b_r8_1, u4_norm_factor);
591 
592     i4_res_final_8x16b_r12_0 = _mm_packs_epi32(i4_res_4x32b_r1_0, i4_res_4x32b_r2_0);
593     i4_res_final_8x16b_r34_0 = _mm_packs_epi32(i4_res_4x32b_r3_0, i4_res_4x32b_r4_0);
594     i4_res_final_8x16b_r56_0 = _mm_packs_epi32(i4_res_4x32b_r5_0, i4_res_4x32b_r6_0);
595     i4_res_final_8x16b_r67_0 = _mm_packs_epi32(i4_res_4x32b_r7_0, i4_res_4x32b_r8_0);
596 
597     i4_res_final_8x16b_r12_1 = _mm_packs_epi32(i4_res_4x32b_r1_1, i4_res_4x32b_r2_1);
598     i4_res_final_8x16b_r34_1 = _mm_packs_epi32(i4_res_4x32b_r3_1, i4_res_4x32b_r4_1);
599     i4_res_final_8x16b_r56_1 = _mm_packs_epi32(i4_res_4x32b_r5_1, i4_res_4x32b_r6_1);
600     i4_res_final_8x16b_r67_1 = _mm_packs_epi32(i4_res_4x32b_r7_1, i4_res_4x32b_r8_1);
601 
602     i4_res_final_8x16b_r1 = _mm_unpacklo_epi16(i4_res_final_8x16b_r12_0, i4_res_final_8x16b_r12_1);
603     i4_res_final_8x16b_r2 = _mm_unpackhi_epi16(i4_res_final_8x16b_r12_0, i4_res_final_8x16b_r12_1);
604     i4_res_final_8x16b_r3 = _mm_unpacklo_epi16(i4_res_final_8x16b_r34_0, i4_res_final_8x16b_r34_1);
605     i4_res_final_8x16b_r4 = _mm_unpackhi_epi16(i4_res_final_8x16b_r34_0, i4_res_final_8x16b_r34_1);
606     i4_res_final_8x16b_r5 = _mm_unpacklo_epi16(i4_res_final_8x16b_r56_0, i4_res_final_8x16b_r56_1);
607     i4_res_final_8x16b_r6 = _mm_unpackhi_epi16(i4_res_final_8x16b_r56_0, i4_res_final_8x16b_r56_1);
608     i4_res_final_8x16b_r7 = _mm_unpacklo_epi16(i4_res_final_8x16b_r67_0, i4_res_final_8x16b_r67_1);
609     i4_res_final_8x16b_r8 = _mm_unpackhi_epi16(i4_res_final_8x16b_r67_0, i4_res_final_8x16b_r67_1);
610 
611     chroma_mask = _mm_set1_epi16(0xFF00);
612     chroma_mask2 = _mm_set1_epi16(0x00FF);
613     out_16x8b_r1 = _mm_loadu_si128((__m128i *) (&pu1_out[0]));
614     out_16x8b_r2 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride]));
615     out_16x8b_r3 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride2]));
616     out_16x8b_r4 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride2 + i4_dst_stride]));
617     out_16x8b_r5 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4]));
618     out_16x8b_r6 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4 + i4_dst_stride]));
619     out_16x8b_r7 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4 + i4_dst_stride2]));
620     out_16x8b_r8 =
621         _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4 + i4_dst_stride2 + i4_dst_stride]));
622 
623     out_16x8b_r1 = _mm_and_si128(out_16x8b_r1, chroma_mask);
624     out_16x8b_r2 = _mm_and_si128(out_16x8b_r2, chroma_mask);
625     out_16x8b_r3 = _mm_and_si128(out_16x8b_r3, chroma_mask);
626     out_16x8b_r4 = _mm_and_si128(out_16x8b_r4, chroma_mask);
627     out_16x8b_r5 = _mm_and_si128(out_16x8b_r5, chroma_mask);
628     out_16x8b_r6 = _mm_and_si128(out_16x8b_r6, chroma_mask);
629     out_16x8b_r7 = _mm_and_si128(out_16x8b_r7, chroma_mask);
630     out_16x8b_r8 = _mm_and_si128(out_16x8b_r8, chroma_mask);
631 
632     i4_res_final_8x16b_r1 = _mm_and_si128(i4_res_final_8x16b_r1, chroma_mask2);
633     i4_res_final_8x16b_r2 = _mm_and_si128(i4_res_final_8x16b_r2, chroma_mask2);
634     i4_res_final_8x16b_r3 = _mm_and_si128(i4_res_final_8x16b_r3, chroma_mask2);
635     i4_res_final_8x16b_r4 = _mm_and_si128(i4_res_final_8x16b_r4, chroma_mask2);
636     i4_res_final_8x16b_r5 = _mm_and_si128(i4_res_final_8x16b_r5, chroma_mask2);
637     i4_res_final_8x16b_r6 = _mm_and_si128(i4_res_final_8x16b_r6, chroma_mask2);
638     i4_res_final_8x16b_r7 = _mm_and_si128(i4_res_final_8x16b_r7, chroma_mask2);
639     i4_res_final_8x16b_r8 = _mm_and_si128(i4_res_final_8x16b_r8, chroma_mask2);
640 
641     out_16x8b_r1 = _mm_add_epi8(i4_res_final_8x16b_r1, out_16x8b_r1);
642     out_16x8b_r2 = _mm_add_epi8(i4_res_final_8x16b_r2, out_16x8b_r2);
643     out_16x8b_r3 = _mm_add_epi8(i4_res_final_8x16b_r3, out_16x8b_r3);
644     out_16x8b_r4 = _mm_add_epi8(i4_res_final_8x16b_r4, out_16x8b_r4);
645     out_16x8b_r5 = _mm_add_epi8(i4_res_final_8x16b_r5, out_16x8b_r5);
646     out_16x8b_r6 = _mm_add_epi8(i4_res_final_8x16b_r6, out_16x8b_r6);
647     out_16x8b_r7 = _mm_add_epi8(i4_res_final_8x16b_r7, out_16x8b_r7);
648     out_16x8b_r8 = _mm_add_epi8(i4_res_final_8x16b_r8, out_16x8b_r8);
649 
650     _mm_storeu_si128((__m128i *) pu1_out, out_16x8b_r1);
651     _mm_storeu_si128((__m128i *) (pu1_out + i4_dst_stride), out_16x8b_r2);
652     _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 2)), out_16x8b_r3);
653     _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 3)), out_16x8b_r4);
654     _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 4)), out_16x8b_r5);
655     _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 5)), out_16x8b_r6);
656     _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 6)), out_16x8b_r7);
657     _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 7)), out_16x8b_r8);
658 }
659