1 /******************************************************************************
2 *
3 * Copyright (C) 2022 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /*!
21 **************************************************************************
22
23 * * \file ih264d_resamp_svc.c
24 *
25 * \brief
26 * Contains routines that
27 * resample for SVC resampling
28 *
29 * Detailed_description
30 *
31 * \date
32 *
33 *
34 *
35 * \author
36
37 * **************************************************************************
38
39 */
40 #include <immintrin.h>
41
42 #include "ih264_typedefs.h"
43 #include "ih264_debug.h"
44 #include "isvc_intra_resample.h"
45
isvc_interpolate_base_luma_dyadic_sse42(UWORD8 * pu1_inp_buf,WORD16 * pi2_tmp_filt_buf,UWORD8 * pu1_out_buf,WORD32 i4_out_stride)46 void isvc_interpolate_base_luma_dyadic_sse42(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
47 UWORD8 *pu1_out_buf, WORD32 i4_out_stride)
48 {
49 WORD32 i4_y;
50 WORD32 i4_filt_stride, i4_src_stride;
51 UWORD8 *pu1_inp, *pu1_out;
52 WORD16 *pi2_tmp;
53
54 __m128i i4_samp_16x8b_0, i4_samp_16x8b_1, i4_samp_16x8b_2, i4_samp_16x8b_3;
55 __m128i i4_samp_8x16b_0, i4_samp_8x16b_1, i4_samp_8x16b_2, i4_samp_8x16b_3;
56 __m128i i4_res_8x16b_r1_1, i4_res_8x16b_r1_2, i4_res_8x16b_r1_3;
57 __m128i i4_res_8x16b_r2_1, i4_res_8x16b_r2_2, i4_res_8x16b_r2_3;
58
59 /* Filter coefficient values for phase 4 */
60 __m128i i4_coeff_8x16b_0 = _mm_set1_epi16(-3);
61 __m128i i4_coeff_8x16b_1 = _mm_set1_epi16(28);
62 i4_filt_stride = 12;
63 i4_src_stride = DYADIC_REF_W_Y;
64
65 /* Initializing pointers */
66 pu1_inp = pu1_inp_buf;
67 pi2_tmp = pi2_tmp_filt_buf;
68 pu1_out = pu1_out_buf;
69
70 /* Vertical interpolation */
71 /*First 64 bit */
72 /* y = 0, y_phase = 12 */
73 i4_samp_16x8b_0 = _mm_loadl_epi64((__m128i *) (pu1_inp));
74 i4_samp_16x8b_1 = _mm_loadl_epi64((__m128i *) (pu1_inp + i4_src_stride));
75 i4_samp_16x8b_2 = _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1)));
76 i4_samp_16x8b_3 = _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1) + i4_src_stride));
77 pu1_inp += (i4_src_stride << 2);
78 i4_samp_8x16b_0 = _mm_cvtepu8_epi16(i4_samp_16x8b_0);
79 i4_samp_8x16b_1 = _mm_cvtepu8_epi16(i4_samp_16x8b_1);
80 i4_samp_8x16b_2 = _mm_cvtepu8_epi16(i4_samp_16x8b_2);
81 i4_samp_8x16b_3 = _mm_cvtepu8_epi16(i4_samp_16x8b_3);
82
83 /* since y_phase 12 for y = 0 */
84 /*Multiply by 8 => left shift by 3*/
85 i4_res_8x16b_r1_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
86 i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
87 i4_res_8x16b_r1_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
88
89 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
90 i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_0);
91 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
92
93 _mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
94 pi2_tmp += i4_filt_stride;
95
96 for(i4_y = 1; i4_y < 15; i4_y += 2)
97 {
98 i4_samp_8x16b_0 = i4_samp_8x16b_1;
99 i4_samp_8x16b_1 = i4_samp_8x16b_2;
100 i4_samp_8x16b_2 = i4_samp_8x16b_3;
101 i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
102
103 /* y_phase is 4 for odd values of y */
104 /* and 12 for even values of y */
105 //*Multiply by 8 => left shift by 3*/
106 i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
107 i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
108 i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
109
110 i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
111 i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
112 i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
113
114 i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
115 i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
116
117 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
118 i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
119
120 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
121 i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
122
123 /* Storing the results */
124 _mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
125 _mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
126 pi2_tmp += (i4_filt_stride << 1);
127 pu1_inp += i4_src_stride;
128
129 } /* End of loop over y */
130
131 /* y = 15, y_phase = 4 */
132 i4_samp_8x16b_0 = i4_samp_8x16b_1;
133 i4_samp_8x16b_1 = i4_samp_8x16b_2;
134 i4_samp_8x16b_2 = i4_samp_8x16b_3;
135 i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
136
137 i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
138 i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
139 i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
140 i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
141
142 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
143 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
144
145 /* Store the output */
146 _mm_storeu_si128((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
147
148 /* Reinitializing the ptrs */
149 pu1_inp = pu1_inp_buf;
150 pi2_tmp = pi2_tmp_filt_buf;
151
152 /*Remaining 32 bit */
153 pu1_inp += 8;
154 pi2_tmp += 8;
155
156 /* y = 0, y_phase = 12 */
157 i4_samp_16x8b_0 = _mm_loadl_epi64((__m128i *) (pu1_inp));
158 i4_samp_16x8b_1 = _mm_loadl_epi64((__m128i *) (pu1_inp + i4_src_stride));
159 i4_samp_16x8b_2 = _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1)));
160 i4_samp_16x8b_3 =
161 _mm_loadl_epi64((__m128i *) (pu1_inp + (i4_src_stride << 1) + i4_src_stride));
162 pu1_inp += (i4_src_stride << 2);
163 i4_samp_8x16b_0 = _mm_cvtepu8_epi16(i4_samp_16x8b_0);
164 i4_samp_8x16b_1 = _mm_cvtepu8_epi16(i4_samp_16x8b_1);
165 i4_samp_8x16b_2 = _mm_cvtepu8_epi16(i4_samp_16x8b_2);
166 i4_samp_8x16b_3 = _mm_cvtepu8_epi16(i4_samp_16x8b_3);
167
168 /* since y_phase 12 for y = 0 */
169 /*Multiply by 8 => left shift by 3*/
170 i4_res_8x16b_r1_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
171 i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
172 i4_res_8x16b_r1_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
173
174 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
175 i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_0);
176 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
177
178 _mm_storel_epi64((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
179 pi2_tmp += i4_filt_stride;
180
181 for(i4_y = 1; i4_y < 15; i4_y += 2)
182 {
183 i4_samp_8x16b_0 = i4_samp_8x16b_1;
184 i4_samp_8x16b_1 = i4_samp_8x16b_2;
185 i4_samp_8x16b_2 = i4_samp_8x16b_3;
186 i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
187
188 /* y_phase is 4 for odd values of y */
189 /* and 12 for even values of y */
190 //*Multiply by 8 => left shift by 3*/
191 i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
192 i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
193 i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
194
195 i4_res_8x16b_r2_1 = _mm_slli_epi16(i4_samp_8x16b_1, 3);
196 i4_res_8x16b_r2_2 = _mm_mullo_epi16(i4_samp_8x16b_2, i4_coeff_8x16b_1);
197 i4_res_8x16b_r2_3 = _mm_mullo_epi16(i4_samp_8x16b_3, i4_coeff_8x16b_0);
198
199 i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
200 i4_res_8x16b_r2_3 = _mm_subs_epi16(i4_res_8x16b_r2_3, i4_samp_8x16b_0);
201
202 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
203 i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_2);
204
205 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
206 i4_res_8x16b_r2_1 = _mm_adds_epi16(i4_res_8x16b_r2_1, i4_res_8x16b_r2_3);
207
208 /* Storing the results */
209 _mm_storel_epi64((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
210 _mm_storel_epi64((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r2_1);
211 pi2_tmp += (i4_filt_stride << 1);
212 pu1_inp += i4_src_stride;
213
214 } /* End of loop over y */
215
216 /* y = 15, y_phase = 4 */
217 i4_samp_8x16b_0 = i4_samp_8x16b_1;
218 i4_samp_8x16b_1 = i4_samp_8x16b_2;
219 i4_samp_8x16b_2 = i4_samp_8x16b_3;
220 i4_samp_8x16b_3 = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i *) (pu1_inp)));
221
222 i4_res_8x16b_r1_1 = _mm_mullo_epi16(i4_samp_8x16b_0, i4_coeff_8x16b_0);
223 i4_res_8x16b_r1_2 = _mm_mullo_epi16(i4_samp_8x16b_1, i4_coeff_8x16b_1);
224 i4_res_8x16b_r1_3 = _mm_slli_epi16(i4_samp_8x16b_2, 3);
225 i4_res_8x16b_r1_3 = _mm_subs_epi16(i4_res_8x16b_r1_3, i4_samp_8x16b_3);
226
227 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_2);
228 i4_res_8x16b_r1_1 = _mm_adds_epi16(i4_res_8x16b_r1_1, i4_res_8x16b_r1_3);
229
230 /* Store the output */
231 _mm_storel_epi64((__m128i *) pi2_tmp, i4_res_8x16b_r1_1);
232
233 /* Reinitializing the ptrs */
234 pu1_inp = pu1_inp_buf;
235 pi2_tmp = pi2_tmp_filt_buf;
236
237 {
238 __m128i coeff_c0_c1_8x16b = _mm_set_epi16(28, -3, 28, -3, 28, -3, 28, -3);
239 __m128i coeff_c2_c3_8x16b = _mm_set_epi16(-1, 8, -1, 8, -1, 8, -1, 8);
240 __m128i coeff_c3_c2_8x16b = _mm_set_epi16(8, -1, 8, -1, 8, -1, 8, -1);
241 __m128i coeff_c1_c0_8x16b = _mm_set_epi16(-3, 28, -3, 28, -3, 28, -3, 28);
242
243 __m128i i4_samp_8x16b_rpart1_0, i4_samp_8x16b_rpart2_0;
244 __m128i i4_samp_8x16b_rpart1_1, i4_samp_8x16b_rpart2_1;
245 __m128i i4_samp_8x16b_rpart1_2, i4_samp_8x16b_rpart2_2;
246 __m128i i4_samp_8x16b_rpart1_3, i4_samp_8x16b_rpart2_3;
247 __m128i i4_samp_8x16b_rpart1_4, i4_samp_8x16b_rpart2_4;
248
249 __m128i i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart2_0;
250 __m128i i4_res_4x32b_rpart1_1, i4_res_4x32b_rpart2_1;
251 __m128i i4_res_4x32b_rpart1_2, i4_res_4x32b_rpart2_2;
252 __m128i i4_res_4x32b_rpart1_3, i4_res_4x32b_rpart2_3;
253
254 __m128i res_512 = _mm_set1_epi32(512);
255 /* Horizontal interpolation */
256 for(i4_y = 0; i4_y < 16; i4_y++)
257 {
258 i4_samp_8x16b_rpart1_0 = _mm_loadu_si128((__m128i *) pi2_tmp);
259 i4_samp_8x16b_rpart2_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 4));
260
261 i4_samp_8x16b_rpart1_1 = _mm_srli_si128(i4_samp_8x16b_rpart1_0, 2);
262 i4_samp_8x16b_rpart1_2 = _mm_srli_si128(i4_samp_8x16b_rpart1_0, 4);
263 i4_samp_8x16b_rpart1_3 = _mm_srli_si128(i4_samp_8x16b_rpart1_0, 6);
264 i4_samp_8x16b_rpart1_4 = _mm_srli_si128(i4_samp_8x16b_rpart1_0, 8);
265
266 i4_samp_8x16b_rpart2_1 = _mm_srli_si128(i4_samp_8x16b_rpart2_0, 2);
267 i4_samp_8x16b_rpart2_2 = _mm_srli_si128(i4_samp_8x16b_rpart2_0, 4);
268 i4_samp_8x16b_rpart2_3 = _mm_srli_si128(i4_samp_8x16b_rpart2_0, 6);
269 i4_samp_8x16b_rpart2_4 = _mm_srli_si128(i4_samp_8x16b_rpart2_0, 8);
270
271 i4_samp_8x16b_rpart1_0 =
272 _mm_unpacklo_epi16(i4_samp_8x16b_rpart1_0, i4_samp_8x16b_rpart1_1);
273 i4_samp_8x16b_rpart1_1 =
274 _mm_unpacklo_epi16(i4_samp_8x16b_rpart1_1, i4_samp_8x16b_rpart1_2);
275 i4_samp_8x16b_rpart1_2 =
276 _mm_unpacklo_epi16(i4_samp_8x16b_rpart1_2, i4_samp_8x16b_rpart1_3);
277 i4_samp_8x16b_rpart1_3 =
278 _mm_unpacklo_epi16(i4_samp_8x16b_rpart1_3, i4_samp_8x16b_rpart1_4);
279
280 i4_samp_8x16b_rpart2_0 =
281 _mm_unpacklo_epi16(i4_samp_8x16b_rpart2_0, i4_samp_8x16b_rpart2_1);
282 i4_samp_8x16b_rpart2_1 =
283 _mm_unpacklo_epi16(i4_samp_8x16b_rpart2_1, i4_samp_8x16b_rpart2_2);
284 i4_samp_8x16b_rpart2_2 =
285 _mm_unpacklo_epi16(i4_samp_8x16b_rpart2_2, i4_samp_8x16b_rpart2_3);
286 i4_samp_8x16b_rpart2_3 =
287 _mm_unpacklo_epi16(i4_samp_8x16b_rpart2_3, i4_samp_8x16b_rpart2_4);
288
289 i4_res_4x32b_rpart1_0 = _mm_madd_epi16(i4_samp_8x16b_rpart1_0, coeff_c3_c2_8x16b);
290 i4_res_4x32b_rpart1_2 = _mm_madd_epi16(i4_samp_8x16b_rpart1_2, coeff_c1_c0_8x16b);
291
292 i4_res_4x32b_rpart1_1 = _mm_madd_epi16(i4_samp_8x16b_rpart1_1, coeff_c0_c1_8x16b);
293 i4_res_4x32b_rpart1_3 = _mm_madd_epi16(i4_samp_8x16b_rpart1_3, coeff_c2_c3_8x16b);
294
295 i4_res_4x32b_rpart2_0 = _mm_madd_epi16(i4_samp_8x16b_rpart2_0, coeff_c3_c2_8x16b);
296 i4_res_4x32b_rpart2_2 = _mm_madd_epi16(i4_samp_8x16b_rpart2_2, coeff_c1_c0_8x16b);
297
298 i4_res_4x32b_rpart2_1 = _mm_madd_epi16(i4_samp_8x16b_rpart2_1, coeff_c0_c1_8x16b);
299 i4_res_4x32b_rpart2_3 = _mm_madd_epi16(i4_samp_8x16b_rpart2_3, coeff_c2_c3_8x16b);
300
301 i4_res_4x32b_rpart1_0 = _mm_add_epi32(i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart1_2);
302 i4_res_4x32b_rpart1_1 = _mm_add_epi32(i4_res_4x32b_rpart1_1, i4_res_4x32b_rpart1_3);
303
304 i4_res_4x32b_rpart2_0 = _mm_add_epi32(i4_res_4x32b_rpart2_0, i4_res_4x32b_rpart2_2);
305 i4_res_4x32b_rpart2_1 = _mm_add_epi32(i4_res_4x32b_rpart2_1, i4_res_4x32b_rpart2_3);
306
307 i4_res_4x32b_rpart1_2 =
308 _mm_unpacklo_epi32(i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart1_1);
309 i4_res_4x32b_rpart1_3 =
310 _mm_unpackhi_epi32(i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart1_1);
311
312 i4_res_4x32b_rpart2_2 =
313 _mm_unpacklo_epi32(i4_res_4x32b_rpart2_0, i4_res_4x32b_rpart2_1);
314 i4_res_4x32b_rpart2_3 =
315 _mm_unpackhi_epi32(i4_res_4x32b_rpart2_0, i4_res_4x32b_rpart2_1);
316
317 i4_res_4x32b_rpart1_0 = _mm_add_epi32(i4_res_4x32b_rpart1_2, res_512);
318 i4_res_4x32b_rpart1_1 = _mm_add_epi32(i4_res_4x32b_rpart1_3, res_512);
319
320 i4_res_4x32b_rpart1_0 = _mm_srai_epi32(i4_res_4x32b_rpart1_0, 10);
321 i4_res_4x32b_rpart1_1 = _mm_srai_epi32(i4_res_4x32b_rpart1_1, 10);
322
323 i4_res_4x32b_rpart2_0 = _mm_add_epi32(i4_res_4x32b_rpart2_2, res_512);
324 i4_res_4x32b_rpart2_1 = _mm_add_epi32(i4_res_4x32b_rpart2_3, res_512);
325
326 i4_res_4x32b_rpart2_0 = _mm_srai_epi32(i4_res_4x32b_rpart2_0, 10);
327 i4_res_4x32b_rpart2_1 = _mm_srai_epi32(i4_res_4x32b_rpart2_1, 10);
328
329 _mm_storeu_si128(
330 (__m128i *) pu1_out,
331 _mm_packus_epi16(_mm_packus_epi32(i4_res_4x32b_rpart1_0, i4_res_4x32b_rpart1_1),
332 _mm_packus_epi32(i4_res_4x32b_rpart2_0, i4_res_4x32b_rpart2_1)));
333
334 pi2_tmp += i4_filt_stride;
335 pu1_out += i4_out_stride;
336
337 } /* End of loop over y */
338 }
339 }
340
isvc_vert_interpol_chroma_dyadic_sse42(UWORD8 * pu1_inp_buf,WORD16 * pi2_tmp_filt_buf,WORD32 i4_phase_0,WORD32 i4_phase_1)341 void isvc_vert_interpol_chroma_dyadic_sse42(UWORD8 *pu1_inp_buf, WORD16 *pi2_tmp_filt_buf,
342 WORD32 i4_phase_0, WORD32 i4_phase_1)
343 {
344 WORD8 i4_coeff_0, i4_coeff_1, i4_coeff_2, i4_coeff_3;
345 WORD32 i4_filt_stride, i4_src_stride;
346 UWORD8 *pu1_inp;
347 WORD16 *pi2_tmp;
348 __m128i i4_samp_16x8b_0, i4_samp_16x8b_1, i4_samp_16x8b_2, i4_samp_16x8b_3, i4_samp_16x8b_4,
349 i4_samp_16x8b_5;
350 __m128i i4_res_8x16b_r0, i4_res_8x16b_r1, i4_res_8x16b_r2, i4_res_8x16b_r3, i4_res_8x16b_r4,
351 i4_res_8x16b_r5, i4_res_8x16b_r6, i4_res_8x16b_r7;
352 __m128i i4_res_8x16b_r7_temp;
353 __m128i i4_c0_c1_16x8b, i4_c2_c3_16x8b;
354
355 i4_coeff_0 = (WORD8) (16 - i4_phase_0);
356 i4_coeff_1 = (WORD8) (i4_phase_0);
357 i4_coeff_2 = (WORD8) (16 - i4_phase_1);
358 i4_coeff_3 = (WORD8) (i4_phase_1);
359
360 i4_c0_c1_16x8b =
361 _mm_set_epi8(i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0,
362 i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0,
363 i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0);
364 i4_c2_c3_16x8b =
365 _mm_set_epi8(i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2,
366 i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2,
367 i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2);
368
369 /* Initializing pointers */
370 pu1_inp = pu1_inp_buf;
371 pi2_tmp = pi2_tmp_filt_buf;
372 i4_filt_stride = 6;
373 i4_src_stride = DYADIC_REF_W_C;
374
375 i4_samp_16x8b_0 = _mm_loadu_si128((__m128i *) (pu1_inp));
376 i4_samp_16x8b_1 = _mm_loadu_si128((__m128i *) (pu1_inp + i4_src_stride));
377 i4_samp_16x8b_2 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 1)));
378 i4_samp_16x8b_3 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 1) + i4_src_stride));
379 i4_samp_16x8b_4 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 2)));
380 i4_samp_16x8b_5 = _mm_loadu_si128((__m128i *) (pu1_inp + (i4_src_stride << 2) + i4_src_stride));
381
382 i4_samp_16x8b_0 = _mm_unpacklo_epi8(i4_samp_16x8b_0, i4_samp_16x8b_1);
383 i4_res_8x16b_r0 = _mm_maddubs_epi16(i4_samp_16x8b_0, i4_c0_c1_16x8b);
384 _mm_storeu_si128((__m128i *) (pi2_tmp), i4_res_8x16b_r0);
385
386 i4_samp_16x8b_1 = _mm_unpacklo_epi8(i4_samp_16x8b_1, i4_samp_16x8b_2);
387 i4_res_8x16b_r1 = _mm_maddubs_epi16(i4_samp_16x8b_1, i4_c2_c3_16x8b);
388 _mm_storeu_si128((__m128i *) (pi2_tmp + i4_filt_stride), i4_res_8x16b_r1);
389
390 i4_res_8x16b_r2 = _mm_maddubs_epi16(i4_samp_16x8b_1, i4_c0_c1_16x8b);
391 _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 1)), i4_res_8x16b_r2);
392
393 i4_samp_16x8b_2 = _mm_unpacklo_epi8(i4_samp_16x8b_2, i4_samp_16x8b_3);
394 i4_res_8x16b_r3 = _mm_maddubs_epi16(i4_samp_16x8b_2, i4_c2_c3_16x8b);
395 _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 1) + i4_filt_stride),
396 i4_res_8x16b_r3);
397
398 i4_res_8x16b_r4 = _mm_maddubs_epi16(i4_samp_16x8b_2, i4_c0_c1_16x8b);
399 _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2)), i4_res_8x16b_r4);
400
401 i4_samp_16x8b_3 = _mm_unpacklo_epi8(i4_samp_16x8b_3, i4_samp_16x8b_4);
402 i4_res_8x16b_r5 = _mm_maddubs_epi16(i4_samp_16x8b_3, i4_c2_c3_16x8b);
403 _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + i4_filt_stride),
404 i4_res_8x16b_r5);
405
406 i4_res_8x16b_r6 = _mm_maddubs_epi16(i4_samp_16x8b_3, i4_c0_c1_16x8b);
407 _mm_storel_epi64((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + (i4_filt_stride << 1)),
408 i4_res_8x16b_r6);
409
410 i4_res_8x16b_r6 = _mm_shuffle_epi32(i4_res_8x16b_r6, 78);
411
412 i4_samp_16x8b_4 = _mm_unpacklo_epi8(i4_samp_16x8b_4, i4_samp_16x8b_5);
413
414 i4_res_8x16b_r7 = _mm_maddubs_epi16(i4_samp_16x8b_4, i4_c2_c3_16x8b);
415
416 i4_res_8x16b_r7 = _mm_shuffle_epi32(i4_res_8x16b_r7, 147);
417
418 i4_res_8x16b_r7_temp = _mm_blend_epi16(i4_res_8x16b_r6, i4_res_8x16b_r7, 252);
419
420 _mm_storeu_si128((__m128i *) (pi2_tmp + (i4_filt_stride << 2) + (i4_filt_stride << 1) + 4),
421 i4_res_8x16b_r7_temp);
422 }
423
isvc_horz_interpol_chroma_dyadic_sse42(WORD16 * pi2_tmp_filt_buf,UWORD8 * pu1_out_buf,WORD32 i4_out_stride,WORD32 i4_phase_0,WORD32 i4_phase_1)424 void isvc_horz_interpol_chroma_dyadic_sse42(WORD16 *pi2_tmp_filt_buf, UWORD8 *pu1_out_buf,
425 WORD32 i4_out_stride, WORD32 i4_phase_0,
426 WORD32 i4_phase_1)
427 {
428 WORD32 i4_dst_stride, i4_dst_stride2, i4_dst_stride4;
429 UWORD8 *pu1_out;
430 WORD16 *pi2_tmp;
431
432 __m128i i4_samp_8x16b_r1_0, i4_samp_8x16b_r1_1, i4_samp_8x16b_r1_2;
433 __m128i i4_samp_8x16b_r2_0, i4_samp_8x16b_r2_1, i4_samp_8x16b_r2_2;
434 __m128i i4_samp_8x16b_r3_0, i4_samp_8x16b_r3_1, i4_samp_8x16b_r3_2;
435 __m128i i4_samp_8x16b_r4_0, i4_samp_8x16b_r4_1, i4_samp_8x16b_r4_2;
436 __m128i i4_samp_8x16b_r5_0, i4_samp_8x16b_r5_1, i4_samp_8x16b_r5_2;
437 __m128i i4_samp_8x16b_r6_0, i4_samp_8x16b_r6_1, i4_samp_8x16b_r6_2;
438 __m128i i4_samp_8x16b_r7_0, i4_samp_8x16b_r7_1, i4_samp_8x16b_r7_2;
439 __m128i i4_samp_8x16b_r8_0, i4_samp_8x16b_r8_1, i4_samp_8x16b_r8_2;
440
441 __m128i i4_res_4x32b_r1_0, i4_res_4x32b_r1_1;
442 __m128i i4_res_4x32b_r2_0, i4_res_4x32b_r2_1;
443 __m128i i4_res_4x32b_r3_0, i4_res_4x32b_r3_1;
444 __m128i i4_res_4x32b_r4_0, i4_res_4x32b_r4_1;
445 __m128i i4_res_4x32b_r5_0, i4_res_4x32b_r5_1;
446 __m128i i4_res_4x32b_r6_0, i4_res_4x32b_r6_1;
447 __m128i i4_res_4x32b_r7_0, i4_res_4x32b_r7_1;
448 __m128i i4_res_4x32b_r8_0, i4_res_4x32b_r8_1;
449
450 __m128i i4_res_final_8x16b_r1, i4_res_final_8x16b_r2, i4_res_final_8x16b_r3,
451 i4_res_final_8x16b_r4, i4_res_final_8x16b_r5, i4_res_final_8x16b_r6, i4_res_final_8x16b_r7,
452 i4_res_final_8x16b_r8;
453
454 __m128i out_16x8b_r1, out_16x8b_r2, out_16x8b_r3, out_16x8b_r4, out_16x8b_r5, out_16x8b_r6,
455 out_16x8b_r7, out_16x8b_r8;
456
457 __m128i i4_res_final_8x16b_r12_0, i4_res_final_8x16b_r12_1;
458 __m128i i4_res_final_8x16b_r34_0, i4_res_final_8x16b_r34_1;
459 __m128i i4_res_final_8x16b_r56_0, i4_res_final_8x16b_r56_1;
460 __m128i i4_res_final_8x16b_r67_0, i4_res_final_8x16b_r67_1;
461 __m128i chroma_mask, chroma_mask2;
462
463 WORD32 i4_coeff_0 = 16 - i4_phase_0;
464 WORD32 i4_coeff_1 = i4_phase_0;
465 WORD32 i4_coeff_2 = 16 - i4_phase_1;
466 WORD32 i4_coeff_3 = i4_phase_1;
467 __m128i coeff_c0_c1_8x16b = _mm_set_epi16(i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0,
468 i4_coeff_1, i4_coeff_0, i4_coeff_1, i4_coeff_0);
469 __m128i coeff_c2_c3_8x16b = _mm_set_epi16(i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2,
470 i4_coeff_3, i4_coeff_2, i4_coeff_3, i4_coeff_2);
471 __m128i res_128 = _mm_set1_epi32(128);
472 UWORD32 u4_norm_factor = 8;
473
474 /* Initializing pointers */
475 pu1_out = pu1_out_buf;
476 pi2_tmp = pi2_tmp_filt_buf;
477 i4_dst_stride = i4_out_stride;
478
479 i4_dst_stride2 = i4_dst_stride << 1;
480 i4_dst_stride4 = i4_dst_stride << 2;
481
482 /* Horizontal interpolation */
483 i4_samp_8x16b_r1_0 = _mm_loadu_si128((__m128i *) pi2_tmp);
484 i4_samp_8x16b_r2_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 6));
485 i4_samp_8x16b_r3_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 12));
486 i4_samp_8x16b_r4_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 18));
487 i4_samp_8x16b_r5_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 24));
488 i4_samp_8x16b_r6_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 30));
489 i4_samp_8x16b_r7_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 36));
490 i4_samp_8x16b_r8_0 = _mm_loadu_si128((__m128i *) (pi2_tmp + 42));
491
492 i4_samp_8x16b_r1_1 = _mm_srli_si128(i4_samp_8x16b_r1_0, 2);
493 i4_samp_8x16b_r1_2 = _mm_srli_si128(i4_samp_8x16b_r1_0, 4);
494
495 i4_samp_8x16b_r2_1 = _mm_srli_si128(i4_samp_8x16b_r2_0, 2);
496 i4_samp_8x16b_r2_2 = _mm_srli_si128(i4_samp_8x16b_r2_0, 4);
497
498 i4_samp_8x16b_r3_1 = _mm_srli_si128(i4_samp_8x16b_r3_0, 2);
499 i4_samp_8x16b_r3_2 = _mm_srli_si128(i4_samp_8x16b_r3_0, 4);
500
501 i4_samp_8x16b_r4_1 = _mm_srli_si128(i4_samp_8x16b_r4_0, 2);
502 i4_samp_8x16b_r4_2 = _mm_srli_si128(i4_samp_8x16b_r4_0, 4);
503
504 i4_samp_8x16b_r5_1 = _mm_srli_si128(i4_samp_8x16b_r5_0, 2);
505 i4_samp_8x16b_r5_2 = _mm_srli_si128(i4_samp_8x16b_r5_0, 4);
506
507 i4_samp_8x16b_r6_1 = _mm_srli_si128(i4_samp_8x16b_r6_0, 2);
508 i4_samp_8x16b_r6_2 = _mm_srli_si128(i4_samp_8x16b_r6_0, 4);
509
510 i4_samp_8x16b_r7_1 = _mm_srli_si128(i4_samp_8x16b_r7_0, 2);
511 i4_samp_8x16b_r7_2 = _mm_srli_si128(i4_samp_8x16b_r7_0, 4);
512
513 i4_samp_8x16b_r8_1 = _mm_srli_si128(i4_samp_8x16b_r8_0, 2);
514 i4_samp_8x16b_r8_2 = _mm_srli_si128(i4_samp_8x16b_r8_0, 4);
515
516 i4_samp_8x16b_r1_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r1_0, i4_samp_8x16b_r1_1);
517 i4_samp_8x16b_r2_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r2_0, i4_samp_8x16b_r2_1);
518 i4_samp_8x16b_r3_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r3_0, i4_samp_8x16b_r3_1);
519 i4_samp_8x16b_r4_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r4_0, i4_samp_8x16b_r4_1);
520 i4_samp_8x16b_r5_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r5_0, i4_samp_8x16b_r5_1);
521 i4_samp_8x16b_r6_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r6_0, i4_samp_8x16b_r6_1);
522 i4_samp_8x16b_r7_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r7_0, i4_samp_8x16b_r7_1);
523 i4_samp_8x16b_r8_0 = _mm_unpacklo_epi16(i4_samp_8x16b_r8_0, i4_samp_8x16b_r8_1);
524
525 i4_samp_8x16b_r1_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r1_1, i4_samp_8x16b_r1_2);
526 i4_samp_8x16b_r2_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r2_1, i4_samp_8x16b_r2_2);
527 i4_samp_8x16b_r3_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r3_1, i4_samp_8x16b_r3_2);
528 i4_samp_8x16b_r4_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r4_1, i4_samp_8x16b_r4_2);
529 i4_samp_8x16b_r5_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r5_1, i4_samp_8x16b_r5_2);
530 i4_samp_8x16b_r6_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r6_1, i4_samp_8x16b_r6_2);
531 i4_samp_8x16b_r7_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r7_1, i4_samp_8x16b_r7_2);
532 i4_samp_8x16b_r8_1 = _mm_unpacklo_epi16(i4_samp_8x16b_r8_1, i4_samp_8x16b_r8_2);
533
534 // a0c0 + a1c1 a1c0 + a2c1 a2c0 + a3c1 a3c0 + a4c1
535 i4_res_4x32b_r1_0 = _mm_madd_epi16(i4_samp_8x16b_r1_0, coeff_c0_c1_8x16b);
536 // b0c0+b1c1 b1c0+b2c1 b2c0+b3c1 b3c0+b4c1
537 i4_res_4x32b_r2_0 = _mm_madd_epi16(i4_samp_8x16b_r2_0, coeff_c0_c1_8x16b);
538 i4_res_4x32b_r3_0 = _mm_madd_epi16(i4_samp_8x16b_r3_0, coeff_c0_c1_8x16b);
539 i4_res_4x32b_r4_0 = _mm_madd_epi16(i4_samp_8x16b_r4_0, coeff_c0_c1_8x16b);
540 i4_res_4x32b_r5_0 = _mm_madd_epi16(i4_samp_8x16b_r5_0, coeff_c0_c1_8x16b);
541 i4_res_4x32b_r6_0 = _mm_madd_epi16(i4_samp_8x16b_r6_0, coeff_c0_c1_8x16b);
542 i4_res_4x32b_r7_0 = _mm_madd_epi16(i4_samp_8x16b_r7_0, coeff_c0_c1_8x16b);
543 i4_res_4x32b_r8_0 = _mm_madd_epi16(i4_samp_8x16b_r8_0, coeff_c0_c1_8x16b);
544
545 // a1c2+a2c3 a2c2+a3c3 a3c2+a4c3 a4c2+a5c3
546 i4_res_4x32b_r1_1 = _mm_madd_epi16(i4_samp_8x16b_r1_1, coeff_c2_c3_8x16b);
547 // b1c2+b2c3 b2c2+b3c3 b3c2+b4c3 b4c2+b5c3
548 i4_res_4x32b_r2_1 = _mm_madd_epi16(i4_samp_8x16b_r2_1, coeff_c2_c3_8x16b);
549 i4_res_4x32b_r3_1 = _mm_madd_epi16(i4_samp_8x16b_r3_1, coeff_c2_c3_8x16b);
550 i4_res_4x32b_r4_1 = _mm_madd_epi16(i4_samp_8x16b_r4_1, coeff_c2_c3_8x16b);
551 i4_res_4x32b_r5_1 = _mm_madd_epi16(i4_samp_8x16b_r5_1, coeff_c2_c3_8x16b);
552 i4_res_4x32b_r6_1 = _mm_madd_epi16(i4_samp_8x16b_r6_1, coeff_c2_c3_8x16b);
553 i4_res_4x32b_r7_1 = _mm_madd_epi16(i4_samp_8x16b_r7_1, coeff_c2_c3_8x16b);
554 i4_res_4x32b_r8_1 = _mm_madd_epi16(i4_samp_8x16b_r8_1, coeff_c2_c3_8x16b);
555
556 i4_res_4x32b_r1_0 = _mm_add_epi32(i4_res_4x32b_r1_0, res_128);
557 i4_res_4x32b_r2_0 = _mm_add_epi32(i4_res_4x32b_r2_0, res_128);
558 i4_res_4x32b_r3_0 = _mm_add_epi32(i4_res_4x32b_r3_0, res_128);
559 i4_res_4x32b_r4_0 = _mm_add_epi32(i4_res_4x32b_r4_0, res_128);
560 i4_res_4x32b_r5_0 = _mm_add_epi32(i4_res_4x32b_r5_0, res_128);
561 i4_res_4x32b_r6_0 = _mm_add_epi32(i4_res_4x32b_r6_0, res_128);
562 i4_res_4x32b_r7_0 = _mm_add_epi32(i4_res_4x32b_r7_0, res_128);
563 i4_res_4x32b_r8_0 = _mm_add_epi32(i4_res_4x32b_r8_0, res_128);
564
565 i4_res_4x32b_r1_1 = _mm_add_epi32(i4_res_4x32b_r1_1, res_128);
566 i4_res_4x32b_r2_1 = _mm_add_epi32(i4_res_4x32b_r2_1, res_128);
567 i4_res_4x32b_r3_1 = _mm_add_epi32(i4_res_4x32b_r3_1, res_128);
568 i4_res_4x32b_r4_1 = _mm_add_epi32(i4_res_4x32b_r4_1, res_128);
569 i4_res_4x32b_r5_1 = _mm_add_epi32(i4_res_4x32b_r5_1, res_128);
570 i4_res_4x32b_r6_1 = _mm_add_epi32(i4_res_4x32b_r6_1, res_128);
571 i4_res_4x32b_r7_1 = _mm_add_epi32(i4_res_4x32b_r7_1, res_128);
572 i4_res_4x32b_r8_1 = _mm_add_epi32(i4_res_4x32b_r8_1, res_128);
573
574 i4_res_4x32b_r1_0 = _mm_srai_epi32(i4_res_4x32b_r1_0, u4_norm_factor);
575 i4_res_4x32b_r2_0 = _mm_srai_epi32(i4_res_4x32b_r2_0, u4_norm_factor);
576 i4_res_4x32b_r3_0 = _mm_srai_epi32(i4_res_4x32b_r3_0, u4_norm_factor);
577 i4_res_4x32b_r4_0 = _mm_srai_epi32(i4_res_4x32b_r4_0, u4_norm_factor);
578 i4_res_4x32b_r5_0 = _mm_srai_epi32(i4_res_4x32b_r5_0, u4_norm_factor);
579 i4_res_4x32b_r6_0 = _mm_srai_epi32(i4_res_4x32b_r6_0, u4_norm_factor);
580 i4_res_4x32b_r7_0 = _mm_srai_epi32(i4_res_4x32b_r7_0, u4_norm_factor);
581 i4_res_4x32b_r8_0 = _mm_srai_epi32(i4_res_4x32b_r8_0, u4_norm_factor);
582
583 i4_res_4x32b_r1_1 = _mm_srai_epi32(i4_res_4x32b_r1_1, u4_norm_factor);
584 i4_res_4x32b_r2_1 = _mm_srai_epi32(i4_res_4x32b_r2_1, u4_norm_factor);
585 i4_res_4x32b_r3_1 = _mm_srai_epi32(i4_res_4x32b_r3_1, u4_norm_factor);
586 i4_res_4x32b_r4_1 = _mm_srai_epi32(i4_res_4x32b_r4_1, u4_norm_factor);
587 i4_res_4x32b_r5_1 = _mm_srai_epi32(i4_res_4x32b_r5_1, u4_norm_factor);
588 i4_res_4x32b_r6_1 = _mm_srai_epi32(i4_res_4x32b_r6_1, u4_norm_factor);
589 i4_res_4x32b_r7_1 = _mm_srai_epi32(i4_res_4x32b_r7_1, u4_norm_factor);
590 i4_res_4x32b_r8_1 = _mm_srai_epi32(i4_res_4x32b_r8_1, u4_norm_factor);
591
592 i4_res_final_8x16b_r12_0 = _mm_packs_epi32(i4_res_4x32b_r1_0, i4_res_4x32b_r2_0);
593 i4_res_final_8x16b_r34_0 = _mm_packs_epi32(i4_res_4x32b_r3_0, i4_res_4x32b_r4_0);
594 i4_res_final_8x16b_r56_0 = _mm_packs_epi32(i4_res_4x32b_r5_0, i4_res_4x32b_r6_0);
595 i4_res_final_8x16b_r67_0 = _mm_packs_epi32(i4_res_4x32b_r7_0, i4_res_4x32b_r8_0);
596
597 i4_res_final_8x16b_r12_1 = _mm_packs_epi32(i4_res_4x32b_r1_1, i4_res_4x32b_r2_1);
598 i4_res_final_8x16b_r34_1 = _mm_packs_epi32(i4_res_4x32b_r3_1, i4_res_4x32b_r4_1);
599 i4_res_final_8x16b_r56_1 = _mm_packs_epi32(i4_res_4x32b_r5_1, i4_res_4x32b_r6_1);
600 i4_res_final_8x16b_r67_1 = _mm_packs_epi32(i4_res_4x32b_r7_1, i4_res_4x32b_r8_1);
601
602 i4_res_final_8x16b_r1 = _mm_unpacklo_epi16(i4_res_final_8x16b_r12_0, i4_res_final_8x16b_r12_1);
603 i4_res_final_8x16b_r2 = _mm_unpackhi_epi16(i4_res_final_8x16b_r12_0, i4_res_final_8x16b_r12_1);
604 i4_res_final_8x16b_r3 = _mm_unpacklo_epi16(i4_res_final_8x16b_r34_0, i4_res_final_8x16b_r34_1);
605 i4_res_final_8x16b_r4 = _mm_unpackhi_epi16(i4_res_final_8x16b_r34_0, i4_res_final_8x16b_r34_1);
606 i4_res_final_8x16b_r5 = _mm_unpacklo_epi16(i4_res_final_8x16b_r56_0, i4_res_final_8x16b_r56_1);
607 i4_res_final_8x16b_r6 = _mm_unpackhi_epi16(i4_res_final_8x16b_r56_0, i4_res_final_8x16b_r56_1);
608 i4_res_final_8x16b_r7 = _mm_unpacklo_epi16(i4_res_final_8x16b_r67_0, i4_res_final_8x16b_r67_1);
609 i4_res_final_8x16b_r8 = _mm_unpackhi_epi16(i4_res_final_8x16b_r67_0, i4_res_final_8x16b_r67_1);
610
611 chroma_mask = _mm_set1_epi16(0xFF00);
612 chroma_mask2 = _mm_set1_epi16(0x00FF);
613 out_16x8b_r1 = _mm_loadu_si128((__m128i *) (&pu1_out[0]));
614 out_16x8b_r2 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride]));
615 out_16x8b_r3 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride2]));
616 out_16x8b_r4 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride2 + i4_dst_stride]));
617 out_16x8b_r5 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4]));
618 out_16x8b_r6 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4 + i4_dst_stride]));
619 out_16x8b_r7 = _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4 + i4_dst_stride2]));
620 out_16x8b_r8 =
621 _mm_loadu_si128((__m128i *) (&pu1_out[i4_dst_stride4 + i4_dst_stride2 + i4_dst_stride]));
622
623 out_16x8b_r1 = _mm_and_si128(out_16x8b_r1, chroma_mask);
624 out_16x8b_r2 = _mm_and_si128(out_16x8b_r2, chroma_mask);
625 out_16x8b_r3 = _mm_and_si128(out_16x8b_r3, chroma_mask);
626 out_16x8b_r4 = _mm_and_si128(out_16x8b_r4, chroma_mask);
627 out_16x8b_r5 = _mm_and_si128(out_16x8b_r5, chroma_mask);
628 out_16x8b_r6 = _mm_and_si128(out_16x8b_r6, chroma_mask);
629 out_16x8b_r7 = _mm_and_si128(out_16x8b_r7, chroma_mask);
630 out_16x8b_r8 = _mm_and_si128(out_16x8b_r8, chroma_mask);
631
632 i4_res_final_8x16b_r1 = _mm_and_si128(i4_res_final_8x16b_r1, chroma_mask2);
633 i4_res_final_8x16b_r2 = _mm_and_si128(i4_res_final_8x16b_r2, chroma_mask2);
634 i4_res_final_8x16b_r3 = _mm_and_si128(i4_res_final_8x16b_r3, chroma_mask2);
635 i4_res_final_8x16b_r4 = _mm_and_si128(i4_res_final_8x16b_r4, chroma_mask2);
636 i4_res_final_8x16b_r5 = _mm_and_si128(i4_res_final_8x16b_r5, chroma_mask2);
637 i4_res_final_8x16b_r6 = _mm_and_si128(i4_res_final_8x16b_r6, chroma_mask2);
638 i4_res_final_8x16b_r7 = _mm_and_si128(i4_res_final_8x16b_r7, chroma_mask2);
639 i4_res_final_8x16b_r8 = _mm_and_si128(i4_res_final_8x16b_r8, chroma_mask2);
640
641 out_16x8b_r1 = _mm_add_epi8(i4_res_final_8x16b_r1, out_16x8b_r1);
642 out_16x8b_r2 = _mm_add_epi8(i4_res_final_8x16b_r2, out_16x8b_r2);
643 out_16x8b_r3 = _mm_add_epi8(i4_res_final_8x16b_r3, out_16x8b_r3);
644 out_16x8b_r4 = _mm_add_epi8(i4_res_final_8x16b_r4, out_16x8b_r4);
645 out_16x8b_r5 = _mm_add_epi8(i4_res_final_8x16b_r5, out_16x8b_r5);
646 out_16x8b_r6 = _mm_add_epi8(i4_res_final_8x16b_r6, out_16x8b_r6);
647 out_16x8b_r7 = _mm_add_epi8(i4_res_final_8x16b_r7, out_16x8b_r7);
648 out_16x8b_r8 = _mm_add_epi8(i4_res_final_8x16b_r8, out_16x8b_r8);
649
650 _mm_storeu_si128((__m128i *) pu1_out, out_16x8b_r1);
651 _mm_storeu_si128((__m128i *) (pu1_out + i4_dst_stride), out_16x8b_r2);
652 _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 2)), out_16x8b_r3);
653 _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 3)), out_16x8b_r4);
654 _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 4)), out_16x8b_r5);
655 _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 5)), out_16x8b_r6);
656 _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 6)), out_16x8b_r7);
657 _mm_storeu_si128((__m128i *) (pu1_out + (i4_dst_stride * 7)), out_16x8b_r8);
658 }
659