1 /******************************************************************************
2 *
3 * Copyright (C) 2022 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20
21 /**
22 ******************************************************************************
23 * @file isvce_downscaler_sse42.c
24 *
25 * @brief
26 * This file contains the x86 SIMD version of the function which does
27 * horizontal scaling and transpose
28 *
29 * @author
30 * Ittiam
31 *
32 * @par List of Functions:
33 * - isvce_horizontal_downscale_and_transpose_sse42()
34 *
35 * @remarks
36 * None
37 *
38 *******************************************************************************
39 */
40
41 /*****************************************************************************/
42 /* File Includes */
43 /*****************************************************************************/
44
45 /* System include files */
46 #include <stdio.h>
47 #include <stdlib.h>
48 #include <immintrin.h>
49
50 /* User include files */
51 #include "ih264_typedefs.h"
52 #include "isvc_macros.h"
53 #include "ih264_platform_macros.h"
54 #include "isvc_defs.h"
55 #include "isvce_defs.h"
56 #include "isvc_structs.h"
57 #include "isvce_downscaler_private_defs.h"
58
59 /*****************************************************************************/
60 /* Function Definitions */
61 /*****************************************************************************/
62
63 /**
64 *******************************************************************************
65 *
66 * @brief
67 * horizontal scaler function
68 *
69 * @par Description:
70 * Does horizontal scaling for the given block
71 *
72 * @param[in] ps_scaler
73 * pointer to downscaler context
74 *
75 * @param[in] ps_src
76 * pointer to source buffer container
77 *
78 * @param[in] ps_dst
79 * pointer to destination buffer container
80 *
81 * @param[in] pai1_filters
82 * pointer to array of downscaler filters
83 *
84 * @param[in] u4_blk_wd
85 * width of the block after horizontal scaling (output block width)
86 *
87 * @param[in] u4_blk_ht
88 * height of the current block (input block height)
89 *
90 * @param[in] u1_is_chroma
91 * flag suggesting whether the buffer is luma or chroma
92 *
93 *
94 * @returns
95 *
96 * @remarks
97 * The same function is used for vertical scaling too as
98 * the horizontally scaled input in stored in transpose fashion.
99 *
100 *******************************************************************************
101 */
102
isvce_horizontal_downscale_and_transpose_sse42(downscaler_ctxt_t * ps_scaler,buffer_container_t * ps_src,buffer_container_t * ps_dst,FILTER_COEFF_ARRAY pai1_filters,UWORD32 u4_blk_wd,UWORD32 u4_blk_ht,UWORD8 u1_is_chroma)103 void isvce_horizontal_downscale_and_transpose_sse42(
104 downscaler_ctxt_t *ps_scaler, buffer_container_t *ps_src, buffer_container_t *ps_dst,
105 FILTER_COEFF_ARRAY pai1_filters, UWORD32 u4_blk_wd, UWORD32 u4_blk_ht, UWORD8 u1_is_chroma)
106 {
107 WORD32 i, j;
108 UWORD8 u1_phase;
109 UWORD8 *pu1_src_j, *pu1_dst_j;
110 WORD32 i4_temp_pixel_holder;
111 UWORD32 u4_num_iterations_vertical_by_16;
112 UWORD32 u4_rem_vert_loop;
113 UWORD8 *pu1_in_pixel;
114 UWORD8 *pu1_out_pixel;
115 WORD8 *pi1_filter_for_grid;
116 UWORD16 u2_full_pixel_inc;
117
118 __m128i src_temp_0, src_temp_1, src_temp_2, src_temp_3, src_temp_4, src_temp_5, src_temp_6,
119 src_temp_7;
120
121 __m128i reg_all_1s, reg_64val_32bit, reg_all_0s, filt_coeff_grid, reg_shuffle;
122
123 __m128i reg_01_16x8b, reg_02_16x8b, reg_03_16x8b, reg_04_16x8b, reg_05_16x8b;
124
125 downscaler_state_t *ps_scaler_state = (downscaler_state_t *) ps_scaler->pv_scaler_state;
126
127 UWORD32 u4_center_pixel_pos = ps_scaler_state->i4_init_offset;
128 UWORD32 u4_src_vert_increments = ps_scaler_state->u4_vert_increment;
129 UWORD32 u4_src_horz_increments = ps_scaler_state->u4_horz_increment;
130
131 UWORD8 *pu1_src = ps_src->pv_data;
132 UWORD32 u4_in_stride = ps_src->i4_data_stride;
133 UWORD8 *pu1_dst = ps_dst->pv_data;
134 UWORD32 u4_out_stride = ps_dst->i4_data_stride;
135 UWORD32 u4_center_pixel_pos_src = u4_center_pixel_pos;
136
137 ASSERT((1 << DOWNSCALER_Q) == u4_src_vert_increments);
138
139 reg_all_1s = _mm_set1_epi16((short) 1);
140 reg_64val_32bit = _mm_set1_epi32((int) 64);
141 reg_all_0s = _mm_setzero_si128();
142 reg_shuffle = _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
143
144 u4_num_iterations_vertical_by_16 = u4_blk_ht >> 4;
145 u4_rem_vert_loop = u4_blk_ht % 16;
146
147 /* Offset the input so that the input pixel to be processed
148 co-incides with the centre of filter (4th coefficient)*/
149 pu1_src += (1 + u1_is_chroma);
150
151 if(!u1_is_chroma)
152 {
153 for(j = 0; j < (WORD32) u4_num_iterations_vertical_by_16; j++)
154 {
155 pu1_src_j = pu1_src + ((j << 4) * u4_in_stride);
156 pu1_dst_j = pu1_dst + (j << 4);
157
158 u4_center_pixel_pos = u4_center_pixel_pos_src;
159
160 for(i = 0; i < (WORD32) u4_blk_wd; i++)
161 {
162 u1_phase = get_filter_phase(u4_center_pixel_pos);
163 pi1_filter_for_grid = pai1_filters[u1_phase];
164
165 u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q;
166
167 pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma);
168
169 pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride);
170
171 filt_coeff_grid = _mm_loadu_si128((__m128i *) pi1_filter_for_grid);
172 /******************************************************/
173 /* This loop is going vertically in bottom direction */
174 /* but the output pixels are stored in horizontal */
175 /* direction in transpose manner */
176 /******************************************************/
177
178 /*For row 0,1*/
179 src_temp_0 = _mm_loadl_epi64((__m128i *) pu1_in_pixel);
180 src_temp_1 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride));
181 /*next transfer the 8 pixels from temp_2 to temp_1 higher bits 64-127*/
182 src_temp_0 = _mm_unpacklo_epi64(src_temp_0, src_temp_1);
183
184 /*For row 2,3*/
185 src_temp_2 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 2));
186
187 src_temp_3 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 3));
188
189 src_temp_2 = _mm_unpacklo_epi64(src_temp_2, src_temp_3);
190
191 reg_01_16x8b = _mm_maddubs_epi16(src_temp_0, filt_coeff_grid);
192
193 /*multiply with filter coeffs to get 16 bit results*/
194 reg_02_16x8b = _mm_maddubs_epi16(src_temp_2, filt_coeff_grid);
195
196 reg_01_16x8b = _mm_hadd_epi16(reg_01_16x8b, reg_02_16x8b);
197 /*add adjacent 16 bit values to get 32 bit values*/
198 reg_01_16x8b = _mm_madd_epi16(reg_01_16x8b, reg_all_1s);
199
200 /*Add offset of 64 for rounding each out pixel value*/
201 reg_01_16x8b = _mm_add_epi32(reg_01_16x8b, reg_64val_32bit);
202 /*Divide by 128 each out pixel value*/
203 reg_01_16x8b = _mm_srli_epi32(reg_01_16x8b, 7);
204
205 /*For row 4,5*/
206 src_temp_4 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 4));
207
208 src_temp_5 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 5));
209
210 src_temp_4 = _mm_unpacklo_epi64(src_temp_4, src_temp_5);
211
212 /*For row 6,7*/
213 src_temp_6 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 6));
214
215 src_temp_7 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 7));
216
217 src_temp_6 = _mm_unpacklo_epi64(src_temp_6, src_temp_7);
218
219 reg_03_16x8b = _mm_maddubs_epi16(src_temp_4, filt_coeff_grid);
220
221 reg_04_16x8b = _mm_maddubs_epi16(src_temp_6, filt_coeff_grid);
222
223 reg_03_16x8b = _mm_hadd_epi16(reg_03_16x8b, reg_04_16x8b);
224
225 reg_03_16x8b = _mm_madd_epi16(reg_03_16x8b, reg_all_1s);
226
227 /*next add 2 adjacent 32 bit values to get a single 32 bit
228 **value in each row
229 */
230
231 /*Add offset of 64 for rounding each out pixel value*/
232 reg_03_16x8b = _mm_add_epi32(reg_03_16x8b, reg_64val_32bit);
233 /*Divide by 128 each out pixel value*/
234 reg_03_16x8b = _mm_srli_epi32(reg_03_16x8b, 7);
235
236 /*pack the lower 16 bit values corresponding to the 8 output
237 pixels from reg1 and reg 2*/
238 reg_01_16x8b = _mm_packus_epi32(reg_01_16x8b, reg_03_16x8b);
239
240 /*For row 8,9*/
241 src_temp_0 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + 8 * u4_in_stride));
242
243 src_temp_1 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + 9 * u4_in_stride));
244
245 /*next transfer the 8 pixels from temp_2 to temp_1 higher bits 64-127*/
246 src_temp_0 = _mm_unpacklo_epi64(src_temp_0, src_temp_1);
247
248 /*For row 10,11*/
249 src_temp_2 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 10));
250
251 src_temp_3 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 11));
252
253 src_temp_2 = _mm_unpacklo_epi64(src_temp_2, src_temp_3);
254
255 reg_02_16x8b = _mm_maddubs_epi16(src_temp_0, filt_coeff_grid);
256
257 /*multiply with filter coeffs to get 16 bit results*/
258 reg_03_16x8b = _mm_maddubs_epi16(src_temp_2, filt_coeff_grid);
259
260 reg_02_16x8b = _mm_hadd_epi16(reg_02_16x8b, reg_03_16x8b);
261 /*add adjacent 16 bit values to get 32 bit values*/
262 reg_02_16x8b = _mm_madd_epi16(reg_02_16x8b, reg_all_1s);
263
264 /*next add 2 adjacent 32 bit values to get a single
265 32 bit value in each row*/
266
267 /*Add offset of 64 for rounding each out pixel value*/
268 reg_02_16x8b = _mm_add_epi32(reg_02_16x8b, reg_64val_32bit);
269 /*Divide by 128 each out pixel value*/
270 reg_02_16x8b = _mm_srli_epi32(reg_02_16x8b, 7);
271
272 /*For row 12,13*/
273 src_temp_4 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 12));
274
275 src_temp_5 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 13));
276
277 src_temp_4 = _mm_unpacklo_epi64(src_temp_4, src_temp_5);
278
279 /*For row 14,15*/
280 src_temp_6 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 14));
281
282 src_temp_7 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 15));
283
284 src_temp_6 = _mm_unpacklo_epi64(src_temp_6, src_temp_7);
285
286 reg_04_16x8b = _mm_maddubs_epi16(src_temp_4, filt_coeff_grid);
287
288 reg_05_16x8b = _mm_maddubs_epi16(src_temp_6, filt_coeff_grid);
289
290 reg_04_16x8b = _mm_hadd_epi16(reg_04_16x8b, reg_05_16x8b);
291 /*add adjacent 16 bit values to get 32 bit values*/
292 reg_04_16x8b = _mm_madd_epi16(reg_04_16x8b, reg_all_1s);
293
294 /*next add 2 adjacent 32 bit values to get a single
295 32 bit value in each row*/
296
297 /*Add offset of 64 for rounding each out pixel value*/
298 reg_04_16x8b = _mm_add_epi32(reg_04_16x8b, reg_64val_32bit);
299 /*Divide by 128 each out pixel value*/
300 reg_04_16x8b = _mm_srli_epi32(reg_04_16x8b, 7);
301
302 /*pack the lower 16 bit values corresponding to the 8 output
303 pixels from reg1 and reg 2*/
304 reg_02_16x8b = _mm_packus_epi32(reg_02_16x8b, reg_04_16x8b);
305
306 /*next get saturated 8 bit output pixel values for row 0-15*/
307 reg_01_16x8b = _mm_packus_epi16(reg_01_16x8b, reg_02_16x8b);
308
309 /*Store the 16 output values*/
310 _mm_storeu_si128((__m128i *) pu1_out_pixel, reg_01_16x8b);
311
312 pu1_out_pixel += 16;
313
314 pu1_in_pixel += ((u4_src_vert_increments * (u4_in_stride << 4)) >> DOWNSCALER_Q);
315
316 /* Update the context for next Loop Count */
317 u4_center_pixel_pos += u4_src_horz_increments;
318 }
319 }
320
321 /*if height is not a multiple of 8 process 2 rows at a
322 time for the remaining rows*/
323 if(u4_rem_vert_loop)
324 {
325 pu1_src_j = pu1_src + ((j << 4) * u4_in_stride);
326 pu1_dst_j = pu1_dst + (j << 4);
327
328 u4_center_pixel_pos = u4_center_pixel_pos_src;
329
330 for(i = 0; i < (WORD32) u4_blk_wd; i++)
331 {
332 u1_phase = get_filter_phase(u4_center_pixel_pos);
333 pi1_filter_for_grid = pai1_filters[u1_phase];
334
335 u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q;
336
337 pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma);
338
339 pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride);
340
341 filt_coeff_grid = _mm_loadu_si128((__m128i *) pi1_filter_for_grid);
342
343 for(j = u4_rem_vert_loop; j > 0; j--)
344 {
345 src_temp_0 = _mm_loadl_epi64((__m128i const *) pu1_in_pixel);
346
347 src_temp_0 = _mm_maddubs_epi16(src_temp_0, filt_coeff_grid);
348
349 src_temp_0 = _mm_madd_epi16(src_temp_0, reg_all_1s);
350
351 reg_01_16x8b = _mm_hadd_epi32(src_temp_0, reg_all_0s);
352
353 /*Add offset of 64 for rounding each out pixel value*/
354 reg_01_16x8b = _mm_add_epi32(reg_01_16x8b, reg_64val_32bit);
355 /*Divide by 128 each out pixel value*/
356 reg_01_16x8b = _mm_srli_epi32(reg_01_16x8b, (int) 7);
357
358 reg_01_16x8b = _mm_packus_epi32(reg_01_16x8b, reg_all_0s);
359
360 /*next get saturated 8 bit output pixel values*/
361 reg_01_16x8b = _mm_packus_epi16(reg_01_16x8b, reg_all_0s);
362
363 /*Store the 1 output value*/
364 *pu1_out_pixel = (UWORD8) _mm_cvtsi128_si32(reg_01_16x8b);
365
366 pu1_in_pixel += (u4_src_vert_increments * u4_in_stride) >> DOWNSCALER_Q;
367
368 pu1_out_pixel++;
369 }
370 /* Update the context for next Loop Count */
371 u4_center_pixel_pos += u4_src_horz_increments;
372 }
373 }
374 }
375
376 else /* for chroma */
377 {
378 for(j = 0; j < (WORD32) u4_num_iterations_vertical_by_16; j++)
379 {
380 pu1_src_j = pu1_src + ((j << 4) * u4_in_stride);
381 pu1_dst_j = pu1_dst + (j << 4);
382
383 u4_center_pixel_pos = u4_center_pixel_pos_src;
384
385 for(i = 0; i < (WORD32) u4_blk_wd; i++)
386 {
387 u1_phase = get_filter_phase(u4_center_pixel_pos);
388 pi1_filter_for_grid = pai1_filters[u1_phase];
389
390 u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q;
391
392 pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma);
393
394 pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride);
395
396 filt_coeff_grid = _mm_loadu_si128((__m128i *) pi1_filter_for_grid);
397 /******************************************************/
398 /* This loop is going vertically in bottom direction */
399 /* but the output pixels are stored in horizontal */
400 /* direction in transpose manner */
401 /******************************************************/
402
403 /*Load 16 values shuffle to separate Cb and Cr and process*/
404
405 src_temp_0 = _mm_loadu_si128((__m128i *) pu1_in_pixel);
406 src_temp_1 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride));
407
408 src_temp_2 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 2));
409
410 src_temp_3 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 3));
411
412 src_temp_0 = _mm_shuffle_epi8(src_temp_0, reg_shuffle);
413 src_temp_1 = _mm_shuffle_epi8(src_temp_1, reg_shuffle);
414 src_temp_2 = _mm_shuffle_epi8(src_temp_2, reg_shuffle);
415 src_temp_3 = _mm_shuffle_epi8(src_temp_3, reg_shuffle);
416
417 reg_01_16x8b = _mm_maddubs_epi16(src_temp_0, filt_coeff_grid);
418 reg_02_16x8b = _mm_maddubs_epi16(src_temp_1, filt_coeff_grid);
419
420 reg_01_16x8b = _mm_hadd_epi16(reg_01_16x8b, reg_02_16x8b);
421
422 reg_01_16x8b = _mm_madd_epi16(reg_01_16x8b, reg_all_1s);
423
424 reg_01_16x8b = _mm_add_epi32(reg_01_16x8b, reg_64val_32bit);
425
426 reg_01_16x8b = _mm_srli_epi32(reg_01_16x8b, (int) 7);
427
428 reg_03_16x8b = _mm_maddubs_epi16(src_temp_2, filt_coeff_grid);
429 reg_04_16x8b = _mm_maddubs_epi16(src_temp_3, filt_coeff_grid);
430
431 src_temp_4 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 4));
432
433 src_temp_5 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 5));
434
435 src_temp_6 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 6));
436
437 src_temp_7 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 7));
438
439 src_temp_4 = _mm_shuffle_epi8(src_temp_4, reg_shuffle);
440 src_temp_5 = _mm_shuffle_epi8(src_temp_5, reg_shuffle);
441 src_temp_6 = _mm_shuffle_epi8(src_temp_6, reg_shuffle);
442 src_temp_7 = _mm_shuffle_epi8(src_temp_7, reg_shuffle);
443
444 reg_03_16x8b = _mm_hadd_epi16(reg_03_16x8b, reg_04_16x8b);
445
446 reg_03_16x8b = _mm_madd_epi16(reg_03_16x8b, reg_all_1s);
447
448 reg_03_16x8b = _mm_add_epi32(reg_03_16x8b, reg_64val_32bit);
449
450 reg_03_16x8b = _mm_srli_epi32(reg_03_16x8b, (int) 7);
451
452 reg_01_16x8b = _mm_packus_epi32(reg_01_16x8b, reg_03_16x8b);
453
454 reg_02_16x8b = _mm_maddubs_epi16(src_temp_4, filt_coeff_grid);
455 reg_04_16x8b = _mm_maddubs_epi16(src_temp_5, filt_coeff_grid);
456
457 reg_02_16x8b = _mm_hadd_epi16(reg_02_16x8b, reg_04_16x8b);
458
459 reg_02_16x8b = _mm_madd_epi16(reg_02_16x8b, reg_all_1s);
460
461 reg_02_16x8b = _mm_add_epi32(reg_02_16x8b, reg_64val_32bit);
462
463 reg_02_16x8b = _mm_srli_epi32(reg_02_16x8b, (int) 7);
464
465 reg_03_16x8b = _mm_maddubs_epi16(src_temp_6, filt_coeff_grid);
466 reg_04_16x8b = _mm_maddubs_epi16(src_temp_7, filt_coeff_grid);
467
468 reg_03_16x8b = _mm_hadd_epi16(reg_03_16x8b, reg_04_16x8b);
469
470 reg_03_16x8b = _mm_madd_epi16(reg_03_16x8b, reg_all_1s);
471
472 reg_03_16x8b = _mm_add_epi32(reg_03_16x8b, reg_64val_32bit);
473
474 reg_03_16x8b = _mm_srli_epi32(reg_03_16x8b, (int) 7);
475
476 reg_02_16x8b = _mm_packus_epi32(reg_02_16x8b, reg_03_16x8b);
477
478 reg_01_16x8b = _mm_packus_epi16(reg_01_16x8b, reg_02_16x8b);
479
480 reg_01_16x8b = _mm_shuffle_epi8(reg_01_16x8b, reg_shuffle);
481
482 src_temp_0 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + 8 * u4_in_stride));
483
484 src_temp_1 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + 9 * u4_in_stride));
485
486 src_temp_2 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 10));
487
488 src_temp_3 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 11));
489
490 src_temp_0 = _mm_shuffle_epi8(src_temp_0, reg_shuffle);
491 src_temp_1 = _mm_shuffle_epi8(src_temp_1, reg_shuffle);
492 src_temp_2 = _mm_shuffle_epi8(src_temp_2, reg_shuffle);
493 src_temp_3 = _mm_shuffle_epi8(src_temp_3, reg_shuffle);
494
495 reg_02_16x8b = _mm_maddubs_epi16(src_temp_0, filt_coeff_grid);
496 reg_03_16x8b = _mm_maddubs_epi16(src_temp_1, filt_coeff_grid);
497
498 reg_02_16x8b = _mm_hadd_epi16(reg_02_16x8b, reg_03_16x8b);
499
500 reg_02_16x8b = _mm_madd_epi16(reg_02_16x8b, reg_all_1s);
501
502 reg_02_16x8b = _mm_add_epi32(reg_02_16x8b, reg_64val_32bit);
503
504 reg_02_16x8b = _mm_srli_epi32(reg_02_16x8b, (int) 7);
505
506 reg_04_16x8b = _mm_maddubs_epi16(src_temp_2, filt_coeff_grid);
507 reg_05_16x8b = _mm_maddubs_epi16(src_temp_3, filt_coeff_grid);
508
509 src_temp_4 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 12));
510
511 src_temp_5 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 13));
512
513 src_temp_6 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 14));
514
515 src_temp_7 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 15));
516
517 src_temp_4 = _mm_shuffle_epi8(src_temp_4, reg_shuffle);
518 src_temp_5 = _mm_shuffle_epi8(src_temp_5, reg_shuffle);
519 src_temp_6 = _mm_shuffle_epi8(src_temp_6, reg_shuffle);
520 src_temp_7 = _mm_shuffle_epi8(src_temp_7, reg_shuffle);
521
522 reg_04_16x8b = _mm_hadd_epi16(reg_04_16x8b, reg_05_16x8b);
523
524 reg_04_16x8b = _mm_madd_epi16(reg_04_16x8b, reg_all_1s);
525
526 reg_04_16x8b = _mm_add_epi32(reg_04_16x8b, reg_64val_32bit);
527
528 reg_04_16x8b = _mm_srli_epi32(reg_04_16x8b, (int) 7);
529
530 reg_02_16x8b = _mm_packus_epi32(reg_02_16x8b, reg_04_16x8b);
531
532 reg_03_16x8b = _mm_maddubs_epi16(src_temp_4, filt_coeff_grid);
533 reg_05_16x8b = _mm_maddubs_epi16(src_temp_5, filt_coeff_grid);
534
535 reg_03_16x8b = _mm_hadd_epi16(reg_03_16x8b, reg_05_16x8b);
536
537 reg_03_16x8b = _mm_madd_epi16(reg_03_16x8b, reg_all_1s);
538
539 reg_03_16x8b = _mm_add_epi32(reg_03_16x8b, reg_64val_32bit);
540
541 reg_03_16x8b = _mm_srli_epi32(reg_03_16x8b, (int) 7);
542
543 reg_04_16x8b = _mm_maddubs_epi16(src_temp_6, filt_coeff_grid);
544 reg_05_16x8b = _mm_maddubs_epi16(src_temp_7, filt_coeff_grid);
545
546 reg_04_16x8b = _mm_hadd_epi16(reg_04_16x8b, reg_05_16x8b);
547
548 reg_04_16x8b = _mm_madd_epi16(reg_04_16x8b, reg_all_1s);
549
550 reg_04_16x8b = _mm_add_epi32(reg_04_16x8b, reg_64val_32bit);
551
552 reg_04_16x8b = _mm_srli_epi32(reg_04_16x8b, (int) 7);
553
554 reg_03_16x8b = _mm_packus_epi32(reg_03_16x8b, reg_04_16x8b);
555
556 reg_02_16x8b = _mm_packus_epi16(reg_02_16x8b, reg_03_16x8b);
557
558 reg_02_16x8b = _mm_shuffle_epi8(reg_02_16x8b, reg_shuffle);
559
560 reg_03_16x8b = _mm_unpacklo_epi64(reg_01_16x8b, reg_02_16x8b);
561
562 reg_04_16x8b = _mm_unpackhi_epi64(reg_01_16x8b, reg_02_16x8b);
563
564 /*Storing after shuffling again*/
565
566 _mm_storeu_si128((__m128i *) pu1_out_pixel, reg_03_16x8b);
567 _mm_storeu_si128((__m128i *) (pu1_out_pixel + u4_out_stride), reg_04_16x8b);
568
569 pu1_out_pixel += 16;
570
571 pu1_in_pixel += (u4_src_vert_increments * (u4_in_stride << 4)) >> DOWNSCALER_Q;
572
573 /* Update the context for next Loop Count */
574 u4_center_pixel_pos += u4_src_horz_increments;
575 }
576 }
577
578 /*if height is not a multiple of 8 process 2 rows at a
579 time for the remaining rows*/
580 if(u4_rem_vert_loop)
581 {
582 pu1_src_j = pu1_src + ((j << 4) * u4_in_stride);
583 pu1_dst_j = pu1_dst + (j << 4);
584
585 u4_center_pixel_pos = u4_center_pixel_pos_src;
586 for(i = 0; i < (WORD32) u4_blk_wd; i++)
587 {
588 UWORD8 u1_phase = get_filter_phase(u4_center_pixel_pos);
589 pi1_filter_for_grid = pai1_filters[u1_phase];
590
591 u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q;
592
593 pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma);
594
595 pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride);
596
597 filt_coeff_grid = _mm_loadu_si128((__m128i *) pi1_filter_for_grid);
598
599 for(j = u4_rem_vert_loop; j > 0; j = j - 2)
600 {
601 src_temp_0 = _mm_loadu_si128((__m128i const *) pu1_in_pixel);
602 src_temp_0 = _mm_shuffle_epi8(src_temp_0, reg_shuffle);
603
604 src_temp_1 = _mm_loadu_si128((__m128i const *) (pu1_in_pixel + u4_in_stride));
605
606 src_temp_1 = _mm_shuffle_epi8(src_temp_1, reg_shuffle);
607
608 src_temp_0 = _mm_maddubs_epi16(src_temp_0, filt_coeff_grid);
609 src_temp_1 = _mm_maddubs_epi16(src_temp_1, filt_coeff_grid);
610
611 reg_01_16x8b = _mm_hadd_epi16(src_temp_0, src_temp_1);
612
613 reg_01_16x8b = _mm_madd_epi16(reg_01_16x8b, reg_all_1s);
614
615 /*Add offset of 64 for rounding each out pixel value*/
616 reg_01_16x8b = _mm_add_epi32(reg_01_16x8b, reg_64val_32bit);
617 /*Divide by 128 each out pixel value*/
618 reg_01_16x8b = _mm_srli_epi32(reg_01_16x8b, (int) 7);
619
620 reg_01_16x8b = _mm_packus_epi32(reg_01_16x8b, reg_all_0s);
621
622 /*next get saturated 8 bit output pixel values*/
623 reg_01_16x8b = _mm_packus_epi16(reg_01_16x8b, reg_all_0s);
624
625 reg_01_16x8b = _mm_shuffle_epi8(reg_01_16x8b, reg_shuffle);
626
627 reg_02_16x8b = _mm_srli_si128(reg_01_16x8b, (int) 8);
628
629 /*Store the 2 output values*/
630 i4_temp_pixel_holder = _mm_cvtsi128_si32(reg_01_16x8b);
631
632 *pu1_out_pixel = (UWORD8) i4_temp_pixel_holder;
633 i4_temp_pixel_holder >>= 8;
634
635 *(pu1_out_pixel + 1) = (UWORD8) i4_temp_pixel_holder;
636
637 i4_temp_pixel_holder = _mm_cvtsi128_si32(reg_02_16x8b);
638
639 *(pu1_out_pixel + u4_out_stride) = (UWORD8) i4_temp_pixel_holder;
640 i4_temp_pixel_holder >>= 8;
641
642 *(pu1_out_pixel + u4_out_stride + 1) = (UWORD8) i4_temp_pixel_holder;
643
644 pu1_in_pixel += (u4_src_vert_increments * (u4_in_stride << 1)) >> DOWNSCALER_Q;
645 pu1_out_pixel += 2;
646 }
647 /* Update the context for next Loop Count */
648 u4_center_pixel_pos += u4_src_horz_increments;
649 }
650 }
651 }
652 }
653