xref: /aosp_15_r20/external/libavc/encoder/x86/svc/isvce_downscaler_sse42.c (revision 495ae853bb871d1e5a258cb02c2cc13cde8ddb9a)
1 /******************************************************************************
2  *
3  * Copyright (C) 2022 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19  */
20 
21 /**
22 ******************************************************************************
23 * @file isvce_downscaler_sse42.c
24 *
25 * @brief
26 *  This file contains the x86 SIMD version of the function which does
27 *  horizontal scaling and transpose
28 *
29 * @author
30 *  Ittiam
31 *
32 * @par List of Functions:
33 *  - isvce_horizontal_downscale_and_transpose_sse42()
34 *
35 * @remarks
36 *  None
37 *
38 *******************************************************************************
39 */
40 
41 /*****************************************************************************/
42 /* File Includes                                                             */
43 /*****************************************************************************/
44 
45 /* System include files */
46 #include <stdio.h>
47 #include <stdlib.h>
48 #include <immintrin.h>
49 
50 /* User include files */
51 #include "ih264_typedefs.h"
52 #include "isvc_macros.h"
53 #include "ih264_platform_macros.h"
54 #include "isvc_defs.h"
55 #include "isvce_defs.h"
56 #include "isvc_structs.h"
57 #include "isvce_downscaler_private_defs.h"
58 
59 /*****************************************************************************/
60 /* Function Definitions                                                      */
61 /*****************************************************************************/
62 
63 /**
64 *******************************************************************************
65 *
66 * @brief
67 *   horizontal scaler function
68 *
69 * @par Description:
70 *   Does horizontal scaling for the given block
71 *
72 * @param[in] ps_scaler
73 *  pointer to downscaler context
74 *
75 * @param[in] ps_src
76 *  pointer to source buffer container
77 *
78 * @param[in] ps_dst
79 *  pointer to destination buffer container
80 *
81 * @param[in] pai1_filters
82 *  pointer to array of downscaler filters
83 *
84 * @param[in] u4_blk_wd
85 *  width of the block after horizontal scaling (output block width)
86 *
87 * @param[in] u4_blk_ht
88 *  height of the current block (input block height)
89 *
90 * @param[in] u1_is_chroma
91 *  flag suggesting whether the buffer is luma or chroma
92 *
93 *
94 * @returns
95 *
96 * @remarks
97 *  The same function is used for vertical scaling too as
98 *  the horizontally scaled input in stored in transpose fashion.
99 *
100 *******************************************************************************
101 */
102 
isvce_horizontal_downscale_and_transpose_sse42(downscaler_ctxt_t * ps_scaler,buffer_container_t * ps_src,buffer_container_t * ps_dst,FILTER_COEFF_ARRAY pai1_filters,UWORD32 u4_blk_wd,UWORD32 u4_blk_ht,UWORD8 u1_is_chroma)103 void isvce_horizontal_downscale_and_transpose_sse42(
104     downscaler_ctxt_t *ps_scaler, buffer_container_t *ps_src, buffer_container_t *ps_dst,
105     FILTER_COEFF_ARRAY pai1_filters, UWORD32 u4_blk_wd, UWORD32 u4_blk_ht, UWORD8 u1_is_chroma)
106 {
107     WORD32 i, j;
108     UWORD8 u1_phase;
109     UWORD8 *pu1_src_j, *pu1_dst_j;
110     WORD32 i4_temp_pixel_holder;
111     UWORD32 u4_num_iterations_vertical_by_16;
112     UWORD32 u4_rem_vert_loop;
113     UWORD8 *pu1_in_pixel;
114     UWORD8 *pu1_out_pixel;
115     WORD8 *pi1_filter_for_grid;
116     UWORD16 u2_full_pixel_inc;
117 
118     __m128i src_temp_0, src_temp_1, src_temp_2, src_temp_3, src_temp_4, src_temp_5, src_temp_6,
119         src_temp_7;
120 
121     __m128i reg_all_1s, reg_64val_32bit, reg_all_0s, filt_coeff_grid, reg_shuffle;
122 
123     __m128i reg_01_16x8b, reg_02_16x8b, reg_03_16x8b, reg_04_16x8b, reg_05_16x8b;
124 
125     downscaler_state_t *ps_scaler_state = (downscaler_state_t *) ps_scaler->pv_scaler_state;
126 
127     UWORD32 u4_center_pixel_pos = ps_scaler_state->i4_init_offset;
128     UWORD32 u4_src_vert_increments = ps_scaler_state->u4_vert_increment;
129     UWORD32 u4_src_horz_increments = ps_scaler_state->u4_horz_increment;
130 
131     UWORD8 *pu1_src = ps_src->pv_data;
132     UWORD32 u4_in_stride = ps_src->i4_data_stride;
133     UWORD8 *pu1_dst = ps_dst->pv_data;
134     UWORD32 u4_out_stride = ps_dst->i4_data_stride;
135     UWORD32 u4_center_pixel_pos_src = u4_center_pixel_pos;
136 
137     ASSERT((1 << DOWNSCALER_Q) == u4_src_vert_increments);
138 
139     reg_all_1s = _mm_set1_epi16((short) 1);
140     reg_64val_32bit = _mm_set1_epi32((int) 64);
141     reg_all_0s = _mm_setzero_si128();
142     reg_shuffle = _mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
143 
144     u4_num_iterations_vertical_by_16 = u4_blk_ht >> 4;
145     u4_rem_vert_loop = u4_blk_ht % 16;
146 
147     /* Offset the input so that the input pixel to be processed
148        co-incides with the centre of filter (4th coefficient)*/
149     pu1_src += (1 + u1_is_chroma);
150 
151     if(!u1_is_chroma)
152     {
153         for(j = 0; j < (WORD32) u4_num_iterations_vertical_by_16; j++)
154         {
155             pu1_src_j = pu1_src + ((j << 4) * u4_in_stride);
156             pu1_dst_j = pu1_dst + (j << 4);
157 
158             u4_center_pixel_pos = u4_center_pixel_pos_src;
159 
160             for(i = 0; i < (WORD32) u4_blk_wd; i++)
161             {
162                 u1_phase = get_filter_phase(u4_center_pixel_pos);
163                 pi1_filter_for_grid = pai1_filters[u1_phase];
164 
165                 u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q;
166 
167                 pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma);
168 
169                 pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride);
170 
171                 filt_coeff_grid = _mm_loadu_si128((__m128i *) pi1_filter_for_grid);
172                 /******************************************************/
173                 /* This loop is going vertically in bottom direction */
174                 /* but the output pixels are stored in horizontal    */
175                 /* direction in transpose manner                     */
176                 /******************************************************/
177 
178                 /*For row 0,1*/
179                 src_temp_0 = _mm_loadl_epi64((__m128i *) pu1_in_pixel);
180                 src_temp_1 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride));
181                 /*next transfer the 8 pixels from temp_2 to temp_1 higher bits 64-127*/
182                 src_temp_0 = _mm_unpacklo_epi64(src_temp_0, src_temp_1);
183 
184                 /*For row 2,3*/
185                 src_temp_2 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 2));
186 
187                 src_temp_3 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 3));
188 
189                 src_temp_2 = _mm_unpacklo_epi64(src_temp_2, src_temp_3);
190 
191                 reg_01_16x8b = _mm_maddubs_epi16(src_temp_0, filt_coeff_grid);
192 
193                 /*multiply with filter coeffs to get 16 bit results*/
194                 reg_02_16x8b = _mm_maddubs_epi16(src_temp_2, filt_coeff_grid);
195 
196                 reg_01_16x8b = _mm_hadd_epi16(reg_01_16x8b, reg_02_16x8b);
197                 /*add adjacent 16 bit values to get 32 bit values*/
198                 reg_01_16x8b = _mm_madd_epi16(reg_01_16x8b, reg_all_1s);
199 
200                 /*Add offset of 64 for rounding each out pixel value*/
201                 reg_01_16x8b = _mm_add_epi32(reg_01_16x8b, reg_64val_32bit);
202                 /*Divide by 128 each out pixel value*/
203                 reg_01_16x8b = _mm_srli_epi32(reg_01_16x8b, 7);
204 
205                 /*For row 4,5*/
206                 src_temp_4 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 4));
207 
208                 src_temp_5 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 5));
209 
210                 src_temp_4 = _mm_unpacklo_epi64(src_temp_4, src_temp_5);
211 
212                 /*For row 6,7*/
213                 src_temp_6 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 6));
214 
215                 src_temp_7 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 7));
216 
217                 src_temp_6 = _mm_unpacklo_epi64(src_temp_6, src_temp_7);
218 
219                 reg_03_16x8b = _mm_maddubs_epi16(src_temp_4, filt_coeff_grid);
220 
221                 reg_04_16x8b = _mm_maddubs_epi16(src_temp_6, filt_coeff_grid);
222 
223                 reg_03_16x8b = _mm_hadd_epi16(reg_03_16x8b, reg_04_16x8b);
224 
225                 reg_03_16x8b = _mm_madd_epi16(reg_03_16x8b, reg_all_1s);
226 
227                 /*next add 2 adjacent 32 bit values to get a single 32 bit
228                 **value in each row
229                 */
230 
231                 /*Add offset of 64 for rounding each out pixel value*/
232                 reg_03_16x8b = _mm_add_epi32(reg_03_16x8b, reg_64val_32bit);
233                 /*Divide by 128 each out pixel value*/
234                 reg_03_16x8b = _mm_srli_epi32(reg_03_16x8b, 7);
235 
236                 /*pack the lower 16 bit values corresponding to the 8 output
237                 pixels from reg1 and reg 2*/
238                 reg_01_16x8b = _mm_packus_epi32(reg_01_16x8b, reg_03_16x8b);
239 
240                 /*For row 8,9*/
241                 src_temp_0 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + 8 * u4_in_stride));
242 
243                 src_temp_1 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + 9 * u4_in_stride));
244 
245                 /*next transfer the 8 pixels from temp_2 to temp_1 higher bits 64-127*/
246                 src_temp_0 = _mm_unpacklo_epi64(src_temp_0, src_temp_1);
247 
248                 /*For row 10,11*/
249                 src_temp_2 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 10));
250 
251                 src_temp_3 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 11));
252 
253                 src_temp_2 = _mm_unpacklo_epi64(src_temp_2, src_temp_3);
254 
255                 reg_02_16x8b = _mm_maddubs_epi16(src_temp_0, filt_coeff_grid);
256 
257                 /*multiply with filter coeffs to get 16 bit results*/
258                 reg_03_16x8b = _mm_maddubs_epi16(src_temp_2, filt_coeff_grid);
259 
260                 reg_02_16x8b = _mm_hadd_epi16(reg_02_16x8b, reg_03_16x8b);
261                 /*add adjacent 16 bit values to get 32 bit values*/
262                 reg_02_16x8b = _mm_madd_epi16(reg_02_16x8b, reg_all_1s);
263 
264                 /*next add 2 adjacent 32 bit values to get a single
265                 32 bit value in each row*/
266 
267                 /*Add offset of 64 for rounding each out pixel value*/
268                 reg_02_16x8b = _mm_add_epi32(reg_02_16x8b, reg_64val_32bit);
269                 /*Divide by 128 each out pixel value*/
270                 reg_02_16x8b = _mm_srli_epi32(reg_02_16x8b, 7);
271 
272                 /*For row 12,13*/
273                 src_temp_4 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 12));
274 
275                 src_temp_5 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 13));
276 
277                 src_temp_4 = _mm_unpacklo_epi64(src_temp_4, src_temp_5);
278 
279                 /*For row 14,15*/
280                 src_temp_6 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 14));
281 
282                 src_temp_7 = _mm_loadl_epi64((__m128i *) (pu1_in_pixel + u4_in_stride * 15));
283 
284                 src_temp_6 = _mm_unpacklo_epi64(src_temp_6, src_temp_7);
285 
286                 reg_04_16x8b = _mm_maddubs_epi16(src_temp_4, filt_coeff_grid);
287 
288                 reg_05_16x8b = _mm_maddubs_epi16(src_temp_6, filt_coeff_grid);
289 
290                 reg_04_16x8b = _mm_hadd_epi16(reg_04_16x8b, reg_05_16x8b);
291                 /*add adjacent 16 bit values to get 32 bit values*/
292                 reg_04_16x8b = _mm_madd_epi16(reg_04_16x8b, reg_all_1s);
293 
294                 /*next add 2 adjacent 32 bit values to get a single
295                 32 bit value in each row*/
296 
297                 /*Add offset of 64 for rounding each out pixel value*/
298                 reg_04_16x8b = _mm_add_epi32(reg_04_16x8b, reg_64val_32bit);
299                 /*Divide by 128 each out pixel value*/
300                 reg_04_16x8b = _mm_srli_epi32(reg_04_16x8b, 7);
301 
302                 /*pack the lower 16 bit values corresponding to the 8 output
303                 pixels from reg1 and reg 2*/
304                 reg_02_16x8b = _mm_packus_epi32(reg_02_16x8b, reg_04_16x8b);
305 
306                 /*next get saturated 8 bit output pixel values for row 0-15*/
307                 reg_01_16x8b = _mm_packus_epi16(reg_01_16x8b, reg_02_16x8b);
308 
309                 /*Store the 16 output values*/
310                 _mm_storeu_si128((__m128i *) pu1_out_pixel, reg_01_16x8b);
311 
312                 pu1_out_pixel += 16;
313 
314                 pu1_in_pixel += ((u4_src_vert_increments * (u4_in_stride << 4)) >> DOWNSCALER_Q);
315 
316                 /* Update the context for next Loop Count */
317                 u4_center_pixel_pos += u4_src_horz_increments;
318             }
319         }
320 
321         /*if height is not a multiple of 8 process 2 rows at a
322         time for the remaining rows*/
323         if(u4_rem_vert_loop)
324         {
325             pu1_src_j = pu1_src + ((j << 4) * u4_in_stride);
326             pu1_dst_j = pu1_dst + (j << 4);
327 
328             u4_center_pixel_pos = u4_center_pixel_pos_src;
329 
330             for(i = 0; i < (WORD32) u4_blk_wd; i++)
331             {
332                 u1_phase = get_filter_phase(u4_center_pixel_pos);
333                 pi1_filter_for_grid = pai1_filters[u1_phase];
334 
335                 u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q;
336 
337                 pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma);
338 
339                 pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride);
340 
341                 filt_coeff_grid = _mm_loadu_si128((__m128i *) pi1_filter_for_grid);
342 
343                 for(j = u4_rem_vert_loop; j > 0; j--)
344                 {
345                     src_temp_0 = _mm_loadl_epi64((__m128i const *) pu1_in_pixel);
346 
347                     src_temp_0 = _mm_maddubs_epi16(src_temp_0, filt_coeff_grid);
348 
349                     src_temp_0 = _mm_madd_epi16(src_temp_0, reg_all_1s);
350 
351                     reg_01_16x8b = _mm_hadd_epi32(src_temp_0, reg_all_0s);
352 
353                     /*Add offset of 64 for rounding each out pixel value*/
354                     reg_01_16x8b = _mm_add_epi32(reg_01_16x8b, reg_64val_32bit);
355                     /*Divide by 128 each out pixel value*/
356                     reg_01_16x8b = _mm_srli_epi32(reg_01_16x8b, (int) 7);
357 
358                     reg_01_16x8b = _mm_packus_epi32(reg_01_16x8b, reg_all_0s);
359 
360                     /*next get saturated 8 bit output pixel values*/
361                     reg_01_16x8b = _mm_packus_epi16(reg_01_16x8b, reg_all_0s);
362 
363                     /*Store the 1 output value*/
364                     *pu1_out_pixel = (UWORD8) _mm_cvtsi128_si32(reg_01_16x8b);
365 
366                     pu1_in_pixel += (u4_src_vert_increments * u4_in_stride) >> DOWNSCALER_Q;
367 
368                     pu1_out_pixel++;
369                 }
370                 /* Update the context for next Loop Count */
371                 u4_center_pixel_pos += u4_src_horz_increments;
372             }
373         }
374     }
375 
376     else /* for chroma */
377     {
378         for(j = 0; j < (WORD32) u4_num_iterations_vertical_by_16; j++)
379         {
380             pu1_src_j = pu1_src + ((j << 4) * u4_in_stride);
381             pu1_dst_j = pu1_dst + (j << 4);
382 
383             u4_center_pixel_pos = u4_center_pixel_pos_src;
384 
385             for(i = 0; i < (WORD32) u4_blk_wd; i++)
386             {
387                 u1_phase = get_filter_phase(u4_center_pixel_pos);
388                 pi1_filter_for_grid = pai1_filters[u1_phase];
389 
390                 u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q;
391 
392                 pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma);
393 
394                 pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride);
395 
396                 filt_coeff_grid = _mm_loadu_si128((__m128i *) pi1_filter_for_grid);
397                 /******************************************************/
398                 /* This loop is going vertically in bottom direction */
399                 /* but the output pixels are stored in horizontal    */
400                 /* direction in transpose manner                     */
401                 /******************************************************/
402 
403                 /*Load 16 values shuffle to separate Cb and Cr and process*/
404 
405                 src_temp_0 = _mm_loadu_si128((__m128i *) pu1_in_pixel);
406                 src_temp_1 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride));
407 
408                 src_temp_2 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 2));
409 
410                 src_temp_3 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 3));
411 
412                 src_temp_0 = _mm_shuffle_epi8(src_temp_0, reg_shuffle);
413                 src_temp_1 = _mm_shuffle_epi8(src_temp_1, reg_shuffle);
414                 src_temp_2 = _mm_shuffle_epi8(src_temp_2, reg_shuffle);
415                 src_temp_3 = _mm_shuffle_epi8(src_temp_3, reg_shuffle);
416 
417                 reg_01_16x8b = _mm_maddubs_epi16(src_temp_0, filt_coeff_grid);
418                 reg_02_16x8b = _mm_maddubs_epi16(src_temp_1, filt_coeff_grid);
419 
420                 reg_01_16x8b = _mm_hadd_epi16(reg_01_16x8b, reg_02_16x8b);
421 
422                 reg_01_16x8b = _mm_madd_epi16(reg_01_16x8b, reg_all_1s);
423 
424                 reg_01_16x8b = _mm_add_epi32(reg_01_16x8b, reg_64val_32bit);
425 
426                 reg_01_16x8b = _mm_srli_epi32(reg_01_16x8b, (int) 7);
427 
428                 reg_03_16x8b = _mm_maddubs_epi16(src_temp_2, filt_coeff_grid);
429                 reg_04_16x8b = _mm_maddubs_epi16(src_temp_3, filt_coeff_grid);
430 
431                 src_temp_4 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 4));
432 
433                 src_temp_5 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 5));
434 
435                 src_temp_6 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 6));
436 
437                 src_temp_7 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 7));
438 
439                 src_temp_4 = _mm_shuffle_epi8(src_temp_4, reg_shuffle);
440                 src_temp_5 = _mm_shuffle_epi8(src_temp_5, reg_shuffle);
441                 src_temp_6 = _mm_shuffle_epi8(src_temp_6, reg_shuffle);
442                 src_temp_7 = _mm_shuffle_epi8(src_temp_7, reg_shuffle);
443 
444                 reg_03_16x8b = _mm_hadd_epi16(reg_03_16x8b, reg_04_16x8b);
445 
446                 reg_03_16x8b = _mm_madd_epi16(reg_03_16x8b, reg_all_1s);
447 
448                 reg_03_16x8b = _mm_add_epi32(reg_03_16x8b, reg_64val_32bit);
449 
450                 reg_03_16x8b = _mm_srli_epi32(reg_03_16x8b, (int) 7);
451 
452                 reg_01_16x8b = _mm_packus_epi32(reg_01_16x8b, reg_03_16x8b);
453 
454                 reg_02_16x8b = _mm_maddubs_epi16(src_temp_4, filt_coeff_grid);
455                 reg_04_16x8b = _mm_maddubs_epi16(src_temp_5, filt_coeff_grid);
456 
457                 reg_02_16x8b = _mm_hadd_epi16(reg_02_16x8b, reg_04_16x8b);
458 
459                 reg_02_16x8b = _mm_madd_epi16(reg_02_16x8b, reg_all_1s);
460 
461                 reg_02_16x8b = _mm_add_epi32(reg_02_16x8b, reg_64val_32bit);
462 
463                 reg_02_16x8b = _mm_srli_epi32(reg_02_16x8b, (int) 7);
464 
465                 reg_03_16x8b = _mm_maddubs_epi16(src_temp_6, filt_coeff_grid);
466                 reg_04_16x8b = _mm_maddubs_epi16(src_temp_7, filt_coeff_grid);
467 
468                 reg_03_16x8b = _mm_hadd_epi16(reg_03_16x8b, reg_04_16x8b);
469 
470                 reg_03_16x8b = _mm_madd_epi16(reg_03_16x8b, reg_all_1s);
471 
472                 reg_03_16x8b = _mm_add_epi32(reg_03_16x8b, reg_64val_32bit);
473 
474                 reg_03_16x8b = _mm_srli_epi32(reg_03_16x8b, (int) 7);
475 
476                 reg_02_16x8b = _mm_packus_epi32(reg_02_16x8b, reg_03_16x8b);
477 
478                 reg_01_16x8b = _mm_packus_epi16(reg_01_16x8b, reg_02_16x8b);
479 
480                 reg_01_16x8b = _mm_shuffle_epi8(reg_01_16x8b, reg_shuffle);
481 
482                 src_temp_0 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + 8 * u4_in_stride));
483 
484                 src_temp_1 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + 9 * u4_in_stride));
485 
486                 src_temp_2 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 10));
487 
488                 src_temp_3 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 11));
489 
490                 src_temp_0 = _mm_shuffle_epi8(src_temp_0, reg_shuffle);
491                 src_temp_1 = _mm_shuffle_epi8(src_temp_1, reg_shuffle);
492                 src_temp_2 = _mm_shuffle_epi8(src_temp_2, reg_shuffle);
493                 src_temp_3 = _mm_shuffle_epi8(src_temp_3, reg_shuffle);
494 
495                 reg_02_16x8b = _mm_maddubs_epi16(src_temp_0, filt_coeff_grid);
496                 reg_03_16x8b = _mm_maddubs_epi16(src_temp_1, filt_coeff_grid);
497 
498                 reg_02_16x8b = _mm_hadd_epi16(reg_02_16x8b, reg_03_16x8b);
499 
500                 reg_02_16x8b = _mm_madd_epi16(reg_02_16x8b, reg_all_1s);
501 
502                 reg_02_16x8b = _mm_add_epi32(reg_02_16x8b, reg_64val_32bit);
503 
504                 reg_02_16x8b = _mm_srli_epi32(reg_02_16x8b, (int) 7);
505 
506                 reg_04_16x8b = _mm_maddubs_epi16(src_temp_2, filt_coeff_grid);
507                 reg_05_16x8b = _mm_maddubs_epi16(src_temp_3, filt_coeff_grid);
508 
509                 src_temp_4 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 12));
510 
511                 src_temp_5 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 13));
512 
513                 src_temp_6 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 14));
514 
515                 src_temp_7 = _mm_loadu_si128((__m128i *) (pu1_in_pixel + u4_in_stride * 15));
516 
517                 src_temp_4 = _mm_shuffle_epi8(src_temp_4, reg_shuffle);
518                 src_temp_5 = _mm_shuffle_epi8(src_temp_5, reg_shuffle);
519                 src_temp_6 = _mm_shuffle_epi8(src_temp_6, reg_shuffle);
520                 src_temp_7 = _mm_shuffle_epi8(src_temp_7, reg_shuffle);
521 
522                 reg_04_16x8b = _mm_hadd_epi16(reg_04_16x8b, reg_05_16x8b);
523 
524                 reg_04_16x8b = _mm_madd_epi16(reg_04_16x8b, reg_all_1s);
525 
526                 reg_04_16x8b = _mm_add_epi32(reg_04_16x8b, reg_64val_32bit);
527 
528                 reg_04_16x8b = _mm_srli_epi32(reg_04_16x8b, (int) 7);
529 
530                 reg_02_16x8b = _mm_packus_epi32(reg_02_16x8b, reg_04_16x8b);
531 
532                 reg_03_16x8b = _mm_maddubs_epi16(src_temp_4, filt_coeff_grid);
533                 reg_05_16x8b = _mm_maddubs_epi16(src_temp_5, filt_coeff_grid);
534 
535                 reg_03_16x8b = _mm_hadd_epi16(reg_03_16x8b, reg_05_16x8b);
536 
537                 reg_03_16x8b = _mm_madd_epi16(reg_03_16x8b, reg_all_1s);
538 
539                 reg_03_16x8b = _mm_add_epi32(reg_03_16x8b, reg_64val_32bit);
540 
541                 reg_03_16x8b = _mm_srli_epi32(reg_03_16x8b, (int) 7);
542 
543                 reg_04_16x8b = _mm_maddubs_epi16(src_temp_6, filt_coeff_grid);
544                 reg_05_16x8b = _mm_maddubs_epi16(src_temp_7, filt_coeff_grid);
545 
546                 reg_04_16x8b = _mm_hadd_epi16(reg_04_16x8b, reg_05_16x8b);
547 
548                 reg_04_16x8b = _mm_madd_epi16(reg_04_16x8b, reg_all_1s);
549 
550                 reg_04_16x8b = _mm_add_epi32(reg_04_16x8b, reg_64val_32bit);
551 
552                 reg_04_16x8b = _mm_srli_epi32(reg_04_16x8b, (int) 7);
553 
554                 reg_03_16x8b = _mm_packus_epi32(reg_03_16x8b, reg_04_16x8b);
555 
556                 reg_02_16x8b = _mm_packus_epi16(reg_02_16x8b, reg_03_16x8b);
557 
558                 reg_02_16x8b = _mm_shuffle_epi8(reg_02_16x8b, reg_shuffle);
559 
560                 reg_03_16x8b = _mm_unpacklo_epi64(reg_01_16x8b, reg_02_16x8b);
561 
562                 reg_04_16x8b = _mm_unpackhi_epi64(reg_01_16x8b, reg_02_16x8b);
563 
564                 /*Storing after shuffling again*/
565 
566                 _mm_storeu_si128((__m128i *) pu1_out_pixel, reg_03_16x8b);
567                 _mm_storeu_si128((__m128i *) (pu1_out_pixel + u4_out_stride), reg_04_16x8b);
568 
569                 pu1_out_pixel += 16;
570 
571                 pu1_in_pixel += (u4_src_vert_increments * (u4_in_stride << 4)) >> DOWNSCALER_Q;
572 
573                 /* Update the context for next Loop Count */
574                 u4_center_pixel_pos += u4_src_horz_increments;
575             }
576         }
577 
578         /*if height is not a multiple of 8 process 2 rows at a
579         time for the remaining rows*/
580         if(u4_rem_vert_loop)
581         {
582             pu1_src_j = pu1_src + ((j << 4) * u4_in_stride);
583             pu1_dst_j = pu1_dst + (j << 4);
584 
585             u4_center_pixel_pos = u4_center_pixel_pos_src;
586             for(i = 0; i < (WORD32) u4_blk_wd; i++)
587             {
588                 UWORD8 u1_phase = get_filter_phase(u4_center_pixel_pos);
589                 pi1_filter_for_grid = pai1_filters[u1_phase];
590 
591                 u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q;
592 
593                 pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma);
594 
595                 pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride);
596 
597                 filt_coeff_grid = _mm_loadu_si128((__m128i *) pi1_filter_for_grid);
598 
599                 for(j = u4_rem_vert_loop; j > 0; j = j - 2)
600                 {
601                     src_temp_0 = _mm_loadu_si128((__m128i const *) pu1_in_pixel);
602                     src_temp_0 = _mm_shuffle_epi8(src_temp_0, reg_shuffle);
603 
604                     src_temp_1 = _mm_loadu_si128((__m128i const *) (pu1_in_pixel + u4_in_stride));
605 
606                     src_temp_1 = _mm_shuffle_epi8(src_temp_1, reg_shuffle);
607 
608                     src_temp_0 = _mm_maddubs_epi16(src_temp_0, filt_coeff_grid);
609                     src_temp_1 = _mm_maddubs_epi16(src_temp_1, filt_coeff_grid);
610 
611                     reg_01_16x8b = _mm_hadd_epi16(src_temp_0, src_temp_1);
612 
613                     reg_01_16x8b = _mm_madd_epi16(reg_01_16x8b, reg_all_1s);
614 
615                     /*Add offset of 64 for rounding each out pixel value*/
616                     reg_01_16x8b = _mm_add_epi32(reg_01_16x8b, reg_64val_32bit);
617                     /*Divide by 128 each out pixel value*/
618                     reg_01_16x8b = _mm_srli_epi32(reg_01_16x8b, (int) 7);
619 
620                     reg_01_16x8b = _mm_packus_epi32(reg_01_16x8b, reg_all_0s);
621 
622                     /*next get saturated 8 bit output pixel values*/
623                     reg_01_16x8b = _mm_packus_epi16(reg_01_16x8b, reg_all_0s);
624 
625                     reg_01_16x8b = _mm_shuffle_epi8(reg_01_16x8b, reg_shuffle);
626 
627                     reg_02_16x8b = _mm_srli_si128(reg_01_16x8b, (int) 8);
628 
629                     /*Store the 2 output values*/
630                     i4_temp_pixel_holder = _mm_cvtsi128_si32(reg_01_16x8b);
631 
632                     *pu1_out_pixel = (UWORD8) i4_temp_pixel_holder;
633                     i4_temp_pixel_holder >>= 8;
634 
635                     *(pu1_out_pixel + 1) = (UWORD8) i4_temp_pixel_holder;
636 
637                     i4_temp_pixel_holder = _mm_cvtsi128_si32(reg_02_16x8b);
638 
639                     *(pu1_out_pixel + u4_out_stride) = (UWORD8) i4_temp_pixel_holder;
640                     i4_temp_pixel_holder >>= 8;
641 
642                     *(pu1_out_pixel + u4_out_stride + 1) = (UWORD8) i4_temp_pixel_holder;
643 
644                     pu1_in_pixel += (u4_src_vert_increments * (u4_in_stride << 1)) >> DOWNSCALER_Q;
645                     pu1_out_pixel += 2;
646                 }
647                 /* Update the context for next Loop Count */
648                 u4_center_pixel_pos += u4_src_horz_increments;
649             }
650         }
651     }
652 }
653