xref: /aosp_15_r20/external/libhevc/common/arm/ihevc_intra_pred_filters_neon_intr.c (revision c83a76b084498d55f252f48b2e3786804cdf24b7)
1 /******************************************************************************
2 *
3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18 /**
19 *******************************************************************************
20 * @file
21 *  ihevc_intra_pred_filters_neon_intr.c
22 *
23 * @brief
24 *  Contains function Definition for intra prediction  interpolation filters
25 *
26 *
27 * @author
28 *  Yogeswaran RS
29 *
30 * @par List of Functions:
31 *  - ihevc_intra_pred_luma_planar()
32 *  - ihevc_intra_pred_luma_dc()
33 *  - ihevc_intra_pred_luma_horz()
34 *  - ihevc_intra_pred_luma_ver()
35 *  - ihevc_intra_pred_luma_mode2()
36 *  - ihevc_intra_pred_luma_mode_18_34()
37 *
38 * @remarks
39 *  None
40 *
41 *******************************************************************************
42 */
43 /*****************************************************************************/
44 /* File Includes                                                             */
45 /*****************************************************************************/
46 #include <stdio.h>
47 
48 #include "ihevc_typedefs.h"
49 #include "ihevc_intra_pred.h"
50 #include "ihevc_macros.h"
51 #include "ihevc_func_selector.h"
52 #include "arm_neon.h"
53 #include "ihevc_platform_macros.h"
54 #include "ihevc_common_tables.h"
55 
56 /****************************************************************************/
57 /* Constant Macros                                                          */
58 /****************************************************************************/
59 #define MAX_CU_SIZE 64
60 #define BIT_DEPTH 8
61 #define T32_4NT 128
62 #define T16_4NT 64
63 
64 /*****************************************************************************/
65 /* Function Definition                                                      */
66 /*****************************************************************************/
67 
68 /**
69 *******************************************************************************
70 *
71 * @brief
72  *    Intra prediction interpolation filter for pu1_ref substitution
73  *
74  *
75  * @par Description:
76  *    Reference substitution process for samples unavailable  for prediction
77  *    Refer to section 8.4.4.2.2
78  *
79  * @param[in] pu1_top_left
80  *  UWORD8 pointer to the top-left
81  *
82  * @param[in] pu1_top
83  *  UWORD8 pointer to the top
84  *
85  * @param[in] pu1_left
86  *  UWORD8 pointer to the left
87  *
88  * @param[in] src_strd
89  *  WORD32 Source stride
90  *
91  * @param[in] nbr_flags
92  *  WORD32 neighbor availability flags
93  *
94  * @param[in] nt
95  *  WORD32 transform Block size
96  *
97  * @param[in] dst_strd
98  *  WORD32 Destination stride
99  *
100  * @returns
101  *
102  * @remarks
103  *  None
104  *
105  *******************************************************************************
106  */
107 
108 
ihevc_intra_pred_luma_ref_substitution_neonintr(UWORD8 * pu1_top_left,UWORD8 * pu1_top,UWORD8 * pu1_left,WORD32 src_strd,WORD32 nt,WORD32 nbr_flags,UWORD8 * pu1_dst,WORD32 dst_strd)109 void ihevc_intra_pred_luma_ref_substitution_neonintr(UWORD8 *pu1_top_left,
110                                                      UWORD8 *pu1_top,
111                                                      UWORD8 *pu1_left,
112                                                      WORD32 src_strd,
113                                                      WORD32 nt,
114                                                      WORD32 nbr_flags,
115                                                      UWORD8 *pu1_dst,
116                                                      WORD32 dst_strd)
117 {
118     UWORD8 pu1_ref;
119     WORD32 dc_val, i;
120     WORD32 total_samples = (4 * nt) + 1;
121     WORD32 two_nt = 2 * nt;
122     WORD32 three_nt = 3 * nt;
123     WORD32 get_bits;
124     WORD32 next;
125     WORD32 bot_left, left, top, tp_right, tp_left;
126     WORD32 idx, nbr_id_from_bl, frwd_nbr_flag;
127     UNUSED(dst_strd);
128     dc_val = 1 << (BIT_DEPTH - 1);
129 
130     /* Neighbor Flag Structure*/
131     /*    Top-Left | Top-Right | Top | Left | Bottom-Left
132               1         4         4     4         4
133      */
134 
135     /* If no neighbor flags are present, fill the neighbor samples with DC value */
136     if(nbr_flags == 0)
137     {
138         for(i = 0; i < total_samples; i++)
139         {
140             pu1_dst[i] = dc_val;
141         }
142     }
143     else
144     {
145         /* Else fill the corresponding samples */
146         pu1_dst[two_nt] = *pu1_top_left;
147         UWORD8 *pu1_dst_tmp2 = pu1_dst;
148         UWORD8 *pu1_top_tmp = pu1_top;
149         pu1_dst_tmp2 += two_nt + 1;
150 
151         for(i = 0; i < two_nt; i++)
152             pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
153 
154         uint8x8_t src;
155         for(i = two_nt; i > 0; i -= 8)
156         {
157             src = vld1_u8(pu1_top_tmp);
158             pu1_top_tmp += 8;
159             vst1_u8(pu1_dst_tmp2, src);
160             pu1_dst_tmp2 += 8;
161         }
162 
163         if(nt <= 8)
164         {
165             /* 1 bit extraction for all the neighboring blocks */
166             tp_left = (nbr_flags & 0x10000) >> 16;
167             bot_left = nbr_flags & 0x1;
168             left = (nbr_flags & 0x10) >> 4;
169             top = (nbr_flags & 0x100) >> 8;
170             tp_right = (nbr_flags & 0x1000) >> 12;
171 
172             next = 1;
173 
174             /* If bottom -left is not available, reverse substitution process*/
175             if(bot_left == 0)
176             {
177                 WORD32 a_nbr_flag[5] = { bot_left, left, tp_left, top, tp_right };
178 
179                 /* Check for the 1st available sample from bottom-left*/
180                 while(!a_nbr_flag[next])
181                     next++;
182 
183                 /* If Left, top-left are available*/
184                 if(next <= 2)
185                 {
186                     idx = nt * next;
187                     pu1_ref = pu1_dst[idx];
188                     for(i = 0; i < idx; i++)
189                         pu1_dst[i] = pu1_ref;
190                 }
191                 else /* If top, top-right are available */
192                 {
193                     /* Idx is changed to copy 1 pixel value for top-left ,if top-left is not available*/
194                     idx = (nt * (next - 1)) + 1;
195                     pu1_ref = pu1_dst[idx];
196                     for(i = 0; i < idx; i++)
197                         pu1_dst[i] = pu1_ref;
198                 }
199             }
200 
201             /* Forward Substitution Process */
202             /* If left is Unavailable, copy the last bottom-left value */
203 
204             if(left == 0)
205             {
206                 uint8x8_t dup_pu1_dst1;
207                 UWORD8 *pu1_dst_const_nt = pu1_dst;
208                 pu1_dst_const_nt += nt;
209 
210                 if(0 == (nt & 7))
211                 {
212                     dup_pu1_dst1 = vdup_n_u8(pu1_dst[nt - 1]);
213                     for(i = nt; i > 0; i -= 8)
214                     {
215                         vst1_u8(pu1_dst_const_nt, dup_pu1_dst1);
216                         pu1_dst_const_nt += 8;
217 
218                     }
219                 }
220                 else
221                 {
222                     //uint32x2_t dup_pu1_dst4;
223                     dup_pu1_dst1 = vdup_n_u8(pu1_dst[nt - 1]);
224                     //dup_pu1_dst4 = vdup_n_u32((uint32_t) pu1_dst[nt - 1]);
225                     for(i = nt; i > 0; i -= 4)
226                     {
227                         vst1_lane_u32((uint32_t *)pu1_dst_const_nt, vreinterpret_u32_u8(dup_pu1_dst1), 0);
228                         pu1_dst_const_nt += 4;
229 
230                     }
231 
232                 }
233 
234             }
235             if(tp_left == 0)
236                 pu1_dst[two_nt] = pu1_dst[two_nt - 1];
237             if(top == 0)
238             {
239 
240                 if(0 == (nt & 7))
241                 {
242                     uint8x8_t dup_pu1_dst2;
243                     UWORD8 *pu1_dst_const_two_nt_1 = pu1_dst;
244                     pu1_dst_const_two_nt_1 += (two_nt + 1);
245                     dup_pu1_dst2 = vdup_n_u8(pu1_dst[two_nt]);
246                     for(i = nt; i > 0; i -= 8)
247                     {
248                         vst1_u8(pu1_dst_const_two_nt_1, dup_pu1_dst2);
249                         pu1_dst_const_two_nt_1 += 8;
250 
251                     }
252                 }
253                 else
254                 {
255                     for(i = 0; i < nt; i++)
256                         pu1_dst[two_nt + 1 + i] = pu1_dst[two_nt];
257                 }
258             }
259             if(tp_right == 0)
260             {
261                 uint8x8_t dup_pu1_dst3;
262                 UWORD8 *pu1_dst_const_three_nt_1 = pu1_dst;
263                 pu1_dst_const_three_nt_1 += (three_nt + 1);
264                 dup_pu1_dst3 = vdup_n_u8(pu1_dst[two_nt]);
265                 if(0 == (nt & 7))
266                 {
267                     for(i = nt; i > 0; i -= 8)
268                     {
269                         vst1_u8(pu1_dst_const_three_nt_1, dup_pu1_dst3);
270                         pu1_dst_const_three_nt_1 += 8;
271 
272                     }
273                 }
274                 else
275                 {
276                     for(i = nt; i > 0; i -= 4)
277                     {
278                         vst1_lane_u32((uint32_t *)pu1_dst_const_three_nt_1, vreinterpret_u32_u8(dup_pu1_dst3), 0);
279                         pu1_dst_const_three_nt_1 += 4;
280                     }
281 
282                 }
283 
284             }
285         }
286         if(nt == 16)
287         {
288             WORD32 nbr_flags_temp = 0;
289             nbr_flags_temp = (nbr_flags & 0x3) + ((nbr_flags & 0x30) >> 2)
290                             + ((nbr_flags & 0x300) >> 4)
291                             + ((nbr_flags & 0x3000) >> 6)
292                             + ((nbr_flags & 0x10000) >> 8);
293 
294             /* compute trailing zeors based on nbr_flag for substitution process of below left see section .*/
295             /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
296             {
297                 nbr_id_from_bl = look_up_trailing_zeros(nbr_flags_temp & 0XF) * 8; /* for below left and left */
298 
299                 if(nbr_id_from_bl == 64)
300                     nbr_id_from_bl = 32;
301 
302                 if(nbr_id_from_bl == 32)
303                 {
304                     /* for top left : 1 pel per nbr bit */
305                     if(!((nbr_flags_temp >> 8) & 0x1))
306                     {
307                         nbr_id_from_bl++;
308                         nbr_id_from_bl += look_up_trailing_zeros((nbr_flags_temp >> 4) & 0xF) * 8; /* top and top right;  8 pels per nbr bit */
309                     }
310                 }
311                 /* Reverse Substitution Process*/
312                 if(nbr_id_from_bl)
313                 {
314                     /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
315                     pu1_ref = pu1_dst[nbr_id_from_bl];
316                     for(i = (nbr_id_from_bl - 1); i >= 0; i--)
317                     {
318                         pu1_dst[i] = pu1_ref;
319                     }
320                 }
321             }
322 
323             /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
324             while(nbr_id_from_bl < ((T16_4NT) + 1))
325             {
326                 /* To Obtain the next unavailable idx flag after reverse neighbor substitution  */
327                 /* Devide by 8 to obtain the original index */
328                 frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
329 
330                 /* The Top-left flag is at the last bit location of nbr_flags*/
331                 if(nbr_id_from_bl == (T16_4NT / 2))
332                 {
333                     get_bits = GET_BIT(nbr_flags_temp, 8);
334 
335                     /* only pel substitution for TL */
336                     if(!get_bits)
337                         pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
338                 }
339                 else
340                 {
341                     get_bits = GET_BIT(nbr_flags_temp, frwd_nbr_flag);
342                     if(!get_bits)
343                     {
344                         /* 8 pel substitution (other than TL) */
345                         pu1_ref = pu1_dst[nbr_id_from_bl - 1];
346                         for(i = 0; i < 8; i++)
347                             pu1_dst[nbr_id_from_bl + i] = pu1_ref;
348                     }
349 
350                 }
351                 nbr_id_from_bl += (nbr_id_from_bl == (T16_4NT / 2)) ? 1 : 8;
352             }
353         }
354 
355         if(nt == 32)
356         {
357             /* compute trailing ones based on mbr_flag for substitution process of below left see section .*/
358             /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
359             {
360                 nbr_id_from_bl = look_up_trailing_zeros((nbr_flags & 0XFF)) * 8; /* for below left and left */
361 
362                 if(nbr_id_from_bl == 64)
363                 {
364                     /* for top left : 1 pel per nbr bit */
365                     if(!((nbr_flags >> 16) & 0x1))
366                     {
367                         /* top left not available */
368                         nbr_id_from_bl++;
369                         /* top and top right;  8 pels per nbr bit */
370                         nbr_id_from_bl += look_up_trailing_zeros((nbr_flags >> 8) & 0xFF) * 8;
371                     }
372                 }
373                 /* Reverse Substitution Process*/
374                 if(nbr_id_from_bl)
375                 {
376                     /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
377                     pu1_ref = pu1_dst[nbr_id_from_bl];
378                     for(i = (nbr_id_from_bl - 1); i >= 0; i--)
379                         pu1_dst[i] = pu1_ref;
380                 }
381             }
382 
383             /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
384             while(nbr_id_from_bl < ((T32_4NT)+1))
385             {
386                 /* To Obtain the next unavailable idx flag after reverse neighbor substitution  */
387                 /* Devide by 8 to obtain the original index */
388                 frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
389 
390                 /* The Top-left flag is at the last bit location of nbr_flags*/
391                 if(nbr_id_from_bl == (T32_4NT / 2))
392                 {
393                     get_bits = GET_BIT(nbr_flags, 16);
394                     /* only pel substitution for TL */
395                     if(!get_bits)
396                         pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
397                 }
398                 else
399                 {
400                     get_bits = GET_BIT(nbr_flags, frwd_nbr_flag);
401                     if(!get_bits)
402                     {
403                         /* 8 pel substitution (other than TL) */
404                         pu1_ref = pu1_dst[nbr_id_from_bl - 1];
405                         for(i = 0; i < 8; i++)
406                             pu1_dst[nbr_id_from_bl + i] = pu1_ref;
407                     }
408 
409                 }
410                 nbr_id_from_bl += (nbr_id_from_bl == (T32_4NT / 2)) ? 1 : 8;
411             }
412         }
413 
414     }
415 
416 }
417 
418 /**
419  *******************************************************************************
420  *
421  * @brief
422  *    Intra prediction interpolation filter for ref_filtering
423  *
424  *
425  * @par Description:
426  *    Reference DC filtering for neighboring samples dependent  on TU size and
427  *    mode  Refer to section 8.4.4.2.3 in the standard
428  *
429  * @param[in] pu1_src
430  *  UWORD8 pointer to the source
431  *
432  * @param[out] pu1_dst
433  *  UWORD8 pointer to the destination
434  *
435  * @param[in] nt
436  *  integer Transform Block size
437  *
438  * @param[in] mode
439  *  integer intraprediction mode
440  *
441  * @returns
442  *
443  * @remarks
444  *  None
445  *
446  *******************************************************************************
447  */
448 
449 
ihevc_intra_pred_ref_filtering_neonintr(UWORD8 * pu1_src,WORD32 nt,UWORD8 * pu1_dst,WORD32 mode,WORD32 strong_intra_smoothing_enable_flag)450 void ihevc_intra_pred_ref_filtering_neonintr(UWORD8 *pu1_src,
451                                              WORD32 nt,
452                                              UWORD8 *pu1_dst,
453                                              WORD32 mode,
454                                              WORD32 strong_intra_smoothing_enable_flag)
455 {
456     WORD32 filter_flag;
457     WORD32 i = 0;
458     WORD32 four_nt = 4 * nt;
459 
460     WORD32 src_4nt;
461     WORD32 src_0nt;
462     /* Naming has been made as per the functionlity it has, For eg. pu1_src_tmp_1 is denoting pu1_src + 1   */
463     /* src_val_1 to load value from pointer pu1_src_tmp_1, add_res has the result of adding 2 values        */
464     UWORD8 *pu1_src_tmp_0 = pu1_src;
465     UWORD8 *pu1_src_tmp_1;
466     UWORD8 *pu1_src_tmp_2;
467     UWORD8 *pu1_dst_tmp_0 = pu1_dst;
468     UWORD8 *pu1_dst_tmp_1;
469 
470     uint8x8_t src_val_0, src_val_2;
471     uint8x8_t src_val_1, shift_res;
472     uint8x8_t dup_const_2;
473     uint16x8_t mul_res, add_res;
474     WORD32 bi_linear_int_flag = 0;
475     WORD32 abs_cond_left_flag = 0;
476     WORD32 abs_cond_top_flag = 0;
477     WORD32 dc_val = 1 << (BIT_DEPTH - 5);
478     shift_res = vdup_n_u8(0);
479 
480     filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2));
481 
482     if(0 == filter_flag)
483     {
484         if(pu1_src == pu1_dst)
485         {
486             return;
487         }
488         else
489         {
490             for(i = four_nt; i > 0; i -= 8)
491             {
492                 src_val_0 = vld1_u8(pu1_src_tmp_0);
493                 pu1_src_tmp_0 += 8;
494                 vst1_u8(pu1_dst_tmp_0, src_val_0);
495                 pu1_dst_tmp_0 += 8;
496             }
497             pu1_dst[four_nt] = pu1_src[four_nt];
498         }
499     }
500 
501     else
502     {
503         /* If strong intra smoothin is enabled and transform size is 32 */
504         if((1 == strong_intra_smoothing_enable_flag) && (32 == nt))
505         {
506             /*Strong Intra Filtering*/
507             abs_cond_top_flag = (ABS(pu1_src[2 * nt] + pu1_src[4 * nt]
508                             - (2 * pu1_src[3 * nt]))) < dc_val;
509             abs_cond_left_flag = (ABS(pu1_src[2 * nt] + pu1_src[0]
510                             - (2 * pu1_src[nt]))) < dc_val;
511 
512             bi_linear_int_flag = ((1 == abs_cond_left_flag)
513                             && (1 == abs_cond_top_flag));
514         }
515 
516         src_4nt = pu1_src[4 * nt];
517         src_0nt = pu1_src[0];
518         /* Strong filtering of reference samples */
519         if(1 == bi_linear_int_flag)
520         {
521             WORD32 two_nt = four_nt >> 1;
522 
523             WORD32 pu1_src_0_val = pu1_src[0];
524             WORD32 pu1_src_2_nt_val = pu1_src[2 * nt];
525             WORD32 pu1_src_4_nt_val = pu1_src[4 * nt];
526 
527             WORD32 prod_two_nt_src_0_val = two_nt * pu1_src_0_val;
528             uint16x8_t prod_two_nt_src_0_val_t = vdupq_n_u16(prod_two_nt_src_0_val);
529 
530             WORD32 prod_two_nt_src_2_nt_val = two_nt * pu1_src_2_nt_val;
531             uint16x8_t prod_two_nt_src_2_nt_val_t = vdupq_n_u16(prod_two_nt_src_2_nt_val);
532 
533             const UWORD8 *const_col_i;
534             uint8x8_t const_col_i_val;
535             uint16x8_t prod_val_1;
536             uint16x8_t prod_val_2;
537             uint16x8_t prod_val_3;
538             uint16x8_t prod_val_4;
539             uint8x8_t res_val_1;
540             uint8x8_t res_val_2;
541             uint8x8_t pu1_src_0_val_t = vdup_n_u8(pu1_src_0_val);
542             uint8x8_t pu1_src_2_nt_val_t = vdup_n_u8(pu1_src_2_nt_val);
543             uint8x8_t pu1_src_4_nt_val_t = vdup_n_u8(pu1_src_4_nt_val);
544             pu1_dst_tmp_0 = pu1_dst + 1;
545             pu1_dst_tmp_1 = pu1_dst + two_nt + 1;
546 
547             const_col_i = gau1_ihevc_planar_factor + 1;
548 
549             for(i = two_nt; i > 0; i -= 8)
550             {
551                 const_col_i_val = vld1_u8(const_col_i);
552                 const_col_i += 8;
553 
554                 prod_val_1 = vmlsl_u8(prod_two_nt_src_0_val_t, const_col_i_val, pu1_src_0_val_t);
555                 prod_val_2 = vmlal_u8(prod_val_1, const_col_i_val, pu1_src_2_nt_val_t);
556 
557                 res_val_1 = vrshrn_n_u16(prod_val_2, 6);
558                 prod_val_3 = vmlsl_u8(prod_two_nt_src_2_nt_val_t, const_col_i_val, pu1_src_2_nt_val_t);
559 
560                 vst1_u8(pu1_dst_tmp_0, res_val_1);
561                 pu1_dst_tmp_0 += 8;
562                 prod_val_4 = vmlal_u8(prod_val_3, const_col_i_val, pu1_src_4_nt_val_t);
563 
564                 res_val_2 = vrshrn_n_u16(prod_val_4, 6);
565                 vst1_u8(pu1_dst_tmp_1, res_val_2);
566                 pu1_dst_tmp_1 += 8;
567             }
568             pu1_dst[2 * nt] = pu1_src[2 * nt];
569         }
570         else
571         {
572             pu1_src_tmp_1 = pu1_src + 1;
573             pu1_src_tmp_2 = pu1_src + 2;
574             pu1_dst_tmp_0 += 1;
575 
576             dup_const_2 = vdup_n_u8(2);
577 
578             /* Extremities Untouched*/
579             pu1_dst[0] = pu1_src[0];
580 
581             /* To avoid the issue when the dest and src has the same pointer this load has been done
582              * outside and the 2nd consecutive load is done before the store of the 1st */
583 
584             /* Perform bilinear filtering of Reference Samples */
585             for(i = (four_nt - 1); i > 0; i -= 8)
586             {
587                 src_val_0 = vld1_u8(pu1_src_tmp_0);
588                 pu1_src_tmp_0 += 8;
589 
590                 src_val_2 = vld1_u8(pu1_src_tmp_2);
591                 pu1_src_tmp_2 += 8;
592 
593                 src_val_1 = vld1_u8(pu1_src_tmp_1);
594                 pu1_src_tmp_1 += 8;
595 
596                 if(i < four_nt - 1)
597                 {
598                     vst1_u8(pu1_dst_tmp_0, shift_res);
599                     pu1_dst_tmp_0 += 8;
600                 }
601 
602                 add_res = vaddl_u8(src_val_0, src_val_2);
603 
604                 mul_res = vmlal_u8(add_res, src_val_1, dup_const_2);
605                 shift_res = vrshrn_n_u16(mul_res, 2);
606 
607             }
608             vst1_u8(pu1_dst_tmp_0, shift_res);
609             pu1_dst_tmp_0 += 8;
610         }
611         pu1_dst[4 * nt] = src_4nt;
612         pu1_dst[0] = src_0nt;
613     }
614 
615 }
616 
617 
618 
619 /**
620  *******************************************************************************
621  *
622  * @brief
623 *   Intra prediction interpolation filter for luma planar
624 *
625 * @par Description:
626 *      Planar Intraprediction with reference neighboring samples  location
627 *      pointed by 'pu1_ref' to the TU block location  pointed by 'pu1_dst'
628 *
629 * @param[in] pu1_src
630 *  UWORD8 pointer to the source
631 *
632 * @param[out] pu1_dst
633 *  UWORD8 pointer to the destination
634 *
635 * @param[in] src_strd
636 *  integer source stride
637 *
638 * @param[in] dst_strd
639 *  integer destination stride
640 *
641 * @param[in] nt
642 *  integer Transform Block size
643 *
644 * @param[in] wd
645 *  integer width of the array
646 *
647 * @returns
648 *
649 * @remarks
650 *  None
651 *
652 *******************************************************************************
653 */
654 
ihevc_intra_pred_luma_planar_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)655 void ihevc_intra_pred_luma_planar_neonintr(UWORD8 *pu1_ref,
656                                            WORD32 src_strd,
657                                            UWORD8 *pu1_dst,
658                                            WORD32 dst_strd,
659                                            WORD32 nt,
660                                            WORD32 mode)
661 {
662     /* named it in the way (nt - 1 - col) --> const_nt_1_col(const denotes g_ihevc_planar_factor)   */
663     /* load const_nt_1_col values into a d register                                                 */
664     /* named it in the way pu1_ref[nt - 1] --> pu1_ref_nt_1                                         */
665     /* the value of pu1_ref_nt_1 is duplicated to d register hence pu1_ref_nt_1_dup                 */
666     /* log2nt + 1 is taken care while assigning the values itself                                   */
667     /* In width multiple of 4 case the row also has been unrolled by 2 and store has been taken care*/
668 
669     WORD32 row, col = 0;
670     WORD32 log2nt_plus1 = 6;
671     WORD32 two_nt, three_nt;
672     UWORD8 *pu1_ref_two_nt_1;
673     UWORD8 *pu1_dst_tmp;
674     const UWORD8 *const_nt_1_col;
675     uint8x8_t const_nt_1_col_t;
676     const UWORD8 *const_col_1;
677     uint8x8_t const_col_1_t;
678     uint8_t const_nt_1_row;
679     uint8x8_t const_nt_1_row_dup;
680     uint8_t const_row_1;
681     uint8x8_t const_row_1_dup;
682     uint8_t const_nt = nt;
683     uint16x8_t const_nt_dup;
684     uint8_t pu1_ref_nt_1 = pu1_ref[nt - 1];
685     uint8x8_t pu1_ref_nt_1_dup;
686     uint8_t pu1_ref_two_nt_1_row;
687     uint8_t pu1_ref_three_nt_1;
688     uint8x8_t pu1_ref_two_nt_1_row_dup;
689     uint8x8_t pu1_ref_two_nt_1_t;
690     uint8x8_t pu1_ref_three_nt_1_dup;
691     uint16x8_t prod_t1;
692     uint16x8_t prod_t2;
693     uint16x8_t sto_res_tmp;
694     uint8x8_t sto_res;
695     int16x8_t log2nt_dup;
696     UNUSED(src_strd);
697     UNUSED(mode);
698     log2nt_plus1 = 32 - CLZ(nt);
699     two_nt = 2 * nt;
700     three_nt = 3 * nt;
701     /* loops have been unrolld considering the fact width is multiple of 8  */
702     if(0 == (nt & 7))
703     {
704         pu1_dst_tmp = pu1_dst;
705         const_nt_1_col = gau1_ihevc_planar_factor + nt - 8;
706 
707         const_col_1 = gau1_ihevc_planar_factor + 1;
708         pu1_ref_three_nt_1 = pu1_ref[three_nt + 1];
709 
710         pu1_ref_nt_1_dup = vdup_n_u8(pu1_ref_nt_1);
711         const_nt_dup = vdupq_n_u16(const_nt);
712 
713         log2nt_dup = vdupq_n_s16(log2nt_plus1);
714         log2nt_dup = vnegq_s16(log2nt_dup);
715 
716         pu1_ref_three_nt_1_dup = vdup_n_u8(pu1_ref_three_nt_1);
717 
718         for(row = 0; row < nt; row++)
719         {
720             pu1_ref_two_nt_1_row = pu1_ref[two_nt - 1 - row];
721             pu1_ref_two_nt_1_row_dup = vdup_n_u8(pu1_ref_two_nt_1_row);
722 
723             const_nt_1_row = nt - 1 - row;
724             const_nt_1_row_dup = vdup_n_u8(const_nt_1_row);
725 
726             const_row_1 = row + 1;
727             const_row_1_dup = vdup_n_u8(const_row_1);
728 
729             const_nt_1_col = gau1_ihevc_planar_factor + nt - 8;
730 
731             const_col_1 = gau1_ihevc_planar_factor + 1;
732             pu1_ref_two_nt_1 = pu1_ref + two_nt + 1;
733 
734             for(col = nt; col > 0; col -= 8)
735             {
736                 const_nt_1_col_t = vld1_u8(const_nt_1_col);
737                 const_nt_1_col -= 8;
738                 const_nt_1_col_t = vrev64_u8(const_nt_1_col_t);
739 
740                 const_col_1_t = vld1_u8(const_col_1);
741                 const_col_1 += 8;
742                 prod_t1 = vmull_u8(const_nt_1_col_t, pu1_ref_two_nt_1_row_dup);
743 
744                 pu1_ref_two_nt_1_t = vld1_u8(pu1_ref_two_nt_1);
745                 pu1_ref_two_nt_1 += 8;
746                 prod_t2 = vmull_u8(const_col_1_t, pu1_ref_three_nt_1_dup);
747 
748                 prod_t1 = vmlal_u8(prod_t1, const_nt_1_row_dup, pu1_ref_two_nt_1_t);
749                 prod_t2 = vmlal_u8(prod_t2, const_row_1_dup, pu1_ref_nt_1_dup);
750                 prod_t1 = vaddq_u16(prod_t1, const_nt_dup);
751                 prod_t1 = vaddq_u16(prod_t1, prod_t2);
752 
753                 sto_res_tmp = vreinterpretq_u16_s16(vshlq_s16(vreinterpretq_s16_u16(prod_t1), log2nt_dup));
754                 sto_res = vmovn_u16(sto_res_tmp);
755                 vst1_u8(pu1_dst_tmp, sto_res);
756                 pu1_dst_tmp += 8;
757             }
758             pu1_dst_tmp += dst_strd - nt;
759         }
760     }
761     /* loops have been unrolld considering the fact width is multiple of 4  */
762     /* If column is multiple of 4 then height should be multiple of 2       */
763     else
764     {
765         uint8x8_t const_row_1_dup1;
766         uint8x8_t pu1_ref_two_nt_1_t1;
767         uint8x8_t const_nt_1_col_t1;
768         uint8x8_t const_col_1_t1;
769         uint8x8_t pu1_ref_two_nt_1_row_dup1;
770         uint8x8_t const_nt_1_row_dup1;
771 
772         pu1_ref_three_nt_1 = pu1_ref[three_nt + 1];
773 
774         pu1_ref_nt_1_dup = vdup_n_u8(pu1_ref_nt_1);
775         const_nt_dup = vdupq_n_u16(const_nt);
776 
777         log2nt_dup = vdupq_n_s16(log2nt_plus1);
778         log2nt_dup = vnegq_s16(log2nt_dup);
779 
780         pu1_ref_three_nt_1_dup = vdup_n_u8(pu1_ref_three_nt_1);
781 
782         for(row = 0; row < nt; row += 2)
783         {
784             pu1_ref_two_nt_1_row = pu1_ref[two_nt - 1 - row];
785             pu1_ref_two_nt_1_row_dup = vdup_n_u8(pu1_ref_two_nt_1_row);
786             pu1_ref_two_nt_1_row = pu1_ref[two_nt - 2 - row];
787             pu1_ref_two_nt_1_row_dup1 = vdup_n_u8(pu1_ref_two_nt_1_row);
788             pu1_ref_two_nt_1_row_dup = vext_u8(pu1_ref_two_nt_1_row_dup, pu1_ref_two_nt_1_row_dup1, 4);
789 
790             const_nt_1_row = nt - 1 - row;
791             const_nt_1_row_dup = vdup_n_u8(const_nt_1_row);
792             const_nt_1_row = nt - 2 - row;
793             const_nt_1_row_dup1 = vdup_n_u8(const_nt_1_row);
794             const_nt_1_row_dup = vext_u8(const_nt_1_row_dup, const_nt_1_row_dup1, 4);
795 
796             const_row_1 = row + 1;
797             const_row_1_dup = vdup_n_u8(const_row_1);
798             const_row_1 = row + 2;
799             const_row_1_dup1 = vdup_n_u8(const_row_1);
800             const_row_1_dup = vext_u8(const_row_1_dup, const_row_1_dup1, 4);
801 
802             const_nt_1_col = gau1_ihevc_planar_factor + nt - 4;
803 
804             const_col_1 = gau1_ihevc_planar_factor + 1;
805 
806             pu1_ref_two_nt_1 = pu1_ref + two_nt + 1;
807 
808             for(col = nt; col > 0; col -= 4)
809             {
810                 const_nt_1_col_t = vld1_u8(const_nt_1_col);
811                 const_nt_1_col -= 4;
812                 const_nt_1_col_t = vrev64_u8(const_nt_1_col_t);
813 
814                 const_col_1_t = vld1_u8(const_col_1);
815                 const_col_1 += 4;
816                 const_nt_1_col_t1 = vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(const_nt_1_col_t), 32));
817 
818                 pu1_dst_tmp = pu1_dst;
819                 const_nt_1_col_t = vext_u8(const_nt_1_col_t, const_nt_1_col_t1, 4);
820 
821                 const_col_1_t1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(const_col_1_t), 32));
822                 prod_t1 = vmull_u8(const_nt_1_col_t, pu1_ref_two_nt_1_row_dup);
823 
824                 pu1_ref_two_nt_1_t = vld1_u8(pu1_ref_two_nt_1);
825                 pu1_ref_two_nt_1 += 4;
826                 const_col_1_t = vext_u8(const_col_1_t1, const_col_1_t, 4);
827 
828                 pu1_ref_two_nt_1_t1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(pu1_ref_two_nt_1_t), 32));
829                 prod_t2 = vmull_u8(const_col_1_t, pu1_ref_three_nt_1_dup);
830 
831                 pu1_ref_two_nt_1_t = vext_u8(pu1_ref_two_nt_1_t1, pu1_ref_two_nt_1_t, 4);
832                 prod_t2 = vmlal_u8(prod_t2, const_row_1_dup, pu1_ref_nt_1_dup);
833 
834                 prod_t1 = vmlal_u8(prod_t1, const_nt_1_row_dup, pu1_ref_two_nt_1_t);
835                 prod_t1 = vaddq_u16(prod_t1, const_nt_dup);
836                 prod_t1 = vaddq_u16(prod_t1, prod_t2);
837 
838                 sto_res_tmp = vreinterpretq_u16_s16(vshlq_s16(vreinterpretq_s16_u16(prod_t1), log2nt_dup));
839                 sto_res = vmovn_u16(sto_res_tmp);
840 
841                 vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
842                 pu1_dst_tmp += dst_strd;
843 
844                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
845                 pu1_dst += 4;
846             }
847             pu1_dst += 2 * dst_strd - nt;
848         }
849     }
850 
851 }
852 /* INTRA_PRED_LUMA_PLANAR */
853 
854 /**
855 *******************************************************************************
856 *
857 * @brief
858 *    Intra prediction interpolation filter for luma dc
859 *
860 * @par Description:
861 *    Intraprediction for DC mode with reference neighboring  samples location
862 *    pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'
863 *
864 * @param[in] pu1_src
865 *  UWORD8 pointer to the source
866 *
867 * @param[out] pu1_dst
868 *  UWORD8 pointer to the destination
869 *
870 * @param[in] src_strd
871 *  integer source stride
872 *
873 * @param[in] dst_strd
874 *  integer destination stride
875 *
876 * @param[in] nt
877 *  integer Transform Block size
878 *
879 * @param[in] wd
880 *  integer width of the array
881 *
882 * @returns
883 *
884 * @remarks
885 *  None
886 *
887 *******************************************************************************
888 */
889 
ihevc_intra_pred_luma_dc_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)890 void ihevc_intra_pred_luma_dc_neonintr(UWORD8 *pu1_ref,
891                                        WORD32 src_strd,
892                                        UWORD8 *pu1_dst,
893                                        WORD32 dst_strd,
894                                        WORD32 nt,
895                                        WORD32 mode)
896 {
897     WORD32 dc_val = 0, two_dc_val = 0, three_dc_val = 0;
898     WORD32 i = 0;
899     WORD32 row = 0, col = 0, col_count;
900     WORD32 log2nt_plus1 = 6;
901     WORD32 two_nt = 0;
902     uint16x8_t ref_load_q;
903     uint16x8_t three_dc_val_t;
904     uint8x8_t sto_res_tmp;
905     uint8x8_t sto_res_tmp1;
906     uint8x8_t sto_res_tmp2;
907     uint8x8_t sto_res_tmp3;
908     uint8x8_t sto_res_tmp4;
909     uint8x8_t dc_val_t;
910 
911     UWORD8 *pu1_ref_tmp;
912     UWORD8 *pu1_ref_tmp1;
913     UWORD8 *pu1_dst_tmp;
914     UWORD8 *pu1_dst_tmp1;
915     UWORD8 *pu1_dst_tmp2;
916     UNUSED(src_strd);
917     UNUSED(mode);
918 
919     /* log2nt + 1 is taken care while assigning the values itself.          */
920     log2nt_plus1 = 32 - CLZ(nt);
921 
922     /* loops have been unrolld considering the fact width is multiple of 8  */
923     if(0 == (nt & 7))
924     {
925         uint8x8_t ref_load1;
926         uint8x8_t ref_load2;
927         uint16x4_t acc_dc_pair1;
928         uint32x2_t acc_dc_pair2;
929         uint64x1_t acc_dc = vdup_n_u64(col);
930 
931         two_nt = 2 * nt;
932         pu1_ref_tmp = pu1_ref + nt;
933         pu1_ref_tmp1 = pu1_ref + two_nt + 1;
934 
935         for(i = two_nt; i > nt; i -= 8)
936         {
937             ref_load1 = vld1_u8(pu1_ref_tmp);
938             pu1_ref_tmp += 8;
939             acc_dc_pair1 = vpaddl_u8(ref_load1);
940 
941             ref_load2 = vld1_u8(pu1_ref_tmp1);
942             pu1_ref_tmp1 += 8;
943 
944             acc_dc_pair2 = vpaddl_u16(acc_dc_pair1);
945             acc_dc = vpadal_u32(acc_dc, acc_dc_pair2);
946 
947             acc_dc_pair1 = vpaddl_u8(ref_load2);
948             acc_dc_pair2 = vpaddl_u16(acc_dc_pair1);
949             acc_dc = vpadal_u32(acc_dc, acc_dc_pair2);
950         }
951 
952         dc_val = (vget_lane_u32(vreinterpret_u32_u64(acc_dc), 0) + nt) >> (log2nt_plus1);
953         dc_val_t = vdup_n_u8(dc_val);
954         two_dc_val = 2 * dc_val;
955         three_dc_val = 3 * dc_val;
956         three_dc_val += 2;
957 
958         three_dc_val_t = vdupq_n_u16((WORD16)three_dc_val);
959         pu1_ref_tmp = pu1_ref + two_nt + 1 + 0;
960         pu1_dst_tmp = pu1_dst;
961 
962 
963         if(nt == 32)
964         {
965             for(row = 0; row < nt; row++)
966             {
967                 for(col = nt; col > 0; col -= 8)
968                 {
969                     vst1_u8(pu1_dst_tmp, dc_val_t);
970                     pu1_dst_tmp += 8;
971                 }
972                 pu1_dst_tmp += dst_strd - nt;
973             }
974         }
975         else
976 
977         {
978             for(col = nt; col > 0; col -= 8)
979             {
980                 ref_load1 = vld1_u8(pu1_ref_tmp);
981                 pu1_ref_tmp += 8;
982                 ref_load_q = vmovl_u8(ref_load1);
983                 ref_load_q = vaddq_u16(ref_load_q, three_dc_val_t);
984                 ref_load_q = vshrq_n_u16(ref_load_q, 2);
985                 sto_res_tmp = vmovn_u16(ref_load_q);
986                 vst1_u8(pu1_dst_tmp, sto_res_tmp);
987                 pu1_dst_tmp += 8;
988             }
989 
990             pu1_ref_tmp = pu1_ref + two_nt - 9;
991             pu1_dst_tmp = pu1_dst + dst_strd;
992             col_count = nt - 8;
993 
994             /* Except the first row the remaining rows are done here                            */
995             /* Both column and row has been unrolled by 8                                       */
996             /* Store has been taken care for the unrolling                                      */
997             /* Except the 1st column of the remaining rows(other than 1st row), the values are  */
998             /* constant hence it is extracted with an constant value and stored                 */
999             /* If the column is greater than 8, then the remaining values are constant which is */
1000             /* taken care in the inner for loop                                                 */
1001 
1002             for(row = nt; row > 0; row -= 8)
1003             {
1004                 pu1_dst_tmp1 = pu1_dst_tmp + 8;
1005                 ref_load1 = vld1_u8(pu1_ref_tmp);
1006                 pu1_ref_tmp -= 8;
1007                 ref_load_q = vmovl_u8(ref_load1);
1008                 ref_load_q = vaddq_u16(ref_load_q, three_dc_val_t);
1009                 ref_load_q = vshrq_n_u16(ref_load_q, 2);
1010                 sto_res_tmp = vmovn_u16(ref_load_q);
1011 
1012                 sto_res_tmp1 = vext_u8(sto_res_tmp, dc_val_t, 7);
1013 
1014                 sto_res_tmp2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 8));
1015                 sto_res_tmp2 = vext_u8(sto_res_tmp2, dc_val_t, 7);
1016                 vst1_u8(pu1_dst_tmp, sto_res_tmp1);
1017                 pu1_dst_tmp += dst_strd;
1018 
1019                 sto_res_tmp3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 16));
1020                 sto_res_tmp3 = vext_u8(sto_res_tmp3, dc_val_t, 7);
1021                 vst1_u8(pu1_dst_tmp, sto_res_tmp2);
1022                 pu1_dst_tmp += dst_strd;
1023 
1024                 sto_res_tmp4 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 24));
1025                 sto_res_tmp4 = vext_u8(sto_res_tmp4, dc_val_t, 7);
1026                 vst1_u8(pu1_dst_tmp, sto_res_tmp3);
1027                 pu1_dst_tmp += dst_strd;
1028 
1029                 sto_res_tmp1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 32));
1030                 sto_res_tmp1 = vext_u8(sto_res_tmp1, dc_val_t, 7);
1031                 vst1_u8(pu1_dst_tmp, sto_res_tmp4);
1032                 pu1_dst_tmp += dst_strd;
1033 
1034                 sto_res_tmp2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 40));
1035                 sto_res_tmp2 = vext_u8(sto_res_tmp2, dc_val_t, 7);
1036                 vst1_u8(pu1_dst_tmp, sto_res_tmp1);
1037                 pu1_dst_tmp += dst_strd;
1038 
1039                 sto_res_tmp3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 48));
1040                 sto_res_tmp3 = vext_u8(sto_res_tmp3, dc_val_t, 7);
1041                 vst1_u8(pu1_dst_tmp, sto_res_tmp2);
1042                 pu1_dst_tmp += dst_strd;
1043 
1044                 sto_res_tmp4 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 56));
1045                 sto_res_tmp4 = vext_u8(sto_res_tmp4, dc_val_t, 7);
1046                 vst1_u8(pu1_dst_tmp, sto_res_tmp3);
1047                 pu1_dst_tmp += dst_strd;
1048                 /* For last set of 8 rows only 7 rows need to be updated since first row is already written */
1049                 if(row != 8)
1050                     vst1_u8(pu1_dst_tmp, sto_res_tmp4);
1051                 pu1_dst_tmp += dst_strd;
1052 
1053                 for(col = col_count; col > 0; col -= 8)
1054                 {
1055                     pu1_dst_tmp2 = pu1_dst_tmp1;
1056                     vst1_u8(pu1_dst_tmp1, dc_val_t);
1057                     pu1_dst_tmp1 += dst_strd;
1058                     vst1_u8(pu1_dst_tmp1, dc_val_t);
1059                     pu1_dst_tmp1 += dst_strd;
1060                     vst1_u8(pu1_dst_tmp1, dc_val_t);
1061                     pu1_dst_tmp1 += dst_strd;
1062                     vst1_u8(pu1_dst_tmp1, dc_val_t);
1063                     pu1_dst_tmp1 += dst_strd;
1064                     vst1_u8(pu1_dst_tmp1, dc_val_t);
1065                     pu1_dst_tmp1 += dst_strd;
1066                     vst1_u8(pu1_dst_tmp1, dc_val_t);
1067                     pu1_dst_tmp1 += dst_strd;
1068                     vst1_u8(pu1_dst_tmp1, dc_val_t);
1069                     pu1_dst_tmp1 += dst_strd;
1070 
1071                     /* For last set of 8 rows only 7 rows need to be updated since first row is already written */
1072                     if(row != 8)
1073                         vst1_u8(pu1_dst_tmp1, dc_val_t);
1074                     pu1_dst_tmp1 = pu1_dst_tmp2 + 8;
1075                 }
1076             }
1077             pu1_dst[0] = (pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) >> 2;
1078         }
1079     }
1080     /* loops have been unrolld considering the fact width is multiple of 4  */
1081     else
1082     {
1083         WORD32 acc_dc;
1084         two_nt = 2 * nt;
1085 
1086         acc_dc = 0;
1087         pu1_ref_tmp = pu1_ref + nt + 1;
1088         for(i = nt; i < two_nt; i++)
1089         {
1090             acc_dc += pu1_ref[i];
1091             acc_dc += pu1_ref_tmp[i];
1092         }
1093         dc_val = (acc_dc + nt) >> (log2nt_plus1);
1094         two_dc_val = 2 * dc_val;
1095         three_dc_val = 3 * dc_val;
1096         three_dc_val = three_dc_val + 2;
1097         dc_val_t = vdup_n_u8(dc_val);
1098 
1099         if(nt == 32)
1100         {
1101             pu1_dst_tmp = pu1_dst;
1102             for(row = 0; row < nt; row++)
1103             {
1104                 for(col = nt; col > 0; col -= 4)
1105                 {
1106                     vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(dc_val_t), 0);
1107                     pu1_dst_tmp += 4;
1108                 }
1109                 pu1_dst_tmp += dst_strd - nt;
1110             }
1111         }
1112         else
1113 
1114         {
1115             for(col = 1; col < nt; col++)
1116             {
1117                 pu1_dst[col] = (pu1_ref[two_nt + 1 + col] + three_dc_val) >> 2;
1118             }
1119 
1120             pu1_dst_tmp = pu1_dst + dst_strd + 0;
1121             /* Since first row is already updated before, loop count is nt-1 */
1122             for(row = nt - 1; row > 0; row -= 1)
1123             {
1124                 for(col = nt; col > 0; col -= 4)
1125                 {
1126                     vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(dc_val_t), 0);
1127                     pu1_dst_tmp += 4;
1128                 }
1129                 pu1_dst_tmp += dst_strd - nt;
1130             }
1131 
1132             for(row = 1; row < nt; row++)
1133             {
1134                 pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val) >> 2;
1135             }
1136             pu1_dst[0] = (pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) >> 2;
1137         }
1138     }
1139 }
1140 /* INTRA_PRED_LUMA_DC */
1141 
1142 /**
1143 *******************************************************************************
1144 *
1145 * @brief
1146  *   Intra prediction interpolation filter for horizontal luma variable.
1147  *
1148  * @par Description:
1149  *   Horizontal intraprediction with reference neighboring  samples location
1150  *   pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'
1151  *
1152  * @param[in] pu1_src
1153  *  UWORD8 pointer to the source
1154  *
1155  * @param[out] pu1_dst
1156  *  UWORD8 pointer to the destination
1157  *
1158  * @param[in] src_strd
1159  *  integer source stride
1160  *
1161  * @param[in] dst_strd
1162  *  integer destination stride
1163  *
1164  * @param[in] nt
1165  *  integer Transform Block size
1166  *
1167  * @param[in] wd
1168  *  integer width of the array
1169  *
1170  * @returns
1171  *
1172  * @remarks
1173  *  None
1174  *
1175  *******************************************************************************
1176  */
1177 
ihevc_intra_pred_luma_horz_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)1178 void ihevc_intra_pred_luma_horz_neonintr(UWORD8 *pu1_ref,
1179                                          WORD32 src_strd,
1180                                          UWORD8 *pu1_dst,
1181                                          WORD32 dst_strd,
1182                                          WORD32 nt,
1183                                          WORD32 mode)
1184 {
1185 
1186     WORD32 row, col;
1187     WORD32 two_nt;
1188     UNUSED(src_strd);
1189     UNUSED(mode);
1190 
1191     two_nt = 2 * nt;
1192 
1193 
1194     UWORD8 *pu1_dst_tmp = pu1_dst;
1195     UWORD32 pu1_val;
1196     uint8x8_t pu1_val_two_nt_1_row;
1197     if(nt == 32)
1198     {
1199         pu1_dst_tmp = pu1_dst;
1200         for(row = 0; row < nt; row++)
1201         {
1202             pu1_val = pu1_ref[two_nt - 1 - row];
1203             pu1_val_two_nt_1_row = vdup_n_u8(pu1_val);
1204             for(col = nt; col > 0; col -= 8)
1205             {
1206                 vst1_u8(pu1_dst_tmp, pu1_val_two_nt_1_row);
1207                 pu1_dst_tmp += 8;
1208             }
1209             pu1_dst_tmp += dst_strd - nt;
1210         }
1211     }
1212     else
1213 
1214 
1215     /* row loop has been unrolled, hence had pu1_ref_val1 and pu1_ref_val2 variables*/
1216     /* naming of variables made according to the operation(instructions) it performs*/
1217     /* (eg. shift_val which contains the shifted value,                             */
1218     /* add_sat which has add and saturated value)                                   */
1219     /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8  */
1220     /* rows and columns are unrolled by 4, when the width is multiple of 4                              */
1221     {
1222         if(0 != (nt & 7))      /* cond for multiple of 4 */
1223         {
1224             UWORD8 *pu1_ref_4_two_nt_plus1 = pu1_ref;
1225             UWORD8 *pu1_ref_4_two_nt_minus_nt = pu1_ref;
1226             UWORD8 *pu1_dst_4 = pu1_dst;
1227             UWORD8 *pu1_dst_4_tmp = pu1_dst;
1228 
1229             uint32x2_t pu1_ref_val1, pu1_ref_val2;
1230             uint8x8_t dup_sub, round_val, dup_val;
1231             uint16x8_t dup_add, sub_val;
1232             int16x8_t shift_val, add_sat;
1233 
1234             pu1_ref_val1 = vdup_n_u32(0);
1235             pu1_ref_val2 = vdup_n_u32(0);
1236 
1237             dup_sub = vdup_n_u8(pu1_ref[two_nt]);
1238 
1239             dup_add = vdupq_n_u16(pu1_ref[two_nt - 1]);
1240 
1241             pu1_ref_4_two_nt_plus1 += (two_nt + 1);
1242 
1243             pu1_ref_4_two_nt_minus_nt += (two_nt - nt);
1244 
1245             for(row = nt; row > 0; row -= 4)
1246             {
1247                 for(col = nt; col > 0; col -= 4)
1248                 {
1249                     pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_4_two_nt_plus1, pu1_ref_val1, 0);
1250                     sub_val = vsubl_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_sub);
1251                     shift_val  = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
1252 
1253                     add_sat = vqaddq_s16(shift_val, vreinterpretq_s16_u16(dup_add));
1254                     round_val = vqmovun_s16(add_sat);
1255                     vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(round_val), 0);
1256                     pu1_dst_4 += dst_strd;
1257 
1258                     pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_4_two_nt_minus_nt, pu1_ref_val2, 0);
1259                     dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 2);
1260                     vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
1261                     pu1_dst_4 += dst_strd;
1262 
1263                     dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 1);
1264                     vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
1265                     pu1_dst_4 += dst_strd;
1266 
1267                     dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 0);
1268                     vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
1269                     pu1_dst_4 += dst_strd;
1270 
1271 
1272                 }
1273                 /* worst cases */
1274                 pu1_ref_4_two_nt_minus_nt += 3;
1275                 pu1_ref_4_two_nt_plus1 += 4;
1276                 pu1_dst_4 = (pu1_dst_4_tmp + 4);
1277             }
1278 
1279         }
1280 
1281         /* dup_1 - dup_8 are variables to load the duplicated values from the loaded source */
1282         /* naming of variables made according to the operation(instructions) it performs    */
1283         /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8  */
1284         /* rows and columns are unrolled by 8, when the width is multiple of 8                              */
1285 
1286         else
1287         {
1288             UWORD8 *pu1_ref_tmp_1 = pu1_ref;
1289             UWORD8 *pu1_ref_tmp_2 = pu1_ref;
1290 
1291             UWORD8 *pu1_dst_tmp_1 = pu1_dst;
1292             UWORD8 *pu1_dst_tmp_2 = pu1_dst + dst_strd;
1293             UWORD8 *pu1_dst_tmp_3 = pu1_dst + dst_strd;
1294 
1295             uint8x8_t dup_sub, src_tmp, src_tmp_1, round_val, dup_1, dup_2, dup_3, dup_4, dup_5, dup_6, dup_7, dup_8, rev_res;
1296             uint16x8_t sub_res, dup_add;
1297             int16x8_t shift_res, add_res;
1298 
1299             dup_sub = vdup_n_u8(pu1_ref[two_nt]);
1300             dup_add = vdupq_n_u16(pu1_ref[two_nt - 1]);
1301 
1302             pu1_ref_tmp_1 += (two_nt + 1);
1303             pu1_ref_tmp_2 += (two_nt - 1);
1304 
1305             for(col = nt; col > 0; col -= 8)
1306             {
1307                 src_tmp = vld1_u8(pu1_ref_tmp_1);
1308                 pu1_ref_tmp_1 += 8;
1309 
1310                 sub_res = vsubl_u8(src_tmp, dup_sub);
1311                 shift_res  = vshrq_n_s16(vreinterpretq_s16_u16(sub_res), 1);
1312                 add_res = vqaddq_s16(shift_res, vreinterpretq_s16_u16(dup_add));
1313                 round_val = vqmovun_s16(add_res);
1314                 vst1_u8(pu1_dst_tmp_1, round_val);
1315                 pu1_dst_tmp_1 += 8;
1316             }
1317 
1318             for(row = nt; row > 0; row -= 8)
1319             {
1320                 pu1_ref_tmp_2 -= 8;
1321 
1322                 src_tmp_1 = vld1_u8(pu1_ref_tmp_2);
1323                 rev_res = vrev64_u8(src_tmp_1); /* Reversing the loaded values */
1324 
1325                 dup_1 = vdup_lane_u8(rev_res, 0);
1326                 dup_2 = vdup_lane_u8(rev_res, 1);
1327                 dup_3 = vdup_lane_u8(rev_res, 2);
1328                 dup_4 = vdup_lane_u8(rev_res, 3);
1329                 dup_5 = vdup_lane_u8(rev_res, 4);
1330                 dup_6 = vdup_lane_u8(rev_res, 5);
1331                 dup_7 = vdup_lane_u8(rev_res, 6);
1332                 dup_8 = vdup_lane_u8(rev_res, 7);
1333 
1334                 for(col = nt; col > 0; col -= 8)
1335                 {
1336                     pu1_dst_tmp_2 = pu1_dst_tmp_3;
1337 
1338                     vst1_u8(pu1_dst_tmp_2, dup_1);
1339                     pu1_dst_tmp_2 += dst_strd;
1340 
1341                     vst1_u8(pu1_dst_tmp_2, dup_2);
1342                     pu1_dst_tmp_2 += dst_strd;
1343 
1344                     vst1_u8(pu1_dst_tmp_2, dup_3);
1345                     pu1_dst_tmp_2 += dst_strd;
1346 
1347                     vst1_u8(pu1_dst_tmp_2, dup_4);
1348                     pu1_dst_tmp_2 += dst_strd;
1349 
1350                     vst1_u8(pu1_dst_tmp_2, dup_5);
1351                     pu1_dst_tmp_2 += dst_strd;
1352 
1353                     vst1_u8(pu1_dst_tmp_2, dup_6);
1354                     pu1_dst_tmp_2 += dst_strd;
1355 
1356                     vst1_u8(pu1_dst_tmp_2, dup_7);
1357                     pu1_dst_tmp_2 += dst_strd;
1358 
1359                     /* For last set of 8 rows only 7 rows need to be updated since first row is already written */
1360                     if(row != 8)
1361                         vst1_u8(pu1_dst_tmp_2, dup_8);
1362                     pu1_dst_tmp_2 += dst_strd;
1363 
1364                     pu1_dst_tmp_3 += 8;
1365                 }
1366                 pu1_dst_tmp_2 -= (nt - 8);
1367                 pu1_dst_tmp_3 = pu1_dst_tmp_2;
1368             }
1369         }
1370     }
1371 }
1372 /* INTRA_PRED_LUMA_HORZ */
1373 
1374 /**
1375 *******************************************************************************
1376 *
1377 * @brief
1378 *    Intra prediction interpolation filter for vertical luma variable.
1379 *
1380 * @par Description:
1381 *    Horizontal intraprediction with reference neighboring  samples location
1382 *    pointed by 'pu1_ref' to the TU block  location pointed by 'pu1_dst'
1383 *
1384 * @param[in] pu1_src
1385 *  UWORD8 pointer to the source
1386 *
1387 * @param[out] pu1_dst
1388 *  UWORD8 pointer to the destination
1389 *
1390 * @param[in] src_strd
1391 *  integer source stride
1392 *
1393 * @param[in] dst_strd
1394 *  integer destination stride
1395 *
1396 * @param[in] nt
1397 *  integer Transform Block size
1398 *
1399 * @param[in] wd
1400 *  integer width of the array
1401 *
1402 * @returns
1403 *
1404 * @remarks
1405 *  None
1406 *
1407 *******************************************************************************
1408 */
1409 
ihevc_intra_pred_luma_ver_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)1410 void ihevc_intra_pred_luma_ver_neonintr(UWORD8 *pu1_ref,
1411                                         WORD32 src_strd,
1412                                         UWORD8 *pu1_dst,
1413                                         WORD32 dst_strd,
1414                                         WORD32 nt,
1415                                         WORD32 mode)
1416 {
1417     WORD32 row, col;
1418     WORD32 two_nt;
1419     UNUSED(src_strd);
1420     UNUSED(mode);
1421 
1422     two_nt = 2 * nt;
1423 
1424     UWORD8 *pu1_dst_tmp = pu1_dst;
1425     UWORD8 *pu1_ref_tmp_1 = pu1_ref + two_nt + 1;
1426     uint8x8_t pu1_val_two_nt_1_col;
1427     if(nt == 32)
1428     {
1429         pu1_dst_tmp = pu1_dst;
1430         for(row = 0; row < nt; row++)
1431         {
1432             for(col = nt; col > 0; col -= 8)
1433             {
1434                 pu1_val_two_nt_1_col = vld1_u8(pu1_ref_tmp_1);
1435                 pu1_ref_tmp_1 += 8;
1436                 vst1_u8(pu1_dst_tmp, pu1_val_two_nt_1_col);
1437                 pu1_dst_tmp += 8;
1438             }
1439             pu1_ref_tmp_1 -= nt;
1440             pu1_dst_tmp += dst_strd - nt;
1441         }
1442     }
1443     else
1444 
1445     {
1446         /* naming of variables made according to the operation(instructions) it performs                    */
1447         /* (eg. shift_val which contains the shifted value,                                                 */
1448         /* add_sat which has add and saturated value)                                                       */
1449         /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8  */
1450         /* rows and columns are unrolled by 4, when the width is multiple of 4                              */
1451 
1452         if(0 != (nt & 7))
1453         {
1454             WORD32 cond_4 = 0;
1455             UWORD8 *pu1_ref_val1 = pu1_ref;
1456             UWORD8 *pu1_ref_val2 = pu1_ref;
1457             UWORD8 *pu1_ref_val3 = pu1_ref;
1458 
1459             UWORD8 *pu1_dst_val1 = pu1_dst;
1460             UWORD8 *pu1_dst_val2 = pu1_dst;
1461             UWORD8 *pu1_dst_val3 = pu1_dst;
1462 
1463             uint8x8_t dup_2_sub, round_val, vext_val;
1464             uint16x8_t dup_2_add;
1465             uint32x2_t src_val1, src_val2, src_val3;
1466             uint16x8_t sub_val;
1467             int16x8_t shift_val1, add_sat;
1468             uint64x1_t shift_val2;
1469 
1470             src_val1 = vdup_n_u32(0);
1471             src_val2 = vdup_n_u32(0);
1472             src_val3 = vdup_n_u32(0);
1473             pu1_ref_val1 += (two_nt - nt);
1474             pu1_ref_val3 += (two_nt + 2);
1475             pu1_ref_val2 += (two_nt + 1);
1476 
1477             dup_2_sub = vdup_n_u8(pu1_ref[two_nt]);
1478             dup_2_add = vdupq_n_u16(pu1_ref[two_nt + 1]);
1479 
1480             /* loops to store the first nt sets of values in the destination */
1481 
1482             for(row = nt; row > 0; row -= 4)
1483             {
1484                 for(col = nt; (col > 0) && (cond_4 == 0); col -= 4)
1485                 {
1486                     /*  unrolling s2_predpixel = pu1_ref[two_nt + 1] + ((pu1_ref[two_nt - 1 - row] - pu1_ref[two_nt]) >> 1); here*/
1487                     src_val1 = vld1_lane_u32((uint32_t *)pu1_ref_val1, src_val1, 1);
1488                     sub_val = vsubl_u8(vreinterpret_u8_u32(src_val1), dup_2_sub);
1489                     shift_val1  = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
1490                     add_sat = vqaddq_s16(shift_val1, vreinterpretq_s16_u16(dup_2_add));
1491                     round_val = vqmovun_s16(add_sat);
1492 
1493                     /* unrolling pu1_dst[row * dst_strd + col] = pu1_ref[two_nt + 1 + col]; here*/
1494                     src_val2 = vld1_lane_u32((uint32_t *)pu1_ref_val3, src_val2, 0);
1495                     vext_val = vext_u8(round_val, vreinterpret_u8_u32(src_val2), 7);
1496                     vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
1497                     pu1_dst_val1 += dst_strd;
1498 
1499                     shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 8);
1500 
1501                     vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
1502                     vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
1503                     pu1_dst_val1 += dst_strd;
1504 
1505                     shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 16);
1506 
1507                     vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
1508                     vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
1509                     pu1_dst_val1 += dst_strd;
1510 
1511                     shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 24);
1512 
1513                     vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
1514                     vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
1515                     pu1_dst_val1 += dst_strd;
1516 
1517                     pu1_ref_val1  -= 4;
1518                 }
1519 
1520                 /* loop to store next sets of eight values in the destination */
1521 
1522                 for(col = nt - 3; (col > 0) && (cond_4 == 1); col -= 4)
1523                 {
1524                     src_val3 = vld1_lane_u32((uint32_t *)pu1_ref_val2, src_val3, 0);
1525 
1526                     vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
1527                     pu1_dst_val2 += dst_strd;
1528 
1529                     vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
1530                     pu1_dst_val2 += dst_strd;
1531 
1532                     vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
1533                     pu1_dst_val2 += dst_strd;
1534 
1535                     vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
1536                     pu1_dst_val2 += dst_strd;
1537                 }
1538                 pu1_ref_val2 += 4;
1539                 pu1_dst_val3 += 4;
1540                 pu1_dst_val2 = pu1_dst_val3;
1541                 cond_4 = 1;
1542             }
1543         }
1544 
1545         /* rows and columns are unrolled by 8, when the width is multiple of 8          */
1546         else
1547         {
1548             WORD32 cond = 0, col_1;
1549             UWORD8 *pu1_dst_tmp_1 = pu1_dst;
1550             UWORD8 *pu1_dst_tmp_2 = pu1_dst;
1551             UWORD8 *pu1_dst_tmp_3 = pu1_dst;
1552 
1553             UWORD8 *pu1_ref_tmp_1 = pu1_ref;
1554             UWORD8 *pu1_ref_tmp_2 = pu1_ref;
1555             UWORD8 *pu1_ref_tmp_3 = pu1_ref;
1556 
1557             uint8x8_t pu1_src_tmp1;
1558             uint8x8_t pu1_src_tmp2;
1559 
1560             uint8x8_t dup_sub;
1561             uint16x8_t dup_add;
1562             int16x8_t subsh_val;
1563             int16x8_t addsat_val;
1564             uint16x8_t sub_val;
1565             uint8x8_t round_val;
1566             uint8x8_t vext_t;
1567             uint64x1_t shift_64;
1568 
1569             dup_sub = vdup_n_u8(pu1_ref[two_nt]);
1570             dup_add = vdupq_n_u16(pu1_ref[two_nt + 1]);
1571 
1572             pu1_ref_tmp_1 += (two_nt);
1573             pu1_ref_tmp_1 -= 8;
1574             pu1_ref_tmp_2 += (two_nt + 2);
1575             pu1_ref_tmp_3 += (two_nt + 1);
1576 
1577             /* loops to store the first nt sets of values in the destination */
1578 
1579             for(row = nt; row > 0; row -= 8)
1580             {
1581                 for(col = (nt - 1); (col > 0) && (cond == 0); col -= 8)
1582                 {
1583                     pu1_src_tmp1 = vld1_u8(pu1_ref_tmp_1);
1584 
1585                     sub_val = vsubl_u8(pu1_src_tmp1, dup_sub);
1586                     subsh_val  = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
1587                     addsat_val = vqaddq_s16(subsh_val, vreinterpretq_s16_u16(dup_add));
1588                     round_val = vqmovun_s16(addsat_val);
1589 
1590                     /* unrolling pu1_dst[row * dst_strd + col] = pu1_ref[two_nt + 1 + col]; here*/
1591 
1592                     pu1_src_tmp2 = vld1_u8(pu1_ref_tmp_2);
1593                     vext_t = vext_u8(round_val, pu1_src_tmp2, 7);
1594                     vst1_u8(pu1_dst_tmp_1, vext_t);
1595                     pu1_dst_tmp_1 += dst_strd;
1596 
1597                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 8);
1598 
1599                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1600                     vst1_u8(pu1_dst_tmp_1, vext_t);
1601                     pu1_dst_tmp_1 += dst_strd;
1602 
1603                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 16);
1604                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1605                     vst1_u8(pu1_dst_tmp_1, vext_t);
1606                     pu1_dst_tmp_1 += dst_strd;
1607 
1608                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 24);
1609                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1610                     vst1_u8(pu1_dst_tmp_1, vext_t);
1611                     pu1_dst_tmp_1 += dst_strd;
1612 
1613                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 32);
1614                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1615                     vst1_u8(pu1_dst_tmp_1, vext_t);
1616                     pu1_dst_tmp_1 += dst_strd;
1617 
1618                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 40);
1619                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1620                     vst1_u8(pu1_dst_tmp_1, vext_t);
1621                     pu1_dst_tmp_1 += dst_strd;
1622 
1623                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 48);
1624                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1625                     vst1_u8(pu1_dst_tmp_1, vext_t);
1626                     pu1_dst_tmp_1 += dst_strd;
1627 
1628                     shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 56);
1629                     vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1630                     vst1_u8(pu1_dst_tmp_1, vext_t);
1631                     pu1_dst_tmp_1 += dst_strd;
1632 
1633                     pu1_ref_tmp_1 -= 8;
1634                 }
1635 
1636                 /* loop to store next sets of eight values in the destination */
1637 
1638                 for(col_1 = nt - 7; (col_1 > 0) && (cond == 1); col_1 -= 8)
1639                 {
1640                     pu1_src_tmp2 = vld1_u8(pu1_ref_tmp_3);
1641 
1642                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1643                     pu1_dst_tmp_2 += dst_strd;
1644 
1645                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1646                     pu1_dst_tmp_2 += dst_strd;
1647 
1648                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1649                     pu1_dst_tmp_2 += dst_strd;
1650 
1651                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1652                     pu1_dst_tmp_2 += dst_strd;
1653 
1654                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1655                     pu1_dst_tmp_2 += dst_strd;
1656 
1657                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1658                     pu1_dst_tmp_2 += dst_strd;
1659 
1660                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1661                     pu1_dst_tmp_2 += dst_strd;
1662 
1663                     vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1664                     pu1_dst_tmp_2 += dst_strd;
1665                 }
1666                 pu1_ref_tmp_3 += 8;
1667                 pu1_dst_tmp_3 += 8;
1668                 pu1_dst_tmp_2 = pu1_dst_tmp_3;
1669                 cond = 1;
1670             }
1671         }
1672     }
1673 }
1674 /* INTRA_PRED_LUMA_VER */
1675 
1676 /**
1677 *******************************************************************************
1678 *
1679 * @brief
1680 *    Intra prediction interpolation filter for luma mode2.
1681 *
1682 * @par Description:
1683 *    Intraprediction for mode 2 (sw angle) with reference  neighboring samples
1684 *    location pointed by 'pu1_ref' to the  TU block location pointed by
1685 *    'pu1_dst'
1686 *
1687 * @param[in] pu1_src
1688 *  UWORD8 pointer to the source
1689 *
1690 * @param[out] pu1_dst
1691 *  UWORD8 pointer to the destination
1692 *
1693 * @param[in] src_strd
1694 *  integer source stride
1695 *
1696 * @param[in] dst_strd
1697 *  integer destination stride
1698 *
1699 * @param[in] nt
1700 *  integer Transform Block size
1701 *
1702 * @param[in] wd
1703 *  integer width of the array
1704 *
1705 * @returns
1706 *
1707 * @remarks
1708 *  None
1709 *
1710 *******************************************************************************
1711 */
1712 
ihevc_intra_pred_luma_mode2_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)1713 void ihevc_intra_pred_luma_mode2_neonintr(UWORD8 *pu1_ref,
1714                                           WORD32 src_strd,
1715                                           UWORD8 *pu1_dst,
1716                                           WORD32 dst_strd,
1717                                           WORD32 nt,
1718                                           WORD32 mode)
1719 {
1720 
1721     WORD32 row, col;
1722     WORD32 two_nt;
1723     UNUSED(src_strd);
1724     UNUSED(mode);
1725 
1726     /* rev_res naming has been made to have the reverse result value in it                              */
1727     /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8  */
1728     /* rows and columns are unrolled by 4, when the width is multiple of 4                              */
1729 
1730     if(0 != (nt & 7))
1731     {
1732         UWORD8 *pu1_ref_tmp = pu1_ref;
1733         UWORD8 *pu1_dst_tmp = pu1_dst;
1734         uint8x8_t pu1_src_val, rev_res;
1735         uint64x1_t shift_res;
1736 
1737         for(col = nt; col > 0; col -= 4)
1738         {
1739             for(row = nt; row > 0; row -= 4)
1740             {
1741                 /* unrolling all col & rows for pu1_dst[row + (col * dst_strd)] = pu1_ref[two_nt - col - idx - 1]; */
1742 
1743                 pu1_src_val = vld1_u8(pu1_ref_tmp);
1744                 shift_res = vshl_n_u64(vreinterpret_u64_u8(pu1_src_val), 8);
1745                 rev_res = vrev64_u8(vreinterpret_u8_u64(shift_res));
1746 
1747                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(rev_res), 0);
1748                 pu1_dst_tmp += dst_strd;
1749 
1750                 shift_res = vshr_n_u64(vreinterpret_u64_u8(rev_res), 8);
1751                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
1752                 pu1_dst_tmp += dst_strd;
1753 
1754                 shift_res = vshr_n_u64(shift_res, 8);
1755                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
1756                 pu1_dst_tmp += dst_strd;
1757 
1758                 shift_res = vshr_n_u64(shift_res, 8);
1759                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
1760                 pu1_dst_tmp += dst_strd;
1761             }
1762         }
1763     }
1764 
1765     /* rev_val_second, rev_val_first  to reverse the loaded values in order to get the values in right order */
1766     /* shift_64 to shift the reversed 2nd values to get the value what we need                               */
1767     /* rows and columns are unrolled by 8, when the width is multiple of 8                              */
1768 
1769     else
1770     {
1771         UWORD8 *pu1_ref_two_nt_minus2 = pu1_ref;
1772         UWORD8 *pu1_dst_tmp = pu1_dst;
1773         UWORD8 *pu1_dst_tmp_plus8 = pu1_dst;
1774 
1775         uint8x8_t pu1_src_val1, pu1_src_val2, vext_t, rev_val_second, rev_val_first;
1776         uint64x1_t shift_val;
1777 
1778         two_nt = 2 * nt;
1779         pu1_ref_two_nt_minus2 += (two_nt);
1780         pu1_ref_two_nt_minus2 -= 8;
1781 
1782         for(col = nt; col > 0; col -= 8)
1783         {
1784             for(row = nt; row > 0; row -= 8)
1785             {
1786                 pu1_src_val2 = vld1_u8(pu1_ref_two_nt_minus2);
1787                 rev_val_first = vrev64_u8(pu1_src_val2);
1788 
1789                 pu1_ref_two_nt_minus2 -= 8;
1790                 pu1_src_val1 = vld1_u8(pu1_ref_two_nt_minus2);
1791                 rev_val_second = vrev64_u8(pu1_src_val1);
1792 
1793                 vext_t = vext_u8(rev_val_first, rev_val_second, 1);
1794                 vst1_u8(pu1_dst_tmp, vext_t);
1795                 pu1_dst_tmp += dst_strd;
1796 
1797                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 8);
1798                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1799                 vst1_u8(pu1_dst_tmp, vext_t);
1800                 pu1_dst_tmp += dst_strd;
1801 
1802                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 16);
1803                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1804                 vst1_u8(pu1_dst_tmp, vext_t);
1805                 pu1_dst_tmp += dst_strd;
1806 
1807                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 24);
1808                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1809                 vst1_u8(pu1_dst_tmp, vext_t);
1810                 pu1_dst_tmp += dst_strd;
1811 
1812                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 32);
1813                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1814                 vst1_u8(pu1_dst_tmp, vext_t);
1815                 pu1_dst_tmp += dst_strd;
1816 
1817                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 40);
1818                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1819                 vst1_u8(pu1_dst_tmp, vext_t);
1820                 pu1_dst_tmp += dst_strd;
1821 
1822                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 48);
1823                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1824                 vst1_u8(pu1_dst_tmp, vext_t);
1825                 pu1_dst_tmp += dst_strd;
1826 
1827                 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 56);
1828                 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1829                 vst1_u8(pu1_dst_tmp, vext_t);
1830                 pu1_dst_tmp += dst_strd;
1831             }
1832             pu1_dst_tmp_plus8 += 8;
1833             pu1_dst_tmp = pu1_dst_tmp_plus8;
1834             pu1_ref_two_nt_minus2 += (nt - 8);
1835         }
1836     }
1837 }
1838 /* INTRA_PRED_LUMA_MODE2 */
1839 
1840 /**
1841 *******************************************************************************
1842 *
1843 * @brief
1844 *   Intra prediction interpolation filter for luma mode 18 & mode 34.
1845 *
1846 * @par Description:
1847 *    Intraprediction for mode 34 (ne angle) with reference  neighboring
1848 *    samples location pointed by 'pu1_ref' to the  TU block location pointed by
1849 *    'pu1_dst'
1850 *
1851 * @param[in] pu1_src
1852 *  UWORD8 pointer to the source
1853 *
1854 * @param[out] pu1_dst
1855 *  UWORD8 pointer to the destination
1856 *
1857 * @param[in] src_strd
1858 *  integer source stride
1859 *
1860 * @param[in] dst_strd
1861 *  integer destination stride
1862 *
1863 * @param[in] nt
1864 *  integer Transform Block size
1865 *
1866 * @param[in] wd
1867 *  integer width of the array
1868 *
1869 * @returns
1870 *
1871 * @remarks
1872 *  None
1873 *
1874 *******************************************************************************
1875 */
1876 
ihevc_intra_pred_luma_mode_18_34_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)1877 void ihevc_intra_pred_luma_mode_18_34_neonintr(UWORD8 *pu1_ref,
1878                                                WORD32 src_strd,
1879                                                UWORD8 *pu1_dst,
1880                                                WORD32 dst_strd,
1881                                                WORD32 nt,
1882                                                WORD32 mode)
1883 {
1884 
1885     WORD32 row, col, idx;
1886     WORD32 intraPredAngle = 32;
1887     WORD32 two_nt;
1888     UNUSED(src_strd);
1889     two_nt = 2 * nt;
1890 
1891     UWORD8 *pu1_ref_tmp = pu1_ref;
1892     UWORD8 *pu1_ref_tmp1 = pu1_ref;
1893     UWORD8 *pu1_dst_tmp = pu1_dst;
1894     UWORD8 *pu1_dst_tmp_plus8 = pu1_dst;
1895 
1896     uint8x8_t src_tmp_1st, src_tmp_2nd, vext1, vext2, vext3, vext4, vext5, vext6, vext7;
1897 
1898     /* src_tmp_1st, src_tmp_2nd are named as to load the 1st eight and next 8 values from source(pu1_ref)   */
1899     /* vext1 - vext7 are named to do vext operation between 2 loaded values and to handle dual issue        */
1900     /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8      */
1901     /* rows and columns are unrolled by 8, when the width is multiple of 8                                  */
1902     /* loops are maintained separately for mode18 and mode34                                                */
1903 
1904     /* cond to allow multiples of 8 */
1905     if(0 == (nt & 7))
1906     {
1907         if(mode == 34)
1908         {
1909             pu1_ref_tmp += (two_nt + 2);
1910 
1911             for(row = nt; row > 0; row -= 8)
1912             {
1913                 for(col = nt; col > 0; col -= 8)
1914                 {
1915                     /* Loading 1st eight values */
1916                     src_tmp_1st = vld1_u8(pu1_ref_tmp);
1917                     pu1_ref_tmp += 8;
1918 
1919                     /* Loading next eight values */
1920                     src_tmp_2nd = vld1_u8(pu1_ref_tmp);
1921 
1922                     /* UNROLLED  pu1_dst[col + (row * dst_strd)] = pu1_ref[two_nt + col + idx + 1] */
1923                     vext1 = vext_u8(src_tmp_1st, src_tmp_2nd, 1);
1924                     vst1_u8(pu1_dst_tmp, src_tmp_1st);
1925                     pu1_dst_tmp += dst_strd;
1926 
1927                     vext2 = vext_u8(src_tmp_1st, src_tmp_2nd, 2);
1928                     vst1_u8(pu1_dst_tmp, vext1);
1929                     pu1_dst_tmp += dst_strd;
1930 
1931                     vext3 = vext_u8(src_tmp_1st, src_tmp_2nd, 3);
1932                     vst1_u8(pu1_dst_tmp, vext2);
1933                     pu1_dst_tmp += dst_strd;
1934 
1935                     vext4 = vext_u8(src_tmp_1st, src_tmp_2nd, 4);
1936                     vst1_u8(pu1_dst_tmp, vext3);
1937                     pu1_dst_tmp += dst_strd;
1938 
1939                     vext5 = vext_u8(src_tmp_1st, src_tmp_2nd, 5);
1940                     vst1_u8(pu1_dst_tmp, vext4);
1941                     pu1_dst_tmp += dst_strd;
1942 
1943                     vext6 = vext_u8(src_tmp_1st, src_tmp_2nd, 6);
1944                     vst1_u8(pu1_dst_tmp, vext5);
1945                     pu1_dst_tmp += dst_strd;
1946 
1947                     vext7 = vext_u8(src_tmp_1st, src_tmp_2nd, 7);
1948                     vst1_u8(pu1_dst_tmp, vext6);
1949                     pu1_dst_tmp += dst_strd;
1950 
1951                     vst1_u8(pu1_dst_tmp, vext7);
1952                     pu1_dst_tmp += dst_strd;
1953                 }
1954 
1955                 pu1_dst_tmp_plus8 += 8;
1956                 pu1_dst_tmp = pu1_dst_tmp_plus8;
1957                 pu1_ref_tmp -= (nt - 8);
1958             }
1959         }
1960         else /* Loop for mode 18 */
1961         {
1962             pu1_ref_tmp += (two_nt);
1963 
1964             for(row = nt; row > 0; row -= 8)
1965             {
1966                 for(col = nt; col > 0; col -= 8)
1967                 {
1968                     /* Loading 1st eight values */
1969                     src_tmp_1st = vld1_u8(pu1_ref_tmp);
1970                     pu1_ref_tmp -= 8;
1971 
1972                     /* Loading next eight values */
1973                     src_tmp_2nd = vld1_u8(pu1_ref_tmp);
1974 
1975                     /* UNROLLED  pu1_dst[col + (row * dst_strd)] = pu1_ref[two_nt + col + idx + 1] */
1976                     vext1 = vext_u8(src_tmp_2nd, src_tmp_1st, 7);
1977                     vst1_u8(pu1_dst_tmp, src_tmp_1st);
1978                     pu1_dst_tmp += dst_strd;
1979 
1980                     vext2 = vext_u8(src_tmp_2nd, src_tmp_1st, 6);
1981                     vst1_u8(pu1_dst_tmp, vext1);
1982                     pu1_dst_tmp += dst_strd;
1983 
1984                     vext3 = vext_u8(src_tmp_2nd, src_tmp_1st, 5);
1985                     vst1_u8(pu1_dst_tmp, vext2);
1986                     pu1_dst_tmp += dst_strd;
1987 
1988                     vext4 = vext_u8(src_tmp_2nd, src_tmp_1st, 4);
1989                     vst1_u8(pu1_dst_tmp, vext3);
1990                     pu1_dst_tmp += dst_strd;
1991 
1992                     vext5 = vext_u8(src_tmp_2nd, src_tmp_1st, 3);
1993                     vst1_u8(pu1_dst_tmp, vext4);
1994                     pu1_dst_tmp += dst_strd;
1995 
1996                     vext6 = vext_u8(src_tmp_2nd, src_tmp_1st, 2);
1997                     vst1_u8(pu1_dst_tmp, vext5);
1998                     pu1_dst_tmp += dst_strd;
1999 
2000                     vext7 = vext_u8(src_tmp_2nd, src_tmp_1st, 1);
2001                     vst1_u8(pu1_dst_tmp, vext6);
2002                     pu1_dst_tmp += dst_strd;
2003 
2004                     vst1_u8(pu1_dst_tmp, vext7);
2005                     pu1_dst_tmp += dst_strd;
2006                 }
2007                 pu1_dst_tmp_plus8 += 8;
2008                 pu1_dst_tmp = pu1_dst_tmp_plus8;
2009                 pu1_ref_tmp += (nt + 8);
2010             }
2011         }
2012     }
2013 
2014     /* rows and columns are unrolled by 4, when the width is multiple of 4  */
2015 
2016     else /* loop for multiples of 4 */
2017     {
2018         uint8x8_t src_val1;
2019         uint8x8_t src_val2;
2020 
2021         if(mode == 18)
2022             intraPredAngle = -32;
2023         else if(mode == 34)
2024             intraPredAngle = 32;
2025 
2026         for(row = 0; row < nt; row += 2)
2027         {
2028             /* unrolling 2 rows */
2029             idx = ((row + 1) * intraPredAngle) >> 5;
2030             pu1_ref_tmp = pu1_ref + two_nt + idx + 1;
2031             src_val1 = vld1_u8(pu1_ref_tmp);
2032 
2033             idx = ((row + 2) * intraPredAngle) >> 5;
2034             pu1_ref_tmp1 = pu1_ref + two_nt + idx + 1;
2035             src_val2 = vld1_u8(pu1_ref_tmp1);
2036 
2037             /* unrolling 4 col */
2038             for(col = nt; col > 0; col -= 4)
2039             {
2040                 pu1_dst_tmp = pu1_dst;
2041                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(src_val1), 0);
2042                 pu1_dst_tmp += dst_strd;
2043                 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(src_val2), 0);
2044                 pu1_dst += 4;
2045             }
2046             pu1_dst += 2 * dst_strd - nt;
2047         }
2048     }
2049 }
2050 /* INTRA_PRED_LUMA_MODE_18_34 */
2051 
2052 /**
2053  *******************************************************************************
2054  *
2055  * @brief
2056  *    Intra prediction interpolation filter for luma mode 3 to mode 9
2057  *
2058  * @par Description:
2059  *    Intraprediction for mode 3 to 9  (positive angle, horizontal mode ) with
2060  *    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
2061  *    block location pointed by 'pu1_dst'
2062  *
2063  * @param[in] pu1_src
2064  *  UWORD8 pointer to the source
2065  *
2066  * @param[out] pu1_dst
2067  *  UWORD8 pointer to the destination
2068  *
2069  * @param[in] src_strd
2070  *  integer source stride
2071  *
2072  * @param[in] dst_strd
2073  *  integer destination stride
2074  *
2075  * @param[in] nt
2076  *  integer Transform Block size
2077  *
2078  * @param[in] mode
2079  *  integer intraprediction mode
2080  *
2081  * @returns
2082  *
2083  * @remarks
2084  *  None
2085  *
2086  *******************************************************************************
2087  */
2088 
2089 
ihevc_intra_pred_luma_mode_3_to_9_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)2090 void ihevc_intra_pred_luma_mode_3_to_9_neonintr(UWORD8 *pu1_ref,
2091                                                 WORD32 src_strd,
2092                                                 UWORD8 *pu1_dst,
2093                                                 WORD32 dst_strd,
2094                                                 WORD32 nt,
2095                                                 WORD32 mode)
2096 {
2097 
2098     WORD32 row, col;
2099     WORD32 intra_pred_ang;
2100     WORD32 pos, fract = 100, fract_prev;
2101     UNUSED(src_strd);
2102     if(0 == (nt & 7))
2103     {
2104 
2105         UWORD8 *pu1_ref_main_idx = pu1_ref;
2106         UWORD8 *pu1_ref_main_idx_1 = pu1_ref;
2107 
2108         UWORD8 *pu1_dst_tmp1 = pu1_dst;
2109         UWORD8 *pu1_dst_tmp2 = pu1_dst;
2110 
2111         WORD32 two_nt = 2 * nt;
2112 
2113         pu1_ref_main_idx += two_nt;
2114         pu1_ref_main_idx_1 += two_nt - 1;
2115 
2116         uint8x8_t dup_const_fract, dup_const_32_fract, ref_main_idx, ref_main_idx_1;
2117         uint8x8_t shift_res;
2118         uint16x8_t mul_res1, mul_res2, add_res;
2119 
2120         /* Intra Pred Angle according to the mode */
2121         intra_pred_ang = gai4_ihevc_ang_table[mode];
2122 
2123         pu1_ref_main_idx -= 8;
2124         pu1_ref_main_idx_1 -= 8;
2125 
2126         for(col = 0; col < nt; col++)
2127         {
2128             fract_prev = fract;
2129 
2130             pos = ((col + 1) * intra_pred_ang);
2131             fract = pos & (31);
2132 
2133             if(fract_prev < fract)
2134             {
2135                 pu1_ref_main_idx += 1;
2136                 pu1_ref_main_idx_1 += 1;
2137             }
2138 
2139             dup_const_fract = vdup_n_u8((uint8_t)fract);
2140             dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2141 
2142             for(row = nt; row > 0; row -= 8)
2143             {
2144                 ref_main_idx = vld1_u8(pu1_ref_main_idx);
2145                 ref_main_idx_1 = vld1_u8(pu1_ref_main_idx_1);
2146 
2147                 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
2148                 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
2149 
2150                 add_res = vaddq_u16(mul_res1, mul_res2);
2151 
2152                 shift_res = vrshrn_n_u16(add_res, 5);
2153 
2154                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 7);
2155                 pu1_dst_tmp1 += dst_strd;
2156 
2157                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 6);
2158                 pu1_dst_tmp1 += dst_strd;
2159 
2160                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 5);
2161                 pu1_dst_tmp1 += dst_strd;
2162 
2163                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 4);
2164                 pu1_dst_tmp1 += dst_strd;
2165 
2166                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
2167                 pu1_dst_tmp1 += dst_strd;
2168 
2169                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
2170                 pu1_dst_tmp1 += dst_strd;
2171 
2172                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
2173                 pu1_dst_tmp1 += dst_strd;
2174 
2175                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
2176                 pu1_dst_tmp1 += dst_strd;
2177 
2178                 pu1_ref_main_idx -= 8;
2179                 pu1_ref_main_idx_1 -= 8;
2180 
2181             }
2182             pu1_dst_tmp2 += 1;
2183             pu1_dst_tmp1 = pu1_dst_tmp2;
2184 
2185             pu1_ref_main_idx += nt;
2186             pu1_ref_main_idx_1 += nt;
2187 
2188             pu1_ref_main_idx -= 1;
2189             pu1_ref_main_idx_1 -= 1;
2190 
2191         }
2192     }
2193     else
2194     {
2195         UWORD8 *pu1_ref_tmp1 = pu1_ref;
2196         UWORD8 *pu1_ref_tmp2 = pu1_ref;
2197         UWORD8 *pu1_dst_tmp1 = pu1_dst;
2198         UWORD8 *pu1_dst_tmp2 = pu1_dst;
2199 
2200         pu1_ref_tmp1 += nt;
2201         pu1_ref_tmp2 += (nt - 1);
2202 
2203         uint8x8_t dup_fract, dup_32_fract, shift_res;
2204         uint16x8_t mul_res1, mul_res2, add_res;
2205         uint32x2_t  pu1_ref_val1, pu1_ref_val2;
2206 
2207         pu1_ref_val1 = vdup_n_u32(0);
2208         pu1_ref_val2 = vdup_n_u32(0);
2209 
2210         /* Intra Pred Angle according to the mode */
2211         intra_pred_ang = gai4_ihevc_ang_table[mode];
2212 
2213 
2214         for(col = 0; col < nt; col++)
2215         {
2216             fract_prev = fract;
2217             pos = ((col + 1) * intra_pred_ang);
2218             fract = pos & (31);
2219             if(fract_prev < fract)
2220             {
2221                 pu1_ref_tmp1 += 1;
2222                 pu1_ref_tmp2 += 1;
2223             }
2224             dup_fract = vdup_n_u8((uint8_t)fract);
2225             dup_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2226 
2227             for(row = nt; row > 0; row -= 4)
2228             {
2229                 pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, pu1_ref_val1, 0);
2230                 pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, pu1_ref_val2, 0);
2231 
2232                 mul_res1 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_32_fract);
2233                 mul_res2 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val2), dup_fract);
2234 
2235                 add_res = vaddq_u16(mul_res1, mul_res2);
2236 
2237                 shift_res = vrshrn_n_u16(add_res, 5);
2238 
2239                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
2240                 pu1_dst_tmp1 += dst_strd;
2241 
2242                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
2243                 pu1_dst_tmp1 += dst_strd;
2244 
2245                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
2246                 pu1_dst_tmp1 += dst_strd;
2247 
2248                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
2249 
2250             }
2251             pu1_ref_tmp1 -= 1;
2252             pu1_ref_tmp2 -= 1;
2253 
2254             pu1_dst_tmp2 += 1;
2255             pu1_dst_tmp1 = pu1_dst_tmp2;
2256 
2257         }
2258 
2259 
2260     }
2261 
2262 }
2263 
2264 /**
2265  *******************************************************************************
2266  *
2267  * @brief
2268  *   Intra prediction interpolation filter for luma mode 11 to mode 17
2269  *
2270  * @par Description:
2271  *    Intraprediction for mode 11 to 17  (negative angle, horizontal mode )
2272  *    with reference  neighboring samples location pointed by 'pu1_ref' to the
2273  *    TU block location pointed by 'pu1_dst'
2274  *
2275  * @param[in] pu1_src
2276  *  UWORD8 pointer to the source
2277  *
2278  * @param[out] pu1_dst
2279  *  UWORD8 pointer to the destination
2280  *
2281  * @param[in] src_strd
2282  *  integer source stride
2283  *
2284  * @param[in] dst_strd
2285  *  integer destination stride
2286  *
2287  * @param[in] nt
2288  *  integer Transform Block size
2289  *
2290  * @param[in] mode
2291  *  integer intraprediction mode
2292  *
2293  * @returns
2294  *
2295  * @remarks
2296  *  None
2297  *
2298  *******************************************************************************
2299  */
2300 
2301 
ihevc_intra_pred_luma_mode_11_to_17_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)2302 void ihevc_intra_pred_luma_mode_11_to_17_neonintr(UWORD8 *pu1_ref,
2303                                                   WORD32 src_strd,
2304                                                   UWORD8 *pu1_dst,
2305                                                   WORD32 dst_strd,
2306                                                   WORD32 nt,
2307                                                   WORD32 mode)
2308 {
2309 
2310     WORD32 row, col, k;
2311     WORD32 two_nt;
2312     WORD32 intra_pred_ang, inv_ang, inv_ang_sum;
2313     WORD32 pos, fract = 1000, fract_prev;
2314     WORD32  ref_idx;
2315 
2316     UWORD8 *ref_main;
2317     UWORD8 *ref_main_tmp;
2318 
2319     UWORD8 *pu1_ref_tmp1 = pu1_ref;
2320     UWORD8 *pu1_ref_tmp2 = pu1_ref;
2321     UWORD8 *pu1_dst_tmp1 = pu1_dst;
2322     UWORD8 *pu1_dst_tmp2 = pu1_dst;
2323 
2324     UWORD8 ref_temp[2 * MAX_CU_SIZE + 1];
2325 
2326     uint16x8_t mul_res1, mul_res2, add_res;
2327     uint8x8_t dup_const_fract, dup_const_32_fract;
2328     uint8x8_t ref_main_idx, ref_main_idx_1, shift_res;
2329     uint8x8_t ref_left_t;
2330     uint32x2_t  ref_left_tmp;
2331     UNUSED(src_strd);
2332     ref_left_tmp = vdup_n_u32(0);
2333 
2334     inv_ang_sum = 128;
2335     two_nt = 2 * nt;
2336 
2337     intra_pred_ang = gai4_ihevc_ang_table[mode];
2338 
2339     inv_ang = gai4_ihevc_inv_ang_table[mode - 11];
2340 
2341     pu1_ref_tmp1 += two_nt;
2342 
2343     ref_main = ref_temp + (nt - 1);
2344     ref_main_tmp = ref_main;
2345 
2346     if(0 == (nt & 7))
2347     {
2348         pu1_ref_tmp2 += (two_nt - 7);
2349 
2350         for(k = nt - 1; k >= 0; k -= 8)
2351         {
2352 
2353             ref_left_t = vld1_u8(pu1_ref_tmp2);
2354 
2355             ref_left_t = vrev64_u8(ref_left_t);
2356             vst1_u8(ref_main_tmp, ref_left_t);
2357             ref_main_tmp += 8;
2358             pu1_ref_tmp2 -= 8;
2359 
2360         }
2361 
2362     }
2363     else
2364     {
2365         uint8x8_t rev_val;
2366         pu1_ref_tmp2 += (two_nt - (nt - 1));
2367 
2368         for(k = nt - 1; k >= 0; k -= 8)
2369         {
2370 
2371             ref_left_tmp = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, ref_left_tmp, 1);
2372 
2373             rev_val = vrev64_u8(vreinterpret_u8_u32(ref_left_tmp));
2374             vst1_lane_u32((uint32_t *)ref_main_tmp, vreinterpret_u32_u8(rev_val), 0);
2375 
2376         }
2377 
2378     }
2379 
2380     ref_main[nt] = pu1_ref[two_nt - nt];
2381 
2382     /* For horizontal modes, (ref main = ref left) (ref side = ref above) */
2383 
2384     ref_idx = (nt * intra_pred_ang) >> 5;
2385 
2386     /* SIMD Optimization can be done using look-up table for the loop */
2387     /* For negative angled derive the main reference samples from side */
2388     /*  reference samples refer to section 8.4.4.2.6 */
2389     for(k = -1; k > ref_idx; k--)
2390     {
2391         inv_ang_sum += inv_ang;
2392         ref_main[k] = pu1_ref[two_nt + (inv_ang_sum >> 8)];
2393     }
2394 
2395     UWORD8 *ref_main_tmp1 = ref_main;
2396     UWORD8 *ref_main_tmp2 = ref_main;
2397 
2398     ref_main_tmp2 += 1;
2399 
2400     if(0 == (nt & 7))
2401     {
2402         /* For the angles other then 45 degree, interpolation btw 2 neighboring */
2403         /* samples dependent on distance to obtain destination sample */
2404         for(col = 0; col < nt; col++)
2405         {
2406 
2407             fract_prev = fract;
2408             pos = ((col + 1) * intra_pred_ang);
2409             fract = pos & (31);
2410 
2411             if(fract_prev < fract)
2412             {
2413                 ref_main_tmp1 -= 1;
2414                 ref_main_tmp2 -= 1;
2415             }
2416 
2417             dup_const_fract = vdup_n_u8((uint8_t)fract);
2418             dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2419 
2420             // Do linear filtering
2421             for(row = nt; row > 0; row -= 8)
2422             {
2423                 ref_main_idx = vld1_u8(ref_main_tmp1);
2424 
2425                 ref_main_idx_1 = vld1_u8(ref_main_tmp2);
2426 
2427                 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
2428                 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
2429 
2430                 add_res = vaddq_u16(mul_res1, mul_res2);
2431 
2432                 shift_res = vrshrn_n_u16(add_res, 5);
2433 
2434                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
2435                 pu1_dst_tmp1 += dst_strd;
2436 
2437                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
2438                 pu1_dst_tmp1 += dst_strd;
2439 
2440                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
2441                 pu1_dst_tmp1 += dst_strd;
2442 
2443                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
2444                 pu1_dst_tmp1 += dst_strd;
2445 
2446                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 4);
2447                 pu1_dst_tmp1 += dst_strd;
2448 
2449                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 5);
2450                 pu1_dst_tmp1 += dst_strd;
2451 
2452                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 6);
2453                 pu1_dst_tmp1 += dst_strd;
2454 
2455                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 7);
2456                 pu1_dst_tmp1 += dst_strd;
2457 
2458                 ref_main_tmp1 += 8;
2459                 ref_main_tmp2 += 8;
2460             }
2461 
2462             ref_main_tmp1 -= nt;
2463             ref_main_tmp2 -= nt;
2464 
2465             pu1_dst_tmp2 += 1;
2466             pu1_dst_tmp1 = pu1_dst_tmp2;
2467         }
2468     }
2469     else
2470     {
2471         uint32x2_t ref_main_idx1, ref_main_idx2;
2472 
2473         ref_main_idx1 = vdup_n_u32(0);
2474         ref_main_idx2 = vdup_n_u32(0);
2475 
2476         for(col = 0; col < nt; col++)
2477         {
2478             fract_prev = fract;
2479             pos = ((col + 1) * intra_pred_ang);
2480             fract = pos & (31);
2481 
2482             if(fract_prev < fract)
2483             {
2484                 ref_main_tmp1 -= 1;
2485                 ref_main_tmp2 -= 1;
2486             }
2487 
2488             dup_const_fract = vdup_n_u8((uint8_t)fract);
2489             dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2490 
2491             for(row = nt; row > 0; row -= 4)
2492             {
2493 
2494                 ref_main_idx1 = vld1_lane_u32((uint32_t *)ref_main_tmp1, ref_main_idx1, 0);
2495                 ref_main_idx2 = vld1_lane_u32((uint32_t *)ref_main_tmp2, ref_main_idx2, 0);
2496 
2497                 mul_res1 = vmull_u8(vreinterpret_u8_u32(ref_main_idx1), dup_const_32_fract);
2498                 mul_res2 = vmull_u8(vreinterpret_u8_u32(ref_main_idx2), dup_const_fract);
2499 
2500                 add_res = vaddq_u16(mul_res1, mul_res2);
2501 
2502                 shift_res = vrshrn_n_u16(add_res, 5);
2503 
2504                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
2505                 pu1_dst_tmp1 += dst_strd;
2506 
2507                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
2508                 pu1_dst_tmp1 += dst_strd;
2509 
2510                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
2511                 pu1_dst_tmp1 += dst_strd;
2512 
2513                 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
2514                 pu1_dst_tmp1 += dst_strd;
2515 
2516             }
2517 
2518             pu1_dst_tmp2 += 1;
2519             pu1_dst_tmp1 = pu1_dst_tmp2;
2520 
2521         }
2522 
2523     }
2524 }
2525 
2526 /**
2527  *******************************************************************************
2528  *
2529  * @brief
2530  *   Intra prediction interpolation filter for luma mode 19 to mode 25
2531  *
2532  * @par Description:
2533  *    Intraprediction for mode 19 to 25  (negative angle, vertical mode ) with
2534  *    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
2535  *    block location pointed by 'pu1_dst'
2536  *
2537  * @param[in] pu1_src
2538  *  UWORD8 pointer to the source
2539  *
2540  * @param[out] pu1_dst
2541  *  UWORD8 pointer to the destination
2542  *
2543  * @param[in] src_strd
2544  *  integer source stride
2545  *
2546  * @param[in] dst_strd
2547  *  integer destination stride
2548  *
2549  * @param[in] nt
2550  *  integer Transform Block size
2551  *
2552  * @param[in] mode
2553  *  integer intraprediction mode
2554  *
2555  * @returns
2556  *
2557  * @remarks
2558  *  None
2559  *
2560  *******************************************************************************
2561  */
2562 
2563 
ihevc_intra_pred_luma_mode_19_to_25_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)2564 void ihevc_intra_pred_luma_mode_19_to_25_neonintr(UWORD8 *pu1_ref,
2565                                                   WORD32 src_strd,
2566                                                   UWORD8 *pu1_dst,
2567                                                   WORD32 dst_strd,
2568                                                   WORD32 nt,
2569                                                   WORD32 mode)
2570 {
2571 
2572     WORD32 row, col, k;
2573     WORD32 two_nt, intra_pred_ang;
2574     WORD32 inv_ang, inv_ang_sum, pos, fract = 1000, fract_prev;;
2575     WORD32 ref_idx;
2576     UWORD8 *ref_main;
2577     UWORD8 *ref_main_tmp;
2578     UWORD8 ref_temp[(2 * MAX_CU_SIZE) + 1];
2579 
2580     UWORD8 *pu1_ref_tmp1 = pu1_ref;
2581     UWORD8 *pu1_ref_tmp2 = pu1_ref;
2582     UWORD8 *pu1_dst_tmp1 = pu1_dst;
2583 
2584     uint16x8_t mul_res1, mul_res2, add_res;
2585     uint8x8_t dup_const_fract, dup_const_32_fract;
2586     uint8x8_t ref_main_idx, ref_main_idx_1, shift_res;
2587     uint8x8_t ref_above_t;
2588     uint32x2_t ref_above_tmp;
2589     UNUSED(src_strd);
2590     ref_above_tmp = vdup_n_u32(0);
2591 
2592     two_nt = 2 * nt;
2593     intra_pred_ang = gai4_ihevc_ang_table[mode];
2594     inv_ang = gai4_ihevc_inv_ang_table[mode - 12];
2595 
2596     /* Intermediate reference samples for negative angle modes */
2597     /* This have to be removed during optimization*/
2598     pu1_ref_tmp1 += two_nt;
2599 
2600 
2601     ref_main = ref_temp + (nt - 1);
2602     ref_main_tmp = ref_main;
2603 
2604     if(0 == (nt & 7))
2605     {
2606         pu1_ref_tmp2 += (two_nt - 7);
2607         for(k = nt - 1; k >= 0; k -= 8)
2608         {
2609 
2610             ref_above_t = vld1_u8(pu1_ref_tmp1);
2611             vst1_u8(ref_main_tmp, ref_above_t);
2612             ref_main_tmp += 8;
2613             pu1_ref_tmp1 += 8;
2614 
2615         }
2616 
2617     }
2618     else
2619     {
2620         pu1_ref_tmp2 += (two_nt - (nt - 1));
2621 
2622         for(k = nt - 1; k >= 0; k -= 4)
2623         {
2624 
2625             ref_above_tmp = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, ref_above_tmp, 0);
2626             vst1_lane_u32((uint32_t *)ref_main_tmp, ref_above_tmp, 0);
2627 
2628         }
2629 
2630     }
2631 
2632     ref_main[nt] = pu1_ref[two_nt + nt];
2633 
2634     /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
2635 
2636     ref_idx = (nt * intra_pred_ang) >> 5;
2637     inv_ang_sum = 128;
2638 
2639     /* SIMD Optimization can be done using look-up table for the loop */
2640     /* For negative angled derive the main reference samples from side */
2641     /*  reference samples refer to section 8.4.4.2.6 */
2642     for(k = -1; k > ref_idx; k--)
2643     {
2644         inv_ang_sum += inv_ang;
2645         ref_main[k] = pu1_ref[two_nt - (inv_ang_sum >> 8)];
2646     }
2647 
2648     UWORD8 *ref_main_tmp1 = ref_main;
2649     UWORD8 *ref_main_tmp2 = ref_main;
2650 
2651     ref_main_tmp2 += 1;
2652 
2653     if(0 == (nt & 7))
2654     {
2655         /* For the angles other then 45 degree, interpolation btw 2 neighboring */
2656         /* samples dependent on distance to obtain destination sample */
2657         for(row = 0; row < nt; row++)
2658         {
2659 
2660             fract_prev = fract;
2661             pos = ((row + 1) * intra_pred_ang);
2662             fract = pos & (31);
2663 
2664             if(fract_prev < fract)
2665             {
2666                 ref_main_tmp1 -= 1;
2667                 ref_main_tmp2 -= 1;
2668             }
2669 
2670             dup_const_fract = vdup_n_u8((uint8_t)fract);
2671             dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2672 
2673             // Do linear filtering
2674             for(col = nt; col > 0; col -= 8)
2675             {
2676                 ref_main_idx = vld1_u8(ref_main_tmp1);
2677 
2678                 ref_main_idx_1 = vld1_u8(ref_main_tmp2);
2679 
2680                 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
2681                 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
2682 
2683                 add_res = vaddq_u16(mul_res1, mul_res2);
2684 
2685                 shift_res = vrshrn_n_u16(add_res, 5);
2686 
2687                 vst1_u8(pu1_dst_tmp1, shift_res);
2688                 pu1_dst_tmp1 += 8;
2689 
2690                 ref_main_tmp1 += 8;
2691                 ref_main_tmp2 += 8;
2692             }
2693 
2694             ref_main_tmp1 -= nt;
2695             ref_main_tmp2 -= nt;
2696 
2697             pu1_dst_tmp1 += (dst_strd - nt);
2698         }
2699     }
2700     else
2701     {
2702         uint32x2_t ref_main_idx1, ref_main_idx2;
2703 
2704         ref_main_idx1 = vdup_n_u32(0);
2705         ref_main_idx2 = vdup_n_u32(0);
2706 
2707         for(row = 0; row < nt; row++)
2708         {
2709             fract_prev = fract;
2710             pos = ((row + 1) * intra_pred_ang);
2711             fract = pos & (31);
2712 
2713             if(fract_prev < fract)
2714             {
2715                 ref_main_tmp1 -= 1;
2716                 ref_main_tmp2 -= 1;
2717             }
2718 
2719             dup_const_fract = vdup_n_u8((uint8_t)fract);
2720             dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2721 
2722             for(col = nt; col > 0; col -= 4)
2723             {
2724 
2725                 ref_main_idx1 = vld1_lane_u32((uint32_t *)ref_main_tmp1, ref_main_idx1, 0);
2726                 ref_main_idx2 = vld1_lane_u32((uint32_t *)ref_main_tmp2, ref_main_idx2, 0);
2727 
2728                 mul_res1 = vmull_u8(vreinterpret_u8_u32(ref_main_idx1), dup_const_32_fract);
2729                 mul_res2 = vmull_u8(vreinterpret_u8_u32(ref_main_idx2), dup_const_fract);
2730 
2731                 add_res = vaddq_u16(mul_res1, mul_res2);
2732 
2733                 shift_res = vrshrn_n_u16(add_res, 5);
2734 
2735                 vst1_lane_u32((uint32_t *)pu1_dst_tmp1, vreinterpret_u32_u8(shift_res), 0);
2736                 pu1_dst_tmp1 += 4;
2737 
2738             }
2739             pu1_dst_tmp1 += (dst_strd - nt);
2740         }
2741 
2742     }
2743 
2744 }
2745 
2746 /**
2747  *******************************************************************************
2748  *
2749  * @brief
2750  *    Intra prediction interpolation filter for luma mode 27 to mode 33
2751  *
2752  * @par Description:
2753  *    Intraprediction for mode 27 to 33  (positive angle, vertical mode ) with
2754  *    reference  neighboring samples location pointed by 'pu1_ref' to the  TU
2755  *    block location pointed by 'pu1_dst'
2756  *
2757  * @param[in] pu1_src
2758  *  UWORD8 pointer to the source
2759  *
2760  * @param[out] pu1_dst
2761  *  UWORD8 pointer to the destination
2762  *
2763  * @param[in] src_strd
2764  *  integer source stride
2765  *
2766  * @param[in] dst_strd
2767  *  integer destination stride
2768  *
2769  * @param[in] nt
2770  *  integer Transform Block size
2771  *
2772  * @param[in] mode
2773  *  integer intraprediction mode
2774  *
2775  * @returns
2776  *
2777  * @remarks
2778  *  None
2779  *
2780  *******************************************************************************
2781  */
2782 
2783 
ihevc_intra_pred_luma_mode_27_to_33_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)2784 void ihevc_intra_pred_luma_mode_27_to_33_neonintr(UWORD8 *pu1_ref,
2785                                                   WORD32 src_strd,
2786                                                   UWORD8 *pu1_dst,
2787                                                   WORD32 dst_strd,
2788                                                   WORD32 nt,
2789                                                   WORD32 mode)
2790 {
2791 
2792     WORD32 row, col;
2793     WORD32 intra_pred_ang;
2794     WORD32 pos, fract = 0, fract_prev;
2795 
2796     WORD32 two_nt = 2 * nt;
2797     UNUSED(src_strd);
2798     if(0 == (nt & 7))
2799     {
2800 
2801         UWORD8 *pu1_ref_main_idx = pu1_ref;
2802         UWORD8 *pu1_ref_main_idx_1 = pu1_ref;
2803 
2804         UWORD8 *pu1_dst_tmp1 = pu1_dst;
2805         pu1_ref_main_idx += (two_nt + 1);
2806         pu1_ref_main_idx_1 += (two_nt + 2);
2807 
2808         uint8x8_t dup_const_fract, dup_const_32_fract, ref_main_idx, ref_main_idx_1;
2809         uint8x8_t shift_res;
2810         uint16x8_t mul_res1, mul_res2, add_res;
2811 
2812         /* Intra Pred Angle according to the mode */
2813         intra_pred_ang = gai4_ihevc_ang_table[mode];
2814 
2815         for(row = 0; row < nt; row++)
2816         {
2817             fract_prev = fract;
2818 
2819             pos = ((row + 1) * intra_pred_ang);
2820             fract = pos & (31);
2821 
2822             if(fract_prev > fract)
2823             {
2824                 pu1_ref_main_idx += 1;
2825                 pu1_ref_main_idx_1 += 1;
2826             }
2827 
2828             dup_const_fract = vdup_n_u8((uint8_t)fract);
2829             dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2830 
2831             for(col = nt; col > 0; col -= 8)
2832             {
2833                 ref_main_idx = vld1_u8(pu1_ref_main_idx);
2834                 ref_main_idx_1 = vld1_u8(pu1_ref_main_idx_1);
2835 
2836                 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
2837                 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
2838 
2839                 add_res = vaddq_u16(mul_res1, mul_res2);
2840 
2841                 shift_res = vrshrn_n_u16(add_res, 5);
2842 
2843                 vst1_u8(pu1_dst_tmp1, shift_res);
2844                 pu1_dst_tmp1 += 8;
2845 
2846                 pu1_ref_main_idx += 8;
2847                 pu1_ref_main_idx_1 += 8;
2848             }
2849 
2850             pu1_ref_main_idx -= nt;
2851             pu1_ref_main_idx_1 -= nt;
2852 
2853             pu1_dst_tmp1 += (dst_strd - nt);
2854         }
2855 
2856     }
2857     else
2858     {
2859         UWORD8 *pu1_ref_tmp1 = pu1_ref;
2860         UWORD8 *pu1_ref_tmp2 = pu1_ref;
2861         UWORD8 *pu1_dst_tmp1 = pu1_dst;
2862 
2863         pu1_ref_tmp1 += (two_nt + 1);;
2864         pu1_ref_tmp2 += (two_nt + 2);;
2865 
2866         uint8x8_t dup_fract, dup_32_fract, shift_res;
2867         uint16x8_t mul_res1, mul_res2, add_res;
2868         uint32x2_t  pu1_ref_val1, pu1_ref_val2;
2869 
2870         pu1_ref_val1 = vdup_n_u32(0);
2871         pu1_ref_val2 = vdup_n_u32(0);
2872 
2873         /* Intra Pred Angle according to the mode */
2874         intra_pred_ang = gai4_ihevc_ang_table[mode];
2875 
2876         for(row = 0; row < nt; row++)
2877         {
2878             fract_prev = fract;
2879             pos = ((row + 1) * intra_pred_ang);
2880             fract = pos & (31);
2881             if(fract_prev > fract)
2882             {
2883                 pu1_ref_tmp1 += 1;
2884                 pu1_ref_tmp2 += 1;
2885             }
2886             dup_fract = vdup_n_u8((uint8_t)fract);
2887             dup_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2888 
2889             for(col = nt; col > 0; col -= 4)
2890             {
2891                 pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, pu1_ref_val1, 0);
2892                 pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, pu1_ref_val2, 0);
2893 
2894                 mul_res1 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_32_fract);
2895                 mul_res2 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val2), dup_fract);
2896 
2897                 add_res = vaddq_u16(mul_res1, mul_res2);
2898 
2899                 shift_res = vrshrn_n_u16(add_res, 5);
2900 
2901                 vst1_lane_u32((uint32_t *)pu1_dst_tmp1, vreinterpret_u32_u8(shift_res), 0);
2902                 pu1_dst_tmp1 += 4;
2903 
2904             }
2905 
2906             pu1_dst_tmp1 += (dst_strd - nt);
2907 
2908         }
2909 
2910 
2911     }
2912 
2913 }
2914