1 /******************************************************************************
2 *
3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18 /**
19 *******************************************************************************
20 * @file
21 * ihevc_intra_pred_filters_neon_intr.c
22 *
23 * @brief
24 * Contains function Definition for intra prediction interpolation filters
25 *
26 *
27 * @author
28 * Yogeswaran RS
29 *
30 * @par List of Functions:
31 * - ihevc_intra_pred_luma_planar()
32 * - ihevc_intra_pred_luma_dc()
33 * - ihevc_intra_pred_luma_horz()
34 * - ihevc_intra_pred_luma_ver()
35 * - ihevc_intra_pred_luma_mode2()
36 * - ihevc_intra_pred_luma_mode_18_34()
37 *
38 * @remarks
39 * None
40 *
41 *******************************************************************************
42 */
43 /*****************************************************************************/
44 /* File Includes */
45 /*****************************************************************************/
46 #include <stdio.h>
47
48 #include "ihevc_typedefs.h"
49 #include "ihevc_intra_pred.h"
50 #include "ihevc_macros.h"
51 #include "ihevc_func_selector.h"
52 #include "arm_neon.h"
53 #include "ihevc_platform_macros.h"
54 #include "ihevc_common_tables.h"
55
56 /****************************************************************************/
57 /* Constant Macros */
58 /****************************************************************************/
59 #define MAX_CU_SIZE 64
60 #define BIT_DEPTH 8
61 #define T32_4NT 128
62 #define T16_4NT 64
63
64 /*****************************************************************************/
65 /* Function Definition */
66 /*****************************************************************************/
67
68 /**
69 *******************************************************************************
70 *
71 * @brief
72 * Intra prediction interpolation filter for pu1_ref substitution
73 *
74 *
75 * @par Description:
76 * Reference substitution process for samples unavailable for prediction
77 * Refer to section 8.4.4.2.2
78 *
79 * @param[in] pu1_top_left
80 * UWORD8 pointer to the top-left
81 *
82 * @param[in] pu1_top
83 * UWORD8 pointer to the top
84 *
85 * @param[in] pu1_left
86 * UWORD8 pointer to the left
87 *
88 * @param[in] src_strd
89 * WORD32 Source stride
90 *
91 * @param[in] nbr_flags
92 * WORD32 neighbor availability flags
93 *
94 * @param[in] nt
95 * WORD32 transform Block size
96 *
97 * @param[in] dst_strd
98 * WORD32 Destination stride
99 *
100 * @returns
101 *
102 * @remarks
103 * None
104 *
105 *******************************************************************************
106 */
107
108
ihevc_intra_pred_luma_ref_substitution_neonintr(UWORD8 * pu1_top_left,UWORD8 * pu1_top,UWORD8 * pu1_left,WORD32 src_strd,WORD32 nt,WORD32 nbr_flags,UWORD8 * pu1_dst,WORD32 dst_strd)109 void ihevc_intra_pred_luma_ref_substitution_neonintr(UWORD8 *pu1_top_left,
110 UWORD8 *pu1_top,
111 UWORD8 *pu1_left,
112 WORD32 src_strd,
113 WORD32 nt,
114 WORD32 nbr_flags,
115 UWORD8 *pu1_dst,
116 WORD32 dst_strd)
117 {
118 UWORD8 pu1_ref;
119 WORD32 dc_val, i;
120 WORD32 total_samples = (4 * nt) + 1;
121 WORD32 two_nt = 2 * nt;
122 WORD32 three_nt = 3 * nt;
123 WORD32 get_bits;
124 WORD32 next;
125 WORD32 bot_left, left, top, tp_right, tp_left;
126 WORD32 idx, nbr_id_from_bl, frwd_nbr_flag;
127 UNUSED(dst_strd);
128 dc_val = 1 << (BIT_DEPTH - 1);
129
130 /* Neighbor Flag Structure*/
131 /* Top-Left | Top-Right | Top | Left | Bottom-Left
132 1 4 4 4 4
133 */
134
135 /* If no neighbor flags are present, fill the neighbor samples with DC value */
136 if(nbr_flags == 0)
137 {
138 for(i = 0; i < total_samples; i++)
139 {
140 pu1_dst[i] = dc_val;
141 }
142 }
143 else
144 {
145 /* Else fill the corresponding samples */
146 pu1_dst[two_nt] = *pu1_top_left;
147 UWORD8 *pu1_dst_tmp2 = pu1_dst;
148 UWORD8 *pu1_top_tmp = pu1_top;
149 pu1_dst_tmp2 += two_nt + 1;
150
151 for(i = 0; i < two_nt; i++)
152 pu1_dst[two_nt - 1 - i] = pu1_left[i * src_strd];
153
154 uint8x8_t src;
155 for(i = two_nt; i > 0; i -= 8)
156 {
157 src = vld1_u8(pu1_top_tmp);
158 pu1_top_tmp += 8;
159 vst1_u8(pu1_dst_tmp2, src);
160 pu1_dst_tmp2 += 8;
161 }
162
163 if(nt <= 8)
164 {
165 /* 1 bit extraction for all the neighboring blocks */
166 tp_left = (nbr_flags & 0x10000) >> 16;
167 bot_left = nbr_flags & 0x1;
168 left = (nbr_flags & 0x10) >> 4;
169 top = (nbr_flags & 0x100) >> 8;
170 tp_right = (nbr_flags & 0x1000) >> 12;
171
172 next = 1;
173
174 /* If bottom -left is not available, reverse substitution process*/
175 if(bot_left == 0)
176 {
177 WORD32 a_nbr_flag[5] = { bot_left, left, tp_left, top, tp_right };
178
179 /* Check for the 1st available sample from bottom-left*/
180 while(!a_nbr_flag[next])
181 next++;
182
183 /* If Left, top-left are available*/
184 if(next <= 2)
185 {
186 idx = nt * next;
187 pu1_ref = pu1_dst[idx];
188 for(i = 0; i < idx; i++)
189 pu1_dst[i] = pu1_ref;
190 }
191 else /* If top, top-right are available */
192 {
193 /* Idx is changed to copy 1 pixel value for top-left ,if top-left is not available*/
194 idx = (nt * (next - 1)) + 1;
195 pu1_ref = pu1_dst[idx];
196 for(i = 0; i < idx; i++)
197 pu1_dst[i] = pu1_ref;
198 }
199 }
200
201 /* Forward Substitution Process */
202 /* If left is Unavailable, copy the last bottom-left value */
203
204 if(left == 0)
205 {
206 uint8x8_t dup_pu1_dst1;
207 UWORD8 *pu1_dst_const_nt = pu1_dst;
208 pu1_dst_const_nt += nt;
209
210 if(0 == (nt & 7))
211 {
212 dup_pu1_dst1 = vdup_n_u8(pu1_dst[nt - 1]);
213 for(i = nt; i > 0; i -= 8)
214 {
215 vst1_u8(pu1_dst_const_nt, dup_pu1_dst1);
216 pu1_dst_const_nt += 8;
217
218 }
219 }
220 else
221 {
222 //uint32x2_t dup_pu1_dst4;
223 dup_pu1_dst1 = vdup_n_u8(pu1_dst[nt - 1]);
224 //dup_pu1_dst4 = vdup_n_u32((uint32_t) pu1_dst[nt - 1]);
225 for(i = nt; i > 0; i -= 4)
226 {
227 vst1_lane_u32((uint32_t *)pu1_dst_const_nt, vreinterpret_u32_u8(dup_pu1_dst1), 0);
228 pu1_dst_const_nt += 4;
229
230 }
231
232 }
233
234 }
235 if(tp_left == 0)
236 pu1_dst[two_nt] = pu1_dst[two_nt - 1];
237 if(top == 0)
238 {
239
240 if(0 == (nt & 7))
241 {
242 uint8x8_t dup_pu1_dst2;
243 UWORD8 *pu1_dst_const_two_nt_1 = pu1_dst;
244 pu1_dst_const_two_nt_1 += (two_nt + 1);
245 dup_pu1_dst2 = vdup_n_u8(pu1_dst[two_nt]);
246 for(i = nt; i > 0; i -= 8)
247 {
248 vst1_u8(pu1_dst_const_two_nt_1, dup_pu1_dst2);
249 pu1_dst_const_two_nt_1 += 8;
250
251 }
252 }
253 else
254 {
255 for(i = 0; i < nt; i++)
256 pu1_dst[two_nt + 1 + i] = pu1_dst[two_nt];
257 }
258 }
259 if(tp_right == 0)
260 {
261 uint8x8_t dup_pu1_dst3;
262 UWORD8 *pu1_dst_const_three_nt_1 = pu1_dst;
263 pu1_dst_const_three_nt_1 += (three_nt + 1);
264 dup_pu1_dst3 = vdup_n_u8(pu1_dst[two_nt]);
265 if(0 == (nt & 7))
266 {
267 for(i = nt; i > 0; i -= 8)
268 {
269 vst1_u8(pu1_dst_const_three_nt_1, dup_pu1_dst3);
270 pu1_dst_const_three_nt_1 += 8;
271
272 }
273 }
274 else
275 {
276 for(i = nt; i > 0; i -= 4)
277 {
278 vst1_lane_u32((uint32_t *)pu1_dst_const_three_nt_1, vreinterpret_u32_u8(dup_pu1_dst3), 0);
279 pu1_dst_const_three_nt_1 += 4;
280 }
281
282 }
283
284 }
285 }
286 if(nt == 16)
287 {
288 WORD32 nbr_flags_temp = 0;
289 nbr_flags_temp = (nbr_flags & 0x3) + ((nbr_flags & 0x30) >> 2)
290 + ((nbr_flags & 0x300) >> 4)
291 + ((nbr_flags & 0x3000) >> 6)
292 + ((nbr_flags & 0x10000) >> 8);
293
294 /* compute trailing zeors based on nbr_flag for substitution process of below left see section .*/
295 /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
296 {
297 nbr_id_from_bl = look_up_trailing_zeros(nbr_flags_temp & 0XF) * 8; /* for below left and left */
298
299 if(nbr_id_from_bl == 64)
300 nbr_id_from_bl = 32;
301
302 if(nbr_id_from_bl == 32)
303 {
304 /* for top left : 1 pel per nbr bit */
305 if(!((nbr_flags_temp >> 8) & 0x1))
306 {
307 nbr_id_from_bl++;
308 nbr_id_from_bl += look_up_trailing_zeros((nbr_flags_temp >> 4) & 0xF) * 8; /* top and top right; 8 pels per nbr bit */
309 }
310 }
311 /* Reverse Substitution Process*/
312 if(nbr_id_from_bl)
313 {
314 /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
315 pu1_ref = pu1_dst[nbr_id_from_bl];
316 for(i = (nbr_id_from_bl - 1); i >= 0; i--)
317 {
318 pu1_dst[i] = pu1_ref;
319 }
320 }
321 }
322
323 /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
324 while(nbr_id_from_bl < ((T16_4NT) + 1))
325 {
326 /* To Obtain the next unavailable idx flag after reverse neighbor substitution */
327 /* Devide by 8 to obtain the original index */
328 frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
329
330 /* The Top-left flag is at the last bit location of nbr_flags*/
331 if(nbr_id_from_bl == (T16_4NT / 2))
332 {
333 get_bits = GET_BIT(nbr_flags_temp, 8);
334
335 /* only pel substitution for TL */
336 if(!get_bits)
337 pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
338 }
339 else
340 {
341 get_bits = GET_BIT(nbr_flags_temp, frwd_nbr_flag);
342 if(!get_bits)
343 {
344 /* 8 pel substitution (other than TL) */
345 pu1_ref = pu1_dst[nbr_id_from_bl - 1];
346 for(i = 0; i < 8; i++)
347 pu1_dst[nbr_id_from_bl + i] = pu1_ref;
348 }
349
350 }
351 nbr_id_from_bl += (nbr_id_from_bl == (T16_4NT / 2)) ? 1 : 8;
352 }
353 }
354
355 if(nt == 32)
356 {
357 /* compute trailing ones based on mbr_flag for substitution process of below left see section .*/
358 /* as each bit in nbr flags corresponds to 8 pels for bot_left, left, top and topright but 1 pel for topleft */
359 {
360 nbr_id_from_bl = look_up_trailing_zeros((nbr_flags & 0XFF)) * 8; /* for below left and left */
361
362 if(nbr_id_from_bl == 64)
363 {
364 /* for top left : 1 pel per nbr bit */
365 if(!((nbr_flags >> 16) & 0x1))
366 {
367 /* top left not available */
368 nbr_id_from_bl++;
369 /* top and top right; 8 pels per nbr bit */
370 nbr_id_from_bl += look_up_trailing_zeros((nbr_flags >> 8) & 0xFF) * 8;
371 }
372 }
373 /* Reverse Substitution Process*/
374 if(nbr_id_from_bl)
375 {
376 /* Replicate the bottom-left and subsequent unavailable pixels with the 1st available pixel above */
377 pu1_ref = pu1_dst[nbr_id_from_bl];
378 for(i = (nbr_id_from_bl - 1); i >= 0; i--)
379 pu1_dst[i] = pu1_ref;
380 }
381 }
382
383 /* for the loop of 4*Nt+1 pixels (excluding pixels computed from reverse substitution) */
384 while(nbr_id_from_bl < ((T32_4NT)+1))
385 {
386 /* To Obtain the next unavailable idx flag after reverse neighbor substitution */
387 /* Devide by 8 to obtain the original index */
388 frwd_nbr_flag = (nbr_id_from_bl >> 3); /*+ (nbr_id_from_bl & 0x1);*/
389
390 /* The Top-left flag is at the last bit location of nbr_flags*/
391 if(nbr_id_from_bl == (T32_4NT / 2))
392 {
393 get_bits = GET_BIT(nbr_flags, 16);
394 /* only pel substitution for TL */
395 if(!get_bits)
396 pu1_dst[nbr_id_from_bl] = pu1_dst[nbr_id_from_bl - 1];
397 }
398 else
399 {
400 get_bits = GET_BIT(nbr_flags, frwd_nbr_flag);
401 if(!get_bits)
402 {
403 /* 8 pel substitution (other than TL) */
404 pu1_ref = pu1_dst[nbr_id_from_bl - 1];
405 for(i = 0; i < 8; i++)
406 pu1_dst[nbr_id_from_bl + i] = pu1_ref;
407 }
408
409 }
410 nbr_id_from_bl += (nbr_id_from_bl == (T32_4NT / 2)) ? 1 : 8;
411 }
412 }
413
414 }
415
416 }
417
418 /**
419 *******************************************************************************
420 *
421 * @brief
422 * Intra prediction interpolation filter for ref_filtering
423 *
424 *
425 * @par Description:
426 * Reference DC filtering for neighboring samples dependent on TU size and
427 * mode Refer to section 8.4.4.2.3 in the standard
428 *
429 * @param[in] pu1_src
430 * UWORD8 pointer to the source
431 *
432 * @param[out] pu1_dst
433 * UWORD8 pointer to the destination
434 *
435 * @param[in] nt
436 * integer Transform Block size
437 *
438 * @param[in] mode
439 * integer intraprediction mode
440 *
441 * @returns
442 *
443 * @remarks
444 * None
445 *
446 *******************************************************************************
447 */
448
449
ihevc_intra_pred_ref_filtering_neonintr(UWORD8 * pu1_src,WORD32 nt,UWORD8 * pu1_dst,WORD32 mode,WORD32 strong_intra_smoothing_enable_flag)450 void ihevc_intra_pred_ref_filtering_neonintr(UWORD8 *pu1_src,
451 WORD32 nt,
452 UWORD8 *pu1_dst,
453 WORD32 mode,
454 WORD32 strong_intra_smoothing_enable_flag)
455 {
456 WORD32 filter_flag;
457 WORD32 i = 0;
458 WORD32 four_nt = 4 * nt;
459
460 WORD32 src_4nt;
461 WORD32 src_0nt;
462 /* Naming has been made as per the functionlity it has, For eg. pu1_src_tmp_1 is denoting pu1_src + 1 */
463 /* src_val_1 to load value from pointer pu1_src_tmp_1, add_res has the result of adding 2 values */
464 UWORD8 *pu1_src_tmp_0 = pu1_src;
465 UWORD8 *pu1_src_tmp_1;
466 UWORD8 *pu1_src_tmp_2;
467 UWORD8 *pu1_dst_tmp_0 = pu1_dst;
468 UWORD8 *pu1_dst_tmp_1;
469
470 uint8x8_t src_val_0, src_val_2;
471 uint8x8_t src_val_1, shift_res;
472 uint8x8_t dup_const_2;
473 uint16x8_t mul_res, add_res;
474 WORD32 bi_linear_int_flag = 0;
475 WORD32 abs_cond_left_flag = 0;
476 WORD32 abs_cond_top_flag = 0;
477 WORD32 dc_val = 1 << (BIT_DEPTH - 5);
478 shift_res = vdup_n_u8(0);
479
480 filter_flag = gau1_intra_pred_ref_filter[mode] & (1 << (CTZ(nt) - 2));
481
482 if(0 == filter_flag)
483 {
484 if(pu1_src == pu1_dst)
485 {
486 return;
487 }
488 else
489 {
490 for(i = four_nt; i > 0; i -= 8)
491 {
492 src_val_0 = vld1_u8(pu1_src_tmp_0);
493 pu1_src_tmp_0 += 8;
494 vst1_u8(pu1_dst_tmp_0, src_val_0);
495 pu1_dst_tmp_0 += 8;
496 }
497 pu1_dst[four_nt] = pu1_src[four_nt];
498 }
499 }
500
501 else
502 {
503 /* If strong intra smoothin is enabled and transform size is 32 */
504 if((1 == strong_intra_smoothing_enable_flag) && (32 == nt))
505 {
506 /*Strong Intra Filtering*/
507 abs_cond_top_flag = (ABS(pu1_src[2 * nt] + pu1_src[4 * nt]
508 - (2 * pu1_src[3 * nt]))) < dc_val;
509 abs_cond_left_flag = (ABS(pu1_src[2 * nt] + pu1_src[0]
510 - (2 * pu1_src[nt]))) < dc_val;
511
512 bi_linear_int_flag = ((1 == abs_cond_left_flag)
513 && (1 == abs_cond_top_flag));
514 }
515
516 src_4nt = pu1_src[4 * nt];
517 src_0nt = pu1_src[0];
518 /* Strong filtering of reference samples */
519 if(1 == bi_linear_int_flag)
520 {
521 WORD32 two_nt = four_nt >> 1;
522
523 WORD32 pu1_src_0_val = pu1_src[0];
524 WORD32 pu1_src_2_nt_val = pu1_src[2 * nt];
525 WORD32 pu1_src_4_nt_val = pu1_src[4 * nt];
526
527 WORD32 prod_two_nt_src_0_val = two_nt * pu1_src_0_val;
528 uint16x8_t prod_two_nt_src_0_val_t = vdupq_n_u16(prod_two_nt_src_0_val);
529
530 WORD32 prod_two_nt_src_2_nt_val = two_nt * pu1_src_2_nt_val;
531 uint16x8_t prod_two_nt_src_2_nt_val_t = vdupq_n_u16(prod_two_nt_src_2_nt_val);
532
533 const UWORD8 *const_col_i;
534 uint8x8_t const_col_i_val;
535 uint16x8_t prod_val_1;
536 uint16x8_t prod_val_2;
537 uint16x8_t prod_val_3;
538 uint16x8_t prod_val_4;
539 uint8x8_t res_val_1;
540 uint8x8_t res_val_2;
541 uint8x8_t pu1_src_0_val_t = vdup_n_u8(pu1_src_0_val);
542 uint8x8_t pu1_src_2_nt_val_t = vdup_n_u8(pu1_src_2_nt_val);
543 uint8x8_t pu1_src_4_nt_val_t = vdup_n_u8(pu1_src_4_nt_val);
544 pu1_dst_tmp_0 = pu1_dst + 1;
545 pu1_dst_tmp_1 = pu1_dst + two_nt + 1;
546
547 const_col_i = gau1_ihevc_planar_factor + 1;
548
549 for(i = two_nt; i > 0; i -= 8)
550 {
551 const_col_i_val = vld1_u8(const_col_i);
552 const_col_i += 8;
553
554 prod_val_1 = vmlsl_u8(prod_two_nt_src_0_val_t, const_col_i_val, pu1_src_0_val_t);
555 prod_val_2 = vmlal_u8(prod_val_1, const_col_i_val, pu1_src_2_nt_val_t);
556
557 res_val_1 = vrshrn_n_u16(prod_val_2, 6);
558 prod_val_3 = vmlsl_u8(prod_two_nt_src_2_nt_val_t, const_col_i_val, pu1_src_2_nt_val_t);
559
560 vst1_u8(pu1_dst_tmp_0, res_val_1);
561 pu1_dst_tmp_0 += 8;
562 prod_val_4 = vmlal_u8(prod_val_3, const_col_i_val, pu1_src_4_nt_val_t);
563
564 res_val_2 = vrshrn_n_u16(prod_val_4, 6);
565 vst1_u8(pu1_dst_tmp_1, res_val_2);
566 pu1_dst_tmp_1 += 8;
567 }
568 pu1_dst[2 * nt] = pu1_src[2 * nt];
569 }
570 else
571 {
572 pu1_src_tmp_1 = pu1_src + 1;
573 pu1_src_tmp_2 = pu1_src + 2;
574 pu1_dst_tmp_0 += 1;
575
576 dup_const_2 = vdup_n_u8(2);
577
578 /* Extremities Untouched*/
579 pu1_dst[0] = pu1_src[0];
580
581 /* To avoid the issue when the dest and src has the same pointer this load has been done
582 * outside and the 2nd consecutive load is done before the store of the 1st */
583
584 /* Perform bilinear filtering of Reference Samples */
585 for(i = (four_nt - 1); i > 0; i -= 8)
586 {
587 src_val_0 = vld1_u8(pu1_src_tmp_0);
588 pu1_src_tmp_0 += 8;
589
590 src_val_2 = vld1_u8(pu1_src_tmp_2);
591 pu1_src_tmp_2 += 8;
592
593 src_val_1 = vld1_u8(pu1_src_tmp_1);
594 pu1_src_tmp_1 += 8;
595
596 if(i < four_nt - 1)
597 {
598 vst1_u8(pu1_dst_tmp_0, shift_res);
599 pu1_dst_tmp_0 += 8;
600 }
601
602 add_res = vaddl_u8(src_val_0, src_val_2);
603
604 mul_res = vmlal_u8(add_res, src_val_1, dup_const_2);
605 shift_res = vrshrn_n_u16(mul_res, 2);
606
607 }
608 vst1_u8(pu1_dst_tmp_0, shift_res);
609 pu1_dst_tmp_0 += 8;
610 }
611 pu1_dst[4 * nt] = src_4nt;
612 pu1_dst[0] = src_0nt;
613 }
614
615 }
616
617
618
619 /**
620 *******************************************************************************
621 *
622 * @brief
623 * Intra prediction interpolation filter for luma planar
624 *
625 * @par Description:
626 * Planar Intraprediction with reference neighboring samples location
627 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst'
628 *
629 * @param[in] pu1_src
630 * UWORD8 pointer to the source
631 *
632 * @param[out] pu1_dst
633 * UWORD8 pointer to the destination
634 *
635 * @param[in] src_strd
636 * integer source stride
637 *
638 * @param[in] dst_strd
639 * integer destination stride
640 *
641 * @param[in] nt
642 * integer Transform Block size
643 *
644 * @param[in] wd
645 * integer width of the array
646 *
647 * @returns
648 *
649 * @remarks
650 * None
651 *
652 *******************************************************************************
653 */
654
ihevc_intra_pred_luma_planar_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)655 void ihevc_intra_pred_luma_planar_neonintr(UWORD8 *pu1_ref,
656 WORD32 src_strd,
657 UWORD8 *pu1_dst,
658 WORD32 dst_strd,
659 WORD32 nt,
660 WORD32 mode)
661 {
662 /* named it in the way (nt - 1 - col) --> const_nt_1_col(const denotes g_ihevc_planar_factor) */
663 /* load const_nt_1_col values into a d register */
664 /* named it in the way pu1_ref[nt - 1] --> pu1_ref_nt_1 */
665 /* the value of pu1_ref_nt_1 is duplicated to d register hence pu1_ref_nt_1_dup */
666 /* log2nt + 1 is taken care while assigning the values itself */
667 /* In width multiple of 4 case the row also has been unrolled by 2 and store has been taken care*/
668
669 WORD32 row, col = 0;
670 WORD32 log2nt_plus1 = 6;
671 WORD32 two_nt, three_nt;
672 UWORD8 *pu1_ref_two_nt_1;
673 UWORD8 *pu1_dst_tmp;
674 const UWORD8 *const_nt_1_col;
675 uint8x8_t const_nt_1_col_t;
676 const UWORD8 *const_col_1;
677 uint8x8_t const_col_1_t;
678 uint8_t const_nt_1_row;
679 uint8x8_t const_nt_1_row_dup;
680 uint8_t const_row_1;
681 uint8x8_t const_row_1_dup;
682 uint8_t const_nt = nt;
683 uint16x8_t const_nt_dup;
684 uint8_t pu1_ref_nt_1 = pu1_ref[nt - 1];
685 uint8x8_t pu1_ref_nt_1_dup;
686 uint8_t pu1_ref_two_nt_1_row;
687 uint8_t pu1_ref_three_nt_1;
688 uint8x8_t pu1_ref_two_nt_1_row_dup;
689 uint8x8_t pu1_ref_two_nt_1_t;
690 uint8x8_t pu1_ref_three_nt_1_dup;
691 uint16x8_t prod_t1;
692 uint16x8_t prod_t2;
693 uint16x8_t sto_res_tmp;
694 uint8x8_t sto_res;
695 int16x8_t log2nt_dup;
696 UNUSED(src_strd);
697 UNUSED(mode);
698 log2nt_plus1 = 32 - CLZ(nt);
699 two_nt = 2 * nt;
700 three_nt = 3 * nt;
701 /* loops have been unrolld considering the fact width is multiple of 8 */
702 if(0 == (nt & 7))
703 {
704 pu1_dst_tmp = pu1_dst;
705 const_nt_1_col = gau1_ihevc_planar_factor + nt - 8;
706
707 const_col_1 = gau1_ihevc_planar_factor + 1;
708 pu1_ref_three_nt_1 = pu1_ref[three_nt + 1];
709
710 pu1_ref_nt_1_dup = vdup_n_u8(pu1_ref_nt_1);
711 const_nt_dup = vdupq_n_u16(const_nt);
712
713 log2nt_dup = vdupq_n_s16(log2nt_plus1);
714 log2nt_dup = vnegq_s16(log2nt_dup);
715
716 pu1_ref_three_nt_1_dup = vdup_n_u8(pu1_ref_three_nt_1);
717
718 for(row = 0; row < nt; row++)
719 {
720 pu1_ref_two_nt_1_row = pu1_ref[two_nt - 1 - row];
721 pu1_ref_two_nt_1_row_dup = vdup_n_u8(pu1_ref_two_nt_1_row);
722
723 const_nt_1_row = nt - 1 - row;
724 const_nt_1_row_dup = vdup_n_u8(const_nt_1_row);
725
726 const_row_1 = row + 1;
727 const_row_1_dup = vdup_n_u8(const_row_1);
728
729 const_nt_1_col = gau1_ihevc_planar_factor + nt - 8;
730
731 const_col_1 = gau1_ihevc_planar_factor + 1;
732 pu1_ref_two_nt_1 = pu1_ref + two_nt + 1;
733
734 for(col = nt; col > 0; col -= 8)
735 {
736 const_nt_1_col_t = vld1_u8(const_nt_1_col);
737 const_nt_1_col -= 8;
738 const_nt_1_col_t = vrev64_u8(const_nt_1_col_t);
739
740 const_col_1_t = vld1_u8(const_col_1);
741 const_col_1 += 8;
742 prod_t1 = vmull_u8(const_nt_1_col_t, pu1_ref_two_nt_1_row_dup);
743
744 pu1_ref_two_nt_1_t = vld1_u8(pu1_ref_two_nt_1);
745 pu1_ref_two_nt_1 += 8;
746 prod_t2 = vmull_u8(const_col_1_t, pu1_ref_three_nt_1_dup);
747
748 prod_t1 = vmlal_u8(prod_t1, const_nt_1_row_dup, pu1_ref_two_nt_1_t);
749 prod_t2 = vmlal_u8(prod_t2, const_row_1_dup, pu1_ref_nt_1_dup);
750 prod_t1 = vaddq_u16(prod_t1, const_nt_dup);
751 prod_t1 = vaddq_u16(prod_t1, prod_t2);
752
753 sto_res_tmp = vreinterpretq_u16_s16(vshlq_s16(vreinterpretq_s16_u16(prod_t1), log2nt_dup));
754 sto_res = vmovn_u16(sto_res_tmp);
755 vst1_u8(pu1_dst_tmp, sto_res);
756 pu1_dst_tmp += 8;
757 }
758 pu1_dst_tmp += dst_strd - nt;
759 }
760 }
761 /* loops have been unrolld considering the fact width is multiple of 4 */
762 /* If column is multiple of 4 then height should be multiple of 2 */
763 else
764 {
765 uint8x8_t const_row_1_dup1;
766 uint8x8_t pu1_ref_two_nt_1_t1;
767 uint8x8_t const_nt_1_col_t1;
768 uint8x8_t const_col_1_t1;
769 uint8x8_t pu1_ref_two_nt_1_row_dup1;
770 uint8x8_t const_nt_1_row_dup1;
771
772 pu1_ref_three_nt_1 = pu1_ref[three_nt + 1];
773
774 pu1_ref_nt_1_dup = vdup_n_u8(pu1_ref_nt_1);
775 const_nt_dup = vdupq_n_u16(const_nt);
776
777 log2nt_dup = vdupq_n_s16(log2nt_plus1);
778 log2nt_dup = vnegq_s16(log2nt_dup);
779
780 pu1_ref_three_nt_1_dup = vdup_n_u8(pu1_ref_three_nt_1);
781
782 for(row = 0; row < nt; row += 2)
783 {
784 pu1_ref_two_nt_1_row = pu1_ref[two_nt - 1 - row];
785 pu1_ref_two_nt_1_row_dup = vdup_n_u8(pu1_ref_two_nt_1_row);
786 pu1_ref_two_nt_1_row = pu1_ref[two_nt - 2 - row];
787 pu1_ref_two_nt_1_row_dup1 = vdup_n_u8(pu1_ref_two_nt_1_row);
788 pu1_ref_two_nt_1_row_dup = vext_u8(pu1_ref_two_nt_1_row_dup, pu1_ref_two_nt_1_row_dup1, 4);
789
790 const_nt_1_row = nt - 1 - row;
791 const_nt_1_row_dup = vdup_n_u8(const_nt_1_row);
792 const_nt_1_row = nt - 2 - row;
793 const_nt_1_row_dup1 = vdup_n_u8(const_nt_1_row);
794 const_nt_1_row_dup = vext_u8(const_nt_1_row_dup, const_nt_1_row_dup1, 4);
795
796 const_row_1 = row + 1;
797 const_row_1_dup = vdup_n_u8(const_row_1);
798 const_row_1 = row + 2;
799 const_row_1_dup1 = vdup_n_u8(const_row_1);
800 const_row_1_dup = vext_u8(const_row_1_dup, const_row_1_dup1, 4);
801
802 const_nt_1_col = gau1_ihevc_planar_factor + nt - 4;
803
804 const_col_1 = gau1_ihevc_planar_factor + 1;
805
806 pu1_ref_two_nt_1 = pu1_ref + two_nt + 1;
807
808 for(col = nt; col > 0; col -= 4)
809 {
810 const_nt_1_col_t = vld1_u8(const_nt_1_col);
811 const_nt_1_col -= 4;
812 const_nt_1_col_t = vrev64_u8(const_nt_1_col_t);
813
814 const_col_1_t = vld1_u8(const_col_1);
815 const_col_1 += 4;
816 const_nt_1_col_t1 = vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(const_nt_1_col_t), 32));
817
818 pu1_dst_tmp = pu1_dst;
819 const_nt_1_col_t = vext_u8(const_nt_1_col_t, const_nt_1_col_t1, 4);
820
821 const_col_1_t1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(const_col_1_t), 32));
822 prod_t1 = vmull_u8(const_nt_1_col_t, pu1_ref_two_nt_1_row_dup);
823
824 pu1_ref_two_nt_1_t = vld1_u8(pu1_ref_two_nt_1);
825 pu1_ref_two_nt_1 += 4;
826 const_col_1_t = vext_u8(const_col_1_t1, const_col_1_t, 4);
827
828 pu1_ref_two_nt_1_t1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(pu1_ref_two_nt_1_t), 32));
829 prod_t2 = vmull_u8(const_col_1_t, pu1_ref_three_nt_1_dup);
830
831 pu1_ref_two_nt_1_t = vext_u8(pu1_ref_two_nt_1_t1, pu1_ref_two_nt_1_t, 4);
832 prod_t2 = vmlal_u8(prod_t2, const_row_1_dup, pu1_ref_nt_1_dup);
833
834 prod_t1 = vmlal_u8(prod_t1, const_nt_1_row_dup, pu1_ref_two_nt_1_t);
835 prod_t1 = vaddq_u16(prod_t1, const_nt_dup);
836 prod_t1 = vaddq_u16(prod_t1, prod_t2);
837
838 sto_res_tmp = vreinterpretq_u16_s16(vshlq_s16(vreinterpretq_s16_u16(prod_t1), log2nt_dup));
839 sto_res = vmovn_u16(sto_res_tmp);
840
841 vst1_lane_u32((uint32_t *)pu1_dst, vreinterpret_u32_u8(sto_res), 0);
842 pu1_dst_tmp += dst_strd;
843
844 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(sto_res), 1);
845 pu1_dst += 4;
846 }
847 pu1_dst += 2 * dst_strd - nt;
848 }
849 }
850
851 }
852 /* INTRA_PRED_LUMA_PLANAR */
853
854 /**
855 *******************************************************************************
856 *
857 * @brief
858 * Intra prediction interpolation filter for luma dc
859 *
860 * @par Description:
861 * Intraprediction for DC mode with reference neighboring samples location
862 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst'
863 *
864 * @param[in] pu1_src
865 * UWORD8 pointer to the source
866 *
867 * @param[out] pu1_dst
868 * UWORD8 pointer to the destination
869 *
870 * @param[in] src_strd
871 * integer source stride
872 *
873 * @param[in] dst_strd
874 * integer destination stride
875 *
876 * @param[in] nt
877 * integer Transform Block size
878 *
879 * @param[in] wd
880 * integer width of the array
881 *
882 * @returns
883 *
884 * @remarks
885 * None
886 *
887 *******************************************************************************
888 */
889
ihevc_intra_pred_luma_dc_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)890 void ihevc_intra_pred_luma_dc_neonintr(UWORD8 *pu1_ref,
891 WORD32 src_strd,
892 UWORD8 *pu1_dst,
893 WORD32 dst_strd,
894 WORD32 nt,
895 WORD32 mode)
896 {
897 WORD32 dc_val = 0, two_dc_val = 0, three_dc_val = 0;
898 WORD32 i = 0;
899 WORD32 row = 0, col = 0, col_count;
900 WORD32 log2nt_plus1 = 6;
901 WORD32 two_nt = 0;
902 uint16x8_t ref_load_q;
903 uint16x8_t three_dc_val_t;
904 uint8x8_t sto_res_tmp;
905 uint8x8_t sto_res_tmp1;
906 uint8x8_t sto_res_tmp2;
907 uint8x8_t sto_res_tmp3;
908 uint8x8_t sto_res_tmp4;
909 uint8x8_t dc_val_t;
910
911 UWORD8 *pu1_ref_tmp;
912 UWORD8 *pu1_ref_tmp1;
913 UWORD8 *pu1_dst_tmp;
914 UWORD8 *pu1_dst_tmp1;
915 UWORD8 *pu1_dst_tmp2;
916 UNUSED(src_strd);
917 UNUSED(mode);
918
919 /* log2nt + 1 is taken care while assigning the values itself. */
920 log2nt_plus1 = 32 - CLZ(nt);
921
922 /* loops have been unrolld considering the fact width is multiple of 8 */
923 if(0 == (nt & 7))
924 {
925 uint8x8_t ref_load1;
926 uint8x8_t ref_load2;
927 uint16x4_t acc_dc_pair1;
928 uint32x2_t acc_dc_pair2;
929 uint64x1_t acc_dc = vdup_n_u64(col);
930
931 two_nt = 2 * nt;
932 pu1_ref_tmp = pu1_ref + nt;
933 pu1_ref_tmp1 = pu1_ref + two_nt + 1;
934
935 for(i = two_nt; i > nt; i -= 8)
936 {
937 ref_load1 = vld1_u8(pu1_ref_tmp);
938 pu1_ref_tmp += 8;
939 acc_dc_pair1 = vpaddl_u8(ref_load1);
940
941 ref_load2 = vld1_u8(pu1_ref_tmp1);
942 pu1_ref_tmp1 += 8;
943
944 acc_dc_pair2 = vpaddl_u16(acc_dc_pair1);
945 acc_dc = vpadal_u32(acc_dc, acc_dc_pair2);
946
947 acc_dc_pair1 = vpaddl_u8(ref_load2);
948 acc_dc_pair2 = vpaddl_u16(acc_dc_pair1);
949 acc_dc = vpadal_u32(acc_dc, acc_dc_pair2);
950 }
951
952 dc_val = (vget_lane_u32(vreinterpret_u32_u64(acc_dc), 0) + nt) >> (log2nt_plus1);
953 dc_val_t = vdup_n_u8(dc_val);
954 two_dc_val = 2 * dc_val;
955 three_dc_val = 3 * dc_val;
956 three_dc_val += 2;
957
958 three_dc_val_t = vdupq_n_u16((WORD16)three_dc_val);
959 pu1_ref_tmp = pu1_ref + two_nt + 1 + 0;
960 pu1_dst_tmp = pu1_dst;
961
962
963 if(nt == 32)
964 {
965 for(row = 0; row < nt; row++)
966 {
967 for(col = nt; col > 0; col -= 8)
968 {
969 vst1_u8(pu1_dst_tmp, dc_val_t);
970 pu1_dst_tmp += 8;
971 }
972 pu1_dst_tmp += dst_strd - nt;
973 }
974 }
975 else
976
977 {
978 for(col = nt; col > 0; col -= 8)
979 {
980 ref_load1 = vld1_u8(pu1_ref_tmp);
981 pu1_ref_tmp += 8;
982 ref_load_q = vmovl_u8(ref_load1);
983 ref_load_q = vaddq_u16(ref_load_q, three_dc_val_t);
984 ref_load_q = vshrq_n_u16(ref_load_q, 2);
985 sto_res_tmp = vmovn_u16(ref_load_q);
986 vst1_u8(pu1_dst_tmp, sto_res_tmp);
987 pu1_dst_tmp += 8;
988 }
989
990 pu1_ref_tmp = pu1_ref + two_nt - 9;
991 pu1_dst_tmp = pu1_dst + dst_strd;
992 col_count = nt - 8;
993
994 /* Except the first row the remaining rows are done here */
995 /* Both column and row has been unrolled by 8 */
996 /* Store has been taken care for the unrolling */
997 /* Except the 1st column of the remaining rows(other than 1st row), the values are */
998 /* constant hence it is extracted with an constant value and stored */
999 /* If the column is greater than 8, then the remaining values are constant which is */
1000 /* taken care in the inner for loop */
1001
1002 for(row = nt; row > 0; row -= 8)
1003 {
1004 pu1_dst_tmp1 = pu1_dst_tmp + 8;
1005 ref_load1 = vld1_u8(pu1_ref_tmp);
1006 pu1_ref_tmp -= 8;
1007 ref_load_q = vmovl_u8(ref_load1);
1008 ref_load_q = vaddq_u16(ref_load_q, three_dc_val_t);
1009 ref_load_q = vshrq_n_u16(ref_load_q, 2);
1010 sto_res_tmp = vmovn_u16(ref_load_q);
1011
1012 sto_res_tmp1 = vext_u8(sto_res_tmp, dc_val_t, 7);
1013
1014 sto_res_tmp2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 8));
1015 sto_res_tmp2 = vext_u8(sto_res_tmp2, dc_val_t, 7);
1016 vst1_u8(pu1_dst_tmp, sto_res_tmp1);
1017 pu1_dst_tmp += dst_strd;
1018
1019 sto_res_tmp3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 16));
1020 sto_res_tmp3 = vext_u8(sto_res_tmp3, dc_val_t, 7);
1021 vst1_u8(pu1_dst_tmp, sto_res_tmp2);
1022 pu1_dst_tmp += dst_strd;
1023
1024 sto_res_tmp4 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 24));
1025 sto_res_tmp4 = vext_u8(sto_res_tmp4, dc_val_t, 7);
1026 vst1_u8(pu1_dst_tmp, sto_res_tmp3);
1027 pu1_dst_tmp += dst_strd;
1028
1029 sto_res_tmp1 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 32));
1030 sto_res_tmp1 = vext_u8(sto_res_tmp1, dc_val_t, 7);
1031 vst1_u8(pu1_dst_tmp, sto_res_tmp4);
1032 pu1_dst_tmp += dst_strd;
1033
1034 sto_res_tmp2 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 40));
1035 sto_res_tmp2 = vext_u8(sto_res_tmp2, dc_val_t, 7);
1036 vst1_u8(pu1_dst_tmp, sto_res_tmp1);
1037 pu1_dst_tmp += dst_strd;
1038
1039 sto_res_tmp3 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 48));
1040 sto_res_tmp3 = vext_u8(sto_res_tmp3, dc_val_t, 7);
1041 vst1_u8(pu1_dst_tmp, sto_res_tmp2);
1042 pu1_dst_tmp += dst_strd;
1043
1044 sto_res_tmp4 = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(sto_res_tmp), 56));
1045 sto_res_tmp4 = vext_u8(sto_res_tmp4, dc_val_t, 7);
1046 vst1_u8(pu1_dst_tmp, sto_res_tmp3);
1047 pu1_dst_tmp += dst_strd;
1048 /* For last set of 8 rows only 7 rows need to be updated since first row is already written */
1049 if(row != 8)
1050 vst1_u8(pu1_dst_tmp, sto_res_tmp4);
1051 pu1_dst_tmp += dst_strd;
1052
1053 for(col = col_count; col > 0; col -= 8)
1054 {
1055 pu1_dst_tmp2 = pu1_dst_tmp1;
1056 vst1_u8(pu1_dst_tmp1, dc_val_t);
1057 pu1_dst_tmp1 += dst_strd;
1058 vst1_u8(pu1_dst_tmp1, dc_val_t);
1059 pu1_dst_tmp1 += dst_strd;
1060 vst1_u8(pu1_dst_tmp1, dc_val_t);
1061 pu1_dst_tmp1 += dst_strd;
1062 vst1_u8(pu1_dst_tmp1, dc_val_t);
1063 pu1_dst_tmp1 += dst_strd;
1064 vst1_u8(pu1_dst_tmp1, dc_val_t);
1065 pu1_dst_tmp1 += dst_strd;
1066 vst1_u8(pu1_dst_tmp1, dc_val_t);
1067 pu1_dst_tmp1 += dst_strd;
1068 vst1_u8(pu1_dst_tmp1, dc_val_t);
1069 pu1_dst_tmp1 += dst_strd;
1070
1071 /* For last set of 8 rows only 7 rows need to be updated since first row is already written */
1072 if(row != 8)
1073 vst1_u8(pu1_dst_tmp1, dc_val_t);
1074 pu1_dst_tmp1 = pu1_dst_tmp2 + 8;
1075 }
1076 }
1077 pu1_dst[0] = (pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) >> 2;
1078 }
1079 }
1080 /* loops have been unrolld considering the fact width is multiple of 4 */
1081 else
1082 {
1083 WORD32 acc_dc;
1084 two_nt = 2 * nt;
1085
1086 acc_dc = 0;
1087 pu1_ref_tmp = pu1_ref + nt + 1;
1088 for(i = nt; i < two_nt; i++)
1089 {
1090 acc_dc += pu1_ref[i];
1091 acc_dc += pu1_ref_tmp[i];
1092 }
1093 dc_val = (acc_dc + nt) >> (log2nt_plus1);
1094 two_dc_val = 2 * dc_val;
1095 three_dc_val = 3 * dc_val;
1096 three_dc_val = three_dc_val + 2;
1097 dc_val_t = vdup_n_u8(dc_val);
1098
1099 if(nt == 32)
1100 {
1101 pu1_dst_tmp = pu1_dst;
1102 for(row = 0; row < nt; row++)
1103 {
1104 for(col = nt; col > 0; col -= 4)
1105 {
1106 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(dc_val_t), 0);
1107 pu1_dst_tmp += 4;
1108 }
1109 pu1_dst_tmp += dst_strd - nt;
1110 }
1111 }
1112 else
1113
1114 {
1115 for(col = 1; col < nt; col++)
1116 {
1117 pu1_dst[col] = (pu1_ref[two_nt + 1 + col] + three_dc_val) >> 2;
1118 }
1119
1120 pu1_dst_tmp = pu1_dst + dst_strd + 0;
1121 /* Since first row is already updated before, loop count is nt-1 */
1122 for(row = nt - 1; row > 0; row -= 1)
1123 {
1124 for(col = nt; col > 0; col -= 4)
1125 {
1126 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(dc_val_t), 0);
1127 pu1_dst_tmp += 4;
1128 }
1129 pu1_dst_tmp += dst_strd - nt;
1130 }
1131
1132 for(row = 1; row < nt; row++)
1133 {
1134 pu1_dst[row * dst_strd] = (pu1_ref[two_nt - 1 - row] + three_dc_val) >> 2;
1135 }
1136 pu1_dst[0] = (pu1_ref[two_nt - 1] + two_dc_val + pu1_ref[two_nt + 1] + 2) >> 2;
1137 }
1138 }
1139 }
1140 /* INTRA_PRED_LUMA_DC */
1141
1142 /**
1143 *******************************************************************************
1144 *
1145 * @brief
1146 * Intra prediction interpolation filter for horizontal luma variable.
1147 *
1148 * @par Description:
1149 * Horizontal intraprediction with reference neighboring samples location
1150 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst'
1151 *
1152 * @param[in] pu1_src
1153 * UWORD8 pointer to the source
1154 *
1155 * @param[out] pu1_dst
1156 * UWORD8 pointer to the destination
1157 *
1158 * @param[in] src_strd
1159 * integer source stride
1160 *
1161 * @param[in] dst_strd
1162 * integer destination stride
1163 *
1164 * @param[in] nt
1165 * integer Transform Block size
1166 *
1167 * @param[in] wd
1168 * integer width of the array
1169 *
1170 * @returns
1171 *
1172 * @remarks
1173 * None
1174 *
1175 *******************************************************************************
1176 */
1177
ihevc_intra_pred_luma_horz_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)1178 void ihevc_intra_pred_luma_horz_neonintr(UWORD8 *pu1_ref,
1179 WORD32 src_strd,
1180 UWORD8 *pu1_dst,
1181 WORD32 dst_strd,
1182 WORD32 nt,
1183 WORD32 mode)
1184 {
1185
1186 WORD32 row, col;
1187 WORD32 two_nt;
1188 UNUSED(src_strd);
1189 UNUSED(mode);
1190
1191 two_nt = 2 * nt;
1192
1193
1194 UWORD8 *pu1_dst_tmp = pu1_dst;
1195 UWORD32 pu1_val;
1196 uint8x8_t pu1_val_two_nt_1_row;
1197 if(nt == 32)
1198 {
1199 pu1_dst_tmp = pu1_dst;
1200 for(row = 0; row < nt; row++)
1201 {
1202 pu1_val = pu1_ref[two_nt - 1 - row];
1203 pu1_val_two_nt_1_row = vdup_n_u8(pu1_val);
1204 for(col = nt; col > 0; col -= 8)
1205 {
1206 vst1_u8(pu1_dst_tmp, pu1_val_two_nt_1_row);
1207 pu1_dst_tmp += 8;
1208 }
1209 pu1_dst_tmp += dst_strd - nt;
1210 }
1211 }
1212 else
1213
1214
1215 /* row loop has been unrolled, hence had pu1_ref_val1 and pu1_ref_val2 variables*/
1216 /* naming of variables made according to the operation(instructions) it performs*/
1217 /* (eg. shift_val which contains the shifted value, */
1218 /* add_sat which has add and saturated value) */
1219 /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */
1220 /* rows and columns are unrolled by 4, when the width is multiple of 4 */
1221 {
1222 if(0 != (nt & 7)) /* cond for multiple of 4 */
1223 {
1224 UWORD8 *pu1_ref_4_two_nt_plus1 = pu1_ref;
1225 UWORD8 *pu1_ref_4_two_nt_minus_nt = pu1_ref;
1226 UWORD8 *pu1_dst_4 = pu1_dst;
1227 UWORD8 *pu1_dst_4_tmp = pu1_dst;
1228
1229 uint32x2_t pu1_ref_val1, pu1_ref_val2;
1230 uint8x8_t dup_sub, round_val, dup_val;
1231 uint16x8_t dup_add, sub_val;
1232 int16x8_t shift_val, add_sat;
1233
1234 pu1_ref_val1 = vdup_n_u32(0);
1235 pu1_ref_val2 = vdup_n_u32(0);
1236
1237 dup_sub = vdup_n_u8(pu1_ref[two_nt]);
1238
1239 dup_add = vdupq_n_u16(pu1_ref[two_nt - 1]);
1240
1241 pu1_ref_4_two_nt_plus1 += (two_nt + 1);
1242
1243 pu1_ref_4_two_nt_minus_nt += (two_nt - nt);
1244
1245 for(row = nt; row > 0; row -= 4)
1246 {
1247 for(col = nt; col > 0; col -= 4)
1248 {
1249 pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_4_two_nt_plus1, pu1_ref_val1, 0);
1250 sub_val = vsubl_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_sub);
1251 shift_val = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
1252
1253 add_sat = vqaddq_s16(shift_val, vreinterpretq_s16_u16(dup_add));
1254 round_val = vqmovun_s16(add_sat);
1255 vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(round_val), 0);
1256 pu1_dst_4 += dst_strd;
1257
1258 pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_4_two_nt_minus_nt, pu1_ref_val2, 0);
1259 dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 2);
1260 vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
1261 pu1_dst_4 += dst_strd;
1262
1263 dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 1);
1264 vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
1265 pu1_dst_4 += dst_strd;
1266
1267 dup_val = vdup_lane_u8(vreinterpret_u8_u32(pu1_ref_val2), 0);
1268 vst1_lane_u32((uint32_t *)pu1_dst_4, vreinterpret_u32_u8(dup_val), 0);
1269 pu1_dst_4 += dst_strd;
1270
1271
1272 }
1273 /* worst cases */
1274 pu1_ref_4_two_nt_minus_nt += 3;
1275 pu1_ref_4_two_nt_plus1 += 4;
1276 pu1_dst_4 = (pu1_dst_4_tmp + 4);
1277 }
1278
1279 }
1280
1281 /* dup_1 - dup_8 are variables to load the duplicated values from the loaded source */
1282 /* naming of variables made according to the operation(instructions) it performs */
1283 /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */
1284 /* rows and columns are unrolled by 8, when the width is multiple of 8 */
1285
1286 else
1287 {
1288 UWORD8 *pu1_ref_tmp_1 = pu1_ref;
1289 UWORD8 *pu1_ref_tmp_2 = pu1_ref;
1290
1291 UWORD8 *pu1_dst_tmp_1 = pu1_dst;
1292 UWORD8 *pu1_dst_tmp_2 = pu1_dst + dst_strd;
1293 UWORD8 *pu1_dst_tmp_3 = pu1_dst + dst_strd;
1294
1295 uint8x8_t dup_sub, src_tmp, src_tmp_1, round_val, dup_1, dup_2, dup_3, dup_4, dup_5, dup_6, dup_7, dup_8, rev_res;
1296 uint16x8_t sub_res, dup_add;
1297 int16x8_t shift_res, add_res;
1298
1299 dup_sub = vdup_n_u8(pu1_ref[two_nt]);
1300 dup_add = vdupq_n_u16(pu1_ref[two_nt - 1]);
1301
1302 pu1_ref_tmp_1 += (two_nt + 1);
1303 pu1_ref_tmp_2 += (two_nt - 1);
1304
1305 for(col = nt; col > 0; col -= 8)
1306 {
1307 src_tmp = vld1_u8(pu1_ref_tmp_1);
1308 pu1_ref_tmp_1 += 8;
1309
1310 sub_res = vsubl_u8(src_tmp, dup_sub);
1311 shift_res = vshrq_n_s16(vreinterpretq_s16_u16(sub_res), 1);
1312 add_res = vqaddq_s16(shift_res, vreinterpretq_s16_u16(dup_add));
1313 round_val = vqmovun_s16(add_res);
1314 vst1_u8(pu1_dst_tmp_1, round_val);
1315 pu1_dst_tmp_1 += 8;
1316 }
1317
1318 for(row = nt; row > 0; row -= 8)
1319 {
1320 pu1_ref_tmp_2 -= 8;
1321
1322 src_tmp_1 = vld1_u8(pu1_ref_tmp_2);
1323 rev_res = vrev64_u8(src_tmp_1); /* Reversing the loaded values */
1324
1325 dup_1 = vdup_lane_u8(rev_res, 0);
1326 dup_2 = vdup_lane_u8(rev_res, 1);
1327 dup_3 = vdup_lane_u8(rev_res, 2);
1328 dup_4 = vdup_lane_u8(rev_res, 3);
1329 dup_5 = vdup_lane_u8(rev_res, 4);
1330 dup_6 = vdup_lane_u8(rev_res, 5);
1331 dup_7 = vdup_lane_u8(rev_res, 6);
1332 dup_8 = vdup_lane_u8(rev_res, 7);
1333
1334 for(col = nt; col > 0; col -= 8)
1335 {
1336 pu1_dst_tmp_2 = pu1_dst_tmp_3;
1337
1338 vst1_u8(pu1_dst_tmp_2, dup_1);
1339 pu1_dst_tmp_2 += dst_strd;
1340
1341 vst1_u8(pu1_dst_tmp_2, dup_2);
1342 pu1_dst_tmp_2 += dst_strd;
1343
1344 vst1_u8(pu1_dst_tmp_2, dup_3);
1345 pu1_dst_tmp_2 += dst_strd;
1346
1347 vst1_u8(pu1_dst_tmp_2, dup_4);
1348 pu1_dst_tmp_2 += dst_strd;
1349
1350 vst1_u8(pu1_dst_tmp_2, dup_5);
1351 pu1_dst_tmp_2 += dst_strd;
1352
1353 vst1_u8(pu1_dst_tmp_2, dup_6);
1354 pu1_dst_tmp_2 += dst_strd;
1355
1356 vst1_u8(pu1_dst_tmp_2, dup_7);
1357 pu1_dst_tmp_2 += dst_strd;
1358
1359 /* For last set of 8 rows only 7 rows need to be updated since first row is already written */
1360 if(row != 8)
1361 vst1_u8(pu1_dst_tmp_2, dup_8);
1362 pu1_dst_tmp_2 += dst_strd;
1363
1364 pu1_dst_tmp_3 += 8;
1365 }
1366 pu1_dst_tmp_2 -= (nt - 8);
1367 pu1_dst_tmp_3 = pu1_dst_tmp_2;
1368 }
1369 }
1370 }
1371 }
1372 /* INTRA_PRED_LUMA_HORZ */
1373
1374 /**
1375 *******************************************************************************
1376 *
1377 * @brief
1378 * Intra prediction interpolation filter for vertical luma variable.
1379 *
1380 * @par Description:
1381 * Horizontal intraprediction with reference neighboring samples location
1382 * pointed by 'pu1_ref' to the TU block location pointed by 'pu1_dst'
1383 *
1384 * @param[in] pu1_src
1385 * UWORD8 pointer to the source
1386 *
1387 * @param[out] pu1_dst
1388 * UWORD8 pointer to the destination
1389 *
1390 * @param[in] src_strd
1391 * integer source stride
1392 *
1393 * @param[in] dst_strd
1394 * integer destination stride
1395 *
1396 * @param[in] nt
1397 * integer Transform Block size
1398 *
1399 * @param[in] wd
1400 * integer width of the array
1401 *
1402 * @returns
1403 *
1404 * @remarks
1405 * None
1406 *
1407 *******************************************************************************
1408 */
1409
ihevc_intra_pred_luma_ver_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)1410 void ihevc_intra_pred_luma_ver_neonintr(UWORD8 *pu1_ref,
1411 WORD32 src_strd,
1412 UWORD8 *pu1_dst,
1413 WORD32 dst_strd,
1414 WORD32 nt,
1415 WORD32 mode)
1416 {
1417 WORD32 row, col;
1418 WORD32 two_nt;
1419 UNUSED(src_strd);
1420 UNUSED(mode);
1421
1422 two_nt = 2 * nt;
1423
1424 UWORD8 *pu1_dst_tmp = pu1_dst;
1425 UWORD8 *pu1_ref_tmp_1 = pu1_ref + two_nt + 1;
1426 uint8x8_t pu1_val_two_nt_1_col;
1427 if(nt == 32)
1428 {
1429 pu1_dst_tmp = pu1_dst;
1430 for(row = 0; row < nt; row++)
1431 {
1432 for(col = nt; col > 0; col -= 8)
1433 {
1434 pu1_val_two_nt_1_col = vld1_u8(pu1_ref_tmp_1);
1435 pu1_ref_tmp_1 += 8;
1436 vst1_u8(pu1_dst_tmp, pu1_val_two_nt_1_col);
1437 pu1_dst_tmp += 8;
1438 }
1439 pu1_ref_tmp_1 -= nt;
1440 pu1_dst_tmp += dst_strd - nt;
1441 }
1442 }
1443 else
1444
1445 {
1446 /* naming of variables made according to the operation(instructions) it performs */
1447 /* (eg. shift_val which contains the shifted value, */
1448 /* add_sat which has add and saturated value) */
1449 /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */
1450 /* rows and columns are unrolled by 4, when the width is multiple of 4 */
1451
1452 if(0 != (nt & 7))
1453 {
1454 WORD32 cond_4 = 0;
1455 UWORD8 *pu1_ref_val1 = pu1_ref;
1456 UWORD8 *pu1_ref_val2 = pu1_ref;
1457 UWORD8 *pu1_ref_val3 = pu1_ref;
1458
1459 UWORD8 *pu1_dst_val1 = pu1_dst;
1460 UWORD8 *pu1_dst_val2 = pu1_dst;
1461 UWORD8 *pu1_dst_val3 = pu1_dst;
1462
1463 uint8x8_t dup_2_sub, round_val, vext_val;
1464 uint16x8_t dup_2_add;
1465 uint32x2_t src_val1, src_val2, src_val3;
1466 uint16x8_t sub_val;
1467 int16x8_t shift_val1, add_sat;
1468 uint64x1_t shift_val2;
1469
1470 src_val1 = vdup_n_u32(0);
1471 src_val2 = vdup_n_u32(0);
1472 src_val3 = vdup_n_u32(0);
1473 pu1_ref_val1 += (two_nt - nt);
1474 pu1_ref_val3 += (two_nt + 2);
1475 pu1_ref_val2 += (two_nt + 1);
1476
1477 dup_2_sub = vdup_n_u8(pu1_ref[two_nt]);
1478 dup_2_add = vdupq_n_u16(pu1_ref[two_nt + 1]);
1479
1480 /* loops to store the first nt sets of values in the destination */
1481
1482 for(row = nt; row > 0; row -= 4)
1483 {
1484 for(col = nt; (col > 0) && (cond_4 == 0); col -= 4)
1485 {
1486 /* unrolling s2_predpixel = pu1_ref[two_nt + 1] + ((pu1_ref[two_nt - 1 - row] - pu1_ref[two_nt]) >> 1); here*/
1487 src_val1 = vld1_lane_u32((uint32_t *)pu1_ref_val1, src_val1, 1);
1488 sub_val = vsubl_u8(vreinterpret_u8_u32(src_val1), dup_2_sub);
1489 shift_val1 = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
1490 add_sat = vqaddq_s16(shift_val1, vreinterpretq_s16_u16(dup_2_add));
1491 round_val = vqmovun_s16(add_sat);
1492
1493 /* unrolling pu1_dst[row * dst_strd + col] = pu1_ref[two_nt + 1 + col]; here*/
1494 src_val2 = vld1_lane_u32((uint32_t *)pu1_ref_val3, src_val2, 0);
1495 vext_val = vext_u8(round_val, vreinterpret_u8_u32(src_val2), 7);
1496 vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
1497 pu1_dst_val1 += dst_strd;
1498
1499 shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 8);
1500
1501 vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
1502 vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
1503 pu1_dst_val1 += dst_strd;
1504
1505 shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 16);
1506
1507 vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
1508 vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
1509 pu1_dst_val1 += dst_strd;
1510
1511 shift_val2 = vshl_n_u64(vreinterpret_u64_u8(round_val), 24);
1512
1513 vext_val = vext_u8(vreinterpret_u8_u64(shift_val2), vreinterpret_u8_u32(src_val2), 7);
1514 vst1_lane_u32((uint32_t *)pu1_dst_val1, vreinterpret_u32_u8(vext_val), 0);
1515 pu1_dst_val1 += dst_strd;
1516
1517 pu1_ref_val1 -= 4;
1518 }
1519
1520 /* loop to store next sets of eight values in the destination */
1521
1522 for(col = nt - 3; (col > 0) && (cond_4 == 1); col -= 4)
1523 {
1524 src_val3 = vld1_lane_u32((uint32_t *)pu1_ref_val2, src_val3, 0);
1525
1526 vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
1527 pu1_dst_val2 += dst_strd;
1528
1529 vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
1530 pu1_dst_val2 += dst_strd;
1531
1532 vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
1533 pu1_dst_val2 += dst_strd;
1534
1535 vst1_u8(pu1_dst_val2, vreinterpret_u8_u32(src_val3));
1536 pu1_dst_val2 += dst_strd;
1537 }
1538 pu1_ref_val2 += 4;
1539 pu1_dst_val3 += 4;
1540 pu1_dst_val2 = pu1_dst_val3;
1541 cond_4 = 1;
1542 }
1543 }
1544
1545 /* rows and columns are unrolled by 8, when the width is multiple of 8 */
1546 else
1547 {
1548 WORD32 cond = 0, col_1;
1549 UWORD8 *pu1_dst_tmp_1 = pu1_dst;
1550 UWORD8 *pu1_dst_tmp_2 = pu1_dst;
1551 UWORD8 *pu1_dst_tmp_3 = pu1_dst;
1552
1553 UWORD8 *pu1_ref_tmp_1 = pu1_ref;
1554 UWORD8 *pu1_ref_tmp_2 = pu1_ref;
1555 UWORD8 *pu1_ref_tmp_3 = pu1_ref;
1556
1557 uint8x8_t pu1_src_tmp1;
1558 uint8x8_t pu1_src_tmp2;
1559
1560 uint8x8_t dup_sub;
1561 uint16x8_t dup_add;
1562 int16x8_t subsh_val;
1563 int16x8_t addsat_val;
1564 uint16x8_t sub_val;
1565 uint8x8_t round_val;
1566 uint8x8_t vext_t;
1567 uint64x1_t shift_64;
1568
1569 dup_sub = vdup_n_u8(pu1_ref[two_nt]);
1570 dup_add = vdupq_n_u16(pu1_ref[two_nt + 1]);
1571
1572 pu1_ref_tmp_1 += (two_nt);
1573 pu1_ref_tmp_1 -= 8;
1574 pu1_ref_tmp_2 += (two_nt + 2);
1575 pu1_ref_tmp_3 += (two_nt + 1);
1576
1577 /* loops to store the first nt sets of values in the destination */
1578
1579 for(row = nt; row > 0; row -= 8)
1580 {
1581 for(col = (nt - 1); (col > 0) && (cond == 0); col -= 8)
1582 {
1583 pu1_src_tmp1 = vld1_u8(pu1_ref_tmp_1);
1584
1585 sub_val = vsubl_u8(pu1_src_tmp1, dup_sub);
1586 subsh_val = vshrq_n_s16(vreinterpretq_s16_u16(sub_val), 1);
1587 addsat_val = vqaddq_s16(subsh_val, vreinterpretq_s16_u16(dup_add));
1588 round_val = vqmovun_s16(addsat_val);
1589
1590 /* unrolling pu1_dst[row * dst_strd + col] = pu1_ref[two_nt + 1 + col]; here*/
1591
1592 pu1_src_tmp2 = vld1_u8(pu1_ref_tmp_2);
1593 vext_t = vext_u8(round_val, pu1_src_tmp2, 7);
1594 vst1_u8(pu1_dst_tmp_1, vext_t);
1595 pu1_dst_tmp_1 += dst_strd;
1596
1597 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 8);
1598
1599 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1600 vst1_u8(pu1_dst_tmp_1, vext_t);
1601 pu1_dst_tmp_1 += dst_strd;
1602
1603 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 16);
1604 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1605 vst1_u8(pu1_dst_tmp_1, vext_t);
1606 pu1_dst_tmp_1 += dst_strd;
1607
1608 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 24);
1609 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1610 vst1_u8(pu1_dst_tmp_1, vext_t);
1611 pu1_dst_tmp_1 += dst_strd;
1612
1613 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 32);
1614 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1615 vst1_u8(pu1_dst_tmp_1, vext_t);
1616 pu1_dst_tmp_1 += dst_strd;
1617
1618 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 40);
1619 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1620 vst1_u8(pu1_dst_tmp_1, vext_t);
1621 pu1_dst_tmp_1 += dst_strd;
1622
1623 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 48);
1624 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1625 vst1_u8(pu1_dst_tmp_1, vext_t);
1626 pu1_dst_tmp_1 += dst_strd;
1627
1628 shift_64 = vshl_n_u64(vreinterpret_u64_u8(round_val), 56);
1629 vext_t = vext_u8(vreinterpret_u8_u64(shift_64), pu1_src_tmp2, 7);
1630 vst1_u8(pu1_dst_tmp_1, vext_t);
1631 pu1_dst_tmp_1 += dst_strd;
1632
1633 pu1_ref_tmp_1 -= 8;
1634 }
1635
1636 /* loop to store next sets of eight values in the destination */
1637
1638 for(col_1 = nt - 7; (col_1 > 0) && (cond == 1); col_1 -= 8)
1639 {
1640 pu1_src_tmp2 = vld1_u8(pu1_ref_tmp_3);
1641
1642 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1643 pu1_dst_tmp_2 += dst_strd;
1644
1645 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1646 pu1_dst_tmp_2 += dst_strd;
1647
1648 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1649 pu1_dst_tmp_2 += dst_strd;
1650
1651 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1652 pu1_dst_tmp_2 += dst_strd;
1653
1654 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1655 pu1_dst_tmp_2 += dst_strd;
1656
1657 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1658 pu1_dst_tmp_2 += dst_strd;
1659
1660 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1661 pu1_dst_tmp_2 += dst_strd;
1662
1663 vst1_u8(pu1_dst_tmp_2, pu1_src_tmp2);
1664 pu1_dst_tmp_2 += dst_strd;
1665 }
1666 pu1_ref_tmp_3 += 8;
1667 pu1_dst_tmp_3 += 8;
1668 pu1_dst_tmp_2 = pu1_dst_tmp_3;
1669 cond = 1;
1670 }
1671 }
1672 }
1673 }
1674 /* INTRA_PRED_LUMA_VER */
1675
1676 /**
1677 *******************************************************************************
1678 *
1679 * @brief
1680 * Intra prediction interpolation filter for luma mode2.
1681 *
1682 * @par Description:
1683 * Intraprediction for mode 2 (sw angle) with reference neighboring samples
1684 * location pointed by 'pu1_ref' to the TU block location pointed by
1685 * 'pu1_dst'
1686 *
1687 * @param[in] pu1_src
1688 * UWORD8 pointer to the source
1689 *
1690 * @param[out] pu1_dst
1691 * UWORD8 pointer to the destination
1692 *
1693 * @param[in] src_strd
1694 * integer source stride
1695 *
1696 * @param[in] dst_strd
1697 * integer destination stride
1698 *
1699 * @param[in] nt
1700 * integer Transform Block size
1701 *
1702 * @param[in] wd
1703 * integer width of the array
1704 *
1705 * @returns
1706 *
1707 * @remarks
1708 * None
1709 *
1710 *******************************************************************************
1711 */
1712
ihevc_intra_pred_luma_mode2_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)1713 void ihevc_intra_pred_luma_mode2_neonintr(UWORD8 *pu1_ref,
1714 WORD32 src_strd,
1715 UWORD8 *pu1_dst,
1716 WORD32 dst_strd,
1717 WORD32 nt,
1718 WORD32 mode)
1719 {
1720
1721 WORD32 row, col;
1722 WORD32 two_nt;
1723 UNUSED(src_strd);
1724 UNUSED(mode);
1725
1726 /* rev_res naming has been made to have the reverse result value in it */
1727 /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */
1728 /* rows and columns are unrolled by 4, when the width is multiple of 4 */
1729
1730 if(0 != (nt & 7))
1731 {
1732 UWORD8 *pu1_ref_tmp = pu1_ref;
1733 UWORD8 *pu1_dst_tmp = pu1_dst;
1734 uint8x8_t pu1_src_val, rev_res;
1735 uint64x1_t shift_res;
1736
1737 for(col = nt; col > 0; col -= 4)
1738 {
1739 for(row = nt; row > 0; row -= 4)
1740 {
1741 /* unrolling all col & rows for pu1_dst[row + (col * dst_strd)] = pu1_ref[two_nt - col - idx - 1]; */
1742
1743 pu1_src_val = vld1_u8(pu1_ref_tmp);
1744 shift_res = vshl_n_u64(vreinterpret_u64_u8(pu1_src_val), 8);
1745 rev_res = vrev64_u8(vreinterpret_u8_u64(shift_res));
1746
1747 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(rev_res), 0);
1748 pu1_dst_tmp += dst_strd;
1749
1750 shift_res = vshr_n_u64(vreinterpret_u64_u8(rev_res), 8);
1751 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
1752 pu1_dst_tmp += dst_strd;
1753
1754 shift_res = vshr_n_u64(shift_res, 8);
1755 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
1756 pu1_dst_tmp += dst_strd;
1757
1758 shift_res = vshr_n_u64(shift_res, 8);
1759 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u64(shift_res), 0);
1760 pu1_dst_tmp += dst_strd;
1761 }
1762 }
1763 }
1764
1765 /* rev_val_second, rev_val_first to reverse the loaded values in order to get the values in right order */
1766 /* shift_64 to shift the reversed 2nd values to get the value what we need */
1767 /* rows and columns are unrolled by 8, when the width is multiple of 8 */
1768
1769 else
1770 {
1771 UWORD8 *pu1_ref_two_nt_minus2 = pu1_ref;
1772 UWORD8 *pu1_dst_tmp = pu1_dst;
1773 UWORD8 *pu1_dst_tmp_plus8 = pu1_dst;
1774
1775 uint8x8_t pu1_src_val1, pu1_src_val2, vext_t, rev_val_second, rev_val_first;
1776 uint64x1_t shift_val;
1777
1778 two_nt = 2 * nt;
1779 pu1_ref_two_nt_minus2 += (two_nt);
1780 pu1_ref_two_nt_minus2 -= 8;
1781
1782 for(col = nt; col > 0; col -= 8)
1783 {
1784 for(row = nt; row > 0; row -= 8)
1785 {
1786 pu1_src_val2 = vld1_u8(pu1_ref_two_nt_minus2);
1787 rev_val_first = vrev64_u8(pu1_src_val2);
1788
1789 pu1_ref_two_nt_minus2 -= 8;
1790 pu1_src_val1 = vld1_u8(pu1_ref_two_nt_minus2);
1791 rev_val_second = vrev64_u8(pu1_src_val1);
1792
1793 vext_t = vext_u8(rev_val_first, rev_val_second, 1);
1794 vst1_u8(pu1_dst_tmp, vext_t);
1795 pu1_dst_tmp += dst_strd;
1796
1797 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 8);
1798 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1799 vst1_u8(pu1_dst_tmp, vext_t);
1800 pu1_dst_tmp += dst_strd;
1801
1802 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 16);
1803 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1804 vst1_u8(pu1_dst_tmp, vext_t);
1805 pu1_dst_tmp += dst_strd;
1806
1807 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 24);
1808 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1809 vst1_u8(pu1_dst_tmp, vext_t);
1810 pu1_dst_tmp += dst_strd;
1811
1812 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 32);
1813 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1814 vst1_u8(pu1_dst_tmp, vext_t);
1815 pu1_dst_tmp += dst_strd;
1816
1817 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 40);
1818 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1819 vst1_u8(pu1_dst_tmp, vext_t);
1820 pu1_dst_tmp += dst_strd;
1821
1822 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 48);
1823 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1824 vst1_u8(pu1_dst_tmp, vext_t);
1825 pu1_dst_tmp += dst_strd;
1826
1827 shift_val = vshr_n_u64(vreinterpret_u64_u8(rev_val_second), 56);
1828 vext_t = vext_u8(vext_t, vreinterpret_u8_u64(shift_val), 1);
1829 vst1_u8(pu1_dst_tmp, vext_t);
1830 pu1_dst_tmp += dst_strd;
1831 }
1832 pu1_dst_tmp_plus8 += 8;
1833 pu1_dst_tmp = pu1_dst_tmp_plus8;
1834 pu1_ref_two_nt_minus2 += (nt - 8);
1835 }
1836 }
1837 }
1838 /* INTRA_PRED_LUMA_MODE2 */
1839
1840 /**
1841 *******************************************************************************
1842 *
1843 * @brief
1844 * Intra prediction interpolation filter for luma mode 18 & mode 34.
1845 *
1846 * @par Description:
1847 * Intraprediction for mode 34 (ne angle) with reference neighboring
1848 * samples location pointed by 'pu1_ref' to the TU block location pointed by
1849 * 'pu1_dst'
1850 *
1851 * @param[in] pu1_src
1852 * UWORD8 pointer to the source
1853 *
1854 * @param[out] pu1_dst
1855 * UWORD8 pointer to the destination
1856 *
1857 * @param[in] src_strd
1858 * integer source stride
1859 *
1860 * @param[in] dst_strd
1861 * integer destination stride
1862 *
1863 * @param[in] nt
1864 * integer Transform Block size
1865 *
1866 * @param[in] wd
1867 * integer width of the array
1868 *
1869 * @returns
1870 *
1871 * @remarks
1872 * None
1873 *
1874 *******************************************************************************
1875 */
1876
ihevc_intra_pred_luma_mode_18_34_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)1877 void ihevc_intra_pred_luma_mode_18_34_neonintr(UWORD8 *pu1_ref,
1878 WORD32 src_strd,
1879 UWORD8 *pu1_dst,
1880 WORD32 dst_strd,
1881 WORD32 nt,
1882 WORD32 mode)
1883 {
1884
1885 WORD32 row, col, idx;
1886 WORD32 intraPredAngle = 32;
1887 WORD32 two_nt;
1888 UNUSED(src_strd);
1889 two_nt = 2 * nt;
1890
1891 UWORD8 *pu1_ref_tmp = pu1_ref;
1892 UWORD8 *pu1_ref_tmp1 = pu1_ref;
1893 UWORD8 *pu1_dst_tmp = pu1_dst;
1894 UWORD8 *pu1_dst_tmp_plus8 = pu1_dst;
1895
1896 uint8x8_t src_tmp_1st, src_tmp_2nd, vext1, vext2, vext3, vext4, vext5, vext6, vext7;
1897
1898 /* src_tmp_1st, src_tmp_2nd are named as to load the 1st eight and next 8 values from source(pu1_ref) */
1899 /* vext1 - vext7 are named to do vext operation between 2 loaded values and to handle dual issue */
1900 /* Loops are unrolled by 4 and 8 considering the fact the input width is either multiple of 4 or 8 */
1901 /* rows and columns are unrolled by 8, when the width is multiple of 8 */
1902 /* loops are maintained separately for mode18 and mode34 */
1903
1904 /* cond to allow multiples of 8 */
1905 if(0 == (nt & 7))
1906 {
1907 if(mode == 34)
1908 {
1909 pu1_ref_tmp += (two_nt + 2);
1910
1911 for(row = nt; row > 0; row -= 8)
1912 {
1913 for(col = nt; col > 0; col -= 8)
1914 {
1915 /* Loading 1st eight values */
1916 src_tmp_1st = vld1_u8(pu1_ref_tmp);
1917 pu1_ref_tmp += 8;
1918
1919 /* Loading next eight values */
1920 src_tmp_2nd = vld1_u8(pu1_ref_tmp);
1921
1922 /* UNROLLED pu1_dst[col + (row * dst_strd)] = pu1_ref[two_nt + col + idx + 1] */
1923 vext1 = vext_u8(src_tmp_1st, src_tmp_2nd, 1);
1924 vst1_u8(pu1_dst_tmp, src_tmp_1st);
1925 pu1_dst_tmp += dst_strd;
1926
1927 vext2 = vext_u8(src_tmp_1st, src_tmp_2nd, 2);
1928 vst1_u8(pu1_dst_tmp, vext1);
1929 pu1_dst_tmp += dst_strd;
1930
1931 vext3 = vext_u8(src_tmp_1st, src_tmp_2nd, 3);
1932 vst1_u8(pu1_dst_tmp, vext2);
1933 pu1_dst_tmp += dst_strd;
1934
1935 vext4 = vext_u8(src_tmp_1st, src_tmp_2nd, 4);
1936 vst1_u8(pu1_dst_tmp, vext3);
1937 pu1_dst_tmp += dst_strd;
1938
1939 vext5 = vext_u8(src_tmp_1st, src_tmp_2nd, 5);
1940 vst1_u8(pu1_dst_tmp, vext4);
1941 pu1_dst_tmp += dst_strd;
1942
1943 vext6 = vext_u8(src_tmp_1st, src_tmp_2nd, 6);
1944 vst1_u8(pu1_dst_tmp, vext5);
1945 pu1_dst_tmp += dst_strd;
1946
1947 vext7 = vext_u8(src_tmp_1st, src_tmp_2nd, 7);
1948 vst1_u8(pu1_dst_tmp, vext6);
1949 pu1_dst_tmp += dst_strd;
1950
1951 vst1_u8(pu1_dst_tmp, vext7);
1952 pu1_dst_tmp += dst_strd;
1953 }
1954
1955 pu1_dst_tmp_plus8 += 8;
1956 pu1_dst_tmp = pu1_dst_tmp_plus8;
1957 pu1_ref_tmp -= (nt - 8);
1958 }
1959 }
1960 else /* Loop for mode 18 */
1961 {
1962 pu1_ref_tmp += (two_nt);
1963
1964 for(row = nt; row > 0; row -= 8)
1965 {
1966 for(col = nt; col > 0; col -= 8)
1967 {
1968 /* Loading 1st eight values */
1969 src_tmp_1st = vld1_u8(pu1_ref_tmp);
1970 pu1_ref_tmp -= 8;
1971
1972 /* Loading next eight values */
1973 src_tmp_2nd = vld1_u8(pu1_ref_tmp);
1974
1975 /* UNROLLED pu1_dst[col + (row * dst_strd)] = pu1_ref[two_nt + col + idx + 1] */
1976 vext1 = vext_u8(src_tmp_2nd, src_tmp_1st, 7);
1977 vst1_u8(pu1_dst_tmp, src_tmp_1st);
1978 pu1_dst_tmp += dst_strd;
1979
1980 vext2 = vext_u8(src_tmp_2nd, src_tmp_1st, 6);
1981 vst1_u8(pu1_dst_tmp, vext1);
1982 pu1_dst_tmp += dst_strd;
1983
1984 vext3 = vext_u8(src_tmp_2nd, src_tmp_1st, 5);
1985 vst1_u8(pu1_dst_tmp, vext2);
1986 pu1_dst_tmp += dst_strd;
1987
1988 vext4 = vext_u8(src_tmp_2nd, src_tmp_1st, 4);
1989 vst1_u8(pu1_dst_tmp, vext3);
1990 pu1_dst_tmp += dst_strd;
1991
1992 vext5 = vext_u8(src_tmp_2nd, src_tmp_1st, 3);
1993 vst1_u8(pu1_dst_tmp, vext4);
1994 pu1_dst_tmp += dst_strd;
1995
1996 vext6 = vext_u8(src_tmp_2nd, src_tmp_1st, 2);
1997 vst1_u8(pu1_dst_tmp, vext5);
1998 pu1_dst_tmp += dst_strd;
1999
2000 vext7 = vext_u8(src_tmp_2nd, src_tmp_1st, 1);
2001 vst1_u8(pu1_dst_tmp, vext6);
2002 pu1_dst_tmp += dst_strd;
2003
2004 vst1_u8(pu1_dst_tmp, vext7);
2005 pu1_dst_tmp += dst_strd;
2006 }
2007 pu1_dst_tmp_plus8 += 8;
2008 pu1_dst_tmp = pu1_dst_tmp_plus8;
2009 pu1_ref_tmp += (nt + 8);
2010 }
2011 }
2012 }
2013
2014 /* rows and columns are unrolled by 4, when the width is multiple of 4 */
2015
2016 else /* loop for multiples of 4 */
2017 {
2018 uint8x8_t src_val1;
2019 uint8x8_t src_val2;
2020
2021 if(mode == 18)
2022 intraPredAngle = -32;
2023 else if(mode == 34)
2024 intraPredAngle = 32;
2025
2026 for(row = 0; row < nt; row += 2)
2027 {
2028 /* unrolling 2 rows */
2029 idx = ((row + 1) * intraPredAngle) >> 5;
2030 pu1_ref_tmp = pu1_ref + two_nt + idx + 1;
2031 src_val1 = vld1_u8(pu1_ref_tmp);
2032
2033 idx = ((row + 2) * intraPredAngle) >> 5;
2034 pu1_ref_tmp1 = pu1_ref + two_nt + idx + 1;
2035 src_val2 = vld1_u8(pu1_ref_tmp1);
2036
2037 /* unrolling 4 col */
2038 for(col = nt; col > 0; col -= 4)
2039 {
2040 pu1_dst_tmp = pu1_dst;
2041 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(src_val1), 0);
2042 pu1_dst_tmp += dst_strd;
2043 vst1_lane_u32((uint32_t *)pu1_dst_tmp, vreinterpret_u32_u8(src_val2), 0);
2044 pu1_dst += 4;
2045 }
2046 pu1_dst += 2 * dst_strd - nt;
2047 }
2048 }
2049 }
2050 /* INTRA_PRED_LUMA_MODE_18_34 */
2051
2052 /**
2053 *******************************************************************************
2054 *
2055 * @brief
2056 * Intra prediction interpolation filter for luma mode 3 to mode 9
2057 *
2058 * @par Description:
2059 * Intraprediction for mode 3 to 9 (positive angle, horizontal mode ) with
2060 * reference neighboring samples location pointed by 'pu1_ref' to the TU
2061 * block location pointed by 'pu1_dst'
2062 *
2063 * @param[in] pu1_src
2064 * UWORD8 pointer to the source
2065 *
2066 * @param[out] pu1_dst
2067 * UWORD8 pointer to the destination
2068 *
2069 * @param[in] src_strd
2070 * integer source stride
2071 *
2072 * @param[in] dst_strd
2073 * integer destination stride
2074 *
2075 * @param[in] nt
2076 * integer Transform Block size
2077 *
2078 * @param[in] mode
2079 * integer intraprediction mode
2080 *
2081 * @returns
2082 *
2083 * @remarks
2084 * None
2085 *
2086 *******************************************************************************
2087 */
2088
2089
ihevc_intra_pred_luma_mode_3_to_9_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)2090 void ihevc_intra_pred_luma_mode_3_to_9_neonintr(UWORD8 *pu1_ref,
2091 WORD32 src_strd,
2092 UWORD8 *pu1_dst,
2093 WORD32 dst_strd,
2094 WORD32 nt,
2095 WORD32 mode)
2096 {
2097
2098 WORD32 row, col;
2099 WORD32 intra_pred_ang;
2100 WORD32 pos, fract = 100, fract_prev;
2101 UNUSED(src_strd);
2102 if(0 == (nt & 7))
2103 {
2104
2105 UWORD8 *pu1_ref_main_idx = pu1_ref;
2106 UWORD8 *pu1_ref_main_idx_1 = pu1_ref;
2107
2108 UWORD8 *pu1_dst_tmp1 = pu1_dst;
2109 UWORD8 *pu1_dst_tmp2 = pu1_dst;
2110
2111 WORD32 two_nt = 2 * nt;
2112
2113 pu1_ref_main_idx += two_nt;
2114 pu1_ref_main_idx_1 += two_nt - 1;
2115
2116 uint8x8_t dup_const_fract, dup_const_32_fract, ref_main_idx, ref_main_idx_1;
2117 uint8x8_t shift_res;
2118 uint16x8_t mul_res1, mul_res2, add_res;
2119
2120 /* Intra Pred Angle according to the mode */
2121 intra_pred_ang = gai4_ihevc_ang_table[mode];
2122
2123 pu1_ref_main_idx -= 8;
2124 pu1_ref_main_idx_1 -= 8;
2125
2126 for(col = 0; col < nt; col++)
2127 {
2128 fract_prev = fract;
2129
2130 pos = ((col + 1) * intra_pred_ang);
2131 fract = pos & (31);
2132
2133 if(fract_prev < fract)
2134 {
2135 pu1_ref_main_idx += 1;
2136 pu1_ref_main_idx_1 += 1;
2137 }
2138
2139 dup_const_fract = vdup_n_u8((uint8_t)fract);
2140 dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2141
2142 for(row = nt; row > 0; row -= 8)
2143 {
2144 ref_main_idx = vld1_u8(pu1_ref_main_idx);
2145 ref_main_idx_1 = vld1_u8(pu1_ref_main_idx_1);
2146
2147 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
2148 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
2149
2150 add_res = vaddq_u16(mul_res1, mul_res2);
2151
2152 shift_res = vrshrn_n_u16(add_res, 5);
2153
2154 vst1_lane_u8(pu1_dst_tmp1, shift_res, 7);
2155 pu1_dst_tmp1 += dst_strd;
2156
2157 vst1_lane_u8(pu1_dst_tmp1, shift_res, 6);
2158 pu1_dst_tmp1 += dst_strd;
2159
2160 vst1_lane_u8(pu1_dst_tmp1, shift_res, 5);
2161 pu1_dst_tmp1 += dst_strd;
2162
2163 vst1_lane_u8(pu1_dst_tmp1, shift_res, 4);
2164 pu1_dst_tmp1 += dst_strd;
2165
2166 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
2167 pu1_dst_tmp1 += dst_strd;
2168
2169 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
2170 pu1_dst_tmp1 += dst_strd;
2171
2172 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
2173 pu1_dst_tmp1 += dst_strd;
2174
2175 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
2176 pu1_dst_tmp1 += dst_strd;
2177
2178 pu1_ref_main_idx -= 8;
2179 pu1_ref_main_idx_1 -= 8;
2180
2181 }
2182 pu1_dst_tmp2 += 1;
2183 pu1_dst_tmp1 = pu1_dst_tmp2;
2184
2185 pu1_ref_main_idx += nt;
2186 pu1_ref_main_idx_1 += nt;
2187
2188 pu1_ref_main_idx -= 1;
2189 pu1_ref_main_idx_1 -= 1;
2190
2191 }
2192 }
2193 else
2194 {
2195 UWORD8 *pu1_ref_tmp1 = pu1_ref;
2196 UWORD8 *pu1_ref_tmp2 = pu1_ref;
2197 UWORD8 *pu1_dst_tmp1 = pu1_dst;
2198 UWORD8 *pu1_dst_tmp2 = pu1_dst;
2199
2200 pu1_ref_tmp1 += nt;
2201 pu1_ref_tmp2 += (nt - 1);
2202
2203 uint8x8_t dup_fract, dup_32_fract, shift_res;
2204 uint16x8_t mul_res1, mul_res2, add_res;
2205 uint32x2_t pu1_ref_val1, pu1_ref_val2;
2206
2207 pu1_ref_val1 = vdup_n_u32(0);
2208 pu1_ref_val2 = vdup_n_u32(0);
2209
2210 /* Intra Pred Angle according to the mode */
2211 intra_pred_ang = gai4_ihevc_ang_table[mode];
2212
2213
2214 for(col = 0; col < nt; col++)
2215 {
2216 fract_prev = fract;
2217 pos = ((col + 1) * intra_pred_ang);
2218 fract = pos & (31);
2219 if(fract_prev < fract)
2220 {
2221 pu1_ref_tmp1 += 1;
2222 pu1_ref_tmp2 += 1;
2223 }
2224 dup_fract = vdup_n_u8((uint8_t)fract);
2225 dup_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2226
2227 for(row = nt; row > 0; row -= 4)
2228 {
2229 pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, pu1_ref_val1, 0);
2230 pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, pu1_ref_val2, 0);
2231
2232 mul_res1 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_32_fract);
2233 mul_res2 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val2), dup_fract);
2234
2235 add_res = vaddq_u16(mul_res1, mul_res2);
2236
2237 shift_res = vrshrn_n_u16(add_res, 5);
2238
2239 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
2240 pu1_dst_tmp1 += dst_strd;
2241
2242 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
2243 pu1_dst_tmp1 += dst_strd;
2244
2245 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
2246 pu1_dst_tmp1 += dst_strd;
2247
2248 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
2249
2250 }
2251 pu1_ref_tmp1 -= 1;
2252 pu1_ref_tmp2 -= 1;
2253
2254 pu1_dst_tmp2 += 1;
2255 pu1_dst_tmp1 = pu1_dst_tmp2;
2256
2257 }
2258
2259
2260 }
2261
2262 }
2263
2264 /**
2265 *******************************************************************************
2266 *
2267 * @brief
2268 * Intra prediction interpolation filter for luma mode 11 to mode 17
2269 *
2270 * @par Description:
2271 * Intraprediction for mode 11 to 17 (negative angle, horizontal mode )
2272 * with reference neighboring samples location pointed by 'pu1_ref' to the
2273 * TU block location pointed by 'pu1_dst'
2274 *
2275 * @param[in] pu1_src
2276 * UWORD8 pointer to the source
2277 *
2278 * @param[out] pu1_dst
2279 * UWORD8 pointer to the destination
2280 *
2281 * @param[in] src_strd
2282 * integer source stride
2283 *
2284 * @param[in] dst_strd
2285 * integer destination stride
2286 *
2287 * @param[in] nt
2288 * integer Transform Block size
2289 *
2290 * @param[in] mode
2291 * integer intraprediction mode
2292 *
2293 * @returns
2294 *
2295 * @remarks
2296 * None
2297 *
2298 *******************************************************************************
2299 */
2300
2301
ihevc_intra_pred_luma_mode_11_to_17_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)2302 void ihevc_intra_pred_luma_mode_11_to_17_neonintr(UWORD8 *pu1_ref,
2303 WORD32 src_strd,
2304 UWORD8 *pu1_dst,
2305 WORD32 dst_strd,
2306 WORD32 nt,
2307 WORD32 mode)
2308 {
2309
2310 WORD32 row, col, k;
2311 WORD32 two_nt;
2312 WORD32 intra_pred_ang, inv_ang, inv_ang_sum;
2313 WORD32 pos, fract = 1000, fract_prev;
2314 WORD32 ref_idx;
2315
2316 UWORD8 *ref_main;
2317 UWORD8 *ref_main_tmp;
2318
2319 UWORD8 *pu1_ref_tmp1 = pu1_ref;
2320 UWORD8 *pu1_ref_tmp2 = pu1_ref;
2321 UWORD8 *pu1_dst_tmp1 = pu1_dst;
2322 UWORD8 *pu1_dst_tmp2 = pu1_dst;
2323
2324 UWORD8 ref_temp[2 * MAX_CU_SIZE + 1];
2325
2326 uint16x8_t mul_res1, mul_res2, add_res;
2327 uint8x8_t dup_const_fract, dup_const_32_fract;
2328 uint8x8_t ref_main_idx, ref_main_idx_1, shift_res;
2329 uint8x8_t ref_left_t;
2330 uint32x2_t ref_left_tmp;
2331 UNUSED(src_strd);
2332 ref_left_tmp = vdup_n_u32(0);
2333
2334 inv_ang_sum = 128;
2335 two_nt = 2 * nt;
2336
2337 intra_pred_ang = gai4_ihevc_ang_table[mode];
2338
2339 inv_ang = gai4_ihevc_inv_ang_table[mode - 11];
2340
2341 pu1_ref_tmp1 += two_nt;
2342
2343 ref_main = ref_temp + (nt - 1);
2344 ref_main_tmp = ref_main;
2345
2346 if(0 == (nt & 7))
2347 {
2348 pu1_ref_tmp2 += (two_nt - 7);
2349
2350 for(k = nt - 1; k >= 0; k -= 8)
2351 {
2352
2353 ref_left_t = vld1_u8(pu1_ref_tmp2);
2354
2355 ref_left_t = vrev64_u8(ref_left_t);
2356 vst1_u8(ref_main_tmp, ref_left_t);
2357 ref_main_tmp += 8;
2358 pu1_ref_tmp2 -= 8;
2359
2360 }
2361
2362 }
2363 else
2364 {
2365 uint8x8_t rev_val;
2366 pu1_ref_tmp2 += (two_nt - (nt - 1));
2367
2368 for(k = nt - 1; k >= 0; k -= 8)
2369 {
2370
2371 ref_left_tmp = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, ref_left_tmp, 1);
2372
2373 rev_val = vrev64_u8(vreinterpret_u8_u32(ref_left_tmp));
2374 vst1_lane_u32((uint32_t *)ref_main_tmp, vreinterpret_u32_u8(rev_val), 0);
2375
2376 }
2377
2378 }
2379
2380 ref_main[nt] = pu1_ref[two_nt - nt];
2381
2382 /* For horizontal modes, (ref main = ref left) (ref side = ref above) */
2383
2384 ref_idx = (nt * intra_pred_ang) >> 5;
2385
2386 /* SIMD Optimization can be done using look-up table for the loop */
2387 /* For negative angled derive the main reference samples from side */
2388 /* reference samples refer to section 8.4.4.2.6 */
2389 for(k = -1; k > ref_idx; k--)
2390 {
2391 inv_ang_sum += inv_ang;
2392 ref_main[k] = pu1_ref[two_nt + (inv_ang_sum >> 8)];
2393 }
2394
2395 UWORD8 *ref_main_tmp1 = ref_main;
2396 UWORD8 *ref_main_tmp2 = ref_main;
2397
2398 ref_main_tmp2 += 1;
2399
2400 if(0 == (nt & 7))
2401 {
2402 /* For the angles other then 45 degree, interpolation btw 2 neighboring */
2403 /* samples dependent on distance to obtain destination sample */
2404 for(col = 0; col < nt; col++)
2405 {
2406
2407 fract_prev = fract;
2408 pos = ((col + 1) * intra_pred_ang);
2409 fract = pos & (31);
2410
2411 if(fract_prev < fract)
2412 {
2413 ref_main_tmp1 -= 1;
2414 ref_main_tmp2 -= 1;
2415 }
2416
2417 dup_const_fract = vdup_n_u8((uint8_t)fract);
2418 dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2419
2420 // Do linear filtering
2421 for(row = nt; row > 0; row -= 8)
2422 {
2423 ref_main_idx = vld1_u8(ref_main_tmp1);
2424
2425 ref_main_idx_1 = vld1_u8(ref_main_tmp2);
2426
2427 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
2428 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
2429
2430 add_res = vaddq_u16(mul_res1, mul_res2);
2431
2432 shift_res = vrshrn_n_u16(add_res, 5);
2433
2434 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
2435 pu1_dst_tmp1 += dst_strd;
2436
2437 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
2438 pu1_dst_tmp1 += dst_strd;
2439
2440 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
2441 pu1_dst_tmp1 += dst_strd;
2442
2443 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
2444 pu1_dst_tmp1 += dst_strd;
2445
2446 vst1_lane_u8(pu1_dst_tmp1, shift_res, 4);
2447 pu1_dst_tmp1 += dst_strd;
2448
2449 vst1_lane_u8(pu1_dst_tmp1, shift_res, 5);
2450 pu1_dst_tmp1 += dst_strd;
2451
2452 vst1_lane_u8(pu1_dst_tmp1, shift_res, 6);
2453 pu1_dst_tmp1 += dst_strd;
2454
2455 vst1_lane_u8(pu1_dst_tmp1, shift_res, 7);
2456 pu1_dst_tmp1 += dst_strd;
2457
2458 ref_main_tmp1 += 8;
2459 ref_main_tmp2 += 8;
2460 }
2461
2462 ref_main_tmp1 -= nt;
2463 ref_main_tmp2 -= nt;
2464
2465 pu1_dst_tmp2 += 1;
2466 pu1_dst_tmp1 = pu1_dst_tmp2;
2467 }
2468 }
2469 else
2470 {
2471 uint32x2_t ref_main_idx1, ref_main_idx2;
2472
2473 ref_main_idx1 = vdup_n_u32(0);
2474 ref_main_idx2 = vdup_n_u32(0);
2475
2476 for(col = 0; col < nt; col++)
2477 {
2478 fract_prev = fract;
2479 pos = ((col + 1) * intra_pred_ang);
2480 fract = pos & (31);
2481
2482 if(fract_prev < fract)
2483 {
2484 ref_main_tmp1 -= 1;
2485 ref_main_tmp2 -= 1;
2486 }
2487
2488 dup_const_fract = vdup_n_u8((uint8_t)fract);
2489 dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2490
2491 for(row = nt; row > 0; row -= 4)
2492 {
2493
2494 ref_main_idx1 = vld1_lane_u32((uint32_t *)ref_main_tmp1, ref_main_idx1, 0);
2495 ref_main_idx2 = vld1_lane_u32((uint32_t *)ref_main_tmp2, ref_main_idx2, 0);
2496
2497 mul_res1 = vmull_u8(vreinterpret_u8_u32(ref_main_idx1), dup_const_32_fract);
2498 mul_res2 = vmull_u8(vreinterpret_u8_u32(ref_main_idx2), dup_const_fract);
2499
2500 add_res = vaddq_u16(mul_res1, mul_res2);
2501
2502 shift_res = vrshrn_n_u16(add_res, 5);
2503
2504 vst1_lane_u8(pu1_dst_tmp1, shift_res, 0);
2505 pu1_dst_tmp1 += dst_strd;
2506
2507 vst1_lane_u8(pu1_dst_tmp1, shift_res, 1);
2508 pu1_dst_tmp1 += dst_strd;
2509
2510 vst1_lane_u8(pu1_dst_tmp1, shift_res, 2);
2511 pu1_dst_tmp1 += dst_strd;
2512
2513 vst1_lane_u8(pu1_dst_tmp1, shift_res, 3);
2514 pu1_dst_tmp1 += dst_strd;
2515
2516 }
2517
2518 pu1_dst_tmp2 += 1;
2519 pu1_dst_tmp1 = pu1_dst_tmp2;
2520
2521 }
2522
2523 }
2524 }
2525
2526 /**
2527 *******************************************************************************
2528 *
2529 * @brief
2530 * Intra prediction interpolation filter for luma mode 19 to mode 25
2531 *
2532 * @par Description:
2533 * Intraprediction for mode 19 to 25 (negative angle, vertical mode ) with
2534 * reference neighboring samples location pointed by 'pu1_ref' to the TU
2535 * block location pointed by 'pu1_dst'
2536 *
2537 * @param[in] pu1_src
2538 * UWORD8 pointer to the source
2539 *
2540 * @param[out] pu1_dst
2541 * UWORD8 pointer to the destination
2542 *
2543 * @param[in] src_strd
2544 * integer source stride
2545 *
2546 * @param[in] dst_strd
2547 * integer destination stride
2548 *
2549 * @param[in] nt
2550 * integer Transform Block size
2551 *
2552 * @param[in] mode
2553 * integer intraprediction mode
2554 *
2555 * @returns
2556 *
2557 * @remarks
2558 * None
2559 *
2560 *******************************************************************************
2561 */
2562
2563
ihevc_intra_pred_luma_mode_19_to_25_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)2564 void ihevc_intra_pred_luma_mode_19_to_25_neonintr(UWORD8 *pu1_ref,
2565 WORD32 src_strd,
2566 UWORD8 *pu1_dst,
2567 WORD32 dst_strd,
2568 WORD32 nt,
2569 WORD32 mode)
2570 {
2571
2572 WORD32 row, col, k;
2573 WORD32 two_nt, intra_pred_ang;
2574 WORD32 inv_ang, inv_ang_sum, pos, fract = 1000, fract_prev;;
2575 WORD32 ref_idx;
2576 UWORD8 *ref_main;
2577 UWORD8 *ref_main_tmp;
2578 UWORD8 ref_temp[(2 * MAX_CU_SIZE) + 1];
2579
2580 UWORD8 *pu1_ref_tmp1 = pu1_ref;
2581 UWORD8 *pu1_ref_tmp2 = pu1_ref;
2582 UWORD8 *pu1_dst_tmp1 = pu1_dst;
2583
2584 uint16x8_t mul_res1, mul_res2, add_res;
2585 uint8x8_t dup_const_fract, dup_const_32_fract;
2586 uint8x8_t ref_main_idx, ref_main_idx_1, shift_res;
2587 uint8x8_t ref_above_t;
2588 uint32x2_t ref_above_tmp;
2589 UNUSED(src_strd);
2590 ref_above_tmp = vdup_n_u32(0);
2591
2592 two_nt = 2 * nt;
2593 intra_pred_ang = gai4_ihevc_ang_table[mode];
2594 inv_ang = gai4_ihevc_inv_ang_table[mode - 12];
2595
2596 /* Intermediate reference samples for negative angle modes */
2597 /* This have to be removed during optimization*/
2598 pu1_ref_tmp1 += two_nt;
2599
2600
2601 ref_main = ref_temp + (nt - 1);
2602 ref_main_tmp = ref_main;
2603
2604 if(0 == (nt & 7))
2605 {
2606 pu1_ref_tmp2 += (two_nt - 7);
2607 for(k = nt - 1; k >= 0; k -= 8)
2608 {
2609
2610 ref_above_t = vld1_u8(pu1_ref_tmp1);
2611 vst1_u8(ref_main_tmp, ref_above_t);
2612 ref_main_tmp += 8;
2613 pu1_ref_tmp1 += 8;
2614
2615 }
2616
2617 }
2618 else
2619 {
2620 pu1_ref_tmp2 += (two_nt - (nt - 1));
2621
2622 for(k = nt - 1; k >= 0; k -= 4)
2623 {
2624
2625 ref_above_tmp = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, ref_above_tmp, 0);
2626 vst1_lane_u32((uint32_t *)ref_main_tmp, ref_above_tmp, 0);
2627
2628 }
2629
2630 }
2631
2632 ref_main[nt] = pu1_ref[two_nt + nt];
2633
2634 /* For horizontal modes, (ref main = ref above) (ref side = ref left) */
2635
2636 ref_idx = (nt * intra_pred_ang) >> 5;
2637 inv_ang_sum = 128;
2638
2639 /* SIMD Optimization can be done using look-up table for the loop */
2640 /* For negative angled derive the main reference samples from side */
2641 /* reference samples refer to section 8.4.4.2.6 */
2642 for(k = -1; k > ref_idx; k--)
2643 {
2644 inv_ang_sum += inv_ang;
2645 ref_main[k] = pu1_ref[two_nt - (inv_ang_sum >> 8)];
2646 }
2647
2648 UWORD8 *ref_main_tmp1 = ref_main;
2649 UWORD8 *ref_main_tmp2 = ref_main;
2650
2651 ref_main_tmp2 += 1;
2652
2653 if(0 == (nt & 7))
2654 {
2655 /* For the angles other then 45 degree, interpolation btw 2 neighboring */
2656 /* samples dependent on distance to obtain destination sample */
2657 for(row = 0; row < nt; row++)
2658 {
2659
2660 fract_prev = fract;
2661 pos = ((row + 1) * intra_pred_ang);
2662 fract = pos & (31);
2663
2664 if(fract_prev < fract)
2665 {
2666 ref_main_tmp1 -= 1;
2667 ref_main_tmp2 -= 1;
2668 }
2669
2670 dup_const_fract = vdup_n_u8((uint8_t)fract);
2671 dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2672
2673 // Do linear filtering
2674 for(col = nt; col > 0; col -= 8)
2675 {
2676 ref_main_idx = vld1_u8(ref_main_tmp1);
2677
2678 ref_main_idx_1 = vld1_u8(ref_main_tmp2);
2679
2680 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
2681 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
2682
2683 add_res = vaddq_u16(mul_res1, mul_res2);
2684
2685 shift_res = vrshrn_n_u16(add_res, 5);
2686
2687 vst1_u8(pu1_dst_tmp1, shift_res);
2688 pu1_dst_tmp1 += 8;
2689
2690 ref_main_tmp1 += 8;
2691 ref_main_tmp2 += 8;
2692 }
2693
2694 ref_main_tmp1 -= nt;
2695 ref_main_tmp2 -= nt;
2696
2697 pu1_dst_tmp1 += (dst_strd - nt);
2698 }
2699 }
2700 else
2701 {
2702 uint32x2_t ref_main_idx1, ref_main_idx2;
2703
2704 ref_main_idx1 = vdup_n_u32(0);
2705 ref_main_idx2 = vdup_n_u32(0);
2706
2707 for(row = 0; row < nt; row++)
2708 {
2709 fract_prev = fract;
2710 pos = ((row + 1) * intra_pred_ang);
2711 fract = pos & (31);
2712
2713 if(fract_prev < fract)
2714 {
2715 ref_main_tmp1 -= 1;
2716 ref_main_tmp2 -= 1;
2717 }
2718
2719 dup_const_fract = vdup_n_u8((uint8_t)fract);
2720 dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2721
2722 for(col = nt; col > 0; col -= 4)
2723 {
2724
2725 ref_main_idx1 = vld1_lane_u32((uint32_t *)ref_main_tmp1, ref_main_idx1, 0);
2726 ref_main_idx2 = vld1_lane_u32((uint32_t *)ref_main_tmp2, ref_main_idx2, 0);
2727
2728 mul_res1 = vmull_u8(vreinterpret_u8_u32(ref_main_idx1), dup_const_32_fract);
2729 mul_res2 = vmull_u8(vreinterpret_u8_u32(ref_main_idx2), dup_const_fract);
2730
2731 add_res = vaddq_u16(mul_res1, mul_res2);
2732
2733 shift_res = vrshrn_n_u16(add_res, 5);
2734
2735 vst1_lane_u32((uint32_t *)pu1_dst_tmp1, vreinterpret_u32_u8(shift_res), 0);
2736 pu1_dst_tmp1 += 4;
2737
2738 }
2739 pu1_dst_tmp1 += (dst_strd - nt);
2740 }
2741
2742 }
2743
2744 }
2745
2746 /**
2747 *******************************************************************************
2748 *
2749 * @brief
2750 * Intra prediction interpolation filter for luma mode 27 to mode 33
2751 *
2752 * @par Description:
2753 * Intraprediction for mode 27 to 33 (positive angle, vertical mode ) with
2754 * reference neighboring samples location pointed by 'pu1_ref' to the TU
2755 * block location pointed by 'pu1_dst'
2756 *
2757 * @param[in] pu1_src
2758 * UWORD8 pointer to the source
2759 *
2760 * @param[out] pu1_dst
2761 * UWORD8 pointer to the destination
2762 *
2763 * @param[in] src_strd
2764 * integer source stride
2765 *
2766 * @param[in] dst_strd
2767 * integer destination stride
2768 *
2769 * @param[in] nt
2770 * integer Transform Block size
2771 *
2772 * @param[in] mode
2773 * integer intraprediction mode
2774 *
2775 * @returns
2776 *
2777 * @remarks
2778 * None
2779 *
2780 *******************************************************************************
2781 */
2782
2783
ihevc_intra_pred_luma_mode_27_to_33_neonintr(UWORD8 * pu1_ref,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 nt,WORD32 mode)2784 void ihevc_intra_pred_luma_mode_27_to_33_neonintr(UWORD8 *pu1_ref,
2785 WORD32 src_strd,
2786 UWORD8 *pu1_dst,
2787 WORD32 dst_strd,
2788 WORD32 nt,
2789 WORD32 mode)
2790 {
2791
2792 WORD32 row, col;
2793 WORD32 intra_pred_ang;
2794 WORD32 pos, fract = 0, fract_prev;
2795
2796 WORD32 two_nt = 2 * nt;
2797 UNUSED(src_strd);
2798 if(0 == (nt & 7))
2799 {
2800
2801 UWORD8 *pu1_ref_main_idx = pu1_ref;
2802 UWORD8 *pu1_ref_main_idx_1 = pu1_ref;
2803
2804 UWORD8 *pu1_dst_tmp1 = pu1_dst;
2805 pu1_ref_main_idx += (two_nt + 1);
2806 pu1_ref_main_idx_1 += (two_nt + 2);
2807
2808 uint8x8_t dup_const_fract, dup_const_32_fract, ref_main_idx, ref_main_idx_1;
2809 uint8x8_t shift_res;
2810 uint16x8_t mul_res1, mul_res2, add_res;
2811
2812 /* Intra Pred Angle according to the mode */
2813 intra_pred_ang = gai4_ihevc_ang_table[mode];
2814
2815 for(row = 0; row < nt; row++)
2816 {
2817 fract_prev = fract;
2818
2819 pos = ((row + 1) * intra_pred_ang);
2820 fract = pos & (31);
2821
2822 if(fract_prev > fract)
2823 {
2824 pu1_ref_main_idx += 1;
2825 pu1_ref_main_idx_1 += 1;
2826 }
2827
2828 dup_const_fract = vdup_n_u8((uint8_t)fract);
2829 dup_const_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2830
2831 for(col = nt; col > 0; col -= 8)
2832 {
2833 ref_main_idx = vld1_u8(pu1_ref_main_idx);
2834 ref_main_idx_1 = vld1_u8(pu1_ref_main_idx_1);
2835
2836 mul_res1 = vmull_u8(ref_main_idx, dup_const_32_fract);
2837 mul_res2 = vmull_u8(ref_main_idx_1, dup_const_fract);
2838
2839 add_res = vaddq_u16(mul_res1, mul_res2);
2840
2841 shift_res = vrshrn_n_u16(add_res, 5);
2842
2843 vst1_u8(pu1_dst_tmp1, shift_res);
2844 pu1_dst_tmp1 += 8;
2845
2846 pu1_ref_main_idx += 8;
2847 pu1_ref_main_idx_1 += 8;
2848 }
2849
2850 pu1_ref_main_idx -= nt;
2851 pu1_ref_main_idx_1 -= nt;
2852
2853 pu1_dst_tmp1 += (dst_strd - nt);
2854 }
2855
2856 }
2857 else
2858 {
2859 UWORD8 *pu1_ref_tmp1 = pu1_ref;
2860 UWORD8 *pu1_ref_tmp2 = pu1_ref;
2861 UWORD8 *pu1_dst_tmp1 = pu1_dst;
2862
2863 pu1_ref_tmp1 += (two_nt + 1);;
2864 pu1_ref_tmp2 += (two_nt + 2);;
2865
2866 uint8x8_t dup_fract, dup_32_fract, shift_res;
2867 uint16x8_t mul_res1, mul_res2, add_res;
2868 uint32x2_t pu1_ref_val1, pu1_ref_val2;
2869
2870 pu1_ref_val1 = vdup_n_u32(0);
2871 pu1_ref_val2 = vdup_n_u32(0);
2872
2873 /* Intra Pred Angle according to the mode */
2874 intra_pred_ang = gai4_ihevc_ang_table[mode];
2875
2876 for(row = 0; row < nt; row++)
2877 {
2878 fract_prev = fract;
2879 pos = ((row + 1) * intra_pred_ang);
2880 fract = pos & (31);
2881 if(fract_prev > fract)
2882 {
2883 pu1_ref_tmp1 += 1;
2884 pu1_ref_tmp2 += 1;
2885 }
2886 dup_fract = vdup_n_u8((uint8_t)fract);
2887 dup_32_fract = vdup_n_u8((uint8_t)(32 - fract));
2888
2889 for(col = nt; col > 0; col -= 4)
2890 {
2891 pu1_ref_val1 = vld1_lane_u32((uint32_t *)pu1_ref_tmp1, pu1_ref_val1, 0);
2892 pu1_ref_val2 = vld1_lane_u32((uint32_t *)pu1_ref_tmp2, pu1_ref_val2, 0);
2893
2894 mul_res1 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val1), dup_32_fract);
2895 mul_res2 = vmull_u8(vreinterpret_u8_u32(pu1_ref_val2), dup_fract);
2896
2897 add_res = vaddq_u16(mul_res1, mul_res2);
2898
2899 shift_res = vrshrn_n_u16(add_res, 5);
2900
2901 vst1_lane_u32((uint32_t *)pu1_dst_tmp1, vreinterpret_u32_u8(shift_res), 0);
2902 pu1_dst_tmp1 += 4;
2903
2904 }
2905
2906 pu1_dst_tmp1 += (dst_strd - nt);
2907
2908 }
2909
2910
2911 }
2912
2913 }
2914