xref: /aosp_15_r20/external/libaom/av1/common/arm/resize_neon.h (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1 /*
2  * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #ifndef AOM_AV1_COMMON_ARM_RESIZE_NEON_H_
13 #define AOM_AV1_COMMON_ARM_RESIZE_NEON_H_
14 
15 #include <arm_neon.h>
16 
17 #include "aom_dsp/aom_filter.h"
18 #include "aom_dsp/arm/mem_neon.h"
19 #include "aom_dsp/arm/transpose_neon.h"
20 
scale_filter6_8(const int16x8_t s0,const int16x8_t s1,const int16x8_t s2,const int16x8_t s3,const int16x8_t s4,const int16x8_t s5,int16x8_t filter)21 static inline uint8x8_t scale_filter6_8(const int16x8_t s0, const int16x8_t s1,
22                                         const int16x8_t s2, const int16x8_t s3,
23                                         const int16x8_t s4, const int16x8_t s5,
24                                         int16x8_t filter) {
25   const int16x4_t filter_lo = vget_low_s16(filter);
26   const int16x4_t filter_hi = vget_high_s16(filter);
27 
28   // Filter values at indices 0 and 7 are 0.
29   int16x8_t sum = vmulq_lane_s16(s0, filter_lo, 1);
30   sum = vmlaq_lane_s16(sum, s1, filter_lo, 2);
31   sum = vmlaq_lane_s16(sum, s2, filter_lo, 3);
32   sum = vmlaq_lane_s16(sum, s3, filter_hi, 0);
33   sum = vmlaq_lane_s16(sum, s4, filter_hi, 1);
34   sum = vmlaq_lane_s16(sum, s5, filter_hi, 2);
35 
36   // We halved the convolution filter values so -1 from the right shift.
37   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
38 }
39 
scale_2_to_1_vert_6tap(const uint8_t * src,const int src_stride,int w,int h,uint8_t * dst,const int dst_stride,const int16x8_t filters)40 static inline void scale_2_to_1_vert_6tap(const uint8_t *src,
41                                           const int src_stride, int w, int h,
42                                           uint8_t *dst, const int dst_stride,
43                                           const int16x8_t filters) {
44   do {
45     uint8x8_t t0, t1, t2, t3;
46     load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
47 
48     int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
49     int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
50     int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
51     int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
52 
53     const uint8_t *s = src + 4 * src_stride;
54     uint8_t *d = dst;
55     int height = h;
56 
57     do {
58       uint8x8_t t4, t5, t6, t7, t8, t9, t10, t11;
59       load_u8_8x8(s, src_stride, &t4, &t5, &t6, &t7, &t8, &t9, &t10, &t11);
60 
61       int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
62       int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
63       int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
64       int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
65       int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
66       int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
67       int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
68       int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11));
69 
70       uint8x8_t d0 = scale_filter6_8(s0, s1, s2, s3, s4, s5, filters);
71       uint8x8_t d1 = scale_filter6_8(s2, s3, s4, s5, s6, s7, filters);
72       uint8x8_t d2 = scale_filter6_8(s4, s5, s6, s7, s8, s9, filters);
73       uint8x8_t d3 = scale_filter6_8(s6, s7, s8, s9, s10, s11, filters);
74 
75       store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
76 
77       s0 = s8;
78       s1 = s9;
79       s2 = s10;
80       s3 = s11;
81 
82       d += 4 * dst_stride;
83       s += 8 * src_stride;
84       height -= 4;
85     } while (height > 0);
86 
87     dst += 8;
88     src += 8;
89     w -= 8;
90   } while (w > 0);
91 }
92 
scale_4_to_1_vert_6tap(const uint8_t * src,const int src_stride,int w,int h,uint8_t * dst,const int dst_stride,const int16x8_t filters)93 static inline void scale_4_to_1_vert_6tap(const uint8_t *src,
94                                           const int src_stride, int w, int h,
95                                           uint8_t *dst, const int dst_stride,
96                                           const int16x8_t filters) {
97   do {
98     uint8x8_t t0 = vld1_u8(src + 0 * src_stride);
99     uint8x8_t t1 = vld1_u8(src + 1 * src_stride);
100 
101     int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
102     int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
103 
104     const uint8_t *s = src + 2 * src_stride;
105     uint8_t *d = dst;
106     int height = h;
107 
108     do {
109       uint8x8_t t2, t3, t4, t5, t6, t7, t8, t9;
110       load_u8_8x8(s, src_stride, &t2, &t3, &t4, &t5, &t6, &t7, &t8, &t9);
111 
112       int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
113       int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
114       int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
115       int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
116       int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
117       int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
118       int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
119       int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
120 
121       uint8x8_t d0 = scale_filter6_8(s0, s1, s2, s3, s4, s5, filters);
122       uint8x8_t d1 = scale_filter6_8(s4, s5, s6, s7, s8, s9, filters);
123 
124       store_u8_8x2(d, dst_stride, d0, d1);
125 
126       s0 = s8;
127       s1 = s9;
128 
129       s += 8 * src_stride;
130       d += 2 * dst_stride;
131       height -= 2;
132     } while (height > 0);
133 
134     src += 8;
135     dst += 8;
136     w -= 8;
137   } while (w > 0);
138 }
139 
140 #endif  // AOM_AV1_COMMON_ARM_RESIZE_NEON_H_
141