xref: /aosp_15_r20/external/libaom/av1/common/arm/reconinter_neon.c (revision 77c1e3ccc04c968bd2bc212e87364f250e820521)
1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker  *
3*77c1e3ccSAndroid Build Coastguard Worker  * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
4*77c1e3ccSAndroid Build Coastguard Worker  *
5*77c1e3ccSAndroid Build Coastguard Worker  * This source code is subject to the terms of the BSD 2 Clause License and
6*77c1e3ccSAndroid Build Coastguard Worker  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
7*77c1e3ccSAndroid Build Coastguard Worker  * was not distributed with this source code in the LICENSE file, you can
8*77c1e3ccSAndroid Build Coastguard Worker  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
9*77c1e3ccSAndroid Build Coastguard Worker  * Media Patent License 1.0 was not distributed with this source code in the
10*77c1e3ccSAndroid Build Coastguard Worker  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
11*77c1e3ccSAndroid Build Coastguard Worker  */
12*77c1e3ccSAndroid Build Coastguard Worker 
13*77c1e3ccSAndroid Build Coastguard Worker #include <arm_neon.h>
14*77c1e3ccSAndroid Build Coastguard Worker #include <assert.h>
15*77c1e3ccSAndroid Build Coastguard Worker #include <stdbool.h>
16*77c1e3ccSAndroid Build Coastguard Worker 
17*77c1e3ccSAndroid Build Coastguard Worker #include "aom/aom_integer.h"
18*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/blend.h"
19*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/arm/mem_neon.h"
20*77c1e3ccSAndroid Build Coastguard Worker #include "aom_ports/mem.h"
21*77c1e3ccSAndroid Build Coastguard Worker #include "av1/common/blockd.h"
22*77c1e3ccSAndroid Build Coastguard Worker #include "config/av1_rtcd.h"
23*77c1e3ccSAndroid Build Coastguard Worker 
diffwtd_mask_d16_neon(uint8_t * mask,const bool inverse,const CONV_BUF_TYPE * src0,int src0_stride,const CONV_BUF_TYPE * src1,int src1_stride,int h,int w,ConvolveParams * conv_params,int bd)24*77c1e3ccSAndroid Build Coastguard Worker static inline void diffwtd_mask_d16_neon(uint8_t *mask, const bool inverse,
25*77c1e3ccSAndroid Build Coastguard Worker                                          const CONV_BUF_TYPE *src0,
26*77c1e3ccSAndroid Build Coastguard Worker                                          int src0_stride,
27*77c1e3ccSAndroid Build Coastguard Worker                                          const CONV_BUF_TYPE *src1,
28*77c1e3ccSAndroid Build Coastguard Worker                                          int src1_stride, int h, int w,
29*77c1e3ccSAndroid Build Coastguard Worker                                          ConvolveParams *conv_params, int bd) {
30*77c1e3ccSAndroid Build Coastguard Worker   const int round =
31*77c1e3ccSAndroid Build Coastguard Worker       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
32*77c1e3ccSAndroid Build Coastguard Worker   const int16x8_t round_vec = vdupq_n_s16((int16_t)(-round));
33*77c1e3ccSAndroid Build Coastguard Worker 
34*77c1e3ccSAndroid Build Coastguard Worker   if (w >= 16) {
35*77c1e3ccSAndroid Build Coastguard Worker     int i = 0;
36*77c1e3ccSAndroid Build Coastguard Worker     do {
37*77c1e3ccSAndroid Build Coastguard Worker       int j = 0;
38*77c1e3ccSAndroid Build Coastguard Worker       do {
39*77c1e3ccSAndroid Build Coastguard Worker         uint16x8_t s0_lo = vld1q_u16(src0 + j);
40*77c1e3ccSAndroid Build Coastguard Worker         uint16x8_t s1_lo = vld1q_u16(src1 + j);
41*77c1e3ccSAndroid Build Coastguard Worker         uint16x8_t s0_hi = vld1q_u16(src0 + j + 8);
42*77c1e3ccSAndroid Build Coastguard Worker         uint16x8_t s1_hi = vld1q_u16(src1 + j + 8);
43*77c1e3ccSAndroid Build Coastguard Worker 
44*77c1e3ccSAndroid Build Coastguard Worker         uint16x8_t diff_lo_u16 = vrshlq_u16(vabdq_u16(s0_lo, s1_lo), round_vec);
45*77c1e3ccSAndroid Build Coastguard Worker         uint16x8_t diff_hi_u16 = vrshlq_u16(vabdq_u16(s0_hi, s1_hi), round_vec);
46*77c1e3ccSAndroid Build Coastguard Worker         uint8x8_t diff_lo_u8 = vshrn_n_u16(diff_lo_u16, DIFF_FACTOR_LOG2);
47*77c1e3ccSAndroid Build Coastguard Worker         uint8x8_t diff_hi_u8 = vshrn_n_u16(diff_hi_u16, DIFF_FACTOR_LOG2);
48*77c1e3ccSAndroid Build Coastguard Worker         uint8x16_t diff = vcombine_u8(diff_lo_u8, diff_hi_u8);
49*77c1e3ccSAndroid Build Coastguard Worker 
50*77c1e3ccSAndroid Build Coastguard Worker         uint8x16_t m;
51*77c1e3ccSAndroid Build Coastguard Worker         if (inverse) {
52*77c1e3ccSAndroid Build Coastguard Worker           m = vqsubq_u8(vdupq_n_u8(64 - 38), diff);  // Saturating to 0
53*77c1e3ccSAndroid Build Coastguard Worker         } else {
54*77c1e3ccSAndroid Build Coastguard Worker           m = vminq_u8(vaddq_u8(diff, vdupq_n_u8(38)), vdupq_n_u8(64));
55*77c1e3ccSAndroid Build Coastguard Worker         }
56*77c1e3ccSAndroid Build Coastguard Worker 
57*77c1e3ccSAndroid Build Coastguard Worker         vst1q_u8(mask, m);
58*77c1e3ccSAndroid Build Coastguard Worker 
59*77c1e3ccSAndroid Build Coastguard Worker         mask += 16;
60*77c1e3ccSAndroid Build Coastguard Worker         j += 16;
61*77c1e3ccSAndroid Build Coastguard Worker       } while (j < w);
62*77c1e3ccSAndroid Build Coastguard Worker       src0 += src0_stride;
63*77c1e3ccSAndroid Build Coastguard Worker       src1 += src1_stride;
64*77c1e3ccSAndroid Build Coastguard Worker     } while (++i < h);
65*77c1e3ccSAndroid Build Coastguard Worker   } else if (w == 8) {
66*77c1e3ccSAndroid Build Coastguard Worker     int i = 0;
67*77c1e3ccSAndroid Build Coastguard Worker     do {
68*77c1e3ccSAndroid Build Coastguard Worker       uint16x8_t s0 = vld1q_u16(src0);
69*77c1e3ccSAndroid Build Coastguard Worker       uint16x8_t s1 = vld1q_u16(src1);
70*77c1e3ccSAndroid Build Coastguard Worker 
71*77c1e3ccSAndroid Build Coastguard Worker       uint16x8_t diff_u16 = vrshlq_u16(vabdq_u16(s0, s1), round_vec);
72*77c1e3ccSAndroid Build Coastguard Worker       uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, DIFF_FACTOR_LOG2);
73*77c1e3ccSAndroid Build Coastguard Worker       uint8x8_t m;
74*77c1e3ccSAndroid Build Coastguard Worker       if (inverse) {
75*77c1e3ccSAndroid Build Coastguard Worker         m = vqsub_u8(vdup_n_u8(64 - 38), diff_u8);  // Saturating to 0
76*77c1e3ccSAndroid Build Coastguard Worker       } else {
77*77c1e3ccSAndroid Build Coastguard Worker         m = vmin_u8(vadd_u8(diff_u8, vdup_n_u8(38)), vdup_n_u8(64));
78*77c1e3ccSAndroid Build Coastguard Worker       }
79*77c1e3ccSAndroid Build Coastguard Worker 
80*77c1e3ccSAndroid Build Coastguard Worker       vst1_u8(mask, m);
81*77c1e3ccSAndroid Build Coastguard Worker 
82*77c1e3ccSAndroid Build Coastguard Worker       mask += 8;
83*77c1e3ccSAndroid Build Coastguard Worker       src0 += src0_stride;
84*77c1e3ccSAndroid Build Coastguard Worker       src1 += src1_stride;
85*77c1e3ccSAndroid Build Coastguard Worker     } while (++i < h);
86*77c1e3ccSAndroid Build Coastguard Worker   } else if (w == 4) {
87*77c1e3ccSAndroid Build Coastguard Worker     int i = 0;
88*77c1e3ccSAndroid Build Coastguard Worker     do {
89*77c1e3ccSAndroid Build Coastguard Worker       uint16x8_t s0 =
90*77c1e3ccSAndroid Build Coastguard Worker           vcombine_u16(vld1_u16(src0), vld1_u16(src0 + src0_stride));
91*77c1e3ccSAndroid Build Coastguard Worker       uint16x8_t s1 =
92*77c1e3ccSAndroid Build Coastguard Worker           vcombine_u16(vld1_u16(src1), vld1_u16(src1 + src1_stride));
93*77c1e3ccSAndroid Build Coastguard Worker 
94*77c1e3ccSAndroid Build Coastguard Worker       uint16x8_t diff_u16 = vrshlq_u16(vabdq_u16(s0, s1), round_vec);
95*77c1e3ccSAndroid Build Coastguard Worker       uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, DIFF_FACTOR_LOG2);
96*77c1e3ccSAndroid Build Coastguard Worker       uint8x8_t m;
97*77c1e3ccSAndroid Build Coastguard Worker       if (inverse) {
98*77c1e3ccSAndroid Build Coastguard Worker         m = vqsub_u8(vdup_n_u8(64 - 38), diff_u8);  // Saturating to 0
99*77c1e3ccSAndroid Build Coastguard Worker       } else {
100*77c1e3ccSAndroid Build Coastguard Worker         m = vmin_u8(vadd_u8(diff_u8, vdup_n_u8(38)), vdup_n_u8(64));
101*77c1e3ccSAndroid Build Coastguard Worker       }
102*77c1e3ccSAndroid Build Coastguard Worker 
103*77c1e3ccSAndroid Build Coastguard Worker       vst1_u8(mask, m);
104*77c1e3ccSAndroid Build Coastguard Worker 
105*77c1e3ccSAndroid Build Coastguard Worker       mask += 8;
106*77c1e3ccSAndroid Build Coastguard Worker       src0 += 2 * src0_stride;
107*77c1e3ccSAndroid Build Coastguard Worker       src1 += 2 * src1_stride;
108*77c1e3ccSAndroid Build Coastguard Worker       i += 2;
109*77c1e3ccSAndroid Build Coastguard Worker     } while (i < h);
110*77c1e3ccSAndroid Build Coastguard Worker   }
111*77c1e3ccSAndroid Build Coastguard Worker }
112*77c1e3ccSAndroid Build Coastguard Worker 
av1_build_compound_diffwtd_mask_d16_neon(uint8_t * mask,DIFFWTD_MASK_TYPE mask_type,const CONV_BUF_TYPE * src0,int src0_stride,const CONV_BUF_TYPE * src1,int src1_stride,int h,int w,ConvolveParams * conv_params,int bd)113*77c1e3ccSAndroid Build Coastguard Worker void av1_build_compound_diffwtd_mask_d16_neon(
114*77c1e3ccSAndroid Build Coastguard Worker     uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
115*77c1e3ccSAndroid Build Coastguard Worker     int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
116*77c1e3ccSAndroid Build Coastguard Worker     ConvolveParams *conv_params, int bd) {
117*77c1e3ccSAndroid Build Coastguard Worker   assert(h >= 4);
118*77c1e3ccSAndroid Build Coastguard Worker   assert(w >= 4);
119*77c1e3ccSAndroid Build Coastguard Worker   assert((mask_type == DIFFWTD_38_INV) || (mask_type == DIFFWTD_38));
120*77c1e3ccSAndroid Build Coastguard Worker 
121*77c1e3ccSAndroid Build Coastguard Worker   if (mask_type == DIFFWTD_38) {
122*77c1e3ccSAndroid Build Coastguard Worker     diffwtd_mask_d16_neon(mask, /*inverse=*/false, src0, src0_stride, src1,
123*77c1e3ccSAndroid Build Coastguard Worker                           src1_stride, h, w, conv_params, bd);
124*77c1e3ccSAndroid Build Coastguard Worker   } else {  // mask_type == DIFFWTD_38_INV
125*77c1e3ccSAndroid Build Coastguard Worker     diffwtd_mask_d16_neon(mask, /*inverse=*/true, src0, src0_stride, src1,
126*77c1e3ccSAndroid Build Coastguard Worker                           src1_stride, h, w, conv_params, bd);
127*77c1e3ccSAndroid Build Coastguard Worker   }
128*77c1e3ccSAndroid Build Coastguard Worker }
129*77c1e3ccSAndroid Build Coastguard Worker 
diffwtd_mask_neon(uint8_t * mask,const bool inverse,const uint8_t * src0,int src0_stride,const uint8_t * src1,int src1_stride,int h,int w)130*77c1e3ccSAndroid Build Coastguard Worker static inline void diffwtd_mask_neon(uint8_t *mask, const bool inverse,
131*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *src0, int src0_stride,
132*77c1e3ccSAndroid Build Coastguard Worker                                      const uint8_t *src1, int src1_stride,
133*77c1e3ccSAndroid Build Coastguard Worker                                      int h, int w) {
134*77c1e3ccSAndroid Build Coastguard Worker   if (w >= 16) {
135*77c1e3ccSAndroid Build Coastguard Worker     int i = 0;
136*77c1e3ccSAndroid Build Coastguard Worker     do {
137*77c1e3ccSAndroid Build Coastguard Worker       int j = 0;
138*77c1e3ccSAndroid Build Coastguard Worker       do {
139*77c1e3ccSAndroid Build Coastguard Worker         uint8x16_t s0 = vld1q_u8(src0 + j);
140*77c1e3ccSAndroid Build Coastguard Worker         uint8x16_t s1 = vld1q_u8(src1 + j);
141*77c1e3ccSAndroid Build Coastguard Worker 
142*77c1e3ccSAndroid Build Coastguard Worker         uint8x16_t diff = vshrq_n_u8(vabdq_u8(s0, s1), DIFF_FACTOR_LOG2);
143*77c1e3ccSAndroid Build Coastguard Worker         uint8x16_t m;
144*77c1e3ccSAndroid Build Coastguard Worker         if (inverse) {
145*77c1e3ccSAndroid Build Coastguard Worker           m = vqsubq_u8(vdupq_n_u8(64 - 38), diff);  // Saturating to 0
146*77c1e3ccSAndroid Build Coastguard Worker         } else {
147*77c1e3ccSAndroid Build Coastguard Worker           m = vminq_u8(vaddq_u8(diff, vdupq_n_u8(38)), vdupq_n_u8(64));
148*77c1e3ccSAndroid Build Coastguard Worker         }
149*77c1e3ccSAndroid Build Coastguard Worker 
150*77c1e3ccSAndroid Build Coastguard Worker         vst1q_u8(mask, m);
151*77c1e3ccSAndroid Build Coastguard Worker 
152*77c1e3ccSAndroid Build Coastguard Worker         mask += 16;
153*77c1e3ccSAndroid Build Coastguard Worker         j += 16;
154*77c1e3ccSAndroid Build Coastguard Worker       } while (j < w);
155*77c1e3ccSAndroid Build Coastguard Worker       src0 += src0_stride;
156*77c1e3ccSAndroid Build Coastguard Worker       src1 += src1_stride;
157*77c1e3ccSAndroid Build Coastguard Worker     } while (++i < h);
158*77c1e3ccSAndroid Build Coastguard Worker   } else if (w == 8) {
159*77c1e3ccSAndroid Build Coastguard Worker     int i = 0;
160*77c1e3ccSAndroid Build Coastguard Worker     do {
161*77c1e3ccSAndroid Build Coastguard Worker       uint8x16_t s0 = vcombine_u8(vld1_u8(src0), vld1_u8(src0 + src0_stride));
162*77c1e3ccSAndroid Build Coastguard Worker       uint8x16_t s1 = vcombine_u8(vld1_u8(src1), vld1_u8(src1 + src0_stride));
163*77c1e3ccSAndroid Build Coastguard Worker 
164*77c1e3ccSAndroid Build Coastguard Worker       uint8x16_t diff = vshrq_n_u8(vabdq_u8(s0, s1), DIFF_FACTOR_LOG2);
165*77c1e3ccSAndroid Build Coastguard Worker       uint8x16_t m;
166*77c1e3ccSAndroid Build Coastguard Worker       if (inverse) {
167*77c1e3ccSAndroid Build Coastguard Worker         m = vqsubq_u8(vdupq_n_u8(64 - 38), diff);  // Saturating to 0
168*77c1e3ccSAndroid Build Coastguard Worker       } else {
169*77c1e3ccSAndroid Build Coastguard Worker         m = vminq_u8(vaddq_u8(diff, vdupq_n_u8(38)), vdupq_n_u8(64));
170*77c1e3ccSAndroid Build Coastguard Worker       }
171*77c1e3ccSAndroid Build Coastguard Worker 
172*77c1e3ccSAndroid Build Coastguard Worker       vst1q_u8(mask, m);
173*77c1e3ccSAndroid Build Coastguard Worker 
174*77c1e3ccSAndroid Build Coastguard Worker       mask += 16;
175*77c1e3ccSAndroid Build Coastguard Worker       src0 += 2 * src0_stride;
176*77c1e3ccSAndroid Build Coastguard Worker       src1 += 2 * src1_stride;
177*77c1e3ccSAndroid Build Coastguard Worker       i += 2;
178*77c1e3ccSAndroid Build Coastguard Worker     } while (i < h);
179*77c1e3ccSAndroid Build Coastguard Worker   } else if (w == 4) {
180*77c1e3ccSAndroid Build Coastguard Worker     int i = 0;
181*77c1e3ccSAndroid Build Coastguard Worker     do {
182*77c1e3ccSAndroid Build Coastguard Worker       uint8x16_t s0 = load_unaligned_u8q(src0, src0_stride);
183*77c1e3ccSAndroid Build Coastguard Worker       uint8x16_t s1 = load_unaligned_u8q(src1, src1_stride);
184*77c1e3ccSAndroid Build Coastguard Worker 
185*77c1e3ccSAndroid Build Coastguard Worker       uint8x16_t diff = vshrq_n_u8(vabdq_u8(s0, s1), DIFF_FACTOR_LOG2);
186*77c1e3ccSAndroid Build Coastguard Worker       uint8x16_t m;
187*77c1e3ccSAndroid Build Coastguard Worker       if (inverse) {
188*77c1e3ccSAndroid Build Coastguard Worker         m = vqsubq_u8(vdupq_n_u8(64 - 38), diff);  // Saturating to 0
189*77c1e3ccSAndroid Build Coastguard Worker       } else {
190*77c1e3ccSAndroid Build Coastguard Worker         m = vminq_u8(vaddq_u8(diff, vdupq_n_u8(38)), vdupq_n_u8(64));
191*77c1e3ccSAndroid Build Coastguard Worker       }
192*77c1e3ccSAndroid Build Coastguard Worker 
193*77c1e3ccSAndroid Build Coastguard Worker       vst1q_u8(mask, m);
194*77c1e3ccSAndroid Build Coastguard Worker 
195*77c1e3ccSAndroid Build Coastguard Worker       mask += 16;
196*77c1e3ccSAndroid Build Coastguard Worker       src0 += 4 * src0_stride;
197*77c1e3ccSAndroid Build Coastguard Worker       src1 += 4 * src1_stride;
198*77c1e3ccSAndroid Build Coastguard Worker       i += 4;
199*77c1e3ccSAndroid Build Coastguard Worker     } while (i < h);
200*77c1e3ccSAndroid Build Coastguard Worker   }
201*77c1e3ccSAndroid Build Coastguard Worker }
202*77c1e3ccSAndroid Build Coastguard Worker 
av1_build_compound_diffwtd_mask_neon(uint8_t * mask,DIFFWTD_MASK_TYPE mask_type,const uint8_t * src0,int src0_stride,const uint8_t * src1,int src1_stride,int h,int w)203*77c1e3ccSAndroid Build Coastguard Worker void av1_build_compound_diffwtd_mask_neon(uint8_t *mask,
204*77c1e3ccSAndroid Build Coastguard Worker                                           DIFFWTD_MASK_TYPE mask_type,
205*77c1e3ccSAndroid Build Coastguard Worker                                           const uint8_t *src0, int src0_stride,
206*77c1e3ccSAndroid Build Coastguard Worker                                           const uint8_t *src1, int src1_stride,
207*77c1e3ccSAndroid Build Coastguard Worker                                           int h, int w) {
208*77c1e3ccSAndroid Build Coastguard Worker   assert(h % 4 == 0);
209*77c1e3ccSAndroid Build Coastguard Worker   assert(w % 4 == 0);
210*77c1e3ccSAndroid Build Coastguard Worker   assert(mask_type == DIFFWTD_38_INV || mask_type == DIFFWTD_38);
211*77c1e3ccSAndroid Build Coastguard Worker 
212*77c1e3ccSAndroid Build Coastguard Worker   if (mask_type == DIFFWTD_38) {
213*77c1e3ccSAndroid Build Coastguard Worker     diffwtd_mask_neon(mask, /*inverse=*/false, src0, src0_stride, src1,
214*77c1e3ccSAndroid Build Coastguard Worker                       src1_stride, h, w);
215*77c1e3ccSAndroid Build Coastguard Worker   } else {  // mask_type == DIFFWTD_38_INV
216*77c1e3ccSAndroid Build Coastguard Worker     diffwtd_mask_neon(mask, /*inverse=*/true, src0, src0_stride, src1,
217*77c1e3ccSAndroid Build Coastguard Worker                       src1_stride, h, w);
218*77c1e3ccSAndroid Build Coastguard Worker   }
219*77c1e3ccSAndroid Build Coastguard Worker }
220