1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker *
3*77c1e3ccSAndroid Build Coastguard Worker * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
4*77c1e3ccSAndroid Build Coastguard Worker *
5*77c1e3ccSAndroid Build Coastguard Worker * This source code is subject to the terms of the BSD 2 Clause License and
6*77c1e3ccSAndroid Build Coastguard Worker * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
7*77c1e3ccSAndroid Build Coastguard Worker * was not distributed with this source code in the LICENSE file, you can
8*77c1e3ccSAndroid Build Coastguard Worker * obtain it at www.aomedia.org/license/software. If the Alliance for Open
9*77c1e3ccSAndroid Build Coastguard Worker * Media Patent License 1.0 was not distributed with this source code in the
10*77c1e3ccSAndroid Build Coastguard Worker * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
11*77c1e3ccSAndroid Build Coastguard Worker */
12*77c1e3ccSAndroid Build Coastguard Worker
13*77c1e3ccSAndroid Build Coastguard Worker #include <arm_neon.h>
14*77c1e3ccSAndroid Build Coastguard Worker #include <assert.h>
15*77c1e3ccSAndroid Build Coastguard Worker #include <stdbool.h>
16*77c1e3ccSAndroid Build Coastguard Worker
17*77c1e3ccSAndroid Build Coastguard Worker #include "aom/aom_integer.h"
18*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/blend.h"
19*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/arm/mem_neon.h"
20*77c1e3ccSAndroid Build Coastguard Worker #include "aom_ports/mem.h"
21*77c1e3ccSAndroid Build Coastguard Worker #include "av1/common/blockd.h"
22*77c1e3ccSAndroid Build Coastguard Worker #include "config/av1_rtcd.h"
23*77c1e3ccSAndroid Build Coastguard Worker
diffwtd_mask_d16_neon(uint8_t * mask,const bool inverse,const CONV_BUF_TYPE * src0,int src0_stride,const CONV_BUF_TYPE * src1,int src1_stride,int h,int w,ConvolveParams * conv_params,int bd)24*77c1e3ccSAndroid Build Coastguard Worker static inline void diffwtd_mask_d16_neon(uint8_t *mask, const bool inverse,
25*77c1e3ccSAndroid Build Coastguard Worker const CONV_BUF_TYPE *src0,
26*77c1e3ccSAndroid Build Coastguard Worker int src0_stride,
27*77c1e3ccSAndroid Build Coastguard Worker const CONV_BUF_TYPE *src1,
28*77c1e3ccSAndroid Build Coastguard Worker int src1_stride, int h, int w,
29*77c1e3ccSAndroid Build Coastguard Worker ConvolveParams *conv_params, int bd) {
30*77c1e3ccSAndroid Build Coastguard Worker const int round =
31*77c1e3ccSAndroid Build Coastguard Worker 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
32*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t round_vec = vdupq_n_s16((int16_t)(-round));
33*77c1e3ccSAndroid Build Coastguard Worker
34*77c1e3ccSAndroid Build Coastguard Worker if (w >= 16) {
35*77c1e3ccSAndroid Build Coastguard Worker int i = 0;
36*77c1e3ccSAndroid Build Coastguard Worker do {
37*77c1e3ccSAndroid Build Coastguard Worker int j = 0;
38*77c1e3ccSAndroid Build Coastguard Worker do {
39*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t s0_lo = vld1q_u16(src0 + j);
40*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t s1_lo = vld1q_u16(src1 + j);
41*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t s0_hi = vld1q_u16(src0 + j + 8);
42*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t s1_hi = vld1q_u16(src1 + j + 8);
43*77c1e3ccSAndroid Build Coastguard Worker
44*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t diff_lo_u16 = vrshlq_u16(vabdq_u16(s0_lo, s1_lo), round_vec);
45*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t diff_hi_u16 = vrshlq_u16(vabdq_u16(s0_hi, s1_hi), round_vec);
46*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t diff_lo_u8 = vshrn_n_u16(diff_lo_u16, DIFF_FACTOR_LOG2);
47*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t diff_hi_u8 = vshrn_n_u16(diff_hi_u16, DIFF_FACTOR_LOG2);
48*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t diff = vcombine_u8(diff_lo_u8, diff_hi_u8);
49*77c1e3ccSAndroid Build Coastguard Worker
50*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t m;
51*77c1e3ccSAndroid Build Coastguard Worker if (inverse) {
52*77c1e3ccSAndroid Build Coastguard Worker m = vqsubq_u8(vdupq_n_u8(64 - 38), diff); // Saturating to 0
53*77c1e3ccSAndroid Build Coastguard Worker } else {
54*77c1e3ccSAndroid Build Coastguard Worker m = vminq_u8(vaddq_u8(diff, vdupq_n_u8(38)), vdupq_n_u8(64));
55*77c1e3ccSAndroid Build Coastguard Worker }
56*77c1e3ccSAndroid Build Coastguard Worker
57*77c1e3ccSAndroid Build Coastguard Worker vst1q_u8(mask, m);
58*77c1e3ccSAndroid Build Coastguard Worker
59*77c1e3ccSAndroid Build Coastguard Worker mask += 16;
60*77c1e3ccSAndroid Build Coastguard Worker j += 16;
61*77c1e3ccSAndroid Build Coastguard Worker } while (j < w);
62*77c1e3ccSAndroid Build Coastguard Worker src0 += src0_stride;
63*77c1e3ccSAndroid Build Coastguard Worker src1 += src1_stride;
64*77c1e3ccSAndroid Build Coastguard Worker } while (++i < h);
65*77c1e3ccSAndroid Build Coastguard Worker } else if (w == 8) {
66*77c1e3ccSAndroid Build Coastguard Worker int i = 0;
67*77c1e3ccSAndroid Build Coastguard Worker do {
68*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t s0 = vld1q_u16(src0);
69*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t s1 = vld1q_u16(src1);
70*77c1e3ccSAndroid Build Coastguard Worker
71*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t diff_u16 = vrshlq_u16(vabdq_u16(s0, s1), round_vec);
72*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, DIFF_FACTOR_LOG2);
73*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t m;
74*77c1e3ccSAndroid Build Coastguard Worker if (inverse) {
75*77c1e3ccSAndroid Build Coastguard Worker m = vqsub_u8(vdup_n_u8(64 - 38), diff_u8); // Saturating to 0
76*77c1e3ccSAndroid Build Coastguard Worker } else {
77*77c1e3ccSAndroid Build Coastguard Worker m = vmin_u8(vadd_u8(diff_u8, vdup_n_u8(38)), vdup_n_u8(64));
78*77c1e3ccSAndroid Build Coastguard Worker }
79*77c1e3ccSAndroid Build Coastguard Worker
80*77c1e3ccSAndroid Build Coastguard Worker vst1_u8(mask, m);
81*77c1e3ccSAndroid Build Coastguard Worker
82*77c1e3ccSAndroid Build Coastguard Worker mask += 8;
83*77c1e3ccSAndroid Build Coastguard Worker src0 += src0_stride;
84*77c1e3ccSAndroid Build Coastguard Worker src1 += src1_stride;
85*77c1e3ccSAndroid Build Coastguard Worker } while (++i < h);
86*77c1e3ccSAndroid Build Coastguard Worker } else if (w == 4) {
87*77c1e3ccSAndroid Build Coastguard Worker int i = 0;
88*77c1e3ccSAndroid Build Coastguard Worker do {
89*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t s0 =
90*77c1e3ccSAndroid Build Coastguard Worker vcombine_u16(vld1_u16(src0), vld1_u16(src0 + src0_stride));
91*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t s1 =
92*77c1e3ccSAndroid Build Coastguard Worker vcombine_u16(vld1_u16(src1), vld1_u16(src1 + src1_stride));
93*77c1e3ccSAndroid Build Coastguard Worker
94*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t diff_u16 = vrshlq_u16(vabdq_u16(s0, s1), round_vec);
95*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t diff_u8 = vshrn_n_u16(diff_u16, DIFF_FACTOR_LOG2);
96*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t m;
97*77c1e3ccSAndroid Build Coastguard Worker if (inverse) {
98*77c1e3ccSAndroid Build Coastguard Worker m = vqsub_u8(vdup_n_u8(64 - 38), diff_u8); // Saturating to 0
99*77c1e3ccSAndroid Build Coastguard Worker } else {
100*77c1e3ccSAndroid Build Coastguard Worker m = vmin_u8(vadd_u8(diff_u8, vdup_n_u8(38)), vdup_n_u8(64));
101*77c1e3ccSAndroid Build Coastguard Worker }
102*77c1e3ccSAndroid Build Coastguard Worker
103*77c1e3ccSAndroid Build Coastguard Worker vst1_u8(mask, m);
104*77c1e3ccSAndroid Build Coastguard Worker
105*77c1e3ccSAndroid Build Coastguard Worker mask += 8;
106*77c1e3ccSAndroid Build Coastguard Worker src0 += 2 * src0_stride;
107*77c1e3ccSAndroid Build Coastguard Worker src1 += 2 * src1_stride;
108*77c1e3ccSAndroid Build Coastguard Worker i += 2;
109*77c1e3ccSAndroid Build Coastguard Worker } while (i < h);
110*77c1e3ccSAndroid Build Coastguard Worker }
111*77c1e3ccSAndroid Build Coastguard Worker }
112*77c1e3ccSAndroid Build Coastguard Worker
av1_build_compound_diffwtd_mask_d16_neon(uint8_t * mask,DIFFWTD_MASK_TYPE mask_type,const CONV_BUF_TYPE * src0,int src0_stride,const CONV_BUF_TYPE * src1,int src1_stride,int h,int w,ConvolveParams * conv_params,int bd)113*77c1e3ccSAndroid Build Coastguard Worker void av1_build_compound_diffwtd_mask_d16_neon(
114*77c1e3ccSAndroid Build Coastguard Worker uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
115*77c1e3ccSAndroid Build Coastguard Worker int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
116*77c1e3ccSAndroid Build Coastguard Worker ConvolveParams *conv_params, int bd) {
117*77c1e3ccSAndroid Build Coastguard Worker assert(h >= 4);
118*77c1e3ccSAndroid Build Coastguard Worker assert(w >= 4);
119*77c1e3ccSAndroid Build Coastguard Worker assert((mask_type == DIFFWTD_38_INV) || (mask_type == DIFFWTD_38));
120*77c1e3ccSAndroid Build Coastguard Worker
121*77c1e3ccSAndroid Build Coastguard Worker if (mask_type == DIFFWTD_38) {
122*77c1e3ccSAndroid Build Coastguard Worker diffwtd_mask_d16_neon(mask, /*inverse=*/false, src0, src0_stride, src1,
123*77c1e3ccSAndroid Build Coastguard Worker src1_stride, h, w, conv_params, bd);
124*77c1e3ccSAndroid Build Coastguard Worker } else { // mask_type == DIFFWTD_38_INV
125*77c1e3ccSAndroid Build Coastguard Worker diffwtd_mask_d16_neon(mask, /*inverse=*/true, src0, src0_stride, src1,
126*77c1e3ccSAndroid Build Coastguard Worker src1_stride, h, w, conv_params, bd);
127*77c1e3ccSAndroid Build Coastguard Worker }
128*77c1e3ccSAndroid Build Coastguard Worker }
129*77c1e3ccSAndroid Build Coastguard Worker
diffwtd_mask_neon(uint8_t * mask,const bool inverse,const uint8_t * src0,int src0_stride,const uint8_t * src1,int src1_stride,int h,int w)130*77c1e3ccSAndroid Build Coastguard Worker static inline void diffwtd_mask_neon(uint8_t *mask, const bool inverse,
131*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src0, int src0_stride,
132*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src1, int src1_stride,
133*77c1e3ccSAndroid Build Coastguard Worker int h, int w) {
134*77c1e3ccSAndroid Build Coastguard Worker if (w >= 16) {
135*77c1e3ccSAndroid Build Coastguard Worker int i = 0;
136*77c1e3ccSAndroid Build Coastguard Worker do {
137*77c1e3ccSAndroid Build Coastguard Worker int j = 0;
138*77c1e3ccSAndroid Build Coastguard Worker do {
139*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s0 = vld1q_u8(src0 + j);
140*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s1 = vld1q_u8(src1 + j);
141*77c1e3ccSAndroid Build Coastguard Worker
142*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t diff = vshrq_n_u8(vabdq_u8(s0, s1), DIFF_FACTOR_LOG2);
143*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t m;
144*77c1e3ccSAndroid Build Coastguard Worker if (inverse) {
145*77c1e3ccSAndroid Build Coastguard Worker m = vqsubq_u8(vdupq_n_u8(64 - 38), diff); // Saturating to 0
146*77c1e3ccSAndroid Build Coastguard Worker } else {
147*77c1e3ccSAndroid Build Coastguard Worker m = vminq_u8(vaddq_u8(diff, vdupq_n_u8(38)), vdupq_n_u8(64));
148*77c1e3ccSAndroid Build Coastguard Worker }
149*77c1e3ccSAndroid Build Coastguard Worker
150*77c1e3ccSAndroid Build Coastguard Worker vst1q_u8(mask, m);
151*77c1e3ccSAndroid Build Coastguard Worker
152*77c1e3ccSAndroid Build Coastguard Worker mask += 16;
153*77c1e3ccSAndroid Build Coastguard Worker j += 16;
154*77c1e3ccSAndroid Build Coastguard Worker } while (j < w);
155*77c1e3ccSAndroid Build Coastguard Worker src0 += src0_stride;
156*77c1e3ccSAndroid Build Coastguard Worker src1 += src1_stride;
157*77c1e3ccSAndroid Build Coastguard Worker } while (++i < h);
158*77c1e3ccSAndroid Build Coastguard Worker } else if (w == 8) {
159*77c1e3ccSAndroid Build Coastguard Worker int i = 0;
160*77c1e3ccSAndroid Build Coastguard Worker do {
161*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s0 = vcombine_u8(vld1_u8(src0), vld1_u8(src0 + src0_stride));
162*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s1 = vcombine_u8(vld1_u8(src1), vld1_u8(src1 + src0_stride));
163*77c1e3ccSAndroid Build Coastguard Worker
164*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t diff = vshrq_n_u8(vabdq_u8(s0, s1), DIFF_FACTOR_LOG2);
165*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t m;
166*77c1e3ccSAndroid Build Coastguard Worker if (inverse) {
167*77c1e3ccSAndroid Build Coastguard Worker m = vqsubq_u8(vdupq_n_u8(64 - 38), diff); // Saturating to 0
168*77c1e3ccSAndroid Build Coastguard Worker } else {
169*77c1e3ccSAndroid Build Coastguard Worker m = vminq_u8(vaddq_u8(diff, vdupq_n_u8(38)), vdupq_n_u8(64));
170*77c1e3ccSAndroid Build Coastguard Worker }
171*77c1e3ccSAndroid Build Coastguard Worker
172*77c1e3ccSAndroid Build Coastguard Worker vst1q_u8(mask, m);
173*77c1e3ccSAndroid Build Coastguard Worker
174*77c1e3ccSAndroid Build Coastguard Worker mask += 16;
175*77c1e3ccSAndroid Build Coastguard Worker src0 += 2 * src0_stride;
176*77c1e3ccSAndroid Build Coastguard Worker src1 += 2 * src1_stride;
177*77c1e3ccSAndroid Build Coastguard Worker i += 2;
178*77c1e3ccSAndroid Build Coastguard Worker } while (i < h);
179*77c1e3ccSAndroid Build Coastguard Worker } else if (w == 4) {
180*77c1e3ccSAndroid Build Coastguard Worker int i = 0;
181*77c1e3ccSAndroid Build Coastguard Worker do {
182*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s0 = load_unaligned_u8q(src0, src0_stride);
183*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t s1 = load_unaligned_u8q(src1, src1_stride);
184*77c1e3ccSAndroid Build Coastguard Worker
185*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t diff = vshrq_n_u8(vabdq_u8(s0, s1), DIFF_FACTOR_LOG2);
186*77c1e3ccSAndroid Build Coastguard Worker uint8x16_t m;
187*77c1e3ccSAndroid Build Coastguard Worker if (inverse) {
188*77c1e3ccSAndroid Build Coastguard Worker m = vqsubq_u8(vdupq_n_u8(64 - 38), diff); // Saturating to 0
189*77c1e3ccSAndroid Build Coastguard Worker } else {
190*77c1e3ccSAndroid Build Coastguard Worker m = vminq_u8(vaddq_u8(diff, vdupq_n_u8(38)), vdupq_n_u8(64));
191*77c1e3ccSAndroid Build Coastguard Worker }
192*77c1e3ccSAndroid Build Coastguard Worker
193*77c1e3ccSAndroid Build Coastguard Worker vst1q_u8(mask, m);
194*77c1e3ccSAndroid Build Coastguard Worker
195*77c1e3ccSAndroid Build Coastguard Worker mask += 16;
196*77c1e3ccSAndroid Build Coastguard Worker src0 += 4 * src0_stride;
197*77c1e3ccSAndroid Build Coastguard Worker src1 += 4 * src1_stride;
198*77c1e3ccSAndroid Build Coastguard Worker i += 4;
199*77c1e3ccSAndroid Build Coastguard Worker } while (i < h);
200*77c1e3ccSAndroid Build Coastguard Worker }
201*77c1e3ccSAndroid Build Coastguard Worker }
202*77c1e3ccSAndroid Build Coastguard Worker
av1_build_compound_diffwtd_mask_neon(uint8_t * mask,DIFFWTD_MASK_TYPE mask_type,const uint8_t * src0,int src0_stride,const uint8_t * src1,int src1_stride,int h,int w)203*77c1e3ccSAndroid Build Coastguard Worker void av1_build_compound_diffwtd_mask_neon(uint8_t *mask,
204*77c1e3ccSAndroid Build Coastguard Worker DIFFWTD_MASK_TYPE mask_type,
205*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src0, int src0_stride,
206*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src1, int src1_stride,
207*77c1e3ccSAndroid Build Coastguard Worker int h, int w) {
208*77c1e3ccSAndroid Build Coastguard Worker assert(h % 4 == 0);
209*77c1e3ccSAndroid Build Coastguard Worker assert(w % 4 == 0);
210*77c1e3ccSAndroid Build Coastguard Worker assert(mask_type == DIFFWTD_38_INV || mask_type == DIFFWTD_38);
211*77c1e3ccSAndroid Build Coastguard Worker
212*77c1e3ccSAndroid Build Coastguard Worker if (mask_type == DIFFWTD_38) {
213*77c1e3ccSAndroid Build Coastguard Worker diffwtd_mask_neon(mask, /*inverse=*/false, src0, src0_stride, src1,
214*77c1e3ccSAndroid Build Coastguard Worker src1_stride, h, w);
215*77c1e3ccSAndroid Build Coastguard Worker } else { // mask_type == DIFFWTD_38_INV
216*77c1e3ccSAndroid Build Coastguard Worker diffwtd_mask_neon(mask, /*inverse=*/true, src0, src0_stride, src1,
217*77c1e3ccSAndroid Build Coastguard Worker src1_stride, h, w);
218*77c1e3ccSAndroid Build Coastguard Worker }
219*77c1e3ccSAndroid Build Coastguard Worker }
220