1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker *
4*77c1e3ccSAndroid Build Coastguard Worker * This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker * was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker * Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker */
11*77c1e3ccSAndroid Build Coastguard Worker
12*77c1e3ccSAndroid Build Coastguard Worker #include <arm_neon.h>
13*77c1e3ccSAndroid Build Coastguard Worker #include <assert.h>
14*77c1e3ccSAndroid Build Coastguard Worker
15*77c1e3ccSAndroid Build Coastguard Worker #include "config/aom_config.h"
16*77c1e3ccSAndroid Build Coastguard Worker #include "config/av1_rtcd.h"
17*77c1e3ccSAndroid Build Coastguard Worker
18*77c1e3ccSAndroid Build Coastguard Worker #include "aom/aom_integer.h"
19*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/arm/mem_neon.h"
20*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/arm/sum_neon.h"
21*77c1e3ccSAndroid Build Coastguard Worker
22*77c1e3ccSAndroid Build Coastguard Worker #define MAX_UPSAMPLE_SZ 16
23*77c1e3ccSAndroid Build Coastguard Worker
24*77c1e3ccSAndroid Build Coastguard Worker // TODO(aomedia:349436249): enable for armv7 after SIGBUS is fixed.
25*77c1e3ccSAndroid Build Coastguard Worker #if AOM_ARCH_AARCH64
26*77c1e3ccSAndroid Build Coastguard Worker
27*77c1e3ccSAndroid Build Coastguard Worker // These kernels are a transposed version of those defined in reconintra.c,
28*77c1e3ccSAndroid Build Coastguard Worker // with the absolute value of the negatives taken in the top row.
29*77c1e3ccSAndroid Build Coastguard Worker DECLARE_ALIGNED(16, const uint8_t,
30*77c1e3ccSAndroid Build Coastguard Worker av1_filter_intra_taps_neon[FILTER_INTRA_MODES][7][8]) = {
31*77c1e3ccSAndroid Build Coastguard Worker // clang-format off
32*77c1e3ccSAndroid Build Coastguard Worker {
33*77c1e3ccSAndroid Build Coastguard Worker { 6, 5, 3, 3, 4, 3, 3, 3 },
34*77c1e3ccSAndroid Build Coastguard Worker { 10, 2, 1, 1, 6, 2, 2, 1 },
35*77c1e3ccSAndroid Build Coastguard Worker { 0, 10, 1, 1, 0, 6, 2, 2 },
36*77c1e3ccSAndroid Build Coastguard Worker { 0, 0, 10, 2, 0, 0, 6, 2 },
37*77c1e3ccSAndroid Build Coastguard Worker { 0, 0, 0, 10, 0, 0, 0, 6 },
38*77c1e3ccSAndroid Build Coastguard Worker { 12, 9, 7, 5, 2, 2, 2, 3 },
39*77c1e3ccSAndroid Build Coastguard Worker { 0, 0, 0, 0, 12, 9, 7, 5 }
40*77c1e3ccSAndroid Build Coastguard Worker },
41*77c1e3ccSAndroid Build Coastguard Worker {
42*77c1e3ccSAndroid Build Coastguard Worker { 10, 6, 4, 2, 10, 6, 4, 2 },
43*77c1e3ccSAndroid Build Coastguard Worker { 16, 0, 0, 0, 16, 0, 0, 0 },
44*77c1e3ccSAndroid Build Coastguard Worker { 0, 16, 0, 0, 0, 16, 0, 0 },
45*77c1e3ccSAndroid Build Coastguard Worker { 0, 0, 16, 0, 0, 0, 16, 0 },
46*77c1e3ccSAndroid Build Coastguard Worker { 0, 0, 0, 16, 0, 0, 0, 16 },
47*77c1e3ccSAndroid Build Coastguard Worker { 10, 6, 4, 2, 0, 0, 0, 0 },
48*77c1e3ccSAndroid Build Coastguard Worker { 0, 0, 0, 0, 10, 6, 4, 2 }
49*77c1e3ccSAndroid Build Coastguard Worker },
50*77c1e3ccSAndroid Build Coastguard Worker {
51*77c1e3ccSAndroid Build Coastguard Worker { 8, 8, 8, 8, 4, 4, 4, 4 },
52*77c1e3ccSAndroid Build Coastguard Worker { 8, 0, 0, 0, 4, 0, 0, 0 },
53*77c1e3ccSAndroid Build Coastguard Worker { 0, 8, 0, 0, 0, 4, 0, 0 },
54*77c1e3ccSAndroid Build Coastguard Worker { 0, 0, 8, 0, 0, 0, 4, 0 },
55*77c1e3ccSAndroid Build Coastguard Worker { 0, 0, 0, 8, 0, 0, 0, 4 },
56*77c1e3ccSAndroid Build Coastguard Worker { 16, 16, 16, 16, 0, 0, 0, 0 },
57*77c1e3ccSAndroid Build Coastguard Worker { 0, 0, 0, 0, 16, 16, 16, 16 }
58*77c1e3ccSAndroid Build Coastguard Worker },
59*77c1e3ccSAndroid Build Coastguard Worker {
60*77c1e3ccSAndroid Build Coastguard Worker { 2, 1, 1, 0, 1, 1, 1, 1 },
61*77c1e3ccSAndroid Build Coastguard Worker { 8, 3, 2, 1, 4, 3, 2, 2 },
62*77c1e3ccSAndroid Build Coastguard Worker { 0, 8, 3, 2, 0, 4, 3, 2 },
63*77c1e3ccSAndroid Build Coastguard Worker { 0, 0, 8, 3, 0, 0, 4, 3 },
64*77c1e3ccSAndroid Build Coastguard Worker { 0, 0, 0, 8, 0, 0, 0, 4 },
65*77c1e3ccSAndroid Build Coastguard Worker { 10, 6, 4, 2, 3, 4, 4, 3 },
66*77c1e3ccSAndroid Build Coastguard Worker { 0, 0, 0, 0, 10, 6, 4, 3 }
67*77c1e3ccSAndroid Build Coastguard Worker },
68*77c1e3ccSAndroid Build Coastguard Worker {
69*77c1e3ccSAndroid Build Coastguard Worker { 12, 10, 9, 8, 10, 9, 8, 7 },
70*77c1e3ccSAndroid Build Coastguard Worker { 14, 0, 0, 0, 12, 1, 0, 0 },
71*77c1e3ccSAndroid Build Coastguard Worker { 0, 14, 0, 0, 0, 12, 0, 0 },
72*77c1e3ccSAndroid Build Coastguard Worker { 0, 0, 14, 0, 0, 0, 12, 1 },
73*77c1e3ccSAndroid Build Coastguard Worker { 0, 0, 0, 14, 0, 0, 0, 12 },
74*77c1e3ccSAndroid Build Coastguard Worker { 14, 12, 11, 10, 0, 0, 1, 1 },
75*77c1e3ccSAndroid Build Coastguard Worker { 0, 0, 0, 0, 14, 12, 11, 9 }
76*77c1e3ccSAndroid Build Coastguard Worker }
77*77c1e3ccSAndroid Build Coastguard Worker // clang-format on
78*77c1e3ccSAndroid Build Coastguard Worker };
79*77c1e3ccSAndroid Build Coastguard Worker
80*77c1e3ccSAndroid Build Coastguard Worker #define FILTER_INTRA_SCALE_BITS 4
81*77c1e3ccSAndroid Build Coastguard Worker
av1_filter_intra_predictor_neon(uint8_t * dst,ptrdiff_t stride,TX_SIZE tx_size,const uint8_t * above,const uint8_t * left,int mode)82*77c1e3ccSAndroid Build Coastguard Worker void av1_filter_intra_predictor_neon(uint8_t *dst, ptrdiff_t stride,
83*77c1e3ccSAndroid Build Coastguard Worker TX_SIZE tx_size, const uint8_t *above,
84*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *left, int mode) {
85*77c1e3ccSAndroid Build Coastguard Worker const int width = tx_size_wide[tx_size];
86*77c1e3ccSAndroid Build Coastguard Worker const int height = tx_size_high[tx_size];
87*77c1e3ccSAndroid Build Coastguard Worker assert(width <= 32 && height <= 32);
88*77c1e3ccSAndroid Build Coastguard Worker
89*77c1e3ccSAndroid Build Coastguard Worker const uint8x8_t f0 = vld1_u8(av1_filter_intra_taps_neon[mode][0]);
90*77c1e3ccSAndroid Build Coastguard Worker const uint8x8_t f1 = vld1_u8(av1_filter_intra_taps_neon[mode][1]);
91*77c1e3ccSAndroid Build Coastguard Worker const uint8x8_t f2 = vld1_u8(av1_filter_intra_taps_neon[mode][2]);
92*77c1e3ccSAndroid Build Coastguard Worker const uint8x8_t f3 = vld1_u8(av1_filter_intra_taps_neon[mode][3]);
93*77c1e3ccSAndroid Build Coastguard Worker const uint8x8_t f4 = vld1_u8(av1_filter_intra_taps_neon[mode][4]);
94*77c1e3ccSAndroid Build Coastguard Worker const uint8x8_t f5 = vld1_u8(av1_filter_intra_taps_neon[mode][5]);
95*77c1e3ccSAndroid Build Coastguard Worker const uint8x8_t f6 = vld1_u8(av1_filter_intra_taps_neon[mode][6]);
96*77c1e3ccSAndroid Build Coastguard Worker
97*77c1e3ccSAndroid Build Coastguard Worker uint8_t buffer[33][33];
98*77c1e3ccSAndroid Build Coastguard Worker // Populate the top row in the scratch buffer with data from above.
99*77c1e3ccSAndroid Build Coastguard Worker memcpy(buffer[0], &above[-1], (width + 1) * sizeof(uint8_t));
100*77c1e3ccSAndroid Build Coastguard Worker // Populate the first column in the scratch buffer with data from the left.
101*77c1e3ccSAndroid Build Coastguard Worker int r = 0;
102*77c1e3ccSAndroid Build Coastguard Worker do {
103*77c1e3ccSAndroid Build Coastguard Worker buffer[r + 1][0] = left[r];
104*77c1e3ccSAndroid Build Coastguard Worker } while (++r < height);
105*77c1e3ccSAndroid Build Coastguard Worker
106*77c1e3ccSAndroid Build Coastguard Worker // Computing 4 cols per iteration (instead of 8) for 8x<h> blocks is faster.
107*77c1e3ccSAndroid Build Coastguard Worker if (width <= 8) {
108*77c1e3ccSAndroid Build Coastguard Worker r = 1;
109*77c1e3ccSAndroid Build Coastguard Worker do {
110*77c1e3ccSAndroid Build Coastguard Worker int c = 1;
111*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s0 = vld1_dup_u8(&buffer[r - 1][c - 1]);
112*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s5 = vld1_dup_u8(&buffer[r + 0][c - 1]);
113*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s6 = vld1_dup_u8(&buffer[r + 1][c - 1]);
114*77c1e3ccSAndroid Build Coastguard Worker
115*77c1e3ccSAndroid Build Coastguard Worker do {
116*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s1234 = load_u8_4x1(&buffer[r - 1][c - 1] + 1);
117*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s1 = vdup_lane_u8(s1234, 0);
118*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s2 = vdup_lane_u8(s1234, 1);
119*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s3 = vdup_lane_u8(s1234, 2);
120*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s4 = vdup_lane_u8(s1234, 3);
121*77c1e3ccSAndroid Build Coastguard Worker
122*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t sum = vmull_u8(s1, f1);
123*77c1e3ccSAndroid Build Coastguard Worker // First row of each filter has all negative values so subtract.
124*77c1e3ccSAndroid Build Coastguard Worker sum = vmlsl_u8(sum, s0, f0);
125*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_u8(sum, s2, f2);
126*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_u8(sum, s3, f3);
127*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_u8(sum, s4, f4);
128*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_u8(sum, s5, f5);
129*77c1e3ccSAndroid Build Coastguard Worker sum = vmlal_u8(sum, s6, f6);
130*77c1e3ccSAndroid Build Coastguard Worker
131*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t res =
132*77c1e3ccSAndroid Build Coastguard Worker vqrshrun_n_s16(vreinterpretq_s16_u16(sum), FILTER_INTRA_SCALE_BITS);
133*77c1e3ccSAndroid Build Coastguard Worker
134*77c1e3ccSAndroid Build Coastguard Worker // Store buffer[r + 0][c] and buffer[r + 1][c].
135*77c1e3ccSAndroid Build Coastguard Worker store_u8x4_strided_x2(&buffer[r][c], 33, res);
136*77c1e3ccSAndroid Build Coastguard Worker
137*77c1e3ccSAndroid Build Coastguard Worker store_u8x4_strided_x2(dst + (r - 1) * stride + c - 1, stride, res);
138*77c1e3ccSAndroid Build Coastguard Worker
139*77c1e3ccSAndroid Build Coastguard Worker s0 = s4;
140*77c1e3ccSAndroid Build Coastguard Worker s5 = vdup_lane_u8(res, 3);
141*77c1e3ccSAndroid Build Coastguard Worker s6 = vdup_lane_u8(res, 7);
142*77c1e3ccSAndroid Build Coastguard Worker c += 4;
143*77c1e3ccSAndroid Build Coastguard Worker } while (c < width + 1);
144*77c1e3ccSAndroid Build Coastguard Worker
145*77c1e3ccSAndroid Build Coastguard Worker r += 2;
146*77c1e3ccSAndroid Build Coastguard Worker } while (r < height + 1);
147*77c1e3ccSAndroid Build Coastguard Worker } else {
148*77c1e3ccSAndroid Build Coastguard Worker r = 1;
149*77c1e3ccSAndroid Build Coastguard Worker do {
150*77c1e3ccSAndroid Build Coastguard Worker int c = 1;
151*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s0_lo = vld1_dup_u8(&buffer[r - 1][c - 1]);
152*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s5_lo = vld1_dup_u8(&buffer[r + 0][c - 1]);
153*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s6_lo = vld1_dup_u8(&buffer[r + 1][c - 1]);
154*77c1e3ccSAndroid Build Coastguard Worker
155*77c1e3ccSAndroid Build Coastguard Worker do {
156*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s1234 = vld1_u8(&buffer[r - 1][c - 1] + 1);
157*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s1_lo = vdup_lane_u8(s1234, 0);
158*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s2_lo = vdup_lane_u8(s1234, 1);
159*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s3_lo = vdup_lane_u8(s1234, 2);
160*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s4_lo = vdup_lane_u8(s1234, 3);
161*77c1e3ccSAndroid Build Coastguard Worker
162*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t sum_lo = vmull_u8(s1_lo, f1);
163*77c1e3ccSAndroid Build Coastguard Worker // First row of each filter has all negative values so subtract.
164*77c1e3ccSAndroid Build Coastguard Worker sum_lo = vmlsl_u8(sum_lo, s0_lo, f0);
165*77c1e3ccSAndroid Build Coastguard Worker sum_lo = vmlal_u8(sum_lo, s2_lo, f2);
166*77c1e3ccSAndroid Build Coastguard Worker sum_lo = vmlal_u8(sum_lo, s3_lo, f3);
167*77c1e3ccSAndroid Build Coastguard Worker sum_lo = vmlal_u8(sum_lo, s4_lo, f4);
168*77c1e3ccSAndroid Build Coastguard Worker sum_lo = vmlal_u8(sum_lo, s5_lo, f5);
169*77c1e3ccSAndroid Build Coastguard Worker sum_lo = vmlal_u8(sum_lo, s6_lo, f6);
170*77c1e3ccSAndroid Build Coastguard Worker
171*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t res_lo = vqrshrun_n_s16(vreinterpretq_s16_u16(sum_lo),
172*77c1e3ccSAndroid Build Coastguard Worker FILTER_INTRA_SCALE_BITS);
173*77c1e3ccSAndroid Build Coastguard Worker
174*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s0_hi = s4_lo;
175*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s1_hi = vdup_lane_u8(s1234, 4);
176*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s2_hi = vdup_lane_u8(s1234, 5);
177*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s3_hi = vdup_lane_u8(s1234, 6);
178*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s4_hi = vdup_lane_u8(s1234, 7);
179*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s5_hi = vdup_lane_u8(res_lo, 3);
180*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s6_hi = vdup_lane_u8(res_lo, 7);
181*77c1e3ccSAndroid Build Coastguard Worker
182*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t sum_hi = vmull_u8(s1_hi, f1);
183*77c1e3ccSAndroid Build Coastguard Worker // First row of each filter has all negative values so subtract.
184*77c1e3ccSAndroid Build Coastguard Worker sum_hi = vmlsl_u8(sum_hi, s0_hi, f0);
185*77c1e3ccSAndroid Build Coastguard Worker sum_hi = vmlal_u8(sum_hi, s2_hi, f2);
186*77c1e3ccSAndroid Build Coastguard Worker sum_hi = vmlal_u8(sum_hi, s3_hi, f3);
187*77c1e3ccSAndroid Build Coastguard Worker sum_hi = vmlal_u8(sum_hi, s4_hi, f4);
188*77c1e3ccSAndroid Build Coastguard Worker sum_hi = vmlal_u8(sum_hi, s5_hi, f5);
189*77c1e3ccSAndroid Build Coastguard Worker sum_hi = vmlal_u8(sum_hi, s6_hi, f6);
190*77c1e3ccSAndroid Build Coastguard Worker
191*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t res_hi = vqrshrun_n_s16(vreinterpretq_s16_u16(sum_hi),
192*77c1e3ccSAndroid Build Coastguard Worker FILTER_INTRA_SCALE_BITS);
193*77c1e3ccSAndroid Build Coastguard Worker
194*77c1e3ccSAndroid Build Coastguard Worker uint32x2x2_t res =
195*77c1e3ccSAndroid Build Coastguard Worker vzip_u32(vreinterpret_u32_u8(res_lo), vreinterpret_u32_u8(res_hi));
196*77c1e3ccSAndroid Build Coastguard Worker
197*77c1e3ccSAndroid Build Coastguard Worker vst1_u8(&buffer[r + 0][c], vreinterpret_u8_u32(res.val[0]));
198*77c1e3ccSAndroid Build Coastguard Worker vst1_u8(&buffer[r + 1][c], vreinterpret_u8_u32(res.val[1]));
199*77c1e3ccSAndroid Build Coastguard Worker
200*77c1e3ccSAndroid Build Coastguard Worker vst1_u8(dst + (r - 1) * stride + c - 1,
201*77c1e3ccSAndroid Build Coastguard Worker vreinterpret_u8_u32(res.val[0]));
202*77c1e3ccSAndroid Build Coastguard Worker vst1_u8(dst + (r + 0) * stride + c - 1,
203*77c1e3ccSAndroid Build Coastguard Worker vreinterpret_u8_u32(res.val[1]));
204*77c1e3ccSAndroid Build Coastguard Worker
205*77c1e3ccSAndroid Build Coastguard Worker s0_lo = s4_hi;
206*77c1e3ccSAndroid Build Coastguard Worker s5_lo = vdup_lane_u8(res_hi, 3);
207*77c1e3ccSAndroid Build Coastguard Worker s6_lo = vdup_lane_u8(res_hi, 7);
208*77c1e3ccSAndroid Build Coastguard Worker c += 8;
209*77c1e3ccSAndroid Build Coastguard Worker } while (c < width + 1);
210*77c1e3ccSAndroid Build Coastguard Worker
211*77c1e3ccSAndroid Build Coastguard Worker r += 2;
212*77c1e3ccSAndroid Build Coastguard Worker } while (r < height + 1);
213*77c1e3ccSAndroid Build Coastguard Worker }
214*77c1e3ccSAndroid Build Coastguard Worker }
215*77c1e3ccSAndroid Build Coastguard Worker #endif // AOM_ARCH_AARCH64
216*77c1e3ccSAndroid Build Coastguard Worker
av1_filter_intra_edge_neon(uint8_t * p,int sz,int strength)217*77c1e3ccSAndroid Build Coastguard Worker void av1_filter_intra_edge_neon(uint8_t *p, int sz, int strength) {
218*77c1e3ccSAndroid Build Coastguard Worker if (!strength) return;
219*77c1e3ccSAndroid Build Coastguard Worker assert(sz >= 0 && sz <= 129);
220*77c1e3ccSAndroid Build Coastguard Worker
221*77c1e3ccSAndroid Build Coastguard Worker uint8_t edge[160]; // Max value of sz + enough padding for vector accesses.
222*77c1e3ccSAndroid Build Coastguard Worker memcpy(edge + 1, p, sz * sizeof(*p));
223*77c1e3ccSAndroid Build Coastguard Worker
224*77c1e3ccSAndroid Build Coastguard Worker // Populate extra space appropriately.
225*77c1e3ccSAndroid Build Coastguard Worker edge[0] = edge[1];
226*77c1e3ccSAndroid Build Coastguard Worker edge[sz + 1] = edge[sz];
227*77c1e3ccSAndroid Build Coastguard Worker edge[sz + 2] = edge[sz];
228*77c1e3ccSAndroid Build Coastguard Worker
229*77c1e3ccSAndroid Build Coastguard Worker // Don't overwrite first pixel.
230*77c1e3ccSAndroid Build Coastguard Worker uint8_t *dst = p + 1;
231*77c1e3ccSAndroid Build Coastguard Worker sz--;
232*77c1e3ccSAndroid Build Coastguard Worker
233*77c1e3ccSAndroid Build Coastguard Worker if (strength == 1) { // Filter: {4, 8, 4}.
234*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src = edge + 1;
235*77c1e3ccSAndroid Build Coastguard Worker
236*77c1e3ccSAndroid Build Coastguard Worker while (sz >= 8) {
237*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s0 = vld1_u8(src);
238*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s1 = vld1_u8(src + 1);
239*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s2 = vld1_u8(src + 2);
240*77c1e3ccSAndroid Build Coastguard Worker
241*77c1e3ccSAndroid Build Coastguard Worker // Make use of the identity:
242*77c1e3ccSAndroid Build Coastguard Worker // (4*a + 8*b + 4*c) >> 4 == (a + (b << 1) + c) >> 2
243*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t t0 = vaddl_u8(s0, s2);
244*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t t1 = vaddl_u8(s1, s1);
245*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t sum = vaddq_u16(t0, t1);
246*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t res = vrshrn_n_u16(sum, 2);
247*77c1e3ccSAndroid Build Coastguard Worker
248*77c1e3ccSAndroid Build Coastguard Worker vst1_u8(dst, res);
249*77c1e3ccSAndroid Build Coastguard Worker
250*77c1e3ccSAndroid Build Coastguard Worker src += 8;
251*77c1e3ccSAndroid Build Coastguard Worker dst += 8;
252*77c1e3ccSAndroid Build Coastguard Worker sz -= 8;
253*77c1e3ccSAndroid Build Coastguard Worker }
254*77c1e3ccSAndroid Build Coastguard Worker
255*77c1e3ccSAndroid Build Coastguard Worker if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values.
256*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s0 = vld1_u8(src);
257*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s1 = vld1_u8(src + 1);
258*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s2 = vld1_u8(src + 2);
259*77c1e3ccSAndroid Build Coastguard Worker
260*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t t0 = vaddl_u8(s0, s2);
261*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t t1 = vaddl_u8(s1, s1);
262*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t sum = vaddq_u16(t0, t1);
263*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t res = vrshrn_n_u16(sum, 2);
264*77c1e3ccSAndroid Build Coastguard Worker
265*77c1e3ccSAndroid Build Coastguard Worker // Mask off out-of-bounds indices.
266*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t current_dst = vld1_u8(dst);
267*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t mask = vcgt_u8(vdup_n_u8(sz), vcreate_u8(0x0706050403020100));
268*77c1e3ccSAndroid Build Coastguard Worker res = vbsl_u8(mask, res, current_dst);
269*77c1e3ccSAndroid Build Coastguard Worker
270*77c1e3ccSAndroid Build Coastguard Worker vst1_u8(dst, res);
271*77c1e3ccSAndroid Build Coastguard Worker }
272*77c1e3ccSAndroid Build Coastguard Worker } else if (strength == 2) { // Filter: {5, 6, 5}.
273*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src = edge + 1;
274*77c1e3ccSAndroid Build Coastguard Worker
275*77c1e3ccSAndroid Build Coastguard Worker const uint8x8x3_t filter = { { vdup_n_u8(5), vdup_n_u8(6), vdup_n_u8(5) } };
276*77c1e3ccSAndroid Build Coastguard Worker
277*77c1e3ccSAndroid Build Coastguard Worker while (sz >= 8) {
278*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s0 = vld1_u8(src);
279*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s1 = vld1_u8(src + 1);
280*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s2 = vld1_u8(src + 2);
281*77c1e3ccSAndroid Build Coastguard Worker
282*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t accum = vmull_u8(s0, filter.val[0]);
283*77c1e3ccSAndroid Build Coastguard Worker accum = vmlal_u8(accum, s1, filter.val[1]);
284*77c1e3ccSAndroid Build Coastguard Worker accum = vmlal_u8(accum, s2, filter.val[2]);
285*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t res = vrshrn_n_u16(accum, 4);
286*77c1e3ccSAndroid Build Coastguard Worker
287*77c1e3ccSAndroid Build Coastguard Worker vst1_u8(dst, res);
288*77c1e3ccSAndroid Build Coastguard Worker
289*77c1e3ccSAndroid Build Coastguard Worker src += 8;
290*77c1e3ccSAndroid Build Coastguard Worker dst += 8;
291*77c1e3ccSAndroid Build Coastguard Worker sz -= 8;
292*77c1e3ccSAndroid Build Coastguard Worker }
293*77c1e3ccSAndroid Build Coastguard Worker
294*77c1e3ccSAndroid Build Coastguard Worker if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values.
295*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s0 = vld1_u8(src);
296*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s1 = vld1_u8(src + 1);
297*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s2 = vld1_u8(src + 2);
298*77c1e3ccSAndroid Build Coastguard Worker
299*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t accum = vmull_u8(s0, filter.val[0]);
300*77c1e3ccSAndroid Build Coastguard Worker accum = vmlal_u8(accum, s1, filter.val[1]);
301*77c1e3ccSAndroid Build Coastguard Worker accum = vmlal_u8(accum, s2, filter.val[2]);
302*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t res = vrshrn_n_u16(accum, 4);
303*77c1e3ccSAndroid Build Coastguard Worker
304*77c1e3ccSAndroid Build Coastguard Worker // Mask off out-of-bounds indices.
305*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t current_dst = vld1_u8(dst);
306*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t mask = vcgt_u8(vdup_n_u8(sz), vcreate_u8(0x0706050403020100));
307*77c1e3ccSAndroid Build Coastguard Worker res = vbsl_u8(mask, res, current_dst);
308*77c1e3ccSAndroid Build Coastguard Worker
309*77c1e3ccSAndroid Build Coastguard Worker vst1_u8(dst, res);
310*77c1e3ccSAndroid Build Coastguard Worker }
311*77c1e3ccSAndroid Build Coastguard Worker } else { // Filter {2, 4, 4, 4, 2}.
312*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src = edge;
313*77c1e3ccSAndroid Build Coastguard Worker
314*77c1e3ccSAndroid Build Coastguard Worker while (sz >= 8) {
315*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s0 = vld1_u8(src);
316*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s1 = vld1_u8(src + 1);
317*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s2 = vld1_u8(src + 2);
318*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s3 = vld1_u8(src + 3);
319*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s4 = vld1_u8(src + 4);
320*77c1e3ccSAndroid Build Coastguard Worker
321*77c1e3ccSAndroid Build Coastguard Worker // Make use of the identity:
322*77c1e3ccSAndroid Build Coastguard Worker // (2*a + 4*b + 4*c + 4*d + 2*e) >> 4 == (a + ((b + c + d) << 1) + e) >> 3
323*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t t0 = vaddl_u8(s0, s4);
324*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t t1 = vaddl_u8(s1, s2);
325*77c1e3ccSAndroid Build Coastguard Worker t1 = vaddw_u8(t1, s3);
326*77c1e3ccSAndroid Build Coastguard Worker t1 = vaddq_u16(t1, t1);
327*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t sum = vaddq_u16(t0, t1);
328*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t res = vrshrn_n_u16(sum, 3);
329*77c1e3ccSAndroid Build Coastguard Worker
330*77c1e3ccSAndroid Build Coastguard Worker vst1_u8(dst, res);
331*77c1e3ccSAndroid Build Coastguard Worker
332*77c1e3ccSAndroid Build Coastguard Worker src += 8;
333*77c1e3ccSAndroid Build Coastguard Worker dst += 8;
334*77c1e3ccSAndroid Build Coastguard Worker sz -= 8;
335*77c1e3ccSAndroid Build Coastguard Worker }
336*77c1e3ccSAndroid Build Coastguard Worker
337*77c1e3ccSAndroid Build Coastguard Worker if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values.
338*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s0 = vld1_u8(src);
339*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s1 = vld1_u8(src + 1);
340*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s2 = vld1_u8(src + 2);
341*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s3 = vld1_u8(src + 3);
342*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s4 = vld1_u8(src + 4);
343*77c1e3ccSAndroid Build Coastguard Worker
344*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t t0 = vaddl_u8(s0, s4);
345*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t t1 = vaddl_u8(s1, s2);
346*77c1e3ccSAndroid Build Coastguard Worker t1 = vaddw_u8(t1, s3);
347*77c1e3ccSAndroid Build Coastguard Worker t1 = vaddq_u16(t1, t1);
348*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t sum = vaddq_u16(t0, t1);
349*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t res = vrshrn_n_u16(sum, 3);
350*77c1e3ccSAndroid Build Coastguard Worker
351*77c1e3ccSAndroid Build Coastguard Worker // Mask off out-of-bounds indices.
352*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t current_dst = vld1_u8(dst);
353*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t mask = vcgt_u8(vdup_n_u8(sz), vcreate_u8(0x0706050403020100));
354*77c1e3ccSAndroid Build Coastguard Worker res = vbsl_u8(mask, res, current_dst);
355*77c1e3ccSAndroid Build Coastguard Worker
356*77c1e3ccSAndroid Build Coastguard Worker vst1_u8(dst, res);
357*77c1e3ccSAndroid Build Coastguard Worker }
358*77c1e3ccSAndroid Build Coastguard Worker }
359*77c1e3ccSAndroid Build Coastguard Worker }
360*77c1e3ccSAndroid Build Coastguard Worker
av1_upsample_intra_edge_neon(uint8_t * p,int sz)361*77c1e3ccSAndroid Build Coastguard Worker void av1_upsample_intra_edge_neon(uint8_t *p, int sz) {
362*77c1e3ccSAndroid Build Coastguard Worker if (!sz) return;
363*77c1e3ccSAndroid Build Coastguard Worker
364*77c1e3ccSAndroid Build Coastguard Worker assert(sz <= MAX_UPSAMPLE_SZ);
365*77c1e3ccSAndroid Build Coastguard Worker
366*77c1e3ccSAndroid Build Coastguard Worker uint8_t edge[MAX_UPSAMPLE_SZ + 3];
367*77c1e3ccSAndroid Build Coastguard Worker const uint8_t *src = edge;
368*77c1e3ccSAndroid Build Coastguard Worker
369*77c1e3ccSAndroid Build Coastguard Worker // Copy p[-1..(sz-1)] and pad out both ends.
370*77c1e3ccSAndroid Build Coastguard Worker edge[0] = p[-1];
371*77c1e3ccSAndroid Build Coastguard Worker edge[1] = p[-1];
372*77c1e3ccSAndroid Build Coastguard Worker memcpy(edge + 2, p, sz);
373*77c1e3ccSAndroid Build Coastguard Worker edge[sz + 2] = p[sz - 1];
374*77c1e3ccSAndroid Build Coastguard Worker p[-2] = p[-1];
375*77c1e3ccSAndroid Build Coastguard Worker
376*77c1e3ccSAndroid Build Coastguard Worker uint8_t *dst = p - 1;
377*77c1e3ccSAndroid Build Coastguard Worker
378*77c1e3ccSAndroid Build Coastguard Worker do {
379*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s0 = vld1_u8(src);
380*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s1 = vld1_u8(src + 1);
381*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s2 = vld1_u8(src + 2);
382*77c1e3ccSAndroid Build Coastguard Worker uint8x8_t s3 = vld1_u8(src + 3);
383*77c1e3ccSAndroid Build Coastguard Worker
384*77c1e3ccSAndroid Build Coastguard Worker int16x8_t t0 = vreinterpretq_s16_u16(vaddl_u8(s0, s3));
385*77c1e3ccSAndroid Build Coastguard Worker int16x8_t t1 = vreinterpretq_s16_u16(vaddl_u8(s1, s2));
386*77c1e3ccSAndroid Build Coastguard Worker t1 = vmulq_n_s16(t1, 9);
387*77c1e3ccSAndroid Build Coastguard Worker t1 = vsubq_s16(t1, t0);
388*77c1e3ccSAndroid Build Coastguard Worker
389*77c1e3ccSAndroid Build Coastguard Worker uint8x8x2_t res = { { vqrshrun_n_s16(t1, 4), s2 } };
390*77c1e3ccSAndroid Build Coastguard Worker
391*77c1e3ccSAndroid Build Coastguard Worker vst2_u8(dst, res);
392*77c1e3ccSAndroid Build Coastguard Worker
393*77c1e3ccSAndroid Build Coastguard Worker src += 8;
394*77c1e3ccSAndroid Build Coastguard Worker dst += 16;
395*77c1e3ccSAndroid Build Coastguard Worker sz -= 8;
396*77c1e3ccSAndroid Build Coastguard Worker } while (sz > 0);
397*77c1e3ccSAndroid Build Coastguard Worker }
398