1*77c1e3ccSAndroid Build Coastguard Worker /*
2*77c1e3ccSAndroid Build Coastguard Worker * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3*77c1e3ccSAndroid Build Coastguard Worker *
4*77c1e3ccSAndroid Build Coastguard Worker * This source code is subject to the terms of the BSD 2 Clause License and
5*77c1e3ccSAndroid Build Coastguard Worker * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6*77c1e3ccSAndroid Build Coastguard Worker * was not distributed with this source code in the LICENSE file, you can
7*77c1e3ccSAndroid Build Coastguard Worker * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8*77c1e3ccSAndroid Build Coastguard Worker * Media Patent License 1.0 was not distributed with this source code in the
9*77c1e3ccSAndroid Build Coastguard Worker * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10*77c1e3ccSAndroid Build Coastguard Worker */
11*77c1e3ccSAndroid Build Coastguard Worker
12*77c1e3ccSAndroid Build Coastguard Worker #include <arm_neon.h>
13*77c1e3ccSAndroid Build Coastguard Worker
14*77c1e3ccSAndroid Build Coastguard Worker #include <assert.h>
15*77c1e3ccSAndroid Build Coastguard Worker #include <math.h>
16*77c1e3ccSAndroid Build Coastguard Worker
17*77c1e3ccSAndroid Build Coastguard Worker #include "config/aom_config.h"
18*77c1e3ccSAndroid Build Coastguard Worker
19*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/arm/mem_neon.h"
20*77c1e3ccSAndroid Build Coastguard Worker #include "aom_dsp/arm/sum_neon.h"
21*77c1e3ccSAndroid Build Coastguard Worker #include "aom_mem/aom_mem.h"
22*77c1e3ccSAndroid Build Coastguard Worker
23*77c1e3ccSAndroid Build Coastguard Worker #include "av1/common/quant_common.h"
24*77c1e3ccSAndroid Build Coastguard Worker #include "av1/common/seg_common.h"
25*77c1e3ccSAndroid Build Coastguard Worker
26*77c1e3ccSAndroid Build Coastguard Worker #include "av1/encoder/av1_quantize.h"
27*77c1e3ccSAndroid Build Coastguard Worker #include "av1/encoder/encoder.h"
28*77c1e3ccSAndroid Build Coastguard Worker #include "av1/encoder/rd.h"
29*77c1e3ccSAndroid Build Coastguard Worker
get_max_eob(int16x8_t v_eobmax)30*77c1e3ccSAndroid Build Coastguard Worker static inline uint16_t get_max_eob(int16x8_t v_eobmax) {
31*77c1e3ccSAndroid Build Coastguard Worker #if AOM_ARCH_AARCH64
32*77c1e3ccSAndroid Build Coastguard Worker return (uint16_t)vmaxvq_s16(v_eobmax);
33*77c1e3ccSAndroid Build Coastguard Worker #else
34*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t v_eobmax_3210 =
35*77c1e3ccSAndroid Build Coastguard Worker vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax));
36*77c1e3ccSAndroid Build Coastguard Worker const int64x1_t v_eobmax_xx32 =
37*77c1e3ccSAndroid Build Coastguard Worker vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
38*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t v_eobmax_tmp =
39*77c1e3ccSAndroid Build Coastguard Worker vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
40*77c1e3ccSAndroid Build Coastguard Worker const int64x1_t v_eobmax_xxx3 =
41*77c1e3ccSAndroid Build Coastguard Worker vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
42*77c1e3ccSAndroid Build Coastguard Worker const int16x4_t v_eobmax_final =
43*77c1e3ccSAndroid Build Coastguard Worker vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
44*77c1e3ccSAndroid Build Coastguard Worker return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
45*77c1e3ccSAndroid Build Coastguard Worker #endif
46*77c1e3ccSAndroid Build Coastguard Worker }
47*77c1e3ccSAndroid Build Coastguard Worker
get_max_lane_eob(const int16_t * iscan,int16x8_t v_eobmax,uint16x8_t v_mask)48*77c1e3ccSAndroid Build Coastguard Worker static inline int16x8_t get_max_lane_eob(const int16_t *iscan,
49*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_eobmax,
50*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t v_mask) {
51*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
52*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1));
53*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan_plus1, vdupq_n_s16(0));
54*77c1e3ccSAndroid Build Coastguard Worker return vmaxq_s16(v_eobmax, v_nz_iscan);
55*77c1e3ccSAndroid Build Coastguard Worker }
56*77c1e3ccSAndroid Build Coastguard Worker
quantize_fp_8(const tran_low_t * coeff_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,int16x8_t v_quant,int16x8_t v_dequant,int16x8_t v_round,int16x8_t v_zero)57*77c1e3ccSAndroid Build Coastguard Worker static inline uint16x8_t quantize_fp_8(const tran_low_t *coeff_ptr,
58*77c1e3ccSAndroid Build Coastguard Worker tran_low_t *qcoeff_ptr,
59*77c1e3ccSAndroid Build Coastguard Worker tran_low_t *dqcoeff_ptr,
60*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_quant, int16x8_t v_dequant,
61*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_round, int16x8_t v_zero) {
62*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
63*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
64*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_abs = vabsq_s16(v_coeff);
65*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
66*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_tmp2 = vshrq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1);
67*77c1e3ccSAndroid Build Coastguard Worker const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero);
68*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
69*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
70*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
71*77c1e3ccSAndroid Build Coastguard Worker store_s16q_to_tran_low(&qcoeff_ptr[0], v_qcoeff);
72*77c1e3ccSAndroid Build Coastguard Worker store_s16q_to_tran_low(&dqcoeff_ptr[0], v_dqcoeff);
73*77c1e3ccSAndroid Build Coastguard Worker return v_nz_mask;
74*77c1e3ccSAndroid Build Coastguard Worker }
75*77c1e3ccSAndroid Build Coastguard Worker
av1_quantize_fp_neon(const tran_low_t * coeff_ptr,intptr_t count,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)76*77c1e3ccSAndroid Build Coastguard Worker void av1_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
77*77c1e3ccSAndroid Build Coastguard Worker const int16_t *zbin_ptr, const int16_t *round_ptr,
78*77c1e3ccSAndroid Build Coastguard Worker const int16_t *quant_ptr,
79*77c1e3ccSAndroid Build Coastguard Worker const int16_t *quant_shift_ptr,
80*77c1e3ccSAndroid Build Coastguard Worker tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
81*77c1e3ccSAndroid Build Coastguard Worker const int16_t *dequant_ptr, uint16_t *eob_ptr,
82*77c1e3ccSAndroid Build Coastguard Worker const int16_t *scan, const int16_t *iscan) {
83*77c1e3ccSAndroid Build Coastguard Worker // TODO(jingning) Decide the need of these arguments after the
84*77c1e3ccSAndroid Build Coastguard Worker // quantization process is completed.
85*77c1e3ccSAndroid Build Coastguard Worker (void)zbin_ptr;
86*77c1e3ccSAndroid Build Coastguard Worker (void)quant_shift_ptr;
87*77c1e3ccSAndroid Build Coastguard Worker (void)scan;
88*77c1e3ccSAndroid Build Coastguard Worker
89*77c1e3ccSAndroid Build Coastguard Worker // Quantization pass: All coefficients with index >= zero_flag are
90*77c1e3ccSAndroid Build Coastguard Worker // skippable. Note: zero_flag can be zero.
91*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_zero = vdupq_n_s16(0);
92*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_quant = vld1q_s16(quant_ptr);
93*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_dequant = vld1q_s16(dequant_ptr);
94*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_round = vld1q_s16(round_ptr);
95*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
96*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t v_nz_mask;
97*77c1e3ccSAndroid Build Coastguard Worker // process dc and the first seven ac coeffs
98*77c1e3ccSAndroid Build Coastguard Worker v_nz_mask = quantize_fp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
99*77c1e3ccSAndroid Build Coastguard Worker v_dequant, v_round, v_zero);
100*77c1e3ccSAndroid Build Coastguard Worker v_eobmax_76543210 = get_max_lane_eob(&iscan[0], v_eobmax_76543210, v_nz_mask);
101*77c1e3ccSAndroid Build Coastguard Worker // overwrite the dc constants with ac constants
102*77c1e3ccSAndroid Build Coastguard Worker v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1);
103*77c1e3ccSAndroid Build Coastguard Worker v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1);
104*77c1e3ccSAndroid Build Coastguard Worker v_round = vdupq_lane_s16(vget_low_s16(v_round), 1);
105*77c1e3ccSAndroid Build Coastguard Worker
106*77c1e3ccSAndroid Build Coastguard Worker count -= 8;
107*77c1e3ccSAndroid Build Coastguard Worker // now process the rest of the ac coeffs
108*77c1e3ccSAndroid Build Coastguard Worker do {
109*77c1e3ccSAndroid Build Coastguard Worker coeff_ptr += 8;
110*77c1e3ccSAndroid Build Coastguard Worker qcoeff_ptr += 8;
111*77c1e3ccSAndroid Build Coastguard Worker dqcoeff_ptr += 8;
112*77c1e3ccSAndroid Build Coastguard Worker iscan += 8;
113*77c1e3ccSAndroid Build Coastguard Worker v_nz_mask = quantize_fp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
114*77c1e3ccSAndroid Build Coastguard Worker v_dequant, v_round, v_zero);
115*77c1e3ccSAndroid Build Coastguard Worker v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
116*77c1e3ccSAndroid Build Coastguard Worker count -= 8;
117*77c1e3ccSAndroid Build Coastguard Worker } while (count > 0);
118*77c1e3ccSAndroid Build Coastguard Worker *eob_ptr = get_max_eob(v_eobmax_76543210);
119*77c1e3ccSAndroid Build Coastguard Worker }
120*77c1e3ccSAndroid Build Coastguard Worker
quantize_lp_8(const int16_t * coeff_ptr,int16_t * qcoeff_ptr,int16_t * dqcoeff_ptr,int16x8_t v_quant,int16x8_t v_dequant,int16x8_t v_round,int16x8_t v_zero)121*77c1e3ccSAndroid Build Coastguard Worker static inline uint16x8_t quantize_lp_8(const int16_t *coeff_ptr,
122*77c1e3ccSAndroid Build Coastguard Worker int16_t *qcoeff_ptr,
123*77c1e3ccSAndroid Build Coastguard Worker int16_t *dqcoeff_ptr, int16x8_t v_quant,
124*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_dequant, int16x8_t v_round,
125*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_zero) {
126*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[0]);
127*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
128*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_abs = vabsq_s16(v_coeff);
129*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
130*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_tmp2 = vshrq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1);
131*77c1e3ccSAndroid Build Coastguard Worker const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero);
132*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
133*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
134*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
135*77c1e3ccSAndroid Build Coastguard Worker vst1q_s16(qcoeff_ptr, v_qcoeff);
136*77c1e3ccSAndroid Build Coastguard Worker vst1q_s16(dqcoeff_ptr, v_dqcoeff);
137*77c1e3ccSAndroid Build Coastguard Worker return v_nz_mask;
138*77c1e3ccSAndroid Build Coastguard Worker }
139*77c1e3ccSAndroid Build Coastguard Worker
av1_quantize_lp_neon(const int16_t * coeff_ptr,intptr_t n_coeffs,const int16_t * round_ptr,const int16_t * quant_ptr,int16_t * qcoeff_ptr,int16_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)140*77c1e3ccSAndroid Build Coastguard Worker void av1_quantize_lp_neon(const int16_t *coeff_ptr, intptr_t n_coeffs,
141*77c1e3ccSAndroid Build Coastguard Worker const int16_t *round_ptr, const int16_t *quant_ptr,
142*77c1e3ccSAndroid Build Coastguard Worker int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
143*77c1e3ccSAndroid Build Coastguard Worker const int16_t *dequant_ptr, uint16_t *eob_ptr,
144*77c1e3ccSAndroid Build Coastguard Worker const int16_t *scan, const int16_t *iscan) {
145*77c1e3ccSAndroid Build Coastguard Worker (void)scan;
146*77c1e3ccSAndroid Build Coastguard Worker // Quantization pass: All coefficients with index >= zero_flag are
147*77c1e3ccSAndroid Build Coastguard Worker // skippable. Note: zero_flag can be zero.
148*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_zero = vdupq_n_s16(0);
149*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_quant = vld1q_s16(quant_ptr);
150*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_dequant = vld1q_s16(dequant_ptr);
151*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_round = vld1q_s16(round_ptr);
152*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
153*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t v_nz_mask;
154*77c1e3ccSAndroid Build Coastguard Worker intptr_t count = n_coeffs;
155*77c1e3ccSAndroid Build Coastguard Worker
156*77c1e3ccSAndroid Build Coastguard Worker // process dc and the first seven ac coeffs
157*77c1e3ccSAndroid Build Coastguard Worker v_nz_mask = quantize_lp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
158*77c1e3ccSAndroid Build Coastguard Worker v_dequant, v_round, v_zero);
159*77c1e3ccSAndroid Build Coastguard Worker v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
160*77c1e3ccSAndroid Build Coastguard Worker // overwrite the dc constants with ac constants
161*77c1e3ccSAndroid Build Coastguard Worker v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1);
162*77c1e3ccSAndroid Build Coastguard Worker v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1);
163*77c1e3ccSAndroid Build Coastguard Worker v_round = vdupq_lane_s16(vget_low_s16(v_round), 1);
164*77c1e3ccSAndroid Build Coastguard Worker
165*77c1e3ccSAndroid Build Coastguard Worker count -= 8;
166*77c1e3ccSAndroid Build Coastguard Worker // now process the rest of the ac coeffs
167*77c1e3ccSAndroid Build Coastguard Worker do {
168*77c1e3ccSAndroid Build Coastguard Worker coeff_ptr += 8;
169*77c1e3ccSAndroid Build Coastguard Worker qcoeff_ptr += 8;
170*77c1e3ccSAndroid Build Coastguard Worker dqcoeff_ptr += 8;
171*77c1e3ccSAndroid Build Coastguard Worker iscan += 8;
172*77c1e3ccSAndroid Build Coastguard Worker v_nz_mask = quantize_lp_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
173*77c1e3ccSAndroid Build Coastguard Worker v_dequant, v_round, v_zero);
174*77c1e3ccSAndroid Build Coastguard Worker v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
175*77c1e3ccSAndroid Build Coastguard Worker count -= 8;
176*77c1e3ccSAndroid Build Coastguard Worker } while (count != 0);
177*77c1e3ccSAndroid Build Coastguard Worker *eob_ptr = get_max_eob(v_eobmax_76543210);
178*77c1e3ccSAndroid Build Coastguard Worker }
179*77c1e3ccSAndroid Build Coastguard Worker
quantize_fp_logscale_8(const tran_low_t * coeff_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,int16x8_t v_quant,int16x8_t v_dequant,int16x8_t v_round,int16x8_t v_zero,int log_scale)180*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE uint16x8_t quantize_fp_logscale_8(
181*77c1e3ccSAndroid Build Coastguard Worker const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
182*77c1e3ccSAndroid Build Coastguard Worker tran_low_t *dqcoeff_ptr, int16x8_t v_quant, int16x8_t v_dequant,
183*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_round, int16x8_t v_zero, int log_scale) {
184*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_log_scale_minus_1 = vdupq_n_s16(log_scale - 1);
185*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_neg_log_scale_plus_1 = vdupq_n_s16(-(1 + log_scale));
186*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
187*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
188*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_abs_coeff = vabsq_s16(v_coeff);
189*77c1e3ccSAndroid Build Coastguard Worker const uint16x8_t v_mask =
190*77c1e3ccSAndroid Build Coastguard Worker vcgeq_s16(v_abs_coeff, vshlq_s16(v_dequant, v_neg_log_scale_plus_1));
191*77c1e3ccSAndroid Build Coastguard Worker // const int64_t tmp = vmask ? (int64_t)abs_coeff + log_scaled_round : 0
192*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_tmp = vandq_s16(vqaddq_s16(v_abs_coeff, v_round),
193*77c1e3ccSAndroid Build Coastguard Worker vreinterpretq_s16_u16(v_mask));
194*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_tmp2 =
195*77c1e3ccSAndroid Build Coastguard Worker vqdmulhq_s16(vshlq_s16(v_tmp, v_log_scale_minus_1), v_quant);
196*77c1e3ccSAndroid Build Coastguard Worker const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero);
197*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_qcoeff =
198*77c1e3ccSAndroid Build Coastguard Worker vsubq_s16(veorq_s16(v_tmp2, v_coeff_sign), v_coeff_sign);
199*77c1e3ccSAndroid Build Coastguard Worker // Multiplying by dequant here will use all 16 bits. Cast to unsigned before
200*77c1e3ccSAndroid Build Coastguard Worker // shifting right. (vshlq_s16 will shift right if shift value is negative)
201*77c1e3ccSAndroid Build Coastguard Worker const uint16x8_t v_abs_dqcoeff =
202*77c1e3ccSAndroid Build Coastguard Worker vshlq_u16(vreinterpretq_u16_s16(vmulq_s16(v_tmp2, v_dequant)),
203*77c1e3ccSAndroid Build Coastguard Worker vdupq_n_s16(-log_scale));
204*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_dqcoeff =
205*77c1e3ccSAndroid Build Coastguard Worker vsubq_s16(veorq_s16(vreinterpretq_s16_u16(v_abs_dqcoeff), v_coeff_sign),
206*77c1e3ccSAndroid Build Coastguard Worker v_coeff_sign);
207*77c1e3ccSAndroid Build Coastguard Worker store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
208*77c1e3ccSAndroid Build Coastguard Worker store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff);
209*77c1e3ccSAndroid Build Coastguard Worker return v_nz_mask;
210*77c1e3ccSAndroid Build Coastguard Worker }
211*77c1e3ccSAndroid Build Coastguard Worker
quantize_fp_logscale2_8(const tran_low_t * coeff_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,int16x8_t v_quant,int16x8_t v_dequant,int16x8_t v_round,int16x8_t v_zero)212*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE uint16x8_t quantize_fp_logscale2_8(
213*77c1e3ccSAndroid Build Coastguard Worker const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
214*77c1e3ccSAndroid Build Coastguard Worker tran_low_t *dqcoeff_ptr, int16x8_t v_quant, int16x8_t v_dequant,
215*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_round, int16x8_t v_zero) {
216*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
217*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
218*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_abs_coeff = vabsq_s16(v_coeff);
219*77c1e3ccSAndroid Build Coastguard Worker const uint16x8_t v_mask =
220*77c1e3ccSAndroid Build Coastguard Worker vcgeq_u16(vshlq_n_u16(vreinterpretq_u16_s16(v_abs_coeff), 1),
221*77c1e3ccSAndroid Build Coastguard Worker vshrq_n_u16(vreinterpretq_u16_s16(v_dequant), 2));
222*77c1e3ccSAndroid Build Coastguard Worker // abs_coeff = vmask ? (int64_t)abs_coeff + log_scaled_round : 0
223*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_tmp = vandq_s16(vqaddq_s16(v_abs_coeff, v_round),
224*77c1e3ccSAndroid Build Coastguard Worker vreinterpretq_s16_u16(v_mask));
225*77c1e3ccSAndroid Build Coastguard Worker // tmp32 = (int)((abs_coeff * quant_ptr[rc != 0]) >> (16 - log_scale));
226*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_tmp2 =
227*77c1e3ccSAndroid Build Coastguard Worker vorrq_s16(vshlq_n_s16(vqdmulhq_s16(v_tmp, v_quant), 1),
228*77c1e3ccSAndroid Build Coastguard Worker vreinterpretq_s16_u16(vshrq_n_u16(
229*77c1e3ccSAndroid Build Coastguard Worker vreinterpretq_u16_s16(vmulq_s16(v_tmp, v_quant)), 14)));
230*77c1e3ccSAndroid Build Coastguard Worker const uint16x8_t v_nz_mask = vcgtq_s16(v_tmp2, v_zero);
231*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_qcoeff =
232*77c1e3ccSAndroid Build Coastguard Worker vsubq_s16(veorq_s16(v_tmp2, v_coeff_sign), v_coeff_sign);
233*77c1e3ccSAndroid Build Coastguard Worker // const tran_low_t abs_dqcoeff = (tmp32 * dequant_ptr[rc != 0]) >> log_scale;
234*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_abs_dqcoeff =
235*77c1e3ccSAndroid Build Coastguard Worker vorrq_s16(vshlq_n_s16(vqdmulhq_s16(v_tmp2, v_dequant), 13),
236*77c1e3ccSAndroid Build Coastguard Worker vreinterpretq_s16_u16(vshrq_n_u16(
237*77c1e3ccSAndroid Build Coastguard Worker vreinterpretq_u16_s16(vmulq_s16(v_tmp2, v_dequant)), 2)));
238*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_dqcoeff =
239*77c1e3ccSAndroid Build Coastguard Worker vsubq_s16(veorq_s16(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign);
240*77c1e3ccSAndroid Build Coastguard Worker store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
241*77c1e3ccSAndroid Build Coastguard Worker store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff);
242*77c1e3ccSAndroid Build Coastguard Worker return v_nz_mask;
243*77c1e3ccSAndroid Build Coastguard Worker }
244*77c1e3ccSAndroid Build Coastguard Worker
quantize_fp_no_qmatrix_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * round_ptr,const int16_t * quant_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * iscan,int log_scale)245*77c1e3ccSAndroid Build Coastguard Worker static AOM_FORCE_INLINE void quantize_fp_no_qmatrix_neon(
246*77c1e3ccSAndroid Build Coastguard Worker const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,
247*77c1e3ccSAndroid Build Coastguard Worker const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
248*77c1e3ccSAndroid Build Coastguard Worker const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *iscan,
249*77c1e3ccSAndroid Build Coastguard Worker int log_scale) {
250*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_zero = vdupq_n_s16(0);
251*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_quant = vld1q_s16(quant_ptr);
252*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_dequant = vld1q_s16(dequant_ptr);
253*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_round_no_scale = vld1q_s16(round_ptr);
254*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_round =
255*77c1e3ccSAndroid Build Coastguard Worker vqrdmulhq_n_s16(v_round_no_scale, (int16_t)(1 << (15 - log_scale)));
256*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
257*77c1e3ccSAndroid Build Coastguard Worker intptr_t non_zero_count = n_coeffs;
258*77c1e3ccSAndroid Build Coastguard Worker
259*77c1e3ccSAndroid Build Coastguard Worker assert(n_coeffs > 16);
260*77c1e3ccSAndroid Build Coastguard Worker // Pre-scan pass
261*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_dequant_scaled =
262*77c1e3ccSAndroid Build Coastguard Worker vshlq_s16(v_dequant, vdupq_n_s16(-(1 + log_scale)));
263*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_zbin_s16 =
264*77c1e3ccSAndroid Build Coastguard Worker vdupq_lane_s16(vget_low_s16(v_dequant_scaled), 1);
265*77c1e3ccSAndroid Build Coastguard Worker intptr_t i = n_coeffs;
266*77c1e3ccSAndroid Build Coastguard Worker do {
267*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_coeff_a = load_tran_low_to_s16q(coeff_ptr + i - 8);
268*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_coeff_b = load_tran_low_to_s16q(coeff_ptr + i - 16);
269*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_abs_coeff_a = vabsq_s16(v_coeff_a);
270*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_abs_coeff_b = vabsq_s16(v_coeff_b);
271*77c1e3ccSAndroid Build Coastguard Worker const uint16x8_t v_mask_a = vcgeq_s16(v_abs_coeff_a, v_zbin_s16);
272*77c1e3ccSAndroid Build Coastguard Worker const uint16x8_t v_mask_b = vcgeq_s16(v_abs_coeff_b, v_zbin_s16);
273*77c1e3ccSAndroid Build Coastguard Worker // If the coefficient is in the base ZBIN range, then discard.
274*77c1e3ccSAndroid Build Coastguard Worker if (horizontal_long_add_u16x8(v_mask_a, v_mask_b) == 0) {
275*77c1e3ccSAndroid Build Coastguard Worker non_zero_count -= 16;
276*77c1e3ccSAndroid Build Coastguard Worker } else {
277*77c1e3ccSAndroid Build Coastguard Worker break;
278*77c1e3ccSAndroid Build Coastguard Worker }
279*77c1e3ccSAndroid Build Coastguard Worker i -= 16;
280*77c1e3ccSAndroid Build Coastguard Worker } while (i > 0);
281*77c1e3ccSAndroid Build Coastguard Worker
282*77c1e3ccSAndroid Build Coastguard Worker const intptr_t remaining_zcoeffs = n_coeffs - non_zero_count;
283*77c1e3ccSAndroid Build Coastguard Worker memset(qcoeff_ptr + non_zero_count, 0,
284*77c1e3ccSAndroid Build Coastguard Worker remaining_zcoeffs * sizeof(*qcoeff_ptr));
285*77c1e3ccSAndroid Build Coastguard Worker memset(dqcoeff_ptr + non_zero_count, 0,
286*77c1e3ccSAndroid Build Coastguard Worker remaining_zcoeffs * sizeof(*dqcoeff_ptr));
287*77c1e3ccSAndroid Build Coastguard Worker
288*77c1e3ccSAndroid Build Coastguard Worker // process dc and the first seven ac coeffs
289*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t v_nz_mask;
290*77c1e3ccSAndroid Build Coastguard Worker if (log_scale == 2) {
291*77c1e3ccSAndroid Build Coastguard Worker v_nz_mask = quantize_fp_logscale2_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
292*77c1e3ccSAndroid Build Coastguard Worker v_quant, v_dequant, v_round, v_zero);
293*77c1e3ccSAndroid Build Coastguard Worker } else {
294*77c1e3ccSAndroid Build Coastguard Worker v_nz_mask =
295*77c1e3ccSAndroid Build Coastguard Worker quantize_fp_logscale_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
296*77c1e3ccSAndroid Build Coastguard Worker v_dequant, v_round, v_zero, log_scale);
297*77c1e3ccSAndroid Build Coastguard Worker }
298*77c1e3ccSAndroid Build Coastguard Worker v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
299*77c1e3ccSAndroid Build Coastguard Worker // overwrite the dc constants with ac constants
300*77c1e3ccSAndroid Build Coastguard Worker v_quant = vdupq_lane_s16(vget_low_s16(v_quant), 1);
301*77c1e3ccSAndroid Build Coastguard Worker v_dequant = vdupq_lane_s16(vget_low_s16(v_dequant), 1);
302*77c1e3ccSAndroid Build Coastguard Worker v_round = vdupq_lane_s16(vget_low_s16(v_round), 1);
303*77c1e3ccSAndroid Build Coastguard Worker
304*77c1e3ccSAndroid Build Coastguard Worker for (intptr_t count = non_zero_count - 8; count > 0; count -= 8) {
305*77c1e3ccSAndroid Build Coastguard Worker coeff_ptr += 8;
306*77c1e3ccSAndroid Build Coastguard Worker qcoeff_ptr += 8;
307*77c1e3ccSAndroid Build Coastguard Worker dqcoeff_ptr += 8;
308*77c1e3ccSAndroid Build Coastguard Worker iscan += 8;
309*77c1e3ccSAndroid Build Coastguard Worker if (log_scale == 2) {
310*77c1e3ccSAndroid Build Coastguard Worker v_nz_mask = quantize_fp_logscale2_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
311*77c1e3ccSAndroid Build Coastguard Worker v_quant, v_dequant, v_round, v_zero);
312*77c1e3ccSAndroid Build Coastguard Worker } else {
313*77c1e3ccSAndroid Build Coastguard Worker v_nz_mask =
314*77c1e3ccSAndroid Build Coastguard Worker quantize_fp_logscale_8(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant,
315*77c1e3ccSAndroid Build Coastguard Worker v_dequant, v_round, v_zero, log_scale);
316*77c1e3ccSAndroid Build Coastguard Worker }
317*77c1e3ccSAndroid Build Coastguard Worker v_eobmax_76543210 = get_max_lane_eob(iscan, v_eobmax_76543210, v_nz_mask);
318*77c1e3ccSAndroid Build Coastguard Worker }
319*77c1e3ccSAndroid Build Coastguard Worker *eob_ptr = get_max_eob(v_eobmax_76543210);
320*77c1e3ccSAndroid Build Coastguard Worker }
321*77c1e3ccSAndroid Build Coastguard Worker
av1_quantize_fp_32x32_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)322*77c1e3ccSAndroid Build Coastguard Worker void av1_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
323*77c1e3ccSAndroid Build Coastguard Worker const int16_t *zbin_ptr,
324*77c1e3ccSAndroid Build Coastguard Worker const int16_t *round_ptr,
325*77c1e3ccSAndroid Build Coastguard Worker const int16_t *quant_ptr,
326*77c1e3ccSAndroid Build Coastguard Worker const int16_t *quant_shift_ptr,
327*77c1e3ccSAndroid Build Coastguard Worker tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
328*77c1e3ccSAndroid Build Coastguard Worker const int16_t *dequant_ptr, uint16_t *eob_ptr,
329*77c1e3ccSAndroid Build Coastguard Worker const int16_t *scan, const int16_t *iscan) {
330*77c1e3ccSAndroid Build Coastguard Worker (void)zbin_ptr;
331*77c1e3ccSAndroid Build Coastguard Worker (void)quant_shift_ptr;
332*77c1e3ccSAndroid Build Coastguard Worker (void)scan;
333*77c1e3ccSAndroid Build Coastguard Worker quantize_fp_no_qmatrix_neon(coeff_ptr, n_coeffs, round_ptr, quant_ptr,
334*77c1e3ccSAndroid Build Coastguard Worker qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
335*77c1e3ccSAndroid Build Coastguard Worker iscan, 1);
336*77c1e3ccSAndroid Build Coastguard Worker }
337*77c1e3ccSAndroid Build Coastguard Worker
av1_quantize_fp_64x64_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)338*77c1e3ccSAndroid Build Coastguard Worker void av1_quantize_fp_64x64_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
339*77c1e3ccSAndroid Build Coastguard Worker const int16_t *zbin_ptr,
340*77c1e3ccSAndroid Build Coastguard Worker const int16_t *round_ptr,
341*77c1e3ccSAndroid Build Coastguard Worker const int16_t *quant_ptr,
342*77c1e3ccSAndroid Build Coastguard Worker const int16_t *quant_shift_ptr,
343*77c1e3ccSAndroid Build Coastguard Worker tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
344*77c1e3ccSAndroid Build Coastguard Worker const int16_t *dequant_ptr, uint16_t *eob_ptr,
345*77c1e3ccSAndroid Build Coastguard Worker const int16_t *scan, const int16_t *iscan) {
346*77c1e3ccSAndroid Build Coastguard Worker (void)zbin_ptr;
347*77c1e3ccSAndroid Build Coastguard Worker (void)quant_shift_ptr;
348*77c1e3ccSAndroid Build Coastguard Worker (void)scan;
349*77c1e3ccSAndroid Build Coastguard Worker quantize_fp_no_qmatrix_neon(coeff_ptr, n_coeffs, round_ptr, quant_ptr,
350*77c1e3ccSAndroid Build Coastguard Worker qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
351*77c1e3ccSAndroid Build Coastguard Worker iscan, 2);
352*77c1e3ccSAndroid Build Coastguard Worker }
353*77c1e3ccSAndroid Build Coastguard Worker
aom_quantize_b_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)354*77c1e3ccSAndroid Build Coastguard Worker void aom_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
355*77c1e3ccSAndroid Build Coastguard Worker const int16_t *zbin_ptr, const int16_t *round_ptr,
356*77c1e3ccSAndroid Build Coastguard Worker const int16_t *quant_ptr,
357*77c1e3ccSAndroid Build Coastguard Worker const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
358*77c1e3ccSAndroid Build Coastguard Worker tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
359*77c1e3ccSAndroid Build Coastguard Worker uint16_t *eob_ptr, const int16_t *scan,
360*77c1e3ccSAndroid Build Coastguard Worker const int16_t *iscan) {
361*77c1e3ccSAndroid Build Coastguard Worker (void)quant_shift_ptr;
362*77c1e3ccSAndroid Build Coastguard Worker (void)scan;
363*77c1e3ccSAndroid Build Coastguard Worker
364*77c1e3ccSAndroid Build Coastguard Worker const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
365*77c1e3ccSAndroid Build Coastguard Worker
366*77c1e3ccSAndroid Build Coastguard Worker memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
367*77c1e3ccSAndroid Build Coastguard Worker memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
368*77c1e3ccSAndroid Build Coastguard Worker
369*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t zero = vdupq_n_s16(0);
370*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
371*77c1e3ccSAndroid Build Coastguard Worker
372*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vzbins = vdupq_n_s16(zbins[1]), vround = vdupq_n_s16(round_ptr[1]);
373*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
374*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
375*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
376*77c1e3ccSAndroid Build Coastguard Worker
377*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
378*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
379*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_abs = vabsq_s16(v_coeff);
380*77c1e3ccSAndroid Build Coastguard Worker
381*77c1e3ccSAndroid Build Coastguard Worker vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
382*77c1e3ccSAndroid Build Coastguard Worker
383*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t vcond = vcgeq_s16(v_abs, vzbins);
384*77c1e3ccSAndroid Build Coastguard Worker uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
385*77c1e3ccSAndroid Build Coastguard Worker if (nz_check) {
386*77c1e3ccSAndroid Build Coastguard Worker vround = vsetq_lane_s16(round_ptr[0], vround, 0);
387*77c1e3ccSAndroid Build Coastguard Worker vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
388*77c1e3ccSAndroid Build Coastguard Worker vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
389*77c1e3ccSAndroid Build Coastguard Worker vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
390*77c1e3ccSAndroid Build Coastguard Worker
391*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vtmp = vqaddq_s16(v_abs, vround);
392*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
393*77c1e3ccSAndroid Build Coastguard Worker vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
394*77c1e3ccSAndroid Build Coastguard Worker
395*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
396*77c1e3ccSAndroid Build Coastguard Worker int16x8_t coeff_nz_mask =
397*77c1e3ccSAndroid Build Coastguard Worker vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
398*77c1e3ccSAndroid Build Coastguard Worker store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
399*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
400*77c1e3ccSAndroid Build Coastguard Worker
401*77c1e3ccSAndroid Build Coastguard Worker vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
402*77c1e3ccSAndroid Build Coastguard Worker coeff_nz_mask =
403*77c1e3ccSAndroid Build Coastguard Worker vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
404*77c1e3ccSAndroid Build Coastguard Worker store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
405*77c1e3ccSAndroid Build Coastguard Worker
406*77c1e3ccSAndroid Build Coastguard Worker vround = vsetq_lane_s16(round_ptr[1], vround, 0);
407*77c1e3ccSAndroid Build Coastguard Worker vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
408*77c1e3ccSAndroid Build Coastguard Worker vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
409*77c1e3ccSAndroid Build Coastguard Worker vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
410*77c1e3ccSAndroid Build Coastguard Worker
411*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
412*77c1e3ccSAndroid Build Coastguard Worker const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
413*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_iscan = vld1q_s16(&iscan[0]);
414*77c1e3ccSAndroid Build Coastguard Worker vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
415*77c1e3ccSAndroid Build Coastguard Worker v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
416*77c1e3ccSAndroid Build Coastguard Worker }
417*77c1e3ccSAndroid Build Coastguard Worker vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
418*77c1e3ccSAndroid Build Coastguard Worker
419*77c1e3ccSAndroid Build Coastguard Worker for (int i = 8; i < n_coeffs; i += 8) {
420*77c1e3ccSAndroid Build Coastguard Worker v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
421*77c1e3ccSAndroid Build Coastguard Worker v_coeff_sign = vshrq_n_s16(v_coeff, 15);
422*77c1e3ccSAndroid Build Coastguard Worker v_abs = vabsq_s16(v_coeff);
423*77c1e3ccSAndroid Build Coastguard Worker vcond = vcgeq_s16(v_abs, vzbins);
424*77c1e3ccSAndroid Build Coastguard Worker
425*77c1e3ccSAndroid Build Coastguard Worker nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
426*77c1e3ccSAndroid Build Coastguard Worker if (nz_check) {
427*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vtmp = vqaddq_s16(v_abs, vround);
428*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
429*77c1e3ccSAndroid Build Coastguard Worker
430*77c1e3ccSAndroid Build Coastguard Worker vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
431*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
432*77c1e3ccSAndroid Build Coastguard Worker int16x8_t coeff_nz_mask =
433*77c1e3ccSAndroid Build Coastguard Worker vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
434*77c1e3ccSAndroid Build Coastguard Worker store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
435*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
436*77c1e3ccSAndroid Build Coastguard Worker vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
437*77c1e3ccSAndroid Build Coastguard Worker coeff_nz_mask =
438*77c1e3ccSAndroid Build Coastguard Worker vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
439*77c1e3ccSAndroid Build Coastguard Worker store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
440*77c1e3ccSAndroid Build Coastguard Worker
441*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
442*77c1e3ccSAndroid Build Coastguard Worker const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
443*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_iscan = vld1q_s16(&iscan[i]);
444*77c1e3ccSAndroid Build Coastguard Worker vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
445*77c1e3ccSAndroid Build Coastguard Worker v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
446*77c1e3ccSAndroid Build Coastguard Worker }
447*77c1e3ccSAndroid Build Coastguard Worker }
448*77c1e3ccSAndroid Build Coastguard Worker *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
449*77c1e3ccSAndroid Build Coastguard Worker }
450*77c1e3ccSAndroid Build Coastguard Worker
451*77c1e3ccSAndroid Build Coastguard Worker #define QM_MULL_SHIFT(x0, x1) \
452*77c1e3ccSAndroid Build Coastguard Worker vreinterpretq_s16_u16(vorrq_u16( \
453*77c1e3ccSAndroid Build Coastguard Worker vreinterpretq_u16_s16(vshlq_n_s16( \
454*77c1e3ccSAndroid Build Coastguard Worker vqdmulhq_s16(x0, vreinterpretq_s16_u16(x1)), 15 - AOM_QM_BITS)), \
455*77c1e3ccSAndroid Build Coastguard Worker vshrq_n_u16(vmulq_u16(vreinterpretq_u16_s16(x0), x1), AOM_QM_BITS)))
456*77c1e3ccSAndroid Build Coastguard Worker
aom_quantize_b_helper_16x16_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan,const qm_val_t * qm_ptr,const qm_val_t * iqm_ptr)457*77c1e3ccSAndroid Build Coastguard Worker static void aom_quantize_b_helper_16x16_neon(
458*77c1e3ccSAndroid Build Coastguard Worker const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
459*77c1e3ccSAndroid Build Coastguard Worker const int16_t *round_ptr, const int16_t *quant_ptr,
460*77c1e3ccSAndroid Build Coastguard Worker const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
461*77c1e3ccSAndroid Build Coastguard Worker tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
462*77c1e3ccSAndroid Build Coastguard Worker const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
463*77c1e3ccSAndroid Build Coastguard Worker const qm_val_t *iqm_ptr) {
464*77c1e3ccSAndroid Build Coastguard Worker (void)scan;
465*77c1e3ccSAndroid Build Coastguard Worker
466*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t vwt, viwt;
467*77c1e3ccSAndroid Build Coastguard Worker const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
468*77c1e3ccSAndroid Build Coastguard Worker
469*77c1e3ccSAndroid Build Coastguard Worker memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
470*77c1e3ccSAndroid Build Coastguard Worker memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
471*77c1e3ccSAndroid Build Coastguard Worker
472*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t zero = vdupq_n_s16(0);
473*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
474*77c1e3ccSAndroid Build Coastguard Worker
475*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vzbins = vdupq_n_s16(zbins[1]), vround = vdupq_n_s16(round_ptr[1]);
476*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
477*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
478*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
479*77c1e3ccSAndroid Build Coastguard Worker
480*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
481*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
482*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_abs = vabsq_s16(v_coeff);
483*77c1e3ccSAndroid Build Coastguard Worker vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
484*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t vcond;
485*77c1e3ccSAndroid Build Coastguard Worker if (qm_ptr == NULL) {
486*77c1e3ccSAndroid Build Coastguard Worker vcond = vcgeq_s16(v_abs, vzbins);
487*77c1e3ccSAndroid Build Coastguard Worker } else {
488*77c1e3ccSAndroid Build Coastguard Worker vwt = vmovl_u8(vld1_u8(&qm_ptr[0]));
489*77c1e3ccSAndroid Build Coastguard Worker vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
490*77c1e3ccSAndroid Build Coastguard Worker }
491*77c1e3ccSAndroid Build Coastguard Worker uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
492*77c1e3ccSAndroid Build Coastguard Worker if (nz_check) {
493*77c1e3ccSAndroid Build Coastguard Worker vround = vsetq_lane_s16(round_ptr[0], vround, 0);
494*77c1e3ccSAndroid Build Coastguard Worker vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
495*77c1e3ccSAndroid Build Coastguard Worker vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
496*77c1e3ccSAndroid Build Coastguard Worker vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
497*77c1e3ccSAndroid Build Coastguard Worker
498*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vtmp = vqaddq_s16(v_abs, vround);
499*77c1e3ccSAndroid Build Coastguard Worker
500*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vtmp2;
501*77c1e3ccSAndroid Build Coastguard Worker if (qm_ptr == NULL) {
502*77c1e3ccSAndroid Build Coastguard Worker vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
503*77c1e3ccSAndroid Build Coastguard Worker } else {
504*77c1e3ccSAndroid Build Coastguard Worker vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
505*77c1e3ccSAndroid Build Coastguard Worker vtmp2 = vaddq_s16(vtmp2, vtmp);
506*77c1e3ccSAndroid Build Coastguard Worker }
507*77c1e3ccSAndroid Build Coastguard Worker
508*77c1e3ccSAndroid Build Coastguard Worker vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
509*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
510*77c1e3ccSAndroid Build Coastguard Worker int16x8_t coeff_nz_mask =
511*77c1e3ccSAndroid Build Coastguard Worker vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
512*77c1e3ccSAndroid Build Coastguard Worker store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
513*77c1e3ccSAndroid Build Coastguard Worker
514*77c1e3ccSAndroid Build Coastguard Worker if (iqm_ptr != NULL) {
515*77c1e3ccSAndroid Build Coastguard Worker viwt = vmovl_u8(vld1_u8(&iqm_ptr[0]));
516*77c1e3ccSAndroid Build Coastguard Worker vdequant = QM_MULL_SHIFT(vdequant, viwt);
517*77c1e3ccSAndroid Build Coastguard Worker }
518*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
519*77c1e3ccSAndroid Build Coastguard Worker vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
520*77c1e3ccSAndroid Build Coastguard Worker coeff_nz_mask =
521*77c1e3ccSAndroid Build Coastguard Worker vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
522*77c1e3ccSAndroid Build Coastguard Worker store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
523*77c1e3ccSAndroid Build Coastguard Worker
524*77c1e3ccSAndroid Build Coastguard Worker vround = vsetq_lane_s16(round_ptr[1], vround, 0);
525*77c1e3ccSAndroid Build Coastguard Worker vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
526*77c1e3ccSAndroid Build Coastguard Worker vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
527*77c1e3ccSAndroid Build Coastguard Worker vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
528*77c1e3ccSAndroid Build Coastguard Worker
529*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
530*77c1e3ccSAndroid Build Coastguard Worker const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
531*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_iscan = vld1q_s16(&iscan[0]);
532*77c1e3ccSAndroid Build Coastguard Worker vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
533*77c1e3ccSAndroid Build Coastguard Worker v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
534*77c1e3ccSAndroid Build Coastguard Worker }
535*77c1e3ccSAndroid Build Coastguard Worker vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
536*77c1e3ccSAndroid Build Coastguard Worker
537*77c1e3ccSAndroid Build Coastguard Worker for (int i = 8; i < n_coeffs; i += 8) {
538*77c1e3ccSAndroid Build Coastguard Worker v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
539*77c1e3ccSAndroid Build Coastguard Worker v_coeff_sign = vshrq_n_s16(v_coeff, 15);
540*77c1e3ccSAndroid Build Coastguard Worker v_abs = vabsq_s16(v_coeff);
541*77c1e3ccSAndroid Build Coastguard Worker
542*77c1e3ccSAndroid Build Coastguard Worker if (qm_ptr == NULL) {
543*77c1e3ccSAndroid Build Coastguard Worker vcond = vcgeq_s16(v_abs, vzbins);
544*77c1e3ccSAndroid Build Coastguard Worker } else {
545*77c1e3ccSAndroid Build Coastguard Worker vwt = vmovl_u8(vld1_u8(&qm_ptr[i]));
546*77c1e3ccSAndroid Build Coastguard Worker vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
547*77c1e3ccSAndroid Build Coastguard Worker }
548*77c1e3ccSAndroid Build Coastguard Worker nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
549*77c1e3ccSAndroid Build Coastguard Worker if (nz_check) {
550*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vtmp = vqaddq_s16(v_abs, vround);
551*77c1e3ccSAndroid Build Coastguard Worker
552*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vtmp2;
553*77c1e3ccSAndroid Build Coastguard Worker if (qm_ptr == NULL) {
554*77c1e3ccSAndroid Build Coastguard Worker vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
555*77c1e3ccSAndroid Build Coastguard Worker } else {
556*77c1e3ccSAndroid Build Coastguard Worker vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
557*77c1e3ccSAndroid Build Coastguard Worker vtmp2 = vaddq_s16(vtmp2, vtmp);
558*77c1e3ccSAndroid Build Coastguard Worker }
559*77c1e3ccSAndroid Build Coastguard Worker
560*77c1e3ccSAndroid Build Coastguard Worker vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
561*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
562*77c1e3ccSAndroid Build Coastguard Worker int16x8_t coeff_nz_mask =
563*77c1e3ccSAndroid Build Coastguard Worker vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
564*77c1e3ccSAndroid Build Coastguard Worker store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
565*77c1e3ccSAndroid Build Coastguard Worker
566*77c1e3ccSAndroid Build Coastguard Worker if (iqm_ptr != NULL) {
567*77c1e3ccSAndroid Build Coastguard Worker viwt = vmovl_u8(vld1_u8(&iqm_ptr[i]));
568*77c1e3ccSAndroid Build Coastguard Worker vdequant = QM_MULL_SHIFT(vdequant, viwt);
569*77c1e3ccSAndroid Build Coastguard Worker }
570*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
571*77c1e3ccSAndroid Build Coastguard Worker vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
572*77c1e3ccSAndroid Build Coastguard Worker coeff_nz_mask =
573*77c1e3ccSAndroid Build Coastguard Worker vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
574*77c1e3ccSAndroid Build Coastguard Worker store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
575*77c1e3ccSAndroid Build Coastguard Worker
576*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
577*77c1e3ccSAndroid Build Coastguard Worker const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
578*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_iscan = vld1q_s16(&iscan[i]);
579*77c1e3ccSAndroid Build Coastguard Worker vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
580*77c1e3ccSAndroid Build Coastguard Worker v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
581*77c1e3ccSAndroid Build Coastguard Worker }
582*77c1e3ccSAndroid Build Coastguard Worker }
583*77c1e3ccSAndroid Build Coastguard Worker *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
584*77c1e3ccSAndroid Build Coastguard Worker }
585*77c1e3ccSAndroid Build Coastguard Worker
aom_quantize_b_helper_32x32_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan,const qm_val_t * qm_ptr,const qm_val_t * iqm_ptr)586*77c1e3ccSAndroid Build Coastguard Worker static void aom_quantize_b_helper_32x32_neon(
587*77c1e3ccSAndroid Build Coastguard Worker const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
588*77c1e3ccSAndroid Build Coastguard Worker const int16_t *round_ptr, const int16_t *quant_ptr,
589*77c1e3ccSAndroid Build Coastguard Worker const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
590*77c1e3ccSAndroid Build Coastguard Worker tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
591*77c1e3ccSAndroid Build Coastguard Worker const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
592*77c1e3ccSAndroid Build Coastguard Worker const qm_val_t *iqm_ptr) {
593*77c1e3ccSAndroid Build Coastguard Worker (void)scan;
594*77c1e3ccSAndroid Build Coastguard Worker
595*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t vwt, viwt;
596*77c1e3ccSAndroid Build Coastguard Worker const int log_scale = 1;
597*77c1e3ccSAndroid Build Coastguard Worker const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
598*77c1e3ccSAndroid Build Coastguard Worker ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
599*77c1e3ccSAndroid Build Coastguard Worker
600*77c1e3ccSAndroid Build Coastguard Worker memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
601*77c1e3ccSAndroid Build Coastguard Worker memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
602*77c1e3ccSAndroid Build Coastguard Worker
603*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t zero = vdupq_n_s16(0);
604*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
605*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_log_scale = v_eobmax_76543210;
606*77c1e3ccSAndroid Build Coastguard Worker
607*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vzbins = vdupq_n_s16(zbins[1]),
608*77c1e3ccSAndroid Build Coastguard Worker vround = vdupq_n_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale));
609*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
610*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
611*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
612*77c1e3ccSAndroid Build Coastguard Worker
613*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
614*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
615*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_abs = vabsq_s16(v_coeff);
616*77c1e3ccSAndroid Build Coastguard Worker vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
617*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t vcond;
618*77c1e3ccSAndroid Build Coastguard Worker if (qm_ptr == NULL) {
619*77c1e3ccSAndroid Build Coastguard Worker vcond = vcgeq_s16(v_abs, vzbins);
620*77c1e3ccSAndroid Build Coastguard Worker } else {
621*77c1e3ccSAndroid Build Coastguard Worker vwt = vmovl_u8(vld1_u8(&qm_ptr[0]));
622*77c1e3ccSAndroid Build Coastguard Worker vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
623*77c1e3ccSAndroid Build Coastguard Worker }
624*77c1e3ccSAndroid Build Coastguard Worker uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
625*77c1e3ccSAndroid Build Coastguard Worker if (nz_check) {
626*77c1e3ccSAndroid Build Coastguard Worker vround =
627*77c1e3ccSAndroid Build Coastguard Worker vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[0], log_scale), vround, 0);
628*77c1e3ccSAndroid Build Coastguard Worker vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
629*77c1e3ccSAndroid Build Coastguard Worker vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
630*77c1e3ccSAndroid Build Coastguard Worker vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
631*77c1e3ccSAndroid Build Coastguard Worker
632*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vtmp = vqaddq_s16(v_abs, vround);
633*77c1e3ccSAndroid Build Coastguard Worker
634*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vtmp2;
635*77c1e3ccSAndroid Build Coastguard Worker if (qm_ptr == NULL) {
636*77c1e3ccSAndroid Build Coastguard Worker vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
637*77c1e3ccSAndroid Build Coastguard Worker } else {
638*77c1e3ccSAndroid Build Coastguard Worker vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
639*77c1e3ccSAndroid Build Coastguard Worker vtmp2 = vaddq_s16(vtmp2, vtmp);
640*77c1e3ccSAndroid Build Coastguard Worker }
641*77c1e3ccSAndroid Build Coastguard Worker
642*77c1e3ccSAndroid Build Coastguard Worker vtmp2 = vqdmulhq_s16(vtmp2, vquant_shift);
643*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
644*77c1e3ccSAndroid Build Coastguard Worker int16x8_t coeff_nz_mask =
645*77c1e3ccSAndroid Build Coastguard Worker vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
646*77c1e3ccSAndroid Build Coastguard Worker store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
647*77c1e3ccSAndroid Build Coastguard Worker
648*77c1e3ccSAndroid Build Coastguard Worker if (iqm_ptr != NULL) {
649*77c1e3ccSAndroid Build Coastguard Worker viwt = vmovl_u8(vld1_u8(&iqm_ptr[0]));
650*77c1e3ccSAndroid Build Coastguard Worker vdequant = QM_MULL_SHIFT(vdequant, viwt);
651*77c1e3ccSAndroid Build Coastguard Worker }
652*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
653*77c1e3ccSAndroid Build Coastguard Worker vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
654*77c1e3ccSAndroid Build Coastguard Worker vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
655*77c1e3ccSAndroid Build Coastguard Worker coeff_nz_mask =
656*77c1e3ccSAndroid Build Coastguard Worker vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
657*77c1e3ccSAndroid Build Coastguard Worker store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
658*77c1e3ccSAndroid Build Coastguard Worker
659*77c1e3ccSAndroid Build Coastguard Worker vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
660*77c1e3ccSAndroid Build Coastguard Worker vround =
661*77c1e3ccSAndroid Build Coastguard Worker vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale), vround, 0);
662*77c1e3ccSAndroid Build Coastguard Worker vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
663*77c1e3ccSAndroid Build Coastguard Worker vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
664*77c1e3ccSAndroid Build Coastguard Worker vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
665*77c1e3ccSAndroid Build Coastguard Worker
666*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
667*77c1e3ccSAndroid Build Coastguard Worker const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
668*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_iscan = vld1q_s16(&iscan[0]);
669*77c1e3ccSAndroid Build Coastguard Worker vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
670*77c1e3ccSAndroid Build Coastguard Worker v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
671*77c1e3ccSAndroid Build Coastguard Worker }
672*77c1e3ccSAndroid Build Coastguard Worker vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
673*77c1e3ccSAndroid Build Coastguard Worker
674*77c1e3ccSAndroid Build Coastguard Worker for (int i = 8; i < n_coeffs; i += 8) {
675*77c1e3ccSAndroid Build Coastguard Worker v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
676*77c1e3ccSAndroid Build Coastguard Worker v_coeff_sign = vshrq_n_s16(v_coeff, 15);
677*77c1e3ccSAndroid Build Coastguard Worker v_abs = vabsq_s16(v_coeff);
678*77c1e3ccSAndroid Build Coastguard Worker
679*77c1e3ccSAndroid Build Coastguard Worker if (qm_ptr == NULL) {
680*77c1e3ccSAndroid Build Coastguard Worker vcond = vcgeq_s16(v_abs, vzbins);
681*77c1e3ccSAndroid Build Coastguard Worker } else {
682*77c1e3ccSAndroid Build Coastguard Worker vwt = vmovl_u8(vld1_u8(&qm_ptr[i]));
683*77c1e3ccSAndroid Build Coastguard Worker vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
684*77c1e3ccSAndroid Build Coastguard Worker }
685*77c1e3ccSAndroid Build Coastguard Worker nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
686*77c1e3ccSAndroid Build Coastguard Worker if (nz_check) {
687*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vtmp = vqaddq_s16(v_abs, vround);
688*77c1e3ccSAndroid Build Coastguard Worker
689*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vtmp2;
690*77c1e3ccSAndroid Build Coastguard Worker if (qm_ptr == NULL) {
691*77c1e3ccSAndroid Build Coastguard Worker vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
692*77c1e3ccSAndroid Build Coastguard Worker } else {
693*77c1e3ccSAndroid Build Coastguard Worker vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
694*77c1e3ccSAndroid Build Coastguard Worker vtmp2 = vaddq_s16(vtmp2, vtmp);
695*77c1e3ccSAndroid Build Coastguard Worker }
696*77c1e3ccSAndroid Build Coastguard Worker vtmp2 = vqdmulhq_s16(vtmp2, vquant_shift);
697*77c1e3ccSAndroid Build Coastguard Worker
698*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
699*77c1e3ccSAndroid Build Coastguard Worker int16x8_t coeff_nz_mask =
700*77c1e3ccSAndroid Build Coastguard Worker vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
701*77c1e3ccSAndroid Build Coastguard Worker store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
702*77c1e3ccSAndroid Build Coastguard Worker
703*77c1e3ccSAndroid Build Coastguard Worker if (iqm_ptr != NULL) {
704*77c1e3ccSAndroid Build Coastguard Worker viwt = vmovl_u8(vld1_u8(&iqm_ptr[i]));
705*77c1e3ccSAndroid Build Coastguard Worker vdequant = QM_MULL_SHIFT(vdequant, viwt);
706*77c1e3ccSAndroid Build Coastguard Worker }
707*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
708*77c1e3ccSAndroid Build Coastguard Worker vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
709*77c1e3ccSAndroid Build Coastguard Worker vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
710*77c1e3ccSAndroid Build Coastguard Worker coeff_nz_mask =
711*77c1e3ccSAndroid Build Coastguard Worker vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
712*77c1e3ccSAndroid Build Coastguard Worker store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
713*77c1e3ccSAndroid Build Coastguard Worker
714*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
715*77c1e3ccSAndroid Build Coastguard Worker const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
716*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_iscan = vld1q_s16(&iscan[i]);
717*77c1e3ccSAndroid Build Coastguard Worker vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
718*77c1e3ccSAndroid Build Coastguard Worker v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
719*77c1e3ccSAndroid Build Coastguard Worker }
720*77c1e3ccSAndroid Build Coastguard Worker }
721*77c1e3ccSAndroid Build Coastguard Worker *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
722*77c1e3ccSAndroid Build Coastguard Worker }
723*77c1e3ccSAndroid Build Coastguard Worker
aom_quantize_b_helper_64x64_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan,const qm_val_t * qm_ptr,const qm_val_t * iqm_ptr)724*77c1e3ccSAndroid Build Coastguard Worker static void aom_quantize_b_helper_64x64_neon(
725*77c1e3ccSAndroid Build Coastguard Worker const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
726*77c1e3ccSAndroid Build Coastguard Worker const int16_t *round_ptr, const int16_t *quant_ptr,
727*77c1e3ccSAndroid Build Coastguard Worker const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
728*77c1e3ccSAndroid Build Coastguard Worker tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
729*77c1e3ccSAndroid Build Coastguard Worker const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
730*77c1e3ccSAndroid Build Coastguard Worker const qm_val_t *iqm_ptr) {
731*77c1e3ccSAndroid Build Coastguard Worker (void)scan;
732*77c1e3ccSAndroid Build Coastguard Worker
733*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t vwt, viwt;
734*77c1e3ccSAndroid Build Coastguard Worker const int log_scale = 2;
735*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t v_log_scale =
736*77c1e3ccSAndroid Build Coastguard Worker vreinterpretq_s16_s64(vdupq_n_s64(0xFFFEFFFEFFFEFFFE));
737*77c1e3ccSAndroid Build Coastguard Worker
738*77c1e3ccSAndroid Build Coastguard Worker const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
739*77c1e3ccSAndroid Build Coastguard Worker ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
740*77c1e3ccSAndroid Build Coastguard Worker
741*77c1e3ccSAndroid Build Coastguard Worker memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
742*77c1e3ccSAndroid Build Coastguard Worker memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
743*77c1e3ccSAndroid Build Coastguard Worker
744*77c1e3ccSAndroid Build Coastguard Worker const int16x8_t zero = vdupq_n_s16(0);
745*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
746*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_ones = vnegq_s16(v_eobmax_76543210);
747*77c1e3ccSAndroid Build Coastguard Worker
748*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vzbins = vdupq_n_s16(zbins[1]),
749*77c1e3ccSAndroid Build Coastguard Worker vround = vdupq_n_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale));
750*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
751*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
752*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
753*77c1e3ccSAndroid Build Coastguard Worker
754*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
755*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
756*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_abs = vabsq_s16(v_coeff);
757*77c1e3ccSAndroid Build Coastguard Worker vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
758*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t vcond;
759*77c1e3ccSAndroid Build Coastguard Worker if (qm_ptr == NULL) {
760*77c1e3ccSAndroid Build Coastguard Worker vcond = vcgeq_s16(v_abs, vzbins);
761*77c1e3ccSAndroid Build Coastguard Worker } else {
762*77c1e3ccSAndroid Build Coastguard Worker vwt = vmovl_u8(vld1_u8(&qm_ptr[0]));
763*77c1e3ccSAndroid Build Coastguard Worker vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
764*77c1e3ccSAndroid Build Coastguard Worker }
765*77c1e3ccSAndroid Build Coastguard Worker uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
766*77c1e3ccSAndroid Build Coastguard Worker if (nz_check) {
767*77c1e3ccSAndroid Build Coastguard Worker vround =
768*77c1e3ccSAndroid Build Coastguard Worker vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[0], log_scale), vround, 0);
769*77c1e3ccSAndroid Build Coastguard Worker vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
770*77c1e3ccSAndroid Build Coastguard Worker vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
771*77c1e3ccSAndroid Build Coastguard Worker vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
772*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vtmp = vqaddq_s16(v_abs, vround);
773*77c1e3ccSAndroid Build Coastguard Worker
774*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vtmp2;
775*77c1e3ccSAndroid Build Coastguard Worker if (qm_ptr == NULL) {
776*77c1e3ccSAndroid Build Coastguard Worker vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
777*77c1e3ccSAndroid Build Coastguard Worker } else {
778*77c1e3ccSAndroid Build Coastguard Worker vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
779*77c1e3ccSAndroid Build Coastguard Worker vtmp2 = vaddq_s16(vtmp2, vtmp);
780*77c1e3ccSAndroid Build Coastguard Worker }
781*77c1e3ccSAndroid Build Coastguard Worker
782*77c1e3ccSAndroid Build Coastguard Worker int16x8_t ones =
783*77c1e3ccSAndroid Build Coastguard Worker vandq_s16(vshrq_n_s16(vmulq_s16(vtmp2, vquant_shift), 14), v_ones);
784*77c1e3ccSAndroid Build Coastguard Worker vtmp2 =
785*77c1e3ccSAndroid Build Coastguard Worker vaddq_s16(vshlq_s16(vqdmulhq_s16(vtmp2, vquant_shift), v_ones), ones);
786*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
787*77c1e3ccSAndroid Build Coastguard Worker int16x8_t coeff_nz_mask =
788*77c1e3ccSAndroid Build Coastguard Worker vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
789*77c1e3ccSAndroid Build Coastguard Worker store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
790*77c1e3ccSAndroid Build Coastguard Worker
791*77c1e3ccSAndroid Build Coastguard Worker if (iqm_ptr != NULL) {
792*77c1e3ccSAndroid Build Coastguard Worker viwt = vmovl_u8(vld1_u8(&iqm_ptr[0]));
793*77c1e3ccSAndroid Build Coastguard Worker vdequant = QM_MULL_SHIFT(vdequant, viwt);
794*77c1e3ccSAndroid Build Coastguard Worker }
795*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
796*77c1e3ccSAndroid Build Coastguard Worker vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
797*77c1e3ccSAndroid Build Coastguard Worker v_deq_abs =
798*77c1e3ccSAndroid Build Coastguard Worker vorrq_s16(vshlq_n_s16(vqdmulhq_s16(vtmp2, vdequant), 13), v_deq_abs);
799*77c1e3ccSAndroid Build Coastguard Worker vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
800*77c1e3ccSAndroid Build Coastguard Worker coeff_nz_mask =
801*77c1e3ccSAndroid Build Coastguard Worker vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
802*77c1e3ccSAndroid Build Coastguard Worker store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
803*77c1e3ccSAndroid Build Coastguard Worker
804*77c1e3ccSAndroid Build Coastguard Worker vround =
805*77c1e3ccSAndroid Build Coastguard Worker vsetq_lane_s16(ROUND_POWER_OF_TWO(round_ptr[1], log_scale), vround, 0);
806*77c1e3ccSAndroid Build Coastguard Worker vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
807*77c1e3ccSAndroid Build Coastguard Worker vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
808*77c1e3ccSAndroid Build Coastguard Worker vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
809*77c1e3ccSAndroid Build Coastguard Worker
810*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
811*77c1e3ccSAndroid Build Coastguard Worker const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
812*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_iscan = vld1q_s16(&iscan[0]);
813*77c1e3ccSAndroid Build Coastguard Worker vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
814*77c1e3ccSAndroid Build Coastguard Worker v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
815*77c1e3ccSAndroid Build Coastguard Worker }
816*77c1e3ccSAndroid Build Coastguard Worker vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
817*77c1e3ccSAndroid Build Coastguard Worker
818*77c1e3ccSAndroid Build Coastguard Worker for (int i = 8; i < n_coeffs; i += 8) {
819*77c1e3ccSAndroid Build Coastguard Worker v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
820*77c1e3ccSAndroid Build Coastguard Worker v_coeff_sign = vshrq_n_s16(v_coeff, 15);
821*77c1e3ccSAndroid Build Coastguard Worker v_abs = vabsq_s16(v_coeff);
822*77c1e3ccSAndroid Build Coastguard Worker
823*77c1e3ccSAndroid Build Coastguard Worker if (qm_ptr == NULL) {
824*77c1e3ccSAndroid Build Coastguard Worker vcond = vcgeq_s16(v_abs, vzbins);
825*77c1e3ccSAndroid Build Coastguard Worker } else {
826*77c1e3ccSAndroid Build Coastguard Worker vwt = vmovl_u8(vld1_u8(&qm_ptr[i]));
827*77c1e3ccSAndroid Build Coastguard Worker vcond = vcgeq_s16(QM_MULL_SHIFT(v_abs, vwt), vzbins);
828*77c1e3ccSAndroid Build Coastguard Worker }
829*77c1e3ccSAndroid Build Coastguard Worker nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
830*77c1e3ccSAndroid Build Coastguard Worker if (nz_check) {
831*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vtmp = vqaddq_s16(v_abs, vround);
832*77c1e3ccSAndroid Build Coastguard Worker
833*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vtmp2;
834*77c1e3ccSAndroid Build Coastguard Worker if (qm_ptr == NULL) {
835*77c1e3ccSAndroid Build Coastguard Worker vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
836*77c1e3ccSAndroid Build Coastguard Worker } else {
837*77c1e3ccSAndroid Build Coastguard Worker vtmp2 = QM_MULL_SHIFT(vtmp, vwt);
838*77c1e3ccSAndroid Build Coastguard Worker vtmp2 = vaddq_s16(vtmp2, vtmp);
839*77c1e3ccSAndroid Build Coastguard Worker }
840*77c1e3ccSAndroid Build Coastguard Worker
841*77c1e3ccSAndroid Build Coastguard Worker int16x8_t ones =
842*77c1e3ccSAndroid Build Coastguard Worker vandq_s16(vshrq_n_s16(vmulq_s16(vtmp2, vquant_shift), 14), v_ones);
843*77c1e3ccSAndroid Build Coastguard Worker vtmp2 =
844*77c1e3ccSAndroid Build Coastguard Worker vaddq_s16(vshlq_s16(vqdmulhq_s16(vtmp2, vquant_shift), v_ones), ones);
845*77c1e3ccSAndroid Build Coastguard Worker int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
846*77c1e3ccSAndroid Build Coastguard Worker int16x8_t coeff_nz_mask =
847*77c1e3ccSAndroid Build Coastguard Worker vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
848*77c1e3ccSAndroid Build Coastguard Worker store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
849*77c1e3ccSAndroid Build Coastguard Worker
850*77c1e3ccSAndroid Build Coastguard Worker if (iqm_ptr != NULL) {
851*77c1e3ccSAndroid Build Coastguard Worker viwt = vmovl_u8(vld1_u8(&iqm_ptr[i]));
852*77c1e3ccSAndroid Build Coastguard Worker vdequant = QM_MULL_SHIFT(vdequant, viwt);
853*77c1e3ccSAndroid Build Coastguard Worker }
854*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_deq_abs = vreinterpretq_s16_u16(vshlq_u16(
855*77c1e3ccSAndroid Build Coastguard Worker vreinterpretq_u16_s16(vmulq_s16(vtmp2, vdequant)), v_log_scale));
856*77c1e3ccSAndroid Build Coastguard Worker v_deq_abs =
857*77c1e3ccSAndroid Build Coastguard Worker vorrq_s16(vshlq_n_s16(vqdmulhq_s16(vtmp2, vdequant), 13), v_deq_abs);
858*77c1e3ccSAndroid Build Coastguard Worker vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
859*77c1e3ccSAndroid Build Coastguard Worker coeff_nz_mask =
860*77c1e3ccSAndroid Build Coastguard Worker vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
861*77c1e3ccSAndroid Build Coastguard Worker store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
862*77c1e3ccSAndroid Build Coastguard Worker
863*77c1e3ccSAndroid Build Coastguard Worker uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
864*77c1e3ccSAndroid Build Coastguard Worker const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
865*77c1e3ccSAndroid Build Coastguard Worker int16x8_t v_iscan = vld1q_s16(&iscan[i]);
866*77c1e3ccSAndroid Build Coastguard Worker vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
867*77c1e3ccSAndroid Build Coastguard Worker v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
868*77c1e3ccSAndroid Build Coastguard Worker }
869*77c1e3ccSAndroid Build Coastguard Worker }
870*77c1e3ccSAndroid Build Coastguard Worker *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
871*77c1e3ccSAndroid Build Coastguard Worker }
872*77c1e3ccSAndroid Build Coastguard Worker
aom_quantize_b_helper_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan,const qm_val_t * qm_ptr,const qm_val_t * iqm_ptr,const int log_scale)873*77c1e3ccSAndroid Build Coastguard Worker void aom_quantize_b_helper_neon(
874*77c1e3ccSAndroid Build Coastguard Worker const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
875*77c1e3ccSAndroid Build Coastguard Worker const int16_t *round_ptr, const int16_t *quant_ptr,
876*77c1e3ccSAndroid Build Coastguard Worker const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
877*77c1e3ccSAndroid Build Coastguard Worker tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
878*77c1e3ccSAndroid Build Coastguard Worker const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
879*77c1e3ccSAndroid Build Coastguard Worker const qm_val_t *iqm_ptr, const int log_scale) {
880*77c1e3ccSAndroid Build Coastguard Worker switch (log_scale) { // log_scale for AV1 encoder can be only 0, 1, 2
881*77c1e3ccSAndroid Build Coastguard Worker case 0:
882*77c1e3ccSAndroid Build Coastguard Worker aom_quantize_b_helper_16x16_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
883*77c1e3ccSAndroid Build Coastguard Worker quant_ptr, quant_shift_ptr, qcoeff_ptr,
884*77c1e3ccSAndroid Build Coastguard Worker dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
885*77c1e3ccSAndroid Build Coastguard Worker iscan, qm_ptr, iqm_ptr);
886*77c1e3ccSAndroid Build Coastguard Worker break;
887*77c1e3ccSAndroid Build Coastguard Worker case 1:
888*77c1e3ccSAndroid Build Coastguard Worker aom_quantize_b_helper_32x32_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
889*77c1e3ccSAndroid Build Coastguard Worker quant_ptr, quant_shift_ptr, qcoeff_ptr,
890*77c1e3ccSAndroid Build Coastguard Worker dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
891*77c1e3ccSAndroid Build Coastguard Worker iscan, qm_ptr, iqm_ptr);
892*77c1e3ccSAndroid Build Coastguard Worker break;
893*77c1e3ccSAndroid Build Coastguard Worker case 2:
894*77c1e3ccSAndroid Build Coastguard Worker aom_quantize_b_helper_64x64_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
895*77c1e3ccSAndroid Build Coastguard Worker quant_ptr, quant_shift_ptr, qcoeff_ptr,
896*77c1e3ccSAndroid Build Coastguard Worker dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
897*77c1e3ccSAndroid Build Coastguard Worker iscan, qm_ptr, iqm_ptr);
898*77c1e3ccSAndroid Build Coastguard Worker break;
899*77c1e3ccSAndroid Build Coastguard Worker }
900*77c1e3ccSAndroid Build Coastguard Worker }
901*77c1e3ccSAndroid Build Coastguard Worker
aom_quantize_b_32x32_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)902*77c1e3ccSAndroid Build Coastguard Worker void aom_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
903*77c1e3ccSAndroid Build Coastguard Worker const int16_t *zbin_ptr,
904*77c1e3ccSAndroid Build Coastguard Worker const int16_t *round_ptr,
905*77c1e3ccSAndroid Build Coastguard Worker const int16_t *quant_ptr,
906*77c1e3ccSAndroid Build Coastguard Worker const int16_t *quant_shift_ptr,
907*77c1e3ccSAndroid Build Coastguard Worker tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
908*77c1e3ccSAndroid Build Coastguard Worker const int16_t *dequant_ptr, uint16_t *eob_ptr,
909*77c1e3ccSAndroid Build Coastguard Worker const int16_t *scan, const int16_t *iscan) {
910*77c1e3ccSAndroid Build Coastguard Worker aom_quantize_b_helper_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
911*77c1e3ccSAndroid Build Coastguard Worker quant_ptr, quant_shift_ptr, qcoeff_ptr,
912*77c1e3ccSAndroid Build Coastguard Worker dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
913*77c1e3ccSAndroid Build Coastguard Worker NULL, NULL, 1);
914*77c1e3ccSAndroid Build Coastguard Worker }
915*77c1e3ccSAndroid Build Coastguard Worker
aom_quantize_b_64x64_neon(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const int16_t * zbin_ptr,const int16_t * round_ptr,const int16_t * quant_ptr,const int16_t * quant_shift_ptr,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const int16_t * scan,const int16_t * iscan)916*77c1e3ccSAndroid Build Coastguard Worker void aom_quantize_b_64x64_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
917*77c1e3ccSAndroid Build Coastguard Worker const int16_t *zbin_ptr,
918*77c1e3ccSAndroid Build Coastguard Worker const int16_t *round_ptr,
919*77c1e3ccSAndroid Build Coastguard Worker const int16_t *quant_ptr,
920*77c1e3ccSAndroid Build Coastguard Worker const int16_t *quant_shift_ptr,
921*77c1e3ccSAndroid Build Coastguard Worker tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
922*77c1e3ccSAndroid Build Coastguard Worker const int16_t *dequant_ptr, uint16_t *eob_ptr,
923*77c1e3ccSAndroid Build Coastguard Worker const int16_t *scan, const int16_t *iscan) {
924*77c1e3ccSAndroid Build Coastguard Worker aom_quantize_b_helper_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
925*77c1e3ccSAndroid Build Coastguard Worker quant_ptr, quant_shift_ptr, qcoeff_ptr,
926*77c1e3ccSAndroid Build Coastguard Worker dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
927*77c1e3ccSAndroid Build Coastguard Worker NULL, NULL, 2);
928*77c1e3ccSAndroid Build Coastguard Worker }
929