xref: /aosp_15_r20/external/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c (revision fb1b10ab9aebc7c7068eedab379b749d7e3900be)
1 /*
2  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <emmintrin.h>
13 #include <xmmintrin.h>
14 
15 #include "./vp9_rtcd.h"
16 #include "vpx/vpx_integer.h"
17 #include "vpx_dsp/vpx_dsp_common.h"
18 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
19 #include "vpx_dsp/x86/quantize_sse2.h"
20 #include "vp9/common/vp9_scan.h"
21 #include "vp9/encoder/vp9_block.h"
22 
vp9_quantize_fp_sse2(const tran_low_t * coeff_ptr,intptr_t n_coeffs,const struct macroblock_plane * const mb_plane,tran_low_t * qcoeff_ptr,tran_low_t * dqcoeff_ptr,const int16_t * dequant_ptr,uint16_t * eob_ptr,const struct ScanOrder * const scan_order)23 void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
24                           const struct macroblock_plane *const mb_plane,
25                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
26                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
27                           const struct ScanOrder *const scan_order) {
28   const __m128i zero = _mm_setzero_si128();
29   __m128i thr;
30   int nzflag;
31   int index = 16;
32   __m128i round, quant, dequant;
33   __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
34   __m128i qcoeff0, qcoeff1;
35   __m128i eob;
36   const int16_t *iscan = scan_order->iscan;
37 
38   // Setup global values.
39   load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant);
40 
41   // Do DC and first 15 AC.
42   coeff0 = load_tran_low(coeff_ptr);
43   coeff1 = load_tran_low(coeff_ptr + 8);
44 
45   // Poor man's abs().
46   coeff0_sign = _mm_srai_epi16(coeff0, 15);
47   coeff1_sign = _mm_srai_epi16(coeff1, 15);
48   qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
49   qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
50 
51   qcoeff0 = _mm_adds_epi16(qcoeff0, round);
52   qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
53 
54   round = _mm_unpackhi_epi64(round, round);
55   quant = _mm_unpackhi_epi64(quant, quant);
56 
57   qcoeff1 = _mm_adds_epi16(qcoeff1, round);
58   qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
59 
60   // Reinsert signs.
61   qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
62   qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
63 
64   store_tran_low(qcoeff0, qcoeff_ptr);
65   store_tran_low(qcoeff1, qcoeff_ptr + 8);
66 
67   qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
68   dequant = _mm_unpackhi_epi64(dequant, dequant);
69   qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
70 
71   store_tran_low(qcoeff0, dqcoeff_ptr);
72   store_tran_low(qcoeff1, dqcoeff_ptr + 8);
73 
74   eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
75 
76   thr = _mm_srai_epi16(dequant, 1);
77 
78   // AC only loop.
79   while (index < n_coeffs) {
80     coeff0 = load_tran_low(coeff_ptr + index);
81     coeff1 = load_tran_low(coeff_ptr + index + 8);
82 
83     // Poor man's abs().
84     coeff0_sign = _mm_srai_epi16(coeff0, 15);
85     coeff1_sign = _mm_srai_epi16(coeff1, 15);
86     qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
87     qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
88 
89     nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
90              _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
91 
92     if (nzflag) {
93       __m128i eob0;
94       qcoeff0 = _mm_adds_epi16(qcoeff0, round);
95       qcoeff1 = _mm_adds_epi16(qcoeff1, round);
96       qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
97       qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
98 
99       // Reinsert signs.
100       qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
101       qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
102 
103       store_tran_low(qcoeff0, qcoeff_ptr + index);
104       store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
105 
106       qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
107       qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
108 
109       store_tran_low(qcoeff0, dqcoeff_ptr + index);
110       store_tran_low(qcoeff1, dqcoeff_ptr + index + 8);
111 
112       eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
113       eob = _mm_max_epi16(eob, eob0);
114     } else {
115       store_zero_tran_low(qcoeff_ptr + index);
116       store_zero_tran_low(qcoeff_ptr + index + 8);
117 
118       store_zero_tran_low(dqcoeff_ptr + index);
119       store_zero_tran_low(dqcoeff_ptr + index + 8);
120     }
121 
122     index += 16;
123   }
124 
125   *eob_ptr = accumulate_eob(eob);
126 }
127