xref: /aosp_15_r20/external/libgav1/src/utils/entropy_decoder.cc (revision 095378508e87ed692bf8dfeb34008b65b3735891)
1*09537850SAkhilesh Sanikop // Copyright 2019 The libgav1 Authors
2*09537850SAkhilesh Sanikop //
3*09537850SAkhilesh Sanikop // Licensed under the Apache License, Version 2.0 (the "License");
4*09537850SAkhilesh Sanikop // you may not use this file except in compliance with the License.
5*09537850SAkhilesh Sanikop // You may obtain a copy of the License at
6*09537850SAkhilesh Sanikop //
7*09537850SAkhilesh Sanikop //      http://www.apache.org/licenses/LICENSE-2.0
8*09537850SAkhilesh Sanikop //
9*09537850SAkhilesh Sanikop // Unless required by applicable law or agreed to in writing, software
10*09537850SAkhilesh Sanikop // distributed under the License is distributed on an "AS IS" BASIS,
11*09537850SAkhilesh Sanikop // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*09537850SAkhilesh Sanikop // See the License for the specific language governing permissions and
13*09537850SAkhilesh Sanikop // limitations under the License.
14*09537850SAkhilesh Sanikop 
15*09537850SAkhilesh Sanikop #include "src/utils/entropy_decoder.h"
16*09537850SAkhilesh Sanikop 
17*09537850SAkhilesh Sanikop #include <cassert>
18*09537850SAkhilesh Sanikop #include <cstring>
19*09537850SAkhilesh Sanikop 
20*09537850SAkhilesh Sanikop #include "src/utils/common.h"
21*09537850SAkhilesh Sanikop #include "src/utils/compiler_attributes.h"
22*09537850SAkhilesh Sanikop #include "src/utils/constants.h"
23*09537850SAkhilesh Sanikop #include "src/utils/cpu.h"
24*09537850SAkhilesh Sanikop 
25*09537850SAkhilesh Sanikop #if defined(__ARM_NEON__) || defined(__aarch64__) || \
26*09537850SAkhilesh Sanikop     (defined(_MSC_VER) && defined(_M_ARM))
27*09537850SAkhilesh Sanikop #define LIBGAV1_ENTROPY_DECODER_ENABLE_NEON 1
28*09537850SAkhilesh Sanikop #else
29*09537850SAkhilesh Sanikop #define LIBGAV1_ENTROPY_DECODER_ENABLE_NEON 0
30*09537850SAkhilesh Sanikop #endif
31*09537850SAkhilesh Sanikop 
32*09537850SAkhilesh Sanikop #if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
33*09537850SAkhilesh Sanikop #include <arm_neon.h>
34*09537850SAkhilesh Sanikop #endif
35*09537850SAkhilesh Sanikop 
36*09537850SAkhilesh Sanikop #if defined(__SSE2__) || defined(LIBGAV1_X86_MSVC)
37*09537850SAkhilesh Sanikop #define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 1
38*09537850SAkhilesh Sanikop #else
39*09537850SAkhilesh Sanikop #define LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2 0
40*09537850SAkhilesh Sanikop #endif
41*09537850SAkhilesh Sanikop 
42*09537850SAkhilesh Sanikop #if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
43*09537850SAkhilesh Sanikop #include <emmintrin.h>
44*09537850SAkhilesh Sanikop #endif
45*09537850SAkhilesh Sanikop 
46*09537850SAkhilesh Sanikop namespace libgav1 {
47*09537850SAkhilesh Sanikop namespace {
48*09537850SAkhilesh Sanikop 
49*09537850SAkhilesh Sanikop constexpr uint32_t kReadBitMask = ~255;
50*09537850SAkhilesh Sanikop constexpr int kCdfPrecision = 6;
51*09537850SAkhilesh Sanikop constexpr int kMinimumProbabilityPerSymbol = 4;
52*09537850SAkhilesh Sanikop 
53*09537850SAkhilesh Sanikop // This function computes the "cur" variable as specified inside the do-while
54*09537850SAkhilesh Sanikop // loop in Section 8.2.6 of the spec. This function is monotonically
55*09537850SAkhilesh Sanikop // decreasing as the values of index increases (note that the |cdf| array is
56*09537850SAkhilesh Sanikop // sorted in decreasing order).
ScaleCdf(uint32_t values_in_range_shifted,const uint16_t * const cdf,int index,int symbol_count)57*09537850SAkhilesh Sanikop uint32_t ScaleCdf(uint32_t values_in_range_shifted, const uint16_t* const cdf,
58*09537850SAkhilesh Sanikop                   int index, int symbol_count) {
59*09537850SAkhilesh Sanikop   return ((values_in_range_shifted * (cdf[index] >> kCdfPrecision)) >> 1) +
60*09537850SAkhilesh Sanikop          (kMinimumProbabilityPerSymbol * (symbol_count - index));
61*09537850SAkhilesh Sanikop }
62*09537850SAkhilesh Sanikop 
UpdateCdf(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol_count,const int symbol)63*09537850SAkhilesh Sanikop void UpdateCdf(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol_count,
64*09537850SAkhilesh Sanikop                const int symbol) {
65*09537850SAkhilesh Sanikop   const uint16_t count = cdf[symbol_count];
66*09537850SAkhilesh Sanikop   // rate is computed in the spec as:
67*09537850SAkhilesh Sanikop   //  3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2)
68*09537850SAkhilesh Sanikop   // In this case cdf[N] is |count|.
69*09537850SAkhilesh Sanikop   // Min(FloorLog2(N), 2) is 1 for symbol_count == {2, 3} and 2 for all
70*09537850SAkhilesh Sanikop   // symbol_count > 3. So the equation becomes:
71*09537850SAkhilesh Sanikop   //  4 + (count > 15) + (count > 31) + (symbol_count > 3).
72*09537850SAkhilesh Sanikop   // Note that the largest value for count is 32 (it is not incremented beyond
73*09537850SAkhilesh Sanikop   // 32). So using that information:
74*09537850SAkhilesh Sanikop   //  count >> 4 is 0 for count from 0 to 15.
75*09537850SAkhilesh Sanikop   //  count >> 4 is 1 for count from 16 to 31.
76*09537850SAkhilesh Sanikop   //  count >> 4 is 2 for count == 31.
77*09537850SAkhilesh Sanikop   // Now, the equation becomes:
78*09537850SAkhilesh Sanikop   //  4 + (count >> 4) + (symbol_count > 3).
79*09537850SAkhilesh Sanikop   // Since (count >> 4) can only be 0 or 1 or 2, the addition could be replaced
80*09537850SAkhilesh Sanikop   // with bitwise or:
81*09537850SAkhilesh Sanikop   //  (4 | (count >> 4)) + (symbol_count > 3).
82*09537850SAkhilesh Sanikop   // but using addition will allow the compiler to eliminate an operation when
83*09537850SAkhilesh Sanikop   // symbol_count is known and this function is inlined.
84*09537850SAkhilesh Sanikop   const int rate = (count >> 4) + 4 + static_cast<int>(symbol_count > 3);
85*09537850SAkhilesh Sanikop   // Hints for further optimizations:
86*09537850SAkhilesh Sanikop   //
87*09537850SAkhilesh Sanikop   // 1. clang can vectorize this for loop with width 4, even though the loop
88*09537850SAkhilesh Sanikop   // contains an if-else statement. Therefore, it may be advantageous to use
89*09537850SAkhilesh Sanikop   // "i < symbol_count" as the loop condition when symbol_count is 8, 12, or 16
90*09537850SAkhilesh Sanikop   // (a multiple of 4 that's not too small).
91*09537850SAkhilesh Sanikop   //
92*09537850SAkhilesh Sanikop   // 2. The for loop can be rewritten in the following form, which would enable
93*09537850SAkhilesh Sanikop   // clang to vectorize the loop with width 8:
94*09537850SAkhilesh Sanikop   //
95*09537850SAkhilesh Sanikop   //   const int rounding = (1 << rate) - 1;
96*09537850SAkhilesh Sanikop   //   for (int i = 0; i < symbol_count - 1; ++i) {
97*09537850SAkhilesh Sanikop   //     const uint16_t a = (i < symbol) ? kCdfMaxProbability : rounding;
98*09537850SAkhilesh Sanikop   //     cdf[i] += static_cast<int16_t>(a - cdf[i]) >> rate;
99*09537850SAkhilesh Sanikop   //   }
100*09537850SAkhilesh Sanikop   //
101*09537850SAkhilesh Sanikop   // The subtraction (a - cdf[i]) relies on the overflow semantics of unsigned
102*09537850SAkhilesh Sanikop   // integer arithmetic. The result of the unsigned subtraction is cast to a
103*09537850SAkhilesh Sanikop   // signed integer and right-shifted. This requires the right shift of a
104*09537850SAkhilesh Sanikop   // signed integer be an arithmetic shift, which is true for clang, gcc, and
105*09537850SAkhilesh Sanikop   // Visual C++.
106*09537850SAkhilesh Sanikop   assert(symbol_count - 1 > 0);
107*09537850SAkhilesh Sanikop   int i = 0;
108*09537850SAkhilesh Sanikop   do {
109*09537850SAkhilesh Sanikop     if (i < symbol) {
110*09537850SAkhilesh Sanikop       cdf[i] += (kCdfMaxProbability - cdf[i]) >> rate;
111*09537850SAkhilesh Sanikop     } else {
112*09537850SAkhilesh Sanikop       cdf[i] -= cdf[i] >> rate;
113*09537850SAkhilesh Sanikop     }
114*09537850SAkhilesh Sanikop   } while (++i < symbol_count - 1);
115*09537850SAkhilesh Sanikop   cdf[symbol_count] += static_cast<uint16_t>(count < 32);
116*09537850SAkhilesh Sanikop }
117*09537850SAkhilesh Sanikop 
118*09537850SAkhilesh Sanikop // Define the UpdateCdfN functions. UpdateCdfN is a specialized implementation
119*09537850SAkhilesh Sanikop // of UpdateCdf based on the fact that symbol_count == N. UpdateCdfN uses the
120*09537850SAkhilesh Sanikop // SIMD instruction sets if available.
121*09537850SAkhilesh Sanikop 
122*09537850SAkhilesh Sanikop #if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
123*09537850SAkhilesh Sanikop 
124*09537850SAkhilesh Sanikop // The UpdateCdf() method contains the following for loop:
125*09537850SAkhilesh Sanikop //
126*09537850SAkhilesh Sanikop //   for (int i = 0; i < symbol_count - 1; ++i) {
127*09537850SAkhilesh Sanikop //     if (i < symbol) {
128*09537850SAkhilesh Sanikop //       cdf[i] += (kCdfMaxProbability - cdf[i]) >> rate;
129*09537850SAkhilesh Sanikop //     } else {
130*09537850SAkhilesh Sanikop //       cdf[i] -= cdf[i] >> rate;
131*09537850SAkhilesh Sanikop //     }
132*09537850SAkhilesh Sanikop //   }
133*09537850SAkhilesh Sanikop //
134*09537850SAkhilesh Sanikop // It can be rewritten in the following two forms, which are amenable to SIMD
135*09537850SAkhilesh Sanikop // implementations:
136*09537850SAkhilesh Sanikop //
137*09537850SAkhilesh Sanikop //   const int rounding = (1 << rate) - 1;
138*09537850SAkhilesh Sanikop //   for (int i = 0; i < symbol_count - 1; ++i) {
139*09537850SAkhilesh Sanikop //     const uint16_t a = (i < symbol) ? kCdfMaxProbability : rounding;
140*09537850SAkhilesh Sanikop //     cdf[i] += static_cast<int16_t>(a - cdf[i]) >> rate;
141*09537850SAkhilesh Sanikop //   }
142*09537850SAkhilesh Sanikop //
143*09537850SAkhilesh Sanikop // or:
144*09537850SAkhilesh Sanikop //
145*09537850SAkhilesh Sanikop //   const int rounding = (1 << rate) - 1;
146*09537850SAkhilesh Sanikop //   for (int i = 0; i < symbol_count - 1; ++i) {
147*09537850SAkhilesh Sanikop //     const uint16_t a = (i < symbol) ? (kCdfMaxProbability - rounding) : 0;
148*09537850SAkhilesh Sanikop //     cdf[i] -= static_cast<int16_t>(cdf[i] - a) >> rate;
149*09537850SAkhilesh Sanikop //   }
150*09537850SAkhilesh Sanikop //
151*09537850SAkhilesh Sanikop // The following ARM NEON implementations use a modified version of the first
152*09537850SAkhilesh Sanikop // form, using the comparison mask and unsigned rollover to avoid the need to
153*09537850SAkhilesh Sanikop // calculate rounding.
154*09537850SAkhilesh Sanikop //
155*09537850SAkhilesh Sanikop // The cdf array has symbol_count + 1 elements. The first symbol_count elements
156*09537850SAkhilesh Sanikop // are the CDF. The last element is a count that is initialized to 0 and may
157*09537850SAkhilesh Sanikop // grow up to 32. The for loop in UpdateCdf updates the CDF in the array. Since
158*09537850SAkhilesh Sanikop // cdf[symbol_count - 1] is always 0, the for loop does not update
159*09537850SAkhilesh Sanikop // cdf[symbol_count - 1]. However, it would be correct to have the for loop
160*09537850SAkhilesh Sanikop // update cdf[symbol_count - 1] anyway: since symbol_count - 1 >= symbol, the
161*09537850SAkhilesh Sanikop // for loop would take the else branch when i is symbol_count - 1:
162*09537850SAkhilesh Sanikop //      cdf[i] -= cdf[i] >> rate;
163*09537850SAkhilesh Sanikop // Since cdf[symbol_count - 1] is 0, cdf[symbol_count - 1] would still be 0
164*09537850SAkhilesh Sanikop // after the update. The ARM NEON implementations take advantage of this in the
165*09537850SAkhilesh Sanikop // following two cases:
166*09537850SAkhilesh Sanikop // 1. When symbol_count is 8 or 16, the vectorized code updates the first
167*09537850SAkhilesh Sanikop //    symbol_count elements in the array.
168*09537850SAkhilesh Sanikop // 2. When symbol_count is 7, the vectorized code updates all the 8 elements in
169*09537850SAkhilesh Sanikop //    the cdf array. Since an invalid CDF value is written into cdf[7], the
170*09537850SAkhilesh Sanikop //    count in cdf[7] needs to be fixed up after the vectorized code.
171*09537850SAkhilesh Sanikop 
UpdateCdf5(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol)172*09537850SAkhilesh Sanikop void UpdateCdf5(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
173*09537850SAkhilesh Sanikop   uint16x4_t cdf_vec = vld1_u16(cdf);
174*09537850SAkhilesh Sanikop   const uint16_t count = cdf[5];
175*09537850SAkhilesh Sanikop   const int rate = (count >> 4) + 5;
176*09537850SAkhilesh Sanikop   const uint16x4_t cdf_max_probability = vdup_n_u16(kCdfMaxProbability);
177*09537850SAkhilesh Sanikop   const uint16x4_t index = vcreate_u16(0x0003000200010000);
178*09537850SAkhilesh Sanikop   const uint16x4_t symbol_vec = vdup_n_u16(symbol);
179*09537850SAkhilesh Sanikop   const uint16x4_t mask = vcge_u16(index, symbol_vec);
180*09537850SAkhilesh Sanikop   // i < symbol: 32768, i >= symbol: 65535.
181*09537850SAkhilesh Sanikop   const uint16x4_t a = vorr_u16(mask, cdf_max_probability);
182*09537850SAkhilesh Sanikop   // i < symbol: 32768 - cdf, i >= symbol: 65535 - cdf.
183*09537850SAkhilesh Sanikop   const int16x4_t diff = vreinterpret_s16_u16(vsub_u16(a, cdf_vec));
184*09537850SAkhilesh Sanikop   // i < symbol: cdf - 0, i >= symbol: cdf - 65535.
185*09537850SAkhilesh Sanikop   const uint16x4_t cdf_offset = vsub_u16(cdf_vec, mask);
186*09537850SAkhilesh Sanikop   const int16x4_t negative_rate = vdup_n_s16(-rate);
187*09537850SAkhilesh Sanikop   // i < symbol: (32768 - cdf) >> rate, i >= symbol: (65535 (-1) - cdf) >> rate.
188*09537850SAkhilesh Sanikop   const uint16x4_t delta = vreinterpret_u16_s16(vshl_s16(diff, negative_rate));
189*09537850SAkhilesh Sanikop   // i < symbol: (cdf - 0) + ((32768 - cdf) >> rate).
190*09537850SAkhilesh Sanikop   // i >= symbol: (cdf - 65535) + ((65535 - cdf) >> rate).
191*09537850SAkhilesh Sanikop   cdf_vec = vadd_u16(cdf_offset, delta);
192*09537850SAkhilesh Sanikop   vst1_u16(cdf, cdf_vec);
193*09537850SAkhilesh Sanikop   cdf[5] = count + static_cast<uint16_t>(count < 32);
194*09537850SAkhilesh Sanikop }
195*09537850SAkhilesh Sanikop 
196*09537850SAkhilesh Sanikop // This version works for |symbol_count| = 7, 8, or 9.
197*09537850SAkhilesh Sanikop // See UpdateCdf5 for implementation details.
198*09537850SAkhilesh Sanikop template <int symbol_count>
UpdateCdf7To9(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol)199*09537850SAkhilesh Sanikop void UpdateCdf7To9(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
200*09537850SAkhilesh Sanikop   static_assert(symbol_count >= 7 && symbol_count <= 9, "");
201*09537850SAkhilesh Sanikop   uint16x8_t cdf_vec = vld1q_u16(cdf);
202*09537850SAkhilesh Sanikop   const uint16_t count = cdf[symbol_count];
203*09537850SAkhilesh Sanikop   const int rate = (count >> 4) + 5;
204*09537850SAkhilesh Sanikop   const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
205*09537850SAkhilesh Sanikop   const uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
206*09537850SAkhilesh Sanikop                                         vcreate_u16(0x0007000600050004));
207*09537850SAkhilesh Sanikop   const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
208*09537850SAkhilesh Sanikop   const uint16x8_t mask = vcgeq_u16(index, symbol_vec);
209*09537850SAkhilesh Sanikop   const uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
210*09537850SAkhilesh Sanikop   const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
211*09537850SAkhilesh Sanikop   const uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask);
212*09537850SAkhilesh Sanikop   const int16x8_t negative_rate = vdupq_n_s16(-rate);
213*09537850SAkhilesh Sanikop   const uint16x8_t delta =
214*09537850SAkhilesh Sanikop       vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
215*09537850SAkhilesh Sanikop   cdf_vec = vaddq_u16(cdf_offset, delta);
216*09537850SAkhilesh Sanikop   vst1q_u16(cdf, cdf_vec);
217*09537850SAkhilesh Sanikop   cdf[symbol_count] = count + static_cast<uint16_t>(count < 32);
218*09537850SAkhilesh Sanikop }
219*09537850SAkhilesh Sanikop 
UpdateCdf7(uint16_t * const cdf,const int symbol)220*09537850SAkhilesh Sanikop void UpdateCdf7(uint16_t* const cdf, const int symbol) {
221*09537850SAkhilesh Sanikop   UpdateCdf7To9<7>(cdf, symbol);
222*09537850SAkhilesh Sanikop }
223*09537850SAkhilesh Sanikop 
UpdateCdf8(uint16_t * const cdf,const int symbol)224*09537850SAkhilesh Sanikop void UpdateCdf8(uint16_t* const cdf, const int symbol) {
225*09537850SAkhilesh Sanikop   UpdateCdf7To9<8>(cdf, symbol);
226*09537850SAkhilesh Sanikop }
227*09537850SAkhilesh Sanikop 
UpdateCdf9(uint16_t * const cdf,const int symbol)228*09537850SAkhilesh Sanikop void UpdateCdf9(uint16_t* const cdf, const int symbol) {
229*09537850SAkhilesh Sanikop   UpdateCdf7To9<9>(cdf, symbol);
230*09537850SAkhilesh Sanikop }
231*09537850SAkhilesh Sanikop 
232*09537850SAkhilesh Sanikop // See UpdateCdf5 for implementation details.
UpdateCdf11(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol)233*09537850SAkhilesh Sanikop void UpdateCdf11(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
234*09537850SAkhilesh Sanikop   uint16x8_t cdf_vec = vld1q_u16(cdf + 2);
235*09537850SAkhilesh Sanikop   const uint16_t count = cdf[11];
236*09537850SAkhilesh Sanikop   cdf[11] = count + static_cast<uint16_t>(count < 32);
237*09537850SAkhilesh Sanikop   const int rate = (count >> 4) + 5;
238*09537850SAkhilesh Sanikop   if (symbol > 1) {
239*09537850SAkhilesh Sanikop     cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
240*09537850SAkhilesh Sanikop     cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
241*09537850SAkhilesh Sanikop     const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
242*09537850SAkhilesh Sanikop     const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
243*09537850SAkhilesh Sanikop     const int16x8_t negative_rate = vdupq_n_s16(-rate);
244*09537850SAkhilesh Sanikop     const uint16x8_t index = vcombine_u16(vcreate_u16(0x0005000400030002),
245*09537850SAkhilesh Sanikop                                           vcreate_u16(0x0009000800070006));
246*09537850SAkhilesh Sanikop     const uint16x8_t mask = vcgeq_u16(index, symbol_vec);
247*09537850SAkhilesh Sanikop     const uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
248*09537850SAkhilesh Sanikop     const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
249*09537850SAkhilesh Sanikop     const uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask);
250*09537850SAkhilesh Sanikop     const uint16x8_t delta =
251*09537850SAkhilesh Sanikop         vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
252*09537850SAkhilesh Sanikop     cdf_vec = vaddq_u16(cdf_offset, delta);
253*09537850SAkhilesh Sanikop     vst1q_u16(cdf + 2, cdf_vec);
254*09537850SAkhilesh Sanikop   } else {
255*09537850SAkhilesh Sanikop     if (symbol != 0) {
256*09537850SAkhilesh Sanikop       cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
257*09537850SAkhilesh Sanikop       cdf[1] -= cdf[1] >> rate;
258*09537850SAkhilesh Sanikop     } else {
259*09537850SAkhilesh Sanikop       cdf[0] -= cdf[0] >> rate;
260*09537850SAkhilesh Sanikop       cdf[1] -= cdf[1] >> rate;
261*09537850SAkhilesh Sanikop     }
262*09537850SAkhilesh Sanikop     const int16x8_t negative_rate = vdupq_n_s16(-rate);
263*09537850SAkhilesh Sanikop     const uint16x8_t delta = vshlq_u16(cdf_vec, negative_rate);
264*09537850SAkhilesh Sanikop     cdf_vec = vsubq_u16(cdf_vec, delta);
265*09537850SAkhilesh Sanikop     vst1q_u16(cdf + 2, cdf_vec);
266*09537850SAkhilesh Sanikop   }
267*09537850SAkhilesh Sanikop }
268*09537850SAkhilesh Sanikop 
269*09537850SAkhilesh Sanikop // See UpdateCdf5 for implementation details.
UpdateCdf13(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol)270*09537850SAkhilesh Sanikop void UpdateCdf13(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
271*09537850SAkhilesh Sanikop   uint16x8_t cdf_vec0 = vld1q_u16(cdf);
272*09537850SAkhilesh Sanikop   uint16x8_t cdf_vec1 = vld1q_u16(cdf + 4);
273*09537850SAkhilesh Sanikop   const uint16_t count = cdf[13];
274*09537850SAkhilesh Sanikop   const int rate = (count >> 4) + 5;
275*09537850SAkhilesh Sanikop   const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
276*09537850SAkhilesh Sanikop   const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
277*09537850SAkhilesh Sanikop   const int16x8_t negative_rate = vdupq_n_s16(-rate);
278*09537850SAkhilesh Sanikop 
279*09537850SAkhilesh Sanikop   uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
280*09537850SAkhilesh Sanikop                                   vcreate_u16(0x0007000600050004));
281*09537850SAkhilesh Sanikop   uint16x8_t mask = vcgeq_u16(index, symbol_vec);
282*09537850SAkhilesh Sanikop   uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
283*09537850SAkhilesh Sanikop   int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec0));
284*09537850SAkhilesh Sanikop   uint16x8_t cdf_offset = vsubq_u16(cdf_vec0, mask);
285*09537850SAkhilesh Sanikop   uint16x8_t delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
286*09537850SAkhilesh Sanikop   cdf_vec0 = vaddq_u16(cdf_offset, delta);
287*09537850SAkhilesh Sanikop   vst1q_u16(cdf, cdf_vec0);
288*09537850SAkhilesh Sanikop 
289*09537850SAkhilesh Sanikop   index = vcombine_u16(vcreate_u16(0x0007000600050004),
290*09537850SAkhilesh Sanikop                        vcreate_u16(0x000b000a00090008));
291*09537850SAkhilesh Sanikop   mask = vcgeq_u16(index, symbol_vec);
292*09537850SAkhilesh Sanikop   a = vorrq_u16(mask, cdf_max_probability);
293*09537850SAkhilesh Sanikop   diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec1));
294*09537850SAkhilesh Sanikop   cdf_offset = vsubq_u16(cdf_vec1, mask);
295*09537850SAkhilesh Sanikop   delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
296*09537850SAkhilesh Sanikop   cdf_vec1 = vaddq_u16(cdf_offset, delta);
297*09537850SAkhilesh Sanikop   vst1q_u16(cdf + 4, cdf_vec1);
298*09537850SAkhilesh Sanikop 
299*09537850SAkhilesh Sanikop   cdf[13] = count + static_cast<uint16_t>(count < 32);
300*09537850SAkhilesh Sanikop }
301*09537850SAkhilesh Sanikop 
302*09537850SAkhilesh Sanikop // See UpdateCdf5 for implementation details.
UpdateCdf16(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol)303*09537850SAkhilesh Sanikop void UpdateCdf16(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
304*09537850SAkhilesh Sanikop   uint16x8_t cdf_vec = vld1q_u16(cdf);
305*09537850SAkhilesh Sanikop   const uint16_t count = cdf[16];
306*09537850SAkhilesh Sanikop   const int rate = (count >> 4) + 5;
307*09537850SAkhilesh Sanikop   const uint16x8_t cdf_max_probability = vdupq_n_u16(kCdfMaxProbability);
308*09537850SAkhilesh Sanikop   const uint16x8_t symbol_vec = vdupq_n_u16(symbol);
309*09537850SAkhilesh Sanikop   const int16x8_t negative_rate = vdupq_n_s16(-rate);
310*09537850SAkhilesh Sanikop 
311*09537850SAkhilesh Sanikop   uint16x8_t index = vcombine_u16(vcreate_u16(0x0003000200010000),
312*09537850SAkhilesh Sanikop                                   vcreate_u16(0x0007000600050004));
313*09537850SAkhilesh Sanikop   uint16x8_t mask = vcgeq_u16(index, symbol_vec);
314*09537850SAkhilesh Sanikop   uint16x8_t a = vorrq_u16(mask, cdf_max_probability);
315*09537850SAkhilesh Sanikop   int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
316*09537850SAkhilesh Sanikop   uint16x8_t cdf_offset = vsubq_u16(cdf_vec, mask);
317*09537850SAkhilesh Sanikop   uint16x8_t delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
318*09537850SAkhilesh Sanikop   cdf_vec = vaddq_u16(cdf_offset, delta);
319*09537850SAkhilesh Sanikop   vst1q_u16(cdf, cdf_vec);
320*09537850SAkhilesh Sanikop 
321*09537850SAkhilesh Sanikop   cdf_vec = vld1q_u16(cdf + 8);
322*09537850SAkhilesh Sanikop   index = vcombine_u16(vcreate_u16(0x000b000a00090008),
323*09537850SAkhilesh Sanikop                        vcreate_u16(0x000f000e000d000c));
324*09537850SAkhilesh Sanikop   mask = vcgeq_u16(index, symbol_vec);
325*09537850SAkhilesh Sanikop   a = vorrq_u16(mask, cdf_max_probability);
326*09537850SAkhilesh Sanikop   diff = vreinterpretq_s16_u16(vsubq_u16(a, cdf_vec));
327*09537850SAkhilesh Sanikop   cdf_offset = vsubq_u16(cdf_vec, mask);
328*09537850SAkhilesh Sanikop   delta = vreinterpretq_u16_s16(vshlq_s16(diff, negative_rate));
329*09537850SAkhilesh Sanikop   cdf_vec = vaddq_u16(cdf_offset, delta);
330*09537850SAkhilesh Sanikop   vst1q_u16(cdf + 8, cdf_vec);
331*09537850SAkhilesh Sanikop 
332*09537850SAkhilesh Sanikop   cdf[16] = count + static_cast<uint16_t>(count < 32);
333*09537850SAkhilesh Sanikop }
334*09537850SAkhilesh Sanikop 
335*09537850SAkhilesh Sanikop #else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
336*09537850SAkhilesh Sanikop 
337*09537850SAkhilesh Sanikop #if LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
338*09537850SAkhilesh Sanikop 
LoadLo8(const void * a)339*09537850SAkhilesh Sanikop inline __m128i LoadLo8(const void* a) {
340*09537850SAkhilesh Sanikop   return _mm_loadl_epi64(static_cast<const __m128i*>(a));
341*09537850SAkhilesh Sanikop }
342*09537850SAkhilesh Sanikop 
LoadUnaligned16(const void * a)343*09537850SAkhilesh Sanikop inline __m128i LoadUnaligned16(const void* a) {
344*09537850SAkhilesh Sanikop   return _mm_loadu_si128(static_cast<const __m128i*>(a));
345*09537850SAkhilesh Sanikop }
346*09537850SAkhilesh Sanikop 
StoreLo8(void * a,const __m128i v)347*09537850SAkhilesh Sanikop inline void StoreLo8(void* a, const __m128i v) {
348*09537850SAkhilesh Sanikop   _mm_storel_epi64(static_cast<__m128i*>(a), v);
349*09537850SAkhilesh Sanikop }
350*09537850SAkhilesh Sanikop 
StoreUnaligned16(void * a,const __m128i v)351*09537850SAkhilesh Sanikop inline void StoreUnaligned16(void* a, const __m128i v) {
352*09537850SAkhilesh Sanikop   _mm_storeu_si128(static_cast<__m128i*>(a), v);
353*09537850SAkhilesh Sanikop }
354*09537850SAkhilesh Sanikop 
UpdateCdf5(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol)355*09537850SAkhilesh Sanikop void UpdateCdf5(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
356*09537850SAkhilesh Sanikop   __m128i cdf_vec = LoadLo8(cdf);
357*09537850SAkhilesh Sanikop   const uint16_t count = cdf[5];
358*09537850SAkhilesh Sanikop   const int rate = (count >> 4) + 5;
359*09537850SAkhilesh Sanikop   const __m128i cdf_max_probability =
360*09537850SAkhilesh Sanikop       _mm_shufflelo_epi16(_mm_cvtsi32_si128(kCdfMaxProbability), 0);
361*09537850SAkhilesh Sanikop   const __m128i index = _mm_set_epi32(0x0, 0x0, 0x00040003, 0x00020001);
362*09537850SAkhilesh Sanikop   const __m128i symbol_vec = _mm_shufflelo_epi16(_mm_cvtsi32_si128(symbol), 0);
363*09537850SAkhilesh Sanikop   // i >= symbol.
364*09537850SAkhilesh Sanikop   const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
365*09537850SAkhilesh Sanikop   // i < symbol: 32768, i >= symbol: 65535.
366*09537850SAkhilesh Sanikop   const __m128i a = _mm_or_si128(mask, cdf_max_probability);
367*09537850SAkhilesh Sanikop   // i < symbol: 32768 - cdf, i >= symbol: 65535 - cdf.
368*09537850SAkhilesh Sanikop   const __m128i diff = _mm_sub_epi16(a, cdf_vec);
369*09537850SAkhilesh Sanikop   // i < symbol: cdf - 0, i >= symbol: cdf - 65535.
370*09537850SAkhilesh Sanikop   const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask);
371*09537850SAkhilesh Sanikop   // i < symbol: (32768 - cdf) >> rate, i >= symbol: (65535 (-1) - cdf) >> rate.
372*09537850SAkhilesh Sanikop   const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
373*09537850SAkhilesh Sanikop   // i < symbol: (cdf - 0) + ((32768 - cdf) >> rate).
374*09537850SAkhilesh Sanikop   // i >= symbol: (cdf - 65535) + ((65535 - cdf) >> rate).
375*09537850SAkhilesh Sanikop   cdf_vec = _mm_add_epi16(cdf_offset, delta);
376*09537850SAkhilesh Sanikop   StoreLo8(cdf, cdf_vec);
377*09537850SAkhilesh Sanikop   cdf[5] = count + static_cast<uint16_t>(count < 32);
378*09537850SAkhilesh Sanikop }
379*09537850SAkhilesh Sanikop 
380*09537850SAkhilesh Sanikop // This version works for |symbol_count| = 7, 8, or 9.
381*09537850SAkhilesh Sanikop // See UpdateCdf5 for implementation details.
382*09537850SAkhilesh Sanikop template <int symbol_count>
UpdateCdf7To9(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol)383*09537850SAkhilesh Sanikop void UpdateCdf7To9(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
384*09537850SAkhilesh Sanikop   static_assert(symbol_count >= 7 && symbol_count <= 9, "");
385*09537850SAkhilesh Sanikop   __m128i cdf_vec = LoadUnaligned16(cdf);
386*09537850SAkhilesh Sanikop   const uint16_t count = cdf[symbol_count];
387*09537850SAkhilesh Sanikop   const int rate = (count >> 4) + 5;
388*09537850SAkhilesh Sanikop   const __m128i cdf_max_probability =
389*09537850SAkhilesh Sanikop       _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
390*09537850SAkhilesh Sanikop   const __m128i index =
391*09537850SAkhilesh Sanikop       _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
392*09537850SAkhilesh Sanikop   const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
393*09537850SAkhilesh Sanikop   const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
394*09537850SAkhilesh Sanikop   const __m128i a = _mm_or_si128(mask, cdf_max_probability);
395*09537850SAkhilesh Sanikop   const __m128i diff = _mm_sub_epi16(a, cdf_vec);
396*09537850SAkhilesh Sanikop   const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask);
397*09537850SAkhilesh Sanikop   const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
398*09537850SAkhilesh Sanikop   cdf_vec = _mm_add_epi16(cdf_offset, delta);
399*09537850SAkhilesh Sanikop   StoreUnaligned16(cdf, cdf_vec);
400*09537850SAkhilesh Sanikop   cdf[symbol_count] = count + static_cast<uint16_t>(count < 32);
401*09537850SAkhilesh Sanikop }
402*09537850SAkhilesh Sanikop 
UpdateCdf7(uint16_t * const cdf,const int symbol)403*09537850SAkhilesh Sanikop void UpdateCdf7(uint16_t* const cdf, const int symbol) {
404*09537850SAkhilesh Sanikop   UpdateCdf7To9<7>(cdf, symbol);
405*09537850SAkhilesh Sanikop }
406*09537850SAkhilesh Sanikop 
UpdateCdf8(uint16_t * const cdf,const int symbol)407*09537850SAkhilesh Sanikop void UpdateCdf8(uint16_t* const cdf, const int symbol) {
408*09537850SAkhilesh Sanikop   UpdateCdf7To9<8>(cdf, symbol);
409*09537850SAkhilesh Sanikop }
410*09537850SAkhilesh Sanikop 
UpdateCdf9(uint16_t * const cdf,const int symbol)411*09537850SAkhilesh Sanikop void UpdateCdf9(uint16_t* const cdf, const int symbol) {
412*09537850SAkhilesh Sanikop   UpdateCdf7To9<9>(cdf, symbol);
413*09537850SAkhilesh Sanikop }
414*09537850SAkhilesh Sanikop 
415*09537850SAkhilesh Sanikop // See UpdateCdf5 for implementation details.
UpdateCdf11(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol)416*09537850SAkhilesh Sanikop void UpdateCdf11(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
417*09537850SAkhilesh Sanikop   __m128i cdf_vec = LoadUnaligned16(cdf + 2);
418*09537850SAkhilesh Sanikop   const uint16_t count = cdf[11];
419*09537850SAkhilesh Sanikop   cdf[11] = count + static_cast<uint16_t>(count < 32);
420*09537850SAkhilesh Sanikop   const int rate = (count >> 4) + 5;
421*09537850SAkhilesh Sanikop   if (symbol > 1) {
422*09537850SAkhilesh Sanikop     cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
423*09537850SAkhilesh Sanikop     cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
424*09537850SAkhilesh Sanikop     const __m128i cdf_max_probability =
425*09537850SAkhilesh Sanikop         _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
426*09537850SAkhilesh Sanikop     const __m128i index =
427*09537850SAkhilesh Sanikop         _mm_set_epi32(0x000a0009, 0x00080007, 0x00060005, 0x00040003);
428*09537850SAkhilesh Sanikop     const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
429*09537850SAkhilesh Sanikop     const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
430*09537850SAkhilesh Sanikop     const __m128i a = _mm_or_si128(mask, cdf_max_probability);
431*09537850SAkhilesh Sanikop     const __m128i diff = _mm_sub_epi16(a, cdf_vec);
432*09537850SAkhilesh Sanikop     const __m128i cdf_offset = _mm_sub_epi16(cdf_vec, mask);
433*09537850SAkhilesh Sanikop     const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
434*09537850SAkhilesh Sanikop     cdf_vec = _mm_add_epi16(cdf_offset, delta);
435*09537850SAkhilesh Sanikop     StoreUnaligned16(cdf + 2, cdf_vec);
436*09537850SAkhilesh Sanikop   } else {
437*09537850SAkhilesh Sanikop     if (symbol != 0) {
438*09537850SAkhilesh Sanikop       cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
439*09537850SAkhilesh Sanikop       cdf[1] -= cdf[1] >> rate;
440*09537850SAkhilesh Sanikop     } else {
441*09537850SAkhilesh Sanikop       cdf[0] -= cdf[0] >> rate;
442*09537850SAkhilesh Sanikop       cdf[1] -= cdf[1] >> rate;
443*09537850SAkhilesh Sanikop     }
444*09537850SAkhilesh Sanikop     const __m128i delta = _mm_sra_epi16(cdf_vec, _mm_cvtsi32_si128(rate));
445*09537850SAkhilesh Sanikop     cdf_vec = _mm_sub_epi16(cdf_vec, delta);
446*09537850SAkhilesh Sanikop     StoreUnaligned16(cdf + 2, cdf_vec);
447*09537850SAkhilesh Sanikop   }
448*09537850SAkhilesh Sanikop }
449*09537850SAkhilesh Sanikop 
450*09537850SAkhilesh Sanikop // See UpdateCdf5 for implementation details.
UpdateCdf13(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol)451*09537850SAkhilesh Sanikop void UpdateCdf13(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
452*09537850SAkhilesh Sanikop   __m128i cdf_vec0 = LoadLo8(cdf);
453*09537850SAkhilesh Sanikop   __m128i cdf_vec1 = LoadUnaligned16(cdf + 4);
454*09537850SAkhilesh Sanikop   const uint16_t count = cdf[13];
455*09537850SAkhilesh Sanikop   const int rate = (count >> 4) + 5;
456*09537850SAkhilesh Sanikop   const __m128i cdf_max_probability =
457*09537850SAkhilesh Sanikop       _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
458*09537850SAkhilesh Sanikop   const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
459*09537850SAkhilesh Sanikop 
460*09537850SAkhilesh Sanikop   const __m128i index = _mm_set_epi32(0x0, 0x0, 0x00040003, 0x00020001);
461*09537850SAkhilesh Sanikop   const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
462*09537850SAkhilesh Sanikop   const __m128i a = _mm_or_si128(mask, cdf_max_probability);
463*09537850SAkhilesh Sanikop   const __m128i diff = _mm_sub_epi16(a, cdf_vec0);
464*09537850SAkhilesh Sanikop   const __m128i cdf_offset = _mm_sub_epi16(cdf_vec0, mask);
465*09537850SAkhilesh Sanikop   const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
466*09537850SAkhilesh Sanikop   cdf_vec0 = _mm_add_epi16(cdf_offset, delta);
467*09537850SAkhilesh Sanikop   StoreLo8(cdf, cdf_vec0);
468*09537850SAkhilesh Sanikop 
469*09537850SAkhilesh Sanikop   const __m128i index1 =
470*09537850SAkhilesh Sanikop       _mm_set_epi32(0x000c000b, 0x000a0009, 0x00080007, 0x00060005);
471*09537850SAkhilesh Sanikop   const __m128i mask1 = _mm_cmpgt_epi16(index1, symbol_vec);
472*09537850SAkhilesh Sanikop   const __m128i a1 = _mm_or_si128(mask1, cdf_max_probability);
473*09537850SAkhilesh Sanikop   const __m128i diff1 = _mm_sub_epi16(a1, cdf_vec1);
474*09537850SAkhilesh Sanikop   const __m128i cdf_offset1 = _mm_sub_epi16(cdf_vec1, mask1);
475*09537850SAkhilesh Sanikop   const __m128i delta1 = _mm_sra_epi16(diff1, _mm_cvtsi32_si128(rate));
476*09537850SAkhilesh Sanikop   cdf_vec1 = _mm_add_epi16(cdf_offset1, delta1);
477*09537850SAkhilesh Sanikop   StoreUnaligned16(cdf + 4, cdf_vec1);
478*09537850SAkhilesh Sanikop 
479*09537850SAkhilesh Sanikop   cdf[13] = count + static_cast<uint16_t>(count < 32);
480*09537850SAkhilesh Sanikop }
481*09537850SAkhilesh Sanikop 
UpdateCdf16(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol)482*09537850SAkhilesh Sanikop void UpdateCdf16(uint16_t* LIBGAV1_RESTRICT const cdf, const int symbol) {
483*09537850SAkhilesh Sanikop   __m128i cdf_vec0 = LoadUnaligned16(cdf);
484*09537850SAkhilesh Sanikop   const uint16_t count = cdf[16];
485*09537850SAkhilesh Sanikop   const int rate = (count >> 4) + 5;
486*09537850SAkhilesh Sanikop   const __m128i cdf_max_probability =
487*09537850SAkhilesh Sanikop       _mm_set1_epi16(static_cast<int16_t>(kCdfMaxProbability));
488*09537850SAkhilesh Sanikop   const __m128i symbol_vec = _mm_set1_epi16(static_cast<int16_t>(symbol));
489*09537850SAkhilesh Sanikop 
490*09537850SAkhilesh Sanikop   const __m128i index =
491*09537850SAkhilesh Sanikop       _mm_set_epi32(0x00080007, 0x00060005, 0x00040003, 0x00020001);
492*09537850SAkhilesh Sanikop   const __m128i mask = _mm_cmpgt_epi16(index, symbol_vec);
493*09537850SAkhilesh Sanikop   const __m128i a = _mm_or_si128(mask, cdf_max_probability);
494*09537850SAkhilesh Sanikop   const __m128i diff = _mm_sub_epi16(a, cdf_vec0);
495*09537850SAkhilesh Sanikop   const __m128i cdf_offset = _mm_sub_epi16(cdf_vec0, mask);
496*09537850SAkhilesh Sanikop   const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
497*09537850SAkhilesh Sanikop   cdf_vec0 = _mm_add_epi16(cdf_offset, delta);
498*09537850SAkhilesh Sanikop   StoreUnaligned16(cdf, cdf_vec0);
499*09537850SAkhilesh Sanikop 
500*09537850SAkhilesh Sanikop   __m128i cdf_vec1 = LoadUnaligned16(cdf + 8);
501*09537850SAkhilesh Sanikop   const __m128i index1 =
502*09537850SAkhilesh Sanikop       _mm_set_epi32(0x0010000f, 0x000e000d, 0x000c000b, 0x000a0009);
503*09537850SAkhilesh Sanikop   const __m128i mask1 = _mm_cmpgt_epi16(index1, symbol_vec);
504*09537850SAkhilesh Sanikop   const __m128i a1 = _mm_or_si128(mask1, cdf_max_probability);
505*09537850SAkhilesh Sanikop   const __m128i diff1 = _mm_sub_epi16(a1, cdf_vec1);
506*09537850SAkhilesh Sanikop   const __m128i cdf_offset1 = _mm_sub_epi16(cdf_vec1, mask1);
507*09537850SAkhilesh Sanikop   const __m128i delta1 = _mm_sra_epi16(diff1, _mm_cvtsi32_si128(rate));
508*09537850SAkhilesh Sanikop   cdf_vec1 = _mm_add_epi16(cdf_offset1, delta1);
509*09537850SAkhilesh Sanikop   StoreUnaligned16(cdf + 8, cdf_vec1);
510*09537850SAkhilesh Sanikop 
511*09537850SAkhilesh Sanikop   cdf[16] = count + static_cast<uint16_t>(count < 32);
512*09537850SAkhilesh Sanikop }
513*09537850SAkhilesh Sanikop 
514*09537850SAkhilesh Sanikop #else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
515*09537850SAkhilesh Sanikop 
UpdateCdf5(uint16_t * const cdf,const int symbol)516*09537850SAkhilesh Sanikop void UpdateCdf5(uint16_t* const cdf, const int symbol) {
517*09537850SAkhilesh Sanikop   UpdateCdf(cdf, 5, symbol);
518*09537850SAkhilesh Sanikop }
519*09537850SAkhilesh Sanikop 
UpdateCdf7(uint16_t * const cdf,const int symbol)520*09537850SAkhilesh Sanikop void UpdateCdf7(uint16_t* const cdf, const int symbol) {
521*09537850SAkhilesh Sanikop   UpdateCdf(cdf, 7, symbol);
522*09537850SAkhilesh Sanikop }
523*09537850SAkhilesh Sanikop 
UpdateCdf8(uint16_t * const cdf,const int symbol)524*09537850SAkhilesh Sanikop void UpdateCdf8(uint16_t* const cdf, const int symbol) {
525*09537850SAkhilesh Sanikop   UpdateCdf(cdf, 8, symbol);
526*09537850SAkhilesh Sanikop }
527*09537850SAkhilesh Sanikop 
UpdateCdf9(uint16_t * const cdf,const int symbol)528*09537850SAkhilesh Sanikop void UpdateCdf9(uint16_t* const cdf, const int symbol) {
529*09537850SAkhilesh Sanikop   UpdateCdf(cdf, 9, symbol);
530*09537850SAkhilesh Sanikop }
531*09537850SAkhilesh Sanikop 
UpdateCdf11(uint16_t * const cdf,const int symbol)532*09537850SAkhilesh Sanikop void UpdateCdf11(uint16_t* const cdf, const int symbol) {
533*09537850SAkhilesh Sanikop   UpdateCdf(cdf, 11, symbol);
534*09537850SAkhilesh Sanikop }
535*09537850SAkhilesh Sanikop 
UpdateCdf13(uint16_t * const cdf,const int symbol)536*09537850SAkhilesh Sanikop void UpdateCdf13(uint16_t* const cdf, const int symbol) {
537*09537850SAkhilesh Sanikop   UpdateCdf(cdf, 13, symbol);
538*09537850SAkhilesh Sanikop }
539*09537850SAkhilesh Sanikop 
UpdateCdf16(uint16_t * const cdf,const int symbol)540*09537850SAkhilesh Sanikop void UpdateCdf16(uint16_t* const cdf, const int symbol) {
541*09537850SAkhilesh Sanikop   UpdateCdf(cdf, 16, symbol);
542*09537850SAkhilesh Sanikop }
543*09537850SAkhilesh Sanikop 
544*09537850SAkhilesh Sanikop #endif  // LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
545*09537850SAkhilesh Sanikop #endif  // LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
546*09537850SAkhilesh Sanikop 
HostToBigEndian(const EntropyDecoder::WindowSize x)547*09537850SAkhilesh Sanikop inline EntropyDecoder::WindowSize HostToBigEndian(
548*09537850SAkhilesh Sanikop     const EntropyDecoder::WindowSize x) {
549*09537850SAkhilesh Sanikop   static_assert(sizeof(x) == 4 || sizeof(x) == 8, "");
550*09537850SAkhilesh Sanikop #if defined(__GNUC__)
551*09537850SAkhilesh Sanikop #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
552*09537850SAkhilesh Sanikop   return (sizeof(x) == 8) ? __builtin_bswap64(x) : __builtin_bswap32(x);
553*09537850SAkhilesh Sanikop #else
554*09537850SAkhilesh Sanikop   return x;
555*09537850SAkhilesh Sanikop #endif
556*09537850SAkhilesh Sanikop #elif defined(_WIN32)
557*09537850SAkhilesh Sanikop   // Note Windows targets are assumed to be little endian.
558*09537850SAkhilesh Sanikop   return static_cast<EntropyDecoder::WindowSize>(
559*09537850SAkhilesh Sanikop       (sizeof(x) == 8) ? _byteswap_uint64(static_cast<unsigned __int64>(x))
560*09537850SAkhilesh Sanikop                        : _byteswap_ulong(static_cast<unsigned long>(x)));
561*09537850SAkhilesh Sanikop #else
562*09537850SAkhilesh Sanikop #error Unknown compiler!
563*09537850SAkhilesh Sanikop #endif  // defined(__GNUC__)
564*09537850SAkhilesh Sanikop }
565*09537850SAkhilesh Sanikop 
566*09537850SAkhilesh Sanikop }  // namespace
567*09537850SAkhilesh Sanikop 
568*09537850SAkhilesh Sanikop #if !LIBGAV1_CXX17
569*09537850SAkhilesh Sanikop constexpr int EntropyDecoder::kWindowSize;  // static.
570*09537850SAkhilesh Sanikop #endif
571*09537850SAkhilesh Sanikop 
EntropyDecoder(const uint8_t * data,size_t size,bool allow_update_cdf)572*09537850SAkhilesh Sanikop EntropyDecoder::EntropyDecoder(const uint8_t* data, size_t size,
573*09537850SAkhilesh Sanikop                                bool allow_update_cdf)
574*09537850SAkhilesh Sanikop     : data_(data),
575*09537850SAkhilesh Sanikop       data_end_(data + size),
576*09537850SAkhilesh Sanikop       data_memcpy_end_((size >= sizeof(WindowSize))
577*09537850SAkhilesh Sanikop                            ? data + size - sizeof(WindowSize) + 1
578*09537850SAkhilesh Sanikop                            : data),
579*09537850SAkhilesh Sanikop       allow_update_cdf_(allow_update_cdf),
580*09537850SAkhilesh Sanikop       values_in_range_(kCdfMaxProbability) {
581*09537850SAkhilesh Sanikop   if (data_ < data_memcpy_end_) {
582*09537850SAkhilesh Sanikop     // This is a simplified version of PopulateBits() which loads 8 extra bits
583*09537850SAkhilesh Sanikop     // and skips the unnecessary shifts of value and window_diff_.
584*09537850SAkhilesh Sanikop     WindowSize value;
585*09537850SAkhilesh Sanikop     memcpy(&value, data_, sizeof(value));
586*09537850SAkhilesh Sanikop     data_ += sizeof(value);
587*09537850SAkhilesh Sanikop     window_diff_ = HostToBigEndian(value) ^ -1;
588*09537850SAkhilesh Sanikop     // Note the initial value of bits_ is larger than kMaxCachedBits as it's
589*09537850SAkhilesh Sanikop     // used to restore the most significant 0 bit that would be present after
590*09537850SAkhilesh Sanikop     // PopulateBits() when we extract the first symbol value.
591*09537850SAkhilesh Sanikop     // As shown in Section 8.2.2 Initialization process for symbol decoder,
592*09537850SAkhilesh Sanikop     // which uses a fixed offset to read the symbol values, the most
593*09537850SAkhilesh Sanikop     // significant bit is always 0:
594*09537850SAkhilesh Sanikop     //   The variable numBits is set equal to Min( sz * 8, 15).
595*09537850SAkhilesh Sanikop     //   The variable buf is read using the f(numBits) parsing process.
596*09537850SAkhilesh Sanikop     //   The variable paddedBuf is set equal to ( buf << (15 - numBits) ).
597*09537850SAkhilesh Sanikop     //   The variable SymbolValue is set to ((1 << 15) - 1) ^ paddedBuf.
598*09537850SAkhilesh Sanikop     bits_ = kWindowSize - 15;
599*09537850SAkhilesh Sanikop     return;
600*09537850SAkhilesh Sanikop   }
601*09537850SAkhilesh Sanikop   window_diff_ = 0;
602*09537850SAkhilesh Sanikop   bits_ = -15;
603*09537850SAkhilesh Sanikop   PopulateBits();
604*09537850SAkhilesh Sanikop }
605*09537850SAkhilesh Sanikop 
606*09537850SAkhilesh Sanikop // This is similar to the ReadSymbol() implementation but it is optimized based
607*09537850SAkhilesh Sanikop // on the following facts:
608*09537850SAkhilesh Sanikop //   * The probability is fixed at half. So some multiplications can be replaced
609*09537850SAkhilesh Sanikop //     with bit operations.
610*09537850SAkhilesh Sanikop //   * Symbol count is fixed at 2.
ReadBit()611*09537850SAkhilesh Sanikop int EntropyDecoder::ReadBit() {
612*09537850SAkhilesh Sanikop   const uint32_t curr =
613*09537850SAkhilesh Sanikop       ((values_in_range_ & kReadBitMask) >> 1) + kMinimumProbabilityPerSymbol;
614*09537850SAkhilesh Sanikop   const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
615*09537850SAkhilesh Sanikop   int bit = 1;
616*09537850SAkhilesh Sanikop   if (symbol_value >= curr) {
617*09537850SAkhilesh Sanikop     values_in_range_ -= curr;
618*09537850SAkhilesh Sanikop     window_diff_ -= static_cast<WindowSize>(curr) << bits_;
619*09537850SAkhilesh Sanikop     bit = 0;
620*09537850SAkhilesh Sanikop   } else {
621*09537850SAkhilesh Sanikop     values_in_range_ = curr;
622*09537850SAkhilesh Sanikop   }
623*09537850SAkhilesh Sanikop   NormalizeRange();
624*09537850SAkhilesh Sanikop   return bit;
625*09537850SAkhilesh Sanikop }
626*09537850SAkhilesh Sanikop 
ReadLiteral(int num_bits)627*09537850SAkhilesh Sanikop int64_t EntropyDecoder::ReadLiteral(int num_bits) {
628*09537850SAkhilesh Sanikop   assert(num_bits <= 32);
629*09537850SAkhilesh Sanikop   assert(num_bits > 0);
630*09537850SAkhilesh Sanikop   uint32_t literal = 0;
631*09537850SAkhilesh Sanikop   int bit = num_bits - 1;
632*09537850SAkhilesh Sanikop   do {
633*09537850SAkhilesh Sanikop     // ARM can combine a shift operation with a constant number of bits with
634*09537850SAkhilesh Sanikop     // some other operations, such as the OR operation.
635*09537850SAkhilesh Sanikop     // Here is an ARM disassembly example:
636*09537850SAkhilesh Sanikop     // orr w1, w0, w1, lsl #1
637*09537850SAkhilesh Sanikop     // which left shifts register w1 by 1 bit and OR the shift result with
638*09537850SAkhilesh Sanikop     // register w0.
639*09537850SAkhilesh Sanikop     // The next 2 lines are equivalent to:
640*09537850SAkhilesh Sanikop     // literal |= static_cast<uint32_t>(ReadBit()) << bit;
641*09537850SAkhilesh Sanikop     literal <<= 1;
642*09537850SAkhilesh Sanikop     literal |= static_cast<uint32_t>(ReadBit());
643*09537850SAkhilesh Sanikop   } while (--bit >= 0);
644*09537850SAkhilesh Sanikop   return literal;
645*09537850SAkhilesh Sanikop }
646*09537850SAkhilesh Sanikop 
ReadSymbol(uint16_t * LIBGAV1_RESTRICT const cdf,int symbol_count)647*09537850SAkhilesh Sanikop int EntropyDecoder::ReadSymbol(uint16_t* LIBGAV1_RESTRICT const cdf,
648*09537850SAkhilesh Sanikop                                int symbol_count) {
649*09537850SAkhilesh Sanikop   const int symbol = ReadSymbolImpl(cdf, symbol_count);
650*09537850SAkhilesh Sanikop   if (allow_update_cdf_) {
651*09537850SAkhilesh Sanikop     UpdateCdf(cdf, symbol_count, symbol);
652*09537850SAkhilesh Sanikop   }
653*09537850SAkhilesh Sanikop   return symbol;
654*09537850SAkhilesh Sanikop }
655*09537850SAkhilesh Sanikop 
ReadSymbol(uint16_t * LIBGAV1_RESTRICT cdf)656*09537850SAkhilesh Sanikop bool EntropyDecoder::ReadSymbol(uint16_t* LIBGAV1_RESTRICT cdf) {
657*09537850SAkhilesh Sanikop   assert(cdf[1] == 0);
658*09537850SAkhilesh Sanikop   const bool symbol = ReadSymbolImpl(cdf[0]) != 0;
659*09537850SAkhilesh Sanikop   if (allow_update_cdf_) {
660*09537850SAkhilesh Sanikop     const uint16_t count = cdf[2];
661*09537850SAkhilesh Sanikop     // rate is computed in the spec as:
662*09537850SAkhilesh Sanikop     //  3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2)
663*09537850SAkhilesh Sanikop     // In this case N is 2 and cdf[N] is |count|. So the equation becomes:
664*09537850SAkhilesh Sanikop     //  4 + (count > 15) + (count > 31)
665*09537850SAkhilesh Sanikop     // Note that the largest value for count is 32 (it is not incremented beyond
666*09537850SAkhilesh Sanikop     // 32). So using that information:
667*09537850SAkhilesh Sanikop     //  count >> 4 is 0 for count from 0 to 15.
668*09537850SAkhilesh Sanikop     //  count >> 4 is 1 for count from 16 to 31.
669*09537850SAkhilesh Sanikop     //  count >> 4 is 2 for count == 32.
670*09537850SAkhilesh Sanikop     // Now, the equation becomes:
671*09537850SAkhilesh Sanikop     //  4 + (count >> 4).
672*09537850SAkhilesh Sanikop     // Since (count >> 4) can only be 0 or 1 or 2, the addition can be replaced
673*09537850SAkhilesh Sanikop     // with bitwise or. So the final equation is:
674*09537850SAkhilesh Sanikop     //  4 | (count >> 4).
675*09537850SAkhilesh Sanikop     const int rate = 4 | (count >> 4);
676*09537850SAkhilesh Sanikop     if (symbol) {
677*09537850SAkhilesh Sanikop       cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
678*09537850SAkhilesh Sanikop     } else {
679*09537850SAkhilesh Sanikop       cdf[0] -= cdf[0] >> rate;
680*09537850SAkhilesh Sanikop     }
681*09537850SAkhilesh Sanikop     cdf[2] += static_cast<uint16_t>(count < 32);
682*09537850SAkhilesh Sanikop   }
683*09537850SAkhilesh Sanikop   return symbol;
684*09537850SAkhilesh Sanikop }
685*09537850SAkhilesh Sanikop 
ReadSymbolWithoutCdfUpdate(uint16_t cdf)686*09537850SAkhilesh Sanikop bool EntropyDecoder::ReadSymbolWithoutCdfUpdate(uint16_t cdf) {
687*09537850SAkhilesh Sanikop   return ReadSymbolImpl(cdf) != 0;
688*09537850SAkhilesh Sanikop }
689*09537850SAkhilesh Sanikop 
690*09537850SAkhilesh Sanikop template <int symbol_count>
ReadSymbol(uint16_t * LIBGAV1_RESTRICT const cdf)691*09537850SAkhilesh Sanikop int EntropyDecoder::ReadSymbol(uint16_t* LIBGAV1_RESTRICT const cdf) {
692*09537850SAkhilesh Sanikop   static_assert(symbol_count >= 3 && symbol_count <= 16, "");
693*09537850SAkhilesh Sanikop   if (symbol_count == 3 || symbol_count == 4) {
694*09537850SAkhilesh Sanikop     return ReadSymbol3Or4(cdf, symbol_count);
695*09537850SAkhilesh Sanikop   }
696*09537850SAkhilesh Sanikop   int symbol;
697*09537850SAkhilesh Sanikop   if (symbol_count == 8) {
698*09537850SAkhilesh Sanikop     symbol = ReadSymbolImpl8(cdf);
699*09537850SAkhilesh Sanikop   } else if (symbol_count <= 13) {
700*09537850SAkhilesh Sanikop     symbol = ReadSymbolImpl(cdf, symbol_count);
701*09537850SAkhilesh Sanikop   } else {
702*09537850SAkhilesh Sanikop     symbol = ReadSymbolImplBinarySearch(cdf, symbol_count);
703*09537850SAkhilesh Sanikop   }
704*09537850SAkhilesh Sanikop   if (allow_update_cdf_) {
705*09537850SAkhilesh Sanikop     if (symbol_count == 5) {
706*09537850SAkhilesh Sanikop       UpdateCdf5(cdf, symbol);
707*09537850SAkhilesh Sanikop     } else if (symbol_count == 7) {
708*09537850SAkhilesh Sanikop       UpdateCdf7(cdf, symbol);
709*09537850SAkhilesh Sanikop     } else if (symbol_count == 8) {
710*09537850SAkhilesh Sanikop       UpdateCdf8(cdf, symbol);
711*09537850SAkhilesh Sanikop     } else if (symbol_count == 9) {
712*09537850SAkhilesh Sanikop       UpdateCdf9(cdf, symbol);
713*09537850SAkhilesh Sanikop     } else if (symbol_count == 11) {
714*09537850SAkhilesh Sanikop       UpdateCdf11(cdf, symbol);
715*09537850SAkhilesh Sanikop     } else if (symbol_count == 13) {
716*09537850SAkhilesh Sanikop       UpdateCdf13(cdf, symbol);
717*09537850SAkhilesh Sanikop     } else if (symbol_count == 16) {
718*09537850SAkhilesh Sanikop       UpdateCdf16(cdf, symbol);
719*09537850SAkhilesh Sanikop     } else {
720*09537850SAkhilesh Sanikop       UpdateCdf(cdf, symbol_count, symbol);
721*09537850SAkhilesh Sanikop     }
722*09537850SAkhilesh Sanikop   }
723*09537850SAkhilesh Sanikop   return symbol;
724*09537850SAkhilesh Sanikop }
725*09537850SAkhilesh Sanikop 
ReadSymbolImpl(const uint16_t * LIBGAV1_RESTRICT const cdf,int symbol_count)726*09537850SAkhilesh Sanikop int EntropyDecoder::ReadSymbolImpl(const uint16_t* LIBGAV1_RESTRICT const cdf,
727*09537850SAkhilesh Sanikop                                    int symbol_count) {
728*09537850SAkhilesh Sanikop   assert(cdf[symbol_count - 1] == 0);
729*09537850SAkhilesh Sanikop   --symbol_count;
730*09537850SAkhilesh Sanikop   uint32_t curr = values_in_range_;
731*09537850SAkhilesh Sanikop   int symbol = -1;
732*09537850SAkhilesh Sanikop   uint32_t prev;
733*09537850SAkhilesh Sanikop   const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
734*09537850SAkhilesh Sanikop   uint32_t delta = kMinimumProbabilityPerSymbol * symbol_count;
735*09537850SAkhilesh Sanikop   // Search through the |cdf| array to determine where the scaled cdf value and
736*09537850SAkhilesh Sanikop   // |symbol_value| cross over.
737*09537850SAkhilesh Sanikop   do {
738*09537850SAkhilesh Sanikop     prev = curr;
739*09537850SAkhilesh Sanikop     curr = (((values_in_range_ >> 8) * (cdf[++symbol] >> kCdfPrecision)) >> 1) +
740*09537850SAkhilesh Sanikop            delta;
741*09537850SAkhilesh Sanikop     delta -= kMinimumProbabilityPerSymbol;
742*09537850SAkhilesh Sanikop   } while (symbol_value < curr);
743*09537850SAkhilesh Sanikop   values_in_range_ = prev - curr;
744*09537850SAkhilesh Sanikop   window_diff_ -= static_cast<WindowSize>(curr) << bits_;
745*09537850SAkhilesh Sanikop   NormalizeRange();
746*09537850SAkhilesh Sanikop   return symbol;
747*09537850SAkhilesh Sanikop }
748*09537850SAkhilesh Sanikop 
ReadSymbolImplBinarySearch(const uint16_t * LIBGAV1_RESTRICT const cdf,int symbol_count)749*09537850SAkhilesh Sanikop int EntropyDecoder::ReadSymbolImplBinarySearch(
750*09537850SAkhilesh Sanikop     const uint16_t* LIBGAV1_RESTRICT const cdf, int symbol_count) {
751*09537850SAkhilesh Sanikop   assert(cdf[symbol_count - 1] == 0);
752*09537850SAkhilesh Sanikop   assert(symbol_count > 1 && symbol_count <= 16);
753*09537850SAkhilesh Sanikop   --symbol_count;
754*09537850SAkhilesh Sanikop   const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
755*09537850SAkhilesh Sanikop   // Search through the |cdf| array to determine where the scaled cdf value and
756*09537850SAkhilesh Sanikop   // |symbol_value| cross over. Since the CDFs are sorted, we can use binary
757*09537850SAkhilesh Sanikop   // search to do this. Let |symbol| be the index of the first |cdf| array
758*09537850SAkhilesh Sanikop   // entry whose scaled cdf value is less than or equal to |symbol_value|. The
759*09537850SAkhilesh Sanikop   // binary search maintains the invariant:
760*09537850SAkhilesh Sanikop   //   low <= symbol <= high + 1
761*09537850SAkhilesh Sanikop   // and terminates when low == high + 1.
762*09537850SAkhilesh Sanikop   int low = 0;
763*09537850SAkhilesh Sanikop   int high = symbol_count - 1;
764*09537850SAkhilesh Sanikop   // The binary search maintains the invariants that |prev| is the scaled cdf
765*09537850SAkhilesh Sanikop   // value for low - 1 and |curr| is the scaled cdf value for high + 1. (By
766*09537850SAkhilesh Sanikop   // convention, the scaled cdf value for -1 is values_in_range_.) When the
767*09537850SAkhilesh Sanikop   // binary search terminates, |prev| is the scaled cdf value for symbol - 1
768*09537850SAkhilesh Sanikop   // and |curr| is the scaled cdf value for |symbol|.
769*09537850SAkhilesh Sanikop   uint32_t prev = values_in_range_;
770*09537850SAkhilesh Sanikop   uint32_t curr = 0;
771*09537850SAkhilesh Sanikop   const uint32_t values_in_range_shifted = values_in_range_ >> 8;
772*09537850SAkhilesh Sanikop   do {
773*09537850SAkhilesh Sanikop     const int mid = DivideBy2(low + high);
774*09537850SAkhilesh Sanikop     const uint32_t scaled_cdf =
775*09537850SAkhilesh Sanikop         ScaleCdf(values_in_range_shifted, cdf, mid, symbol_count);
776*09537850SAkhilesh Sanikop     if (symbol_value < scaled_cdf) {
777*09537850SAkhilesh Sanikop       low = mid + 1;
778*09537850SAkhilesh Sanikop       prev = scaled_cdf;
779*09537850SAkhilesh Sanikop     } else {
780*09537850SAkhilesh Sanikop       high = mid - 1;
781*09537850SAkhilesh Sanikop       curr = scaled_cdf;
782*09537850SAkhilesh Sanikop     }
783*09537850SAkhilesh Sanikop   } while (low <= high);
784*09537850SAkhilesh Sanikop   assert(low == high + 1);
785*09537850SAkhilesh Sanikop   // At this point, |low| is the symbol that has been decoded.
786*09537850SAkhilesh Sanikop   values_in_range_ = prev - curr;
787*09537850SAkhilesh Sanikop   window_diff_ -= static_cast<WindowSize>(curr) << bits_;
788*09537850SAkhilesh Sanikop   NormalizeRange();
789*09537850SAkhilesh Sanikop   return low;
790*09537850SAkhilesh Sanikop }
791*09537850SAkhilesh Sanikop 
ReadSymbolImpl(uint16_t cdf)792*09537850SAkhilesh Sanikop int EntropyDecoder::ReadSymbolImpl(uint16_t cdf) {
793*09537850SAkhilesh Sanikop   const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
794*09537850SAkhilesh Sanikop   const uint32_t curr =
795*09537850SAkhilesh Sanikop       (((values_in_range_ >> 8) * (cdf >> kCdfPrecision)) >> 1) +
796*09537850SAkhilesh Sanikop       kMinimumProbabilityPerSymbol;
797*09537850SAkhilesh Sanikop   const int symbol = static_cast<int>(symbol_value < curr);
798*09537850SAkhilesh Sanikop   if (symbol == 1) {
799*09537850SAkhilesh Sanikop     values_in_range_ = curr;
800*09537850SAkhilesh Sanikop   } else {
801*09537850SAkhilesh Sanikop     values_in_range_ -= curr;
802*09537850SAkhilesh Sanikop     window_diff_ -= static_cast<WindowSize>(curr) << bits_;
803*09537850SAkhilesh Sanikop   }
804*09537850SAkhilesh Sanikop   NormalizeRange();
805*09537850SAkhilesh Sanikop   return symbol;
806*09537850SAkhilesh Sanikop }
807*09537850SAkhilesh Sanikop 
808*09537850SAkhilesh Sanikop // Equivalent to ReadSymbol(cdf, [3,4]), with the ReadSymbolImpl and UpdateCdf
809*09537850SAkhilesh Sanikop // calls inlined.
ReadSymbol3Or4(uint16_t * LIBGAV1_RESTRICT const cdf,const int symbol_count)810*09537850SAkhilesh Sanikop int EntropyDecoder::ReadSymbol3Or4(uint16_t* LIBGAV1_RESTRICT const cdf,
811*09537850SAkhilesh Sanikop                                    const int symbol_count) {
812*09537850SAkhilesh Sanikop   assert(cdf[symbol_count - 1] == 0);
813*09537850SAkhilesh Sanikop   uint32_t curr = values_in_range_;
814*09537850SAkhilesh Sanikop   uint32_t prev;
815*09537850SAkhilesh Sanikop   const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
816*09537850SAkhilesh Sanikop   uint32_t delta = kMinimumProbabilityPerSymbol * (symbol_count - 1);
817*09537850SAkhilesh Sanikop   const uint32_t values_in_range_shifted = values_in_range_ >> 8;
818*09537850SAkhilesh Sanikop 
819*09537850SAkhilesh Sanikop   // Search through the |cdf| array to determine where the scaled cdf value and
820*09537850SAkhilesh Sanikop   // |symbol_value| cross over. If allow_update_cdf_ is true, update the |cdf|
821*09537850SAkhilesh Sanikop   // array.
822*09537850SAkhilesh Sanikop   //
823*09537850SAkhilesh Sanikop   // The original code is:
824*09537850SAkhilesh Sanikop   //
825*09537850SAkhilesh Sanikop   //  int symbol = -1;
826*09537850SAkhilesh Sanikop   //  do {
827*09537850SAkhilesh Sanikop   //    prev = curr;
828*09537850SAkhilesh Sanikop   //    curr =
829*09537850SAkhilesh Sanikop   //        ((values_in_range_shifted * (cdf[++symbol] >> kCdfPrecision)) >> 1)
830*09537850SAkhilesh Sanikop   //        + delta;
831*09537850SAkhilesh Sanikop   //    delta -= kMinimumProbabilityPerSymbol;
832*09537850SAkhilesh Sanikop   //  } while (symbol_value < curr);
833*09537850SAkhilesh Sanikop   //  if (allow_update_cdf_) {
834*09537850SAkhilesh Sanikop   //    UpdateCdf(cdf, [3,4], symbol);
835*09537850SAkhilesh Sanikop   //  }
836*09537850SAkhilesh Sanikop   //
837*09537850SAkhilesh Sanikop   // The do-while loop is unrolled with three or four iterations, and the
838*09537850SAkhilesh Sanikop   // UpdateCdf call is inlined and merged into the iterations.
839*09537850SAkhilesh Sanikop   int symbol = 0;
840*09537850SAkhilesh Sanikop   // Iteration 0.
841*09537850SAkhilesh Sanikop   prev = curr;
842*09537850SAkhilesh Sanikop   curr =
843*09537850SAkhilesh Sanikop       ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
844*09537850SAkhilesh Sanikop   if (symbol_value >= curr) {
845*09537850SAkhilesh Sanikop     // symbol == 0.
846*09537850SAkhilesh Sanikop     if (allow_update_cdf_) {
847*09537850SAkhilesh Sanikop       // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/0).
848*09537850SAkhilesh Sanikop       const uint16_t count = cdf[symbol_count];
849*09537850SAkhilesh Sanikop       cdf[symbol_count] += static_cast<uint16_t>(count < 32);
850*09537850SAkhilesh Sanikop       const int rate = (count >> 4) + 4 + static_cast<int>(symbol_count == 4);
851*09537850SAkhilesh Sanikop       if (symbol_count == 4) {
852*09537850SAkhilesh Sanikop #if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
853*09537850SAkhilesh Sanikop         // 1. On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM
854*09537850SAkhilesh Sanikop         // NEON code is slower. Consider using the C version if __arm__ is
855*09537850SAkhilesh Sanikop         // defined.
856*09537850SAkhilesh Sanikop         // 2. The ARM NEON code (compiled for arm64) is slightly slower on
857*09537850SAkhilesh Sanikop         // Samsung Galaxy S8+ (SM-G955FD).
858*09537850SAkhilesh Sanikop         uint16x4_t cdf_vec = vld1_u16(cdf);
859*09537850SAkhilesh Sanikop         const int16x4_t negative_rate = vdup_n_s16(-rate);
860*09537850SAkhilesh Sanikop         const uint16x4_t delta = vshl_u16(cdf_vec, negative_rate);
861*09537850SAkhilesh Sanikop         cdf_vec = vsub_u16(cdf_vec, delta);
862*09537850SAkhilesh Sanikop         vst1_u16(cdf, cdf_vec);
863*09537850SAkhilesh Sanikop #elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
864*09537850SAkhilesh Sanikop         __m128i cdf_vec = LoadLo8(cdf);
865*09537850SAkhilesh Sanikop         const __m128i delta = _mm_sra_epi16(cdf_vec, _mm_cvtsi32_si128(rate));
866*09537850SAkhilesh Sanikop         cdf_vec = _mm_sub_epi16(cdf_vec, delta);
867*09537850SAkhilesh Sanikop         StoreLo8(cdf, cdf_vec);
868*09537850SAkhilesh Sanikop #else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
869*09537850SAkhilesh Sanikop         cdf[0] -= cdf[0] >> rate;
870*09537850SAkhilesh Sanikop         cdf[1] -= cdf[1] >> rate;
871*09537850SAkhilesh Sanikop         cdf[2] -= cdf[2] >> rate;
872*09537850SAkhilesh Sanikop #endif
873*09537850SAkhilesh Sanikop       } else {  // symbol_count == 3.
874*09537850SAkhilesh Sanikop         cdf[0] -= cdf[0] >> rate;
875*09537850SAkhilesh Sanikop         cdf[1] -= cdf[1] >> rate;
876*09537850SAkhilesh Sanikop       }
877*09537850SAkhilesh Sanikop     }
878*09537850SAkhilesh Sanikop     goto found;
879*09537850SAkhilesh Sanikop   }
880*09537850SAkhilesh Sanikop   ++symbol;
881*09537850SAkhilesh Sanikop   delta -= kMinimumProbabilityPerSymbol;
882*09537850SAkhilesh Sanikop   // Iteration 1.
883*09537850SAkhilesh Sanikop   prev = curr;
884*09537850SAkhilesh Sanikop   curr =
885*09537850SAkhilesh Sanikop       ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
886*09537850SAkhilesh Sanikop   if (symbol_value >= curr) {
887*09537850SAkhilesh Sanikop     // symbol == 1.
888*09537850SAkhilesh Sanikop     if (allow_update_cdf_) {
889*09537850SAkhilesh Sanikop       // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/1).
890*09537850SAkhilesh Sanikop       const uint16_t count = cdf[symbol_count];
891*09537850SAkhilesh Sanikop       cdf[symbol_count] += static_cast<uint16_t>(count < 32);
892*09537850SAkhilesh Sanikop       const int rate = (count >> 4) + 4 + static_cast<int>(symbol_count == 4);
893*09537850SAkhilesh Sanikop       cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
894*09537850SAkhilesh Sanikop       cdf[1] -= cdf[1] >> rate;
895*09537850SAkhilesh Sanikop       if (symbol_count == 4) cdf[2] -= cdf[2] >> rate;
896*09537850SAkhilesh Sanikop     }
897*09537850SAkhilesh Sanikop     goto found;
898*09537850SAkhilesh Sanikop   }
899*09537850SAkhilesh Sanikop   ++symbol;
900*09537850SAkhilesh Sanikop   if (symbol_count == 4) {
901*09537850SAkhilesh Sanikop     delta -= kMinimumProbabilityPerSymbol;
902*09537850SAkhilesh Sanikop     // Iteration 2.
903*09537850SAkhilesh Sanikop     prev = curr;
904*09537850SAkhilesh Sanikop     curr = ((values_in_range_shifted * (cdf[symbol] >> kCdfPrecision)) >> 1) +
905*09537850SAkhilesh Sanikop            delta;
906*09537850SAkhilesh Sanikop     if (symbol_value >= curr) {
907*09537850SAkhilesh Sanikop       // symbol == 2.
908*09537850SAkhilesh Sanikop       if (allow_update_cdf_) {
909*09537850SAkhilesh Sanikop         // Inlined version of UpdateCdf(cdf, 4, /*symbol=*/2).
910*09537850SAkhilesh Sanikop         const uint16_t count = cdf[4];
911*09537850SAkhilesh Sanikop         cdf[4] += static_cast<uint16_t>(count < 32);
912*09537850SAkhilesh Sanikop         const int rate = (count >> 4) + 5;
913*09537850SAkhilesh Sanikop         cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
914*09537850SAkhilesh Sanikop         cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
915*09537850SAkhilesh Sanikop         cdf[2] -= cdf[2] >> rate;
916*09537850SAkhilesh Sanikop       }
917*09537850SAkhilesh Sanikop       goto found;
918*09537850SAkhilesh Sanikop     }
919*09537850SAkhilesh Sanikop     ++symbol;
920*09537850SAkhilesh Sanikop   }
921*09537850SAkhilesh Sanikop   // |delta| is 0 for the last iteration.
922*09537850SAkhilesh Sanikop   // Iteration 2 (symbol_count == 3) or 3 (symbol_count == 4).
923*09537850SAkhilesh Sanikop   prev = curr;
924*09537850SAkhilesh Sanikop   // Since cdf[symbol_count - 1] is 0 and |delta| is 0, |curr| is also 0.
925*09537850SAkhilesh Sanikop   curr = 0;
926*09537850SAkhilesh Sanikop   // symbol == [2,3].
927*09537850SAkhilesh Sanikop   if (allow_update_cdf_) {
928*09537850SAkhilesh Sanikop     // Inlined version of UpdateCdf(cdf, [3,4], /*symbol=*/[2,3]).
929*09537850SAkhilesh Sanikop     const uint16_t count = cdf[symbol_count];
930*09537850SAkhilesh Sanikop     cdf[symbol_count] += static_cast<uint16_t>(count < 32);
931*09537850SAkhilesh Sanikop     const int rate = (4 | (count >> 4)) + static_cast<int>(symbol_count == 4);
932*09537850SAkhilesh Sanikop     if (symbol_count == 4) {
933*09537850SAkhilesh Sanikop #if LIBGAV1_ENTROPY_DECODER_ENABLE_NEON
934*09537850SAkhilesh Sanikop       // On Motorola Moto G5 Plus (running 32-bit Android 8.1.0), the ARM NEON
935*09537850SAkhilesh Sanikop       // code is a tiny bit slower. Consider using the C version if __arm__ is
936*09537850SAkhilesh Sanikop       // defined.
937*09537850SAkhilesh Sanikop       uint16x4_t cdf_vec = vld1_u16(cdf);
938*09537850SAkhilesh Sanikop       const uint16x4_t cdf_max_probability = vdup_n_u16(kCdfMaxProbability);
939*09537850SAkhilesh Sanikop       const int16x4_t diff =
940*09537850SAkhilesh Sanikop           vreinterpret_s16_u16(vsub_u16(cdf_max_probability, cdf_vec));
941*09537850SAkhilesh Sanikop       const int16x4_t negative_rate = vdup_n_s16(-rate);
942*09537850SAkhilesh Sanikop       const uint16x4_t delta =
943*09537850SAkhilesh Sanikop           vreinterpret_u16_s16(vshl_s16(diff, negative_rate));
944*09537850SAkhilesh Sanikop       cdf_vec = vadd_u16(cdf_vec, delta);
945*09537850SAkhilesh Sanikop       vst1_u16(cdf, cdf_vec);
946*09537850SAkhilesh Sanikop       cdf[3] = 0;
947*09537850SAkhilesh Sanikop #elif LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
948*09537850SAkhilesh Sanikop       __m128i cdf_vec = LoadLo8(cdf);
949*09537850SAkhilesh Sanikop       const __m128i cdf_max_probability =
950*09537850SAkhilesh Sanikop           _mm_shufflelo_epi16(_mm_cvtsi32_si128(kCdfMaxProbability), 0);
951*09537850SAkhilesh Sanikop       const __m128i diff = _mm_sub_epi16(cdf_max_probability, cdf_vec);
952*09537850SAkhilesh Sanikop       const __m128i delta = _mm_sra_epi16(diff, _mm_cvtsi32_si128(rate));
953*09537850SAkhilesh Sanikop       cdf_vec = _mm_add_epi16(cdf_vec, delta);
954*09537850SAkhilesh Sanikop       StoreLo8(cdf, cdf_vec);
955*09537850SAkhilesh Sanikop       cdf[3] = 0;
956*09537850SAkhilesh Sanikop #else  // !LIBGAV1_ENTROPY_DECODER_ENABLE_SSE2
957*09537850SAkhilesh Sanikop       cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
958*09537850SAkhilesh Sanikop       cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
959*09537850SAkhilesh Sanikop       cdf[2] += (kCdfMaxProbability - cdf[2]) >> rate;
960*09537850SAkhilesh Sanikop #endif
961*09537850SAkhilesh Sanikop     } else {  // symbol_count == 3.
962*09537850SAkhilesh Sanikop       cdf[0] += (kCdfMaxProbability - cdf[0]) >> rate;
963*09537850SAkhilesh Sanikop       cdf[1] += (kCdfMaxProbability - cdf[1]) >> rate;
964*09537850SAkhilesh Sanikop     }
965*09537850SAkhilesh Sanikop   }
966*09537850SAkhilesh Sanikop found:
967*09537850SAkhilesh Sanikop   // End of unrolled do-while loop.
968*09537850SAkhilesh Sanikop 
969*09537850SAkhilesh Sanikop   values_in_range_ = prev - curr;
970*09537850SAkhilesh Sanikop   window_diff_ -= static_cast<WindowSize>(curr) << bits_;
971*09537850SAkhilesh Sanikop   NormalizeRange();
972*09537850SAkhilesh Sanikop   return symbol;
973*09537850SAkhilesh Sanikop }
974*09537850SAkhilesh Sanikop 
ReadSymbolImpl8(const uint16_t * LIBGAV1_RESTRICT const cdf)975*09537850SAkhilesh Sanikop int EntropyDecoder::ReadSymbolImpl8(
976*09537850SAkhilesh Sanikop     const uint16_t* LIBGAV1_RESTRICT const cdf) {
977*09537850SAkhilesh Sanikop   assert(cdf[7] == 0);
978*09537850SAkhilesh Sanikop   uint32_t curr = values_in_range_;
979*09537850SAkhilesh Sanikop   uint32_t prev;
980*09537850SAkhilesh Sanikop   const auto symbol_value = static_cast<uint16_t>(window_diff_ >> bits_);
981*09537850SAkhilesh Sanikop   uint32_t delta = kMinimumProbabilityPerSymbol * 7;
982*09537850SAkhilesh Sanikop   // Search through the |cdf| array to determine where the scaled cdf value and
983*09537850SAkhilesh Sanikop   // |symbol_value| cross over.
984*09537850SAkhilesh Sanikop   //
985*09537850SAkhilesh Sanikop   // The original code is:
986*09537850SAkhilesh Sanikop   //
987*09537850SAkhilesh Sanikop   // int symbol = -1;
988*09537850SAkhilesh Sanikop   // do {
989*09537850SAkhilesh Sanikop   //   prev = curr;
990*09537850SAkhilesh Sanikop   //   curr =
991*09537850SAkhilesh Sanikop   //       (((values_in_range_ >> 8) * (cdf[++symbol] >> kCdfPrecision)) >> 1)
992*09537850SAkhilesh Sanikop   //       + delta;
993*09537850SAkhilesh Sanikop   //   delta -= kMinimumProbabilityPerSymbol;
994*09537850SAkhilesh Sanikop   // } while (symbol_value < curr);
995*09537850SAkhilesh Sanikop   //
996*09537850SAkhilesh Sanikop   // The do-while loop is unrolled with eight iterations.
997*09537850SAkhilesh Sanikop   int symbol = 0;
998*09537850SAkhilesh Sanikop 
999*09537850SAkhilesh Sanikop #define READ_SYMBOL_ITERATION                                                \
1000*09537850SAkhilesh Sanikop   prev = curr;                                                               \
1001*09537850SAkhilesh Sanikop   curr = (((values_in_range_ >> 8) * (cdf[symbol] >> kCdfPrecision)) >> 1) + \
1002*09537850SAkhilesh Sanikop          delta;                                                              \
1003*09537850SAkhilesh Sanikop   if (symbol_value >= curr) goto found;                                      \
1004*09537850SAkhilesh Sanikop   ++symbol;                                                                  \
1005*09537850SAkhilesh Sanikop   delta -= kMinimumProbabilityPerSymbol
1006*09537850SAkhilesh Sanikop 
1007*09537850SAkhilesh Sanikop   READ_SYMBOL_ITERATION;  // Iteration 0.
1008*09537850SAkhilesh Sanikop   READ_SYMBOL_ITERATION;  // Iteration 1.
1009*09537850SAkhilesh Sanikop   READ_SYMBOL_ITERATION;  // Iteration 2.
1010*09537850SAkhilesh Sanikop   READ_SYMBOL_ITERATION;  // Iteration 3.
1011*09537850SAkhilesh Sanikop   READ_SYMBOL_ITERATION;  // Iteration 4.
1012*09537850SAkhilesh Sanikop   READ_SYMBOL_ITERATION;  // Iteration 5.
1013*09537850SAkhilesh Sanikop 
1014*09537850SAkhilesh Sanikop   // The last two iterations can be simplified, so they don't use the
1015*09537850SAkhilesh Sanikop   // READ_SYMBOL_ITERATION macro.
1016*09537850SAkhilesh Sanikop #undef READ_SYMBOL_ITERATION
1017*09537850SAkhilesh Sanikop 
1018*09537850SAkhilesh Sanikop   // Iteration 6.
1019*09537850SAkhilesh Sanikop   prev = curr;
1020*09537850SAkhilesh Sanikop   curr =
1021*09537850SAkhilesh Sanikop       (((values_in_range_ >> 8) * (cdf[symbol] >> kCdfPrecision)) >> 1) + delta;
1022*09537850SAkhilesh Sanikop   if (symbol_value >= curr) goto found;  // symbol == 6.
1023*09537850SAkhilesh Sanikop   ++symbol;
1024*09537850SAkhilesh Sanikop   // |delta| is 0 for the last iteration.
1025*09537850SAkhilesh Sanikop   // Iteration 7.
1026*09537850SAkhilesh Sanikop   prev = curr;
1027*09537850SAkhilesh Sanikop   // Since cdf[7] is 0 and |delta| is 0, |curr| is also 0.
1028*09537850SAkhilesh Sanikop   curr = 0;
1029*09537850SAkhilesh Sanikop   // symbol == 7.
1030*09537850SAkhilesh Sanikop found:
1031*09537850SAkhilesh Sanikop   // End of unrolled do-while loop.
1032*09537850SAkhilesh Sanikop 
1033*09537850SAkhilesh Sanikop   values_in_range_ = prev - curr;
1034*09537850SAkhilesh Sanikop   window_diff_ -= static_cast<WindowSize>(curr) << bits_;
1035*09537850SAkhilesh Sanikop   NormalizeRange();
1036*09537850SAkhilesh Sanikop   return symbol;
1037*09537850SAkhilesh Sanikop }
1038*09537850SAkhilesh Sanikop 
PopulateBits()1039*09537850SAkhilesh Sanikop void EntropyDecoder::PopulateBits() {
1040*09537850SAkhilesh Sanikop   constexpr int kMaxCachedBits = kWindowSize - 16;
1041*09537850SAkhilesh Sanikop #if defined(__aarch64__)
1042*09537850SAkhilesh Sanikop   // Fast path: read eight bytes and add the first six bytes to window_diff_.
1043*09537850SAkhilesh Sanikop   // This fast path makes the following assumptions.
1044*09537850SAkhilesh Sanikop   // 1. We assume that unaligned load of uint64_t is fast.
1045*09537850SAkhilesh Sanikop   // 2. When there are enough bytes in data_, the for loop below reads 6 or 7
1046*09537850SAkhilesh Sanikop   //    bytes depending on the value of bits_. This fast path always reads 6
1047*09537850SAkhilesh Sanikop   //    bytes, which results in more calls to PopulateBits(). We assume that
1048*09537850SAkhilesh Sanikop   //    making more calls to a faster PopulateBits() is overall a win.
1049*09537850SAkhilesh Sanikop   // NOTE: Although this fast path could also be used on x86_64, it hurts
1050*09537850SAkhilesh Sanikop   // performance (measured on Lenovo ThinkStation P920 running Linux). (The
1051*09537850SAkhilesh Sanikop   // reason is still unknown.) Therefore this fast path is only used on arm64.
1052*09537850SAkhilesh Sanikop   static_assert(kWindowSize == 64, "");
1053*09537850SAkhilesh Sanikop   if (data_ < data_memcpy_end_) {
1054*09537850SAkhilesh Sanikop     uint64_t value;
1055*09537850SAkhilesh Sanikop     // arm64 supports unaligned loads, so this memcpy call is compiled to a
1056*09537850SAkhilesh Sanikop     // single ldr instruction.
1057*09537850SAkhilesh Sanikop     memcpy(&value, data_, sizeof(value));
1058*09537850SAkhilesh Sanikop     data_ += kMaxCachedBits >> 3;
1059*09537850SAkhilesh Sanikop     value = HostToBigEndian(value) ^ -1;
1060*09537850SAkhilesh Sanikop     value >>= kWindowSize - kMaxCachedBits;
1061*09537850SAkhilesh Sanikop     window_diff_ = value | (window_diff_ << kMaxCachedBits);
1062*09537850SAkhilesh Sanikop     bits_ += kMaxCachedBits;
1063*09537850SAkhilesh Sanikop     return;
1064*09537850SAkhilesh Sanikop   }
1065*09537850SAkhilesh Sanikop #endif
1066*09537850SAkhilesh Sanikop 
1067*09537850SAkhilesh Sanikop   const uint8_t* data = data_;
1068*09537850SAkhilesh Sanikop   int bits = bits_;
1069*09537850SAkhilesh Sanikop   WindowSize window_diff = window_diff_;
1070*09537850SAkhilesh Sanikop 
1071*09537850SAkhilesh Sanikop   int count = kWindowSize - 9 - (bits + 15);
1072*09537850SAkhilesh Sanikop   // The fast path above, if compiled, would cause clang 8.0.7 to vectorize
1073*09537850SAkhilesh Sanikop   // this loop. Since -15 <= bits_ <= -1, this loop has at most 6 or 7
1074*09537850SAkhilesh Sanikop   // iterations when WindowSize is 64 bits. So it is not profitable to
1075*09537850SAkhilesh Sanikop   // vectorize this loop. Note that clang 8.0.7 does not vectorize this loop if
1076*09537850SAkhilesh Sanikop   // the fast path above is not compiled.
1077*09537850SAkhilesh Sanikop 
1078*09537850SAkhilesh Sanikop #ifdef __clang__
1079*09537850SAkhilesh Sanikop #pragma clang loop vectorize(disable) interleave(disable)
1080*09537850SAkhilesh Sanikop #endif
1081*09537850SAkhilesh Sanikop   for (; count >= 0 && data < data_end_; count -= 8) {
1082*09537850SAkhilesh Sanikop     const uint8_t value = *data++ ^ -1;
1083*09537850SAkhilesh Sanikop     window_diff = static_cast<WindowSize>(value) | (window_diff << 8);
1084*09537850SAkhilesh Sanikop     bits += 8;
1085*09537850SAkhilesh Sanikop   }
1086*09537850SAkhilesh Sanikop   assert(bits <= kMaxCachedBits);
1087*09537850SAkhilesh Sanikop   if (data == data_end_) {
1088*09537850SAkhilesh Sanikop     // Shift in some 1s. This is equivalent to providing fake 0 data bits.
1089*09537850SAkhilesh Sanikop     window_diff = ((window_diff + 1) << (kMaxCachedBits - bits)) - 1;
1090*09537850SAkhilesh Sanikop     bits = kMaxCachedBits;
1091*09537850SAkhilesh Sanikop   }
1092*09537850SAkhilesh Sanikop 
1093*09537850SAkhilesh Sanikop   data_ = data;
1094*09537850SAkhilesh Sanikop   bits_ = bits;
1095*09537850SAkhilesh Sanikop   window_diff_ = window_diff;
1096*09537850SAkhilesh Sanikop }
1097*09537850SAkhilesh Sanikop 
NormalizeRange()1098*09537850SAkhilesh Sanikop void EntropyDecoder::NormalizeRange() {
1099*09537850SAkhilesh Sanikop   const int bits_used = 15 ^ FloorLog2(values_in_range_);
1100*09537850SAkhilesh Sanikop   bits_ -= bits_used;
1101*09537850SAkhilesh Sanikop   values_in_range_ <<= bits_used;
1102*09537850SAkhilesh Sanikop   if (bits_ < 0) PopulateBits();
1103*09537850SAkhilesh Sanikop }
1104*09537850SAkhilesh Sanikop 
1105*09537850SAkhilesh Sanikop // Explicit instantiations.
1106*09537850SAkhilesh Sanikop template int EntropyDecoder::ReadSymbol<3>(uint16_t* cdf);
1107*09537850SAkhilesh Sanikop template int EntropyDecoder::ReadSymbol<4>(uint16_t* cdf);
1108*09537850SAkhilesh Sanikop template int EntropyDecoder::ReadSymbol<5>(uint16_t* cdf);
1109*09537850SAkhilesh Sanikop template int EntropyDecoder::ReadSymbol<6>(uint16_t* cdf);
1110*09537850SAkhilesh Sanikop template int EntropyDecoder::ReadSymbol<7>(uint16_t* cdf);
1111*09537850SAkhilesh Sanikop template int EntropyDecoder::ReadSymbol<8>(uint16_t* cdf);
1112*09537850SAkhilesh Sanikop template int EntropyDecoder::ReadSymbol<9>(uint16_t* cdf);
1113*09537850SAkhilesh Sanikop template int EntropyDecoder::ReadSymbol<10>(uint16_t* cdf);
1114*09537850SAkhilesh Sanikop template int EntropyDecoder::ReadSymbol<11>(uint16_t* cdf);
1115*09537850SAkhilesh Sanikop template int EntropyDecoder::ReadSymbol<12>(uint16_t* cdf);
1116*09537850SAkhilesh Sanikop template int EntropyDecoder::ReadSymbol<13>(uint16_t* cdf);
1117*09537850SAkhilesh Sanikop template int EntropyDecoder::ReadSymbol<14>(uint16_t* cdf);
1118*09537850SAkhilesh Sanikop template int EntropyDecoder::ReadSymbol<16>(uint16_t* cdf);
1119*09537850SAkhilesh Sanikop 
1120*09537850SAkhilesh Sanikop }  // namespace libgav1
1121