xref: /aosp_15_r20/external/lzma/C/ZstdDec.c (revision f6dc9357d832569d4d1f5d24eacdb3935a1ae8e6)
1 /* ZstdDec.c -- Zstd Decoder
2 2024-06-18 : the code was developed by Igor Pavlov, using Zstandard format
3              specification and original zstd decoder code as reference code.
4 original zstd decoder code: Copyright (c) Facebook, Inc. All rights reserved.
5 This source code is licensed under BSD 3-Clause License.
6 */
7 
8 #include "Precomp.h"
9 
10 #include <string.h>
11 #include <stdlib.h>
12 // #include <stdio.h>
13 
14 #include "Alloc.h"
15 #include "Xxh64.h"
16 #include "ZstdDec.h"
17 #include "CpuArch.h"
18 
19 #if defined(MY_CPU_ARM64)
20 #include <arm_neon.h>
21 #endif
22 
23 /* original-zstd still doesn't support window larger than 2 GiB.
24    So we also limit our decoder for 2 GiB window: */
25 #if defined(MY_CPU_64BIT) && 0 == 1
26   #define MAX_WINDOW_SIZE_LOG  41
27 #else
28   #define MAX_WINDOW_SIZE_LOG  31
29 #endif
30 
31 typedef
32   #if MAX_WINDOW_SIZE_LOG < 32
33     UInt32
34   #else
35     size_t
36   #endif
37     CZstdDecOffset;
38 
39 // for debug: simpler and smaller code but slow:
40 // #define Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS
41 
42 // #define SHOW_STAT
43 #ifdef SHOW_STAT
44 #include <stdio.h>
45 static unsigned g_Num_Blocks_Compressed = 0;
46 static unsigned g_Num_Blocks_memcpy = 0;
47 static unsigned g_Num_Wrap_memmove_Num = 0;
48 static unsigned g_Num_Wrap_memmove_Bytes = 0;
49 static unsigned g_NumSeqs_total = 0;
50 // static unsigned g_NumCopy = 0;
51 static unsigned g_NumOver = 0;
52 static unsigned g_NumOver2 = 0;
53 static unsigned g_Num_Match = 0;
54 static unsigned g_Num_Lits = 0;
55 static unsigned g_Num_LitsBig = 0;
56 static unsigned g_Num_Lit0 = 0;
57 static unsigned g_Num_Rep0 = 0;
58 static unsigned g_Num_Rep1 = 0;
59 static unsigned g_Num_Rep2 = 0;
60 static unsigned g_Num_Rep3 = 0;
61 static unsigned g_Num_Threshold_0 = 0;
62 static unsigned g_Num_Threshold_1 = 0;
63 static unsigned g_Num_Threshold_0sum = 0;
64 static unsigned g_Num_Threshold_1sum = 0;
65 #define STAT_UPDATE(v) v
66 #else
67 #define STAT_UPDATE(v)
68 #endif
69 #define STAT_INC(v)  STAT_UPDATE(v++;)
70 
71 
72 typedef struct
73 {
74   const Byte *ptr;
75   size_t len;
76 }
77 CInBufPair;
78 
79 
80 #if defined(MY_CPU_ARM_OR_ARM64) || defined(MY_CPU_X86_OR_AMD64)
81   #if (defined(__clang__) && (__clang_major__ >= 6)) \
82    || (defined(__GNUC__) && (__GNUC__ >= 6))
83     // disable for debug:
84     #define Z7_ZSTD_DEC_USE_BSR
85   #elif defined(_MSC_VER) && (_MSC_VER >= 1300)
86     // #if defined(MY_CPU_ARM_OR_ARM64)
87     #if (_MSC_VER >= 1600)
88       #include <intrin.h>
89     #endif
90     // disable for debug:
91     #define Z7_ZSTD_DEC_USE_BSR
92   #endif
93 #endif
94 
95 #ifdef Z7_ZSTD_DEC_USE_BSR
96   #if defined(__clang__) || defined(__GNUC__)
97     #define MY_clz(x)  ((unsigned)__builtin_clz((UInt32)x))
98   #else  // #if defined(_MSC_VER)
99     #ifdef MY_CPU_ARM_OR_ARM64
100       #define MY_clz  _CountLeadingZeros
101     #endif // MY_CPU_X86_OR_AMD64
102   #endif // _MSC_VER
103 #elif !defined(Z7_ZSTD_DEC_USE_LOG_TABLE)
104   #define Z7_ZSTD_DEC_USE_LOG_TABLE
105 #endif
106 
107 
108 static
109 Z7_FORCE_INLINE
GetHighestSetBit_32_nonzero_big(UInt32 num)110 unsigned GetHighestSetBit_32_nonzero_big(UInt32 num)
111 {
112   // (num != 0)
113   #ifdef MY_clz
114     return 31 - MY_clz(num);
115   #elif defined(Z7_ZSTD_DEC_USE_BSR)
116   {
117     unsigned long zz;
118     _BitScanReverse(&zz, num);
119     return zz;
120   }
121   #else
122   {
123     int i = -1;
124     for (;;)
125     {
126       i++;
127       num >>= 1;
128       if (num == 0)
129         return (unsigned)i;
130     }
131   }
132   #endif
133 }
134 
135 #ifdef Z7_ZSTD_DEC_USE_LOG_TABLE
136 
137 #define R1(a)  a, a
138 #define R2(a)  R1(a), R1(a)
139 #define R3(a)  R2(a), R2(a)
140 #define R4(a)  R3(a), R3(a)
141 #define R5(a)  R4(a), R4(a)
142 #define R6(a)  R5(a), R5(a)
143 #define R7(a)  R6(a), R6(a)
144 #define R8(a)  R7(a), R7(a)
145 #define R9(a)  R8(a), R8(a)
146 
147 #define Z7_ZSTD_FSE_MAX_ACCURACY  9
148 // states[] values in FSE_Generate() can use (Z7_ZSTD_FSE_MAX_ACCURACY + 1) bits.
149 static const Byte k_zstd_LogTable[2 << Z7_ZSTD_FSE_MAX_ACCURACY] =
150 {
151   R1(0), R1(1), R2(2), R3(3), R4(4), R5(5), R6(6), R7(7), R8(8), R9(9)
152 };
153 
154 #define GetHighestSetBit_32_nonzero_small(num)  (k_zstd_LogTable[num])
155 #else
156 #define GetHighestSetBit_32_nonzero_small  GetHighestSetBit_32_nonzero_big
157 #endif
158 
159 
160 #ifdef MY_clz
161   #define UPDATE_BIT_OFFSET_FOR_PADDING(b, bitOffset) \
162     bitOffset -= (CBitCtr)(MY_clz(b) - 23);
163 #elif defined(Z7_ZSTD_DEC_USE_BSR)
164   #define UPDATE_BIT_OFFSET_FOR_PADDING(b, bitOffset) \
165     { unsigned long zz;  _BitScanReverse(&zz, b);  bitOffset -= 8;  bitOffset += zz; }
166 #else
167   #define UPDATE_BIT_OFFSET_FOR_PADDING(b, bitOffset) \
168     for (;;) { bitOffset--;  if (b & 0x80) { break; }  b <<= 1; }
169 #endif
170 
171 #define SET_bitOffset_TO_PAD(bitOffset, src, srcLen) \
172 { \
173   unsigned lastByte = (src)[(size_t)(srcLen) - 1]; \
174   if (lastByte == 0) return SZ_ERROR_DATA; \
175   bitOffset = (CBitCtr)((srcLen) * 8); \
176   UPDATE_BIT_OFFSET_FOR_PADDING(lastByte, bitOffset) \
177 }
178 
179 #ifndef Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS
180 
181 #define SET_bitOffset_TO_PAD_and_SET_BIT_SIZE(bitOffset, src, srcLen_res) \
182 { \
183   unsigned lastByte = (src)[(size_t)(srcLen_res) - 1]; \
184   if (lastByte == 0) return SZ_ERROR_DATA; \
185   srcLen_res *= 8; \
186   bitOffset = (CBitCtr)srcLen_res; \
187   UPDATE_BIT_OFFSET_FOR_PADDING(lastByte, bitOffset) \
188 }
189 
190 #endif
191 
192 /*
193 typedef Int32 CBitCtr_signed;
194 typedef Int32 CBitCtr;
195 */
196 // /*
197 typedef ptrdiff_t CBitCtr_signed;
198 typedef ptrdiff_t CBitCtr;
199 // */
200 
201 
202 #define MATCH_LEN_MIN  3
203 #define kBlockSizeMax  (1u << 17)
204 
205 // #define Z7_ZSTD_DEC_PRINT_TABLE
206 
207 #ifdef Z7_ZSTD_DEC_PRINT_TABLE
208 #define NUM_OFFSET_SYMBOLS_PREDEF 29
209 #endif
210 #define NUM_OFFSET_SYMBOLS_MAX    (MAX_WINDOW_SIZE_LOG + 1)  // 32
211 #define NUM_LL_SYMBOLS            36
212 #define NUM_ML_SYMBOLS            53
213 #define FSE_NUM_SYMBOLS_MAX       53  // NUM_ML_SYMBOLS
214 
215 // /*
216 #if !defined(MY_CPU_X86) || defined(__PIC__) || defined(MY_CPU_64BIT)
217 #define Z7_ZSTD_DEC_USE_BASES_IN_OBJECT
218 #endif
219 // */
220 // for debug:
221 // #define Z7_ZSTD_DEC_USE_BASES_LOCAL
222 // #define Z7_ZSTD_DEC_USE_BASES_IN_OBJECT
223 
224 #define GLOBAL_TABLE(n)  k_ ## n
225 
226 #if defined(Z7_ZSTD_DEC_USE_BASES_LOCAL)
227   #define BASES_TABLE(n)  a_ ## n
228 #elif defined(Z7_ZSTD_DEC_USE_BASES_IN_OBJECT)
229   #define BASES_TABLE(n)  p->m_ ## n
230 #else
231   #define BASES_TABLE(n)  GLOBAL_TABLE(n)
232 #endif
233 
234 #define Z7_ZSTD_DEC_USE_ML_PLUS3
235 
236 #if defined(Z7_ZSTD_DEC_USE_BASES_LOCAL) || \
237     defined(Z7_ZSTD_DEC_USE_BASES_IN_OBJECT)
238 
239 #define SEQ_EXTRA_TABLES(n) \
240   Byte   n ## SEQ_LL_EXTRA [NUM_LL_SYMBOLS]; \
241   Byte   n ## SEQ_ML_EXTRA [NUM_ML_SYMBOLS]; \
242   UInt32 n ## SEQ_LL_BASES [NUM_LL_SYMBOLS]; \
243   UInt32 n ## SEQ_ML_BASES [NUM_ML_SYMBOLS]; \
244 
245 #define Z7_ZSTD_DEC_USE_BASES_CALC
246 
247 #ifdef Z7_ZSTD_DEC_USE_BASES_CALC
248 
249   #define FILL_LOC_BASES(n, startSum) \
250     { unsigned i; UInt32 sum = startSum; \
251       for (i = 0; i != Z7_ARRAY_SIZE(GLOBAL_TABLE(n ## _EXTRA)); i++) \
252       { const unsigned a = GLOBAL_TABLE(n ## _EXTRA)[i]; \
253         BASES_TABLE(n ## _BASES)[i] = sum; \
254         /* if (sum != GLOBAL_TABLE(n ## _BASES)[i]) exit(1); */ \
255         sum += (UInt32)1 << a; \
256         BASES_TABLE(n ## _EXTRA)[i] = (Byte)a; }}
257 
258   #define FILL_LOC_BASES_ALL \
259       FILL_LOC_BASES (SEQ_LL, 0) \
260       FILL_LOC_BASES (SEQ_ML, MATCH_LEN_MIN) \
261 
262 #else
263   #define COPY_GLOBAL_ARR(n)  \
264     memcpy(BASES_TABLE(n), GLOBAL_TABLE(n), sizeof(GLOBAL_TABLE(n)));
265   #define FILL_LOC_BASES_ALL \
266     COPY_GLOBAL_ARR (SEQ_LL_EXTRA) \
267     COPY_GLOBAL_ARR (SEQ_ML_EXTRA) \
268     COPY_GLOBAL_ARR (SEQ_LL_BASES) \
269     COPY_GLOBAL_ARR (SEQ_ML_BASES) \
270 
271 #endif
272 
273 #endif
274 
275 
276 
277 /// The sequence decoding baseline and number of additional bits to read/add
278 #if !defined(Z7_ZSTD_DEC_USE_BASES_CALC)
279 static const UInt32 GLOBAL_TABLE(SEQ_LL_BASES) [NUM_LL_SYMBOLS] =
280 {
281   0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
282   16, 18, 20, 22, 24, 28, 32, 40, 48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000,
283   0x2000, 0x4000, 0x8000, 0x10000
284 };
285 #endif
286 
287 static const Byte GLOBAL_TABLE(SEQ_LL_EXTRA) [NUM_LL_SYMBOLS] =
288 {
289   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
290   1, 1, 1, 1, 2, 2, 3, 3, 4, 6, 7, 8, 9, 10, 11, 12,
291   13, 14, 15, 16
292 };
293 
294 #if !defined(Z7_ZSTD_DEC_USE_BASES_CALC)
295 static const UInt32 GLOBAL_TABLE(SEQ_ML_BASES) [NUM_ML_SYMBOLS] =
296 {
297   3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
298   19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
299   35, 37, 39, 41, 43, 47, 51, 59, 67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803,
300   0x1003, 0x2003, 0x4003, 0x8003, 0x10003
301 };
302 #endif
303 
304 static const Byte GLOBAL_TABLE(SEQ_ML_EXTRA) [NUM_ML_SYMBOLS] =
305 {
306   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
307   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
308   1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 7, 8, 9, 10, 11,
309   12, 13, 14, 15, 16
310 };
311 
312 
313 #ifdef Z7_ZSTD_DEC_PRINT_TABLE
314 
315 static const Int16 SEQ_LL_PREDEF_DIST [NUM_LL_SYMBOLS] =
316 {
317   4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1,
318   2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 1,
319  -1,-1,-1,-1
320 };
321 static const Int16 SEQ_OFFSET_PREDEF_DIST [NUM_OFFSET_SYMBOLS_PREDEF] =
322 {
323   1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
324   1, 1, 1, 1, 1, 1, 1, 1,-1,-1,-1,-1,-1
325 };
326 static const Int16 SEQ_ML_PREDEF_DIST [NUM_ML_SYMBOLS] =
327 {
328   1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
329   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
330   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,-1,-1,
331  -1,-1,-1,-1,-1
332 };
333 
334 #endif
335 
336 // typedef int FastInt;
337 // typedef Int32 FastInt32;
338 typedef unsigned FastInt;
339 typedef UInt32 FastInt32;
340 typedef FastInt32 CFseRecord;
341 
342 
343 #define FSE_REC_LEN_OFFSET    8
344 #define FSE_REC_STATE_OFFSET  16
345 #define GET_FSE_REC_SYM(st)   ((Byte)(st))
346 #define GET_FSE_REC_LEN(st)   ((Byte)((st) >> FSE_REC_LEN_OFFSET))
347 #define GET_FSE_REC_STATE(st) ((st) >> FSE_REC_STATE_OFFSET)
348 
349 // #define FSE_REC_SYM_MASK      (0xff)
350 // #define GET_FSE_REC_SYM(st)   (st & FSE_REC_SYM_MASK)
351 
352 #define W_BASE(state, len, sym) \
353     (((UInt32)state << (4 + FSE_REC_STATE_OFFSET)) + \
354     (len << FSE_REC_LEN_OFFSET) + (sym))
355 #define W(state, len, sym)  W_BASE(state, len, sym)
356 static const CFseRecord k_PredefRecords_LL[1 << 6] = {
357 W(0,4, 0),W(1,4, 0),W(2,5, 1),W(0,5, 3),W(0,5, 4),W(0,5, 6),W(0,5, 7),W(0,5, 9),
358 W(0,5,10),W(0,5,12),W(0,6,14),W(0,5,16),W(0,5,18),W(0,5,19),W(0,5,21),W(0,5,22),
359 W(0,5,24),W(2,5,25),W(0,5,26),W(0,6,27),W(0,6,29),W(0,6,31),W(2,4, 0),W(0,4, 1),
360 W(0,5, 2),W(2,5, 4),W(0,5, 5),W(2,5, 7),W(0,5, 8),W(2,5,10),W(0,5,11),W(0,6,13),
361 W(2,5,16),W(0,5,17),W(2,5,19),W(0,5,20),W(2,5,22),W(0,5,23),W(0,4,25),W(1,4,25),
362 W(2,5,26),W(0,6,28),W(0,6,30),W(3,4, 0),W(1,4, 1),W(2,5, 2),W(2,5, 3),W(2,5, 5),
363 W(2,5, 6),W(2,5, 8),W(2,5, 9),W(2,5,11),W(2,5,12),W(0,6,15),W(2,5,17),W(2,5,18),
364 W(2,5,20),W(2,5,21),W(2,5,23),W(2,5,24),W(0,6,35),W(0,6,34),W(0,6,33),W(0,6,32)
365 };
366 static const CFseRecord k_PredefRecords_OF[1 << 5] = {
367 W(0,5, 0),W(0,4, 6),W(0,5, 9),W(0,5,15),W(0,5,21),W(0,5, 3),W(0,4, 7),W(0,5,12),
368 W(0,5,18),W(0,5,23),W(0,5, 5),W(0,4, 8),W(0,5,14),W(0,5,20),W(0,5, 2),W(1,4, 7),
369 W(0,5,11),W(0,5,17),W(0,5,22),W(0,5, 4),W(1,4, 8),W(0,5,13),W(0,5,19),W(0,5, 1),
370 W(1,4, 6),W(0,5,10),W(0,5,16),W(0,5,28),W(0,5,27),W(0,5,26),W(0,5,25),W(0,5,24)
371 };
372 #if defined(Z7_ZSTD_DEC_USE_ML_PLUS3)
373 #undef W
374 #define W(state, len, sym)  W_BASE(state, len, (sym + MATCH_LEN_MIN))
375 #endif
376 static const CFseRecord k_PredefRecords_ML[1 << 6] = {
377 W(0,6, 0),W(0,4, 1),W(2,5, 2),W(0,5, 3),W(0,5, 5),W(0,5, 6),W(0,5, 8),W(0,6,10),
378 W(0,6,13),W(0,6,16),W(0,6,19),W(0,6,22),W(0,6,25),W(0,6,28),W(0,6,31),W(0,6,33),
379 W(0,6,35),W(0,6,37),W(0,6,39),W(0,6,41),W(0,6,43),W(0,6,45),W(1,4, 1),W(0,4, 2),
380 W(2,5, 3),W(0,5, 4),W(2,5, 6),W(0,5, 7),W(0,6, 9),W(0,6,12),W(0,6,15),W(0,6,18),
381 W(0,6,21),W(0,6,24),W(0,6,27),W(0,6,30),W(0,6,32),W(0,6,34),W(0,6,36),W(0,6,38),
382 W(0,6,40),W(0,6,42),W(0,6,44),W(2,4, 1),W(3,4, 1),W(1,4, 2),W(2,5, 4),W(2,5, 5),
383 W(2,5, 7),W(2,5, 8),W(0,6,11),W(0,6,14),W(0,6,17),W(0,6,20),W(0,6,23),W(0,6,26),
384 W(0,6,29),W(0,6,52),W(0,6,51),W(0,6,50),W(0,6,49),W(0,6,48),W(0,6,47),W(0,6,46)
385 };
386 
387 
388 // sum of freqs[] must be correct
389 // (numSyms != 0)
390 // (accuracy >= 5)
391 static
392 Z7_NO_INLINE
393 // Z7_FORCE_INLINE
FSE_Generate(CFseRecord * table,const Int16 * const freqs,const size_t numSyms,const unsigned accuracy,UInt32 delta)394 void FSE_Generate(CFseRecord *table,
395     const Int16 *const freqs, const size_t numSyms,
396     const unsigned accuracy, UInt32 delta)
397 {
398   size_t size = (size_t)1 << accuracy;
399   // max value in states[x] is ((1 << accuracy) * 2)
400   UInt16 states[FSE_NUM_SYMBOLS_MAX];
401   {
402     /* Symbols with "less than 1" probability get a single cell,
403        starting from the end of the table.
404        These symbols define a full state reset, reading (accuracy) bits. */
405     size_t threshold = size;
406     {
407       size_t s = 0;
408       do
409         if (freqs[s] == -1)
410         {
411           table[--threshold] = (CFseRecord)s;
412           states[s] = 1;
413         }
414       while (++s != numSyms);
415     }
416 
417     #ifdef SHOW_STAT
418     if (threshold == size)
419     {
420       STAT_INC(g_Num_Threshold_0)
421       STAT_UPDATE(g_Num_Threshold_0sum += (unsigned)size;)
422     }
423     else
424     {
425       STAT_INC(g_Num_Threshold_1)
426       STAT_UPDATE(g_Num_Threshold_1sum += (unsigned)size;)
427     }
428     #endif
429 
430     // { unsigned uuu; for (uuu = 0; uuu < 400; uuu++)
431     {
432       // Each (symbol) gets freqs[symbol] cells.
433       // Cell allocation is spread, not linear.
434       const size_t step = (size >> 1) + (size >> 3) + 3;
435       size_t pos = 0;
436       // const unsigned mask = size - 1;
437       /*
438       if (threshold == size)
439       {
440         size_t s = 0;
441         size--;
442         do
443         {
444           int freq = freqs[s];
445           if (freq <= 0)
446             continue;
447           states[s] = (UInt16)freq;
448           do
449           {
450             table[pos] (CFseRecord)s;
451             pos = (pos + step) & size; // & mask;
452           }
453           while (--freq);
454         }
455         while (++s != numSyms);
456       }
457       else
458       */
459       {
460         size_t s = 0;
461         size--;
462         do
463         {
464           int freq = freqs[s];
465           if (freq <= 0)
466             continue;
467           states[s] = (UInt16)freq;
468           do
469           {
470             table[pos] = (CFseRecord)s;
471             // we skip position, if it's already occupied by a "less than 1" probability symbol.
472             // (step) is coprime to table size, so the cycle will visit each position exactly once
473             do
474               pos = (pos + step) & size; // & mask;
475             while (pos >= threshold);
476           }
477           while (--freq);
478         }
479         while (++s != numSyms);
480       }
481       size++;
482       // (pos != 0) is unexpected case that means that freqs[] are not correct.
483       // so it's some failure in code (for example, incorrect predefined freq[] table)
484       // if (pos != 0) return SZ_ERROR_FAIL;
485     }
486     // }
487   }
488   {
489     const CFseRecord * const limit = table + size;
490     delta = ((UInt32)size << FSE_REC_STATE_OFFSET) - delta;
491     /* State increases by symbol over time, decreasing number of bits.
492        Baseline increases until the bit threshold is passed, at which point it resets to 0 */
493     do
494     {
495       #define TABLE_ITER(a) \
496       { \
497         const FastInt sym = (FastInt)table[a]; \
498         const unsigned nextState = states[sym]; \
499         unsigned nb; \
500         states[sym] = (UInt16)(nextState + 1); \
501         nb = accuracy - GetHighestSetBit_32_nonzero_small(nextState); \
502         table[a] = (CFseRecord)(sym - delta \
503             + ((UInt32)nb << FSE_REC_LEN_OFFSET) \
504             + ((UInt32)nextState << FSE_REC_STATE_OFFSET << nb)); \
505       }
506       TABLE_ITER(0)
507       TABLE_ITER(1)
508       table += 2;
509     }
510     while (table != limit);
511   }
512 }
513 
514 
515 #ifdef Z7_ZSTD_DEC_PRINT_TABLE
516 
Print_Predef(unsigned predefAccuracy,const unsigned numSymsPredef,const Int16 * const predefFreqs,const CFseRecord * checkTable)517 static void Print_Predef(unsigned predefAccuracy,
518     const unsigned numSymsPredef,
519     const Int16 * const predefFreqs,
520     const CFseRecord *checkTable)
521 {
522   CFseRecord table[1 << 6];
523   unsigned i;
524   FSE_Generate(table, predefFreqs, numSymsPredef, predefAccuracy,
525         #if defined(Z7_ZSTD_DEC_USE_ML_PLUS3)
526           numSymsPredef == NUM_ML_SYMBOLS ? MATCH_LEN_MIN :
527         #endif
528           0
529     );
530   if (memcmp(table, checkTable, sizeof(UInt32) << predefAccuracy) != 0)
531     exit(1);
532   for (i = 0; i < (1u << predefAccuracy); i++)
533   {
534     const UInt32 v = table[i];
535     const unsigned state = (unsigned)(GET_FSE_REC_STATE(v));
536     if (state & 0xf)
537       exit(1);
538     if (i != 0)
539     {
540       printf(",");
541       if (i % 8 == 0)
542         printf("\n");
543     }
544     printf("W(%d,%d,%2d)",
545         (unsigned)(state >> 4),
546         (unsigned)((v >> FSE_REC_LEN_OFFSET) & 0xff),
547         (unsigned)GET_FSE_REC_SYM(v));
548   }
549   printf("\n\n");
550 }
551 
552 #endif
553 
554 
555 #define GET16(dest, p)  { const Byte *ptr = p;  dest = GetUi16(ptr); }
556 #define GET32(dest, p)  { const Byte *ptr = p;  dest = GetUi32(ptr); }
557 
558 // (1 <= numBits <= 9)
559 #define FORWARD_READ_BITS(destVal, numBits, mask) \
560   { const CBitCtr_signed bos3 = (bitOffset) >> 3; \
561     if (bos3 >= 0) return SZ_ERROR_DATA; \
562     GET16(destVal, src + bos3) \
563     destVal >>= (bitOffset) & 7; \
564     bitOffset += (CBitCtr_signed)(numBits); \
565     mask = (1u << (numBits)) - 1; \
566     destVal &= mask; \
567   }
568 
569 #define FORWARD_READ_1BIT(destVal) \
570   { const CBitCtr_signed bos3 = (bitOffset) >> 3; \
571     if (bos3 >= 0) return SZ_ERROR_DATA; \
572     destVal = *(src + bos3); \
573     destVal >>= (bitOffset) & 7; \
574     (bitOffset)++; \
575     destVal &= 1; \
576   }
577 
578 
579 // in: (accuracyMax <= 9)
580 // at least 2 bytes will be processed from (in) stream.
581 // at return: (in->len > 0)
582 static
583 Z7_NO_INLINE
FSE_DecodeHeader(CFseRecord * const table,CInBufPair * const in,const unsigned accuracyMax,Byte * const accuracyRes,unsigned numSymbolsMax)584 SRes FSE_DecodeHeader(CFseRecord *const table,
585     CInBufPair *const in,
586     const unsigned accuracyMax,
587     Byte *const accuracyRes,
588     unsigned numSymbolsMax)
589 {
590   unsigned accuracy;
591   unsigned remain1;
592   unsigned syms;
593   Int16 freqs[FSE_NUM_SYMBOLS_MAX + 3]; // +3 for overwrite (repeat)
594   const Byte *src = in->ptr;
595   CBitCtr_signed bitOffset = (CBitCtr_signed)in->len - 1;
596   if (bitOffset <= 0)
597     return SZ_ERROR_DATA;
598   accuracy = *src & 0xf;
599   accuracy += 5;
600   if (accuracy > accuracyMax)
601     return SZ_ERROR_DATA;
602   *accuracyRes = (Byte)accuracy;
603   remain1 = (1u << accuracy) + 1; // (it's remain_freqs_sum + 1)
604   syms = 0;
605   src += bitOffset;  // src points to last byte
606   bitOffset = 4 - (bitOffset << 3);
607 
608   for (;;)
609   {
610     // (2 <= remain1)
611     const unsigned bits = GetHighestSetBit_32_nonzero_small((unsigned)remain1);
612     // (1 <= bits <= accuracy)
613     unsigned val; // it must be unsigned or int
614     unsigned mask;
615     FORWARD_READ_BITS(val, bits, mask)
616     {
617       const unsigned val2 = remain1 + val - mask;
618       if (val2 > mask)
619       {
620         unsigned bit;
621         FORWARD_READ_1BIT(bit)
622         if (bit)
623           val = val2;
624       }
625     }
626     {
627       // (remain1 >= 2)
628       // (0 <= (int)val <= remain1)
629       val = (unsigned)((int)val - 1);
630       // val now is "probability" of symbol
631       // (probability == -1) means "less than 1" frequency.
632       // (-1 <= (int)val <= remain1 - 1)
633       freqs[syms++] = (Int16)(int)val;
634       if (val != 0)
635       {
636         remain1 -= (int)val < 0 ? 1u : (unsigned)val;
637         // remain1 -= val;
638         // val >>= (sizeof(val) * 8 - 2);
639         // remain1 -= val & 2;
640         // freqs[syms++] = (Int16)(int)val;
641         // syms++;
642         if (remain1 == 1)
643           break;
644         if (syms >= FSE_NUM_SYMBOLS_MAX)
645           return SZ_ERROR_DATA;
646       }
647       else // if (val == 0)
648       {
649         // freqs[syms++] = 0;
650         // syms++;
651         for (;;)
652         {
653           unsigned repeat;
654           FORWARD_READ_BITS(repeat, 2, mask)
655           freqs[syms    ] = 0;
656           freqs[syms + 1] = 0;
657           freqs[syms + 2] = 0;
658           syms += repeat;
659           if (syms >= FSE_NUM_SYMBOLS_MAX)
660             return SZ_ERROR_DATA;
661           if (repeat != 3)
662             break;
663         }
664       }
665     }
666   }
667 
668   if (syms > numSymbolsMax)
669     return SZ_ERROR_DATA;
670   bitOffset += 7;
671   bitOffset >>= 3;
672   if (bitOffset > 0)
673     return SZ_ERROR_DATA;
674   in->ptr = src + bitOffset;
675   in->len = (size_t)(1 - bitOffset);
676   {
677     // unsigned uuu; for (uuu = 0; uuu < 50; uuu++)
678     FSE_Generate(table, freqs, syms, accuracy,
679         #if defined(Z7_ZSTD_DEC_USE_ML_PLUS3)
680           numSymbolsMax == NUM_ML_SYMBOLS ? MATCH_LEN_MIN :
681         #endif
682           0
683         );
684   }
685   return SZ_OK;
686 }
687 
688 
689 // ---------- HUFFMAN ----------
690 
691 #define HUF_MAX_BITS    12
692 #define HUF_MAX_SYMBS   256
693 #define HUF_DUMMY_SIZE  (128 + 8 * 2)  // it must multiple of 8
694 // #define HUF_DUMMY_SIZE 0
695 #define HUF_TABLE_SIZE  ((2 << HUF_MAX_BITS) + HUF_DUMMY_SIZE)
696 #define HUF_GET_SYMBOLS(table)  ((table) + (1 << HUF_MAX_BITS) + HUF_DUMMY_SIZE)
697 // #define HUF_GET_LENS(table)  (table)
698 
699 typedef struct
700 {
701   // Byte table[HUF_TABLE_SIZE];
702   UInt64 table64[HUF_TABLE_SIZE / sizeof(UInt64)];
703 }
704 CZstdDecHufTable;
705 
706 /*
707 Input:
708   numSyms != 0
709   (bits) array size must be aligned for 2
710   if (numSyms & 1), then bits[numSyms] == 0,
711   Huffman tree must be correct before Huf_Build() call:
712     (sum (1/2^bits[i]) == 1).
713     && (bits[i] <= HUF_MAX_BITS)
714 */
715 static
716 Z7_FORCE_INLINE
Huf_Build(Byte * const table,const Byte * bits,const unsigned numSyms)717 void Huf_Build(Byte * const table,
718     const Byte *bits, const unsigned numSyms)
719 {
720   unsigned counts0[HUF_MAX_BITS + 1];
721   unsigned counts1[HUF_MAX_BITS + 1];
722   const Byte * const bitsEnd = bits + numSyms;
723   // /*
724   {
725     unsigned t;
726     for (t = 0; t < Z7_ARRAY_SIZE(counts0); t++) counts0[t] = 0;
727     for (t = 0; t < Z7_ARRAY_SIZE(counts1); t++) counts1[t] = 0;
728   }
729   // */
730   // memset(counts0, 0, sizeof(counts0));
731   // memset(counts1, 0, sizeof(counts1));
732   {
733     const Byte *bits2 = bits;
734     // we access additional bits[symbol] if (numSyms & 1)
735     do
736     {
737       counts0[bits2[0]]++;
738       counts1[bits2[1]]++;
739     }
740     while ((bits2 += 2) < bitsEnd);
741   }
742   {
743     unsigned r = 0;
744     unsigned i = HUF_MAX_BITS;
745     // Byte *lens = HUF_GET_LENS(symbols);
746     do
747     {
748       const unsigned num = (counts0[i] + counts1[i]) << (HUF_MAX_BITS - i);
749       counts0[i] = r;
750       if (num)
751       {
752         Byte *lens = &table[r];
753         r += num;
754         memset(lens, (int)i, num);
755       }
756     }
757     while (--i);
758     counts0[0] = 0; // for speculated loads
759     // no need for check:
760     // if (r != (UInt32)1 << HUF_MAX_BITS) exit(0);
761   }
762   {
763     #ifdef MY_CPU_64BIT
764       UInt64
765     #else
766       UInt32
767     #endif
768         v = 0;
769     Byte *symbols = HUF_GET_SYMBOLS(table);
770     do
771     {
772       const unsigned nb = *bits++;
773       if (nb)
774       {
775         const unsigned code = counts0[nb];
776         const unsigned num = (1u << HUF_MAX_BITS) >> nb;
777         counts0[nb] = code + num;
778         // memset(&symbols[code], i, num);
779         // /*
780         {
781           Byte *s2 = &symbols[code];
782           if (num <= 2)
783           {
784             s2[0] = (Byte)v;
785             s2[(size_t)num - 1] = (Byte)v;
786           }
787           else if (num <= 8)
788           {
789             *(UInt32 *)(void *)s2 = (UInt32)v;
790             *(UInt32 *)(void *)(s2 + (size_t)num - 4) = (UInt32)v;
791           }
792           else
793           {
794             #ifdef MY_CPU_64BIT
795               UInt64 *s = (UInt64 *)(void *)s2;
796               const UInt64 *lim = (UInt64 *)(void *)(s2 + num);
797               do
798               {
799                 s[0] = v;  s[1] = v;  s += 2;
800               }
801               while (s != lim);
802             #else
803               UInt32 *s = (UInt32 *)(void *)s2;
804               const UInt32 *lim = (const UInt32 *)(const void *)(s2 + num);
805               do
806               {
807                 s[0] = v;  s[1] = v;  s += 2;
808                 s[0] = v;  s[1] = v;  s += 2;
809               }
810               while (s != lim);
811             #endif
812           }
813         }
814         // */
815       }
816       v +=
817         #ifdef MY_CPU_64BIT
818           0x0101010101010101;
819         #else
820           0x01010101;
821         #endif
822     }
823     while (bits != bitsEnd);
824   }
825 }
826 
827 
828 
829 // how many bytes (src) was moved back from original value.
830 // we need (HUF_SRC_OFFSET == 3) for optimized 32-bit memory access
831 #define HUF_SRC_OFFSET  3
832 
833 // v <<= 8 - (bitOffset & 7) + numBits;
834 // v >>= 32 - HUF_MAX_BITS;
835 #define HUF_GET_STATE(v, bitOffset, numBits) \
836   GET32(v, src + (HUF_SRC_OFFSET - 3) + ((CBitCtr_signed)bitOffset >> 3)) \
837   v >>= 32 - HUF_MAX_BITS - 8 + ((unsigned)bitOffset & 7) - numBits; \
838   v &= (1u << HUF_MAX_BITS) - 1; \
839 
840 
841 #ifndef Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS
842 #if defined(MY_CPU_AMD64) && defined(_MSC_VER) && _MSC_VER == 1400 \
843   || !defined(MY_CPU_X86_OR_AMD64) \
844   // || 1 == 1 /* for debug : to force STREAM4_PRELOAD mode */
845   // we need big number (>=16) of registers for PRELOAD4
846   #define Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD4
847   // #define Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD2 // for debug
848 #endif
849 #endif
850 
851 // for debug: simpler and smaller code but slow:
852 // #define Z7_ZSTD_DEC_USE_HUF_STREAM1_SIMPLE
853 
854 #if  defined(Z7_ZSTD_DEC_USE_HUF_STREAM1_SIMPLE) || \
855     !defined(Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS)
856 
857 #define HUF_DECODE(bitOffset, dest) \
858 { \
859   UInt32 v; \
860   HUF_GET_STATE(v, bitOffset, 0) \
861   bitOffset -= table[v]; \
862   *(dest) = symbols[v]; \
863   if ((CBitCtr_signed)bitOffset < 0) return SZ_ERROR_DATA; \
864 }
865 
866 #endif
867 
868 #if !defined(Z7_ZSTD_DEC_USE_HUF_STREAM1_SIMPLE) || \
869      defined(Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD4) || \
870      defined(Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD2) \
871 
872 #define HUF_DECODE_2_INIT(v, bitOffset) \
873   HUF_GET_STATE(v, bitOffset, 0)
874 
875 #define HUF_DECODE_2(v, bitOffset, dest) \
876 { \
877   unsigned numBits; \
878   numBits = table[v]; \
879   *(dest) = symbols[v]; \
880   HUF_GET_STATE(v, bitOffset, numBits) \
881   bitOffset -= (CBitCtr)numBits; \
882   if ((CBitCtr_signed)bitOffset < 0) return SZ_ERROR_DATA; \
883 }
884 
885 #endif
886 
887 
888 // src == ptr - HUF_SRC_OFFSET
889 // we are allowed to access 3 bytes before start of input buffer
890 static
891 Z7_NO_INLINE
Huf_Decompress_1stream(const Byte * const table,const Byte * src,const size_t srcLen,Byte * dest,const size_t destLen)892 SRes Huf_Decompress_1stream(const Byte * const table,
893     const Byte *src, const size_t srcLen,
894     Byte *dest, const size_t destLen)
895 {
896   CBitCtr bitOffset;
897   if (srcLen == 0)
898     return SZ_ERROR_DATA;
899   SET_bitOffset_TO_PAD (bitOffset, src + HUF_SRC_OFFSET, srcLen)
900   if (destLen)
901   {
902     const Byte *symbols = HUF_GET_SYMBOLS(table);
903     const Byte *destLim = dest + destLen;
904     #ifdef Z7_ZSTD_DEC_USE_HUF_STREAM1_SIMPLE
905     {
906       do
907       {
908         HUF_DECODE (bitOffset, dest)
909       }
910       while (++dest != destLim);
911     }
912     #else
913     {
914       UInt32 v;
915       HUF_DECODE_2_INIT (v, bitOffset)
916       do
917       {
918         HUF_DECODE_2 (v, bitOffset, dest)
919       }
920       while (++dest != destLim);
921     }
922     #endif
923   }
924   return bitOffset == 0 ? SZ_OK : SZ_ERROR_DATA;
925 }
926 
927 
928 // for debug : it reduces register pressure : by array copy can be slow :
929 // #define Z7_ZSTD_DEC_USE_HUF_LOCAL
930 
931 // src == ptr + (6 - HUF_SRC_OFFSET)
932 // srcLen >= 10
933 // we are allowed to access 3 bytes before start of input buffer
934 static
935 Z7_NO_INLINE
Huf_Decompress_4stream(const Byte * const table2,const Byte * src,size_t srcLen,Byte * dest,size_t destLen)936 SRes Huf_Decompress_4stream(const Byte * const
937   #ifdef Z7_ZSTD_DEC_USE_HUF_LOCAL
938     table2,
939   #else
940     table,
941   #endif
942     const Byte *src, size_t srcLen,
943     Byte *dest, size_t destLen)
944 {
945  #ifdef Z7_ZSTD_DEC_USE_HUF_LOCAL
946   Byte table[HUF_TABLE_SIZE];
947  #endif
948   UInt32 sizes[3];
949   const size_t delta = (destLen + 3) / 4;
950   if ((sizes[0] = GetUi16(src + (0 + HUF_SRC_OFFSET - 6))) == 0) return SZ_ERROR_DATA;
951   if ((sizes[1] = GetUi16(src + (2 + HUF_SRC_OFFSET - 6))) == 0) return SZ_ERROR_DATA;
952   sizes[1] += sizes[0];
953   if ((sizes[2] = GetUi16(src + (4 + HUF_SRC_OFFSET - 6))) == 0) return SZ_ERROR_DATA;
954   sizes[2] += sizes[1];
955   srcLen -= 6;
956   if (srcLen <= sizes[2])
957     return SZ_ERROR_DATA;
958 
959  #ifdef Z7_ZSTD_DEC_USE_HUF_LOCAL
960   {
961     // unsigned i = 0; for(; i < 1000; i++)
962     memcpy(table, table2, HUF_TABLE_SIZE);
963   }
964  #endif
965 
966   #ifndef Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS
967   {
968     CBitCtr bitOffset_0,
969             bitOffset_1,
970             bitOffset_2,
971             bitOffset_3;
972     {
973       SET_bitOffset_TO_PAD_and_SET_BIT_SIZE (bitOffset_0, src + HUF_SRC_OFFSET, sizes[0])
974       SET_bitOffset_TO_PAD_and_SET_BIT_SIZE (bitOffset_1, src + HUF_SRC_OFFSET, sizes[1])
975       SET_bitOffset_TO_PAD_and_SET_BIT_SIZE (bitOffset_2, src + HUF_SRC_OFFSET, sizes[2])
976       SET_bitOffset_TO_PAD                  (bitOffset_3, src + HUF_SRC_OFFSET, srcLen)
977     }
978     {
979       const Byte * const symbols = HUF_GET_SYMBOLS(table);
980       Byte *destLim = dest + destLen - delta * 3;
981 
982       if (dest != destLim)
983     #ifdef Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD4
984       {
985         UInt32 v_0, v_1, v_2, v_3;
986         HUF_DECODE_2_INIT (v_0, bitOffset_0)
987         HUF_DECODE_2_INIT (v_1, bitOffset_1)
988         HUF_DECODE_2_INIT (v_2, bitOffset_2)
989         HUF_DECODE_2_INIT (v_3, bitOffset_3)
990         // #define HUF_DELTA (1 << 17) / 4
991         do
992         {
993           HUF_DECODE_2 (v_3, bitOffset_3, dest + delta * 3)
994           HUF_DECODE_2 (v_2, bitOffset_2, dest + delta * 2)
995           HUF_DECODE_2 (v_1, bitOffset_1, dest + delta)
996           HUF_DECODE_2 (v_0, bitOffset_0, dest)
997         }
998         while (++dest != destLim);
999         /*
1000         {// unsigned y = 0; for (;y < 1; y++)
1001         {
1002           const size_t num = destLen - delta * 3;
1003           Byte *orig = dest - num;
1004           memmove (orig + delta    , orig + HUF_DELTA,     num);
1005           memmove (orig + delta * 2, orig + HUF_DELTA * 2, num);
1006           memmove (orig + delta * 3, orig + HUF_DELTA * 3, num);
1007         }}
1008         */
1009       }
1010     #elif defined(Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD2)
1011       {
1012         UInt32 v_0, v_1, v_2, v_3;
1013         HUF_DECODE_2_INIT (v_0, bitOffset_0)
1014         HUF_DECODE_2_INIT (v_1, bitOffset_1)
1015         do
1016         {
1017           HUF_DECODE_2 (v_0, bitOffset_0, dest)
1018           HUF_DECODE_2 (v_1, bitOffset_1, dest + delta)
1019         }
1020         while (++dest != destLim);
1021         dest = destLim - (destLen - delta * 3);
1022         dest += delta * 2;
1023         destLim += delta * 2;
1024         HUF_DECODE_2_INIT (v_2, bitOffset_2)
1025         HUF_DECODE_2_INIT (v_3, bitOffset_3)
1026         do
1027         {
1028           HUF_DECODE_2 (v_2, bitOffset_2, dest)
1029           HUF_DECODE_2 (v_3, bitOffset_3, dest + delta)
1030         }
1031         while (++dest != destLim);
1032         dest -= delta * 2;
1033         destLim -= delta * 2;
1034       }
1035     #else
1036       {
1037         do
1038         {
1039           HUF_DECODE (bitOffset_3, dest + delta * 3)
1040           HUF_DECODE (bitOffset_2, dest + delta * 2)
1041           HUF_DECODE (bitOffset_1, dest + delta)
1042           HUF_DECODE (bitOffset_0, dest)
1043         }
1044         while (++dest != destLim);
1045       }
1046     #endif
1047 
1048       if (bitOffset_3 != (CBitCtr)sizes[2])
1049         return SZ_ERROR_DATA;
1050       if (destLen &= 3)
1051       {
1052         destLim = dest + 4 - destLen;
1053         do
1054         {
1055           HUF_DECODE (bitOffset_2, dest + delta * 2)
1056           HUF_DECODE (bitOffset_1, dest + delta)
1057           HUF_DECODE (bitOffset_0, dest)
1058         }
1059         while (++dest != destLim);
1060       }
1061       if (   bitOffset_0 != 0
1062           || bitOffset_1 != (CBitCtr)sizes[0]
1063           || bitOffset_2 != (CBitCtr)sizes[1])
1064         return SZ_ERROR_DATA;
1065     }
1066   }
1067   #else // Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS
1068   {
1069     unsigned i;
1070     for (i = 0; i < 4; i++)
1071     {
1072       size_t d = destLen;
1073       size_t size = srcLen;
1074       if (i != 3)
1075       {
1076         d = delta;
1077         size = sizes[i];
1078       }
1079       if (i != 0)
1080         size -= sizes[i - 1];
1081       destLen -= d;
1082       RINOK(Huf_Decompress_1stream(table, src, size, dest, d))
1083       dest += d;
1084       src += size;
1085     }
1086   }
1087   #endif
1088 
1089   return SZ_OK;
1090 }
1091 
1092 
1093 
1094 // (in->len != 0)
1095 // we are allowed to access in->ptr[-3]
1096 // at least 2 bytes in (in->ptr) will be processed
Huf_DecodeTable(CZstdDecHufTable * const p,CInBufPair * const in)1097 static SRes Huf_DecodeTable(CZstdDecHufTable *const p, CInBufPair *const in)
1098 {
1099   Byte weights[HUF_MAX_SYMBS + 1];  // +1 for extra write for loop unroll
1100   unsigned numSyms;
1101   const unsigned header = *(in->ptr)++;
1102   in->len--;
1103   // memset(weights, 0, sizeof(weights));
1104   if (header >= 128)
1105   {
1106     // direct representation: 4 bits field (0-15) per weight
1107     numSyms = header - 127;
1108     // numSyms != 0
1109     {
1110       const size_t numBytes = (numSyms + 1) / 2;
1111       const Byte *const ws = in->ptr;
1112       size_t i = 0;
1113       if (in->len < numBytes)
1114         return SZ_ERROR_DATA;
1115       in->ptr += numBytes;
1116       in->len -= numBytes;
1117       do
1118       {
1119         const unsigned b = ws[i];
1120         weights[i * 2    ] = (Byte)(b >> 4);
1121         weights[i * 2 + 1] = (Byte)(b & 0xf);
1122       }
1123       while (++i != numBytes);
1124       /* 7ZIP: we can restore correct zero value for weights[numSyms],
1125          if we want to use zero values starting from numSyms in code below. */
1126       // weights[numSyms] = 0;
1127     }
1128   }
1129   else
1130   {
1131     #define MAX_ACCURACY_LOG_FOR_WEIGHTS 6
1132     CFseRecord table[1 << MAX_ACCURACY_LOG_FOR_WEIGHTS];
1133 
1134     Byte accuracy;
1135     const Byte *src;
1136     size_t srcLen;
1137     if (in->len < header)
1138       return SZ_ERROR_DATA;
1139     {
1140       CInBufPair fse_stream;
1141       fse_stream.len = header;
1142       fse_stream.ptr = in->ptr;
1143       in->ptr += header;
1144       in->len -= header;
1145       RINOK(FSE_DecodeHeader(table, &fse_stream,
1146           MAX_ACCURACY_LOG_FOR_WEIGHTS,
1147           &accuracy,
1148           16 // num weight symbols max (max-symbol is 15)
1149           ))
1150       // at least 2 bytes were processed in fse_stream.
1151       // (srcLen > 0) after FSE_DecodeHeader()
1152       // if (srcLen == 0) return SZ_ERROR_DATA;
1153       src = fse_stream.ptr;
1154       srcLen = fse_stream.len;
1155     }
1156     // we are allowed to access src[-5]
1157     {
1158       // unsigned yyy = 200; do {
1159       CBitCtr bitOffset;
1160       FastInt32 state1, state2;
1161       SET_bitOffset_TO_PAD (bitOffset, src, srcLen)
1162       state1 = accuracy;
1163       src -= state1 >> 2;  // src -= 1; // for GET16() optimization
1164       state1 <<= FSE_REC_LEN_OFFSET;
1165       state2 = state1;
1166       numSyms = 0;
1167       for (;;)
1168       {
1169         #define FSE_WEIGHT_DECODE(st) \
1170         { \
1171           const unsigned bits = GET_FSE_REC_LEN(st); \
1172           FastInt r; \
1173           GET16(r, src + (bitOffset >> 3)) \
1174           r >>= (unsigned)bitOffset & 7; \
1175           if ((CBitCtr_signed)(bitOffset -= (CBitCtr)bits) < 0) \
1176             { if (bitOffset + (CBitCtr)bits != 0) \
1177                 return SZ_ERROR_DATA; \
1178               break; } \
1179           r &= 0xff; \
1180           r >>= 8 - bits; \
1181           st = table[GET_FSE_REC_STATE(st) + r]; \
1182           weights[numSyms++] = (Byte)GET_FSE_REC_SYM(st); \
1183         }
1184         FSE_WEIGHT_DECODE (state1)
1185         FSE_WEIGHT_DECODE (state2)
1186         if (numSyms == HUF_MAX_SYMBS)
1187           return SZ_ERROR_DATA;
1188       }
1189       // src += (unsigned)accuracy >> 2; } while (--yyy);
1190     }
1191   }
1192 
1193   // Build using weights:
1194   {
1195     UInt32 sum = 0;
1196     {
1197       // numSyms >= 1
1198       unsigned i = 0;
1199       weights[numSyms] = 0;
1200       do
1201       {
1202         sum += ((UInt32)1 << weights[i    ]) & ~(UInt32)1;
1203         sum += ((UInt32)1 << weights[i + 1]) & ~(UInt32)1;
1204         i += 2;
1205       }
1206       while (i < numSyms);
1207       if (sum == 0)
1208         return SZ_ERROR_DATA;
1209     }
1210     {
1211       const unsigned maxBits = GetHighestSetBit_32_nonzero_big(sum) + 1;
1212       {
1213         const UInt32 left = ((UInt32)1 << maxBits) - sum;
1214         // (left != 0)
1215         // (left) must be power of 2 in correct stream
1216         if (left & (left - 1))
1217           return SZ_ERROR_DATA;
1218         weights[numSyms++] = (Byte)GetHighestSetBit_32_nonzero_big(left);
1219       }
1220       // if (numSyms & 1)
1221         weights[numSyms] = 0; // for loop unroll
1222       // numSyms >= 2
1223       {
1224         unsigned i = 0;
1225         do
1226         {
1227           /*
1228           #define WEIGHT_ITER(a) \
1229             { unsigned w = weights[i + (a)]; \
1230               const unsigned t = maxBits - w; \
1231               w = w ? t: w; \
1232               if (w > HUF_MAX_BITS) return SZ_ERROR_DATA; \
1233               weights[i + (a)] = (Byte)w; }
1234           */
1235           // /*
1236           #define WEIGHT_ITER(a) \
1237             { unsigned w = weights[i + (a)]; \
1238               if (w) {  \
1239                 w = maxBits - w; \
1240                 if (w > HUF_MAX_BITS) return SZ_ERROR_DATA; \
1241                 weights[i + (a)] = (Byte)w; }}
1242           // */
1243           WEIGHT_ITER(0)
1244           // WEIGHT_ITER(1)
1245           // i += 2;
1246         }
1247         while (++i != numSyms);
1248       }
1249     }
1250   }
1251   {
1252     // unsigned yyy; for (yyy = 0; yyy < 100; yyy++)
1253     Huf_Build((Byte *)(void *)p->table64, weights, numSyms);
1254   }
1255   return SZ_OK;
1256 }
1257 
1258 
1259 typedef enum
1260 {
1261   k_SeqMode_Predef = 0,
1262   k_SeqMode_RLE    = 1,
1263   k_SeqMode_FSE    = 2,
1264   k_SeqMode_Repeat = 3
1265 }
1266 z7_zstd_enum_SeqMode;
1267 
1268 // predefAccuracy == 5 for OFFSET symbols
1269 // predefAccuracy == 6 for MATCH/LIT LEN symbols
1270 static
1271 SRes
1272 Z7_NO_INLINE
1273 // Z7_FORCE_INLINE
FSE_Decode_SeqTable(CFseRecord * const table,CInBufPair * const in,unsigned predefAccuracy,Byte * const accuracyRes,unsigned numSymbolsMax,const CFseRecord * const predefs,const unsigned seqMode)1274 FSE_Decode_SeqTable(CFseRecord * const table,
1275     CInBufPair * const in,
1276     unsigned predefAccuracy,
1277     Byte * const accuracyRes,
1278     unsigned numSymbolsMax,
1279     const CFseRecord * const predefs,
1280     const unsigned seqMode)
1281 {
1282   // UNUSED_VAR(numSymsPredef)
1283   // UNUSED_VAR(predefFreqs)
1284   if (seqMode == k_SeqMode_FSE)
1285   {
1286     // unsigned y = 50; CInBufPair in2 = *in; do { *in = in2; RINOK(
1287     return
1288     FSE_DecodeHeader(table, in,
1289         predefAccuracy + 3, // accuracyMax
1290         accuracyRes,
1291         numSymbolsMax)
1292     ;
1293     // )} while (--y); return SZ_OK;
1294   }
1295   // numSymsMax = numSymsPredef + ((predefAccuracy & 1) * (32 - 29))); // numSymsMax
1296   // numSymsMax == 32 for offsets
1297 
1298   if (seqMode == k_SeqMode_Predef)
1299   {
1300     *accuracyRes = (Byte)predefAccuracy;
1301     memcpy(table, predefs, sizeof(UInt32) << predefAccuracy);
1302     return SZ_OK;
1303   }
1304 
1305   // (seqMode == k_SeqMode_RLE)
1306   if (in->len == 0)
1307     return SZ_ERROR_DATA;
1308   in->len--;
1309   {
1310     const Byte *ptr = in->ptr;
1311     const unsigned sym = ptr[0];
1312     in->ptr = ptr + 1;
1313     if (sym >= numSymbolsMax)
1314       return SZ_ERROR_DATA;
1315     table[0] = (FastInt32)sym
1316       #if defined(Z7_ZSTD_DEC_USE_ML_PLUS3)
1317         + (numSymbolsMax == NUM_ML_SYMBOLS ? MATCH_LEN_MIN : 0)
1318       #endif
1319       ;
1320     *accuracyRes = 0;
1321   }
1322   return SZ_OK;
1323 }
1324 
1325 
1326 typedef struct
1327 {
1328   CFseRecord of[1 << 8];
1329   CFseRecord ll[1 << 9];
1330   CFseRecord ml[1 << 9];
1331 }
1332 CZstdDecFseTables;
1333 
1334 
1335 typedef struct
1336 {
1337   Byte *win;
1338   SizeT cycSize;
1339   /*
1340     if (outBuf_fromCaller)  : cycSize = outBufSize_fromCaller
1341     else {
1342       if ( isCyclicMode) : cycSize = cyclic_buffer_size = (winSize + extra_space)
1343       if (!isCyclicMode) : cycSize = ContentSize,
1344       (isCyclicMode == true) if (ContetSize >= winSize) or ContetSize is unknown
1345     }
1346   */
1347   SizeT winPos;
1348 
1349   CZstdDecOffset reps[3];
1350 
1351   Byte ll_accuracy;
1352   Byte of_accuracy;
1353   Byte ml_accuracy;
1354   // Byte seqTables_wereSet;
1355   Byte litHuf_wasSet;
1356 
1357   Byte *literalsBase;
1358 
1359   size_t winSize;        // from header
1360   size_t totalOutCheck;  // totalOutCheck <= winSize
1361 
1362   #ifdef Z7_ZSTD_DEC_USE_BASES_IN_OBJECT
1363   SEQ_EXTRA_TABLES(m_)
1364   #endif
1365   // UInt64 _pad_Alignment;  // is not required now
1366   CZstdDecFseTables fse;
1367   CZstdDecHufTable huf;
1368 }
1369 CZstdDec1;
1370 
1371 #define ZstdDec1_GET_BLOCK_SIZE_LIMIT(p) \
1372   ((p)->winSize < kBlockSizeMax ? (UInt32)(p)->winSize : kBlockSizeMax)
1373 
1374 #define SEQ_TABLES_WERE_NOT_SET_ml_accuracy  1  // accuracy=1 is not used by zstd
1375 #define IS_SEQ_TABLES_WERE_SET(p)  (((p)->ml_accuracy != SEQ_TABLES_WERE_NOT_SET_ml_accuracy))
1376 // #define IS_SEQ_TABLES_WERE_SET(p)  ((p)->seqTables_wereSet)
1377 
1378 
ZstdDec1_Construct(CZstdDec1 * p)1379 static void ZstdDec1_Construct(CZstdDec1 *p)
1380 {
1381   #ifdef Z7_ZSTD_DEC_PRINT_TABLE
1382   Print_Predef(6, NUM_LL_SYMBOLS, SEQ_LL_PREDEF_DIST, k_PredefRecords_LL);
1383   Print_Predef(5, NUM_OFFSET_SYMBOLS_PREDEF, SEQ_OFFSET_PREDEF_DIST, k_PredefRecords_OF);
1384   Print_Predef(6, NUM_ML_SYMBOLS, SEQ_ML_PREDEF_DIST, k_PredefRecords_ML);
1385   #endif
1386 
1387   p->win = NULL;
1388   p->cycSize = 0;
1389   p->literalsBase = NULL;
1390   #ifdef Z7_ZSTD_DEC_USE_BASES_IN_OBJECT
1391   FILL_LOC_BASES_ALL
1392   #endif
1393 }
1394 
1395 
ZstdDec1_Init(CZstdDec1 * p)1396 static void ZstdDec1_Init(CZstdDec1 *p)
1397 {
1398   p->reps[0] = 1;
1399   p->reps[1] = 4;
1400   p->reps[2] = 8;
1401   // p->seqTables_wereSet = False;
1402   p->ml_accuracy = SEQ_TABLES_WERE_NOT_SET_ml_accuracy;
1403   p->litHuf_wasSet = False;
1404   p->totalOutCheck = 0;
1405 }
1406 
1407 
1408 
1409 #ifdef MY_CPU_LE_UNALIGN
1410   #define Z7_ZSTD_DEC_USE_UNALIGNED_COPY
1411 #endif
1412 
1413 #ifdef Z7_ZSTD_DEC_USE_UNALIGNED_COPY
1414 
1415   #define COPY_CHUNK_SIZE 16
1416 
1417     #define COPY_CHUNK_4_2(dest, src) \
1418     { \
1419       ((UInt32 *)(void *)dest)[0] = ((const UInt32 *)(const void *)src)[0]; \
1420       ((UInt32 *)(void *)dest)[1] = ((const UInt32 *)(const void *)src)[1]; \
1421       src += 4 * 2; \
1422       dest += 4 * 2; \
1423     }
1424 
1425   /* sse2 doesn't help here in GCC and CLANG.
1426      so we disabled sse2 here */
1427   /*
1428   #if defined(MY_CPU_AMD64)
1429     #define Z7_ZSTD_DEC_USE_SSE2
1430   #elif defined(MY_CPU_X86)
1431     #if defined(_MSC_VER) && _MSC_VER >= 1300 && defined(_M_IX86_FP) && (_M_IX86_FP >= 2) \
1432       || defined(__SSE2__) \
1433       // || 1 == 1  // for debug only
1434       #define Z7_ZSTD_DEC_USE_SSE2
1435     #endif
1436   #endif
1437   */
1438 
1439   #if defined(MY_CPU_ARM64)
1440     #define COPY_OFFSET_MIN  16
1441     #define COPY_CHUNK1(dest, src) \
1442     { \
1443       vst1q_u8((uint8_t *)(void *)dest, \
1444       vld1q_u8((const uint8_t *)(const void *)src)); \
1445       src += 16; \
1446       dest += 16; \
1447     }
1448 
1449     #define COPY_CHUNK(dest, src) \
1450     { \
1451       COPY_CHUNK1(dest, src) \
1452       if ((len -= COPY_CHUNK_SIZE) == 0) break; \
1453       COPY_CHUNK1(dest, src) \
1454     }
1455 
1456   #elif defined(Z7_ZSTD_DEC_USE_SSE2)
1457     #include <emmintrin.h> // sse2
1458     #define COPY_OFFSET_MIN  16
1459 
1460     #define COPY_CHUNK1(dest, src) \
1461     { \
1462       _mm_storeu_si128((__m128i *)(void *)dest, \
1463       _mm_loadu_si128((const __m128i *)(const void *)src)); \
1464       src += 16; \
1465       dest += 16; \
1466     }
1467 
1468     #define COPY_CHUNK(dest, src) \
1469     { \
1470       COPY_CHUNK1(dest, src) \
1471       if ((len -= COPY_CHUNK_SIZE) == 0) break; \
1472       COPY_CHUNK1(dest, src) \
1473     }
1474 
1475   #elif defined(MY_CPU_64BIT)
1476     #define COPY_OFFSET_MIN  8
1477 
1478     #define COPY_CHUNK(dest, src) \
1479     { \
1480       ((UInt64 *)(void *)dest)[0] = ((const UInt64 *)(const void *)src)[0]; \
1481       ((UInt64 *)(void *)dest)[1] = ((const UInt64 *)(const void *)src)[1]; \
1482       src += 8 * 2; \
1483       dest += 8 * 2; \
1484     }
1485 
1486   #else
1487     #define COPY_OFFSET_MIN  4
1488 
1489     #define COPY_CHUNK(dest, src) \
1490     { \
1491       COPY_CHUNK_4_2(dest, src); \
1492       COPY_CHUNK_4_2(dest, src); \
1493     }
1494 
1495   #endif
1496 #endif
1497 
1498 
1499 #ifndef COPY_CHUNK_SIZE
1500     #define COPY_OFFSET_MIN  4
1501     #define COPY_CHUNK_SIZE  8
1502     #define COPY_CHUNK_2(dest, src) \
1503     { \
1504       const Byte a0 = src[0]; \
1505       const Byte a1 = src[1]; \
1506       dest[0] = a0; \
1507       dest[1] = a1; \
1508       src += 2; \
1509       dest += 2; \
1510     }
1511     #define COPY_CHUNK(dest, src) \
1512     { \
1513       COPY_CHUNK_2(dest, src) \
1514       COPY_CHUNK_2(dest, src) \
1515       COPY_CHUNK_2(dest, src) \
1516       COPY_CHUNK_2(dest, src) \
1517     }
1518 #endif
1519 
1520 
1521 #define COPY_PREPARE \
1522   len += (COPY_CHUNK_SIZE - 1); \
1523   len &= ~(size_t)(COPY_CHUNK_SIZE - 1); \
1524   { if (len > rem) \
1525   { len = rem; \
1526     rem &= (COPY_CHUNK_SIZE - 1); \
1527     if (rem) {  \
1528       len -= rem; \
1529       Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE \
1530       do *dest++ = *src++; while (--rem); \
1531       if (len == 0) return; }}}
1532 
1533 #define COPY_CHUNKS \
1534 { \
1535   Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE \
1536   do { COPY_CHUNK(dest, src) } \
1537   while (len -= COPY_CHUNK_SIZE); \
1538 }
1539 
1540 // (len != 0)
1541 // (len <= rem)
1542 static
1543 Z7_FORCE_INLINE
1544 // Z7_ATTRIB_NO_VECTOR
CopyLiterals(Byte * dest,Byte const * src,size_t len,size_t rem)1545 void CopyLiterals(Byte *dest, Byte const *src, size_t len, size_t rem)
1546 {
1547   COPY_PREPARE
1548   COPY_CHUNKS
1549 }
1550 
1551 
1552 /* we can define Z7_STD_DEC_USE_AFTER_CYC_BUF, if we want to use additional
1553    space after cycSize that can be used to reduce the code in CopyMatch(): */
1554 // for debug:
1555 // #define Z7_STD_DEC_USE_AFTER_CYC_BUF
1556 
1557 /*
1558 CopyMatch()
1559 if wrap (offset > winPos)
1560 {
1561   then we have at least (COPY_CHUNK_SIZE) avail in (dest) before we will overwrite (src):
1562   (cycSize >= offset + COPY_CHUNK_SIZE)
1563   if defined(Z7_STD_DEC_USE_AFTER_CYC_BUF)
1564     we are allowed to read win[cycSize + COPY_CHUNK_SIZE - 1],
1565 }
1566 (len != 0)
1567 */
1568 static
1569 Z7_FORCE_INLINE
1570 // Z7_ATTRIB_NO_VECTOR
CopyMatch(size_t offset,size_t len,Byte * win,size_t winPos,size_t rem,const size_t cycSize)1571 void CopyMatch(size_t offset, size_t len,
1572     Byte *win, size_t winPos, size_t rem, const size_t cycSize)
1573 {
1574   Byte *dest = win + winPos;
1575   const Byte *src;
1576   // STAT_INC(g_NumCopy)
1577 
1578   if (offset > winPos)
1579   {
1580     size_t back = offset - winPos;
1581     // src = win + cycSize - back;
1582     // cycSize -= offset;
1583     STAT_INC(g_NumOver)
1584     src = dest + (cycSize - offset);
1585     // (src >= dest) here
1586    #ifdef Z7_STD_DEC_USE_AFTER_CYC_BUF
1587     if (back < len)
1588     {
1589    #else
1590     if (back < len + (COPY_CHUNK_SIZE - 1))
1591     {
1592       if (back >= len)
1593       {
1594         Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
1595         do
1596           *dest++ = *src++;
1597         while (--len);
1598         return;
1599       }
1600    #endif
1601       // back < len
1602       STAT_INC(g_NumOver2)
1603       len -= back;
1604       rem -= back;
1605       Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
1606       do
1607         *dest++ = *src++;
1608       while (--back);
1609       src = dest - offset;
1610       // src = win;
1611       // we go to MAIN-COPY
1612     }
1613   }
1614   else
1615     src = dest - offset;
1616 
1617   // len != 0
1618   // do *dest++ = *src++; while (--len); return;
1619 
1620   // --- MAIN COPY ---
1621   // if (src >= dest), then ((size_t)(src - dest) >= COPY_CHUNK_SIZE)
1622   //   so we have at least COPY_CHUNK_SIZE space before overlap for writing.
1623   COPY_PREPARE
1624 
1625   /* now (len == COPY_CHUNK_SIZE * x)
1626      so we can unroll for aligned copy */
1627   {
1628     // const unsigned b0 = src[0];
1629     // (COPY_OFFSET_MIN >= 4)
1630 
1631     if (offset >= COPY_OFFSET_MIN)
1632     {
1633       COPY_CHUNKS
1634       // return;
1635     }
1636     else
1637   #if (COPY_OFFSET_MIN > 4)
1638     #if COPY_CHUNK_SIZE < 8
1639       #error Stop_Compiling_Bad_COPY_CHUNK_SIZE
1640     #endif
1641     if (offset >= 4)
1642     {
1643       Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
1644       do
1645       {
1646         COPY_CHUNK_4_2(dest, src)
1647         #if COPY_CHUNK_SIZE != 16
1648           if (len == 8) break;
1649         #endif
1650         COPY_CHUNK_4_2(dest, src)
1651       }
1652       while (len -= 16);
1653       // return;
1654     }
1655     else
1656   #endif
1657     {
1658       // (offset < 4)
1659       const unsigned b0 = src[0];
1660       if (offset < 2)
1661       {
1662       #if defined(Z7_ZSTD_DEC_USE_UNALIGNED_COPY) && (COPY_CHUNK_SIZE == 16)
1663         #if defined(MY_CPU_64BIT)
1664         {
1665           const UInt64 v64 = (UInt64)b0 * 0x0101010101010101;
1666           Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
1667           do
1668           {
1669             ((UInt64 *)(void *)dest)[0] = v64;
1670             ((UInt64 *)(void *)dest)[1] = v64;
1671             dest += 16;
1672           }
1673           while (len -= 16);
1674         }
1675         #else
1676         {
1677           UInt32 v = b0;
1678           v |= v << 8;
1679           v |= v << 16;
1680           do
1681           {
1682             ((UInt32 *)(void *)dest)[0] = v;
1683             ((UInt32 *)(void *)dest)[1] = v;
1684             dest += 8;
1685             ((UInt32 *)(void *)dest)[0] = v;
1686             ((UInt32 *)(void *)dest)[1] = v;
1687             dest += 8;
1688           }
1689           while (len -= 16);
1690         }
1691         #endif
1692       #else
1693         do
1694         {
1695           dest[0] = (Byte)b0;
1696           dest[1] = (Byte)b0;
1697           dest += 2;
1698           dest[0] = (Byte)b0;
1699           dest[1] = (Byte)b0;
1700           dest += 2;
1701         }
1702         while (len -= 4);
1703       #endif
1704       }
1705       else if (offset == 2)
1706       {
1707         const Byte b1 = src[1];
1708         {
1709           do
1710           {
1711             dest[0] = (Byte)b0;
1712             dest[1] = b1;
1713             dest += 2;
1714           }
1715           while (len -= 2);
1716         }
1717       }
1718       else // (offset == 3)
1719       {
1720         const Byte *lim = dest + len - 2;
1721         const Byte b1 = src[1];
1722         const Byte b2 = src[2];
1723         do
1724         {
1725           dest[0] = (Byte)b0;
1726           dest[1] = b1;
1727           dest[2] = b2;
1728           dest += 3;
1729         }
1730         while (dest < lim);
1731         lim++; // points to last byte that must be written
1732         if (dest <= lim)
1733         {
1734           *dest = (Byte)b0;
1735           if (dest != lim)
1736             dest[1] = b1;
1737         }
1738       }
1739     }
1740   }
1741 }
1742 
1743 
1744 
1745 #define UPDATE_TOTAL_OUT(p, size) \
1746 { \
1747   size_t _toc = (p)->totalOutCheck + (size); \
1748   const size_t _ws = (p)->winSize; \
1749   if (_toc >= _ws) _toc = _ws; \
1750   (p)->totalOutCheck = _toc; \
1751 }
1752 
1753 
1754 #if defined(MY_CPU_64BIT) && defined(MY_CPU_LE_UNALIGN)
1755 // we can disable it for debug:
1756 #define Z7_ZSTD_DEC_USE_64BIT_LOADS
1757 #endif
1758 // #define Z7_ZSTD_DEC_USE_64BIT_LOADS // for debug : slow in 32-bit
1759 
1760 // SEQ_SRC_OFFSET: how many bytes (src) (seqSrc) was moved back from original value.
1761 // we need (SEQ_SRC_OFFSET != 0) for optimized memory access
1762 #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
1763   #define SEQ_SRC_OFFSET 7
1764 #else
1765   #define SEQ_SRC_OFFSET 3
1766 #endif
1767 #define SRC_PLUS_FOR_4BYTES(bitOffset)  (SEQ_SRC_OFFSET - 3) + ((CBitCtr_signed)(bitOffset) >> 3)
1768 #define BIT_OFFSET_7BITS(bitOffset)  ((unsigned)(bitOffset) & 7)
1769 /*
1770   if (BIT_OFFSET_DELTA_BITS == 0) : bitOffset == number_of_unprocessed_bits
1771   if (BIT_OFFSET_DELTA_BITS == 1) : bitOffset == number_of_unprocessed_bits - 1
1772       and we can read 1 bit more in that mode : (8 * n + 1).
1773 */
1774 // #define BIT_OFFSET_DELTA_BITS 0
1775 #define BIT_OFFSET_DELTA_BITS 1
1776 #if BIT_OFFSET_DELTA_BITS == 1
1777   #define GET_SHIFT_FROM_BOFFS7(boff7)  (7 ^ (boff7))
1778 #else
1779   #define GET_SHIFT_FROM_BOFFS7(boff7)  (8 - BIT_OFFSET_DELTA_BITS - (boff7))
1780 #endif
1781 
1782 #define UPDATE_BIT_OFFSET(bitOffset, numBits) \
1783     (bitOffset) -= (CBitCtr)(numBits);
1784 
1785 #define GET_SHIFT(bitOffset)  GET_SHIFT_FROM_BOFFS7(BIT_OFFSET_7BITS(bitOffset))
1786 
1787 
1788 #if defined(Z7_ZSTD_DEC_USE_64BIT_LOADS)
1789   #if (NUM_OFFSET_SYMBOLS_MAX - BIT_OFFSET_DELTA_BITS < 32)
1790     /* if (NUM_OFFSET_SYMBOLS_MAX == 32 && BIT_OFFSET_DELTA_BITS == 1),
1791        we have depth 31 + 9 + 9 + 8 = 57 bits that can b read with single read. */
1792     #define Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF
1793   #endif
1794   #ifndef Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF
1795     #if (BIT_OFFSET_DELTA_BITS == 1)
1796     /* if (winLimit - winPos <= (kBlockSizeMax = (1 << 17)))
1797        {
1798          the case (16 bits literal extra + 16 match extra) is not possible
1799          in correct stream. So error will be detected for (16 + 16) case.
1800          And longest correct sequence after offset reading is (31 + 9 + 9 + 8 = 57 bits).
1801          So we can use just one 64-bit load here in that case.
1802        }
1803     */
1804     #define Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML
1805     #endif
1806   #endif
1807 #endif
1808 
1809 
1810 #if !defined(Z7_ZSTD_DEC_USE_64BIT_LOADS) || \
1811     (!defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF) && \
1812      !defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML))
1813 // in : (0 < bits <= (24 or 25)):
1814 #define STREAM_READ_BITS(dest, bits) \
1815 { \
1816   GET32(dest, src + SRC_PLUS_FOR_4BYTES(bitOffset)) \
1817   dest <<= GET_SHIFT(bitOffset); \
1818   UPDATE_BIT_OFFSET(bitOffset, bits) \
1819   dest >>= 32 - bits; \
1820 }
1821 #endif
1822 
1823 
1824 #define FSE_Peek_1(table, state)  table[state]
1825 
1826 #define STATE_VAR(name)  state_ ## name
1827 
1828 // in : (0 <= accuracy <= (24 or 25))
1829 #define FSE_INIT_STATE(name, cond) \
1830 { \
1831   UInt32 r; \
1832   const unsigned bits = p->name ## _accuracy; \
1833   GET32(r, src + SRC_PLUS_FOR_4BYTES(bitOffset)) \
1834   r <<= GET_SHIFT(bitOffset); \
1835   r >>= 1; \
1836   r >>= 31 ^ bits; \
1837   UPDATE_BIT_OFFSET(bitOffset, bits) \
1838   cond \
1839   STATE_VAR(name) = FSE_Peek_1(FSE_TABLE(name), r); \
1840   /* STATE_VAR(name) = dest << 16; */ \
1841 }
1842 
1843 
1844 #define FSE_Peek_Plus(name, r)  \
1845   STATE_VAR(name) = FSE_Peek_1(FSE_TABLE(name), \
1846     GET_FSE_REC_STATE(STATE_VAR(name)) + r);
1847 
1848 #define LZ_LOOP_ERROR_EXIT  { return SZ_ERROR_DATA; }
1849 
1850 #define BO_OVERFLOW_CHECK \
1851   { if ((CBitCtr_signed)bitOffset < 0) LZ_LOOP_ERROR_EXIT }
1852 
1853 
1854 #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
1855 
1856 #define GET64(dest, p)  { const Byte *ptr = p;  dest = GetUi64(ptr); }
1857 
1858 #define FSE_PRELOAD \
1859 { \
1860   GET64(v, src - 4 + SRC_PLUS_FOR_4BYTES(bitOffset)) \
1861   v <<= GET_SHIFT(bitOffset); \
1862 }
1863 
1864 #define FSE_UPDATE_STATE_2(name, cond) \
1865 { \
1866   const unsigned bits = GET_FSE_REC_LEN(STATE_VAR(name)); \
1867   UInt64 r = v; \
1868   v <<= bits; \
1869   r >>= 1; \
1870   UPDATE_BIT_OFFSET(bitOffset, bits) \
1871   cond \
1872   r >>= 63 ^ bits; \
1873   FSE_Peek_Plus(name, r); \
1874 }
1875 
1876 #define FSE_UPDATE_STATES \
1877   FSE_UPDATE_STATE_2 (ll, {} ) \
1878   FSE_UPDATE_STATE_2 (ml, {} ) \
1879   FSE_UPDATE_STATE_2 (of, BO_OVERFLOW_CHECK) \
1880 
1881 #else // Z7_ZSTD_DEC_USE_64BIT_LOADS
1882 
1883 // it supports 8 bits accuracy for any code
1884 // it supports 9 bits accuracy, if (BIT_OFFSET_DELTA_BITS == 1)
1885 #define FSE_UPDATE_STATE_0(name, cond) \
1886 { \
1887   UInt32 r; \
1888   const unsigned bits = GET_FSE_REC_LEN(STATE_VAR(name)); \
1889   GET16(r, src + 2 + SRC_PLUS_FOR_4BYTES(bitOffset)) \
1890   r >>= (bitOffset & 7); \
1891   r &= (1 << (8 + BIT_OFFSET_DELTA_BITS)) - 1; \
1892   UPDATE_BIT_OFFSET(bitOffset, bits) \
1893   cond \
1894   r >>= (8 + BIT_OFFSET_DELTA_BITS) - bits; \
1895   FSE_Peek_Plus(name, r); \
1896 }
1897 
1898 // for debug (slow):
1899 // #define Z7_ZSTD_DEC_USE_FSE_FUSION_FORCE
1900 #if BIT_OFFSET_DELTA_BITS == 0 || defined(Z7_ZSTD_DEC_USE_FSE_FUSION_FORCE)
1901   #define Z7_ZSTD_DEC_USE_FSE_FUSION
1902 #endif
1903 
1904 #ifdef Z7_ZSTD_DEC_USE_FSE_FUSION
1905 #define FSE_UPDATE_STATE_1(name) \
1906 { UInt32 rest2; \
1907 { \
1908   UInt32 r; \
1909   unsigned bits; \
1910   GET32(r, src + SRC_PLUS_FOR_4BYTES(bitOffset)) \
1911   bits = GET_FSE_REC_LEN(STATE_VAR(name)); \
1912   r <<= GET_SHIFT(bitOffset); \
1913   rest2 = r << bits; \
1914   r >>= 1; \
1915   UPDATE_BIT_OFFSET(bitOffset, bits) \
1916   r >>= 31 ^ bits; \
1917   FSE_Peek_Plus(name, r); \
1918 }
1919 
1920 #define FSE_UPDATE_STATE_3(name) \
1921 { \
1922   const unsigned bits = GET_FSE_REC_LEN(STATE_VAR(name)); \
1923   rest2 >>= 1; \
1924   UPDATE_BIT_OFFSET(bitOffset, bits) \
1925   rest2 >>= 31 ^ bits; \
1926   FSE_Peek_Plus(name, rest2); \
1927 }}
1928 
1929 #define FSE_UPDATE_STATES \
1930   FSE_UPDATE_STATE_1 (ll) \
1931   FSE_UPDATE_STATE_3 (ml) \
1932   FSE_UPDATE_STATE_0 (of, BO_OVERFLOW_CHECK) \
1933 
1934 #else // Z7_ZSTD_DEC_USE_64BIT_LOADS
1935 
1936 #define FSE_UPDATE_STATES \
1937   FSE_UPDATE_STATE_0 (ll, {} ) \
1938   FSE_UPDATE_STATE_0 (ml, {} ) \
1939   FSE_UPDATE_STATE_0 (of, BO_OVERFLOW_CHECK) \
1940 
1941 #endif // Z7_ZSTD_DEC_USE_FSE_FUSION
1942 #endif // Z7_ZSTD_DEC_USE_64BIT_LOADS
1943 
1944 
1945 
1946 typedef struct
1947 {
1948   UInt32 numSeqs;
1949   UInt32 literalsLen;
1950   const Byte *literals;
1951 }
1952 CZstdDec1_Vars;
1953 
1954 
1955 // if (BIT_OFFSET_DELTA_BITS != 0), we need (BIT_OFFSET_DELTA_BYTES > 0)
1956 #define BIT_OFFSET_DELTA_BYTES   BIT_OFFSET_DELTA_BITS
1957 
1958 /* if (NUM_OFFSET_SYMBOLS_MAX == 32)
1959      max_seq_bit_length = (31) + 16 + 16 + 9 + 8 + 9 = 89 bits
1960    if defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF) we have longest backward
1961      lookahead offset, and we read UInt64 after literal_len reading.
1962    if (BIT_OFFSET_DELTA_BITS == 1 && NUM_OFFSET_SYMBOLS_MAX == 32)
1963      MAX_BACKWARD_DEPTH = 16 bytes
1964 */
1965 #define MAX_BACKWARD_DEPTH  \
1966     ((NUM_OFFSET_SYMBOLS_MAX - 1 + 16 + 16 + 7) / 8 + 7 + BIT_OFFSET_DELTA_BYTES)
1967 
1968 /* srcLen != 0
1969    src == real_data_ptr - SEQ_SRC_OFFSET - BIT_OFFSET_DELTA_BYTES
1970    if defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML) then
1971      (winLimit - p->winPos <= (1 << 17)) is required
1972 */
1973 static
1974 Z7_NO_INLINE
1975 // Z7_ATTRIB_NO_VECTOR
1976 SRes Decompress_Sequences(CZstdDec1 * const p,
1977     const Byte *src, const size_t srcLen,
1978     const size_t winLimit,
1979     const CZstdDec1_Vars * const vars)
1980 {
1981 #ifdef Z7_ZSTD_DEC_USE_BASES_LOCAL
1982   SEQ_EXTRA_TABLES(a_)
1983 #endif
1984 
1985   // for debug:
1986   // #define Z7_ZSTD_DEC_USE_LOCAL_FSE_TABLES
1987 #ifdef Z7_ZSTD_DEC_USE_LOCAL_FSE_TABLES
1988   #define FSE_TABLE(n)  fse. n
1989   const CZstdDecFseTables fse = p->fse;
1990   /*
1991   CZstdDecFseTables fse;
1992   #define COPY_FSE_TABLE(n) \
1993     memcpy(fse. n, p->fse. n, (size_t)4 << p-> n ## _accuracy);
1994   COPY_FSE_TABLE(of)
1995   COPY_FSE_TABLE(ll)
1996   COPY_FSE_TABLE(ml)
1997   */
1998 #else
1999   #define FSE_TABLE(n)  (p->fse.  n)
2000 #endif
2001 
2002 #ifdef Z7_ZSTD_DEC_USE_BASES_LOCAL
2003   FILL_LOC_BASES_ALL
2004 #endif
2005 
2006   {
2007     unsigned numSeqs = vars->numSeqs;
2008     const Byte *literals = vars->literals;
2009     ptrdiff_t literalsLen = (ptrdiff_t)vars->literalsLen;
2010     Byte * const win = p->win;
2011     size_t winPos = p->winPos;
2012     const size_t cycSize = p->cycSize;
2013     size_t totalOutCheck = p->totalOutCheck;
2014     const size_t winSize = p->winSize;
2015     size_t reps_0 = p->reps[0];
2016     size_t reps_1 = p->reps[1];
2017     size_t reps_2 = p->reps[2];
2018     UInt32 STATE_VAR(ll), STATE_VAR(of), STATE_VAR(ml);
2019     CBitCtr bitOffset;
2020 
2021     SET_bitOffset_TO_PAD (bitOffset, src + SEQ_SRC_OFFSET, srcLen + BIT_OFFSET_DELTA_BYTES)
2022 
2023     bitOffset -= BIT_OFFSET_DELTA_BITS;
2024 
2025     FSE_INIT_STATE(ll, {} )
2026     FSE_INIT_STATE(of, {} )
2027     FSE_INIT_STATE(ml, BO_OVERFLOW_CHECK)
2028 
2029     for (;;)
2030     {
2031       size_t matchLen;
2032     #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
2033       UInt64 v;
2034     #endif
2035 
2036       #ifdef Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF
2037         FSE_PRELOAD
2038       #endif
2039 
2040       // if (of_code == 0)
2041       if ((Byte)STATE_VAR(of) == 0)
2042       {
2043         if (GET_FSE_REC_SYM(STATE_VAR(ll)) == 0)
2044         {
2045           const size_t offset = reps_1;
2046           reps_1 = reps_0;
2047           reps_0 = offset;
2048           STAT_INC(g_Num_Rep1)
2049         }
2050         STAT_UPDATE(else g_Num_Rep0++;)
2051       }
2052       else
2053       {
2054         const unsigned of_code = (Byte)STATE_VAR(of);
2055 
2056       #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
2057         #if !defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF)
2058           FSE_PRELOAD
2059         #endif
2060       #else
2061         UInt32 v;
2062         {
2063           const Byte *src4 = src + SRC_PLUS_FOR_4BYTES(bitOffset);
2064           const unsigned skip = GET_SHIFT(bitOffset);
2065           GET32(v, src4)
2066           v <<= skip;
2067           v |= (UInt32)src4[-1] >> (8 - skip);
2068         }
2069       #endif
2070 
2071         UPDATE_BIT_OFFSET(bitOffset, of_code)
2072 
2073         if (of_code == 1)
2074         {
2075           // read 1 bit
2076           #if defined(Z7_MSC_VER_ORIGINAL) || defined(MY_CPU_X86_OR_AMD64)
2077             #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
2078               #define CHECK_HIGH_BIT_64(a)  ((Int64)(UInt64)(a) < 0)
2079             #else
2080               #define CHECK_HIGH_BIT_32(a)  ((Int32)(UInt32)(a) < 0)
2081             #endif
2082           #else
2083             #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
2084               #define CHECK_HIGH_BIT_64(a)  ((UInt64)(a) & ((UInt64)1 << 63))
2085             #else
2086               #define CHECK_HIGH_BIT_32(a)  ((UInt32)(a) & ((UInt32)1 << 31))
2087             #endif
2088           #endif
2089 
2090           if
2091             #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
2092               CHECK_HIGH_BIT_64 (((UInt64)GET_FSE_REC_SYM(STATE_VAR(ll)) - 1) ^ v)
2093             #else
2094               CHECK_HIGH_BIT_32 (((UInt32)GET_FSE_REC_SYM(STATE_VAR(ll)) - 1) ^ v)
2095             #endif
2096           {
2097             v <<= 1;
2098             {
2099               const size_t offset = reps_2;
2100               reps_2 = reps_1;
2101               reps_1 = reps_0;
2102               reps_0 = offset;
2103               STAT_INC(g_Num_Rep2)
2104             }
2105           }
2106           else
2107           {
2108             if (GET_FSE_REC_SYM(STATE_VAR(ll)) == 0)
2109             {
2110               // litLen == 0 && bit == 1
2111               STAT_INC(g_Num_Rep3)
2112               v <<= 1;
2113               reps_2 = reps_1;
2114               reps_1 = reps_0;
2115               if (--reps_0 == 0)
2116               {
2117                 // LZ_LOOP_ERROR_EXIT
2118                 // original-zstd decoder : input is corrupted; force offset to 1
2119                 // reps_0 = 1;
2120                 reps_0++;
2121               }
2122             }
2123             else
2124             {
2125               // litLen != 0 && bit == 0
2126               v <<= 1;
2127               {
2128                 const size_t offset = reps_1;
2129                 reps_1 = reps_0;
2130                 reps_0 = offset;
2131                 STAT_INC(g_Num_Rep1)
2132               }
2133             }
2134           }
2135         }
2136         else
2137         {
2138           // (2 <= of_code)
2139           // if (of_code >= 32) LZ_LOOP_ERROR_EXIT // optional check
2140           // we don't allow (of_code >= 32) cases in another code
2141           reps_2 = reps_1;
2142           reps_1 = reps_0;
2143           reps_0 = ((size_t)1 << of_code) - 3 + (size_t)
2144             #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
2145               (v >> (64 - of_code));
2146               v <<= of_code;
2147             #else
2148               (v >> (32 - of_code));
2149             #endif
2150         }
2151       }
2152 
2153       #ifdef Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML
2154         FSE_PRELOAD
2155       #endif
2156 
2157       matchLen = (size_t)GET_FSE_REC_SYM(STATE_VAR(ml))
2158           #ifndef Z7_ZSTD_DEC_USE_ML_PLUS3
2159             + MATCH_LEN_MIN
2160           #endif
2161           ;
2162       {
2163         {
2164           if (matchLen >= 32 + MATCH_LEN_MIN) // if (state_ml & 0x20)
2165           {
2166             const unsigned extra = BASES_TABLE(SEQ_ML_EXTRA) [(size_t)matchLen - MATCH_LEN_MIN];
2167             matchLen = BASES_TABLE(SEQ_ML_BASES) [(size_t)matchLen - MATCH_LEN_MIN];
2168             #if defined(Z7_ZSTD_DEC_USE_64BIT_LOADS) && \
2169                (defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML) || \
2170                 defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF))
2171             {
2172               UPDATE_BIT_OFFSET(bitOffset, extra)
2173               matchLen += (size_t)(v >> (64 - extra));
2174               #if defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF)
2175                 FSE_PRELOAD
2176               #else
2177                 v <<= extra;
2178               #endif
2179             }
2180             #else
2181             {
2182               UInt32 v32;
2183               STREAM_READ_BITS(v32, extra)
2184               matchLen += v32;
2185             }
2186             #endif
2187             STAT_INC(g_Num_Match)
2188           }
2189         }
2190       }
2191 
2192       #if  defined(Z7_ZSTD_DEC_USE_64BIT_LOADS) && \
2193           !defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF) && \
2194           !defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML)
2195         FSE_PRELOAD
2196       #endif
2197 
2198       {
2199         size_t litLen = GET_FSE_REC_SYM(STATE_VAR(ll));
2200         if (litLen)
2201         {
2202           // if (STATE_VAR(ll) & 0x70)
2203           if (litLen >= 16)
2204           {
2205             const unsigned extra = BASES_TABLE(SEQ_LL_EXTRA) [litLen];
2206             litLen = BASES_TABLE(SEQ_LL_BASES) [litLen];
2207             #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
2208             {
2209               UPDATE_BIT_OFFSET(bitOffset, extra)
2210               litLen += (size_t)(v >> (64 - extra));
2211               #if defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF)
2212                 FSE_PRELOAD
2213               #else
2214                 v <<= extra;
2215               #endif
2216             }
2217             #else
2218             {
2219               UInt32 v32;
2220               STREAM_READ_BITS(v32, extra)
2221               litLen += v32;
2222             }
2223             #endif
2224             STAT_INC(g_Num_LitsBig)
2225           }
2226 
2227           if ((literalsLen -= (ptrdiff_t)litLen) < 0)
2228             LZ_LOOP_ERROR_EXIT
2229           totalOutCheck += litLen;
2230           {
2231             const size_t rem = winLimit - winPos;
2232             if (litLen > rem)
2233               LZ_LOOP_ERROR_EXIT
2234             {
2235               const Byte *literals_temp = literals;
2236               Byte *d = win + winPos;
2237               literals += litLen;
2238               winPos += litLen;
2239               CopyLiterals(d, literals_temp, litLen, rem);
2240             }
2241           }
2242         }
2243         STAT_UPDATE(else g_Num_Lit0++;)
2244       }
2245 
2246       #define COPY_MATCH \
2247         { if (reps_0 > winSize || reps_0 > totalOutCheck) LZ_LOOP_ERROR_EXIT \
2248         totalOutCheck += matchLen; \
2249         { const size_t rem = winLimit - winPos; \
2250         if (matchLen > rem) LZ_LOOP_ERROR_EXIT \
2251         { const size_t winPos_temp = winPos; \
2252         winPos += matchLen; \
2253         CopyMatch(reps_0, matchLen, win, winPos_temp, rem, cycSize); }}}
2254 
2255       if (--numSeqs == 0)
2256       {
2257         COPY_MATCH
2258         break;
2259       }
2260       FSE_UPDATE_STATES
2261       COPY_MATCH
2262     } // for
2263 
2264     if ((CBitCtr_signed)bitOffset != BIT_OFFSET_DELTA_BYTES * 8 - BIT_OFFSET_DELTA_BITS)
2265       return SZ_ERROR_DATA;
2266 
2267     if (literalsLen)
2268     {
2269       const size_t rem = winLimit - winPos;
2270       if ((size_t)literalsLen > rem)
2271         return SZ_ERROR_DATA;
2272       {
2273         Byte *d = win + winPos;
2274         winPos += (size_t)literalsLen;
2275         totalOutCheck += (size_t)literalsLen;
2276         CopyLiterals
2277         // memcpy
2278           (d, literals, (size_t)literalsLen, rem);
2279       }
2280     }
2281     if (totalOutCheck >= winSize)
2282       totalOutCheck = winSize;
2283     p->totalOutCheck = totalOutCheck;
2284     p->winPos = winPos;
2285     p->reps[0] = (CZstdDecOffset)reps_0;
2286     p->reps[1] = (CZstdDecOffset)reps_1;
2287     p->reps[2] = (CZstdDecOffset)reps_2;
2288   }
2289   return SZ_OK;
2290 }
2291 
2292 
2293 // for debug: define to check that ZstdDec1_NeedTempBufferForInput() works correctly:
2294 // #define Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP // define it for debug only
2295 #ifdef Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP
2296 static unsigned g_numSeqs;
2297 #endif
2298 
2299 
2300 #define k_LitBlockType_Flag_RLE_or_Treeless  1
2301 #define k_LitBlockType_Flag_Compressed       2
2302 
2303 // outLimit : is strong limit
2304 // outLimit <= ZstdDec1_GET_BLOCK_SIZE_LIMIT(p)
2305 // inSize != 0
2306 static
2307 Z7_NO_INLINE
2308 SRes ZstdDec1_DecodeBlock(CZstdDec1 *p,
2309     const Byte *src, SizeT inSize, SizeT afterAvail,
2310     const size_t outLimit)
2311 {
2312   CZstdDec1_Vars vars;
2313   vars.literals = p->literalsBase;
2314   {
2315     const unsigned b0 = *src++;
2316     UInt32 numLits, compressedSize;
2317     const Byte *litStream;
2318     Byte *literalsDest;
2319     inSize--;
2320 
2321     if ((b0 & k_LitBlockType_Flag_Compressed) == 0)
2322     {
2323       // we need at least one additional byte for (numSeqs).
2324       // so we check for that additional byte in conditions.
2325       numLits = b0 >> 3;
2326       if (b0 & 4)
2327       {
2328         UInt32 v;
2329         if (inSize < 1 + 1) // we need at least 1 byte here and 1 byte for (numSeqs).
2330           return SZ_ERROR_DATA;
2331         numLits >>= 1;
2332         v = GetUi16(src);
2333         src += 2;
2334         inSize -= 2;
2335         if ((b0 & 8) == 0)
2336         {
2337           src--;
2338           inSize++;
2339           v = (Byte)v;
2340         }
2341         numLits += v << 4;
2342       }
2343       compressedSize = 1;
2344       if ((b0 & k_LitBlockType_Flag_RLE_or_Treeless) == 0)
2345         compressedSize = numLits;
2346     }
2347     else if (inSize < 4)
2348       return SZ_ERROR_DATA;
2349     else
2350     {
2351       const unsigned mode4Streams = b0 & 0xc;
2352       const unsigned numBytes = (3 * mode4Streams + 32) >> 4;
2353       const unsigned numBits = 4 * numBytes - 2;
2354       const UInt32 mask = ((UInt32)16 << numBits) - 1;
2355       compressedSize = GetUi32(src);
2356       numLits = ((
2357           #ifdef MY_CPU_LE_UNALIGN
2358             GetUi32(src - 1)
2359           #else
2360             ((compressedSize << 8) + b0)
2361           #endif
2362           ) >> 4) & mask;
2363       src += numBytes;
2364       inSize -= numBytes;
2365       compressedSize >>= numBits;
2366       compressedSize &= mask;
2367       /*
2368       if (numLits != 0) printf("inSize = %7u num_lits=%7u compressed=%7u ratio = %u  ratio2 = %u\n",
2369           i1, numLits, (unsigned)compressedSize * 1, (unsigned)compressedSize * 100 / numLits,
2370           (unsigned)numLits * 100 / (unsigned)inSize);
2371       }
2372       */
2373       if (compressedSize == 0)
2374         return SZ_ERROR_DATA; // (compressedSize == 0) is not allowed
2375     }
2376 
2377     STAT_UPDATE(g_Num_Lits += numLits;)
2378 
2379     vars.literalsLen = numLits;
2380 
2381     if (compressedSize >= inSize)
2382       return SZ_ERROR_DATA;
2383     litStream = src;
2384     src += compressedSize;
2385     inSize -= compressedSize;
2386     // inSize != 0
2387     {
2388       UInt32 numSeqs = *src++;
2389       inSize--;
2390       if (numSeqs > 127)
2391       {
2392         UInt32 b1;
2393         if (inSize == 0)
2394           return SZ_ERROR_DATA;
2395         numSeqs -= 128;
2396         b1 = *src++;
2397         inSize--;
2398         if (numSeqs == 127)
2399         {
2400           if (inSize == 0)
2401             return SZ_ERROR_DATA;
2402           numSeqs = (UInt32)(*src++) + 127;
2403           inSize--;
2404         }
2405         numSeqs = (numSeqs << 8) + b1;
2406       }
2407       if (numSeqs * MATCH_LEN_MIN + numLits > outLimit)
2408         return SZ_ERROR_DATA;
2409       vars.numSeqs = numSeqs;
2410 
2411       STAT_UPDATE(g_NumSeqs_total += numSeqs;)
2412       /*
2413         #ifdef SHOW_STAT
2414         printf("\n %5u : %8u, %8u : %5u", (int)g_Num_Blocks_Compressed, (int)numSeqs, (int)g_NumSeqs_total,
2415           (int)g_NumSeqs_total / g_Num_Blocks_Compressed);
2416         #endif
2417         // printf("\nnumSeqs2 = %d", numSeqs);
2418       */
2419     #ifdef Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP
2420       if (numSeqs != g_numSeqs) return SZ_ERROR_DATA; // for debug
2421     #endif
2422       if (numSeqs == 0)
2423       {
2424         if (inSize != 0)
2425           return SZ_ERROR_DATA;
2426         literalsDest = p->win + p->winPos;
2427       }
2428       else
2429         literalsDest = p->literalsBase;
2430     }
2431 
2432     if ((b0 & k_LitBlockType_Flag_Compressed) == 0)
2433     {
2434       if (b0 & k_LitBlockType_Flag_RLE_or_Treeless)
2435       {
2436         memset(literalsDest, litStream[0], numLits);
2437         if (vars.numSeqs)
2438         {
2439           // literalsDest == p->literalsBase == vars.literals
2440           #if COPY_CHUNK_SIZE > 1
2441             memset(p->literalsBase + numLits, 0, COPY_CHUNK_SIZE);
2442           #endif
2443         }
2444       }
2445       else
2446       {
2447         // unsigned y;
2448         // for (y = 0; y < 10000; y++)
2449         memcpy(literalsDest, litStream, numLits);
2450         if (vars.numSeqs)
2451         {
2452           /* we need up to (15 == COPY_CHUNK_SIZE - 1) space for optimized CopyLiterals().
2453              If we have additional space in input stream after literals stream,
2454              we use direct copy of rar literals in input stream */
2455           if ((size_t)(src + inSize - litStream) - numLits + afterAvail >= (COPY_CHUNK_SIZE - 1))
2456             vars.literals = litStream;
2457           else
2458           {
2459             // literalsDest == p->literalsBase == vars.literals
2460             #if COPY_CHUNK_SIZE > 1
2461             /* CopyLiterals():
2462                 1) we don't want reading non-initialized data
2463                 2) we will copy only zero byte after literals buffer */
2464               memset(p->literalsBase + numLits, 0, COPY_CHUNK_SIZE);
2465             #endif
2466           }
2467         }
2468       }
2469     }
2470     else
2471     {
2472       CInBufPair hufStream;
2473       hufStream.ptr = litStream;
2474       hufStream.len = compressedSize;
2475 
2476       if ((b0 & k_LitBlockType_Flag_RLE_or_Treeless) == 0)
2477       {
2478         // unsigned y = 100; CInBufPair hs2 = hufStream; do { hufStream = hs2;
2479         RINOK(Huf_DecodeTable(&p->huf, &hufStream))
2480         p->litHuf_wasSet = True;
2481         // } while (--y);
2482       }
2483       else if (!p->litHuf_wasSet)
2484         return SZ_ERROR_DATA;
2485 
2486       {
2487         // int yyy; for (yyy = 0; yyy < 34; yyy++) {
2488         SRes sres;
2489         if ((b0 & 0xc) == 0) // mode4Streams
2490           sres = Huf_Decompress_1stream((const Byte *)(const void *)p->huf.table64,
2491               hufStream.ptr - HUF_SRC_OFFSET, hufStream.len, literalsDest, numLits);
2492         else
2493         {
2494           // 6 bytes for the jump table + 4x1 bytes of end-padding Bytes)
2495           if (hufStream.len < 6 + 4)
2496             return SZ_ERROR_DATA;
2497           // the condition from original-zstd decoder:
2498           #define Z7_ZSTD_MIN_LITERALS_FOR_4_STREAMS 6
2499           if (numLits < Z7_ZSTD_MIN_LITERALS_FOR_4_STREAMS)
2500             return SZ_ERROR_DATA;
2501           sres = Huf_Decompress_4stream((const Byte *)(const void *)p->huf.table64,
2502               hufStream.ptr + (6 - HUF_SRC_OFFSET), hufStream.len, literalsDest, numLits);
2503         }
2504         RINOK(sres)
2505         // }
2506       }
2507     }
2508 
2509     if (vars.numSeqs == 0)
2510     {
2511       p->winPos += numLits;
2512       UPDATE_TOTAL_OUT(p, numLits)
2513       return SZ_OK;
2514     }
2515   }
2516   {
2517     CInBufPair in;
2518     unsigned mode;
2519     unsigned seqMode;
2520 
2521     in.ptr = src;
2522     in.len = inSize;
2523     if (in.len == 0)
2524       return SZ_ERROR_DATA;
2525     in.len--;
2526     mode = *in.ptr++;
2527     if (mode & 3) // Reserved bits
2528       return SZ_ERROR_DATA;
2529 
2530     seqMode = (mode >> 6);
2531     if (seqMode == k_SeqMode_Repeat)
2532       { if (!IS_SEQ_TABLES_WERE_SET(p)) return SZ_ERROR_DATA; }
2533     else RINOK(FSE_Decode_SeqTable(
2534         p->fse.ll,
2535         &in,
2536         6, // predefAccuracy
2537         &p->ll_accuracy,
2538         NUM_LL_SYMBOLS,
2539         k_PredefRecords_LL,
2540         seqMode))
2541 
2542     seqMode = (mode >> 4) & 3;
2543     if (seqMode == k_SeqMode_Repeat)
2544       { if (!IS_SEQ_TABLES_WERE_SET(p)) return SZ_ERROR_DATA; }
2545     else RINOK(FSE_Decode_SeqTable(
2546         p->fse.of,
2547         &in,
2548         5, // predefAccuracy
2549         &p->of_accuracy,
2550         NUM_OFFSET_SYMBOLS_MAX,
2551         k_PredefRecords_OF,
2552         seqMode))
2553 
2554     seqMode = (mode >> 2) & 3;
2555     if (seqMode == k_SeqMode_Repeat)
2556       { if (!IS_SEQ_TABLES_WERE_SET(p)) return SZ_ERROR_DATA; }
2557     else
2558     {
2559       RINOK(FSE_Decode_SeqTable(
2560         p->fse.ml,
2561         &in,
2562         6, // predefAccuracy
2563         &p->ml_accuracy,
2564         NUM_ML_SYMBOLS,
2565         k_PredefRecords_ML,
2566         seqMode))
2567       /*
2568       #if defined(Z7_ZSTD_DEC_USE_ML_PLUS3)
2569         // { unsigned y = 1 << 10; do
2570       {
2571         const unsigned accuracy = p->ml_accuracy;
2572         if (accuracy == 0)
2573           p->fse.ml[0] += 3;
2574         else
2575         #ifdef MY_CPU_64BIT
2576         {
2577           // alignemt (UInt64 _pad_Alignment) in fse.ml is required for that code
2578           UInt64 *table = (UInt64 *)(void *)p->fse.ml;
2579           const UInt64 *end = (const UInt64 *)(const void *)
2580             ((const Byte *)(const void *)table + ((size_t)sizeof(CFseRecord) << accuracy));
2581           do
2582           {
2583             table[0] += ((UInt64)MATCH_LEN_MIN << 32) + MATCH_LEN_MIN;
2584             table[1] += ((UInt64)MATCH_LEN_MIN << 32) + MATCH_LEN_MIN;
2585             table += 2;
2586           }
2587           while (table != end);
2588         }
2589         #else
2590         {
2591           UInt32 *table = p->fse.ml;
2592           const UInt32 *end = (const UInt32 *)(const void *)
2593             ((const Byte *)(const void *)table + ((size_t)sizeof(CFseRecord) << accuracy));
2594           do
2595           {
2596             table[0] += MATCH_LEN_MIN;
2597             table[1] += MATCH_LEN_MIN;
2598             table += 2;
2599             table[0] += MATCH_LEN_MIN;
2600             table[1] += MATCH_LEN_MIN;
2601             table += 2;
2602           }
2603           while (table != end);
2604         }
2605         #endif
2606       }
2607       // while (--y); }
2608       #endif
2609       */
2610     }
2611 
2612     // p->seqTables_wereSet = True;
2613     if (in.len == 0)
2614       return SZ_ERROR_DATA;
2615     return Decompress_Sequences(p,
2616         in.ptr - SEQ_SRC_OFFSET - BIT_OFFSET_DELTA_BYTES, in.len,
2617         p->winPos + outLimit, &vars);
2618   }
2619 }
2620 
2621 
2622 
2623 
2624 // inSize != 0
2625 // it must do similar to ZstdDec1_DecodeBlock()
2626 static size_t ZstdDec1_NeedTempBufferForInput(
2627     const SizeT beforeSize, const Byte * const src, const SizeT inSize)
2628 {
2629   unsigned b0;
2630   UInt32 pos;
2631 
2632   #ifdef Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP
2633     g_numSeqs = 1 << 24;
2634   #else
2635   // we have at least 3 bytes before seq data: litBlockType, numSeqs, seqMode
2636   #define MIN_BLOCK_LZ_HEADERS_SIZE 3
2637   if (beforeSize >= MAX_BACKWARD_DEPTH - MIN_BLOCK_LZ_HEADERS_SIZE)
2638     return 0;
2639   #endif
2640 
2641   b0 = src[0];
2642 
2643   if ((b0 & k_LitBlockType_Flag_Compressed) == 0)
2644   {
2645     UInt32 numLits = b0 >> 3;
2646     pos = 1;
2647     if (b0 & 4)
2648     {
2649       UInt32 v;
2650       if (inSize < 3)
2651         return 0;
2652       numLits >>= 1;
2653       v = GetUi16(src + 1);
2654       pos = 3;
2655       if ((b0 & 8) == 0)
2656       {
2657         pos = 2;
2658         v = (Byte)v;
2659       }
2660       numLits += v << 4;
2661     }
2662     if (b0 & k_LitBlockType_Flag_RLE_or_Treeless)
2663       numLits = 1;
2664     pos += numLits;
2665   }
2666   else if (inSize < 5)
2667     return 0;
2668   else
2669   {
2670     const unsigned mode4Streams = b0 & 0xc;
2671     const unsigned numBytes = (3 * mode4Streams + 48) >> 4;
2672     const unsigned numBits = 4 * numBytes - 6;
2673     UInt32 cs = GetUi32(src + 1);
2674     cs >>= numBits;
2675     cs &= ((UInt32)16 << numBits) - 1;
2676     if (cs == 0)
2677       return 0;
2678     pos = numBytes + cs;
2679   }
2680 
2681   if (pos >= inSize)
2682     return 0;
2683   {
2684     UInt32 numSeqs = src[pos++];
2685     if (numSeqs > 127)
2686     {
2687       UInt32 b1;
2688       if (pos >= inSize)
2689         return 0;
2690       numSeqs -= 128;
2691       b1 = src[pos++];
2692       if (numSeqs == 127)
2693       {
2694         if (pos >= inSize)
2695           return 0;
2696         numSeqs = (UInt32)(src[pos++]) + 127;
2697       }
2698       numSeqs = (numSeqs << 8) + b1;
2699     }
2700     #ifdef Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP
2701       g_numSeqs = numSeqs; // for debug
2702     #endif
2703     if (numSeqs == 0)
2704       return 0;
2705   }
2706   /*
2707   if (pos >= inSize)
2708     return 0;
2709   pos++;
2710   */
2711   // we will have one additional byte for seqMode:
2712   if (beforeSize + pos >= MAX_BACKWARD_DEPTH - 1)
2713     return 0;
2714   return 1;
2715 }
2716 
2717 
2718 
2719 // ---------- ZSTD FRAME ----------
2720 
2721 #define kBlockType_Raw          0
2722 #define kBlockType_RLE          1
2723 #define kBlockType_Compressed   2
2724 #define kBlockType_Reserved     3
2725 
2726 typedef enum
2727 {
2728   // begin: states that require 4 bytes:
2729   ZSTD2_STATE_SIGNATURE,
2730   ZSTD2_STATE_HASH,
2731   ZSTD2_STATE_SKIP_HEADER,
2732   // end of states that require 4 bytes
2733 
2734   ZSTD2_STATE_SKIP_DATA,
2735   ZSTD2_STATE_FRAME_HEADER,
2736   ZSTD2_STATE_AFTER_HEADER,
2737   ZSTD2_STATE_BLOCK,
2738   ZSTD2_STATE_DATA,
2739   ZSTD2_STATE_FINISHED
2740 } EZstd2State;
2741 
2742 
2743 struct CZstdDec
2744 {
2745   EZstd2State frameState;
2746   unsigned tempSize;
2747 
2748   Byte temp[14]; // 14 is required
2749 
2750   Byte descriptor;
2751   Byte windowDescriptor;
2752   Byte isLastBlock;
2753   Byte blockType;
2754   Byte isErrorState;
2755   Byte hashError;
2756   Byte disableHash;
2757   Byte isCyclicMode;
2758 
2759   UInt32 blockSize;
2760   UInt32 dictionaryId;
2761   UInt32 curBlockUnpackRem; // for compressed blocks only
2762   UInt32 inTempPos;
2763 
2764   UInt64 contentSize;
2765   UInt64 contentProcessed;
2766   CXxh64State xxh64;
2767 
2768   Byte *inTemp;
2769   SizeT winBufSize_Allocated;
2770   Byte *win_Base;
2771 
2772   ISzAllocPtr alloc_Small;
2773   ISzAllocPtr alloc_Big;
2774 
2775   CZstdDec1 decoder;
2776 };
2777 
2778 #define ZstdDec_GET_UNPROCESSED_XXH64_SIZE(p) \
2779   ((unsigned)(p)->contentProcessed & (Z7_XXH64_BLOCK_SIZE - 1))
2780 
2781 #define ZSTD_DEC_IS_LAST_BLOCK(p) ((p)->isLastBlock)
2782 
2783 
2784 static void ZstdDec_FreeWindow(CZstdDec * const p)
2785 {
2786   if (p->win_Base)
2787   {
2788     ISzAlloc_Free(p->alloc_Big, p->win_Base);
2789     p->win_Base = NULL;
2790     // p->decoder.win = NULL;
2791     p->winBufSize_Allocated = 0;
2792   }
2793 }
2794 
2795 
2796 CZstdDecHandle ZstdDec_Create(ISzAllocPtr alloc_Small, ISzAllocPtr alloc_Big)
2797 {
2798   CZstdDec *p = (CZstdDec *)ISzAlloc_Alloc(alloc_Small, sizeof(CZstdDec));
2799   if (!p)
2800     return NULL;
2801   p->alloc_Small = alloc_Small;
2802   p->alloc_Big = alloc_Big;
2803   // ZstdDec_CONSTRUCT(p)
2804   p->inTemp = NULL;
2805   p->win_Base = NULL;
2806   p->winBufSize_Allocated = 0;
2807   p->disableHash = False;
2808   ZstdDec1_Construct(&p->decoder);
2809   return p;
2810 }
2811 
2812 void ZstdDec_Destroy(CZstdDecHandle p)
2813 {
2814   #ifdef SHOW_STAT
2815     #define PRINT_STAT1(name, v) \
2816       printf("\n%25s = %9u", name, v);
2817   PRINT_STAT1("g_Num_Blocks_Compressed", g_Num_Blocks_Compressed)
2818   PRINT_STAT1("g_Num_Blocks_memcpy", g_Num_Blocks_memcpy)
2819   PRINT_STAT1("g_Num_Wrap_memmove_Num", g_Num_Wrap_memmove_Num)
2820   PRINT_STAT1("g_Num_Wrap_memmove_Bytes", g_Num_Wrap_memmove_Bytes)
2821   if (g_Num_Blocks_Compressed)
2822   {
2823     #define PRINT_STAT(name, v) \
2824       printf("\n%17s = %9u, per_block = %8u", name, v, v / g_Num_Blocks_Compressed);
2825     PRINT_STAT("g_NumSeqs", g_NumSeqs_total)
2826     // PRINT_STAT("g_NumCopy", g_NumCopy)
2827     PRINT_STAT("g_NumOver", g_NumOver)
2828     PRINT_STAT("g_NumOver2", g_NumOver2)
2829     PRINT_STAT("g_Num_Match", g_Num_Match)
2830     PRINT_STAT("g_Num_Lits", g_Num_Lits)
2831     PRINT_STAT("g_Num_LitsBig", g_Num_LitsBig)
2832     PRINT_STAT("g_Num_Lit0", g_Num_Lit0)
2833     PRINT_STAT("g_Num_Rep_0", g_Num_Rep0)
2834     PRINT_STAT("g_Num_Rep_1", g_Num_Rep1)
2835     PRINT_STAT("g_Num_Rep_2", g_Num_Rep2)
2836     PRINT_STAT("g_Num_Rep_3", g_Num_Rep3)
2837     PRINT_STAT("g_Num_Threshold_0", g_Num_Threshold_0)
2838     PRINT_STAT("g_Num_Threshold_1", g_Num_Threshold_1)
2839     PRINT_STAT("g_Num_Threshold_0sum", g_Num_Threshold_0sum)
2840     PRINT_STAT("g_Num_Threshold_1sum", g_Num_Threshold_1sum)
2841   }
2842   printf("\n");
2843   #endif
2844 
2845   ISzAlloc_Free(p->alloc_Small, p->decoder.literalsBase);
2846   // p->->decoder.literalsBase = NULL;
2847   ISzAlloc_Free(p->alloc_Small, p->inTemp);
2848   // p->inTemp = NULL;
2849   ZstdDec_FreeWindow(p);
2850   ISzAlloc_Free(p->alloc_Small, p);
2851 }
2852 
2853 
2854 
2855 #define kTempBuffer_PreSize  (1u << 6)
2856 #if kTempBuffer_PreSize < MAX_BACKWARD_DEPTH
2857   #error Stop_Compiling_Bad_kTempBuffer_PreSize
2858 #endif
2859 
2860 static SRes ZstdDec_AllocateMisc(CZstdDec *p)
2861 {
2862   #define k_Lit_AfterAvail  (1u << 6)
2863   #if k_Lit_AfterAvail < (COPY_CHUNK_SIZE - 1)
2864     #error Stop_Compiling_Bad_k_Lit_AfterAvail
2865   #endif
2866   // return ZstdDec1_Allocate(&p->decoder, p->alloc_Small);
2867   if (!p->decoder.literalsBase)
2868   {
2869     p->decoder.literalsBase = (Byte *)ISzAlloc_Alloc(p->alloc_Small,
2870         kBlockSizeMax + k_Lit_AfterAvail);
2871     if (!p->decoder.literalsBase)
2872       return SZ_ERROR_MEM;
2873   }
2874   if (!p->inTemp)
2875   {
2876     // we need k_Lit_AfterAvail here for owerread from raw literals stream
2877     p->inTemp = (Byte *)ISzAlloc_Alloc(p->alloc_Small,
2878         kBlockSizeMax + kTempBuffer_PreSize + k_Lit_AfterAvail);
2879     if (!p->inTemp)
2880       return SZ_ERROR_MEM;
2881   }
2882   return SZ_OK;
2883 }
2884 
2885 
2886 static void ZstdDec_Init_ForNewFrame(CZstdDec *p)
2887 {
2888   p->frameState = ZSTD2_STATE_SIGNATURE;
2889   p->tempSize = 0;
2890 
2891   p->isErrorState = False;
2892   p->hashError = False;
2893   p->isCyclicMode = False;
2894   p->contentProcessed = 0;
2895   Xxh64State_Init(&p->xxh64);
2896   ZstdDec1_Init(&p->decoder);
2897 }
2898 
2899 
2900 void ZstdDec_Init(CZstdDec *p)
2901 {
2902   ZstdDec_Init_ForNewFrame(p);
2903   p->decoder.winPos = 0;
2904   memset(p->temp, 0, sizeof(p->temp));
2905 }
2906 
2907 
2908 #define DESCRIPTOR_Get_DictionaryId_Flag(d)   ((d) & 3)
2909 #define DESCRIPTOR_FLAG_CHECKSUM              (1 << 2)
2910 #define DESCRIPTOR_FLAG_RESERVED              (1 << 3)
2911 // #define DESCRIPTOR_FLAG_UNUSED                (1 << 4)
2912 #define DESCRIPTOR_FLAG_SINGLE                (1 << 5)
2913 #define DESCRIPTOR_Get_ContentSize_Flag3(d)   ((d) >> 5)
2914 #define DESCRIPTOR_Is_ContentSize_Defined(d)  (((d) & 0xe0) != 0)
2915 
2916 
2917 static EZstd2State ZstdDec_UpdateState(CZstdDec * const p, const Byte b, CZstdDecInfo * const info)
2918 {
2919   unsigned tempSize = p->tempSize;
2920   p->temp[tempSize++] = b;
2921   p->tempSize = tempSize;
2922 
2923   if (p->frameState == ZSTD2_STATE_BLOCK)
2924   {
2925     if (tempSize < 3)
2926       return ZSTD2_STATE_BLOCK;
2927     {
2928       UInt32 b0 = GetUi32(p->temp);
2929       const unsigned type = ((unsigned)b0 >> 1) & 3;
2930       if (type == kBlockType_RLE && tempSize == 3)
2931         return ZSTD2_STATE_BLOCK;
2932       // info->num_Blocks_forType[type]++;
2933       info->num_Blocks++;
2934       if (type == kBlockType_Reserved)
2935       {
2936         p->isErrorState = True; // SZ_ERROR_UNSUPPORTED
2937         return ZSTD2_STATE_BLOCK;
2938       }
2939       p->blockType = (Byte)type;
2940       p->isLastBlock = (Byte)(b0 & 1);
2941       p->inTempPos = 0;
2942       p->tempSize = 0;
2943       b0 >>= 3;
2944       b0 &= 0x1fffff;
2945       // info->num_BlockBytes_forType[type] += b0;
2946       if (b0 == 0)
2947       {
2948         // empty RAW/RLE blocks are allowed in original-zstd decoder
2949         if (type == kBlockType_Compressed)
2950         {
2951           p->isErrorState = True;
2952           return ZSTD2_STATE_BLOCK;
2953         }
2954         if (!ZSTD_DEC_IS_LAST_BLOCK(p))
2955           return ZSTD2_STATE_BLOCK;
2956         if (p->descriptor & DESCRIPTOR_FLAG_CHECKSUM)
2957           return ZSTD2_STATE_HASH;
2958         return ZSTD2_STATE_FINISHED;
2959       }
2960       p->blockSize = b0;
2961       {
2962         UInt32 blockLim = ZstdDec1_GET_BLOCK_SIZE_LIMIT(&p->decoder);
2963         // compressed and uncompressed block sizes cannot be larger than min(kBlockSizeMax, window_size)
2964         if (b0 > blockLim)
2965         {
2966           p->isErrorState = True; // SZ_ERROR_UNSUPPORTED;
2967           return ZSTD2_STATE_BLOCK;
2968         }
2969         if (DESCRIPTOR_Is_ContentSize_Defined(p->descriptor))
2970         {
2971           const UInt64 rem = p->contentSize - p->contentProcessed;
2972           if (blockLim > rem)
2973               blockLim = (UInt32)rem;
2974         }
2975         p->curBlockUnpackRem = blockLim;
2976         // uncompressed block size cannot be larger than remain data size:
2977         if (type != kBlockType_Compressed)
2978         {
2979           if (b0 > blockLim)
2980           {
2981             p->isErrorState = True; // SZ_ERROR_UNSUPPORTED;
2982             return ZSTD2_STATE_BLOCK;
2983           }
2984         }
2985       }
2986     }
2987     return ZSTD2_STATE_DATA;
2988   }
2989 
2990   if ((unsigned)p->frameState < ZSTD2_STATE_SKIP_DATA)
2991   {
2992     UInt32 v;
2993     if (tempSize != 4)
2994       return p->frameState;
2995     v = GetUi32(p->temp);
2996     if ((unsigned)p->frameState < ZSTD2_STATE_HASH) // == ZSTD2_STATE_SIGNATURE
2997     {
2998       if (v == 0xfd2fb528)
2999       {
3000         p->tempSize = 0;
3001         info->num_DataFrames++;
3002         return ZSTD2_STATE_FRAME_HEADER;
3003       }
3004       if ((v & 0xfffffff0) == 0x184d2a50)
3005       {
3006         p->tempSize = 0;
3007         info->num_SkipFrames++;
3008         return ZSTD2_STATE_SKIP_HEADER;
3009       }
3010       p->isErrorState = True;
3011       return ZSTD2_STATE_SIGNATURE;
3012       // return ZSTD2_STATE_ERROR; // is not ZSTD stream
3013     }
3014     if (p->frameState == ZSTD2_STATE_HASH)
3015     {
3016       info->checksum_Defined = True;
3017       info->checksum = v;
3018       // #ifndef DISABLE_XXH_CHECK
3019       if (!p->disableHash)
3020       {
3021         if (p->decoder.winPos < ZstdDec_GET_UNPROCESSED_XXH64_SIZE(p))
3022         {
3023           // unexpected code failure
3024           p->isErrorState = True;
3025           // SZ_ERROR_FAIL;
3026         }
3027         else
3028         if ((UInt32)Xxh64State_Digest(&p->xxh64,
3029             p->decoder.win + (p->decoder.winPos - ZstdDec_GET_UNPROCESSED_XXH64_SIZE(p)),
3030             p->contentProcessed) != v)
3031         {
3032           p->hashError = True;
3033           // return ZSTD2_STATE_ERROR; // hash error
3034         }
3035       }
3036       // #endif
3037       return ZSTD2_STATE_FINISHED;
3038     }
3039     // (p->frameState == ZSTD2_STATE_SKIP_HEADER)
3040     {
3041       p->blockSize = v;
3042       info->skipFrames_Size += v;
3043       p->tempSize = 0;
3044       /* we want the caller could know that there was finished frame
3045          finished frame. So we allow the case where
3046          we have ZSTD2_STATE_SKIP_DATA state with (blockSize == 0).
3047       */
3048       // if (v == 0) return ZSTD2_STATE_SIGNATURE;
3049       return ZSTD2_STATE_SKIP_DATA;
3050     }
3051   }
3052 
3053   // if (p->frameState == ZSTD2_STATE_FRAME_HEADER)
3054   {
3055     unsigned descriptor;
3056     const Byte *h;
3057     descriptor = p->temp[0];
3058     p->descriptor = (Byte)descriptor;
3059     if (descriptor & DESCRIPTOR_FLAG_RESERVED) // reserved bit
3060     {
3061       p->isErrorState = True;
3062       return ZSTD2_STATE_FRAME_HEADER;
3063       // return ZSTD2_STATE_ERROR;
3064     }
3065     {
3066       const unsigned n = DESCRIPTOR_Get_ContentSize_Flag3(descriptor);
3067       // tempSize -= 1 + ((1u << (n >> 1)) | ((n + 1) & 1));
3068       tempSize -= (0x9a563422u >> (n * 4)) & 0xf;
3069     }
3070     if (tempSize != (4u >> (3 - DESCRIPTOR_Get_DictionaryId_Flag(descriptor))))
3071       return ZSTD2_STATE_FRAME_HEADER;
3072 
3073     info->descriptor_OR     = (Byte)(info->descriptor_OR     |  descriptor);
3074     info->descriptor_NOT_OR = (Byte)(info->descriptor_NOT_OR | ~descriptor);
3075 
3076     h = &p->temp[1];
3077     {
3078       Byte w = 0;
3079       if ((descriptor & DESCRIPTOR_FLAG_SINGLE) == 0)
3080       {
3081         w = *h++;
3082         if (info->windowDescriptor_MAX < w)
3083             info->windowDescriptor_MAX = w;
3084         // info->are_WindowDescriptors = True;
3085         // info->num_WindowDescriptors++;
3086       }
3087       else
3088       {
3089         // info->are_SingleSegments = True;
3090         // info->num_SingleSegments++;
3091       }
3092       p->windowDescriptor = w;
3093     }
3094     {
3095       unsigned n = DESCRIPTOR_Get_DictionaryId_Flag(descriptor);
3096       UInt32 d = 0;
3097       if (n)
3098       {
3099         n = 1u << (n - 1);
3100         d = GetUi32(h) & ((UInt32)(Int32)-1 >> (32 - 8u * n));
3101         h += n;
3102       }
3103       p->dictionaryId = d;
3104       // info->dictionaryId_Cur = d;
3105       if (d != 0)
3106       {
3107         if (info->dictionaryId == 0)
3108           info->dictionaryId = d;
3109         else if (info->dictionaryId != d)
3110           info->are_DictionaryId_Different = True;
3111       }
3112     }
3113     {
3114       unsigned n = DESCRIPTOR_Get_ContentSize_Flag3(descriptor);
3115       UInt64 v = 0;
3116       if (n)
3117       {
3118         n >>= 1;
3119         if (n == 1)
3120           v = 256;
3121         v += GetUi64(h) & ((UInt64)(Int64)-1 >> (64 - (8u << n)));
3122         // info->are_ContentSize_Known = True;
3123         // info->num_Frames_with_ContentSize++;
3124         if (info->contentSize_MAX < v)
3125             info->contentSize_MAX = v;
3126         info->contentSize_Total += v;
3127       }
3128       else
3129       {
3130         info->are_ContentSize_Unknown = True;
3131         // info->num_Frames_without_ContentSize++;
3132       }
3133       p->contentSize = v;
3134     }
3135     // if ((size_t)(h - p->temp) != headerSize) return ZSTD2_STATE_ERROR; // it's unexpected internal code failure
3136     p->tempSize = 0;
3137 
3138     info->checksum_Defined = False;
3139     /*
3140     if (descriptor & DESCRIPTOR_FLAG_CHECKSUM)
3141       info->are_Checksums = True;
3142     else
3143       info->are_Non_Checksums = True;
3144     */
3145 
3146     return ZSTD2_STATE_AFTER_HEADER; // ZSTD2_STATE_BLOCK;
3147   }
3148 }
3149 
3150 
3151 static void ZstdDec_Update_XXH(CZstdDec * const p, size_t xxh64_winPos)
3152 {
3153  /*
3154  #ifdef DISABLE_XXH_CHECK
3155   UNUSED_VAR(data)
3156  #else
3157  */
3158   if (!p->disableHash && (p->descriptor & DESCRIPTOR_FLAG_CHECKSUM))
3159   {
3160     // const size_t pos = p->xxh64_winPos;
3161     const size_t size = (p->decoder.winPos - xxh64_winPos) & ~(size_t)31;
3162     if (size)
3163     {
3164       // p->xxh64_winPos = pos + size;
3165       Xxh64State_UpdateBlocks(&p->xxh64,
3166           p->decoder.win + xxh64_winPos,
3167           p->decoder.win + xxh64_winPos + size);
3168     }
3169   }
3170 }
3171 
3172 
3173 /*
3174 in:
3175   (winLimit) : is relaxed limit, where this function is allowed to stop writing of decoded data (if possible).
3176     - this function uses (winLimit) for RAW/RLE blocks only,
3177         because this function can decode single RAW/RLE block in several different calls.
3178     - this function DOESN'T use (winLimit) for Compressed blocks,
3179         because this function decodes full compressed block in single call.
3180   (CZstdDec1::winPos <= winLimit)
3181   (winLimit <= CZstdDec1::cycSize).
3182   Note: if (ds->outBuf_fromCaller) mode is used, then
3183   {
3184     (strong_limit) is stored in CZstdDec1::cycSize.
3185     So (winLimit) is more strong than (strong_limit).
3186   }
3187 
3188 exit:
3189   Note: (CZstdDecState::winPos) will be set by caller after exit of this function.
3190 
3191   This function can exit for any of these conditions:
3192     - (frameState == ZSTD2_STATE_AFTER_HEADER)
3193     - (frameState == ZSTD2_STATE_FINISHED) : frame was finished : (status == ZSTD_STATUS_FINISHED_FRAME) is set
3194     - finished non-empty non-last block. So (CZstdDec1::winPos_atExit != winPos_atFuncStart).
3195     - ZSTD_STATUS_NEEDS_MORE_INPUT in src
3196     - (CZstdDec1::winPos) have reached (winLimit) in non-finished RAW/RLE block
3197 
3198   This function decodes no more than one non-empty block.
3199   So it fulfills the condition at exit:
3200     (CZstdDec1::winPos_atExit - winPos_atFuncStart <= block_size_max)
3201   Note: (winPos_atExit > winLimit) is possible in some cases after compressed block decoding.
3202 
3203   if (ds->outBuf_fromCaller) mode (useAdditionalWinLimit medo)
3204   {
3205     then this function uses additional strong limit from (CZstdDec1::cycSize).
3206     So this function will not write any data after (CZstdDec1::cycSize)
3207     And it fulfills the condition at exit:
3208       (CZstdDec1::winPos_atExit <= CZstdDec1::cycSize)
3209   }
3210 */
3211 static SRes ZstdDec_DecodeBlock(CZstdDec * const p, CZstdDecState * const ds,
3212     SizeT winLimitAdd)
3213 {
3214   const Byte *src = ds->inBuf;
3215   SizeT * const srcLen = &ds->inPos;
3216   const SizeT inSize = ds->inLim;
3217   // const int useAdditionalWinLimit = ds->outBuf_fromCaller ? 1 : 0;
3218   enum_ZstdStatus * const status = &ds->status;
3219   CZstdDecInfo * const info = &ds->info;
3220   SizeT winLimit;
3221 
3222   const SizeT winPos_atFuncStart = p->decoder.winPos;
3223   src += *srcLen;
3224   *status = ZSTD_STATUS_NOT_SPECIFIED;
3225 
3226   // finishMode = ZSTD_FINISH_ANY;
3227   if (ds->outSize_Defined)
3228   {
3229     if (ds->outSize < ds->outProcessed)
3230     {
3231       // p->isAfterSizeMode = 2; // we have extra bytes already
3232       *status = ZSTD_STATUS_OUT_REACHED;
3233       return SZ_OK;
3234       // size = 0;
3235     }
3236     else
3237     {
3238       // p->outSize >= p->outProcessed
3239       const UInt64 rem = ds->outSize - ds->outProcessed;
3240       /*
3241       if (rem == 0)
3242       p->isAfterSizeMode = 1; // we have reached exact required size
3243       */
3244       if (winLimitAdd >= rem)
3245       {
3246         winLimitAdd = (SizeT)rem;
3247         // if (p->finishMode) finishMode = ZSTD_FINISH_END;
3248       }
3249     }
3250   }
3251 
3252   winLimit = p->decoder.winPos + winLimitAdd;
3253   // (p->decoder.winPos <= winLimit)
3254 
3255   // while (p->frameState != ZSTD2_STATE_ERROR)
3256   while (!p->isErrorState)
3257   {
3258     SizeT inCur = inSize - *srcLen;
3259 
3260     if (p->frameState == ZSTD2_STATE_DATA)
3261     {
3262       /* (p->decoder.winPos == winPos_atFuncStart) is expected,
3263          because this function doesn't start new block.
3264          if it have finished some non-empty block in this call. */
3265       if (p->decoder.winPos != winPos_atFuncStart)
3266         return SZ_ERROR_FAIL; // it's unexpected
3267 
3268       /*
3269       if (p->decoder.winPos > winLimit)
3270       {
3271         // we can be here, if in this function call
3272         //      - we have extracted non-empty compressed block, and (winPos > winLimit) after that.
3273         //      - we have started new block decoding after that.
3274         // It's unexpected case, because we exit after non-empty non-last block.
3275         *status = (inSize == *srcLen) ?
3276             ZSTD_STATUS_NEEDS_MORE_INPUT :
3277             ZSTD_STATUS_NOT_FINISHED;
3278         return SZ_OK;
3279       }
3280       */
3281       // p->decoder.winPos <= winLimit
3282 
3283       if (p->blockType != kBlockType_Compressed)
3284       {
3285         // it's RLE or RAW block.
3286         // p->BlockSize != 0_
3287         // winLimit <= p->decoder.cycSize
3288         /* So here we use more strong (winLimit), even for
3289            (ds->outBuf_fromCaller) mode. */
3290         SizeT outCur = winLimit - p->decoder.winPos;
3291         {
3292           const UInt32 rem = p->blockSize;
3293           if (outCur > rem)
3294               outCur = rem;
3295         }
3296         if (p->blockType == kBlockType_Raw)
3297         {
3298           if (outCur > inCur)
3299               outCur = inCur;
3300           /* output buffer is better aligned for XXH code.
3301              So we use hash for output buffer data */
3302           // ZstdDec_Update_XXH(p, src, outCur); // for debug:
3303           memcpy(p->decoder.win + p->decoder.winPos, src, outCur);
3304           src += outCur;
3305           *srcLen += outCur;
3306         }
3307         else // kBlockType_RLE
3308         {
3309           #define RLE_BYTE_INDEX_IN_temp  3
3310           memset(p->decoder.win + p->decoder.winPos,
3311               p->temp[RLE_BYTE_INDEX_IN_temp], outCur);
3312         }
3313         {
3314           const SizeT xxh64_winPos = p->decoder.winPos - ZstdDec_GET_UNPROCESSED_XXH64_SIZE(p);
3315           p->decoder.winPos += outCur;
3316           UPDATE_TOTAL_OUT(&p->decoder, outCur)
3317           p->contentProcessed += outCur;
3318           ZstdDec_Update_XXH(p, xxh64_winPos);
3319         }
3320         // ds->winPos = p->decoder.winPos;  // the caller does it instead. for debug:
3321         ds->outProcessed += outCur;
3322         if (p->blockSize -= (UInt32)outCur)
3323         {
3324           /*
3325           if (ds->outSize_Defined)
3326           {
3327             if (ds->outSize <= ds->outProcessed) ds->isAfterSizeMode = (enum_ZstdStatus)
3328                (ds->outSize == ds->outProcessed ? 1u: 2u);
3329           }
3330           */
3331           *status = (enum_ZstdStatus)
3332               (ds->outSize_Defined && ds->outSize <= ds->outProcessed ?
3333                 ZSTD_STATUS_OUT_REACHED : (p->blockType == kBlockType_Raw && inSize == *srcLen) ?
3334                 ZSTD_STATUS_NEEDS_MORE_INPUT :
3335                 ZSTD_STATUS_NOT_FINISHED);
3336           return SZ_OK;
3337         }
3338       }
3339       else // kBlockType_Compressed
3340       {
3341         // p->blockSize != 0
3342         // (uncompressed_size_of_block == 0) is allowed
3343         // (p->curBlockUnpackRem == 0) is allowed
3344         /*
3345         if (p->decoder.winPos >= winLimit)
3346         {
3347           if (p->decoder.winPos != winPos_atFuncStart)
3348           {
3349             // it's unexpected case
3350             // We already have some data in finished blocks in this function call.
3351             //   So we don't decompress new block after (>=winLimit),
3352             //   even if it's empty block.
3353             *status = (inSize == *srcLen) ?
3354                 ZSTD_STATUS_NEEDS_MORE_INPUT :
3355                 ZSTD_STATUS_NOT_FINISHED;
3356             return SZ_OK;
3357           }
3358           // (p->decoder.winPos == winLimit == winPos_atFuncStart)
3359           // we will decode current block, because that current
3360           //   block can be empty block and we want to make some visible
3361           //   change of (src) stream after function start.
3362         }
3363         */
3364         /*
3365         if (ds->outSize_Defined && ds->outSize < ds->outProcessed)
3366         {
3367           // we don't want to start new block, if we have more extra decoded bytes already
3368           *status = ZSTD_STATUS_OUT_REACHED;
3369           return SZ_OK;
3370         }
3371         */
3372         {
3373           const Byte *comprStream;
3374           size_t afterAvail;
3375           UInt32 inTempPos = p->inTempPos;
3376           const UInt32 rem = p->blockSize - inTempPos;
3377           // rem != 0
3378           if (inTempPos != 0  // (inTemp) buffer already contains some input data
3379               || inCur < rem  // available input data size is smaller than compressed block size
3380               || ZstdDec1_NeedTempBufferForInput(*srcLen, src, rem))
3381           {
3382             if (inCur > rem)
3383                 inCur = rem;
3384             if (inCur)
3385             {
3386               STAT_INC(g_Num_Blocks_memcpy)
3387               // we clear data for backward lookahead reading
3388               if (inTempPos == 0)
3389                 memset(p->inTemp + kTempBuffer_PreSize - MAX_BACKWARD_DEPTH, 0, MAX_BACKWARD_DEPTH);
3390               // { unsigned y = 0; for(;y < 1000; y++)
3391               memcpy(p->inTemp + inTempPos + kTempBuffer_PreSize, src, inCur);
3392               // }
3393               src += inCur;
3394               *srcLen += inCur;
3395               inTempPos += (UInt32)inCur;
3396               p->inTempPos = inTempPos;
3397             }
3398             if (inTempPos != p->blockSize)
3399             {
3400               *status = ZSTD_STATUS_NEEDS_MORE_INPUT;
3401               return SZ_OK;
3402             }
3403             #if COPY_CHUNK_SIZE > 1
3404               memset(p->inTemp + kTempBuffer_PreSize + inTempPos, 0, COPY_CHUNK_SIZE);
3405             #endif
3406             comprStream = p->inTemp + kTempBuffer_PreSize;
3407             afterAvail = k_Lit_AfterAvail;
3408             // we don't want to read non-initialized data or junk in CopyMatch():
3409           }
3410           else
3411           {
3412             // inCur >= rem
3413             // we use direct decoding from (src) buffer:
3414             afterAvail = inCur - rem;
3415             comprStream = src;
3416             src += rem;
3417             *srcLen += rem;
3418           }
3419 
3420           #ifdef Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP
3421             ZstdDec1_NeedTempBufferForInput(*srcLen, comprStream, p->blockSize);
3422           #endif
3423           // printf("\nblockSize=%u", p->blockSize);
3424           // printf("%x\n", (unsigned)p->contentProcessed);
3425           STAT_INC(g_Num_Blocks_Compressed)
3426           {
3427             SRes sres;
3428             const size_t winPos = p->decoder.winPos;
3429             /*
3430                if ( useAdditionalWinLimit), we use strong unpack limit: smallest from
3431                   - limit from stream : (curBlockUnpackRem)
3432                   - limit from caller : (cycSize - winPos)
3433                if (!useAdditionalWinLimit), we use only relaxed limit:
3434                   - limit from stream : (curBlockUnpackRem)
3435             */
3436             SizeT outLimit = p->curBlockUnpackRem;
3437             if (ds->outBuf_fromCaller)
3438             // if (useAdditionalWinLimit)
3439             {
3440               const size_t limit = p->decoder.cycSize - winPos;
3441               if (outLimit > limit)
3442                   outLimit = limit;
3443             }
3444             sres = ZstdDec1_DecodeBlock(&p->decoder,
3445                 comprStream, p->blockSize, afterAvail, outLimit);
3446             // ds->winPos = p->decoder.winPos;  // the caller does it instead. for debug:
3447             if (sres)
3448             {
3449               p->isErrorState = True;
3450               return sres;
3451             }
3452             {
3453               const SizeT xxh64_winPos = winPos - ZstdDec_GET_UNPROCESSED_XXH64_SIZE(p);
3454               const size_t num = p->decoder.winPos - winPos;
3455               ds->outProcessed += num;
3456               p->contentProcessed += num;
3457               ZstdDec_Update_XXH(p, xxh64_winPos);
3458             }
3459           }
3460           // printf("\nwinPos=%x", (int)(unsigned)p->decoder.winPos);
3461         }
3462       }
3463 
3464       /*
3465       if (ds->outSize_Defined)
3466       {
3467         if (ds->outSize <= ds->outProcessed) ds->isAfterSizeMode = (enum_ZstdStatus)
3468            (ds->outSize == ds->outProcessed ? 1u: 2u);
3469       }
3470       */
3471 
3472       if (!ZSTD_DEC_IS_LAST_BLOCK(p))
3473       {
3474         p->frameState = ZSTD2_STATE_BLOCK;
3475         if (ds->outSize_Defined && ds->outSize < ds->outProcessed)
3476         {
3477           *status = ZSTD_STATUS_OUT_REACHED;
3478           return SZ_OK;
3479         }
3480         // we exit only if (winPos) was changed in this function call:
3481         if (p->decoder.winPos != winPos_atFuncStart)
3482         {
3483           // decoded block was not empty. So we exit:
3484           *status = (enum_ZstdStatus)(
3485               (inSize == *srcLen) ?
3486                 ZSTD_STATUS_NEEDS_MORE_INPUT :
3487                 ZSTD_STATUS_NOT_FINISHED);
3488           return SZ_OK;
3489         }
3490         // (p->decoder.winPos == winPos_atFuncStart)
3491         // so current decoded block was empty.
3492         // we will try to decode more blocks in this function.
3493         continue;
3494       }
3495 
3496       // decoded block was last in frame
3497       if (p->descriptor & DESCRIPTOR_FLAG_CHECKSUM)
3498       {
3499         p->frameState = ZSTD2_STATE_HASH;
3500         if (ds->outSize_Defined && ds->outSize < ds->outProcessed)
3501         {
3502           *status = ZSTD_STATUS_OUT_REACHED;
3503           return SZ_OK; // disable if want to
3504           /* We want to get same return codes for any input buffer sizes.
3505              We want to get faster ZSTD_STATUS_OUT_REACHED status.
3506              So we exit with ZSTD_STATUS_OUT_REACHED here,
3507              instead of ZSTD2_STATE_HASH and ZSTD2_STATE_FINISHED processing.
3508              that depends from input buffer size and that can set
3509              ZSTD_STATUS_NEEDS_MORE_INPUT or return SZ_ERROR_DATA or SZ_ERROR_CRC.
3510           */
3511         }
3512       }
3513       else
3514       {
3515         /* ZSTD2_STATE_FINISHED proccesing doesn't depend from input buffer */
3516         p->frameState = ZSTD2_STATE_FINISHED;
3517       }
3518       /*
3519       p->frameState = (p->descriptor & DESCRIPTOR_FLAG_CHECKSUM) ?
3520           ZSTD2_STATE_HASH :
3521           ZSTD2_STATE_FINISHED;
3522       */
3523       /* it's required to process ZSTD2_STATE_FINISHED state in this function call,
3524          because we must check contentSize and hashError in ZSTD2_STATE_FINISHED code,
3525          while the caller can reinit full state for ZSTD2_STATE_FINISHED
3526          So we can't exit from function here. */
3527       continue;
3528     }
3529 
3530     if (p->frameState == ZSTD2_STATE_FINISHED)
3531     {
3532       *status = ZSTD_STATUS_FINISHED_FRAME;
3533       if (DESCRIPTOR_Is_ContentSize_Defined(p->descriptor)
3534           && p->contentSize != p->contentProcessed)
3535         return SZ_ERROR_DATA;
3536       if (p->hashError) // for debug
3537         return SZ_ERROR_CRC;
3538       return SZ_OK;
3539       // p->frameState = ZSTD2_STATE_SIGNATURE;
3540       // continue;
3541     }
3542 
3543     if (p->frameState == ZSTD2_STATE_AFTER_HEADER)
3544       return SZ_OK; // we need memory allocation for that state
3545 
3546     if (p->frameState == ZSTD2_STATE_SKIP_DATA)
3547     {
3548       UInt32 blockSize = p->blockSize;
3549       // (blockSize == 0) is possible
3550       if (inCur > blockSize)
3551           inCur = blockSize;
3552       src += inCur;
3553       *srcLen += inCur;
3554       blockSize -= (UInt32)inCur;
3555       p->blockSize = blockSize;
3556       if (blockSize == 0)
3557       {
3558         p->frameState = ZSTD2_STATE_SIGNATURE;
3559         // continue; // for debug: we can continue without return to caller.
3560         // we notify the caller that skip frame was finished:
3561         *status = ZSTD_STATUS_FINISHED_FRAME;
3562         return SZ_OK;
3563       }
3564       // blockSize != 0
3565       // (inCur) was smaller than previous value of p->blockSize.
3566       // (inSize == *srcLen) now
3567       *status = ZSTD_STATUS_NEEDS_MORE_INPUT;
3568       return SZ_OK;
3569     }
3570 
3571     if (inCur == 0)
3572     {
3573       *status = ZSTD_STATUS_NEEDS_MORE_INPUT;
3574       return SZ_OK;
3575     }
3576 
3577     {
3578       (*srcLen)++;
3579       p->frameState = ZstdDec_UpdateState(p, *src++, info);
3580     }
3581   }
3582 
3583   *status = ZSTD_STATUS_NOT_SPECIFIED;
3584   p->isErrorState = True;
3585   // p->frameState = ZSTD2_STATE_ERROR;
3586   // if (p->frameState = ZSTD2_STATE_SIGNATURE)  return SZ_ERROR_NO_ARCHIVE
3587   return SZ_ERROR_DATA;
3588 }
3589 
3590 
3591 
3592 
3593 SRes ZstdDec_Decode(CZstdDecHandle dec, CZstdDecState *p)
3594 {
3595   p->needWrite_Size = 0;
3596   p->status = ZSTD_STATUS_NOT_SPECIFIED;
3597   dec->disableHash = p->disableHash;
3598 
3599   if (p->outBuf_fromCaller)
3600   {
3601     dec->decoder.win = p->outBuf_fromCaller;
3602     dec->decoder.cycSize = p->outBufSize_fromCaller;
3603   }
3604 
3605   // p->winPos = dec->decoder.winPos;
3606 
3607   for (;;)
3608   {
3609     SizeT winPos, size;
3610     // SizeT outProcessed;
3611     SRes res;
3612 
3613     if (p->wrPos > dec->decoder.winPos)
3614       return SZ_ERROR_FAIL;
3615 
3616     if (dec->frameState == ZSTD2_STATE_FINISHED)
3617     {
3618       if (!p->outBuf_fromCaller)
3619       {
3620         // we need to set positions to zero for new frame.
3621         if (p->wrPos != dec->decoder.winPos)
3622         {
3623           /* We have already asked the caller to flush all data
3624              with (p->needWrite_Size) and (ZSTD_STATUS_FINISHED_FRAME) status.
3625              So it's unexpected case */
3626           // p->winPos = dec->decoder.winPos;
3627           // p->needWrite_Size = dec->decoder.winPos - p->wrPos; // flush size asking
3628           // return SZ_OK; // ask to flush again
3629           return SZ_ERROR_FAIL;
3630         }
3631         // (p->wrPos == dec->decoder.winPos), and we wrap to zero:
3632         dec->decoder.winPos = 0;
3633         p->winPos = 0;
3634         p->wrPos = 0;
3635       }
3636       ZstdDec_Init_ForNewFrame(dec);
3637       // continue;
3638     }
3639 
3640     winPos = dec->decoder.winPos;
3641     {
3642       SizeT next = dec->decoder.cycSize;
3643       /* cycSize == 0, if no buffer was allocated still,
3644          or, if (outBuf_fromCaller) mode and (outBufSize_fromCaller == 0) */
3645       if (!p->outBuf_fromCaller
3646           && next
3647           && next <= winPos
3648           && dec->isCyclicMode)
3649       {
3650         // (0 < decoder.cycSize <= winPos) in isCyclicMode.
3651         // so we need to wrap (winPos) and (wrPos) over (cycSize).
3652         const size_t delta = next;
3653         // (delta) is how many bytes we remove from buffer.
3654         /*
3655         // we don't need data older than last (cycSize) bytes.
3656         size_t delta = winPos - next; // num bytes after (cycSize)
3657         if (delta <= next) // it's expected case
3658           delta = next;
3659         // delta == Max(cycSize, winPos - cycSize)
3660         */
3661         if (p->wrPos < delta)
3662         {
3663           // (wrPos < decoder.cycSize)
3664           // We have asked already the caller to flush required data
3665           // p->status = ZSTD_STATUS_NOT_SPECIFIED;
3666           // p->winPos = winPos;
3667           // p->needWrite_Size = delta - p->wrPos; // flush size asking
3668           // return SZ_OK; // ask to flush again
3669           return SZ_ERROR_FAIL;
3670         }
3671         // p->wrPos >= decoder.cycSize
3672         // we move extra data after (decoder.cycSize) to start of cyclic buffer:
3673         winPos -= delta;
3674         if (winPos)
3675         {
3676           if (winPos >= delta)
3677             return SZ_ERROR_FAIL;
3678           memmove(dec->decoder.win, dec->decoder.win + delta, winPos);
3679           // printf("\nmemmove processed=%8x winPos=%8x\n", (unsigned)p->outProcessed, (unsigned)dec->decoder.winPos);
3680           STAT_INC(g_Num_Wrap_memmove_Num)
3681           STAT_UPDATE(g_Num_Wrap_memmove_Bytes += (unsigned)winPos;)
3682         }
3683         dec->decoder.winPos = winPos;
3684         p->winPos = winPos;
3685         p->wrPos -= delta;
3686         // dec->xxh64_winPos -= delta;
3687 
3688         // (winPos < delta)
3689         #ifdef Z7_STD_DEC_USE_AFTER_CYC_BUF
3690           /* we set the data after cycSize, because
3691              we don't want to read non-initialized data or junk in CopyMatch(). */
3692           memset(dec->decoder.win + next, 0, COPY_CHUNK_SIZE);
3693         #endif
3694 
3695         /*
3696         if (winPos == next)
3697         {
3698           if (winPos != p->wrPos)
3699           {
3700             // we already requested before to flush full data for that case.
3701             //   but we give the caller a second chance to flush data:
3702             p->needWrite_Size = winPos - p->wrPos;
3703             return SZ_OK;
3704           }
3705           // (decoder.cycSize == winPos == p->wrPos)
3706           // so we do second wrapping to zero:
3707           winPos = 0;
3708           dec->decoder.winPos = 0;
3709           p->winPos = 0;
3710           p->wrPos = 0;
3711         }
3712         */
3713         // (winPos < next)
3714       }
3715 
3716       if (winPos > next)
3717         return SZ_ERROR_FAIL; // it's unexpected case
3718       /*
3719         if (!outBuf_fromCaller && isCyclicMode && cycSize != 0)
3720           then (winPos <  cycSize)
3721           else (winPos <= cycSize)
3722       */
3723       if (!p->outBuf_fromCaller)
3724       {
3725         // that code is optional. We try to optimize write chunk sizes.
3726         /* (next2) is expected next write position in the caller,
3727            if the caller writes by kBlockSizeMax chunks.
3728         */
3729         /*
3730         const size_t next2 = (winPos + kBlockSizeMax) & (kBlockSizeMax - 1);
3731         if (winPos < next2 && next2 < next)
3732           next = next2;
3733         */
3734       }
3735       size = next - winPos;
3736     }
3737 
3738     // note: ZstdDec_DecodeBlock() uses (winLimit = winPos + size) only for RLE and RAW blocks
3739     res = ZstdDec_DecodeBlock(dec, p, size);
3740     /*
3741       after one block decoding:
3742       if (!outBuf_fromCaller && isCyclicMode && cycSize != 0)
3743         then (winPos <  cycSize + max_block_size)
3744         else (winPos <= cycSize)
3745     */
3746 
3747     if (!p->outBuf_fromCaller)
3748       p->win = dec->decoder.win;
3749     p->winPos = dec->decoder.winPos;
3750 
3751     // outProcessed = dec->decoder.winPos - winPos;
3752     // p->outProcessed += outProcessed;
3753 
3754     if (res != SZ_OK)
3755       return res;
3756 
3757     if (dec->frameState != ZSTD2_STATE_AFTER_HEADER)
3758     {
3759       if (p->outBuf_fromCaller)
3760         return SZ_OK;
3761       {
3762         // !p->outBuf_fromCaller
3763         /*
3764           if (ZSTD_STATUS_FINISHED_FRAME), we request full flushing here because
3765             1) it's simpler to work with allocation and extracting of next frame,
3766             2) it's better to start writing to next new frame with aligned memory
3767                for faster xxh 64-bit reads.
3768         */
3769         size_t end = dec->decoder.winPos;  // end pos for all data flushing
3770         if (p->status != ZSTD_STATUS_FINISHED_FRAME)
3771         {
3772           // we will request flush here only for cases when wrap in cyclic buffer can be required in next call.
3773           if (!dec->isCyclicMode)
3774             return SZ_OK;
3775           // isCyclicMode
3776           {
3777             const size_t delta = dec->decoder.cycSize;
3778             if (end < delta)
3779               return SZ_OK; // (winPos < cycSize). no need for flush
3780             // cycSize <= winPos
3781             // So we ask the caller to flush of (cycSize - wrPos) bytes,
3782             // and then we will wrap cylicBuffer in next call
3783             end = delta;
3784           }
3785         }
3786         p->needWrite_Size = end - p->wrPos;
3787       }
3788       return SZ_OK;
3789     }
3790 
3791     // ZSTD2_STATE_AFTER_HEADER
3792     {
3793       BoolInt useCyclic = False;
3794       size_t cycSize;
3795 
3796       // p->status = ZSTD_STATUS_NOT_FINISHED;
3797       if (dec->dictionaryId != 0)
3798       {
3799         /* actually we can try to decode some data,
3800            because it's possible that some data doesn't use dictionary */
3801         // p->status = ZSTD_STATUS_NOT_SPECIFIED;
3802         return SZ_ERROR_UNSUPPORTED;
3803       }
3804 
3805       {
3806         UInt64 winSize = dec->contentSize;
3807         UInt64 winSize_Allocate = winSize;
3808         const unsigned descriptor = dec->descriptor;
3809 
3810         if ((descriptor & DESCRIPTOR_FLAG_SINGLE) == 0)
3811         {
3812           const Byte wd = dec->windowDescriptor;
3813           winSize = (UInt64)(8 + (wd & 7)) << ((wd >> 3) + 10 - 3);
3814           if (!DESCRIPTOR_Is_ContentSize_Defined(descriptor)
3815               || winSize_Allocate > winSize)
3816           {
3817             winSize_Allocate = winSize;
3818             useCyclic = True;
3819           }
3820         }
3821         /*
3822         else
3823         {
3824           if (p->info.singleSegment_ContentSize_MAX < winSize)
3825               p->info.singleSegment_ContentSize_MAX = winSize;
3826           // p->info.num_SingleSegments++;
3827         }
3828         */
3829         if (p->info.windowSize_MAX < winSize)
3830             p->info.windowSize_MAX = winSize;
3831         if (p->info.windowSize_Allocate_MAX < winSize_Allocate)
3832             p->info.windowSize_Allocate_MAX = winSize_Allocate;
3833         /*
3834            winSize_Allocate is MIN(content_size, window_size_from_descriptor).
3835            Wven if (content_size < (window_size_from_descriptor))
3836              original-zstd still uses (window_size_from_descriptor) to check that decoding is allowed.
3837            We try to follow original-zstd, and here we check (winSize) instead of (winSize_Allocate))
3838         */
3839         if (
3840               // winSize_Allocate   // it's relaxed check
3841               winSize               // it's more strict check to be compatible with original-zstd
3842             > ((UInt64)1 << MAX_WINDOW_SIZE_LOG))
3843           return SZ_ERROR_UNSUPPORTED; // SZ_ERROR_MEM
3844         cycSize = (size_t)winSize_Allocate;
3845         if (cycSize != winSize_Allocate)
3846           return SZ_ERROR_MEM;
3847         // cycSize <= winSize
3848         /* later we will use (CZstdDec1::winSize) to check match offsets and check block sizes.
3849            if (there is window descriptor)
3850            {
3851              We will check block size with (window_size_from_descriptor) instead of (winSize_Allocate).
3852              Does original-zstd do it that way also?
3853            }
3854            Here we must reduce full real 64-bit (winSize) to size_t for (CZstdDec1::winSize).
3855            Also we don't want too big values for (CZstdDec1::winSize).
3856            our (CZstdDec1::winSize) will meet the condition:
3857              (CZstdDec1::winSize < kBlockSizeMax || CZstdDec1::winSize <= cycSize).
3858         */
3859         dec->decoder.winSize = (winSize < kBlockSizeMax) ? (size_t)winSize: cycSize;
3860         // note: (CZstdDec1::winSize > cycSize) is possible, if (!useCyclic)
3861       }
3862 
3863       RINOK(ZstdDec_AllocateMisc(dec))
3864 
3865       if (p->outBuf_fromCaller)
3866         dec->isCyclicMode = False;
3867       else
3868       {
3869         size_t d = cycSize;
3870 
3871         if (dec->decoder.winPos != p->wrPos)
3872           return SZ_ERROR_FAIL;
3873 
3874         dec->decoder.winPos = 0;
3875         p->wrPos = 0;
3876         p->winPos = dec->decoder.winPos;
3877 
3878         /*
3879         const size_t needWrite = dec->decoder.winPos - p->wrPos;
3880         if (!needWrite)
3881         {
3882           dec->decoder.winPos = 0;
3883           p->wrPos = 0;
3884           p->winPos = dec->decoder.winPos;
3885         }
3886         */
3887         /* if (!useCyclic) we allocate only cycSize = ContentSize.
3888            But if we want to support the case where new frame starts with winPos != 0,
3889            then we will wrap over zero, and we still need
3890            to set (useCyclic) and allocate additional buffer spaces.
3891            Now we don't allow new frame starting with (winPos != 0).
3892            so (dec->decoder->winPos == 0)
3893            can use (!useCyclic) with reduced buffer sizes.
3894         */
3895         /*
3896         if (dec->decoder->winPos != 0)
3897           useCyclic = True;
3898         */
3899 
3900         if (useCyclic)
3901         {
3902           /* cyclyc buffer size must be at least (COPY_CHUNK_SIZE - 1) bytes
3903              larger than window size, because CopyMatch() can write additional
3904              (COPY_CHUNK_SIZE - 1) bytes and overwrite oldests data in cyclyc buffer.
3905              But for performance reasons we align (cycSize) for (kBlockSizeMax).
3906              also we must provide (cycSize >= max_decoded_data_after_cycSize),
3907              because after data move wrapping over zero we must provide (winPos < cycSize).
3908           */
3909           const size_t alignSize = kBlockSizeMax;
3910           /* here we add (1 << 7) instead of (COPY_CHUNK_SIZE - 1), because
3911              we want to get same (cycSize) for different COPY_CHUNK_SIZE values. */
3912           // cycSize += (COPY_CHUNK_SIZE - 1) + (alignSize - 1); // for debug : we can get smallest (cycSize)
3913           cycSize += (1 << 7) + alignSize;
3914           cycSize &= ~(size_t)(alignSize - 1);
3915           // cycSize must be aligned for 32, because xxh requires 32-bytes blocks.
3916           // cycSize += 12345; // for debug
3917           // cycSize += 1 << 10; // for debug
3918           // cycSize += 32; // for debug
3919           // cycSize += kBlockSizeMax; // for debug
3920           if (cycSize < d)
3921             return SZ_ERROR_MEM;
3922           /*
3923              in cyclic buffer mode we allow to decode one additional block
3924              that exceeds (cycSize).
3925              So we must allocate additional (kBlockSizeMax) bytes after (cycSize).
3926              if defined(Z7_STD_DEC_USE_AFTER_CYC_BUF)
3927              {
3928                we can read (COPY_CHUNK_SIZE - 1) bytes after (cycSize)
3929                but we aready allocate additional kBlockSizeMax that
3930                is larger than COPY_CHUNK_SIZE.
3931                So we don't need additional space of COPY_CHUNK_SIZE after (cycSize).
3932              }
3933           */
3934           /*
3935           #ifdef Z7_STD_DEC_USE_AFTER_CYC_BUF
3936           d = cycSize + (1 << 7); // we must add at least (COPY_CHUNK_SIZE - 1)
3937           #endif
3938           */
3939           d = cycSize + kBlockSizeMax;
3940           if (d < cycSize)
3941             return SZ_ERROR_MEM;
3942         }
3943 
3944         {
3945           const size_t kMinWinAllocSize = 1 << 12;
3946           if (d < kMinWinAllocSize)
3947               d = kMinWinAllocSize;
3948         }
3949 
3950         if (d > dec->winBufSize_Allocated)
3951         {
3952           /*
3953           if (needWrite)
3954           {
3955             p->needWrite_Size = needWrite;
3956             return SZ_OK;
3957             // return SZ_ERROR_FAIL;
3958           }
3959           */
3960 
3961           if (dec->winBufSize_Allocated != 0)
3962           {
3963             const size_t k_extra = (useCyclic || d >= (1u << 20)) ?
3964                 2 * kBlockSizeMax : 0;
3965             unsigned i = useCyclic ? 17 : 12;
3966             for (; i < sizeof(size_t) * 8; i++)
3967             {
3968               const size_t d2 = ((size_t)1 << i) + k_extra;
3969               if (d2 >= d)
3970               {
3971                 d = d2;
3972                 break;
3973               }
3974             }
3975           }
3976           // RINOK(ZstdDec_AllocateWindow(dec, d))
3977           ZstdDec_FreeWindow(dec);
3978           dec->win_Base = (Byte *)ISzAlloc_Alloc(dec->alloc_Big, d);
3979           if (!dec->win_Base)
3980             return SZ_ERROR_MEM;
3981           dec->decoder.win = dec->win_Base;
3982           dec->winBufSize_Allocated = d;
3983         }
3984         /*
3985         else
3986         {
3987           // for non-cyclycMode we want flush data, and set winPos = 0
3988           if (needWrite)
3989           {
3990             if (!useCyclic || dec->decoder.winPos >= cycSize)
3991             {
3992               p->needWrite_Size = needWrite;
3993               return SZ_OK;
3994               // return SZ_ERROR_FAIL;
3995             }
3996           }
3997         }
3998         */
3999 
4000         dec->decoder.cycSize = cycSize;
4001         p->win = dec->decoder.win;
4002         // p->cycSize = dec->decoder.cycSize;
4003         dec->isCyclicMode = (Byte)useCyclic;
4004       } // (!p->outBuf_fromCaller) end
4005 
4006       // p->winPos = dec->decoder.winPos;
4007       dec->frameState = ZSTD2_STATE_BLOCK;
4008       // continue;
4009     } // ZSTD2_STATE_AFTER_HEADER end
4010   }
4011 }
4012 
4013 
4014 void ZstdDec_GetResInfo(const CZstdDec *dec,
4015     const CZstdDecState *p,
4016     SRes res,
4017     CZstdDecResInfo *stat)
4018 {
4019   // ZstdDecInfo_CLEAR(stat);
4020   stat->extraSize = 0;
4021   stat->is_NonFinishedFrame = False;
4022   if (dec->frameState != ZSTD2_STATE_FINISHED)
4023   {
4024     if (dec->frameState == ZSTD2_STATE_SIGNATURE)
4025     {
4026       stat->extraSize = (Byte)dec->tempSize;
4027       if (ZstdDecInfo_GET_NUM_FRAMES(&p->info) == 0)
4028         res = SZ_ERROR_NO_ARCHIVE;
4029     }
4030     else
4031     {
4032       stat->is_NonFinishedFrame = True;
4033       if (res == SZ_OK && p->status == ZSTD_STATUS_NEEDS_MORE_INPUT)
4034         res = SZ_ERROR_INPUT_EOF;
4035     }
4036   }
4037   stat->decode_SRes = res;
4038 }
4039 
4040 
4041 size_t ZstdDec_ReadUnusedFromInBuf(
4042     CZstdDecHandle dec,
4043     size_t afterDecoding_tempPos,
4044     void *data, size_t size)
4045 {
4046   size_t processed = 0;
4047   if (dec->frameState == ZSTD2_STATE_SIGNATURE)
4048   {
4049     Byte *dest = (Byte *)data;
4050     const size_t tempSize = dec->tempSize;
4051     while (afterDecoding_tempPos < tempSize)
4052     {
4053       if (size == 0)
4054         break;
4055       size--;
4056       *dest++ = dec->temp[afterDecoding_tempPos++];
4057       processed++;
4058     }
4059   }
4060   return processed;
4061 }
4062 
4063 
4064 void ZstdDecState_Clear(CZstdDecState *p)
4065 {
4066   memset(p, 0 , sizeof(*p));
4067 }
4068