1 /* ZstdDec.c -- Zstd Decoder
2 2024-06-18 : the code was developed by Igor Pavlov, using Zstandard format
3 specification and original zstd decoder code as reference code.
4 original zstd decoder code: Copyright (c) Facebook, Inc. All rights reserved.
5 This source code is licensed under BSD 3-Clause License.
6 */
7
8 #include "Precomp.h"
9
10 #include <string.h>
11 #include <stdlib.h>
12 // #include <stdio.h>
13
14 #include "Alloc.h"
15 #include "Xxh64.h"
16 #include "ZstdDec.h"
17 #include "CpuArch.h"
18
19 #if defined(MY_CPU_ARM64)
20 #include <arm_neon.h>
21 #endif
22
23 /* original-zstd still doesn't support window larger than 2 GiB.
24 So we also limit our decoder for 2 GiB window: */
25 #if defined(MY_CPU_64BIT) && 0 == 1
26 #define MAX_WINDOW_SIZE_LOG 41
27 #else
28 #define MAX_WINDOW_SIZE_LOG 31
29 #endif
30
31 typedef
32 #if MAX_WINDOW_SIZE_LOG < 32
33 UInt32
34 #else
35 size_t
36 #endif
37 CZstdDecOffset;
38
39 // for debug: simpler and smaller code but slow:
40 // #define Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS
41
42 // #define SHOW_STAT
43 #ifdef SHOW_STAT
44 #include <stdio.h>
45 static unsigned g_Num_Blocks_Compressed = 0;
46 static unsigned g_Num_Blocks_memcpy = 0;
47 static unsigned g_Num_Wrap_memmove_Num = 0;
48 static unsigned g_Num_Wrap_memmove_Bytes = 0;
49 static unsigned g_NumSeqs_total = 0;
50 // static unsigned g_NumCopy = 0;
51 static unsigned g_NumOver = 0;
52 static unsigned g_NumOver2 = 0;
53 static unsigned g_Num_Match = 0;
54 static unsigned g_Num_Lits = 0;
55 static unsigned g_Num_LitsBig = 0;
56 static unsigned g_Num_Lit0 = 0;
57 static unsigned g_Num_Rep0 = 0;
58 static unsigned g_Num_Rep1 = 0;
59 static unsigned g_Num_Rep2 = 0;
60 static unsigned g_Num_Rep3 = 0;
61 static unsigned g_Num_Threshold_0 = 0;
62 static unsigned g_Num_Threshold_1 = 0;
63 static unsigned g_Num_Threshold_0sum = 0;
64 static unsigned g_Num_Threshold_1sum = 0;
65 #define STAT_UPDATE(v) v
66 #else
67 #define STAT_UPDATE(v)
68 #endif
69 #define STAT_INC(v) STAT_UPDATE(v++;)
70
71
72 typedef struct
73 {
74 const Byte *ptr;
75 size_t len;
76 }
77 CInBufPair;
78
79
80 #if defined(MY_CPU_ARM_OR_ARM64) || defined(MY_CPU_X86_OR_AMD64)
81 #if (defined(__clang__) && (__clang_major__ >= 6)) \
82 || (defined(__GNUC__) && (__GNUC__ >= 6))
83 // disable for debug:
84 #define Z7_ZSTD_DEC_USE_BSR
85 #elif defined(_MSC_VER) && (_MSC_VER >= 1300)
86 // #if defined(MY_CPU_ARM_OR_ARM64)
87 #if (_MSC_VER >= 1600)
88 #include <intrin.h>
89 #endif
90 // disable for debug:
91 #define Z7_ZSTD_DEC_USE_BSR
92 #endif
93 #endif
94
95 #ifdef Z7_ZSTD_DEC_USE_BSR
96 #if defined(__clang__) || defined(__GNUC__)
97 #define MY_clz(x) ((unsigned)__builtin_clz((UInt32)x))
98 #else // #if defined(_MSC_VER)
99 #ifdef MY_CPU_ARM_OR_ARM64
100 #define MY_clz _CountLeadingZeros
101 #endif // MY_CPU_X86_OR_AMD64
102 #endif // _MSC_VER
103 #elif !defined(Z7_ZSTD_DEC_USE_LOG_TABLE)
104 #define Z7_ZSTD_DEC_USE_LOG_TABLE
105 #endif
106
107
108 static
109 Z7_FORCE_INLINE
GetHighestSetBit_32_nonzero_big(UInt32 num)110 unsigned GetHighestSetBit_32_nonzero_big(UInt32 num)
111 {
112 // (num != 0)
113 #ifdef MY_clz
114 return 31 - MY_clz(num);
115 #elif defined(Z7_ZSTD_DEC_USE_BSR)
116 {
117 unsigned long zz;
118 _BitScanReverse(&zz, num);
119 return zz;
120 }
121 #else
122 {
123 int i = -1;
124 for (;;)
125 {
126 i++;
127 num >>= 1;
128 if (num == 0)
129 return (unsigned)i;
130 }
131 }
132 #endif
133 }
134
135 #ifdef Z7_ZSTD_DEC_USE_LOG_TABLE
136
137 #define R1(a) a, a
138 #define R2(a) R1(a), R1(a)
139 #define R3(a) R2(a), R2(a)
140 #define R4(a) R3(a), R3(a)
141 #define R5(a) R4(a), R4(a)
142 #define R6(a) R5(a), R5(a)
143 #define R7(a) R6(a), R6(a)
144 #define R8(a) R7(a), R7(a)
145 #define R9(a) R8(a), R8(a)
146
147 #define Z7_ZSTD_FSE_MAX_ACCURACY 9
148 // states[] values in FSE_Generate() can use (Z7_ZSTD_FSE_MAX_ACCURACY + 1) bits.
149 static const Byte k_zstd_LogTable[2 << Z7_ZSTD_FSE_MAX_ACCURACY] =
150 {
151 R1(0), R1(1), R2(2), R3(3), R4(4), R5(5), R6(6), R7(7), R8(8), R9(9)
152 };
153
154 #define GetHighestSetBit_32_nonzero_small(num) (k_zstd_LogTable[num])
155 #else
156 #define GetHighestSetBit_32_nonzero_small GetHighestSetBit_32_nonzero_big
157 #endif
158
159
160 #ifdef MY_clz
161 #define UPDATE_BIT_OFFSET_FOR_PADDING(b, bitOffset) \
162 bitOffset -= (CBitCtr)(MY_clz(b) - 23);
163 #elif defined(Z7_ZSTD_DEC_USE_BSR)
164 #define UPDATE_BIT_OFFSET_FOR_PADDING(b, bitOffset) \
165 { unsigned long zz; _BitScanReverse(&zz, b); bitOffset -= 8; bitOffset += zz; }
166 #else
167 #define UPDATE_BIT_OFFSET_FOR_PADDING(b, bitOffset) \
168 for (;;) { bitOffset--; if (b & 0x80) { break; } b <<= 1; }
169 #endif
170
171 #define SET_bitOffset_TO_PAD(bitOffset, src, srcLen) \
172 { \
173 unsigned lastByte = (src)[(size_t)(srcLen) - 1]; \
174 if (lastByte == 0) return SZ_ERROR_DATA; \
175 bitOffset = (CBitCtr)((srcLen) * 8); \
176 UPDATE_BIT_OFFSET_FOR_PADDING(lastByte, bitOffset) \
177 }
178
179 #ifndef Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS
180
181 #define SET_bitOffset_TO_PAD_and_SET_BIT_SIZE(bitOffset, src, srcLen_res) \
182 { \
183 unsigned lastByte = (src)[(size_t)(srcLen_res) - 1]; \
184 if (lastByte == 0) return SZ_ERROR_DATA; \
185 srcLen_res *= 8; \
186 bitOffset = (CBitCtr)srcLen_res; \
187 UPDATE_BIT_OFFSET_FOR_PADDING(lastByte, bitOffset) \
188 }
189
190 #endif
191
192 /*
193 typedef Int32 CBitCtr_signed;
194 typedef Int32 CBitCtr;
195 */
196 // /*
197 typedef ptrdiff_t CBitCtr_signed;
198 typedef ptrdiff_t CBitCtr;
199 // */
200
201
202 #define MATCH_LEN_MIN 3
203 #define kBlockSizeMax (1u << 17)
204
205 // #define Z7_ZSTD_DEC_PRINT_TABLE
206
207 #ifdef Z7_ZSTD_DEC_PRINT_TABLE
208 #define NUM_OFFSET_SYMBOLS_PREDEF 29
209 #endif
210 #define NUM_OFFSET_SYMBOLS_MAX (MAX_WINDOW_SIZE_LOG + 1) // 32
211 #define NUM_LL_SYMBOLS 36
212 #define NUM_ML_SYMBOLS 53
213 #define FSE_NUM_SYMBOLS_MAX 53 // NUM_ML_SYMBOLS
214
215 // /*
216 #if !defined(MY_CPU_X86) || defined(__PIC__) || defined(MY_CPU_64BIT)
217 #define Z7_ZSTD_DEC_USE_BASES_IN_OBJECT
218 #endif
219 // */
220 // for debug:
221 // #define Z7_ZSTD_DEC_USE_BASES_LOCAL
222 // #define Z7_ZSTD_DEC_USE_BASES_IN_OBJECT
223
224 #define GLOBAL_TABLE(n) k_ ## n
225
226 #if defined(Z7_ZSTD_DEC_USE_BASES_LOCAL)
227 #define BASES_TABLE(n) a_ ## n
228 #elif defined(Z7_ZSTD_DEC_USE_BASES_IN_OBJECT)
229 #define BASES_TABLE(n) p->m_ ## n
230 #else
231 #define BASES_TABLE(n) GLOBAL_TABLE(n)
232 #endif
233
234 #define Z7_ZSTD_DEC_USE_ML_PLUS3
235
236 #if defined(Z7_ZSTD_DEC_USE_BASES_LOCAL) || \
237 defined(Z7_ZSTD_DEC_USE_BASES_IN_OBJECT)
238
239 #define SEQ_EXTRA_TABLES(n) \
240 Byte n ## SEQ_LL_EXTRA [NUM_LL_SYMBOLS]; \
241 Byte n ## SEQ_ML_EXTRA [NUM_ML_SYMBOLS]; \
242 UInt32 n ## SEQ_LL_BASES [NUM_LL_SYMBOLS]; \
243 UInt32 n ## SEQ_ML_BASES [NUM_ML_SYMBOLS]; \
244
245 #define Z7_ZSTD_DEC_USE_BASES_CALC
246
247 #ifdef Z7_ZSTD_DEC_USE_BASES_CALC
248
249 #define FILL_LOC_BASES(n, startSum) \
250 { unsigned i; UInt32 sum = startSum; \
251 for (i = 0; i != Z7_ARRAY_SIZE(GLOBAL_TABLE(n ## _EXTRA)); i++) \
252 { const unsigned a = GLOBAL_TABLE(n ## _EXTRA)[i]; \
253 BASES_TABLE(n ## _BASES)[i] = sum; \
254 /* if (sum != GLOBAL_TABLE(n ## _BASES)[i]) exit(1); */ \
255 sum += (UInt32)1 << a; \
256 BASES_TABLE(n ## _EXTRA)[i] = (Byte)a; }}
257
258 #define FILL_LOC_BASES_ALL \
259 FILL_LOC_BASES (SEQ_LL, 0) \
260 FILL_LOC_BASES (SEQ_ML, MATCH_LEN_MIN) \
261
262 #else
263 #define COPY_GLOBAL_ARR(n) \
264 memcpy(BASES_TABLE(n), GLOBAL_TABLE(n), sizeof(GLOBAL_TABLE(n)));
265 #define FILL_LOC_BASES_ALL \
266 COPY_GLOBAL_ARR (SEQ_LL_EXTRA) \
267 COPY_GLOBAL_ARR (SEQ_ML_EXTRA) \
268 COPY_GLOBAL_ARR (SEQ_LL_BASES) \
269 COPY_GLOBAL_ARR (SEQ_ML_BASES) \
270
271 #endif
272
273 #endif
274
275
276
277 /// The sequence decoding baseline and number of additional bits to read/add
278 #if !defined(Z7_ZSTD_DEC_USE_BASES_CALC)
279 static const UInt32 GLOBAL_TABLE(SEQ_LL_BASES) [NUM_LL_SYMBOLS] =
280 {
281 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
282 16, 18, 20, 22, 24, 28, 32, 40, 48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000,
283 0x2000, 0x4000, 0x8000, 0x10000
284 };
285 #endif
286
287 static const Byte GLOBAL_TABLE(SEQ_LL_EXTRA) [NUM_LL_SYMBOLS] =
288 {
289 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
290 1, 1, 1, 1, 2, 2, 3, 3, 4, 6, 7, 8, 9, 10, 11, 12,
291 13, 14, 15, 16
292 };
293
294 #if !defined(Z7_ZSTD_DEC_USE_BASES_CALC)
295 static const UInt32 GLOBAL_TABLE(SEQ_ML_BASES) [NUM_ML_SYMBOLS] =
296 {
297 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
298 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
299 35, 37, 39, 41, 43, 47, 51, 59, 67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803,
300 0x1003, 0x2003, 0x4003, 0x8003, 0x10003
301 };
302 #endif
303
304 static const Byte GLOBAL_TABLE(SEQ_ML_EXTRA) [NUM_ML_SYMBOLS] =
305 {
306 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
308 1, 1, 1, 1, 2, 2, 3, 3, 4, 4, 5, 7, 8, 9, 10, 11,
309 12, 13, 14, 15, 16
310 };
311
312
313 #ifdef Z7_ZSTD_DEC_PRINT_TABLE
314
315 static const Int16 SEQ_LL_PREDEF_DIST [NUM_LL_SYMBOLS] =
316 {
317 4, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1,
318 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 1, 1, 1, 1, 1,
319 -1,-1,-1,-1
320 };
321 static const Int16 SEQ_OFFSET_PREDEF_DIST [NUM_OFFSET_SYMBOLS_PREDEF] =
322 {
323 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
324 1, 1, 1, 1, 1, 1, 1, 1,-1,-1,-1,-1,-1
325 };
326 static const Int16 SEQ_ML_PREDEF_DIST [NUM_ML_SYMBOLS] =
327 {
328 1, 4, 3, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1,
329 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
330 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,-1,-1,
331 -1,-1,-1,-1,-1
332 };
333
334 #endif
335
336 // typedef int FastInt;
337 // typedef Int32 FastInt32;
338 typedef unsigned FastInt;
339 typedef UInt32 FastInt32;
340 typedef FastInt32 CFseRecord;
341
342
343 #define FSE_REC_LEN_OFFSET 8
344 #define FSE_REC_STATE_OFFSET 16
345 #define GET_FSE_REC_SYM(st) ((Byte)(st))
346 #define GET_FSE_REC_LEN(st) ((Byte)((st) >> FSE_REC_LEN_OFFSET))
347 #define GET_FSE_REC_STATE(st) ((st) >> FSE_REC_STATE_OFFSET)
348
349 // #define FSE_REC_SYM_MASK (0xff)
350 // #define GET_FSE_REC_SYM(st) (st & FSE_REC_SYM_MASK)
351
352 #define W_BASE(state, len, sym) \
353 (((UInt32)state << (4 + FSE_REC_STATE_OFFSET)) + \
354 (len << FSE_REC_LEN_OFFSET) + (sym))
355 #define W(state, len, sym) W_BASE(state, len, sym)
356 static const CFseRecord k_PredefRecords_LL[1 << 6] = {
357 W(0,4, 0),W(1,4, 0),W(2,5, 1),W(0,5, 3),W(0,5, 4),W(0,5, 6),W(0,5, 7),W(0,5, 9),
358 W(0,5,10),W(0,5,12),W(0,6,14),W(0,5,16),W(0,5,18),W(0,5,19),W(0,5,21),W(0,5,22),
359 W(0,5,24),W(2,5,25),W(0,5,26),W(0,6,27),W(0,6,29),W(0,6,31),W(2,4, 0),W(0,4, 1),
360 W(0,5, 2),W(2,5, 4),W(0,5, 5),W(2,5, 7),W(0,5, 8),W(2,5,10),W(0,5,11),W(0,6,13),
361 W(2,5,16),W(0,5,17),W(2,5,19),W(0,5,20),W(2,5,22),W(0,5,23),W(0,4,25),W(1,4,25),
362 W(2,5,26),W(0,6,28),W(0,6,30),W(3,4, 0),W(1,4, 1),W(2,5, 2),W(2,5, 3),W(2,5, 5),
363 W(2,5, 6),W(2,5, 8),W(2,5, 9),W(2,5,11),W(2,5,12),W(0,6,15),W(2,5,17),W(2,5,18),
364 W(2,5,20),W(2,5,21),W(2,5,23),W(2,5,24),W(0,6,35),W(0,6,34),W(0,6,33),W(0,6,32)
365 };
366 static const CFseRecord k_PredefRecords_OF[1 << 5] = {
367 W(0,5, 0),W(0,4, 6),W(0,5, 9),W(0,5,15),W(0,5,21),W(0,5, 3),W(0,4, 7),W(0,5,12),
368 W(0,5,18),W(0,5,23),W(0,5, 5),W(0,4, 8),W(0,5,14),W(0,5,20),W(0,5, 2),W(1,4, 7),
369 W(0,5,11),W(0,5,17),W(0,5,22),W(0,5, 4),W(1,4, 8),W(0,5,13),W(0,5,19),W(0,5, 1),
370 W(1,4, 6),W(0,5,10),W(0,5,16),W(0,5,28),W(0,5,27),W(0,5,26),W(0,5,25),W(0,5,24)
371 };
372 #if defined(Z7_ZSTD_DEC_USE_ML_PLUS3)
373 #undef W
374 #define W(state, len, sym) W_BASE(state, len, (sym + MATCH_LEN_MIN))
375 #endif
376 static const CFseRecord k_PredefRecords_ML[1 << 6] = {
377 W(0,6, 0),W(0,4, 1),W(2,5, 2),W(0,5, 3),W(0,5, 5),W(0,5, 6),W(0,5, 8),W(0,6,10),
378 W(0,6,13),W(0,6,16),W(0,6,19),W(0,6,22),W(0,6,25),W(0,6,28),W(0,6,31),W(0,6,33),
379 W(0,6,35),W(0,6,37),W(0,6,39),W(0,6,41),W(0,6,43),W(0,6,45),W(1,4, 1),W(0,4, 2),
380 W(2,5, 3),W(0,5, 4),W(2,5, 6),W(0,5, 7),W(0,6, 9),W(0,6,12),W(0,6,15),W(0,6,18),
381 W(0,6,21),W(0,6,24),W(0,6,27),W(0,6,30),W(0,6,32),W(0,6,34),W(0,6,36),W(0,6,38),
382 W(0,6,40),W(0,6,42),W(0,6,44),W(2,4, 1),W(3,4, 1),W(1,4, 2),W(2,5, 4),W(2,5, 5),
383 W(2,5, 7),W(2,5, 8),W(0,6,11),W(0,6,14),W(0,6,17),W(0,6,20),W(0,6,23),W(0,6,26),
384 W(0,6,29),W(0,6,52),W(0,6,51),W(0,6,50),W(0,6,49),W(0,6,48),W(0,6,47),W(0,6,46)
385 };
386
387
388 // sum of freqs[] must be correct
389 // (numSyms != 0)
390 // (accuracy >= 5)
391 static
392 Z7_NO_INLINE
393 // Z7_FORCE_INLINE
FSE_Generate(CFseRecord * table,const Int16 * const freqs,const size_t numSyms,const unsigned accuracy,UInt32 delta)394 void FSE_Generate(CFseRecord *table,
395 const Int16 *const freqs, const size_t numSyms,
396 const unsigned accuracy, UInt32 delta)
397 {
398 size_t size = (size_t)1 << accuracy;
399 // max value in states[x] is ((1 << accuracy) * 2)
400 UInt16 states[FSE_NUM_SYMBOLS_MAX];
401 {
402 /* Symbols with "less than 1" probability get a single cell,
403 starting from the end of the table.
404 These symbols define a full state reset, reading (accuracy) bits. */
405 size_t threshold = size;
406 {
407 size_t s = 0;
408 do
409 if (freqs[s] == -1)
410 {
411 table[--threshold] = (CFseRecord)s;
412 states[s] = 1;
413 }
414 while (++s != numSyms);
415 }
416
417 #ifdef SHOW_STAT
418 if (threshold == size)
419 {
420 STAT_INC(g_Num_Threshold_0)
421 STAT_UPDATE(g_Num_Threshold_0sum += (unsigned)size;)
422 }
423 else
424 {
425 STAT_INC(g_Num_Threshold_1)
426 STAT_UPDATE(g_Num_Threshold_1sum += (unsigned)size;)
427 }
428 #endif
429
430 // { unsigned uuu; for (uuu = 0; uuu < 400; uuu++)
431 {
432 // Each (symbol) gets freqs[symbol] cells.
433 // Cell allocation is spread, not linear.
434 const size_t step = (size >> 1) + (size >> 3) + 3;
435 size_t pos = 0;
436 // const unsigned mask = size - 1;
437 /*
438 if (threshold == size)
439 {
440 size_t s = 0;
441 size--;
442 do
443 {
444 int freq = freqs[s];
445 if (freq <= 0)
446 continue;
447 states[s] = (UInt16)freq;
448 do
449 {
450 table[pos] (CFseRecord)s;
451 pos = (pos + step) & size; // & mask;
452 }
453 while (--freq);
454 }
455 while (++s != numSyms);
456 }
457 else
458 */
459 {
460 size_t s = 0;
461 size--;
462 do
463 {
464 int freq = freqs[s];
465 if (freq <= 0)
466 continue;
467 states[s] = (UInt16)freq;
468 do
469 {
470 table[pos] = (CFseRecord)s;
471 // we skip position, if it's already occupied by a "less than 1" probability symbol.
472 // (step) is coprime to table size, so the cycle will visit each position exactly once
473 do
474 pos = (pos + step) & size; // & mask;
475 while (pos >= threshold);
476 }
477 while (--freq);
478 }
479 while (++s != numSyms);
480 }
481 size++;
482 // (pos != 0) is unexpected case that means that freqs[] are not correct.
483 // so it's some failure in code (for example, incorrect predefined freq[] table)
484 // if (pos != 0) return SZ_ERROR_FAIL;
485 }
486 // }
487 }
488 {
489 const CFseRecord * const limit = table + size;
490 delta = ((UInt32)size << FSE_REC_STATE_OFFSET) - delta;
491 /* State increases by symbol over time, decreasing number of bits.
492 Baseline increases until the bit threshold is passed, at which point it resets to 0 */
493 do
494 {
495 #define TABLE_ITER(a) \
496 { \
497 const FastInt sym = (FastInt)table[a]; \
498 const unsigned nextState = states[sym]; \
499 unsigned nb; \
500 states[sym] = (UInt16)(nextState + 1); \
501 nb = accuracy - GetHighestSetBit_32_nonzero_small(nextState); \
502 table[a] = (CFseRecord)(sym - delta \
503 + ((UInt32)nb << FSE_REC_LEN_OFFSET) \
504 + ((UInt32)nextState << FSE_REC_STATE_OFFSET << nb)); \
505 }
506 TABLE_ITER(0)
507 TABLE_ITER(1)
508 table += 2;
509 }
510 while (table != limit);
511 }
512 }
513
514
515 #ifdef Z7_ZSTD_DEC_PRINT_TABLE
516
Print_Predef(unsigned predefAccuracy,const unsigned numSymsPredef,const Int16 * const predefFreqs,const CFseRecord * checkTable)517 static void Print_Predef(unsigned predefAccuracy,
518 const unsigned numSymsPredef,
519 const Int16 * const predefFreqs,
520 const CFseRecord *checkTable)
521 {
522 CFseRecord table[1 << 6];
523 unsigned i;
524 FSE_Generate(table, predefFreqs, numSymsPredef, predefAccuracy,
525 #if defined(Z7_ZSTD_DEC_USE_ML_PLUS3)
526 numSymsPredef == NUM_ML_SYMBOLS ? MATCH_LEN_MIN :
527 #endif
528 0
529 );
530 if (memcmp(table, checkTable, sizeof(UInt32) << predefAccuracy) != 0)
531 exit(1);
532 for (i = 0; i < (1u << predefAccuracy); i++)
533 {
534 const UInt32 v = table[i];
535 const unsigned state = (unsigned)(GET_FSE_REC_STATE(v));
536 if (state & 0xf)
537 exit(1);
538 if (i != 0)
539 {
540 printf(",");
541 if (i % 8 == 0)
542 printf("\n");
543 }
544 printf("W(%d,%d,%2d)",
545 (unsigned)(state >> 4),
546 (unsigned)((v >> FSE_REC_LEN_OFFSET) & 0xff),
547 (unsigned)GET_FSE_REC_SYM(v));
548 }
549 printf("\n\n");
550 }
551
552 #endif
553
554
555 #define GET16(dest, p) { const Byte *ptr = p; dest = GetUi16(ptr); }
556 #define GET32(dest, p) { const Byte *ptr = p; dest = GetUi32(ptr); }
557
558 // (1 <= numBits <= 9)
559 #define FORWARD_READ_BITS(destVal, numBits, mask) \
560 { const CBitCtr_signed bos3 = (bitOffset) >> 3; \
561 if (bos3 >= 0) return SZ_ERROR_DATA; \
562 GET16(destVal, src + bos3) \
563 destVal >>= (bitOffset) & 7; \
564 bitOffset += (CBitCtr_signed)(numBits); \
565 mask = (1u << (numBits)) - 1; \
566 destVal &= mask; \
567 }
568
569 #define FORWARD_READ_1BIT(destVal) \
570 { const CBitCtr_signed bos3 = (bitOffset) >> 3; \
571 if (bos3 >= 0) return SZ_ERROR_DATA; \
572 destVal = *(src + bos3); \
573 destVal >>= (bitOffset) & 7; \
574 (bitOffset)++; \
575 destVal &= 1; \
576 }
577
578
579 // in: (accuracyMax <= 9)
580 // at least 2 bytes will be processed from (in) stream.
581 // at return: (in->len > 0)
582 static
583 Z7_NO_INLINE
FSE_DecodeHeader(CFseRecord * const table,CInBufPair * const in,const unsigned accuracyMax,Byte * const accuracyRes,unsigned numSymbolsMax)584 SRes FSE_DecodeHeader(CFseRecord *const table,
585 CInBufPair *const in,
586 const unsigned accuracyMax,
587 Byte *const accuracyRes,
588 unsigned numSymbolsMax)
589 {
590 unsigned accuracy;
591 unsigned remain1;
592 unsigned syms;
593 Int16 freqs[FSE_NUM_SYMBOLS_MAX + 3]; // +3 for overwrite (repeat)
594 const Byte *src = in->ptr;
595 CBitCtr_signed bitOffset = (CBitCtr_signed)in->len - 1;
596 if (bitOffset <= 0)
597 return SZ_ERROR_DATA;
598 accuracy = *src & 0xf;
599 accuracy += 5;
600 if (accuracy > accuracyMax)
601 return SZ_ERROR_DATA;
602 *accuracyRes = (Byte)accuracy;
603 remain1 = (1u << accuracy) + 1; // (it's remain_freqs_sum + 1)
604 syms = 0;
605 src += bitOffset; // src points to last byte
606 bitOffset = 4 - (bitOffset << 3);
607
608 for (;;)
609 {
610 // (2 <= remain1)
611 const unsigned bits = GetHighestSetBit_32_nonzero_small((unsigned)remain1);
612 // (1 <= bits <= accuracy)
613 unsigned val; // it must be unsigned or int
614 unsigned mask;
615 FORWARD_READ_BITS(val, bits, mask)
616 {
617 const unsigned val2 = remain1 + val - mask;
618 if (val2 > mask)
619 {
620 unsigned bit;
621 FORWARD_READ_1BIT(bit)
622 if (bit)
623 val = val2;
624 }
625 }
626 {
627 // (remain1 >= 2)
628 // (0 <= (int)val <= remain1)
629 val = (unsigned)((int)val - 1);
630 // val now is "probability" of symbol
631 // (probability == -1) means "less than 1" frequency.
632 // (-1 <= (int)val <= remain1 - 1)
633 freqs[syms++] = (Int16)(int)val;
634 if (val != 0)
635 {
636 remain1 -= (int)val < 0 ? 1u : (unsigned)val;
637 // remain1 -= val;
638 // val >>= (sizeof(val) * 8 - 2);
639 // remain1 -= val & 2;
640 // freqs[syms++] = (Int16)(int)val;
641 // syms++;
642 if (remain1 == 1)
643 break;
644 if (syms >= FSE_NUM_SYMBOLS_MAX)
645 return SZ_ERROR_DATA;
646 }
647 else // if (val == 0)
648 {
649 // freqs[syms++] = 0;
650 // syms++;
651 for (;;)
652 {
653 unsigned repeat;
654 FORWARD_READ_BITS(repeat, 2, mask)
655 freqs[syms ] = 0;
656 freqs[syms + 1] = 0;
657 freqs[syms + 2] = 0;
658 syms += repeat;
659 if (syms >= FSE_NUM_SYMBOLS_MAX)
660 return SZ_ERROR_DATA;
661 if (repeat != 3)
662 break;
663 }
664 }
665 }
666 }
667
668 if (syms > numSymbolsMax)
669 return SZ_ERROR_DATA;
670 bitOffset += 7;
671 bitOffset >>= 3;
672 if (bitOffset > 0)
673 return SZ_ERROR_DATA;
674 in->ptr = src + bitOffset;
675 in->len = (size_t)(1 - bitOffset);
676 {
677 // unsigned uuu; for (uuu = 0; uuu < 50; uuu++)
678 FSE_Generate(table, freqs, syms, accuracy,
679 #if defined(Z7_ZSTD_DEC_USE_ML_PLUS3)
680 numSymbolsMax == NUM_ML_SYMBOLS ? MATCH_LEN_MIN :
681 #endif
682 0
683 );
684 }
685 return SZ_OK;
686 }
687
688
689 // ---------- HUFFMAN ----------
690
691 #define HUF_MAX_BITS 12
692 #define HUF_MAX_SYMBS 256
693 #define HUF_DUMMY_SIZE (128 + 8 * 2) // it must multiple of 8
694 // #define HUF_DUMMY_SIZE 0
695 #define HUF_TABLE_SIZE ((2 << HUF_MAX_BITS) + HUF_DUMMY_SIZE)
696 #define HUF_GET_SYMBOLS(table) ((table) + (1 << HUF_MAX_BITS) + HUF_DUMMY_SIZE)
697 // #define HUF_GET_LENS(table) (table)
698
699 typedef struct
700 {
701 // Byte table[HUF_TABLE_SIZE];
702 UInt64 table64[HUF_TABLE_SIZE / sizeof(UInt64)];
703 }
704 CZstdDecHufTable;
705
706 /*
707 Input:
708 numSyms != 0
709 (bits) array size must be aligned for 2
710 if (numSyms & 1), then bits[numSyms] == 0,
711 Huffman tree must be correct before Huf_Build() call:
712 (sum (1/2^bits[i]) == 1).
713 && (bits[i] <= HUF_MAX_BITS)
714 */
715 static
716 Z7_FORCE_INLINE
Huf_Build(Byte * const table,const Byte * bits,const unsigned numSyms)717 void Huf_Build(Byte * const table,
718 const Byte *bits, const unsigned numSyms)
719 {
720 unsigned counts0[HUF_MAX_BITS + 1];
721 unsigned counts1[HUF_MAX_BITS + 1];
722 const Byte * const bitsEnd = bits + numSyms;
723 // /*
724 {
725 unsigned t;
726 for (t = 0; t < Z7_ARRAY_SIZE(counts0); t++) counts0[t] = 0;
727 for (t = 0; t < Z7_ARRAY_SIZE(counts1); t++) counts1[t] = 0;
728 }
729 // */
730 // memset(counts0, 0, sizeof(counts0));
731 // memset(counts1, 0, sizeof(counts1));
732 {
733 const Byte *bits2 = bits;
734 // we access additional bits[symbol] if (numSyms & 1)
735 do
736 {
737 counts0[bits2[0]]++;
738 counts1[bits2[1]]++;
739 }
740 while ((bits2 += 2) < bitsEnd);
741 }
742 {
743 unsigned r = 0;
744 unsigned i = HUF_MAX_BITS;
745 // Byte *lens = HUF_GET_LENS(symbols);
746 do
747 {
748 const unsigned num = (counts0[i] + counts1[i]) << (HUF_MAX_BITS - i);
749 counts0[i] = r;
750 if (num)
751 {
752 Byte *lens = &table[r];
753 r += num;
754 memset(lens, (int)i, num);
755 }
756 }
757 while (--i);
758 counts0[0] = 0; // for speculated loads
759 // no need for check:
760 // if (r != (UInt32)1 << HUF_MAX_BITS) exit(0);
761 }
762 {
763 #ifdef MY_CPU_64BIT
764 UInt64
765 #else
766 UInt32
767 #endif
768 v = 0;
769 Byte *symbols = HUF_GET_SYMBOLS(table);
770 do
771 {
772 const unsigned nb = *bits++;
773 if (nb)
774 {
775 const unsigned code = counts0[nb];
776 const unsigned num = (1u << HUF_MAX_BITS) >> nb;
777 counts0[nb] = code + num;
778 // memset(&symbols[code], i, num);
779 // /*
780 {
781 Byte *s2 = &symbols[code];
782 if (num <= 2)
783 {
784 s2[0] = (Byte)v;
785 s2[(size_t)num - 1] = (Byte)v;
786 }
787 else if (num <= 8)
788 {
789 *(UInt32 *)(void *)s2 = (UInt32)v;
790 *(UInt32 *)(void *)(s2 + (size_t)num - 4) = (UInt32)v;
791 }
792 else
793 {
794 #ifdef MY_CPU_64BIT
795 UInt64 *s = (UInt64 *)(void *)s2;
796 const UInt64 *lim = (UInt64 *)(void *)(s2 + num);
797 do
798 {
799 s[0] = v; s[1] = v; s += 2;
800 }
801 while (s != lim);
802 #else
803 UInt32 *s = (UInt32 *)(void *)s2;
804 const UInt32 *lim = (const UInt32 *)(const void *)(s2 + num);
805 do
806 {
807 s[0] = v; s[1] = v; s += 2;
808 s[0] = v; s[1] = v; s += 2;
809 }
810 while (s != lim);
811 #endif
812 }
813 }
814 // */
815 }
816 v +=
817 #ifdef MY_CPU_64BIT
818 0x0101010101010101;
819 #else
820 0x01010101;
821 #endif
822 }
823 while (bits != bitsEnd);
824 }
825 }
826
827
828
829 // how many bytes (src) was moved back from original value.
830 // we need (HUF_SRC_OFFSET == 3) for optimized 32-bit memory access
831 #define HUF_SRC_OFFSET 3
832
833 // v <<= 8 - (bitOffset & 7) + numBits;
834 // v >>= 32 - HUF_MAX_BITS;
835 #define HUF_GET_STATE(v, bitOffset, numBits) \
836 GET32(v, src + (HUF_SRC_OFFSET - 3) + ((CBitCtr_signed)bitOffset >> 3)) \
837 v >>= 32 - HUF_MAX_BITS - 8 + ((unsigned)bitOffset & 7) - numBits; \
838 v &= (1u << HUF_MAX_BITS) - 1; \
839
840
841 #ifndef Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS
842 #if defined(MY_CPU_AMD64) && defined(_MSC_VER) && _MSC_VER == 1400 \
843 || !defined(MY_CPU_X86_OR_AMD64) \
844 // || 1 == 1 /* for debug : to force STREAM4_PRELOAD mode */
845 // we need big number (>=16) of registers for PRELOAD4
846 #define Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD4
847 // #define Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD2 // for debug
848 #endif
849 #endif
850
851 // for debug: simpler and smaller code but slow:
852 // #define Z7_ZSTD_DEC_USE_HUF_STREAM1_SIMPLE
853
854 #if defined(Z7_ZSTD_DEC_USE_HUF_STREAM1_SIMPLE) || \
855 !defined(Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS)
856
857 #define HUF_DECODE(bitOffset, dest) \
858 { \
859 UInt32 v; \
860 HUF_GET_STATE(v, bitOffset, 0) \
861 bitOffset -= table[v]; \
862 *(dest) = symbols[v]; \
863 if ((CBitCtr_signed)bitOffset < 0) return SZ_ERROR_DATA; \
864 }
865
866 #endif
867
868 #if !defined(Z7_ZSTD_DEC_USE_HUF_STREAM1_SIMPLE) || \
869 defined(Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD4) || \
870 defined(Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD2) \
871
872 #define HUF_DECODE_2_INIT(v, bitOffset) \
873 HUF_GET_STATE(v, bitOffset, 0)
874
875 #define HUF_DECODE_2(v, bitOffset, dest) \
876 { \
877 unsigned numBits; \
878 numBits = table[v]; \
879 *(dest) = symbols[v]; \
880 HUF_GET_STATE(v, bitOffset, numBits) \
881 bitOffset -= (CBitCtr)numBits; \
882 if ((CBitCtr_signed)bitOffset < 0) return SZ_ERROR_DATA; \
883 }
884
885 #endif
886
887
888 // src == ptr - HUF_SRC_OFFSET
889 // we are allowed to access 3 bytes before start of input buffer
890 static
891 Z7_NO_INLINE
Huf_Decompress_1stream(const Byte * const table,const Byte * src,const size_t srcLen,Byte * dest,const size_t destLen)892 SRes Huf_Decompress_1stream(const Byte * const table,
893 const Byte *src, const size_t srcLen,
894 Byte *dest, const size_t destLen)
895 {
896 CBitCtr bitOffset;
897 if (srcLen == 0)
898 return SZ_ERROR_DATA;
899 SET_bitOffset_TO_PAD (bitOffset, src + HUF_SRC_OFFSET, srcLen)
900 if (destLen)
901 {
902 const Byte *symbols = HUF_GET_SYMBOLS(table);
903 const Byte *destLim = dest + destLen;
904 #ifdef Z7_ZSTD_DEC_USE_HUF_STREAM1_SIMPLE
905 {
906 do
907 {
908 HUF_DECODE (bitOffset, dest)
909 }
910 while (++dest != destLim);
911 }
912 #else
913 {
914 UInt32 v;
915 HUF_DECODE_2_INIT (v, bitOffset)
916 do
917 {
918 HUF_DECODE_2 (v, bitOffset, dest)
919 }
920 while (++dest != destLim);
921 }
922 #endif
923 }
924 return bitOffset == 0 ? SZ_OK : SZ_ERROR_DATA;
925 }
926
927
928 // for debug : it reduces register pressure : by array copy can be slow :
929 // #define Z7_ZSTD_DEC_USE_HUF_LOCAL
930
931 // src == ptr + (6 - HUF_SRC_OFFSET)
932 // srcLen >= 10
933 // we are allowed to access 3 bytes before start of input buffer
934 static
935 Z7_NO_INLINE
Huf_Decompress_4stream(const Byte * const table2,const Byte * src,size_t srcLen,Byte * dest,size_t destLen)936 SRes Huf_Decompress_4stream(const Byte * const
937 #ifdef Z7_ZSTD_DEC_USE_HUF_LOCAL
938 table2,
939 #else
940 table,
941 #endif
942 const Byte *src, size_t srcLen,
943 Byte *dest, size_t destLen)
944 {
945 #ifdef Z7_ZSTD_DEC_USE_HUF_LOCAL
946 Byte table[HUF_TABLE_SIZE];
947 #endif
948 UInt32 sizes[3];
949 const size_t delta = (destLen + 3) / 4;
950 if ((sizes[0] = GetUi16(src + (0 + HUF_SRC_OFFSET - 6))) == 0) return SZ_ERROR_DATA;
951 if ((sizes[1] = GetUi16(src + (2 + HUF_SRC_OFFSET - 6))) == 0) return SZ_ERROR_DATA;
952 sizes[1] += sizes[0];
953 if ((sizes[2] = GetUi16(src + (4 + HUF_SRC_OFFSET - 6))) == 0) return SZ_ERROR_DATA;
954 sizes[2] += sizes[1];
955 srcLen -= 6;
956 if (srcLen <= sizes[2])
957 return SZ_ERROR_DATA;
958
959 #ifdef Z7_ZSTD_DEC_USE_HUF_LOCAL
960 {
961 // unsigned i = 0; for(; i < 1000; i++)
962 memcpy(table, table2, HUF_TABLE_SIZE);
963 }
964 #endif
965
966 #ifndef Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS
967 {
968 CBitCtr bitOffset_0,
969 bitOffset_1,
970 bitOffset_2,
971 bitOffset_3;
972 {
973 SET_bitOffset_TO_PAD_and_SET_BIT_SIZE (bitOffset_0, src + HUF_SRC_OFFSET, sizes[0])
974 SET_bitOffset_TO_PAD_and_SET_BIT_SIZE (bitOffset_1, src + HUF_SRC_OFFSET, sizes[1])
975 SET_bitOffset_TO_PAD_and_SET_BIT_SIZE (bitOffset_2, src + HUF_SRC_OFFSET, sizes[2])
976 SET_bitOffset_TO_PAD (bitOffset_3, src + HUF_SRC_OFFSET, srcLen)
977 }
978 {
979 const Byte * const symbols = HUF_GET_SYMBOLS(table);
980 Byte *destLim = dest + destLen - delta * 3;
981
982 if (dest != destLim)
983 #ifdef Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD4
984 {
985 UInt32 v_0, v_1, v_2, v_3;
986 HUF_DECODE_2_INIT (v_0, bitOffset_0)
987 HUF_DECODE_2_INIT (v_1, bitOffset_1)
988 HUF_DECODE_2_INIT (v_2, bitOffset_2)
989 HUF_DECODE_2_INIT (v_3, bitOffset_3)
990 // #define HUF_DELTA (1 << 17) / 4
991 do
992 {
993 HUF_DECODE_2 (v_3, bitOffset_3, dest + delta * 3)
994 HUF_DECODE_2 (v_2, bitOffset_2, dest + delta * 2)
995 HUF_DECODE_2 (v_1, bitOffset_1, dest + delta)
996 HUF_DECODE_2 (v_0, bitOffset_0, dest)
997 }
998 while (++dest != destLim);
999 /*
1000 {// unsigned y = 0; for (;y < 1; y++)
1001 {
1002 const size_t num = destLen - delta * 3;
1003 Byte *orig = dest - num;
1004 memmove (orig + delta , orig + HUF_DELTA, num);
1005 memmove (orig + delta * 2, orig + HUF_DELTA * 2, num);
1006 memmove (orig + delta * 3, orig + HUF_DELTA * 3, num);
1007 }}
1008 */
1009 }
1010 #elif defined(Z7_ZSTD_DEC_USE_HUF_STREAM4_PRELOAD2)
1011 {
1012 UInt32 v_0, v_1, v_2, v_3;
1013 HUF_DECODE_2_INIT (v_0, bitOffset_0)
1014 HUF_DECODE_2_INIT (v_1, bitOffset_1)
1015 do
1016 {
1017 HUF_DECODE_2 (v_0, bitOffset_0, dest)
1018 HUF_DECODE_2 (v_1, bitOffset_1, dest + delta)
1019 }
1020 while (++dest != destLim);
1021 dest = destLim - (destLen - delta * 3);
1022 dest += delta * 2;
1023 destLim += delta * 2;
1024 HUF_DECODE_2_INIT (v_2, bitOffset_2)
1025 HUF_DECODE_2_INIT (v_3, bitOffset_3)
1026 do
1027 {
1028 HUF_DECODE_2 (v_2, bitOffset_2, dest)
1029 HUF_DECODE_2 (v_3, bitOffset_3, dest + delta)
1030 }
1031 while (++dest != destLim);
1032 dest -= delta * 2;
1033 destLim -= delta * 2;
1034 }
1035 #else
1036 {
1037 do
1038 {
1039 HUF_DECODE (bitOffset_3, dest + delta * 3)
1040 HUF_DECODE (bitOffset_2, dest + delta * 2)
1041 HUF_DECODE (bitOffset_1, dest + delta)
1042 HUF_DECODE (bitOffset_0, dest)
1043 }
1044 while (++dest != destLim);
1045 }
1046 #endif
1047
1048 if (bitOffset_3 != (CBitCtr)sizes[2])
1049 return SZ_ERROR_DATA;
1050 if (destLen &= 3)
1051 {
1052 destLim = dest + 4 - destLen;
1053 do
1054 {
1055 HUF_DECODE (bitOffset_2, dest + delta * 2)
1056 HUF_DECODE (bitOffset_1, dest + delta)
1057 HUF_DECODE (bitOffset_0, dest)
1058 }
1059 while (++dest != destLim);
1060 }
1061 if ( bitOffset_0 != 0
1062 || bitOffset_1 != (CBitCtr)sizes[0]
1063 || bitOffset_2 != (CBitCtr)sizes[1])
1064 return SZ_ERROR_DATA;
1065 }
1066 }
1067 #else // Z7_ZSTD_DEC_USE_HUF_STREAM1_ALWAYS
1068 {
1069 unsigned i;
1070 for (i = 0; i < 4; i++)
1071 {
1072 size_t d = destLen;
1073 size_t size = srcLen;
1074 if (i != 3)
1075 {
1076 d = delta;
1077 size = sizes[i];
1078 }
1079 if (i != 0)
1080 size -= sizes[i - 1];
1081 destLen -= d;
1082 RINOK(Huf_Decompress_1stream(table, src, size, dest, d))
1083 dest += d;
1084 src += size;
1085 }
1086 }
1087 #endif
1088
1089 return SZ_OK;
1090 }
1091
1092
1093
1094 // (in->len != 0)
1095 // we are allowed to access in->ptr[-3]
1096 // at least 2 bytes in (in->ptr) will be processed
Huf_DecodeTable(CZstdDecHufTable * const p,CInBufPair * const in)1097 static SRes Huf_DecodeTable(CZstdDecHufTable *const p, CInBufPair *const in)
1098 {
1099 Byte weights[HUF_MAX_SYMBS + 1]; // +1 for extra write for loop unroll
1100 unsigned numSyms;
1101 const unsigned header = *(in->ptr)++;
1102 in->len--;
1103 // memset(weights, 0, sizeof(weights));
1104 if (header >= 128)
1105 {
1106 // direct representation: 4 bits field (0-15) per weight
1107 numSyms = header - 127;
1108 // numSyms != 0
1109 {
1110 const size_t numBytes = (numSyms + 1) / 2;
1111 const Byte *const ws = in->ptr;
1112 size_t i = 0;
1113 if (in->len < numBytes)
1114 return SZ_ERROR_DATA;
1115 in->ptr += numBytes;
1116 in->len -= numBytes;
1117 do
1118 {
1119 const unsigned b = ws[i];
1120 weights[i * 2 ] = (Byte)(b >> 4);
1121 weights[i * 2 + 1] = (Byte)(b & 0xf);
1122 }
1123 while (++i != numBytes);
1124 /* 7ZIP: we can restore correct zero value for weights[numSyms],
1125 if we want to use zero values starting from numSyms in code below. */
1126 // weights[numSyms] = 0;
1127 }
1128 }
1129 else
1130 {
1131 #define MAX_ACCURACY_LOG_FOR_WEIGHTS 6
1132 CFseRecord table[1 << MAX_ACCURACY_LOG_FOR_WEIGHTS];
1133
1134 Byte accuracy;
1135 const Byte *src;
1136 size_t srcLen;
1137 if (in->len < header)
1138 return SZ_ERROR_DATA;
1139 {
1140 CInBufPair fse_stream;
1141 fse_stream.len = header;
1142 fse_stream.ptr = in->ptr;
1143 in->ptr += header;
1144 in->len -= header;
1145 RINOK(FSE_DecodeHeader(table, &fse_stream,
1146 MAX_ACCURACY_LOG_FOR_WEIGHTS,
1147 &accuracy,
1148 16 // num weight symbols max (max-symbol is 15)
1149 ))
1150 // at least 2 bytes were processed in fse_stream.
1151 // (srcLen > 0) after FSE_DecodeHeader()
1152 // if (srcLen == 0) return SZ_ERROR_DATA;
1153 src = fse_stream.ptr;
1154 srcLen = fse_stream.len;
1155 }
1156 // we are allowed to access src[-5]
1157 {
1158 // unsigned yyy = 200; do {
1159 CBitCtr bitOffset;
1160 FastInt32 state1, state2;
1161 SET_bitOffset_TO_PAD (bitOffset, src, srcLen)
1162 state1 = accuracy;
1163 src -= state1 >> 2; // src -= 1; // for GET16() optimization
1164 state1 <<= FSE_REC_LEN_OFFSET;
1165 state2 = state1;
1166 numSyms = 0;
1167 for (;;)
1168 {
1169 #define FSE_WEIGHT_DECODE(st) \
1170 { \
1171 const unsigned bits = GET_FSE_REC_LEN(st); \
1172 FastInt r; \
1173 GET16(r, src + (bitOffset >> 3)) \
1174 r >>= (unsigned)bitOffset & 7; \
1175 if ((CBitCtr_signed)(bitOffset -= (CBitCtr)bits) < 0) \
1176 { if (bitOffset + (CBitCtr)bits != 0) \
1177 return SZ_ERROR_DATA; \
1178 break; } \
1179 r &= 0xff; \
1180 r >>= 8 - bits; \
1181 st = table[GET_FSE_REC_STATE(st) + r]; \
1182 weights[numSyms++] = (Byte)GET_FSE_REC_SYM(st); \
1183 }
1184 FSE_WEIGHT_DECODE (state1)
1185 FSE_WEIGHT_DECODE (state2)
1186 if (numSyms == HUF_MAX_SYMBS)
1187 return SZ_ERROR_DATA;
1188 }
1189 // src += (unsigned)accuracy >> 2; } while (--yyy);
1190 }
1191 }
1192
1193 // Build using weights:
1194 {
1195 UInt32 sum = 0;
1196 {
1197 // numSyms >= 1
1198 unsigned i = 0;
1199 weights[numSyms] = 0;
1200 do
1201 {
1202 sum += ((UInt32)1 << weights[i ]) & ~(UInt32)1;
1203 sum += ((UInt32)1 << weights[i + 1]) & ~(UInt32)1;
1204 i += 2;
1205 }
1206 while (i < numSyms);
1207 if (sum == 0)
1208 return SZ_ERROR_DATA;
1209 }
1210 {
1211 const unsigned maxBits = GetHighestSetBit_32_nonzero_big(sum) + 1;
1212 {
1213 const UInt32 left = ((UInt32)1 << maxBits) - sum;
1214 // (left != 0)
1215 // (left) must be power of 2 in correct stream
1216 if (left & (left - 1))
1217 return SZ_ERROR_DATA;
1218 weights[numSyms++] = (Byte)GetHighestSetBit_32_nonzero_big(left);
1219 }
1220 // if (numSyms & 1)
1221 weights[numSyms] = 0; // for loop unroll
1222 // numSyms >= 2
1223 {
1224 unsigned i = 0;
1225 do
1226 {
1227 /*
1228 #define WEIGHT_ITER(a) \
1229 { unsigned w = weights[i + (a)]; \
1230 const unsigned t = maxBits - w; \
1231 w = w ? t: w; \
1232 if (w > HUF_MAX_BITS) return SZ_ERROR_DATA; \
1233 weights[i + (a)] = (Byte)w; }
1234 */
1235 // /*
1236 #define WEIGHT_ITER(a) \
1237 { unsigned w = weights[i + (a)]; \
1238 if (w) { \
1239 w = maxBits - w; \
1240 if (w > HUF_MAX_BITS) return SZ_ERROR_DATA; \
1241 weights[i + (a)] = (Byte)w; }}
1242 // */
1243 WEIGHT_ITER(0)
1244 // WEIGHT_ITER(1)
1245 // i += 2;
1246 }
1247 while (++i != numSyms);
1248 }
1249 }
1250 }
1251 {
1252 // unsigned yyy; for (yyy = 0; yyy < 100; yyy++)
1253 Huf_Build((Byte *)(void *)p->table64, weights, numSyms);
1254 }
1255 return SZ_OK;
1256 }
1257
1258
1259 typedef enum
1260 {
1261 k_SeqMode_Predef = 0,
1262 k_SeqMode_RLE = 1,
1263 k_SeqMode_FSE = 2,
1264 k_SeqMode_Repeat = 3
1265 }
1266 z7_zstd_enum_SeqMode;
1267
1268 // predefAccuracy == 5 for OFFSET symbols
1269 // predefAccuracy == 6 for MATCH/LIT LEN symbols
1270 static
1271 SRes
1272 Z7_NO_INLINE
1273 // Z7_FORCE_INLINE
FSE_Decode_SeqTable(CFseRecord * const table,CInBufPair * const in,unsigned predefAccuracy,Byte * const accuracyRes,unsigned numSymbolsMax,const CFseRecord * const predefs,const unsigned seqMode)1274 FSE_Decode_SeqTable(CFseRecord * const table,
1275 CInBufPair * const in,
1276 unsigned predefAccuracy,
1277 Byte * const accuracyRes,
1278 unsigned numSymbolsMax,
1279 const CFseRecord * const predefs,
1280 const unsigned seqMode)
1281 {
1282 // UNUSED_VAR(numSymsPredef)
1283 // UNUSED_VAR(predefFreqs)
1284 if (seqMode == k_SeqMode_FSE)
1285 {
1286 // unsigned y = 50; CInBufPair in2 = *in; do { *in = in2; RINOK(
1287 return
1288 FSE_DecodeHeader(table, in,
1289 predefAccuracy + 3, // accuracyMax
1290 accuracyRes,
1291 numSymbolsMax)
1292 ;
1293 // )} while (--y); return SZ_OK;
1294 }
1295 // numSymsMax = numSymsPredef + ((predefAccuracy & 1) * (32 - 29))); // numSymsMax
1296 // numSymsMax == 32 for offsets
1297
1298 if (seqMode == k_SeqMode_Predef)
1299 {
1300 *accuracyRes = (Byte)predefAccuracy;
1301 memcpy(table, predefs, sizeof(UInt32) << predefAccuracy);
1302 return SZ_OK;
1303 }
1304
1305 // (seqMode == k_SeqMode_RLE)
1306 if (in->len == 0)
1307 return SZ_ERROR_DATA;
1308 in->len--;
1309 {
1310 const Byte *ptr = in->ptr;
1311 const unsigned sym = ptr[0];
1312 in->ptr = ptr + 1;
1313 if (sym >= numSymbolsMax)
1314 return SZ_ERROR_DATA;
1315 table[0] = (FastInt32)sym
1316 #if defined(Z7_ZSTD_DEC_USE_ML_PLUS3)
1317 + (numSymbolsMax == NUM_ML_SYMBOLS ? MATCH_LEN_MIN : 0)
1318 #endif
1319 ;
1320 *accuracyRes = 0;
1321 }
1322 return SZ_OK;
1323 }
1324
1325
1326 typedef struct
1327 {
1328 CFseRecord of[1 << 8];
1329 CFseRecord ll[1 << 9];
1330 CFseRecord ml[1 << 9];
1331 }
1332 CZstdDecFseTables;
1333
1334
1335 typedef struct
1336 {
1337 Byte *win;
1338 SizeT cycSize;
1339 /*
1340 if (outBuf_fromCaller) : cycSize = outBufSize_fromCaller
1341 else {
1342 if ( isCyclicMode) : cycSize = cyclic_buffer_size = (winSize + extra_space)
1343 if (!isCyclicMode) : cycSize = ContentSize,
1344 (isCyclicMode == true) if (ContetSize >= winSize) or ContetSize is unknown
1345 }
1346 */
1347 SizeT winPos;
1348
1349 CZstdDecOffset reps[3];
1350
1351 Byte ll_accuracy;
1352 Byte of_accuracy;
1353 Byte ml_accuracy;
1354 // Byte seqTables_wereSet;
1355 Byte litHuf_wasSet;
1356
1357 Byte *literalsBase;
1358
1359 size_t winSize; // from header
1360 size_t totalOutCheck; // totalOutCheck <= winSize
1361
1362 #ifdef Z7_ZSTD_DEC_USE_BASES_IN_OBJECT
1363 SEQ_EXTRA_TABLES(m_)
1364 #endif
1365 // UInt64 _pad_Alignment; // is not required now
1366 CZstdDecFseTables fse;
1367 CZstdDecHufTable huf;
1368 }
1369 CZstdDec1;
1370
1371 #define ZstdDec1_GET_BLOCK_SIZE_LIMIT(p) \
1372 ((p)->winSize < kBlockSizeMax ? (UInt32)(p)->winSize : kBlockSizeMax)
1373
1374 #define SEQ_TABLES_WERE_NOT_SET_ml_accuracy 1 // accuracy=1 is not used by zstd
1375 #define IS_SEQ_TABLES_WERE_SET(p) (((p)->ml_accuracy != SEQ_TABLES_WERE_NOT_SET_ml_accuracy))
1376 // #define IS_SEQ_TABLES_WERE_SET(p) ((p)->seqTables_wereSet)
1377
1378
ZstdDec1_Construct(CZstdDec1 * p)1379 static void ZstdDec1_Construct(CZstdDec1 *p)
1380 {
1381 #ifdef Z7_ZSTD_DEC_PRINT_TABLE
1382 Print_Predef(6, NUM_LL_SYMBOLS, SEQ_LL_PREDEF_DIST, k_PredefRecords_LL);
1383 Print_Predef(5, NUM_OFFSET_SYMBOLS_PREDEF, SEQ_OFFSET_PREDEF_DIST, k_PredefRecords_OF);
1384 Print_Predef(6, NUM_ML_SYMBOLS, SEQ_ML_PREDEF_DIST, k_PredefRecords_ML);
1385 #endif
1386
1387 p->win = NULL;
1388 p->cycSize = 0;
1389 p->literalsBase = NULL;
1390 #ifdef Z7_ZSTD_DEC_USE_BASES_IN_OBJECT
1391 FILL_LOC_BASES_ALL
1392 #endif
1393 }
1394
1395
ZstdDec1_Init(CZstdDec1 * p)1396 static void ZstdDec1_Init(CZstdDec1 *p)
1397 {
1398 p->reps[0] = 1;
1399 p->reps[1] = 4;
1400 p->reps[2] = 8;
1401 // p->seqTables_wereSet = False;
1402 p->ml_accuracy = SEQ_TABLES_WERE_NOT_SET_ml_accuracy;
1403 p->litHuf_wasSet = False;
1404 p->totalOutCheck = 0;
1405 }
1406
1407
1408
1409 #ifdef MY_CPU_LE_UNALIGN
1410 #define Z7_ZSTD_DEC_USE_UNALIGNED_COPY
1411 #endif
1412
1413 #ifdef Z7_ZSTD_DEC_USE_UNALIGNED_COPY
1414
1415 #define COPY_CHUNK_SIZE 16
1416
1417 #define COPY_CHUNK_4_2(dest, src) \
1418 { \
1419 ((UInt32 *)(void *)dest)[0] = ((const UInt32 *)(const void *)src)[0]; \
1420 ((UInt32 *)(void *)dest)[1] = ((const UInt32 *)(const void *)src)[1]; \
1421 src += 4 * 2; \
1422 dest += 4 * 2; \
1423 }
1424
1425 /* sse2 doesn't help here in GCC and CLANG.
1426 so we disabled sse2 here */
1427 /*
1428 #if defined(MY_CPU_AMD64)
1429 #define Z7_ZSTD_DEC_USE_SSE2
1430 #elif defined(MY_CPU_X86)
1431 #if defined(_MSC_VER) && _MSC_VER >= 1300 && defined(_M_IX86_FP) && (_M_IX86_FP >= 2) \
1432 || defined(__SSE2__) \
1433 // || 1 == 1 // for debug only
1434 #define Z7_ZSTD_DEC_USE_SSE2
1435 #endif
1436 #endif
1437 */
1438
1439 #if defined(MY_CPU_ARM64)
1440 #define COPY_OFFSET_MIN 16
1441 #define COPY_CHUNK1(dest, src) \
1442 { \
1443 vst1q_u8((uint8_t *)(void *)dest, \
1444 vld1q_u8((const uint8_t *)(const void *)src)); \
1445 src += 16; \
1446 dest += 16; \
1447 }
1448
1449 #define COPY_CHUNK(dest, src) \
1450 { \
1451 COPY_CHUNK1(dest, src) \
1452 if ((len -= COPY_CHUNK_SIZE) == 0) break; \
1453 COPY_CHUNK1(dest, src) \
1454 }
1455
1456 #elif defined(Z7_ZSTD_DEC_USE_SSE2)
1457 #include <emmintrin.h> // sse2
1458 #define COPY_OFFSET_MIN 16
1459
1460 #define COPY_CHUNK1(dest, src) \
1461 { \
1462 _mm_storeu_si128((__m128i *)(void *)dest, \
1463 _mm_loadu_si128((const __m128i *)(const void *)src)); \
1464 src += 16; \
1465 dest += 16; \
1466 }
1467
1468 #define COPY_CHUNK(dest, src) \
1469 { \
1470 COPY_CHUNK1(dest, src) \
1471 if ((len -= COPY_CHUNK_SIZE) == 0) break; \
1472 COPY_CHUNK1(dest, src) \
1473 }
1474
1475 #elif defined(MY_CPU_64BIT)
1476 #define COPY_OFFSET_MIN 8
1477
1478 #define COPY_CHUNK(dest, src) \
1479 { \
1480 ((UInt64 *)(void *)dest)[0] = ((const UInt64 *)(const void *)src)[0]; \
1481 ((UInt64 *)(void *)dest)[1] = ((const UInt64 *)(const void *)src)[1]; \
1482 src += 8 * 2; \
1483 dest += 8 * 2; \
1484 }
1485
1486 #else
1487 #define COPY_OFFSET_MIN 4
1488
1489 #define COPY_CHUNK(dest, src) \
1490 { \
1491 COPY_CHUNK_4_2(dest, src); \
1492 COPY_CHUNK_4_2(dest, src); \
1493 }
1494
1495 #endif
1496 #endif
1497
1498
1499 #ifndef COPY_CHUNK_SIZE
1500 #define COPY_OFFSET_MIN 4
1501 #define COPY_CHUNK_SIZE 8
1502 #define COPY_CHUNK_2(dest, src) \
1503 { \
1504 const Byte a0 = src[0]; \
1505 const Byte a1 = src[1]; \
1506 dest[0] = a0; \
1507 dest[1] = a1; \
1508 src += 2; \
1509 dest += 2; \
1510 }
1511 #define COPY_CHUNK(dest, src) \
1512 { \
1513 COPY_CHUNK_2(dest, src) \
1514 COPY_CHUNK_2(dest, src) \
1515 COPY_CHUNK_2(dest, src) \
1516 COPY_CHUNK_2(dest, src) \
1517 }
1518 #endif
1519
1520
1521 #define COPY_PREPARE \
1522 len += (COPY_CHUNK_SIZE - 1); \
1523 len &= ~(size_t)(COPY_CHUNK_SIZE - 1); \
1524 { if (len > rem) \
1525 { len = rem; \
1526 rem &= (COPY_CHUNK_SIZE - 1); \
1527 if (rem) { \
1528 len -= rem; \
1529 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE \
1530 do *dest++ = *src++; while (--rem); \
1531 if (len == 0) return; }}}
1532
1533 #define COPY_CHUNKS \
1534 { \
1535 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE \
1536 do { COPY_CHUNK(dest, src) } \
1537 while (len -= COPY_CHUNK_SIZE); \
1538 }
1539
1540 // (len != 0)
1541 // (len <= rem)
1542 static
1543 Z7_FORCE_INLINE
1544 // Z7_ATTRIB_NO_VECTOR
CopyLiterals(Byte * dest,Byte const * src,size_t len,size_t rem)1545 void CopyLiterals(Byte *dest, Byte const *src, size_t len, size_t rem)
1546 {
1547 COPY_PREPARE
1548 COPY_CHUNKS
1549 }
1550
1551
1552 /* we can define Z7_STD_DEC_USE_AFTER_CYC_BUF, if we want to use additional
1553 space after cycSize that can be used to reduce the code in CopyMatch(): */
1554 // for debug:
1555 // #define Z7_STD_DEC_USE_AFTER_CYC_BUF
1556
1557 /*
1558 CopyMatch()
1559 if wrap (offset > winPos)
1560 {
1561 then we have at least (COPY_CHUNK_SIZE) avail in (dest) before we will overwrite (src):
1562 (cycSize >= offset + COPY_CHUNK_SIZE)
1563 if defined(Z7_STD_DEC_USE_AFTER_CYC_BUF)
1564 we are allowed to read win[cycSize + COPY_CHUNK_SIZE - 1],
1565 }
1566 (len != 0)
1567 */
1568 static
1569 Z7_FORCE_INLINE
1570 // Z7_ATTRIB_NO_VECTOR
CopyMatch(size_t offset,size_t len,Byte * win,size_t winPos,size_t rem,const size_t cycSize)1571 void CopyMatch(size_t offset, size_t len,
1572 Byte *win, size_t winPos, size_t rem, const size_t cycSize)
1573 {
1574 Byte *dest = win + winPos;
1575 const Byte *src;
1576 // STAT_INC(g_NumCopy)
1577
1578 if (offset > winPos)
1579 {
1580 size_t back = offset - winPos;
1581 // src = win + cycSize - back;
1582 // cycSize -= offset;
1583 STAT_INC(g_NumOver)
1584 src = dest + (cycSize - offset);
1585 // (src >= dest) here
1586 #ifdef Z7_STD_DEC_USE_AFTER_CYC_BUF
1587 if (back < len)
1588 {
1589 #else
1590 if (back < len + (COPY_CHUNK_SIZE - 1))
1591 {
1592 if (back >= len)
1593 {
1594 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
1595 do
1596 *dest++ = *src++;
1597 while (--len);
1598 return;
1599 }
1600 #endif
1601 // back < len
1602 STAT_INC(g_NumOver2)
1603 len -= back;
1604 rem -= back;
1605 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
1606 do
1607 *dest++ = *src++;
1608 while (--back);
1609 src = dest - offset;
1610 // src = win;
1611 // we go to MAIN-COPY
1612 }
1613 }
1614 else
1615 src = dest - offset;
1616
1617 // len != 0
1618 // do *dest++ = *src++; while (--len); return;
1619
1620 // --- MAIN COPY ---
1621 // if (src >= dest), then ((size_t)(src - dest) >= COPY_CHUNK_SIZE)
1622 // so we have at least COPY_CHUNK_SIZE space before overlap for writing.
1623 COPY_PREPARE
1624
1625 /* now (len == COPY_CHUNK_SIZE * x)
1626 so we can unroll for aligned copy */
1627 {
1628 // const unsigned b0 = src[0];
1629 // (COPY_OFFSET_MIN >= 4)
1630
1631 if (offset >= COPY_OFFSET_MIN)
1632 {
1633 COPY_CHUNKS
1634 // return;
1635 }
1636 else
1637 #if (COPY_OFFSET_MIN > 4)
1638 #if COPY_CHUNK_SIZE < 8
1639 #error Stop_Compiling_Bad_COPY_CHUNK_SIZE
1640 #endif
1641 if (offset >= 4)
1642 {
1643 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
1644 do
1645 {
1646 COPY_CHUNK_4_2(dest, src)
1647 #if COPY_CHUNK_SIZE != 16
1648 if (len == 8) break;
1649 #endif
1650 COPY_CHUNK_4_2(dest, src)
1651 }
1652 while (len -= 16);
1653 // return;
1654 }
1655 else
1656 #endif
1657 {
1658 // (offset < 4)
1659 const unsigned b0 = src[0];
1660 if (offset < 2)
1661 {
1662 #if defined(Z7_ZSTD_DEC_USE_UNALIGNED_COPY) && (COPY_CHUNK_SIZE == 16)
1663 #if defined(MY_CPU_64BIT)
1664 {
1665 const UInt64 v64 = (UInt64)b0 * 0x0101010101010101;
1666 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
1667 do
1668 {
1669 ((UInt64 *)(void *)dest)[0] = v64;
1670 ((UInt64 *)(void *)dest)[1] = v64;
1671 dest += 16;
1672 }
1673 while (len -= 16);
1674 }
1675 #else
1676 {
1677 UInt32 v = b0;
1678 v |= v << 8;
1679 v |= v << 16;
1680 do
1681 {
1682 ((UInt32 *)(void *)dest)[0] = v;
1683 ((UInt32 *)(void *)dest)[1] = v;
1684 dest += 8;
1685 ((UInt32 *)(void *)dest)[0] = v;
1686 ((UInt32 *)(void *)dest)[1] = v;
1687 dest += 8;
1688 }
1689 while (len -= 16);
1690 }
1691 #endif
1692 #else
1693 do
1694 {
1695 dest[0] = (Byte)b0;
1696 dest[1] = (Byte)b0;
1697 dest += 2;
1698 dest[0] = (Byte)b0;
1699 dest[1] = (Byte)b0;
1700 dest += 2;
1701 }
1702 while (len -= 4);
1703 #endif
1704 }
1705 else if (offset == 2)
1706 {
1707 const Byte b1 = src[1];
1708 {
1709 do
1710 {
1711 dest[0] = (Byte)b0;
1712 dest[1] = b1;
1713 dest += 2;
1714 }
1715 while (len -= 2);
1716 }
1717 }
1718 else // (offset == 3)
1719 {
1720 const Byte *lim = dest + len - 2;
1721 const Byte b1 = src[1];
1722 const Byte b2 = src[2];
1723 do
1724 {
1725 dest[0] = (Byte)b0;
1726 dest[1] = b1;
1727 dest[2] = b2;
1728 dest += 3;
1729 }
1730 while (dest < lim);
1731 lim++; // points to last byte that must be written
1732 if (dest <= lim)
1733 {
1734 *dest = (Byte)b0;
1735 if (dest != lim)
1736 dest[1] = b1;
1737 }
1738 }
1739 }
1740 }
1741 }
1742
1743
1744
1745 #define UPDATE_TOTAL_OUT(p, size) \
1746 { \
1747 size_t _toc = (p)->totalOutCheck + (size); \
1748 const size_t _ws = (p)->winSize; \
1749 if (_toc >= _ws) _toc = _ws; \
1750 (p)->totalOutCheck = _toc; \
1751 }
1752
1753
1754 #if defined(MY_CPU_64BIT) && defined(MY_CPU_LE_UNALIGN)
1755 // we can disable it for debug:
1756 #define Z7_ZSTD_DEC_USE_64BIT_LOADS
1757 #endif
1758 // #define Z7_ZSTD_DEC_USE_64BIT_LOADS // for debug : slow in 32-bit
1759
1760 // SEQ_SRC_OFFSET: how many bytes (src) (seqSrc) was moved back from original value.
1761 // we need (SEQ_SRC_OFFSET != 0) for optimized memory access
1762 #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
1763 #define SEQ_SRC_OFFSET 7
1764 #else
1765 #define SEQ_SRC_OFFSET 3
1766 #endif
1767 #define SRC_PLUS_FOR_4BYTES(bitOffset) (SEQ_SRC_OFFSET - 3) + ((CBitCtr_signed)(bitOffset) >> 3)
1768 #define BIT_OFFSET_7BITS(bitOffset) ((unsigned)(bitOffset) & 7)
1769 /*
1770 if (BIT_OFFSET_DELTA_BITS == 0) : bitOffset == number_of_unprocessed_bits
1771 if (BIT_OFFSET_DELTA_BITS == 1) : bitOffset == number_of_unprocessed_bits - 1
1772 and we can read 1 bit more in that mode : (8 * n + 1).
1773 */
1774 // #define BIT_OFFSET_DELTA_BITS 0
1775 #define BIT_OFFSET_DELTA_BITS 1
1776 #if BIT_OFFSET_DELTA_BITS == 1
1777 #define GET_SHIFT_FROM_BOFFS7(boff7) (7 ^ (boff7))
1778 #else
1779 #define GET_SHIFT_FROM_BOFFS7(boff7) (8 - BIT_OFFSET_DELTA_BITS - (boff7))
1780 #endif
1781
1782 #define UPDATE_BIT_OFFSET(bitOffset, numBits) \
1783 (bitOffset) -= (CBitCtr)(numBits);
1784
1785 #define GET_SHIFT(bitOffset) GET_SHIFT_FROM_BOFFS7(BIT_OFFSET_7BITS(bitOffset))
1786
1787
1788 #if defined(Z7_ZSTD_DEC_USE_64BIT_LOADS)
1789 #if (NUM_OFFSET_SYMBOLS_MAX - BIT_OFFSET_DELTA_BITS < 32)
1790 /* if (NUM_OFFSET_SYMBOLS_MAX == 32 && BIT_OFFSET_DELTA_BITS == 1),
1791 we have depth 31 + 9 + 9 + 8 = 57 bits that can b read with single read. */
1792 #define Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF
1793 #endif
1794 #ifndef Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF
1795 #if (BIT_OFFSET_DELTA_BITS == 1)
1796 /* if (winLimit - winPos <= (kBlockSizeMax = (1 << 17)))
1797 {
1798 the case (16 bits literal extra + 16 match extra) is not possible
1799 in correct stream. So error will be detected for (16 + 16) case.
1800 And longest correct sequence after offset reading is (31 + 9 + 9 + 8 = 57 bits).
1801 So we can use just one 64-bit load here in that case.
1802 }
1803 */
1804 #define Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML
1805 #endif
1806 #endif
1807 #endif
1808
1809
1810 #if !defined(Z7_ZSTD_DEC_USE_64BIT_LOADS) || \
1811 (!defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF) && \
1812 !defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML))
1813 // in : (0 < bits <= (24 or 25)):
1814 #define STREAM_READ_BITS(dest, bits) \
1815 { \
1816 GET32(dest, src + SRC_PLUS_FOR_4BYTES(bitOffset)) \
1817 dest <<= GET_SHIFT(bitOffset); \
1818 UPDATE_BIT_OFFSET(bitOffset, bits) \
1819 dest >>= 32 - bits; \
1820 }
1821 #endif
1822
1823
1824 #define FSE_Peek_1(table, state) table[state]
1825
1826 #define STATE_VAR(name) state_ ## name
1827
1828 // in : (0 <= accuracy <= (24 or 25))
1829 #define FSE_INIT_STATE(name, cond) \
1830 { \
1831 UInt32 r; \
1832 const unsigned bits = p->name ## _accuracy; \
1833 GET32(r, src + SRC_PLUS_FOR_4BYTES(bitOffset)) \
1834 r <<= GET_SHIFT(bitOffset); \
1835 r >>= 1; \
1836 r >>= 31 ^ bits; \
1837 UPDATE_BIT_OFFSET(bitOffset, bits) \
1838 cond \
1839 STATE_VAR(name) = FSE_Peek_1(FSE_TABLE(name), r); \
1840 /* STATE_VAR(name) = dest << 16; */ \
1841 }
1842
1843
1844 #define FSE_Peek_Plus(name, r) \
1845 STATE_VAR(name) = FSE_Peek_1(FSE_TABLE(name), \
1846 GET_FSE_REC_STATE(STATE_VAR(name)) + r);
1847
1848 #define LZ_LOOP_ERROR_EXIT { return SZ_ERROR_DATA; }
1849
1850 #define BO_OVERFLOW_CHECK \
1851 { if ((CBitCtr_signed)bitOffset < 0) LZ_LOOP_ERROR_EXIT }
1852
1853
1854 #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
1855
1856 #define GET64(dest, p) { const Byte *ptr = p; dest = GetUi64(ptr); }
1857
1858 #define FSE_PRELOAD \
1859 { \
1860 GET64(v, src - 4 + SRC_PLUS_FOR_4BYTES(bitOffset)) \
1861 v <<= GET_SHIFT(bitOffset); \
1862 }
1863
1864 #define FSE_UPDATE_STATE_2(name, cond) \
1865 { \
1866 const unsigned bits = GET_FSE_REC_LEN(STATE_VAR(name)); \
1867 UInt64 r = v; \
1868 v <<= bits; \
1869 r >>= 1; \
1870 UPDATE_BIT_OFFSET(bitOffset, bits) \
1871 cond \
1872 r >>= 63 ^ bits; \
1873 FSE_Peek_Plus(name, r); \
1874 }
1875
1876 #define FSE_UPDATE_STATES \
1877 FSE_UPDATE_STATE_2 (ll, {} ) \
1878 FSE_UPDATE_STATE_2 (ml, {} ) \
1879 FSE_UPDATE_STATE_2 (of, BO_OVERFLOW_CHECK) \
1880
1881 #else // Z7_ZSTD_DEC_USE_64BIT_LOADS
1882
1883 // it supports 8 bits accuracy for any code
1884 // it supports 9 bits accuracy, if (BIT_OFFSET_DELTA_BITS == 1)
1885 #define FSE_UPDATE_STATE_0(name, cond) \
1886 { \
1887 UInt32 r; \
1888 const unsigned bits = GET_FSE_REC_LEN(STATE_VAR(name)); \
1889 GET16(r, src + 2 + SRC_PLUS_FOR_4BYTES(bitOffset)) \
1890 r >>= (bitOffset & 7); \
1891 r &= (1 << (8 + BIT_OFFSET_DELTA_BITS)) - 1; \
1892 UPDATE_BIT_OFFSET(bitOffset, bits) \
1893 cond \
1894 r >>= (8 + BIT_OFFSET_DELTA_BITS) - bits; \
1895 FSE_Peek_Plus(name, r); \
1896 }
1897
1898 // for debug (slow):
1899 // #define Z7_ZSTD_DEC_USE_FSE_FUSION_FORCE
1900 #if BIT_OFFSET_DELTA_BITS == 0 || defined(Z7_ZSTD_DEC_USE_FSE_FUSION_FORCE)
1901 #define Z7_ZSTD_DEC_USE_FSE_FUSION
1902 #endif
1903
1904 #ifdef Z7_ZSTD_DEC_USE_FSE_FUSION
1905 #define FSE_UPDATE_STATE_1(name) \
1906 { UInt32 rest2; \
1907 { \
1908 UInt32 r; \
1909 unsigned bits; \
1910 GET32(r, src + SRC_PLUS_FOR_4BYTES(bitOffset)) \
1911 bits = GET_FSE_REC_LEN(STATE_VAR(name)); \
1912 r <<= GET_SHIFT(bitOffset); \
1913 rest2 = r << bits; \
1914 r >>= 1; \
1915 UPDATE_BIT_OFFSET(bitOffset, bits) \
1916 r >>= 31 ^ bits; \
1917 FSE_Peek_Plus(name, r); \
1918 }
1919
1920 #define FSE_UPDATE_STATE_3(name) \
1921 { \
1922 const unsigned bits = GET_FSE_REC_LEN(STATE_VAR(name)); \
1923 rest2 >>= 1; \
1924 UPDATE_BIT_OFFSET(bitOffset, bits) \
1925 rest2 >>= 31 ^ bits; \
1926 FSE_Peek_Plus(name, rest2); \
1927 }}
1928
1929 #define FSE_UPDATE_STATES \
1930 FSE_UPDATE_STATE_1 (ll) \
1931 FSE_UPDATE_STATE_3 (ml) \
1932 FSE_UPDATE_STATE_0 (of, BO_OVERFLOW_CHECK) \
1933
1934 #else // Z7_ZSTD_DEC_USE_64BIT_LOADS
1935
1936 #define FSE_UPDATE_STATES \
1937 FSE_UPDATE_STATE_0 (ll, {} ) \
1938 FSE_UPDATE_STATE_0 (ml, {} ) \
1939 FSE_UPDATE_STATE_0 (of, BO_OVERFLOW_CHECK) \
1940
1941 #endif // Z7_ZSTD_DEC_USE_FSE_FUSION
1942 #endif // Z7_ZSTD_DEC_USE_64BIT_LOADS
1943
1944
1945
1946 typedef struct
1947 {
1948 UInt32 numSeqs;
1949 UInt32 literalsLen;
1950 const Byte *literals;
1951 }
1952 CZstdDec1_Vars;
1953
1954
1955 // if (BIT_OFFSET_DELTA_BITS != 0), we need (BIT_OFFSET_DELTA_BYTES > 0)
1956 #define BIT_OFFSET_DELTA_BYTES BIT_OFFSET_DELTA_BITS
1957
1958 /* if (NUM_OFFSET_SYMBOLS_MAX == 32)
1959 max_seq_bit_length = (31) + 16 + 16 + 9 + 8 + 9 = 89 bits
1960 if defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF) we have longest backward
1961 lookahead offset, and we read UInt64 after literal_len reading.
1962 if (BIT_OFFSET_DELTA_BITS == 1 && NUM_OFFSET_SYMBOLS_MAX == 32)
1963 MAX_BACKWARD_DEPTH = 16 bytes
1964 */
1965 #define MAX_BACKWARD_DEPTH \
1966 ((NUM_OFFSET_SYMBOLS_MAX - 1 + 16 + 16 + 7) / 8 + 7 + BIT_OFFSET_DELTA_BYTES)
1967
1968 /* srcLen != 0
1969 src == real_data_ptr - SEQ_SRC_OFFSET - BIT_OFFSET_DELTA_BYTES
1970 if defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML) then
1971 (winLimit - p->winPos <= (1 << 17)) is required
1972 */
1973 static
1974 Z7_NO_INLINE
1975 // Z7_ATTRIB_NO_VECTOR
1976 SRes Decompress_Sequences(CZstdDec1 * const p,
1977 const Byte *src, const size_t srcLen,
1978 const size_t winLimit,
1979 const CZstdDec1_Vars * const vars)
1980 {
1981 #ifdef Z7_ZSTD_DEC_USE_BASES_LOCAL
1982 SEQ_EXTRA_TABLES(a_)
1983 #endif
1984
1985 // for debug:
1986 // #define Z7_ZSTD_DEC_USE_LOCAL_FSE_TABLES
1987 #ifdef Z7_ZSTD_DEC_USE_LOCAL_FSE_TABLES
1988 #define FSE_TABLE(n) fse. n
1989 const CZstdDecFseTables fse = p->fse;
1990 /*
1991 CZstdDecFseTables fse;
1992 #define COPY_FSE_TABLE(n) \
1993 memcpy(fse. n, p->fse. n, (size_t)4 << p-> n ## _accuracy);
1994 COPY_FSE_TABLE(of)
1995 COPY_FSE_TABLE(ll)
1996 COPY_FSE_TABLE(ml)
1997 */
1998 #else
1999 #define FSE_TABLE(n) (p->fse. n)
2000 #endif
2001
2002 #ifdef Z7_ZSTD_DEC_USE_BASES_LOCAL
2003 FILL_LOC_BASES_ALL
2004 #endif
2005
2006 {
2007 unsigned numSeqs = vars->numSeqs;
2008 const Byte *literals = vars->literals;
2009 ptrdiff_t literalsLen = (ptrdiff_t)vars->literalsLen;
2010 Byte * const win = p->win;
2011 size_t winPos = p->winPos;
2012 const size_t cycSize = p->cycSize;
2013 size_t totalOutCheck = p->totalOutCheck;
2014 const size_t winSize = p->winSize;
2015 size_t reps_0 = p->reps[0];
2016 size_t reps_1 = p->reps[1];
2017 size_t reps_2 = p->reps[2];
2018 UInt32 STATE_VAR(ll), STATE_VAR(of), STATE_VAR(ml);
2019 CBitCtr bitOffset;
2020
2021 SET_bitOffset_TO_PAD (bitOffset, src + SEQ_SRC_OFFSET, srcLen + BIT_OFFSET_DELTA_BYTES)
2022
2023 bitOffset -= BIT_OFFSET_DELTA_BITS;
2024
2025 FSE_INIT_STATE(ll, {} )
2026 FSE_INIT_STATE(of, {} )
2027 FSE_INIT_STATE(ml, BO_OVERFLOW_CHECK)
2028
2029 for (;;)
2030 {
2031 size_t matchLen;
2032 #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
2033 UInt64 v;
2034 #endif
2035
2036 #ifdef Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF
2037 FSE_PRELOAD
2038 #endif
2039
2040 // if (of_code == 0)
2041 if ((Byte)STATE_VAR(of) == 0)
2042 {
2043 if (GET_FSE_REC_SYM(STATE_VAR(ll)) == 0)
2044 {
2045 const size_t offset = reps_1;
2046 reps_1 = reps_0;
2047 reps_0 = offset;
2048 STAT_INC(g_Num_Rep1)
2049 }
2050 STAT_UPDATE(else g_Num_Rep0++;)
2051 }
2052 else
2053 {
2054 const unsigned of_code = (Byte)STATE_VAR(of);
2055
2056 #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
2057 #if !defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF)
2058 FSE_PRELOAD
2059 #endif
2060 #else
2061 UInt32 v;
2062 {
2063 const Byte *src4 = src + SRC_PLUS_FOR_4BYTES(bitOffset);
2064 const unsigned skip = GET_SHIFT(bitOffset);
2065 GET32(v, src4)
2066 v <<= skip;
2067 v |= (UInt32)src4[-1] >> (8 - skip);
2068 }
2069 #endif
2070
2071 UPDATE_BIT_OFFSET(bitOffset, of_code)
2072
2073 if (of_code == 1)
2074 {
2075 // read 1 bit
2076 #if defined(Z7_MSC_VER_ORIGINAL) || defined(MY_CPU_X86_OR_AMD64)
2077 #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
2078 #define CHECK_HIGH_BIT_64(a) ((Int64)(UInt64)(a) < 0)
2079 #else
2080 #define CHECK_HIGH_BIT_32(a) ((Int32)(UInt32)(a) < 0)
2081 #endif
2082 #else
2083 #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
2084 #define CHECK_HIGH_BIT_64(a) ((UInt64)(a) & ((UInt64)1 << 63))
2085 #else
2086 #define CHECK_HIGH_BIT_32(a) ((UInt32)(a) & ((UInt32)1 << 31))
2087 #endif
2088 #endif
2089
2090 if
2091 #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
2092 CHECK_HIGH_BIT_64 (((UInt64)GET_FSE_REC_SYM(STATE_VAR(ll)) - 1) ^ v)
2093 #else
2094 CHECK_HIGH_BIT_32 (((UInt32)GET_FSE_REC_SYM(STATE_VAR(ll)) - 1) ^ v)
2095 #endif
2096 {
2097 v <<= 1;
2098 {
2099 const size_t offset = reps_2;
2100 reps_2 = reps_1;
2101 reps_1 = reps_0;
2102 reps_0 = offset;
2103 STAT_INC(g_Num_Rep2)
2104 }
2105 }
2106 else
2107 {
2108 if (GET_FSE_REC_SYM(STATE_VAR(ll)) == 0)
2109 {
2110 // litLen == 0 && bit == 1
2111 STAT_INC(g_Num_Rep3)
2112 v <<= 1;
2113 reps_2 = reps_1;
2114 reps_1 = reps_0;
2115 if (--reps_0 == 0)
2116 {
2117 // LZ_LOOP_ERROR_EXIT
2118 // original-zstd decoder : input is corrupted; force offset to 1
2119 // reps_0 = 1;
2120 reps_0++;
2121 }
2122 }
2123 else
2124 {
2125 // litLen != 0 && bit == 0
2126 v <<= 1;
2127 {
2128 const size_t offset = reps_1;
2129 reps_1 = reps_0;
2130 reps_0 = offset;
2131 STAT_INC(g_Num_Rep1)
2132 }
2133 }
2134 }
2135 }
2136 else
2137 {
2138 // (2 <= of_code)
2139 // if (of_code >= 32) LZ_LOOP_ERROR_EXIT // optional check
2140 // we don't allow (of_code >= 32) cases in another code
2141 reps_2 = reps_1;
2142 reps_1 = reps_0;
2143 reps_0 = ((size_t)1 << of_code) - 3 + (size_t)
2144 #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
2145 (v >> (64 - of_code));
2146 v <<= of_code;
2147 #else
2148 (v >> (32 - of_code));
2149 #endif
2150 }
2151 }
2152
2153 #ifdef Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML
2154 FSE_PRELOAD
2155 #endif
2156
2157 matchLen = (size_t)GET_FSE_REC_SYM(STATE_VAR(ml))
2158 #ifndef Z7_ZSTD_DEC_USE_ML_PLUS3
2159 + MATCH_LEN_MIN
2160 #endif
2161 ;
2162 {
2163 {
2164 if (matchLen >= 32 + MATCH_LEN_MIN) // if (state_ml & 0x20)
2165 {
2166 const unsigned extra = BASES_TABLE(SEQ_ML_EXTRA) [(size_t)matchLen - MATCH_LEN_MIN];
2167 matchLen = BASES_TABLE(SEQ_ML_BASES) [(size_t)matchLen - MATCH_LEN_MIN];
2168 #if defined(Z7_ZSTD_DEC_USE_64BIT_LOADS) && \
2169 (defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML) || \
2170 defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF))
2171 {
2172 UPDATE_BIT_OFFSET(bitOffset, extra)
2173 matchLen += (size_t)(v >> (64 - extra));
2174 #if defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF)
2175 FSE_PRELOAD
2176 #else
2177 v <<= extra;
2178 #endif
2179 }
2180 #else
2181 {
2182 UInt32 v32;
2183 STREAM_READ_BITS(v32, extra)
2184 matchLen += v32;
2185 }
2186 #endif
2187 STAT_INC(g_Num_Match)
2188 }
2189 }
2190 }
2191
2192 #if defined(Z7_ZSTD_DEC_USE_64BIT_LOADS) && \
2193 !defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF) && \
2194 !defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_ML)
2195 FSE_PRELOAD
2196 #endif
2197
2198 {
2199 size_t litLen = GET_FSE_REC_SYM(STATE_VAR(ll));
2200 if (litLen)
2201 {
2202 // if (STATE_VAR(ll) & 0x70)
2203 if (litLen >= 16)
2204 {
2205 const unsigned extra = BASES_TABLE(SEQ_LL_EXTRA) [litLen];
2206 litLen = BASES_TABLE(SEQ_LL_BASES) [litLen];
2207 #ifdef Z7_ZSTD_DEC_USE_64BIT_LOADS
2208 {
2209 UPDATE_BIT_OFFSET(bitOffset, extra)
2210 litLen += (size_t)(v >> (64 - extra));
2211 #if defined(Z7_ZSTD_DEC_USE_64BIT_PRELOAD_OF)
2212 FSE_PRELOAD
2213 #else
2214 v <<= extra;
2215 #endif
2216 }
2217 #else
2218 {
2219 UInt32 v32;
2220 STREAM_READ_BITS(v32, extra)
2221 litLen += v32;
2222 }
2223 #endif
2224 STAT_INC(g_Num_LitsBig)
2225 }
2226
2227 if ((literalsLen -= (ptrdiff_t)litLen) < 0)
2228 LZ_LOOP_ERROR_EXIT
2229 totalOutCheck += litLen;
2230 {
2231 const size_t rem = winLimit - winPos;
2232 if (litLen > rem)
2233 LZ_LOOP_ERROR_EXIT
2234 {
2235 const Byte *literals_temp = literals;
2236 Byte *d = win + winPos;
2237 literals += litLen;
2238 winPos += litLen;
2239 CopyLiterals(d, literals_temp, litLen, rem);
2240 }
2241 }
2242 }
2243 STAT_UPDATE(else g_Num_Lit0++;)
2244 }
2245
2246 #define COPY_MATCH \
2247 { if (reps_0 > winSize || reps_0 > totalOutCheck) LZ_LOOP_ERROR_EXIT \
2248 totalOutCheck += matchLen; \
2249 { const size_t rem = winLimit - winPos; \
2250 if (matchLen > rem) LZ_LOOP_ERROR_EXIT \
2251 { const size_t winPos_temp = winPos; \
2252 winPos += matchLen; \
2253 CopyMatch(reps_0, matchLen, win, winPos_temp, rem, cycSize); }}}
2254
2255 if (--numSeqs == 0)
2256 {
2257 COPY_MATCH
2258 break;
2259 }
2260 FSE_UPDATE_STATES
2261 COPY_MATCH
2262 } // for
2263
2264 if ((CBitCtr_signed)bitOffset != BIT_OFFSET_DELTA_BYTES * 8 - BIT_OFFSET_DELTA_BITS)
2265 return SZ_ERROR_DATA;
2266
2267 if (literalsLen)
2268 {
2269 const size_t rem = winLimit - winPos;
2270 if ((size_t)literalsLen > rem)
2271 return SZ_ERROR_DATA;
2272 {
2273 Byte *d = win + winPos;
2274 winPos += (size_t)literalsLen;
2275 totalOutCheck += (size_t)literalsLen;
2276 CopyLiterals
2277 // memcpy
2278 (d, literals, (size_t)literalsLen, rem);
2279 }
2280 }
2281 if (totalOutCheck >= winSize)
2282 totalOutCheck = winSize;
2283 p->totalOutCheck = totalOutCheck;
2284 p->winPos = winPos;
2285 p->reps[0] = (CZstdDecOffset)reps_0;
2286 p->reps[1] = (CZstdDecOffset)reps_1;
2287 p->reps[2] = (CZstdDecOffset)reps_2;
2288 }
2289 return SZ_OK;
2290 }
2291
2292
2293 // for debug: define to check that ZstdDec1_NeedTempBufferForInput() works correctly:
2294 // #define Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP // define it for debug only
2295 #ifdef Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP
2296 static unsigned g_numSeqs;
2297 #endif
2298
2299
2300 #define k_LitBlockType_Flag_RLE_or_Treeless 1
2301 #define k_LitBlockType_Flag_Compressed 2
2302
2303 // outLimit : is strong limit
2304 // outLimit <= ZstdDec1_GET_BLOCK_SIZE_LIMIT(p)
2305 // inSize != 0
2306 static
2307 Z7_NO_INLINE
2308 SRes ZstdDec1_DecodeBlock(CZstdDec1 *p,
2309 const Byte *src, SizeT inSize, SizeT afterAvail,
2310 const size_t outLimit)
2311 {
2312 CZstdDec1_Vars vars;
2313 vars.literals = p->literalsBase;
2314 {
2315 const unsigned b0 = *src++;
2316 UInt32 numLits, compressedSize;
2317 const Byte *litStream;
2318 Byte *literalsDest;
2319 inSize--;
2320
2321 if ((b0 & k_LitBlockType_Flag_Compressed) == 0)
2322 {
2323 // we need at least one additional byte for (numSeqs).
2324 // so we check for that additional byte in conditions.
2325 numLits = b0 >> 3;
2326 if (b0 & 4)
2327 {
2328 UInt32 v;
2329 if (inSize < 1 + 1) // we need at least 1 byte here and 1 byte for (numSeqs).
2330 return SZ_ERROR_DATA;
2331 numLits >>= 1;
2332 v = GetUi16(src);
2333 src += 2;
2334 inSize -= 2;
2335 if ((b0 & 8) == 0)
2336 {
2337 src--;
2338 inSize++;
2339 v = (Byte)v;
2340 }
2341 numLits += v << 4;
2342 }
2343 compressedSize = 1;
2344 if ((b0 & k_LitBlockType_Flag_RLE_or_Treeless) == 0)
2345 compressedSize = numLits;
2346 }
2347 else if (inSize < 4)
2348 return SZ_ERROR_DATA;
2349 else
2350 {
2351 const unsigned mode4Streams = b0 & 0xc;
2352 const unsigned numBytes = (3 * mode4Streams + 32) >> 4;
2353 const unsigned numBits = 4 * numBytes - 2;
2354 const UInt32 mask = ((UInt32)16 << numBits) - 1;
2355 compressedSize = GetUi32(src);
2356 numLits = ((
2357 #ifdef MY_CPU_LE_UNALIGN
2358 GetUi32(src - 1)
2359 #else
2360 ((compressedSize << 8) + b0)
2361 #endif
2362 ) >> 4) & mask;
2363 src += numBytes;
2364 inSize -= numBytes;
2365 compressedSize >>= numBits;
2366 compressedSize &= mask;
2367 /*
2368 if (numLits != 0) printf("inSize = %7u num_lits=%7u compressed=%7u ratio = %u ratio2 = %u\n",
2369 i1, numLits, (unsigned)compressedSize * 1, (unsigned)compressedSize * 100 / numLits,
2370 (unsigned)numLits * 100 / (unsigned)inSize);
2371 }
2372 */
2373 if (compressedSize == 0)
2374 return SZ_ERROR_DATA; // (compressedSize == 0) is not allowed
2375 }
2376
2377 STAT_UPDATE(g_Num_Lits += numLits;)
2378
2379 vars.literalsLen = numLits;
2380
2381 if (compressedSize >= inSize)
2382 return SZ_ERROR_DATA;
2383 litStream = src;
2384 src += compressedSize;
2385 inSize -= compressedSize;
2386 // inSize != 0
2387 {
2388 UInt32 numSeqs = *src++;
2389 inSize--;
2390 if (numSeqs > 127)
2391 {
2392 UInt32 b1;
2393 if (inSize == 0)
2394 return SZ_ERROR_DATA;
2395 numSeqs -= 128;
2396 b1 = *src++;
2397 inSize--;
2398 if (numSeqs == 127)
2399 {
2400 if (inSize == 0)
2401 return SZ_ERROR_DATA;
2402 numSeqs = (UInt32)(*src++) + 127;
2403 inSize--;
2404 }
2405 numSeqs = (numSeqs << 8) + b1;
2406 }
2407 if (numSeqs * MATCH_LEN_MIN + numLits > outLimit)
2408 return SZ_ERROR_DATA;
2409 vars.numSeqs = numSeqs;
2410
2411 STAT_UPDATE(g_NumSeqs_total += numSeqs;)
2412 /*
2413 #ifdef SHOW_STAT
2414 printf("\n %5u : %8u, %8u : %5u", (int)g_Num_Blocks_Compressed, (int)numSeqs, (int)g_NumSeqs_total,
2415 (int)g_NumSeqs_total / g_Num_Blocks_Compressed);
2416 #endif
2417 // printf("\nnumSeqs2 = %d", numSeqs);
2418 */
2419 #ifdef Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP
2420 if (numSeqs != g_numSeqs) return SZ_ERROR_DATA; // for debug
2421 #endif
2422 if (numSeqs == 0)
2423 {
2424 if (inSize != 0)
2425 return SZ_ERROR_DATA;
2426 literalsDest = p->win + p->winPos;
2427 }
2428 else
2429 literalsDest = p->literalsBase;
2430 }
2431
2432 if ((b0 & k_LitBlockType_Flag_Compressed) == 0)
2433 {
2434 if (b0 & k_LitBlockType_Flag_RLE_or_Treeless)
2435 {
2436 memset(literalsDest, litStream[0], numLits);
2437 if (vars.numSeqs)
2438 {
2439 // literalsDest == p->literalsBase == vars.literals
2440 #if COPY_CHUNK_SIZE > 1
2441 memset(p->literalsBase + numLits, 0, COPY_CHUNK_SIZE);
2442 #endif
2443 }
2444 }
2445 else
2446 {
2447 // unsigned y;
2448 // for (y = 0; y < 10000; y++)
2449 memcpy(literalsDest, litStream, numLits);
2450 if (vars.numSeqs)
2451 {
2452 /* we need up to (15 == COPY_CHUNK_SIZE - 1) space for optimized CopyLiterals().
2453 If we have additional space in input stream after literals stream,
2454 we use direct copy of rar literals in input stream */
2455 if ((size_t)(src + inSize - litStream) - numLits + afterAvail >= (COPY_CHUNK_SIZE - 1))
2456 vars.literals = litStream;
2457 else
2458 {
2459 // literalsDest == p->literalsBase == vars.literals
2460 #if COPY_CHUNK_SIZE > 1
2461 /* CopyLiterals():
2462 1) we don't want reading non-initialized data
2463 2) we will copy only zero byte after literals buffer */
2464 memset(p->literalsBase + numLits, 0, COPY_CHUNK_SIZE);
2465 #endif
2466 }
2467 }
2468 }
2469 }
2470 else
2471 {
2472 CInBufPair hufStream;
2473 hufStream.ptr = litStream;
2474 hufStream.len = compressedSize;
2475
2476 if ((b0 & k_LitBlockType_Flag_RLE_or_Treeless) == 0)
2477 {
2478 // unsigned y = 100; CInBufPair hs2 = hufStream; do { hufStream = hs2;
2479 RINOK(Huf_DecodeTable(&p->huf, &hufStream))
2480 p->litHuf_wasSet = True;
2481 // } while (--y);
2482 }
2483 else if (!p->litHuf_wasSet)
2484 return SZ_ERROR_DATA;
2485
2486 {
2487 // int yyy; for (yyy = 0; yyy < 34; yyy++) {
2488 SRes sres;
2489 if ((b0 & 0xc) == 0) // mode4Streams
2490 sres = Huf_Decompress_1stream((const Byte *)(const void *)p->huf.table64,
2491 hufStream.ptr - HUF_SRC_OFFSET, hufStream.len, literalsDest, numLits);
2492 else
2493 {
2494 // 6 bytes for the jump table + 4x1 bytes of end-padding Bytes)
2495 if (hufStream.len < 6 + 4)
2496 return SZ_ERROR_DATA;
2497 // the condition from original-zstd decoder:
2498 #define Z7_ZSTD_MIN_LITERALS_FOR_4_STREAMS 6
2499 if (numLits < Z7_ZSTD_MIN_LITERALS_FOR_4_STREAMS)
2500 return SZ_ERROR_DATA;
2501 sres = Huf_Decompress_4stream((const Byte *)(const void *)p->huf.table64,
2502 hufStream.ptr + (6 - HUF_SRC_OFFSET), hufStream.len, literalsDest, numLits);
2503 }
2504 RINOK(sres)
2505 // }
2506 }
2507 }
2508
2509 if (vars.numSeqs == 0)
2510 {
2511 p->winPos += numLits;
2512 UPDATE_TOTAL_OUT(p, numLits)
2513 return SZ_OK;
2514 }
2515 }
2516 {
2517 CInBufPair in;
2518 unsigned mode;
2519 unsigned seqMode;
2520
2521 in.ptr = src;
2522 in.len = inSize;
2523 if (in.len == 0)
2524 return SZ_ERROR_DATA;
2525 in.len--;
2526 mode = *in.ptr++;
2527 if (mode & 3) // Reserved bits
2528 return SZ_ERROR_DATA;
2529
2530 seqMode = (mode >> 6);
2531 if (seqMode == k_SeqMode_Repeat)
2532 { if (!IS_SEQ_TABLES_WERE_SET(p)) return SZ_ERROR_DATA; }
2533 else RINOK(FSE_Decode_SeqTable(
2534 p->fse.ll,
2535 &in,
2536 6, // predefAccuracy
2537 &p->ll_accuracy,
2538 NUM_LL_SYMBOLS,
2539 k_PredefRecords_LL,
2540 seqMode))
2541
2542 seqMode = (mode >> 4) & 3;
2543 if (seqMode == k_SeqMode_Repeat)
2544 { if (!IS_SEQ_TABLES_WERE_SET(p)) return SZ_ERROR_DATA; }
2545 else RINOK(FSE_Decode_SeqTable(
2546 p->fse.of,
2547 &in,
2548 5, // predefAccuracy
2549 &p->of_accuracy,
2550 NUM_OFFSET_SYMBOLS_MAX,
2551 k_PredefRecords_OF,
2552 seqMode))
2553
2554 seqMode = (mode >> 2) & 3;
2555 if (seqMode == k_SeqMode_Repeat)
2556 { if (!IS_SEQ_TABLES_WERE_SET(p)) return SZ_ERROR_DATA; }
2557 else
2558 {
2559 RINOK(FSE_Decode_SeqTable(
2560 p->fse.ml,
2561 &in,
2562 6, // predefAccuracy
2563 &p->ml_accuracy,
2564 NUM_ML_SYMBOLS,
2565 k_PredefRecords_ML,
2566 seqMode))
2567 /*
2568 #if defined(Z7_ZSTD_DEC_USE_ML_PLUS3)
2569 // { unsigned y = 1 << 10; do
2570 {
2571 const unsigned accuracy = p->ml_accuracy;
2572 if (accuracy == 0)
2573 p->fse.ml[0] += 3;
2574 else
2575 #ifdef MY_CPU_64BIT
2576 {
2577 // alignemt (UInt64 _pad_Alignment) in fse.ml is required for that code
2578 UInt64 *table = (UInt64 *)(void *)p->fse.ml;
2579 const UInt64 *end = (const UInt64 *)(const void *)
2580 ((const Byte *)(const void *)table + ((size_t)sizeof(CFseRecord) << accuracy));
2581 do
2582 {
2583 table[0] += ((UInt64)MATCH_LEN_MIN << 32) + MATCH_LEN_MIN;
2584 table[1] += ((UInt64)MATCH_LEN_MIN << 32) + MATCH_LEN_MIN;
2585 table += 2;
2586 }
2587 while (table != end);
2588 }
2589 #else
2590 {
2591 UInt32 *table = p->fse.ml;
2592 const UInt32 *end = (const UInt32 *)(const void *)
2593 ((const Byte *)(const void *)table + ((size_t)sizeof(CFseRecord) << accuracy));
2594 do
2595 {
2596 table[0] += MATCH_LEN_MIN;
2597 table[1] += MATCH_LEN_MIN;
2598 table += 2;
2599 table[0] += MATCH_LEN_MIN;
2600 table[1] += MATCH_LEN_MIN;
2601 table += 2;
2602 }
2603 while (table != end);
2604 }
2605 #endif
2606 }
2607 // while (--y); }
2608 #endif
2609 */
2610 }
2611
2612 // p->seqTables_wereSet = True;
2613 if (in.len == 0)
2614 return SZ_ERROR_DATA;
2615 return Decompress_Sequences(p,
2616 in.ptr - SEQ_SRC_OFFSET - BIT_OFFSET_DELTA_BYTES, in.len,
2617 p->winPos + outLimit, &vars);
2618 }
2619 }
2620
2621
2622
2623
2624 // inSize != 0
2625 // it must do similar to ZstdDec1_DecodeBlock()
2626 static size_t ZstdDec1_NeedTempBufferForInput(
2627 const SizeT beforeSize, const Byte * const src, const SizeT inSize)
2628 {
2629 unsigned b0;
2630 UInt32 pos;
2631
2632 #ifdef Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP
2633 g_numSeqs = 1 << 24;
2634 #else
2635 // we have at least 3 bytes before seq data: litBlockType, numSeqs, seqMode
2636 #define MIN_BLOCK_LZ_HEADERS_SIZE 3
2637 if (beforeSize >= MAX_BACKWARD_DEPTH - MIN_BLOCK_LZ_HEADERS_SIZE)
2638 return 0;
2639 #endif
2640
2641 b0 = src[0];
2642
2643 if ((b0 & k_LitBlockType_Flag_Compressed) == 0)
2644 {
2645 UInt32 numLits = b0 >> 3;
2646 pos = 1;
2647 if (b0 & 4)
2648 {
2649 UInt32 v;
2650 if (inSize < 3)
2651 return 0;
2652 numLits >>= 1;
2653 v = GetUi16(src + 1);
2654 pos = 3;
2655 if ((b0 & 8) == 0)
2656 {
2657 pos = 2;
2658 v = (Byte)v;
2659 }
2660 numLits += v << 4;
2661 }
2662 if (b0 & k_LitBlockType_Flag_RLE_or_Treeless)
2663 numLits = 1;
2664 pos += numLits;
2665 }
2666 else if (inSize < 5)
2667 return 0;
2668 else
2669 {
2670 const unsigned mode4Streams = b0 & 0xc;
2671 const unsigned numBytes = (3 * mode4Streams + 48) >> 4;
2672 const unsigned numBits = 4 * numBytes - 6;
2673 UInt32 cs = GetUi32(src + 1);
2674 cs >>= numBits;
2675 cs &= ((UInt32)16 << numBits) - 1;
2676 if (cs == 0)
2677 return 0;
2678 pos = numBytes + cs;
2679 }
2680
2681 if (pos >= inSize)
2682 return 0;
2683 {
2684 UInt32 numSeqs = src[pos++];
2685 if (numSeqs > 127)
2686 {
2687 UInt32 b1;
2688 if (pos >= inSize)
2689 return 0;
2690 numSeqs -= 128;
2691 b1 = src[pos++];
2692 if (numSeqs == 127)
2693 {
2694 if (pos >= inSize)
2695 return 0;
2696 numSeqs = (UInt32)(src[pos++]) + 127;
2697 }
2698 numSeqs = (numSeqs << 8) + b1;
2699 }
2700 #ifdef Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP
2701 g_numSeqs = numSeqs; // for debug
2702 #endif
2703 if (numSeqs == 0)
2704 return 0;
2705 }
2706 /*
2707 if (pos >= inSize)
2708 return 0;
2709 pos++;
2710 */
2711 // we will have one additional byte for seqMode:
2712 if (beforeSize + pos >= MAX_BACKWARD_DEPTH - 1)
2713 return 0;
2714 return 1;
2715 }
2716
2717
2718
2719 // ---------- ZSTD FRAME ----------
2720
2721 #define kBlockType_Raw 0
2722 #define kBlockType_RLE 1
2723 #define kBlockType_Compressed 2
2724 #define kBlockType_Reserved 3
2725
2726 typedef enum
2727 {
2728 // begin: states that require 4 bytes:
2729 ZSTD2_STATE_SIGNATURE,
2730 ZSTD2_STATE_HASH,
2731 ZSTD2_STATE_SKIP_HEADER,
2732 // end of states that require 4 bytes
2733
2734 ZSTD2_STATE_SKIP_DATA,
2735 ZSTD2_STATE_FRAME_HEADER,
2736 ZSTD2_STATE_AFTER_HEADER,
2737 ZSTD2_STATE_BLOCK,
2738 ZSTD2_STATE_DATA,
2739 ZSTD2_STATE_FINISHED
2740 } EZstd2State;
2741
2742
2743 struct CZstdDec
2744 {
2745 EZstd2State frameState;
2746 unsigned tempSize;
2747
2748 Byte temp[14]; // 14 is required
2749
2750 Byte descriptor;
2751 Byte windowDescriptor;
2752 Byte isLastBlock;
2753 Byte blockType;
2754 Byte isErrorState;
2755 Byte hashError;
2756 Byte disableHash;
2757 Byte isCyclicMode;
2758
2759 UInt32 blockSize;
2760 UInt32 dictionaryId;
2761 UInt32 curBlockUnpackRem; // for compressed blocks only
2762 UInt32 inTempPos;
2763
2764 UInt64 contentSize;
2765 UInt64 contentProcessed;
2766 CXxh64State xxh64;
2767
2768 Byte *inTemp;
2769 SizeT winBufSize_Allocated;
2770 Byte *win_Base;
2771
2772 ISzAllocPtr alloc_Small;
2773 ISzAllocPtr alloc_Big;
2774
2775 CZstdDec1 decoder;
2776 };
2777
2778 #define ZstdDec_GET_UNPROCESSED_XXH64_SIZE(p) \
2779 ((unsigned)(p)->contentProcessed & (Z7_XXH64_BLOCK_SIZE - 1))
2780
2781 #define ZSTD_DEC_IS_LAST_BLOCK(p) ((p)->isLastBlock)
2782
2783
2784 static void ZstdDec_FreeWindow(CZstdDec * const p)
2785 {
2786 if (p->win_Base)
2787 {
2788 ISzAlloc_Free(p->alloc_Big, p->win_Base);
2789 p->win_Base = NULL;
2790 // p->decoder.win = NULL;
2791 p->winBufSize_Allocated = 0;
2792 }
2793 }
2794
2795
2796 CZstdDecHandle ZstdDec_Create(ISzAllocPtr alloc_Small, ISzAllocPtr alloc_Big)
2797 {
2798 CZstdDec *p = (CZstdDec *)ISzAlloc_Alloc(alloc_Small, sizeof(CZstdDec));
2799 if (!p)
2800 return NULL;
2801 p->alloc_Small = alloc_Small;
2802 p->alloc_Big = alloc_Big;
2803 // ZstdDec_CONSTRUCT(p)
2804 p->inTemp = NULL;
2805 p->win_Base = NULL;
2806 p->winBufSize_Allocated = 0;
2807 p->disableHash = False;
2808 ZstdDec1_Construct(&p->decoder);
2809 return p;
2810 }
2811
2812 void ZstdDec_Destroy(CZstdDecHandle p)
2813 {
2814 #ifdef SHOW_STAT
2815 #define PRINT_STAT1(name, v) \
2816 printf("\n%25s = %9u", name, v);
2817 PRINT_STAT1("g_Num_Blocks_Compressed", g_Num_Blocks_Compressed)
2818 PRINT_STAT1("g_Num_Blocks_memcpy", g_Num_Blocks_memcpy)
2819 PRINT_STAT1("g_Num_Wrap_memmove_Num", g_Num_Wrap_memmove_Num)
2820 PRINT_STAT1("g_Num_Wrap_memmove_Bytes", g_Num_Wrap_memmove_Bytes)
2821 if (g_Num_Blocks_Compressed)
2822 {
2823 #define PRINT_STAT(name, v) \
2824 printf("\n%17s = %9u, per_block = %8u", name, v, v / g_Num_Blocks_Compressed);
2825 PRINT_STAT("g_NumSeqs", g_NumSeqs_total)
2826 // PRINT_STAT("g_NumCopy", g_NumCopy)
2827 PRINT_STAT("g_NumOver", g_NumOver)
2828 PRINT_STAT("g_NumOver2", g_NumOver2)
2829 PRINT_STAT("g_Num_Match", g_Num_Match)
2830 PRINT_STAT("g_Num_Lits", g_Num_Lits)
2831 PRINT_STAT("g_Num_LitsBig", g_Num_LitsBig)
2832 PRINT_STAT("g_Num_Lit0", g_Num_Lit0)
2833 PRINT_STAT("g_Num_Rep_0", g_Num_Rep0)
2834 PRINT_STAT("g_Num_Rep_1", g_Num_Rep1)
2835 PRINT_STAT("g_Num_Rep_2", g_Num_Rep2)
2836 PRINT_STAT("g_Num_Rep_3", g_Num_Rep3)
2837 PRINT_STAT("g_Num_Threshold_0", g_Num_Threshold_0)
2838 PRINT_STAT("g_Num_Threshold_1", g_Num_Threshold_1)
2839 PRINT_STAT("g_Num_Threshold_0sum", g_Num_Threshold_0sum)
2840 PRINT_STAT("g_Num_Threshold_1sum", g_Num_Threshold_1sum)
2841 }
2842 printf("\n");
2843 #endif
2844
2845 ISzAlloc_Free(p->alloc_Small, p->decoder.literalsBase);
2846 // p->->decoder.literalsBase = NULL;
2847 ISzAlloc_Free(p->alloc_Small, p->inTemp);
2848 // p->inTemp = NULL;
2849 ZstdDec_FreeWindow(p);
2850 ISzAlloc_Free(p->alloc_Small, p);
2851 }
2852
2853
2854
2855 #define kTempBuffer_PreSize (1u << 6)
2856 #if kTempBuffer_PreSize < MAX_BACKWARD_DEPTH
2857 #error Stop_Compiling_Bad_kTempBuffer_PreSize
2858 #endif
2859
2860 static SRes ZstdDec_AllocateMisc(CZstdDec *p)
2861 {
2862 #define k_Lit_AfterAvail (1u << 6)
2863 #if k_Lit_AfterAvail < (COPY_CHUNK_SIZE - 1)
2864 #error Stop_Compiling_Bad_k_Lit_AfterAvail
2865 #endif
2866 // return ZstdDec1_Allocate(&p->decoder, p->alloc_Small);
2867 if (!p->decoder.literalsBase)
2868 {
2869 p->decoder.literalsBase = (Byte *)ISzAlloc_Alloc(p->alloc_Small,
2870 kBlockSizeMax + k_Lit_AfterAvail);
2871 if (!p->decoder.literalsBase)
2872 return SZ_ERROR_MEM;
2873 }
2874 if (!p->inTemp)
2875 {
2876 // we need k_Lit_AfterAvail here for owerread from raw literals stream
2877 p->inTemp = (Byte *)ISzAlloc_Alloc(p->alloc_Small,
2878 kBlockSizeMax + kTempBuffer_PreSize + k_Lit_AfterAvail);
2879 if (!p->inTemp)
2880 return SZ_ERROR_MEM;
2881 }
2882 return SZ_OK;
2883 }
2884
2885
2886 static void ZstdDec_Init_ForNewFrame(CZstdDec *p)
2887 {
2888 p->frameState = ZSTD2_STATE_SIGNATURE;
2889 p->tempSize = 0;
2890
2891 p->isErrorState = False;
2892 p->hashError = False;
2893 p->isCyclicMode = False;
2894 p->contentProcessed = 0;
2895 Xxh64State_Init(&p->xxh64);
2896 ZstdDec1_Init(&p->decoder);
2897 }
2898
2899
2900 void ZstdDec_Init(CZstdDec *p)
2901 {
2902 ZstdDec_Init_ForNewFrame(p);
2903 p->decoder.winPos = 0;
2904 memset(p->temp, 0, sizeof(p->temp));
2905 }
2906
2907
2908 #define DESCRIPTOR_Get_DictionaryId_Flag(d) ((d) & 3)
2909 #define DESCRIPTOR_FLAG_CHECKSUM (1 << 2)
2910 #define DESCRIPTOR_FLAG_RESERVED (1 << 3)
2911 // #define DESCRIPTOR_FLAG_UNUSED (1 << 4)
2912 #define DESCRIPTOR_FLAG_SINGLE (1 << 5)
2913 #define DESCRIPTOR_Get_ContentSize_Flag3(d) ((d) >> 5)
2914 #define DESCRIPTOR_Is_ContentSize_Defined(d) (((d) & 0xe0) != 0)
2915
2916
2917 static EZstd2State ZstdDec_UpdateState(CZstdDec * const p, const Byte b, CZstdDecInfo * const info)
2918 {
2919 unsigned tempSize = p->tempSize;
2920 p->temp[tempSize++] = b;
2921 p->tempSize = tempSize;
2922
2923 if (p->frameState == ZSTD2_STATE_BLOCK)
2924 {
2925 if (tempSize < 3)
2926 return ZSTD2_STATE_BLOCK;
2927 {
2928 UInt32 b0 = GetUi32(p->temp);
2929 const unsigned type = ((unsigned)b0 >> 1) & 3;
2930 if (type == kBlockType_RLE && tempSize == 3)
2931 return ZSTD2_STATE_BLOCK;
2932 // info->num_Blocks_forType[type]++;
2933 info->num_Blocks++;
2934 if (type == kBlockType_Reserved)
2935 {
2936 p->isErrorState = True; // SZ_ERROR_UNSUPPORTED
2937 return ZSTD2_STATE_BLOCK;
2938 }
2939 p->blockType = (Byte)type;
2940 p->isLastBlock = (Byte)(b0 & 1);
2941 p->inTempPos = 0;
2942 p->tempSize = 0;
2943 b0 >>= 3;
2944 b0 &= 0x1fffff;
2945 // info->num_BlockBytes_forType[type] += b0;
2946 if (b0 == 0)
2947 {
2948 // empty RAW/RLE blocks are allowed in original-zstd decoder
2949 if (type == kBlockType_Compressed)
2950 {
2951 p->isErrorState = True;
2952 return ZSTD2_STATE_BLOCK;
2953 }
2954 if (!ZSTD_DEC_IS_LAST_BLOCK(p))
2955 return ZSTD2_STATE_BLOCK;
2956 if (p->descriptor & DESCRIPTOR_FLAG_CHECKSUM)
2957 return ZSTD2_STATE_HASH;
2958 return ZSTD2_STATE_FINISHED;
2959 }
2960 p->blockSize = b0;
2961 {
2962 UInt32 blockLim = ZstdDec1_GET_BLOCK_SIZE_LIMIT(&p->decoder);
2963 // compressed and uncompressed block sizes cannot be larger than min(kBlockSizeMax, window_size)
2964 if (b0 > blockLim)
2965 {
2966 p->isErrorState = True; // SZ_ERROR_UNSUPPORTED;
2967 return ZSTD2_STATE_BLOCK;
2968 }
2969 if (DESCRIPTOR_Is_ContentSize_Defined(p->descriptor))
2970 {
2971 const UInt64 rem = p->contentSize - p->contentProcessed;
2972 if (blockLim > rem)
2973 blockLim = (UInt32)rem;
2974 }
2975 p->curBlockUnpackRem = blockLim;
2976 // uncompressed block size cannot be larger than remain data size:
2977 if (type != kBlockType_Compressed)
2978 {
2979 if (b0 > blockLim)
2980 {
2981 p->isErrorState = True; // SZ_ERROR_UNSUPPORTED;
2982 return ZSTD2_STATE_BLOCK;
2983 }
2984 }
2985 }
2986 }
2987 return ZSTD2_STATE_DATA;
2988 }
2989
2990 if ((unsigned)p->frameState < ZSTD2_STATE_SKIP_DATA)
2991 {
2992 UInt32 v;
2993 if (tempSize != 4)
2994 return p->frameState;
2995 v = GetUi32(p->temp);
2996 if ((unsigned)p->frameState < ZSTD2_STATE_HASH) // == ZSTD2_STATE_SIGNATURE
2997 {
2998 if (v == 0xfd2fb528)
2999 {
3000 p->tempSize = 0;
3001 info->num_DataFrames++;
3002 return ZSTD2_STATE_FRAME_HEADER;
3003 }
3004 if ((v & 0xfffffff0) == 0x184d2a50)
3005 {
3006 p->tempSize = 0;
3007 info->num_SkipFrames++;
3008 return ZSTD2_STATE_SKIP_HEADER;
3009 }
3010 p->isErrorState = True;
3011 return ZSTD2_STATE_SIGNATURE;
3012 // return ZSTD2_STATE_ERROR; // is not ZSTD stream
3013 }
3014 if (p->frameState == ZSTD2_STATE_HASH)
3015 {
3016 info->checksum_Defined = True;
3017 info->checksum = v;
3018 // #ifndef DISABLE_XXH_CHECK
3019 if (!p->disableHash)
3020 {
3021 if (p->decoder.winPos < ZstdDec_GET_UNPROCESSED_XXH64_SIZE(p))
3022 {
3023 // unexpected code failure
3024 p->isErrorState = True;
3025 // SZ_ERROR_FAIL;
3026 }
3027 else
3028 if ((UInt32)Xxh64State_Digest(&p->xxh64,
3029 p->decoder.win + (p->decoder.winPos - ZstdDec_GET_UNPROCESSED_XXH64_SIZE(p)),
3030 p->contentProcessed) != v)
3031 {
3032 p->hashError = True;
3033 // return ZSTD2_STATE_ERROR; // hash error
3034 }
3035 }
3036 // #endif
3037 return ZSTD2_STATE_FINISHED;
3038 }
3039 // (p->frameState == ZSTD2_STATE_SKIP_HEADER)
3040 {
3041 p->blockSize = v;
3042 info->skipFrames_Size += v;
3043 p->tempSize = 0;
3044 /* we want the caller could know that there was finished frame
3045 finished frame. So we allow the case where
3046 we have ZSTD2_STATE_SKIP_DATA state with (blockSize == 0).
3047 */
3048 // if (v == 0) return ZSTD2_STATE_SIGNATURE;
3049 return ZSTD2_STATE_SKIP_DATA;
3050 }
3051 }
3052
3053 // if (p->frameState == ZSTD2_STATE_FRAME_HEADER)
3054 {
3055 unsigned descriptor;
3056 const Byte *h;
3057 descriptor = p->temp[0];
3058 p->descriptor = (Byte)descriptor;
3059 if (descriptor & DESCRIPTOR_FLAG_RESERVED) // reserved bit
3060 {
3061 p->isErrorState = True;
3062 return ZSTD2_STATE_FRAME_HEADER;
3063 // return ZSTD2_STATE_ERROR;
3064 }
3065 {
3066 const unsigned n = DESCRIPTOR_Get_ContentSize_Flag3(descriptor);
3067 // tempSize -= 1 + ((1u << (n >> 1)) | ((n + 1) & 1));
3068 tempSize -= (0x9a563422u >> (n * 4)) & 0xf;
3069 }
3070 if (tempSize != (4u >> (3 - DESCRIPTOR_Get_DictionaryId_Flag(descriptor))))
3071 return ZSTD2_STATE_FRAME_HEADER;
3072
3073 info->descriptor_OR = (Byte)(info->descriptor_OR | descriptor);
3074 info->descriptor_NOT_OR = (Byte)(info->descriptor_NOT_OR | ~descriptor);
3075
3076 h = &p->temp[1];
3077 {
3078 Byte w = 0;
3079 if ((descriptor & DESCRIPTOR_FLAG_SINGLE) == 0)
3080 {
3081 w = *h++;
3082 if (info->windowDescriptor_MAX < w)
3083 info->windowDescriptor_MAX = w;
3084 // info->are_WindowDescriptors = True;
3085 // info->num_WindowDescriptors++;
3086 }
3087 else
3088 {
3089 // info->are_SingleSegments = True;
3090 // info->num_SingleSegments++;
3091 }
3092 p->windowDescriptor = w;
3093 }
3094 {
3095 unsigned n = DESCRIPTOR_Get_DictionaryId_Flag(descriptor);
3096 UInt32 d = 0;
3097 if (n)
3098 {
3099 n = 1u << (n - 1);
3100 d = GetUi32(h) & ((UInt32)(Int32)-1 >> (32 - 8u * n));
3101 h += n;
3102 }
3103 p->dictionaryId = d;
3104 // info->dictionaryId_Cur = d;
3105 if (d != 0)
3106 {
3107 if (info->dictionaryId == 0)
3108 info->dictionaryId = d;
3109 else if (info->dictionaryId != d)
3110 info->are_DictionaryId_Different = True;
3111 }
3112 }
3113 {
3114 unsigned n = DESCRIPTOR_Get_ContentSize_Flag3(descriptor);
3115 UInt64 v = 0;
3116 if (n)
3117 {
3118 n >>= 1;
3119 if (n == 1)
3120 v = 256;
3121 v += GetUi64(h) & ((UInt64)(Int64)-1 >> (64 - (8u << n)));
3122 // info->are_ContentSize_Known = True;
3123 // info->num_Frames_with_ContentSize++;
3124 if (info->contentSize_MAX < v)
3125 info->contentSize_MAX = v;
3126 info->contentSize_Total += v;
3127 }
3128 else
3129 {
3130 info->are_ContentSize_Unknown = True;
3131 // info->num_Frames_without_ContentSize++;
3132 }
3133 p->contentSize = v;
3134 }
3135 // if ((size_t)(h - p->temp) != headerSize) return ZSTD2_STATE_ERROR; // it's unexpected internal code failure
3136 p->tempSize = 0;
3137
3138 info->checksum_Defined = False;
3139 /*
3140 if (descriptor & DESCRIPTOR_FLAG_CHECKSUM)
3141 info->are_Checksums = True;
3142 else
3143 info->are_Non_Checksums = True;
3144 */
3145
3146 return ZSTD2_STATE_AFTER_HEADER; // ZSTD2_STATE_BLOCK;
3147 }
3148 }
3149
3150
3151 static void ZstdDec_Update_XXH(CZstdDec * const p, size_t xxh64_winPos)
3152 {
3153 /*
3154 #ifdef DISABLE_XXH_CHECK
3155 UNUSED_VAR(data)
3156 #else
3157 */
3158 if (!p->disableHash && (p->descriptor & DESCRIPTOR_FLAG_CHECKSUM))
3159 {
3160 // const size_t pos = p->xxh64_winPos;
3161 const size_t size = (p->decoder.winPos - xxh64_winPos) & ~(size_t)31;
3162 if (size)
3163 {
3164 // p->xxh64_winPos = pos + size;
3165 Xxh64State_UpdateBlocks(&p->xxh64,
3166 p->decoder.win + xxh64_winPos,
3167 p->decoder.win + xxh64_winPos + size);
3168 }
3169 }
3170 }
3171
3172
3173 /*
3174 in:
3175 (winLimit) : is relaxed limit, where this function is allowed to stop writing of decoded data (if possible).
3176 - this function uses (winLimit) for RAW/RLE blocks only,
3177 because this function can decode single RAW/RLE block in several different calls.
3178 - this function DOESN'T use (winLimit) for Compressed blocks,
3179 because this function decodes full compressed block in single call.
3180 (CZstdDec1::winPos <= winLimit)
3181 (winLimit <= CZstdDec1::cycSize).
3182 Note: if (ds->outBuf_fromCaller) mode is used, then
3183 {
3184 (strong_limit) is stored in CZstdDec1::cycSize.
3185 So (winLimit) is more strong than (strong_limit).
3186 }
3187
3188 exit:
3189 Note: (CZstdDecState::winPos) will be set by caller after exit of this function.
3190
3191 This function can exit for any of these conditions:
3192 - (frameState == ZSTD2_STATE_AFTER_HEADER)
3193 - (frameState == ZSTD2_STATE_FINISHED) : frame was finished : (status == ZSTD_STATUS_FINISHED_FRAME) is set
3194 - finished non-empty non-last block. So (CZstdDec1::winPos_atExit != winPos_atFuncStart).
3195 - ZSTD_STATUS_NEEDS_MORE_INPUT in src
3196 - (CZstdDec1::winPos) have reached (winLimit) in non-finished RAW/RLE block
3197
3198 This function decodes no more than one non-empty block.
3199 So it fulfills the condition at exit:
3200 (CZstdDec1::winPos_atExit - winPos_atFuncStart <= block_size_max)
3201 Note: (winPos_atExit > winLimit) is possible in some cases after compressed block decoding.
3202
3203 if (ds->outBuf_fromCaller) mode (useAdditionalWinLimit medo)
3204 {
3205 then this function uses additional strong limit from (CZstdDec1::cycSize).
3206 So this function will not write any data after (CZstdDec1::cycSize)
3207 And it fulfills the condition at exit:
3208 (CZstdDec1::winPos_atExit <= CZstdDec1::cycSize)
3209 }
3210 */
3211 static SRes ZstdDec_DecodeBlock(CZstdDec * const p, CZstdDecState * const ds,
3212 SizeT winLimitAdd)
3213 {
3214 const Byte *src = ds->inBuf;
3215 SizeT * const srcLen = &ds->inPos;
3216 const SizeT inSize = ds->inLim;
3217 // const int useAdditionalWinLimit = ds->outBuf_fromCaller ? 1 : 0;
3218 enum_ZstdStatus * const status = &ds->status;
3219 CZstdDecInfo * const info = &ds->info;
3220 SizeT winLimit;
3221
3222 const SizeT winPos_atFuncStart = p->decoder.winPos;
3223 src += *srcLen;
3224 *status = ZSTD_STATUS_NOT_SPECIFIED;
3225
3226 // finishMode = ZSTD_FINISH_ANY;
3227 if (ds->outSize_Defined)
3228 {
3229 if (ds->outSize < ds->outProcessed)
3230 {
3231 // p->isAfterSizeMode = 2; // we have extra bytes already
3232 *status = ZSTD_STATUS_OUT_REACHED;
3233 return SZ_OK;
3234 // size = 0;
3235 }
3236 else
3237 {
3238 // p->outSize >= p->outProcessed
3239 const UInt64 rem = ds->outSize - ds->outProcessed;
3240 /*
3241 if (rem == 0)
3242 p->isAfterSizeMode = 1; // we have reached exact required size
3243 */
3244 if (winLimitAdd >= rem)
3245 {
3246 winLimitAdd = (SizeT)rem;
3247 // if (p->finishMode) finishMode = ZSTD_FINISH_END;
3248 }
3249 }
3250 }
3251
3252 winLimit = p->decoder.winPos + winLimitAdd;
3253 // (p->decoder.winPos <= winLimit)
3254
3255 // while (p->frameState != ZSTD2_STATE_ERROR)
3256 while (!p->isErrorState)
3257 {
3258 SizeT inCur = inSize - *srcLen;
3259
3260 if (p->frameState == ZSTD2_STATE_DATA)
3261 {
3262 /* (p->decoder.winPos == winPos_atFuncStart) is expected,
3263 because this function doesn't start new block.
3264 if it have finished some non-empty block in this call. */
3265 if (p->decoder.winPos != winPos_atFuncStart)
3266 return SZ_ERROR_FAIL; // it's unexpected
3267
3268 /*
3269 if (p->decoder.winPos > winLimit)
3270 {
3271 // we can be here, if in this function call
3272 // - we have extracted non-empty compressed block, and (winPos > winLimit) after that.
3273 // - we have started new block decoding after that.
3274 // It's unexpected case, because we exit after non-empty non-last block.
3275 *status = (inSize == *srcLen) ?
3276 ZSTD_STATUS_NEEDS_MORE_INPUT :
3277 ZSTD_STATUS_NOT_FINISHED;
3278 return SZ_OK;
3279 }
3280 */
3281 // p->decoder.winPos <= winLimit
3282
3283 if (p->blockType != kBlockType_Compressed)
3284 {
3285 // it's RLE or RAW block.
3286 // p->BlockSize != 0_
3287 // winLimit <= p->decoder.cycSize
3288 /* So here we use more strong (winLimit), even for
3289 (ds->outBuf_fromCaller) mode. */
3290 SizeT outCur = winLimit - p->decoder.winPos;
3291 {
3292 const UInt32 rem = p->blockSize;
3293 if (outCur > rem)
3294 outCur = rem;
3295 }
3296 if (p->blockType == kBlockType_Raw)
3297 {
3298 if (outCur > inCur)
3299 outCur = inCur;
3300 /* output buffer is better aligned for XXH code.
3301 So we use hash for output buffer data */
3302 // ZstdDec_Update_XXH(p, src, outCur); // for debug:
3303 memcpy(p->decoder.win + p->decoder.winPos, src, outCur);
3304 src += outCur;
3305 *srcLen += outCur;
3306 }
3307 else // kBlockType_RLE
3308 {
3309 #define RLE_BYTE_INDEX_IN_temp 3
3310 memset(p->decoder.win + p->decoder.winPos,
3311 p->temp[RLE_BYTE_INDEX_IN_temp], outCur);
3312 }
3313 {
3314 const SizeT xxh64_winPos = p->decoder.winPos - ZstdDec_GET_UNPROCESSED_XXH64_SIZE(p);
3315 p->decoder.winPos += outCur;
3316 UPDATE_TOTAL_OUT(&p->decoder, outCur)
3317 p->contentProcessed += outCur;
3318 ZstdDec_Update_XXH(p, xxh64_winPos);
3319 }
3320 // ds->winPos = p->decoder.winPos; // the caller does it instead. for debug:
3321 ds->outProcessed += outCur;
3322 if (p->blockSize -= (UInt32)outCur)
3323 {
3324 /*
3325 if (ds->outSize_Defined)
3326 {
3327 if (ds->outSize <= ds->outProcessed) ds->isAfterSizeMode = (enum_ZstdStatus)
3328 (ds->outSize == ds->outProcessed ? 1u: 2u);
3329 }
3330 */
3331 *status = (enum_ZstdStatus)
3332 (ds->outSize_Defined && ds->outSize <= ds->outProcessed ?
3333 ZSTD_STATUS_OUT_REACHED : (p->blockType == kBlockType_Raw && inSize == *srcLen) ?
3334 ZSTD_STATUS_NEEDS_MORE_INPUT :
3335 ZSTD_STATUS_NOT_FINISHED);
3336 return SZ_OK;
3337 }
3338 }
3339 else // kBlockType_Compressed
3340 {
3341 // p->blockSize != 0
3342 // (uncompressed_size_of_block == 0) is allowed
3343 // (p->curBlockUnpackRem == 0) is allowed
3344 /*
3345 if (p->decoder.winPos >= winLimit)
3346 {
3347 if (p->decoder.winPos != winPos_atFuncStart)
3348 {
3349 // it's unexpected case
3350 // We already have some data in finished blocks in this function call.
3351 // So we don't decompress new block after (>=winLimit),
3352 // even if it's empty block.
3353 *status = (inSize == *srcLen) ?
3354 ZSTD_STATUS_NEEDS_MORE_INPUT :
3355 ZSTD_STATUS_NOT_FINISHED;
3356 return SZ_OK;
3357 }
3358 // (p->decoder.winPos == winLimit == winPos_atFuncStart)
3359 // we will decode current block, because that current
3360 // block can be empty block and we want to make some visible
3361 // change of (src) stream after function start.
3362 }
3363 */
3364 /*
3365 if (ds->outSize_Defined && ds->outSize < ds->outProcessed)
3366 {
3367 // we don't want to start new block, if we have more extra decoded bytes already
3368 *status = ZSTD_STATUS_OUT_REACHED;
3369 return SZ_OK;
3370 }
3371 */
3372 {
3373 const Byte *comprStream;
3374 size_t afterAvail;
3375 UInt32 inTempPos = p->inTempPos;
3376 const UInt32 rem = p->blockSize - inTempPos;
3377 // rem != 0
3378 if (inTempPos != 0 // (inTemp) buffer already contains some input data
3379 || inCur < rem // available input data size is smaller than compressed block size
3380 || ZstdDec1_NeedTempBufferForInput(*srcLen, src, rem))
3381 {
3382 if (inCur > rem)
3383 inCur = rem;
3384 if (inCur)
3385 {
3386 STAT_INC(g_Num_Blocks_memcpy)
3387 // we clear data for backward lookahead reading
3388 if (inTempPos == 0)
3389 memset(p->inTemp + kTempBuffer_PreSize - MAX_BACKWARD_DEPTH, 0, MAX_BACKWARD_DEPTH);
3390 // { unsigned y = 0; for(;y < 1000; y++)
3391 memcpy(p->inTemp + inTempPos + kTempBuffer_PreSize, src, inCur);
3392 // }
3393 src += inCur;
3394 *srcLen += inCur;
3395 inTempPos += (UInt32)inCur;
3396 p->inTempPos = inTempPos;
3397 }
3398 if (inTempPos != p->blockSize)
3399 {
3400 *status = ZSTD_STATUS_NEEDS_MORE_INPUT;
3401 return SZ_OK;
3402 }
3403 #if COPY_CHUNK_SIZE > 1
3404 memset(p->inTemp + kTempBuffer_PreSize + inTempPos, 0, COPY_CHUNK_SIZE);
3405 #endif
3406 comprStream = p->inTemp + kTempBuffer_PreSize;
3407 afterAvail = k_Lit_AfterAvail;
3408 // we don't want to read non-initialized data or junk in CopyMatch():
3409 }
3410 else
3411 {
3412 // inCur >= rem
3413 // we use direct decoding from (src) buffer:
3414 afterAvail = inCur - rem;
3415 comprStream = src;
3416 src += rem;
3417 *srcLen += rem;
3418 }
3419
3420 #ifdef Z7_ZSTD_DEC_USE_CHECK_OF_NEED_TEMP
3421 ZstdDec1_NeedTempBufferForInput(*srcLen, comprStream, p->blockSize);
3422 #endif
3423 // printf("\nblockSize=%u", p->blockSize);
3424 // printf("%x\n", (unsigned)p->contentProcessed);
3425 STAT_INC(g_Num_Blocks_Compressed)
3426 {
3427 SRes sres;
3428 const size_t winPos = p->decoder.winPos;
3429 /*
3430 if ( useAdditionalWinLimit), we use strong unpack limit: smallest from
3431 - limit from stream : (curBlockUnpackRem)
3432 - limit from caller : (cycSize - winPos)
3433 if (!useAdditionalWinLimit), we use only relaxed limit:
3434 - limit from stream : (curBlockUnpackRem)
3435 */
3436 SizeT outLimit = p->curBlockUnpackRem;
3437 if (ds->outBuf_fromCaller)
3438 // if (useAdditionalWinLimit)
3439 {
3440 const size_t limit = p->decoder.cycSize - winPos;
3441 if (outLimit > limit)
3442 outLimit = limit;
3443 }
3444 sres = ZstdDec1_DecodeBlock(&p->decoder,
3445 comprStream, p->blockSize, afterAvail, outLimit);
3446 // ds->winPos = p->decoder.winPos; // the caller does it instead. for debug:
3447 if (sres)
3448 {
3449 p->isErrorState = True;
3450 return sres;
3451 }
3452 {
3453 const SizeT xxh64_winPos = winPos - ZstdDec_GET_UNPROCESSED_XXH64_SIZE(p);
3454 const size_t num = p->decoder.winPos - winPos;
3455 ds->outProcessed += num;
3456 p->contentProcessed += num;
3457 ZstdDec_Update_XXH(p, xxh64_winPos);
3458 }
3459 }
3460 // printf("\nwinPos=%x", (int)(unsigned)p->decoder.winPos);
3461 }
3462 }
3463
3464 /*
3465 if (ds->outSize_Defined)
3466 {
3467 if (ds->outSize <= ds->outProcessed) ds->isAfterSizeMode = (enum_ZstdStatus)
3468 (ds->outSize == ds->outProcessed ? 1u: 2u);
3469 }
3470 */
3471
3472 if (!ZSTD_DEC_IS_LAST_BLOCK(p))
3473 {
3474 p->frameState = ZSTD2_STATE_BLOCK;
3475 if (ds->outSize_Defined && ds->outSize < ds->outProcessed)
3476 {
3477 *status = ZSTD_STATUS_OUT_REACHED;
3478 return SZ_OK;
3479 }
3480 // we exit only if (winPos) was changed in this function call:
3481 if (p->decoder.winPos != winPos_atFuncStart)
3482 {
3483 // decoded block was not empty. So we exit:
3484 *status = (enum_ZstdStatus)(
3485 (inSize == *srcLen) ?
3486 ZSTD_STATUS_NEEDS_MORE_INPUT :
3487 ZSTD_STATUS_NOT_FINISHED);
3488 return SZ_OK;
3489 }
3490 // (p->decoder.winPos == winPos_atFuncStart)
3491 // so current decoded block was empty.
3492 // we will try to decode more blocks in this function.
3493 continue;
3494 }
3495
3496 // decoded block was last in frame
3497 if (p->descriptor & DESCRIPTOR_FLAG_CHECKSUM)
3498 {
3499 p->frameState = ZSTD2_STATE_HASH;
3500 if (ds->outSize_Defined && ds->outSize < ds->outProcessed)
3501 {
3502 *status = ZSTD_STATUS_OUT_REACHED;
3503 return SZ_OK; // disable if want to
3504 /* We want to get same return codes for any input buffer sizes.
3505 We want to get faster ZSTD_STATUS_OUT_REACHED status.
3506 So we exit with ZSTD_STATUS_OUT_REACHED here,
3507 instead of ZSTD2_STATE_HASH and ZSTD2_STATE_FINISHED processing.
3508 that depends from input buffer size and that can set
3509 ZSTD_STATUS_NEEDS_MORE_INPUT or return SZ_ERROR_DATA or SZ_ERROR_CRC.
3510 */
3511 }
3512 }
3513 else
3514 {
3515 /* ZSTD2_STATE_FINISHED proccesing doesn't depend from input buffer */
3516 p->frameState = ZSTD2_STATE_FINISHED;
3517 }
3518 /*
3519 p->frameState = (p->descriptor & DESCRIPTOR_FLAG_CHECKSUM) ?
3520 ZSTD2_STATE_HASH :
3521 ZSTD2_STATE_FINISHED;
3522 */
3523 /* it's required to process ZSTD2_STATE_FINISHED state in this function call,
3524 because we must check contentSize and hashError in ZSTD2_STATE_FINISHED code,
3525 while the caller can reinit full state for ZSTD2_STATE_FINISHED
3526 So we can't exit from function here. */
3527 continue;
3528 }
3529
3530 if (p->frameState == ZSTD2_STATE_FINISHED)
3531 {
3532 *status = ZSTD_STATUS_FINISHED_FRAME;
3533 if (DESCRIPTOR_Is_ContentSize_Defined(p->descriptor)
3534 && p->contentSize != p->contentProcessed)
3535 return SZ_ERROR_DATA;
3536 if (p->hashError) // for debug
3537 return SZ_ERROR_CRC;
3538 return SZ_OK;
3539 // p->frameState = ZSTD2_STATE_SIGNATURE;
3540 // continue;
3541 }
3542
3543 if (p->frameState == ZSTD2_STATE_AFTER_HEADER)
3544 return SZ_OK; // we need memory allocation for that state
3545
3546 if (p->frameState == ZSTD2_STATE_SKIP_DATA)
3547 {
3548 UInt32 blockSize = p->blockSize;
3549 // (blockSize == 0) is possible
3550 if (inCur > blockSize)
3551 inCur = blockSize;
3552 src += inCur;
3553 *srcLen += inCur;
3554 blockSize -= (UInt32)inCur;
3555 p->blockSize = blockSize;
3556 if (blockSize == 0)
3557 {
3558 p->frameState = ZSTD2_STATE_SIGNATURE;
3559 // continue; // for debug: we can continue without return to caller.
3560 // we notify the caller that skip frame was finished:
3561 *status = ZSTD_STATUS_FINISHED_FRAME;
3562 return SZ_OK;
3563 }
3564 // blockSize != 0
3565 // (inCur) was smaller than previous value of p->blockSize.
3566 // (inSize == *srcLen) now
3567 *status = ZSTD_STATUS_NEEDS_MORE_INPUT;
3568 return SZ_OK;
3569 }
3570
3571 if (inCur == 0)
3572 {
3573 *status = ZSTD_STATUS_NEEDS_MORE_INPUT;
3574 return SZ_OK;
3575 }
3576
3577 {
3578 (*srcLen)++;
3579 p->frameState = ZstdDec_UpdateState(p, *src++, info);
3580 }
3581 }
3582
3583 *status = ZSTD_STATUS_NOT_SPECIFIED;
3584 p->isErrorState = True;
3585 // p->frameState = ZSTD2_STATE_ERROR;
3586 // if (p->frameState = ZSTD2_STATE_SIGNATURE) return SZ_ERROR_NO_ARCHIVE
3587 return SZ_ERROR_DATA;
3588 }
3589
3590
3591
3592
3593 SRes ZstdDec_Decode(CZstdDecHandle dec, CZstdDecState *p)
3594 {
3595 p->needWrite_Size = 0;
3596 p->status = ZSTD_STATUS_NOT_SPECIFIED;
3597 dec->disableHash = p->disableHash;
3598
3599 if (p->outBuf_fromCaller)
3600 {
3601 dec->decoder.win = p->outBuf_fromCaller;
3602 dec->decoder.cycSize = p->outBufSize_fromCaller;
3603 }
3604
3605 // p->winPos = dec->decoder.winPos;
3606
3607 for (;;)
3608 {
3609 SizeT winPos, size;
3610 // SizeT outProcessed;
3611 SRes res;
3612
3613 if (p->wrPos > dec->decoder.winPos)
3614 return SZ_ERROR_FAIL;
3615
3616 if (dec->frameState == ZSTD2_STATE_FINISHED)
3617 {
3618 if (!p->outBuf_fromCaller)
3619 {
3620 // we need to set positions to zero for new frame.
3621 if (p->wrPos != dec->decoder.winPos)
3622 {
3623 /* We have already asked the caller to flush all data
3624 with (p->needWrite_Size) and (ZSTD_STATUS_FINISHED_FRAME) status.
3625 So it's unexpected case */
3626 // p->winPos = dec->decoder.winPos;
3627 // p->needWrite_Size = dec->decoder.winPos - p->wrPos; // flush size asking
3628 // return SZ_OK; // ask to flush again
3629 return SZ_ERROR_FAIL;
3630 }
3631 // (p->wrPos == dec->decoder.winPos), and we wrap to zero:
3632 dec->decoder.winPos = 0;
3633 p->winPos = 0;
3634 p->wrPos = 0;
3635 }
3636 ZstdDec_Init_ForNewFrame(dec);
3637 // continue;
3638 }
3639
3640 winPos = dec->decoder.winPos;
3641 {
3642 SizeT next = dec->decoder.cycSize;
3643 /* cycSize == 0, if no buffer was allocated still,
3644 or, if (outBuf_fromCaller) mode and (outBufSize_fromCaller == 0) */
3645 if (!p->outBuf_fromCaller
3646 && next
3647 && next <= winPos
3648 && dec->isCyclicMode)
3649 {
3650 // (0 < decoder.cycSize <= winPos) in isCyclicMode.
3651 // so we need to wrap (winPos) and (wrPos) over (cycSize).
3652 const size_t delta = next;
3653 // (delta) is how many bytes we remove from buffer.
3654 /*
3655 // we don't need data older than last (cycSize) bytes.
3656 size_t delta = winPos - next; // num bytes after (cycSize)
3657 if (delta <= next) // it's expected case
3658 delta = next;
3659 // delta == Max(cycSize, winPos - cycSize)
3660 */
3661 if (p->wrPos < delta)
3662 {
3663 // (wrPos < decoder.cycSize)
3664 // We have asked already the caller to flush required data
3665 // p->status = ZSTD_STATUS_NOT_SPECIFIED;
3666 // p->winPos = winPos;
3667 // p->needWrite_Size = delta - p->wrPos; // flush size asking
3668 // return SZ_OK; // ask to flush again
3669 return SZ_ERROR_FAIL;
3670 }
3671 // p->wrPos >= decoder.cycSize
3672 // we move extra data after (decoder.cycSize) to start of cyclic buffer:
3673 winPos -= delta;
3674 if (winPos)
3675 {
3676 if (winPos >= delta)
3677 return SZ_ERROR_FAIL;
3678 memmove(dec->decoder.win, dec->decoder.win + delta, winPos);
3679 // printf("\nmemmove processed=%8x winPos=%8x\n", (unsigned)p->outProcessed, (unsigned)dec->decoder.winPos);
3680 STAT_INC(g_Num_Wrap_memmove_Num)
3681 STAT_UPDATE(g_Num_Wrap_memmove_Bytes += (unsigned)winPos;)
3682 }
3683 dec->decoder.winPos = winPos;
3684 p->winPos = winPos;
3685 p->wrPos -= delta;
3686 // dec->xxh64_winPos -= delta;
3687
3688 // (winPos < delta)
3689 #ifdef Z7_STD_DEC_USE_AFTER_CYC_BUF
3690 /* we set the data after cycSize, because
3691 we don't want to read non-initialized data or junk in CopyMatch(). */
3692 memset(dec->decoder.win + next, 0, COPY_CHUNK_SIZE);
3693 #endif
3694
3695 /*
3696 if (winPos == next)
3697 {
3698 if (winPos != p->wrPos)
3699 {
3700 // we already requested before to flush full data for that case.
3701 // but we give the caller a second chance to flush data:
3702 p->needWrite_Size = winPos - p->wrPos;
3703 return SZ_OK;
3704 }
3705 // (decoder.cycSize == winPos == p->wrPos)
3706 // so we do second wrapping to zero:
3707 winPos = 0;
3708 dec->decoder.winPos = 0;
3709 p->winPos = 0;
3710 p->wrPos = 0;
3711 }
3712 */
3713 // (winPos < next)
3714 }
3715
3716 if (winPos > next)
3717 return SZ_ERROR_FAIL; // it's unexpected case
3718 /*
3719 if (!outBuf_fromCaller && isCyclicMode && cycSize != 0)
3720 then (winPos < cycSize)
3721 else (winPos <= cycSize)
3722 */
3723 if (!p->outBuf_fromCaller)
3724 {
3725 // that code is optional. We try to optimize write chunk sizes.
3726 /* (next2) is expected next write position in the caller,
3727 if the caller writes by kBlockSizeMax chunks.
3728 */
3729 /*
3730 const size_t next2 = (winPos + kBlockSizeMax) & (kBlockSizeMax - 1);
3731 if (winPos < next2 && next2 < next)
3732 next = next2;
3733 */
3734 }
3735 size = next - winPos;
3736 }
3737
3738 // note: ZstdDec_DecodeBlock() uses (winLimit = winPos + size) only for RLE and RAW blocks
3739 res = ZstdDec_DecodeBlock(dec, p, size);
3740 /*
3741 after one block decoding:
3742 if (!outBuf_fromCaller && isCyclicMode && cycSize != 0)
3743 then (winPos < cycSize + max_block_size)
3744 else (winPos <= cycSize)
3745 */
3746
3747 if (!p->outBuf_fromCaller)
3748 p->win = dec->decoder.win;
3749 p->winPos = dec->decoder.winPos;
3750
3751 // outProcessed = dec->decoder.winPos - winPos;
3752 // p->outProcessed += outProcessed;
3753
3754 if (res != SZ_OK)
3755 return res;
3756
3757 if (dec->frameState != ZSTD2_STATE_AFTER_HEADER)
3758 {
3759 if (p->outBuf_fromCaller)
3760 return SZ_OK;
3761 {
3762 // !p->outBuf_fromCaller
3763 /*
3764 if (ZSTD_STATUS_FINISHED_FRAME), we request full flushing here because
3765 1) it's simpler to work with allocation and extracting of next frame,
3766 2) it's better to start writing to next new frame with aligned memory
3767 for faster xxh 64-bit reads.
3768 */
3769 size_t end = dec->decoder.winPos; // end pos for all data flushing
3770 if (p->status != ZSTD_STATUS_FINISHED_FRAME)
3771 {
3772 // we will request flush here only for cases when wrap in cyclic buffer can be required in next call.
3773 if (!dec->isCyclicMode)
3774 return SZ_OK;
3775 // isCyclicMode
3776 {
3777 const size_t delta = dec->decoder.cycSize;
3778 if (end < delta)
3779 return SZ_OK; // (winPos < cycSize). no need for flush
3780 // cycSize <= winPos
3781 // So we ask the caller to flush of (cycSize - wrPos) bytes,
3782 // and then we will wrap cylicBuffer in next call
3783 end = delta;
3784 }
3785 }
3786 p->needWrite_Size = end - p->wrPos;
3787 }
3788 return SZ_OK;
3789 }
3790
3791 // ZSTD2_STATE_AFTER_HEADER
3792 {
3793 BoolInt useCyclic = False;
3794 size_t cycSize;
3795
3796 // p->status = ZSTD_STATUS_NOT_FINISHED;
3797 if (dec->dictionaryId != 0)
3798 {
3799 /* actually we can try to decode some data,
3800 because it's possible that some data doesn't use dictionary */
3801 // p->status = ZSTD_STATUS_NOT_SPECIFIED;
3802 return SZ_ERROR_UNSUPPORTED;
3803 }
3804
3805 {
3806 UInt64 winSize = dec->contentSize;
3807 UInt64 winSize_Allocate = winSize;
3808 const unsigned descriptor = dec->descriptor;
3809
3810 if ((descriptor & DESCRIPTOR_FLAG_SINGLE) == 0)
3811 {
3812 const Byte wd = dec->windowDescriptor;
3813 winSize = (UInt64)(8 + (wd & 7)) << ((wd >> 3) + 10 - 3);
3814 if (!DESCRIPTOR_Is_ContentSize_Defined(descriptor)
3815 || winSize_Allocate > winSize)
3816 {
3817 winSize_Allocate = winSize;
3818 useCyclic = True;
3819 }
3820 }
3821 /*
3822 else
3823 {
3824 if (p->info.singleSegment_ContentSize_MAX < winSize)
3825 p->info.singleSegment_ContentSize_MAX = winSize;
3826 // p->info.num_SingleSegments++;
3827 }
3828 */
3829 if (p->info.windowSize_MAX < winSize)
3830 p->info.windowSize_MAX = winSize;
3831 if (p->info.windowSize_Allocate_MAX < winSize_Allocate)
3832 p->info.windowSize_Allocate_MAX = winSize_Allocate;
3833 /*
3834 winSize_Allocate is MIN(content_size, window_size_from_descriptor).
3835 Wven if (content_size < (window_size_from_descriptor))
3836 original-zstd still uses (window_size_from_descriptor) to check that decoding is allowed.
3837 We try to follow original-zstd, and here we check (winSize) instead of (winSize_Allocate))
3838 */
3839 if (
3840 // winSize_Allocate // it's relaxed check
3841 winSize // it's more strict check to be compatible with original-zstd
3842 > ((UInt64)1 << MAX_WINDOW_SIZE_LOG))
3843 return SZ_ERROR_UNSUPPORTED; // SZ_ERROR_MEM
3844 cycSize = (size_t)winSize_Allocate;
3845 if (cycSize != winSize_Allocate)
3846 return SZ_ERROR_MEM;
3847 // cycSize <= winSize
3848 /* later we will use (CZstdDec1::winSize) to check match offsets and check block sizes.
3849 if (there is window descriptor)
3850 {
3851 We will check block size with (window_size_from_descriptor) instead of (winSize_Allocate).
3852 Does original-zstd do it that way also?
3853 }
3854 Here we must reduce full real 64-bit (winSize) to size_t for (CZstdDec1::winSize).
3855 Also we don't want too big values for (CZstdDec1::winSize).
3856 our (CZstdDec1::winSize) will meet the condition:
3857 (CZstdDec1::winSize < kBlockSizeMax || CZstdDec1::winSize <= cycSize).
3858 */
3859 dec->decoder.winSize = (winSize < kBlockSizeMax) ? (size_t)winSize: cycSize;
3860 // note: (CZstdDec1::winSize > cycSize) is possible, if (!useCyclic)
3861 }
3862
3863 RINOK(ZstdDec_AllocateMisc(dec))
3864
3865 if (p->outBuf_fromCaller)
3866 dec->isCyclicMode = False;
3867 else
3868 {
3869 size_t d = cycSize;
3870
3871 if (dec->decoder.winPos != p->wrPos)
3872 return SZ_ERROR_FAIL;
3873
3874 dec->decoder.winPos = 0;
3875 p->wrPos = 0;
3876 p->winPos = dec->decoder.winPos;
3877
3878 /*
3879 const size_t needWrite = dec->decoder.winPos - p->wrPos;
3880 if (!needWrite)
3881 {
3882 dec->decoder.winPos = 0;
3883 p->wrPos = 0;
3884 p->winPos = dec->decoder.winPos;
3885 }
3886 */
3887 /* if (!useCyclic) we allocate only cycSize = ContentSize.
3888 But if we want to support the case where new frame starts with winPos != 0,
3889 then we will wrap over zero, and we still need
3890 to set (useCyclic) and allocate additional buffer spaces.
3891 Now we don't allow new frame starting with (winPos != 0).
3892 so (dec->decoder->winPos == 0)
3893 can use (!useCyclic) with reduced buffer sizes.
3894 */
3895 /*
3896 if (dec->decoder->winPos != 0)
3897 useCyclic = True;
3898 */
3899
3900 if (useCyclic)
3901 {
3902 /* cyclyc buffer size must be at least (COPY_CHUNK_SIZE - 1) bytes
3903 larger than window size, because CopyMatch() can write additional
3904 (COPY_CHUNK_SIZE - 1) bytes and overwrite oldests data in cyclyc buffer.
3905 But for performance reasons we align (cycSize) for (kBlockSizeMax).
3906 also we must provide (cycSize >= max_decoded_data_after_cycSize),
3907 because after data move wrapping over zero we must provide (winPos < cycSize).
3908 */
3909 const size_t alignSize = kBlockSizeMax;
3910 /* here we add (1 << 7) instead of (COPY_CHUNK_SIZE - 1), because
3911 we want to get same (cycSize) for different COPY_CHUNK_SIZE values. */
3912 // cycSize += (COPY_CHUNK_SIZE - 1) + (alignSize - 1); // for debug : we can get smallest (cycSize)
3913 cycSize += (1 << 7) + alignSize;
3914 cycSize &= ~(size_t)(alignSize - 1);
3915 // cycSize must be aligned for 32, because xxh requires 32-bytes blocks.
3916 // cycSize += 12345; // for debug
3917 // cycSize += 1 << 10; // for debug
3918 // cycSize += 32; // for debug
3919 // cycSize += kBlockSizeMax; // for debug
3920 if (cycSize < d)
3921 return SZ_ERROR_MEM;
3922 /*
3923 in cyclic buffer mode we allow to decode one additional block
3924 that exceeds (cycSize).
3925 So we must allocate additional (kBlockSizeMax) bytes after (cycSize).
3926 if defined(Z7_STD_DEC_USE_AFTER_CYC_BUF)
3927 {
3928 we can read (COPY_CHUNK_SIZE - 1) bytes after (cycSize)
3929 but we aready allocate additional kBlockSizeMax that
3930 is larger than COPY_CHUNK_SIZE.
3931 So we don't need additional space of COPY_CHUNK_SIZE after (cycSize).
3932 }
3933 */
3934 /*
3935 #ifdef Z7_STD_DEC_USE_AFTER_CYC_BUF
3936 d = cycSize + (1 << 7); // we must add at least (COPY_CHUNK_SIZE - 1)
3937 #endif
3938 */
3939 d = cycSize + kBlockSizeMax;
3940 if (d < cycSize)
3941 return SZ_ERROR_MEM;
3942 }
3943
3944 {
3945 const size_t kMinWinAllocSize = 1 << 12;
3946 if (d < kMinWinAllocSize)
3947 d = kMinWinAllocSize;
3948 }
3949
3950 if (d > dec->winBufSize_Allocated)
3951 {
3952 /*
3953 if (needWrite)
3954 {
3955 p->needWrite_Size = needWrite;
3956 return SZ_OK;
3957 // return SZ_ERROR_FAIL;
3958 }
3959 */
3960
3961 if (dec->winBufSize_Allocated != 0)
3962 {
3963 const size_t k_extra = (useCyclic || d >= (1u << 20)) ?
3964 2 * kBlockSizeMax : 0;
3965 unsigned i = useCyclic ? 17 : 12;
3966 for (; i < sizeof(size_t) * 8; i++)
3967 {
3968 const size_t d2 = ((size_t)1 << i) + k_extra;
3969 if (d2 >= d)
3970 {
3971 d = d2;
3972 break;
3973 }
3974 }
3975 }
3976 // RINOK(ZstdDec_AllocateWindow(dec, d))
3977 ZstdDec_FreeWindow(dec);
3978 dec->win_Base = (Byte *)ISzAlloc_Alloc(dec->alloc_Big, d);
3979 if (!dec->win_Base)
3980 return SZ_ERROR_MEM;
3981 dec->decoder.win = dec->win_Base;
3982 dec->winBufSize_Allocated = d;
3983 }
3984 /*
3985 else
3986 {
3987 // for non-cyclycMode we want flush data, and set winPos = 0
3988 if (needWrite)
3989 {
3990 if (!useCyclic || dec->decoder.winPos >= cycSize)
3991 {
3992 p->needWrite_Size = needWrite;
3993 return SZ_OK;
3994 // return SZ_ERROR_FAIL;
3995 }
3996 }
3997 }
3998 */
3999
4000 dec->decoder.cycSize = cycSize;
4001 p->win = dec->decoder.win;
4002 // p->cycSize = dec->decoder.cycSize;
4003 dec->isCyclicMode = (Byte)useCyclic;
4004 } // (!p->outBuf_fromCaller) end
4005
4006 // p->winPos = dec->decoder.winPos;
4007 dec->frameState = ZSTD2_STATE_BLOCK;
4008 // continue;
4009 } // ZSTD2_STATE_AFTER_HEADER end
4010 }
4011 }
4012
4013
4014 void ZstdDec_GetResInfo(const CZstdDec *dec,
4015 const CZstdDecState *p,
4016 SRes res,
4017 CZstdDecResInfo *stat)
4018 {
4019 // ZstdDecInfo_CLEAR(stat);
4020 stat->extraSize = 0;
4021 stat->is_NonFinishedFrame = False;
4022 if (dec->frameState != ZSTD2_STATE_FINISHED)
4023 {
4024 if (dec->frameState == ZSTD2_STATE_SIGNATURE)
4025 {
4026 stat->extraSize = (Byte)dec->tempSize;
4027 if (ZstdDecInfo_GET_NUM_FRAMES(&p->info) == 0)
4028 res = SZ_ERROR_NO_ARCHIVE;
4029 }
4030 else
4031 {
4032 stat->is_NonFinishedFrame = True;
4033 if (res == SZ_OK && p->status == ZSTD_STATUS_NEEDS_MORE_INPUT)
4034 res = SZ_ERROR_INPUT_EOF;
4035 }
4036 }
4037 stat->decode_SRes = res;
4038 }
4039
4040
4041 size_t ZstdDec_ReadUnusedFromInBuf(
4042 CZstdDecHandle dec,
4043 size_t afterDecoding_tempPos,
4044 void *data, size_t size)
4045 {
4046 size_t processed = 0;
4047 if (dec->frameState == ZSTD2_STATE_SIGNATURE)
4048 {
4049 Byte *dest = (Byte *)data;
4050 const size_t tempSize = dec->tempSize;
4051 while (afterDecoding_tempPos < tempSize)
4052 {
4053 if (size == 0)
4054 break;
4055 size--;
4056 *dest++ = dec->temp[afterDecoding_tempPos++];
4057 processed++;
4058 }
4059 }
4060 return processed;
4061 }
4062
4063
4064 void ZstdDecState_Clear(CZstdDecState *p)
4065 {
4066 memset(p, 0 , sizeof(*p));
4067 }
4068