1 // XpressDecoder.cpp
2
3 #include "StdAfx.h"
4
5 #include "../../../C/CpuArch.h"
6 #include "../../../C/RotateDefs.h"
7
8 #include "HuffmanDecoder.h"
9 #include "XpressDecoder.h"
10
11 #ifdef MY_CPU_LE_UNALIGN
12 #define Z7_XPRESS_DEC_USE_UNALIGNED_COPY
13 #endif
14
15 #ifdef Z7_XPRESS_DEC_USE_UNALIGNED_COPY
16
17 #define COPY_CHUNK_SIZE 16
18
19 #define COPY_CHUNK_4_2(dest, src) \
20 { \
21 ((UInt32 *)(void *)dest)[0] = ((const UInt32 *)(const void *)src)[0]; \
22 ((UInt32 *)(void *)dest)[1] = ((const UInt32 *)(const void *)src)[1]; \
23 src += 4 * 2; \
24 dest += 4 * 2; \
25 }
26
27 /* sse2 doesn't help here in GCC and CLANG.
28 so we disabled sse2 here */
29 #if 0
30 #if defined(MY_CPU_AMD64)
31 #define Z7_XPRESS_DEC_USE_SSE2
32 #elif defined(MY_CPU_X86)
33 #if defined(_MSC_VER) && _MSC_VER >= 1300 && defined(_M_IX86_FP) && (_M_IX86_FP >= 2) \
34 || defined(__SSE2__) \
35 // || 1 == 1 // for debug only
36 #define Z7_XPRESS_DEC_USE_SSE2
37 #endif
38 #endif
39 #endif
40
41 #if defined(MY_CPU_ARM64)
42 #include <arm_neon.h>
43 #define COPY_OFFSET_MIN 16
44 #define COPY_CHUNK1(dest, src) \
45 { \
46 vst1q_u8((uint8_t *)(void *)dest, \
47 vld1q_u8((const uint8_t *)(const void *)src)); \
48 src += 16; \
49 dest += 16; \
50 }
51
52 #define COPY_CHUNK(dest, src) \
53 { \
54 COPY_CHUNK1(dest, src) \
55 if (dest >= dest_lim) break; \
56 COPY_CHUNK1(dest, src) \
57 }
58
59 #elif defined(Z7_XPRESS_DEC_USE_SSE2)
60 #include <emmintrin.h> // sse2
61 #define COPY_OFFSET_MIN 16
62
63 #define COPY_CHUNK1(dest, src) \
64 { \
65 _mm_storeu_si128((__m128i *)(void *)dest, \
66 _mm_loadu_si128((const __m128i *)(const void *)src)); \
67 src += 16; \
68 dest += 16; \
69 }
70
71 #define COPY_CHUNK(dest, src) \
72 { \
73 COPY_CHUNK1(dest, src) \
74 if (dest >= dest_lim) break; \
75 COPY_CHUNK1(dest, src) \
76 }
77
78 #elif defined(MY_CPU_64BIT)
79 #define COPY_OFFSET_MIN 8
80
81 #define COPY_CHUNK(dest, src) \
82 { \
83 ((UInt64 *)(void *)dest)[0] = ((const UInt64 *)(const void *)src)[0]; \
84 ((UInt64 *)(void *)dest)[1] = ((const UInt64 *)(const void *)src)[1]; \
85 src += 8 * 2; \
86 dest += 8 * 2; \
87 }
88
89 #else
90 #define COPY_OFFSET_MIN 4
91
92 #define COPY_CHUNK(dest, src) \
93 { \
94 COPY_CHUNK_4_2(dest, src); \
95 COPY_CHUNK_4_2(dest, src); \
96 }
97
98 #endif
99 #endif
100
101
102 #ifndef COPY_CHUNK_SIZE
103 #define COPY_OFFSET_MIN 4
104 #define COPY_CHUNK_SIZE 8
105 #define COPY_CHUNK_2(dest, src) \
106 { \
107 const Byte a0 = src[0]; \
108 const Byte a1 = src[1]; \
109 dest[0] = a0; \
110 dest[1] = a1; \
111 src += 2; \
112 dest += 2; \
113 }
114 #define COPY_CHUNK(dest, src) \
115 { \
116 COPY_CHUNK_2(dest, src) \
117 COPY_CHUNK_2(dest, src) \
118 COPY_CHUNK_2(dest, src) \
119 COPY_CHUNK_2(dest, src) \
120 }
121 #endif
122
123
124 #define COPY_CHUNKS \
125 { \
126 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE \
127 do { COPY_CHUNK(dest, src) } \
128 while (dest < dest_lim); \
129 }
130
131
132 static
133 Z7_FORCE_INLINE
134 // Z7_ATTRIB_NO_VECTOR
CopyMatch_1(Byte * dest,const Byte * dest_lim)135 void CopyMatch_1(Byte *dest, const Byte *dest_lim)
136 {
137 const unsigned b0 = dest[-1];
138 {
139 #if defined(Z7_XPRESS_DEC_USE_UNALIGNED_COPY) && (COPY_CHUNK_SIZE == 16)
140 #if defined(MY_CPU_64BIT)
141 {
142 const UInt64 v64 = (UInt64)b0 * 0x0101010101010101;
143 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
144 do
145 {
146 ((UInt64 *)(void *)dest)[0] = v64;
147 ((UInt64 *)(void *)dest)[1] = v64;
148 dest += 16;
149 }
150 while (dest < dest_lim);
151 }
152 #else
153 {
154 UInt32 v = b0;
155 v |= v << 8;
156 v |= v << 16;
157 do
158 {
159 ((UInt32 *)(void *)dest)[0] = v;
160 ((UInt32 *)(void *)dest)[1] = v;
161 dest += 8;
162 ((UInt32 *)(void *)dest)[0] = v;
163 ((UInt32 *)(void *)dest)[1] = v;
164 dest += 8;
165 }
166 while (dest < dest_lim);
167 }
168 #endif
169 #else
170 do
171 {
172 dest[0] = (Byte)b0;
173 dest[1] = (Byte)b0;
174 dest += 2;
175 dest[0] = (Byte)b0;
176 dest[1] = (Byte)b0;
177 dest += 2;
178 }
179 while (dest < dest_lim);
180 #endif
181 }
182 }
183
184
185 // (offset != 1)
186 static
187 Z7_FORCE_INLINE
188 // Z7_ATTRIB_NO_VECTOR
CopyMatch_Non1(Byte * dest,size_t offset,const Byte * dest_lim)189 void CopyMatch_Non1(Byte *dest, size_t offset, const Byte *dest_lim)
190 {
191 const Byte *src = dest - offset;
192 {
193 // (COPY_OFFSET_MIN >= 4)
194 if (offset >= COPY_OFFSET_MIN)
195 {
196 COPY_CHUNKS
197 // return;
198 }
199 else
200 #if (COPY_OFFSET_MIN > 4)
201 #if COPY_CHUNK_SIZE < 8
202 #error Stop_Compiling_Bad_COPY_CHUNK_SIZE
203 #endif
204 if (offset >= 4)
205 {
206 Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
207 do
208 {
209 COPY_CHUNK_4_2(dest, src)
210 #if COPY_CHUNK_SIZE != 16
211 if (dest >= dest_lim) break;
212 #endif
213 COPY_CHUNK_4_2(dest, src)
214 }
215 while (dest < dest_lim);
216 // return;
217 }
218 else
219 #endif
220 {
221 // (offset < 4)
222 if (offset == 2)
223 {
224 #if defined(Z7_XPRESS_DEC_USE_UNALIGNED_COPY)
225 UInt32 w0 = GetUi16(src);
226 w0 += w0 << 16;
227 do
228 {
229 SetUi32(dest, w0)
230 dest += 4;
231 }
232 while (dest < dest_lim);
233 #else
234 const unsigned b0 = src[0];
235 const Byte b1 = src[1];
236 do
237 {
238 dest[0] = (Byte)b0;
239 dest[1] = b1;
240 dest += 2;
241 }
242 while (dest < dest_lim);
243 #endif
244 }
245 else // (offset == 3)
246 {
247 const unsigned b0 = src[0];
248 #if defined(Z7_XPRESS_DEC_USE_UNALIGNED_COPY)
249 const unsigned w1 = GetUi16(src + 1);
250 do
251 {
252 dest[0] = (Byte)b0;
253 SetUi16(dest + 1, (UInt16)w1)
254 dest += 3;
255 }
256 while (dest < dest_lim);
257 #else
258 const Byte b1 = src[1];
259 const Byte b2 = src[2];
260 do
261 {
262 dest[0] = (Byte)b0;
263 dest[1] = b1;
264 dest[2] = b2;
265 dest += 3;
266 }
267 while (dest < dest_lim);
268 #endif
269 }
270 }
271 }
272 }
273
274
275 namespace NCompress {
276 namespace NXpress {
277
278 #define BIT_STREAM_NORMALIZE \
279 if (BitPos > 16) { \
280 if (in >= lim) return S_FALSE; \
281 BitPos -= 16; \
282 Value |= (UInt32)GetUi16(in) << BitPos; \
283 in += 2; }
284
285 #define MOVE_POS(bs, numBits) \
286 BitPos += (unsigned)numBits; \
287 Value <<= numBits; \
288
289
290 static const unsigned kNumHuffBits = 15;
291 static const unsigned kNumTableBits = 10;
292 static const unsigned kNumLenBits = 4;
293 static const unsigned kLenMask = (1 << kNumLenBits) - 1;
294 static const unsigned kNumPosSlots = 16;
295 static const unsigned kNumSyms = 256 + (kNumPosSlots << kNumLenBits);
296
Decode_WithExceedWrite(const Byte * in,size_t inSize,Byte * out,size_t outSize)297 HRESULT Decode_WithExceedWrite(const Byte *in, size_t inSize, Byte *out, size_t outSize)
298 {
299 NCompress::NHuffman::CDecoder<kNumHuffBits, kNumSyms, kNumTableBits> huff;
300
301 if (inSize < kNumSyms / 2 + 4)
302 return S_FALSE;
303 {
304 Byte levels[kNumSyms];
305 for (unsigned i = 0; i < kNumSyms / 2; i++)
306 {
307 const unsigned b = in[i];
308 levels[(size_t)i * 2 ] = (Byte)(b & 0xf);
309 levels[(size_t)i * 2 + 1] = (Byte)(b >> 4);
310 }
311 if (!huff.Build(levels, NHuffman::k_BuildMode_Full))
312 return S_FALSE;
313 }
314
315 UInt32 Value;
316 unsigned BitPos; // how many bits in (Value) were processed
317
318 const Byte *lim = in + inSize - 1; // points to last byte
319 in += kNumSyms / 2;
320 #ifdef MY_CPU_LE_UNALIGN
321 Value = GetUi32(in);
322 Value = rotlFixed(Value, 16);
323 #else
324 Value = ((UInt32)GetUi16(in) << 16) | GetUi16(in + 2);
325 #endif
326
327 in += 4;
328 BitPos = 0;
329 Byte *dest = out;
330 const Byte *outLim = out + outSize;
331
332 for (;;)
333 {
334 unsigned sym;
335 Z7_HUFF_DECODE_VAL_IN_HIGH32(sym, &huff, kNumHuffBits, kNumTableBits,
336 Value, Z7_HUFF_DECODE_ERROR_SYM_CHECK_NO, {}, MOVE_POS, {}, bs)
337 // 0 < BitPos <= 31
338 BIT_STREAM_NORMALIZE
339 // 0 < BitPos <= 16
340
341 if (dest >= outLim)
342 return (sym == 256 && Value == 0 && in == lim + 1) ? S_OK : S_FALSE;
343
344 if (sym < 256)
345 *dest++ = (Byte)sym;
346 else
347 {
348 const unsigned distBits = (unsigned)(Byte)sym >> kNumLenBits; // (sym - 256) >> kNumLenBits;
349 UInt32 len = (UInt32)(sym & kLenMask);
350
351 if (len == kLenMask)
352 {
353 if (in > lim)
354 return S_FALSE;
355 // here we read input bytes in out-of-order related to main input stream (bits in Value):
356 len = *in++;
357 if (len == 0xff)
358 {
359 if (in >= lim)
360 return S_FALSE;
361 len = GetUi16(in);
362 in += 2;
363 }
364 else
365 len += kLenMask;
366 }
367
368 len += 3;
369 if (len > (size_t)(outLim - dest))
370 return S_FALSE;
371
372 if (distBits == 0)
373 {
374 // d == 1
375 if (dest == out)
376 return S_FALSE;
377 Byte *destTemp = dest;
378 dest += len;
379 CopyMatch_1(destTemp, dest);
380 }
381 else
382 {
383 unsigned d = (unsigned)(Value >> (32 - distBits));
384 MOVE_POS(bs, distBits)
385 d += 1u << distBits;
386 // 0 < BitPos <= 31
387 BIT_STREAM_NORMALIZE
388 // 0 < BitPos <= 16
389 if (d > (size_t)(dest - out))
390 return S_FALSE;
391 Byte *destTemp = dest;
392 dest += len;
393 CopyMatch_Non1(destTemp, d, dest);
394 }
395 }
396 }
397 }
398
399 }}
400