xref: /aosp_15_r20/external/zstd/programs/dibio.c (revision 01826a4963a0d8a59bc3812d29bdf0fb76416722)
1 /*
2  * Copyright (c) Meta Platforms, Inc. and affiliates.
3  * All rights reserved.
4  *
5  * This source code is licensed under both the BSD-style license (found in the
6  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7  * in the COPYING file in the root directory of this source tree).
8  * You may select, at your option, one of the above-listed licenses.
9  */
10 
11 
12 
13 /* **************************************
14 *  Compiler Warnings
15 ****************************************/
16 #ifdef _MSC_VER
17 #  pragma warning(disable : 4127)    /* disable: C4127: conditional expression is constant */
18 #endif
19 
20 
21 /*-*************************************
22 *  Includes
23 ***************************************/
24 #include "platform.h"       /* Large Files support */
25 #include "util.h"           /* UTIL_getFileSize, UTIL_getTotalFileSize */
26 #include <stdlib.h>         /* malloc, free */
27 #include <string.h>         /* memset */
28 #include <stdio.h>          /* fprintf, fopen, ftello64 */
29 #include <errno.h>          /* errno */
30 
31 #include "timefn.h"         /* UTIL_time_t, UTIL_clockSpanMicro, UTIL_getTime */
32 #include "../lib/common/debug.h" /* assert */
33 #include "../lib/common/mem.h"  /* read */
34 #include "../lib/zstd_errors.h"
35 #include "dibio.h"
36 
37 
38 /*-*************************************
39 *  Constants
40 ***************************************/
41 #define KB *(1 <<10)
42 #define MB *(1 <<20)
43 #define GB *(1U<<30)
44 
45 #define SAMPLESIZE_MAX (128 KB)
46 #define MEMMULT 11    /* rough estimation : memory cost to analyze 1 byte of sample */
47 #define COVER_MEMMULT 9    /* rough estimation : memory cost to analyze 1 byte of sample */
48 #define FASTCOVER_MEMMULT 1    /* rough estimation : memory cost to analyze 1 byte of sample */
49 static const size_t g_maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t));
50 
51 #define NOISELENGTH 32
52 #define MAX_SAMPLES_SIZE (2 GB) /* training dataset limited to 2GB */
53 
54 
55 /*-*************************************
56 *  Console display
57 ***************************************/
58 #define DISPLAY(...)         fprintf(stderr, __VA_ARGS__)
59 #define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); }
60 
61 static const U64 g_refreshRate = SEC_TO_MICRO / 6;
62 static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER;
63 
64 #define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \
65             if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \
66             { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \
67             if (displayLevel>=4) fflush(stderr); } } }
68 
69 /*-*************************************
70 *  Exceptions
71 ***************************************/
72 #ifndef DEBUG
73 #  define DEBUG 0
74 #endif
75 #define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__);
76 #define EXM_THROW(error, ...)                                             \
77 {                                                                         \
78     DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \
79     DISPLAY("Error %i : ", error);                                        \
80     DISPLAY(__VA_ARGS__);                                                 \
81     DISPLAY("\n");                                                        \
82     exit(error);                                                          \
83 }
84 
85 
86 /* ********************************************************
87 *  Helper functions
88 **********************************************************/
89 #undef MIN
90 #define MIN(a,b)    ((a) < (b) ? (a) : (b))
91 
92 /**
93   Returns the size of a file.
94   If error returns -1.
95 */
DiB_getFileSize(const char * fileName)96 static S64 DiB_getFileSize (const char * fileName)
97 {
98     U64 const fileSize = UTIL_getFileSize(fileName);
99     return (fileSize == UTIL_FILESIZE_UNKNOWN) ? -1 : (S64)fileSize;
100 }
101 
102 /* ********************************************************
103 *  File related operations
104 **********************************************************/
105 /** DiB_loadFiles() :
106  *  load samples from files listed in fileNamesTable into buffer.
107  *  works even if buffer is too small to load all samples.
108  *  Also provides the size of each sample into sampleSizes table
109  *  which must be sized correctly, using DiB_fileStats().
110  * @return : nb of samples effectively loaded into `buffer`
111  * *bufferSizePtr is modified, it provides the amount data loaded within buffer.
112  *  sampleSizes is filled with the size of each sample.
113  */
DiB_loadFiles(void * buffer,size_t * bufferSizePtr,size_t * sampleSizes,int sstSize,const char ** fileNamesTable,int nbFiles,size_t targetChunkSize,int displayLevel)114 static int DiB_loadFiles(
115     void* buffer, size_t* bufferSizePtr,
116     size_t* sampleSizes, int sstSize,
117     const char** fileNamesTable, int nbFiles,
118     size_t targetChunkSize, int displayLevel )
119 {
120     char* const buff = (char*)buffer;
121     size_t totalDataLoaded = 0;
122     int nbSamplesLoaded = 0;
123     int fileIndex = 0;
124     FILE * f = NULL;
125 
126     assert(targetChunkSize <= SAMPLESIZE_MAX);
127 
128     while ( nbSamplesLoaded < sstSize && fileIndex < nbFiles ) {
129         size_t fileDataLoaded;
130         S64 const fileSize = DiB_getFileSize(fileNamesTable[fileIndex]);
131         if (fileSize <= 0) {
132             /* skip if zero-size or file error */
133             ++fileIndex;
134             continue;
135         }
136 
137         f = fopen( fileNamesTable[fileIndex], "rb");
138         if (f == NULL)
139             EXM_THROW(10, "zstd: dictBuilder: %s %s ", fileNamesTable[fileIndex], strerror(errno));
140         DISPLAYUPDATE(2, "Loading %s...       \r", fileNamesTable[fileIndex]);
141 
142         /* Load the first chunk of data from the file */
143         fileDataLoaded = targetChunkSize > 0 ?
144                             (size_t)MIN(fileSize, (S64)targetChunkSize) :
145                             (size_t)MIN(fileSize, SAMPLESIZE_MAX );
146         if (totalDataLoaded + fileDataLoaded > *bufferSizePtr)
147             break;
148         if (fread( buff+totalDataLoaded, 1, fileDataLoaded, f ) != fileDataLoaded)
149             EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]);
150         sampleSizes[nbSamplesLoaded++] = fileDataLoaded;
151         totalDataLoaded += fileDataLoaded;
152 
153         /* If file-chunking is enabled, load the rest of the file as more samples */
154         if (targetChunkSize > 0) {
155             while( (S64)fileDataLoaded < fileSize && nbSamplesLoaded < sstSize ) {
156                 size_t const chunkSize = MIN((size_t)(fileSize-fileDataLoaded), targetChunkSize);
157                 if (totalDataLoaded + chunkSize > *bufferSizePtr) /* buffer is full */
158                     break;
159 
160                 if (fread( buff+totalDataLoaded, 1, chunkSize, f ) != chunkSize)
161                     EXM_THROW(11, "Pb reading %s", fileNamesTable[fileIndex]);
162                 sampleSizes[nbSamplesLoaded++] = chunkSize;
163                 totalDataLoaded += chunkSize;
164                 fileDataLoaded += chunkSize;
165             }
166         }
167         fileIndex += 1;
168         fclose(f); f = NULL;
169     }
170     if (f != NULL)
171         fclose(f);
172 
173     DISPLAYLEVEL(2, "\r%79s\r", "");
174     DISPLAYLEVEL(4, "Loaded %d KB total training data, %d nb samples \n",
175         (int)(totalDataLoaded / (1 KB)), nbSamplesLoaded );
176     *bufferSizePtr = totalDataLoaded;
177     return nbSamplesLoaded;
178 }
179 
180 #define DiB_rotl32(x,r) ((x << r) | (x >> (32 - r)))
DiB_rand(U32 * src)181 static U32 DiB_rand(U32* src)
182 {
183     static const U32 prime1 = 2654435761U;
184     static const U32 prime2 = 2246822519U;
185     U32 rand32 = *src;
186     rand32 *= prime1;
187     rand32 ^= prime2;
188     rand32  = DiB_rotl32(rand32, 13);
189     *src = rand32;
190     return rand32 >> 5;
191 }
192 
193 /* DiB_shuffle() :
194  * shuffle a table of file names in a semi-random way
195  * It improves dictionary quality by reducing "locality" impact, so if sample set is very large,
196  * it will load random elements from it, instead of just the first ones. */
DiB_shuffle(const char ** fileNamesTable,unsigned nbFiles)197 static void DiB_shuffle(const char** fileNamesTable, unsigned nbFiles) {
198     U32 seed = 0xFD2FB528;
199     unsigned i;
200     if (nbFiles == 0)
201         return;
202     for (i = nbFiles - 1; i > 0; --i) {
203         unsigned const j = DiB_rand(&seed) % (i + 1);
204         const char* const tmp = fileNamesTable[j];
205         fileNamesTable[j] = fileNamesTable[i];
206         fileNamesTable[i] = tmp;
207     }
208 }
209 
210 
211 /*-********************************************************
212 *  Dictionary training functions
213 **********************************************************/
DiB_findMaxMem(unsigned long long requiredMem)214 static size_t DiB_findMaxMem(unsigned long long requiredMem)
215 {
216     size_t const step = 8 MB;
217     void* testmem = NULL;
218 
219     requiredMem = (((requiredMem >> 23) + 1) << 23);
220     requiredMem += step;
221     if (requiredMem > g_maxMemory) requiredMem = g_maxMemory;
222 
223     while (!testmem) {
224         testmem = malloc((size_t)requiredMem);
225         requiredMem -= step;
226     }
227 
228     free(testmem);
229     return (size_t)requiredMem;
230 }
231 
232 
DiB_fillNoise(void * buffer,size_t length)233 static void DiB_fillNoise(void* buffer, size_t length)
234 {
235     unsigned const prime1 = 2654435761U;
236     unsigned const prime2 = 2246822519U;
237     unsigned acc = prime1;
238     size_t p=0;
239 
240     for (p=0; p<length; p++) {
241         acc *= prime2;
242         ((unsigned char*)buffer)[p] = (unsigned char)(acc >> 21);
243     }
244 }
245 
246 
DiB_saveDict(const char * dictFileName,const void * buff,size_t buffSize)247 static void DiB_saveDict(const char* dictFileName,
248                          const void* buff, size_t buffSize)
249 {
250     FILE* const f = fopen(dictFileName, "wb");
251     if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
252 
253     { size_t const n = fwrite(buff, 1, buffSize, f);
254       if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName) }
255 
256     { size_t const n = (size_t)fclose(f);
257       if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName) }
258 }
259 
260 typedef struct {
261     S64 totalSizeToLoad;
262     int nbSamples;
263     int oneSampleTooLarge;
264 } fileStats;
265 
266 /*! DiB_fileStats() :
267  *  Given a list of files, and a chunkSize (0 == no chunk, whole files)
268  *  provides the amount of data to be loaded and the resulting nb of samples.
269  *  This is useful primarily for allocation purpose => sample buffer, and sample sizes table.
270  */
DiB_fileStats(const char ** fileNamesTable,int nbFiles,size_t chunkSize,int displayLevel)271 static fileStats DiB_fileStats(const char** fileNamesTable, int nbFiles, size_t chunkSize, int displayLevel)
272 {
273     fileStats fs;
274     int n;
275     memset(&fs, 0, sizeof(fs));
276 
277     /* We assume that if chunking is requested, the chunk size is < SAMPLESIZE_MAX */
278     assert( chunkSize <= SAMPLESIZE_MAX );
279 
280     for (n=0; n<nbFiles; n++) {
281       S64 const fileSize = DiB_getFileSize(fileNamesTable[n]);
282       /* TODO: is there a minimum sample size? What if the file is 1-byte? */
283       if (fileSize == 0) {
284         DISPLAYLEVEL(3, "Sample file '%s' has zero size, skipping...\n", fileNamesTable[n]);
285         continue;
286       }
287 
288       /* the case where we are breaking up files in sample chunks */
289       if (chunkSize > 0) {
290         /* TODO: is there a minimum sample size? Can we have a 1-byte sample? */
291         fs.nbSamples += (int)((fileSize + chunkSize-1) / chunkSize);
292         fs.totalSizeToLoad += fileSize;
293       }
294       else {
295       /* the case where one file is one sample */
296         if (fileSize > SAMPLESIZE_MAX) {
297           /* flag excessively large sample files */
298           fs.oneSampleTooLarge |= (fileSize > 2*SAMPLESIZE_MAX);
299 
300           /* Limit to the first SAMPLESIZE_MAX (128kB) of the file */
301           DISPLAYLEVEL(3, "Sample file '%s' is too large, limiting to %d KB",
302               fileNamesTable[n], SAMPLESIZE_MAX / (1 KB));
303         }
304         fs.nbSamples += 1;
305         fs.totalSizeToLoad += MIN(fileSize, SAMPLESIZE_MAX);
306       }
307     }
308     DISPLAYLEVEL(4, "Found training data %d files, %d KB, %d samples\n", nbFiles, (int)(fs.totalSizeToLoad / (1 KB)), fs.nbSamples);
309     return fs;
310 }
311 
DiB_trainFromFiles(const char * dictFileName,size_t maxDictSize,const char ** fileNamesTable,int nbFiles,size_t chunkSize,ZDICT_legacy_params_t * params,ZDICT_cover_params_t * coverParams,ZDICT_fastCover_params_t * fastCoverParams,int optimize,unsigned memLimit)312 int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,
313                        const char** fileNamesTable, int nbFiles, size_t chunkSize,
314                        ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
315                        ZDICT_fastCover_params_t* fastCoverParams, int optimize, unsigned memLimit)
316 {
317     fileStats fs;
318     size_t* sampleSizes; /* vector of sample sizes. Each sample can be up to SAMPLESIZE_MAX */
319     int nbSamplesLoaded; /* nb of samples effectively loaded in srcBuffer */
320     size_t loadedSize; /* total data loaded in srcBuffer for all samples */
321     void* srcBuffer /* contiguous buffer with training data/samples */;
322     void* const dictBuffer = malloc(maxDictSize);
323     int result = 0;
324 
325     int const displayLevel = params ? params->zParams.notificationLevel :
326         coverParams ? coverParams->zParams.notificationLevel :
327         fastCoverParams ? fastCoverParams->zParams.notificationLevel : 0;
328 
329     /* Shuffle input files before we start assessing how much sample datA to load.
330        The purpose of the shuffle is to pick random samples when the sample
331        set is larger than what we can load in memory. */
332     DISPLAYLEVEL(3, "Shuffling input files\n");
333     DiB_shuffle(fileNamesTable, nbFiles);
334 
335     /* Figure out how much sample data to load with how many samples */
336     fs = DiB_fileStats(fileNamesTable, nbFiles, chunkSize, displayLevel);
337 
338     {
339         int const memMult = params ? MEMMULT :
340                             coverParams ? COVER_MEMMULT:
341                             FASTCOVER_MEMMULT;
342         size_t const maxMem =  DiB_findMaxMem(fs.totalSizeToLoad * memMult) / memMult;
343         /* Limit the size of the training data to the free memory */
344         /* Limit the size of the training data to 2GB */
345         /* TODO: there is opportunity to stop DiB_fileStats() early when the data limit is reached */
346         loadedSize = (size_t)MIN( MIN((S64)maxMem, fs.totalSizeToLoad), MAX_SAMPLES_SIZE );
347         if (memLimit != 0) {
348             DISPLAYLEVEL(2, "!  Warning : setting manual memory limit for dictionary training data at %u MB \n",
349                 (unsigned)(memLimit / (1 MB)));
350             loadedSize = (size_t)MIN(loadedSize, memLimit);
351         }
352         srcBuffer = malloc(loadedSize+NOISELENGTH);
353         sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
354     }
355 
356     /* Checks */
357     if ((fs.nbSamples && !sampleSizes) || (!srcBuffer) || (!dictBuffer))
358         EXM_THROW(12, "not enough memory for DiB_trainFiles");   /* should not happen */
359     if (fs.oneSampleTooLarge) {
360         DISPLAYLEVEL(2, "!  Warning : some sample(s) are very large \n");
361         DISPLAYLEVEL(2, "!  Note that dictionary is only useful for small samples. \n");
362         DISPLAYLEVEL(2, "!  As a consequence, only the first %u bytes of each sample are loaded \n", SAMPLESIZE_MAX);
363     }
364     if (fs.nbSamples < 5) {
365         DISPLAYLEVEL(2, "!  Warning : nb of samples too low for proper processing ! \n");
366         DISPLAYLEVEL(2, "!  Please provide _one file per sample_. \n");
367         DISPLAYLEVEL(2, "!  Alternatively, split files into fixed-size blocks representative of samples, with -B# \n");
368         EXM_THROW(14, "nb of samples too low");   /* we now clearly forbid this case */
369     }
370     if (fs.totalSizeToLoad < (S64)maxDictSize * 8) {
371         DISPLAYLEVEL(2, "!  Warning : data size of samples too small for target dictionary size \n");
372         DISPLAYLEVEL(2, "!  Samples should be about 100x larger than target dictionary size \n");
373     }
374 
375     /* init */
376     if ((S64)loadedSize < fs.totalSizeToLoad)
377         DISPLAYLEVEL(1, "Training samples set too large (%u MB); training on %u MB only...\n",
378             (unsigned)(fs.totalSizeToLoad / (1 MB)),
379             (unsigned)(loadedSize / (1 MB)));
380 
381     /* Load input buffer */
382     nbSamplesLoaded = DiB_loadFiles(
383         srcBuffer, &loadedSize, sampleSizes, fs.nbSamples, fileNamesTable,
384         nbFiles, chunkSize, displayLevel);
385 
386     {   size_t dictSize = ZSTD_error_GENERIC;
387         if (params) {
388             DiB_fillNoise((char*)srcBuffer + loadedSize, NOISELENGTH);   /* guard band, for end of buffer condition */
389             dictSize = ZDICT_trainFromBuffer_legacy(dictBuffer, maxDictSize,
390                                                     srcBuffer, sampleSizes, nbSamplesLoaded,
391                                                     *params);
392         } else if (coverParams) {
393             if (optimize) {
394               dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize,
395                                                              srcBuffer, sampleSizes, nbSamplesLoaded,
396                                                              coverParams);
397               if (!ZDICT_isError(dictSize)) {
398                   unsigned splitPercentage = (unsigned)(coverParams->splitPoint * 100);
399                   DISPLAYLEVEL(2, "k=%u\nd=%u\nsteps=%u\nsplit=%u\n", coverParams->k, coverParams->d,
400                               coverParams->steps, splitPercentage);
401               }
402             } else {
403               dictSize = ZDICT_trainFromBuffer_cover(dictBuffer, maxDictSize, srcBuffer,
404                                                      sampleSizes, nbSamplesLoaded, *coverParams);
405             }
406         } else if (fastCoverParams != NULL) {
407             if (optimize) {
408               dictSize = ZDICT_optimizeTrainFromBuffer_fastCover(dictBuffer, maxDictSize,
409                                                               srcBuffer, sampleSizes, nbSamplesLoaded,
410                                                               fastCoverParams);
411               if (!ZDICT_isError(dictSize)) {
412                 unsigned splitPercentage = (unsigned)(fastCoverParams->splitPoint * 100);
413                 DISPLAYLEVEL(2, "k=%u\nd=%u\nf=%u\nsteps=%u\nsplit=%u\naccel=%u\n", fastCoverParams->k,
414                             fastCoverParams->d, fastCoverParams->f, fastCoverParams->steps, splitPercentage,
415                             fastCoverParams->accel);
416               }
417             } else {
418               dictSize = ZDICT_trainFromBuffer_fastCover(dictBuffer, maxDictSize, srcBuffer,
419                                                         sampleSizes, nbSamplesLoaded, *fastCoverParams);
420             }
421         } else {
422             assert(0 /* Impossible */);
423         }
424         if (ZDICT_isError(dictSize)) {
425             DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize));   /* should not happen */
426             result = 1;
427             goto _cleanup;
428         }
429         /* save dict */
430         DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (unsigned)dictSize, dictFileName);
431         DiB_saveDict(dictFileName, dictBuffer, dictSize);
432     }
433 
434     /* clean up */
435 _cleanup:
436     free(srcBuffer);
437     free(sampleSizes);
438     free(dictBuffer);
439     return result;
440 }
441