xref: /aosp_15_r20/external/zstd/tests/regression/data.c (revision 01826a4963a0d8a59bc3812d29bdf0fb76416722)
1*01826a49SYabin Cui /*
2*01826a49SYabin Cui  * Copyright (c) Meta Platforms, Inc. and affiliates.
3*01826a49SYabin Cui  * All rights reserved.
4*01826a49SYabin Cui  *
5*01826a49SYabin Cui  * This source code is licensed under both the BSD-style license (found in the
6*01826a49SYabin Cui  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7*01826a49SYabin Cui  * in the COPYING file in the root directory of this source tree).
8*01826a49SYabin Cui  * You may select, at your option, one of the above-listed licenses.
9*01826a49SYabin Cui  */
10*01826a49SYabin Cui 
11*01826a49SYabin Cui #include "data.h"
12*01826a49SYabin Cui 
13*01826a49SYabin Cui #include <assert.h>
14*01826a49SYabin Cui #include <errno.h>
15*01826a49SYabin Cui #include <stdio.h>
16*01826a49SYabin Cui #include <string.h>
17*01826a49SYabin Cui #include <stdlib.h>   /* free() */
18*01826a49SYabin Cui 
19*01826a49SYabin Cui #include <sys/stat.h>
20*01826a49SYabin Cui 
21*01826a49SYabin Cui #include <curl/curl.h>
22*01826a49SYabin Cui 
23*01826a49SYabin Cui #include "mem.h"
24*01826a49SYabin Cui #include "util.h"
25*01826a49SYabin Cui #define XXH_STATIC_LINKING_ONLY
26*01826a49SYabin Cui #include "xxhash.h"
27*01826a49SYabin Cui 
28*01826a49SYabin Cui /**
29*01826a49SYabin Cui  * Data objects
30*01826a49SYabin Cui  */
31*01826a49SYabin Cui 
32*01826a49SYabin Cui #define REGRESSION_RELEASE(x) \
33*01826a49SYabin Cui     "https://github.com/facebook/zstd/releases/download/regression-data/" x
34*01826a49SYabin Cui 
35*01826a49SYabin Cui data_t silesia = {
36*01826a49SYabin Cui     .name = "silesia",
37*01826a49SYabin Cui     .type = data_type_dir,
38*01826a49SYabin Cui     .data =
39*01826a49SYabin Cui         {
40*01826a49SYabin Cui             .url = REGRESSION_RELEASE("silesia.tar.zst"),
41*01826a49SYabin Cui             .xxhash64 = 0x48a199f92f93e977LL,
42*01826a49SYabin Cui         },
43*01826a49SYabin Cui };
44*01826a49SYabin Cui 
45*01826a49SYabin Cui data_t silesia_tar = {
46*01826a49SYabin Cui     .name = "silesia.tar",
47*01826a49SYabin Cui     .type = data_type_file,
48*01826a49SYabin Cui     .data =
49*01826a49SYabin Cui         {
50*01826a49SYabin Cui             .url = REGRESSION_RELEASE("silesia.tar.zst"),
51*01826a49SYabin Cui             .xxhash64 = 0x48a199f92f93e977LL,
52*01826a49SYabin Cui         },
53*01826a49SYabin Cui };
54*01826a49SYabin Cui 
55*01826a49SYabin Cui data_t github = {
56*01826a49SYabin Cui     .name = "github",
57*01826a49SYabin Cui     .type = data_type_dir,
58*01826a49SYabin Cui     .data =
59*01826a49SYabin Cui         {
60*01826a49SYabin Cui             .url = REGRESSION_RELEASE("github.tar.zst"),
61*01826a49SYabin Cui             .xxhash64 = 0xa9b1b44b020df292LL,
62*01826a49SYabin Cui         },
63*01826a49SYabin Cui     .dict =
64*01826a49SYabin Cui         {
65*01826a49SYabin Cui             .url = REGRESSION_RELEASE("github.dict.zst"),
66*01826a49SYabin Cui             .xxhash64 = 0x1eddc6f737d3cb53LL,
67*01826a49SYabin Cui 
68*01826a49SYabin Cui         },
69*01826a49SYabin Cui };
70*01826a49SYabin Cui 
71*01826a49SYabin Cui data_t github_tar = {
72*01826a49SYabin Cui     .name = "github.tar",
73*01826a49SYabin Cui     .type = data_type_file,
74*01826a49SYabin Cui     .data =
75*01826a49SYabin Cui         {
76*01826a49SYabin Cui             .url = REGRESSION_RELEASE("github.tar.zst"),
77*01826a49SYabin Cui             .xxhash64 = 0xa9b1b44b020df292LL,
78*01826a49SYabin Cui         },
79*01826a49SYabin Cui     .dict =
80*01826a49SYabin Cui         {
81*01826a49SYabin Cui             .url = REGRESSION_RELEASE("github.dict.zst"),
82*01826a49SYabin Cui             .xxhash64 = 0x1eddc6f737d3cb53LL,
83*01826a49SYabin Cui 
84*01826a49SYabin Cui         },
85*01826a49SYabin Cui };
86*01826a49SYabin Cui 
87*01826a49SYabin Cui static data_t* g_data[] = {
88*01826a49SYabin Cui     &silesia,
89*01826a49SYabin Cui     &silesia_tar,
90*01826a49SYabin Cui     &github,
91*01826a49SYabin Cui     &github_tar,
92*01826a49SYabin Cui     NULL,
93*01826a49SYabin Cui };
94*01826a49SYabin Cui 
95*01826a49SYabin Cui data_t const* const* data = (data_t const* const*)g_data;
96*01826a49SYabin Cui 
97*01826a49SYabin Cui /**
98*01826a49SYabin Cui  * data helpers.
99*01826a49SYabin Cui  */
100*01826a49SYabin Cui 
data_has_dict(data_t const * data)101*01826a49SYabin Cui int data_has_dict(data_t const* data) {
102*01826a49SYabin Cui     return data->dict.url != NULL;
103*01826a49SYabin Cui }
104*01826a49SYabin Cui 
105*01826a49SYabin Cui /**
106*01826a49SYabin Cui  * data buffer helper functions (documented in header).
107*01826a49SYabin Cui  */
108*01826a49SYabin Cui 
data_buffer_create(size_t const capacity)109*01826a49SYabin Cui data_buffer_t data_buffer_create(size_t const capacity) {
110*01826a49SYabin Cui     data_buffer_t buffer = {};
111*01826a49SYabin Cui 
112*01826a49SYabin Cui     buffer.data = (uint8_t*)malloc(capacity);
113*01826a49SYabin Cui     if (buffer.data == NULL)
114*01826a49SYabin Cui         return buffer;
115*01826a49SYabin Cui     buffer.capacity = capacity;
116*01826a49SYabin Cui     return buffer;
117*01826a49SYabin Cui }
118*01826a49SYabin Cui 
data_buffer_read(char const * filename)119*01826a49SYabin Cui data_buffer_t data_buffer_read(char const* filename) {
120*01826a49SYabin Cui     data_buffer_t buffer = {};
121*01826a49SYabin Cui 
122*01826a49SYabin Cui     uint64_t const size = UTIL_getFileSize(filename);
123*01826a49SYabin Cui     if (size == UTIL_FILESIZE_UNKNOWN) {
124*01826a49SYabin Cui         fprintf(stderr, "unknown size for %s\n", filename);
125*01826a49SYabin Cui         return buffer;
126*01826a49SYabin Cui     }
127*01826a49SYabin Cui 
128*01826a49SYabin Cui     buffer.data = (uint8_t*)malloc(size);
129*01826a49SYabin Cui     if (buffer.data == NULL) {
130*01826a49SYabin Cui         fprintf(stderr, "malloc failed\n");
131*01826a49SYabin Cui         return buffer;
132*01826a49SYabin Cui     }
133*01826a49SYabin Cui     buffer.capacity = size;
134*01826a49SYabin Cui 
135*01826a49SYabin Cui     FILE* file = fopen(filename, "rb");
136*01826a49SYabin Cui     if (file == NULL) {
137*01826a49SYabin Cui         fprintf(stderr, "file null\n");
138*01826a49SYabin Cui         goto err;
139*01826a49SYabin Cui     }
140*01826a49SYabin Cui     buffer.size = fread(buffer.data, 1, buffer.capacity, file);
141*01826a49SYabin Cui     fclose(file);
142*01826a49SYabin Cui     if (buffer.size != buffer.capacity) {
143*01826a49SYabin Cui         fprintf(stderr, "read %zu != %zu\n", buffer.size, buffer.capacity);
144*01826a49SYabin Cui         goto err;
145*01826a49SYabin Cui     }
146*01826a49SYabin Cui 
147*01826a49SYabin Cui     return buffer;
148*01826a49SYabin Cui err:
149*01826a49SYabin Cui     free(buffer.data);
150*01826a49SYabin Cui     memset(&buffer, 0, sizeof(buffer));
151*01826a49SYabin Cui     return buffer;
152*01826a49SYabin Cui }
153*01826a49SYabin Cui 
data_buffer_get_data(data_t const * data)154*01826a49SYabin Cui data_buffer_t data_buffer_get_data(data_t const* data) {
155*01826a49SYabin Cui     data_buffer_t const kEmptyBuffer = {};
156*01826a49SYabin Cui 
157*01826a49SYabin Cui     if (data->type != data_type_file)
158*01826a49SYabin Cui         return kEmptyBuffer;
159*01826a49SYabin Cui 
160*01826a49SYabin Cui     return data_buffer_read(data->data.path);
161*01826a49SYabin Cui }
162*01826a49SYabin Cui 
data_buffer_get_dict(data_t const * data)163*01826a49SYabin Cui data_buffer_t data_buffer_get_dict(data_t const* data) {
164*01826a49SYabin Cui     data_buffer_t const kEmptyBuffer = {};
165*01826a49SYabin Cui 
166*01826a49SYabin Cui     if (!data_has_dict(data))
167*01826a49SYabin Cui         return kEmptyBuffer;
168*01826a49SYabin Cui 
169*01826a49SYabin Cui     return data_buffer_read(data->dict.path);
170*01826a49SYabin Cui }
171*01826a49SYabin Cui 
data_buffer_compare(data_buffer_t buffer1,data_buffer_t buffer2)172*01826a49SYabin Cui int data_buffer_compare(data_buffer_t buffer1, data_buffer_t buffer2) {
173*01826a49SYabin Cui     size_t const size =
174*01826a49SYabin Cui         buffer1.size < buffer2.size ? buffer1.size : buffer2.size;
175*01826a49SYabin Cui     int const cmp = memcmp(buffer1.data, buffer2.data, size);
176*01826a49SYabin Cui     if (cmp != 0)
177*01826a49SYabin Cui         return cmp;
178*01826a49SYabin Cui     if (buffer1.size < buffer2.size)
179*01826a49SYabin Cui         return -1;
180*01826a49SYabin Cui     if (buffer1.size == buffer2.size)
181*01826a49SYabin Cui         return 0;
182*01826a49SYabin Cui     assert(buffer1.size > buffer2.size);
183*01826a49SYabin Cui     return 1;
184*01826a49SYabin Cui }
185*01826a49SYabin Cui 
data_buffer_free(data_buffer_t buffer)186*01826a49SYabin Cui void data_buffer_free(data_buffer_t buffer) {
187*01826a49SYabin Cui     free(buffer.data);
188*01826a49SYabin Cui }
189*01826a49SYabin Cui 
190*01826a49SYabin Cui /**
191*01826a49SYabin Cui  * data filenames helpers.
192*01826a49SYabin Cui  */
193*01826a49SYabin Cui 
data_filenames_get(data_t const * data)194*01826a49SYabin Cui FileNamesTable* data_filenames_get(data_t const* data)
195*01826a49SYabin Cui {
196*01826a49SYabin Cui     char const* const path = data->data.path;
197*01826a49SYabin Cui     return UTIL_createExpandedFNT(&path, 1, 0 /* followLinks */ );
198*01826a49SYabin Cui }
199*01826a49SYabin Cui 
200*01826a49SYabin Cui /**
201*01826a49SYabin Cui  * data buffers helpers.
202*01826a49SYabin Cui  */
203*01826a49SYabin Cui 
data_buffers_get(data_t const * data)204*01826a49SYabin Cui data_buffers_t data_buffers_get(data_t const* data) {
205*01826a49SYabin Cui     data_buffers_t buffers = {.size = 0};
206*01826a49SYabin Cui     FileNamesTable* const filenames = data_filenames_get(data);
207*01826a49SYabin Cui     if (filenames == NULL) return buffers;
208*01826a49SYabin Cui     if (filenames->tableSize == 0) {
209*01826a49SYabin Cui         UTIL_freeFileNamesTable(filenames);
210*01826a49SYabin Cui         return buffers;
211*01826a49SYabin Cui     }
212*01826a49SYabin Cui 
213*01826a49SYabin Cui     data_buffer_t* buffersPtr =
214*01826a49SYabin Cui         (data_buffer_t*)malloc(filenames->tableSize * sizeof(*buffersPtr));
215*01826a49SYabin Cui     if (buffersPtr == NULL) {
216*01826a49SYabin Cui         UTIL_freeFileNamesTable(filenames);
217*01826a49SYabin Cui         return buffers;
218*01826a49SYabin Cui     }
219*01826a49SYabin Cui     buffers.buffers = (data_buffer_t const*)buffersPtr;
220*01826a49SYabin Cui     buffers.size = filenames->tableSize;
221*01826a49SYabin Cui 
222*01826a49SYabin Cui     for (size_t i = 0; i < filenames->tableSize; ++i) {
223*01826a49SYabin Cui         buffersPtr[i] = data_buffer_read(filenames->fileNames[i]);
224*01826a49SYabin Cui         if (buffersPtr[i].data == NULL) {
225*01826a49SYabin Cui             data_buffers_t const kEmptyBuffer = {};
226*01826a49SYabin Cui             data_buffers_free(buffers);
227*01826a49SYabin Cui             UTIL_freeFileNamesTable(filenames);
228*01826a49SYabin Cui             return kEmptyBuffer;
229*01826a49SYabin Cui         }
230*01826a49SYabin Cui     }
231*01826a49SYabin Cui 
232*01826a49SYabin Cui     UTIL_freeFileNamesTable(filenames);
233*01826a49SYabin Cui     return buffers;
234*01826a49SYabin Cui }
235*01826a49SYabin Cui 
236*01826a49SYabin Cui /**
237*01826a49SYabin Cui  * Frees the data buffers.
238*01826a49SYabin Cui  */
data_buffers_free(data_buffers_t buffers)239*01826a49SYabin Cui void data_buffers_free(data_buffers_t buffers) {
240*01826a49SYabin Cui     free((data_buffer_t*)buffers.buffers);
241*01826a49SYabin Cui }
242*01826a49SYabin Cui 
243*01826a49SYabin Cui /**
244*01826a49SYabin Cui  * Initialization and download functions.
245*01826a49SYabin Cui  */
246*01826a49SYabin Cui 
247*01826a49SYabin Cui static char* g_data_dir = NULL;
248*01826a49SYabin Cui 
249*01826a49SYabin Cui /* mkdir -p */
ensure_directory_exists(char const * indir)250*01826a49SYabin Cui static int ensure_directory_exists(char const* indir) {
251*01826a49SYabin Cui     char* const dir = strdup(indir);
252*01826a49SYabin Cui     char* end = dir;
253*01826a49SYabin Cui     int ret = 0;
254*01826a49SYabin Cui     if (dir == NULL) {
255*01826a49SYabin Cui         ret = EINVAL;
256*01826a49SYabin Cui         goto out;
257*01826a49SYabin Cui     }
258*01826a49SYabin Cui     do {
259*01826a49SYabin Cui         /* Find the next directory level. */
260*01826a49SYabin Cui         for (++end; *end != '\0' && *end != '/'; ++end)
261*01826a49SYabin Cui             ;
262*01826a49SYabin Cui         /* End the string there, make the directory, and restore the string. */
263*01826a49SYabin Cui         char const save = *end;
264*01826a49SYabin Cui         *end = '\0';
265*01826a49SYabin Cui         int const isdir = UTIL_isDirectory(dir);
266*01826a49SYabin Cui         ret = mkdir(dir, S_IRWXU);
267*01826a49SYabin Cui         *end = save;
268*01826a49SYabin Cui         /* Its okay if the directory already exists. */
269*01826a49SYabin Cui         if (ret == 0 || (errno == EEXIST && isdir))
270*01826a49SYabin Cui             continue;
271*01826a49SYabin Cui         ret = errno;
272*01826a49SYabin Cui         fprintf(stderr, "mkdir() failed\n");
273*01826a49SYabin Cui         goto out;
274*01826a49SYabin Cui     } while (*end != '\0');
275*01826a49SYabin Cui 
276*01826a49SYabin Cui     ret = 0;
277*01826a49SYabin Cui out:
278*01826a49SYabin Cui     free(dir);
279*01826a49SYabin Cui     return ret;
280*01826a49SYabin Cui }
281*01826a49SYabin Cui 
282*01826a49SYabin Cui /** Concatenate 3 strings into a new buffer. */
cat3(char const * str1,char const * str2,char const * str3)283*01826a49SYabin Cui static char* cat3(char const* str1, char const* str2, char const* str3) {
284*01826a49SYabin Cui     size_t const size1 = strlen(str1);
285*01826a49SYabin Cui     size_t const size2 = strlen(str2);
286*01826a49SYabin Cui     size_t const size3 = str3 == NULL ? 0 : strlen(str3);
287*01826a49SYabin Cui     size_t const size = size1 + size2 + size3 + 1;
288*01826a49SYabin Cui     char* const dst = (char*)malloc(size);
289*01826a49SYabin Cui     if (dst == NULL)
290*01826a49SYabin Cui         return NULL;
291*01826a49SYabin Cui     strcpy(dst, str1);
292*01826a49SYabin Cui     strcpy(dst + size1, str2);
293*01826a49SYabin Cui     if (str3 != NULL)
294*01826a49SYabin Cui         strcpy(dst + size1 + size2, str3);
295*01826a49SYabin Cui     assert(strlen(dst) == size1 + size2 + size3);
296*01826a49SYabin Cui     return dst;
297*01826a49SYabin Cui }
298*01826a49SYabin Cui 
cat2(char const * str1,char const * str2)299*01826a49SYabin Cui static char* cat2(char const* str1, char const* str2) {
300*01826a49SYabin Cui     return cat3(str1, str2, NULL);
301*01826a49SYabin Cui }
302*01826a49SYabin Cui 
303*01826a49SYabin Cui /**
304*01826a49SYabin Cui  * State needed by the curl callback.
305*01826a49SYabin Cui  * It takes data from curl, hashes it, and writes it to the file.
306*01826a49SYabin Cui  */
307*01826a49SYabin Cui typedef struct {
308*01826a49SYabin Cui     FILE* file;
309*01826a49SYabin Cui     XXH64_state_t xxhash64;
310*01826a49SYabin Cui     int error;
311*01826a49SYabin Cui } curl_data_t;
312*01826a49SYabin Cui 
313*01826a49SYabin Cui /** Create the curl state. */
curl_data_create(data_resource_t const * resource,data_type_t type)314*01826a49SYabin Cui static curl_data_t curl_data_create(
315*01826a49SYabin Cui     data_resource_t const* resource,
316*01826a49SYabin Cui     data_type_t type) {
317*01826a49SYabin Cui     curl_data_t cdata = {};
318*01826a49SYabin Cui 
319*01826a49SYabin Cui     XXH64_reset(&cdata.xxhash64, 0);
320*01826a49SYabin Cui 
321*01826a49SYabin Cui     assert(UTIL_isDirectory(g_data_dir));
322*01826a49SYabin Cui 
323*01826a49SYabin Cui     if (type == data_type_file) {
324*01826a49SYabin Cui         /* Decompress the resource and store to the path. */
325*01826a49SYabin Cui         char* cmd = cat3("zstd -dqfo '", resource->path, "'");
326*01826a49SYabin Cui         if (cmd == NULL) {
327*01826a49SYabin Cui             cdata.error = ENOMEM;
328*01826a49SYabin Cui             return cdata;
329*01826a49SYabin Cui         }
330*01826a49SYabin Cui         cdata.file = popen(cmd, "w");
331*01826a49SYabin Cui         free(cmd);
332*01826a49SYabin Cui     } else {
333*01826a49SYabin Cui         /* Decompress and extract the resource to the cache directory. */
334*01826a49SYabin Cui         char* cmd = cat3("zstd -dc | tar -x -C '", g_data_dir, "'");
335*01826a49SYabin Cui         if (cmd == NULL) {
336*01826a49SYabin Cui             cdata.error = ENOMEM;
337*01826a49SYabin Cui             return cdata;
338*01826a49SYabin Cui         }
339*01826a49SYabin Cui         cdata.file = popen(cmd, "w");
340*01826a49SYabin Cui         free(cmd);
341*01826a49SYabin Cui     }
342*01826a49SYabin Cui     if (cdata.file == NULL) {
343*01826a49SYabin Cui         cdata.error = errno;
344*01826a49SYabin Cui     }
345*01826a49SYabin Cui 
346*01826a49SYabin Cui     return cdata;
347*01826a49SYabin Cui }
348*01826a49SYabin Cui 
349*01826a49SYabin Cui /** Free the curl state. */
curl_data_free(curl_data_t cdata)350*01826a49SYabin Cui static int curl_data_free(curl_data_t cdata) {
351*01826a49SYabin Cui     return pclose(cdata.file);
352*01826a49SYabin Cui }
353*01826a49SYabin Cui 
354*01826a49SYabin Cui /** curl callback. Updates the hash, and writes to the file. */
curl_write(void * data,size_t size,size_t count,void * ptr)355*01826a49SYabin Cui static size_t curl_write(void* data, size_t size, size_t count, void* ptr) {
356*01826a49SYabin Cui     curl_data_t* cdata = (curl_data_t*)ptr;
357*01826a49SYabin Cui     size_t const written = fwrite(data, size, count, cdata->file);
358*01826a49SYabin Cui     XXH64_update(&cdata->xxhash64, data, written * size);
359*01826a49SYabin Cui     return written;
360*01826a49SYabin Cui }
361*01826a49SYabin Cui 
curl_download_resource(CURL * curl,data_resource_t const * resource,data_type_t type)362*01826a49SYabin Cui static int curl_download_resource(
363*01826a49SYabin Cui     CURL* curl,
364*01826a49SYabin Cui     data_resource_t const* resource,
365*01826a49SYabin Cui     data_type_t type) {
366*01826a49SYabin Cui     curl_data_t cdata;
367*01826a49SYabin Cui     /* Download the data. */
368*01826a49SYabin Cui     if (curl_easy_setopt(curl, CURLOPT_URL, resource->url) != 0)
369*01826a49SYabin Cui         return EINVAL;
370*01826a49SYabin Cui     if (curl_easy_setopt(curl, CURLOPT_WRITEDATA, &cdata) != 0)
371*01826a49SYabin Cui         return EINVAL;
372*01826a49SYabin Cui     cdata = curl_data_create(resource, type);
373*01826a49SYabin Cui     if (cdata.error != 0)
374*01826a49SYabin Cui         return cdata.error;
375*01826a49SYabin Cui     int const curl_err = curl_easy_perform(curl);
376*01826a49SYabin Cui     int const close_err = curl_data_free(cdata);
377*01826a49SYabin Cui     if (curl_err) {
378*01826a49SYabin Cui         fprintf(
379*01826a49SYabin Cui             stderr,
380*01826a49SYabin Cui             "downloading '%s' for '%s' failed\n",
381*01826a49SYabin Cui             resource->url,
382*01826a49SYabin Cui             resource->path);
383*01826a49SYabin Cui         return EIO;
384*01826a49SYabin Cui     }
385*01826a49SYabin Cui     if (close_err) {
386*01826a49SYabin Cui         fprintf(stderr, "writing data to '%s' failed\n", resource->path);
387*01826a49SYabin Cui         return EIO;
388*01826a49SYabin Cui     }
389*01826a49SYabin Cui     /* check that the file exists. */
390*01826a49SYabin Cui     if (type == data_type_file && !UTIL_isRegularFile(resource->path)) {
391*01826a49SYabin Cui         fprintf(stderr, "output file '%s' does not exist\n", resource->path);
392*01826a49SYabin Cui         return EIO;
393*01826a49SYabin Cui     }
394*01826a49SYabin Cui     if (type == data_type_dir && !UTIL_isDirectory(resource->path)) {
395*01826a49SYabin Cui         fprintf(
396*01826a49SYabin Cui             stderr, "output directory '%s' does not exist\n", resource->path);
397*01826a49SYabin Cui         return EIO;
398*01826a49SYabin Cui     }
399*01826a49SYabin Cui     /* Check that the hash matches. */
400*01826a49SYabin Cui     if (XXH64_digest(&cdata.xxhash64) != resource->xxhash64) {
401*01826a49SYabin Cui         fprintf(
402*01826a49SYabin Cui             stderr,
403*01826a49SYabin Cui             "checksum does not match: 0x%llxLL != 0x%llxLL\n",
404*01826a49SYabin Cui             (unsigned long long)XXH64_digest(&cdata.xxhash64),
405*01826a49SYabin Cui             (unsigned long long)resource->xxhash64);
406*01826a49SYabin Cui         return EINVAL;
407*01826a49SYabin Cui     }
408*01826a49SYabin Cui 
409*01826a49SYabin Cui     return 0;
410*01826a49SYabin Cui }
411*01826a49SYabin Cui 
412*01826a49SYabin Cui /** Download a single data object. */
curl_download_datum(CURL * curl,data_t const * data)413*01826a49SYabin Cui static int curl_download_datum(CURL* curl, data_t const* data) {
414*01826a49SYabin Cui     int ret;
415*01826a49SYabin Cui     ret = curl_download_resource(curl, &data->data, data->type);
416*01826a49SYabin Cui     if (ret != 0)
417*01826a49SYabin Cui         return ret;
418*01826a49SYabin Cui     if (data_has_dict(data)) {
419*01826a49SYabin Cui         ret = curl_download_resource(curl, &data->dict, data_type_file);
420*01826a49SYabin Cui         if (ret != 0)
421*01826a49SYabin Cui             return ret;
422*01826a49SYabin Cui     }
423*01826a49SYabin Cui     return ret;
424*01826a49SYabin Cui }
425*01826a49SYabin Cui 
426*01826a49SYabin Cui /** Download all the data. */
curl_download_data(data_t const * const * data)427*01826a49SYabin Cui static int curl_download_data(data_t const* const* data) {
428*01826a49SYabin Cui     if (curl_global_init(CURL_GLOBAL_ALL) != 0)
429*01826a49SYabin Cui         return EFAULT;
430*01826a49SYabin Cui 
431*01826a49SYabin Cui     curl_data_t cdata = {};
432*01826a49SYabin Cui     CURL* curl = curl_easy_init();
433*01826a49SYabin Cui     int err = EFAULT;
434*01826a49SYabin Cui 
435*01826a49SYabin Cui     if (curl == NULL)
436*01826a49SYabin Cui         return EFAULT;
437*01826a49SYabin Cui 
438*01826a49SYabin Cui     if (curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L) != 0)
439*01826a49SYabin Cui         goto out;
440*01826a49SYabin Cui     if (curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L) != 0)
441*01826a49SYabin Cui         goto out;
442*01826a49SYabin Cui     if (curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_write) != 0)
443*01826a49SYabin Cui         goto out;
444*01826a49SYabin Cui 
445*01826a49SYabin Cui     assert(data != NULL);
446*01826a49SYabin Cui     for (; *data != NULL; ++data) {
447*01826a49SYabin Cui         if (curl_download_datum(curl, *data) != 0)
448*01826a49SYabin Cui             goto out;
449*01826a49SYabin Cui     }
450*01826a49SYabin Cui 
451*01826a49SYabin Cui     err = 0;
452*01826a49SYabin Cui out:
453*01826a49SYabin Cui     curl_easy_cleanup(curl);
454*01826a49SYabin Cui     curl_global_cleanup();
455*01826a49SYabin Cui     return err;
456*01826a49SYabin Cui }
457*01826a49SYabin Cui 
458*01826a49SYabin Cui /** Fill the path member variable of the data objects. */
data_create_paths(data_t * const * data,char const * dir)459*01826a49SYabin Cui static int data_create_paths(data_t* const* data, char const* dir) {
460*01826a49SYabin Cui     size_t const dirlen = strlen(dir);
461*01826a49SYabin Cui     assert(data != NULL);
462*01826a49SYabin Cui     for (; *data != NULL; ++data) {
463*01826a49SYabin Cui         data_t* const datum = *data;
464*01826a49SYabin Cui         datum->data.path = cat3(dir, "/", datum->name);
465*01826a49SYabin Cui         if (datum->data.path == NULL)
466*01826a49SYabin Cui             return ENOMEM;
467*01826a49SYabin Cui         if (data_has_dict(datum)) {
468*01826a49SYabin Cui             datum->dict.path = cat2(datum->data.path, ".dict");
469*01826a49SYabin Cui             if (datum->dict.path == NULL)
470*01826a49SYabin Cui                 return ENOMEM;
471*01826a49SYabin Cui         }
472*01826a49SYabin Cui     }
473*01826a49SYabin Cui     return 0;
474*01826a49SYabin Cui }
475*01826a49SYabin Cui 
476*01826a49SYabin Cui /** Free the path member variable of the data objects. */
data_free_paths(data_t * const * data)477*01826a49SYabin Cui static void data_free_paths(data_t* const* data) {
478*01826a49SYabin Cui     assert(data != NULL);
479*01826a49SYabin Cui     for (; *data != NULL; ++data) {
480*01826a49SYabin Cui         data_t* datum = *data;
481*01826a49SYabin Cui         free((void*)datum->data.path);
482*01826a49SYabin Cui         free((void*)datum->dict.path);
483*01826a49SYabin Cui         datum->data.path = NULL;
484*01826a49SYabin Cui         datum->dict.path = NULL;
485*01826a49SYabin Cui     }
486*01826a49SYabin Cui }
487*01826a49SYabin Cui 
488*01826a49SYabin Cui static char const kStampName[] = "STAMP";
489*01826a49SYabin Cui 
xxh_update_le(XXH64_state_t * state,uint64_t data)490*01826a49SYabin Cui static void xxh_update_le(XXH64_state_t* state, uint64_t data) {
491*01826a49SYabin Cui     if (!MEM_isLittleEndian())
492*01826a49SYabin Cui         data = MEM_swap64(data);
493*01826a49SYabin Cui     XXH64_update(state, &data, sizeof(data));
494*01826a49SYabin Cui }
495*01826a49SYabin Cui 
496*01826a49SYabin Cui /** Hash the data to create the stamp. */
stamp_hash(data_t const * const * data)497*01826a49SYabin Cui static uint64_t stamp_hash(data_t const* const* data) {
498*01826a49SYabin Cui     XXH64_state_t state;
499*01826a49SYabin Cui 
500*01826a49SYabin Cui     XXH64_reset(&state, 0);
501*01826a49SYabin Cui     assert(data != NULL);
502*01826a49SYabin Cui     for (; *data != NULL; ++data) {
503*01826a49SYabin Cui         data_t const* datum = *data;
504*01826a49SYabin Cui         /* We don't care about the URL that we fetch from. */
505*01826a49SYabin Cui         /* The path is derived from the name. */
506*01826a49SYabin Cui         XXH64_update(&state, datum->name, strlen(datum->name));
507*01826a49SYabin Cui         xxh_update_le(&state, datum->data.xxhash64);
508*01826a49SYabin Cui         xxh_update_le(&state, datum->dict.xxhash64);
509*01826a49SYabin Cui         xxh_update_le(&state, datum->type);
510*01826a49SYabin Cui     }
511*01826a49SYabin Cui     return XXH64_digest(&state);
512*01826a49SYabin Cui }
513*01826a49SYabin Cui 
514*01826a49SYabin Cui /** Check if the stamp matches the stamp in the cache directory. */
stamp_check(char const * dir,data_t const * const * data)515*01826a49SYabin Cui static int stamp_check(char const* dir, data_t const* const* data) {
516*01826a49SYabin Cui     char* stamp = cat3(dir, "/", kStampName);
517*01826a49SYabin Cui     uint64_t const expected = stamp_hash(data);
518*01826a49SYabin Cui     XXH64_canonical_t actual;
519*01826a49SYabin Cui     FILE* stampfile = NULL;
520*01826a49SYabin Cui     int matches = 0;
521*01826a49SYabin Cui 
522*01826a49SYabin Cui     if (stamp == NULL)
523*01826a49SYabin Cui         goto out;
524*01826a49SYabin Cui     if (!UTIL_isRegularFile(stamp)) {
525*01826a49SYabin Cui         fprintf(stderr, "stamp does not exist: recreating the data cache\n");
526*01826a49SYabin Cui         goto out;
527*01826a49SYabin Cui     }
528*01826a49SYabin Cui 
529*01826a49SYabin Cui     stampfile = fopen(stamp, "rb");
530*01826a49SYabin Cui     if (stampfile == NULL) {
531*01826a49SYabin Cui         fprintf(stderr, "could not open stamp: recreating the data cache\n");
532*01826a49SYabin Cui         goto out;
533*01826a49SYabin Cui     }
534*01826a49SYabin Cui 
535*01826a49SYabin Cui     size_t b;
536*01826a49SYabin Cui     if ((b = fread(&actual, sizeof(actual), 1, stampfile)) != 1) {
537*01826a49SYabin Cui         fprintf(stderr, "invalid stamp: recreating the data cache\n");
538*01826a49SYabin Cui         goto out;
539*01826a49SYabin Cui     }
540*01826a49SYabin Cui 
541*01826a49SYabin Cui     matches = (expected == XXH64_hashFromCanonical(&actual));
542*01826a49SYabin Cui     if (matches)
543*01826a49SYabin Cui         fprintf(stderr, "stamp matches: reusing the cached data\n");
544*01826a49SYabin Cui     else
545*01826a49SYabin Cui         fprintf(stderr, "stamp does not match: recreating the data cache\n");
546*01826a49SYabin Cui 
547*01826a49SYabin Cui out:
548*01826a49SYabin Cui     free(stamp);
549*01826a49SYabin Cui     if (stampfile != NULL)
550*01826a49SYabin Cui         fclose(stampfile);
551*01826a49SYabin Cui     return matches;
552*01826a49SYabin Cui }
553*01826a49SYabin Cui 
554*01826a49SYabin Cui /** On success write a new stamp, on failure delete the old stamp. */
555*01826a49SYabin Cui static int
stamp_write(char const * dir,data_t const * const * data,int const data_err)556*01826a49SYabin Cui stamp_write(char const* dir, data_t const* const* data, int const data_err) {
557*01826a49SYabin Cui     char* stamp = cat3(dir, "/", kStampName);
558*01826a49SYabin Cui     FILE* stampfile = NULL;
559*01826a49SYabin Cui     int err = EIO;
560*01826a49SYabin Cui 
561*01826a49SYabin Cui     if (stamp == NULL)
562*01826a49SYabin Cui         return ENOMEM;
563*01826a49SYabin Cui 
564*01826a49SYabin Cui     if (data_err != 0) {
565*01826a49SYabin Cui         err = data_err;
566*01826a49SYabin Cui         goto out;
567*01826a49SYabin Cui     }
568*01826a49SYabin Cui     XXH64_canonical_t hash;
569*01826a49SYabin Cui 
570*01826a49SYabin Cui     XXH64_canonicalFromHash(&hash, stamp_hash(data));
571*01826a49SYabin Cui 
572*01826a49SYabin Cui     stampfile = fopen(stamp, "wb");
573*01826a49SYabin Cui     if (stampfile == NULL)
574*01826a49SYabin Cui         goto out;
575*01826a49SYabin Cui     if (fwrite(&hash, sizeof(hash), 1, stampfile) != 1)
576*01826a49SYabin Cui         goto out;
577*01826a49SYabin Cui     err = 0;
578*01826a49SYabin Cui     fprintf(stderr, "stamped new data cache\n");
579*01826a49SYabin Cui out:
580*01826a49SYabin Cui     if (err != 0)
581*01826a49SYabin Cui         /* Ignore errors. */
582*01826a49SYabin Cui         unlink(stamp);
583*01826a49SYabin Cui     free(stamp);
584*01826a49SYabin Cui     if (stampfile != NULL)
585*01826a49SYabin Cui         fclose(stampfile);
586*01826a49SYabin Cui     return err;
587*01826a49SYabin Cui }
588*01826a49SYabin Cui 
data_init(char const * dir)589*01826a49SYabin Cui int data_init(char const* dir) {
590*01826a49SYabin Cui     int err;
591*01826a49SYabin Cui 
592*01826a49SYabin Cui     if (dir == NULL)
593*01826a49SYabin Cui         return EINVAL;
594*01826a49SYabin Cui 
595*01826a49SYabin Cui     /* This must be first to simplify logic. */
596*01826a49SYabin Cui     err = ensure_directory_exists(dir);
597*01826a49SYabin Cui     if (err != 0)
598*01826a49SYabin Cui         return err;
599*01826a49SYabin Cui 
600*01826a49SYabin Cui     /* Save the cache directory. */
601*01826a49SYabin Cui     g_data_dir = strdup(dir);
602*01826a49SYabin Cui     if (g_data_dir == NULL)
603*01826a49SYabin Cui         return ENOMEM;
604*01826a49SYabin Cui 
605*01826a49SYabin Cui     err = data_create_paths(g_data, dir);
606*01826a49SYabin Cui     if (err != 0)
607*01826a49SYabin Cui         return err;
608*01826a49SYabin Cui 
609*01826a49SYabin Cui     /* If the stamp matches then we are good to go.
610*01826a49SYabin Cui      * This must be called before any modifications to the data cache.
611*01826a49SYabin Cui      * After this point, we MUST call stamp_write() to update the STAMP,
612*01826a49SYabin Cui      * since we've updated the data cache.
613*01826a49SYabin Cui      */
614*01826a49SYabin Cui     if (stamp_check(dir, data))
615*01826a49SYabin Cui         return 0;
616*01826a49SYabin Cui 
617*01826a49SYabin Cui     err = curl_download_data(data);
618*01826a49SYabin Cui     if (err != 0)
619*01826a49SYabin Cui         goto out;
620*01826a49SYabin Cui 
621*01826a49SYabin Cui out:
622*01826a49SYabin Cui     /* This must be last, since it must know if data_init() succeeded. */
623*01826a49SYabin Cui     stamp_write(dir, data, err);
624*01826a49SYabin Cui     return err;
625*01826a49SYabin Cui }
626*01826a49SYabin Cui 
data_finish(void)627*01826a49SYabin Cui void data_finish(void) {
628*01826a49SYabin Cui     data_free_paths(g_data);
629*01826a49SYabin Cui     free(g_data_dir);
630*01826a49SYabin Cui     g_data_dir = NULL;
631*01826a49SYabin Cui }
632