1*01826a49SYabin Cui /*
2*01826a49SYabin Cui * Copyright (c) Meta Platforms, Inc. and affiliates.
3*01826a49SYabin Cui * All rights reserved.
4*01826a49SYabin Cui *
5*01826a49SYabin Cui * This source code is licensed under both the BSD-style license (found in the
6*01826a49SYabin Cui * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7*01826a49SYabin Cui * in the COPYING file in the root directory of this source tree).
8*01826a49SYabin Cui * You may select, at your option, one of the above-listed licenses.
9*01826a49SYabin Cui */
10*01826a49SYabin Cui
11*01826a49SYabin Cui #include "data.h"
12*01826a49SYabin Cui
13*01826a49SYabin Cui #include <assert.h>
14*01826a49SYabin Cui #include <errno.h>
15*01826a49SYabin Cui #include <stdio.h>
16*01826a49SYabin Cui #include <string.h>
17*01826a49SYabin Cui #include <stdlib.h> /* free() */
18*01826a49SYabin Cui
19*01826a49SYabin Cui #include <sys/stat.h>
20*01826a49SYabin Cui
21*01826a49SYabin Cui #include <curl/curl.h>
22*01826a49SYabin Cui
23*01826a49SYabin Cui #include "mem.h"
24*01826a49SYabin Cui #include "util.h"
25*01826a49SYabin Cui #define XXH_STATIC_LINKING_ONLY
26*01826a49SYabin Cui #include "xxhash.h"
27*01826a49SYabin Cui
28*01826a49SYabin Cui /**
29*01826a49SYabin Cui * Data objects
30*01826a49SYabin Cui */
31*01826a49SYabin Cui
32*01826a49SYabin Cui #define REGRESSION_RELEASE(x) \
33*01826a49SYabin Cui "https://github.com/facebook/zstd/releases/download/regression-data/" x
34*01826a49SYabin Cui
35*01826a49SYabin Cui data_t silesia = {
36*01826a49SYabin Cui .name = "silesia",
37*01826a49SYabin Cui .type = data_type_dir,
38*01826a49SYabin Cui .data =
39*01826a49SYabin Cui {
40*01826a49SYabin Cui .url = REGRESSION_RELEASE("silesia.tar.zst"),
41*01826a49SYabin Cui .xxhash64 = 0x48a199f92f93e977LL,
42*01826a49SYabin Cui },
43*01826a49SYabin Cui };
44*01826a49SYabin Cui
45*01826a49SYabin Cui data_t silesia_tar = {
46*01826a49SYabin Cui .name = "silesia.tar",
47*01826a49SYabin Cui .type = data_type_file,
48*01826a49SYabin Cui .data =
49*01826a49SYabin Cui {
50*01826a49SYabin Cui .url = REGRESSION_RELEASE("silesia.tar.zst"),
51*01826a49SYabin Cui .xxhash64 = 0x48a199f92f93e977LL,
52*01826a49SYabin Cui },
53*01826a49SYabin Cui };
54*01826a49SYabin Cui
55*01826a49SYabin Cui data_t github = {
56*01826a49SYabin Cui .name = "github",
57*01826a49SYabin Cui .type = data_type_dir,
58*01826a49SYabin Cui .data =
59*01826a49SYabin Cui {
60*01826a49SYabin Cui .url = REGRESSION_RELEASE("github.tar.zst"),
61*01826a49SYabin Cui .xxhash64 = 0xa9b1b44b020df292LL,
62*01826a49SYabin Cui },
63*01826a49SYabin Cui .dict =
64*01826a49SYabin Cui {
65*01826a49SYabin Cui .url = REGRESSION_RELEASE("github.dict.zst"),
66*01826a49SYabin Cui .xxhash64 = 0x1eddc6f737d3cb53LL,
67*01826a49SYabin Cui
68*01826a49SYabin Cui },
69*01826a49SYabin Cui };
70*01826a49SYabin Cui
71*01826a49SYabin Cui data_t github_tar = {
72*01826a49SYabin Cui .name = "github.tar",
73*01826a49SYabin Cui .type = data_type_file,
74*01826a49SYabin Cui .data =
75*01826a49SYabin Cui {
76*01826a49SYabin Cui .url = REGRESSION_RELEASE("github.tar.zst"),
77*01826a49SYabin Cui .xxhash64 = 0xa9b1b44b020df292LL,
78*01826a49SYabin Cui },
79*01826a49SYabin Cui .dict =
80*01826a49SYabin Cui {
81*01826a49SYabin Cui .url = REGRESSION_RELEASE("github.dict.zst"),
82*01826a49SYabin Cui .xxhash64 = 0x1eddc6f737d3cb53LL,
83*01826a49SYabin Cui
84*01826a49SYabin Cui },
85*01826a49SYabin Cui };
86*01826a49SYabin Cui
87*01826a49SYabin Cui static data_t* g_data[] = {
88*01826a49SYabin Cui &silesia,
89*01826a49SYabin Cui &silesia_tar,
90*01826a49SYabin Cui &github,
91*01826a49SYabin Cui &github_tar,
92*01826a49SYabin Cui NULL,
93*01826a49SYabin Cui };
94*01826a49SYabin Cui
95*01826a49SYabin Cui data_t const* const* data = (data_t const* const*)g_data;
96*01826a49SYabin Cui
97*01826a49SYabin Cui /**
98*01826a49SYabin Cui * data helpers.
99*01826a49SYabin Cui */
100*01826a49SYabin Cui
data_has_dict(data_t const * data)101*01826a49SYabin Cui int data_has_dict(data_t const* data) {
102*01826a49SYabin Cui return data->dict.url != NULL;
103*01826a49SYabin Cui }
104*01826a49SYabin Cui
105*01826a49SYabin Cui /**
106*01826a49SYabin Cui * data buffer helper functions (documented in header).
107*01826a49SYabin Cui */
108*01826a49SYabin Cui
data_buffer_create(size_t const capacity)109*01826a49SYabin Cui data_buffer_t data_buffer_create(size_t const capacity) {
110*01826a49SYabin Cui data_buffer_t buffer = {};
111*01826a49SYabin Cui
112*01826a49SYabin Cui buffer.data = (uint8_t*)malloc(capacity);
113*01826a49SYabin Cui if (buffer.data == NULL)
114*01826a49SYabin Cui return buffer;
115*01826a49SYabin Cui buffer.capacity = capacity;
116*01826a49SYabin Cui return buffer;
117*01826a49SYabin Cui }
118*01826a49SYabin Cui
data_buffer_read(char const * filename)119*01826a49SYabin Cui data_buffer_t data_buffer_read(char const* filename) {
120*01826a49SYabin Cui data_buffer_t buffer = {};
121*01826a49SYabin Cui
122*01826a49SYabin Cui uint64_t const size = UTIL_getFileSize(filename);
123*01826a49SYabin Cui if (size == UTIL_FILESIZE_UNKNOWN) {
124*01826a49SYabin Cui fprintf(stderr, "unknown size for %s\n", filename);
125*01826a49SYabin Cui return buffer;
126*01826a49SYabin Cui }
127*01826a49SYabin Cui
128*01826a49SYabin Cui buffer.data = (uint8_t*)malloc(size);
129*01826a49SYabin Cui if (buffer.data == NULL) {
130*01826a49SYabin Cui fprintf(stderr, "malloc failed\n");
131*01826a49SYabin Cui return buffer;
132*01826a49SYabin Cui }
133*01826a49SYabin Cui buffer.capacity = size;
134*01826a49SYabin Cui
135*01826a49SYabin Cui FILE* file = fopen(filename, "rb");
136*01826a49SYabin Cui if (file == NULL) {
137*01826a49SYabin Cui fprintf(stderr, "file null\n");
138*01826a49SYabin Cui goto err;
139*01826a49SYabin Cui }
140*01826a49SYabin Cui buffer.size = fread(buffer.data, 1, buffer.capacity, file);
141*01826a49SYabin Cui fclose(file);
142*01826a49SYabin Cui if (buffer.size != buffer.capacity) {
143*01826a49SYabin Cui fprintf(stderr, "read %zu != %zu\n", buffer.size, buffer.capacity);
144*01826a49SYabin Cui goto err;
145*01826a49SYabin Cui }
146*01826a49SYabin Cui
147*01826a49SYabin Cui return buffer;
148*01826a49SYabin Cui err:
149*01826a49SYabin Cui free(buffer.data);
150*01826a49SYabin Cui memset(&buffer, 0, sizeof(buffer));
151*01826a49SYabin Cui return buffer;
152*01826a49SYabin Cui }
153*01826a49SYabin Cui
data_buffer_get_data(data_t const * data)154*01826a49SYabin Cui data_buffer_t data_buffer_get_data(data_t const* data) {
155*01826a49SYabin Cui data_buffer_t const kEmptyBuffer = {};
156*01826a49SYabin Cui
157*01826a49SYabin Cui if (data->type != data_type_file)
158*01826a49SYabin Cui return kEmptyBuffer;
159*01826a49SYabin Cui
160*01826a49SYabin Cui return data_buffer_read(data->data.path);
161*01826a49SYabin Cui }
162*01826a49SYabin Cui
data_buffer_get_dict(data_t const * data)163*01826a49SYabin Cui data_buffer_t data_buffer_get_dict(data_t const* data) {
164*01826a49SYabin Cui data_buffer_t const kEmptyBuffer = {};
165*01826a49SYabin Cui
166*01826a49SYabin Cui if (!data_has_dict(data))
167*01826a49SYabin Cui return kEmptyBuffer;
168*01826a49SYabin Cui
169*01826a49SYabin Cui return data_buffer_read(data->dict.path);
170*01826a49SYabin Cui }
171*01826a49SYabin Cui
data_buffer_compare(data_buffer_t buffer1,data_buffer_t buffer2)172*01826a49SYabin Cui int data_buffer_compare(data_buffer_t buffer1, data_buffer_t buffer2) {
173*01826a49SYabin Cui size_t const size =
174*01826a49SYabin Cui buffer1.size < buffer2.size ? buffer1.size : buffer2.size;
175*01826a49SYabin Cui int const cmp = memcmp(buffer1.data, buffer2.data, size);
176*01826a49SYabin Cui if (cmp != 0)
177*01826a49SYabin Cui return cmp;
178*01826a49SYabin Cui if (buffer1.size < buffer2.size)
179*01826a49SYabin Cui return -1;
180*01826a49SYabin Cui if (buffer1.size == buffer2.size)
181*01826a49SYabin Cui return 0;
182*01826a49SYabin Cui assert(buffer1.size > buffer2.size);
183*01826a49SYabin Cui return 1;
184*01826a49SYabin Cui }
185*01826a49SYabin Cui
data_buffer_free(data_buffer_t buffer)186*01826a49SYabin Cui void data_buffer_free(data_buffer_t buffer) {
187*01826a49SYabin Cui free(buffer.data);
188*01826a49SYabin Cui }
189*01826a49SYabin Cui
190*01826a49SYabin Cui /**
191*01826a49SYabin Cui * data filenames helpers.
192*01826a49SYabin Cui */
193*01826a49SYabin Cui
data_filenames_get(data_t const * data)194*01826a49SYabin Cui FileNamesTable* data_filenames_get(data_t const* data)
195*01826a49SYabin Cui {
196*01826a49SYabin Cui char const* const path = data->data.path;
197*01826a49SYabin Cui return UTIL_createExpandedFNT(&path, 1, 0 /* followLinks */ );
198*01826a49SYabin Cui }
199*01826a49SYabin Cui
200*01826a49SYabin Cui /**
201*01826a49SYabin Cui * data buffers helpers.
202*01826a49SYabin Cui */
203*01826a49SYabin Cui
data_buffers_get(data_t const * data)204*01826a49SYabin Cui data_buffers_t data_buffers_get(data_t const* data) {
205*01826a49SYabin Cui data_buffers_t buffers = {.size = 0};
206*01826a49SYabin Cui FileNamesTable* const filenames = data_filenames_get(data);
207*01826a49SYabin Cui if (filenames == NULL) return buffers;
208*01826a49SYabin Cui if (filenames->tableSize == 0) {
209*01826a49SYabin Cui UTIL_freeFileNamesTable(filenames);
210*01826a49SYabin Cui return buffers;
211*01826a49SYabin Cui }
212*01826a49SYabin Cui
213*01826a49SYabin Cui data_buffer_t* buffersPtr =
214*01826a49SYabin Cui (data_buffer_t*)malloc(filenames->tableSize * sizeof(*buffersPtr));
215*01826a49SYabin Cui if (buffersPtr == NULL) {
216*01826a49SYabin Cui UTIL_freeFileNamesTable(filenames);
217*01826a49SYabin Cui return buffers;
218*01826a49SYabin Cui }
219*01826a49SYabin Cui buffers.buffers = (data_buffer_t const*)buffersPtr;
220*01826a49SYabin Cui buffers.size = filenames->tableSize;
221*01826a49SYabin Cui
222*01826a49SYabin Cui for (size_t i = 0; i < filenames->tableSize; ++i) {
223*01826a49SYabin Cui buffersPtr[i] = data_buffer_read(filenames->fileNames[i]);
224*01826a49SYabin Cui if (buffersPtr[i].data == NULL) {
225*01826a49SYabin Cui data_buffers_t const kEmptyBuffer = {};
226*01826a49SYabin Cui data_buffers_free(buffers);
227*01826a49SYabin Cui UTIL_freeFileNamesTable(filenames);
228*01826a49SYabin Cui return kEmptyBuffer;
229*01826a49SYabin Cui }
230*01826a49SYabin Cui }
231*01826a49SYabin Cui
232*01826a49SYabin Cui UTIL_freeFileNamesTable(filenames);
233*01826a49SYabin Cui return buffers;
234*01826a49SYabin Cui }
235*01826a49SYabin Cui
236*01826a49SYabin Cui /**
237*01826a49SYabin Cui * Frees the data buffers.
238*01826a49SYabin Cui */
data_buffers_free(data_buffers_t buffers)239*01826a49SYabin Cui void data_buffers_free(data_buffers_t buffers) {
240*01826a49SYabin Cui free((data_buffer_t*)buffers.buffers);
241*01826a49SYabin Cui }
242*01826a49SYabin Cui
243*01826a49SYabin Cui /**
244*01826a49SYabin Cui * Initialization and download functions.
245*01826a49SYabin Cui */
246*01826a49SYabin Cui
247*01826a49SYabin Cui static char* g_data_dir = NULL;
248*01826a49SYabin Cui
249*01826a49SYabin Cui /* mkdir -p */
ensure_directory_exists(char const * indir)250*01826a49SYabin Cui static int ensure_directory_exists(char const* indir) {
251*01826a49SYabin Cui char* const dir = strdup(indir);
252*01826a49SYabin Cui char* end = dir;
253*01826a49SYabin Cui int ret = 0;
254*01826a49SYabin Cui if (dir == NULL) {
255*01826a49SYabin Cui ret = EINVAL;
256*01826a49SYabin Cui goto out;
257*01826a49SYabin Cui }
258*01826a49SYabin Cui do {
259*01826a49SYabin Cui /* Find the next directory level. */
260*01826a49SYabin Cui for (++end; *end != '\0' && *end != '/'; ++end)
261*01826a49SYabin Cui ;
262*01826a49SYabin Cui /* End the string there, make the directory, and restore the string. */
263*01826a49SYabin Cui char const save = *end;
264*01826a49SYabin Cui *end = '\0';
265*01826a49SYabin Cui int const isdir = UTIL_isDirectory(dir);
266*01826a49SYabin Cui ret = mkdir(dir, S_IRWXU);
267*01826a49SYabin Cui *end = save;
268*01826a49SYabin Cui /* Its okay if the directory already exists. */
269*01826a49SYabin Cui if (ret == 0 || (errno == EEXIST && isdir))
270*01826a49SYabin Cui continue;
271*01826a49SYabin Cui ret = errno;
272*01826a49SYabin Cui fprintf(stderr, "mkdir() failed\n");
273*01826a49SYabin Cui goto out;
274*01826a49SYabin Cui } while (*end != '\0');
275*01826a49SYabin Cui
276*01826a49SYabin Cui ret = 0;
277*01826a49SYabin Cui out:
278*01826a49SYabin Cui free(dir);
279*01826a49SYabin Cui return ret;
280*01826a49SYabin Cui }
281*01826a49SYabin Cui
282*01826a49SYabin Cui /** Concatenate 3 strings into a new buffer. */
cat3(char const * str1,char const * str2,char const * str3)283*01826a49SYabin Cui static char* cat3(char const* str1, char const* str2, char const* str3) {
284*01826a49SYabin Cui size_t const size1 = strlen(str1);
285*01826a49SYabin Cui size_t const size2 = strlen(str2);
286*01826a49SYabin Cui size_t const size3 = str3 == NULL ? 0 : strlen(str3);
287*01826a49SYabin Cui size_t const size = size1 + size2 + size3 + 1;
288*01826a49SYabin Cui char* const dst = (char*)malloc(size);
289*01826a49SYabin Cui if (dst == NULL)
290*01826a49SYabin Cui return NULL;
291*01826a49SYabin Cui strcpy(dst, str1);
292*01826a49SYabin Cui strcpy(dst + size1, str2);
293*01826a49SYabin Cui if (str3 != NULL)
294*01826a49SYabin Cui strcpy(dst + size1 + size2, str3);
295*01826a49SYabin Cui assert(strlen(dst) == size1 + size2 + size3);
296*01826a49SYabin Cui return dst;
297*01826a49SYabin Cui }
298*01826a49SYabin Cui
cat2(char const * str1,char const * str2)299*01826a49SYabin Cui static char* cat2(char const* str1, char const* str2) {
300*01826a49SYabin Cui return cat3(str1, str2, NULL);
301*01826a49SYabin Cui }
302*01826a49SYabin Cui
303*01826a49SYabin Cui /**
304*01826a49SYabin Cui * State needed by the curl callback.
305*01826a49SYabin Cui * It takes data from curl, hashes it, and writes it to the file.
306*01826a49SYabin Cui */
307*01826a49SYabin Cui typedef struct {
308*01826a49SYabin Cui FILE* file;
309*01826a49SYabin Cui XXH64_state_t xxhash64;
310*01826a49SYabin Cui int error;
311*01826a49SYabin Cui } curl_data_t;
312*01826a49SYabin Cui
313*01826a49SYabin Cui /** Create the curl state. */
curl_data_create(data_resource_t const * resource,data_type_t type)314*01826a49SYabin Cui static curl_data_t curl_data_create(
315*01826a49SYabin Cui data_resource_t const* resource,
316*01826a49SYabin Cui data_type_t type) {
317*01826a49SYabin Cui curl_data_t cdata = {};
318*01826a49SYabin Cui
319*01826a49SYabin Cui XXH64_reset(&cdata.xxhash64, 0);
320*01826a49SYabin Cui
321*01826a49SYabin Cui assert(UTIL_isDirectory(g_data_dir));
322*01826a49SYabin Cui
323*01826a49SYabin Cui if (type == data_type_file) {
324*01826a49SYabin Cui /* Decompress the resource and store to the path. */
325*01826a49SYabin Cui char* cmd = cat3("zstd -dqfo '", resource->path, "'");
326*01826a49SYabin Cui if (cmd == NULL) {
327*01826a49SYabin Cui cdata.error = ENOMEM;
328*01826a49SYabin Cui return cdata;
329*01826a49SYabin Cui }
330*01826a49SYabin Cui cdata.file = popen(cmd, "w");
331*01826a49SYabin Cui free(cmd);
332*01826a49SYabin Cui } else {
333*01826a49SYabin Cui /* Decompress and extract the resource to the cache directory. */
334*01826a49SYabin Cui char* cmd = cat3("zstd -dc | tar -x -C '", g_data_dir, "'");
335*01826a49SYabin Cui if (cmd == NULL) {
336*01826a49SYabin Cui cdata.error = ENOMEM;
337*01826a49SYabin Cui return cdata;
338*01826a49SYabin Cui }
339*01826a49SYabin Cui cdata.file = popen(cmd, "w");
340*01826a49SYabin Cui free(cmd);
341*01826a49SYabin Cui }
342*01826a49SYabin Cui if (cdata.file == NULL) {
343*01826a49SYabin Cui cdata.error = errno;
344*01826a49SYabin Cui }
345*01826a49SYabin Cui
346*01826a49SYabin Cui return cdata;
347*01826a49SYabin Cui }
348*01826a49SYabin Cui
349*01826a49SYabin Cui /** Free the curl state. */
curl_data_free(curl_data_t cdata)350*01826a49SYabin Cui static int curl_data_free(curl_data_t cdata) {
351*01826a49SYabin Cui return pclose(cdata.file);
352*01826a49SYabin Cui }
353*01826a49SYabin Cui
354*01826a49SYabin Cui /** curl callback. Updates the hash, and writes to the file. */
curl_write(void * data,size_t size,size_t count,void * ptr)355*01826a49SYabin Cui static size_t curl_write(void* data, size_t size, size_t count, void* ptr) {
356*01826a49SYabin Cui curl_data_t* cdata = (curl_data_t*)ptr;
357*01826a49SYabin Cui size_t const written = fwrite(data, size, count, cdata->file);
358*01826a49SYabin Cui XXH64_update(&cdata->xxhash64, data, written * size);
359*01826a49SYabin Cui return written;
360*01826a49SYabin Cui }
361*01826a49SYabin Cui
curl_download_resource(CURL * curl,data_resource_t const * resource,data_type_t type)362*01826a49SYabin Cui static int curl_download_resource(
363*01826a49SYabin Cui CURL* curl,
364*01826a49SYabin Cui data_resource_t const* resource,
365*01826a49SYabin Cui data_type_t type) {
366*01826a49SYabin Cui curl_data_t cdata;
367*01826a49SYabin Cui /* Download the data. */
368*01826a49SYabin Cui if (curl_easy_setopt(curl, CURLOPT_URL, resource->url) != 0)
369*01826a49SYabin Cui return EINVAL;
370*01826a49SYabin Cui if (curl_easy_setopt(curl, CURLOPT_WRITEDATA, &cdata) != 0)
371*01826a49SYabin Cui return EINVAL;
372*01826a49SYabin Cui cdata = curl_data_create(resource, type);
373*01826a49SYabin Cui if (cdata.error != 0)
374*01826a49SYabin Cui return cdata.error;
375*01826a49SYabin Cui int const curl_err = curl_easy_perform(curl);
376*01826a49SYabin Cui int const close_err = curl_data_free(cdata);
377*01826a49SYabin Cui if (curl_err) {
378*01826a49SYabin Cui fprintf(
379*01826a49SYabin Cui stderr,
380*01826a49SYabin Cui "downloading '%s' for '%s' failed\n",
381*01826a49SYabin Cui resource->url,
382*01826a49SYabin Cui resource->path);
383*01826a49SYabin Cui return EIO;
384*01826a49SYabin Cui }
385*01826a49SYabin Cui if (close_err) {
386*01826a49SYabin Cui fprintf(stderr, "writing data to '%s' failed\n", resource->path);
387*01826a49SYabin Cui return EIO;
388*01826a49SYabin Cui }
389*01826a49SYabin Cui /* check that the file exists. */
390*01826a49SYabin Cui if (type == data_type_file && !UTIL_isRegularFile(resource->path)) {
391*01826a49SYabin Cui fprintf(stderr, "output file '%s' does not exist\n", resource->path);
392*01826a49SYabin Cui return EIO;
393*01826a49SYabin Cui }
394*01826a49SYabin Cui if (type == data_type_dir && !UTIL_isDirectory(resource->path)) {
395*01826a49SYabin Cui fprintf(
396*01826a49SYabin Cui stderr, "output directory '%s' does not exist\n", resource->path);
397*01826a49SYabin Cui return EIO;
398*01826a49SYabin Cui }
399*01826a49SYabin Cui /* Check that the hash matches. */
400*01826a49SYabin Cui if (XXH64_digest(&cdata.xxhash64) != resource->xxhash64) {
401*01826a49SYabin Cui fprintf(
402*01826a49SYabin Cui stderr,
403*01826a49SYabin Cui "checksum does not match: 0x%llxLL != 0x%llxLL\n",
404*01826a49SYabin Cui (unsigned long long)XXH64_digest(&cdata.xxhash64),
405*01826a49SYabin Cui (unsigned long long)resource->xxhash64);
406*01826a49SYabin Cui return EINVAL;
407*01826a49SYabin Cui }
408*01826a49SYabin Cui
409*01826a49SYabin Cui return 0;
410*01826a49SYabin Cui }
411*01826a49SYabin Cui
412*01826a49SYabin Cui /** Download a single data object. */
curl_download_datum(CURL * curl,data_t const * data)413*01826a49SYabin Cui static int curl_download_datum(CURL* curl, data_t const* data) {
414*01826a49SYabin Cui int ret;
415*01826a49SYabin Cui ret = curl_download_resource(curl, &data->data, data->type);
416*01826a49SYabin Cui if (ret != 0)
417*01826a49SYabin Cui return ret;
418*01826a49SYabin Cui if (data_has_dict(data)) {
419*01826a49SYabin Cui ret = curl_download_resource(curl, &data->dict, data_type_file);
420*01826a49SYabin Cui if (ret != 0)
421*01826a49SYabin Cui return ret;
422*01826a49SYabin Cui }
423*01826a49SYabin Cui return ret;
424*01826a49SYabin Cui }
425*01826a49SYabin Cui
426*01826a49SYabin Cui /** Download all the data. */
curl_download_data(data_t const * const * data)427*01826a49SYabin Cui static int curl_download_data(data_t const* const* data) {
428*01826a49SYabin Cui if (curl_global_init(CURL_GLOBAL_ALL) != 0)
429*01826a49SYabin Cui return EFAULT;
430*01826a49SYabin Cui
431*01826a49SYabin Cui curl_data_t cdata = {};
432*01826a49SYabin Cui CURL* curl = curl_easy_init();
433*01826a49SYabin Cui int err = EFAULT;
434*01826a49SYabin Cui
435*01826a49SYabin Cui if (curl == NULL)
436*01826a49SYabin Cui return EFAULT;
437*01826a49SYabin Cui
438*01826a49SYabin Cui if (curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L) != 0)
439*01826a49SYabin Cui goto out;
440*01826a49SYabin Cui if (curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L) != 0)
441*01826a49SYabin Cui goto out;
442*01826a49SYabin Cui if (curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_write) != 0)
443*01826a49SYabin Cui goto out;
444*01826a49SYabin Cui
445*01826a49SYabin Cui assert(data != NULL);
446*01826a49SYabin Cui for (; *data != NULL; ++data) {
447*01826a49SYabin Cui if (curl_download_datum(curl, *data) != 0)
448*01826a49SYabin Cui goto out;
449*01826a49SYabin Cui }
450*01826a49SYabin Cui
451*01826a49SYabin Cui err = 0;
452*01826a49SYabin Cui out:
453*01826a49SYabin Cui curl_easy_cleanup(curl);
454*01826a49SYabin Cui curl_global_cleanup();
455*01826a49SYabin Cui return err;
456*01826a49SYabin Cui }
457*01826a49SYabin Cui
458*01826a49SYabin Cui /** Fill the path member variable of the data objects. */
data_create_paths(data_t * const * data,char const * dir)459*01826a49SYabin Cui static int data_create_paths(data_t* const* data, char const* dir) {
460*01826a49SYabin Cui size_t const dirlen = strlen(dir);
461*01826a49SYabin Cui assert(data != NULL);
462*01826a49SYabin Cui for (; *data != NULL; ++data) {
463*01826a49SYabin Cui data_t* const datum = *data;
464*01826a49SYabin Cui datum->data.path = cat3(dir, "/", datum->name);
465*01826a49SYabin Cui if (datum->data.path == NULL)
466*01826a49SYabin Cui return ENOMEM;
467*01826a49SYabin Cui if (data_has_dict(datum)) {
468*01826a49SYabin Cui datum->dict.path = cat2(datum->data.path, ".dict");
469*01826a49SYabin Cui if (datum->dict.path == NULL)
470*01826a49SYabin Cui return ENOMEM;
471*01826a49SYabin Cui }
472*01826a49SYabin Cui }
473*01826a49SYabin Cui return 0;
474*01826a49SYabin Cui }
475*01826a49SYabin Cui
476*01826a49SYabin Cui /** Free the path member variable of the data objects. */
data_free_paths(data_t * const * data)477*01826a49SYabin Cui static void data_free_paths(data_t* const* data) {
478*01826a49SYabin Cui assert(data != NULL);
479*01826a49SYabin Cui for (; *data != NULL; ++data) {
480*01826a49SYabin Cui data_t* datum = *data;
481*01826a49SYabin Cui free((void*)datum->data.path);
482*01826a49SYabin Cui free((void*)datum->dict.path);
483*01826a49SYabin Cui datum->data.path = NULL;
484*01826a49SYabin Cui datum->dict.path = NULL;
485*01826a49SYabin Cui }
486*01826a49SYabin Cui }
487*01826a49SYabin Cui
488*01826a49SYabin Cui static char const kStampName[] = "STAMP";
489*01826a49SYabin Cui
xxh_update_le(XXH64_state_t * state,uint64_t data)490*01826a49SYabin Cui static void xxh_update_le(XXH64_state_t* state, uint64_t data) {
491*01826a49SYabin Cui if (!MEM_isLittleEndian())
492*01826a49SYabin Cui data = MEM_swap64(data);
493*01826a49SYabin Cui XXH64_update(state, &data, sizeof(data));
494*01826a49SYabin Cui }
495*01826a49SYabin Cui
496*01826a49SYabin Cui /** Hash the data to create the stamp. */
stamp_hash(data_t const * const * data)497*01826a49SYabin Cui static uint64_t stamp_hash(data_t const* const* data) {
498*01826a49SYabin Cui XXH64_state_t state;
499*01826a49SYabin Cui
500*01826a49SYabin Cui XXH64_reset(&state, 0);
501*01826a49SYabin Cui assert(data != NULL);
502*01826a49SYabin Cui for (; *data != NULL; ++data) {
503*01826a49SYabin Cui data_t const* datum = *data;
504*01826a49SYabin Cui /* We don't care about the URL that we fetch from. */
505*01826a49SYabin Cui /* The path is derived from the name. */
506*01826a49SYabin Cui XXH64_update(&state, datum->name, strlen(datum->name));
507*01826a49SYabin Cui xxh_update_le(&state, datum->data.xxhash64);
508*01826a49SYabin Cui xxh_update_le(&state, datum->dict.xxhash64);
509*01826a49SYabin Cui xxh_update_le(&state, datum->type);
510*01826a49SYabin Cui }
511*01826a49SYabin Cui return XXH64_digest(&state);
512*01826a49SYabin Cui }
513*01826a49SYabin Cui
514*01826a49SYabin Cui /** Check if the stamp matches the stamp in the cache directory. */
stamp_check(char const * dir,data_t const * const * data)515*01826a49SYabin Cui static int stamp_check(char const* dir, data_t const* const* data) {
516*01826a49SYabin Cui char* stamp = cat3(dir, "/", kStampName);
517*01826a49SYabin Cui uint64_t const expected = stamp_hash(data);
518*01826a49SYabin Cui XXH64_canonical_t actual;
519*01826a49SYabin Cui FILE* stampfile = NULL;
520*01826a49SYabin Cui int matches = 0;
521*01826a49SYabin Cui
522*01826a49SYabin Cui if (stamp == NULL)
523*01826a49SYabin Cui goto out;
524*01826a49SYabin Cui if (!UTIL_isRegularFile(stamp)) {
525*01826a49SYabin Cui fprintf(stderr, "stamp does not exist: recreating the data cache\n");
526*01826a49SYabin Cui goto out;
527*01826a49SYabin Cui }
528*01826a49SYabin Cui
529*01826a49SYabin Cui stampfile = fopen(stamp, "rb");
530*01826a49SYabin Cui if (stampfile == NULL) {
531*01826a49SYabin Cui fprintf(stderr, "could not open stamp: recreating the data cache\n");
532*01826a49SYabin Cui goto out;
533*01826a49SYabin Cui }
534*01826a49SYabin Cui
535*01826a49SYabin Cui size_t b;
536*01826a49SYabin Cui if ((b = fread(&actual, sizeof(actual), 1, stampfile)) != 1) {
537*01826a49SYabin Cui fprintf(stderr, "invalid stamp: recreating the data cache\n");
538*01826a49SYabin Cui goto out;
539*01826a49SYabin Cui }
540*01826a49SYabin Cui
541*01826a49SYabin Cui matches = (expected == XXH64_hashFromCanonical(&actual));
542*01826a49SYabin Cui if (matches)
543*01826a49SYabin Cui fprintf(stderr, "stamp matches: reusing the cached data\n");
544*01826a49SYabin Cui else
545*01826a49SYabin Cui fprintf(stderr, "stamp does not match: recreating the data cache\n");
546*01826a49SYabin Cui
547*01826a49SYabin Cui out:
548*01826a49SYabin Cui free(stamp);
549*01826a49SYabin Cui if (stampfile != NULL)
550*01826a49SYabin Cui fclose(stampfile);
551*01826a49SYabin Cui return matches;
552*01826a49SYabin Cui }
553*01826a49SYabin Cui
554*01826a49SYabin Cui /** On success write a new stamp, on failure delete the old stamp. */
555*01826a49SYabin Cui static int
stamp_write(char const * dir,data_t const * const * data,int const data_err)556*01826a49SYabin Cui stamp_write(char const* dir, data_t const* const* data, int const data_err) {
557*01826a49SYabin Cui char* stamp = cat3(dir, "/", kStampName);
558*01826a49SYabin Cui FILE* stampfile = NULL;
559*01826a49SYabin Cui int err = EIO;
560*01826a49SYabin Cui
561*01826a49SYabin Cui if (stamp == NULL)
562*01826a49SYabin Cui return ENOMEM;
563*01826a49SYabin Cui
564*01826a49SYabin Cui if (data_err != 0) {
565*01826a49SYabin Cui err = data_err;
566*01826a49SYabin Cui goto out;
567*01826a49SYabin Cui }
568*01826a49SYabin Cui XXH64_canonical_t hash;
569*01826a49SYabin Cui
570*01826a49SYabin Cui XXH64_canonicalFromHash(&hash, stamp_hash(data));
571*01826a49SYabin Cui
572*01826a49SYabin Cui stampfile = fopen(stamp, "wb");
573*01826a49SYabin Cui if (stampfile == NULL)
574*01826a49SYabin Cui goto out;
575*01826a49SYabin Cui if (fwrite(&hash, sizeof(hash), 1, stampfile) != 1)
576*01826a49SYabin Cui goto out;
577*01826a49SYabin Cui err = 0;
578*01826a49SYabin Cui fprintf(stderr, "stamped new data cache\n");
579*01826a49SYabin Cui out:
580*01826a49SYabin Cui if (err != 0)
581*01826a49SYabin Cui /* Ignore errors. */
582*01826a49SYabin Cui unlink(stamp);
583*01826a49SYabin Cui free(stamp);
584*01826a49SYabin Cui if (stampfile != NULL)
585*01826a49SYabin Cui fclose(stampfile);
586*01826a49SYabin Cui return err;
587*01826a49SYabin Cui }
588*01826a49SYabin Cui
data_init(char const * dir)589*01826a49SYabin Cui int data_init(char const* dir) {
590*01826a49SYabin Cui int err;
591*01826a49SYabin Cui
592*01826a49SYabin Cui if (dir == NULL)
593*01826a49SYabin Cui return EINVAL;
594*01826a49SYabin Cui
595*01826a49SYabin Cui /* This must be first to simplify logic. */
596*01826a49SYabin Cui err = ensure_directory_exists(dir);
597*01826a49SYabin Cui if (err != 0)
598*01826a49SYabin Cui return err;
599*01826a49SYabin Cui
600*01826a49SYabin Cui /* Save the cache directory. */
601*01826a49SYabin Cui g_data_dir = strdup(dir);
602*01826a49SYabin Cui if (g_data_dir == NULL)
603*01826a49SYabin Cui return ENOMEM;
604*01826a49SYabin Cui
605*01826a49SYabin Cui err = data_create_paths(g_data, dir);
606*01826a49SYabin Cui if (err != 0)
607*01826a49SYabin Cui return err;
608*01826a49SYabin Cui
609*01826a49SYabin Cui /* If the stamp matches then we are good to go.
610*01826a49SYabin Cui * This must be called before any modifications to the data cache.
611*01826a49SYabin Cui * After this point, we MUST call stamp_write() to update the STAMP,
612*01826a49SYabin Cui * since we've updated the data cache.
613*01826a49SYabin Cui */
614*01826a49SYabin Cui if (stamp_check(dir, data))
615*01826a49SYabin Cui return 0;
616*01826a49SYabin Cui
617*01826a49SYabin Cui err = curl_download_data(data);
618*01826a49SYabin Cui if (err != 0)
619*01826a49SYabin Cui goto out;
620*01826a49SYabin Cui
621*01826a49SYabin Cui out:
622*01826a49SYabin Cui /* This must be last, since it must know if data_init() succeeded. */
623*01826a49SYabin Cui stamp_write(dir, data, err);
624*01826a49SYabin Cui return err;
625*01826a49SYabin Cui }
626*01826a49SYabin Cui
data_finish(void)627*01826a49SYabin Cui void data_finish(void) {
628*01826a49SYabin Cui data_free_paths(g_data);
629*01826a49SYabin Cui free(g_data_dir);
630*01826a49SYabin Cui g_data_dir = NULL;
631*01826a49SYabin Cui }
632