2 * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
5 * This source code is licensed under both the BSD-style license (found in the
6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7 * in the COPYING file in the root directory of this source tree).
8 * You may select, at your option, one of the above-listed licenses.
20 #include <curl/curl.h>
24 #define XXH_STATIC_LINKING_ONLY
31 #define REGRESSION_RELEASE(x) \
32 "https://github.com/facebook/zstd/releases/download/regression-data/" x
36 .type
= data_type_dir
,
39 .url
= REGRESSION_RELEASE("silesia.tar.zst"),
40 .xxhash64
= 0x48a199f92f93e977LL
,
44 data_t silesia_tar
= {
45 .name
= "silesia.tar",
46 .type
= data_type_file
,
49 .url
= REGRESSION_RELEASE("silesia.tar.zst"),
50 .xxhash64
= 0x48a199f92f93e977LL
,
56 .type
= data_type_dir
,
59 .url
= REGRESSION_RELEASE("github.tar.zst"),
60 .xxhash64
= 0xa9b1b44b020df292LL
,
64 .url
= REGRESSION_RELEASE("github.dict.zst"),
65 .xxhash64
= 0x1eddc6f737d3cb53LL
,
70 static data_t
* g_data
[] = {
77 data_t
const* const* data
= (data_t
const* const*)g_data
;
83 int data_has_dict(data_t
const* data
) {
84 return data
->dict
.url
!= NULL
;
88 * data buffer helper functions (documented in header).
91 data_buffer_t
data_buffer_create(size_t const capacity
) {
92 data_buffer_t buffer
= {};
94 buffer
.data
= (uint8_t*)malloc(capacity
);
95 if (buffer
.data
== NULL
)
97 buffer
.capacity
= capacity
;
101 data_buffer_t
data_buffer_read(char const* filename
) {
102 data_buffer_t buffer
= {};
104 uint64_t const size
= UTIL_getFileSize(filename
);
105 if (size
== UTIL_FILESIZE_UNKNOWN
) {
106 fprintf(stderr
, "unknown size for %s\n", filename
);
110 buffer
.data
= (uint8_t*)malloc(size
);
111 if (buffer
.data
== NULL
) {
112 fprintf(stderr
, "malloc failed\n");
115 buffer
.capacity
= size
;
117 FILE* file
= fopen(filename
, "rb");
119 fprintf(stderr
, "file null\n");
122 buffer
.size
= fread(buffer
.data
, 1, buffer
.capacity
, file
);
124 if (buffer
.size
!= buffer
.capacity
) {
125 fprintf(stderr
, "read %zu != %zu\n", buffer
.size
, buffer
.capacity
);
132 memset(&buffer
, 0, sizeof(buffer
));
136 data_buffer_t
data_buffer_get_data(data_t
const* data
) {
137 data_buffer_t
const kEmptyBuffer
= {};
139 if (data
->type
!= data_type_file
)
142 return data_buffer_read(data
->data
.path
);
145 data_buffer_t
data_buffer_get_dict(data_t
const* data
) {
146 data_buffer_t
const kEmptyBuffer
= {};
148 if (!data_has_dict(data
))
151 return data_buffer_read(data
->dict
.path
);
154 int data_buffer_compare(data_buffer_t buffer1
, data_buffer_t buffer2
) {
156 buffer1
.size
< buffer2
.size
? buffer1
.size
: buffer2
.size
;
157 int const cmp
= memcmp(buffer1
.data
, buffer2
.data
, size
);
160 if (buffer1
.size
< buffer2
.size
)
162 if (buffer1
.size
== buffer2
.size
)
164 assert(buffer1
.size
> buffer2
.size
);
168 void data_buffer_free(data_buffer_t buffer
) {
173 * data filenames helpers.
176 data_filenames_t
data_filenames_get(data_t
const* data
) {
177 data_filenames_t filenames
= {.buffer
= NULL
, .size
= 0};
178 char const* path
= data
->data
.path
;
180 filenames
.filenames
= UTIL_createFileList(
185 /* followLinks */ 0);
189 void data_filenames_free(data_filenames_t filenames
) {
190 UTIL_freeFileList(filenames
.filenames
, filenames
.buffer
);
194 * data buffers helpers.
197 data_buffers_t
data_buffers_get(data_t
const* data
) {
198 data_buffers_t buffers
= {.size
= 0};
199 data_filenames_t filenames
= data_filenames_get(data
);
200 if (filenames
.size
== 0)
203 data_buffer_t
* buffersPtr
=
204 (data_buffer_t
*)malloc(filenames
.size
* sizeof(data_buffer_t
));
205 if (buffersPtr
== NULL
)
207 buffers
.buffers
= (data_buffer_t
const*)buffersPtr
;
208 buffers
.size
= filenames
.size
;
210 for (size_t i
= 0; i
< filenames
.size
; ++i
) {
211 buffersPtr
[i
] = data_buffer_read(filenames
.filenames
[i
]);
212 if (buffersPtr
[i
].data
== NULL
) {
213 data_buffers_t
const kEmptyBuffer
= {};
214 data_buffers_free(buffers
);
223 * Frees the data buffers.
225 void data_buffers_free(data_buffers_t buffers
) {
226 free((data_buffer_t
*)buffers
.buffers
);
230 * Initialization and download functions.
233 static char* g_data_dir
= NULL
;
236 static int ensure_directory_exists(char const* indir
) {
237 char* const dir
= strdup(indir
);
245 /* Find the next directory level. */
246 for (++end
; *end
!= '\0' && *end
!= '/'; ++end
)
248 /* End the string there, make the directory, and restore the string. */
249 char const save
= *end
;
251 int const isdir
= UTIL_isDirectory(dir
);
252 ret
= mkdir(dir
, S_IRWXU
);
254 /* Its okay if the directory already exists. */
255 if (ret
== 0 || (errno
== EEXIST
&& isdir
))
258 fprintf(stderr
, "mkdir() failed\n");
260 } while (*end
!= '\0');
268 /** Concatenate 3 strings into a new buffer. */
269 static char* cat3(char const* str1
, char const* str2
, char const* str3
) {
270 size_t const size1
= strlen(str1
);
271 size_t const size2
= strlen(str2
);
272 size_t const size3
= str3
== NULL
? 0 : strlen(str3
);
273 size_t const size
= size1
+ size2
+ size3
+ 1;
274 char* const dst
= (char*)malloc(size
);
278 strcpy(dst
+ size1
, str2
);
280 strcpy(dst
+ size1
+ size2
, str3
);
281 assert(strlen(dst
) == size1
+ size2
+ size3
);
285 static char* cat2(char const* str1
, char const* str2
) {
286 return cat3(str1
, str2
, NULL
);
290 * State needed by the curl callback.
291 * It takes data from curl, hashes it, and writes it to the file.
295 XXH64_state_t xxhash64
;
299 /** Create the curl state. */
300 static curl_data_t
curl_data_create(
301 data_resource_t
const* resource
,
303 curl_data_t cdata
= {};
305 XXH64_reset(&cdata
.xxhash64
, 0);
307 assert(UTIL_isDirectory(g_data_dir
));
309 if (type
== data_type_file
) {
310 /* Decompress the resource and store to the path. */
311 char* cmd
= cat3("zstd -dqfo '", resource
->path
, "'");
313 cdata
.error
= ENOMEM
;
316 cdata
.file
= popen(cmd
, "w");
319 /* Decompress and extract the resource to the cache directory. */
320 char* cmd
= cat3("zstd -dc | tar -x -C '", g_data_dir
, "'");
322 cdata
.error
= ENOMEM
;
325 cdata
.file
= popen(cmd
, "w");
328 if (cdata
.file
== NULL
) {
335 /** Free the curl state. */
336 static int curl_data_free(curl_data_t cdata
) {
337 return pclose(cdata
.file
);
340 /** curl callback. Updates the hash, and writes to the file. */
341 static size_t curl_write(void* data
, size_t size
, size_t count
, void* ptr
) {
342 curl_data_t
* cdata
= (curl_data_t
*)ptr
;
343 size_t const written
= fwrite(data
, size
, count
, cdata
->file
);
344 XXH64_update(&cdata
->xxhash64
, data
, written
* size
);
348 static int curl_download_resource(
350 data_resource_t
const* resource
,
353 /* Download the data. */
354 if (curl_easy_setopt(curl
, CURLOPT_URL
, resource
->url
) != 0)
356 if (curl_easy_setopt(curl
, CURLOPT_WRITEDATA
, &cdata
) != 0)
358 cdata
= curl_data_create(resource
, type
);
359 if (cdata
.error
!= 0)
361 int const curl_err
= curl_easy_perform(curl
);
362 int const close_err
= curl_data_free(cdata
);
366 "downloading '%s' for '%s' failed\n",
372 fprintf(stderr
, "writing data to '%s' failed\n", resource
->path
);
375 /* check that the file exists. */
376 if (type
== data_type_file
&& !UTIL_isRegularFile(resource
->path
)) {
377 fprintf(stderr
, "output file '%s' does not exist\n", resource
->path
);
380 if (type
== data_type_dir
&& !UTIL_isDirectory(resource
->path
)) {
382 stderr
, "output directory '%s' does not exist\n", resource
->path
);
385 /* Check that the hash matches. */
386 if (XXH64_digest(&cdata
.xxhash64
) != resource
->xxhash64
) {
389 "checksum does not match: 0x%llxLL != 0x%llxLL\n",
390 (unsigned long long)XXH64_digest(&cdata
.xxhash64
),
391 (unsigned long long)resource
->xxhash64
);
398 /** Download a single data object. */
399 static int curl_download_datum(CURL
* curl
, data_t
const* data
) {
401 ret
= curl_download_resource(curl
, &data
->data
, data
->type
);
404 if (data_has_dict(data
)) {
405 ret
= curl_download_resource(curl
, &data
->dict
, data_type_file
);
412 /** Download all the data. */
413 static int curl_download_data(data_t
const* const* data
) {
414 if (curl_global_init(CURL_GLOBAL_ALL
) != 0)
417 curl_data_t cdata
= {};
418 CURL
* curl
= curl_easy_init();
424 if (curl_easy_setopt(curl
, CURLOPT_NOPROGRESS
, 1L) != 0)
426 if (curl_easy_setopt(curl
, CURLOPT_FOLLOWLOCATION
, 1L) != 0)
428 if (curl_easy_setopt(curl
, CURLOPT_WRITEFUNCTION
, curl_write
) != 0)
431 assert(data
!= NULL
);
432 for (; *data
!= NULL
; ++data
) {
433 if (curl_download_datum(curl
, *data
) != 0)
439 curl_easy_cleanup(curl
);
440 curl_global_cleanup();
444 /** Fill the path member variable of the data objects. */
445 static int data_create_paths(data_t
* const* data
, char const* dir
) {
446 size_t const dirlen
= strlen(dir
);
447 assert(data
!= NULL
);
448 for (; *data
!= NULL
; ++data
) {
449 data_t
* const datum
= *data
;
450 datum
->data
.path
= cat3(dir
, "/", datum
->name
);
451 if (datum
->data
.path
== NULL
)
453 if (data_has_dict(datum
)) {
454 datum
->dict
.path
= cat2(datum
->data
.path
, ".dict");
455 if (datum
->dict
.path
== NULL
)
462 /** Free the path member variable of the data objects. */
463 static void data_free_paths(data_t
* const* data
) {
464 assert(data
!= NULL
);
465 for (; *data
!= NULL
; ++data
) {
466 data_t
* datum
= *data
;
467 free((void*)datum
->data
.path
);
468 free((void*)datum
->dict
.path
);
469 datum
->data
.path
= NULL
;
470 datum
->dict
.path
= NULL
;
474 static char const kStampName
[] = "STAMP";
476 static void xxh_update_le(XXH64_state_t
* state
, uint64_t data
) {
477 if (!MEM_isLittleEndian())
478 data
= MEM_swap64(data
);
479 XXH64_update(state
, &data
, sizeof(data
));
482 /** Hash the data to create the stamp. */
483 static uint64_t stamp_hash(data_t
const* const* data
) {
486 XXH64_reset(&state
, 0);
487 assert(data
!= NULL
);
488 for (; *data
!= NULL
; ++data
) {
489 data_t
const* datum
= *data
;
490 /* We don't care about the URL that we fetch from. */
491 /* The path is derived from the name. */
492 XXH64_update(&state
, datum
->name
, strlen(datum
->name
));
493 xxh_update_le(&state
, datum
->data
.xxhash64
);
494 xxh_update_le(&state
, datum
->dict
.xxhash64
);
495 xxh_update_le(&state
, datum
->type
);
497 return XXH64_digest(&state
);
500 /** Check if the stamp matches the stamp in the cache directory. */
501 static int stamp_check(char const* dir
, data_t
const* const* data
) {
502 char* stamp
= cat3(dir
, "/", kStampName
);
503 uint64_t const expected
= stamp_hash(data
);
504 XXH64_canonical_t actual
;
505 FILE* stampfile
= NULL
;
510 if (!UTIL_isRegularFile(stamp
)) {
511 fprintf(stderr
, "stamp does not exist: recreating the data cache\n");
515 stampfile
= fopen(stamp
, "rb");
516 if (stampfile
== NULL
) {
517 fprintf(stderr
, "could not open stamp: recreating the data cache\n");
522 if ((b
= fread(&actual
, sizeof(actual
), 1, stampfile
)) != 1) {
523 fprintf(stderr
, "invalid stamp: recreating the data cache\n");
527 matches
= (expected
== XXH64_hashFromCanonical(&actual
));
529 fprintf(stderr
, "stamp matches: reusing the cached data\n");
531 fprintf(stderr
, "stamp does not match: recreating the data cache\n");
535 if (stampfile
!= NULL
)
540 /** On success write a new stamp, on failure delete the old stamp. */
542 stamp_write(char const* dir
, data_t
const* const* data
, int const data_err
) {
543 char* stamp
= cat3(dir
, "/", kStampName
);
544 FILE* stampfile
= NULL
;
554 XXH64_canonical_t hash
;
556 XXH64_canonicalFromHash(&hash
, stamp_hash(data
));
558 stampfile
= fopen(stamp
, "wb");
559 if (stampfile
== NULL
)
561 if (fwrite(&hash
, sizeof(hash
), 1, stampfile
) != 1)
564 fprintf(stderr
, "stamped new data cache\n");
570 if (stampfile
!= NULL
)
575 int data_init(char const* dir
) {
581 /* This must be first to simplify logic. */
582 err
= ensure_directory_exists(dir
);
586 /* Save the cache directory. */
587 g_data_dir
= strdup(dir
);
588 if (g_data_dir
== NULL
)
591 err
= data_create_paths(g_data
, dir
);
595 /* If the stamp matches then we are good to go.
596 * This must be called before any modifications to the data cache.
597 * After this point, we MUST call stamp_write() to update the STAMP,
598 * since we've updated the data cache.
600 if (stamp_check(dir
, data
))
603 err
= curl_download_data(data
);
608 /* This must be last, since it must know if data_init() succeeded. */
609 stamp_write(dir
, data
, err
);
613 void data_finish(void) {
614 data_free_paths(g_data
);