ceph/src/zstd/tests/regression/data.c

   1 /*
   2  * Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
   3  * All rights reserved.
   4  *
   5  * This source code is licensed under both the BSD-style license (found in the
   6  * LICENSE file in the root directory of this source tree) and the GPLv2 (found
   7  * in the COPYING file in the root directory of this source tree).
   8  * You may select, at your option, one of the above-listed licenses.
   9  */
  10
  11 #include "data.h"
  12
  13 #include <assert.h>
  14 #include <errno.h>
  15 #include <stdio.h>
  16 #include <string.h>
  17
  18 #include <sys/stat.h>
  19
  20 #include <curl/curl.h>
  21
  22 #include "mem.h"
  23 #include "util.h"
  24 #define XXH_STATIC_LINKING_ONLY
  25 #include "xxhash.h"
  26
  27 /**
  28  * Data objects
  29  */
  30
  31 #define REGRESSION_RELEASE(x) \
  32     "https://github.com/facebook/zstd/releases/download/regression-data/" x
  33
  34 data_t silesia = {
  35     .name = "silesia",
  36     .type = data_type_dir,
  37     .data =
  38         {
  39             .url = REGRESSION_RELEASE("silesia.tar.zst"),
  40             .xxhash64 = 0x48a199f92f93e977LL,
  41         },
  42 };
  43
  44 data_t silesia_tar = {
  45     .name = "silesia.tar",
  46     .type = data_type_file,
  47     .data =
  48         {
  49             .url = REGRESSION_RELEASE("silesia.tar.zst"),
  50             .xxhash64 = 0x48a199f92f93e977LL,
  51         },
  52 };
  53
  54 data_t github = {
  55     .name = "github",
  56     .type = data_type_dir,
  57     .data =
  58         {
  59             .url = REGRESSION_RELEASE("github.tar.zst"),
  60             .xxhash64 = 0xa9b1b44b020df292LL,
  61         },
  62     .dict =
  63         {
  64             .url = REGRESSION_RELEASE("github.dict.zst"),
  65             .xxhash64 = 0x1eddc6f737d3cb53LL,
  66
  67         },
  68 };
  69
  70 static data_t* g_data[] = {
  71     &silesia,
  72     &silesia_tar,
  73     &github,
  74     NULL,
  75 };
  76
  77 data_t const* const* data = (data_t const* const*)g_data;
  78
  79 /**
  80  * data helpers.
  81  */
  82
  83 int data_has_dict(data_t const* data) {
  84     return data->dict.url != NULL;
  85 }
  86
  87 /**
  88  * data buffer helper functions (documented in header).
  89  */
  90
  91 data_buffer_t data_buffer_create(size_t const capacity) {
  92     data_buffer_t buffer = {};
  93
  94     buffer.data = (uint8_t*)malloc(capacity);
  95     if (buffer.data == NULL)
  96         return buffer;
  97     buffer.capacity = capacity;
  98     return buffer;
  99 }
 100
 101 data_buffer_t data_buffer_read(char const* filename) {
 102     data_buffer_t buffer = {};
 103
 104     uint64_t const size = UTIL_getFileSize(filename);
 105     if (size == UTIL_FILESIZE_UNKNOWN) {
 106         fprintf(stderr, "unknown size for %s\n", filename);
 107         return buffer;
 108     }
 109
 110     buffer.data = (uint8_t*)malloc(size);
 111     if (buffer.data == NULL) {
 112         fprintf(stderr, "malloc failed\n");
 113         return buffer;
 114     }
 115     buffer.capacity = size;
 116
 117     FILE* file = fopen(filename, "rb");
 118     if (file == NULL) {
 119         fprintf(stderr, "file null\n");
 120         goto err;
 121     }
 122     buffer.size = fread(buffer.data, 1, buffer.capacity, file);
 123     fclose(file);
 124     if (buffer.size != buffer.capacity) {
 125         fprintf(stderr, "read %zu != %zu\n", buffer.size, buffer.capacity);
 126         goto err;
 127     }
 128
 129     return buffer;
 130 err:
 131     free(buffer.data);
 132     memset(&buffer, 0, sizeof(buffer));
 133     return buffer;
 134 }
 135
 136 data_buffer_t data_buffer_get_data(data_t const* data) {
 137     data_buffer_t const kEmptyBuffer = {};
 138
 139     if (data->type != data_type_file)
 140         return kEmptyBuffer;
 141
 142     return data_buffer_read(data->data.path);
 143 }
 144
 145 data_buffer_t data_buffer_get_dict(data_t const* data) {
 146     data_buffer_t const kEmptyBuffer = {};
 147
 148     if (!data_has_dict(data))
 149         return kEmptyBuffer;
 150
 151     return data_buffer_read(data->dict.path);
 152 }
 153
 154 int data_buffer_compare(data_buffer_t buffer1, data_buffer_t buffer2) {
 155     size_t const size =
 156         buffer1.size < buffer2.size ? buffer1.size : buffer2.size;
 157     int const cmp = memcmp(buffer1.data, buffer2.data, size);
 158     if (cmp != 0)
 159         return cmp;
 160     if (buffer1.size < buffer2.size)
 161         return -1;
 162     if (buffer1.size == buffer2.size)
 163         return 0;
 164     assert(buffer1.size > buffer2.size);
 165     return 1;
 166 }
 167
 168 void data_buffer_free(data_buffer_t buffer) {
 169     free(buffer.data);
 170 }
 171
 172 /**
 173  * data filenames helpers.
 174  */
 175
 176 data_filenames_t data_filenames_get(data_t const* data) {
 177     data_filenames_t filenames = {.buffer = NULL, .size = 0};
 178     char const* path = data->data.path;
 179
 180     filenames.filenames = UTIL_createFileList(
 181         &path,
 182         1,
 183         &filenames.buffer,
 184         &filenames.size,
 185         /* followLinks */ 0);
 186     return filenames;
 187 }
 188
 189 void data_filenames_free(data_filenames_t filenames) {
 190     UTIL_freeFileList(filenames.filenames, filenames.buffer);
 191 }
 192
 193 /**
 194  * data buffers helpers.
 195  */
 196
 197 data_buffers_t data_buffers_get(data_t const* data) {
 198     data_buffers_t buffers = {.size = 0};
 199     data_filenames_t filenames = data_filenames_get(data);
 200     if (filenames.size == 0)
 201         return buffers;
 202
 203     data_buffer_t* buffersPtr =
 204         (data_buffer_t*)malloc(filenames.size * sizeof(data_buffer_t));
 205     if (buffersPtr == NULL)
 206         return buffers;
 207     buffers.buffers = (data_buffer_t const*)buffersPtr;
 208     buffers.size = filenames.size;
 209
 210     for (size_t i = 0; i < filenames.size; ++i) {
 211         buffersPtr[i] = data_buffer_read(filenames.filenames[i]);
 212         if (buffersPtr[i].data == NULL) {
 213             data_buffers_t const kEmptyBuffer = {};
 214             data_buffers_free(buffers);
 215             return kEmptyBuffer;
 216         }
 217     }
 218
 219     return buffers;
 220 }
 221
 222 /**
 223  * Frees the data buffers.
 224  */
 225 void data_buffers_free(data_buffers_t buffers) {
 226     free((data_buffer_t*)buffers.buffers);
 227 }
 228
 229 /**
 230  * Initialization and download functions.
 231  */
 232
 233 static char* g_data_dir = NULL;
 234
 235 /* mkdir -p */
 236 static int ensure_directory_exists(char const* indir) {
 237     char* const dir = strdup(indir);
 238     char* end = dir;
 239     int ret = 0;
 240     if (dir == NULL) {
 241         ret = EINVAL;
 242         goto out;
 243     }
 244     do {
 245         /* Find the next directory level. */
 246         for (++end; *end != '\0' && *end != '/'; ++end)
 247             ;
 248         /* End the string there, make the directory, and restore the string. */
 249         char const save = *end;
 250         *end = '\0';
 251         int const isdir = UTIL_isDirectory(dir);
 252         ret = mkdir(dir, S_IRWXU);
 253         *end = save;
 254         /* Its okay if the directory already exists. */
 255         if (ret == 0 || (errno == EEXIST && isdir))
 256             continue;
 257         ret = errno;
 258         fprintf(stderr, "mkdir() failed\n");
 259         goto out;
 260     } while (*end != '\0');
 261
 262     ret = 0;
 263 out:
 264     free(dir);
 265     return ret;
 266 }
 267
 268 /** Concatenate 3 strings into a new buffer. */
 269 static char* cat3(char const* str1, char const* str2, char const* str3) {
 270     size_t const size1 = strlen(str1);
 271     size_t const size2 = strlen(str2);
 272     size_t const size3 = str3 == NULL ? 0 : strlen(str3);
 273     size_t const size = size1 + size2 + size3 + 1;
 274     char* const dst = (char*)malloc(size);
 275     if (dst == NULL)
 276         return NULL;
 277     strcpy(dst, str1);
 278     strcpy(dst + size1, str2);
 279     if (str3 != NULL)
 280         strcpy(dst + size1 + size2, str3);
 281     assert(strlen(dst) == size1 + size2 + size3);
 282     return dst;
 283 }
 284
 285 static char* cat2(char const* str1, char const* str2) {
 286     return cat3(str1, str2, NULL);
 287 }
 288
 289 /**
 290  * State needed by the curl callback.
 291  * It takes data from curl, hashes it, and writes it to the file.
 292  */
 293 typedef struct {
 294     FILE* file;
 295     XXH64_state_t xxhash64;
 296     int error;
 297 } curl_data_t;
 298
 299 /** Create the curl state. */
 300 static curl_data_t curl_data_create(
 301     data_resource_t const* resource,
 302     data_type_t type) {
 303     curl_data_t cdata = {};
 304
 305     XXH64_reset(&cdata.xxhash64, 0);
 306
 307     assert(UTIL_isDirectory(g_data_dir));
 308
 309     if (type == data_type_file) {
 310         /* Decompress the resource and store to the path. */
 311         char* cmd = cat3("zstd -dqfo '", resource->path, "'");
 312         if (cmd == NULL) {
 313             cdata.error = ENOMEM;
 314             return cdata;
 315         }
 316         cdata.file = popen(cmd, "w");
 317         free(cmd);
 318     } else {
 319         /* Decompress and extract the resource to the cache directory. */
 320         char* cmd = cat3("zstd -dc | tar -x -C '", g_data_dir, "'");
 321         if (cmd == NULL) {
 322             cdata.error = ENOMEM;
 323             return cdata;
 324         }
 325         cdata.file = popen(cmd, "w");
 326         free(cmd);
 327     }
 328     if (cdata.file == NULL) {
 329         cdata.error = errno;
 330     }
 331
 332     return cdata;
 333 }
 334
 335 /** Free the curl state. */
 336 static int curl_data_free(curl_data_t cdata) {
 337     return pclose(cdata.file);
 338 }
 339
 340 /** curl callback. Updates the hash, and writes to the file. */
 341 static size_t curl_write(void* data, size_t size, size_t count, void* ptr) {
 342     curl_data_t* cdata = (curl_data_t*)ptr;
 343     size_t const written = fwrite(data, size, count, cdata->file);
 344     XXH64_update(&cdata->xxhash64, data, written * size);
 345     return written;
 346 }
 347
 348 static int curl_download_resource(
 349     CURL* curl,
 350     data_resource_t const* resource,
 351     data_type_t type) {
 352     curl_data_t cdata;
 353     /* Download the data. */
 354     if (curl_easy_setopt(curl, CURLOPT_URL, resource->url) != 0)
 355         return EINVAL;
 356     if (curl_easy_setopt(curl, CURLOPT_WRITEDATA, &cdata) != 0)
 357         return EINVAL;
 358     cdata = curl_data_create(resource, type);
 359     if (cdata.error != 0)
 360         return cdata.error;
 361     int const curl_err = curl_easy_perform(curl);
 362     int const close_err = curl_data_free(cdata);
 363     if (curl_err) {
 364         fprintf(
 365             stderr,
 366             "downloading '%s' for '%s' failed\n",
 367             resource->url,
 368             resource->path);
 369         return EIO;
 370     }
 371     if (close_err) {
 372         fprintf(stderr, "writing data to '%s' failed\n", resource->path);
 373         return EIO;
 374     }
 375     /* check that the file exists. */
 376     if (type == data_type_file && !UTIL_isRegularFile(resource->path)) {
 377         fprintf(stderr, "output file '%s' does not exist\n", resource->path);
 378         return EIO;
 379     }
 380     if (type == data_type_dir && !UTIL_isDirectory(resource->path)) {
 381         fprintf(
 382             stderr, "output directory '%s' does not exist\n", resource->path);
 383         return EIO;
 384     }
 385     /* Check that the hash matches. */
 386     if (XXH64_digest(&cdata.xxhash64) != resource->xxhash64) {
 387         fprintf(
 388             stderr,
 389             "checksum does not match: 0x%llxLL != 0x%llxLL\n",
 390             (unsigned long long)XXH64_digest(&cdata.xxhash64),
 391             (unsigned long long)resource->xxhash64);
 392         return EINVAL;
 393     }
 394
 395     return 0;
 396 }
 397
 398 /** Download a single data object. */
 399 static int curl_download_datum(CURL* curl, data_t const* data) {
 400     int ret;
 401     ret = curl_download_resource(curl, &data->data, data->type);
 402     if (ret != 0)
 403         return ret;
 404     if (data_has_dict(data)) {
 405         ret = curl_download_resource(curl, &data->dict, data_type_file);
 406         if (ret != 0)
 407             return ret;
 408     }
 409     return ret;
 410 }
 411
 412 /** Download all the data. */
 413 static int curl_download_data(data_t const* const* data) {
 414     if (curl_global_init(CURL_GLOBAL_ALL) != 0)
 415         return EFAULT;
 416
 417     curl_data_t cdata = {};
 418     CURL* curl = curl_easy_init();
 419     int err = EFAULT;
 420
 421     if (curl == NULL)
 422         return EFAULT;
 423
 424     if (curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L) != 0)
 425         goto out;
 426     if (curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L) != 0)
 427         goto out;
 428     if (curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, curl_write) != 0)
 429         goto out;
 430
 431     assert(data != NULL);
 432     for (; *data != NULL; ++data) {
 433         if (curl_download_datum(curl, *data) != 0)
 434             goto out;
 435     }
 436
 437     err = 0;
 438 out:
 439     curl_easy_cleanup(curl);
 440     curl_global_cleanup();
 441     return err;
 442 }
 443
 444 /** Fill the path member variable of the data objects. */
 445 static int data_create_paths(data_t* const* data, char const* dir) {
 446     size_t const dirlen = strlen(dir);
 447     assert(data != NULL);
 448     for (; *data != NULL; ++data) {
 449         data_t* const datum = *data;
 450         datum->data.path = cat3(dir, "/", datum->name);
 451         if (datum->data.path == NULL)
 452             return ENOMEM;
 453         if (data_has_dict(datum)) {
 454             datum->dict.path = cat2(datum->data.path, ".dict");
 455             if (datum->dict.path == NULL)
 456                 return ENOMEM;
 457         }
 458     }
 459     return 0;
 460 }
 461
 462 /** Free the path member variable of the data objects. */
 463 static void data_free_paths(data_t* const* data) {
 464     assert(data != NULL);
 465     for (; *data != NULL; ++data) {
 466         data_t* datum = *data;
 467         free((void*)datum->data.path);
 468         free((void*)datum->dict.path);
 469         datum->data.path = NULL;
 470         datum->dict.path = NULL;
 471     }
 472 }
 473
 474 static char const kStampName[] = "STAMP";
 475
 476 static void xxh_update_le(XXH64_state_t* state, uint64_t data) {
 477     if (!MEM_isLittleEndian())
 478         data = MEM_swap64(data);
 479     XXH64_update(state, &data, sizeof(data));
 480 }
 481
 482 /** Hash the data to create the stamp. */
 483 static uint64_t stamp_hash(data_t const* const* data) {
 484     XXH64_state_t state;
 485
 486     XXH64_reset(&state, 0);
 487     assert(data != NULL);
 488     for (; *data != NULL; ++data) {
 489         data_t const* datum = *data;
 490         /* We don't care about the URL that we fetch from. */
 491         /* The path is derived from the name. */
 492         XXH64_update(&state, datum->name, strlen(datum->name));
 493         xxh_update_le(&state, datum->data.xxhash64);
 494         xxh_update_le(&state, datum->dict.xxhash64);
 495         xxh_update_le(&state, datum->type);
 496     }
 497     return XXH64_digest(&state);
 498 }
 499
 500 /** Check if the stamp matches the stamp in the cache directory. */
 501 static int stamp_check(char const* dir, data_t const* const* data) {
 502     char* stamp = cat3(dir, "/", kStampName);
 503     uint64_t const expected = stamp_hash(data);
 504     XXH64_canonical_t actual;
 505     FILE* stampfile = NULL;
 506     int matches = 0;
 507
 508     if (stamp == NULL)
 509         goto out;
 510     if (!UTIL_isRegularFile(stamp)) {
 511         fprintf(stderr, "stamp does not exist: recreating the data cache\n");
 512         goto out;
 513     }
 514
 515     stampfile = fopen(stamp, "rb");
 516     if (stampfile == NULL) {
 517         fprintf(stderr, "could not open stamp: recreating the data cache\n");
 518         goto out;
 519     }
 520
 521     size_t b;
 522     if ((b = fread(&actual, sizeof(actual), 1, stampfile)) != 1) {
 523         fprintf(stderr, "invalid stamp: recreating the data cache\n");
 524         goto out;
 525     }
 526
 527     matches = (expected == XXH64_hashFromCanonical(&actual));
 528     if (matches)
 529         fprintf(stderr, "stamp matches: reusing the cached data\n");
 530     else
 531         fprintf(stderr, "stamp does not match: recreating the data cache\n");
 532
 533 out:
 534     free(stamp);
 535     if (stampfile != NULL)
 536         fclose(stampfile);
 537     return matches;
 538 }
 539
 540 /** On success write a new stamp, on failure delete the old stamp. */
 541 static int
 542 stamp_write(char const* dir, data_t const* const* data, int const data_err) {
 543     char* stamp = cat3(dir, "/", kStampName);
 544     FILE* stampfile = NULL;
 545     int err = EIO;
 546
 547     if (stamp == NULL)
 548         return ENOMEM;
 549
 550     if (data_err != 0) {
 551         err = data_err;
 552         goto out;
 553     }
 554     XXH64_canonical_t hash;
 555
 556     XXH64_canonicalFromHash(&hash, stamp_hash(data));
 557
 558     stampfile = fopen(stamp, "wb");
 559     if (stampfile == NULL)
 560         goto out;
 561     if (fwrite(&hash, sizeof(hash), 1, stampfile) != 1)
 562         goto out;
 563     err = 0;
 564     fprintf(stderr, "stamped new data cache\n");
 565 out:
 566     if (err != 0)
 567         /* Ignore errors. */
 568         unlink(stamp);
 569     free(stamp);
 570     if (stampfile != NULL)
 571         fclose(stampfile);
 572     return err;
 573 }
 574
 575 int data_init(char const* dir) {
 576     int err;
 577
 578     if (dir == NULL)
 579         return EINVAL;
 580
 581     /* This must be first to simplify logic. */
 582     err = ensure_directory_exists(dir);
 583     if (err != 0)
 584         return err;
 585
 586     /* Save the cache directory. */
 587     g_data_dir = strdup(dir);
 588     if (g_data_dir == NULL)
 589         return ENOMEM;
 590
 591     err = data_create_paths(g_data, dir);
 592     if (err != 0)
 593         return err;
 594
 595     /* If the stamp matches then we are good to go.
 596      * This must be called before any modifications to the data cache.
 597      * After this point, we MUST call stamp_write() to update the STAMP,
 598      * since we've updated the data cache.
 599      */
 600     if (stamp_check(dir, data))
 601         return 0;
 602
 603     err = curl_download_data(data);
 604     if (err != 0)
 605         goto out;
 606
 607 out:
 608     /* This must be last, since it must know if data_init() succeeded. */
 609     stamp_write(dir, data, err);
 610     return err;
 611 }
 612
 613 void data_finish(void) {
 614     data_free_paths(g_data);
 615     free(g_data_dir);
 616     g_data_dir = NULL;
 617 }